Add OCR, Background Removal, and PDF Editor features with tests
- Implemented OCR functionality using pytesseract for image and PDF text extraction. - Added Background Removal service using rembg for image processing. - Developed PDF Editor service for applying text annotations to PDF files. - Created corresponding API routes for OCR, Background Removal, and PDF Editor. - Added frontend components for OCR and Background Removal tools. - Integrated feature flagging for new tools, ensuring they are disabled by default. - Implemented comprehensive unit tests for OCR service, PDF editor, and background removal. - Updated documentation to reflect new features and usage instructions. - Added translations for new features in English, Arabic, and French.
This commit is contained in:
163
backend/tests/test_ocr.py
Normal file
163
backend/tests/test_ocr.py
Normal file
@@ -0,0 +1,163 @@
|
||||
"""Tests for OCR routes — /api/ocr/image, /api/ocr/pdf, /api/ocr/languages."""
|
||||
import io
|
||||
import json
|
||||
import os
|
||||
import tempfile
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
from tests.conftest import make_png_bytes, make_pdf_bytes
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# Feature flag enforcement
|
||||
# =========================================================================
|
||||
class TestOcrFeatureFlag:
|
||||
def test_ocr_image_disabled_by_default(self, client):
|
||||
"""OCR image should return 403 when FEATURE_EDITOR is off."""
|
||||
data = {"file": (io.BytesIO(make_png_bytes()), "test.png")}
|
||||
response = client.post(
|
||||
"/api/ocr/image",
|
||||
data=data,
|
||||
content_type="multipart/form-data",
|
||||
)
|
||||
assert response.status_code == 403
|
||||
assert "not enabled" in response.get_json()["error"]
|
||||
|
||||
def test_ocr_pdf_disabled_by_default(self, client):
|
||||
"""OCR PDF should return 403 when FEATURE_EDITOR is off."""
|
||||
data = {"file": (io.BytesIO(make_pdf_bytes()), "scan.pdf")}
|
||||
response = client.post(
|
||||
"/api/ocr/pdf",
|
||||
data=data,
|
||||
content_type="multipart/form-data",
|
||||
)
|
||||
assert response.status_code == 403
|
||||
|
||||
def test_languages_always_available(self, client):
|
||||
"""GET /api/ocr/languages should work even when feature is disabled."""
|
||||
response = client.get("/api/ocr/languages")
|
||||
assert response.status_code == 200
|
||||
data = response.get_json()
|
||||
langs = data["languages"]
|
||||
assert "eng" in langs
|
||||
assert "ara" in langs
|
||||
assert "fra" in langs
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# Validation
|
||||
# =========================================================================
|
||||
class TestOcrValidation:
|
||||
def test_ocr_image_no_file(self, client, app):
|
||||
"""Should return 400 when no file provided."""
|
||||
app.config["FEATURE_EDITOR"] = True
|
||||
response = client.post("/api/ocr/image")
|
||||
assert response.status_code == 400
|
||||
assert "No file" in response.get_json()["error"]
|
||||
|
||||
def test_ocr_pdf_no_file(self, client, app):
|
||||
"""Should return 400 when no file provided."""
|
||||
app.config["FEATURE_EDITOR"] = True
|
||||
response = client.post("/api/ocr/pdf")
|
||||
assert response.status_code == 400
|
||||
assert "No file" in response.get_json()["error"]
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# Success paths
|
||||
# =========================================================================
|
||||
class TestOcrSuccess:
|
||||
def test_ocr_image_success(self, client, app, monkeypatch):
|
||||
"""Should return 202 with task_id when valid image provided."""
|
||||
app.config["FEATURE_EDITOR"] = True
|
||||
mock_task = MagicMock()
|
||||
mock_task.id = "ocr-img-task-1"
|
||||
|
||||
tmp_dir = tempfile.mkdtemp()
|
||||
save_path = os.path.join(tmp_dir, "mock.png")
|
||||
|
||||
monkeypatch.setattr(
|
||||
"app.routes.ocr.validate_actor_file",
|
||||
lambda f, allowed_types, actor: ("test.png", "png"),
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"app.routes.ocr.generate_safe_path",
|
||||
lambda ext, folder_type: ("mock-id", save_path),
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"app.routes.ocr.ocr_image_task.delay",
|
||||
MagicMock(return_value=mock_task),
|
||||
)
|
||||
|
||||
data = {"file": (io.BytesIO(make_png_bytes()), "test.png"), "lang": "eng"}
|
||||
response = client.post(
|
||||
"/api/ocr/image",
|
||||
data=data,
|
||||
content_type="multipart/form-data",
|
||||
)
|
||||
assert response.status_code == 202
|
||||
body = response.get_json()
|
||||
assert body["task_id"] == "ocr-img-task-1"
|
||||
|
||||
def test_ocr_pdf_success(self, client, app, monkeypatch):
|
||||
"""Should return 202 with task_id when valid PDF provided."""
|
||||
app.config["FEATURE_EDITOR"] = True
|
||||
mock_task = MagicMock()
|
||||
mock_task.id = "ocr-pdf-task-1"
|
||||
|
||||
tmp_dir = tempfile.mkdtemp()
|
||||
save_path = os.path.join(tmp_dir, "mock.pdf")
|
||||
|
||||
monkeypatch.setattr(
|
||||
"app.routes.ocr.validate_actor_file",
|
||||
lambda f, allowed_types, actor: ("scan.pdf", "pdf"),
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"app.routes.ocr.generate_safe_path",
|
||||
lambda ext, folder_type: ("mock-id", save_path),
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"app.routes.ocr.ocr_pdf_task.delay",
|
||||
MagicMock(return_value=mock_task),
|
||||
)
|
||||
|
||||
data = {"file": (io.BytesIO(make_pdf_bytes()), "scan.pdf"), "lang": "ara"}
|
||||
response = client.post(
|
||||
"/api/ocr/pdf",
|
||||
data=data,
|
||||
content_type="multipart/form-data",
|
||||
)
|
||||
assert response.status_code == 202
|
||||
body = response.get_json()
|
||||
assert body["task_id"] == "ocr-pdf-task-1"
|
||||
|
||||
def test_ocr_image_invalid_lang_falls_back(self, client, app, monkeypatch):
|
||||
"""Invalid lang should fall back to 'eng' without error."""
|
||||
app.config["FEATURE_EDITOR"] = True
|
||||
mock_task = MagicMock()
|
||||
mock_task.id = "ocr-lang-task"
|
||||
|
||||
tmp_dir = tempfile.mkdtemp()
|
||||
save_path = os.path.join(tmp_dir, "mock.png")
|
||||
|
||||
monkeypatch.setattr(
|
||||
"app.routes.ocr.validate_actor_file",
|
||||
lambda f, allowed_types, actor: ("test.png", "png"),
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"app.routes.ocr.generate_safe_path",
|
||||
lambda ext, folder_type: ("mock-id", save_path),
|
||||
)
|
||||
mock_delay = MagicMock(return_value=mock_task)
|
||||
monkeypatch.setattr("app.routes.ocr.ocr_image_task.delay", mock_delay)
|
||||
|
||||
data = {"file": (io.BytesIO(make_png_bytes()), "test.png"), "lang": "invalid"}
|
||||
response = client.post(
|
||||
"/api/ocr/image",
|
||||
data=data,
|
||||
content_type="multipart/form-data",
|
||||
)
|
||||
assert response.status_code == 202
|
||||
# Verify 'eng' was passed to the task
|
||||
call_args = mock_delay.call_args
|
||||
assert call_args[0][3] == "eng" # 4th positional arg is lang
|
||||
66
backend/tests/test_ocr_service.py
Normal file
66
backend/tests/test_ocr_service.py
Normal file
@@ -0,0 +1,66 @@
|
||||
"""Tests for OCR service and PDF editor service — unit tests with mocking."""
|
||||
import os
|
||||
import sys
|
||||
import tempfile
|
||||
|
||||
import pytest
|
||||
from unittest.mock import patch, MagicMock
|
||||
|
||||
from app.services.ocr_service import ocr_image, OCRError, SUPPORTED_LANGUAGES
|
||||
|
||||
|
||||
class TestOcrServiceConstants:
|
||||
def test_supported_languages(self):
|
||||
"""Verify the supported languages dict."""
|
||||
assert "eng" in SUPPORTED_LANGUAGES
|
||||
assert "ara" in SUPPORTED_LANGUAGES
|
||||
assert "fra" in SUPPORTED_LANGUAGES
|
||||
assert len(SUPPORTED_LANGUAGES) == 3
|
||||
|
||||
|
||||
class TestOcrImage:
|
||||
def test_ocr_image_success(self):
|
||||
"""Should return text and char_count from image (mocked pytesseract)."""
|
||||
mock_pytesseract = MagicMock()
|
||||
mock_pytesseract.image_to_string.return_value = " Hello World "
|
||||
mock_pytesseract.pytesseract.tesseract_cmd = ""
|
||||
|
||||
mock_img = MagicMock()
|
||||
mock_img.mode = "RGB"
|
||||
mock_img.__enter__ = MagicMock(return_value=mock_img)
|
||||
mock_img.__exit__ = MagicMock(return_value=False)
|
||||
|
||||
with patch.dict(sys.modules, {"pytesseract": mock_pytesseract}):
|
||||
with patch("app.services.ocr_service.Image") as mock_pil:
|
||||
mock_pil.open.return_value = mock_img
|
||||
result = ocr_image("/fake/path.png", lang="eng")
|
||||
|
||||
assert result["text"] == "Hello World"
|
||||
assert result["char_count"] == 11
|
||||
assert result["lang"] == "eng"
|
||||
|
||||
def test_ocr_image_invalid_lang_fallback(self):
|
||||
"""Invalid language should fall back to 'eng'."""
|
||||
mock_pytesseract = MagicMock()
|
||||
mock_pytesseract.image_to_string.return_value = "Test"
|
||||
mock_pytesseract.pytesseract.tesseract_cmd = ""
|
||||
|
||||
mock_img = MagicMock()
|
||||
mock_img.mode = "RGB"
|
||||
mock_img.__enter__ = MagicMock(return_value=mock_img)
|
||||
mock_img.__exit__ = MagicMock(return_value=False)
|
||||
|
||||
with patch.dict(sys.modules, {"pytesseract": mock_pytesseract}):
|
||||
with patch("app.services.ocr_service.Image") as mock_pil:
|
||||
mock_pil.open.return_value = mock_img
|
||||
result = ocr_image("/fake/path.png", lang="zzzz")
|
||||
|
||||
assert result["lang"] == "eng"
|
||||
|
||||
|
||||
class TestPdfEditorService:
|
||||
def test_no_edits_raises(self):
|
||||
"""Should raise PDFEditorError when no edits provided."""
|
||||
from app.services.pdf_editor_service import apply_pdf_edits, PDFEditorError
|
||||
with pytest.raises(PDFEditorError, match="No edits"):
|
||||
apply_pdf_edits("/fake.pdf", "/out.pdf", [])
|
||||
144
backend/tests/test_pdf_editor.py
Normal file
144
backend/tests/test_pdf_editor.py
Normal file
@@ -0,0 +1,144 @@
|
||||
"""Tests for PDF editor route — /api/pdf-editor/edit."""
|
||||
import io
|
||||
import json
|
||||
import os
|
||||
import tempfile
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
from tests.conftest import make_pdf_bytes
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# Feature flag enforcement
|
||||
# =========================================================================
|
||||
class TestPdfEditorFeatureFlag:
|
||||
def test_pdf_editor_disabled_by_default(self, client):
|
||||
"""Should return 403 when FEATURE_EDITOR is off."""
|
||||
data = {
|
||||
"file": (io.BytesIO(make_pdf_bytes()), "doc.pdf"),
|
||||
"edits": json.dumps([{"type": "text", "page": 1, "x": 100, "y": 200, "content": "Hello"}]),
|
||||
}
|
||||
response = client.post(
|
||||
"/api/pdf-editor/edit",
|
||||
data=data,
|
||||
content_type="multipart/form-data",
|
||||
)
|
||||
assert response.status_code == 403
|
||||
assert "not enabled" in response.get_json()["error"]
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# Validation
|
||||
# =========================================================================
|
||||
class TestPdfEditorValidation:
|
||||
def test_pdf_editor_no_file(self, client, app):
|
||||
"""Should return 400 when no file provided."""
|
||||
app.config["FEATURE_EDITOR"] = True
|
||||
response = client.post("/api/pdf-editor/edit")
|
||||
assert response.status_code == 400
|
||||
assert "No file" in response.get_json()["error"]
|
||||
|
||||
def test_pdf_editor_invalid_json(self, client, app):
|
||||
"""Should return 400 when edits is invalid JSON."""
|
||||
app.config["FEATURE_EDITOR"] = True
|
||||
data = {
|
||||
"file": (io.BytesIO(make_pdf_bytes()), "doc.pdf"),
|
||||
"edits": "not valid json{",
|
||||
}
|
||||
response = client.post(
|
||||
"/api/pdf-editor/edit",
|
||||
data=data,
|
||||
content_type="multipart/form-data",
|
||||
)
|
||||
assert response.status_code == 400
|
||||
assert "Invalid JSON" in response.get_json()["error"]
|
||||
|
||||
def test_pdf_editor_edits_not_array(self, client, app):
|
||||
"""Should return 400 when edits is not an array."""
|
||||
app.config["FEATURE_EDITOR"] = True
|
||||
data = {
|
||||
"file": (io.BytesIO(make_pdf_bytes()), "doc.pdf"),
|
||||
"edits": json.dumps({"type": "text"}),
|
||||
}
|
||||
response = client.post(
|
||||
"/api/pdf-editor/edit",
|
||||
data=data,
|
||||
content_type="multipart/form-data",
|
||||
)
|
||||
assert response.status_code == 400
|
||||
assert "JSON array" in response.get_json()["error"]
|
||||
|
||||
def test_pdf_editor_empty_edits(self, client, app):
|
||||
"""Should return 400 when edits array is empty."""
|
||||
app.config["FEATURE_EDITOR"] = True
|
||||
data = {
|
||||
"file": (io.BytesIO(make_pdf_bytes()), "doc.pdf"),
|
||||
"edits": json.dumps([]),
|
||||
}
|
||||
response = client.post(
|
||||
"/api/pdf-editor/edit",
|
||||
data=data,
|
||||
content_type="multipart/form-data",
|
||||
)
|
||||
assert response.status_code == 400
|
||||
assert "At least one edit" in response.get_json()["error"]
|
||||
|
||||
def test_pdf_editor_too_many_edits(self, client, app):
|
||||
"""Should return 400 when more than 500 edits."""
|
||||
app.config["FEATURE_EDITOR"] = True
|
||||
edits = [{"type": "text", "page": 1, "x": 10, "y": 10, "content": "x"}] * 501
|
||||
data = {
|
||||
"file": (io.BytesIO(make_pdf_bytes()), "doc.pdf"),
|
||||
"edits": json.dumps(edits),
|
||||
}
|
||||
response = client.post(
|
||||
"/api/pdf-editor/edit",
|
||||
data=data,
|
||||
content_type="multipart/form-data",
|
||||
)
|
||||
assert response.status_code == 400
|
||||
assert "500" in response.get_json()["error"]
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# Success paths
|
||||
# =========================================================================
|
||||
class TestPdfEditorSuccess:
|
||||
def test_pdf_editor_success(self, client, app, monkeypatch):
|
||||
"""Should return 202 with task_id when valid request provided."""
|
||||
app.config["FEATURE_EDITOR"] = True
|
||||
mock_task = MagicMock()
|
||||
mock_task.id = "edit-task-1"
|
||||
|
||||
tmp_dir = tempfile.mkdtemp()
|
||||
save_path = os.path.join(tmp_dir, "mock.pdf")
|
||||
|
||||
monkeypatch.setattr(
|
||||
"app.routes.pdf_editor.validate_actor_file",
|
||||
lambda f, allowed_types, actor: ("doc.pdf", "pdf"),
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"app.routes.pdf_editor.generate_safe_path",
|
||||
lambda ext, folder_type: ("mock-id", save_path),
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"app.routes.pdf_editor.edit_pdf_task.delay",
|
||||
MagicMock(return_value=mock_task),
|
||||
)
|
||||
|
||||
edits = [
|
||||
{"type": "text", "page": 1, "x": 100, "y": 200, "content": "Hello World", "fontSize": 14},
|
||||
]
|
||||
data = {
|
||||
"file": (io.BytesIO(make_pdf_bytes()), "doc.pdf"),
|
||||
"edits": json.dumps(edits),
|
||||
}
|
||||
response = client.post(
|
||||
"/api/pdf-editor/edit",
|
||||
data=data,
|
||||
content_type="multipart/form-data",
|
||||
)
|
||||
assert response.status_code == 202
|
||||
body = response.get_json()
|
||||
assert body["task_id"] == "edit-task-1"
|
||||
assert "PDF editing started" in body["message"]
|
||||
73
backend/tests/test_removebg.py
Normal file
73
backend/tests/test_removebg.py
Normal file
@@ -0,0 +1,73 @@
|
||||
"""Tests for background removal route — /api/remove-bg."""
|
||||
import io
|
||||
import os
|
||||
import tempfile
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
from tests.conftest import make_png_bytes, make_pdf_bytes
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# Feature flag enforcement
|
||||
# =========================================================================
|
||||
class TestRemoveBgFeatureFlag:
|
||||
def test_removebg_disabled_by_default(self, client):
|
||||
"""Should return 403 when FEATURE_EDITOR is off."""
|
||||
data = {"file": (io.BytesIO(make_png_bytes()), "photo.png")}
|
||||
response = client.post(
|
||||
"/api/remove-bg",
|
||||
data=data,
|
||||
content_type="multipart/form-data",
|
||||
)
|
||||
assert response.status_code == 403
|
||||
assert "not enabled" in response.get_json()["error"]
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# Validation
|
||||
# =========================================================================
|
||||
class TestRemoveBgValidation:
|
||||
def test_removebg_no_file(self, client, app):
|
||||
"""Should return 400 when no file provided."""
|
||||
app.config["FEATURE_EDITOR"] = True
|
||||
response = client.post("/api/remove-bg")
|
||||
assert response.status_code == 400
|
||||
assert "No file" in response.get_json()["error"]
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# Success paths
|
||||
# =========================================================================
|
||||
class TestRemoveBgSuccess:
|
||||
def test_removebg_success(self, client, app, monkeypatch):
|
||||
"""Should return 202 with task_id when valid image provided."""
|
||||
app.config["FEATURE_EDITOR"] = True
|
||||
mock_task = MagicMock()
|
||||
mock_task.id = "rembg-task-1"
|
||||
|
||||
tmp_dir = tempfile.mkdtemp()
|
||||
save_path = os.path.join(tmp_dir, "mock.png")
|
||||
|
||||
monkeypatch.setattr(
|
||||
"app.routes.removebg.validate_actor_file",
|
||||
lambda f, allowed_types, actor: ("photo.png", "png"),
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"app.routes.removebg.generate_safe_path",
|
||||
lambda ext, folder_type: ("mock-id", save_path),
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"app.routes.removebg.remove_bg_task.delay",
|
||||
MagicMock(return_value=mock_task),
|
||||
)
|
||||
|
||||
data = {"file": (io.BytesIO(make_png_bytes()), "photo.png")}
|
||||
response = client.post(
|
||||
"/api/remove-bg",
|
||||
data=data,
|
||||
content_type="multipart/form-data",
|
||||
)
|
||||
assert response.status_code == 202
|
||||
body = response.get_json()
|
||||
assert body["task_id"] == "rembg-task-1"
|
||||
assert "Background removal started" in body["message"]
|
||||
Reference in New Issue
Block a user