Add OCR, Background Removal, and PDF Editor features with tests

- Implemented OCR functionality using pytesseract for image and PDF text extraction. - Added Background Removal service using rembg for image processing. - Developed PDF Editor service for applying text annotations to PDF files. - Created corresponding API routes for OCR, Background Removal, and PDF Editor. - Added frontend components for OCR and Background Removal tools. - Integrated feature flagging for new tools, ensuring they are disabled by default. - Implemented comprehensive unit tests for OCR service, PDF editor, and background removal. - Updated documentation to reflect new features and usage instructions. - Added translations for new features in English, Arabic, and French.
2026-03-07 21:29:08 +02:00
parent 71f7d0382d
commit 6bb76e3f1b
28 changed files with 1975 additions and 0 deletions
--- a/backend/app/services/ocr_service.py
+++ b/backend/app/services/ocr_service.py
@@ -0,0 +1,121 @@
+"""OCR service — extract text from images and PDFs using Tesseract."""
+import logging
+import os
+import subprocess
+import tempfile
+
+from PIL import Image
+
+logger = logging.getLogger(__name__)
+
+
+class OCRError(Exception):
+    """Custom exception for OCR failures."""
+    pass
+
+
+# Tesseract language codes
+SUPPORTED_LANGUAGES = {
+    "eng": "English",
+    "ara": "Arabic",
+    "fra": "French",
+}
+
+DEFAULT_LANG = "eng"
+
+
+def _get_tesseract_cmd() -> str:
+    """Return the tesseract binary path."""
+    return os.getenv("TESSERACT_CMD", "tesseract")
+
+
+def ocr_image(input_path: str, lang: str = DEFAULT_LANG) -> dict:
+    """Extract text from an image file using Tesseract.
+
+    Args:
+        input_path: Path to the input image.
+        lang: Tesseract language code (e.g. "eng", "ara", "fra").
+
+    Returns:
+        dict with ``text``, ``lang``, ``char_count``.
+
+    Raises:
+        OCRError: If the OCR operation fails.
+    """
+    if lang not in SUPPORTED_LANGUAGES:
+        lang = DEFAULT_LANG
+
+    try:
+        import pytesseract
+
+        pytesseract.pytesseract.tesseract_cmd = _get_tesseract_cmd()
+
+        with Image.open(input_path) as img:
+            # Convert to RGB if needed (tesseract works best with RGB)
+            if img.mode not in ("RGB", "L"):
+                img = img.convert("RGB")
+            text = pytesseract.image_to_string(img, lang=lang)
+
+        text = text.strip()
+        return {
+            "text": text,
+            "lang": lang,
+            "char_count": len(text),
+        }
+    except ImportError:
+        raise OCRError("pytesseract is not installed.")
+    except Exception as e:
+        raise OCRError(f"OCR failed: {str(e)}")
+
+
+def ocr_pdf(input_path: str, output_path: str, lang: str = DEFAULT_LANG) -> dict:
+    """Extract text from a scanned PDF by converting pages to images first.
+
+    Args:
+        input_path: Path to the input PDF.
+        output_path: Path for the output text file.
+        lang: Tesseract language code.
+
+    Returns:
+        dict with ``text``, ``page_count``, ``char_count``.
+
+    Raises:
+        OCRError: If the OCR operation fails.
+    """
+    if lang not in SUPPORTED_LANGUAGES:
+        lang = DEFAULT_LANG
+
+    try:
+        from pdf2image import convert_from_path
+        import pytesseract
+
+        pytesseract.pytesseract.tesseract_cmd = _get_tesseract_cmd()
+
+        images = convert_from_path(input_path, dpi=300)
+        if not images:
+            raise OCRError("Could not convert PDF to images — file may be empty.")
+
+        all_text = []
+        for i, img in enumerate(images, 1):
+            if img.mode not in ("RGB", "L"):
+                img = img.convert("RGB")
+            page_text = pytesseract.image_to_string(img, lang=lang)
+            all_text.append(f"--- Page {i} ---\n{page_text.strip()}")
+
+        full_text = "\n\n".join(all_text)
+
+        os.makedirs(os.path.dirname(output_path), exist_ok=True)
+        with open(output_path, "w", encoding="utf-8") as f:
+            f.write(full_text)
+
+        return {
+            "text": full_text,
+            "page_count": len(images),
+            "char_count": len(full_text),
+        }
+    except ImportError as e:
+        raise OCRError(f"Missing dependency: {e}")
+    except OCRError:
+        raise
+    except Exception as e:
+        raise OCRError(f"PDF OCR failed: {str(e)}")
--- a/backend/app/services/pdf_editor_service.py
+++ b/backend/app/services/pdf_editor_service.py
@@ -0,0 +1,120 @@
+"""PDF Editor service — add text annotations and simple edits to PDFs."""
+import io
+import logging
+import os
+
+logger = logging.getLogger(__name__)
+
+
+class PDFEditorError(Exception):
+    """Custom exception for PDF editor failures."""
+    pass
+
+
+def apply_pdf_edits(input_path: str, output_path: str, edits: list[dict]) -> dict:
+    """Apply a list of edits (text annotations) to an existing PDF.
+
+    Each edit dict can contain:
+        - type: "text"
+        - page: 1-based page number
+        - x, y: position in points from bottom-left
+        - content: text string to place
+        - fontSize: optional, default 12
+        - color: optional hex e.g. "#000000"
+
+    Args:
+        input_path: Path to the source PDF.
+        output_path: Path for the edited PDF.
+        edits: List of edit operation dicts.
+
+    Returns:
+        dict with ``page_count``, ``edits_applied``, ``output_size``.
+
+    Raises:
+        PDFEditorError: If the edit fails.
+    """
+    if not edits:
+        raise PDFEditorError("No edits provided.")
+
+    os.makedirs(os.path.dirname(output_path), exist_ok=True)
+
+    try:
+        from PyPDF2 import PdfReader, PdfWriter
+        from reportlab.pdfgen import canvas
+        from reportlab.lib.pagesizes import letter
+        from reportlab.lib.colors import HexColor
+
+        reader = PdfReader(input_path)
+        writer = PdfWriter()
+        page_count = len(reader.pages)
+
+        if page_count == 0:
+            raise PDFEditorError("PDF has no pages.")
+
+        # Group edits by page
+        edits_by_page: dict[int, list[dict]] = {}
+        for edit in edits:
+            page_num = int(edit.get("page", 1))
+            if page_num < 1 or page_num > page_count:
+                continue
+            edits_by_page.setdefault(page_num, []).append(edit)
+
+        edits_applied = 0
+
+        for page_idx in range(page_count):
+            page = reader.pages[page_idx]
+            page_num = page_idx + 1
+            page_edits = edits_by_page.get(page_num, [])
+
+            if page_edits:
+                # Get page dimensions
+                media_box = page.mediabox
+                page_width = float(media_box.width)
+                page_height = float(media_box.height)
+
+                # Create overlay with annotations
+                packet = io.BytesIO()
+                c = canvas.Canvas(packet, pagesize=(page_width, page_height))
+
+                for edit in page_edits:
+                    edit_type = edit.get("type", "text")
+                    if edit_type == "text":
+                        x = float(edit.get("x", 72))
+                        y = float(edit.get("y", 72))
+                        content = str(edit.get("content", ""))
+                        font_size = int(edit.get("fontSize", 12))
+                        color = str(edit.get("color", "#000000"))
+
+                        try:
+                            c.setFillColor(HexColor(color))
+                        except Exception:
+                            c.setFillColor(HexColor("#000000"))
+
+                        c.setFont("Helvetica", font_size)
+                        c.drawString(x, y, content)
+                        edits_applied += 1
+
+                c.save()
+                packet.seek(0)
+
+                overlay_reader = PdfReader(packet)
+                if len(overlay_reader.pages) > 0:
+                    page.merge_page(overlay_reader.pages[0])
+
+            writer.add_page(page)
+
+        with open(output_path, "wb") as f:
+            writer.write(f)
+
+        output_size = os.path.getsize(output_path)
+
+        return {
+            "page_count": page_count,
+            "edits_applied": edits_applied,
+            "output_size": output_size,
+        }
+
+    except PDFEditorError:
+        raise
+    except Exception as e:
+        raise PDFEditorError(f"PDF editing failed: {str(e)}")
--- a/backend/app/services/removebg_service.py
+++ b/backend/app/services/removebg_service.py
@@ -0,0 +1,60 @@
+"""Background removal service using rembg."""
+import logging
+import os
+
+from PIL import Image
+
+logger = logging.getLogger(__name__)
+
+
+class RemoveBGError(Exception):
+    """Custom exception for background removal failures."""
+    pass
+
+
+def remove_background(input_path: str, output_path: str) -> dict:
+    """Remove the background from an image.
+
+    Args:
+        input_path: Path to the input image.
+        output_path: Path for the output PNG (always PNG — transparency).
+
+    Returns:
+        dict with ``original_size``, ``output_size``, ``width``, ``height``.
+
+    Raises:
+        RemoveBGError: If the operation fails.
+    """
+    os.makedirs(os.path.dirname(output_path), exist_ok=True)
+
+    try:
+        from rembg import remove as rembg_remove
+
+        with Image.open(input_path) as img:
+            if img.mode != "RGBA":
+                img = img.convert("RGBA")
+            width, height = img.size
+            original_size = os.path.getsize(input_path)
+
+            result = rembg_remove(img)
+            result.save(output_path, format="PNG", optimize=True)
+
+        output_size = os.path.getsize(output_path)
+
+        logger.info(
+            "Background removed: %s → %s (%d → %d bytes)",
+            input_path, output_path, original_size, output_size,
+        )
+
+        return {
+            "original_size": original_size,
+            "output_size": output_size,
+            "width": width,
+            "height": height,
+        }
+    except ImportError:
+        raise RemoveBGError("rembg is not installed.")
+    except (IOError, OSError) as e:
+        raise RemoveBGError(f"Background removal failed: {str(e)}")
+    except Exception as e:
+        raise RemoveBGError(f"Background removal failed: {str(e)}")