Add OCR, Background Removal, and PDF Editor features with tests

- Implemented OCR functionality using pytesseract for image and PDF text extraction. - Added Background Removal service using rembg for image processing. - Developed PDF Editor service for applying text annotations to PDF files. - Created corresponding API routes for OCR, Background Removal, and PDF Editor. - Added frontend components for OCR and Background Removal tools. - Integrated feature flagging for new tools, ensuring they are disabled by default. - Implemented comprehensive unit tests for OCR service, PDF editor, and background removal. - Updated documentation to reflect new features and usage instructions. - Added translations for new features in English, Arabic, and French.
2026-03-07 21:29:08 +02:00
parent 71f7d0382d
commit 6bb76e3f1b
28 changed files with 1975 additions and 0 deletions
--- a/backend/app/services/ocr_service.py
+++ b/backend/app/services/ocr_service.py
@@ -0,0 +1,121 @@
+"""OCR service — extract text from images and PDFs using Tesseract."""
+import logging
+import os
+import subprocess
+import tempfile
+
+from PIL import Image
+
+logger = logging.getLogger(__name__)
+
+
+class OCRError(Exception):
+    """Custom exception for OCR failures."""
+    pass
+
+
+# Tesseract language codes
+SUPPORTED_LANGUAGES = {
+    "eng": "English",
+    "ara": "Arabic",
+    "fra": "French",
+}
+
+DEFAULT_LANG = "eng"
+
+
+def _get_tesseract_cmd() -> str:
+    """Return the tesseract binary path."""
+    return os.getenv("TESSERACT_CMD", "tesseract")
+
+
+def ocr_image(input_path: str, lang: str = DEFAULT_LANG) -> dict:
+    """Extract text from an image file using Tesseract.
+
+    Args:
+        input_path: Path to the input image.
+        lang: Tesseract language code (e.g. "eng", "ara", "fra").
+
+    Returns:
+        dict with ``text``, ``lang``, ``char_count``.
+
+    Raises:
+        OCRError: If the OCR operation fails.
+    """
+    if lang not in SUPPORTED_LANGUAGES:
+        lang = DEFAULT_LANG
+
+    try:
+        import pytesseract
+
+        pytesseract.pytesseract.tesseract_cmd = _get_tesseract_cmd()
+
+        with Image.open(input_path) as img:
+            # Convert to RGB if needed (tesseract works best with RGB)
+            if img.mode not in ("RGB", "L"):
+                img = img.convert("RGB")
+            text = pytesseract.image_to_string(img, lang=lang)
+
+        text = text.strip()
+        return {
+            "text": text,
+            "lang": lang,
+            "char_count": len(text),
+        }
+    except ImportError:
+        raise OCRError("pytesseract is not installed.")
+    except Exception as e:
+        raise OCRError(f"OCR failed: {str(e)}")
+
+
+def ocr_pdf(input_path: str, output_path: str, lang: str = DEFAULT_LANG) -> dict:
+    """Extract text from a scanned PDF by converting pages to images first.
+
+    Args:
+        input_path: Path to the input PDF.
+        output_path: Path for the output text file.
+        lang: Tesseract language code.
+
+    Returns:
+        dict with ``text``, ``page_count``, ``char_count``.
+
+    Raises:
+        OCRError: If the OCR operation fails.
+    """
+    if lang not in SUPPORTED_LANGUAGES:
+        lang = DEFAULT_LANG
+
+    try:
+        from pdf2image import convert_from_path
+        import pytesseract
+
+        pytesseract.pytesseract.tesseract_cmd = _get_tesseract_cmd()
+
+        images = convert_from_path(input_path, dpi=300)
+        if not images:
+            raise OCRError("Could not convert PDF to images — file may be empty.")
+
+        all_text = []
+        for i, img in enumerate(images, 1):
+            if img.mode not in ("RGB", "L"):
+                img = img.convert("RGB")
+            page_text = pytesseract.image_to_string(img, lang=lang)
+            all_text.append(f"--- Page {i} ---\n{page_text.strip()}")
+
+        full_text = "\n\n".join(all_text)
+
+        os.makedirs(os.path.dirname(output_path), exist_ok=True)
+        with open(output_path, "w", encoding="utf-8") as f:
+            f.write(full_text)
+
+        return {
+            "text": full_text,
+            "page_count": len(images),
+            "char_count": len(full_text),
+        }
+    except ImportError as e:
+        raise OCRError(f"Missing dependency: {e}")
+    except OCRError:
+        raise
+    except Exception as e:
+        raise OCRError(f"PDF OCR failed: {str(e)}")