Add OCR, Background Removal, and PDF Editor features with tests
- Implemented OCR functionality using pytesseract for image and PDF text extraction. - Added Background Removal service using rembg for image processing. - Developed PDF Editor service for applying text annotations to PDF files. - Created corresponding API routes for OCR, Background Removal, and PDF Editor. - Added frontend components for OCR and Background Removal tools. - Integrated feature flagging for new tools, ensuring they are disabled by default. - Implemented comprehensive unit tests for OCR service, PDF editor, and background removal. - Updated documentation to reflect new features and usage instructions. - Added translations for new features in English, Arabic, and French.
This commit is contained in:
121
backend/app/services/ocr_service.py
Normal file
121
backend/app/services/ocr_service.py
Normal file
@@ -0,0 +1,121 @@
|
||||
"""OCR service — extract text from images and PDFs using Tesseract."""
|
||||
import logging
|
||||
import os
|
||||
import subprocess
|
||||
import tempfile
|
||||
|
||||
from PIL import Image
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class OCRError(Exception):
|
||||
"""Custom exception for OCR failures."""
|
||||
pass
|
||||
|
||||
|
||||
# Tesseract language codes
|
||||
SUPPORTED_LANGUAGES = {
|
||||
"eng": "English",
|
||||
"ara": "Arabic",
|
||||
"fra": "French",
|
||||
}
|
||||
|
||||
DEFAULT_LANG = "eng"
|
||||
|
||||
|
||||
def _get_tesseract_cmd() -> str:
|
||||
"""Return the tesseract binary path."""
|
||||
return os.getenv("TESSERACT_CMD", "tesseract")
|
||||
|
||||
|
||||
def ocr_image(input_path: str, lang: str = DEFAULT_LANG) -> dict:
|
||||
"""Extract text from an image file using Tesseract.
|
||||
|
||||
Args:
|
||||
input_path: Path to the input image.
|
||||
lang: Tesseract language code (e.g. "eng", "ara", "fra").
|
||||
|
||||
Returns:
|
||||
dict with ``text``, ``lang``, ``char_count``.
|
||||
|
||||
Raises:
|
||||
OCRError: If the OCR operation fails.
|
||||
"""
|
||||
if lang not in SUPPORTED_LANGUAGES:
|
||||
lang = DEFAULT_LANG
|
||||
|
||||
try:
|
||||
import pytesseract
|
||||
|
||||
pytesseract.pytesseract.tesseract_cmd = _get_tesseract_cmd()
|
||||
|
||||
with Image.open(input_path) as img:
|
||||
# Convert to RGB if needed (tesseract works best with RGB)
|
||||
if img.mode not in ("RGB", "L"):
|
||||
img = img.convert("RGB")
|
||||
text = pytesseract.image_to_string(img, lang=lang)
|
||||
|
||||
text = text.strip()
|
||||
return {
|
||||
"text": text,
|
||||
"lang": lang,
|
||||
"char_count": len(text),
|
||||
}
|
||||
except ImportError:
|
||||
raise OCRError("pytesseract is not installed.")
|
||||
except Exception as e:
|
||||
raise OCRError(f"OCR failed: {str(e)}")
|
||||
|
||||
|
||||
def ocr_pdf(input_path: str, output_path: str, lang: str = DEFAULT_LANG) -> dict:
|
||||
"""Extract text from a scanned PDF by converting pages to images first.
|
||||
|
||||
Args:
|
||||
input_path: Path to the input PDF.
|
||||
output_path: Path for the output text file.
|
||||
lang: Tesseract language code.
|
||||
|
||||
Returns:
|
||||
dict with ``text``, ``page_count``, ``char_count``.
|
||||
|
||||
Raises:
|
||||
OCRError: If the OCR operation fails.
|
||||
"""
|
||||
if lang not in SUPPORTED_LANGUAGES:
|
||||
lang = DEFAULT_LANG
|
||||
|
||||
try:
|
||||
from pdf2image import convert_from_path
|
||||
import pytesseract
|
||||
|
||||
pytesseract.pytesseract.tesseract_cmd = _get_tesseract_cmd()
|
||||
|
||||
images = convert_from_path(input_path, dpi=300)
|
||||
if not images:
|
||||
raise OCRError("Could not convert PDF to images — file may be empty.")
|
||||
|
||||
all_text = []
|
||||
for i, img in enumerate(images, 1):
|
||||
if img.mode not in ("RGB", "L"):
|
||||
img = img.convert("RGB")
|
||||
page_text = pytesseract.image_to_string(img, lang=lang)
|
||||
all_text.append(f"--- Page {i} ---\n{page_text.strip()}")
|
||||
|
||||
full_text = "\n\n".join(all_text)
|
||||
|
||||
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
||||
with open(output_path, "w", encoding="utf-8") as f:
|
||||
f.write(full_text)
|
||||
|
||||
return {
|
||||
"text": full_text,
|
||||
"page_count": len(images),
|
||||
"char_count": len(full_text),
|
||||
}
|
||||
except ImportError as e:
|
||||
raise OCRError(f"Missing dependency: {e}")
|
||||
except OCRError:
|
||||
raise
|
||||
except Exception as e:
|
||||
raise OCRError(f"PDF OCR failed: {str(e)}")
|
||||
Reference in New Issue
Block a user