- Implemented OCR functionality using pytesseract for image and PDF text extraction. - Added Background Removal service using rembg for image processing. - Developed PDF Editor service for applying text annotations to PDF files. - Created corresponding API routes for OCR, Background Removal, and PDF Editor. - Added frontend components for OCR and Background Removal tools. - Integrated feature flagging for new tools, ensuring they are disabled by default. - Implemented comprehensive unit tests for OCR service, PDF editor, and background removal. - Updated documentation to reflect new features and usage instructions. - Added translations for new features in English, Arabic, and French.
122 lines
3.2 KiB
Python
122 lines
3.2 KiB
Python
"""OCR service — extract text from images and PDFs using Tesseract."""
|
|
import logging
|
|
import os
|
|
import subprocess
|
|
import tempfile
|
|
|
|
from PIL import Image
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class OCRError(Exception):
|
|
"""Custom exception for OCR failures."""
|
|
pass
|
|
|
|
|
|
# Tesseract language codes
|
|
SUPPORTED_LANGUAGES = {
|
|
"eng": "English",
|
|
"ara": "Arabic",
|
|
"fra": "French",
|
|
}
|
|
|
|
DEFAULT_LANG = "eng"
|
|
|
|
|
|
def _get_tesseract_cmd() -> str:
|
|
"""Return the tesseract binary path."""
|
|
return os.getenv("TESSERACT_CMD", "tesseract")
|
|
|
|
|
|
def ocr_image(input_path: str, lang: str = DEFAULT_LANG) -> dict:
|
|
"""Extract text from an image file using Tesseract.
|
|
|
|
Args:
|
|
input_path: Path to the input image.
|
|
lang: Tesseract language code (e.g. "eng", "ara", "fra").
|
|
|
|
Returns:
|
|
dict with ``text``, ``lang``, ``char_count``.
|
|
|
|
Raises:
|
|
OCRError: If the OCR operation fails.
|
|
"""
|
|
if lang not in SUPPORTED_LANGUAGES:
|
|
lang = DEFAULT_LANG
|
|
|
|
try:
|
|
import pytesseract
|
|
|
|
pytesseract.pytesseract.tesseract_cmd = _get_tesseract_cmd()
|
|
|
|
with Image.open(input_path) as img:
|
|
# Convert to RGB if needed (tesseract works best with RGB)
|
|
if img.mode not in ("RGB", "L"):
|
|
img = img.convert("RGB")
|
|
text = pytesseract.image_to_string(img, lang=lang)
|
|
|
|
text = text.strip()
|
|
return {
|
|
"text": text,
|
|
"lang": lang,
|
|
"char_count": len(text),
|
|
}
|
|
except ImportError:
|
|
raise OCRError("pytesseract is not installed.")
|
|
except Exception as e:
|
|
raise OCRError(f"OCR failed: {str(e)}")
|
|
|
|
|
|
def ocr_pdf(input_path: str, output_path: str, lang: str = DEFAULT_LANG) -> dict:
|
|
"""Extract text from a scanned PDF by converting pages to images first.
|
|
|
|
Args:
|
|
input_path: Path to the input PDF.
|
|
output_path: Path for the output text file.
|
|
lang: Tesseract language code.
|
|
|
|
Returns:
|
|
dict with ``text``, ``page_count``, ``char_count``.
|
|
|
|
Raises:
|
|
OCRError: If the OCR operation fails.
|
|
"""
|
|
if lang not in SUPPORTED_LANGUAGES:
|
|
lang = DEFAULT_LANG
|
|
|
|
try:
|
|
from pdf2image import convert_from_path
|
|
import pytesseract
|
|
|
|
pytesseract.pytesseract.tesseract_cmd = _get_tesseract_cmd()
|
|
|
|
images = convert_from_path(input_path, dpi=300)
|
|
if not images:
|
|
raise OCRError("Could not convert PDF to images — file may be empty.")
|
|
|
|
all_text = []
|
|
for i, img in enumerate(images, 1):
|
|
if img.mode not in ("RGB", "L"):
|
|
img = img.convert("RGB")
|
|
page_text = pytesseract.image_to_string(img, lang=lang)
|
|
all_text.append(f"--- Page {i} ---\n{page_text.strip()}")
|
|
|
|
full_text = "\n\n".join(all_text)
|
|
|
|
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
|
with open(output_path, "w", encoding="utf-8") as f:
|
|
f.write(full_text)
|
|
|
|
return {
|
|
"text": full_text,
|
|
"page_count": len(images),
|
|
"char_count": len(full_text),
|
|
}
|
|
except ImportError as e:
|
|
raise OCRError(f"Missing dependency: {e}")
|
|
except OCRError:
|
|
raise
|
|
except Exception as e:
|
|
raise OCRError(f"PDF OCR failed: {str(e)}")
|