Add OCR, Background Removal, and PDF Editor features with tests

- Implemented OCR functionality using pytesseract for image and PDF text extraction.
- Added Background Removal service using rembg for image processing.
- Developed PDF Editor service for applying text annotations to PDF files.
- Created corresponding API routes for OCR, Background Removal, and PDF Editor.
- Added frontend components for OCR and Background Removal tools.
- Integrated feature flagging for new tools, ensuring they are disabled by default.
- Implemented comprehensive unit tests for OCR service, PDF editor, and background removal.
- Updated documentation to reflect new features and usage instructions.
- Added translations for new features in English, Arabic, and French.
This commit is contained in:
Your Name
2026-03-07 21:29:08 +02:00
parent 71f7d0382d
commit 6bb76e3f1b
28 changed files with 1975 additions and 0 deletions

View File

@@ -0,0 +1,121 @@
"""OCR service — extract text from images and PDFs using Tesseract."""
import logging
import os
import subprocess
import tempfile
from PIL import Image
logger = logging.getLogger(__name__)
class OCRError(Exception):
"""Custom exception for OCR failures."""
pass
# Tesseract language codes
SUPPORTED_LANGUAGES = {
"eng": "English",
"ara": "Arabic",
"fra": "French",
}
DEFAULT_LANG = "eng"
def _get_tesseract_cmd() -> str:
"""Return the tesseract binary path."""
return os.getenv("TESSERACT_CMD", "tesseract")
def ocr_image(input_path: str, lang: str = DEFAULT_LANG) -> dict:
"""Extract text from an image file using Tesseract.
Args:
input_path: Path to the input image.
lang: Tesseract language code (e.g. "eng", "ara", "fra").
Returns:
dict with ``text``, ``lang``, ``char_count``.
Raises:
OCRError: If the OCR operation fails.
"""
if lang not in SUPPORTED_LANGUAGES:
lang = DEFAULT_LANG
try:
import pytesseract
pytesseract.pytesseract.tesseract_cmd = _get_tesseract_cmd()
with Image.open(input_path) as img:
# Convert to RGB if needed (tesseract works best with RGB)
if img.mode not in ("RGB", "L"):
img = img.convert("RGB")
text = pytesseract.image_to_string(img, lang=lang)
text = text.strip()
return {
"text": text,
"lang": lang,
"char_count": len(text),
}
except ImportError:
raise OCRError("pytesseract is not installed.")
except Exception as e:
raise OCRError(f"OCR failed: {str(e)}")
def ocr_pdf(input_path: str, output_path: str, lang: str = DEFAULT_LANG) -> dict:
"""Extract text from a scanned PDF by converting pages to images first.
Args:
input_path: Path to the input PDF.
output_path: Path for the output text file.
lang: Tesseract language code.
Returns:
dict with ``text``, ``page_count``, ``char_count``.
Raises:
OCRError: If the OCR operation fails.
"""
if lang not in SUPPORTED_LANGUAGES:
lang = DEFAULT_LANG
try:
from pdf2image import convert_from_path
import pytesseract
pytesseract.pytesseract.tesseract_cmd = _get_tesseract_cmd()
images = convert_from_path(input_path, dpi=300)
if not images:
raise OCRError("Could not convert PDF to images — file may be empty.")
all_text = []
for i, img in enumerate(images, 1):
if img.mode not in ("RGB", "L"):
img = img.convert("RGB")
page_text = pytesseract.image_to_string(img, lang=lang)
all_text.append(f"--- Page {i} ---\n{page_text.strip()}")
full_text = "\n\n".join(all_text)
os.makedirs(os.path.dirname(output_path), exist_ok=True)
with open(output_path, "w", encoding="utf-8") as f:
f.write(full_text)
return {
"text": full_text,
"page_count": len(images),
"char_count": len(full_text),
}
except ImportError as e:
raise OCRError(f"Missing dependency: {e}")
except OCRError:
raise
except Exception as e:
raise OCRError(f"PDF OCR failed: {str(e)}")

View File

@@ -0,0 +1,120 @@
"""PDF Editor service — add text annotations and simple edits to PDFs."""
import io
import logging
import os
logger = logging.getLogger(__name__)
class PDFEditorError(Exception):
"""Custom exception for PDF editor failures."""
pass
def apply_pdf_edits(input_path: str, output_path: str, edits: list[dict]) -> dict:
"""Apply a list of edits (text annotations) to an existing PDF.
Each edit dict can contain:
- type: "text"
- page: 1-based page number
- x, y: position in points from bottom-left
- content: text string to place
- fontSize: optional, default 12
- color: optional hex e.g. "#000000"
Args:
input_path: Path to the source PDF.
output_path: Path for the edited PDF.
edits: List of edit operation dicts.
Returns:
dict with ``page_count``, ``edits_applied``, ``output_size``.
Raises:
PDFEditorError: If the edit fails.
"""
if not edits:
raise PDFEditorError("No edits provided.")
os.makedirs(os.path.dirname(output_path), exist_ok=True)
try:
from PyPDF2 import PdfReader, PdfWriter
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
from reportlab.lib.colors import HexColor
reader = PdfReader(input_path)
writer = PdfWriter()
page_count = len(reader.pages)
if page_count == 0:
raise PDFEditorError("PDF has no pages.")
# Group edits by page
edits_by_page: dict[int, list[dict]] = {}
for edit in edits:
page_num = int(edit.get("page", 1))
if page_num < 1 or page_num > page_count:
continue
edits_by_page.setdefault(page_num, []).append(edit)
edits_applied = 0
for page_idx in range(page_count):
page = reader.pages[page_idx]
page_num = page_idx + 1
page_edits = edits_by_page.get(page_num, [])
if page_edits:
# Get page dimensions
media_box = page.mediabox
page_width = float(media_box.width)
page_height = float(media_box.height)
# Create overlay with annotations
packet = io.BytesIO()
c = canvas.Canvas(packet, pagesize=(page_width, page_height))
for edit in page_edits:
edit_type = edit.get("type", "text")
if edit_type == "text":
x = float(edit.get("x", 72))
y = float(edit.get("y", 72))
content = str(edit.get("content", ""))
font_size = int(edit.get("fontSize", 12))
color = str(edit.get("color", "#000000"))
try:
c.setFillColor(HexColor(color))
except Exception:
c.setFillColor(HexColor("#000000"))
c.setFont("Helvetica", font_size)
c.drawString(x, y, content)
edits_applied += 1
c.save()
packet.seek(0)
overlay_reader = PdfReader(packet)
if len(overlay_reader.pages) > 0:
page.merge_page(overlay_reader.pages[0])
writer.add_page(page)
with open(output_path, "wb") as f:
writer.write(f)
output_size = os.path.getsize(output_path)
return {
"page_count": page_count,
"edits_applied": edits_applied,
"output_size": output_size,
}
except PDFEditorError:
raise
except Exception as e:
raise PDFEditorError(f"PDF editing failed: {str(e)}")

View File

@@ -0,0 +1,60 @@
"""Background removal service using rembg."""
import logging
import os
from PIL import Image
logger = logging.getLogger(__name__)
class RemoveBGError(Exception):
"""Custom exception for background removal failures."""
pass
def remove_background(input_path: str, output_path: str) -> dict:
"""Remove the background from an image.
Args:
input_path: Path to the input image.
output_path: Path for the output PNG (always PNG — transparency).
Returns:
dict with ``original_size``, ``output_size``, ``width``, ``height``.
Raises:
RemoveBGError: If the operation fails.
"""
os.makedirs(os.path.dirname(output_path), exist_ok=True)
try:
from rembg import remove as rembg_remove
with Image.open(input_path) as img:
if img.mode != "RGBA":
img = img.convert("RGBA")
width, height = img.size
original_size = os.path.getsize(input_path)
result = rembg_remove(img)
result.save(output_path, format="PNG", optimize=True)
output_size = os.path.getsize(output_path)
logger.info(
"Background removed: %s%s (%d%d bytes)",
input_path, output_path, original_size, output_size,
)
return {
"original_size": original_size,
"output_size": output_size,
"width": width,
"height": height,
}
except ImportError:
raise RemoveBGError("rembg is not installed.")
except (IOError, OSError) as e:
raise RemoveBGError(f"Background removal failed: {str(e)}")
except Exception as e:
raise RemoveBGError(f"Background removal failed: {str(e)}")