SaaS-PDF/backend/app/services/ocr_service.py

"""OCR service — extract text from images and PDFs using Tesseract."""
import logging
import os
import subprocess
import tempfile

from PIL import Image

logger = logging.getLogger(__name__)


class OCRError(Exception):
    """Custom exception for OCR failures."""
    pass


# Tesseract language codes
SUPPORTED_LANGUAGES = {
    "eng": "English",
    "ara": "Arabic",
    "fra": "French",
}

DEFAULT_LANG = "eng"


def _get_tesseract_cmd() -> str:
    """Return the tesseract binary path."""
    return os.getenv("TESSERACT_CMD", "tesseract")


def ocr_image(input_path: str, lang: str = DEFAULT_LANG) -> dict:
    """Extract text from an image file using Tesseract.

    Args:
        input_path: Path to the input image.
        lang: Tesseract language code (e.g. "eng", "ara", "fra").

    Returns:
        dict with ``text``, ``lang``, ``char_count``.

    Raises:
        OCRError: If the OCR operation fails.
    """
    if lang not in SUPPORTED_LANGUAGES:
        lang = DEFAULT_LANG

    try:
        import pytesseract

        pytesseract.pytesseract.tesseract_cmd = _get_tesseract_cmd()

        with Image.open(input_path) as img:
            # Convert to RGB if needed (tesseract works best with RGB)
            if img.mode not in ("RGB", "L"):
                img = img.convert("RGB")
            text = pytesseract.image_to_string(img, lang=lang)

        text = text.strip()
        return {
            "text": text,
            "lang": lang,
            "char_count": len(text),
        }
    except ImportError:
        raise OCRError("pytesseract is not installed.")
    except Exception as e:
        raise OCRError(f"OCR failed: {str(e)}")


def ocr_pdf(input_path: str, output_path: str, lang: str = DEFAULT_LANG) -> dict:
    """Extract text from a scanned PDF by converting pages to images first.

    Args:
        input_path: Path to the input PDF.
        output_path: Path for the output text file.
        lang: Tesseract language code.

    Returns:
        dict with ``text``, ``page_count``, ``char_count``.

    Raises:
        OCRError: If the OCR operation fails.
    """
    if lang not in SUPPORTED_LANGUAGES:
        lang = DEFAULT_LANG

    try:
        from pdf2image import convert_from_path
        import pytesseract

        pytesseract.pytesseract.tesseract_cmd = _get_tesseract_cmd()

        images = convert_from_path(input_path, dpi=300)
        if not images:
            raise OCRError("Could not convert PDF to images — file may be empty.")

        all_text = []
        for i, img in enumerate(images, 1):
            if img.mode not in ("RGB", "L"):
                img = img.convert("RGB")
            page_text = pytesseract.image_to_string(img, lang=lang)
            all_text.append(f"--- Page {i} ---\n{page_text.strip()}")

        full_text = "\n\n".join(all_text)

        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        with open(output_path, "w", encoding="utf-8") as f:
            f.write(full_text)

        return {
            "text": full_text,
            "page_count": len(images),
            "char_count": len(full_text),
        }
    except ImportError as e:
        raise OCRError(f"Missing dependency: {e}")
    except OCRError:
        raise
    except Exception as e:
        raise OCRError(f"PDF OCR failed: {str(e)}")