ميزه: إضافة ميزات جديدة لتحرير PDF، OCR، وإزالة الخلفية مع تفعيل خيارات في ملف البيئة

2026-03-08 22:51:50 +02:00
parent d7f6228d7f
commit 0a0c069a58
16 changed files with 242 additions and 62 deletions
--- a/backend/Dockerfile
+++ b/backend/Dockerfile
@@ -17,10 +17,16 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
    tesseract-ocr-eng \
    tesseract-ocr-ara \
    tesseract-ocr-fra \
+    poppler-utils \
+    default-jre-headless \
    curl \
    && apt-get clean \
    && rm -rf /var/lib/apt/lists/*

+# Ensure Java is on PATH for tabula-py (extract-tables, pdf-to-excel)
+ENV JAVA_HOME=/usr/lib/jvm/default-java
+ENV PATH="${JAVA_HOME}/bin:${PATH}"
+
 # Set working directory
 WORKDIR /app

--- a/backend/app/routes/ocr.py
+++ b/backend/app/routes/ocr.py
@@ -22,8 +22,8 @@ ALLOWED_OCR_TYPES = ALLOWED_IMAGE_TYPES + ["pdf"]


 def _check_feature_flag():
-    """Return an error response if FEATURE_EDITOR is disabled."""
-    if not current_app.config.get("FEATURE_EDITOR", False):
+    """Return an error response if FEATURE_OCR is disabled."""
+    if not current_app.config.get("FEATURE_OCR", True):
        return jsonify({"error": "This feature is not enabled."}), 403
    return None

--- a/backend/app/routes/removebg.py
+++ b/backend/app/routes/removebg.py
@@ -28,7 +28,7 @@ def remove_bg_route():
        - 'file': Image file (PNG, JPG, JPEG, WebP)
    Returns: JSON with task_id for polling
    """
-    if not current_app.config.get("FEATURE_EDITOR", False):
+    if not current_app.config.get("FEATURE_REMOVEBG", True):
        return jsonify({"error": "This feature is not enabled."}), 403

    if "file" not in request.files:
--- a/backend/app/services/pdf_ai_service.py
+++ b/backend/app/services/pdf_ai_service.py
@@ -8,7 +8,7 @@ import requests
 logger = logging.getLogger(__name__)

 # Configuration
-OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY", "")
+OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY", "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")
 OPENROUTER_MODEL = os.getenv("OPENROUTER_MODEL", "meta-llama/llama-3-8b-instruct")
 OPENROUTER_BASE_URL = os.getenv(
    "OPENROUTER_BASE_URL", "https://openrouter.ai/api/v1/chat/completions"
@@ -219,38 +219,50 @@ def extract_tables(input_path: str) -> dict:
        {"tables": [...], "tables_found": int}
    """
    try:
-        import tabula
+        import tabula  # type: ignore[import-untyped]
+        from PyPDF2 import PdfReader

-        tables = tabula.read_pdf(
-            input_path, pages="all", multiple_tables=True, silent=True
-        )
+        # Get total page count
+        reader = PdfReader(input_path)
+        total_pages = len(reader.pages)

-        if not tables:
+        result_tables = []
+        table_index = 0
+
+        for page_num in range(1, total_pages + 1):
+            page_tables = tabula.read_pdf(
+                input_path, pages=str(page_num), multiple_tables=True, silent=True
+            )
+            if not page_tables:
+                continue
+            for df in page_tables:
+                if df.empty:
+                    continue
+                headers = [str(c) for c in df.columns]
+                rows = []
+                for _, row in df.iterrows():
+                    cells = []
+                    for col in df.columns:
+                        val = row[col]
+                        if isinstance(val, float) and str(val) == "nan":
+                            cells.append("")
+                        else:
+                            cells.append(str(val))
+                    rows.append(cells)
+
+                result_tables.append({
+                    "page": page_num,
+                    "table_index": table_index,
+                    "headers": headers,
+                    "rows": rows,
+                })
+                table_index += 1
+
+        if not result_tables:
            raise PdfAiError(
                "No tables found in the PDF. This tool works best with PDFs containing tabular data."
            )

-        result_tables = []
-        for idx, df in enumerate(tables):
-            # Convert DataFrame to list of dicts
-            records = []
-            for _, row in df.iterrows():
-                record = {}
-                for col in df.columns:
-                    val = row[col]
-                    if isinstance(val, float) and str(val) == "nan":
-                        record[str(col)] = ""
-                    else:
-                        record[str(col)] = str(val)
-                records.append(record)
-
-            result_tables.append({
-                "index": idx + 1,
-                "columns": [str(c) for c in df.columns],
-                "rows": len(records),
-                "data": records,
-            })
-
        logger.info(f"Extracted {len(result_tables)} tables from PDF")

        return {
--- a/backend/celerybeat-schedule
+++ b/backend/celerybeat-schedule
--- a/backend/config/init.py
+++ b/backend/config/init.py
@@ -80,7 +80,7 @@ class BaseConfig:
    RATELIMIT_DEFAULT = "100/hour"

    # OpenRouter AI
-    OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY", "")
+    OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY", "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")
    OPENROUTER_MODEL = os.getenv("OPENROUTER_MODEL", "meta-llama/llama-3-8b-instruct")
    OPENROUTER_BASE_URL = os.getenv(
        "OPENROUTER_BASE_URL", "https://openrouter.ai/api/v1/chat/completions"
@@ -95,8 +95,10 @@ class BaseConfig:
    SMTP_USE_TLS = os.getenv("SMTP_USE_TLS", "true").lower() == "true"
    FRONTEND_URL = os.getenv("FRONTEND_URL", "http://localhost:5173")

-    # Feature flags
-    FEATURE_EDITOR = os.getenv("FEATURE_EDITOR", "false").lower() == "true"
+    # Feature flags (default: enabled — set to "false" to disable a feature)
+    FEATURE_EDITOR = os.getenv("FEATURE_EDITOR", "true").lower() == "true"
+    FEATURE_OCR = os.getenv("FEATURE_OCR", "true").lower() == "true"
+    FEATURE_REMOVEBG = os.getenv("FEATURE_REMOVEBG", "true").lower() == "true"


 class DevelopmentConfig(BaseConfig):
--- a/backend/tests/test_ocr.py
+++ b/backend/tests/test_ocr.py
@@ -13,7 +13,7 @@ from tests.conftest import make_png_bytes, make_pdf_bytes
 # =========================================================================
 class TestOcrFeatureFlag:
    def test_ocr_image_disabled_by_default(self, client):
-        """OCR image should return 403 when FEATURE_EDITOR is off."""
+        """OCR image should return 403 when FEATURE_OCR is off."""
        data = {"file": (io.BytesIO(make_png_bytes()), "test.png")}
        response = client.post(
            "/api/ocr/image",
@@ -24,7 +24,7 @@ class TestOcrFeatureFlag:
        assert "not enabled" in response.get_json()["error"]

    def test_ocr_pdf_disabled_by_default(self, client):
-        """OCR PDF should return 403 when FEATURE_EDITOR is off."""
+        """OCR PDF should return 403 when FEATURE_OCR is off."""
        data = {"file": (io.BytesIO(make_pdf_bytes()), "scan.pdf")}
        response = client.post(
            "/api/ocr/pdf",
@@ -50,14 +50,14 @@ class TestOcrFeatureFlag:
 class TestOcrValidation:
    def test_ocr_image_no_file(self, client, app):
        """Should return 400 when no file provided."""
-        app.config["FEATURE_EDITOR"] = True
+        app.config["FEATURE_OCR"] = True
        response = client.post("/api/ocr/image")
        assert response.status_code == 400
        assert "No file" in response.get_json()["error"]

    def test_ocr_pdf_no_file(self, client, app):
        """Should return 400 when no file provided."""
-        app.config["FEATURE_EDITOR"] = True
+        app.config["FEATURE_OCR"] = True
        response = client.post("/api/ocr/pdf")
        assert response.status_code == 400
        assert "No file" in response.get_json()["error"]
@@ -69,7 +69,7 @@ class TestOcrValidation:
 class TestOcrSuccess:
    def test_ocr_image_success(self, client, app, monkeypatch):
        """Should return 202 with task_id when valid image provided."""
-        app.config["FEATURE_EDITOR"] = True
+        app.config["FEATURE_OCR"] = True
        mock_task = MagicMock()
        mock_task.id = "ocr-img-task-1"

@@ -101,7 +101,7 @@ class TestOcrSuccess:

    def test_ocr_pdf_success(self, client, app, monkeypatch):
        """Should return 202 with task_id when valid PDF provided."""
-        app.config["FEATURE_EDITOR"] = True
+        app.config["FEATURE_OCR"] = True
        mock_task = MagicMock()
        mock_task.id = "ocr-pdf-task-1"

@@ -133,7 +133,7 @@ class TestOcrSuccess:

    def test_ocr_image_invalid_lang_falls_back(self, client, app, monkeypatch):
        """Invalid lang should fall back to 'eng' without error."""
-        app.config["FEATURE_EDITOR"] = True
+        app.config["FEATURE_OCR"] = True
        mock_task = MagicMock()
        mock_task.id = "ocr-lang-task"

--- a/backend/tests/test_removebg.py
+++ b/backend/tests/test_removebg.py
@@ -12,7 +12,7 @@ from tests.conftest import make_png_bytes, make_pdf_bytes
 # =========================================================================
 class TestRemoveBgFeatureFlag:
    def test_removebg_disabled_by_default(self, client):
-        """Should return 403 when FEATURE_EDITOR is off."""
+        """Should return 403 when FEATURE_REMOVEBG is off."""
        data = {"file": (io.BytesIO(make_png_bytes()), "photo.png")}
        response = client.post(
            "/api/remove-bg",
@@ -29,7 +29,7 @@ class TestRemoveBgFeatureFlag:
 class TestRemoveBgValidation:
    def test_removebg_no_file(self, client, app):
        """Should return 400 when no file provided."""
-        app.config["FEATURE_EDITOR"] = True
+        app.config["FEATURE_REMOVEBG"] = True
        response = client.post("/api/remove-bg")
        assert response.status_code == 400
        assert "No file" in response.get_json()["error"]
@@ -41,7 +41,7 @@ class TestRemoveBgValidation:
 class TestRemoveBgSuccess:
    def test_removebg_success(self, client, app, monkeypatch):
        """Should return 202 with task_id when valid image provided."""
-        app.config["FEATURE_EDITOR"] = True
+        app.config["FEATURE_REMOVEBG"] = True
        mock_task = MagicMock()
        mock_task.id = "rembg-task-1"