ميزه: إضافة ميزات جديدة لتحرير PDF، OCR، وإزالة الخلفية مع تفعيل خيارات في ملف البيئة

This commit is contained in:
Your Name
2026-03-08 22:51:50 +02:00
parent d7f6228d7f
commit 0a0c069a58
16 changed files with 242 additions and 62 deletions

View File

@@ -17,10 +17,16 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
tesseract-ocr-eng \
tesseract-ocr-ara \
tesseract-ocr-fra \
poppler-utils \
default-jre-headless \
curl \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
# Ensure Java is on PATH for tabula-py (extract-tables, pdf-to-excel)
ENV JAVA_HOME=/usr/lib/jvm/default-java
ENV PATH="${JAVA_HOME}/bin:${PATH}"
# Set working directory
WORKDIR /app

View File

@@ -22,8 +22,8 @@ ALLOWED_OCR_TYPES = ALLOWED_IMAGE_TYPES + ["pdf"]
def _check_feature_flag():
"""Return an error response if FEATURE_EDITOR is disabled."""
if not current_app.config.get("FEATURE_EDITOR", False):
"""Return an error response if FEATURE_OCR is disabled."""
if not current_app.config.get("FEATURE_OCR", True):
return jsonify({"error": "This feature is not enabled."}), 403
return None

View File

@@ -28,7 +28,7 @@ def remove_bg_route():
- 'file': Image file (PNG, JPG, JPEG, WebP)
Returns: JSON with task_id for polling
"""
if not current_app.config.get("FEATURE_EDITOR", False):
if not current_app.config.get("FEATURE_REMOVEBG", True):
return jsonify({"error": "This feature is not enabled."}), 403
if "file" not in request.files:

View File

@@ -8,7 +8,7 @@ import requests
logger = logging.getLogger(__name__)
# Configuration
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY", "")
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY", "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")
OPENROUTER_MODEL = os.getenv("OPENROUTER_MODEL", "meta-llama/llama-3-8b-instruct")
OPENROUTER_BASE_URL = os.getenv(
"OPENROUTER_BASE_URL", "https://openrouter.ai/api/v1/chat/completions"
@@ -219,38 +219,50 @@ def extract_tables(input_path: str) -> dict:
{"tables": [...], "tables_found": int}
"""
try:
import tabula
import tabula # type: ignore[import-untyped]
from PyPDF2 import PdfReader
tables = tabula.read_pdf(
input_path, pages="all", multiple_tables=True, silent=True
)
# Get total page count
reader = PdfReader(input_path)
total_pages = len(reader.pages)
if not tables:
result_tables = []
table_index = 0
for page_num in range(1, total_pages + 1):
page_tables = tabula.read_pdf(
input_path, pages=str(page_num), multiple_tables=True, silent=True
)
if not page_tables:
continue
for df in page_tables:
if df.empty:
continue
headers = [str(c) for c in df.columns]
rows = []
for _, row in df.iterrows():
cells = []
for col in df.columns:
val = row[col]
if isinstance(val, float) and str(val) == "nan":
cells.append("")
else:
cells.append(str(val))
rows.append(cells)
result_tables.append({
"page": page_num,
"table_index": table_index,
"headers": headers,
"rows": rows,
})
table_index += 1
if not result_tables:
raise PdfAiError(
"No tables found in the PDF. This tool works best with PDFs containing tabular data."
)
result_tables = []
for idx, df in enumerate(tables):
# Convert DataFrame to list of dicts
records = []
for _, row in df.iterrows():
record = {}
for col in df.columns:
val = row[col]
if isinstance(val, float) and str(val) == "nan":
record[str(col)] = ""
else:
record[str(col)] = str(val)
records.append(record)
result_tables.append({
"index": idx + 1,
"columns": [str(c) for c in df.columns],
"rows": len(records),
"data": records,
})
logger.info(f"Extracted {len(result_tables)} tables from PDF")
return {

BIN
backend/celerybeat-schedule Normal file

Binary file not shown.

View File

@@ -80,7 +80,7 @@ class BaseConfig:
RATELIMIT_DEFAULT = "100/hour"
# OpenRouter AI
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY", "")
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY", "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")
OPENROUTER_MODEL = os.getenv("OPENROUTER_MODEL", "meta-llama/llama-3-8b-instruct")
OPENROUTER_BASE_URL = os.getenv(
"OPENROUTER_BASE_URL", "https://openrouter.ai/api/v1/chat/completions"
@@ -95,8 +95,10 @@ class BaseConfig:
SMTP_USE_TLS = os.getenv("SMTP_USE_TLS", "true").lower() == "true"
FRONTEND_URL = os.getenv("FRONTEND_URL", "http://localhost:5173")
# Feature flags
FEATURE_EDITOR = os.getenv("FEATURE_EDITOR", "false").lower() == "true"
# Feature flags (default: enabled — set to "false" to disable a feature)
FEATURE_EDITOR = os.getenv("FEATURE_EDITOR", "true").lower() == "true"
FEATURE_OCR = os.getenv("FEATURE_OCR", "true").lower() == "true"
FEATURE_REMOVEBG = os.getenv("FEATURE_REMOVEBG", "true").lower() == "true"
class DevelopmentConfig(BaseConfig):

View File

@@ -13,7 +13,7 @@ from tests.conftest import make_png_bytes, make_pdf_bytes
# =========================================================================
class TestOcrFeatureFlag:
def test_ocr_image_disabled_by_default(self, client):
"""OCR image should return 403 when FEATURE_EDITOR is off."""
"""OCR image should return 403 when FEATURE_OCR is off."""
data = {"file": (io.BytesIO(make_png_bytes()), "test.png")}
response = client.post(
"/api/ocr/image",
@@ -24,7 +24,7 @@ class TestOcrFeatureFlag:
assert "not enabled" in response.get_json()["error"]
def test_ocr_pdf_disabled_by_default(self, client):
"""OCR PDF should return 403 when FEATURE_EDITOR is off."""
"""OCR PDF should return 403 when FEATURE_OCR is off."""
data = {"file": (io.BytesIO(make_pdf_bytes()), "scan.pdf")}
response = client.post(
"/api/ocr/pdf",
@@ -50,14 +50,14 @@ class TestOcrFeatureFlag:
class TestOcrValidation:
def test_ocr_image_no_file(self, client, app):
"""Should return 400 when no file provided."""
app.config["FEATURE_EDITOR"] = True
app.config["FEATURE_OCR"] = True
response = client.post("/api/ocr/image")
assert response.status_code == 400
assert "No file" in response.get_json()["error"]
def test_ocr_pdf_no_file(self, client, app):
"""Should return 400 when no file provided."""
app.config["FEATURE_EDITOR"] = True
app.config["FEATURE_OCR"] = True
response = client.post("/api/ocr/pdf")
assert response.status_code == 400
assert "No file" in response.get_json()["error"]
@@ -69,7 +69,7 @@ class TestOcrValidation:
class TestOcrSuccess:
def test_ocr_image_success(self, client, app, monkeypatch):
"""Should return 202 with task_id when valid image provided."""
app.config["FEATURE_EDITOR"] = True
app.config["FEATURE_OCR"] = True
mock_task = MagicMock()
mock_task.id = "ocr-img-task-1"
@@ -101,7 +101,7 @@ class TestOcrSuccess:
def test_ocr_pdf_success(self, client, app, monkeypatch):
"""Should return 202 with task_id when valid PDF provided."""
app.config["FEATURE_EDITOR"] = True
app.config["FEATURE_OCR"] = True
mock_task = MagicMock()
mock_task.id = "ocr-pdf-task-1"
@@ -133,7 +133,7 @@ class TestOcrSuccess:
def test_ocr_image_invalid_lang_falls_back(self, client, app, monkeypatch):
"""Invalid lang should fall back to 'eng' without error."""
app.config["FEATURE_EDITOR"] = True
app.config["FEATURE_OCR"] = True
mock_task = MagicMock()
mock_task.id = "ocr-lang-task"

View File

@@ -12,7 +12,7 @@ from tests.conftest import make_png_bytes, make_pdf_bytes
# =========================================================================
class TestRemoveBgFeatureFlag:
def test_removebg_disabled_by_default(self, client):
"""Should return 403 when FEATURE_EDITOR is off."""
"""Should return 403 when FEATURE_REMOVEBG is off."""
data = {"file": (io.BytesIO(make_png_bytes()), "photo.png")}
response = client.post(
"/api/remove-bg",
@@ -29,7 +29,7 @@ class TestRemoveBgFeatureFlag:
class TestRemoveBgValidation:
def test_removebg_no_file(self, client, app):
"""Should return 400 when no file provided."""
app.config["FEATURE_EDITOR"] = True
app.config["FEATURE_REMOVEBG"] = True
response = client.post("/api/remove-bg")
assert response.status_code == 400
assert "No file" in response.get_json()["error"]
@@ -41,7 +41,7 @@ class TestRemoveBgValidation:
class TestRemoveBgSuccess:
def test_removebg_success(self, client, app, monkeypatch):
"""Should return 202 with task_id when valid image provided."""
app.config["FEATURE_EDITOR"] = True
app.config["FEATURE_REMOVEBG"] = True
mock_task = MagicMock()
mock_task.id = "rembg-task-1"