Add OCR, Background Removal, and PDF Editor features with tests
- Implemented OCR functionality using pytesseract for image and PDF text extraction. - Added Background Removal service using rembg for image processing. - Developed PDF Editor service for applying text annotations to PDF files. - Created corresponding API routes for OCR, Background Removal, and PDF Editor. - Added frontend components for OCR and Background Removal tools. - Integrated feature flagging for new tools, ensuring they are disabled by default. - Implemented comprehensive unit tests for OCR service, PDF editor, and background removal. - Updated documentation to reflect new features and usage instructions. - Added translations for new features in English, Arabic, and French.
This commit is contained in:
159
backend/app/tasks/ocr_tasks.py
Normal file
159
backend/app/tasks/ocr_tasks.py
Normal file
@@ -0,0 +1,159 @@
|
||||
"""Celery tasks for OCR processing."""
|
||||
import os
|
||||
import logging
|
||||
|
||||
from flask import current_app
|
||||
|
||||
from app.extensions import celery
|
||||
from app.services.ocr_service import ocr_image, ocr_pdf, OCRError
|
||||
from app.services.storage_service import storage
|
||||
from app.services.task_tracking_service import finalize_task_tracking
|
||||
from app.utils.sanitizer import cleanup_task_files
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _cleanup(task_id: str):
|
||||
cleanup_task_files(task_id, keep_outputs=not storage.use_s3)
|
||||
|
||||
|
||||
def _get_output_dir(task_id: str) -> str:
|
||||
output_dir = os.path.join(current_app.config["OUTPUT_FOLDER"], task_id)
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
return output_dir
|
||||
|
||||
|
||||
def _finalize_task(
|
||||
task_id, user_id, tool, original_filename, result,
|
||||
usage_source, api_key_id, celery_task_id,
|
||||
):
|
||||
finalize_task_tracking(
|
||||
user_id=user_id, tool=tool, original_filename=original_filename,
|
||||
result=result, usage_source=usage_source,
|
||||
api_key_id=api_key_id, celery_task_id=celery_task_id,
|
||||
)
|
||||
_cleanup(task_id)
|
||||
return result
|
||||
|
||||
|
||||
@celery.task(bind=True, name="app.tasks.ocr_tasks.ocr_image_task")
|
||||
def ocr_image_task(
|
||||
self,
|
||||
input_path: str,
|
||||
task_id: str,
|
||||
original_filename: str,
|
||||
lang: str = "eng",
|
||||
user_id: int | None = None,
|
||||
usage_source: str = "web",
|
||||
api_key_id: int | None = None,
|
||||
):
|
||||
"""Async task: Extract text from an image via OCR."""
|
||||
output_dir = _get_output_dir(task_id)
|
||||
output_path = os.path.join(output_dir, f"{task_id}.txt")
|
||||
|
||||
try:
|
||||
self.update_state(state="PROCESSING", meta={"step": "Running OCR on image..."})
|
||||
|
||||
stats = ocr_image(input_path, lang=lang)
|
||||
|
||||
# Write text to file for download
|
||||
with open(output_path, "w", encoding="utf-8") as f:
|
||||
f.write(stats["text"])
|
||||
|
||||
self.update_state(state="PROCESSING", meta={"step": "Uploading result..."})
|
||||
s3_key = storage.upload_file(output_path, task_id, folder="outputs")
|
||||
|
||||
name_without_ext = os.path.splitext(original_filename)[0]
|
||||
download_name = f"{name_without_ext}_ocr.txt"
|
||||
|
||||
download_url = storage.generate_presigned_url(s3_key, original_filename=download_name)
|
||||
|
||||
result = {
|
||||
"status": "completed",
|
||||
"download_url": download_url,
|
||||
"filename": download_name,
|
||||
"text": stats["text"][:5000], # preview (first 5k chars)
|
||||
"char_count": stats["char_count"],
|
||||
"lang": stats["lang"],
|
||||
}
|
||||
|
||||
logger.info("Task %s: OCR image completed (%d chars)", task_id, stats["char_count"])
|
||||
return _finalize_task(
|
||||
task_id, user_id, "ocr-image", original_filename,
|
||||
result, usage_source, api_key_id, self.request.id,
|
||||
)
|
||||
|
||||
except OCRError as e:
|
||||
logger.error("Task %s: OCR error — %s", task_id, e)
|
||||
return _finalize_task(
|
||||
task_id, user_id, "ocr-image", original_filename,
|
||||
{"status": "failed", "error": str(e)},
|
||||
usage_source, api_key_id, self.request.id,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error("Task %s: Unexpected error — %s", task_id, e)
|
||||
return _finalize_task(
|
||||
task_id, user_id, "ocr-image", original_filename,
|
||||
{"status": "failed", "error": "An unexpected error occurred."},
|
||||
usage_source, api_key_id, self.request.id,
|
||||
)
|
||||
|
||||
|
||||
@celery.task(bind=True, name="app.tasks.ocr_tasks.ocr_pdf_task")
|
||||
def ocr_pdf_task(
|
||||
self,
|
||||
input_path: str,
|
||||
task_id: str,
|
||||
original_filename: str,
|
||||
lang: str = "eng",
|
||||
user_id: int | None = None,
|
||||
usage_source: str = "web",
|
||||
api_key_id: int | None = None,
|
||||
):
|
||||
"""Async task: Extract text from a scanned PDF via OCR."""
|
||||
output_dir = _get_output_dir(task_id)
|
||||
output_path = os.path.join(output_dir, f"{task_id}.txt")
|
||||
|
||||
try:
|
||||
self.update_state(state="PROCESSING", meta={"step": "Converting PDF pages & running OCR..."})
|
||||
|
||||
stats = ocr_pdf(input_path, output_path, lang=lang)
|
||||
|
||||
self.update_state(state="PROCESSING", meta={"step": "Uploading result..."})
|
||||
s3_key = storage.upload_file(output_path, task_id, folder="outputs")
|
||||
|
||||
name_without_ext = os.path.splitext(original_filename)[0]
|
||||
download_name = f"{name_without_ext}_ocr.txt"
|
||||
|
||||
download_url = storage.generate_presigned_url(s3_key, original_filename=download_name)
|
||||
|
||||
result = {
|
||||
"status": "completed",
|
||||
"download_url": download_url,
|
||||
"filename": download_name,
|
||||
"text": stats["text"][:5000],
|
||||
"page_count": stats["page_count"],
|
||||
"char_count": stats["char_count"],
|
||||
"lang": lang,
|
||||
}
|
||||
|
||||
logger.info("Task %s: OCR PDF completed (%d pages, %d chars)", task_id, stats["page_count"], stats["char_count"])
|
||||
return _finalize_task(
|
||||
task_id, user_id, "ocr-pdf", original_filename,
|
||||
result, usage_source, api_key_id, self.request.id,
|
||||
)
|
||||
|
||||
except OCRError as e:
|
||||
logger.error("Task %s: OCR error — %s", task_id, e)
|
||||
return _finalize_task(
|
||||
task_id, user_id, "ocr-pdf", original_filename,
|
||||
{"status": "failed", "error": str(e)},
|
||||
usage_source, api_key_id, self.request.id,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error("Task %s: Unexpected error — %s", task_id, e)
|
||||
return _finalize_task(
|
||||
task_id, user_id, "ocr-pdf", original_filename,
|
||||
{"status": "failed", "error": "An unexpected error occurred."},
|
||||
usage_source, api_key_id, self.request.id,
|
||||
)
|
||||
95
backend/app/tasks/pdf_editor_tasks.py
Normal file
95
backend/app/tasks/pdf_editor_tasks.py
Normal file
@@ -0,0 +1,95 @@
|
||||
"""Celery tasks for PDF editing."""
|
||||
import os
|
||||
import logging
|
||||
|
||||
from flask import current_app
|
||||
|
||||
from app.extensions import celery
|
||||
from app.services.pdf_editor_service import apply_pdf_edits, PDFEditorError
|
||||
from app.services.storage_service import storage
|
||||
from app.services.task_tracking_service import finalize_task_tracking
|
||||
from app.utils.sanitizer import cleanup_task_files
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _cleanup(task_id: str):
|
||||
cleanup_task_files(task_id, keep_outputs=not storage.use_s3)
|
||||
|
||||
|
||||
def _get_output_dir(task_id: str) -> str:
|
||||
output_dir = os.path.join(current_app.config["OUTPUT_FOLDER"], task_id)
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
return output_dir
|
||||
|
||||
|
||||
def _finalize_task(
|
||||
task_id, user_id, tool, original_filename, result,
|
||||
usage_source, api_key_id, celery_task_id,
|
||||
):
|
||||
finalize_task_tracking(
|
||||
user_id=user_id, tool=tool, original_filename=original_filename,
|
||||
result=result, usage_source=usage_source,
|
||||
api_key_id=api_key_id, celery_task_id=celery_task_id,
|
||||
)
|
||||
_cleanup(task_id)
|
||||
return result
|
||||
|
||||
|
||||
@celery.task(bind=True, name="app.tasks.pdf_editor_tasks.edit_pdf_task")
|
||||
def edit_pdf_task(
|
||||
self,
|
||||
input_path: str,
|
||||
task_id: str,
|
||||
original_filename: str,
|
||||
edits: list[dict],
|
||||
user_id: int | None = None,
|
||||
usage_source: str = "web",
|
||||
api_key_id: int | None = None,
|
||||
):
|
||||
"""Async task: Apply text annotations to a PDF."""
|
||||
output_dir = _get_output_dir(task_id)
|
||||
output_path = os.path.join(output_dir, f"{task_id}.pdf")
|
||||
|
||||
try:
|
||||
self.update_state(state="PROCESSING", meta={"step": "Applying edits to PDF..."})
|
||||
|
||||
stats = apply_pdf_edits(input_path, output_path, edits)
|
||||
|
||||
self.update_state(state="PROCESSING", meta={"step": "Uploading result..."})
|
||||
s3_key = storage.upload_file(output_path, task_id, folder="outputs")
|
||||
|
||||
name_without_ext = os.path.splitext(original_filename)[0]
|
||||
download_name = f"{name_without_ext}_edited.pdf"
|
||||
|
||||
download_url = storage.generate_presigned_url(s3_key, original_filename=download_name)
|
||||
|
||||
result = {
|
||||
"status": "completed",
|
||||
"download_url": download_url,
|
||||
"filename": download_name,
|
||||
"page_count": stats["page_count"],
|
||||
"edits_applied": stats["edits_applied"],
|
||||
"output_size": stats["output_size"],
|
||||
}
|
||||
|
||||
logger.info("Task %s: PDF edit completed (%d edits)", task_id, stats["edits_applied"])
|
||||
return _finalize_task(
|
||||
task_id, user_id, "pdf-edit", original_filename,
|
||||
result, usage_source, api_key_id, self.request.id,
|
||||
)
|
||||
|
||||
except PDFEditorError as e:
|
||||
logger.error("Task %s: PDF edit error — %s", task_id, e)
|
||||
return _finalize_task(
|
||||
task_id, user_id, "pdf-edit", original_filename,
|
||||
{"status": "failed", "error": str(e)},
|
||||
usage_source, api_key_id, self.request.id,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error("Task %s: Unexpected error — %s", task_id, e)
|
||||
return _finalize_task(
|
||||
task_id, user_id, "pdf-edit", original_filename,
|
||||
{"status": "failed", "error": "An unexpected error occurred."},
|
||||
usage_source, api_key_id, self.request.id,
|
||||
)
|
||||
95
backend/app/tasks/removebg_tasks.py
Normal file
95
backend/app/tasks/removebg_tasks.py
Normal file
@@ -0,0 +1,95 @@
|
||||
"""Celery tasks for background removal."""
|
||||
import os
|
||||
import logging
|
||||
|
||||
from flask import current_app
|
||||
|
||||
from app.extensions import celery
|
||||
from app.services.removebg_service import remove_background, RemoveBGError
|
||||
from app.services.storage_service import storage
|
||||
from app.services.task_tracking_service import finalize_task_tracking
|
||||
from app.utils.sanitizer import cleanup_task_files
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _cleanup(task_id: str):
|
||||
cleanup_task_files(task_id, keep_outputs=not storage.use_s3)
|
||||
|
||||
|
||||
def _get_output_dir(task_id: str) -> str:
|
||||
output_dir = os.path.join(current_app.config["OUTPUT_FOLDER"], task_id)
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
return output_dir
|
||||
|
||||
|
||||
def _finalize_task(
|
||||
task_id, user_id, tool, original_filename, result,
|
||||
usage_source, api_key_id, celery_task_id,
|
||||
):
|
||||
finalize_task_tracking(
|
||||
user_id=user_id, tool=tool, original_filename=original_filename,
|
||||
result=result, usage_source=usage_source,
|
||||
api_key_id=api_key_id, celery_task_id=celery_task_id,
|
||||
)
|
||||
_cleanup(task_id)
|
||||
return result
|
||||
|
||||
|
||||
@celery.task(bind=True, name="app.tasks.removebg_tasks.remove_bg_task")
|
||||
def remove_bg_task(
|
||||
self,
|
||||
input_path: str,
|
||||
task_id: str,
|
||||
original_filename: str,
|
||||
user_id: int | None = None,
|
||||
usage_source: str = "web",
|
||||
api_key_id: int | None = None,
|
||||
):
|
||||
"""Async task: Remove background from an image."""
|
||||
output_dir = _get_output_dir(task_id)
|
||||
output_path = os.path.join(output_dir, f"{task_id}.png")
|
||||
|
||||
try:
|
||||
self.update_state(state="PROCESSING", meta={"step": "Removing background..."})
|
||||
|
||||
stats = remove_background(input_path, output_path)
|
||||
|
||||
self.update_state(state="PROCESSING", meta={"step": "Uploading result..."})
|
||||
s3_key = storage.upload_file(output_path, task_id, folder="outputs")
|
||||
|
||||
name_without_ext = os.path.splitext(original_filename)[0]
|
||||
download_name = f"{name_without_ext}_nobg.png"
|
||||
|
||||
download_url = storage.generate_presigned_url(s3_key, original_filename=download_name)
|
||||
|
||||
result = {
|
||||
"status": "completed",
|
||||
"download_url": download_url,
|
||||
"filename": download_name,
|
||||
"original_size": stats["original_size"],
|
||||
"output_size": stats["output_size"],
|
||||
"width": stats["width"],
|
||||
"height": stats["height"],
|
||||
}
|
||||
|
||||
logger.info("Task %s: Background removal completed", task_id)
|
||||
return _finalize_task(
|
||||
task_id, user_id, "remove-bg", original_filename,
|
||||
result, usage_source, api_key_id, self.request.id,
|
||||
)
|
||||
|
||||
except RemoveBGError as e:
|
||||
logger.error("Task %s: RemoveBG error — %s", task_id, e)
|
||||
return _finalize_task(
|
||||
task_id, user_id, "remove-bg", original_filename,
|
||||
{"status": "failed", "error": str(e)},
|
||||
usage_source, api_key_id, self.request.id,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error("Task %s: Unexpected error — %s", task_id, e)
|
||||
return _finalize_task(
|
||||
task_id, user_id, "remove-bg", original_filename,
|
||||
{"status": "failed", "error": "An unexpected error occurred."},
|
||||
usage_source, api_key_id, self.request.id,
|
||||
)
|
||||
Reference in New Issue
Block a user