Add OCR, Background Removal, and PDF Editor features with tests
- Implemented OCR functionality using pytesseract for image and PDF text extraction. - Added Background Removal service using rembg for image processing. - Developed PDF Editor service for applying text annotations to PDF files. - Created corresponding API routes for OCR, Background Removal, and PDF Editor. - Added frontend components for OCR and Background Removal tools. - Integrated feature flagging for new tools, ensuring they are disabled by default. - Implemented comprehensive unit tests for OCR service, PDF editor, and background removal. - Updated documentation to reflect new features and usage instructions. - Added translations for new features in English, Arabic, and French.
This commit is contained in:
134
backend/app/routes/ocr.py
Normal file
134
backend/app/routes/ocr.py
Normal file
@@ -0,0 +1,134 @@
|
||||
"""OCR routes — extract text from images and PDFs."""
|
||||
from flask import Blueprint, request, jsonify, current_app
|
||||
|
||||
from app.extensions import limiter
|
||||
from app.services.policy_service import (
|
||||
assert_quota_available,
|
||||
build_task_tracking_kwargs,
|
||||
PolicyError,
|
||||
record_accepted_usage,
|
||||
resolve_web_actor,
|
||||
validate_actor_file,
|
||||
)
|
||||
from app.services.ocr_service import SUPPORTED_LANGUAGES
|
||||
from app.utils.file_validator import FileValidationError
|
||||
from app.utils.sanitizer import generate_safe_path
|
||||
from app.tasks.ocr_tasks import ocr_image_task, ocr_pdf_task
|
||||
|
||||
ocr_bp = Blueprint("ocr", __name__)
|
||||
|
||||
ALLOWED_IMAGE_TYPES = ["png", "jpg", "jpeg", "webp", "tiff", "bmp"]
|
||||
ALLOWED_OCR_TYPES = ALLOWED_IMAGE_TYPES + ["pdf"]
|
||||
|
||||
|
||||
def _check_feature_flag():
|
||||
"""Return an error response if FEATURE_EDITOR is disabled."""
|
||||
if not current_app.config.get("FEATURE_EDITOR", False):
|
||||
return jsonify({"error": "This feature is not enabled."}), 403
|
||||
return None
|
||||
|
||||
|
||||
@ocr_bp.route("/image", methods=["POST"])
|
||||
@limiter.limit("10/minute")
|
||||
def ocr_image_route():
|
||||
"""Extract text from an image using OCR.
|
||||
|
||||
Accepts: multipart/form-data with:
|
||||
- 'file': Image file
|
||||
- 'lang' (optional): Language code — eng, ara, fra (default: eng)
|
||||
Returns: JSON with task_id for polling
|
||||
"""
|
||||
flag_err = _check_feature_flag()
|
||||
if flag_err:
|
||||
return flag_err
|
||||
|
||||
if "file" not in request.files:
|
||||
return jsonify({"error": "No file provided."}), 400
|
||||
|
||||
file = request.files["file"]
|
||||
lang = request.form.get("lang", "eng").lower()
|
||||
if lang not in SUPPORTED_LANGUAGES:
|
||||
lang = "eng"
|
||||
|
||||
actor = resolve_web_actor()
|
||||
try:
|
||||
assert_quota_available(actor)
|
||||
except PolicyError as e:
|
||||
return jsonify({"error": e.message}), e.status_code
|
||||
|
||||
try:
|
||||
original_filename, ext = validate_actor_file(
|
||||
file, allowed_types=ALLOWED_IMAGE_TYPES, actor=actor
|
||||
)
|
||||
except FileValidationError as e:
|
||||
return jsonify({"error": e.message}), e.code
|
||||
|
||||
task_id, input_path = generate_safe_path(ext, folder_type="upload")
|
||||
file.save(input_path)
|
||||
|
||||
task = ocr_image_task.delay(
|
||||
input_path, task_id, original_filename, lang,
|
||||
**build_task_tracking_kwargs(actor),
|
||||
)
|
||||
record_accepted_usage(actor, "ocr-image", task.id)
|
||||
|
||||
return jsonify({
|
||||
"task_id": task.id,
|
||||
"message": "OCR started. Poll /api/tasks/{task_id}/status for progress.",
|
||||
}), 202
|
||||
|
||||
|
||||
@ocr_bp.route("/pdf", methods=["POST"])
|
||||
@limiter.limit("5/minute")
|
||||
def ocr_pdf_route():
|
||||
"""Extract text from a scanned PDF using OCR.
|
||||
|
||||
Accepts: multipart/form-data with:
|
||||
- 'file': PDF file
|
||||
- 'lang' (optional): Language code — eng, ara, fra (default: eng)
|
||||
Returns: JSON with task_id for polling
|
||||
"""
|
||||
flag_err = _check_feature_flag()
|
||||
if flag_err:
|
||||
return flag_err
|
||||
|
||||
if "file" not in request.files:
|
||||
return jsonify({"error": "No file provided."}), 400
|
||||
|
||||
file = request.files["file"]
|
||||
lang = request.form.get("lang", "eng").lower()
|
||||
if lang not in SUPPORTED_LANGUAGES:
|
||||
lang = "eng"
|
||||
|
||||
actor = resolve_web_actor()
|
||||
try:
|
||||
assert_quota_available(actor)
|
||||
except PolicyError as e:
|
||||
return jsonify({"error": e.message}), e.status_code
|
||||
|
||||
try:
|
||||
original_filename, ext = validate_actor_file(
|
||||
file, allowed_types=["pdf"], actor=actor
|
||||
)
|
||||
except FileValidationError as e:
|
||||
return jsonify({"error": e.message}), e.code
|
||||
|
||||
task_id, input_path = generate_safe_path(ext, folder_type="upload")
|
||||
file.save(input_path)
|
||||
|
||||
task = ocr_pdf_task.delay(
|
||||
input_path, task_id, original_filename, lang,
|
||||
**build_task_tracking_kwargs(actor),
|
||||
)
|
||||
record_accepted_usage(actor, "ocr-pdf", task.id)
|
||||
|
||||
return jsonify({
|
||||
"task_id": task.id,
|
||||
"message": "OCR started. Poll /api/tasks/{task_id}/status for progress.",
|
||||
}), 202
|
||||
|
||||
|
||||
@ocr_bp.route("/languages", methods=["GET"])
|
||||
def ocr_languages_route():
|
||||
"""Return the list of supported OCR languages."""
|
||||
return jsonify({"languages": SUPPORTED_LANGUAGES}), 200
|
||||
80
backend/app/routes/pdf_editor.py
Normal file
80
backend/app/routes/pdf_editor.py
Normal file
@@ -0,0 +1,80 @@
|
||||
"""PDF Editor route — apply text annotations to PDFs."""
|
||||
import json
|
||||
|
||||
from flask import Blueprint, request, jsonify, current_app
|
||||
|
||||
from app.extensions import limiter
|
||||
from app.services.policy_service import (
|
||||
assert_quota_available,
|
||||
build_task_tracking_kwargs,
|
||||
PolicyError,
|
||||
record_accepted_usage,
|
||||
resolve_web_actor,
|
||||
validate_actor_file,
|
||||
)
|
||||
from app.utils.file_validator import FileValidationError
|
||||
from app.utils.sanitizer import generate_safe_path
|
||||
from app.tasks.pdf_editor_tasks import edit_pdf_task
|
||||
|
||||
pdf_editor_bp = Blueprint("pdf_editor", __name__)
|
||||
|
||||
|
||||
@pdf_editor_bp.route("/edit", methods=["POST"])
|
||||
@limiter.limit("10/minute")
|
||||
def edit_pdf_route():
|
||||
"""Apply text annotations to a PDF.
|
||||
|
||||
Accepts: multipart/form-data with:
|
||||
- 'file': PDF file
|
||||
- 'edits': JSON string — array of edit objects
|
||||
Each edit: { type: "text", page: 1, x: 100, y: 200, content: "Hello", fontSize: 14, color: "#000000" }
|
||||
Returns: JSON with task_id for polling
|
||||
"""
|
||||
if not current_app.config.get("FEATURE_EDITOR", False):
|
||||
return jsonify({"error": "This feature is not enabled."}), 403
|
||||
|
||||
if "file" not in request.files:
|
||||
return jsonify({"error": "No file provided."}), 400
|
||||
|
||||
file = request.files["file"]
|
||||
edits_raw = request.form.get("edits", "[]")
|
||||
|
||||
try:
|
||||
edits = json.loads(edits_raw)
|
||||
if not isinstance(edits, list):
|
||||
return jsonify({"error": "Edits must be a JSON array."}), 400
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
return jsonify({"error": "Invalid JSON in 'edits' field."}), 400
|
||||
|
||||
if not edits:
|
||||
return jsonify({"error": "At least one edit is required."}), 400
|
||||
|
||||
if len(edits) > 500:
|
||||
return jsonify({"error": "Maximum 500 edits allowed."}), 400
|
||||
|
||||
actor = resolve_web_actor()
|
||||
try:
|
||||
assert_quota_available(actor)
|
||||
except PolicyError as e:
|
||||
return jsonify({"error": e.message}), e.status_code
|
||||
|
||||
try:
|
||||
original_filename, ext = validate_actor_file(
|
||||
file, allowed_types=["pdf"], actor=actor
|
||||
)
|
||||
except FileValidationError as e:
|
||||
return jsonify({"error": e.message}), e.code
|
||||
|
||||
task_id, input_path = generate_safe_path(ext, folder_type="upload")
|
||||
file.save(input_path)
|
||||
|
||||
task = edit_pdf_task.delay(
|
||||
input_path, task_id, original_filename, edits,
|
||||
**build_task_tracking_kwargs(actor),
|
||||
)
|
||||
record_accepted_usage(actor, "pdf-edit", task.id)
|
||||
|
||||
return jsonify({
|
||||
"task_id": task.id,
|
||||
"message": "PDF editing started. Poll /api/tasks/{task_id}/status for progress.",
|
||||
}), 202
|
||||
64
backend/app/routes/removebg.py
Normal file
64
backend/app/routes/removebg.py
Normal file
@@ -0,0 +1,64 @@
|
||||
"""Background removal route."""
|
||||
from flask import Blueprint, request, jsonify, current_app
|
||||
|
||||
from app.extensions import limiter
|
||||
from app.services.policy_service import (
|
||||
assert_quota_available,
|
||||
build_task_tracking_kwargs,
|
||||
PolicyError,
|
||||
record_accepted_usage,
|
||||
resolve_web_actor,
|
||||
validate_actor_file,
|
||||
)
|
||||
from app.utils.file_validator import FileValidationError
|
||||
from app.utils.sanitizer import generate_safe_path
|
||||
from app.tasks.removebg_tasks import remove_bg_task
|
||||
|
||||
removebg_bp = Blueprint("removebg", __name__)
|
||||
|
||||
ALLOWED_IMAGE_TYPES = ["png", "jpg", "jpeg", "webp"]
|
||||
|
||||
|
||||
@removebg_bp.route("", methods=["POST"])
|
||||
@limiter.limit("5/minute")
|
||||
def remove_bg_route():
|
||||
"""Remove the background from an image.
|
||||
|
||||
Accepts: multipart/form-data with:
|
||||
- 'file': Image file (PNG, JPG, JPEG, WebP)
|
||||
Returns: JSON with task_id for polling
|
||||
"""
|
||||
if not current_app.config.get("FEATURE_EDITOR", False):
|
||||
return jsonify({"error": "This feature is not enabled."}), 403
|
||||
|
||||
if "file" not in request.files:
|
||||
return jsonify({"error": "No file provided."}), 400
|
||||
|
||||
file = request.files["file"]
|
||||
|
||||
actor = resolve_web_actor()
|
||||
try:
|
||||
assert_quota_available(actor)
|
||||
except PolicyError as e:
|
||||
return jsonify({"error": e.message}), e.status_code
|
||||
|
||||
try:
|
||||
original_filename, ext = validate_actor_file(
|
||||
file, allowed_types=ALLOWED_IMAGE_TYPES, actor=actor
|
||||
)
|
||||
except FileValidationError as e:
|
||||
return jsonify({"error": e.message}), e.code
|
||||
|
||||
task_id, input_path = generate_safe_path(ext, folder_type="upload")
|
||||
file.save(input_path)
|
||||
|
||||
task = remove_bg_task.delay(
|
||||
input_path, task_id, original_filename,
|
||||
**build_task_tracking_kwargs(actor),
|
||||
)
|
||||
record_accepted_usage(actor, "remove-bg", task.id)
|
||||
|
||||
return jsonify({
|
||||
"task_id": task.id,
|
||||
"message": "Background removal started. Poll /api/tasks/{task_id}/status for progress.",
|
||||
}), 202
|
||||
Reference in New Issue
Block a user