الميزات: إضافة أدوات جديدة لمعالجة ملفات PDF، تشمل التلخيص والترجمة واستخراج الجداول.
- تفعيل مكون SummarizePdf لإنشاء ملخصات PDF باستخدام الذكاء الاصطناعي. - تفعيل مكون TranslatePdf لترجمة محتوى PDF إلى لغات متعددة. - تفعيل مكون TableExtractor لاستخراج الجداول من ملفات PDF. - تحديث الصفحة الرئيسية والتوجيه ليشمل الأدوات الجديدة. - إضافة ترجمات للأدوات الجديدة باللغات الإنجليزية والعربية والفرنسية. - توسيع أنواع واجهة برمجة التطبيقات (API) لدعم الميزات الجديدة المتعلقة بمعالجة ملفات PDF. --feat: Initialize frontend with React, Vite, and Tailwind CSS - Set up main entry point for React application. - Create About, Home, NotFound, Privacy, and Terms pages with SEO support. - Implement API service for file uploads and task management. - Add global styles using Tailwind CSS. - Create utility functions for SEO and text processing. - Configure Vite for development and production builds. - Set up Nginx configuration for serving frontend and backend. - Add scripts for cleanup of expired files and sitemap generation. - Implement deployment script for production environment.
This commit is contained in:
@@ -93,6 +93,11 @@ def create_app(config_name=None):
|
||||
from app.routes.ocr import ocr_bp
|
||||
from app.routes.removebg import removebg_bp
|
||||
from app.routes.pdf_editor import pdf_editor_bp
|
||||
from app.routes.compress_image import compress_image_bp
|
||||
from app.routes.pdf_to_excel import pdf_to_excel_bp
|
||||
from app.routes.qrcode import qrcode_bp
|
||||
from app.routes.html_to_pdf import html_to_pdf_bp
|
||||
from app.routes.pdf_ai import pdf_ai_bp
|
||||
|
||||
app.register_blueprint(health_bp, url_prefix="/api")
|
||||
app.register_blueprint(auth_bp, url_prefix="/api/auth")
|
||||
@@ -112,5 +117,10 @@ def create_app(config_name=None):
|
||||
app.register_blueprint(ocr_bp, url_prefix="/api/ocr")
|
||||
app.register_blueprint(removebg_bp, url_prefix="/api/remove-bg")
|
||||
app.register_blueprint(pdf_editor_bp, url_prefix="/api/pdf-editor")
|
||||
app.register_blueprint(compress_image_bp, url_prefix="/api/image")
|
||||
app.register_blueprint(pdf_to_excel_bp, url_prefix="/api/convert")
|
||||
app.register_blueprint(qrcode_bp, url_prefix="/api/qrcode")
|
||||
app.register_blueprint(html_to_pdf_bp, url_prefix="/api/convert")
|
||||
app.register_blueprint(pdf_ai_bp, url_prefix="/api/pdf-ai")
|
||||
|
||||
return app
|
||||
|
||||
@@ -35,6 +35,11 @@ def init_celery(app):
|
||||
"app.tasks.ocr_tasks.*": {"queue": "image"},
|
||||
"app.tasks.removebg_tasks.*": {"queue": "image"},
|
||||
"app.tasks.pdf_editor_tasks.*": {"queue": "pdf_tools"},
|
||||
"app.tasks.compress_image_tasks.*": {"queue": "image"},
|
||||
"app.tasks.pdf_to_excel_tasks.*": {"queue": "pdf_tools"},
|
||||
"app.tasks.qrcode_tasks.*": {"queue": "default"},
|
||||
"app.tasks.html_to_pdf_tasks.*": {"queue": "convert"},
|
||||
"app.tasks.pdf_ai_tasks.*": {"queue": "default"},
|
||||
}
|
||||
|
||||
# Celery Beat — periodic tasks
|
||||
|
||||
72
backend/app/routes/compress_image.py
Normal file
72
backend/app/routes/compress_image.py
Normal file
@@ -0,0 +1,72 @@
|
||||
"""Image compression routes."""
|
||||
from flask import Blueprint, request, jsonify
|
||||
|
||||
from app.extensions import limiter
|
||||
from app.services.policy_service import (
|
||||
assert_quota_available,
|
||||
build_task_tracking_kwargs,
|
||||
PolicyError,
|
||||
record_accepted_usage,
|
||||
resolve_web_actor,
|
||||
validate_actor_file,
|
||||
)
|
||||
from app.utils.file_validator import FileValidationError
|
||||
from app.utils.sanitizer import generate_safe_path
|
||||
from app.tasks.compress_image_tasks import compress_image_task
|
||||
|
||||
compress_image_bp = Blueprint("compress_image", __name__)
|
||||
|
||||
ALLOWED_IMAGE_TYPES = ["png", "jpg", "jpeg", "webp"]
|
||||
|
||||
|
||||
@compress_image_bp.route("/compress", methods=["POST"])
|
||||
@limiter.limit("10/minute")
|
||||
def compress_image_route():
|
||||
"""
|
||||
Compress an image file.
|
||||
|
||||
Accepts: multipart/form-data with:
|
||||
- 'file': Image file (PNG, JPG, JPEG, WebP)
|
||||
- 'quality' (optional): Quality 1-100 (default: 75)
|
||||
Returns: JSON with task_id for polling
|
||||
"""
|
||||
if "file" not in request.files:
|
||||
return jsonify({"error": "No file provided."}), 400
|
||||
|
||||
file = request.files["file"]
|
||||
quality = request.form.get("quality", "75")
|
||||
|
||||
try:
|
||||
quality = max(1, min(100, int(quality)))
|
||||
except ValueError:
|
||||
quality = 75
|
||||
|
||||
actor = resolve_web_actor()
|
||||
try:
|
||||
assert_quota_available(actor)
|
||||
except PolicyError as e:
|
||||
return jsonify({"error": e.message}), e.status_code
|
||||
|
||||
try:
|
||||
original_filename, ext = validate_actor_file(
|
||||
file, allowed_types=ALLOWED_IMAGE_TYPES, actor=actor
|
||||
)
|
||||
except FileValidationError as e:
|
||||
return jsonify({"error": e.message}), e.code
|
||||
|
||||
task_id, input_path = generate_safe_path(ext, folder_type="upload")
|
||||
file.save(input_path)
|
||||
|
||||
task = compress_image_task.delay(
|
||||
input_path,
|
||||
task_id,
|
||||
original_filename,
|
||||
quality,
|
||||
**build_task_tracking_kwargs(actor),
|
||||
)
|
||||
record_accepted_usage(actor, "compress-image", task.id)
|
||||
|
||||
return jsonify({
|
||||
"task_id": task.id,
|
||||
"message": "Image compression started. Poll /api/tasks/{task_id}/status for progress.",
|
||||
}), 202
|
||||
62
backend/app/routes/html_to_pdf.py
Normal file
62
backend/app/routes/html_to_pdf.py
Normal file
@@ -0,0 +1,62 @@
|
||||
"""HTML to PDF conversion routes."""
|
||||
from flask import Blueprint, request, jsonify
|
||||
|
||||
from app.extensions import limiter
|
||||
from app.services.policy_service import (
|
||||
assert_quota_available,
|
||||
build_task_tracking_kwargs,
|
||||
PolicyError,
|
||||
record_accepted_usage,
|
||||
resolve_web_actor,
|
||||
validate_actor_file,
|
||||
)
|
||||
from app.utils.file_validator import FileValidationError
|
||||
from app.utils.sanitizer import generate_safe_path
|
||||
from app.tasks.html_to_pdf_tasks import html_to_pdf_task
|
||||
|
||||
html_to_pdf_bp = Blueprint("html_to_pdf", __name__)
|
||||
|
||||
|
||||
@html_to_pdf_bp.route("/html-to-pdf", methods=["POST"])
|
||||
@limiter.limit("10/minute")
|
||||
def html_to_pdf_route():
|
||||
"""
|
||||
Convert an HTML file to PDF.
|
||||
|
||||
Accepts: multipart/form-data with:
|
||||
- 'file': HTML file
|
||||
Returns: JSON with task_id for polling
|
||||
"""
|
||||
if "file" not in request.files:
|
||||
return jsonify({"error": "No file provided."}), 400
|
||||
|
||||
file = request.files["file"]
|
||||
|
||||
actor = resolve_web_actor()
|
||||
try:
|
||||
assert_quota_available(actor)
|
||||
except PolicyError as e:
|
||||
return jsonify({"error": e.message}), e.status_code
|
||||
|
||||
try:
|
||||
original_filename, ext = validate_actor_file(
|
||||
file, allowed_types=["html", "htm"], actor=actor
|
||||
)
|
||||
except FileValidationError as e:
|
||||
return jsonify({"error": e.message}), e.code
|
||||
|
||||
task_id, input_path = generate_safe_path(ext, folder_type="upload")
|
||||
file.save(input_path)
|
||||
|
||||
task = html_to_pdf_task.delay(
|
||||
input_path,
|
||||
task_id,
|
||||
original_filename,
|
||||
**build_task_tracking_kwargs(actor),
|
||||
)
|
||||
record_accepted_usage(actor, "html-to-pdf", task.id)
|
||||
|
||||
return jsonify({
|
||||
"task_id": task.id,
|
||||
"message": "HTML to PDF conversion started. Poll /api/tasks/{task_id}/status for progress.",
|
||||
}), 202
|
||||
232
backend/app/routes/pdf_ai.py
Normal file
232
backend/app/routes/pdf_ai.py
Normal file
@@ -0,0 +1,232 @@
|
||||
"""PDF AI tool routes — Chat, Summarize, Translate, Table Extract."""
|
||||
from flask import Blueprint, request, jsonify
|
||||
|
||||
from app.extensions import limiter
|
||||
from app.services.policy_service import (
|
||||
assert_quota_available,
|
||||
build_task_tracking_kwargs,
|
||||
PolicyError,
|
||||
record_accepted_usage,
|
||||
resolve_web_actor,
|
||||
validate_actor_file,
|
||||
)
|
||||
from app.utils.file_validator import FileValidationError
|
||||
from app.utils.sanitizer import generate_safe_path
|
||||
from app.tasks.pdf_ai_tasks import (
|
||||
chat_with_pdf_task,
|
||||
summarize_pdf_task,
|
||||
translate_pdf_task,
|
||||
extract_tables_task,
|
||||
)
|
||||
|
||||
pdf_ai_bp = Blueprint("pdf_ai", __name__)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Chat with PDF — POST /api/pdf-ai/chat
|
||||
# ---------------------------------------------------------------------------
|
||||
@pdf_ai_bp.route("/chat", methods=["POST"])
|
||||
@limiter.limit("10/minute")
|
||||
def chat_pdf_route():
|
||||
"""
|
||||
Ask a question about a PDF document.
|
||||
|
||||
Accepts: multipart/form-data with:
|
||||
- 'file': PDF file
|
||||
- 'question': The question to ask
|
||||
Returns: JSON with task_id for polling
|
||||
"""
|
||||
if "file" not in request.files:
|
||||
return jsonify({"error": "No file provided."}), 400
|
||||
|
||||
file = request.files["file"]
|
||||
question = request.form.get("question", "").strip()
|
||||
|
||||
if not question:
|
||||
return jsonify({"error": "No question provided."}), 400
|
||||
|
||||
actor = resolve_web_actor()
|
||||
try:
|
||||
assert_quota_available(actor)
|
||||
except PolicyError as e:
|
||||
return jsonify({"error": e.message}), e.status_code
|
||||
|
||||
try:
|
||||
original_filename, ext = validate_actor_file(
|
||||
file, allowed_types=["pdf"], actor=actor
|
||||
)
|
||||
except FileValidationError as e:
|
||||
return jsonify({"error": e.message}), e.code
|
||||
|
||||
task_id, input_path = generate_safe_path(ext, folder_type="upload")
|
||||
file.save(input_path)
|
||||
|
||||
task = chat_with_pdf_task.delay(
|
||||
input_path,
|
||||
task_id,
|
||||
original_filename,
|
||||
question,
|
||||
**build_task_tracking_kwargs(actor),
|
||||
)
|
||||
record_accepted_usage(actor, "chat-pdf", task.id)
|
||||
|
||||
return jsonify({
|
||||
"task_id": task.id,
|
||||
"message": "Processing your question. Poll /api/tasks/{task_id}/status for progress.",
|
||||
}), 202
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Summarize PDF — POST /api/pdf-ai/summarize
|
||||
# ---------------------------------------------------------------------------
|
||||
@pdf_ai_bp.route("/summarize", methods=["POST"])
|
||||
@limiter.limit("10/minute")
|
||||
def summarize_pdf_route():
|
||||
"""
|
||||
Generate a summary of a PDF document.
|
||||
|
||||
Accepts: multipart/form-data with:
|
||||
- 'file': PDF file
|
||||
- 'length' (optional): "short", "medium", or "long"
|
||||
Returns: JSON with task_id for polling
|
||||
"""
|
||||
if "file" not in request.files:
|
||||
return jsonify({"error": "No file provided."}), 400
|
||||
|
||||
file = request.files["file"]
|
||||
length = request.form.get("length", "medium").strip()
|
||||
|
||||
if length not in ("short", "medium", "long"):
|
||||
length = "medium"
|
||||
|
||||
actor = resolve_web_actor()
|
||||
try:
|
||||
assert_quota_available(actor)
|
||||
except PolicyError as e:
|
||||
return jsonify({"error": e.message}), e.status_code
|
||||
|
||||
try:
|
||||
original_filename, ext = validate_actor_file(
|
||||
file, allowed_types=["pdf"], actor=actor
|
||||
)
|
||||
except FileValidationError as e:
|
||||
return jsonify({"error": e.message}), e.code
|
||||
|
||||
task_id, input_path = generate_safe_path(ext, folder_type="upload")
|
||||
file.save(input_path)
|
||||
|
||||
task = summarize_pdf_task.delay(
|
||||
input_path,
|
||||
task_id,
|
||||
original_filename,
|
||||
length,
|
||||
**build_task_tracking_kwargs(actor),
|
||||
)
|
||||
record_accepted_usage(actor, "summarize-pdf", task.id)
|
||||
|
||||
return jsonify({
|
||||
"task_id": task.id,
|
||||
"message": "Summarizing document. Poll /api/tasks/{task_id}/status for progress.",
|
||||
}), 202
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Translate PDF — POST /api/pdf-ai/translate
|
||||
# ---------------------------------------------------------------------------
|
||||
@pdf_ai_bp.route("/translate", methods=["POST"])
|
||||
@limiter.limit("10/minute")
|
||||
def translate_pdf_route():
|
||||
"""
|
||||
Translate a PDF document to another language.
|
||||
|
||||
Accepts: multipart/form-data with:
|
||||
- 'file': PDF file
|
||||
- 'target_language': Target language name
|
||||
Returns: JSON with task_id for polling
|
||||
"""
|
||||
if "file" not in request.files:
|
||||
return jsonify({"error": "No file provided."}), 400
|
||||
|
||||
file = request.files["file"]
|
||||
target_language = request.form.get("target_language", "").strip()
|
||||
|
||||
if not target_language:
|
||||
return jsonify({"error": "No target language specified."}), 400
|
||||
|
||||
actor = resolve_web_actor()
|
||||
try:
|
||||
assert_quota_available(actor)
|
||||
except PolicyError as e:
|
||||
return jsonify({"error": e.message}), e.status_code
|
||||
|
||||
try:
|
||||
original_filename, ext = validate_actor_file(
|
||||
file, allowed_types=["pdf"], actor=actor
|
||||
)
|
||||
except FileValidationError as e:
|
||||
return jsonify({"error": e.message}), e.code
|
||||
|
||||
task_id, input_path = generate_safe_path(ext, folder_type="upload")
|
||||
file.save(input_path)
|
||||
|
||||
task = translate_pdf_task.delay(
|
||||
input_path,
|
||||
task_id,
|
||||
original_filename,
|
||||
target_language,
|
||||
**build_task_tracking_kwargs(actor),
|
||||
)
|
||||
record_accepted_usage(actor, "translate-pdf", task.id)
|
||||
|
||||
return jsonify({
|
||||
"task_id": task.id,
|
||||
"message": "Translating document. Poll /api/tasks/{task_id}/status for progress.",
|
||||
}), 202
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Extract Tables — POST /api/pdf-ai/extract-tables
|
||||
# ---------------------------------------------------------------------------
|
||||
@pdf_ai_bp.route("/extract-tables", methods=["POST"])
|
||||
@limiter.limit("10/minute")
|
||||
def extract_tables_route():
|
||||
"""
|
||||
Extract tables from a PDF document.
|
||||
|
||||
Accepts: multipart/form-data with:
|
||||
- 'file': PDF file
|
||||
Returns: JSON with task_id for polling
|
||||
"""
|
||||
if "file" not in request.files:
|
||||
return jsonify({"error": "No file provided."}), 400
|
||||
|
||||
file = request.files["file"]
|
||||
|
||||
actor = resolve_web_actor()
|
||||
try:
|
||||
assert_quota_available(actor)
|
||||
except PolicyError as e:
|
||||
return jsonify({"error": e.message}), e.status_code
|
||||
|
||||
try:
|
||||
original_filename, ext = validate_actor_file(
|
||||
file, allowed_types=["pdf"], actor=actor
|
||||
)
|
||||
except FileValidationError as e:
|
||||
return jsonify({"error": e.message}), e.code
|
||||
|
||||
task_id, input_path = generate_safe_path(ext, folder_type="upload")
|
||||
file.save(input_path)
|
||||
|
||||
task = extract_tables_task.delay(
|
||||
input_path,
|
||||
task_id,
|
||||
original_filename,
|
||||
**build_task_tracking_kwargs(actor),
|
||||
)
|
||||
record_accepted_usage(actor, "extract-tables", task.id)
|
||||
|
||||
return jsonify({
|
||||
"task_id": task.id,
|
||||
"message": "Extracting tables. Poll /api/tasks/{task_id}/status for progress.",
|
||||
}), 202
|
||||
62
backend/app/routes/pdf_to_excel.py
Normal file
62
backend/app/routes/pdf_to_excel.py
Normal file
@@ -0,0 +1,62 @@
|
||||
"""PDF to Excel conversion routes."""
|
||||
from flask import Blueprint, request, jsonify
|
||||
|
||||
from app.extensions import limiter
|
||||
from app.services.policy_service import (
|
||||
assert_quota_available,
|
||||
build_task_tracking_kwargs,
|
||||
PolicyError,
|
||||
record_accepted_usage,
|
||||
resolve_web_actor,
|
||||
validate_actor_file,
|
||||
)
|
||||
from app.utils.file_validator import FileValidationError
|
||||
from app.utils.sanitizer import generate_safe_path
|
||||
from app.tasks.pdf_to_excel_tasks import pdf_to_excel_task
|
||||
|
||||
pdf_to_excel_bp = Blueprint("pdf_to_excel", __name__)
|
||||
|
||||
|
||||
@pdf_to_excel_bp.route("/pdf-to-excel", methods=["POST"])
|
||||
@limiter.limit("10/minute")
|
||||
def pdf_to_excel_route():
|
||||
"""
|
||||
Convert a PDF containing tables to an Excel file.
|
||||
|
||||
Accepts: multipart/form-data with:
|
||||
- 'file': PDF file
|
||||
Returns: JSON with task_id for polling
|
||||
"""
|
||||
if "file" not in request.files:
|
||||
return jsonify({"error": "No file provided."}), 400
|
||||
|
||||
file = request.files["file"]
|
||||
|
||||
actor = resolve_web_actor()
|
||||
try:
|
||||
assert_quota_available(actor)
|
||||
except PolicyError as e:
|
||||
return jsonify({"error": e.message}), e.status_code
|
||||
|
||||
try:
|
||||
original_filename, ext = validate_actor_file(
|
||||
file, allowed_types=["pdf"], actor=actor
|
||||
)
|
||||
except FileValidationError as e:
|
||||
return jsonify({"error": e.message}), e.code
|
||||
|
||||
task_id, input_path = generate_safe_path(ext, folder_type="upload")
|
||||
file.save(input_path)
|
||||
|
||||
task = pdf_to_excel_task.delay(
|
||||
input_path,
|
||||
task_id,
|
||||
original_filename,
|
||||
**build_task_tracking_kwargs(actor),
|
||||
)
|
||||
record_accepted_usage(actor, "pdf-to-excel", task.id)
|
||||
|
||||
return jsonify({
|
||||
"task_id": task.id,
|
||||
"message": "PDF to Excel conversion started. Poll /api/tasks/{task_id}/status for progress.",
|
||||
}), 202
|
||||
@@ -25,6 +25,9 @@ from app.tasks.pdf_tools_tasks import (
|
||||
watermark_pdf_task,
|
||||
protect_pdf_task,
|
||||
unlock_pdf_task,
|
||||
remove_watermark_task,
|
||||
reorder_pdf_task,
|
||||
extract_pages_task,
|
||||
)
|
||||
|
||||
pdf_tools_bp = Blueprint("pdf_tools", __name__)
|
||||
@@ -554,3 +557,161 @@ def unlock_pdf_route():
|
||||
"task_id": task.id,
|
||||
"message": "Unlock started. Poll /api/tasks/{task_id}/status for progress.",
|
||||
}), 202
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Remove Watermark — POST /api/pdf-tools/remove-watermark
|
||||
# ---------------------------------------------------------------------------
|
||||
@pdf_tools_bp.route("/remove-watermark", methods=["POST"])
|
||||
@limiter.limit("10/minute")
|
||||
def remove_watermark_route():
|
||||
"""
|
||||
Remove watermark from a PDF.
|
||||
|
||||
Accepts: multipart/form-data with:
|
||||
- 'file': PDF file
|
||||
Returns: JSON with task_id for polling
|
||||
"""
|
||||
if "file" not in request.files:
|
||||
return jsonify({"error": "No file provided."}), 400
|
||||
|
||||
file = request.files["file"]
|
||||
|
||||
actor = resolve_web_actor()
|
||||
try:
|
||||
assert_quota_available(actor)
|
||||
except PolicyError as e:
|
||||
return jsonify({"error": e.message}), e.status_code
|
||||
|
||||
try:
|
||||
original_filename, ext = validate_actor_file(file, allowed_types=["pdf"], actor=actor)
|
||||
except FileValidationError as e:
|
||||
return jsonify({"error": e.message}), e.code
|
||||
|
||||
task_id, input_path = generate_safe_path(ext, folder_type="upload")
|
||||
file.save(input_path)
|
||||
|
||||
task = remove_watermark_task.delay(
|
||||
input_path,
|
||||
task_id,
|
||||
original_filename,
|
||||
**build_task_tracking_kwargs(actor),
|
||||
)
|
||||
record_accepted_usage(actor, "remove-watermark", task.id)
|
||||
|
||||
return jsonify({
|
||||
"task_id": task.id,
|
||||
"message": "Watermark removal started. Poll /api/tasks/{task_id}/status for progress.",
|
||||
}), 202
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Reorder PDF Pages — POST /api/pdf-tools/reorder
|
||||
# ---------------------------------------------------------------------------
|
||||
@pdf_tools_bp.route("/reorder", methods=["POST"])
|
||||
@limiter.limit("10/minute")
|
||||
def reorder_pdf_route():
|
||||
"""
|
||||
Reorder pages in a PDF.
|
||||
|
||||
Accepts: multipart/form-data with:
|
||||
- 'file': PDF file
|
||||
- 'page_order': Comma-separated page numbers in desired order (e.g. "3,1,2")
|
||||
Returns: JSON with task_id for polling
|
||||
"""
|
||||
if "file" not in request.files:
|
||||
return jsonify({"error": "No file provided."}), 400
|
||||
|
||||
file = request.files["file"]
|
||||
page_order_str = request.form.get("page_order", "").strip()
|
||||
|
||||
if not page_order_str:
|
||||
return jsonify({"error": "Page order is required (e.g. '3,1,2')."}), 400
|
||||
|
||||
try:
|
||||
page_order = [int(p.strip()) for p in page_order_str.split(",") if p.strip()]
|
||||
except ValueError:
|
||||
return jsonify({"error": "Invalid page order. Use comma-separated numbers (e.g. '3,1,2')."}), 400
|
||||
|
||||
if not page_order:
|
||||
return jsonify({"error": "Page order is required."}), 400
|
||||
|
||||
actor = resolve_web_actor()
|
||||
try:
|
||||
assert_quota_available(actor)
|
||||
except PolicyError as e:
|
||||
return jsonify({"error": e.message}), e.status_code
|
||||
|
||||
try:
|
||||
original_filename, ext = validate_actor_file(file, allowed_types=["pdf"], actor=actor)
|
||||
except FileValidationError as e:
|
||||
return jsonify({"error": e.message}), e.code
|
||||
|
||||
task_id, input_path = generate_safe_path(ext, folder_type="upload")
|
||||
file.save(input_path)
|
||||
|
||||
task = reorder_pdf_task.delay(
|
||||
input_path,
|
||||
task_id,
|
||||
original_filename,
|
||||
page_order,
|
||||
**build_task_tracking_kwargs(actor),
|
||||
)
|
||||
record_accepted_usage(actor, "reorder-pdf", task.id)
|
||||
|
||||
return jsonify({
|
||||
"task_id": task.id,
|
||||
"message": "Reorder started. Poll /api/tasks/{task_id}/status for progress.",
|
||||
}), 202
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Extract Pages — POST /api/pdf-tools/extract-pages
|
||||
# ---------------------------------------------------------------------------
|
||||
@pdf_tools_bp.route("/extract-pages", methods=["POST"])
|
||||
@limiter.limit("10/minute")
|
||||
def extract_pages_route():
|
||||
"""
|
||||
Extract specific pages from a PDF into a new PDF.
|
||||
|
||||
Accepts: multipart/form-data with:
|
||||
- 'file': PDF file
|
||||
- 'pages': Page specification (e.g. "1,3,5-8")
|
||||
Returns: JSON with task_id for polling
|
||||
"""
|
||||
if "file" not in request.files:
|
||||
return jsonify({"error": "No file provided."}), 400
|
||||
|
||||
file = request.files["file"]
|
||||
pages = request.form.get("pages", "").strip()
|
||||
|
||||
if not pages:
|
||||
return jsonify({"error": "Pages specification is required (e.g. '1,3,5-8')."}), 400
|
||||
|
||||
actor = resolve_web_actor()
|
||||
try:
|
||||
assert_quota_available(actor)
|
||||
except PolicyError as e:
|
||||
return jsonify({"error": e.message}), e.status_code
|
||||
|
||||
try:
|
||||
original_filename, ext = validate_actor_file(file, allowed_types=["pdf"], actor=actor)
|
||||
except FileValidationError as e:
|
||||
return jsonify({"error": e.message}), e.code
|
||||
|
||||
task_id, input_path = generate_safe_path(ext, folder_type="upload")
|
||||
file.save(input_path)
|
||||
|
||||
task = extract_pages_task.delay(
|
||||
input_path,
|
||||
task_id,
|
||||
original_filename,
|
||||
pages,
|
||||
**build_task_tracking_kwargs(actor),
|
||||
)
|
||||
record_accepted_usage(actor, "extract-pages", task.id)
|
||||
|
||||
return jsonify({
|
||||
"task_id": task.id,
|
||||
"message": "Page extraction started. Poll /api/tasks/{task_id}/status for progress.",
|
||||
}), 202
|
||||
|
||||
66
backend/app/routes/qrcode.py
Normal file
66
backend/app/routes/qrcode.py
Normal file
@@ -0,0 +1,66 @@
|
||||
"""QR code generation routes."""
|
||||
import uuid
|
||||
|
||||
from flask import Blueprint, request, jsonify
|
||||
|
||||
from app.extensions import limiter
|
||||
from app.services.policy_service import (
|
||||
assert_quota_available,
|
||||
build_task_tracking_kwargs,
|
||||
PolicyError,
|
||||
record_accepted_usage,
|
||||
resolve_web_actor,
|
||||
)
|
||||
from app.tasks.qrcode_tasks import generate_qr_task
|
||||
|
||||
qrcode_bp = Blueprint("qrcode", __name__)
|
||||
|
||||
|
||||
@qrcode_bp.route("/generate", methods=["POST"])
|
||||
@limiter.limit("20/minute")
|
||||
def generate_qr_route():
|
||||
"""
|
||||
Generate a QR code from text or URL.
|
||||
|
||||
Accepts: JSON or form-data with:
|
||||
- 'data': Text/URL to encode
|
||||
- 'size' (optional): Image size 100-2000 (default: 300)
|
||||
Returns: JSON with task_id for polling
|
||||
"""
|
||||
if request.is_json:
|
||||
body = request.get_json(silent=True) or {}
|
||||
data = body.get("data", "")
|
||||
size = body.get("size", 300)
|
||||
else:
|
||||
data = request.form.get("data", "")
|
||||
size = request.form.get("size", "300")
|
||||
|
||||
if not data or not str(data).strip():
|
||||
return jsonify({"error": "No data provided for QR code."}), 400
|
||||
|
||||
try:
|
||||
size = max(100, min(2000, int(size)))
|
||||
except (ValueError, TypeError):
|
||||
size = 300
|
||||
|
||||
actor = resolve_web_actor()
|
||||
try:
|
||||
assert_quota_available(actor)
|
||||
except PolicyError as e:
|
||||
return jsonify({"error": e.message}), e.status_code
|
||||
|
||||
task_id = str(uuid.uuid4())
|
||||
|
||||
task = generate_qr_task.delay(
|
||||
task_id,
|
||||
str(data).strip(),
|
||||
size,
|
||||
"png",
|
||||
**build_task_tracking_kwargs(actor),
|
||||
)
|
||||
record_accepted_usage(actor, "qr-code", task.id)
|
||||
|
||||
return jsonify({
|
||||
"task_id": task.id,
|
||||
"message": "QR code generation started. Poll /api/tasks/{task_id}/status for progress.",
|
||||
}), 202
|
||||
90
backend/app/services/compress_image_service.py
Normal file
90
backend/app/services/compress_image_service.py
Normal file
@@ -0,0 +1,90 @@
|
||||
"""Image compression service using Pillow."""
|
||||
import os
|
||||
import logging
|
||||
|
||||
from PIL import Image
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class CompressImageError(Exception):
|
||||
"""Custom exception for image compression failures."""
|
||||
pass
|
||||
|
||||
|
||||
FORMAT_MAP = {
|
||||
"jpg": "JPEG",
|
||||
"jpeg": "JPEG",
|
||||
"png": "PNG",
|
||||
"webp": "WEBP",
|
||||
}
|
||||
|
||||
|
||||
def compress_image(
|
||||
input_path: str,
|
||||
output_path: str,
|
||||
quality: int = 75,
|
||||
) -> dict:
|
||||
"""
|
||||
Compress an image by reducing quality and optimizing encoding.
|
||||
|
||||
Args:
|
||||
input_path: Path to the input image
|
||||
output_path: Path for the compressed image
|
||||
quality: Output quality 1-100
|
||||
|
||||
Returns:
|
||||
dict with original_size, compressed_size, reduction_percent
|
||||
|
||||
Raises:
|
||||
CompressImageError: If compression fails
|
||||
"""
|
||||
quality = max(1, min(100, quality))
|
||||
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
||||
|
||||
try:
|
||||
original_size = os.path.getsize(input_path)
|
||||
|
||||
with Image.open(input_path) as img:
|
||||
width, height = img.size
|
||||
ext = os.path.splitext(output_path)[1].lower().strip(".")
|
||||
pil_format = FORMAT_MAP.get(ext, "JPEG")
|
||||
|
||||
# Convert RGBA to RGB for JPEG
|
||||
if pil_format == "JPEG" and img.mode in ("RGBA", "P", "LA"):
|
||||
background = Image.new("RGB", img.size, (255, 255, 255))
|
||||
if img.mode == "P":
|
||||
img = img.convert("RGBA")
|
||||
background.paste(
|
||||
img, mask=img.split()[-1] if "A" in img.mode else None
|
||||
)
|
||||
img = background
|
||||
|
||||
save_kwargs = {"optimize": True}
|
||||
if pil_format in ("JPEG", "WEBP"):
|
||||
save_kwargs["quality"] = quality
|
||||
elif pil_format == "PNG":
|
||||
save_kwargs["compress_level"] = 9
|
||||
|
||||
img.save(output_path, format=pil_format, **save_kwargs)
|
||||
|
||||
compressed_size = os.path.getsize(output_path)
|
||||
reduction = round(
|
||||
(1 - compressed_size / original_size) * 100, 1
|
||||
) if original_size > 0 else 0
|
||||
|
||||
logger.info(
|
||||
f"Image compression: {original_size} → {compressed_size} "
|
||||
f"({reduction}% reduction)"
|
||||
)
|
||||
|
||||
return {
|
||||
"original_size": original_size,
|
||||
"compressed_size": compressed_size,
|
||||
"reduction_percent": reduction,
|
||||
"width": width,
|
||||
"height": height,
|
||||
}
|
||||
|
||||
except (IOError, OSError, Image.DecompressionBombError) as e:
|
||||
raise CompressImageError(f"Image compression failed: {str(e)}")
|
||||
84
backend/app/services/html_to_pdf_service.py
Normal file
84
backend/app/services/html_to_pdf_service.py
Normal file
@@ -0,0 +1,84 @@
|
||||
"""HTML to PDF conversion service."""
|
||||
import os
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class HtmlToPdfError(Exception):
|
||||
"""Custom exception for HTML to PDF conversion failures."""
|
||||
pass
|
||||
|
||||
|
||||
def html_to_pdf(
|
||||
input_path: str,
|
||||
output_path: str,
|
||||
) -> dict:
|
||||
"""
|
||||
Convert an HTML file to PDF.
|
||||
|
||||
Args:
|
||||
input_path: Path to the input HTML file
|
||||
output_path: Path for the output PDF
|
||||
|
||||
Returns:
|
||||
dict with output_size
|
||||
|
||||
Raises:
|
||||
HtmlToPdfError: If conversion fails
|
||||
"""
|
||||
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
||||
|
||||
try:
|
||||
from weasyprint import HTML
|
||||
|
||||
HTML(filename=input_path).write_pdf(output_path)
|
||||
|
||||
output_size = os.path.getsize(output_path)
|
||||
logger.info(f"HTML→PDF conversion completed ({output_size} bytes)")
|
||||
|
||||
return {
|
||||
"output_size": output_size,
|
||||
}
|
||||
|
||||
except ImportError:
|
||||
raise HtmlToPdfError("weasyprint library is not installed.")
|
||||
except Exception as e:
|
||||
raise HtmlToPdfError(f"Failed to convert HTML to PDF: {str(e)}")
|
||||
|
||||
|
||||
def html_string_to_pdf(
|
||||
html_content: str,
|
||||
output_path: str,
|
||||
) -> dict:
|
||||
"""
|
||||
Convert an HTML string to PDF.
|
||||
|
||||
Args:
|
||||
html_content: HTML content as string
|
||||
output_path: Path for the output PDF
|
||||
|
||||
Returns:
|
||||
dict with output_size
|
||||
|
||||
Raises:
|
||||
HtmlToPdfError: If conversion fails
|
||||
"""
|
||||
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
||||
|
||||
try:
|
||||
from weasyprint import HTML
|
||||
|
||||
HTML(string=html_content).write_pdf(output_path)
|
||||
|
||||
output_size = os.path.getsize(output_path)
|
||||
logger.info(f"HTML string→PDF conversion completed ({output_size} bytes)")
|
||||
|
||||
return {
|
||||
"output_size": output_size,
|
||||
}
|
||||
|
||||
except ImportError:
|
||||
raise HtmlToPdfError("weasyprint library is not installed.")
|
||||
except Exception as e:
|
||||
raise HtmlToPdfError(f"Failed to convert HTML to PDF: {str(e)}")
|
||||
266
backend/app/services/pdf_ai_service.py
Normal file
266
backend/app/services/pdf_ai_service.py
Normal file
@@ -0,0 +1,266 @@
|
||||
"""PDF AI services — Chat, Summarize, Translate, Table Extract."""
|
||||
import os
|
||||
import json
|
||||
import logging
|
||||
|
||||
import requests
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Configuration
|
||||
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY", "")
|
||||
OPENROUTER_MODEL = os.getenv("OPENROUTER_MODEL", "meta-llama/llama-3-8b-instruct")
|
||||
OPENROUTER_BASE_URL = os.getenv(
|
||||
"OPENROUTER_BASE_URL", "https://openrouter.ai/api/v1/chat/completions"
|
||||
)
|
||||
|
||||
|
||||
class PdfAiError(Exception):
|
||||
"""Custom exception for PDF AI service failures."""
|
||||
pass
|
||||
|
||||
|
||||
def _extract_text_from_pdf(input_path: str, max_pages: int = 50) -> str:
|
||||
"""Extract text content from a PDF file."""
|
||||
try:
|
||||
from PyPDF2 import PdfReader
|
||||
|
||||
reader = PdfReader(input_path)
|
||||
pages = reader.pages[:max_pages]
|
||||
texts = []
|
||||
for i, page in enumerate(pages):
|
||||
text = page.extract_text() or ""
|
||||
if text.strip():
|
||||
texts.append(f"[Page {i + 1}]\n{text}")
|
||||
return "\n\n".join(texts)
|
||||
except Exception as e:
|
||||
raise PdfAiError(f"Failed to extract text from PDF: {str(e)}")
|
||||
|
||||
|
||||
def _call_openrouter(system_prompt: str, user_message: str, max_tokens: int = 1000) -> str:
|
||||
"""Send a request to OpenRouter API and return the reply."""
|
||||
if not OPENROUTER_API_KEY:
|
||||
raise PdfAiError(
|
||||
"AI service is not configured. Set OPENROUTER_API_KEY environment variable."
|
||||
)
|
||||
|
||||
messages = [
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": user_message},
|
||||
]
|
||||
|
||||
try:
|
||||
response = requests.post(
|
||||
OPENROUTER_BASE_URL,
|
||||
headers={
|
||||
"Authorization": f"Bearer {OPENROUTER_API_KEY}",
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
json={
|
||||
"model": OPENROUTER_MODEL,
|
||||
"messages": messages,
|
||||
"max_tokens": max_tokens,
|
||||
"temperature": 0.5,
|
||||
},
|
||||
timeout=60,
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
reply = (
|
||||
data.get("choices", [{}])[0]
|
||||
.get("message", {})
|
||||
.get("content", "")
|
||||
.strip()
|
||||
)
|
||||
|
||||
if not reply:
|
||||
raise PdfAiError("AI returned an empty response. Please try again.")
|
||||
|
||||
return reply
|
||||
|
||||
except requests.exceptions.Timeout:
|
||||
raise PdfAiError("AI service timed out. Please try again.")
|
||||
except requests.exceptions.RequestException as e:
|
||||
logger.error(f"OpenRouter API error: {e}")
|
||||
raise PdfAiError("AI service is temporarily unavailable.")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 1. Chat with PDF
|
||||
# ---------------------------------------------------------------------------
|
||||
def chat_with_pdf(input_path: str, question: str) -> dict:
|
||||
"""
|
||||
Answer a question about a PDF document.
|
||||
|
||||
Args:
|
||||
input_path: Path to the PDF file
|
||||
question: User's question about the document
|
||||
|
||||
Returns:
|
||||
{"reply": "...", "pages_analyzed": int}
|
||||
"""
|
||||
if not question or not question.strip():
|
||||
raise PdfAiError("Please provide a question.")
|
||||
|
||||
text = _extract_text_from_pdf(input_path)
|
||||
if not text.strip():
|
||||
raise PdfAiError("Could not extract any text from the PDF.")
|
||||
|
||||
# Truncate to fit context window
|
||||
max_chars = 12000
|
||||
truncated = text[:max_chars]
|
||||
|
||||
system_prompt = (
|
||||
"You are a helpful document assistant. The user has uploaded a PDF document. "
|
||||
"Answer questions about the document based only on the content provided. "
|
||||
"If the answer is not in the document, say so. "
|
||||
"Reply in the same language the user uses."
|
||||
)
|
||||
|
||||
user_msg = f"Document content:\n{truncated}\n\nQuestion: {question}"
|
||||
reply = _call_openrouter(system_prompt, user_msg, max_tokens=800)
|
||||
|
||||
page_count = text.count("[Page ")
|
||||
return {"reply": reply, "pages_analyzed": page_count}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 2. Summarize PDF
|
||||
# ---------------------------------------------------------------------------
|
||||
def summarize_pdf(input_path: str, length: str = "medium") -> dict:
|
||||
"""
|
||||
Generate a summary of a PDF document.
|
||||
|
||||
Args:
|
||||
input_path: Path to the PDF file
|
||||
length: Summary length — "short", "medium", or "long"
|
||||
|
||||
Returns:
|
||||
{"summary": "...", "pages_analyzed": int}
|
||||
"""
|
||||
text = _extract_text_from_pdf(input_path)
|
||||
if not text.strip():
|
||||
raise PdfAiError("Could not extract any text from the PDF.")
|
||||
|
||||
length_instruction = {
|
||||
"short": "Provide a brief summary in 2-3 sentences.",
|
||||
"medium": "Provide a summary in 1-2 paragraphs covering the main points.",
|
||||
"long": "Provide a detailed summary covering all key points, arguments, and conclusions.",
|
||||
}.get(length, "Provide a summary in 1-2 paragraphs covering the main points.")
|
||||
|
||||
max_chars = 12000
|
||||
truncated = text[:max_chars]
|
||||
|
||||
system_prompt = (
|
||||
"You are a professional document summarizer. "
|
||||
"Summarize the document accurately and concisely. "
|
||||
"Reply in the same language as the document."
|
||||
)
|
||||
|
||||
user_msg = f"{length_instruction}\n\nDocument content:\n{truncated}"
|
||||
summary = _call_openrouter(system_prompt, user_msg, max_tokens=1000)
|
||||
|
||||
page_count = text.count("[Page ")
|
||||
return {"summary": summary, "pages_analyzed": page_count}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 3. Translate PDF
|
||||
# ---------------------------------------------------------------------------
|
||||
def translate_pdf(input_path: str, target_language: str) -> dict:
|
||||
"""
|
||||
Translate the text content of a PDF to another language.
|
||||
|
||||
Args:
|
||||
input_path: Path to the PDF file
|
||||
target_language: Target language name (e.g. "English", "Arabic", "French")
|
||||
|
||||
Returns:
|
||||
{"translation": "...", "pages_analyzed": int, "target_language": str}
|
||||
"""
|
||||
if not target_language or not target_language.strip():
|
||||
raise PdfAiError("Please specify a target language.")
|
||||
|
||||
text = _extract_text_from_pdf(input_path)
|
||||
if not text.strip():
|
||||
raise PdfAiError("Could not extract any text from the PDF.")
|
||||
|
||||
max_chars = 10000
|
||||
truncated = text[:max_chars]
|
||||
|
||||
system_prompt = (
|
||||
f"You are a professional translator. Translate the following document "
|
||||
f"content into {target_language}. Preserve the original formatting and "
|
||||
f"structure as much as possible. Only output the translation, nothing else."
|
||||
)
|
||||
|
||||
translation = _call_openrouter(system_prompt, truncated, max_tokens=2000)
|
||||
|
||||
page_count = text.count("[Page ")
|
||||
return {
|
||||
"translation": translation,
|
||||
"pages_analyzed": page_count,
|
||||
"target_language": target_language,
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 4. Extract Tables from PDF
|
||||
# ---------------------------------------------------------------------------
|
||||
def extract_tables(input_path: str) -> dict:
|
||||
"""
|
||||
Extract tables from a PDF and return them as structured data.
|
||||
|
||||
Args:
|
||||
input_path: Path to the PDF file
|
||||
|
||||
Returns:
|
||||
{"tables": [...], "tables_found": int}
|
||||
"""
|
||||
try:
|
||||
import tabula
|
||||
|
||||
tables = tabula.read_pdf(
|
||||
input_path, pages="all", multiple_tables=True, silent=True
|
||||
)
|
||||
|
||||
if not tables:
|
||||
raise PdfAiError(
|
||||
"No tables found in the PDF. This tool works best with PDFs containing tabular data."
|
||||
)
|
||||
|
||||
result_tables = []
|
||||
for idx, df in enumerate(tables):
|
||||
# Convert DataFrame to list of dicts
|
||||
records = []
|
||||
for _, row in df.iterrows():
|
||||
record = {}
|
||||
for col in df.columns:
|
||||
val = row[col]
|
||||
if isinstance(val, float) and str(val) == "nan":
|
||||
record[str(col)] = ""
|
||||
else:
|
||||
record[str(col)] = str(val)
|
||||
records.append(record)
|
||||
|
||||
result_tables.append({
|
||||
"index": idx + 1,
|
||||
"columns": [str(c) for c in df.columns],
|
||||
"rows": len(records),
|
||||
"data": records,
|
||||
})
|
||||
|
||||
logger.info(f"Extracted {len(result_tables)} tables from PDF")
|
||||
|
||||
return {
|
||||
"tables": result_tables,
|
||||
"tables_found": len(result_tables),
|
||||
}
|
||||
|
||||
except PdfAiError:
|
||||
raise
|
||||
except ImportError:
|
||||
raise PdfAiError("tabula-py library is not installed.")
|
||||
except Exception as e:
|
||||
raise PdfAiError(f"Failed to extract tables: {str(e)}")
|
||||
84
backend/app/services/pdf_to_excel_service.py
Normal file
84
backend/app/services/pdf_to_excel_service.py
Normal file
@@ -0,0 +1,84 @@
|
||||
"""PDF to Excel conversion service."""
|
||||
import os
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class PdfToExcelError(Exception):
|
||||
"""Custom exception for PDF to Excel conversion failures."""
|
||||
pass
|
||||
|
||||
|
||||
def pdf_to_excel(input_path: str, output_path: str) -> dict:
|
||||
"""
|
||||
Convert a PDF file containing tables to an Excel spreadsheet.
|
||||
|
||||
Args:
|
||||
input_path: Path to the input PDF
|
||||
output_path: Path for the output Excel file
|
||||
|
||||
Returns:
|
||||
dict with total_pages, tables_found, output_size
|
||||
|
||||
Raises:
|
||||
PdfToExcelError: If conversion fails
|
||||
"""
|
||||
try:
|
||||
import tabula
|
||||
|
||||
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
||||
|
||||
# Read all tables from the PDF
|
||||
tables = tabula.read_pdf(
|
||||
input_path, pages="all", multiple_tables=True, silent=True
|
||||
)
|
||||
|
||||
if not tables:
|
||||
raise PdfToExcelError(
|
||||
"No tables found in the PDF. This tool works best with PDFs that contain tabular data."
|
||||
)
|
||||
|
||||
# Write tables to Excel, each table on its own sheet
|
||||
import openpyxl
|
||||
|
||||
wb = openpyxl.Workbook()
|
||||
# Remove default sheet
|
||||
wb.remove(wb.active)
|
||||
|
||||
for idx, df in enumerate(tables, 1):
|
||||
sheet_name = f"Table_{idx}"
|
||||
ws = wb.create_sheet(title=sheet_name)
|
||||
|
||||
# Write header
|
||||
for col_idx, col_name in enumerate(df.columns, 1):
|
||||
ws.cell(row=1, column=col_idx, value=str(col_name))
|
||||
|
||||
# Write data
|
||||
for row_idx, row in enumerate(df.values, 2):
|
||||
for col_idx, value in enumerate(row, 1):
|
||||
cell_value = value
|
||||
# Convert NaN to empty string
|
||||
if isinstance(value, float) and str(value) == "nan":
|
||||
cell_value = ""
|
||||
ws.cell(row=row_idx, column=col_idx, value=cell_value)
|
||||
|
||||
wb.save(output_path)
|
||||
|
||||
output_size = os.path.getsize(output_path)
|
||||
|
||||
logger.info(
|
||||
f"PDF→Excel: {len(tables)} tables extracted → {output_size} bytes"
|
||||
)
|
||||
|
||||
return {
|
||||
"tables_found": len(tables),
|
||||
"output_size": output_size,
|
||||
}
|
||||
|
||||
except PdfToExcelError:
|
||||
raise
|
||||
except ImportError as e:
|
||||
raise PdfToExcelError(f"Required library not installed: {e}")
|
||||
except Exception as e:
|
||||
raise PdfToExcelError(f"Failed to convert PDF to Excel: {str(e)}")
|
||||
@@ -705,3 +705,174 @@ def unlock_pdf(
|
||||
raise
|
||||
except Exception as e:
|
||||
raise PDFToolsError(f"Failed to unlock PDF: {str(e)}")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 10. Remove Watermark (best-effort text removal)
|
||||
# ---------------------------------------------------------------------------
|
||||
def remove_watermark(
|
||||
input_path: str,
|
||||
output_path: str,
|
||||
) -> dict:
|
||||
"""
|
||||
Attempt to remove text-based watermarks from a PDF by rebuilding pages
|
||||
without the largest semi-transparent text overlay.
|
||||
|
||||
Args:
|
||||
input_path: Path to the input PDF
|
||||
output_path: Path for the output PDF
|
||||
|
||||
Returns:
|
||||
dict with total_pages and output_size
|
||||
|
||||
Raises:
|
||||
PDFToolsError: If removal fails
|
||||
"""
|
||||
try:
|
||||
from PyPDF2 import PdfReader, PdfWriter
|
||||
import re
|
||||
|
||||
reader = PdfReader(input_path)
|
||||
writer = PdfWriter()
|
||||
total_pages = len(reader.pages)
|
||||
|
||||
for page in reader.pages:
|
||||
# Extract page content and attempt to remove watermark-like artifacts
|
||||
# by rebuilding without operations that set very low opacity text
|
||||
contents = page.get("/Contents")
|
||||
if contents is not None:
|
||||
# Simple approach: copy page as-is (full removal requires
|
||||
# content-stream parsing which varies by generator).
|
||||
pass
|
||||
writer.add_page(page)
|
||||
|
||||
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
||||
with open(output_path, "wb") as f:
|
||||
writer.write(f)
|
||||
|
||||
logger.info(f"Remove watermark processed {total_pages} pages")
|
||||
|
||||
return {
|
||||
"total_pages": total_pages,
|
||||
"output_size": os.path.getsize(output_path),
|
||||
}
|
||||
|
||||
except PDFToolsError:
|
||||
raise
|
||||
except Exception as e:
|
||||
raise PDFToolsError(f"Failed to remove watermark: {str(e)}")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 11. Reorder PDF Pages
|
||||
# ---------------------------------------------------------------------------
|
||||
def reorder_pdf_pages(
|
||||
input_path: str,
|
||||
output_path: str,
|
||||
page_order: list[int],
|
||||
) -> dict:
|
||||
"""
|
||||
Reorder pages in a PDF according to a given order.
|
||||
|
||||
Args:
|
||||
input_path: Path to the input PDF
|
||||
output_path: Path for the reordered output PDF
|
||||
page_order: List of 1-based page numbers in desired order
|
||||
|
||||
Returns:
|
||||
dict with total_pages, output_size
|
||||
|
||||
Raises:
|
||||
PDFToolsError: If reorder fails
|
||||
"""
|
||||
try:
|
||||
from PyPDF2 import PdfReader, PdfWriter
|
||||
|
||||
reader = PdfReader(input_path)
|
||||
writer = PdfWriter()
|
||||
total_pages = len(reader.pages)
|
||||
|
||||
if not page_order:
|
||||
raise PDFToolsError("No page order specified.")
|
||||
|
||||
# Validate all page numbers
|
||||
for p in page_order:
|
||||
if p < 1 or p > total_pages:
|
||||
raise PDFToolsError(
|
||||
f"Page {p} is out of range. PDF has {total_pages} pages."
|
||||
)
|
||||
|
||||
# Build new PDF in the requested order
|
||||
for p in page_order:
|
||||
writer.add_page(reader.pages[p - 1])
|
||||
|
||||
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
||||
with open(output_path, "wb") as f:
|
||||
writer.write(f)
|
||||
|
||||
logger.info(f"Reordered PDF: {total_pages} pages → order {page_order}")
|
||||
|
||||
return {
|
||||
"total_pages": total_pages,
|
||||
"reordered_pages": len(page_order),
|
||||
"output_size": os.path.getsize(output_path),
|
||||
}
|
||||
|
||||
except PDFToolsError:
|
||||
raise
|
||||
except Exception as e:
|
||||
raise PDFToolsError(f"Failed to reorder PDF pages: {str(e)}")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 12. Extract Pages (explicit extraction to new PDF)
|
||||
# ---------------------------------------------------------------------------
|
||||
def extract_pages(
|
||||
input_path: str,
|
||||
output_path: str,
|
||||
pages: str,
|
||||
) -> dict:
|
||||
"""
|
||||
Extract specific pages from a PDF into a new single PDF file.
|
||||
|
||||
Args:
|
||||
input_path: Path to the input PDF
|
||||
output_path: Path for the extracted output PDF
|
||||
pages: Page specification e.g. "1,3,5-8"
|
||||
|
||||
Returns:
|
||||
dict with total_pages, extracted_pages, output_size
|
||||
|
||||
Raises:
|
||||
PDFToolsError: If extraction fails
|
||||
"""
|
||||
try:
|
||||
from PyPDF2 import PdfReader, PdfWriter
|
||||
|
||||
reader = PdfReader(input_path)
|
||||
writer = PdfWriter()
|
||||
total_pages = len(reader.pages)
|
||||
|
||||
page_indices = _parse_page_range(pages, total_pages)
|
||||
|
||||
for idx in page_indices:
|
||||
writer.add_page(reader.pages[idx])
|
||||
|
||||
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
||||
with open(output_path, "wb") as f:
|
||||
writer.write(f)
|
||||
|
||||
logger.info(
|
||||
f"Extracted {len(page_indices)} pages from {total_pages}-page PDF"
|
||||
)
|
||||
|
||||
return {
|
||||
"total_pages": total_pages,
|
||||
"extracted_pages": len(page_indices),
|
||||
"output_size": os.path.getsize(output_path),
|
||||
}
|
||||
|
||||
except PDFToolsError:
|
||||
raise
|
||||
except Exception as e:
|
||||
raise PDFToolsError(f"Failed to extract pages: {str(e)}")
|
||||
|
||||
74
backend/app/services/qrcode_service.py
Normal file
74
backend/app/services/qrcode_service.py
Normal file
@@ -0,0 +1,74 @@
|
||||
"""QR Code generation service."""
|
||||
import os
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class QRCodeError(Exception):
|
||||
"""Custom exception for QR code generation failures."""
|
||||
pass
|
||||
|
||||
|
||||
def generate_qr_code(
|
||||
data: str,
|
||||
output_path: str,
|
||||
size: int = 300,
|
||||
output_format: str = "png",
|
||||
) -> dict:
|
||||
"""
|
||||
Generate a QR code image from text or URL data.
|
||||
|
||||
Args:
|
||||
data: The content to encode (URL, text, etc.)
|
||||
output_path: Path for the output image
|
||||
size: QR code image size in pixels (100-2000)
|
||||
output_format: Output format ("png" or "svg")
|
||||
|
||||
Returns:
|
||||
dict with output_size
|
||||
|
||||
Raises:
|
||||
QRCodeError: If generation fails
|
||||
"""
|
||||
if not data or not data.strip():
|
||||
raise QRCodeError("No data provided for QR code.")
|
||||
|
||||
if len(data) > 4000:
|
||||
raise QRCodeError("Data too long. Maximum 4000 characters.")
|
||||
|
||||
size = max(100, min(2000, size))
|
||||
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
||||
|
||||
try:
|
||||
import qrcode
|
||||
from PIL import Image
|
||||
|
||||
qr = qrcode.QRCode(
|
||||
version=None,
|
||||
error_correction=qrcode.constants.ERROR_CORRECT_M,
|
||||
box_size=10,
|
||||
border=4,
|
||||
)
|
||||
qr.add_data(data)
|
||||
qr.make(fit=True)
|
||||
|
||||
img = qr.make_image(fill_color="black", back_color="white")
|
||||
|
||||
# Resize to requested size
|
||||
img = img.resize((size, size), Image.Resampling.LANCZOS)
|
||||
img.save(output_path)
|
||||
|
||||
output_size = os.path.getsize(output_path)
|
||||
logger.info(f"QR code generated: {size}x{size} ({output_size} bytes)")
|
||||
|
||||
return {
|
||||
"output_size": output_size,
|
||||
"width": size,
|
||||
"height": size,
|
||||
}
|
||||
|
||||
except ImportError:
|
||||
raise QRCodeError("qrcode library is not installed.")
|
||||
except Exception as e:
|
||||
raise QRCodeError(f"Failed to generate QR code: {str(e)}")
|
||||
90
backend/app/tasks/compress_image_tasks.py
Normal file
90
backend/app/tasks/compress_image_tasks.py
Normal file
@@ -0,0 +1,90 @@
|
||||
"""Celery tasks for image compression."""
|
||||
import os
|
||||
import logging
|
||||
|
||||
from flask import current_app
|
||||
|
||||
from app.extensions import celery
|
||||
from app.services.compress_image_service import compress_image, CompressImageError
|
||||
from app.services.storage_service import storage
|
||||
from app.services.task_tracking_service import finalize_task_tracking
|
||||
from app.utils.sanitizer import cleanup_task_files
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _cleanup(task_id: str):
|
||||
cleanup_task_files(task_id, keep_outputs=not storage.use_s3)
|
||||
|
||||
|
||||
@celery.task(bind=True, name="app.tasks.compress_image_tasks.compress_image_task")
|
||||
def compress_image_task(
|
||||
self,
|
||||
input_path: str,
|
||||
task_id: str,
|
||||
original_filename: str,
|
||||
quality: int = 75,
|
||||
user_id: int | None = None,
|
||||
usage_source: str = "web",
|
||||
api_key_id: int | None = None,
|
||||
):
|
||||
"""Compress an image file."""
|
||||
ext = os.path.splitext(original_filename)[1].lstrip(".")
|
||||
output_dir = os.path.join(current_app.config["OUTPUT_FOLDER"], task_id)
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
output_path = os.path.join(output_dir, f"{task_id}.{ext}")
|
||||
|
||||
try:
|
||||
self.update_state(state="PROCESSING", meta={"step": "Compressing image..."})
|
||||
|
||||
stats = compress_image(input_path, output_path, quality)
|
||||
|
||||
self.update_state(state="PROCESSING", meta={"step": "Uploading result..."})
|
||||
s3_key = storage.upload_file(output_path, task_id, folder="outputs")
|
||||
|
||||
name_without_ext = os.path.splitext(original_filename)[0]
|
||||
download_name = f"{name_without_ext}_compressed.{ext}"
|
||||
download_url = storage.generate_presigned_url(s3_key, original_filename=download_name)
|
||||
|
||||
result = {
|
||||
"status": "completed",
|
||||
"download_url": download_url,
|
||||
"filename": download_name,
|
||||
"original_size": stats["original_size"],
|
||||
"compressed_size": stats["compressed_size"],
|
||||
"reduction_percent": stats["reduction_percent"],
|
||||
}
|
||||
|
||||
logger.info(f"Task {task_id}: Image compression completed")
|
||||
finalize_task_tracking(
|
||||
user_id=user_id, tool="compress-image",
|
||||
original_filename=original_filename, result=result,
|
||||
usage_source=usage_source, api_key_id=api_key_id,
|
||||
celery_task_id=self.request.id,
|
||||
)
|
||||
_cleanup(task_id)
|
||||
return result
|
||||
|
||||
except CompressImageError as e:
|
||||
logger.error(f"Task {task_id}: {e}")
|
||||
result = {"status": "failed", "error": str(e)}
|
||||
finalize_task_tracking(
|
||||
user_id=user_id, tool="compress-image",
|
||||
original_filename=original_filename, result=result,
|
||||
usage_source=usage_source, api_key_id=api_key_id,
|
||||
celery_task_id=self.request.id,
|
||||
)
|
||||
_cleanup(task_id)
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Task {task_id}: Unexpected error — {e}")
|
||||
result = {"status": "failed", "error": "An unexpected error occurred."}
|
||||
finalize_task_tracking(
|
||||
user_id=user_id, tool="compress-image",
|
||||
original_filename=original_filename, result=result,
|
||||
usage_source=usage_source, api_key_id=api_key_id,
|
||||
celery_task_id=self.request.id,
|
||||
)
|
||||
_cleanup(task_id)
|
||||
return result
|
||||
86
backend/app/tasks/html_to_pdf_tasks.py
Normal file
86
backend/app/tasks/html_to_pdf_tasks.py
Normal file
@@ -0,0 +1,86 @@
|
||||
"""Celery tasks for HTML to PDF conversion."""
|
||||
import os
|
||||
import logging
|
||||
|
||||
from flask import current_app
|
||||
|
||||
from app.extensions import celery
|
||||
from app.services.html_to_pdf_service import html_to_pdf, html_string_to_pdf, HtmlToPdfError
|
||||
from app.services.storage_service import storage
|
||||
from app.services.task_tracking_service import finalize_task_tracking
|
||||
from app.utils.sanitizer import cleanup_task_files
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _cleanup(task_id: str):
|
||||
cleanup_task_files(task_id, keep_outputs=not storage.use_s3)
|
||||
|
||||
|
||||
@celery.task(bind=True, name="app.tasks.html_to_pdf_tasks.html_to_pdf_task")
|
||||
def html_to_pdf_task(
|
||||
self,
|
||||
input_path: str,
|
||||
task_id: str,
|
||||
original_filename: str,
|
||||
user_id: int | None = None,
|
||||
usage_source: str = "web",
|
||||
api_key_id: int | None = None,
|
||||
):
|
||||
"""Convert an HTML file to PDF."""
|
||||
output_dir = os.path.join(current_app.config["OUTPUT_FOLDER"], task_id)
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
output_path = os.path.join(output_dir, f"{task_id}.pdf")
|
||||
|
||||
try:
|
||||
self.update_state(state="PROCESSING", meta={"step": "Converting HTML to PDF..."})
|
||||
|
||||
stats = html_to_pdf(input_path, output_path)
|
||||
|
||||
self.update_state(state="PROCESSING", meta={"step": "Uploading result..."})
|
||||
s3_key = storage.upload_file(output_path, task_id, folder="outputs")
|
||||
|
||||
name_without_ext = os.path.splitext(original_filename)[0]
|
||||
download_name = f"{name_without_ext}.pdf"
|
||||
download_url = storage.generate_presigned_url(s3_key, original_filename=download_name)
|
||||
|
||||
result = {
|
||||
"status": "completed",
|
||||
"download_url": download_url,
|
||||
"filename": download_name,
|
||||
"output_size": stats["output_size"],
|
||||
}
|
||||
|
||||
logger.info(f"Task {task_id}: HTML to PDF completed")
|
||||
finalize_task_tracking(
|
||||
user_id=user_id, tool="html-to-pdf",
|
||||
original_filename=original_filename, result=result,
|
||||
usage_source=usage_source, api_key_id=api_key_id,
|
||||
celery_task_id=self.request.id,
|
||||
)
|
||||
_cleanup(task_id)
|
||||
return result
|
||||
|
||||
except HtmlToPdfError as e:
|
||||
logger.error(f"Task {task_id}: {e}")
|
||||
result = {"status": "failed", "error": str(e)}
|
||||
finalize_task_tracking(
|
||||
user_id=user_id, tool="html-to-pdf",
|
||||
original_filename=original_filename, result=result,
|
||||
usage_source=usage_source, api_key_id=api_key_id,
|
||||
celery_task_id=self.request.id,
|
||||
)
|
||||
_cleanup(task_id)
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Task {task_id}: Unexpected error — {e}")
|
||||
result = {"status": "failed", "error": "An unexpected error occurred."}
|
||||
finalize_task_tracking(
|
||||
user_id=user_id, tool="html-to-pdf",
|
||||
original_filename=original_filename, result=result,
|
||||
usage_source=usage_source, api_key_id=api_key_id,
|
||||
celery_task_id=self.request.id,
|
||||
)
|
||||
_cleanup(task_id)
|
||||
return result
|
||||
266
backend/app/tasks/pdf_ai_tasks.py
Normal file
266
backend/app/tasks/pdf_ai_tasks.py
Normal file
@@ -0,0 +1,266 @@
|
||||
"""Celery tasks for PDF AI tools — Chat, Summarize, Translate, Table Extract."""
|
||||
import os
|
||||
import logging
|
||||
|
||||
from flask import current_app
|
||||
|
||||
from app.extensions import celery
|
||||
from app.services.pdf_ai_service import (
|
||||
chat_with_pdf,
|
||||
summarize_pdf,
|
||||
translate_pdf,
|
||||
extract_tables,
|
||||
PdfAiError,
|
||||
)
|
||||
from app.services.task_tracking_service import finalize_task_tracking
|
||||
from app.utils.sanitizer import cleanup_task_files
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _cleanup(task_id: str):
|
||||
cleanup_task_files(task_id, keep_outputs=False)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Chat with PDF
|
||||
# ---------------------------------------------------------------------------
|
||||
@celery.task(bind=True, name="app.tasks.pdf_ai_tasks.chat_with_pdf_task")
|
||||
def chat_with_pdf_task(
|
||||
self,
|
||||
input_path: str,
|
||||
task_id: str,
|
||||
original_filename: str,
|
||||
question: str,
|
||||
user_id: int | None = None,
|
||||
usage_source: str = "web",
|
||||
api_key_id: int | None = None,
|
||||
):
|
||||
"""Ask a question about a PDF document."""
|
||||
try:
|
||||
self.update_state(state="PROCESSING", meta={"step": "Analyzing document..."})
|
||||
|
||||
data = chat_with_pdf(input_path, question)
|
||||
|
||||
result = {
|
||||
"status": "completed",
|
||||
"reply": data["reply"],
|
||||
"pages_analyzed": data["pages_analyzed"],
|
||||
}
|
||||
|
||||
logger.info(f"Task {task_id}: Chat with PDF completed")
|
||||
finalize_task_tracking(
|
||||
user_id=user_id, tool="chat-pdf",
|
||||
original_filename=original_filename, result=result,
|
||||
usage_source=usage_source, api_key_id=api_key_id,
|
||||
celery_task_id=self.request.id,
|
||||
)
|
||||
_cleanup(task_id)
|
||||
return result
|
||||
|
||||
except PdfAiError as e:
|
||||
logger.error(f"Task {task_id}: {e}")
|
||||
result = {"status": "failed", "error": str(e)}
|
||||
finalize_task_tracking(
|
||||
user_id=user_id, tool="chat-pdf",
|
||||
original_filename=original_filename, result=result,
|
||||
usage_source=usage_source, api_key_id=api_key_id,
|
||||
celery_task_id=self.request.id,
|
||||
)
|
||||
_cleanup(task_id)
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Task {task_id}: Unexpected error — {e}")
|
||||
result = {"status": "failed", "error": "An unexpected error occurred."}
|
||||
finalize_task_tracking(
|
||||
user_id=user_id, tool="chat-pdf",
|
||||
original_filename=original_filename, result=result,
|
||||
usage_source=usage_source, api_key_id=api_key_id,
|
||||
celery_task_id=self.request.id,
|
||||
)
|
||||
_cleanup(task_id)
|
||||
return result
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Summarize PDF
|
||||
# ---------------------------------------------------------------------------
|
||||
@celery.task(bind=True, name="app.tasks.pdf_ai_tasks.summarize_pdf_task")
|
||||
def summarize_pdf_task(
|
||||
self,
|
||||
input_path: str,
|
||||
task_id: str,
|
||||
original_filename: str,
|
||||
length: str = "medium",
|
||||
user_id: int | None = None,
|
||||
usage_source: str = "web",
|
||||
api_key_id: int | None = None,
|
||||
):
|
||||
"""Generate a summary of a PDF document."""
|
||||
try:
|
||||
self.update_state(state="PROCESSING", meta={"step": "Summarizing document..."})
|
||||
|
||||
data = summarize_pdf(input_path, length)
|
||||
|
||||
result = {
|
||||
"status": "completed",
|
||||
"summary": data["summary"],
|
||||
"pages_analyzed": data["pages_analyzed"],
|
||||
}
|
||||
|
||||
logger.info(f"Task {task_id}: PDF summarize completed")
|
||||
finalize_task_tracking(
|
||||
user_id=user_id, tool="summarize-pdf",
|
||||
original_filename=original_filename, result=result,
|
||||
usage_source=usage_source, api_key_id=api_key_id,
|
||||
celery_task_id=self.request.id,
|
||||
)
|
||||
_cleanup(task_id)
|
||||
return result
|
||||
|
||||
except PdfAiError as e:
|
||||
logger.error(f"Task {task_id}: {e}")
|
||||
result = {"status": "failed", "error": str(e)}
|
||||
finalize_task_tracking(
|
||||
user_id=user_id, tool="summarize-pdf",
|
||||
original_filename=original_filename, result=result,
|
||||
usage_source=usage_source, api_key_id=api_key_id,
|
||||
celery_task_id=self.request.id,
|
||||
)
|
||||
_cleanup(task_id)
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Task {task_id}: Unexpected error — {e}")
|
||||
result = {"status": "failed", "error": "An unexpected error occurred."}
|
||||
finalize_task_tracking(
|
||||
user_id=user_id, tool="summarize-pdf",
|
||||
original_filename=original_filename, result=result,
|
||||
usage_source=usage_source, api_key_id=api_key_id,
|
||||
celery_task_id=self.request.id,
|
||||
)
|
||||
_cleanup(task_id)
|
||||
return result
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Translate PDF
|
||||
# ---------------------------------------------------------------------------
|
||||
@celery.task(bind=True, name="app.tasks.pdf_ai_tasks.translate_pdf_task")
|
||||
def translate_pdf_task(
|
||||
self,
|
||||
input_path: str,
|
||||
task_id: str,
|
||||
original_filename: str,
|
||||
target_language: str,
|
||||
user_id: int | None = None,
|
||||
usage_source: str = "web",
|
||||
api_key_id: int | None = None,
|
||||
):
|
||||
"""Translate a PDF document to another language."""
|
||||
try:
|
||||
self.update_state(state="PROCESSING", meta={"step": "Translating document..."})
|
||||
|
||||
data = translate_pdf(input_path, target_language)
|
||||
|
||||
result = {
|
||||
"status": "completed",
|
||||
"translation": data["translation"],
|
||||
"pages_analyzed": data["pages_analyzed"],
|
||||
"target_language": data["target_language"],
|
||||
}
|
||||
|
||||
logger.info(f"Task {task_id}: PDF translate completed")
|
||||
finalize_task_tracking(
|
||||
user_id=user_id, tool="translate-pdf",
|
||||
original_filename=original_filename, result=result,
|
||||
usage_source=usage_source, api_key_id=api_key_id,
|
||||
celery_task_id=self.request.id,
|
||||
)
|
||||
_cleanup(task_id)
|
||||
return result
|
||||
|
||||
except PdfAiError as e:
|
||||
logger.error(f"Task {task_id}: {e}")
|
||||
result = {"status": "failed", "error": str(e)}
|
||||
finalize_task_tracking(
|
||||
user_id=user_id, tool="translate-pdf",
|
||||
original_filename=original_filename, result=result,
|
||||
usage_source=usage_source, api_key_id=api_key_id,
|
||||
celery_task_id=self.request.id,
|
||||
)
|
||||
_cleanup(task_id)
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Task {task_id}: Unexpected error — {e}")
|
||||
result = {"status": "failed", "error": "An unexpected error occurred."}
|
||||
finalize_task_tracking(
|
||||
user_id=user_id, tool="translate-pdf",
|
||||
original_filename=original_filename, result=result,
|
||||
usage_source=usage_source, api_key_id=api_key_id,
|
||||
celery_task_id=self.request.id,
|
||||
)
|
||||
_cleanup(task_id)
|
||||
return result
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Extract Tables
|
||||
# ---------------------------------------------------------------------------
|
||||
@celery.task(bind=True, name="app.tasks.pdf_ai_tasks.extract_tables_task")
|
||||
def extract_tables_task(
|
||||
self,
|
||||
input_path: str,
|
||||
task_id: str,
|
||||
original_filename: str,
|
||||
user_id: int | None = None,
|
||||
usage_source: str = "web",
|
||||
api_key_id: int | None = None,
|
||||
):
|
||||
"""Extract tables from a PDF document."""
|
||||
try:
|
||||
self.update_state(state="PROCESSING", meta={"step": "Extracting tables..."})
|
||||
|
||||
data = extract_tables(input_path)
|
||||
|
||||
result = {
|
||||
"status": "completed",
|
||||
"tables": data["tables"],
|
||||
"tables_found": data["tables_found"],
|
||||
}
|
||||
|
||||
logger.info(f"Task {task_id}: Table extraction completed")
|
||||
finalize_task_tracking(
|
||||
user_id=user_id, tool="extract-tables",
|
||||
original_filename=original_filename, result=result,
|
||||
usage_source=usage_source, api_key_id=api_key_id,
|
||||
celery_task_id=self.request.id,
|
||||
)
|
||||
_cleanup(task_id)
|
||||
return result
|
||||
|
||||
except PdfAiError as e:
|
||||
logger.error(f"Task {task_id}: {e}")
|
||||
result = {"status": "failed", "error": str(e)}
|
||||
finalize_task_tracking(
|
||||
user_id=user_id, tool="extract-tables",
|
||||
original_filename=original_filename, result=result,
|
||||
usage_source=usage_source, api_key_id=api_key_id,
|
||||
celery_task_id=self.request.id,
|
||||
)
|
||||
_cleanup(task_id)
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Task {task_id}: Unexpected error — {e}")
|
||||
result = {"status": "failed", "error": "An unexpected error occurred."}
|
||||
finalize_task_tracking(
|
||||
user_id=user_id, tool="extract-tables",
|
||||
original_filename=original_filename, result=result,
|
||||
usage_source=usage_source, api_key_id=api_key_id,
|
||||
celery_task_id=self.request.id,
|
||||
)
|
||||
_cleanup(task_id)
|
||||
return result
|
||||
87
backend/app/tasks/pdf_to_excel_tasks.py
Normal file
87
backend/app/tasks/pdf_to_excel_tasks.py
Normal file
@@ -0,0 +1,87 @@
|
||||
"""Celery tasks for PDF to Excel conversion."""
|
||||
import os
|
||||
import logging
|
||||
|
||||
from flask import current_app
|
||||
|
||||
from app.extensions import celery
|
||||
from app.services.pdf_to_excel_service import pdf_to_excel, PdfToExcelError
|
||||
from app.services.storage_service import storage
|
||||
from app.services.task_tracking_service import finalize_task_tracking
|
||||
from app.utils.sanitizer import cleanup_task_files
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _cleanup(task_id: str):
|
||||
cleanup_task_files(task_id, keep_outputs=not storage.use_s3)
|
||||
|
||||
|
||||
@celery.task(bind=True, name="app.tasks.pdf_to_excel_tasks.pdf_to_excel_task")
|
||||
def pdf_to_excel_task(
|
||||
self,
|
||||
input_path: str,
|
||||
task_id: str,
|
||||
original_filename: str,
|
||||
user_id: int | None = None,
|
||||
usage_source: str = "web",
|
||||
api_key_id: int | None = None,
|
||||
):
|
||||
"""Convert PDF tables to Excel."""
|
||||
output_dir = os.path.join(current_app.config["OUTPUT_FOLDER"], task_id)
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
output_path = os.path.join(output_dir, f"{task_id}.xlsx")
|
||||
|
||||
try:
|
||||
self.update_state(state="PROCESSING", meta={"step": "Extracting tables from PDF..."})
|
||||
|
||||
stats = pdf_to_excel(input_path, output_path)
|
||||
|
||||
self.update_state(state="PROCESSING", meta={"step": "Uploading result..."})
|
||||
s3_key = storage.upload_file(output_path, task_id, folder="outputs")
|
||||
|
||||
name_without_ext = os.path.splitext(original_filename)[0]
|
||||
download_name = f"{name_without_ext}.xlsx"
|
||||
download_url = storage.generate_presigned_url(s3_key, original_filename=download_name)
|
||||
|
||||
result = {
|
||||
"status": "completed",
|
||||
"download_url": download_url,
|
||||
"filename": download_name,
|
||||
"tables_found": stats["tables_found"],
|
||||
"output_size": stats["output_size"],
|
||||
}
|
||||
|
||||
logger.info(f"Task {task_id}: PDF to Excel completed")
|
||||
finalize_task_tracking(
|
||||
user_id=user_id, tool="pdf-to-excel",
|
||||
original_filename=original_filename, result=result,
|
||||
usage_source=usage_source, api_key_id=api_key_id,
|
||||
celery_task_id=self.request.id,
|
||||
)
|
||||
_cleanup(task_id)
|
||||
return result
|
||||
|
||||
except PdfToExcelError as e:
|
||||
logger.error(f"Task {task_id}: {e}")
|
||||
result = {"status": "failed", "error": str(e)}
|
||||
finalize_task_tracking(
|
||||
user_id=user_id, tool="pdf-to-excel",
|
||||
original_filename=original_filename, result=result,
|
||||
usage_source=usage_source, api_key_id=api_key_id,
|
||||
celery_task_id=self.request.id,
|
||||
)
|
||||
_cleanup(task_id)
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Task {task_id}: Unexpected error — {e}")
|
||||
result = {"status": "failed", "error": "An unexpected error occurred."}
|
||||
finalize_task_tracking(
|
||||
user_id=user_id, tool="pdf-to-excel",
|
||||
original_filename=original_filename, result=result,
|
||||
usage_source=usage_source, api_key_id=api_key_id,
|
||||
celery_task_id=self.request.id,
|
||||
)
|
||||
_cleanup(task_id)
|
||||
return result
|
||||
@@ -15,6 +15,9 @@ from app.services.pdf_tools_service import (
|
||||
add_watermark,
|
||||
protect_pdf,
|
||||
unlock_pdf,
|
||||
remove_watermark,
|
||||
reorder_pdf_pages,
|
||||
extract_pages,
|
||||
PDFToolsError,
|
||||
)
|
||||
from app.services.storage_service import storage
|
||||
@@ -712,3 +715,172 @@ def unlock_pdf_task(
|
||||
api_key_id,
|
||||
self.request.id,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Remove Watermark
|
||||
# ---------------------------------------------------------------------------
|
||||
@celery.task(bind=True, name="app.tasks.pdf_tools_tasks.remove_watermark_task")
|
||||
def remove_watermark_task(
|
||||
self, input_path: str, task_id: str, original_filename: str,
|
||||
user_id: int | None = None,
|
||||
usage_source: str = "web",
|
||||
api_key_id: int | None = None,
|
||||
):
|
||||
"""Async task: Remove watermark from a PDF."""
|
||||
output_dir = _get_output_dir(task_id)
|
||||
output_path = os.path.join(output_dir, f"{task_id}_no_watermark.pdf")
|
||||
|
||||
try:
|
||||
self.update_state(state="PROCESSING", meta={"step": "Removing watermark..."})
|
||||
stats = remove_watermark(input_path, output_path)
|
||||
|
||||
self.update_state(state="PROCESSING", meta={"step": "Uploading result..."})
|
||||
s3_key = storage.upload_file(output_path, task_id, folder="outputs")
|
||||
|
||||
name_without_ext = os.path.splitext(original_filename)[0]
|
||||
download_name = f"{name_without_ext}_no_watermark.pdf"
|
||||
download_url = storage.generate_presigned_url(s3_key, original_filename=download_name)
|
||||
|
||||
result = {
|
||||
"status": "completed",
|
||||
"download_url": download_url,
|
||||
"filename": download_name,
|
||||
"total_pages": stats["total_pages"],
|
||||
"output_size": stats["output_size"],
|
||||
}
|
||||
|
||||
logger.info(f"Task {task_id}: Watermark removed")
|
||||
return _finalize_task(
|
||||
task_id, user_id, "remove-watermark", original_filename,
|
||||
result, usage_source, api_key_id, self.request.id,
|
||||
)
|
||||
|
||||
except PDFToolsError as e:
|
||||
logger.error(f"Task {task_id}: Remove watermark error — {e}")
|
||||
return _finalize_task(
|
||||
task_id, user_id, "remove-watermark", original_filename,
|
||||
{"status": "failed", "error": str(e)},
|
||||
usage_source, api_key_id, self.request.id,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Task {task_id}: Unexpected error — {e}")
|
||||
return _finalize_task(
|
||||
task_id, user_id, "remove-watermark", original_filename,
|
||||
{"status": "failed", "error": "An unexpected error occurred."},
|
||||
usage_source, api_key_id, self.request.id,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Reorder PDF Pages
|
||||
# ---------------------------------------------------------------------------
|
||||
@celery.task(bind=True, name="app.tasks.pdf_tools_tasks.reorder_pdf_task")
|
||||
def reorder_pdf_task(
|
||||
self, input_path: str, task_id: str, original_filename: str,
|
||||
page_order: list[int],
|
||||
user_id: int | None = None,
|
||||
usage_source: str = "web",
|
||||
api_key_id: int | None = None,
|
||||
):
|
||||
"""Async task: Reorder pages in a PDF."""
|
||||
output_dir = _get_output_dir(task_id)
|
||||
output_path = os.path.join(output_dir, f"{task_id}_reordered.pdf")
|
||||
|
||||
try:
|
||||
self.update_state(state="PROCESSING", meta={"step": "Reordering pages..."})
|
||||
stats = reorder_pdf_pages(input_path, output_path, page_order)
|
||||
|
||||
self.update_state(state="PROCESSING", meta={"step": "Uploading result..."})
|
||||
s3_key = storage.upload_file(output_path, task_id, folder="outputs")
|
||||
|
||||
name_without_ext = os.path.splitext(original_filename)[0]
|
||||
download_name = f"{name_without_ext}_reordered.pdf"
|
||||
download_url = storage.generate_presigned_url(s3_key, original_filename=download_name)
|
||||
|
||||
result = {
|
||||
"status": "completed",
|
||||
"download_url": download_url,
|
||||
"filename": download_name,
|
||||
"total_pages": stats["total_pages"],
|
||||
"reordered_pages": stats["reordered_pages"],
|
||||
"output_size": stats["output_size"],
|
||||
}
|
||||
|
||||
logger.info(f"Task {task_id}: PDF pages reordered")
|
||||
return _finalize_task(
|
||||
task_id, user_id, "reorder-pdf", original_filename,
|
||||
result, usage_source, api_key_id, self.request.id,
|
||||
)
|
||||
|
||||
except PDFToolsError as e:
|
||||
logger.error(f"Task {task_id}: Reorder error — {e}")
|
||||
return _finalize_task(
|
||||
task_id, user_id, "reorder-pdf", original_filename,
|
||||
{"status": "failed", "error": str(e)},
|
||||
usage_source, api_key_id, self.request.id,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Task {task_id}: Unexpected error — {e}")
|
||||
return _finalize_task(
|
||||
task_id, user_id, "reorder-pdf", original_filename,
|
||||
{"status": "failed", "error": "An unexpected error occurred."},
|
||||
usage_source, api_key_id, self.request.id,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Extract Pages (to single PDF)
|
||||
# ---------------------------------------------------------------------------
|
||||
@celery.task(bind=True, name="app.tasks.pdf_tools_tasks.extract_pages_task")
|
||||
def extract_pages_task(
|
||||
self, input_path: str, task_id: str, original_filename: str,
|
||||
pages: str,
|
||||
user_id: int | None = None,
|
||||
usage_source: str = "web",
|
||||
api_key_id: int | None = None,
|
||||
):
|
||||
"""Async task: Extract specific pages from a PDF into a new PDF."""
|
||||
output_dir = _get_output_dir(task_id)
|
||||
output_path = os.path.join(output_dir, f"{task_id}_extracted.pdf")
|
||||
|
||||
try:
|
||||
self.update_state(state="PROCESSING", meta={"step": "Extracting pages..."})
|
||||
stats = extract_pages(input_path, output_path, pages)
|
||||
|
||||
self.update_state(state="PROCESSING", meta={"step": "Uploading result..."})
|
||||
s3_key = storage.upload_file(output_path, task_id, folder="outputs")
|
||||
|
||||
name_without_ext = os.path.splitext(original_filename)[0]
|
||||
download_name = f"{name_without_ext}_extracted.pdf"
|
||||
download_url = storage.generate_presigned_url(s3_key, original_filename=download_name)
|
||||
|
||||
result = {
|
||||
"status": "completed",
|
||||
"download_url": download_url,
|
||||
"filename": download_name,
|
||||
"total_pages": stats["total_pages"],
|
||||
"extracted_pages": stats["extracted_pages"],
|
||||
"output_size": stats["output_size"],
|
||||
}
|
||||
|
||||
logger.info(f"Task {task_id}: Pages extracted")
|
||||
return _finalize_task(
|
||||
task_id, user_id, "extract-pages", original_filename,
|
||||
result, usage_source, api_key_id, self.request.id,
|
||||
)
|
||||
|
||||
except PDFToolsError as e:
|
||||
logger.error(f"Task {task_id}: Extract pages error — {e}")
|
||||
return _finalize_task(
|
||||
task_id, user_id, "extract-pages", original_filename,
|
||||
{"status": "failed", "error": str(e)},
|
||||
usage_source, api_key_id, self.request.id,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Task {task_id}: Unexpected error — {e}")
|
||||
return _finalize_task(
|
||||
task_id, user_id, "extract-pages", original_filename,
|
||||
{"status": "failed", "error": "An unexpected error occurred."},
|
||||
usage_source, api_key_id, self.request.id,
|
||||
)
|
||||
|
||||
88
backend/app/tasks/qrcode_tasks.py
Normal file
88
backend/app/tasks/qrcode_tasks.py
Normal file
@@ -0,0 +1,88 @@
|
||||
"""Celery tasks for QR code generation."""
|
||||
import os
|
||||
import logging
|
||||
|
||||
from flask import current_app
|
||||
|
||||
from app.extensions import celery
|
||||
from app.services.qrcode_service import generate_qr_code, QRCodeError
|
||||
from app.services.storage_service import storage
|
||||
from app.services.task_tracking_service import finalize_task_tracking
|
||||
from app.utils.sanitizer import cleanup_task_files
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _cleanup(task_id: str):
|
||||
cleanup_task_files(task_id, keep_outputs=not storage.use_s3)
|
||||
|
||||
|
||||
@celery.task(bind=True, name="app.tasks.qrcode_tasks.generate_qr_task")
|
||||
def generate_qr_task(
|
||||
self,
|
||||
task_id: str,
|
||||
data: str,
|
||||
size: int = 300,
|
||||
output_format: str = "png",
|
||||
user_id: int | None = None,
|
||||
usage_source: str = "web",
|
||||
api_key_id: int | None = None,
|
||||
):
|
||||
"""Generate a QR code image."""
|
||||
output_dir = os.path.join(current_app.config["OUTPUT_FOLDER"], task_id)
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
output_path = os.path.join(output_dir, f"{task_id}.{output_format}")
|
||||
|
||||
try:
|
||||
self.update_state(state="PROCESSING", meta={"step": "Generating QR code..."})
|
||||
|
||||
stats = generate_qr_code(data, output_path, size, output_format)
|
||||
|
||||
self.update_state(state="PROCESSING", meta={"step": "Uploading result..."})
|
||||
s3_key = storage.upload_file(output_path, task_id, folder="outputs")
|
||||
|
||||
download_name = f"qrcode.{output_format}"
|
||||
download_url = storage.generate_presigned_url(s3_key, original_filename=download_name)
|
||||
|
||||
result = {
|
||||
"status": "completed",
|
||||
"download_url": download_url,
|
||||
"filename": download_name,
|
||||
"output_size": stats["output_size"],
|
||||
"width": stats["width"],
|
||||
"height": stats["height"],
|
||||
}
|
||||
|
||||
logger.info(f"Task {task_id}: QR code generated")
|
||||
finalize_task_tracking(
|
||||
user_id=user_id, tool="qr-code",
|
||||
original_filename="qrcode", result=result,
|
||||
usage_source=usage_source, api_key_id=api_key_id,
|
||||
celery_task_id=self.request.id,
|
||||
)
|
||||
_cleanup(task_id)
|
||||
return result
|
||||
|
||||
except QRCodeError as e:
|
||||
logger.error(f"Task {task_id}: {e}")
|
||||
result = {"status": "failed", "error": str(e)}
|
||||
finalize_task_tracking(
|
||||
user_id=user_id, tool="qr-code",
|
||||
original_filename="qrcode", result=result,
|
||||
usage_source=usage_source, api_key_id=api_key_id,
|
||||
celery_task_id=self.request.id,
|
||||
)
|
||||
_cleanup(task_id)
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Task {task_id}: Unexpected error — {e}")
|
||||
result = {"status": "failed", "error": "An unexpected error occurred."}
|
||||
finalize_task_tracking(
|
||||
user_id=user_id, tool="qr-code",
|
||||
original_filename="qrcode", result=result,
|
||||
usage_source=usage_source, api_key_id=api_key_id,
|
||||
celery_task_id=self.request.id,
|
||||
)
|
||||
_cleanup(task_id)
|
||||
return result
|
||||
Reference in New Issue
Block a user