feat: harden PDF translation workflow

This commit is contained in:
Your Name
2026-03-30 14:24:18 +02:00
parent 499ebe3ce8
commit 6e8cf6f83a
17 changed files with 1358 additions and 1931 deletions

View File

@@ -20,6 +20,11 @@ OPENROUTER_API_KEY=
OPENROUTER_MODEL=nvidia/nemotron-3-super-120b-a12b:free
OPENROUTER_BASE_URL=https://openrouter.ai/api/v1/chat/completions
# Premium document translation (recommended for Translate PDF)
DEEPL_API_KEY=
DEEPL_API_URL=https://api-free.deepl.com/v2/translate
DEEPL_TIMEOUT_SECONDS=90
# AWS S3
AWS_ACCESS_KEY_ID=
AWS_SECRET_ACCESS_KEY=

View File

@@ -1,4 +1,5 @@
"""PDF AI tool routes — Chat, Summarize, Translate, Table Extract."""
from flask import Blueprint, request, jsonify
from app.extensions import limiter
@@ -70,10 +71,12 @@ def chat_pdf_route():
)
record_accepted_usage(actor, "chat-pdf", task.id)
return jsonify({
return jsonify(
{
"task_id": task.id,
"message": "Processing your question. Poll /api/tasks/{task_id}/status for progress.",
}), 202
}
), 202
# ---------------------------------------------------------------------------
@@ -124,10 +127,12 @@ def summarize_pdf_route():
)
record_accepted_usage(actor, "summarize-pdf", task.id)
return jsonify({
return jsonify(
{
"task_id": task.id,
"message": "Summarizing document. Poll /api/tasks/{task_id}/status for progress.",
}), 202
}
), 202
# ---------------------------------------------------------------------------
@@ -149,6 +154,7 @@ def translate_pdf_route():
file = request.files["file"]
target_language = request.form.get("target_language", "").strip()
source_language = request.form.get("source_language", "auto").strip()
if not target_language:
return jsonify({"error": "No target language specified."}), 400
@@ -174,14 +180,17 @@ def translate_pdf_route():
task_id,
original_filename,
target_language,
source_language,
**build_task_tracking_kwargs(actor),
)
record_accepted_usage(actor, "translate-pdf", task.id)
return jsonify({
return jsonify(
{
"task_id": task.id,
"message": "Translating document. Poll /api/tasks/{task_id}/status for progress.",
}), 202
}
), 202
# ---------------------------------------------------------------------------
@@ -226,7 +235,9 @@ def extract_tables_route():
)
record_accepted_usage(actor, "extract-tables", task.id)
return jsonify({
return jsonify(
{
"task_id": task.id,
"message": "Extracting tables. Poll /api/tasks/{task_id}/status for progress.",
}), 202
}
), 202

View File

@@ -1,4 +1,5 @@
"""B2B API v1 tool routes — authenticated via X-API-Key, Pro plan only."""
import os
import uuid
import logging
@@ -37,16 +38,25 @@ from app.tasks.flowchart_tasks import extract_flowchart_task
from app.tasks.ocr_tasks import ocr_image_task, ocr_pdf_task
from app.tasks.removebg_tasks import remove_bg_task
from app.tasks.pdf_ai_tasks import (
chat_with_pdf_task, summarize_pdf_task, translate_pdf_task, extract_tables_task,
chat_with_pdf_task,
summarize_pdf_task,
translate_pdf_task,
extract_tables_task,
)
from app.tasks.pdf_to_excel_tasks import pdf_to_excel_task
from app.tasks.html_to_pdf_tasks import html_to_pdf_task
from app.tasks.qrcode_tasks import generate_qr_task
from app.tasks.pdf_convert_tasks import (
pdf_to_pptx_task, excel_to_pdf_task, pptx_to_pdf_task, sign_pdf_task,
pdf_to_pptx_task,
excel_to_pdf_task,
pptx_to_pdf_task,
sign_pdf_task,
)
from app.tasks.pdf_extra_tasks import (
crop_pdf_task, flatten_pdf_task, repair_pdf_task, edit_metadata_task,
crop_pdf_task,
flatten_pdf_task,
repair_pdf_task,
edit_metadata_task,
)
from app.tasks.image_extra_tasks import crop_image_task, rotate_flip_image_task
from app.tasks.barcode_tasks import generate_barcode_task
@@ -80,6 +90,7 @@ def _resolve_and_check() -> tuple:
# Task status — GET /api/v1/tasks/<task_id>/status
# ---------------------------------------------------------------------------
@v1_bp.route("/tasks/<task_id>/status", methods=["GET"])
@limiter.limit("300/minute", override_defaults=True)
def get_task_status(task_id: str):
@@ -113,6 +124,7 @@ def get_task_status(task_id: str):
# Compress — POST /api/v1/compress/pdf
# ---------------------------------------------------------------------------
@v1_bp.route("/compress/pdf", methods=["POST"])
@limiter.limit("10/minute")
def compress_pdf_route():
@@ -130,7 +142,9 @@ def compress_pdf_route():
quality = "medium"
try:
original_filename, ext = validate_actor_file(file, allowed_types=["pdf"], actor=actor)
original_filename, ext = validate_actor_file(
file, allowed_types=["pdf"], actor=actor
)
except FileValidationError as e:
return jsonify({"error": e.message}), e.code
@@ -138,7 +152,10 @@ def compress_pdf_route():
file.save(input_path)
task = compress_pdf_task.delay(
input_path, task_id, original_filename, quality,
input_path,
task_id,
original_filename,
quality,
**build_task_tracking_kwargs(actor),
)
record_accepted_usage(actor, "compress-pdf", task.id)
@@ -150,6 +167,7 @@ def compress_pdf_route():
# Convert — POST /api/v1/convert/pdf-to-word & /api/v1/convert/word-to-pdf
# ---------------------------------------------------------------------------
@v1_bp.route("/convert/pdf-to-word", methods=["POST"])
@limiter.limit("10/minute")
def pdf_to_word_route():
@@ -163,7 +181,9 @@ def pdf_to_word_route():
file = request.files["file"]
try:
original_filename, ext = validate_actor_file(file, allowed_types=["pdf"], actor=actor)
original_filename, ext = validate_actor_file(
file, allowed_types=["pdf"], actor=actor
)
except FileValidationError as e:
return jsonify({"error": e.message}), e.code
@@ -171,7 +191,9 @@ def pdf_to_word_route():
file.save(input_path)
task = convert_pdf_to_word.delay(
input_path, task_id, original_filename,
input_path,
task_id,
original_filename,
**build_task_tracking_kwargs(actor),
)
record_accepted_usage(actor, "pdf-to-word", task.id)
@@ -201,7 +223,9 @@ def word_to_pdf_route():
file.save(input_path)
task = convert_word_to_pdf.delay(
input_path, task_id, original_filename,
input_path,
task_id,
original_filename,
**build_task_tracking_kwargs(actor),
)
record_accepted_usage(actor, "word-to-pdf", task.id)
@@ -212,6 +236,7 @@ def word_to_pdf_route():
# Image — POST /api/v1/image/convert & /api/v1/image/resize
# ---------------------------------------------------------------------------
@v1_bp.route("/image/convert", methods=["POST"])
@limiter.limit("10/minute")
def convert_image_route():
@@ -226,7 +251,9 @@ def convert_image_route():
file = request.files["file"]
output_format = request.form.get("format", "").lower()
if output_format not in ALLOWED_OUTPUT_FORMATS:
return jsonify({"error": f"Invalid format. Supported: {', '.join(ALLOWED_OUTPUT_FORMATS)}"}), 400
return jsonify(
{"error": f"Invalid format. Supported: {', '.join(ALLOWED_OUTPUT_FORMATS)}"}
), 400
try:
quality = max(1, min(100, int(request.form.get("quality", "85"))))
@@ -244,7 +271,11 @@ def convert_image_route():
file.save(input_path)
task = convert_image_task.delay(
input_path, task_id, original_filename, output_format, quality,
input_path,
task_id,
original_filename,
output_format,
quality,
**build_task_tracking_kwargs(actor),
)
record_accepted_usage(actor, "image-convert", task.id)
@@ -292,7 +323,12 @@ def resize_image_route():
file.save(input_path)
task = resize_image_task.delay(
input_path, task_id, original_filename, width, height, quality,
input_path,
task_id,
original_filename,
width,
height,
quality,
**build_task_tracking_kwargs(actor),
)
record_accepted_usage(actor, "image-resize", task.id)
@@ -303,6 +339,7 @@ def resize_image_route():
# Video — POST /api/v1/video/to-gif
# ---------------------------------------------------------------------------
@v1_bp.route("/video/to-gif", methods=["POST"])
@limiter.limit("5/minute")
def video_to_gif_route():
@@ -343,7 +380,13 @@ def video_to_gif_route():
file.save(input_path)
task = create_gif_task.delay(
input_path, task_id, original_filename, start_time, duration, fps, width,
input_path,
task_id,
original_filename,
start_time,
duration,
fps,
width,
**build_task_tracking_kwargs(actor),
)
record_accepted_usage(actor, "video-to-gif", task.id)
@@ -354,6 +397,7 @@ def video_to_gif_route():
# PDF Tools — all single-file and multi-file routes
# ---------------------------------------------------------------------------
@v1_bp.route("/pdf-tools/merge", methods=["POST"])
@limiter.limit("10/minute")
def merge_pdfs_route():
@@ -372,7 +416,9 @@ def merge_pdfs_route():
input_paths, original_filenames = [], []
for f in files:
try:
original_filename, ext = validate_actor_file(f, allowed_types=["pdf"], actor=actor)
original_filename, ext = validate_actor_file(
f, allowed_types=["pdf"], actor=actor
)
except FileValidationError as e:
return jsonify({"error": e.message}), e.code
upload_dir = os.path.join(current_app.config["UPLOAD_FOLDER"], task_id)
@@ -383,7 +429,9 @@ def merge_pdfs_route():
original_filenames.append(original_filename)
task = merge_pdfs_task.delay(
input_paths, task_id, original_filenames,
input_paths,
task_id,
original_filenames,
**build_task_tracking_kwargs(actor),
)
record_accepted_usage(actor, "merge-pdf", task.id)
@@ -410,14 +458,20 @@ def split_pdf_route():
return jsonify({"error": "Please specify which pages to extract."}), 400
try:
original_filename, ext = validate_actor_file(file, allowed_types=["pdf"], actor=actor)
original_filename, ext = validate_actor_file(
file, allowed_types=["pdf"], actor=actor
)
except FileValidationError as e:
return jsonify({"error": e.message}), e.code
task_id, input_path = generate_safe_path(ext, folder_type="upload")
file.save(input_path)
task = split_pdf_task.delay(
input_path, task_id, original_filename, mode, pages,
input_path,
task_id,
original_filename,
mode,
pages,
**build_task_tracking_kwargs(actor),
)
record_accepted_usage(actor, "split-pdf", task.id)
@@ -445,14 +499,20 @@ def rotate_pdf_route():
pages = request.form.get("pages", "all")
try:
original_filename, ext = validate_actor_file(file, allowed_types=["pdf"], actor=actor)
original_filename, ext = validate_actor_file(
file, allowed_types=["pdf"], actor=actor
)
except FileValidationError as e:
return jsonify({"error": e.message}), e.code
task_id, input_path = generate_safe_path(ext, folder_type="upload")
file.save(input_path)
task = rotate_pdf_task.delay(
input_path, task_id, original_filename, rotation, pages,
input_path,
task_id,
original_filename,
rotation,
pages,
**build_task_tracking_kwargs(actor),
)
record_accepted_usage(actor, "rotate-pdf", task.id)
@@ -473,8 +533,12 @@ def add_page_numbers_route():
file = request.files["file"]
position = request.form.get("position", "bottom-center")
valid_positions = [
"bottom-center", "bottom-right", "bottom-left",
"top-center", "top-right", "top-left",
"bottom-center",
"bottom-right",
"bottom-left",
"top-center",
"top-right",
"top-left",
]
if position not in valid_positions:
position = "bottom-center"
@@ -484,14 +548,20 @@ def add_page_numbers_route():
start_number = 1
try:
original_filename, ext = validate_actor_file(file, allowed_types=["pdf"], actor=actor)
original_filename, ext = validate_actor_file(
file, allowed_types=["pdf"], actor=actor
)
except FileValidationError as e:
return jsonify({"error": e.message}), e.code
task_id, input_path = generate_safe_path(ext, folder_type="upload")
file.save(input_path)
task = add_page_numbers_task.delay(
input_path, task_id, original_filename, position, start_number,
input_path,
task_id,
original_filename,
position,
start_number,
**build_task_tracking_kwargs(actor),
)
record_accepted_usage(actor, "page-numbers", task.id)
@@ -519,14 +589,20 @@ def pdf_to_images_route():
dpi = 200
try:
original_filename, ext = validate_actor_file(file, allowed_types=["pdf"], actor=actor)
original_filename, ext = validate_actor_file(
file, allowed_types=["pdf"], actor=actor
)
except FileValidationError as e:
return jsonify({"error": e.message}), e.code
task_id, input_path = generate_safe_path(ext, folder_type="upload")
file.save(input_path)
task = pdf_to_images_task.delay(
input_path, task_id, original_filename, output_format, dpi,
input_path,
task_id,
original_filename,
output_format,
dpi,
**build_task_tracking_kwargs(actor),
)
record_accepted_usage(actor, "pdf-to-images", task.id)
@@ -564,7 +640,9 @@ def images_to_pdf_route():
original_filenames.append(original_filename)
task = images_to_pdf_task.delay(
input_paths, task_id, original_filenames,
input_paths,
task_id,
original_filenames,
**build_task_tracking_kwargs(actor),
)
record_accepted_usage(actor, "images-to-pdf", task.id)
@@ -594,14 +672,20 @@ def watermark_pdf_route():
opacity = 0.3
try:
original_filename, ext = validate_actor_file(file, allowed_types=["pdf"], actor=actor)
original_filename, ext = validate_actor_file(
file, allowed_types=["pdf"], actor=actor
)
except FileValidationError as e:
return jsonify({"error": e.message}), e.code
task_id, input_path = generate_safe_path(ext, folder_type="upload")
file.save(input_path)
task = watermark_pdf_task.delay(
input_path, task_id, original_filename, watermark_text, opacity,
input_path,
task_id,
original_filename,
watermark_text,
opacity,
**build_task_tracking_kwargs(actor),
)
record_accepted_usage(actor, "watermark-pdf", task.id)
@@ -627,14 +711,19 @@ def protect_pdf_route():
return jsonify({"error": "Password must be at least 4 characters."}), 400
try:
original_filename, ext = validate_actor_file(file, allowed_types=["pdf"], actor=actor)
original_filename, ext = validate_actor_file(
file, allowed_types=["pdf"], actor=actor
)
except FileValidationError as e:
return jsonify({"error": e.message}), e.code
task_id, input_path = generate_safe_path(ext, folder_type="upload")
file.save(input_path)
task = protect_pdf_task.delay(
input_path, task_id, original_filename, password,
input_path,
task_id,
original_filename,
password,
**build_task_tracking_kwargs(actor),
)
record_accepted_usage(actor, "protect-pdf", task.id)
@@ -658,14 +747,19 @@ def unlock_pdf_route():
return jsonify({"error": "Password is required."}), 400
try:
original_filename, ext = validate_actor_file(file, allowed_types=["pdf"], actor=actor)
original_filename, ext = validate_actor_file(
file, allowed_types=["pdf"], actor=actor
)
except FileValidationError as e:
return jsonify({"error": e.message}), e.code
task_id, input_path = generate_safe_path(ext, folder_type="upload")
file.save(input_path)
task = unlock_pdf_task.delay(
input_path, task_id, original_filename, password,
input_path,
task_id,
original_filename,
password,
**build_task_tracking_kwargs(actor),
)
record_accepted_usage(actor, "unlock-pdf", task.id)
@@ -685,18 +779,24 @@ def extract_flowchart_route():
file = request.files["file"]
try:
original_filename, ext = validate_actor_file(file, allowed_types=["pdf"], actor=actor)
original_filename, ext = validate_actor_file(
file, allowed_types=["pdf"], actor=actor
)
except FileValidationError as e:
return jsonify({"error": e.message}), e.code
task_id, input_path = generate_safe_path(ext)
file.save(input_path)
task = extract_flowchart_task.delay(
input_path, task_id, original_filename,
input_path,
task_id,
original_filename,
**build_task_tracking_kwargs(actor),
)
record_accepted_usage(actor, "pdf-flowchart", task.id)
return jsonify({"task_id": task.id, "message": "Flowchart extraction started."}), 202
return jsonify(
{"task_id": task.id, "message": "Flowchart extraction started."}
), 202
# ===========================================================================
@@ -707,6 +807,7 @@ def extract_flowchart_route():
# OCR — POST /api/v1/ocr/image & /api/v1/ocr/pdf
# ---------------------------------------------------------------------------
@v1_bp.route("/ocr/image", methods=["POST"])
@limiter.limit("10/minute")
def ocr_image_route():
@@ -731,7 +832,10 @@ def ocr_image_route():
task_id, input_path = generate_safe_path(ext, folder_type="upload")
file.save(input_path)
task = ocr_image_task.delay(
input_path, task_id, original_filename, lang,
input_path,
task_id,
original_filename,
lang,
**build_task_tracking_kwargs(actor),
)
record_accepted_usage(actor, "ocr-image", task.id)
@@ -753,14 +857,19 @@ def ocr_pdf_route():
lang = request.form.get("lang", "eng")
try:
original_filename, ext = validate_actor_file(file, allowed_types=["pdf"], actor=actor)
original_filename, ext = validate_actor_file(
file, allowed_types=["pdf"], actor=actor
)
except FileValidationError as e:
return jsonify({"error": e.message}), e.code
task_id, input_path = generate_safe_path(ext, folder_type="upload")
file.save(input_path)
task = ocr_pdf_task.delay(
input_path, task_id, original_filename, lang,
input_path,
task_id,
original_filename,
lang,
**build_task_tracking_kwargs(actor),
)
record_accepted_usage(actor, "ocr-pdf", task.id)
@@ -771,6 +880,7 @@ def ocr_pdf_route():
# Remove Background — POST /api/v1/image/remove-bg
# ---------------------------------------------------------------------------
@v1_bp.route("/image/remove-bg", methods=["POST"])
@limiter.limit("5/minute")
def remove_bg_route():
@@ -793,7 +903,9 @@ def remove_bg_route():
task_id, input_path = generate_safe_path(ext, folder_type="upload")
file.save(input_path)
task = remove_bg_task.delay(
input_path, task_id, original_filename,
input_path,
task_id,
original_filename,
**build_task_tracking_kwargs(actor),
)
record_accepted_usage(actor, "remove-bg", task.id)
@@ -804,6 +916,7 @@ def remove_bg_route():
# PDF AI — POST /api/v1/pdf-ai/chat, summarize, translate, extract-tables
# ---------------------------------------------------------------------------
@v1_bp.route("/pdf-ai/chat", methods=["POST"])
@limiter.limit("5/minute")
def chat_pdf_route():
@@ -821,14 +934,19 @@ def chat_pdf_route():
return jsonify({"error": "Question is required."}), 400
try:
original_filename, ext = validate_actor_file(file, allowed_types=["pdf"], actor=actor)
original_filename, ext = validate_actor_file(
file, allowed_types=["pdf"], actor=actor
)
except FileValidationError as e:
return jsonify({"error": e.message}), e.code
task_id, input_path = generate_safe_path(ext, folder_type="upload")
file.save(input_path)
task = chat_with_pdf_task.delay(
input_path, task_id, original_filename, question,
input_path,
task_id,
original_filename,
question,
**build_task_tracking_kwargs(actor),
)
record_accepted_usage(actor, "chat-pdf", task.id)
@@ -852,14 +970,19 @@ def summarize_pdf_route():
length = "medium"
try:
original_filename, ext = validate_actor_file(file, allowed_types=["pdf"], actor=actor)
original_filename, ext = validate_actor_file(
file, allowed_types=["pdf"], actor=actor
)
except FileValidationError as e:
return jsonify({"error": e.message}), e.code
task_id, input_path = generate_safe_path(ext, folder_type="upload")
file.save(input_path)
task = summarize_pdf_task.delay(
input_path, task_id, original_filename, length,
input_path,
task_id,
original_filename,
length,
**build_task_tracking_kwargs(actor),
)
record_accepted_usage(actor, "summarize-pdf", task.id)
@@ -879,18 +1002,25 @@ def translate_pdf_route():
file = request.files["file"]
target_language = request.form.get("target_language", "").strip()
source_language = request.form.get("source_language", "auto").strip()
if not target_language:
return jsonify({"error": "Target language is required."}), 400
try:
original_filename, ext = validate_actor_file(file, allowed_types=["pdf"], actor=actor)
original_filename, ext = validate_actor_file(
file, allowed_types=["pdf"], actor=actor
)
except FileValidationError as e:
return jsonify({"error": e.message}), e.code
task_id, input_path = generate_safe_path(ext, folder_type="upload")
file.save(input_path)
task = translate_pdf_task.delay(
input_path, task_id, original_filename, target_language,
input_path,
task_id,
original_filename,
target_language,
source_language,
**build_task_tracking_kwargs(actor),
)
record_accepted_usage(actor, "translate-pdf", task.id)
@@ -910,14 +1040,18 @@ def extract_tables_route():
file = request.files["file"]
try:
original_filename, ext = validate_actor_file(file, allowed_types=["pdf"], actor=actor)
original_filename, ext = validate_actor_file(
file, allowed_types=["pdf"], actor=actor
)
except FileValidationError as e:
return jsonify({"error": e.message}), e.code
task_id, input_path = generate_safe_path(ext, folder_type="upload")
file.save(input_path)
task = extract_tables_task.delay(
input_path, task_id, original_filename,
input_path,
task_id,
original_filename,
**build_task_tracking_kwargs(actor),
)
record_accepted_usage(actor, "extract-tables", task.id)
@@ -928,6 +1062,7 @@ def extract_tables_route():
# PDF to Excel — POST /api/v1/convert/pdf-to-excel
# ---------------------------------------------------------------------------
@v1_bp.route("/convert/pdf-to-excel", methods=["POST"])
@limiter.limit("10/minute")
def pdf_to_excel_route():
@@ -941,14 +1076,18 @@ def pdf_to_excel_route():
file = request.files["file"]
try:
original_filename, ext = validate_actor_file(file, allowed_types=["pdf"], actor=actor)
original_filename, ext = validate_actor_file(
file, allowed_types=["pdf"], actor=actor
)
except FileValidationError as e:
return jsonify({"error": e.message}), e.code
task_id, input_path = generate_safe_path(ext, folder_type="upload")
file.save(input_path)
task = pdf_to_excel_task.delay(
input_path, task_id, original_filename,
input_path,
task_id,
original_filename,
**build_task_tracking_kwargs(actor),
)
record_accepted_usage(actor, "pdf-to-excel", task.id)
@@ -959,6 +1098,7 @@ def pdf_to_excel_route():
# HTML to PDF — POST /api/v1/convert/html-to-pdf
# ---------------------------------------------------------------------------
@v1_bp.route("/convert/html-to-pdf", methods=["POST"])
@limiter.limit("10/minute")
def html_to_pdf_route():
@@ -981,7 +1121,9 @@ def html_to_pdf_route():
task_id, input_path = generate_safe_path(ext, folder_type="upload")
file.save(input_path)
task = html_to_pdf_task.delay(
input_path, task_id, original_filename,
input_path,
task_id,
original_filename,
**build_task_tracking_kwargs(actor),
)
record_accepted_usage(actor, "html-to-pdf", task.id)
@@ -992,6 +1134,7 @@ def html_to_pdf_route():
# QR Code — POST /api/v1/qrcode/generate
# ---------------------------------------------------------------------------
@v1_bp.route("/qrcode/generate", methods=["POST"])
@limiter.limit("20/minute")
def generate_qr_route():
@@ -1018,7 +1161,10 @@ def generate_qr_route():
task_id = str(uuid.uuid4())
task = generate_qr_task.delay(
task_id, str(data).strip(), size, "png",
task_id,
str(data).strip(),
size,
"png",
**build_task_tracking_kwargs(actor),
)
record_accepted_usage(actor, "qr-code", task.id)
@@ -1033,6 +1179,7 @@ def generate_qr_route():
# PDF to PowerPoint — POST /api/v1/convert/pdf-to-pptx
# ---------------------------------------------------------------------------
@v1_bp.route("/convert/pdf-to-pptx", methods=["POST"])
@limiter.limit("10/minute")
def v1_pdf_to_pptx_route():
@@ -1046,14 +1193,18 @@ def v1_pdf_to_pptx_route():
file = request.files["file"]
try:
original_filename, ext = validate_actor_file(file, allowed_types=["pdf"], actor=actor)
original_filename, ext = validate_actor_file(
file, allowed_types=["pdf"], actor=actor
)
except FileValidationError as e:
return jsonify({"error": e.message}), e.code
task_id, input_path = generate_safe_path(ext, folder_type="upload")
file.save(input_path)
task = pdf_to_pptx_task.delay(
input_path, task_id, original_filename,
input_path,
task_id,
original_filename,
**build_task_tracking_kwargs(actor),
)
record_accepted_usage(actor, "pdf-to-pptx", task.id)
@@ -1064,6 +1215,7 @@ def v1_pdf_to_pptx_route():
# Excel to PDF — POST /api/v1/convert/excel-to-pdf
# ---------------------------------------------------------------------------
@v1_bp.route("/convert/excel-to-pdf", methods=["POST"])
@limiter.limit("10/minute")
def v1_excel_to_pdf_route():
@@ -1086,7 +1238,9 @@ def v1_excel_to_pdf_route():
task_id, input_path = generate_safe_path(ext, folder_type="upload")
file.save(input_path)
task = excel_to_pdf_task.delay(
input_path, task_id, original_filename,
input_path,
task_id,
original_filename,
**build_task_tracking_kwargs(actor),
)
record_accepted_usage(actor, "excel-to-pdf", task.id)
@@ -1097,6 +1251,7 @@ def v1_excel_to_pdf_route():
# PowerPoint to PDF — POST /api/v1/convert/pptx-to-pdf
# ---------------------------------------------------------------------------
@v1_bp.route("/convert/pptx-to-pdf", methods=["POST"])
@limiter.limit("10/minute")
def v1_pptx_to_pdf_route():
@@ -1119,7 +1274,9 @@ def v1_pptx_to_pdf_route():
task_id, input_path = generate_safe_path(ext, folder_type="upload")
file.save(input_path)
task = pptx_to_pdf_task.delay(
input_path, task_id, original_filename,
input_path,
task_id,
original_filename,
**build_task_tracking_kwargs(actor),
)
record_accepted_usage(actor, "pptx-to-pdf", task.id)
@@ -1130,6 +1287,7 @@ def v1_pptx_to_pdf_route():
# Sign PDF — POST /api/v1/pdf-tools/sign
# ---------------------------------------------------------------------------
@v1_bp.route("/pdf-tools/sign", methods=["POST"])
@limiter.limit("10/minute")
def v1_sign_pdf_route():
@@ -1147,12 +1305,16 @@ def v1_sign_pdf_route():
sig_file = request.files["signature"]
try:
original_filename, ext = validate_actor_file(pdf_file, allowed_types=["pdf"], actor=actor)
original_filename, ext = validate_actor_file(
pdf_file, allowed_types=["pdf"], actor=actor
)
except FileValidationError as e:
return jsonify({"error": e.message}), e.code
try:
_, sig_ext = validate_actor_file(sig_file, allowed_types=ALLOWED_IMAGE_TYPES, actor=actor)
_, sig_ext = validate_actor_file(
sig_file, allowed_types=ALLOWED_IMAGE_TYPES, actor=actor
)
except FileValidationError as e:
return jsonify({"error": f"Signature: {e.message}"}), e.code
@@ -1174,8 +1336,15 @@ def v1_sign_pdf_route():
sig_file.save(signature_path)
task = sign_pdf_task.delay(
input_path, signature_path, task_id, original_filename,
page, x, y, width, height,
input_path,
signature_path,
task_id,
original_filename,
page,
x,
y,
width,
height,
**build_task_tracking_kwargs(actor),
)
record_accepted_usage(actor, "sign-pdf", task.id)
@@ -1186,6 +1355,7 @@ def v1_sign_pdf_route():
# Crop PDF — POST /api/v1/pdf-tools/crop
# ---------------------------------------------------------------------------
@v1_bp.route("/pdf-tools/crop", methods=["POST"])
@limiter.limit("10/minute")
def v1_crop_pdf_route():
@@ -1209,15 +1379,23 @@ def v1_crop_pdf_route():
pages = request.form.get("pages", "all")
try:
original_filename, ext = validate_actor_file(file, allowed_types=["pdf"], actor=actor)
original_filename, ext = validate_actor_file(
file, allowed_types=["pdf"], actor=actor
)
except FileValidationError as e:
return jsonify({"error": e.message}), e.code
task_id, input_path = generate_safe_path(ext, folder_type="upload")
file.save(input_path)
task = crop_pdf_task.delay(
input_path, task_id, original_filename,
margin_left, margin_right, margin_top, margin_bottom, pages,
input_path,
task_id,
original_filename,
margin_left,
margin_right,
margin_top,
margin_bottom,
pages,
**build_task_tracking_kwargs(actor),
)
record_accepted_usage(actor, "crop-pdf", task.id)
@@ -1228,6 +1406,7 @@ def v1_crop_pdf_route():
# Flatten PDF — POST /api/v1/pdf-tools/flatten
# ---------------------------------------------------------------------------
@v1_bp.route("/pdf-tools/flatten", methods=["POST"])
@limiter.limit("10/minute")
def v1_flatten_pdf_route():
@@ -1241,14 +1420,18 @@ def v1_flatten_pdf_route():
file = request.files["file"]
try:
original_filename, ext = validate_actor_file(file, allowed_types=["pdf"], actor=actor)
original_filename, ext = validate_actor_file(
file, allowed_types=["pdf"], actor=actor
)
except FileValidationError as e:
return jsonify({"error": e.message}), e.code
task_id, input_path = generate_safe_path(ext, folder_type="upload")
file.save(input_path)
task = flatten_pdf_task.delay(
input_path, task_id, original_filename,
input_path,
task_id,
original_filename,
**build_task_tracking_kwargs(actor),
)
record_accepted_usage(actor, "flatten-pdf", task.id)
@@ -1259,6 +1442,7 @@ def v1_flatten_pdf_route():
# Repair PDF — POST /api/v1/pdf-tools/repair
# ---------------------------------------------------------------------------
@v1_bp.route("/pdf-tools/repair", methods=["POST"])
@limiter.limit("10/minute")
def v1_repair_pdf_route():
@@ -1272,14 +1456,18 @@ def v1_repair_pdf_route():
file = request.files["file"]
try:
original_filename, ext = validate_actor_file(file, allowed_types=["pdf"], actor=actor)
original_filename, ext = validate_actor_file(
file, allowed_types=["pdf"], actor=actor
)
except FileValidationError as e:
return jsonify({"error": e.message}), e.code
task_id, input_path = generate_safe_path(ext, folder_type="upload")
file.save(input_path)
task = repair_pdf_task.delay(
input_path, task_id, original_filename,
input_path,
task_id,
original_filename,
**build_task_tracking_kwargs(actor),
)
record_accepted_usage(actor, "repair-pdf", task.id)
@@ -1290,6 +1478,7 @@ def v1_repair_pdf_route():
# Edit PDF Metadata — POST /api/v1/pdf-tools/metadata
# ---------------------------------------------------------------------------
@v1_bp.route("/pdf-tools/metadata", methods=["POST"])
@limiter.limit("10/minute")
def v1_edit_metadata_route():
@@ -1312,15 +1501,23 @@ def v1_edit_metadata_route():
return jsonify({"error": "At least one metadata field required."}), 400
try:
original_filename, ext = validate_actor_file(file, allowed_types=["pdf"], actor=actor)
original_filename, ext = validate_actor_file(
file, allowed_types=["pdf"], actor=actor
)
except FileValidationError as e:
return jsonify({"error": e.message}), e.code
task_id, input_path = generate_safe_path(ext, folder_type="upload")
file.save(input_path)
task = edit_metadata_task.delay(
input_path, task_id, original_filename,
title, author, subject, keywords, creator,
input_path,
task_id,
original_filename,
title,
author,
subject,
keywords,
creator,
**build_task_tracking_kwargs(actor),
)
record_accepted_usage(actor, "edit-metadata", task.id)
@@ -1331,6 +1528,7 @@ def v1_edit_metadata_route():
# Image Crop — POST /api/v1/image/crop
# ---------------------------------------------------------------------------
@v1_bp.route("/image/crop", methods=["POST"])
@limiter.limit("10/minute")
def v1_crop_image_route():
@@ -1364,8 +1562,13 @@ def v1_crop_image_route():
task_id, input_path = generate_safe_path(ext, folder_type="upload")
file.save(input_path)
task = crop_image_task.delay(
input_path, task_id, original_filename,
left, top, right, bottom,
input_path,
task_id,
original_filename,
left,
top,
right,
bottom,
**build_task_tracking_kwargs(actor),
)
record_accepted_usage(actor, "image-crop", task.id)
@@ -1376,6 +1579,7 @@ def v1_crop_image_route():
# Image Rotate/Flip — POST /api/v1/image/rotate-flip
# ---------------------------------------------------------------------------
@v1_bp.route("/image/rotate-flip", methods=["POST"])
@limiter.limit("10/minute")
def v1_rotate_flip_image_route():
@@ -1408,8 +1612,12 @@ def v1_rotate_flip_image_route():
task_id, input_path = generate_safe_path(ext, folder_type="upload")
file.save(input_path)
task = rotate_flip_image_task.delay(
input_path, task_id, original_filename,
rotation, flip_horizontal, flip_vertical,
input_path,
task_id,
original_filename,
rotation,
flip_horizontal,
flip_vertical,
**build_task_tracking_kwargs(actor),
)
record_accepted_usage(actor, "image-rotate-flip", task.id)
@@ -1420,6 +1628,7 @@ def v1_rotate_flip_image_route():
# Barcode — POST /api/v1/barcode/generate
# ---------------------------------------------------------------------------
@v1_bp.route("/barcode/generate", methods=["POST"])
@limiter.limit("20/minute")
def v1_generate_barcode_route():
@@ -1442,14 +1651,21 @@ def v1_generate_barcode_route():
return jsonify({"error": "Barcode data is required."}), 400
if barcode_type not in SUPPORTED_BARCODE_TYPES:
return jsonify({"error": f"Unsupported type. Supported: {', '.join(SUPPORTED_BARCODE_TYPES)}"}), 400
return jsonify(
{
"error": f"Unsupported type. Supported: {', '.join(SUPPORTED_BARCODE_TYPES)}"
}
), 400
if output_format not in ("png", "svg"):
output_format = "png"
task_id = str(uuid.uuid4())
task = generate_barcode_task.delay(
data, barcode_type, task_id, output_format,
data,
barcode_type,
task_id,
output_format,
**build_task_tracking_kwargs(actor),
)
record_accepted_usage(actor, "barcode", task.id)

View File

@@ -1,6 +1,11 @@
"""PDF AI services — Chat, Summarize, Translate, Table Extract."""
import json
import logging
import os
import tempfile
import time
from dataclasses import dataclass
import requests
@@ -11,9 +16,84 @@ from app.services.openrouter_config_service import (
logger = logging.getLogger(__name__)
DEFAULT_DEEPL_API_URL = "https://api-free.deepl.com/v2/translate"
DEFAULT_DEEPL_TIMEOUT_SECONDS = 90
MAX_TRANSLATION_CHUNK_CHARS = 3500
TRANSLATION_RETRY_ATTEMPTS = 3
TRANSLATION_RETRY_DELAY_SECONDS = 2
LANGUAGE_LABELS = {
"auto": "Auto Detect",
"en": "English",
"ar": "Arabic",
"fr": "French",
"es": "Spanish",
"de": "German",
"zh": "Chinese",
"ja": "Japanese",
"ko": "Korean",
"pt": "Portuguese",
"ru": "Russian",
"tr": "Turkish",
"it": "Italian",
}
DEEPL_LANGUAGE_CODES = {
"ar": "AR",
"de": "DE",
"en": "EN",
"es": "ES",
"fr": "FR",
"it": "IT",
"ja": "JA",
"ko": "KO",
"pt": "PT-PT",
"ru": "RU",
"tr": "TR",
"zh": "ZH",
}
OCR_LANGUAGE_CODES = {
"ar": "ara",
"en": "eng",
"fr": "fra",
}
@dataclass(frozen=True)
class DeepLSettings:
api_key: str
base_url: str
timeout_seconds: int
def _normalize_language_code(value: str | None, default: str = "") -> str:
normalized = str(value or "").strip().lower()
return normalized or default
def _language_label(value: str | None) -> str:
normalized = _normalize_language_code(value)
return LANGUAGE_LABELS.get(normalized, normalized or "Unknown")
def _get_deepl_settings() -> DeepLSettings:
api_key = str(os.getenv("DEEPL_API_KEY", "")).strip()
base_url = (
str(os.getenv("DEEPL_API_URL", DEFAULT_DEEPL_API_URL)).strip()
or DEFAULT_DEEPL_API_URL
)
timeout_seconds = int(
os.getenv("DEEPL_TIMEOUT_SECONDS", DEFAULT_DEEPL_TIMEOUT_SECONDS)
)
return DeepLSettings(
api_key=api_key, base_url=base_url, timeout_seconds=timeout_seconds
)
class PdfAiError(Exception):
"""Custom exception for PDF AI service failures."""
def __init__(
self,
user_message: str,
@@ -26,6 +106,42 @@ class PdfAiError(Exception):
self.detail = detail
class RetryableTranslationError(PdfAiError):
"""Error wrapper used for provider failures that should be retried."""
def _translate_with_retry(action, provider_name: str) -> dict:
last_error: PdfAiError | None = None
for attempt in range(1, TRANSLATION_RETRY_ATTEMPTS + 1):
try:
return action()
except RetryableTranslationError as error:
last_error = error
logger.warning(
"%s translation attempt %s/%s failed with retryable error %s",
provider_name,
attempt,
TRANSLATION_RETRY_ATTEMPTS,
error.error_code,
)
if attempt == TRANSLATION_RETRY_ATTEMPTS:
break
time.sleep(TRANSLATION_RETRY_DELAY_SECONDS * attempt)
if last_error:
raise PdfAiError(
last_error.user_message,
error_code=last_error.error_code,
detail=last_error.detail,
)
raise PdfAiError(
"Translation provider failed unexpectedly.",
error_code="TRANSLATION_PROVIDER_FAILED",
)
def _estimate_tokens(text: str) -> int:
"""Rough token estimate: ~4 chars per token for English."""
return max(1, len(text) // 4)
@@ -49,7 +165,30 @@ def _extract_text_from_pdf(input_path: str, max_pages: int = 50) -> str:
text = page.extract_text() or ""
if text.strip():
texts.append(f"[Page {i + 1}]\n{text}")
return "\n\n".join(texts)
extracted = "\n\n".join(texts)
if extracted.strip():
return extracted
# Fall back to OCR for scanned/image-only PDFs instead of failing fast.
try:
from app.services.ocr_service import ocr_pdf
with tempfile.NamedTemporaryFile(suffix=".txt", delete=False) as handle:
ocr_output_path = handle.name
try:
data = ocr_pdf(input_path, ocr_output_path, lang="eng")
ocr_text = str(data.get("text", "")).strip()
if ocr_text:
return ocr_text
finally:
if os.path.exists(ocr_output_path):
os.unlink(ocr_output_path)
except Exception as ocr_error:
logger.warning("OCR fallback for PDF text extraction failed: %s", ocr_error)
return ""
except PdfAiError:
raise
except Exception as e:
@@ -70,14 +209,17 @@ def _call_openrouter(
# Budget guard
try:
from app.services.ai_cost_service import check_ai_budget, AiBudgetExceededError
check_ai_budget()
except AiBudgetExceededError:
except ImportError:
pass
except Exception as error:
if error.__class__.__name__ == "AiBudgetExceededError":
raise PdfAiError(
"Monthly AI processing budget has been reached. Please try again next month.",
error_code="AI_BUDGET_EXCEEDED",
)
except Exception:
pass # Don't block if cost service unavailable
pass
settings = get_openrouter_settings()
@@ -127,14 +269,14 @@ def _call_openrouter(
if status_code == 429:
logger.warning("OpenRouter rate limit reached (429).")
raise PdfAiError(
raise RetryableTranslationError(
"AI service is experiencing high demand. Please wait a moment and try again.",
error_code="OPENROUTER_RATE_LIMIT",
)
if status_code >= 500:
logger.error("OpenRouter server error (%s).", status_code)
raise PdfAiError(
raise RetryableTranslationError(
"AI service provider is experiencing issues. Please try again shortly.",
error_code="OPENROUTER_SERVER_ERROR",
)
@@ -144,7 +286,11 @@ def _call_openrouter(
# Handle model-level errors returned inside a 200 response
if data.get("error"):
error_msg = data["error"].get("message", "") if isinstance(data["error"], dict) else str(data["error"])
error_msg = (
data["error"].get("message", "")
if isinstance(data["error"], dict)
else str(data["error"])
)
logger.error("OpenRouter returned an error payload: %s", error_msg)
raise PdfAiError(
"AI service encountered an issue. Please try again.",
@@ -163,6 +309,7 @@ def _call_openrouter(
# Log usage
try:
from app.services.ai_cost_service import log_ai_usage
usage = data.get("usage", {})
log_ai_usage(
tool=tool_name,
@@ -178,13 +325,13 @@ def _call_openrouter(
except PdfAiError:
raise
except requests.exceptions.Timeout:
raise PdfAiError(
raise RetryableTranslationError(
"AI service timed out. Please try again.",
error_code="OPENROUTER_TIMEOUT",
)
except requests.exceptions.ConnectionError:
logger.error("Cannot connect to OpenRouter API at %s", settings.base_url)
raise PdfAiError(
raise RetryableTranslationError(
"AI service is unreachable. Please try again shortly.",
error_code="OPENROUTER_CONNECTION_ERROR",
)
@@ -197,6 +344,218 @@ def _call_openrouter(
)
def _split_translation_chunks(
text: str, max_chars: int = MAX_TRANSLATION_CHUNK_CHARS
) -> list[str]:
"""Split extracted PDF text into stable chunks while preserving page markers."""
chunks: list[str] = []
current: list[str] = []
current_length = 0
for block in text.split("\n\n"):
normalized = block.strip()
if not normalized:
continue
block_length = len(normalized) + 2
if current and current_length + block_length > max_chars:
chunks.append("\n\n".join(current))
current = [normalized]
current_length = block_length
continue
current.append(normalized)
current_length += block_length
if current:
chunks.append("\n\n".join(current))
return chunks or [text]
def _call_deepl_translate(
chunk: str, target_language: str, source_language: str | None = None
) -> dict:
"""Translate a chunk with DeepL when premium credentials are configured."""
settings = _get_deepl_settings()
if not settings.api_key:
raise PdfAiError(
"DeepL is not configured.",
error_code="DEEPL_NOT_CONFIGURED",
)
target_code = DEEPL_LANGUAGE_CODES.get(_normalize_language_code(target_language))
if not target_code:
raise PdfAiError(
f"Target language '{target_language}' is not supported by the premium translation provider.",
error_code="DEEPL_UNSUPPORTED_TARGET_LANGUAGE",
)
payload: dict[str, object] = {
"text": [chunk],
"target_lang": target_code,
"preserve_formatting": True,
"tag_handling": "xml",
"split_sentences": "nonewlines",
}
source_code = DEEPL_LANGUAGE_CODES.get(_normalize_language_code(source_language))
if source_code:
payload["source_lang"] = source_code
try:
response = requests.post(
settings.base_url,
headers={
"Authorization": f"DeepL-Auth-Key {settings.api_key}",
"Content-Type": "application/json",
},
json=payload,
timeout=settings.timeout_seconds,
)
except requests.exceptions.Timeout:
raise RetryableTranslationError(
"Premium translation service timed out. Retrying...",
error_code="DEEPL_TIMEOUT",
)
except requests.exceptions.ConnectionError:
raise RetryableTranslationError(
"Premium translation service is temporarily unreachable. Retrying...",
error_code="DEEPL_CONNECTION_ERROR",
)
except requests.exceptions.RequestException as error:
raise PdfAiError(
"Premium translation service is temporarily unavailable.",
error_code="DEEPL_REQUEST_ERROR",
detail=str(error),
)
if response.status_code == 429:
raise RetryableTranslationError(
"Premium translation service is busy. Retrying...",
error_code="DEEPL_RATE_LIMIT",
)
if response.status_code >= 500:
raise RetryableTranslationError(
"Premium translation service is experiencing issues. Retrying...",
error_code="DEEPL_SERVER_ERROR",
)
if response.status_code in {403, 456}:
raise PdfAiError(
"Premium translation provider credits or permissions need attention.",
error_code="DEEPL_CREDITS_OR_PERMISSIONS",
)
response.raise_for_status()
data = response.json()
translations = data.get("translations") or []
if not translations:
raise PdfAiError(
"Premium translation provider returned an empty response.",
error_code="DEEPL_EMPTY_RESPONSE",
)
first = translations[0]
translated_text = str(first.get("text", "")).strip()
if not translated_text:
raise PdfAiError(
"Premium translation provider returned an empty response.",
error_code="DEEPL_EMPTY_TEXT",
)
return {
"translation": translated_text,
"provider": "deepl",
"detected_source_language": str(first.get("detected_source_language", ""))
.strip()
.lower(),
}
def _call_openrouter_translate(
chunk: str, target_language: str, source_language: str | None = None
) -> dict:
source_hint = "auto-detect the source language"
if source_language and _normalize_language_code(source_language) != "auto":
source_hint = f"treat {_language_label(source_language)} as the source language"
system_prompt = (
"You are a professional document translator. "
f"Translate the provided PDF content into {_language_label(target_language)}. "
f"Please {source_hint}. Preserve headings, lists, tables, and page markers. "
"Return only the translated text."
)
translation = _call_openrouter(
system_prompt,
chunk,
max_tokens=2200,
tool_name="pdf_translate_fallback",
)
return {
"translation": translation,
"provider": "openrouter",
"detected_source_language": _normalize_language_code(
source_language, default=""
),
}
def _translate_document_text(
text: str, target_language: str, source_language: str | None = None
) -> dict:
chunks = _split_translation_chunks(text)
translations: list[str] = []
detected_source_language = _normalize_language_code(source_language)
if detected_source_language == "auto":
detected_source_language = ""
providers_used: list[str] = []
for chunk in chunks:
chunk_result: dict | None = None
deepl_settings = _get_deepl_settings()
if deepl_settings.api_key:
try:
chunk_result = _translate_with_retry(
lambda: _call_deepl_translate(
chunk, target_language, source_language
),
provider_name="DeepL",
)
except PdfAiError as deepl_error:
logger.warning(
"DeepL translation failed for chunk; falling back to OpenRouter. code=%s detail=%s",
deepl_error.error_code,
deepl_error.detail,
)
if chunk_result is None:
chunk_result = _translate_with_retry(
lambda: _call_openrouter_translate(
chunk, target_language, source_language
),
provider_name="OpenRouter",
)
translations.append(str(chunk_result["translation"]).strip())
providers_used.append(str(chunk_result["provider"]))
if not detected_source_language and chunk_result.get(
"detected_source_language"
):
detected_source_language = _normalize_language_code(
chunk_result["detected_source_language"]
)
return {
"translation": "\n\n".join(part for part in translations if part),
"provider": ", ".join(sorted(set(providers_used))),
"detected_source_language": detected_source_language,
"chunks_translated": len(translations),
}
# ---------------------------------------------------------------------------
# 1. Chat with PDF
# ---------------------------------------------------------------------------
@@ -212,11 +571,15 @@ def chat_with_pdf(input_path: str, question: str) -> dict:
{"reply": "...", "pages_analyzed": int}
"""
if not question or not question.strip():
raise PdfAiError("Please provide a question.", error_code="PDF_AI_INVALID_INPUT")
raise PdfAiError(
"Please provide a question.", error_code="PDF_AI_INVALID_INPUT"
)
text = _extract_text_from_pdf(input_path)
if not text.strip():
raise PdfAiError("Could not extract any text from the PDF.", error_code="PDF_TEXT_EMPTY")
raise PdfAiError(
"Could not extract any text from the PDF.", error_code="PDF_TEXT_EMPTY"
)
# Truncate to fit context window
max_chars = 12000
@@ -230,7 +593,9 @@ def chat_with_pdf(input_path: str, question: str) -> dict:
)
user_msg = f"Document content:\n{truncated}\n\nQuestion: {question}"
reply = _call_openrouter(system_prompt, user_msg, max_tokens=800, tool_name="pdf_chat")
reply = _call_openrouter(
system_prompt, user_msg, max_tokens=800, tool_name="pdf_chat"
)
page_count = text.count("[Page ")
return {"reply": reply, "pages_analyzed": page_count}
@@ -252,7 +617,9 @@ def summarize_pdf(input_path: str, length: str = "medium") -> dict:
"""
text = _extract_text_from_pdf(input_path)
if not text.strip():
raise PdfAiError("Could not extract any text from the PDF.", error_code="PDF_TEXT_EMPTY")
raise PdfAiError(
"Could not extract any text from the PDF.", error_code="PDF_TEXT_EMPTY"
)
length_instruction = {
"short": "Provide a brief summary in 2-3 sentences.",
@@ -270,7 +637,9 @@ def summarize_pdf(input_path: str, length: str = "medium") -> dict:
)
user_msg = f"{length_instruction}\n\nDocument content:\n{truncated}"
summary = _call_openrouter(system_prompt, user_msg, max_tokens=1000, tool_name="pdf_summarize")
summary = _call_openrouter(
system_prompt, user_msg, max_tokens=1000, tool_name="pdf_summarize"
)
page_count = text.count("[Page ")
return {"summary": summary, "pages_analyzed": page_count}
@@ -279,7 +648,9 @@ def summarize_pdf(input_path: str, length: str = "medium") -> dict:
# ---------------------------------------------------------------------------
# 3. Translate PDF
# ---------------------------------------------------------------------------
def translate_pdf(input_path: str, target_language: str) -> dict:
def translate_pdf(
input_path: str, target_language: str, source_language: str | None = None
) -> dict:
"""
Translate the text content of a PDF to another language.
@@ -290,29 +661,46 @@ def translate_pdf(input_path: str, target_language: str) -> dict:
Returns:
{"translation": "...", "pages_analyzed": int, "target_language": str}
"""
if not target_language or not target_language.strip():
raise PdfAiError("Please specify a target language.", error_code="PDF_AI_INVALID_INPUT")
normalized_target_language = _normalize_language_code(target_language)
normalized_source_language = _normalize_language_code(
source_language, default="auto"
)
if not normalized_target_language:
raise PdfAiError(
"Please specify a target language.", error_code="PDF_AI_INVALID_INPUT"
)
if (
normalized_target_language == normalized_source_language
and normalized_source_language != "auto"
):
raise PdfAiError(
"Please choose different source and target languages.",
error_code="PDF_AI_INVALID_INPUT",
)
text = _extract_text_from_pdf(input_path)
if not text.strip():
raise PdfAiError("Could not extract any text from the PDF.", error_code="PDF_TEXT_EMPTY")
max_chars = 10000
truncated = text[:max_chars]
system_prompt = (
f"You are a professional translator. Translate the following document "
f"content into {target_language}. Preserve the original formatting and "
f"structure as much as possible. Only output the translation, nothing else."
raise PdfAiError(
"Could not extract any text from the PDF.", error_code="PDF_TEXT_EMPTY"
)
translation = _call_openrouter(system_prompt, truncated, max_tokens=2000, tool_name="pdf_translate")
translated = _translate_document_text(
text,
target_language=normalized_target_language,
source_language=normalized_source_language,
)
page_count = text.count("[Page ")
return {
"translation": translation,
"translation": translated["translation"],
"pages_analyzed": page_count,
"target_language": target_language,
"target_language": normalized_target_language,
"source_language": normalized_source_language,
"detected_source_language": translated["detected_source_language"],
"provider": translated["provider"],
"chunks_translated": translated["chunks_translated"],
}
@@ -361,12 +749,14 @@ def extract_tables(input_path: str) -> dict:
cells.append(str(val))
rows.append(cells)
result_tables.append({
result_tables.append(
{
"page": page_num,
"table_index": table_index,
"headers": headers,
"rows": rows,
})
}
)
table_index += 1
if not result_tables:
@@ -385,7 +775,9 @@ def extract_tables(input_path: str) -> dict:
except PdfAiError:
raise
except ImportError:
raise PdfAiError("tabula-py library is not installed.", error_code="TABULA_NOT_INSTALLED")
raise PdfAiError(
"tabula-py library is not installed.", error_code="TABULA_NOT_INSTALLED"
)
except Exception as e:
raise PdfAiError(
"Failed to extract tables.",

View File

@@ -1,4 +1,5 @@
"""Celery tasks for PDF AI tools — Chat, Summarize, Translate, Table Extract."""
import os
import logging
import json
@@ -28,7 +29,8 @@ def _build_pdf_ai_error_payload(task_id: str, error: PdfAiError, tool: str) -> d
payload = {
"status": "failed",
"error_code": getattr(error, "error_code", "PDF_AI_ERROR"),
"user_message": getattr(error, "user_message", str(error)) or "AI processing failed.",
"user_message": getattr(error, "user_message", str(error))
or "AI processing failed.",
"task_id": task_id,
}
@@ -80,9 +82,12 @@ def chat_with_pdf_task(
logger.info(f"Task {task_id}: Chat with PDF completed")
finalize_task_tracking(
user_id=user_id, tool="chat-pdf",
original_filename=original_filename, result=result,
usage_source=usage_source, api_key_id=api_key_id,
user_id=user_id,
tool="chat-pdf",
original_filename=original_filename,
result=result,
usage_source=usage_source,
api_key_id=api_key_id,
celery_task_id=self.request.id,
)
_cleanup(task_id)
@@ -91,9 +96,12 @@ def chat_with_pdf_task(
except PdfAiError as e:
result = _build_pdf_ai_error_payload(task_id, e, "chat-pdf")
finalize_task_tracking(
user_id=user_id, tool="chat-pdf",
original_filename=original_filename, result=result,
usage_source=usage_source, api_key_id=api_key_id,
user_id=user_id,
tool="chat-pdf",
original_filename=original_filename,
result=result,
usage_source=usage_source,
api_key_id=api_key_id,
celery_task_id=self.request.id,
)
_cleanup(task_id)
@@ -103,9 +111,12 @@ def chat_with_pdf_task(
logger.error(f"Task {task_id}: Unexpected error — {e}")
result = {"status": "failed", "error": "An unexpected error occurred."}
finalize_task_tracking(
user_id=user_id, tool="chat-pdf",
original_filename=original_filename, result=result,
usage_source=usage_source, api_key_id=api_key_id,
user_id=user_id,
tool="chat-pdf",
original_filename=original_filename,
result=result,
usage_source=usage_source,
api_key_id=api_key_id,
celery_task_id=self.request.id,
)
_cleanup(task_id)
@@ -140,9 +151,12 @@ def summarize_pdf_task(
logger.info(f"Task {task_id}: PDF summarize completed")
finalize_task_tracking(
user_id=user_id, tool="summarize-pdf",
original_filename=original_filename, result=result,
usage_source=usage_source, api_key_id=api_key_id,
user_id=user_id,
tool="summarize-pdf",
original_filename=original_filename,
result=result,
usage_source=usage_source,
api_key_id=api_key_id,
celery_task_id=self.request.id,
)
_cleanup(task_id)
@@ -151,9 +165,12 @@ def summarize_pdf_task(
except PdfAiError as e:
result = _build_pdf_ai_error_payload(task_id, e, "summarize-pdf")
finalize_task_tracking(
user_id=user_id, tool="summarize-pdf",
original_filename=original_filename, result=result,
usage_source=usage_source, api_key_id=api_key_id,
user_id=user_id,
tool="summarize-pdf",
original_filename=original_filename,
result=result,
usage_source=usage_source,
api_key_id=api_key_id,
celery_task_id=self.request.id,
)
_cleanup(task_id)
@@ -163,9 +180,12 @@ def summarize_pdf_task(
logger.error(f"Task {task_id}: Unexpected error — {e}")
result = {"status": "failed", "error": "An unexpected error occurred."}
finalize_task_tracking(
user_id=user_id, tool="summarize-pdf",
original_filename=original_filename, result=result,
usage_source=usage_source, api_key_id=api_key_id,
user_id=user_id,
tool="summarize-pdf",
original_filename=original_filename,
result=result,
usage_source=usage_source,
api_key_id=api_key_id,
celery_task_id=self.request.id,
)
_cleanup(task_id)
@@ -182,28 +202,41 @@ def translate_pdf_task(
task_id: str,
original_filename: str,
target_language: str,
source_language: str | None = None,
user_id: int | None = None,
usage_source: str = "web",
api_key_id: int | None = None,
):
"""Translate a PDF document to another language."""
try:
self.update_state(state="PROCESSING", meta={"step": "Translating document..."})
self.update_state(
state="PROCESSING",
meta={"step": "Translating document with provider fallback..."},
)
data = translate_pdf(input_path, target_language)
data = translate_pdf(
input_path, target_language, source_language=source_language
)
result = {
"status": "completed",
"translation": data["translation"],
"pages_analyzed": data["pages_analyzed"],
"target_language": data["target_language"],
"source_language": data.get("source_language"),
"detected_source_language": data.get("detected_source_language"),
"provider": data.get("provider"),
"chunks_translated": data.get("chunks_translated"),
}
logger.info(f"Task {task_id}: PDF translate completed")
finalize_task_tracking(
user_id=user_id, tool="translate-pdf",
original_filename=original_filename, result=result,
usage_source=usage_source, api_key_id=api_key_id,
user_id=user_id,
tool="translate-pdf",
original_filename=original_filename,
result=result,
usage_source=usage_source,
api_key_id=api_key_id,
celery_task_id=self.request.id,
)
_cleanup(task_id)
@@ -212,9 +245,12 @@ def translate_pdf_task(
except PdfAiError as e:
result = _build_pdf_ai_error_payload(task_id, e, "translate-pdf")
finalize_task_tracking(
user_id=user_id, tool="translate-pdf",
original_filename=original_filename, result=result,
usage_source=usage_source, api_key_id=api_key_id,
user_id=user_id,
tool="translate-pdf",
original_filename=original_filename,
result=result,
usage_source=usage_source,
api_key_id=api_key_id,
celery_task_id=self.request.id,
)
_cleanup(task_id)
@@ -224,9 +260,12 @@ def translate_pdf_task(
logger.error(f"Task {task_id}: Unexpected error — {e}")
result = {"status": "failed", "error": "An unexpected error occurred."}
finalize_task_tracking(
user_id=user_id, tool="translate-pdf",
original_filename=original_filename, result=result,
usage_source=usage_source, api_key_id=api_key_id,
user_id=user_id,
tool="translate-pdf",
original_filename=original_filename,
result=result,
usage_source=usage_source,
api_key_id=api_key_id,
celery_task_id=self.request.id,
)
_cleanup(task_id)
@@ -260,9 +299,12 @@ def extract_tables_task(
logger.info(f"Task {task_id}: Table extraction completed")
finalize_task_tracking(
user_id=user_id, tool="extract-tables",
original_filename=original_filename, result=result,
usage_source=usage_source, api_key_id=api_key_id,
user_id=user_id,
tool="extract-tables",
original_filename=original_filename,
result=result,
usage_source=usage_source,
api_key_id=api_key_id,
celery_task_id=self.request.id,
)
_cleanup(task_id)
@@ -271,9 +313,12 @@ def extract_tables_task(
except PdfAiError as e:
result = _build_pdf_ai_error_payload(task_id, e, "extract-tables")
finalize_task_tracking(
user_id=user_id, tool="extract-tables",
original_filename=original_filename, result=result,
usage_source=usage_source, api_key_id=api_key_id,
user_id=user_id,
tool="extract-tables",
original_filename=original_filename,
result=result,
usage_source=usage_source,
api_key_id=api_key_id,
celery_task_id=self.request.id,
)
_cleanup(task_id)
@@ -283,9 +328,12 @@ def extract_tables_task(
logger.error(f"Task {task_id}: Unexpected error — {e}")
result = {"status": "failed", "error": "An unexpected error occurred."}
finalize_task_tracking(
user_id=user_id, tool="extract-tables",
original_filename=original_filename, result=result,
usage_source=usage_source, api_key_id=api_key_id,
user_id=user_id,
tool="extract-tables",
original_filename=original_filename,
result=result,
usage_source=usage_source,
api_key_id=api_key_id,
celery_task_id=self.request.id,
)
_cleanup(task_id)

View File

@@ -26,20 +26,21 @@ def _env_or_default(name: str, default: str) -> str:
class BaseConfig:
"""Base configuration."""
SECRET_KEY = os.getenv("SECRET_KEY", "change-me-in-production")
INTERNAL_ADMIN_SECRET = os.getenv("INTERNAL_ADMIN_SECRET", "")
INTERNAL_ADMIN_EMAILS = _parse_csv_env("INTERNAL_ADMIN_EMAILS")
# File upload settings
MAX_CONTENT_LENGTH = int(
os.getenv("ABSOLUTE_MAX_CONTENT_LENGTH_MB", 100)
) * 1024 * 1024
MAX_CONTENT_LENGTH = (
int(os.getenv("ABSOLUTE_MAX_CONTENT_LENGTH_MB", 100)) * 1024 * 1024
)
UPLOAD_FOLDER = _env_or_default("UPLOAD_FOLDER", "/tmp/uploads")
OUTPUT_FOLDER = _env_or_default("OUTPUT_FOLDER", "/tmp/outputs")
FILE_EXPIRY_SECONDS = int(os.getenv("FILE_EXPIRY_SECONDS", 1800))
STORAGE_ALLOW_LOCAL_FALLBACK = os.getenv(
"STORAGE_ALLOW_LOCAL_FALLBACK", "true"
).lower() == "true"
STORAGE_ALLOW_LOCAL_FALLBACK = (
os.getenv("STORAGE_ALLOW_LOCAL_FALLBACK", "true").lower() == "true"
)
DATABASE_PATH = _env_or_default(
"DATABASE_PATH", os.path.join(BASE_DIR, "data", "dociva.db")
)
@@ -69,9 +70,7 @@ class BaseConfig:
"application/vnd.openxmlformats-officedocument.presentationml.presentation"
],
"ppt": ["application/vnd.ms-powerpoint"],
"xlsx": [
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
],
"xlsx": ["application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"],
"xls": ["application/vnd.ms-excel"],
}
@@ -118,11 +117,20 @@ class BaseConfig:
# OpenRouter AI
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY", "")
OPENROUTER_MODEL = os.getenv("OPENROUTER_MODEL", "nvidia/nemotron-3-super-120b-a12b:free")
OPENROUTER_MODEL = os.getenv(
"OPENROUTER_MODEL", "nvidia/nemotron-3-super-120b-a12b:free"
)
OPENROUTER_BASE_URL = os.getenv(
"OPENROUTER_BASE_URL", "https://openrouter.ai/api/v1/chat/completions"
)
# Premium translation provider (recommended for Translate PDF)
DEEPL_API_KEY = os.getenv("DEEPL_API_KEY", "")
DEEPL_API_URL = os.getenv(
"DEEPL_API_URL", "https://api-free.deepl.com/v2/translate"
)
DEEPL_TIMEOUT_SECONDS = int(os.getenv("DEEPL_TIMEOUT_SECONDS", 90))
# SMTP (for password reset emails)
SMTP_HOST = os.getenv("SMTP_HOST", "")
SMTP_PORT = int(os.getenv("SMTP_PORT", 587))
@@ -156,12 +164,14 @@ class BaseConfig:
class DevelopmentConfig(BaseConfig):
"""Development configuration."""
DEBUG = True
TESTING = False
class ProductionConfig(BaseConfig):
"""Production configuration."""
DEBUG = False
TESTING = False
SESSION_COOKIE_SECURE = True
@@ -172,6 +182,7 @@ class ProductionConfig(BaseConfig):
class TestingConfig(BaseConfig):
"""Testing configuration."""
DEBUG = True
TESTING = True
UPLOAD_FOLDER = "/tmp/test_uploads"

View File

@@ -0,0 +1,93 @@
"""Tests for the resilient PDF translation workflow."""
from app.services.pdf_ai_service import DeepLSettings, PdfAiError, translate_pdf
def test_translate_pdf_prefers_premium_provider(monkeypatch):
"""Should use the premium provider when configured and available."""
monkeypatch.setattr(
"app.services.pdf_ai_service._extract_text_from_pdf",
lambda _path: "[Page 1]\nHello world\n\n[Page 2]\nSecond page",
)
monkeypatch.setattr(
"app.services.pdf_ai_service._get_deepl_settings",
lambda: DeepLSettings(
api_key="key",
base_url="https://api-free.deepl.com/v2/translate",
timeout_seconds=90,
),
)
monkeypatch.setattr(
"app.services.pdf_ai_service._translate_with_retry",
lambda action, provider_name: action(),
)
monkeypatch.setattr(
"app.services.pdf_ai_service._call_deepl_translate",
lambda chunk, target_language, source_language=None: {
"translation": f"translated::{chunk}",
"provider": "deepl",
"detected_source_language": "en",
},
)
result = translate_pdf("/tmp/demo.pdf", "fr", source_language="en")
assert result["provider"] == "deepl"
assert result["target_language"] == "fr"
assert result["detected_source_language"] == "en"
assert "translated::" in result["translation"]
def test_translate_pdf_falls_back_when_premium_provider_fails(monkeypatch):
"""Should fall back to OpenRouter if the premium provider fails."""
monkeypatch.setattr(
"app.services.pdf_ai_service._extract_text_from_pdf",
lambda _path: "[Page 1]\nHello world",
)
monkeypatch.setattr(
"app.services.pdf_ai_service._get_deepl_settings",
lambda: DeepLSettings(
api_key="key",
base_url="https://api-free.deepl.com/v2/translate",
timeout_seconds=90,
),
)
monkeypatch.setattr(
"app.services.pdf_ai_service._translate_with_retry",
lambda action, provider_name: action(),
)
def fail_deepl(*_args, **_kwargs):
raise PdfAiError("DeepL unavailable", error_code="DEEPL_SERVER_ERROR")
monkeypatch.setattr("app.services.pdf_ai_service._call_deepl_translate", fail_deepl)
monkeypatch.setattr(
"app.services.pdf_ai_service._call_openrouter_translate",
lambda chunk, target_language, source_language=None: {
"translation": f"fallback::{chunk}",
"provider": "openrouter",
"detected_source_language": "en",
},
)
result = translate_pdf("/tmp/demo.pdf", "de", source_language="auto")
assert result["provider"] == "openrouter"
assert result["detected_source_language"] == "en"
assert result["translation"].startswith("fallback::")
def test_translate_pdf_rejects_identical_languages(monkeypatch):
"""Should reject no-op translation requests."""
monkeypatch.setattr(
"app.services.pdf_ai_service._extract_text_from_pdf",
lambda _path: "[Page 1]\nHello world",
)
try:
translate_pdf("/tmp/demo.pdf", "fr", source_language="fr")
except PdfAiError as error:
assert error.error_code == "PDF_AI_INVALID_INPUT"
assert "different source and target languages" in error.user_message
else:
raise AssertionError("Expected identical language validation to fail")

File diff suppressed because it is too large Load Diff

View File

@@ -2,31 +2,31 @@
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url>
<loc>https://dociva.io/blog/how-to-compress-pdf-online</loc>
<lastmod>2026-03-29</lastmod>
<lastmod>2026-03-30</lastmod>
<changefreq>monthly</changefreq>
<priority>0.6</priority>
</url>
<url>
<loc>https://dociva.io/blog/convert-images-without-losing-quality</loc>
<lastmod>2026-03-29</lastmod>
<lastmod>2026-03-30</lastmod>
<changefreq>monthly</changefreq>
<priority>0.6</priority>
</url>
<url>
<loc>https://dociva.io/blog/ocr-extract-text-from-images</loc>
<lastmod>2026-03-29</lastmod>
<lastmod>2026-03-30</lastmod>
<changefreq>monthly</changefreq>
<priority>0.6</priority>
</url>
<url>
<loc>https://dociva.io/blog/merge-split-pdf-files</loc>
<lastmod>2026-03-29</lastmod>
<lastmod>2026-03-30</lastmod>
<changefreq>monthly</changefreq>
<priority>0.6</priority>
</url>
<url>
<loc>https://dociva.io/blog/ai-chat-with-pdf-documents</loc>
<lastmod>2026-03-29</lastmod>
<lastmod>2026-03-30</lastmod>
<changefreq>monthly</changefreq>
<priority>0.6</priority>
</url>

File diff suppressed because it is too large Load Diff

View File

@@ -2,55 +2,55 @@
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url>
<loc>https://dociva.io/</loc>
<lastmod>2026-03-29</lastmod>
<lastmod>2026-03-30</lastmod>
<changefreq>daily</changefreq>
<priority>1.0</priority>
</url>
<url>
<loc>https://dociva.io/tools</loc>
<lastmod>2026-03-29</lastmod>
<lastmod>2026-03-30</lastmod>
<changefreq>weekly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://dociva.io/about</loc>
<lastmod>2026-03-29</lastmod>
<lastmod>2026-03-30</lastmod>
<changefreq>monthly</changefreq>
<priority>0.4</priority>
</url>
<url>
<loc>https://dociva.io/contact</loc>
<lastmod>2026-03-29</lastmod>
<lastmod>2026-03-30</lastmod>
<changefreq>monthly</changefreq>
<priority>0.4</priority>
</url>
<url>
<loc>https://dociva.io/privacy</loc>
<lastmod>2026-03-29</lastmod>
<lastmod>2026-03-30</lastmod>
<changefreq>yearly</changefreq>
<priority>0.3</priority>
</url>
<url>
<loc>https://dociva.io/terms</loc>
<lastmod>2026-03-29</lastmod>
<lastmod>2026-03-30</lastmod>
<changefreq>yearly</changefreq>
<priority>0.3</priority>
</url>
<url>
<loc>https://dociva.io/pricing</loc>
<lastmod>2026-03-29</lastmod>
<lastmod>2026-03-30</lastmod>
<changefreq>monthly</changefreq>
<priority>0.7</priority>
</url>
<url>
<loc>https://dociva.io/blog</loc>
<lastmod>2026-03-29</lastmod>
<lastmod>2026-03-30</lastmod>
<changefreq>weekly</changefreq>
<priority>0.6</priority>
</url>
<url>
<loc>https://dociva.io/developers</loc>
<lastmod>2026-03-29</lastmod>
<lastmod>2026-03-30</lastmod>
<changefreq>monthly</changefreq>
<priority>0.5</priority>
</url>

View File

@@ -2,265 +2,265 @@
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url>
<loc>https://dociva.io/tools/pdf-to-word</loc>
<lastmod>2026-03-29</lastmod>
<lastmod>2026-03-30</lastmod>
<changefreq>weekly</changefreq>
<priority>0.9</priority>
</url>
<url>
<loc>https://dociva.io/tools/word-to-pdf</loc>
<lastmod>2026-03-29</lastmod>
<lastmod>2026-03-30</lastmod>
<changefreq>weekly</changefreq>
<priority>0.9</priority>
</url>
<url>
<loc>https://dociva.io/tools/compress-pdf</loc>
<lastmod>2026-03-29</lastmod>
<lastmod>2026-03-30</lastmod>
<changefreq>weekly</changefreq>
<priority>0.9</priority>
</url>
<url>
<loc>https://dociva.io/tools/merge-pdf</loc>
<lastmod>2026-03-29</lastmod>
<lastmod>2026-03-30</lastmod>
<changefreq>weekly</changefreq>
<priority>0.9</priority>
</url>
<url>
<loc>https://dociva.io/tools/split-pdf</loc>
<lastmod>2026-03-29</lastmod>
<lastmod>2026-03-30</lastmod>
<changefreq>weekly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://dociva.io/tools/rotate-pdf</loc>
<lastmod>2026-03-29</lastmod>
<lastmod>2026-03-30</lastmod>
<changefreq>weekly</changefreq>
<priority>0.7</priority>
</url>
<url>
<loc>https://dociva.io/tools/pdf-to-images</loc>
<lastmod>2026-03-29</lastmod>
<lastmod>2026-03-30</lastmod>
<changefreq>weekly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://dociva.io/tools/images-to-pdf</loc>
<lastmod>2026-03-29</lastmod>
<lastmod>2026-03-30</lastmod>
<changefreq>weekly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://dociva.io/tools/watermark-pdf</loc>
<lastmod>2026-03-29</lastmod>
<lastmod>2026-03-30</lastmod>
<changefreq>weekly</changefreq>
<priority>0.7</priority>
</url>
<url>
<loc>https://dociva.io/tools/protect-pdf</loc>
<lastmod>2026-03-29</lastmod>
<lastmod>2026-03-30</lastmod>
<changefreq>weekly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://dociva.io/tools/unlock-pdf</loc>
<lastmod>2026-03-29</lastmod>
<lastmod>2026-03-30</lastmod>
<changefreq>weekly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://dociva.io/tools/page-numbers</loc>
<lastmod>2026-03-29</lastmod>
<lastmod>2026-03-30</lastmod>
<changefreq>weekly</changefreq>
<priority>0.7</priority>
</url>
<url>
<loc>https://dociva.io/tools/pdf-editor</loc>
<lastmod>2026-03-29</lastmod>
<lastmod>2026-03-30</lastmod>
<changefreq>weekly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://dociva.io/tools/pdf-flowchart</loc>
<lastmod>2026-03-29</lastmod>
<lastmod>2026-03-30</lastmod>
<changefreq>weekly</changefreq>
<priority>0.7</priority>
</url>
<url>
<loc>https://dociva.io/tools/pdf-to-excel</loc>
<lastmod>2026-03-29</lastmod>
<lastmod>2026-03-30</lastmod>
<changefreq>weekly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://dociva.io/tools/remove-watermark-pdf</loc>
<lastmod>2026-03-29</lastmod>
<lastmod>2026-03-30</lastmod>
<changefreq>weekly</changefreq>
<priority>0.7</priority>
</url>
<url>
<loc>https://dociva.io/tools/reorder-pdf</loc>
<lastmod>2026-03-29</lastmod>
<lastmod>2026-03-30</lastmod>
<changefreq>weekly</changefreq>
<priority>0.7</priority>
</url>
<url>
<loc>https://dociva.io/tools/extract-pages</loc>
<lastmod>2026-03-29</lastmod>
<lastmod>2026-03-30</lastmod>
<changefreq>weekly</changefreq>
<priority>0.7</priority>
</url>
<url>
<loc>https://dociva.io/tools/image-converter</loc>
<lastmod>2026-03-29</lastmod>
<lastmod>2026-03-30</lastmod>
<changefreq>weekly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://dociva.io/tools/image-resize</loc>
<lastmod>2026-03-29</lastmod>
<lastmod>2026-03-30</lastmod>
<changefreq>weekly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://dociva.io/tools/compress-image</loc>
<lastmod>2026-03-29</lastmod>
<lastmod>2026-03-30</lastmod>
<changefreq>weekly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://dociva.io/tools/ocr</loc>
<lastmod>2026-03-29</lastmod>
<lastmod>2026-03-30</lastmod>
<changefreq>weekly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://dociva.io/tools/remove-background</loc>
<lastmod>2026-03-29</lastmod>
<lastmod>2026-03-30</lastmod>
<changefreq>weekly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://dociva.io/tools/image-to-svg</loc>
<lastmod>2026-03-29</lastmod>
<lastmod>2026-03-30</lastmod>
<changefreq>weekly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://dociva.io/tools/html-to-pdf</loc>
<lastmod>2026-03-29</lastmod>
<lastmod>2026-03-30</lastmod>
<changefreq>weekly</changefreq>
<priority>0.7</priority>
</url>
<url>
<loc>https://dociva.io/tools/chat-pdf</loc>
<lastmod>2026-03-29</lastmod>
<lastmod>2026-03-30</lastmod>
<changefreq>weekly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://dociva.io/tools/summarize-pdf</loc>
<lastmod>2026-03-29</lastmod>
<lastmod>2026-03-30</lastmod>
<changefreq>weekly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://dociva.io/tools/translate-pdf</loc>
<lastmod>2026-03-29</lastmod>
<lastmod>2026-03-30</lastmod>
<changefreq>weekly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://dociva.io/tools/extract-tables</loc>
<lastmod>2026-03-29</lastmod>
<lastmod>2026-03-30</lastmod>
<changefreq>weekly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://dociva.io/tools/qr-code</loc>
<lastmod>2026-03-29</lastmod>
<lastmod>2026-03-30</lastmod>
<changefreq>weekly</changefreq>
<priority>0.7</priority>
</url>
<url>
<loc>https://dociva.io/tools/video-to-gif</loc>
<lastmod>2026-03-29</lastmod>
<lastmod>2026-03-30</lastmod>
<changefreq>weekly</changefreq>
<priority>0.7</priority>
</url>
<url>
<loc>https://dociva.io/tools/word-counter</loc>
<lastmod>2026-03-29</lastmod>
<lastmod>2026-03-30</lastmod>
<changefreq>weekly</changefreq>
<priority>0.6</priority>
</url>
<url>
<loc>https://dociva.io/tools/text-cleaner</loc>
<lastmod>2026-03-29</lastmod>
<lastmod>2026-03-30</lastmod>
<changefreq>weekly</changefreq>
<priority>0.6</priority>
</url>
<url>
<loc>https://dociva.io/tools/pdf-to-pptx</loc>
<lastmod>2026-03-29</lastmod>
<lastmod>2026-03-30</lastmod>
<changefreq>weekly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://dociva.io/tools/excel-to-pdf</loc>
<lastmod>2026-03-29</lastmod>
<lastmod>2026-03-30</lastmod>
<changefreq>weekly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://dociva.io/tools/pptx-to-pdf</loc>
<lastmod>2026-03-29</lastmod>
<lastmod>2026-03-30</lastmod>
<changefreq>weekly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://dociva.io/tools/sign-pdf</loc>
<lastmod>2026-03-29</lastmod>
<lastmod>2026-03-30</lastmod>
<changefreq>weekly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>https://dociva.io/tools/crop-pdf</loc>
<lastmod>2026-03-29</lastmod>
<lastmod>2026-03-30</lastmod>
<changefreq>weekly</changefreq>
<priority>0.7</priority>
</url>
<url>
<loc>https://dociva.io/tools/flatten-pdf</loc>
<lastmod>2026-03-29</lastmod>
<lastmod>2026-03-30</lastmod>
<changefreq>weekly</changefreq>
<priority>0.7</priority>
</url>
<url>
<loc>https://dociva.io/tools/repair-pdf</loc>
<lastmod>2026-03-29</lastmod>
<lastmod>2026-03-30</lastmod>
<changefreq>weekly</changefreq>
<priority>0.7</priority>
</url>
<url>
<loc>https://dociva.io/tools/pdf-metadata</loc>
<lastmod>2026-03-29</lastmod>
<lastmod>2026-03-30</lastmod>
<changefreq>weekly</changefreq>
<priority>0.6</priority>
</url>
<url>
<loc>https://dociva.io/tools/image-crop</loc>
<lastmod>2026-03-29</lastmod>
<lastmod>2026-03-30</lastmod>
<changefreq>weekly</changefreq>
<priority>0.7</priority>
</url>
<url>
<loc>https://dociva.io/tools/image-rotate-flip</loc>
<lastmod>2026-03-29</lastmod>
<lastmod>2026-03-30</lastmod>
<changefreq>weekly</changefreq>
<priority>0.7</priority>
</url>
<url>
<loc>https://dociva.io/tools/barcode-generator</loc>
<lastmod>2026-03-29</lastmod>
<lastmod>2026-03-30</lastmod>
<changefreq>weekly</changefreq>
<priority>0.7</priority>
</url>

View File

@@ -1,7 +1,7 @@
import { useState, useEffect } from 'react';
import { useTranslation } from 'react-i18next';
import { Helmet } from 'react-helmet-async';
import { Languages } from 'lucide-react';
import { Languages, ShieldCheck, Sparkles } from 'lucide-react';
import FileUploader from '@/components/shared/FileUploader';
import ProgressBar from '@/components/shared/ProgressBar';
import AdSlot from '@/components/layout/AdSlot';
@@ -26,11 +26,22 @@ const LANGUAGES = [
{ value: 'it', label: 'Italiano' },
];
const getLanguageLabel = (value: string) => {
if (!value || value === 'auto') {
return null;
}
return LANGUAGES.find((language) => language.value === value)?.label ?? value;
};
export default function TranslatePdf() {
const { t } = useTranslation();
const [phase, setPhase] = useState<'upload' | 'processing' | 'done'>('upload');
const [sourceLang, setSourceLang] = useState('auto');
const [targetLang, setTargetLang] = useState('en');
const [translation, setTranslation] = useState('');
const [provider, setProvider] = useState('');
const [detectedSourceLanguage, setDetectedSourceLanguage] = useState('');
const {
file, uploadProgress, isUploading, taskId,
@@ -39,7 +50,7 @@ export default function TranslatePdf() {
endpoint: '/pdf-ai/translate',
maxSizeMB: 20,
acceptedTypes: ['pdf'],
extraData: { target_language: targetLang },
extraData: { target_language: targetLang, source_language: sourceLang },
});
const { status, result, error: taskError } = useTaskPolling({
@@ -47,6 +58,8 @@ export default function TranslatePdf() {
onComplete: (r) => {
setPhase('done');
setTranslation(r.translation || '');
setProvider(r.provider || '');
setDetectedSourceLanguage(r.detected_source_language || '');
dispatchRatingPrompt('translate-pdf');
},
onError: () => setPhase('done'),
@@ -63,7 +76,17 @@ export default function TranslatePdf() {
if (id) setPhase('processing');
};
const handleReset = () => { reset(); setPhase('upload'); setTargetLang('en'); setTranslation(''); };
const handleReset = () => {
reset();
setPhase('upload');
setSourceLang('auto');
setTargetLang('en');
setTranslation('');
setProvider('');
setDetectedSourceLanguage('');
};
const resolvedDetectedLanguage = getLanguageLabel(detectedSourceLanguage) || getLanguageLabel(sourceLang);
const schema = generateToolSchema({
name: t('tools.translatePdf.title'),
@@ -103,6 +126,33 @@ export default function TranslatePdf() {
{file && !isUploading && (
<>
<div className="rounded-2xl bg-white p-5 ring-1 ring-slate-200 dark:bg-slate-800 dark:ring-slate-700">
<div className="mb-4 flex items-start gap-3 rounded-xl bg-slate-50 p-4 dark:bg-slate-900/60">
<ShieldCheck className="mt-0.5 h-5 w-5 text-emerald-600 dark:text-emerald-400" />
<div>
<p className="text-sm font-semibold text-slate-900 dark:text-slate-100">
{t('tools.translatePdf.engineTitle')}
</p>
<p className="mt-1 text-sm text-slate-600 dark:text-slate-400">
{t('tools.translatePdf.engineDescription')}
</p>
</div>
</div>
<div className="grid gap-4 md:grid-cols-2">
<div>
<label className="mb-2 block text-sm font-medium text-slate-700 dark:text-slate-300">
{t('tools.translatePdf.sourceLang')}
</label>
<select value={sourceLang} onChange={(e) => setSourceLang(e.target.value)}
className="w-full rounded-lg border border-slate-300 px-3 py-2 text-sm dark:border-slate-600 dark:bg-slate-700 dark:text-slate-200">
<option value="auto">{t('tools.translatePdf.autoDetect')}</option>
{LANGUAGES.map((lang) => (
<option key={`source-${lang.value}`} value={lang.value}>{lang.label}</option>
))}
</select>
</div>
<div>
<label className="mb-2 block text-sm font-medium text-slate-700 dark:text-slate-300">
{t('tools.translatePdf.targetLang')}
</label>
@@ -113,6 +163,8 @@ export default function TranslatePdf() {
))}
</select>
</div>
</div>
</div>
<button onClick={handleUpload} className="btn-primary w-full">
{t('tools.translatePdf.shortDesc')}
</button>
@@ -122,11 +174,39 @@ export default function TranslatePdf() {
)}
{phase === 'processing' && !result && (
<div className="space-y-4">
<ProgressBar state={status?.state || 'PENDING'} message={status?.progress} />
<div className="rounded-xl bg-white p-4 ring-1 ring-slate-200 dark:bg-slate-800 dark:ring-slate-700">
<div className="flex items-start gap-3">
<Sparkles className="mt-0.5 h-5 w-5 text-purple-600 dark:text-purple-400" />
<p className="text-sm text-slate-600 dark:text-slate-400">
{t('tools.translatePdf.processingHint')}
</p>
</div>
</div>
</div>
)}
{phase === 'done' && translation && (
<div className="space-y-4">
<div className="grid gap-3 sm:grid-cols-2">
<div className="rounded-xl bg-white p-4 ring-1 ring-slate-200 dark:bg-slate-800 dark:ring-slate-700">
<p className="text-xs font-semibold uppercase tracking-wide text-slate-500 dark:text-slate-400">
{t('tools.translatePdf.sourceDetected')}
</p>
<p className="mt-1 text-sm font-medium text-slate-900 dark:text-slate-100">
{resolvedDetectedLanguage || t('tools.translatePdf.autoDetect')}
</p>
</div>
<div className="rounded-xl bg-white p-4 ring-1 ring-slate-200 dark:bg-slate-800 dark:ring-slate-700">
<p className="text-xs font-semibold uppercase tracking-wide text-slate-500 dark:text-slate-400">
{t('tools.translatePdf.translationEngine')}
</p>
<p className="mt-1 text-sm font-medium text-slate-900 dark:text-slate-100">
{provider || 'auto'}
</p>
</div>
</div>
<div className="rounded-2xl bg-white p-6 ring-1 ring-slate-200 dark:bg-slate-800 dark:ring-slate-700">
<h3 className="mb-3 text-sm font-semibold text-slate-700 dark:text-slate-300">
{t('tools.translatePdf.resultTitle')}

View File

@@ -827,9 +827,16 @@
},
"translatePdf": {
"title": "ترجمة PDF",
"description": "ترجم محتوى مستند PDF إلى أي لغة باستخدام الذكاء الاصطناعي.",
"description": "ترجم ملفات PDF عبر مسار ترجمة احترافي مع fallback تلقائي وتعامل أفضل مع الملفات الممسوحة ضوئياً.",
"shortDesc": "ترجمة PDF",
"sourceLang": "لغة المصدر",
"targetLang": "اللغة المستهدفة",
"autoDetect": "اكتشاف تلقائي",
"engineTitle": "ترجمة مستندات بجاهزية إنتاجية",
"engineDescription": "يتم إرسال الملف أولاً إلى مزود ترجمة احترافي، ثم يتم التحويل تلقائياً إلى مسار AI فقط عند الحاجة. هذا يقلل مشاكل الضغط ويحسن ثبات النتيجة.",
"processingHint": "قد تتم ترجمة المستندات الكبيرة على عدة أجزاء مع retries وfallback بين المزودات. اترك الصفحة مفتوحة حتى يكتمل الطلب.",
"sourceDetected": "لغة المصدر المكتشفة",
"translationEngine": "محرك الترجمة",
"resultTitle": "الترجمة"
},
"tableExtractor": {

View File

@@ -827,9 +827,16 @@
},
"translatePdf": {
"title": "Translate PDF",
"description": "Translate your PDF document content to any language using AI.",
"description": "Translate PDF documents with a premium translation pipeline, automatic fallback, and better handling for scanned files.",
"shortDesc": "Translate PDF",
"sourceLang": "Source Language",
"targetLang": "Target Language",
"autoDetect": "Auto detect",
"engineTitle": "Production-grade document translation",
"engineDescription": "Your file is translated with a premium translation provider first, then automatically falls back to AI only if needed. This reduces high-demand failures and improves consistency.",
"processingHint": "Large documents may be translated in multiple chunks with retries and provider fallback. Keep this page open until the job completes.",
"sourceDetected": "Detected source",
"translationEngine": "Translation engine",
"resultTitle": "Translation"
},
"tableExtractor": {

View File

@@ -827,9 +827,16 @@
},
"translatePdf": {
"title": "Traduire un PDF",
"description": "Traduisez le contenu de votre document PDF dans n'importe quelle langue grâce à l'IA.",
"description": "Traduisez vos PDF avec un pipeline premium, un fallback automatique et une meilleure prise en charge des fichiers scannés.",
"shortDesc": "Traduire le PDF",
"sourceLang": "Langue source",
"targetLang": "Langue cible",
"autoDetect": "Détection automatique",
"engineTitle": "Traduction documentaire de niveau production",
"engineDescription": "Votre fichier passe d'abord par un fournisseur de traduction premium, puis bascule automatiquement vers l'IA seulement si nécessaire. Cela réduit les erreurs de forte demande et améliore la stabilité.",
"processingHint": "Les documents volumineux peuvent être traduits en plusieurs segments avec retries et fallback entre fournisseurs. Laissez cette page ouverte jusqu'à la fin du traitement.",
"sourceDetected": "Source détectée",
"translationEngine": "Moteur de traduction",
"resultTitle": "Traduction"
},
"tableExtractor": {

View File

@@ -237,6 +237,10 @@ export interface TaskResult {
summary?: string;
translation?: string;
target_language?: string;
source_language?: string;
detected_source_language?: string;
provider?: string;
chunks_translated?: number;
pages_analyzed?: number;
// Table extraction fields
tables?: Array<{ page: number; table_index: number; headers: string[]; rows: string[][] }>;