feat: Initialize frontend with React, Vite, and Tailwind CSS

- Set up main entry point for React application.
- Create About, Home, NotFound, Privacy, and Terms pages with SEO support.
- Implement API service for file uploads and task management.
- Add global styles using Tailwind CSS.
- Create utility functions for SEO and text processing.
- Configure Vite for development and production builds.
- Set up Nginx configuration for serving frontend and backend.
- Add scripts for cleanup of expired files and sitemap generation.
- Implement deployment script for production environment.
This commit is contained in:
Your Name
2026-02-28 23:31:19 +02:00
parent 3b84ebb916
commit 85d98381df
93 changed files with 5940 additions and 0 deletions

73
backend/app/__init__.py Normal file
View File

@@ -0,0 +1,73 @@
"""Flask Application Factory."""
import os
from flask import Flask
from config import config
from app.extensions import cors, limiter, talisman, init_celery
def create_app(config_name=None):
"""Create and configure the Flask application."""
if config_name is None:
config_name = os.getenv("FLASK_ENV", "development")
app = Flask(__name__)
app.config.from_object(config[config_name])
# Create upload/output directories
os.makedirs(app.config["UPLOAD_FOLDER"], exist_ok=True)
os.makedirs(app.config["OUTPUT_FOLDER"], exist_ok=True)
# Initialize extensions
cors.init_app(app, origins=app.config["CORS_ORIGINS"])
limiter.init_app(app)
# Talisman security headers (relaxed CSP for AdSense)
csp = {
"default-src": "'self'",
"script-src": [
"'self'",
"'unsafe-inline'",
"https://pagead2.googlesyndication.com",
"https://www.googletagmanager.com",
"https://www.google-analytics.com",
],
"style-src": ["'self'", "'unsafe-inline'", "https://fonts.googleapis.com"],
"font-src": ["'self'", "https://fonts.gstatic.com"],
"img-src": ["'self'", "data:", "https://pagead2.googlesyndication.com"],
"frame-src": ["https://googleads.g.doubleclick.net"],
"connect-src": [
"'self'",
"https://www.google-analytics.com",
"https://*.amazonaws.com",
],
}
talisman.init_app(
app,
content_security_policy=csp,
force_https=config_name == "production",
)
# Initialize Celery
init_celery(app)
# Register blueprints
from app.routes.health import health_bp
from app.routes.convert import convert_bp
from app.routes.compress import compress_bp
from app.routes.image import image_bp
from app.routes.video import video_bp
from app.routes.tasks import tasks_bp
from app.routes.download import download_bp
app.register_blueprint(health_bp, url_prefix="/api")
app.register_blueprint(convert_bp, url_prefix="/api/convert")
app.register_blueprint(compress_bp, url_prefix="/api/compress")
app.register_blueprint(image_bp, url_prefix="/api/image")
app.register_blueprint(video_bp, url_prefix="/api/video")
app.register_blueprint(tasks_bp, url_prefix="/api/tasks")
app.register_blueprint(download_bp, url_prefix="/api/download")
return app

43
backend/app/extensions.py Normal file
View File

@@ -0,0 +1,43 @@
"""Flask extensions initialization."""
from celery import Celery
from flask_cors import CORS
from flask_limiter import Limiter
from flask_limiter.util import get_remote_address
from flask_talisman import Talisman
# Initialize extensions (will be bound to app in create_app)
cors = CORS()
limiter = Limiter(key_func=get_remote_address)
talisman = Talisman()
celery = Celery()
def init_celery(app):
"""Initialize Celery with Flask app context."""
celery.conf.broker_url = app.config["CELERY_BROKER_URL"]
celery.conf.result_backend = app.config["CELERY_RESULT_BACKEND"]
celery.conf.result_expires = app.config.get("FILE_EXPIRY_SECONDS", 1800)
celery.conf.task_serializer = "json"
celery.conf.result_serializer = "json"
celery.conf.accept_content = ["json"]
celery.conf.timezone = "UTC"
celery.conf.task_track_started = True
# Set task routes
celery.conf.task_routes = {
"app.tasks.convert_tasks.*": {"queue": "convert"},
"app.tasks.compress_tasks.*": {"queue": "compress"},
"app.tasks.image_tasks.*": {"queue": "image"},
"app.tasks.video_tasks.*": {"queue": "video"},
}
class ContextTask(celery.Task):
"""Make Celery tasks work with Flask app context."""
abstract = True
def __call__(self, *args, **kwargs):
with app.app_context():
return self.run(*args, **kwargs)
celery.Task = ContextTask
return celery

View File

@@ -0,0 +1 @@
"""Backend application middleware."""

View File

@@ -0,0 +1,18 @@
"""Rate limiting middleware configuration."""
from app.extensions import limiter
# Custom rate limits for specific operations
UPLOAD_LIMIT = "10/minute"
DOWNLOAD_LIMIT = "30/minute"
API_LIMIT = "100/hour"
def get_upload_limit():
"""Get the rate limit for file upload endpoints."""
return UPLOAD_LIMIT
def get_download_limit():
"""Get the rate limit for file download endpoints."""
return DOWNLOAD_LIMIT

View File

@@ -0,0 +1 @@
"""Backend application routes."""

View File

@@ -0,0 +1,47 @@
"""PDF compression routes."""
from flask import Blueprint, request, jsonify
from app.extensions import limiter
from app.utils.file_validator import validate_file, FileValidationError
from app.utils.sanitizer import generate_safe_path
from app.tasks.compress_tasks import compress_pdf_task
compress_bp = Blueprint("compress", __name__)
@compress_bp.route("/pdf", methods=["POST"])
@limiter.limit("10/minute")
def compress_pdf_route():
"""
Compress a PDF file.
Accepts: multipart/form-data with 'file' field (PDF)
Optional form field 'quality': "low", "medium", "high" (default: "medium")
Returns: JSON with task_id for polling
"""
if "file" not in request.files:
return jsonify({"error": "No file provided."}), 400
file = request.files["file"]
quality = request.form.get("quality", "medium")
# Validate quality parameter
if quality not in ("low", "medium", "high"):
quality = "medium"
try:
original_filename, ext = validate_file(file, allowed_types=["pdf"])
except FileValidationError as e:
return jsonify({"error": e.message}), e.code
# Save file to temp location
task_id, input_path = generate_safe_path(ext, folder_type="upload")
file.save(input_path)
# Dispatch async task
task = compress_pdf_task.delay(input_path, task_id, original_filename, quality)
return jsonify({
"task_id": task.id,
"message": "Compression started. Poll /api/tasks/{task_id}/status for progress.",
}), 202

View File

@@ -0,0 +1,73 @@
"""PDF conversion routes (PDF↔Word)."""
from flask import Blueprint, request, jsonify
from app.extensions import limiter
from app.utils.file_validator import validate_file, FileValidationError
from app.utils.sanitizer import generate_safe_path
from app.tasks.convert_tasks import convert_pdf_to_word, convert_word_to_pdf
convert_bp = Blueprint("convert", __name__)
@convert_bp.route("/pdf-to-word", methods=["POST"])
@limiter.limit("10/minute")
def pdf_to_word_route():
"""
Convert a PDF file to Word (DOCX).
Accepts: multipart/form-data with 'file' field (PDF)
Returns: JSON with task_id for polling
"""
if "file" not in request.files:
return jsonify({"error": "No file provided."}), 400
file = request.files["file"]
try:
original_filename, ext = validate_file(file, allowed_types=["pdf"])
except FileValidationError as e:
return jsonify({"error": e.message}), e.code
# Save file to temp location
task_id, input_path = generate_safe_path(ext, folder_type="upload")
file.save(input_path)
# Dispatch async task
task = convert_pdf_to_word.delay(input_path, task_id, original_filename)
return jsonify({
"task_id": task.id,
"message": "Conversion started. Poll /api/tasks/{task_id}/status for progress.",
}), 202
@convert_bp.route("/word-to-pdf", methods=["POST"])
@limiter.limit("10/minute")
def word_to_pdf_route():
"""
Convert a Word (DOC/DOCX) file to PDF.
Accepts: multipart/form-data with 'file' field (DOC/DOCX)
Returns: JSON with task_id for polling
"""
if "file" not in request.files:
return jsonify({"error": "No file provided."}), 400
file = request.files["file"]
try:
original_filename, ext = validate_file(
file, allowed_types=["doc", "docx"]
)
except FileValidationError as e:
return jsonify({"error": e.message}), e.code
task_id, input_path = generate_safe_path(ext, folder_type="upload")
file.save(input_path)
task = convert_word_to_pdf.delay(input_path, task_id, original_filename)
return jsonify({
"task_id": task.id,
"message": "Conversion started. Poll /api/tasks/{task_id}/status for progress.",
}), 202

View File

@@ -0,0 +1,35 @@
"""Local file download route — used when S3 is not configured."""
import os
from flask import Blueprint, send_file, abort, request, current_app
download_bp = Blueprint("download", __name__)
@download_bp.route("/<task_id>/<filename>", methods=["GET"])
def download_file(task_id: str, filename: str):
"""
Serve a processed file from local filesystem.
Only active in development (when S3 is not configured).
"""
# Security: sanitize inputs
# Only allow UUID-style task IDs and safe filenames
if ".." in task_id or "/" in task_id or "\\" in task_id:
abort(400, "Invalid task ID.")
if ".." in filename or "/" in filename or "\\" in filename:
abort(400, "Invalid filename.")
output_dir = current_app.config["OUTPUT_FOLDER"]
file_path = os.path.join(output_dir, task_id, filename)
if not os.path.isfile(file_path):
abort(404, "File not found or expired.")
download_name = request.args.get("name", filename)
return send_file(
file_path,
as_attachment=True,
download_name=download_name,
)

View File

@@ -0,0 +1,14 @@
"""Health check endpoint."""
from flask import Blueprint, jsonify
health_bp = Blueprint("health", __name__)
@health_bp.route("/health", methods=["GET"])
def health_check():
"""Simple health check — returns 200 if the service is running."""
return jsonify({
"status": "healthy",
"service": "SaaS-PDF API",
"version": "1.0.0",
})

122
backend/app/routes/image.py Normal file
View File

@@ -0,0 +1,122 @@
"""Image processing routes."""
from flask import Blueprint, request, jsonify
from app.extensions import limiter
from app.utils.file_validator import validate_file, FileValidationError
from app.utils.sanitizer import generate_safe_path
from app.tasks.image_tasks import convert_image_task, resize_image_task
image_bp = Blueprint("image", __name__)
ALLOWED_IMAGE_TYPES = ["png", "jpg", "jpeg", "webp"]
ALLOWED_OUTPUT_FORMATS = ["jpg", "png", "webp"]
@image_bp.route("/convert", methods=["POST"])
@limiter.limit("10/minute")
def convert_image_route():
"""
Convert an image to a different format.
Accepts: multipart/form-data with:
- 'file': Image file (PNG, JPG, JPEG, WebP)
- 'format': Target format ("jpg", "png", "webp")
- 'quality' (optional): Quality 1-100 (default: 85)
Returns: JSON with task_id for polling
"""
if "file" not in request.files:
return jsonify({"error": "No file provided."}), 400
file = request.files["file"]
output_format = request.form.get("format", "").lower()
quality = request.form.get("quality", "85")
# Validate output format
if output_format not in ALLOWED_OUTPUT_FORMATS:
return jsonify({
"error": f"Invalid format. Supported: {', '.join(ALLOWED_OUTPUT_FORMATS)}"
}), 400
# Validate quality
try:
quality = max(1, min(100, int(quality)))
except ValueError:
quality = 85
try:
original_filename, ext = validate_file(file, allowed_types=ALLOWED_IMAGE_TYPES)
except FileValidationError as e:
return jsonify({"error": e.message}), e.code
# Save file
task_id, input_path = generate_safe_path(ext, folder_type="upload")
file.save(input_path)
# Dispatch task
task = convert_image_task.delay(
input_path, task_id, original_filename, output_format, quality
)
return jsonify({
"task_id": task.id,
"message": "Image conversion started. Poll /api/tasks/{task_id}/status for progress.",
}), 202
@image_bp.route("/resize", methods=["POST"])
@limiter.limit("10/minute")
def resize_image_route():
"""
Resize an image.
Accepts: multipart/form-data with:
- 'file': Image file
- 'width' (optional): Target width
- 'height' (optional): Target height
- 'quality' (optional): Quality 1-100 (default: 85)
Returns: JSON with task_id for polling
"""
if "file" not in request.files:
return jsonify({"error": "No file provided."}), 400
file = request.files["file"]
width = request.form.get("width")
height = request.form.get("height")
quality = request.form.get("quality", "85")
# Validate dimensions
try:
width = int(width) if width else None
height = int(height) if height else None
except ValueError:
return jsonify({"error": "Width and height must be integers."}), 400
if width is None and height is None:
return jsonify({"error": "At least one of width or height is required."}), 400
if width and (width < 1 or width > 10000):
return jsonify({"error": "Width must be between 1 and 10000."}), 400
if height and (height < 1 or height > 10000):
return jsonify({"error": "Height must be between 1 and 10000."}), 400
try:
quality = max(1, min(100, int(quality)))
except ValueError:
quality = 85
try:
original_filename, ext = validate_file(file, allowed_types=ALLOWED_IMAGE_TYPES)
except FileValidationError as e:
return jsonify({"error": e.message}), e.code
task_id, input_path = generate_safe_path(ext, folder_type="upload")
file.save(input_path)
task = resize_image_task.delay(
input_path, task_id, original_filename, width, height, quality
)
return jsonify({
"task_id": task.id,
"message": "Image resize started. Poll /api/tasks/{task_id}/status for progress.",
}), 202

View File

@@ -0,0 +1,39 @@
"""Task status polling endpoint."""
from flask import Blueprint, jsonify
from celery.result import AsyncResult
from app.extensions import celery
tasks_bp = Blueprint("tasks", __name__)
@tasks_bp.route("/<task_id>/status", methods=["GET"])
def get_task_status(task_id: str):
"""
Get the status of an async task.
Returns:
JSON with task state and result (if completed)
"""
result = AsyncResult(task_id, app=celery)
response = {
"task_id": task_id,
"state": result.state,
}
if result.state == "PENDING":
response["progress"] = "Task is waiting in queue..."
elif result.state == "PROCESSING":
meta = result.info or {}
response["progress"] = meta.get("step", "Processing...")
elif result.state == "SUCCESS":
task_result = result.result or {}
response["result"] = task_result
elif result.state == "FAILURE":
response["error"] = str(result.info) if result.info else "Task failed."
return jsonify(response)

View File

@@ -0,0 +1,70 @@
"""Video processing routes."""
from flask import Blueprint, request, jsonify
from app.extensions import limiter
from app.utils.file_validator import validate_file, FileValidationError
from app.utils.sanitizer import generate_safe_path
from app.tasks.video_tasks import create_gif_task
video_bp = Blueprint("video", __name__)
ALLOWED_VIDEO_TYPES = ["mp4", "webm"]
@video_bp.route("/to-gif", methods=["POST"])
@limiter.limit("5/minute")
def video_to_gif_route():
"""
Convert a video clip to an animated GIF.
Accepts: multipart/form-data with:
- 'file': Video file (MP4, WebM, max 50MB)
- 'start_time' (optional): Start time in seconds (default: 0)
- 'duration' (optional): Duration in seconds, max 15 (default: 5)
- 'fps' (optional): Frames per second, max 20 (default: 10)
- 'width' (optional): Output width, max 640 (default: 480)
Returns: JSON with task_id for polling
"""
if "file" not in request.files:
return jsonify({"error": "No file provided."}), 400
file = request.files["file"]
# Parse and validate parameters
try:
start_time = float(request.form.get("start_time", 0))
duration = float(request.form.get("duration", 5))
fps = int(request.form.get("fps", 10))
width = int(request.form.get("width", 480))
except (ValueError, TypeError):
return jsonify({"error": "Invalid parameters. Must be numeric."}), 400
# Enforce limits
if start_time < 0:
return jsonify({"error": "Start time cannot be negative."}), 400
if duration <= 0 or duration > 15:
return jsonify({"error": "Duration must be between 0.5 and 15 seconds."}), 400
if fps < 1 or fps > 20:
return jsonify({"error": "FPS must be between 1 and 20."}), 400
if width < 100 or width > 640:
return jsonify({"error": "Width must be between 100 and 640 pixels."}), 400
try:
original_filename, ext = validate_file(file, allowed_types=ALLOWED_VIDEO_TYPES)
except FileValidationError as e:
return jsonify({"error": e.message}), e.code
# Save file
task_id, input_path = generate_safe_path(ext, folder_type="upload")
file.save(input_path)
# Dispatch task
task = create_gif_task.delay(
input_path, task_id, original_filename,
start_time, duration, fps, width,
)
return jsonify({
"task_id": task.id,
"message": "GIF creation started. Poll /api/tasks/{task_id}/status for progress.",
}), 202

View File

@@ -0,0 +1 @@
"""Backend application services."""

View File

@@ -0,0 +1,109 @@
"""PDF compression service using Ghostscript."""
import os
import subprocess
import logging
logger = logging.getLogger(__name__)
class PDFCompressionError(Exception):
"""Custom exception for PDF compression failures."""
pass
# Ghostscript quality presets
QUALITY_PRESETS = {
"low": "/screen", # 72 dpi — smallest file, lowest quality
"medium": "/ebook", # 150 dpi — good balance (default)
"high": "/printer", # 300 dpi — high quality, moderate compression
}
def compress_pdf(
input_path: str, output_path: str, quality: str = "medium"
) -> dict:
"""
Compress a PDF file using Ghostscript.
Args:
input_path: Path to the input PDF file
output_path: Path for the compressed output file
quality: Compression quality — "low", "medium", or "high"
Returns:
dict with original_size, compressed_size, reduction_percent
Raises:
PDFCompressionError: If compression fails
"""
if quality not in QUALITY_PRESETS:
quality = "medium"
gs_quality = QUALITY_PRESETS[quality]
# Ensure output directory exists
os.makedirs(os.path.dirname(output_path), exist_ok=True)
cmd = [
"gs",
"-sDEVICE=pdfwrite",
"-dCompatibilityLevel=1.4",
f"-dPDFSETTINGS={gs_quality}",
"-dNOPAUSE",
"-dQUIET",
"-dBATCH",
"-dColorImageResolution=150",
"-dGrayImageResolution=150",
"-dMonoImageResolution=150",
f"-sOutputFile={output_path}",
input_path,
]
try:
original_size = os.path.getsize(input_path)
result = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=120,
)
if result.returncode != 0:
logger.error(f"Ghostscript compression failed: {result.stderr}")
raise PDFCompressionError(
f"Compression failed: {result.stderr or 'Unknown error'}"
)
if not os.path.exists(output_path):
raise PDFCompressionError("Compressed file was not created.")
compressed_size = os.path.getsize(output_path)
# If compressed file is larger, keep original
if compressed_size >= original_size:
import shutil
shutil.copy2(input_path, output_path)
compressed_size = original_size
reduction = (
((original_size - compressed_size) / original_size) * 100
if original_size > 0
else 0
)
logger.info(
f"PDF compression: {original_size}{compressed_size} "
f"({reduction:.1f}% reduction)"
)
return {
"original_size": original_size,
"compressed_size": compressed_size,
"reduction_percent": round(reduction, 1),
}
except subprocess.TimeoutExpired:
raise PDFCompressionError("Compression timed out. File may be too large.")
except FileNotFoundError:
raise PDFCompressionError("Ghostscript is not installed on the server.")

View File

@@ -0,0 +1,169 @@
"""Image processing service using Pillow."""
import os
import logging
from PIL import Image
logger = logging.getLogger(__name__)
class ImageProcessingError(Exception):
"""Custom exception for image processing failures."""
pass
# Supported format mappings
FORMAT_MAP = {
"jpg": "JPEG",
"jpeg": "JPEG",
"png": "PNG",
"webp": "WEBP",
}
def convert_image(
input_path: str,
output_path: str,
output_format: str,
quality: int = 85,
) -> dict:
"""
Convert an image to a different format.
Args:
input_path: Path to the input image
output_path: Path for the output image
output_format: Target format ("jpg", "png", "webp")
quality: Output quality 1-100 (for lossy formats)
Returns:
dict with original_size, converted_size, dimensions
Raises:
ImageProcessingError: If conversion fails
"""
output_format = output_format.lower()
if output_format not in FORMAT_MAP:
raise ImageProcessingError(
f"Unsupported output format: {output_format}. "
f"Supported: {', '.join(FORMAT_MAP.keys())}"
)
pil_format = FORMAT_MAP[output_format]
os.makedirs(os.path.dirname(output_path), exist_ok=True)
try:
original_size = os.path.getsize(input_path)
# Open and re-encode (strips any malicious payloads)
with Image.open(input_path) as img:
# Convert RGBA to RGB for JPEG (JPEG doesn't support alpha)
if pil_format == "JPEG" and img.mode in ("RGBA", "P", "LA"):
background = Image.new("RGB", img.size, (255, 255, 255))
if img.mode == "P":
img = img.convert("RGBA")
background.paste(img, mask=img.split()[-1] if "A" in img.mode else None)
img = background
width, height = img.size
# Save with quality setting
save_kwargs = {}
if pil_format in ("JPEG", "WEBP"):
save_kwargs["quality"] = max(1, min(100, quality))
save_kwargs["optimize"] = True
elif pil_format == "PNG":
save_kwargs["optimize"] = True
img.save(output_path, format=pil_format, **save_kwargs)
converted_size = os.path.getsize(output_path)
logger.info(
f"Image conversion: {input_path}{output_format} "
f"({original_size}{converted_size})"
)
return {
"original_size": original_size,
"converted_size": converted_size,
"width": width,
"height": height,
"format": output_format,
}
except (IOError, OSError, Image.DecompressionBombError) as e:
raise ImageProcessingError(f"Image processing failed: {str(e)}")
def resize_image(
input_path: str,
output_path: str,
width: int | None = None,
height: int | None = None,
quality: int = 85,
) -> dict:
"""
Resize an image while maintaining aspect ratio.
Args:
input_path: Path to the input image
output_path: Path for the resized image
width: Target width (None to auto-calculate from height)
height: Target height (None to auto-calculate from width)
quality: Output quality 1-100
Returns:
dict with original and new dimensions
Raises:
ImageProcessingError: If resize fails
"""
if width is None and height is None:
raise ImageProcessingError("At least one of width or height must be specified.")
os.makedirs(os.path.dirname(output_path), exist_ok=True)
try:
with Image.open(input_path) as img:
orig_width, orig_height = img.size
# Calculate missing dimension to maintain aspect ratio
if width and not height:
ratio = width / orig_width
height = int(orig_height * ratio)
elif height and not width:
ratio = height / orig_height
width = int(orig_width * ratio)
# Resize using high-quality resampling
resized = img.resize((width, height), Image.Resampling.LANCZOS)
# Detect format from output extension
ext = os.path.splitext(output_path)[1].lower().strip(".")
pil_format = FORMAT_MAP.get(ext, "PNG")
save_kwargs = {"optimize": True}
if pil_format in ("JPEG", "WEBP"):
save_kwargs["quality"] = quality
# Handle RGBA for JPEG
if resized.mode in ("RGBA", "P", "LA"):
background = Image.new("RGB", resized.size, (255, 255, 255))
if resized.mode == "P":
resized = resized.convert("RGBA")
background.paste(
resized, mask=resized.split()[-1] if "A" in resized.mode else None
)
resized = background
resized.save(output_path, format=pil_format, **save_kwargs)
return {
"original_width": orig_width,
"original_height": orig_height,
"new_width": width,
"new_height": height,
}
except (IOError, OSError, Image.DecompressionBombError) as e:
raise ImageProcessingError(f"Image resize failed: {str(e)}")

View File

@@ -0,0 +1,170 @@
"""PDF conversion service using LibreOffice headless."""
import os
import subprocess
import logging
import tempfile
logger = logging.getLogger(__name__)
class PDFConversionError(Exception):
"""Custom exception for PDF conversion failures."""
pass
def pdf_to_word(input_path: str, output_dir: str) -> str:
"""
Convert a PDF file to Word (DOCX) format using LibreOffice headless.
Args:
input_path: Path to the input PDF file
output_dir: Directory for the output file
Returns:
Path to the converted DOCX file
Raises:
PDFConversionError: If conversion fails
"""
os.makedirs(output_dir, exist_ok=True)
# Use a unique user profile per process to avoid lock conflicts
user_install_dir = tempfile.mkdtemp(prefix="lo_pdf2word_")
cmd = [
"soffice",
"--headless",
"--norestore",
f"-env:UserInstallation=file://{user_install_dir}",
"--infilter=writer_pdf_import",
"--convert-to", "docx",
"--outdir", output_dir,
input_path,
]
try:
logger.info(f"Running LibreOffice PDF→Word: {' '.join(cmd)}")
result = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=120, # 2 minute timeout
env={**os.environ, "HOME": user_install_dir},
)
logger.info(f"LibreOffice stdout: {result.stdout}")
logger.info(f"LibreOffice stderr: {result.stderr}")
logger.info(f"LibreOffice returncode: {result.returncode}")
# LibreOffice names output based on input filename
input_basename = os.path.splitext(os.path.basename(input_path))[0]
output_path = os.path.join(output_dir, f"{input_basename}.docx")
# Check output file first — LibreOffice may return non-zero
# due to harmless warnings (e.g. javaldx) even on success
if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
logger.info(f"PDF→Word conversion successful: {output_path}")
return output_path
# No output file — now treat as real error
if result.returncode != 0:
# Filter out known harmless warnings
stderr = result.stderr or ""
real_errors = [
line for line in stderr.strip().splitlines()
if not line.startswith("Warning: failed to launch javaldx")
]
error_msg = "\n".join(real_errors) if real_errors else stderr
logger.error(f"LibreOffice PDF→Word failed: {error_msg}")
raise PDFConversionError(
f"Conversion failed: {error_msg or 'Unknown error'}"
)
# Return code 0 but no output file
files_in_dir = os.listdir(output_dir) if os.path.exists(output_dir) else []
logger.error(
f"Expected output not found at {output_path}. "
f"Files in output dir: {files_in_dir}"
)
raise PDFConversionError("Output file was not created.")
except subprocess.TimeoutExpired:
raise PDFConversionError("Conversion timed out. File may be too large.")
except FileNotFoundError:
raise PDFConversionError("LibreOffice is not installed on the server.")
finally:
# Cleanup temporary user profile
import shutil
shutil.rmtree(user_install_dir, ignore_errors=True)
def word_to_pdf(input_path: str, output_dir: str) -> str:
"""
Convert a Word (DOC/DOCX) file to PDF format using LibreOffice headless.
Args:
input_path: Path to the input Word file
output_dir: Directory for the output file
Returns:
Path to the converted PDF file
Raises:
PDFConversionError: If conversion fails
"""
os.makedirs(output_dir, exist_ok=True)
# Use a unique user profile per process to avoid lock conflicts
user_install_dir = tempfile.mkdtemp(prefix="lo_word2pdf_")
cmd = [
"soffice",
"--headless",
"--norestore",
f"-env:UserInstallation=file://{user_install_dir}",
"--convert-to", "pdf",
"--outdir", output_dir,
input_path,
]
try:
result = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=120,
env={**os.environ, "HOME": user_install_dir},
)
input_basename = os.path.splitext(os.path.basename(input_path))[0]
output_path = os.path.join(output_dir, f"{input_basename}.pdf")
# Check output file first — LibreOffice may return non-zero
# due to harmless warnings (e.g. javaldx) even on success
if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
logger.info(f"Word→PDF conversion successful: {output_path}")
return output_path
if result.returncode != 0:
stderr = result.stderr or ""
real_errors = [
line for line in stderr.strip().splitlines()
if not line.startswith("Warning: failed to launch javaldx")
]
error_msg = "\n".join(real_errors) if real_errors else stderr
logger.error(f"LibreOffice Word→PDF failed: {error_msg}")
raise PDFConversionError(
f"Conversion failed: {error_msg or 'Unknown error'}"
)
raise PDFConversionError("Output file was not created.")
except subprocess.TimeoutExpired:
raise PDFConversionError("Conversion timed out. File may be too large.")
except FileNotFoundError:
raise PDFConversionError("LibreOffice is not installed on the server.")
finally:
# Cleanup temporary user profile
import shutil
shutil.rmtree(user_install_dir, ignore_errors=True)

View File

@@ -0,0 +1,154 @@
"""Storage service — S3 in production, local files in development."""
import os
import shutil
import logging
from flask import current_app
logger = logging.getLogger(__name__)
def _is_s3_configured() -> bool:
"""Check if AWS S3 credentials are provided."""
key = current_app.config.get("AWS_ACCESS_KEY_ID")
secret = current_app.config.get("AWS_SECRET_ACCESS_KEY")
return bool(key and secret and key.strip() and secret.strip())
class StorageService:
"""Handle file storage — uses S3 when configured, local filesystem otherwise."""
def __init__(self):
self._client = None
@property
def use_s3(self) -> bool:
return _is_s3_configured()
@property
def client(self):
"""Lazy-initialize S3 client (only when S3 is configured)."""
if self._client is None:
import boto3
self._client = boto3.client(
"s3",
region_name=current_app.config["AWS_S3_REGION"],
aws_access_key_id=current_app.config["AWS_ACCESS_KEY_ID"],
aws_secret_access_key=current_app.config["AWS_SECRET_ACCESS_KEY"],
)
return self._client
@property
def bucket(self):
return current_app.config["AWS_S3_BUCKET"]
def upload_file(self, local_path: str, task_id: str, folder: str = "outputs") -> str:
"""
Upload / store a file.
In S3 mode: uploads to S3 bucket.
In local mode: copies file to the outputs directory.
Returns:
S3 key or local relative path (used as identifier)
"""
filename = os.path.basename(local_path)
key = f"{folder}/{task_id}/{filename}"
if self.use_s3:
from botocore.exceptions import ClientError
try:
self.client.upload_file(local_path, self.bucket, key)
return key
except ClientError as e:
raise RuntimeError(f"Failed to upload file to S3: {e}")
else:
# Local mode — keep file in the outputs directory
output_dir = current_app.config["OUTPUT_FOLDER"]
dest_dir = os.path.join(output_dir, task_id)
os.makedirs(dest_dir, exist_ok=True)
dest_path = os.path.join(dest_dir, filename)
if os.path.abspath(local_path) != os.path.abspath(dest_path):
shutil.copy2(local_path, dest_path)
logger.info(f"[Local] Stored file: {dest_path}")
return key
def generate_presigned_url(
self, s3_key: str, expiry: int | None = None, original_filename: str | None = None
) -> str:
"""
Generate a download URL.
S3 mode: presigned URL.
Local mode: /api/download/<task_id>/<filename>
"""
if self.use_s3:
from botocore.exceptions import ClientError
if expiry is None:
expiry = current_app.config.get("FILE_EXPIRY_SECONDS", 1800)
params = {
"Bucket": self.bucket,
"Key": s3_key,
}
if original_filename:
params["ResponseContentDisposition"] = (
f'attachment; filename="{original_filename}"'
)
try:
url = self.client.generate_presigned_url(
"get_object",
Params=params,
ExpiresIn=expiry,
)
return url
except ClientError as e:
raise RuntimeError(f"Failed to generate presigned URL: {e}")
else:
# Local mode — return path to Flask download route
parts = s3_key.strip("/").split("/")
# key = "outputs/<task_id>/<filename>"
if len(parts) >= 3:
task_id = parts[1]
filename = parts[2]
else:
task_id = parts[0]
filename = parts[-1]
download_name = original_filename or filename
return f"/api/download/{task_id}/{filename}?name={download_name}"
def delete_file(self, s3_key: str):
"""Delete a file from S3 (no-op in local mode)."""
if self.use_s3:
from botocore.exceptions import ClientError
try:
self.client.delete_object(Bucket=self.bucket, Key=s3_key)
except ClientError:
pass
def file_exists(self, s3_key: str) -> bool:
"""Check if a file exists."""
if self.use_s3:
from botocore.exceptions import ClientError
try:
self.client.head_object(Bucket=self.bucket, Key=s3_key)
return True
except ClientError:
return False
else:
parts = s3_key.strip("/").split("/")
if len(parts) >= 3:
task_id = parts[1]
filename = parts[2]
else:
task_id = parts[0]
filename = parts[-1]
output_dir = current_app.config["OUTPUT_FOLDER"]
return os.path.isfile(os.path.join(output_dir, task_id, filename))
# Singleton instance
storage = StorageService()

View File

@@ -0,0 +1,176 @@
"""Video to GIF conversion service using ffmpeg."""
import os
import re
import subprocess
import logging
logger = logging.getLogger(__name__)
class VideoProcessingError(Exception):
"""Custom exception for video processing failures."""
pass
# Safety constraints
MAX_DURATION = 15 # seconds
MAX_WIDTH = 640 # pixels
MAX_FPS = 20
DEFAULT_FPS = 10
DEFAULT_WIDTH = 480
def video_to_gif(
input_path: str,
output_path: str,
start_time: float = 0,
duration: float = 5,
fps: int = DEFAULT_FPS,
width: int = DEFAULT_WIDTH,
) -> dict:
"""
Convert a video clip to an animated GIF using ffmpeg.
Args:
input_path: Path to the input video (MP4/WebM)
output_path: Path for the output GIF
start_time: Start time in seconds
duration: Duration in seconds (max 15)
fps: Frames per second (max 20)
width: Output width in pixels (max 640)
Returns:
dict with output_size, duration, fps, dimensions
Raises:
VideoProcessingError: If conversion fails
"""
# Sanitize numeric parameters (prevent injection)
start_time = max(0, float(start_time))
duration = max(0.5, min(MAX_DURATION, float(duration)))
fps = max(1, min(MAX_FPS, int(fps)))
width = max(100, min(MAX_WIDTH, int(width)))
os.makedirs(os.path.dirname(output_path), exist_ok=True)
# Two-pass palette approach for high-quality GIF
palette_path = output_path + ".palette.png"
try:
# Pass 1: Generate optimized palette
palette_cmd = [
"ffmpeg",
"-y",
"-ss", str(start_time),
"-t", str(duration),
"-i", input_path,
"-vf", f"fps={fps},scale={width}:-1:flags=lanczos,palettegen=stats_mode=diff",
palette_path,
]
result = subprocess.run(
palette_cmd,
capture_output=True,
text=True,
timeout=60,
)
if result.returncode != 0:
logger.error(f"ffmpeg palette generation failed: {result.stderr}")
raise VideoProcessingError("Failed to process video for GIF creation.")
# Pass 2: Create GIF using palette
gif_cmd = [
"ffmpeg",
"-y",
"-ss", str(start_time),
"-t", str(duration),
"-i", input_path,
"-i", palette_path,
"-lavfi", f"fps={fps},scale={width}:-1:flags=lanczos [x]; [x][1:v] paletteuse=dither=bayer:bayer_scale=5",
output_path,
]
result = subprocess.run(
gif_cmd,
capture_output=True,
text=True,
timeout=120,
)
if result.returncode != 0:
logger.error(f"ffmpeg GIF creation failed: {result.stderr}")
raise VideoProcessingError("Failed to create GIF from video.")
if not os.path.exists(output_path):
raise VideoProcessingError("GIF file was not created.")
output_size = os.path.getsize(output_path)
# Get actual output dimensions
actual_width, actual_height = _get_gif_dimensions(output_path)
logger.info(
f"Video→GIF: {input_path}{output_path} "
f"({output_size} bytes, {duration}s, {fps}fps, {actual_width}x{actual_height})"
)
return {
"output_size": output_size,
"duration": duration,
"fps": fps,
"width": actual_width,
"height": actual_height,
}
except subprocess.TimeoutExpired:
raise VideoProcessingError("GIF creation timed out. Video may be too large.")
except FileNotFoundError:
raise VideoProcessingError("ffmpeg is not installed on the server.")
finally:
# Cleanup palette file
if os.path.exists(palette_path):
os.remove(palette_path)
def get_video_duration(input_path: str) -> float:
"""Get the duration of a video file in seconds."""
cmd = [
"ffprobe",
"-v", "error",
"-show_entries", "format=duration",
"-of", "default=noprint_wrappers=1:nokey=1",
input_path,
]
try:
result = subprocess.run(
cmd, capture_output=True, text=True, timeout=10
)
return float(result.stdout.strip())
except (subprocess.TimeoutExpired, ValueError):
return 0.0
def _get_gif_dimensions(gif_path: str) -> tuple[int, int]:
"""Get GIF dimensions using ffprobe."""
cmd = [
"ffprobe",
"-v", "error",
"-select_streams", "v:0",
"-show_entries", "stream=width,height",
"-of", "csv=p=0",
gif_path,
]
try:
result = subprocess.run(
cmd, capture_output=True, text=True, timeout=10
)
parts = result.stdout.strip().split(",")
if len(parts) == 2:
return int(parts[0]), int(parts[1])
except (subprocess.TimeoutExpired, ValueError):
pass
return 0, 0

View File

@@ -0,0 +1 @@
"""Celery tasks for async file processing."""

View File

@@ -0,0 +1,88 @@
"""Celery tasks for PDF compression."""
import os
import logging
from app.extensions import celery
from app.services.compress_service import compress_pdf, PDFCompressionError
from app.services.storage_service import storage
from app.utils.sanitizer import cleanup_task_files
def _cleanup(task_id: str):
cleanup_task_files(task_id, keep_outputs=not storage.use_s3)
logger = logging.getLogger(__name__)
@celery.task(bind=True, name="app.tasks.compress_tasks.compress_pdf_task")
def compress_pdf_task(
self,
input_path: str,
task_id: str,
original_filename: str,
quality: str = "medium",
):
"""
Async task: Compress a PDF file.
Args:
input_path: Path to the uploaded PDF file
task_id: Unique task identifier
original_filename: Original filename for download
quality: Compression quality ("low", "medium", "high")
Returns:
dict with download_url, compression stats, and file info
"""
output_dir = os.path.join("/tmp/outputs", task_id)
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, f"{task_id}.pdf")
try:
self.update_state(
state="PROCESSING",
meta={"step": f"Compressing PDF ({quality} quality)..."},
)
# Compress using Ghostscript
stats = compress_pdf(input_path, output_path, quality)
self.update_state(state="PROCESSING", meta={"step": "Uploading result..."})
# Upload to S3
s3_key = storage.upload_file(output_path, task_id, folder="outputs")
# Generate download filename
name_without_ext = os.path.splitext(original_filename)[0]
download_name = f"{name_without_ext}_compressed.pdf"
download_url = storage.generate_presigned_url(
s3_key, original_filename=download_name
)
result = {
"status": "completed",
"download_url": download_url,
"filename": download_name,
"original_size": stats["original_size"],
"compressed_size": stats["compressed_size"],
"reduction_percent": stats["reduction_percent"],
}
_cleanup(task_id)
logger.info(
f"Task {task_id}: PDF compression completed — "
f"{stats['reduction_percent']}% reduction"
)
return result
except PDFCompressionError as e:
logger.error(f"Task {task_id}: Compression error — {e}")
_cleanup(task_id)
return {"status": "failed", "error": str(e)}
except Exception as e:
logger.error(f"Task {task_id}: Unexpected error — {e}")
_cleanup(task_id)
return {"status": "failed", "error": "An unexpected error occurred."}

View File

@@ -0,0 +1,128 @@
"""Celery tasks for PDF conversion (PDF↔Word)."""
import os
import logging
from app.extensions import celery
from app.services.pdf_service import pdf_to_word, word_to_pdf, PDFConversionError
from app.services.storage_service import storage
from app.utils.sanitizer import cleanup_task_files
def _cleanup(task_id: str):
"""Cleanup with local-aware flag."""
cleanup_task_files(task_id, keep_outputs=not storage.use_s3)
logger = logging.getLogger(__name__)
@celery.task(bind=True, name="app.tasks.convert_tasks.convert_pdf_to_word")
def convert_pdf_to_word(self, input_path: str, task_id: str, original_filename: str):
"""
Async task: Convert PDF to Word document.
Args:
input_path: Path to the uploaded PDF file
task_id: Unique task identifier
original_filename: Original filename for download
Returns:
dict with download_url and file info
"""
output_dir = os.path.join("/tmp/outputs", task_id)
try:
self.update_state(state="PROCESSING", meta={"step": "Converting PDF to Word..."})
# Convert using LibreOffice
output_path = pdf_to_word(input_path, output_dir)
self.update_state(state="PROCESSING", meta={"step": "Uploading result..."})
# Upload to S3
s3_key = storage.upload_file(output_path, task_id, folder="outputs")
# Generate download filename
name_without_ext = os.path.splitext(original_filename)[0]
download_name = f"{name_without_ext}.docx"
# Generate presigned URL
download_url = storage.generate_presigned_url(
s3_key, original_filename=download_name
)
result = {
"status": "completed",
"download_url": download_url,
"filename": download_name,
"output_size": os.path.getsize(output_path),
}
# Cleanup local files
_cleanup(task_id)
logger.info(f"Task {task_id}: PDF→Word conversion completed")
return result
except PDFConversionError as e:
logger.error(f"Task {task_id}: Conversion error — {e}")
_cleanup(task_id)
return {"status": "failed", "error": str(e)}
except Exception as e:
logger.error(f"Task {task_id}: Unexpected error — {e}")
_cleanup(task_id)
return {"status": "failed", "error": "An unexpected error occurred."}
@celery.task(bind=True, name="app.tasks.convert_tasks.convert_word_to_pdf")
def convert_word_to_pdf(self, input_path: str, task_id: str, original_filename: str):
"""
Async task: Convert Word document to PDF.
Args:
input_path: Path to the uploaded Word file
task_id: Unique task identifier
original_filename: Original filename for download
Returns:
dict with download_url and file info
"""
output_dir = os.path.join("/tmp/outputs", task_id)
try:
self.update_state(state="PROCESSING", meta={"step": "Converting Word to PDF..."})
output_path = word_to_pdf(input_path, output_dir)
self.update_state(state="PROCESSING", meta={"step": "Uploading result..."})
s3_key = storage.upload_file(output_path, task_id, folder="outputs")
name_without_ext = os.path.splitext(original_filename)[0]
download_name = f"{name_without_ext}.pdf"
download_url = storage.generate_presigned_url(
s3_key, original_filename=download_name
)
result = {
"status": "completed",
"download_url": download_url,
"filename": download_name,
"output_size": os.path.getsize(output_path),
}
_cleanup(task_id)
logger.info(f"Task {task_id}: Word→PDF conversion completed")
return result
except PDFConversionError as e:
logger.error(f"Task {task_id}: Conversion error — {e}")
_cleanup(task_id)
return {"status": "failed", "error": str(e)}
except Exception as e:
logger.error(f"Task {task_id}: Unexpected error — {e}")
_cleanup(task_id)
return {"status": "failed", "error": "An unexpected error occurred."}

View File

@@ -0,0 +1,160 @@
"""Celery tasks for image processing."""
import os
import logging
from app.extensions import celery
from app.services.image_service import convert_image, resize_image, ImageProcessingError
from app.services.storage_service import storage
from app.utils.sanitizer import cleanup_task_files
def _cleanup(task_id: str):
cleanup_task_files(task_id, keep_outputs=not storage.use_s3)
logger = logging.getLogger(__name__)
@celery.task(bind=True, name="app.tasks.image_tasks.convert_image_task")
def convert_image_task(
self,
input_path: str,
task_id: str,
original_filename: str,
output_format: str,
quality: int = 85,
):
"""
Async task: Convert an image to a different format.
Args:
input_path: Path to the uploaded image
task_id: Unique task identifier
original_filename: Original filename for download
output_format: Target format ("jpg", "png", "webp")
quality: Output quality 1-100
Returns:
dict with download_url and conversion stats
"""
output_dir = os.path.join("/tmp/outputs", task_id)
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, f"{task_id}.{output_format}")
try:
self.update_state(
state="PROCESSING",
meta={"step": f"Converting image to {output_format.upper()}..."},
)
stats = convert_image(input_path, output_path, output_format, quality)
self.update_state(state="PROCESSING", meta={"step": "Uploading result..."})
s3_key = storage.upload_file(output_path, task_id, folder="outputs")
name_without_ext = os.path.splitext(original_filename)[0]
download_name = f"{name_without_ext}.{output_format}"
download_url = storage.generate_presigned_url(
s3_key, original_filename=download_name
)
result = {
"status": "completed",
"download_url": download_url,
"filename": download_name,
"original_size": stats["original_size"],
"converted_size": stats["converted_size"],
"width": stats["width"],
"height": stats["height"],
"format": stats["format"],
}
_cleanup(task_id)
logger.info(f"Task {task_id}: Image conversion to {output_format} completed")
return result
except ImageProcessingError as e:
logger.error(f"Task {task_id}: Image error — {e}")
_cleanup(task_id)
return {"status": "failed", "error": str(e)}
except Exception as e:
logger.error(f"Task {task_id}: Unexpected error — {e}")
_cleanup(task_id)
return {"status": "failed", "error": "An unexpected error occurred."}
@celery.task(bind=True, name="app.tasks.image_tasks.resize_image_task")
def resize_image_task(
self,
input_path: str,
task_id: str,
original_filename: str,
width: int | None = None,
height: int | None = None,
quality: int = 85,
):
"""
Async task: Resize an image.
Args:
input_path: Path to the uploaded image
task_id: Unique task identifier
original_filename: Original filename for download
width: Target width
height: Target height
quality: Output quality 1-100
Returns:
dict with download_url and resize info
"""
ext = os.path.splitext(original_filename)[1].lstrip(".")
output_dir = os.path.join("/tmp/outputs", task_id)
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, f"{task_id}.{ext}")
try:
self.update_state(
state="PROCESSING",
meta={"step": "Resizing image..."},
)
stats = resize_image(input_path, output_path, width, height, quality)
self.update_state(state="PROCESSING", meta={"step": "Uploading result..."})
s3_key = storage.upload_file(output_path, task_id, folder="outputs")
name_without_ext = os.path.splitext(original_filename)[0]
download_name = f"{name_without_ext}_resized.{ext}"
download_url = storage.generate_presigned_url(
s3_key, original_filename=download_name
)
result = {
"status": "completed",
"download_url": download_url,
"filename": download_name,
"original_width": stats["original_width"],
"original_height": stats["original_height"],
"new_width": stats["new_width"],
"new_height": stats["new_height"],
}
_cleanup(task_id)
logger.info(f"Task {task_id}: Image resize completed")
return result
except ImageProcessingError as e:
logger.error(f"Task {task_id}: Image error — {e}")
_cleanup(task_id)
return {"status": "failed", "error": str(e)}
except Exception as e:
logger.error(f"Task {task_id}: Unexpected error — {e}")
_cleanup(task_id)
return {"status": "failed", "error": "An unexpected error occurred."}

View File

@@ -0,0 +1,96 @@
"""Celery tasks for video processing."""
import os
import logging
from app.extensions import celery
from app.services.video_service import video_to_gif, VideoProcessingError
from app.services.storage_service import storage
from app.utils.sanitizer import cleanup_task_files
def _cleanup(task_id: str):
cleanup_task_files(task_id, keep_outputs=not storage.use_s3)
logger = logging.getLogger(__name__)
@celery.task(bind=True, name="app.tasks.video_tasks.create_gif_task")
def create_gif_task(
self,
input_path: str,
task_id: str,
original_filename: str,
start_time: float = 0,
duration: float = 5,
fps: int = 10,
width: int = 480,
):
"""
Async task: Convert video clip to animated GIF.
Args:
input_path: Path to the uploaded video
task_id: Unique task identifier
original_filename: Original filename for download
start_time: Start time in seconds
duration: Duration in seconds
fps: Frames per second
width: Output width in pixels
Returns:
dict with download_url and GIF info
"""
output_dir = os.path.join("/tmp/outputs", task_id)
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, f"{task_id}.gif")
try:
self.update_state(
state="PROCESSING",
meta={"step": "Creating GIF from video..."},
)
stats = video_to_gif(
input_path, output_path,
start_time=start_time,
duration=duration,
fps=fps,
width=width,
)
self.update_state(state="PROCESSING", meta={"step": "Uploading result..."})
s3_key = storage.upload_file(output_path, task_id, folder="outputs")
name_without_ext = os.path.splitext(original_filename)[0]
download_name = f"{name_without_ext}.gif"
download_url = storage.generate_presigned_url(
s3_key, original_filename=download_name
)
result = {
"status": "completed",
"download_url": download_url,
"filename": download_name,
"output_size": stats["output_size"],
"duration": stats["duration"],
"fps": stats["fps"],
"width": stats["width"],
"height": stats["height"],
}
_cleanup(task_id)
logger.info(f"Task {task_id}: Video→GIF creation completed")
return result
except VideoProcessingError as e:
logger.error(f"Task {task_id}: Video error — {e}")
_cleanup(task_id)
return {"status": "failed", "error": str(e)}
except Exception as e:
logger.error(f"Task {task_id}: Unexpected error — {e}")
_cleanup(task_id)
return {"status": "failed", "error": "An unexpected error occurred."}

View File

@@ -0,0 +1 @@
"""Backend application utilities."""

View File

@@ -0,0 +1,31 @@
"""Scheduled cleanup of expired temporary files."""
import os
import shutil
import time
from flask import current_app
def cleanup_expired_files():
"""Remove files older than FILE_EXPIRY_SECONDS from upload/output dirs."""
expiry = current_app.config.get("FILE_EXPIRY_SECONDS", 1800)
now = time.time()
removed_count = 0
for folder_key in ["UPLOAD_FOLDER", "OUTPUT_FOLDER"]:
folder = current_app.config.get(folder_key)
if not folder or not os.path.exists(folder):
continue
for task_dir_name in os.listdir(folder):
task_dir = os.path.join(folder, task_dir_name)
if not os.path.isdir(task_dir):
continue
# Check directory age based on modification time
dir_mtime = os.path.getmtime(task_dir)
if now - dir_mtime > expiry:
shutil.rmtree(task_dir, ignore_errors=True)
removed_count += 1
return removed_count

View File

@@ -0,0 +1,111 @@
"""File validation utilities — multi-layer security checks."""
import os
import magic
from flask import current_app
from werkzeug.utils import secure_filename
class FileValidationError(Exception):
"""Custom exception for file validation failures."""
def __init__(self, message: str, code: int = 400):
self.message = message
self.code = code
super().__init__(self.message)
def validate_file(file_storage, allowed_types: list[str] | None = None):
"""
Validate an uploaded file through multiple security layers.
Args:
file_storage: Flask FileStorage object from request.files
allowed_types: List of allowed extensions (e.g., ["pdf", "docx"]).
If None, uses all allowed extensions from config.
Returns:
tuple: (sanitized_filename, detected_extension)
Raises:
FileValidationError: If validation fails at any layer.
"""
config = current_app.config
# Layer 1: Check if file exists and has a filename
if not file_storage or file_storage.filename == "":
raise FileValidationError("No file provided.")
filename = secure_filename(file_storage.filename)
if not filename:
raise FileValidationError("Invalid filename.")
# Layer 2: Check file extension against whitelist
ext = _get_extension(filename)
allowed_extensions = config.get("ALLOWED_EXTENSIONS", {})
if allowed_types:
valid_extensions = {k: v for k, v in allowed_extensions.items() if k in allowed_types}
else:
valid_extensions = allowed_extensions
if ext not in valid_extensions:
raise FileValidationError(
f"File type '.{ext}' is not allowed. "
f"Allowed types: {', '.join(valid_extensions.keys())}"
)
# Layer 3: Check file size against type-specific limits
file_storage.seek(0, os.SEEK_END)
file_size = file_storage.tell()
file_storage.seek(0)
size_limits = config.get("FILE_SIZE_LIMITS", {})
max_size = size_limits.get(ext, 20 * 1024 * 1024) # Default 20MB
if file_size > max_size:
max_mb = max_size / (1024 * 1024)
raise FileValidationError(
f"File too large. Maximum size for .{ext} files is {max_mb:.0f}MB."
)
if file_size == 0:
raise FileValidationError("File is empty.")
# Layer 4: Check MIME type using magic bytes
file_header = file_storage.read(8192)
file_storage.seek(0)
detected_mime = magic.from_buffer(file_header, mime=True)
expected_mimes = valid_extensions.get(ext, [])
if detected_mime not in expected_mimes:
raise FileValidationError(
f"File content does not match extension '.{ext}'. "
f"Detected type: {detected_mime}"
)
# Layer 5: Additional content checks for specific types
if ext == "pdf":
_check_pdf_safety(file_header)
return filename, ext
def _get_extension(filename: str) -> str:
"""Extract and normalize file extension."""
if "." not in filename:
return ""
return filename.rsplit(".", 1)[1].lower()
def _check_pdf_safety(file_header: bytes):
"""Check PDF for potentially dangerous embedded content."""
dangerous_patterns = [b"/JS", b"/JavaScript", b"/Launch", b"/EmbeddedFile"]
header_str = file_header
for pattern in dangerous_patterns:
if pattern in header_str:
raise FileValidationError(
"PDF contains potentially unsafe content (embedded scripts)."
)

View File

@@ -0,0 +1,77 @@
"""Filename sanitization and temporary file management."""
import os
import uuid
from flask import current_app
def generate_safe_path(extension: str, folder_type: str = "upload") -> tuple[str, str]:
"""
Generate a safe file path using UUID.
Args:
extension: File extension (without dot)
folder_type: "upload" for input files, "output" for processed files
Returns:
tuple: (task_id, full_file_path)
"""
task_id = str(uuid.uuid4())
if folder_type == "upload":
base_dir = current_app.config["UPLOAD_FOLDER"]
else:
base_dir = current_app.config["OUTPUT_FOLDER"]
# Create task-specific directory
task_dir = os.path.join(base_dir, task_id)
os.makedirs(task_dir, exist_ok=True)
filename = f"{task_id}.{extension}"
file_path = os.path.join(task_dir, filename)
return task_id, file_path
def get_output_path(task_id: str, extension: str) -> str:
"""
Get the output file path for a processed file.
Args:
task_id: The task UUID
extension: Output file extension
Returns:
Full output file path
"""
output_dir = current_app.config["OUTPUT_FOLDER"]
task_dir = os.path.join(output_dir, task_id)
os.makedirs(task_dir, exist_ok=True)
filename = f"{task_id}.{extension}"
return os.path.join(task_dir, filename)
def cleanup_task_files(task_id: str, keep_outputs: bool = False):
"""
Remove temporary files for a given task.
Args:
task_id: The task UUID
keep_outputs: If True, only clean uploads (used in local storage mode)
"""
import shutil
upload_dir = current_app.config.get("UPLOAD_FOLDER", "/tmp/uploads")
output_dir = current_app.config.get("OUTPUT_FOLDER", "/tmp/outputs")
# Always clean uploads
upload_task_dir = os.path.join(upload_dir, task_id)
if os.path.exists(upload_task_dir):
shutil.rmtree(upload_task_dir, ignore_errors=True)
# Only clean outputs when using S3 (files already uploaded to S3)
if not keep_outputs:
output_task_dir = os.path.join(output_dir, task_id)
if os.path.exists(output_task_dir):
shutil.rmtree(output_task_dir, ignore_errors=True)