ميزة: إضافة مكوني ProcedureSelection و StepProgress لأداة مخططات التدفق بصيغة PDF

- تنفيذ مكون ProcedureSelection لتمكين المستخدمين من اختيار الإجراءات من قائمة، وإدارة الاختيارات، ومعالجة الإجراءات المرفوضة.

- إنشاء مكون StepProgress لعرض تقدم معالج متعدد الخطوات بشكل مرئي.

- تعريف أنواع مشتركة للإجراءات، وخطوات التدفق، ورسائل الدردشة في ملف types.ts.

- إضافة اختبارات وحدة لخطافات useFileUpload و useTaskPolling لضمان الأداء السليم ومعالجة الأخطاء.

- تنفيذ اختبارات واجهة برمجة التطبيقات (API) للتحقق من تنسيقات نقاط النهاية وضمان اتساق ربط الواجهة الأمامية بالخلفية.
This commit is contained in:
Your Name
2026-03-06 17:16:09 +02:00
parent 2e97741d60
commit cfbcc8bd79
62 changed files with 10567 additions and 101 deletions

View File

@@ -62,6 +62,7 @@ def create_app(config_name=None):
from app.routes.tasks import tasks_bp
from app.routes.download import download_bp
from app.routes.pdf_tools import pdf_tools_bp
from app.routes.flowchart import flowchart_bp
app.register_blueprint(health_bp, url_prefix="/api")
app.register_blueprint(convert_bp, url_prefix="/api/convert")
@@ -69,6 +70,7 @@ def create_app(config_name=None):
app.register_blueprint(image_bp, url_prefix="/api/image")
app.register_blueprint(video_bp, url_prefix="/api/video")
app.register_blueprint(pdf_tools_bp, url_prefix="/api/pdf-tools")
app.register_blueprint(flowchart_bp, url_prefix="/api/flowchart")
app.register_blueprint(tasks_bp, url_prefix="/api/tasks")
app.register_blueprint(download_bp, url_prefix="/api/download")

View File

@@ -0,0 +1,103 @@
"""Flowchart route — POST /api/flowchart/extract, /chat, /generate-manual."""
import logging
from flask import Blueprint, request, jsonify
from app.extensions import limiter
from app.utils.file_validator import validate_file, FileValidationError
from app.utils.sanitizer import generate_safe_path
from app.tasks.flowchart_tasks import extract_flowchart_task
logger = logging.getLogger(__name__)
flowchart_bp = Blueprint("flowchart", __name__)
@flowchart_bp.route("/extract", methods=["POST"])
@limiter.limit("10/minute")
def extract_flowchart_route():
"""
Extract procedures from a PDF and generate flowcharts.
Accepts: multipart/form-data with a single 'file' field (PDF)
Returns: JSON with task_id for polling
"""
if "file" not in request.files:
return jsonify({"error": "No file uploaded."}), 400
file = request.files["file"]
try:
original_filename, ext = validate_file(file, allowed_types=["pdf"])
except FileValidationError as e:
return jsonify({"error": e.message}), e.code
task_id, input_path = generate_safe_path(ext)
file.save(input_path)
task = extract_flowchart_task.delay(input_path, task_id, original_filename)
return jsonify({
"task_id": task.id,
"message": "Flowchart extraction started.",
}), 202
@flowchart_bp.route("/chat", methods=["POST"])
@limiter.limit("20/minute")
def flowchart_chat_route():
"""
AI chat endpoint for flowchart improvement suggestions.
Accepts JSON: { message, flow_id, flow_data }
Returns JSON: { reply, updated_flow? }
"""
data = request.get_json(silent=True)
if not data or not data.get("message"):
return jsonify({"error": "Message is required."}), 400
message = str(data["message"])[:2000] # Limit message length
flow_data = data.get("flow_data")
try:
from app.services.ai_chat_service import chat_about_flowchart
result = chat_about_flowchart(message, flow_data)
return jsonify(result), 200
except Exception as e:
logger.error(f"Flowchart chat error: {e}")
return jsonify({"reply": "Sorry, I couldn't process your request. Please try again."}), 200
@flowchart_bp.route("/generate-manual", methods=["POST"])
@limiter.limit("10/minute")
def generate_manual_flowchart_route():
"""
Generate a flowchart from manually specified procedure data.
Accepts JSON: { title, description, pages (list of page texts) }
Returns JSON: { flowchart }
"""
data = request.get_json(silent=True)
if not data or not data.get("title"):
return jsonify({"error": "Title is required."}), 400
title = str(data["title"])[:200]
description = str(data.get("description", ""))[:500]
page_texts = data.get("pages", [])
from app.services.flowchart_service import generate_flowchart
# Build a synthetic procedure
procedure = {
"id": f"manual-{hash(title) % 100000}",
"title": title,
"description": description,
"pages": list(range(1, len(page_texts) + 1)),
}
pages_data = [
{"page": i + 1, "text": str(p.get("text", ""))[:5000]}
for i, p in enumerate(page_texts)
]
flowchart = generate_flowchart(procedure, pages_data)
return jsonify({"flowchart": flowchart}), 200

View File

@@ -93,6 +93,11 @@ def split_pdf_route():
if mode not in ("all", "range"):
mode = "all"
if mode == "range" and (not pages or not pages.strip()):
return jsonify({
"error": "Please specify which pages to extract (e.g. 1,3,5-8)."
}), 400
try:
original_filename, ext = validate_file(file, allowed_types=["pdf"])
except FileValidationError as e:

View File

@@ -3,11 +3,13 @@ from flask import Blueprint, jsonify
from celery.result import AsyncResult
from app.extensions import celery
from app.middleware.rate_limiter import limiter
tasks_bp = Blueprint("tasks", __name__)
@tasks_bp.route("/<task_id>/status", methods=["GET"])
@limiter.limit("300/minute", override_defaults=True)
def get_task_status(task_id: str):
"""
Get the status of an async task.

View File

@@ -0,0 +1,142 @@
"""AI Chat Service — OpenRouter integration for flowchart improvement."""
import os
import json
import logging
import requests
logger = logging.getLogger(__name__)
# Configuration
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY", "")
OPENROUTER_MODEL = os.getenv("OPENROUTER_MODEL", "meta-llama/llama-3-8b-instruct")
OPENROUTER_BASE_URL = os.getenv(
"OPENROUTER_BASE_URL", "https://openrouter.ai/api/v1/chat/completions"
)
SYSTEM_PROMPT = """You are a flowchart improvement assistant. You help users improve their flowcharts by:
1. Suggesting better step titles and descriptions
2. Identifying missing steps or decision points
3. Recommending better flow structure
4. Simplifying complex flows
When the user asks you to modify the flowchart, respond with your suggestion in plain text.
Keep responses concise and actionable. Reply in the same language the user uses."""
def chat_about_flowchart(message: str, flow_data: dict | None = None) -> dict:
"""
Send a message to the AI about a flowchart and get improvement suggestions.
Args:
message: User message
flow_data: Current flowchart data (optional)
Returns:
{"reply": "...", "updated_flow": {...} | None}
"""
if not OPENROUTER_API_KEY:
return {
"reply": _fallback_response(message, flow_data),
"updated_flow": None,
}
# Build context
context = ""
if flow_data:
steps_summary = []
for s in flow_data.get("steps", []):
steps_summary.append(
f"- [{s.get('type', 'process')}] {s.get('title', '')}"
)
context = (
f"\nCurrent flowchart: {flow_data.get('title', 'Untitled')}\n"
f"Steps:\n" + "\n".join(steps_summary)
)
messages = [
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": f"{message}{context}"},
]
try:
response = requests.post(
OPENROUTER_BASE_URL,
headers={
"Authorization": f"Bearer {OPENROUTER_API_KEY}",
"Content-Type": "application/json",
},
json={
"model": OPENROUTER_MODEL,
"messages": messages,
"max_tokens": 500,
"temperature": 0.7,
},
timeout=30,
)
response.raise_for_status()
data = response.json()
reply = (
data.get("choices", [{}])[0]
.get("message", {})
.get("content", "")
.strip()
)
if not reply:
reply = "I couldn't generate a response. Please try again."
return {"reply": reply, "updated_flow": None}
except requests.exceptions.Timeout:
logger.warning("OpenRouter API timeout")
return {
"reply": "The AI service is taking too long. Please try again.",
"updated_flow": None,
}
except Exception as e:
logger.error(f"OpenRouter API error: {e}")
return {
"reply": _fallback_response(message, flow_data),
"updated_flow": None,
}
def _fallback_response(message: str, flow_data: dict | None) -> str:
"""Provide a helpful response when the AI API is unavailable."""
msg_lower = message.lower()
if flow_data:
steps = flow_data.get("steps", [])
title = flow_data.get("title", "your flowchart")
step_count = len(steps)
decision_count = sum(1 for s in steps if s.get("type") == "decision")
if any(
w in msg_lower for w in ["simplify", "reduce", "shorter", "بسط", "اختصر"]
):
return (
f"Your flowchart '{title}' has {step_count} steps. "
f"To simplify, consider merging consecutive process steps "
f"that perform related actions into a single step."
)
if any(
w in msg_lower for w in ["missing", "add", "more", "ناقص", "أضف"]
):
return (
f"Your flowchart has {decision_count} decision points. "
f"Consider adding error handling or validation steps "
f"between critical process nodes."
)
return (
f"Your flowchart '{title}' contains {step_count} steps "
f"({decision_count} decisions). To get AI-powered suggestions, "
f"please configure the OPENROUTER_API_KEY environment variable."
)
return (
"AI chat requires the OPENROUTER_API_KEY to be configured. "
"Please set up the environment variable for full AI functionality."
)

View File

@@ -0,0 +1,410 @@
"""Flowchart service — Extract procedures from PDF and generate flowchart data."""
import os
import re
import json
import logging
logger = logging.getLogger(__name__)
class FlowchartError(Exception):
"""Custom exception for flowchart operations."""
pass
# ---------------------------------------------------------------------------
# Heuristic keywords that signal procedural content
# ---------------------------------------------------------------------------
_PROCEDURE_KEYWORDS = [
"procedure", "protocol", "checklist", "sequence", "instruction",
"steps", "process", "workflow", "troubleshoot", "maintenance",
"startup", "shutdown", "emergency", "inspection", "replacement",
"installation", "calibration", "operation", "safety", "guide",
]
_STEP_PATTERNS = re.compile(
r"(?:^|\n)\s*(?:"
r"(?:step\s*\d+)|" # Step 1, Step 2 …
r"(?:\d+[\.\)]\s+)|" # 1. or 1) …
r"(?:[a-z][\.\)]\s+)|" # a. or a) …
r"(?:•\s)|" # bullet •
r"(?:-\s)|" # dash -
r"(?:✓\s)" # checkmark ✓
r")",
re.IGNORECASE,
)
_DECISION_KEYWORDS = re.compile(
r"\b(?:if|whether|check|verify|confirm|decide|inspect|compare|ensure|"
r"is\s+\w+\s*\?|does|should|can)\b",
re.IGNORECASE,
)
def extract_text_from_pdf(input_path: str) -> list[dict]:
"""
Extract text from each page of a PDF.
Returns:
List of dicts: [{"page": 1, "text": "..."}, ...]
"""
try:
from PyPDF2 import PdfReader
if not os.path.exists(input_path):
raise FlowchartError(f"File not found: {input_path}")
reader = PdfReader(input_path)
pages = []
for i, page in enumerate(reader.pages, start=1):
text = page.extract_text() or ""
pages.append({"page": i, "text": text.strip()})
return pages
except FlowchartError:
raise
except Exception as e:
raise FlowchartError(f"Failed to extract text from PDF: {str(e)}")
def identify_procedures(pages: list[dict]) -> list[dict]:
"""
Analyse extracted PDF text and identify procedures/sections.
Uses heuristic analysis:
1. Look for headings (lines in UPPER CASE or short bold-like lines)
2. Match procedure keywords
3. Group consecutive pages under the same heading
Returns:
List of procedures: [
{
"id": "proc-1",
"title": "Emergency Shutdown Protocol",
"description": "Extracted first paragraph...",
"pages": [8, 9],
"step_count": 6
},
...
]
"""
procedures = []
current_proc = None
proc_counter = 0
for page_data in pages:
text = page_data["text"]
page_num = page_data["page"]
if not text:
continue
lines = text.split("\n")
heading_candidates = []
for line in lines:
stripped = line.strip()
if not stripped:
continue
# Heading heuristic: short line, mostly uppercase or title-like
is_heading = (
len(stripped) < 80
and (
stripped.isupper()
or (stripped == stripped.title() and len(stripped.split()) <= 8)
or any(kw in stripped.lower() for kw in _PROCEDURE_KEYWORDS)
)
and not stripped.endswith(",")
)
if is_heading:
heading_candidates.append(stripped)
# Check if this page has procedural content
has_steps = bool(_STEP_PATTERNS.search(text))
has_keywords = any(kw in text.lower() for kw in _PROCEDURE_KEYWORDS)
if heading_candidates and (has_steps or has_keywords):
best_heading = heading_candidates[0]
# Check if this is a continuation of the current procedure
if current_proc and _is_continuation(current_proc["title"], best_heading, text):
current_proc["pages"].append(page_num)
current_proc["_text"] += "\n" + text
else:
# Save previous procedure
if current_proc:
_finalize_procedure(current_proc)
procedures.append(current_proc)
proc_counter += 1
first_paragraph = _extract_first_paragraph(text, best_heading)
current_proc = {
"id": f"proc-{proc_counter}",
"title": _clean_title(best_heading),
"description": first_paragraph,
"pages": [page_num],
"_text": text,
}
elif current_proc and has_steps:
# Continuation — same procedure on next page
current_proc["pages"].append(page_num)
current_proc["_text"] += "\n" + text
# Don't forget the last one
if current_proc:
_finalize_procedure(current_proc)
procedures.append(current_proc)
# If no procedures found via headings, try splitting by page with step content
if not procedures:
procedures = _fallback_extraction(pages)
return procedures
def generate_flowchart(procedure: dict, page_texts: list[dict]) -> dict:
"""
Generate a flowchart (list of nodes + connections) from a procedure.
Args:
procedure: Procedure dict with id, title, pages
page_texts: All page text data
Returns:
Flowchart dict: {
"id": "flow-1",
"procedureId": "proc-1",
"title": "...",
"steps": [ {id, type, title, description, connections}, ... ]
}
"""
# Gather text for the procedure's pages
text = ""
for pt in page_texts:
if pt["page"] in procedure["pages"]:
text += pt["text"] + "\n"
steps = _extract_steps_from_text(text, procedure["title"])
return {
"id": f"flow-{procedure['id']}",
"procedureId": procedure["id"],
"title": procedure["title"],
"steps": steps,
}
def extract_and_generate(input_path: str) -> dict:
"""
Full pipeline: extract text → identify procedures → generate flowcharts.
Returns:
{
"procedures": [...],
"flowcharts": [...],
"total_pages": int
}
"""
pages = extract_text_from_pdf(input_path)
procedures = identify_procedures(pages)
flowcharts = []
for proc in procedures:
flow = generate_flowchart(proc, pages)
flowcharts.append(flow)
# Remove internal text field
for proc in procedures:
proc.pop("_text", None)
return {
"procedures": procedures,
"flowcharts": flowcharts,
"total_pages": len(pages),
"pages": pages,
}
# ---------------------------------------------------------------------------
# Internal helpers
# ---------------------------------------------------------------------------
def _is_continuation(current_title: str, new_heading: str, text: str) -> bool:
"""Check if a page is a continuation of the current procedure."""
continued_markers = ["(continued)", "(cont.)", "(cont'd)"]
heading_lower = new_heading.lower()
# Explicit continuation marker
if any(m in heading_lower for m in continued_markers):
return True
# Same title repeated
if current_title.lower().rstrip() in heading_lower:
return True
return False
def _clean_title(title: str) -> str:
"""Clean up a procedure title."""
# Remove continuation markers
title = re.sub(r"\s*\(continued\).*", "", title, flags=re.IGNORECASE)
title = re.sub(r"\s*\(cont[\.\']?d?\).*", "", title, flags=re.IGNORECASE)
# Remove leading numbers like "3.1"
title = re.sub(r"^\d+[\.\)]\s*", "", title)
title = re.sub(r"^\d+\.\d+\s*", "", title)
return title.strip()
def _extract_first_paragraph(text: str, heading: str) -> str:
"""Extract the first meaningful paragraph after a heading."""
idx = text.find(heading)
if idx >= 0:
after_heading = text[idx + len(heading):].strip()
else:
after_heading = text.strip()
lines = after_heading.split("\n")
paragraph = []
for line in lines:
stripped = line.strip()
if not stripped:
if paragraph:
break
continue
if stripped.isupper() and len(stripped) > 10:
break
paragraph.append(stripped)
desc = " ".join(paragraph)[:200]
return desc if desc else "Procedural content extracted from document."
def _finalize_procedure(proc: dict):
"""Calculate step count from the accumulated text."""
text = proc.get("_text", "")
matches = _STEP_PATTERNS.findall(text)
proc["step_count"] = max(len(matches), 2)
def _fallback_extraction(pages: list[dict]) -> list[dict]:
"""When no heading-based procedures found, detect pages with step-like content."""
procedures = []
proc_counter = 0
for page_data in pages:
text = page_data["text"]
if not text:
continue
has_steps = bool(_STEP_PATTERNS.search(text))
if has_steps:
proc_counter += 1
first_line = text.split("\n")[0].strip()[:60]
procedures.append({
"id": f"proc-{proc_counter}",
"title": first_line or f"Procedure (Page {page_data['page']})",
"description": text[:150].strip(),
"pages": [page_data["page"]],
"step_count": len(_STEP_PATTERNS.findall(text)),
})
return procedures
def _extract_steps_from_text(text: str, procedure_title: str) -> list[dict]:
"""
Parse text into flowchart steps (nodes).
Strategy:
1. Split text by numbered/bulleted lines
2. Classify each as process or decision
3. Add start/end nodes
4. Wire connections
"""
lines = text.split("\n")
raw_steps = []
current_step_lines = []
step_counter = 0
for line in lines:
stripped = line.strip()
if not stripped:
continue
# Is this the start of a new step?
is_step_start = bool(re.match(
r"^\s*(?:\d+[\.\)]\s+|[a-z][\.\)]\s+|•\s|-\s|✓\s|step\s*\d+)",
stripped,
re.IGNORECASE,
))
if is_step_start:
if current_step_lines:
raw_steps.append(" ".join(current_step_lines))
current_step_lines = [re.sub(r"^\s*(?:\d+[\.\)]\s*|[a-z][\.\)]\s*|•\s*|-\s*|✓\s*|step\s*\d+[:\.\)]\s*)", "", stripped, flags=re.IGNORECASE)]
elif current_step_lines:
current_step_lines.append(stripped)
if current_step_lines:
raw_steps.append(" ".join(current_step_lines))
# Limit to reasonable number of steps
if len(raw_steps) > 15:
raw_steps = raw_steps[:15]
# Build flowchart nodes
nodes = []
step_id = 0
# Start node
step_id += 1
nodes.append({
"id": str(step_id),
"type": "start",
"title": f"Begin: {procedure_title[:40]}",
"description": "Start of procedure",
"connections": [str(step_id + 1)] if raw_steps else [],
})
for i, step_text in enumerate(raw_steps):
step_id += 1
# Classify as decision or process
is_decision = bool(_DECISION_KEYWORDS.search(step_text))
node_type = "decision" if is_decision else "process"
title = step_text[:60]
description = step_text[:150]
connections = []
if i < len(raw_steps) - 1:
if is_decision:
# Decision: Yes goes to next, No could loop back or skip
connections = [str(step_id + 1)]
else:
connections = [str(step_id + 1)]
else:
connections = [str(step_id + 1)] # Connect to end
nodes.append({
"id": str(step_id),
"type": node_type,
"title": title,
"description": description,
"connections": connections,
})
# End node
step_id += 1
nodes.append({
"id": str(step_id),
"type": "end",
"title": "Procedure Complete",
"description": "End of procedure",
"connections": [],
})
return nodes

View File

@@ -140,20 +140,75 @@ def split_pdf(
def _parse_page_range(spec: str, total: int) -> list[int]:
"""Parse a page specification like '1,3,5-8' into 0-based indices."""
if not spec or not spec.strip():
raise PDFToolsError("Please specify at least one page (e.g. 1,3,5-8).")
indices = set()
for part in spec.split(","):
part = part.strip()
invalid_tokens = []
out_of_range_tokens = []
for raw_part in spec.split(","):
part = raw_part.strip()
if not part:
continue
if "-" in part:
if part.count("-") != 1:
invalid_tokens.append(part)
continue
start_s, end_s = part.split("-", 1)
start = max(1, int(start_s.strip()))
end = min(total, int(end_s.strip()))
start_s = start_s.strip()
end_s = end_s.strip()
if not start_s.isdigit() or not end_s.isdigit():
invalid_tokens.append(part)
continue
start = int(start_s)
end = int(end_s)
if start > end:
invalid_tokens.append(part)
continue
if start < 1 or end > total:
out_of_range_tokens.append(f"{start}-{end}")
continue
indices.update(range(start - 1, end))
else:
if not part.isdigit():
invalid_tokens.append(part)
continue
page = int(part)
if 1 <= page <= total:
indices.add(page - 1)
if page < 1 or page > total:
out_of_range_tokens.append(str(page))
continue
indices.add(page - 1)
if invalid_tokens:
tokens = ", ".join(invalid_tokens)
raise PDFToolsError(
f"Invalid page format: {tokens}. Use a format like 1,3,5-8."
)
if out_of_range_tokens:
tokens = ", ".join(out_of_range_tokens)
page_word = "page" if total == 1 else "pages"
raise PDFToolsError(
f"Selected pages ({tokens}) are out of range. This PDF has only {total} {page_word}."
)
if not indices:
raise PDFToolsError("No valid pages specified.")
page_word = "page" if total == 1 else "pages"
raise PDFToolsError(
f"No pages selected. This PDF has {total} {page_word}."
)
return sorted(indices)

View File

@@ -0,0 +1,79 @@
"""Celery tasks for PDF-to-Flowchart extraction and generation."""
import os
import json
import logging
from app.extensions import celery
from app.services.flowchart_service import extract_and_generate, FlowchartError
from app.services.storage_service import storage
from app.utils.sanitizer import cleanup_task_files
logger = logging.getLogger(__name__)
def _cleanup(task_id: str):
cleanup_task_files(task_id, keep_outputs=not storage.use_s3)
@celery.task(bind=True, name="app.tasks.flowchart_tasks.extract_flowchart_task")
def extract_flowchart_task(
self, input_path: str, task_id: str, original_filename: str
):
"""
Async task: Extract procedures from PDF and generate flowcharts.
Returns a JSON result containing procedures and their flowcharts.
"""
output_dir = os.path.join("/tmp/outputs", task_id)
os.makedirs(output_dir, exist_ok=True)
try:
self.update_state(
state="PROCESSING",
meta={"step": "Extracting text from PDF..."},
)
result = extract_and_generate(input_path)
self.update_state(
state="PROCESSING",
meta={"step": "Saving flowchart data..."},
)
# Save flowchart JSON to a file and upload
output_path = os.path.join(output_dir, f"{task_id}_flowcharts.json")
with open(output_path, "w", encoding="utf-8") as f:
json.dump(result, f, ensure_ascii=False, indent=2)
s3_key = storage.upload_file(output_path, task_id, folder="outputs")
download_url = storage.generate_presigned_url(
s3_key, original_filename="flowcharts.json"
)
final_result = {
"status": "completed",
"download_url": download_url,
"filename": "flowcharts.json",
"procedures": result["procedures"],
"flowcharts": result["flowcharts"],
"pages": result["pages"],
"total_pages": result["total_pages"],
"procedures_count": len(result["procedures"]),
}
_cleanup(task_id)
logger.info(
f"Task {task_id}: Flowchart extraction completed — "
f"{len(result['procedures'])} procedures, "
f"{result['total_pages']} pages"
)
return final_result
except FlowchartError as e:
logger.error(f"Task {task_id}: Flowchart error — {e}")
_cleanup(task_id)
return {"status": "failed", "error": str(e)}
except Exception as e:
logger.error(f"Task {task_id}: Unexpected error — {e}")
_cleanup(task_id)
return {"status": "failed", "error": "An unexpected error occurred."}

View File

@@ -1,7 +1,12 @@
"""File validation utilities — multi-layer security checks."""
import os
import magic
try:
import magic
HAS_MAGIC = True
except (ImportError, OSError):
HAS_MAGIC = False
from flask import current_app
from werkzeug.utils import secure_filename
@@ -72,18 +77,19 @@ def validate_file(file_storage, allowed_types: list[str] | None = None):
if file_size == 0:
raise FileValidationError("File is empty.")
# Layer 4: Check MIME type using magic bytes
# Layer 4: Check MIME type using magic bytes (if libmagic is available)
file_header = file_storage.read(8192)
file_storage.seek(0)
detected_mime = magic.from_buffer(file_header, mime=True)
expected_mimes = valid_extensions.get(ext, [])
if HAS_MAGIC:
detected_mime = magic.from_buffer(file_header, mime=True)
expected_mimes = valid_extensions.get(ext, [])
if detected_mime not in expected_mimes:
raise FileValidationError(
f"File content does not match extension '.{ext}'. "
f"Detected type: {detected_mime}"
)
if detected_mime not in expected_mimes:
raise FileValidationError(
f"File content does not match extension '.{ext}'. "
f"Detected type: {detected_mime}"
)
# Layer 5: Additional content checks for specific types
if ext == "pdf":