- تنفيذ مكون ProcedureSelection لتمكين المستخدمين من اختيار الإجراءات من قائمة، وإدارة الاختيارات، ومعالجة الإجراءات المرفوضة. - إنشاء مكون StepProgress لعرض تقدم معالج متعدد الخطوات بشكل مرئي. - تعريف أنواع مشتركة للإجراءات، وخطوات التدفق، ورسائل الدردشة في ملف types.ts. - إضافة اختبارات وحدة لخطافات useFileUpload و useTaskPolling لضمان الأداء السليم ومعالجة الأخطاء. - تنفيذ اختبارات واجهة برمجة التطبيقات (API) للتحقق من تنسيقات نقاط النهاية وضمان اتساق ربط الواجهة الأمامية بالخلفية.
411 lines
12 KiB
Python
411 lines
12 KiB
Python
"""Flowchart service — Extract procedures from PDF and generate flowchart data."""
|
|
import os
|
|
import re
|
|
import json
|
|
import logging
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class FlowchartError(Exception):
|
|
"""Custom exception for flowchart operations."""
|
|
pass
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Heuristic keywords that signal procedural content
|
|
# ---------------------------------------------------------------------------
|
|
_PROCEDURE_KEYWORDS = [
|
|
"procedure", "protocol", "checklist", "sequence", "instruction",
|
|
"steps", "process", "workflow", "troubleshoot", "maintenance",
|
|
"startup", "shutdown", "emergency", "inspection", "replacement",
|
|
"installation", "calibration", "operation", "safety", "guide",
|
|
]
|
|
|
|
_STEP_PATTERNS = re.compile(
|
|
r"(?:^|\n)\s*(?:"
|
|
r"(?:step\s*\d+)|" # Step 1, Step 2 …
|
|
r"(?:\d+[\.\)]\s+)|" # 1. or 1) …
|
|
r"(?:[a-z][\.\)]\s+)|" # a. or a) …
|
|
r"(?:•\s)|" # bullet •
|
|
r"(?:-\s)|" # dash -
|
|
r"(?:✓\s)" # checkmark ✓
|
|
r")",
|
|
re.IGNORECASE,
|
|
)
|
|
|
|
_DECISION_KEYWORDS = re.compile(
|
|
r"\b(?:if|whether|check|verify|confirm|decide|inspect|compare|ensure|"
|
|
r"is\s+\w+\s*\?|does|should|can)\b",
|
|
re.IGNORECASE,
|
|
)
|
|
|
|
|
|
def extract_text_from_pdf(input_path: str) -> list[dict]:
|
|
"""
|
|
Extract text from each page of a PDF.
|
|
|
|
Returns:
|
|
List of dicts: [{"page": 1, "text": "..."}, ...]
|
|
"""
|
|
try:
|
|
from PyPDF2 import PdfReader
|
|
|
|
if not os.path.exists(input_path):
|
|
raise FlowchartError(f"File not found: {input_path}")
|
|
|
|
reader = PdfReader(input_path)
|
|
pages = []
|
|
for i, page in enumerate(reader.pages, start=1):
|
|
text = page.extract_text() or ""
|
|
pages.append({"page": i, "text": text.strip()})
|
|
|
|
return pages
|
|
|
|
except FlowchartError:
|
|
raise
|
|
except Exception as e:
|
|
raise FlowchartError(f"Failed to extract text from PDF: {str(e)}")
|
|
|
|
|
|
def identify_procedures(pages: list[dict]) -> list[dict]:
|
|
"""
|
|
Analyse extracted PDF text and identify procedures/sections.
|
|
|
|
Uses heuristic analysis:
|
|
1. Look for headings (lines in UPPER CASE or short bold-like lines)
|
|
2. Match procedure keywords
|
|
3. Group consecutive pages under the same heading
|
|
|
|
Returns:
|
|
List of procedures: [
|
|
{
|
|
"id": "proc-1",
|
|
"title": "Emergency Shutdown Protocol",
|
|
"description": "Extracted first paragraph...",
|
|
"pages": [8, 9],
|
|
"step_count": 6
|
|
},
|
|
...
|
|
]
|
|
"""
|
|
procedures = []
|
|
current_proc = None
|
|
proc_counter = 0
|
|
|
|
for page_data in pages:
|
|
text = page_data["text"]
|
|
page_num = page_data["page"]
|
|
|
|
if not text:
|
|
continue
|
|
|
|
lines = text.split("\n")
|
|
heading_candidates = []
|
|
|
|
for line in lines:
|
|
stripped = line.strip()
|
|
if not stripped:
|
|
continue
|
|
|
|
# Heading heuristic: short line, mostly uppercase or title-like
|
|
is_heading = (
|
|
len(stripped) < 80
|
|
and (
|
|
stripped.isupper()
|
|
or (stripped == stripped.title() and len(stripped.split()) <= 8)
|
|
or any(kw in stripped.lower() for kw in _PROCEDURE_KEYWORDS)
|
|
)
|
|
and not stripped.endswith(",")
|
|
)
|
|
|
|
if is_heading:
|
|
heading_candidates.append(stripped)
|
|
|
|
# Check if this page has procedural content
|
|
has_steps = bool(_STEP_PATTERNS.search(text))
|
|
has_keywords = any(kw in text.lower() for kw in _PROCEDURE_KEYWORDS)
|
|
|
|
if heading_candidates and (has_steps or has_keywords):
|
|
best_heading = heading_candidates[0]
|
|
|
|
# Check if this is a continuation of the current procedure
|
|
if current_proc and _is_continuation(current_proc["title"], best_heading, text):
|
|
current_proc["pages"].append(page_num)
|
|
current_proc["_text"] += "\n" + text
|
|
else:
|
|
# Save previous procedure
|
|
if current_proc:
|
|
_finalize_procedure(current_proc)
|
|
procedures.append(current_proc)
|
|
|
|
proc_counter += 1
|
|
first_paragraph = _extract_first_paragraph(text, best_heading)
|
|
current_proc = {
|
|
"id": f"proc-{proc_counter}",
|
|
"title": _clean_title(best_heading),
|
|
"description": first_paragraph,
|
|
"pages": [page_num],
|
|
"_text": text,
|
|
}
|
|
elif current_proc and has_steps:
|
|
# Continuation — same procedure on next page
|
|
current_proc["pages"].append(page_num)
|
|
current_proc["_text"] += "\n" + text
|
|
|
|
# Don't forget the last one
|
|
if current_proc:
|
|
_finalize_procedure(current_proc)
|
|
procedures.append(current_proc)
|
|
|
|
# If no procedures found via headings, try splitting by page with step content
|
|
if not procedures:
|
|
procedures = _fallback_extraction(pages)
|
|
|
|
return procedures
|
|
|
|
|
|
def generate_flowchart(procedure: dict, page_texts: list[dict]) -> dict:
|
|
"""
|
|
Generate a flowchart (list of nodes + connections) from a procedure.
|
|
|
|
Args:
|
|
procedure: Procedure dict with id, title, pages
|
|
page_texts: All page text data
|
|
|
|
Returns:
|
|
Flowchart dict: {
|
|
"id": "flow-1",
|
|
"procedureId": "proc-1",
|
|
"title": "...",
|
|
"steps": [ {id, type, title, description, connections}, ... ]
|
|
}
|
|
"""
|
|
# Gather text for the procedure's pages
|
|
text = ""
|
|
for pt in page_texts:
|
|
if pt["page"] in procedure["pages"]:
|
|
text += pt["text"] + "\n"
|
|
|
|
steps = _extract_steps_from_text(text, procedure["title"])
|
|
|
|
return {
|
|
"id": f"flow-{procedure['id']}",
|
|
"procedureId": procedure["id"],
|
|
"title": procedure["title"],
|
|
"steps": steps,
|
|
}
|
|
|
|
|
|
def extract_and_generate(input_path: str) -> dict:
|
|
"""
|
|
Full pipeline: extract text → identify procedures → generate flowcharts.
|
|
|
|
Returns:
|
|
{
|
|
"procedures": [...],
|
|
"flowcharts": [...],
|
|
"total_pages": int
|
|
}
|
|
"""
|
|
pages = extract_text_from_pdf(input_path)
|
|
procedures = identify_procedures(pages)
|
|
|
|
flowcharts = []
|
|
for proc in procedures:
|
|
flow = generate_flowchart(proc, pages)
|
|
flowcharts.append(flow)
|
|
|
|
# Remove internal text field
|
|
for proc in procedures:
|
|
proc.pop("_text", None)
|
|
|
|
return {
|
|
"procedures": procedures,
|
|
"flowcharts": flowcharts,
|
|
"total_pages": len(pages),
|
|
"pages": pages,
|
|
}
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Internal helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def _is_continuation(current_title: str, new_heading: str, text: str) -> bool:
|
|
"""Check if a page is a continuation of the current procedure."""
|
|
continued_markers = ["(continued)", "(cont.)", "(cont'd)"]
|
|
heading_lower = new_heading.lower()
|
|
|
|
# Explicit continuation marker
|
|
if any(m in heading_lower for m in continued_markers):
|
|
return True
|
|
|
|
# Same title repeated
|
|
if current_title.lower().rstrip() in heading_lower:
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
def _clean_title(title: str) -> str:
|
|
"""Clean up a procedure title."""
|
|
# Remove continuation markers
|
|
title = re.sub(r"\s*\(continued\).*", "", title, flags=re.IGNORECASE)
|
|
title = re.sub(r"\s*\(cont[\.\']?d?\).*", "", title, flags=re.IGNORECASE)
|
|
# Remove leading numbers like "3.1"
|
|
title = re.sub(r"^\d+[\.\)]\s*", "", title)
|
|
title = re.sub(r"^\d+\.\d+\s*", "", title)
|
|
return title.strip()
|
|
|
|
|
|
def _extract_first_paragraph(text: str, heading: str) -> str:
|
|
"""Extract the first meaningful paragraph after a heading."""
|
|
idx = text.find(heading)
|
|
if idx >= 0:
|
|
after_heading = text[idx + len(heading):].strip()
|
|
else:
|
|
after_heading = text.strip()
|
|
|
|
lines = after_heading.split("\n")
|
|
paragraph = []
|
|
for line in lines:
|
|
stripped = line.strip()
|
|
if not stripped:
|
|
if paragraph:
|
|
break
|
|
continue
|
|
if stripped.isupper() and len(stripped) > 10:
|
|
break
|
|
paragraph.append(stripped)
|
|
|
|
desc = " ".join(paragraph)[:200]
|
|
return desc if desc else "Procedural content extracted from document."
|
|
|
|
|
|
def _finalize_procedure(proc: dict):
|
|
"""Calculate step count from the accumulated text."""
|
|
text = proc.get("_text", "")
|
|
matches = _STEP_PATTERNS.findall(text)
|
|
proc["step_count"] = max(len(matches), 2)
|
|
|
|
|
|
def _fallback_extraction(pages: list[dict]) -> list[dict]:
|
|
"""When no heading-based procedures found, detect pages with step-like content."""
|
|
procedures = []
|
|
proc_counter = 0
|
|
|
|
for page_data in pages:
|
|
text = page_data["text"]
|
|
if not text:
|
|
continue
|
|
|
|
has_steps = bool(_STEP_PATTERNS.search(text))
|
|
if has_steps:
|
|
proc_counter += 1
|
|
first_line = text.split("\n")[0].strip()[:60]
|
|
procedures.append({
|
|
"id": f"proc-{proc_counter}",
|
|
"title": first_line or f"Procedure (Page {page_data['page']})",
|
|
"description": text[:150].strip(),
|
|
"pages": [page_data["page"]],
|
|
"step_count": len(_STEP_PATTERNS.findall(text)),
|
|
})
|
|
|
|
return procedures
|
|
|
|
|
|
def _extract_steps_from_text(text: str, procedure_title: str) -> list[dict]:
|
|
"""
|
|
Parse text into flowchart steps (nodes).
|
|
|
|
Strategy:
|
|
1. Split text by numbered/bulleted lines
|
|
2. Classify each as process or decision
|
|
3. Add start/end nodes
|
|
4. Wire connections
|
|
"""
|
|
lines = text.split("\n")
|
|
raw_steps = []
|
|
current_step_lines = []
|
|
step_counter = 0
|
|
|
|
for line in lines:
|
|
stripped = line.strip()
|
|
if not stripped:
|
|
continue
|
|
|
|
# Is this the start of a new step?
|
|
is_step_start = bool(re.match(
|
|
r"^\s*(?:\d+[\.\)]\s+|[a-z][\.\)]\s+|•\s|-\s|✓\s|step\s*\d+)",
|
|
stripped,
|
|
re.IGNORECASE,
|
|
))
|
|
|
|
if is_step_start:
|
|
if current_step_lines:
|
|
raw_steps.append(" ".join(current_step_lines))
|
|
current_step_lines = [re.sub(r"^\s*(?:\d+[\.\)]\s*|[a-z][\.\)]\s*|•\s*|-\s*|✓\s*|step\s*\d+[:\.\)]\s*)", "", stripped, flags=re.IGNORECASE)]
|
|
elif current_step_lines:
|
|
current_step_lines.append(stripped)
|
|
|
|
if current_step_lines:
|
|
raw_steps.append(" ".join(current_step_lines))
|
|
|
|
# Limit to reasonable number of steps
|
|
if len(raw_steps) > 15:
|
|
raw_steps = raw_steps[:15]
|
|
|
|
# Build flowchart nodes
|
|
nodes = []
|
|
step_id = 0
|
|
|
|
# Start node
|
|
step_id += 1
|
|
nodes.append({
|
|
"id": str(step_id),
|
|
"type": "start",
|
|
"title": f"Begin: {procedure_title[:40]}",
|
|
"description": "Start of procedure",
|
|
"connections": [str(step_id + 1)] if raw_steps else [],
|
|
})
|
|
|
|
for i, step_text in enumerate(raw_steps):
|
|
step_id += 1
|
|
# Classify as decision or process
|
|
is_decision = bool(_DECISION_KEYWORDS.search(step_text))
|
|
|
|
node_type = "decision" if is_decision else "process"
|
|
title = step_text[:60]
|
|
description = step_text[:150]
|
|
|
|
connections = []
|
|
if i < len(raw_steps) - 1:
|
|
if is_decision:
|
|
# Decision: Yes goes to next, No could loop back or skip
|
|
connections = [str(step_id + 1)]
|
|
else:
|
|
connections = [str(step_id + 1)]
|
|
else:
|
|
connections = [str(step_id + 1)] # Connect to end
|
|
|
|
nodes.append({
|
|
"id": str(step_id),
|
|
"type": node_type,
|
|
"title": title,
|
|
"description": description,
|
|
"connections": connections,
|
|
})
|
|
|
|
# End node
|
|
step_id += 1
|
|
nodes.append({
|
|
"id": str(step_id),
|
|
"type": "end",
|
|
"title": "Procedure Complete",
|
|
"description": "End of procedure",
|
|
"connections": [],
|
|
})
|
|
|
|
return nodes
|