- تفعيل مكون SummarizePdf لإنشاء ملخصات PDF باستخدام الذكاء الاصطناعي. - تفعيل مكون TranslatePdf لترجمة محتوى PDF إلى لغات متعددة. - تفعيل مكون TableExtractor لاستخراج الجداول من ملفات PDF. - تحديث الصفحة الرئيسية والتوجيه ليشمل الأدوات الجديدة. - إضافة ترجمات للأدوات الجديدة باللغات الإنجليزية والعربية والفرنسية. - توسيع أنواع واجهة برمجة التطبيقات (API) لدعم الميزات الجديدة المتعلقة بمعالجة ملفات PDF. --feat: Initialize frontend with React, Vite, and Tailwind CSS - Set up main entry point for React application. - Create About, Home, NotFound, Privacy, and Terms pages with SEO support. - Implement API service for file uploads and task management. - Add global styles using Tailwind CSS. - Create utility functions for SEO and text processing. - Configure Vite for development and production builds. - Set up Nginx configuration for serving frontend and backend. - Add scripts for cleanup of expired files and sitemap generation. - Implement deployment script for production environment.
85 lines
2.4 KiB
Python
85 lines
2.4 KiB
Python
"""PDF to Excel conversion service."""
|
|
import os
|
|
import logging
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class PdfToExcelError(Exception):
|
|
"""Custom exception for PDF to Excel conversion failures."""
|
|
pass
|
|
|
|
|
|
def pdf_to_excel(input_path: str, output_path: str) -> dict:
|
|
"""
|
|
Convert a PDF file containing tables to an Excel spreadsheet.
|
|
|
|
Args:
|
|
input_path: Path to the input PDF
|
|
output_path: Path for the output Excel file
|
|
|
|
Returns:
|
|
dict with total_pages, tables_found, output_size
|
|
|
|
Raises:
|
|
PdfToExcelError: If conversion fails
|
|
"""
|
|
try:
|
|
import tabula
|
|
|
|
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
|
|
|
# Read all tables from the PDF
|
|
tables = tabula.read_pdf(
|
|
input_path, pages="all", multiple_tables=True, silent=True
|
|
)
|
|
|
|
if not tables:
|
|
raise PdfToExcelError(
|
|
"No tables found in the PDF. This tool works best with PDFs that contain tabular data."
|
|
)
|
|
|
|
# Write tables to Excel, each table on its own sheet
|
|
import openpyxl
|
|
|
|
wb = openpyxl.Workbook()
|
|
# Remove default sheet
|
|
wb.remove(wb.active)
|
|
|
|
for idx, df in enumerate(tables, 1):
|
|
sheet_name = f"Table_{idx}"
|
|
ws = wb.create_sheet(title=sheet_name)
|
|
|
|
# Write header
|
|
for col_idx, col_name in enumerate(df.columns, 1):
|
|
ws.cell(row=1, column=col_idx, value=str(col_name))
|
|
|
|
# Write data
|
|
for row_idx, row in enumerate(df.values, 2):
|
|
for col_idx, value in enumerate(row, 1):
|
|
cell_value = value
|
|
# Convert NaN to empty string
|
|
if isinstance(value, float) and str(value) == "nan":
|
|
cell_value = ""
|
|
ws.cell(row=row_idx, column=col_idx, value=cell_value)
|
|
|
|
wb.save(output_path)
|
|
|
|
output_size = os.path.getsize(output_path)
|
|
|
|
logger.info(
|
|
f"PDF→Excel: {len(tables)} tables extracted → {output_size} bytes"
|
|
)
|
|
|
|
return {
|
|
"tables_found": len(tables),
|
|
"output_size": output_size,
|
|
}
|
|
|
|
except PdfToExcelError:
|
|
raise
|
|
except ImportError as e:
|
|
raise PdfToExcelError(f"Required library not installed: {e}")
|
|
except Exception as e:
|
|
raise PdfToExcelError(f"Failed to convert PDF to Excel: {str(e)}")
|