SaaS-PDF/backend/app/services/pdf_to_excel_service.py

"""PDF to Excel conversion service."""
import os
import logging

logger = logging.getLogger(__name__)


class PdfToExcelError(Exception):
    """Custom exception for PDF to Excel conversion failures."""
    pass


def pdf_to_excel(input_path: str, output_path: str) -> dict:
    """
    Convert a PDF file containing tables to an Excel spreadsheet.

    Args:
        input_path: Path to the input PDF
        output_path: Path for the output Excel file

    Returns:
        dict with total_pages, tables_found, output_size

    Raises:
        PdfToExcelError: If conversion fails
    """
    try:
        import tabula

        os.makedirs(os.path.dirname(output_path), exist_ok=True)

        # Read all tables from the PDF
        tables = tabula.read_pdf(
            input_path, pages="all", multiple_tables=True, silent=True
        )

        if not tables:
            raise PdfToExcelError(
                "No tables found in the PDF. This tool works best with PDFs that contain tabular data."
            )

        # Write tables to Excel, each table on its own sheet
        import openpyxl

        wb = openpyxl.Workbook()
        # Remove default sheet
        wb.remove(wb.active)

        for idx, df in enumerate(tables, 1):
            sheet_name = f"Table_{idx}"
            ws = wb.create_sheet(title=sheet_name)

            # Write header
            for col_idx, col_name in enumerate(df.columns, 1):
                ws.cell(row=1, column=col_idx, value=str(col_name))

            # Write data
            for row_idx, row in enumerate(df.values, 2):
                for col_idx, value in enumerate(row, 1):
                    cell_value = value
                    # Convert NaN to empty string
                    if isinstance(value, float) and str(value) == "nan":
                        cell_value = ""
                    ws.cell(row=row_idx, column=col_idx, value=cell_value)

        wb.save(output_path)

        output_size = os.path.getsize(output_path)

        logger.info(
            f"PDF→Excel: {len(tables)} tables extracted → {output_size} bytes"
        )

        return {
            "tables_found": len(tables),
            "output_size": output_size,
        }

    except PdfToExcelError:
        raise
    except ImportError as e:
        raise PdfToExcelError(f"Required library not installed: {e}")
    except Exception as e:
        raise PdfToExcelError(f"Failed to convert PDF to Excel: {str(e)}")