SaaS-PDF/backend/app/services/pdf_service.py

"""PDF conversion service using LibreOffice headless."""
import os
import subprocess
import logging
import tempfile

logger = logging.getLogger(__name__)


class PDFConversionError(Exception):
    """Custom exception for PDF conversion failures."""
    pass


def pdf_to_word(input_path: str, output_dir: str) -> str:
    """
    Convert a PDF file to Word (DOCX) format using LibreOffice headless.

    Args:
        input_path: Path to the input PDF file
        output_dir: Directory for the output file

    Returns:
        Path to the converted DOCX file

    Raises:
        PDFConversionError: If conversion fails
    """
    os.makedirs(output_dir, exist_ok=True)

    # Use a unique user profile per process to avoid lock conflicts
    user_install_dir = tempfile.mkdtemp(prefix="lo_pdf2word_")

    cmd = [
        "soffice",
        "--headless",
        "--norestore",
        f"-env:UserInstallation=file://{user_install_dir}",
        "--infilter=writer_pdf_import",
        "--convert-to", "docx",
        "--outdir", output_dir,
        input_path,
    ]

    try:
        logger.info(f"Running LibreOffice PDF→Word: {' '.join(cmd)}")

        result = subprocess.run(
            cmd,
            capture_output=True,
            text=True,
            timeout=120,  # 2 minute timeout
            env={**os.environ, "HOME": user_install_dir},
        )

        logger.info(f"LibreOffice stdout: {result.stdout}")
        logger.info(f"LibreOffice stderr: {result.stderr}")
        logger.info(f"LibreOffice returncode: {result.returncode}")

        # LibreOffice names output based on input filename
        input_basename = os.path.splitext(os.path.basename(input_path))[0]
        output_path = os.path.join(output_dir, f"{input_basename}.docx")

        # Check output file first — LibreOffice may return non-zero
        # due to harmless warnings (e.g. javaldx) even on success
        if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
            logger.info(f"PDF→Word conversion successful: {output_path}")
            return output_path

        # No output file — now treat as real error
        if result.returncode != 0:
            # Filter out known harmless warnings
            stderr = result.stderr or ""
            real_errors = [
                line for line in stderr.strip().splitlines()
                if not line.startswith("Warning: failed to launch javaldx")
            ]
            error_msg = "\n".join(real_errors) if real_errors else stderr
            logger.error(f"LibreOffice PDF→Word failed: {error_msg}")
            raise PDFConversionError(
                f"Conversion failed: {error_msg or 'Unknown error'}"
            )

        # Return code 0 but no output file
        files_in_dir = os.listdir(output_dir) if os.path.exists(output_dir) else []
        logger.error(
            f"Expected output not found at {output_path}. "
            f"Files in output dir: {files_in_dir}"
        )
        raise PDFConversionError("Output file was not created.")

    except subprocess.TimeoutExpired:
        raise PDFConversionError("Conversion timed out. File may be too large.")
    except FileNotFoundError:
        raise PDFConversionError("LibreOffice is not installed on the server.")
    finally:
        # Cleanup temporary user profile
        import shutil
        shutil.rmtree(user_install_dir, ignore_errors=True)


def word_to_pdf(input_path: str, output_dir: str) -> str:
    """
    Convert a Word (DOC/DOCX) file to PDF format using LibreOffice headless.

    Args:
        input_path: Path to the input Word file
        output_dir: Directory for the output file

    Returns:
        Path to the converted PDF file

    Raises:
        PDFConversionError: If conversion fails
    """
    os.makedirs(output_dir, exist_ok=True)

    # Use a unique user profile per process to avoid lock conflicts
    user_install_dir = tempfile.mkdtemp(prefix="lo_word2pdf_")

    cmd = [
        "soffice",
        "--headless",
        "--norestore",
        f"-env:UserInstallation=file://{user_install_dir}",
        "--convert-to", "pdf",
        "--outdir", output_dir,
        input_path,
    ]

    try:
        result = subprocess.run(
            cmd,
            capture_output=True,
            text=True,
            timeout=120,
            env={**os.environ, "HOME": user_install_dir},
        )

        input_basename = os.path.splitext(os.path.basename(input_path))[0]
        output_path = os.path.join(output_dir, f"{input_basename}.pdf")

        # Check output file first — LibreOffice may return non-zero
        # due to harmless warnings (e.g. javaldx) even on success
        if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
            logger.info(f"Word→PDF conversion successful: {output_path}")
            return output_path

        if result.returncode != 0:
            stderr = result.stderr or ""
            real_errors = [
                line for line in stderr.strip().splitlines()
                if not line.startswith("Warning: failed to launch javaldx")
            ]
            error_msg = "\n".join(real_errors) if real_errors else stderr
            logger.error(f"LibreOffice Word→PDF failed: {error_msg}")
            raise PDFConversionError(
                f"Conversion failed: {error_msg or 'Unknown error'}"
            )

        raise PDFConversionError("Output file was not created.")

    except subprocess.TimeoutExpired:
        raise PDFConversionError("Conversion timed out. File may be too large.")
    except FileNotFoundError:
        raise PDFConversionError("LibreOffice is not installed on the server.")
    finally:
        # Cleanup temporary user profile
        import shutil
        shutil.rmtree(user_install_dir, ignore_errors=True)