poc_system/extraction/pdf_inspector.py

import logging
from core.models import DocumentType, ProcessingPolicy, PdfType

logger = logging.getLogger("PDFInspector")

class PDFInspector:
    """
    Inspects PDF files to determine if they are TEXT, SCAN, DRAWING or AMBIGUOUS.
    """

    def __init__(self, text_density_threshold: int = 100):
        self.text_density_threshold = text_density_threshold

    def inspect_pdf_from_bytes(self, pdf_bytes: bytes) -> PdfType:
        """
        Deep inspects a PDF file from a byte stream.
        """
        try:
            import fitz  # PyMuPDF
        except ImportError:
            logger.error("PyMuPDF (fitz) is not installed. Returning default SCAN_PDF.")
            return PdfType.SCAN_PDF

        try:
            doc = fitz.open(stream=pdf_bytes, filetype="pdf")
            num_pages = len(doc)

            pages_to_check = min(3, num_pages)
            total_text_length = 0
            is_huge = False
            is_ambiguous_size = False

            for i in range(pages_to_check):
                page = doc[i]
                rect = page.rect
                max_dim = max(rect.width, rect.height)

                if max_dim > 3000:
                    is_huge = True
                elif max_dim > 1000:
                    is_ambiguous_size = True

                text = page.get_text()
                total_text_length += len(text.strip())

            avg_text = total_text_length / pages_to_check

            if avg_text >= self.text_density_threshold:
                return PdfType.TEXT_PDF

            if is_huge:
                return PdfType.DRAWING_PDF

            if is_ambiguous_size:
                return PdfType.AMBIGUOUS_PDF

            return PdfType.SCAN_PDF

        except Exception as e:
            logger.error(f"Error inspecting PDF stream: {e}")
            return PdfType.SCAN_PDF