import logging from core.models import DocumentType, ProcessingPolicy, PdfType logger = logging.getLogger("PDFInspector") class PDFInspector: """ Inspects PDF files to determine if they are TEXT, SCAN, DRAWING or AMBIGUOUS. """ def __init__(self, text_density_threshold: int = 100): self.text_density_threshold = text_density_threshold def inspect_pdf_from_bytes(self, pdf_bytes: bytes) -> PdfType: """ Deep inspects a PDF file from a byte stream. """ try: import fitz # PyMuPDF except ImportError: logger.error("PyMuPDF (fitz) is not installed. Returning default SCAN_PDF.") return PdfType.SCAN_PDF try: doc = fitz.open(stream=pdf_bytes, filetype="pdf") num_pages = len(doc) pages_to_check = min(3, num_pages) total_text_length = 0 is_huge = False is_ambiguous_size = False for i in range(pages_to_check): page = doc[i] rect = page.rect max_dim = max(rect.width, rect.height) if max_dim > 3000: is_huge = True elif max_dim > 1000: is_ambiguous_size = True text = page.get_text() total_text_length += len(text.strip()) avg_text = total_text_length / pages_to_check if avg_text >= self.text_density_threshold: return PdfType.TEXT_PDF if is_huge: return PdfType.DRAWING_PDF if is_ambiguous_size: return PdfType.AMBIGUOUS_PDF return PdfType.SCAN_PDF except Exception as e: logger.error(f"Error inspecting PDF stream: {e}") return PdfType.SCAN_PDF