Files
poc_system/extraction/pdf_inspector.py

62 lines
1.9 KiB
Python

import logging
from core.models import DocumentType, ProcessingPolicy, PdfType
logger = logging.getLogger("PDFInspector")
class PDFInspector:
"""
Inspects PDF files to determine if they are TEXT, SCAN, DRAWING or AMBIGUOUS.
"""
def __init__(self, text_density_threshold: int = 100):
self.text_density_threshold = text_density_threshold
def inspect_pdf_from_bytes(self, pdf_bytes: bytes) -> PdfType:
"""
Deep inspects a PDF file from a byte stream.
"""
try:
import fitz # PyMuPDF
except ImportError:
logger.error("PyMuPDF (fitz) is not installed. Returning default SCAN_PDF.")
return PdfType.SCAN_PDF
try:
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
num_pages = len(doc)
pages_to_check = min(3, num_pages)
total_text_length = 0
is_huge = False
is_ambiguous_size = False
for i in range(pages_to_check):
page = doc[i]
rect = page.rect
max_dim = max(rect.width, rect.height)
if max_dim > 3000:
is_huge = True
elif max_dim > 1000:
is_ambiguous_size = True
text = page.get_text()
total_text_length += len(text.strip())
avg_text = total_text_length / pages_to_check
if avg_text >= self.text_density_threshold:
return PdfType.TEXT_PDF
if is_huge:
return PdfType.DRAWING_PDF
if is_ambiguous_size:
return PdfType.AMBIGUOUS_PDF
return PdfType.SCAN_PDF
except Exception as e:
logger.error(f"Error inspecting PDF stream: {e}")
return PdfType.SCAN_PDF