62 lines
1.9 KiB
Python
62 lines
1.9 KiB
Python
import logging
|
|
from core.models import DocumentType, ProcessingPolicy, PdfType
|
|
|
|
logger = logging.getLogger("PDFInspector")
|
|
|
|
class PDFInspector:
|
|
"""
|
|
Inspects PDF files to determine if they are TEXT, SCAN, DRAWING or AMBIGUOUS.
|
|
"""
|
|
|
|
def __init__(self, text_density_threshold: int = 100):
|
|
self.text_density_threshold = text_density_threshold
|
|
|
|
def inspect_pdf_from_bytes(self, pdf_bytes: bytes) -> PdfType:
|
|
"""
|
|
Deep inspects a PDF file from a byte stream.
|
|
"""
|
|
try:
|
|
import fitz # PyMuPDF
|
|
except ImportError:
|
|
logger.error("PyMuPDF (fitz) is not installed. Returning default SCAN_PDF.")
|
|
return PdfType.SCAN_PDF
|
|
|
|
try:
|
|
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
|
|
num_pages = len(doc)
|
|
|
|
pages_to_check = min(3, num_pages)
|
|
total_text_length = 0
|
|
is_huge = False
|
|
is_ambiguous_size = False
|
|
|
|
for i in range(pages_to_check):
|
|
page = doc[i]
|
|
rect = page.rect
|
|
max_dim = max(rect.width, rect.height)
|
|
|
|
if max_dim > 3000:
|
|
is_huge = True
|
|
elif max_dim > 1000:
|
|
is_ambiguous_size = True
|
|
|
|
text = page.get_text()
|
|
total_text_length += len(text.strip())
|
|
|
|
avg_text = total_text_length / pages_to_check
|
|
|
|
if avg_text >= self.text_density_threshold:
|
|
return PdfType.TEXT_PDF
|
|
|
|
if is_huge:
|
|
return PdfType.DRAWING_PDF
|
|
|
|
if is_ambiguous_size:
|
|
return PdfType.AMBIGUOUS_PDF
|
|
|
|
return PdfType.SCAN_PDF
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error inspecting PDF stream: {e}")
|
|
return PdfType.SCAN_PDF
|