Phase 7: Hoàn thiện Modular RAG Backend với FastAPI và Đa LLM Provider
This commit is contained in:
61
extraction/pdf_inspector.py
Normal file
61
extraction/pdf_inspector.py
Normal file
@@ -0,0 +1,61 @@
|
||||
import logging
|
||||
from core.models import DocumentType, ProcessingPolicy, PdfType
|
||||
|
||||
logger = logging.getLogger("PDFInspector")
|
||||
|
||||
class PDFInspector:
|
||||
"""
|
||||
Inspects PDF files to determine if they are TEXT, SCAN, DRAWING or AMBIGUOUS.
|
||||
"""
|
||||
|
||||
def __init__(self, text_density_threshold: int = 100):
|
||||
self.text_density_threshold = text_density_threshold
|
||||
|
||||
def inspect_pdf_from_bytes(self, pdf_bytes: bytes) -> PdfType:
|
||||
"""
|
||||
Deep inspects a PDF file from a byte stream.
|
||||
"""
|
||||
try:
|
||||
import fitz # PyMuPDF
|
||||
except ImportError:
|
||||
logger.error("PyMuPDF (fitz) is not installed. Returning default SCAN_PDF.")
|
||||
return PdfType.SCAN_PDF
|
||||
|
||||
try:
|
||||
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
|
||||
num_pages = len(doc)
|
||||
|
||||
pages_to_check = min(3, num_pages)
|
||||
total_text_length = 0
|
||||
is_huge = False
|
||||
is_ambiguous_size = False
|
||||
|
||||
for i in range(pages_to_check):
|
||||
page = doc[i]
|
||||
rect = page.rect
|
||||
max_dim = max(rect.width, rect.height)
|
||||
|
||||
if max_dim > 3000:
|
||||
is_huge = True
|
||||
elif max_dim > 1000:
|
||||
is_ambiguous_size = True
|
||||
|
||||
text = page.get_text()
|
||||
total_text_length += len(text.strip())
|
||||
|
||||
avg_text = total_text_length / pages_to_check
|
||||
|
||||
if avg_text >= self.text_density_threshold:
|
||||
return PdfType.TEXT_PDF
|
||||
|
||||
if is_huge:
|
||||
return PdfType.DRAWING_PDF
|
||||
|
||||
if is_ambiguous_size:
|
||||
return PdfType.AMBIGUOUS_PDF
|
||||
|
||||
return PdfType.SCAN_PDF
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error inspecting PDF stream: {e}")
|
||||
return PdfType.SCAN_PDF
|
||||
Reference in New Issue
Block a user