100 lines
4.2 KiB
Python
100 lines
4.2 KiB
Python
import os
|
|
import httpx
|
|
import logging
|
|
from core.models import IngestedDocument, DocumentClassificationResult, DocumentType, ProcessingPolicy, PdfType
|
|
from extraction.magic_numbers import MagicNumberValidator
|
|
from extraction.pdf_inspector import PDFInspector
|
|
|
|
logger = logging.getLogger("DCE")
|
|
|
|
class DocumentClassificationEngine:
|
|
"""
|
|
Document Classification Engine (DCE).
|
|
"""
|
|
def __init__(self):
|
|
self.pdf_inspector = PDFInspector()
|
|
|
|
def classify(self, document: IngestedDocument) -> DocumentClassificationResult:
|
|
logger.info(f"Classifying document: {document.name} (ID: {document.item_id})")
|
|
|
|
ext = os.path.splitext(document.name)[1].lower()
|
|
|
|
doc_type = DocumentType.UNKNOWN
|
|
policy = ProcessingPolicy.UNSUPPORTED
|
|
reason = "Initial state"
|
|
|
|
# 1. Magic Number Validation
|
|
if document.download_url:
|
|
header_bytes = MagicNumberValidator.fetch_header_bytes(document.download_url)
|
|
is_valid, detected_type, sig_desc = MagicNumberValidator.validate_from_bytes(header_bytes)
|
|
if is_valid:
|
|
logger.info(f"Magic Number match: {sig_desc}")
|
|
else:
|
|
logger.warning(f"Could not verify magic number for {document.name}. Trusting extension fallback.")
|
|
|
|
# 2. Routing Rules
|
|
if ext == ".pdf":
|
|
pdf_type = PdfType.SCAN_PDF # Simulated default
|
|
if document.download_url:
|
|
logger.info("Downloading PDF into memory for PyMuPDF inspection...")
|
|
try:
|
|
with httpx.Client() as client:
|
|
resp = client.get(document.download_url)
|
|
resp.raise_for_status()
|
|
pdf_bytes = resp.content
|
|
pdf_type = self.pdf_inspector.inspect_pdf_from_bytes(pdf_bytes)
|
|
except Exception as e:
|
|
logger.error(f"Failed to download/inspect PDF: {e}")
|
|
pdf_type = PdfType.SCAN_PDF
|
|
else:
|
|
logger.warning("No download_url available for PDF. Defaulting to SCAN_PDF.")
|
|
|
|
if pdf_type == PdfType.TEXT_PDF:
|
|
doc_type = DocumentType.TEXTUAL_DOCUMENT
|
|
policy = ProcessingPolicy.SKIP_OCR
|
|
reason = "PDF has text layer (TEXT_PDF)"
|
|
elif pdf_type == PdfType.DRAWING_PDF:
|
|
doc_type = DocumentType.DRAWING
|
|
policy = ProcessingPolicy.METADATA_ONLY
|
|
reason = "PDF has large vector dimensions (DRAWING_PDF)"
|
|
elif pdf_type == PdfType.AMBIGUOUS_PDF:
|
|
doc_type = DocumentType.UNKNOWN
|
|
policy = ProcessingPolicy.REQUIRES_REVIEW
|
|
reason = "Kích thước PDF lớn bất thường (khổ A3/A2 hoặc DPI cao), cần con người xác nhận là bản Scan hay Bản vẽ"
|
|
else:
|
|
doc_type = DocumentType.TEXTUAL_DOCUMENT
|
|
policy = ProcessingPolicy.REQUIRES_OCR
|
|
reason = "PDF has no text layer (SCAN_PDF)"
|
|
|
|
elif ext in [".docx", ".doc", ".txt", ".md"]:
|
|
doc_type = DocumentType.TEXTUAL_DOCUMENT
|
|
policy = ProcessingPolicy.SKIP_OCR
|
|
reason = "Standard textual document format"
|
|
|
|
elif ext in [".xlsx", ".xls", ".csv"]:
|
|
doc_type = DocumentType.SPREADSHEET
|
|
policy = ProcessingPolicy.SKIP_OCR
|
|
reason = "Spreadsheet document format"
|
|
|
|
elif ext in [".dwg", ".dxf", ".cad"]:
|
|
doc_type = DocumentType.DRAWING
|
|
policy = ProcessingPolicy.METADATA_ONLY
|
|
reason = "Native CAD drawing format"
|
|
|
|
else:
|
|
doc_type = DocumentType.BINARY
|
|
policy = ProcessingPolicy.UNSUPPORTED
|
|
reason = f"Unsupported or binary extension: {ext}"
|
|
|
|
result = DocumentClassificationResult(
|
|
item_id=document.item_id,
|
|
doc_type=doc_type,
|
|
processing_policy=policy,
|
|
file_extension=ext,
|
|
is_supported=policy != ProcessingPolicy.UNSUPPORTED,
|
|
reason=reason
|
|
)
|
|
|
|
logger.info(f"Result -> Type: {doc_type.value}, Policy: {policy.value}, Reason: {reason}")
|
|
return result
|