Xu ly SSO

2026-05-09 10:31:28 +00:00
parent 9d04e7484c
commit f937d1a98e
21 changed files with 2515 additions and 271 deletions
--- a/extraction/dce.py
+++ b/extraction/dce.py
@@ -1,20 +1,35 @@
 import os
 import httpx
 import logging
+from typing import Optional
 from core.models import IngestedDocument, DocumentClassificationResult, DocumentType, ProcessingPolicy, PdfType
 from extraction.magic_numbers import MagicNumberValidator
 from extraction.pdf_inspector import PDFInspector

 logger = logging.getLogger("DCE")

+
 class DocumentClassificationEngine:
    """
    Document Classification Engine (DCE).
+    Phân loại file trước khi quyết định OCR / MarkItDown / Skip.
    """
-    def __init__(self):
+    def __init__(self, provider=None):
+        """
+        Args:
+            provider: BaseStorageProvider instance (optional). Nếu có, dùng để download file.
+        """
        self.pdf_inspector = PDFInspector()
+        self.provider = provider

-    def classify(self, document: IngestedDocument) -> DocumentClassificationResult:
+    def classify(self, document: IngestedDocument, target_item: dict = None) -> DocumentClassificationResult:
+        """
+        Phân loại tài liệu.
+        
+        Args:
+            document: IngestedDocument từ ingestion output
+            target_item: Original item dict từ provider (dùng để download qua provider)
+        """
        logger.info(f"Classifying document: {document.name} (ID: {document.item_id})")
        
        ext = os.path.splitext(document.name)[1].lower()
@@ -23,31 +38,9 @@ class DocumentClassificationEngine:
        policy = ProcessingPolicy.UNSUPPORTED
        reason = "Initial state"
        
-        # 1. Magic Number Validation
-        if document.download_url:
-            header_bytes = MagicNumberValidator.fetch_header_bytes(document.download_url)
-            is_valid, detected_type, sig_desc = MagicNumberValidator.validate_from_bytes(header_bytes)
-            if is_valid:
-                logger.info(f"Magic Number match: {sig_desc}")
-            else:
-                logger.warning(f"Could not verify magic number for {document.name}. Trusting extension fallback.")
-
-        # 2. Routing Rules
+        # 1. Routing Rules
        if ext == ".pdf":
-            pdf_type = PdfType.SCAN_PDF # Simulated default
-            if document.download_url:
-                logger.info("Downloading PDF into memory for PyMuPDF inspection...")
-                try:
-                    with httpx.Client() as client:
-                        resp = client.get(document.download_url)
-                        resp.raise_for_status()
-                        pdf_bytes = resp.content
-                    pdf_type = self.pdf_inspector.inspect_pdf_from_bytes(pdf_bytes)
-                except Exception as e:
-                    logger.error(f"Failed to download/inspect PDF: {e}")
-                    pdf_type = PdfType.SCAN_PDF
-            else:
-                logger.warning("No download_url available for PDF. Defaulting to SCAN_PDF.")
+            pdf_type = self._classify_pdf(document, target_item)
            
            if pdf_type == PdfType.TEXT_PDF:
                doc_type = DocumentType.TEXTUAL_DOCUMENT
@@ -60,7 +53,7 @@ class DocumentClassificationEngine:
            elif pdf_type == PdfType.AMBIGUOUS_PDF:
                doc_type = DocumentType.UNKNOWN
                policy = ProcessingPolicy.REQUIRES_REVIEW
-                reason = "Kích thước PDF lớn bất thường (khổ A3/A2 hoặc DPI cao), cần con người xác nhận là bản Scan hay Bản vẽ"
+                reason = "PDF size lớn bất thường (A3/A2 hoặc DPI cao), cần con người xác nhận"
            else:
                doc_type = DocumentType.TEXTUAL_DOCUMENT
                policy = ProcessingPolicy.REQUIRES_OCR
@@ -81,6 +74,11 @@ class DocumentClassificationEngine:
            policy = ProcessingPolicy.METADATA_ONLY
            reason = "Native CAD drawing format"
            
+        elif ext in [".pptx", ".ppt"]:
+            doc_type = DocumentType.PRESENTATION
+            policy = ProcessingPolicy.SKIP_OCR
+            reason = "Presentation document format"
+            
        else:
            doc_type = DocumentType.BINARY
            policy = ProcessingPolicy.UNSUPPORTED
@@ -97,3 +95,46 @@ class DocumentClassificationEngine:
        
        logger.info(f"Result -> Type: {doc_type.value}, Policy: {policy.value}, Reason: {reason}")
        return result
+
+    def _classify_pdf(self, document: IngestedDocument, target_item: dict = None) -> PdfType:
+        """Phân loại PDF thành TEXT_PDF, SCAN_PDF, DRAWING_PDF, AMBIGUOUS_PDF."""
+        pdf_bytes = self._download_pdf(document, target_item)
+        
+        if not pdf_bytes:
+            logger.warning(f"Cannot download PDF {document.name}. Defaulting to SCAN_PDF.")
+            return PdfType.SCAN_PDF
+        
+        # Magic Number validation
+        header = pdf_bytes[:256]
+        is_valid, detected_type, sig_desc = MagicNumberValidator.validate_from_bytes(header)
+        if is_valid:
+            logger.info(f"Magic Number match: {sig_desc}")
+        else:
+            logger.warning(f"Magic number mismatch for {document.name}. Continuing with inspection.")
+        
+        # PDF Inspection
+        return self.pdf_inspector.inspect_pdf_from_bytes(pdf_bytes)
+
+    def _download_pdf(self, document: IngestedDocument, target_item: dict = None) -> Optional[bytes]:
+        """Download PDF bytes. Ưu tiên dùng provider, fallback sang httpx."""
+        # Cách 1: Dùng provider (ưu tiên, đúng auth)
+        if self.provider and target_item:
+            # Chuẩn hóa: đảm bảo có field 'id' (ingestion_output có thể dùng 'item_id')
+            if "id" not in target_item and "item_id" in target_item:
+                target_item = {**target_item, "id": target_item["item_id"]}
+            try:
+                return self.provider.download_file(target_item)
+            except Exception as e:
+                logger.warning(f"Provider download failed: {e}. Falling back to httpx.")
+        
+        # Cách 2: Dùng httpx trực tiếp với download_url
+        if document.download_url:
+            try:
+                with httpx.Client(follow_redirects=True, timeout=60.0) as client:
+                    resp = client.get(document.download_url)
+                    resp.raise_for_status()
+                    return resp.content
+            except Exception as e:
+                logger.error(f"httpx download failed: {e}")
+        
+        return None
--- a/extraction/text_extractor.py
+++ b/extraction/text_extractor.py
@@ -0,0 +1,96 @@
+import logging
+from typing import List, Optional
+from core.models import OCRPageResult
+
+logger = logging.getLogger("TextExtractor")
+
+
+class TextExtractor:
+    """
+    Trích xuất text từ các định dạng tài liệu không cần OCR:
+    - DOCX (python-docx)
+    - XLSX (openpyxl)
+    - TXT/MD (đọc trực tiếp)
+    """
+
+    @staticmethod
+    def extract_from_docx(file_bytes: bytes) -> List[OCRPageResult]:
+        """Trích xuất text từ DOCX, giữ cấu trúc đoạn văn."""
+        try:
+            from docx import Document
+            import io
+
+            doc = Document(io.BytesIO(file_bytes))
+            paragraphs = []
+            for para in doc.paragraphs:
+                text = para.text.strip()
+                if text:
+                    paragraphs.append(text)
+
+            # Cũng trích xuất text từ bảng
+            for table in doc.tables:
+                for row in table.rows:
+                    row_text = " | ".join(cell.text.strip() for cell in row.cells if cell.text.strip())
+                    if row_text:
+                        paragraphs.append(row_text)
+
+            full_text = "\n\n".join(paragraphs)
+            if not full_text.strip():
+                logger.warning("DOCX file is empty or has no readable text.")
+                return []
+
+            return [OCRPageResult(page=1, text=full_text, confidence=1.0)]
+
+        except ImportError:
+            logger.error("python-docx not installed. Run: pip install python-docx")
+            return []
+        except Exception as e:
+            logger.error(f"Failed to extract text from DOCX: {e}")
+            return []
+
+    @staticmethod
+    def extract_from_xlsx(file_bytes: bytes) -> List[OCRPageResult]:
+        """Trích xuất text từ XLSX (header + mỗi sheet là 1 page)."""
+        try:
+            from openpyxl import load_workbook
+            import io
+
+            wb = load_workbook(io.BytesIO(file_bytes), read_only=True, data_only=True)
+            results = []
+
+            for sheet_idx, sheet_name in enumerate(wb.sheetnames, 1):
+                ws = wb[sheet_name]
+                rows = []
+                for row in ws.iter_rows(values_only=True):
+                    cells = [str(c).strip() for c in row if c is not None and str(c).strip()]
+                    if cells:
+                        rows.append(" | ".join(cells))
+
+                if rows:
+                    sheet_text = f"[Sheet: {sheet_name}]\n" + "\n".join(rows)
+                    results.append(OCRPageResult(page=sheet_idx, text=sheet_text, confidence=1.0))
+
+            wb.close()
+
+            if not results:
+                logger.warning("XLSX file is empty or has no readable data.")
+            return results
+
+        except ImportError:
+            logger.error("openpyxl not installed. Run: pip install openpyxl")
+            return []
+        except Exception as e:
+            logger.error(f"Failed to extract text from XLSX: {e}")
+            return []
+
+    @staticmethod
+    def extract_from_text(file_bytes: bytes) -> List[OCRPageResult]:
+        """Trích xuất text từ file text thuần (TXT, MD, CSV)."""
+        try:
+            text = file_bytes.decode("utf-8", errors="replace").strip()
+            if not text:
+                return []
+            return [OCRPageResult(page=1, text=text, confidence=1.0)]
+        except Exception as e:
+            logger.error(f"Failed to extract text from text file: {e}")
+            return []