Phase 7: Hoàn thiện Modular RAG Backend với FastAPI và Đa LLM Provider

2026-05-08 07:30:30 +00:00
commit 26d1298cf6
51 changed files with 5360 additions and 0 deletions
--- a/extraction/pdf_inspector.py
+++ b/extraction/pdf_inspector.py
@@ -0,0 +1,61 @@
+import logging
+from core.models import DocumentType, ProcessingPolicy, PdfType
+
+logger = logging.getLogger("PDFInspector")
+
+class PDFInspector:
+    """
+    Inspects PDF files to determine if they are TEXT, SCAN, DRAWING or AMBIGUOUS.
+    """
+    
+    def __init__(self, text_density_threshold: int = 100):
+        self.text_density_threshold = text_density_threshold
+
+    def inspect_pdf_from_bytes(self, pdf_bytes: bytes) -> PdfType:
+        """
+        Deep inspects a PDF file from a byte stream.
+        """
+        try:
+            import fitz  # PyMuPDF
+        except ImportError:
+            logger.error("PyMuPDF (fitz) is not installed. Returning default SCAN_PDF.")
+            return PdfType.SCAN_PDF
+
+        try:
+            doc = fitz.open(stream=pdf_bytes, filetype="pdf")
+            num_pages = len(doc)
+            
+            pages_to_check = min(3, num_pages)
+            total_text_length = 0
+            is_huge = False
+            is_ambiguous_size = False
+            
+            for i in range(pages_to_check):
+                page = doc[i]
+                rect = page.rect
+                max_dim = max(rect.width, rect.height)
+                
+                if max_dim > 3000:
+                    is_huge = True
+                elif max_dim > 1000:
+                    is_ambiguous_size = True
+                
+                text = page.get_text()
+                total_text_length += len(text.strip())
+            
+            avg_text = total_text_length / pages_to_check
+            
+            if avg_text >= self.text_density_threshold:
+                return PdfType.TEXT_PDF
+                
+            if is_huge:
+                return PdfType.DRAWING_PDF
+                
+            if is_ambiguous_size:
+                return PdfType.AMBIGUOUS_PDF
+                
+            return PdfType.SCAN_PDF
+            
+        except Exception as e:
+            logger.error(f"Error inspecting PDF stream: {e}")
+            return PdfType.SCAN_PDF