Phase 7: Hoàn thiện Modular RAG Backend với FastAPI và Đa LLM Provider

2026-05-08 07:30:30 +00:00
commit 26d1298cf6
51 changed files with 5360 additions and 0 deletions
--- a/extraction/ocr_service.py
+++ b/extraction/ocr_service.py
@@ -0,0 +1,111 @@
+import io
+import logging
+import base64
+import httpx
+import fitz
+from PIL import Image
+from typing import List, Tuple
+from core.models import OCRPageResult
+from core.config import settings
+
+logger = logging.getLogger("OCRService")
+
+class OCRService:
+    """
+    OCR Service implementation acting as a VLM client.
+    """
+    def __init__(self):
+        self.vlm_url = settings.VLM_ENDPOINT
+        logger.info(f"Initialized VLM OCR Service connecting to {self.vlm_url}")
+
+    def _image_to_base64(self, img: Image.Image) -> str:
+        """Chuyển đổi PIL Image sang chuẩn Base64 JPEG"""
+        buffered = io.BytesIO()
+        # Chuyển sang RGB nếu ảnh có kênh Alpha
+        if img.mode != 'RGB':
+            img = img.convert('RGB')
+        img.save(buffered, format="JPEG", quality=85)
+        img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
+        return f"data:image/jpeg;base64,{img_str}"
+
+    def process_pdf_bytes(self, pdf_bytes: bytes) -> List[OCRPageResult]:
+        """Process a PDF from memory using Vintern-3B VLM via LAN"""
+        if not pdf_bytes:
+            logger.warning("Empty PDF bytes received.")
+            return []
+
+        results = []
+        try:
+            import gc
+            doc = fitz.open(stream=pdf_bytes, filetype="pdf")
+            for page_num in range(len(doc)):
+                logger.info(f"VLM Processing page {page_num + 1}/{len(doc)} via LAN...")
+                
+                # Render trang PDF thành ảnh. Hạ độ phân giải xuống 1.2 để giảm thiểu số lượng token
+                # Tránh lỗi 500 do vượt quá Context Window của Llama.cpp
+                matrix = fitz.Matrix(1.2, 1.2)
+                pix = doc[page_num].get_pixmap(matrix=matrix)
+                img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+                
+                del pix
+                gc.collect()
+                
+                # Chuyển ảnh sang Base64
+                b64_image = self._image_to_base64(img)
+                
+                # Gọi API Llama.cpp Server
+                payload = {
+                    "messages": [
+                        {
+                            "role": "user",
+                            "content": [
+                                {
+                                    "type": "image_url",
+                                    "image_url": {
+                                        "url": b64_image
+                                    }
+                                },
+                                {
+                                    "type": "text",
+                                    "text": "Hãy trích xuất chính xác toàn bộ văn bản có trong hình ảnh này. Giữ nguyên định dạng và các dấu câu tiếng Việt."
+                                }
+                            ]
+                        }
+                    ],
+                    "temperature": settings.VLM_TEMPERATURE,
+                    "max_tokens": settings.VLM_MAX_TOKENS
+                }
+                
+                try:
+                    with httpx.Client(timeout=settings.VLM_TIMEOUT) as client:
+                        response = client.post(self.vlm_url, json=payload)
+                        response.raise_for_status()
+                        
+                        data = response.json()
+                        vlm_text = data['choices'][0]['message']['content'].strip()
+                        
+                        results.append(OCRPageResult(
+                            page=page_num + 1,
+                            text=vlm_text,
+                            confidence=0.99, # VLM thường không trả về độ tự tin từng chữ, set cứng 0.99
+                            paddle_text="", # Bỏ qua cột so sánh cũ
+                            paddle_confidence=0.0
+                        ))
+                        logger.info(f"VLM extraction successful for page {page_num + 1}")
+                        
+                except Exception as api_err:
+                    logger.error(f"VLM API Error: {api_err}")
+                    # Ghi nhận trang lỗi nhưng vẫn tiếp tục các trang sau
+                    results.append(OCRPageResult(
+                        page=page_num + 1,
+                        text=f"[LỖI KẾT NỐI VLM: {api_err}]",
+                        confidence=0.0,
+                        paddle_text="",
+                        paddle_confidence=0.0
+                    ))
+                    
+            return results
+        except Exception as e:
+            import traceback
+            logger.error(f"Failed to process PDF: {e}\n{traceback.format_exc()}")
+            return []