poc_system/extraction/ocr_service.py

import io
import logging
import base64
import httpx
import fitz
from PIL import Image
from typing import List, Tuple
from core.models import OCRPageResult
from core.config import settings

logger = logging.getLogger("OCRService")

class OCRService:
    """
    OCR Service implementation acting as a VLM client.
    """
    def __init__(self):
        self.vlm_url = settings.VLM_ENDPOINT
        logger.info(f"Initialized VLM OCR Service connecting to {self.vlm_url}")

    def _image_to_base64(self, img: Image.Image) -> str:
        """Chuyển đổi PIL Image sang chuẩn Base64 JPEG"""
        buffered = io.BytesIO()
        # Chuyển sang RGB nếu ảnh có kênh Alpha
        if img.mode != 'RGB':
            img = img.convert('RGB')
        img.save(buffered, format="JPEG", quality=85)
        img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
        return f"data:image/jpeg;base64,{img_str}"

    def process_pdf_bytes(self, pdf_bytes: bytes) -> List[OCRPageResult]:
        """Process a PDF from memory using Vintern-3B VLM via LAN"""
        if not pdf_bytes:
            logger.warning("Empty PDF bytes received.")
            return []

        results = []
        try:
            import gc
            doc = fitz.open(stream=pdf_bytes, filetype="pdf")
            for page_num in range(len(doc)):
                logger.info(f"VLM Processing page {page_num + 1}/{len(doc)} via LAN...")

                # Render trang PDF thành ảnh. Hạ độ phân giải xuống 1.2 để giảm thiểu số lượng token
                # Tránh lỗi 500 do vượt quá Context Window của Llama.cpp
                matrix = fitz.Matrix(1.2, 1.2)
                pix = doc[page_num].get_pixmap(matrix=matrix)
                img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)

                del pix
                gc.collect()

                # Chuyển ảnh sang Base64
                b64_image = self._image_to_base64(img)

                # Gọi API Llama.cpp Server
                payload = {
                    "messages": [
                        {
                            "role": "user",
                            "content": [
                                {
                                    "type": "image_url",
                                    "image_url": {
                                        "url": b64_image
                                    }
                                },
                                {
                                    "type": "text",
                                    "text": "Hãy trích xuất chính xác toàn bộ văn bản có trong hình ảnh này. Giữ nguyên định dạng và các dấu câu tiếng Việt."
                                }
                            ]
                        }
                    ],
                    "temperature": settings.VLM_TEMPERATURE,
                    "max_tokens": settings.VLM_MAX_TOKENS
                }

                try:
                    with httpx.Client(timeout=settings.VLM_TIMEOUT) as client:
                        response = client.post(self.vlm_url, json=payload)
                        response.raise_for_status()

                        data = response.json()
                        vlm_text = data['choices'][0]['message']['content'].strip()

                        results.append(OCRPageResult(
                            page=page_num + 1,
                            text=vlm_text,
                            confidence=0.99, # VLM thường không trả về độ tự tin từng chữ, set cứng 0.99
                            paddle_text="", # Bỏ qua cột so sánh cũ
                            paddle_confidence=0.0
                        ))
                        logger.info(f"VLM extraction successful for page {page_num + 1}")

                except Exception as api_err:
                    logger.error(f"VLM API Error: {api_err}")
                    # Ghi nhận trang lỗi nhưng vẫn tiếp tục các trang sau
                    results.append(OCRPageResult(
                        page=page_num + 1,
                        text=f"[LỖI KẾT NỐI VLM: {api_err}]",
                        confidence=0.0,
                        paddle_text="",
                        paddle_confidence=0.0
                    ))

            return results
        except Exception as e:
            import traceback
            logger.error(f"Failed to process PDF: {e}\n{traceback.format_exc()}")
            return []