Phase 7: Hoàn thiện Modular RAG Backend với FastAPI và Đa LLM Provider

2026-05-08 07:30:30 +00:00
commit 26d1298cf6
51 changed files with 5360 additions and 0 deletions
--- a/chunking/markdown_chunker.py
+++ b/chunking/markdown_chunker.py
@@ -0,0 +1,117 @@
+import logging
+import uuid
+import re
+from typing import List, Dict, Any
+from core.models import OCRPageResult, DocumentChunk
+
+logger = logging.getLogger("MarkdownChunker")
+
+class MarkdownChunker:
+    """
+    Chia nhỏ văn bản (Semantic Chunking) dựa trên các thẻ Markdown (Header, Double Newline).
+    Theo dõi chính xác đoạn text đó thuộc trang (Page) nào.
+    """
+    def __init__(self, max_chunk_size: int = 1500, overlap: int = 200):
+        self.max_chunk_size = max_chunk_size
+        self.overlap = overlap
+
+    def chunk_document(self, 
+                       pages: List[OCRPageResult], 
+                       metadata: Dict[str, Any]) -> List[DocumentChunk]:
+        """
+        Nhận danh sách các trang đã được OCR/VLM dịch và trả về các Chunks.
+        """
+        chunks = []
+        current_chunk_text = ""
+        current_page_start = 1
+        
+        # 1. Ghép tất cả các trang lại kèm theo mốc (marker) trang ẩn
+        # Cách này giúp ta cắt văn bản liền mạch mà vẫn biết chữ nào thuộc trang nào
+        full_text = ""
+        page_markers = [] # Lưu (index_chữ, page_num)
+        
+        current_char_index = 0
+        for page in sorted(pages, key=lambda p: p.page):
+            page_markers.append((current_char_index, page.page))
+            page_text = page.text + "\n\n"
+            full_text += page_text
+            current_char_index += len(page_text)
+            
+        # 2. Cắt bằng Regex (Tách theo Markdown Heading # hoặc khoảng trắng kép \n\n)
+        # Tách thô các đoạn
+        paragraphs = re.split(r'(?=\n#{1,4}\s)', full_text) # Tách mỗi khi gặp Header
+        
+        refined_paragraphs = []
+        for p in paragraphs:
+            # Nếu đoạn quá dài, cắt tiếp bằng \n\n
+            if len(p) > self.max_chunk_size:
+                sub_p = re.split(r'\n\n', p)
+                refined_paragraphs.extend([s.strip() for s in sub_p if s.strip()])
+            else:
+                if p.strip():
+                    refined_paragraphs.append(p.strip())
+
+        # 3. Gộp các đoạn nhỏ thành các Chunk tối ưu
+        current_chunk = ""
+        chunk_start_index = 0
+        
+        def find_page(char_index):
+            """Hàm tìm số trang từ vị trí ký tự"""
+            found_page = page_markers[0][1]
+            for idx, p_num in page_markers:
+                if char_index >= idx:
+                    found_page = p_num
+                else:
+                    break
+            return found_page
+
+        char_counter = 0
+        for p in refined_paragraphs:
+            p_len = len(p)
+            
+            if len(current_chunk) + p_len > self.max_chunk_size and len(current_chunk) > 0:
+                # Đóng gói Chunk hiện tại
+                p_from = find_page(chunk_start_index)
+                p_to = find_page(char_counter)
+                
+                chunks.append(DocumentChunk(
+                    chunk_id=f"chk_{uuid.uuid4().hex[:10]}",
+                    file_id=metadata.get("item_id", ""),
+                    file_name=metadata.get("name", ""),
+                    text=current_chunk.strip(),
+                    page_from=p_from,
+                    page_to=p_to,
+                    source_url=metadata.get("web_url", ""),
+                    site_id=metadata.get("site_id", ""),
+                    permissions=["*"] # TODO: Sẽ gán quyền thật từ SharePoint
+                ))
+                
+                # Bắt đầu chunk mới với một chút Overlap (chối gối)
+                overlap_text = current_chunk[-self.overlap:] if len(current_chunk) > self.overlap else current_chunk
+                current_chunk = overlap_text + "\n\n" + p
+                chunk_start_index = char_counter - len(overlap_text)
+            else:
+                if len(current_chunk) == 0:
+                    chunk_start_index = char_counter
+                current_chunk += p + "\n\n"
+                
+            char_counter += p_len + 2 # +2 vì có \n\n
+            
+        # Đóng gói Chunk cuối cùng
+        if current_chunk.strip():
+            p_from = find_page(chunk_start_index)
+            p_to = find_page(char_counter)
+            chunks.append(DocumentChunk(
+                chunk_id=f"chk_{uuid.uuid4().hex[:10]}",
+                file_id=metadata.get("item_id", ""),
+                file_name=metadata.get("name", ""),
+                text=current_chunk.strip(),
+                page_from=p_from,
+                page_to=p_to,
+                source_url=metadata.get("web_url", ""),
+                site_id=metadata.get("site_id", ""),
+                permissions=["*"]
+            ))
+            
+        logger.info(f"Chunked document {metadata.get('name')} into {len(chunks)} chunks.")
+        return chunks