Phase 7: Hoàn thiện Modular RAG Backend với FastAPI và Đa LLM Provider

2026-05-08 07:30:30 +00:00
commit 26d1298cf6
51 changed files with 5360 additions and 0 deletions
--- a/.env.example
+++ b/.env.example
@@ -0,0 +1,49 @@
+# Microsoft Entra ID (Azure AD) Config
+TENANT_ID=your_tenant_id_here
+CLIENT_ID=your_client_id_here
+CLIENT_SECRET=your_client_secret_here
+
+# SharePoint Config
+SHAREPOINT_SITE_ID=your_sharepoint_site_id_here
+SHAREPOINT_DRIVE_ID="b!..."
+
+# ========================================
+# VLM (Vision-Language Model) CONFIGURATION
+# ========================================
+# The URL to your local/LAN VLM server (e.g. llama.cpp)
+VLM_ENDPOINT="http://10.202.50.3:8080/v1/chat/completions"
+# Set to low value (0.1) for exact extraction, higher (0.7) for more creativity
+VLM_TEMPERATURE="0.1"
+# Max tokens to generate per page
+VLM_MAX_TOKENS="2000"
+# Connection timeout in seconds
+VLM_TIMEOUT="120.0"
+
+# OpenSearch Config
+OPENSEARCH_HOST=localhost
+OPENSEARCH_PORT=9200
+OPENSEARCH_USER=admin
+OPENSEARCH_PASS=admin
+
+# ========================================
+# CHAT LLM CONFIGURATION (Phase 7)
+# ========================================
+# LLM_PROVIDER can be: 'gemini', 'groq', or 'local'
+LLM_PROVIDER=gemini
+
+# 1. Gemini Config (Default)
+GEMINI_API_KEY=your_gemini_api_key_here
+
+# 2. Groq Config
+GROQ_API_KEY=your_groq_api_key_here
+GROQ_MODEL=llama3-70b-8192
+
+# 3. Local Llama.cpp Config
+LOCAL_LLM_ENDPOINT="http://10.202.50.3:8081/v1/chat/completions"
+
+# (Legacy OpenAI - Can be removed if not used)
+OPENAI_API_KEY=your_openai_api_key_here
+
+# App Settings
+LOG_LEVEL=INFO
+ENVIRONMENT=development
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,22 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# Environments
+.venv/
+venv/
+ENV/
+env/
+
+# Configuration & Secrets
+.env
+delta_state.json
+
+# Databases / Outputs
+ingestion_output.json
+*.log
+
+# OS generated files
+.DS_Store
+Thumbs.db
--- a/23
+++ b/23
@@ -0,0 +1,23 @@
+FROM python:3.11-slim
+
+# Cài đặt các system dependencies cần thiết cho OCR (PaddleOCR/OpenCV) và thao tác hệ thống
+RUN apt-get update && apt-get install -y \
+    libgl1-mesa-glx \
+    libglib2.0-0 \
+    build-essential \
+    && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /app
+
+# Copy requirements và cài đặt
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Copy toàn bộ source code
+COPY . .
+
+# Expose port cho FastAPI
+EXPOSE 8000
+
+# Chạy server FastAPI khi container khởi động
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000", "--reload"]
--- a/api/main.py
+++ b/api/main.py
@@ -0,0 +1,93 @@
+import logging
+from fastapi import FastAPI, HTTPException, BackgroundTasks
+from pydantic import BaseModel
+from typing import List, Optional, Dict, Any
+import uvicorn
+import sys
+import os
+
+# Thêm thư mục gốc vào PYTHONPATH để tìm thấy các module chat, search, ingestion...
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from chat.rag_engine import RAGEngine
+from core.config import settings
+
+# Cấu hình Logging
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
+logger = logging.getLogger("API")
+
+app = FastAPI(
+    title="PoC SharePoint RAG API",
+    description="Hệ thống hỏi đáp nội bộ dựa trên SharePoint và Distributed VLM",
+    version="1.0.0"
+)
+
+# Khởi tạo RAG Engine (Singleton)
+# Khi API khởi động, nó sẽ nạp sẵn Embedding Model vào RAM để phản hồi cực nhanh
+try:
+    if settings.opensearch_host == "opensearch":
+        settings.opensearch_host = "localhost" # Fallback cho Local Dev
+    rag_engine = RAGEngine()
+except Exception as e:
+    logger.error(f"Không thể khởi động RAG Engine: {e}")
+    rag_engine = None
+
+# --- MODELS ---
+class ChatRequest(BaseModel):
+    query: str
+    history: Optional[List[Dict[str, str]]] = []
+
+class ChatResponse(BaseModel):
+    answer: str
+    sources: List[Dict[str, Any]] = []
+
+# --- ENDPOINTS ---
+
+@app.get("/health")
+def health_check():
+    """Kiểm tra sức khỏe hệ thống"""
+    return {
+        "status": "healthy",
+        "llm_provider": settings.llm_provider,
+        "opensearch_host": settings.opensearch_host
+    }
+
+@app.post("/chat", response_model=ChatResponse)
+async def chat_endpoint(request: ChatRequest):
+    """
+    Điểm cuối để thực hiện hỏi đáp (RAG).
+    """
+    if not rag_engine:
+        raise HTTPException(status_code=503, detail="RAG Engine chưa được khởi tạo thành công.")
+    
+    try:
+        logger.info(f"Nhận câu hỏi: {request.query}")
+        result = rag_engine.chat(request.query, history=request.history)
+        
+        return ChatResponse(
+            answer=result["answer"],
+            sources=result["sources"]
+        )
+    except Exception as e:
+        logger.error(f"Lỗi xử lý chat: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+
+@app.post("/ingest")
+async def start_ingestion(background_tasks: BackgroundTasks):
+    """
+    Kích hoạt quá trình quét SharePoint và nạp dữ liệu vào OpenSearch.
+    Chạy dưới dạng Background Task để không làm treo API.
+    """
+    # TODO: Kết nối với script sync.py và quy trình extraction
+    # Ở đây chúng ta sẽ gọi một function xử lý bất đồng bộ
+    background_tasks.add_task(dummy_ingest_task)
+    return {"message": "Quá trình đồng bộ dữ liệu đã bắt đầu chạy ngầm."}
+
+async def dummy_ingest_task():
+    logger.info("Bắt đầu Ingestion task...")
+    # Sẽ tích hợp logic từ test_rag_pipeline.py vào đây
+    pass
+
+if __name__ == "__main__":
+    # Chạy server tại cổng 8000
+    uvicorn.run(app, host="0.0.0.0", port=8000)
--- a/chat/llm_factory.py
+++ b/chat/llm_factory.py
@@ -0,0 +1,26 @@
+import logging
+from core.config import settings
+from .llm_providers.base_llm import BaseLLMProvider
+from .llm_providers.gemini_llm import GeminiLLM
+from .llm_providers.groq_llm import GroqLLM
+from .llm_providers.local_llm import LocalLLM
+
+logger = logging.getLogger("LLMFactory")
+
+class LLMFactory:
+    """
+    Mẫu thiết kế Factory (Nhà máy) để tự động nạp Provider tuỳ theo cấu hình trong .env
+    """
+    @staticmethod
+    def get_provider() -> BaseLLMProvider:
+        provider_name = settings.llm_provider.lower().strip()
+        
+        if provider_name == "gemini":
+            return GeminiLLM()
+        elif provider_name == "groq":
+            return GroqLLM()
+        elif provider_name == "local":
+            return LocalLLM()
+        else:
+            logger.warning(f"Provider '{provider_name}' không được hỗ trợ. Khởi động mặc định: GeminiLLM")
+            return GeminiLLM()
--- a/chat/llm_providers/base_llm.py
+++ b/chat/llm_providers/base_llm.py
@@ -0,0 +1,16 @@
+from abc import ABC, abstractmethod
+from typing import List, Dict
+
+class BaseLLMProvider(ABC):
+    """
+    Interface gốc cho tất cả các mô hình Ngôn ngữ (Chat LLMs).
+    Đảm bảo tính dễ mở rộng (Cắm rút API khác nhau) mà không làm vỡ kiến trúc RAG.
+    """
+    
+    @abstractmethod
+    def generate_response(self, prompt: str, context: str, history: List[Dict[str, str]] = None) -> str:
+        """
+        Nhận vào ngữ cảnh (từ VectorDB) và câu hỏi của user (prompt),
+        Trả về câu trả lời cuối cùng dưới dạng Text.
+        """
+        pass
--- a/chat/llm_providers/gemini_llm.py
+++ b/chat/llm_providers/gemini_llm.py
@@ -0,0 +1,44 @@
+import logging
+from typing import List, Dict
+from core.config import settings
+from .base_llm import BaseLLMProvider
+
+logger = logging.getLogger("GeminiLLM")
+
+class GeminiLLM(BaseLLMProvider):
+    def __init__(self):
+        try:
+            import google.generativeai as genai
+        except ImportError:
+            logger.error("LỖI: Chưa cài đặt thư viện Gemini. Chạy lệnh: pip install google-generativeai")
+            raise
+            
+        if not settings.gemini_api_key:
+            raise ValueError("GEMINI_API_KEY chưa được cấu hình trong .env!")
+            
+        genai.configure(api_key=settings.gemini_api_key)
+        # Sử dụng model flash mới nhất, cực nhanh và có ngữ cảnh lớn
+        self.model = genai.GenerativeModel('gemini-1.5-flash')
+        logger.info("Đã khởi tạo Gemini 1.5 Flash Provider.")
+
+    def generate_response(self, prompt: str, context: str, history: List[Dict[str, str]] = None) -> str:
+        full_prompt = f"""Bạn là một trợ lý ảo thông minh chuyên giải đáp thông tin dựa trên cơ sở dữ liệu nội bộ.
+Hãy trả lời câu hỏi của người dùng một cách chính xác, lịch sự và DỰA HOÀN TOÀN vào ngữ cảnh được cung cấp.
+
+=== NGỮ CẢNH (CONTEXT) ===
+{context}
+
+=== LỊCH SỬ CHAT ===
+{history if history else 'Không có'}
+
+=== CÂU HỎI CỦA NGƯỜI DÙNG ===
+{prompt}
+
+=== TRẢ LỜI ===
+"""
+        try:
+            response = self.model.generate_content(full_prompt)
+            return response.text
+        except Exception as e:
+            logger.error(f"Lỗi khi gọi Gemini API: {e}")
+            return f"Xin lỗi, tôi đang gặp sự cố khi xử lý câu trả lời: {e}"
--- a/chat/llm_providers/groq_llm.py
+++ b/chat/llm_providers/groq_llm.py
@@ -0,0 +1,54 @@
+import logging
+from typing import List, Dict
+from core.config import settings
+from .base_llm import BaseLLMProvider
+
+logger = logging.getLogger("GroqLLM")
+
+class GroqLLM(BaseLLMProvider):
+    def __init__(self):
+        try:
+            from openai import OpenAI
+        except ImportError:
+            logger.error("LỖI: Thư viện openai chưa được cài đặt.")
+            raise
+            
+        if not settings.groq_api_key:
+            raise ValueError("GROQ_API_KEY chưa được cấu hình trong .env!")
+            
+        # Groq hỗ trợ thư viện OpenAI SDK, chỉ cần đổi Base URL
+        self.client = OpenAI(
+            api_key=settings.groq_api_key,
+            base_url="https://api.groq.com/openai/v1"
+        )
+        self.model_name = settings.groq_model
+        logger.info(f"Đã khởi tạo Groq Provider với model: {self.model_name}")
+
+    def generate_response(self, prompt: str, context: str, history: List[Dict[str, str]] = None) -> str:
+        system_prompt = f"""Bạn là một trợ lý ảo thông minh chuyên giải đáp thông tin dựa trên cơ sở dữ liệu nội bộ.
+Hãy trả lời câu hỏi của người dùng một cách chính xác, lịch sự và DỰA HOÀN TOÀN vào ngữ cảnh được cung cấp.
+
+=== NGỮ CẢNH (CONTEXT) ===
+{context}
+"""
+        messages = [{"role": "system", "content": system_prompt}]
+        
+        # Thêm lịch sử chat (nếu có)
+        if history:
+            for msg in history:
+                messages.append({"role": msg.get("role", "user"), "content": msg.get("content", "")})
+                
+        # Thêm câu hỏi hiện tại
+        messages.append({"role": "user", "content": prompt})
+        
+        try:
+            response = self.client.chat.completions.create(
+                model=self.model_name,
+                messages=messages,
+                temperature=0.2, # Giữ temperature thấp để AI không bịa chuyện
+                max_tokens=1024
+            )
+            return response.choices[0].message.content
+        except Exception as e:
+            logger.error(f"Lỗi khi gọi Groq API: {e}")
+            return f"Xin lỗi, tôi đang gặp sự cố khi xử lý câu trả lời qua Groq: {e}"
--- a/chat/llm_providers/local_llm.py
+++ b/chat/llm_providers/local_llm.py
@@ -0,0 +1,54 @@
+import logging
+from typing import List, Dict
+from core.config import settings
+from .base_llm import BaseLLMProvider
+
+logger = logging.getLogger("LocalLLM")
+
+class LocalLLM(BaseLLMProvider):
+    def __init__(self):
+        try:
+            from openai import OpenAI
+        except ImportError:
+            logger.error("LỖI: Thư viện openai chưa được cài đặt.")
+            raise
+            
+        if not settings.local_llm_endpoint:
+            raise ValueError("LOCAL_LLM_ENDPOINT chưa được cấu hình trong .env!")
+            
+        # llama.cpp server hỗ trợ API y hệt OpenAI
+        # Ta cần cắt đuôi /chat/completions để ra base_url
+        base_url = settings.local_llm_endpoint.replace("/chat/completions", "")
+        
+        self.client = OpenAI(
+            api_key="sk-no-key-required", # Llama.cpp local không cần key
+            base_url=base_url
+        )
+        logger.info(f"Đã khởi tạo Local Llama.cpp Provider kết nối tới: {base_url}")
+
+    def generate_response(self, prompt: str, context: str, history: List[Dict[str, str]] = None) -> str:
+        system_prompt = f"""Bạn là trợ lý RAG nội bộ. 
+Chỉ trả lời dựa trên NGỮ CẢNH dưới đây. Nếu không biết thì nói không biết.
+
+=== NGỮ CẢNH ===
+{context}
+"""
+        messages = [{"role": "system", "content": system_prompt}]
+        
+        if history:
+            for msg in history:
+                messages.append({"role": msg.get("role", "user"), "content": msg.get("content", "")})
+                
+        messages.append({"role": "user", "content": prompt})
+        
+        try:
+            response = self.client.chat.completions.create(
+                model="local-model", # llama.cpp thường phớt lờ tên model
+                messages=messages,
+                temperature=0.1,
+                max_tokens=1024
+            )
+            return response.choices[0].message.content
+        except Exception as e:
+            logger.error(f"Lỗi khi gọi Local LLM: {e}")
+            return f"Xin lỗi, máy chủ AI nội bộ đang bận hoặc mất kết nối: {e}"
--- a/chat/rag_engine.py
+++ b/chat/rag_engine.py
@@ -0,0 +1,49 @@
+import logging
+from typing import List, Dict
+from search.retriever import SearchRetriever
+from .llm_factory import LLMFactory
+
+logger = logging.getLogger("RAGEngine")
+
+class RAGEngine:
+    def __init__(self):
+        self.retriever = SearchRetriever()
+        self.llm = LLMFactory.get_provider()
+        logger.info(f"RAG Engine đã sẵn sàng với LLM Provider: {type(self.llm).__name__}")
+
+    def chat(self, user_query: str, history: List[Dict[str, str]] = None) -> Dict:
+        """
+        Quy trình RAG hoàn chỉnh: Search -> Augment -> Generate
+        """
+        # 1. RETRIEVAL: Tìm kiếm ngữ cảnh liên quan
+        relevant_chunks = self.retriever.retrieve(user_query, top_k=5)
+        
+        if not relevant_chunks:
+            context_text = "Không tìm thấy thông tin liên quan trong cơ sở dữ liệu nội bộ."
+        else:
+            # Gộp text từ các chunks lại thành 1 khối context
+            context_text = "\n---\n".join([
+                f"[Nguồn: {c.file_name}, Trang: {c.page_from}]\nNội dung: {c.text}" 
+                for c in relevant_chunks
+            ])
+
+        # 2. GENERATION: Gửi sang LLM để trả lời
+        logger.info("Đang yêu cầu LLM tổng hợp câu trả lời...")
+        answer = self.llm.generate_response(
+            prompt=user_query,
+            context=context_text,
+            history=history
+        )
+
+        # 3. Trả về kết quả kèm theo nguồn trích dẫn (Citations)
+        return {
+            "answer": answer,
+            "context_used": context_text,
+            "sources": [
+                {
+                    "file_name": c.file_name,
+                    "page": c.page_from,
+                    "url": c.source_url
+                } for c in relevant_chunks
+            ]
+        }
--- a/chunking/markdown_chunker.py
+++ b/chunking/markdown_chunker.py
@@ -0,0 +1,117 @@
+import logging
+import uuid
+import re
+from typing import List, Dict, Any
+from core.models import OCRPageResult, DocumentChunk
+
+logger = logging.getLogger("MarkdownChunker")
+
+class MarkdownChunker:
+    """
+    Chia nhỏ văn bản (Semantic Chunking) dựa trên các thẻ Markdown (Header, Double Newline).
+    Theo dõi chính xác đoạn text đó thuộc trang (Page) nào.
+    """
+    def __init__(self, max_chunk_size: int = 1500, overlap: int = 200):
+        self.max_chunk_size = max_chunk_size
+        self.overlap = overlap
+
+    def chunk_document(self, 
+                       pages: List[OCRPageResult], 
+                       metadata: Dict[str, Any]) -> List[DocumentChunk]:
+        """
+        Nhận danh sách các trang đã được OCR/VLM dịch và trả về các Chunks.
+        """
+        chunks = []
+        current_chunk_text = ""
+        current_page_start = 1
+        
+        # 1. Ghép tất cả các trang lại kèm theo mốc (marker) trang ẩn
+        # Cách này giúp ta cắt văn bản liền mạch mà vẫn biết chữ nào thuộc trang nào
+        full_text = ""
+        page_markers = [] # Lưu (index_chữ, page_num)
+        
+        current_char_index = 0
+        for page in sorted(pages, key=lambda p: p.page):
+            page_markers.append((current_char_index, page.page))
+            page_text = page.text + "\n\n"
+            full_text += page_text
+            current_char_index += len(page_text)
+            
+        # 2. Cắt bằng Regex (Tách theo Markdown Heading # hoặc khoảng trắng kép \n\n)
+        # Tách thô các đoạn
+        paragraphs = re.split(r'(?=\n#{1,4}\s)', full_text) # Tách mỗi khi gặp Header
+        
+        refined_paragraphs = []
+        for p in paragraphs:
+            # Nếu đoạn quá dài, cắt tiếp bằng \n\n
+            if len(p) > self.max_chunk_size:
+                sub_p = re.split(r'\n\n', p)
+                refined_paragraphs.extend([s.strip() for s in sub_p if s.strip()])
+            else:
+                if p.strip():
+                    refined_paragraphs.append(p.strip())
+
+        # 3. Gộp các đoạn nhỏ thành các Chunk tối ưu
+        current_chunk = ""
+        chunk_start_index = 0
+        
+        def find_page(char_index):
+            """Hàm tìm số trang từ vị trí ký tự"""
+            found_page = page_markers[0][1]
+            for idx, p_num in page_markers:
+                if char_index >= idx:
+                    found_page = p_num
+                else:
+                    break
+            return found_page
+
+        char_counter = 0
+        for p in refined_paragraphs:
+            p_len = len(p)
+            
+            if len(current_chunk) + p_len > self.max_chunk_size and len(current_chunk) > 0:
+                # Đóng gói Chunk hiện tại
+                p_from = find_page(chunk_start_index)
+                p_to = find_page(char_counter)
+                
+                chunks.append(DocumentChunk(
+                    chunk_id=f"chk_{uuid.uuid4().hex[:10]}",
+                    file_id=metadata.get("item_id", ""),
+                    file_name=metadata.get("name", ""),
+                    text=current_chunk.strip(),
+                    page_from=p_from,
+                    page_to=p_to,
+                    source_url=metadata.get("web_url", ""),
+                    site_id=metadata.get("site_id", ""),
+                    permissions=["*"] # TODO: Sẽ gán quyền thật từ SharePoint
+                ))
+                
+                # Bắt đầu chunk mới với một chút Overlap (chối gối)
+                overlap_text = current_chunk[-self.overlap:] if len(current_chunk) > self.overlap else current_chunk
+                current_chunk = overlap_text + "\n\n" + p
+                chunk_start_index = char_counter - len(overlap_text)
+            else:
+                if len(current_chunk) == 0:
+                    chunk_start_index = char_counter
+                current_chunk += p + "\n\n"
+                
+            char_counter += p_len + 2 # +2 vì có \n\n
+            
+        # Đóng gói Chunk cuối cùng
+        if current_chunk.strip():
+            p_from = find_page(chunk_start_index)
+            p_to = find_page(char_counter)
+            chunks.append(DocumentChunk(
+                chunk_id=f"chk_{uuid.uuid4().hex[:10]}",
+                file_id=metadata.get("item_id", ""),
+                file_name=metadata.get("name", ""),
+                text=current_chunk.strip(),
+                page_from=p_from,
+                page_to=p_to,
+                source_url=metadata.get("web_url", ""),
+                site_id=metadata.get("site_id", ""),
+                permissions=["*"]
+            ))
+            
+        logger.info(f"Chunked document {metadata.get('name')} into {len(chunks)} chunks.")
+        return chunks
--- a/core/config.py
+++ b/core/config.py
@@ -0,0 +1,40 @@
+import os
+from pydantic_settings import BaseSettings, SettingsConfigDict
+
+class Settings(BaseSettings):
+    # Azure AD / Microsoft Graph
+    tenant_id: str = ""
+    client_id: str = ""
+    client_secret: str = ""
+    
+    # VLM (Vision-Language Model) Configuration
+    VLM_ENDPOINT: str = "http://10.202.50.3:8080/v1/chat/completions"
+    VLM_TEMPERATURE: float = 0.1
+    VLM_MAX_TOKENS: int = 2000
+    VLM_TIMEOUT: float = 120.0
+    
+    # SharePoint
+    sharepoint_site_id: str = ""
+    sharepoint_drive_id: str = ""
+
+    # OpenSearch
+    opensearch_host: str = "localhost"
+    opensearch_port: int = 9200
+    opensearch_user: str = "admin"
+    opensearch_pass: str = "admin"
+
+    # Chat LLM Config
+    llm_provider: str = "gemini"
+    gemini_api_key: str = ""
+    groq_api_key: str = ""
+    groq_model: str = "llama-3.3-70b-versatile"
+    local_llm_endpoint: str = "http://10.202.50.3:8081/v1/chat/completions"
+    openai_api_key: str = ""
+
+    # General Settings
+    log_level: str = "INFO"
+    environment: str = "development"
+
+    model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8", extra="ignore")
+
+settings = Settings()
--- a/core/models.py
+++ b/core/models.py
@@ -0,0 +1,69 @@
+from pydantic import BaseModel
+from typing import Optional
+from enum import Enum
+
+class DocumentType(str, Enum):
+    TEXTUAL_DOCUMENT = "textual_document"
+    SPREADSHEET = "spreadsheet"
+    PRESENTATION = "presentation"
+    DRAWING = "drawing"
+    BINARY = "binary"
+    UNKNOWN = "unknown"
+
+class ProcessingPolicy(str, Enum):
+    REQUIRES_OCR = "requires_ocr"          # Needs OCR (e.g., SCAN_PDF)
+    SKIP_OCR = "skip_ocr"                  # No OCR needed, pure text extraction (e.g., TEXT_PDF, DOCX)
+    METADATA_ONLY = "metadata_only"        # Search by metadata only, no text extraction (e.g., CAD, DRAWING_PDF)
+    REQUIRES_REVIEW = "requires_review"    # Doubtful cases (e.g. Ambiguous PDF)
+    UNSUPPORTED = "unsupported"            # Ignored
+
+class PdfType(str, Enum):
+    TEXT_PDF = "TEXT_PDF"
+    SCAN_PDF = "SCAN_PDF"
+    DRAWING_PDF = "DRAWING_PDF"
+    AMBIGUOUS_PDF = "AMBIGUOUS_PDF"
+    NOT_PDF = "NOT_PDF"
+
+class IngestedDocument(BaseModel):
+    """Data contract: Output of Ingestion -> Input for Document Classification Engine (DCE)"""
+    site_id: str
+    drive_id: str
+    item_id: str
+    name: str
+    web_url: str
+    download_url: Optional[str] = None
+    mime_type: Optional[str] = None
+    parent_path: Optional[str] = None
+    is_folder: bool
+    size: int
+    last_modified: Optional[str] = None
+
+class DocumentClassificationResult(BaseModel):
+    """Data contract: Output of DCE -> Input for Inspection & Routing"""
+    item_id: str
+    doc_type: DocumentType
+    processing_policy: ProcessingPolicy
+    file_extension: str
+    is_supported: bool
+    reason: str
+
+class OCRPageResult(BaseModel):
+    """Data contract: Output of OCR Service -> Input for Normalization / RAG"""
+    page: int
+    text: str
+    confidence: float
+    paddle_text: str = ""
+    paddle_confidence: float = 0.0
+
+class DocumentChunk(BaseModel):
+    """Data contract: Output of Chunking -> Input for Embedding & Indexing"""
+    chunk_id: str
+    file_id: str
+    file_name: str
+    text: str
+    embedding: Optional[list[float]] = None
+    page_from: int
+    page_to: int
+    source_url: str
+    permissions: list[str] = []
+    site_id: str = ""
--- a/doc/00.AGENT_ARCHITECTURE_MAP.md
+++ b/doc/00.AGENT_ARCHITECTURE_MAP.md
@@ -0,0 +1,88 @@
+# 🧭 AGENT ARCHITECTURE MAP (LIVING DOCUMENT)
+*Đây là tài liệu dẫn đường dành riêng cho các AI Agent tương lai và lập trình viên bảo trì. Không quét toàn bộ code, hãy đọc file này trước.*
+
+**Lần cập nhật cuối:** Phase 6 (Hoàn thiện Semantic Chunking & Vector Indexing)
+**Trạng thái Dự án:** Đã hoàn thành Ingestion, Extraction, Chunking & Indexing. Chuẩn bị bước vào Phase 7 (RAG Search & Chat API).
+
+---
+
+## 1. Bản Đồ Kiến Trúc Lõi (Core Architecture Patterns)
+
+### A. Tầng Ingestion (Thu thập dữ liệu) - Mẫu Modular Provider Pattern
+- **Mục tiêu:** Tách biệt lõi hệ thống khỏi nền tảng lưu trữ (SharePoint, Google Drive, v.v.).
+- **Interface gốc:** `ingestion/providers/base_provider.py` (Bắt buộc phải implement `fetch_changes` và `download_file`).
+- **Implement hiện tại:** `ingestion/providers/sharepoint_provider.py`. Nó bọc lại `GraphClient` và tự động xử lý thuật toán phân trang (pagination) để lấy dữ liệu Delta.
+- **Nếu cần thêm nguồn dữ liệu mới (ví dụ: NAS, Google Drive):** Chỉ cần tạo một class mới kế thừa `BaseStorageProvider`. Lõi hệ thống không cần biết về API của nguồn đó.
+
+### B. Tầng Extraction (Xử lý chữ & Ảnh) - Mẫu Distributed VLM Pattern
+- **Lịch sử:** Đã từng dùng PaddleOCR + VietOCR nhưng gặp lỗi "Rụng dấu" và "Ảo giác" do cắt ảnh sai.
+- **Kiến trúc hiện tại:** Hệ thống đóng vai trò như một **VLM Client**.
+- **Cách hoạt động:** `extraction/ocr_service.py` render file PDF thành ảnh (DPI=86), nén Base64 và bắn POST Request sang một Server LLM khác trong mạng LAN (chạy `llama.cpp` với model `Vintern-3B`).
+- **Lợi ích:** Giải phóng hoàn toàn RAM cho máy chủ RAG, loại bỏ các thư viện AI nặng nền (Torch, Paddle). Lấy được Markdown nguyên bản, không gãy vỡ layout bảng biểu.
+
+### C. Tầng Chunking & Vector DB (Semantic Indexing)
+- **Chunking:** `chunking/markdown_chunker.py` chia nhỏ văn bản bằng Markdown Rules (nhận biết Header `#`, duy trì overlap chống đứt gãy ngữ cảnh), tự động theo dõi `page_from`, `page_to` chuẩn xác.
+- **Embedding:** Dùng thư viện `sentence-transformers` với model `keepitreal/vietnamese-sbert` chạy Local/Offline. Tạo ra Vector 768 chiều chuyên biệt cho Tiếng Việt.
+- **Database:** `indexing/vector_store.py` cấu hình OpenSearch với thuật toán `k-NN HNSW`. Index mặc định là `poc_sharepoint_docs` hoặc `sharepoint_docs`.
+
+### D. Tầng Cấu hình (Decoupled Configuration)
+- Toàn bộ thông số hệ thống, đặc biệt là IP máy chủ VLM, Token của SharePoint đều nằm trong `.env`.
+- Mã nguồn load cấu hình thông qua `core/config.py`.
+- **Tuyệt đối KHÔNG hardcode URL, Token hay Password trong code.**
+
+---
+
+## 2. Bản Đồ File & Thư Mục Quan Trọng
+
+```text
+📁 poc_system/
+├── 📁 core/
+│   ├── config.py         # ⚙️ Trái tim cấu hình (Load từ .env)
+│   └── models.py         # 🧩 Định nghĩa Data Classes (OCRPageResult, v.v.)
+├── 📁 ingestion/
+│   ├── sync.py           # 🔄 Bộ điều phối đồng bộ (Đang chuẩn bị ghép với BaseStorageProvider)
+│   ├── graph_client.py   # 🌐 Microsoft Graph API Client (Bọc Auth)
+│   └── 📁 providers/     # 🔌 Nơi chứa các plugin kết nối dữ liệu
+│       ├── base_provider.py
+│       └── sharepoint_provider.py
+├── 📁 extraction/
+│   └── ocr_service.py    # 👁️ VLM Client (Chuyển ảnh -> Text Markdown qua LAN)
+├── .env                  # 🔑 Chìa khoá và địa chỉ mạng (KHÔNG commit file này)
+└── test_modular_architecture.py # 🧪 Script kiểm tra nhanh kết nối các module
+```
+
+---
+
+## 3. Lịch Sử Các Lỗi Khét Tiếng & Cách Xử Lý (Known Gotchas)
+1. **Lỗi 401 Unauthorized khi tải file từ SharePoint:**
+   - *Nguyên nhân:* Microsoft chặn download trực tiếp bằng `@microsoft.graph.downloadUrl` nếu dùng App-Only Token.
+   - *Giải pháp:* Dùng endpoint `.../items/{item_id}/content` kèm Bearer Token (Đã cài đặt trong `graph_client.py`).
+
+2. **Lỗi 500 Internal Server Error từ Llama.cpp VLM:**
+   - *Nguyên nhân:* Bức ảnh ném vào VLM có độ phân giải quá cao (Matrix 2.0) làm tràn Context Window (ví dụ: Token ảnh > 4096).
+   - *Giải pháp:* Hạ `Matrix` xuống `1.2`, hoặc khởi chạy Server Llama.cpp với `-c 8192`. Bắt buộc phải có file `--mmproj`.
+
+3. **Lỗi Rụng dấu / Ảo giác của VietOCR:**
+   - *Nguyên nhân:* PaddleOCR bắt khung quá khít, làm cụt phần đuôi của các chữ tiếng Việt có dấu. Mô hình `vgg_seq2seq` tự nội suy ra từ tiếng Anh linh tinh.
+   - *Giải pháp triệt để:* Đã loại bỏ hoàn toàn VietOCR, chuyển sang dùng VLM (Vintern-3B).
+
+4. **Lỗi UTF-8 Surrogate (\udcc3) trong Terminal WSL:**
+   - *Hiện tượng:* Câu hỏi đầu tiên đúng, nhưng từ câu thứ 2 bị lỗi mã hóa khi dùng `input()`.
+   - *Nguyên nhân:* Do sự không đồng nhất giữa `sys.stdin` và bộ đệm Terminal sau khi in lượng lớn dữ liệu từ LLM.
+   - *Giải pháp:* Sử dụng `sys.stdin.buffer.readline()` để đọc dữ liệu thô (Bytes) và tự decode bằng UTF-8. Đây là giải pháp cho môi trường CLI, khi lên Web API (FastAPI) sẽ không bị ảnh hưởng.
+
+---
+
+## 4. Nhiệm Vụ Tiếp Theo (Dành cho Lập Trình Viên/AI Agent)
+- [ ] **Phase 7:** Bọc thành API Backend bằng FastAPI.
+
+---
+
+## 5. Tiêu chuẩn Lập trình & Môi trường (Coding Standards)
+
+### A. Quản lý Mã hóa (Encoding)
+- **Quy tắc vàng:** Luôn sử dụng `encoding='utf-8'` trong mọi lệnh `open()`. Tuyệt đối không dựa dẫm vào encoding mặc định của hệ điều hành.
+- **Môi trường:** Hệ thống được thiết kế để chạy trong môi trường UTF-8. Trong Docker hoặc WSL, luôn đảm bảo biến môi trường `PYTHONIOENCODING=utf-8` được thiết lập. Điều này giúp hệ thống tương thích 100% với các ký tự Tiếng Việt từ LLM mà không cần hack code.
+
+### B. Mẫu Provider (Provider Pattern)
+- Mọi kết nối tới dịch vụ bên thứ ba (Storage, LLM) phải thông qua Interface/BaseClass để đảm bảo tính "Cắm rút" (Pluggable).
--- a/doc/1.phan-tich-kien-truc-tra-cuu-sharepoint.md
+++ b/doc/1.phan-tich-kien-truc-tra-cuu-sharepoint.md
@@ -0,0 +1,243 @@
+# Kiến trúc và định hướng xây dựng hệ thống tra cứu tài liệu scan SharePoint
+
+## 1. Mục tiêu tổng thể
+Hệ thống nhằm giải quyết ba vấn đề cốt lõi trong doanh nghiệp:
+1. **Biến kho tài liệu scan trên SharePoint thành dữ liệu có thể tìm kiếm theo ngữ nghĩa**.
+2. **Tìm nhanh → đúng tài liệu → đúng vị trí trong tài liệu**.
+3. **Giữ nguyên tính kiểm soát, phân quyền và tuân thủ (compliance)** của Microsoft 365.
+
+Hệ thống không thay thế SharePoint, mà **tăng cường khả năng tra cứu và hiểu nội dung**.
+
+---
+
+## 2. Nguyên tắc thiết kế
+
+### 2.1. Không lock-in
+- Ưu tiên **Open Source** cho lõi xử lý.
+- Có thể thay OCR / embedding / LLM trong tương lai.
+
+### 2.2. File gốc là "nguồn chân lý"
+- File **luôn ở SharePoint**.
+- Hệ thống chỉ lưu **nội dung trích xuất + metadata + index**.
+
+### 2.3. Permission-aware từ đầu
+- Không index kiểu "public rồi filter sau".
+- Mỗi kết quả search phải map được tới **ACL tương ứng trên SharePoint**.
+
+---
+
+## 3. Kiến trúc tổng thể (Logical Architecture)
+
+```
+SharePoint (Files, PDFs scan)
+   │
+   ▼
+Ingestion & Sync Layer (tự viết – vibecode)
+   │
+   ├── Fetch file
+   ├── Fetch metadata & permissions
+   └── Versioning
+   ▼
+Extraction Layer
+   │
+   ├── OCR (scan PDF)
+   └── MarkItDown (→ Markdown)
+   ▼
+Normalization & Enrichment
+   │
+   ├── Chunking
+   ├── Metadata mapping
+   └── Page / section anchoring
+   ▼
+Index Layer (OpenSearch)
+   │
+   ├── Full-text index
+   ├── Vector index
+   └── Metadata filter
+   ▼
+Search / Chat UI
+   └── Click → mở đúng file SharePoint
+```
+
+---
+
+## 4. Thành phần chi tiết và trách nhiệm
+
+### 4.1. Ingestion Layer (then chốt nhất)
+
+**Nhiệm vụ**:
+- Kết nối SharePoint bằng Microsoft Graph API.
+- Theo dõi thay đổi file (delta query / webhook).
+- Tải file + metadata.
+
+**Metadata bắt buộc**:
+- site_id
+- drive_id / library
+- file_id
+- file_name
+- file_url (mở trực tiếp trên SharePoint)
+- created_by, modified_by
+- created_at, modified_at
+- permission_groups / users
+
+> Lưu ý: permission cần map thành dạng filter-friendly (list group/user IDs).
+
+---
+
+### 4.2. Extraction Layer
+
+#### OCR
+- Scan PDF thường là image → cần OCR.
+- Định hướng:
+  - Tesseract / PaddleOCR cho local.
+  - Có thể thay bằng model mới sau.
+
+#### MarkItDown
+- Chuyển output OCR → Markdown sạch.
+- Ưu điểm:
+  - Giữ cấu trúc
+  - Thuận lợi cho chunk & embedding
+
+Output cần lưu:
+```json
+{
+  "file_id": "...",
+  "page": 12,
+  "content_md": "## Điều 3...",
+  "bbox_hint": "page=12"
+}
+```
+
+---
+
+### 4.3. Normalization & Chunking
+
+**Mục tiêu**: mỗi đơn vị index phải vừa đủ nhỏ để search chính xác, vừa đủ lớn để có ngữ cảnh.
+
+Chiến lược chunk:
+- Ưu tiên theo:
+  1. Heading (##, ###)
+  2. Trang (page)
+  3. Đoạn văn
+
+Mỗi chunk gồm:
+- chunk_id
+- text
+- file_id
+- page_from / page_to
+- sharepoint_url_with_anchor
+
+---
+
+### 4.4. Index Layer (OpenSearch)
+
+**Hai loại index song song**:
+
+1. **Full-text index**
+- Phục vụ search truyền thống
+- Có highlight
+
+2. **Vector index**
+- Phục vụ semantic search
+- Dùng embedding OSS (bge, e5, etc.)
+
+**Schema gợi ý**:
+
+```json
+{
+  "chunk_id": "...",
+  "text": "...",
+  "embedding": [ ... ],
+  "file_id": "...",
+  "file_name": "...",
+  "sharepoint_url": "...",
+  "page_from": 10,
+  "page_to": 11,
+  "site_id": "...",
+  "permissions": ["groupA", "userB"],
+  "updated_at": "..."
+}
+```
+
+---
+
+## 5. Search & UX định hướng
+
+### 5.1. Search mode
+- Keyword search
+- Semantic search
+- Hybrid (khuyên dùng)
+
+### 5.2. Kết quả search bắt buộc có
+- Trích đoạn nội dung
+- Tên file
+- Thư mục / site
+- Trang (page)
+- Link mở đúng file
+
+### 5.3. UX nguyên tắc
+- Tìm ≤ 3s
+- Click ≤ 1 lần để mở file
+- Người dùng **không cần biết file nằm ở đâu**
+
+---
+
+## 6. Vibecode: nên tập trung viết gì?
+
+✅ NÊN tự viết:
+- SharePoint ingestion & sync
+- Permission mapping
+- Metadata chuẩn hóa
+- UI search phù hợp nghiệp vụ
+
+❌ KHÔNG nên tự viết:
+- OCR engine
+- Search engine
+- Vector index
+
+---
+
+## 7. Lộ trình triển khai thực tế
+
+### Phase 1 – PoC (2–4 tuần)
+- Lấy 1 site SharePoint
+- OCR + MarkItDown
+- Full-text search
+
+### Phase 2 – Semantic search
+- Chunking
+- Embedding
+- Vector search
+
+### Phase 3 – Permission & scale
+- ACL filtering
+- Delta sync
+- Monitoring
+
+---
+
+## 8. Tương tác AI agent trong tương lai
+
+Hệ thống này có thể:
+- Gắn Chat UI (RAG)
+- Tóm tắt tài liệu
+- So sánh nhiều file
+- Trả lời câu hỏi trích dẫn rõ nguồn
+
+---
+
+## 9. Kết luận
+
+Bài toán của bạn **không mới nhưng rất ít hệ thống làm đúng**.
+
+Việc kết hợp:
+- SharePoint (nguồn tin cậy)
+- MarkItDown (chuẩn hóa nội dung)
+- OpenSearch (index & search)
+- Vibecode (điều phối thông minh)
+
+là **hướng kiến trúc bền vững, mở rộng được nhiều năm**.
+
+---
+
+*Tài liệu này được thiết kế để AI agent hoặc dev khác có thể đọc và triển khai tiếp mà không cần giải thích lại.*
--- a/doc/10.Appendix-Document-Type-Classification-and-Processing-Strategy.md
+++ b/doc/10.Appendix-Document-Type-Classification-and-Processing-Strategy.md
@@ -0,0 +1,220 @@
+# 10.Appendix-Document-Type-Classification-and-Processing-Strategy.md
+
+> **Phụ lục chiến lược phân loại & xử lý tài liệu** cho toàn bộ hệ thống SharePoint → Search → RAG.
+>
+> ⚠️ **Nguyên tắc sử dụng**
+> - Đây là **phụ lục độc lập**, không chỉnh sửa các file 1–9.
+> - Được coi là **luật xử lý file của hệ thống**.
+> - Mọi pipeline hiện tại và mở rộng trong tương lai **PHẢI tuân theo phụ lục này**.
+> - Mục tiêu: an toàn, mở rộng được, không xử lý sai loại tài liệu.
+
+---
+
+## 1. Mục tiêu của phụ lục
+
+Phụ lục này nhằm:
+
+- Chuẩn hoá **cách hệ thống hiểu và xử lý từng loại file**
+- Ngăn việc:
+  - OCR / MarkItDown bừa bãi
+  - Đưa bản vẽ kỹ thuật vào RAG
+  - Xử lý sai Excel/dữ liệu bảng
+- Tạo **luồng ngoại lệ có kiểm soát** cho file mới hoặc hiếm gặp
+
+---
+
+## 2. Nguyên tắc cốt lõi (BẮT BUỘC)
+
+1. **Không có "one-size-fits-all" cho tài liệu**
+2. **Phân loại file xảy ra TRƯỚC OCR và MarkItDown**
+3. **Extension là rule đầu, KHÔNG phải rule cuối**
+4. **Tài liệu không có giá trị ngôn ngữ → không RAG**
+5. **File lạ vẫn được ingest, nhưng xử lý an toàn**
+
+---
+
+## 3. Document Classification Engine (DCE)
+
+### 3.1 Vai trò
+
+DCE chịu trách nhiệm quyết định **file đi vào pipeline nào**.
+
+Nếu chưa quyết định được → file **KHÔNG được xử lý sâu hơn**.
+
+---
+
+### 3.2 Input của DCE
+
+- Extension (vd: `.pdf`, `.dwg`, `.xlsx`)
+- MIME type
+- File size
+- Header bytes (chống giả extension)
+- Với PDF:
+  - Có text layer hay không
+  - Pattern nhanh (drawing vs text)
+
+---
+
+### 3.3 Output của DCE (bắt buộc)
+
+```json
+{
+  "doc_type": "textual_document | technical_drawing | structured_data | binary_unsupported",
+  "processing_policy": "text_pipeline | metadata_only | table_pipeline | skip",
+  "confidence": 0.95,
+  "reason": "PDF detected as engineering drawing based on layout"
+}
+```
+
+---
+
+## 4. Các nhóm tài liệu và chiến lược xử lý
+
+## 4.1 Nhóm A – Textual Documents (Tài liệu ngôn ngữ)
+
+### Ví dụ
+- PDF văn bản (scan hoặc text)
+- DOCX, PPTX
+- PDF quy định, hợp đồng
+
+### Xử lý
+- ✅ OCR (nếu scan)
+- ✅ MarkItDown
+- ✅ Chunking
+- ✅ Search
+- ✅ RAG Chat
+
+### Ghi chú
+- Đây là **nguồn chính cho RAG**
+
+---
+
+## 4.2 Nhóm B – Technical Drawings (Bản vẽ kỹ thuật)
+
+### Ví dụ
+- PDF bản vẽ kỹ thuật
+- DWG / DXF / IFC
+
+### Xử lý
+- ✅ Metadata indexing
+- ✅ Thumbnail / preview (nếu có)
+- ✅ OCR giới hạn title block (tuỳ chọn)
+
+### KHÔNG làm
+- ❌ OCR toàn bộ
+- ❌ MarkItDown
+- ❌ RAG
+
+### Lý do
+- Nội dung chính là hình học, không phải ngôn ngữ
+- RAG sẽ dễ hallucinate và gây rủi ro nghiệp vụ
+
+---
+
+## 4.3 Nhóm C – Structured Data Documents (Dữ liệu bảng)
+
+### Ví dụ
+- Excel (XLSX)
+- CSV
+
+### Sub-case 1: Excel có mô tả nghiệp vụ
+
+- ✅ Trích xuất giới hạn
+- ✅ Convert sang Markdown có kiểm soát
+- ✅ Search
+- ⚠️ RAG chỉ áp dụng phần text
+
+### Sub-case 2: Excel số liệu / danh sách lớn
+
+- ✅ Extract header + key columns
+- ✅ Index theo field
+- ❌ MarkItDown toàn file
+- ❌ OCR
+- ❌ RAG ngôn ngữ tự do
+
+---
+
+## 4.4 Nhóm D – Binary / Unsupported Documents
+
+### Ví dụ
+- `.acad`, `.psd`, `.step`, `.zip`
+
+### Xử lý
+- ✅ Metadata-only ingestion
+- ✅ Search theo tên / project
+- ❌ Không OCR
+- ❌ Không convert
+- ❌ Không RAG
+
+---
+
+## 5. Luồng xử lý ngoại lệ (Exception Flow)
+
+### 5.1 Khi nào kích hoạt
+
+- Extension chưa có rule
+- MIME / header không khớp extension
+- File có pattern bất thường
+
+---
+
+### 5.2 Luồng chi tiết
+
+```text
+File mới
+  ↓
+Basic metadata ingest
+  ↓
+DCE classify = UNKNOWN
+  ↓
+Flag: Pending Classification
+  ↓
+Rule-based auto guess (nếu có)
+  ↓
+Manual admin review (hiếm)
+  ↓
+Update classification rule
+  ↓
+Optionally re-process file
+```
+
+---
+
+### 5.3 Nguyên tắc an toàn
+
+- ❌ Không OCR file chưa rõ loại
+- ❌ Không cho RAG đọc file pending
+- ✅ Vẫn search được theo metadata
+
+---
+
+## 6. Tác động tới Search & RAG
+
+- Search
+  - Phải tôn trọng `processing_policy`
+- RAG
+  - Chỉ dùng `textual_document`
+  - Skip toàn bộ nhóm B, C(subcase 2), D
+
+---
+
+## 7. Lợi ích dài hạn của chiến lược này
+
+- ✅ Không phá pipeline khi có file mới
+- ✅ An toàn AI (no hallucination)
+- ✅ Tối ưu chi phí OCR
+- ✅ Mở rộng không cần sửa core
+- ✅ Dễ audit
+
+---
+
+## 8. Kết luận phụ lục
+
+- Phân loại tài liệu là **điểm quyết định sống còn** của hệ thống
+- Extension là **điểm bắt đầu**, không phải kết luận
+- OCR và MarkItDown **là công cụ, không phải mục tiêu**
+- RAG chỉ dùng khi **có giá trị ngôn ngữ thật**
+
+---
+
+*Phụ lục này là tài liệu tham chiếu nền tảng khi mở rộng hoặc audit toàn hệ thống.*
--- a/doc/11.Appendix-PDF-Inspection-and-TextLayer-Detection.md
+++ b/doc/11.Appendix-PDF-Inspection-and-TextLayer-Detection.md
@@ -0,0 +1,193 @@
+# 11.Appendix-PDF-Inspection-and-TextLayer-Detection.md
+
+> **Phụ lục chiến lược nhận diện PDF (text-based vs scan vs drawing)**.
+>
+> ⚠️ Phụ lục này **KHÔNG chỉnh sửa** các file 1–10.
+> Mục tiêu: **chuẩn hoá cách hệ thống hiểu PDF trước khi quyết định OCR / MarkItDown**.
+>
+> Đây là phụ lục then chốt để **không xử lý OCR sai loại PDF**, đặc biệt trong môi trường có nhiều bản vẽ kỹ thuật.
+
+---
+
+## 1. Vì sao cần inspection PDF riêng?
+
+Trong hệ thống doanh nghiệp, `.pdf` là extension **dễ gây hiểu nhầm nhất**:
+
+- PDF có thể là:
+  - Văn bản thuần (text layer)
+  - Scan ảnh (image-only)
+  - Bản vẽ kỹ thuật (vector + text rải rác)
+
+⚠️ **Không được quyết định OCR chỉ dựa trên việc "là PDF"**.
+
+---
+
+## 2. Ba loại PDF cần phân biệt (BẮT BUỘC)
+
+### 2.1 PDF loại 1 – Text-based PDF
+
+**Đặc điểm**
+- Có text layer thật
+- Có thể select/copy text
+- Thường sinh ra từ Word, InDesign, LaTeX
+
+**Xử lý**
+- ❌ Không OCR
+- ✅ Đưa thẳng vào MarkItDown
+
+---
+
+### 2.2 PDF loại 2 – Scan / Image-based PDF
+
+**Đặc điểm**
+- Không có text layer
+- Toàn bộ là ảnh
+- Scan từ giấy
+
+**Xử lý**
+- ✅ OCR là BẮT BUỘC
+- ✅ OCR page-wise
+- ✅ Sau OCR mới đưa vào MarkItDown
+
+---
+
+### 2.3 PDF loại 3 – Technical Drawing PDF
+
+**Đặc điểm**
+- Nội dung chính là hình vẽ kỹ thuật
+- Text rải rác (ký hiệu, mã số)
+- Có thể có vector + text layer
+
+**Xử lý**
+- ❌ Không OCR toàn bộ
+- ❌ Không MarkItDown
+- ✅ Metadata-only indexing
+- ✅ OCR giới hạn (title block) nếu cần
+
+---
+
+## 3. PDF Inspection Flow (Luồng quyết định)
+
+```text
+PDF file
+  ↓
+Quick inspection
+  ├─ Has text layer?
+  ├─ Text density?
+  ├─ Page structure?
+  ↓
+Classify:
+  - TEXT_PDF
+  - SCAN_PDF
+  - DRAWING_PDF
+```
+
+---
+
+## 4. Heuristics xác định Text-based PDF
+
+Một PDF được coi là **TEXT_PDF** khi:
+
+- Có text layer trên >70% số trang
+- Text density đủ lớn (nhiều hơn nhãn/ký hiệu)
+- Không có layout dạng drawing
+
+✅ Ví dụ tiêu biểu
+- Quy định
+- Hợp đồng
+- Hướng dẫn
+
+---
+
+## 5. Heuristics xác định Scan PDF
+
+Một PDF được coi là **SCAN_PDF** khi:
+
+- Không có text layer
+- Page rendering = image
+- OCR text length = 0 trước OCR
+
+✅ Đây là target chính của OCR tiếng Việt
+
+---
+
+## 6. Heuristics xác định Drawing PDF (RẤT QUAN TRỌNG)
+
+Một PDF bị coi là **DRAWING_PDF** khi:
+
+- Text layer rất ít nhưng:
+  - Có vector shapes
+  - Có line dày đặc
+  - Page ratio lớn (A1, A0)
+- Text xuất hiện chủ yếu ở:
+  - Title block
+  - Legend
+
+⚠️ Thoả các dấu hiệu trên → **KHÔNG đưa vào OCR/MarkItDown dù có text layer**.
+
+---
+
+## 7. Vì sao Drawing PDF KHÔNG được OCR đại trà
+
+- OCR sinh rất nhiều noise
+- Text không mang ý nghĩa ngôn ngữ
+- RAG dễ hallucinate nguy hiểm
+
+✅ Nguyên tắc:
+> **Drawing PDF chỉ để TRA CỨU, không để ĐỌC bằng AI**.
+
+---
+
+## 8. Quyết định xử lý theo loại PDF
+
+| PDF Type | OCR | MarkItDown | Search | RAG |
+|--------|-----|------------|--------|-----|
+| TEXT_PDF | ❌ | ✅ | ✅ | ✅ |
+| SCAN_PDF | ✅ | ✅ | ✅ | ✅ |
+| DRAWING_PDF | ❌ (limited) | ❌ | ✅ (metadata) | ❌ |
+
+---
+
+## 9. OCR Scope áp dụng cho PDF
+
+### Chỉ OCR khi:
+- Classified = SCAN_PDF
+
+### OCR giới hạn khi:
+- Classified = DRAWING_PDF
+- Mục tiêu: title, revision, project code
+
+---
+
+## 10. Logging & Audit bắt buộc
+
+Mỗi PDF phải có log:
+
+```json
+{
+  "pdf_type": "SCAN_PDF",
+  "decision_reason": "No text layer detected",
+  "ocr_applied": true
+}
+```
+
+---
+
+## 11. Lợi ích kiến trúc
+
+- ✅ Không OCR sai PDF
+- ✅ Không đưa bản vẽ vào RAG
+- ✅ Giảm chi phí OCR
+- ✅ Tránh hallucination nguy hiểm
+
+---
+
+## 12. Kết luận phụ lục
+
+- PDF inspection là **bước bắt buộc trước OCR**
+- TEXT ≠ SCAN ≠ DRAWING
+- OCR và MarkItDown chỉ là công cụ, **không phải đích đến**
+
+---
+
+*Phụ lục này là nền tảng để thiết kế bước tích hợp OCR trong hệ thống.*
--- a/doc/12.Appendix-OCR-Integration-Strategy-and-Flow.md
+++ b/doc/12.Appendix-OCR-Integration-Strategy-and-Flow.md
@@ -0,0 +1,213 @@
+# 12.Appendix-OCR-Integration-Strategy-and-Flow.md
+
+> **Phụ lục tích hợp OCR – Giải pháp khả thi, rõ ràng, triển khai được ngay** cho hệ thống SharePoint → Search → RAG.
+>
+> ⚠️ Nguyên tắc của file này:
+> - **Đứng độc lập**: có thể load lại trong session khác và tiếp tục thảo luận
+> - **Chọn MỘT giải pháp chính** (không liệt kê mơ hồ nhiều hướng)
+> - **Open‑source, on‑prem friendly**, không lock‑in
+> - Thiết kế **mở để nâng cấp** về sau (fine‑tune / cloud nếu cần)
+
+---
+
+## 1. Tóm tắt nhanh quyết định (Executive Summary)
+
+### Giải pháp OCR được chọn
+
+> ✅ **PaddleOCR (Detection) + VietOCR (Recognition)** – triển khai on‑prem
+
+### Vì sao chọn giải pháp này?
+
+- Phù hợp **99% tài liệu tiếng Việt có dấu**
+- Đã được cộng đồng Việt Nam sử dụng thực tế
+- Hoàn toàn **open‑source**
+- Không phụ thuộc cloud / API bên ngoài
+- Dễ audit, dễ debug, dễ fine‑tune
+
+➡️ Đây là giải pháp **khả thi nhất cho phase PoC → Pilot → Production**.
+
+---
+
+## 2. Vị trí của OCR trong toàn pipeline (nhắc lại để neo tư duy)
+
+```text
+File Ingestion
+   ↓
+Document Classification (file 10)
+   ↓
+PDF Inspection (file 11)
+   ↓
+IF SCAN_PDF
+   → OCR Layer (file này)
+   ↓
+Normalized Text (page-wise)
+   ↓
+MarkItDown
+   ↓
+Chunking → Search → RAG
+```
+
+⚠️ OCR **KHÔNG** đứng độc lập, mà **chỉ là 1 stage có điều kiện**.
+
+---
+
+## 3. OCR Integration – Phạm vi áp dụng
+
+OCR **CHỈ được gọi khi**:
+
+- `doc_type = textual_document`
+- `pdf_type = SCAN_PDF`
+
+OCR **KHÔNG được gọi khi**:
+
+- Drawing PDF
+- Binary / CAD / Excel số liệu
+- File pending classification
+
+---
+
+## 4. Kiến trúc OCR chi tiết (Logical Architecture)
+
+```text
+OCR Service (microservice)
+ ├── Detector: PaddleOCR (DB / SAST)
+ ├── Recognizer: VietOCR (Transformer)
+ ├── Preprocess:
+ │    - Deskew
+ │    - Binarization
+ │    - Resize
+ ├── Output:
+ │    - text
+ │    - page
+ │    - confidence
+```
+
+➡️ OCR được đóng gói dưới dạng **service riêng**, không embed cứng vào ingestion.
+
+---
+## 1. Tại Sao Lại Là Phân Tán VLM (Vision-Language Model)?
+
+Trong giai đoạn đầu của PoC, chúng ta đã thử nghiệm kiến trúc cũ: **PaddleOCR** (để bóc khung chữ) kết hợp với **VietOCR** (để dịch tiếng Việt).
+Tuy nhiên, phương pháp này đã bộc lộ những điểm yếu chí mạng:
+- **PaddleOCR** cực kỳ yếu trong việc nhận diện dấu tiếng Việt (rụng dấu tả tơi).
+- **VietOCR (vgg_seq2seq)** bị lỗi "Ảo giác" (Hallucination) sinh ra chữ tiếng Anh rác khi bị cắt ảnh không chuẩn xác.
+- Quy trình `Cắt ảnh (Crop) -> Dịch -> Ghép lại` làm đứt gãy cấu trúc tự nhiên (Layout) của văn bản.
+
+Vì vậy, kiến trúc đã được nâng cấp lên chuẩn **Enterprise Distributed AI**:
+- Sử dụng mô hình **Vintern-3B** (Vision-Language Model) chạy trên một máy chủ chuyên dụng trong mạng LAN thông qua `llama.cpp` server.
+- Hệ thống RAG (WSL) chỉ đóng vai trò là một VLM Client: Gửi ảnh sang máy chủ LAN và nhận về nguyên văn Markdown chuẩn xác, giữ nguyên cấu trúc bảng biểu, tiêu đề, và không bao giờ rớt dấu.
+
+---
+
+## 2. Distributed VLM Integration Flow (Kiến trúc phân tán)
+
+```mermaid
+sequenceDiagram
+    participant P as Pipeline (sync.py)
+    participant DCE as Classification (DCE)
+    participant OS as OCR Service (Client)
+    participant LAN as VLM Server (10.202.50.3)
+    participant DB as Vector DB (OpenSearch)
+
+    P->>DCE: Gửi PDF/Image
+    DCE-->>P: file_type = "image/scanned_pdf"
+    P->>OS: process_bytes(pdf_bytes)
+    
+    rect rgb(20, 50, 100)
+    note right of OS: VLM Client Logic
+    OS->>OS: Render trang PDF thành Ảnh (DPI=86)
+    OS->>OS: Encode Ảnh sang Base64
+    OS->>LAN: POST /v1/chat/completions (Base64 + Prompt)
+    end
+    
+    rect rgb(80, 20, 20)
+    note right of LAN: GPU/CPU Heavy Lifting
+    LAN->>LAN: Chạy InternVL2 / Vintern-3B
+    LAN->>LAN: Trích xuất toàn bộ Text và Bảng biểu
+    LAN-->>OS: Trả về văn bản chuẩn Markdown
+    end
+    
+    OS-->>P: List[OCRPageResult(text, confidence)]
+    
+    P->>P: Chunking (Semantic)
+    P->>DB: Indexing
+```
+
+---
+
+## 3. Kiến Trúc Modular Data Provider
+
+Để hệ thống hoàn chỉnh ở mức độ cao cấp, chúng tôi đã tái cấu trúc (Refactor) lại tầng Ingestion:
+1. **BaseStorageProvider:** Interface trừu tượng hoá việc lấy dữ liệu `fetch_changes()` và tải dữ liệu `download_file()`.
+2. **SharePointProvider:** Kế thừa từ BaseStorageProvider, đóng gói toàn bộ logic Graph API.
+3. Trong tương lai, chỉ cần viết thêm `GoogleDriveProvider`, `LocalDriveProvider` hoặc `S3Provider` là hệ thống RAG có thể nuốt mọi nguồn dữ liệu mà không cần sửa đổi Core logic.
+
+---
+
+## 4. Tách Biệt Cấu Hình (Decoupled Configuration)
+
+Các thông số siêu nặng của VLM đã được tách biệt hoàn toàn khỏi mã nguồn:
+- Thông số kết nối mạng LAN: `VLM_ENDPOINT`
+- Thông số tinh chỉnh: `VLM_TEMPERATURE`, `VLM_MAX_TOKENS`
+Tất cả được quản lý qua `core/config.py` và file `.env`. Việc đổi máy chủ AI giờ đây chỉ mất 1 giây thay đổi biến môi trường.
+
+---
+
+## 5. Confidence Handling (rất quan trọng cho RAG)
+
+### Nguyên tắc
+
+- OCR < threshold → **KHÔNG đưa vào RAG**
+- OCR < threshold → vẫn search keyword + metadata
+
+### Threshold gợi ý
+
+- `confidence ≥ 0.90` → OK cho RAG
+- `0.80 – 0.90` → Search only
+- `< 0.80` → Flag review
+
+---
+
+## 10. Logging & Audit OCR
+
+Mỗi OCR job phải log:
+
+```json
+{
+  "file_id": "...",
+  "page": 5,
+  "ocr_engine": "PaddleOCR+VietOCR",
+  "confidence": 0.91
+}
+```
+
+➡️ Phục vụ:
+- audit
+- debug
+- cải tiến model
+
+---
+
+## 11. Mở rộng tương lai (NHƯNG KHÔNG LÀM NGAY)
+
+✅ Đã được thiết kế sẵn, **không lock‑in**:
+
+- Fine‑tune VietOCR theo domain
+- Thêm GPU để tăng tốc
+- Thay recognition engine nếu cần
+- Thêm OCR cloud fallback (optional)
+
+➡️ **Không phá pipeline hiện tại**.
+
+---
+
+## 12. Tóm tắt quyết định kỹ thuật
+
+- OCR là **service có điều kiện**, không phải default
+- PaddleOCR + VietOCR là **giải pháp chính**
+- On‑prem, open‑source, audit‑friendly
+- Sẵn sàng cho PoC → Production
+
+---
+
+*Phụ lục này là tài liệu chốt cách tích hợp OCR cho toàn hệ thống.*
--- a/doc/13.PoC-Implementation-Checklist.md
+++ b/doc/13.PoC-Implementation-Checklist.md
@@ -0,0 +1,193 @@
+# 13.PoC-Implementation-Checklist.md
+
+> **Checklist triển khai PoC (Proof of Concept)** cho hệ thống tra cứu tài liệu SharePoint → Search → RAG Chat.
+>
+> ⚠️ Nguyên tắc của checklist này:
+> - Dựa **100% trên các quyết định đã chốt trong file 1–12**
+> - Dùng để **triển khai PoC thật**, không phải tài liệu lý thuyết
+> - Checklist theo thứ tự **chuẩn kỹ thuật**, làm xong mục trước mới sang mục sau
+> - Có thể dùng cho: Tech Lead, Dev, Vendor, AI Agent
+
+---
+
+## 0. Phạm vi PoC (BẮT BUỘC CHỐT TRƯỚC KHI LÀM)
+
+### ✅ Mục tiêu PoC
+- Tra cứu tài liệu nội bộ (SharePoint)
+- Hỗ trợ **search + RAG chat**
+- Đúng quyền người dùng
+- Ưu tiên **tài liệu tiếng Việt**
+
+### ❌ Ngoài phạm vi PoC
+- Fine‑tune OCR
+- Phân tích bản vẽ kỹ thuật bằng AI
+- Multi‑tenant phức tạp
+
+---
+
+## 1. Chuẩn bị môi trường
+
+- [ ] Tenant SharePoint test
+- [ ] App Registration (Graph API – app‑only)
+- [ ] Server PoC (CPU ≥ 16 cores, RAM ≥ 32GB)
+- [ ] Docker / Container runtime
+- [ ] OpenSearch cluster (dev size)
+
+---
+
+## 2. Ingestion – SharePoint
+
+- [ ] Kết nối Graph API thành công
+- [ ] Lấy được site / library metadata
+- [ ] Delta query hoạt động
+- [ ] Persist delta token
+- [ ] Detect create / update / delete file
+
+---
+
+## 3. Document Classification Engine (File 10)
+
+- [ ] Parse extension + MIME type
+- [ ] Header byte validation (chống giả extension)
+- [ ] Gán `doc_type`
+- [ ] Gán `processing_policy`
+- [ ] Log lý do classify
+
+---
+
+## 4. PDF Inspection (File 11)
+
+- [ ] Detect text layer PDF
+- [ ] Calculate text density
+- [ ] Detect vector / drawing layout
+- [ ] Classify: TEXT_PDF / SCAN_PDF / DRAWING_PDF
+- [ ] Log quyết định inspection
+
+---
+
+## 5. OCR Integration (File 12)
+
+### 5.1 Điều kiện gọi OCR
+- [ ] Chỉ OCR khi `SCAN_PDF`
+- [ ] Skip DRAWING_PDF
+- [ ] Skip non‑textual documents
+
+### 5.2 OCR Service
+- [ ] PaddleOCR detector chạy ổn định
+- [ ] VietOCR recognizer nhận tiếng Việt có dấu
+- [ ] OCR page‑wise
+- [ ] OCR chạy được trên CPU
+
+### 5.3 OCR Output
+- [ ] Có page number
+- [ ] Có text
+- [ ] Có confidence
+- [ ] OCR output chuẩn JSON contract
+
+---
+
+## 6. MarkItDown & Normalization
+
+- [ ] Nhận text từ OCR hoặc text‑PDF
+- [ ] Convert sang Markdown
+- [ ] Giữ heading / paragraph / list
+- [ ] Gắn page marker
+
+---
+
+## 7. Chunking
+
+- [ ] Chunk theo heading / page
+- [ ] Token size phù hợp cho search & RAG
+- [ ] Gắn metadata: file_id, page, URL
+
+---
+
+## 8. Index & Search (OpenSearch)
+
+- [ ] Mapping chunk‑first
+- [ ] Index text field
+- [ ] Index metadata field
+- [ ] Index ACL
+- [ ] Index embedding (nếu dùng)
+
+---
+
+## 9. Permission Enforcement
+
+- [ ] Resolve user identity
+- [ ] Resolve group membership
+- [ ] Filter search theo ACL
+- [ ] Không lộ file user không có quyền
+
+---
+
+## 10. Search Experience
+
+- [ ] Keyword search hoạt động
+- [ ] Search trả đúng file & page
+- [ ] Highlight nội dung
+- [ ] Click mở file gốc
+
+---
+
+## 11. RAG Chat (File 5)
+
+### 11.1 Điều kiện vào RAG
+- [ ] Có kết quả search
+- [ ] OCR confidence đạt threshold
+
+### 11.2 Prompt & Answer
+- [ ] System prompt chặt (no hallucination)
+- [ ] Chỉ dùng context search
+- [ ] Trả câu trả lời + citation
+
+### 11.3 Fallback
+- [ ] Không có dữ liệu → trả lời chuẩn
+
+---
+
+## 12. Logging & Audit
+
+- [ ] Log ingestion
+- [ ] Log OCR
+- [ ] Log search query
+- [ ] Log RAG answer + context
+
+---
+
+## 13. User Validation (PoC Acceptance)
+
+- [ ] User tìm được tài liệu mong muốn
+- [ ] User hiểu citation
+- [ ] AI không trả lời bịa
+- [ ] UX dễ sử dụng
+
+---
+
+## 14. KPI PoC (Đủ/Không đủ)
+
+- [ ] Search latency < 3s
+- [ ] OCR Vietnamese readable
+- [ ] RAG trả lời đúng phạm vi
+- [ ] Không lộ quyền
+
+---
+
+## 15. Quyết định sau PoC
+
+- [ ] Go Pilot
+- [ ] Điều chỉnh OCR threshold
+- [ ] Thêm loại document
+- [ ] Fine‑tune sau (nếu cần)
+
+---
+
+## Kết luận Checklist
+
+- Checklist này hoàn thành → PoC **đã thành công về mặt kỹ thuật**
+- Chưa hoàn thành → **không nên lên production**
+
+---
+
+*Tài liệu này là checklist thực thi, không phải tài liệu mô tả.*
--- a/doc/14.Project-Bridge-Context-for-New-Chat.md
+++ b/doc/14.Project-Bridge-Context-for-New-Chat.md
@@ -0,0 +1,194 @@
+# 14.Project-Bridge-Context-for-New-Chat.md
+
+> Mục tiêu file này: **cầu nối ngữ cảnh** để mở New Chat và tiếp tục dự án mà **không cần attach lại toàn bộ 13 file**.
+>
+> ⚠️ Lưu ý quan trọng:
+> - File này là **bản tóm lược có cấu trúc**, **KHÔNG thay thế** hoàn toàn cho các file 1–13.
+> - Dùng khi giới hạn số file đính kèm thấp.
+> - Nếu cần chi tiết kỹ thuật sâu cho một mô-đun, vẫn nên attach thêm file gốc liên quan.
+
+---
+
+## 1. Dự án đang làm là gì?
+
+Xây dựng hệ thống tra cứu tài liệu nội bộ theo pipeline:
+
+```text
+SharePoint → Ingestion → Document Classification → PDF Inspection → OCR (có điều kiện) → MarkItDown → Chunking → Search → RAG Chat
+```
+
+Mục tiêu:
+- Tra cứu nhanh nội dung tài liệu công ty lưu trong SharePoint
+- Tìm đúng file, đúng trang, đúng quyền truy cập
+- Hỗ trợ Search và RAG Chat có dẫn nguồn
+
+---
+
+## 2. Các quyết định kiến trúc đã CHỐT (không thay đổi)
+
+### 2.1 Phân loại tài liệu là bắt buộc trước xử lý
+- Không có one-size-fits-all cho mọi file
+- Mỗi file phải đi qua **Document Classification Engine (DCE)** trước
+
+### 2.2 PDF phải được inspect trước OCR
+PDF được chia thành 3 loại:
+- `TEXT_PDF` → không OCR, đưa thẳng vào MarkItDown
+- `SCAN_PDF` → OCR bắt buộc, page-wise
+- `DRAWING_PDF` → không OCR đại trà, không MarkItDown, không RAG
+
+### 2.3 Bản vẽ kỹ thuật / CAD / binary không đi vào RAG
+- DWG/DXF/IFC/CAD/Binary → metadata search only
+- Drawing PDF → metadata + optional title-block OCR, không RAG
+
+### 2.4 OCR chính thức được chọn
+- **PaddleOCR (Detection) + VietOCR (Recognition)**
+- On-prem, open-source, không lock-in cloud
+- OCR chỉ gọi khi:
+  - `doc_type = textual_document`
+  - `pdf_type = SCAN_PDF`
+
+### 2.5 RAG là tầng sau Search, có điều kiện
+- Search-first, LLM-second
+- Không có context tốt → không trả lời
+- Trả lời phải có citation
+- OCR confidence thấp → không đưa vào RAG
+
+---
+
+## 3. Các file gốc đã tồn tại trước đó
+
+### Nhóm kiến trúc lõi
+1. Kiến trúc tổng thể
+2. SharePoint ingestion playbook
+3. Extraction & normalization playbook
+4. OpenSearch index & search playbook
+5. RAG chat application playbook
+6. Operations / monitoring / governance playbook
+
+### Nhóm review & điều phối
+7. Review / double-check / gap analysis
+8. End-to-end processing flows bullets
+
+### Nhóm phụ lục kỹ thuật quan trọng
+9. Appendix – Vietnamese OCR strategy
+10. Appendix – Document type classification and processing strategy
+11. Appendix – PDF inspection and text-layer detection
+12. Appendix – OCR integration strategy and flow
+13. PoC implementation checklist
+
+---
+
+## 4. Trạng thái hiện tại của dự án
+
+- Đã chốt kiến trúc end-to-end
+- Đã chốt luật xử lý tài liệu
+- Đã chốt chiến lược OCR tiếng Việt
+- Đã có checklist PoC chi tiết
+- Đang chuyển sang giai đoạn **bắt tay triển khai PoC thật**
+
+Trọng tâm hiện tại:
+> Tạo / dùng **SharePoint Team Site test** làm nguồn dữ liệu đầu vào cho PoC.
+
+---
+
+## 5. Site SharePoint test dự kiến / khuyến nghị
+
+### Loại site
+- **Team site**
+- Privacy: Private
+
+### Tên gợi ý
+- `SP-RAG-Test`
+
+### Cấu trúc thư mục test khuyến nghị
+```text
+/Documents
+  /01-PDF-Text
+  /02-PDF-Scan
+  /03-PDF-Drawing
+  /04-DOCX
+  /05-XLSX-Textual
+  /06-XLSX-Structured
+  /07-CAD-Binary
+  /99-Pending-Classification
+```
+
+### ACL test
+- Có ít nhất 1 thư mục/folder restricted để test permission filtering
+
+---
+
+## 6. Thứ tự triển khai PoC đã chốt
+
+1. Tạo SharePoint site test
+2. Upload bộ dữ liệu test nhỏ nhưng đủ loại
+3. Kiểm tra quyền truy cập khác nhau giữa user
+4. Bắt đầu ingestion
+5. Chạy document classification
+6. Chạy PDF inspection
+7. Chỉ OCR cho SCAN_PDF
+8. MarkItDown / normalization / chunking
+9. Index vào OpenSearch
+10. Bật search
+11. Chỉ bật RAG sau khi search + ACL + citation ổn
+
+---
+
+## 7. Các ngưỡng / guardrails đã chốt
+
+### OCR confidence
+- `>= 0.90` → có thể dùng cho RAG
+- `0.80 – 0.90` → search only
+- `< 0.80` → flag review
+
+### Không được làm
+- Không OCR drawing PDF đại trà
+- Không đưa CAD/binary vào RAG
+- Không bỏ qua bước classification/PDF inspection
+- Không để AI tự thay đổi kiến trúc đã chốt
+
+---
+
+## 8. Khi mở New Chat, AI cần tuân thủ gì?
+
+### Bắt buộc
+1. Không thay đổi các quyết định đã chốt ở trên
+2. Không đề xuất đi ngược các guardrails
+3. Nếu mở rộng thì chỉ tạo phụ lục mới hoặc ghi rõ là đề xuất mở rộng
+4. Tiếp tục đúng ngữ cảnh: **từ SharePoint site test → PoC triển khai**
+
+### Câu mở đầu gợi ý cho New Chat
+```text
+Tôi đang tiếp tục dự án SharePoint → Search → RAG.
+Hãy coi file bridge context này là source of truth tạm thời.
+Mục tiêu hiện tại: dùng SharePoint site test làm input cho PoC và đi tiếp đúng theo roadmap đã chốt.
+Không thay đổi kiến trúc, không nhảy bước.
+```
+
+---
+
+## 9. Gợi ý attach file khi chỉ được add ít file
+
+### Tối thiểu nên attach
+1. **File bridge này**
+2. `10.Appendix-Document-Type-Classification-and-Processing-Strategy.md`
+3. `13.PoC-Implementation-Checklist.md`
+
+### Nếu cần đi sâu OCR
+Thay file 13 hoặc attach tiếp ở lượt sau:
+- `11.Appendix-PDF-Inspection-and-TextLayer-Detection.md`
+- `12.Appendix-OCR-Integration-Strategy-and-Flow.md`
+
+---
+
+## 10. Mục tiêu thảo luận tiếp theo sau khi mở New Chat
+
+Một trong các hướng sau:
+- Chuẩn bị dataset test cho SharePoint site
+- Map SharePoint site test vào PoC pipeline
+- Lập execution plan theo checklist PoC
+- Bắt đầu dựng skeleton triển khai kỹ thuật
+
+---
+
+*Kết thúc file bridge. Đây là file cầu nối ngữ cảnh, tối ưu cho trường hợp giới hạn số file đính kèm.*
--- a/doc/2.sharepoint-ingestion-playbook.md
+++ b/doc/2.sharepoint-ingestion-playbook.md
@@ -0,0 +1,264 @@
+# SharePoint Ingestion – Tài liệu triển khai tuần tự (Reusable)
+
+> Mục tiêu tài liệu: **Có thể nạp lại bất kỳ lúc nào để tiếp tục đúng hướng**. Viết theo kiểu **triển khai được ngay**, tránh phụ thuộc ngữ cảnh hội thoại.
+
+---
+
+## Mục lục
+1. Phạm vi & Mục tiêu
+2. Nguyên tắc thiết kế
+3. Luồng xử lý tổng thể (End-to-End Flow)
+4. Sequence Diagram (Mô tả tuần tự)
+5. Thiết kế Ingestion Service
+6. Delta Sync & State Management
+7. Permission Flattening (ACL)
+8. Download & File Lifecycle
+9. Orchestration, Retry, Idempotency
+10. Thiết kế CSDL (Schema)
+11. Internal APIs (Spec)
+12. Observability & Ops
+13. Roadmap thực thi
+
+---
+
+## 1. Phạm vi & Mục tiêu
+- Ingest tài liệu từ **SharePoint Online (Microsoft 365)** bằng **Microsoft Graph API**.
+- Phát hiện thay đổi bằng **Delta Query**.
+- Thu thập **metadata + quyền truy cập**.
+- Đưa file hợp lệ vào pipeline xử lý (OCR/MarkItDown ở phase sau).
+- Đảm bảo **permission-aware ngay từ ingestion**.
+
+---
+
+## 2. Nguyên tắc thiết kế
+- **App-only authentication** (service-to-service).
+- **Delta-first**: không full scan lặp lại.
+- **Permission-first**: flatten ACL khi ingest.
+- **Idempotent**: chạy nhiều lần cho cùng version → kết quả giống nhau.
+- **File gốc là nguồn chân lý** (chỉ lưu bản trích xuất + index).
+
+---
+
+## 3. Luồng xử lý tổng thể (End-to-End)
+
+```text
+[Scheduler]
+  ↓
+[List Configured Sites]
+  ↓
+[List Drives / Libraries]
+  ↓
+[Delta Query Items]
+  ↓
+[For each Changed Item]
+   ├─ Fetch Metadata
+   ├─ Fetch Permissions (ACL)
+   ├─ Decide Eligibility (whitelist extension/size)
+   ├─ Enqueue Processing Job
+   └─ Persist State (deltaToken, etag)
+```
+
+---
+
+## 4. Sequence Diagram (Mô tả tuần tự)
+
+```text
+Scheduler
+  |
+  | trigger
+  v
+Ingestion Service
+  |
+  | GET sites/drives
+  v
+Microsoft Graph
+  |
+  | 200 OK (sites/drives)
+  v
+Ingestion Service
+  |
+  | GET delta(items, token)
+  v
+Microsoft Graph
+  |
+  | items[] + deltaToken
+  v
+Ingestion Service
+  |
+  | For item changed:
+  |   - GET item metadata
+  |   - GET item permissions
+  |   - Persist ingest record
+  |   - Push job to Queue
+  v
+Queue / Pipeline
+```
+
+---
+
+## 5. Thiết kế Ingestion Service
+
+### 5.1 Thành phần
+- **Auth Module**: OAuth2 client-credentials + certificate.
+- **Site/Drive Scanner**: liệt kê phạm vi được cấu hình.
+- **Delta Engine**: quản lý deltaToken.
+- **Metadata Collector**: chuẩn hóa metadata.
+- **ACL Collector**: flatten permission.
+- **Job Producer**: đẩy sự kiện sang pipeline.
+
+### 5.2 Công nghệ gợi ý
+- Backend: Python (FastAPI) hoặc Node.js
+- Queue: Azure Service Bus / RabbitMQ / Redis
+- Storage state: PostgreSQL hoặc CosmosDB (metadata)
+
+---
+
+## 6. Delta Sync & State Management
+
+### 6.1 Delta Token
+- Mỗi **drive/library** có một `delta_token` riêng.
+- Lưu bền vững (DB), không lưu memory.
+
+### 6.2 Logic
+```text
+If first_run:
+  call delta without token → snapshot + token
+Else:
+  call delta with token → changes only
+```
+
+### 6.3 Event Handling
+| Event | Hành động |
+|------|----------|
+| Created | Full ingest |
+| Updated | Re-ingest (new version) |
+| Deleted | Soft-delete index |
+
+---
+
+## 7. Permission Flattening (ACL)
+
+### 7.1 Nguyên tắc
+- Thu thập **users + groups** ngay khi ingest.
+- Expand nested groups **tại ingestion time**.
+
+### 7.2 Schema ACL
+```json
+{
+  "users": ["aad-user-id"],
+  "groups": ["aad-group-id"],
+  "inherited": true
+}
+```
+
+### 7.3 Tối ưu
+- Cache membership group.
+- Refresh định kỳ (daily/weekly).
+
+---
+
+## 8. Download & File Lifecycle
+
+### 8.1 Eligibility Rules
+- Whitelist: pdf, docx, pptx, image.
+- Size limit (ví dụ ≤ 100MB).
+
+### 8.2 Lifecycle
+```text
+Download → Process (OCR/MD) → Persist result → Delete local file
+```
+
+---
+
+## 9. Orchestration, Retry, Idempotency
+
+### 9.1 Idempotency Key
+- `(site_id, drive_id, item_id, etag)`
+
+### 9.2 Retry Strategy
+| Error | Strategy |
+|------|----------|
+| 429 | Exponential backoff |
+| Timeout | Retry N lần |
+| Permission denied | Log + skip |
+
+---
+
+## 10. Thiết kế CSDL (Schema)
+
+### 10.1 Bảng `ingest_state`
+```sql
+site_id TEXT
+drive_id TEXT
+delta_token TEXT
+updated_at TIMESTAMP
+PRIMARY KEY (site_id, drive_id)
+```
+
+### 10.2 Bảng `files`
+```sql
+item_id TEXT
+site_id TEXT
+drive_id TEXT
+file_name TEXT
+etag TEXT
+version TEXT
+status TEXT
+last_processed_at TIMESTAMP
+PRIMARY KEY (item_id)
+```
+
+### 10.3 Bảng `permissions`
+```sql
+item_id TEXT
+principal_type TEXT  -- user|group
+principal_id TEXT
+```
+
+---
+
+## 11. Internal APIs (Spec)
+
+### POST /ingest/run
+- Trigger ingestion cycle.
+
+### POST /ingest/item
+- Input: `{ site_id, drive_id, item_id }`
+- Output: job_id
+
+### GET /ingest/state
+- Trả về deltaToken, health.
+
+---
+
+## 12. Observability & Ops
+
+### Logs (per item)
+```json
+{ "item_id": "...", "step": "permission", "status": "ok" }
+```
+
+### Metrics
+- Files/min
+- OCR queue length
+- Error rate
+
+---
+
+## 13. Roadmap thực thi
+
+### Step 1 (Tuần 1–2)
+- Auth + list sites/drives
+- Delta sync cơ bản
+
+### Step 2 (Tuần 3)
+- Metadata + ACL flatten
+- Queue integration
+
+### Step 3 (Tuần 4)
+- Observability
+- Hardening & scale
+
+---
+
+*Kết thúc tài liệu ingestion. Có thể tiếp tục với Extraction/OCR hoặc Index/Search ở file tiếp theo.*
--- a/doc/3.extraction-markdown-normalization-playbook.md
+++ b/doc/3.extraction-markdown-normalization-playbook.md
@@ -0,0 +1,241 @@
+# Extraction & Normalization – OCR + MarkItDown Playbook (Reusable)
+
+> Tài liệu này nối tiếp **SharePoint Ingestion Playbook**. Mục tiêu: biến file ingest thành **Markdown sạch, có cấu trúc, có thể search – RAG – truy vết trang gốc**. Viết để **tái sử dụng lâu dài**, không phụ thuộc session.
+
+---
+
+## Mục lục
+1. Vị trí của Extraction trong kiến trúc
+2. Nguyên tắc thiết kế
+3. Luồng xử lý tổng thể
+4. Phân loại tài liệu & chiến lược xử lý
+5. OCR Strategy cho tài liệu scan
+6. MarkItDown – tích hợp & cấu hình
+7. Page Mapping & Anchoring (đi tới đúng trang)
+8. Normalization & Cleanup
+9. Chunking Strategy (chuẩn cho Search & RAG)
+10. Output Contract (interface dữ liệu)
+11. Error handling & Retry
+12. Performance & Scale
+13. Checklist triển khai
+
+---
+
+## 1. Vị trí của Extraction trong kiến trúc
+
+```text
+Ingestion (file + metadata + ACL)
+        │
+        ▼
+Extraction Layer (OCR + MarkItDown)
+        │
+        ▼
+Normalization & Chunking
+        │
+        ▼
+Index / Search / RAG
+```
+
+Extraction **không biết Search**, **không biết User**, chỉ biết:
+- Input: file + metadata
+- Output: Markdown + page mapping + structural hints
+
+---
+
+## 2. Nguyên tắc thiết kế
+
+1. **Deterministic**: cùng file + version → output giống nhau
+2. **Lossless về nội dung**: ưu tiên giữ text hơn làm đẹp
+3. **Trang là đơn vị neo (anchor)** – đặc biệt với PDF
+4. **Tách OCR khỏi Markdown** (để thay engine sau này)
+5. **Không embed business logic** vào bước này
+
+---
+
+## 3. Luồng xử lý tổng thể
+
+```text
+[Receive File Job]
+  ↓
+[Detect File Type]
+  ↓
+[If Scan → OCR]
+  ↓
+[MarkItDown Convert → Markdown]
+  ↓
+[Normalize Markdown]
+  ↓
+[Split by Page / Heading]
+  ↓
+[Emit Document Units]
+```
+
+---
+
+## 4. Phân loại tài liệu & chiến lược xử lý
+
+| Loại | Dấu hiệu | Chiến lược |
+|----|--------|-----------|
+| PDF scan | Không có text layer | OCR → MD |
+| PDF text | Có selectable text | Direct → MD |
+| DOCX | Word | Direct → MD |
+| PPTX | Slide | Slide-wise MD |
+| Image | jpg/png | OCR → MD |
+
+✅ Phân loại phải tự động, **không dựa vào extension duy nhất**.
+
+---
+
+## 5. OCR Strategy
+
+### 5.1 Khi nào OCR?
+- PDF không có text layer
+- Image-based document
+
+### 5.2 Output OCR yêu cầu tối thiểu
+```json
+{
+  "page": 5,
+  "text": "Nội dung nhận dạng...",
+  "confidence": 0.92
+}
+```
+
+### 5.3 Nguyên tắc
+- OCR **theo từng trang**
+- Không gộp toàn file thành một blob text
+
+---
+
+## 6. MarkItDown – tích hợp & cấu hình
+
+### 6.1 Vai trò
+- Biến input (PDF/DOCX/image) thành **Markdown có cấu trúc**
+
+### 6.2 Mode sử dụng
+- Input: file path hoặc stream
+- Output: Markdown + page breaks
+
+### 6.3 Quy ước page break (rất quan trọng)
+```markdown
+<!-- page:1 -->
+# Trang 1
+...
+<!-- page:2 -->
+# Trang 2
+```
+
+➡️ Page marker này là **khóa để click mở đúng trang PDF**.
+
+---
+
+## 7. Page Mapping & Anchoring
+
+### 7.1 Mục tiêu
+- Người dùng search → click → mở **đúng trang, đúng vị trí**
+
+### 7.2 Cách làm
+- Mỗi block Markdown phải mang theo:
+```json
+{
+  "file_id": "...",
+  "page_from": 3,
+  "page_to": 4,
+  "sharepoint_url": "...?page=3"
+}
+```
+
+---
+
+## 8. Normalization & Cleanup
+
+### 8.1 Những việc NÊN làm
+- Remove header/footer lặp
+- Strip ký tự OCR rác
+- Normalize whitespace
+- Chuẩn hóa heading
+
+### 8.2 Những việc KHÔNG NÊN làm
+- Rewrite nội dung
+- Tóm tắt
+- Suy diễn
+
+➡️ Bước này **không dùng LLM**.
+
+---
+
+## 9. Chunking Strategy
+
+### 9.1 Thứ tự ưu tiên
+1. Heading (##, ###)
+2. Trang PDF
+3. Đoạn văn
+
+### 9.2 Kích thước gợi ý
+- 300–800 tokens
+- Không cắt giữa câu
+
+### 9.3 Schema chunk
+```json
+{
+  "chunk_id": "uuid",
+  "text": "...",
+  "file_id": "...",
+  "page_from": 5,
+  "page_to": 6,
+  "section": "Điều 3",
+  "source_url": "..."
+}
+```
+
+---
+
+## 10. Output Contract
+
+### 10.1 Đầu ra cho Index Layer
+```json
+{
+  "file_id": "...",
+  "chunks": [ ... ],
+  "metadata": { ... }
+}
+```
+
+### 10.2 Tính chất
+- Self-contained
+- Không cần gọi lại SharePoint
+- Có thể re-index lại từ output này
+
+---
+
+## 11. Error handling & Retry
+
+| Lỗi | Xử lý |
+|---|------|
+| OCR fail | Retry / flag manual |
+| File corrupt | Log + skip |
+| MarkItDown error | Retry with fallback |
+
+Mỗi file có **processing status riêng**.
+
+---
+
+## 12. Performance & Scale
+
+- OCR là bottleneck → async
+- 1 file lớn = nhiều page jobs
+- Scale theo số trang, không theo số file
+
+---
+
+## 13. Checklist triển khai
+
+✅ Detect scan vs text
+✅ OCR page-wise
+✅ Page marker trong Markdown
+✅ Chunk có page mapping
+✅ Output contract rõ ràng
+
+---
+
+*Kết thúc Extraction & Normalization Playbook. File này nối trực tiếp sang Index/Search.*
--- a/doc/4.OpenSearch-Index-Search-Playbook.md
+++ b/doc/4.OpenSearch-Index-Search-Playbook.md
@@ -0,0 +1,246 @@
+# 4.OpenSearch-Index-Search-Playbook.md
+
+> Tài liệu này kế thừa trực tiếp các file trước (1–3). Mục tiêu: thiết kế **Index & Search Layer** đảm bảo tra cứu nhanh, đúng quyền, hỗ trợ full‑text, semantic search và RAG về sau. Nội dung viết để **tái sử dụng lâu dài**, có thể nạp lại để tiếp tục triển khai.
+
+---
+
+## Mục lục
+1. Vai trò của Index/Search trong pipeline
+2. Nguyên tắc thiết kế
+3. Kiến trúc logical của Search Layer
+4. Thiết kế Index (mapping chi tiết)
+5. ACL‑aware indexing & filtering
+6. Full‑text Search
+7. Vector & Semantic Search
+8. Hybrid Search (khuyến nghị)
+9. Highlight, Ranking & Relevance tuning
+10. Query contract (API level)
+11. Performance & Scale
+12. Lifecycle management (update/delete)
+13. Checklist triển khai
+
+---
+
+## 1. Vai trò của Index/Search trong pipeline
+
+```text
+Extraction & Chunking
+        │
+        ▼
+Index Layer (OpenSearch)
+        │
+        ▼
+Search UI / Chat (RAG)
+```
+
+Index/Search chịu trách nhiệm:
+- Lưu trữ **chunk nội dung + metadata + ACL**
+- Trả kết quả **nhanh, chính xác, đúng quyền**
+- Cho phép search theo **từ khóa lẫn ngữ nghĩa**
+
+---
+
+## 2. Nguyên tắc thiết kế
+
+1. **Chunk‑first, không file‑first**
+2. **Permission filter ở query time nhưng dữ liệu đã chuẩn hóa từ ingestion**
+3. **Hybrid search mặc định** (keyword + vector)
+4. **Index schema ổn định**, embedding thay đổi được
+5. **Re‑index được từ output Extraction mà không cần SharePoint**
+
+---
+
+## 3. Kiến trúc logical của Search Layer
+
+```text
+OpenSearch Cluster
+ ├── text index
+ ├── vector index (hoặc combined)
+ ├── analyzer (vi, en)
+ └── ACL filter
+```
+
+Có thể:
+- Dùng **1 index combined** (text + vector)
+- Hoặc **2 index song song** (đơn giản giai đoạn đầu: 1 index)
+
+---
+
+## 4. Thiết kế Index (Mapping)
+
+### 4.1 Document unit
+Mỗi document trong index = **1 chunk**.
+
+### 4.2 Mapping gợi ý
+
+```json
+{
+  "mappings": {
+    "properties": {
+      "chunk_id": { "type": "keyword" },
+      "file_id": { "type": "keyword" },
+      "file_name": { "type": "text" },
+      "text": {
+        "type": "text",
+        "analyzer": "standard",
+        "search_analyzer": "standard"
+      },
+      "embedding": {
+        "type": "knn_vector",
+        "dimension": 768
+      },
+      "site_id": { "type": "keyword" },
+      "page_from": { "type": "integer" },
+      "page_to": { "type": "integer" },
+      "source_url": { "type": "keyword" },
+      "permissions": { "type": "keyword" },
+      "updated_at": { "type": "date" }
+    }
+  }
+}
+```
+
+---
+
+## 5. ACL‑aware Indexing & Filtering
+
+### 5.1 Nguyên tắc
+- Không index public rồi filter payload
+- Mỗi chunk mang theo **list principal IDs** (user/group)
+
+### 5.2 Filter tại query
+```json
+{
+  "terms": {
+    "permissions": ["current_user_id", "group_id_1"]
+  }
+}
+```
+
+---
+
+## 6. Full‑text Search
+
+Sử dụng khi:
+- Tìm chính xác điều khoản, mã số, tên riêng
+
+Ví dụ query:
+```json
+{
+  "match": {
+    "text": "hợp đồng lao động không xác định thời hạn"
+  }
+}
+```
+
+---
+
+## 7. Vector & Semantic Search
+
+Sử dụng khi:
+- Câu hỏi tự nhiên
+- Nội dung diễn đạt khác từ khóa
+
+```json
+{
+  "knn": {
+    "embedding": {
+      "vector": [ ... ],
+      "k": 10
+    }
+  }
+}
+```
+
+---
+
+## 8. Hybrid Search (Khuyến nghị mặc định)
+
+```json
+{
+  "bool": {
+    "must": [
+      {
+        "match": { "text": "chấm dứt hợp đồng" }
+      }
+    ],
+    "should": [
+      {
+        "knn": {
+          "embedding": {
+            "vector": [ ... ],
+            "k": 10
+          }
+        }
+      }
+    ]
+  }
+}
+```
+
+Score cuối = combine(keyword_score, semantic_score)
+
+---
+
+## 9. Highlight, Ranking & Relevance
+
+### 9.1 Highlight
+- Highlight ở field `text`
+- Trả snippet cho UI
+
+### 9.2 Ranking hints
+- Boost theo:
+  - page proximity
+  - recency (updated_at)
+  - heading match
+
+---
+
+## 10. Query Contract (API nội bộ)
+
+### POST /search
+Input:
+```json
+{ "query": "...", "user_id": "...", "groups": ["..."] }
+```
+
+Output:
+```json
+{
+  "results": [
+    { "file_name": "...", "page": 5, "snippet": "...", "url": "..." }
+  ]
+}
+```
+
+---
+
+## 11. Performance & Scale
+
+- Index theo chunk, không theo file
+- Shard theo dữ liệu, không theo tenant sớm
+- Cache query phổ biến
+
+---
+
+## 12. Lifecycle Management
+
+| Event | Action |
+|------|-------|
+| File update | Delete chunks cũ → index lại |
+| File delete | Soft delete hoặc remove |
+| Re‑embedding | Update `embedding` field |
+
+---
+
+## 13. Checklist triển khai
+
+✅ Mapping ổn định
+✅ ACL filter hoạt động
+✅ Hybrid search default
+✅ Highlight trả đúng trang
+✅ Re‑index không cần SharePoint
+
+---
+
+*Kết thúc Index & Search Playbook. File tiếp theo sẽ là RAG / Chat Layer.*
--- a/doc/5.RAG-Chat-Application-Playbook.md
+++ b/doc/5.RAG-Chat-Application-Playbook.md
@@ -0,0 +1,242 @@
+# 5.RAG-Chat-Application-Playbook.md
+
+> Tài liệu này mô tả **tầng RAG Chat** xây dựng trên Search (file 4). Mục tiêu là:
+> - Giúp **AI trả lời đúng – có dẫn chứng – không hallucinate**
+> - Giúp **người dùng cuối dễ hiểu, dễ tin, dễ dùng**
+> - Có thể dùng làm **tài liệu tham khảo khi soạn Hướng dẫn sử dụng (User Guide)**
+>
+> File được viết để **AI agent đọc là triển khai được**, **con người đọc là hiểu và sử dụng được**.
+
+---
+
+## Mục lục
+1. RAG Chat là gì trong hệ thống này?
+2. Nguyên tắc thiết kế RAG Chat
+3. Kiến trúc tổng thể RAG Chat
+4. Vai trò của Search trong RAG
+5. Luồng xử lý câu hỏi (End‑to‑End)
+6. Prompt Strategy (Quan trọng nhất)
+7. Context Assembly & Citation
+8. Tránh Hallucination & Guardrails
+9. Trải nghiệm người dùng (UX Guidelines)
+10. Các chế độ Chat khuyến nghị
+11. Permission propagation
+12. Logging & Explainability
+13. Checklist triển khai & Checklist hướng dẫn sử dụng
+
+---
+
+## 1. RAG Chat là gì trong hệ thống này?
+
+**RAG (Retrieval‑Augmented Generation)** = AI **KHÔNG tự nghĩ**, mà:
+1. Tìm dữ liệu liên quan trong hệ thống Search
+2. Chỉ dùng dữ liệu đó để trả lời
+3. Luôn nói rõ **lấy thông tin từ đâu**
+
+Trong hệ thống này:
+- ❌ Chat **không** thay thế Search
+- ✅ Chat **dựa hoàn toàn trên Search**
+
+---
+
+## 2. Nguyên tắc thiết kế RAG Chat
+
+1. **Search‑first, LLM‑second**
+2. **Không có dữ liệu → không trả lời**
+3. **Trả lời = Nội dung + Dẫn chứng**
+4. **Ngôn ngữ rõ ràng, không học thuật thừa**
+5. **Người dùng luôn có thể click mở tài liệu gốc**
+
+---
+
+## 3. Kiến trúc tổng thể RAG Chat
+
+```text
+User Question
+     │
+     ▼
+Query Understanding
+     │
+     ▼
+Search Layer (OpenSearch)
+     │
+     ├─ Top K chunks (ACL‑aware)
+     ▼
+Context Assembly
+     │
+     ▼
+LLM (Answer generation)
+     │
+     ▼
+Answer + Citations
+```
+
+---
+
+## 4. Vai trò của Search trong RAG
+
+Search quyết định:
+- AI **được phép biết gì**
+- AI **không được phép bịa gì**
+
+Quy tắc:
+- LLM **chỉ được dùng context do Search trả về**
+- Không cho LLM truy cập internet / training data
+
+---
+
+## 5. Luồng xử lý câu hỏi (End‑to‑End)
+
+```text
+[User hỏi]
+  ↓
+[Normalize câu hỏi]
+  ↓
+[Hybrid Search (keyword + vector)]
+  ↓
+[Lọc theo permission user]
+  ↓
+[Chọn top K chunks]
+  ↓
+[Build context + citation map]
+  ↓
+[LLM sinh câu trả lời]
+  ↓
+[Trả lời + link tài liệu]
+```
+
+---
+
+## 6. Prompt Strategy (Rất quan trọng)
+
+### 6.1 System Prompt (bắt buộc)
+
+```text
+Bạn là trợ lý tra cứu tài liệu nội bộ.
+Bạn CHỈ được trả lời dựa trên thông tin được cung cấp trong CONTEXT.
+Nếu CONTEXT không đủ, hãy trả lời: 
+"Tôi không tìm thấy thông tin trong các tài liệu hiện có." 
+Mỗi câu trả lời phải kèm theo nguồn trích dẫn.
+```
+
+---
+
+### 6.2 User Prompt Template
+
+```text
+CÂU HỎI:
+{{user_question}}
+
+CONTEXT:
+{{retrieved_chunks}}
+```
+
+---
+
+### 6.3 Output Format (khuyến nghị)
+
+```text
+TRẢ LỜI NGẮN GỌN:
+...
+
+CHI TIẾT:
+...
+
+NGUỒN THAM KHẢO:
+- Tài liệu A – trang 5
+- Tài liệu B – trang 12
+```
+
+➡️ Format này **rất phù hợp để đưa vào User Guide**.
+
+---
+
+## 7. Context Assembly & Citation
+
+### 7.1 Context Assembly
+- Giữ đúng thứ tự logic
+- Không quá dài (token budget)
+- Mỗi chunk có ID và nguồn
+
+### 7.2 Citation Object
+
+```json
+{
+  "chunk_id": "...",
+  "file_name": "...",
+  "page": 7,
+  "url": "..."
+}
+```
+
+---
+
+## 8. Tránh Hallucination & Guardrails
+
+✅ BẮT BUỘC:
+- Nếu không có chunk phù hợp → không trả lời
+- Không "suy luận thêm" ngoài context
+
+✅ Câu trả lời chuẩn khi không đủ dữ liệu:
+> "Hiện tại tôi không tìm thấy thông tin trong các tài liệu nội bộ để trả lời câu hỏi này."
+
+---
+
+## 9. Trải nghiệm người dùng (UX Guidelines)
+
+### 9.1 Người dùng nên cảm nhận gì?
+- AI **đáng tin**
+- Nói **giống đồng nghiệp biết tài liệu**, không giống chatbot chung chung
+
+### 9.2 UX khuyến nghị
+- Mỗi đoạn trả lời có nút "Mở tài liệu"
+- Highlight đoạn liên quan trong PDF
+
+---
+
+## 10. Các chế độ Chat khuyến nghị
+
+| Mode | Mô tả | Đối tượng |
+|----|------|----------|
+| Tra cứu nhanh | Trả lời ngắn + link | Nhân viên |
+| Giải thích | Có diễn giải | Đào tạo |
+| So sánh | So nhiều tài liệu | Quản lý |
+
+---
+
+## 11. Permission Propagation
+
+Nguyên tắc:
+- Chat **không trả về thứ Search không trả**
+- Không lộ tên file tồn tại nếu user không có quyền
+
+---
+
+## 12. Logging & Explainability
+
+Lưu lại:
+- Question
+- Retrieved chunks
+- Answer
+- Citation
+
+➡️ Phục vụ audit, cải tiến prompt, training nội bộ
+
+---
+
+## 13. Checklist triển khai & Checklist hướng dẫn sử dụng
+
+### 13.1 Checklist kỹ thuật
+✅ Search trước Chat
+✅ Prompt chặt chẽ
+✅ Citation bắt buộc
+✅ Permission end‑to‑end
+
+### 13.2 Checklist cho User Guide
+✅ Giải thích AI làm gì / không làm gì
+✅ Hướng dẫn đọc citation
+✅ Cách phản hồi khi AI không tìm thấy dữ liệu
+
+---
+
+*Kết thúc RAG Chat Playbook. Đây là tầng trên cùng của hệ thống.*
--- a/doc/6.Operations-Monitoring-Governance-Playbook.md
+++ b/doc/6.Operations-Monitoring-Governance-Playbook.md
@@ -0,0 +1,241 @@
+# 6.Operations-Monitoring-Governance-Playbook.md
+
+> Tài liệu này mô tả **cách vận hành, giám sát và kiểm soát** toàn bộ hệ thống SharePoint → Search → RAG Chat trong môi trường doanh nghiệp.
+> Đây là tài liệu **rất quan trọng sau PoC**, đảm bảo hệ thống **bền vững, an toàn, có thể mở rộng và được tin cậy lâu dài**.
+>
+> File được viết để:
+> - IT / Ops / Security đọc là hiểu
+> - AI agent đọc là biết cách triển khai monitoring
+> - Là nền tảng cho **quy trình vận hành & governance nội bộ**
+
+---
+
+## Mục lục
+1. Mục tiêu Operations & Governance
+2. Nguyên tắc vận hành
+3. Tổng quan các tầng cần giám sát
+4. Monitoring chi tiết theo từng tầng
+5. KPI & Quality Metrics
+6. Alerting & Incident Response
+7. Logging & Audit Trail
+8. Data Governance & Compliance
+9. Model / Prompt Governance
+10. User Feedback Loop
+11. Backup, Rollback & Disaster Recovery
+12. Phân quyền vận hành (Ops Roles)
+13. Checklist vận hành định kỳ
+
+---
+
+## 1. Mục tiêu Operations & Governance
+
+Hệ thống không chỉ cần "chạy được", mà phải:
+- ✅ Ổn định
+- ✅ Đúng dữ liệu
+- ✅ Đúng quyền
+- ✅ Truy vết được
+- ✅ Giải thích được
+
+Mục tiêu cuối cùng:
+> **Người dùng tin – IT kiểm soát – Ban lãnh đạo yên tâm**
+
+---
+
+## 2. Nguyên tắc vận hành
+
+1. **Fail rõ ràng, không fail im lặng**
+2. **Mỗi tài liệu đều có lifecycle**
+3. **Mỗi câu trả lời đều có log & nguồn**
+4. **AI không được vượt quyền dữ liệu**
+5. **Có thể rollback bất kỳ tầng nào**
+
+---
+
+## 3. Tổng quan các tầng cần giám sát
+
+```text
+[SharePoint]
+   ↓
+[Ingestion]
+   ↓
+[Extraction / OCR]
+   ↓
+[Index / Search]
+   ↓
+[RAG Chat]
+```
+
+Mỗi tầng phải có:
+- Health check
+- Metric
+- Log
+
+---
+
+## 4. Monitoring chi tiết theo từng tầng
+
+### 4.1 Ingestion Layer
+
+**Theo dõi:**
+- Số file quét / phút
+- Delta sync success rate
+- File fail theo loại lỗi
+
+**Cảnh báo khi:**
+- Delta token không cập nhật > N giờ
+- Lỗi permission tăng đột biến
+
+---
+
+### 4.2 Extraction / OCR Layer
+
+**Theo dõi:**
+- Thời gian OCR / trang
+- OCR failure rate
+- Queue length
+
+**Cảnh báo khi:**
+- OCR latency vượt ngưỡng
+- OCR fail liên tục cùng 1 file type
+
+---
+
+### 4.3 Index / Search Layer
+
+**Theo dõi:**
+- Query latency (p50/p95)
+- Index size
+- Search error rate
+
+**Cảnh báo khi:**
+- Query > SLA
+- Index out-of-sync
+
+---
+
+### 4.4 RAG Chat Layer
+
+**Theo dõi:**
+- Số câu hỏi / ngày
+- % câu trả lời "không tìm thấy dữ liệu"
+- Thời gian phản hồi
+
+**Cảnh báo khi:**
+- Error LLM
+- Citation missing
+
+---
+
+## 5. KPI & Quality Metrics
+
+### 5.1 KPI kỹ thuật
+- Ingestion success rate ≥ 99%
+- OCR success rate ≥ 95%
+- Search latency ≤ 3s
+
+### 5.2 KPI người dùng
+- Query có kết quả hữu ích ≥ X%
+- Click vào tài liệu gốc ≥ Y%
+
+---
+
+## 6. Alerting & Incident Response
+
+### 6.1 Nguyên tắc
+- Alert phải **actionable**
+- Có owner rõ ràng
+
+### 6.2 Incident Flow
+```text
+Detect → Triage → Mitigate → Root cause → Post-mortem
+```
+
+---
+
+## 7. Logging & Audit Trail
+
+### 7.1 Log bắt buộc
+- Ingestion log (file-level)
+- Search log (query-level)
+- Chat log (question, context, answer, citation)
+
+### 7.2 Audit sử dụng khi:
+- Khiếu nại kết quả AI
+- Compliance / kiểm toán
+
+---
+
+## 8. Data Governance & Compliance
+
+### 8.1 Nguyên tắc dữ liệu
+- File gốc không rời SharePoint
+- Không lưu dữ liệu ngoài phạm vi cho phép
+
+### 8.2 Retention
+- Metadata & log theo chính sách công ty
+- Cho phép purge theo yêu cầu
+
+---
+
+## 9. Model / Prompt Governance
+
+### 9.1 Versioning
+- Prompt phải có version
+- Model phải có version
+
+### 9.2 Thay đổi phải:
+- Có test
+- Có rollback plan
+
+---
+
+## 10. User Feedback Loop
+
+### 10.1 Thu thập feedback
+- Nút "Câu trả lời hữu ích / không hữu ích"
+- Comment ngắn
+
+### 10.2 Sử dụng feedback để:
+- Điều chỉnh prompt
+- Điều chỉnh search ranking
+
+---
+
+## 11. Backup, Rollback & DR
+
+- Backup index định kỳ
+- Có thể rebuild index từ Extraction output
+- DR plan cho:
+  - OpenSearch
+  - Metadata DB
+
+---
+
+## 12. Phân quyền vận hành (Ops Roles)
+
+| Role | Trách nhiệm |
+|----|-------------|
+| System Admin | Hạ tầng |
+| Data Admin | Ingestion/Search |
+| AI Admin | Prompt/Model |
+| Auditor | Log/Compliance |
+
+---
+
+## 13. Checklist vận hành định kỳ
+
+### Hàng ngày
+✅ Ingestion health
+✅ OCR queue
+
+### Hàng tuần
+✅ Re-index test
+✅ Permission sync
+
+### Hàng tháng
+✅ Prompt review
+✅ Compliance review
+
+---
+
+*Kết thúc Operations & Governance Playbook. Đây là file hoàn thiện hệ thống ở mức enterprise.*
--- a/doc/8.EndToEnd-Processing-Flows-Bullets.md
+++ b/doc/8.EndToEnd-Processing-Flows-Bullets.md
@@ -0,0 +1,130 @@
+# 8.EndToEnd-Processing-Flows-Bullets.md
+
+> File này liệt kê **TOÀN BỘ các luồng xử lý cần thiết** để triển khai và mở rộng hệ thống.
+> Dạng **gạch đầu dòng**, dùng như **bản đồ tư duy kỹ thuật**, hoặc checklist khi mở rộng.
+
+---
+
+## A. Luồng Ingestion
+
+- Load config site / library
+- Authenticate Graph (app-only)
+- First full delta snapshot
+- Persist delta token
+- Poll delta định kỳ
+- Detect create / update / delete
+- Fetch metadata
+- Fetch & flatten permissions
+- Decide eligible file
+- Emit job downstream
+
+---
+
+## B. Luồng Permission
+
+- Resolve direct users
+- Resolve AAD groups
+- Expand nested groups
+- Cache membership
+- Attach ACL to file/chunk
+
+---
+
+## C. Luồng Extraction
+
+- Receive file job
+- Detect file type
+- Detect scan vs text PDF
+- OCR per page (if needed)
+- Convert to Markdown
+- Insert page markers
+- Normalize text
+- Remove noise
+
+---
+
+## D. Luồng Chunking
+
+- Split by heading
+- Split by page
+- Validate token length
+- Attach page range
+- Attach source URL
+
+---
+
+## E. Luồng Indexing
+
+- Validate mapping version
+- Generate embedding
+- Attach ACL
+- Index chunk
+- Remove old chunks (on update)
+
+---
+
+## F. Luồng Search
+
+- Receive query
+- Resolve user identity
+- Resolve user groups
+- Hybrid search
+- Apply ACL filter
+- Score & rank
+- Highlight text
+- Return results
+
+---
+
+## G. Luồng RAG Chat
+
+- Receive question
+- Decide Search vs Chat
+- Retrieve top K chunks
+- Build context
+- Enforce token budget
+- Generate answer
+- Attach citations
+- Return answer
+
+---
+
+## H. Luồng Feedback
+
+- Collect user feedback
+- Store feedback
+- Aggregate metrics
+- Feed prompt tuning
+- Feed ranking tuning
+
+---
+
+## I. Luồng Ops & Monitoring
+
+- Health check all services
+- Collect metrics
+- Trigger alerts
+- Incident handling
+- Post-mortem
+
+---
+
+## J. Luồng Governance & Change
+
+- Prompt version change
+- Model version change
+- Embedding change
+- Re-index strategy
+- Rollback
+
+---
+
+## Cách sử dụng file này
+
+- Dùng làm checklist triển khai
+- Dùng để chia task cho AI agent
+- Dùng làm reference mở rộng hệ thống
+
+---
+
+*File này intentionally không chi tiết – nó là xương sống logic cho mọi mở rộng sau này.*
--- a/doc/9.Appendix-Vietnamese-OCR-Strategy.md
+++ b/doc/9.Appendix-Vietnamese-OCR-Strategy.md
@@ -0,0 +1,163 @@
+# 9.Appendix-Vietnamese-OCR-Strategy.md
+
+> **Phụ lục chiến lược OCR Tiếng Việt** cho toàn bộ hệ thống SharePoint → Search → RAG.
+>
+> ⚠️ **LƯU Ý QUAN TRỌNG**
+> - File này là **phụ lục độc lập**, **KHÔNG chỉnh sửa** các file 1–6.
+> - Dùng để:
+>   - Thống nhất nhận thức kỹ thuật về OCR tiếng Việt
+>   - Làm tài liệu tham chiếu khi triển khai / audit / mở rộng
+>   - Tránh tranh luận lại từ đầu khi đổi AI agent hoặc dev
+
+---
+
+## 1. Khẳng định phạm vi ngôn ngữ
+
+- **~99% tài liệu là Tiếng Việt (có dấu)**
+- Chủ yếu: văn bản hành chính, hợp đồng, quy định, scan nhiều đời
+- Yêu cầu:
+  - Giữ **đúng dấu tiếng Việt**
+  - Chấp nhận được với search ngữ nghĩa & RAG
+
+➡️ OCR tiếng Việt được coi là **constraint nền**, không phải optional.
+
+---
+
+## 2. Vì sao OCR tiếng Việt khó hơn OCR tiếng Anh?
+
+- Tiếng Việt có:
+  - Dấu thanh (sắc, huyền, hỏi, ngã, nặng)
+  - Dấu phụ (â, ê, ô, ă, ơ, ư, đ)
+- Sai dấu = **đổi nghĩa hoàn toàn**
+- Các engine OCR chung thường:
+  - Nhận đúng chữ cái
+  - **Sai hoặc rơi dấu** trong scan mờ
+
+Nghiên cứu về Vietnamese Document Recognition chỉ ra đây là vấn đề cố hữu nhiều năm citeturn12search39.
+
+---
+
+## 3. Đánh giá các engine OCR phổ biến cho tiếng Việt (2025–2026)
+
+### 3.1 Tesseract (vie.traineddata)
+
+- ✅ Dễ triển khai, chạy CPU
+- ❌ Độ chính xác dấu **không ổn định** với scan thực tế
+- ❌ Không phù hợp cho văn bản pháp lý, hợp đồng
+
+Kết luận: **KHÔNG đủ** làm engine chính cho hệ thống này citeturn12search43turn12search44.
+
+---
+
+### 3.2 EasyOCR
+
+- ✅ Setup nhanh
+- ❌ Nhận dạng tiếng Việt chỉ ở mức trung bình
+- ❌ Không tối ưu cho tài liệu nhiều trang
+
+Kết luận: chỉ dùng thử nghiệm, **không dùng production** cho tiếng Việt.
+
+---
+
+### 3.3 PaddleOCR (base model)
+
+- ✅ Text detection rất tốt
+- ✅ Xử lý layout, xoay, nghiêng tốt
+- ❌ Text recognition tiếng Việt **chưa tối ưu nếu dùng model mặc định**
+
+Được đánh giá cao hơn Tesseract nhưng cần **nâng cấp recognition** citeturn12search43turn12search46.
+
+---
+
+### 3.4 PaddleOCR + VietOCR (Khuyến nghị chính)
+
+**Cấu hình thực tế được cộng đồng Việt Nam sử dụng nhiều nhất**:
+
+```
+Text Detection   : PaddleOCR (DB / SAST)
+Text Recognition : VietOCR (Transformer, tiếng Việt có dấu)
+```
+
+Ưu điểm:
+- ✅ Nhận dạng tiếng Việt có dấu **tốt nhất trong open‑source**
+- ✅ Chạy on‑prem
+- ✅ Có thể fine‑tune theo domain
+
+Đây là pipeline được dùng trong nhiều dự án OCR tiếng Việt thực tế citeturn12search36turn12search41.
+
+---
+
+### 3.5 Fine‑tuned PaddleOCR cho tiếng Việt (Advanced)
+
+- Fine‑tune detection + recognition bằng dataset tiếng Việt
+- Có thể đạt độ chính xác rất cao cho từng domain (hành chính, y tế…)
+
+Nghiên cứu và repo gần đây cho thấy hiệu quả rõ rệt khi fine‑tune PaddleOCR cho VN citeturn12search52turn12search54.
+
+⚠️ Tuy nhiên:
+- Tốn effort huấn luyện
+- **KHÔNG cần làm ngay ở phase đầu**
+
+---
+
+## 4. Mức độ chính xác thực tế có thể kỳ vọng
+
+| Chất lượng scan | OCR tiếng Việt tốt |
+|---|---|
+| Scan rõ, font chuẩn | 97–99% |
+| Scan nhiều đời | 90–95% |
+| Scan rất xấu | Không OCR nào cứu hoàn toàn |
+
+➡️ Vì vậy hệ thống **không được phụ thuộc 100% vào OCR**.
+
+---
+
+## 5. OCR Tiếng Việt trong kiến trúc hiện tại (đã được tính trước)
+
+Kiến trúc hệ thống **không giả định OCR hoàn hảo**:
+
+| Vấn đề OCR | Cách kiến trúc xử lý |
+|---|---|
+| Sai 1 từ | Chunk nhỏ, search vẫn trúng |
+| Sai dấu | Search hybrid + semantic |
+| OCR fail | File bị đánh cờ, không đưa vào RAG |
+| OCR kém | Người dùng mở file gốc |
+
+➡️ Đây là **thiết kế có chủ đích**, không phải workaround.
+
+---
+
+## 6. Chiến lược triển khai OCR tiếng Việt theo phase
+
+### Phase 1 – An toàn, triển khai nhanh
+- PaddleOCR (Detection) + VietOCR (Recognition)
+- OCR page‑wise
+- Lưu confidence theo trang
+
+### Phase 2 – Tối ưu dần
+- Tune preprocessing (deskew, binarization)
+- Dictionary post‑process tiếng Việt
+
+### Phase 3 – Khi thực sự cần
+- Fine‑tune PaddleOCR/VietOCR theo domain
+
+---
+
+## 7. Nguyên tắc vận hành bắt buộc
+
+- OCR < threshold → **KHÔNG đưa vào RAG Chat**
+- OCR output luôn link về file gốc
+- OCR có log & audit
+
+---
+
+## 8. Kết luận phụ lục
+
+- ✅ OCR tiếng Việt **đã được tính đến từ đầu**
+- ✅ Có giải pháp **đủ tốt cho production**
+- ❌ Không có OCR tiếng Việt "100% đúng cho mọi trường hợp"
+- ✅ Kiến trúc đã được thiết kế để **chịu sai có kiểm soát**
+
+---
+
+*Phụ lục này dùng để tham chiếu lâu dài khi nói về OCR tiếng Việt trong hệ thống.*
--- a/doc/Entra-Graph-AppOnly-Checklist-and-Test-Notes.md
+++ b/doc/Entra-Graph-AppOnly-Checklist-and-Test-Notes.md
@@ -0,0 +1,371 @@
+# Ghi chú tránh lặp lại sai sót — Entra App + Microsoft Graph app-only cho SharePoint Ingestion
+
+## 1) Tóm tắt sự cố đã gặp
+
+### Hiện tượng
+- Lấy token **thành công** bằng `client_credentials`.
+- Token có các claim hợp lệ kiểu app-only:
+  - `aud = https://graph.microsoft.com`
+  - `idtyp = app`
+- Nhưng gọi các API Graph như:
+  - `GET /sites/{hostname}`
+  - `GET /sites/{hostname}:/{server-relative-path}`
+  đều bị **401 Unauthorized**.
+
+### Nguyên nhân gốc
+App Registration đã được cấp **Delegated permissions** thay vì **Application permissions**.
+
+Cụ thể lúc xảy ra lỗi, app đang có kiểu quyền như:
+- `Files.Read.All` → **Delegated**
+- `Sites.Read.All` → **Delegated**
+- `User.Read` → **Delegated**
+
+Trong khi luồng đang dùng là:
+- **OAuth 2.0 client credentials flow**
+- **app-only / daemon / service-to-service**
+- **không có user đăng nhập interactive**
+
+=> Kết quả: token app-only **không có `roles`**, nên Microsoft Graph không chấp nhận token cho các API cần application permissions.
+
+---
+
+## 2) Bài học rút ra (phải kiểm tra trước khi test Graph)
+
+### Quy tắc số 1
+Nếu dùng **client credentials flow** thì **bắt buộc** phải cấp quyền ở dạng:
+- **Microsoft Graph → Application permissions**
+
+**Không dùng Delegated permissions** cho bài toán ingestion app-only.
+
+### Quy tắc số 2
+Sau khi sửa permission trong Entra App:
+1. Bấm **Grant admin consent**
+2. Lấy **token mới hoàn toàn**
+3. Decode token và **kiểm tra `roles`** trước khi gọi Graph
+
+### Quy tắc số 3
+Nếu token app-only **không có `roles`**, thì **không test Graph tiếp**.
+Phải quay lại kiểm tra:
+- Permission type có phải **Application** không
+- Có phải **Microsoft Graph** không
+- Đã **Grant admin consent** chưa
+- Có cần đợi propagation vài phút không
+
+---
+
+## 3) Cấu hình đúng cho PoC hiện tại
+
+### Tối thiểu cần có trong Entra App
+#### Microsoft Graph → Application permissions
+- `Sites.Read.All`
+- `Files.Read.All`
+
+### Không cần cho app-only ingestion hiện tại
+- `User.Read` (Delegated)
+- Interactive login
+- Redirect URI cho user login
+
+---
+
+## 4) Checklist fail-fast trước khi test Graph
+
+## Checklist nhanh
+- [ ] App dùng **client credentials flow**
+- [ ] API permissions nằm dưới **Microsoft Graph**
+- [ ] Permission type là **Application**
+- [ ] Có `Sites.Read.All`
+- [ ] Có `Files.Read.All`
+- [ ] Đã **Grant admin consent**
+- [ ] Đã lấy **token mới** sau khi sửa permission
+- [ ] Token decode ra có `roles`
+- [ ] `roles` chứa ít nhất:
+  - `Sites.Read.All`
+  - `Files.Read.All`
+
+Nếu **một trong các dòng trên fail**, không chạy tiếp phần test Graph site/drive.
+
+---
+
+## 5) Script test hoàn chỉnh (đã thêm guard để tránh lặp lỗi)
+
+> Script này làm đúng 4 việc:
+> 1. Lấy token mới
+> 2. Decode token
+> 3. **Kiểm tra `roles` trước**
+> 4. Chỉ khi roles đúng mới test Graph site endpoints
+
+```powershell
+$ErrorActionPreference = "Stop"
+
+# ==========================================
+# Điền thông tin tại đây
+# ==========================================
+$TenantId       = "<TENANT_ID_GUID>"
+$ClientId       = "<CLIENT_ID_GUID>"
+$ClientSecret   = "<CLIENT_SECRET_VALUE>"
+$SharePointHost = "285pdg.sharepoint.com"
+$SitePath       = "/sites/poc_system"
+# ==========================================
+
+function Get-GraphToken {
+    param(
+        [string]$TenantId,
+        [string]$ClientId,
+        [string]$ClientSecret
+    )
+
+    $tokenUrl = "https://login.microsoftonline.com/" + $TenantId + "/oauth2/v2.0/token"
+
+    $body = @{
+        client_id     = $ClientId
+        client_secret = $ClientSecret
+        scope         = "https://graph.microsoft.com/.default"
+        grant_type    = "client_credentials"
+    }
+
+    return Invoke-RestMethod `
+        -Method Post `
+        -Uri $tokenUrl `
+        -ContentType "application/x-www-form-urlencoded" `
+        -Body $body
+}
+
+function Decode-JwtPayload {
+    param([string]$Jwt)
+
+    $parts = $Jwt.Split('.')
+    if ($parts.Length -lt 2) {
+        throw "JWT không hợp lệ."
+    }
+
+    $payload = $parts[1]
+
+    switch ($payload.Length % 4) {
+        2 { $payload += '==' }
+        3 { $payload += '=' }
+    }
+
+    $payload = $payload.Replace('-', '+').Replace('_', '/')
+    $bytes = [System.Convert]::FromBase64String($payload)
+    $json  = [System.Text.Encoding]::UTF8.GetString($bytes)
+
+    return $json | ConvertFrom-Json
+}
+
+function Invoke-GraphGet {
+    param(
+        [string]$Url,
+        [string]$AccessToken,
+        [string]$Label
+    )
+
+    Write-Host ""
+    Write-Host ("=== " + $Label + " ===") -ForegroundColor Cyan
+    Write-Host $Url -ForegroundColor DarkGray
+
+    $headers = @{
+        Authorization = "Bearer " + $AccessToken
+    }
+
+    try {
+        $resp = Invoke-WebRequest `
+            -Method Get `
+            -Uri $Url `
+            -Headers $headers `
+            -UseBasicParsing
+
+        Write-Host "OK" -ForegroundColor Green
+        Write-Host ("HTTP " + [int]$resp.StatusCode) -ForegroundColor Green
+
+        if ($resp.Content) {
+            $json = $resp.Content | ConvertFrom-Json
+            $json | ConvertTo-Json -Depth 10
+            return $json
+        }
+
+        return $null
+    }
+    catch {
+        Write-Host "FAILED" -ForegroundColor Red
+        Write-Host $_.Exception.Message -ForegroundColor Red
+
+        if ($_.Exception.Response -ne $null) {
+            $resp = $_.Exception.Response
+
+            Write-Host ""
+            Write-Host "Status code:" -ForegroundColor Yellow
+            try {
+                Write-Host ([int]$resp.StatusCode)
+            }
+            catch {
+                Write-Host "Không đọc được StatusCode"
+            }
+
+            Write-Host ""
+            Write-Host "WWW-Authenticate:" -ForegroundColor Yellow
+            try {
+                Write-Host $resp.Headers["WWW-Authenticate"]
+            }
+            catch {
+                Write-Host "Không có WWW-Authenticate header"
+            }
+
+            Write-Host ""
+            Write-Host "Response body:" -ForegroundColor Yellow
+            try {
+                $stream = $resp.GetResponseStream()
+                $reader = New-Object System.IO.StreamReader($stream)
+                $bodyText = $reader.ReadToEnd()
+                Write-Host $bodyText
+            }
+            catch {
+                Write-Host "Không đọc được response detail."
+            }
+        }
+
+        return $null
+    }
+}
+
+# =======================================================
+# 1) Lấy token mới
+# =======================================================
+$tokenResponse = Get-GraphToken `
+    -TenantId $TenantId `
+    -ClientId $ClientId `
+    -ClientSecret $ClientSecret
+
+$accessToken = $tokenResponse.access_token
+
+Write-Host "TOKEN OK" -ForegroundColor Green
+Write-Host ("token_type : " + $tokenResponse.token_type)
+Write-Host ("expires_in : " + $tokenResponse.expires_in)
+
+# =======================================================
+# 2) Decode token + kiểm tra claims quan trọng
+# =======================================================
+$claims = Decode-JwtPayload -Jwt $accessToken
+
+Write-Host ""
+Write-Host "=== TOKEN CLAIMS ===" -ForegroundColor Cyan
+Write-Host ("aud   : " + $claims.aud)
+Write-Host ("appid : " + $claims.appid)
+
+if ($claims.PSObject.Properties.Name -contains "idtyp") {
+    Write-Host ("idtyp : " + $claims.idtyp)
+}
+
+$roles = @()
+if ($claims.PSObject.Properties.Name -contains "roles") {
+    $roles = @($claims.roles)
+    Write-Host "roles:" -ForegroundColor Yellow
+    $roles | ForEach-Object { Write-Host (" - " + $_) }
+}
+else {
+    Write-Host "roles: <KHÔNG CÓ>" -ForegroundColor Red
+}
+
+# =======================================================
+# 3) Guard bắt buộc: nếu không có roles đúng thì stop luôn
+# =======================================================
+$hasSitesRead = $roles -contains "Sites.Read.All"
+$hasFilesRead = $roles -contains "Files.Read.All"
+
+if (-not $hasSitesRead) {
+    Write-Host "" 
+    Write-Host "STOP: Token chưa có role 'Sites.Read.All'." -ForegroundColor Red
+    Write-Host "=> Kiểm tra lại Microsoft Graph -> Application permissions -> Sites.Read.All -> Grant admin consent." -ForegroundColor Yellow
+    return
+}
+
+if (-not $hasFilesRead) {
+    Write-Host "" 
+    Write-Host "WARNING: Token chưa có role 'Files.Read.All'." -ForegroundColor Yellow
+    Write-Host "=> Site test có thể pass, nhưng drive/delta về sau sẽ fail." -ForegroundColor Yellow
+}
+
+# =======================================================
+# 4) TEST 1 - GET /sites/{hostname}
+# =======================================================
+$rootByHostUrl = "https://graph.microsoft.com/v1.0/sites/" + $SharePointHost + "?`$select=id,webUrl"
+$rootSite = Invoke-GraphGet `
+    -Url $rootByHostUrl `
+    -AccessToken $accessToken `
+    -Label "TEST 1 - GET /sites/{hostname}"
+
+# =======================================================
+# 5) TEST 2 - GET /sites/{hostname}:/{server-relative-path}
+# =======================================================
+$siteRef = $SharePointHost + ":" + $SitePath
+$resolveSiteUrl = "https://graph.microsoft.com/v1.0/sites/" + $siteRef + "?`$select=id,displayName,webUrl"
+$resolvedSite = Invoke-GraphGet `
+    -Url $resolveSiteUrl `
+    -AccessToken $accessToken `
+    -Label "TEST 2 - GET /sites/{hostname}:/{server-relative-path}"
+
+# =======================================================
+# 6) Summary
+# =======================================================
+Write-Host ""
+Write-Host "=== SUMMARY ===" -ForegroundColor Cyan
+
+if ($null -ne $rootSite) {
+    Write-Host "TEST 1: PASS" -ForegroundColor Green
+}
+else {
+    Write-Host "TEST 1: FAIL" -ForegroundColor Red
+}
+
+if ($null -ne $resolvedSite) {
+    Write-Host "TEST 2: PASS" -ForegroundColor Green
+}
+else {
+    Write-Host "TEST 2: FAIL" -ForegroundColor Red
+}
+```
+
+---
+
+## 6) Kỳ vọng đúng sau khi sửa quyền
+
+### Token claim đúng
+```text
+aud   : https://graph.microsoft.com
+appid : <app-id>
+idtyp : app
+roles:
+ - Sites.Read.All
+ - Files.Read.All
+```
+
+### Test site đúng
+- `TEST 1 - GET /sites/{hostname}` → PASS
+- `TEST 2 - GET /sites/{hostname}:/{server-relative-path}` → PASS
+
+---
+
+## 7) Ghi chú vận hành nội bộ
+
+### Khi nào được đi tiếp sang ingestion skeleton?
+Chỉ khi đạt đủ 3 điều kiện:
+1. Token có `roles`
+2. `TEST 1` pass
+3. `TEST 2` pass
+
+### Nếu token vẫn không có `roles`
+- Kiểm tra lại **Type** của permission có phải **Application** không
+- Kiểm tra lại permission có nằm dưới **Microsoft Graph** không
+- Kiểm tra đã bấm **Grant admin consent** chưa
+- Đợi vài phút rồi lấy **token mới** lần nữa
+
+---
+
+## 8) Tên lỗi nội bộ để nhớ
+
+> **Sai lầm đã gặp:**
+> “Dùng `client_credentials` nhưng lại cấp **Delegated permissions** trong App Registration.”
+
+### Cách nhớ nhanh
+- **App-only** → **Application permissions**
+- **Có user login** → **Delegated permissions**
+
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -0,0 +1,67 @@
+version: '3.8'
+
+services:
+  # 1. OpenSearch: Vector & Full-text Search Engine
+  opensearch:
+    image: opensearchproject/opensearch:2.11.1
+    container_name: poc_opensearch
+    environment:
+      - discovery.type=single-node
+      - OPENSEARCH_INITIAL_ADMIN_PASSWORD=Admin123!@#
+      - bootstrap.memory_lock=true
+      - "OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m"
+      - DISABLE_SECURITY_PLUGIN=true # Tạm tắt security plugin để dev dễ dàng (không cần HTTPS/certs phức tạp)
+    ulimits:
+      memlock:
+        soft: -1
+        hard: -1
+      nofile:
+        soft: 65536
+        hard: 65536
+    ports:
+      - "9200:9200"
+      - "9600:9600"
+    volumes:
+      - opensearch-data:/usr/share/opensearch/data
+    networks:
+      - poc_network
+
+  # 2. OpenSearch Dashboards: Giao diện web quản lý Index (Tùy chọn nhưng rất tiện)
+  opensearch-dashboards:
+    image: opensearchproject/opensearch-dashboards:2.11.1
+    container_name: poc_os_dashboards
+    environment:
+      - OPENSEARCH_HOSTS=http://opensearch:9200
+      - DISABLE_SECURITY_DASHBOARDS_PLUGIN=true
+    ports:
+      - "5601:5601"
+    depends_on:
+      - opensearch
+    networks:
+      - poc_network
+
+  # 3. Backend App: Ứng dụng Python Ingestion & Search
+  backend:
+    build: .
+    container_name: poc_backend
+    volumes:
+      - .:/app # Mount source code để sửa code nhận ngay không cần rebuild
+    ports:
+      - "8000:8000"
+    environment:
+      - OPENSEARCH_HOST=opensearch
+      - OPENSEARCH_PORT=9200
+      # Các biến môi trường khác sẽ được mount từ file .env local
+    env_file:
+      - .env
+    depends_on:
+      - opensearch
+    networks:
+      - poc_network
+
+volumes:
+  opensearch-data:
+
+networks:
+  poc_network:
+    driver: bridge
--- a/extraction/dce.py
+++ b/extraction/dce.py
@@ -0,0 +1,99 @@
+import os
+import httpx
+import logging
+from core.models import IngestedDocument, DocumentClassificationResult, DocumentType, ProcessingPolicy, PdfType
+from extraction.magic_numbers import MagicNumberValidator
+from extraction.pdf_inspector import PDFInspector
+
+logger = logging.getLogger("DCE")
+
+class DocumentClassificationEngine:
+    """
+    Document Classification Engine (DCE).
+    """
+    def __init__(self):
+        self.pdf_inspector = PDFInspector()
+
+    def classify(self, document: IngestedDocument) -> DocumentClassificationResult:
+        logger.info(f"Classifying document: {document.name} (ID: {document.item_id})")
+        
+        ext = os.path.splitext(document.name)[1].lower()
+        
+        doc_type = DocumentType.UNKNOWN
+        policy = ProcessingPolicy.UNSUPPORTED
+        reason = "Initial state"
+        
+        # 1. Magic Number Validation
+        if document.download_url:
+            header_bytes = MagicNumberValidator.fetch_header_bytes(document.download_url)
+            is_valid, detected_type, sig_desc = MagicNumberValidator.validate_from_bytes(header_bytes)
+            if is_valid:
+                logger.info(f"Magic Number match: {sig_desc}")
+            else:
+                logger.warning(f"Could not verify magic number for {document.name}. Trusting extension fallback.")
+
+        # 2. Routing Rules
+        if ext == ".pdf":
+            pdf_type = PdfType.SCAN_PDF # Simulated default
+            if document.download_url:
+                logger.info("Downloading PDF into memory for PyMuPDF inspection...")
+                try:
+                    with httpx.Client() as client:
+                        resp = client.get(document.download_url)
+                        resp.raise_for_status()
+                        pdf_bytes = resp.content
+                    pdf_type = self.pdf_inspector.inspect_pdf_from_bytes(pdf_bytes)
+                except Exception as e:
+                    logger.error(f"Failed to download/inspect PDF: {e}")
+                    pdf_type = PdfType.SCAN_PDF
+            else:
+                logger.warning("No download_url available for PDF. Defaulting to SCAN_PDF.")
+            
+            if pdf_type == PdfType.TEXT_PDF:
+                doc_type = DocumentType.TEXTUAL_DOCUMENT
+                policy = ProcessingPolicy.SKIP_OCR
+                reason = "PDF has text layer (TEXT_PDF)"
+            elif pdf_type == PdfType.DRAWING_PDF:
+                doc_type = DocumentType.DRAWING
+                policy = ProcessingPolicy.METADATA_ONLY
+                reason = "PDF has large vector dimensions (DRAWING_PDF)"
+            elif pdf_type == PdfType.AMBIGUOUS_PDF:
+                doc_type = DocumentType.UNKNOWN
+                policy = ProcessingPolicy.REQUIRES_REVIEW
+                reason = "Kích thước PDF lớn bất thường (khổ A3/A2 hoặc DPI cao), cần con người xác nhận là bản Scan hay Bản vẽ"
+            else:
+                doc_type = DocumentType.TEXTUAL_DOCUMENT
+                policy = ProcessingPolicy.REQUIRES_OCR
+                reason = "PDF has no text layer (SCAN_PDF)"
+                
+        elif ext in [".docx", ".doc", ".txt", ".md"]:
+            doc_type = DocumentType.TEXTUAL_DOCUMENT
+            policy = ProcessingPolicy.SKIP_OCR
+            reason = "Standard textual document format"
+            
+        elif ext in [".xlsx", ".xls", ".csv"]:
+            doc_type = DocumentType.SPREADSHEET
+            policy = ProcessingPolicy.SKIP_OCR
+            reason = "Spreadsheet document format"
+            
+        elif ext in [".dwg", ".dxf", ".cad"]:
+            doc_type = DocumentType.DRAWING
+            policy = ProcessingPolicy.METADATA_ONLY
+            reason = "Native CAD drawing format"
+            
+        else:
+            doc_type = DocumentType.BINARY
+            policy = ProcessingPolicy.UNSUPPORTED
+            reason = f"Unsupported or binary extension: {ext}"
+
+        result = DocumentClassificationResult(
+            item_id=document.item_id,
+            doc_type=doc_type,
+            processing_policy=policy,
+            file_extension=ext,
+            is_supported=policy != ProcessingPolicy.UNSUPPORTED,
+            reason=reason
+        )
+        
+        logger.info(f"Result -> Type: {doc_type.value}, Policy: {policy.value}, Reason: {reason}")
+        return result
--- a/extraction/magic_numbers.py
+++ b/extraction/magic_numbers.py
@@ -0,0 +1,39 @@
+from typing import Dict, Any, Tuple
+import httpx
+import logging
+from core.models import IngestedDocument, DocumentClassificationResult, DocumentType, ProcessingPolicy
+
+logger = logging.getLogger("DCE")
+
+class MagicNumberValidator:
+    """Validates file types using magic numbers (file signatures)."""
+    
+    SIGNATURES = {
+        b"%PDF-": (DocumentType.TEXTUAL_DOCUMENT, "PDF Document"),
+        b"PK\x03\x04": (DocumentType.UNKNOWN, "ZIP Archive / Office Open XML"), # Needs further check
+        b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1": (DocumentType.UNKNOWN, "Legacy Office Document"),
+        # Add CAD magic numbers here if needed (e.g., AutoCAD DWG: b"AC10")
+        b"AC10": (DocumentType.DRAWING, "AutoCAD Drawing (DWG)")
+    }
+
+    @classmethod
+    def validate_from_bytes(cls, header_bytes: bytes) -> Tuple[bool, DocumentType, str]:
+        """Checks if the bytes match any known signature."""
+        for sig, (doc_type, desc) in cls.SIGNATURES.items():
+            if header_bytes.startswith(sig):
+                return True, doc_type, desc
+        return False, DocumentType.UNKNOWN, "Unknown Signature"
+
+    @classmethod
+    def fetch_header_bytes(cls, download_url: str, num_bytes: int = 256) -> bytes:
+        """Fetches only the first N bytes of a file using HTTP Range request."""
+        try:
+            # Idea: HTTP Range request prevents downloading huge files just to check headers
+            headers = {"Range": f"bytes=0-{num_bytes - 1}"}
+            with httpx.Client() as client:
+                response = client.get(download_url, headers=headers)
+                response.raise_for_status()
+                return response.content
+        except Exception as e:
+            logger.error(f"Failed to fetch header bytes: {e}")
+            return b""
--- a/extraction/ocr_service.py
+++ b/extraction/ocr_service.py
@@ -0,0 +1,111 @@
+import io
+import logging
+import base64
+import httpx
+import fitz
+from PIL import Image
+from typing import List, Tuple
+from core.models import OCRPageResult
+from core.config import settings
+
+logger = logging.getLogger("OCRService")
+
+class OCRService:
+    """
+    OCR Service implementation acting as a VLM client.
+    """
+    def __init__(self):
+        self.vlm_url = settings.VLM_ENDPOINT
+        logger.info(f"Initialized VLM OCR Service connecting to {self.vlm_url}")
+
+    def _image_to_base64(self, img: Image.Image) -> str:
+        """Chuyển đổi PIL Image sang chuẩn Base64 JPEG"""
+        buffered = io.BytesIO()
+        # Chuyển sang RGB nếu ảnh có kênh Alpha
+        if img.mode != 'RGB':
+            img = img.convert('RGB')
+        img.save(buffered, format="JPEG", quality=85)
+        img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
+        return f"data:image/jpeg;base64,{img_str}"
+
+    def process_pdf_bytes(self, pdf_bytes: bytes) -> List[OCRPageResult]:
+        """Process a PDF from memory using Vintern-3B VLM via LAN"""
+        if not pdf_bytes:
+            logger.warning("Empty PDF bytes received.")
+            return []
+
+        results = []
+        try:
+            import gc
+            doc = fitz.open(stream=pdf_bytes, filetype="pdf")
+            for page_num in range(len(doc)):
+                logger.info(f"VLM Processing page {page_num + 1}/{len(doc)} via LAN...")
+                
+                # Render trang PDF thành ảnh. Hạ độ phân giải xuống 1.2 để giảm thiểu số lượng token
+                # Tránh lỗi 500 do vượt quá Context Window của Llama.cpp
+                matrix = fitz.Matrix(1.2, 1.2)
+                pix = doc[page_num].get_pixmap(matrix=matrix)
+                img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+                
+                del pix
+                gc.collect()
+                
+                # Chuyển ảnh sang Base64
+                b64_image = self._image_to_base64(img)
+                
+                # Gọi API Llama.cpp Server
+                payload = {
+                    "messages": [
+                        {
+                            "role": "user",
+                            "content": [
+                                {
+                                    "type": "image_url",
+                                    "image_url": {
+                                        "url": b64_image
+                                    }
+                                },
+                                {
+                                    "type": "text",
+                                    "text": "Hãy trích xuất chính xác toàn bộ văn bản có trong hình ảnh này. Giữ nguyên định dạng và các dấu câu tiếng Việt."
+                                }
+                            ]
+                        }
+                    ],
+                    "temperature": settings.VLM_TEMPERATURE,
+                    "max_tokens": settings.VLM_MAX_TOKENS
+                }
+                
+                try:
+                    with httpx.Client(timeout=settings.VLM_TIMEOUT) as client:
+                        response = client.post(self.vlm_url, json=payload)
+                        response.raise_for_status()
+                        
+                        data = response.json()
+                        vlm_text = data['choices'][0]['message']['content'].strip()
+                        
+                        results.append(OCRPageResult(
+                            page=page_num + 1,
+                            text=vlm_text,
+                            confidence=0.99, # VLM thường không trả về độ tự tin từng chữ, set cứng 0.99
+                            paddle_text="", # Bỏ qua cột so sánh cũ
+                            paddle_confidence=0.0
+                        ))
+                        logger.info(f"VLM extraction successful for page {page_num + 1}")
+                        
+                except Exception as api_err:
+                    logger.error(f"VLM API Error: {api_err}")
+                    # Ghi nhận trang lỗi nhưng vẫn tiếp tục các trang sau
+                    results.append(OCRPageResult(
+                        page=page_num + 1,
+                        text=f"[LỖI KẾT NỐI VLM: {api_err}]",
+                        confidence=0.0,
+                        paddle_text="",
+                        paddle_confidence=0.0
+                    ))
+                    
+            return results
+        except Exception as e:
+            import traceback
+            logger.error(f"Failed to process PDF: {e}\n{traceback.format_exc()}")
+            return []
--- a/extraction/pdf_inspector.py
+++ b/extraction/pdf_inspector.py
@@ -0,0 +1,61 @@
+import logging
+from core.models import DocumentType, ProcessingPolicy, PdfType
+
+logger = logging.getLogger("PDFInspector")
+
+class PDFInspector:
+    """
+    Inspects PDF files to determine if they are TEXT, SCAN, DRAWING or AMBIGUOUS.
+    """
+    
+    def __init__(self, text_density_threshold: int = 100):
+        self.text_density_threshold = text_density_threshold
+
+    def inspect_pdf_from_bytes(self, pdf_bytes: bytes) -> PdfType:
+        """
+        Deep inspects a PDF file from a byte stream.
+        """
+        try:
+            import fitz  # PyMuPDF
+        except ImportError:
+            logger.error("PyMuPDF (fitz) is not installed. Returning default SCAN_PDF.")
+            return PdfType.SCAN_PDF
+
+        try:
+            doc = fitz.open(stream=pdf_bytes, filetype="pdf")
+            num_pages = len(doc)
+            
+            pages_to_check = min(3, num_pages)
+            total_text_length = 0
+            is_huge = False
+            is_ambiguous_size = False
+            
+            for i in range(pages_to_check):
+                page = doc[i]
+                rect = page.rect
+                max_dim = max(rect.width, rect.height)
+                
+                if max_dim > 3000:
+                    is_huge = True
+                elif max_dim > 1000:
+                    is_ambiguous_size = True
+                
+                text = page.get_text()
+                total_text_length += len(text.strip())
+            
+            avg_text = total_text_length / pages_to_check
+            
+            if avg_text >= self.text_density_threshold:
+                return PdfType.TEXT_PDF
+                
+            if is_huge:
+                return PdfType.DRAWING_PDF
+                
+            if is_ambiguous_size:
+                return PdfType.AMBIGUOUS_PDF
+                
+            return PdfType.SCAN_PDF
+            
+        except Exception as e:
+            logger.error(f"Error inspecting PDF stream: {e}")
+            return PdfType.SCAN_PDF
--- a/indexing/vector_store.py
+++ b/indexing/vector_store.py
@@ -0,0 +1,101 @@
+import logging
+from typing import List
+from opensearchpy import OpenSearch, RequestsHttpConnection
+from core.models import DocumentChunk
+from core.config import settings
+
+logger = logging.getLogger("VectorStore")
+
+class VectorStore:
+    def __init__(self, index_name: str = "sharepoint_docs"):
+        self.index_name = index_name
+        
+        # Kết nối tới OpenSearch Cluster
+        self.client = OpenSearch(
+            hosts=[{'host': settings.opensearch_host, 'port': settings.opensearch_port}],
+            http_auth=(settings.opensearch_user, settings.opensearch_pass),
+            use_ssl=False,
+            verify_certs=False,
+            connection_class=RequestsHttpConnection
+        )
+        
+        # Load Local Embedding Model (Chạy bằng CPU/GPU nội bộ)
+        logger.info("Đang nạp Local Embedding Model (keepitreal/vietnamese-sbert)...")
+        try:
+            from sentence_transformers import SentenceTransformer
+            # Vietnamese-SBERT (768 dimensions), cực kỳ tốt cho văn bản Tiếng Việt
+            self.embedder = SentenceTransformer('keepitreal/vietnamese-sbert')
+            logger.info("Nạp Embedding Model thành công!")
+        except ImportError:
+            logger.error("LỖI: Chưa cài thư viện. Hãy chạy: pip install sentence-transformers opensearch-py")
+            raise
+            
+        self._ensure_index_exists()
+
+    def _ensure_index_exists(self):
+        """Khởi tạo Mapping cho Index nếu chưa có"""
+        if not self.client.indices.exists(index=self.index_name):
+            mapping = {
+                "settings": {
+                    "index": {
+                        "knn": True,
+                        "knn.algo_param.ef_search": 100
+                    }
+                },
+                "mappings": {
+                    "properties": {
+                        "chunk_id": { "type": "keyword" },
+                        "file_id": { "type": "keyword" },
+                        "file_name": { "type": "text" },
+                        "text": { 
+                            "type": "text",
+                            "analyzer": "standard" # Có thể đổi sang analyzer tiếng Việt nếu cài plugin vi
+                        },
+                        "embedding": {
+                            "type": "knn_vector",
+                            "dimension": 768, # Chiều dài vector của vietnamese-sbert
+                            "method": {
+                                "name": "hnsw",
+                                "space_type": "l2",
+                                "engine": "nmslib"
+                            }
+                        },
+                        "site_id": { "type": "keyword" },
+                        "page_from": { "type": "integer" },
+                        "page_to": { "type": "integer" },
+                        "source_url": { "type": "keyword" },
+                        "permissions": { "type": "keyword" }
+                    }
+                }
+            }
+            self.client.indices.create(index=self.index_name, body=mapping)
+            logger.info(f"Đã tạo OpenSearch Index: {self.index_name}")
+
+    def embed_and_index(self, chunks: List[DocumentChunk]):
+        """Biến đổi Text thành Vector và lưu vào Database"""
+        if not chunks:
+            return
+            
+        logger.info(f"Đang băm (Embedding) {len(chunks)} chunks thành Vector...")
+        
+        texts = [chunk.text for chunk in chunks]
+        # Chạy model AI để tạo vector
+        embeddings = self.embedder.encode(texts, show_progress_bar=False)
+        
+        actions = []
+        for i, chunk in enumerate(chunks):
+            chunk.embedding = embeddings[i].tolist()
+            
+            # Chuẩn bị dữ liệu JSON cho OpenSearch Bulk API
+            action = {
+                "_op_type": "index",
+                "_index": self.index_name,
+                "_id": chunk.chunk_id,
+                "_source": chunk.dict()
+            }
+            actions.append(action)
+            
+        logger.info("Đang nạp dữ liệu vào OpenSearch...")
+        from opensearchpy.helpers import bulk
+        success, failed = bulk(self.client, actions)
+        logger.info(f"Hoàn tất! Bơm thành công: {success} chunks. Thất bại: {len(failed) if isinstance(failed, list) else failed}")
--- a/ingestion/graph_client.py
+++ b/ingestion/graph_client.py
@@ -0,0 +1,134 @@
+import httpx
+import base64
+import json
+import logging
+from azure.identity import ClientSecretCredential
+from core.config import settings
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger("GraphClient")
+
+class GraphClient:
+    """
+    Microsoft Graph API Client using app-only authentication.
+    """
+    def __init__(self):
+        self.tenant_id = settings.tenant_id
+        self.client_id = settings.client_id
+        self.client_secret = settings.client_secret
+        
+        self.credential = ClientSecretCredential(
+            tenant_id=self.tenant_id,
+            client_id=self.client_id,
+            client_secret=self.client_secret
+        )
+        self.scopes = ["https://graph.microsoft.com/.default"]
+        self.base_url = "https://graph.microsoft.com/v1.0"
+        self._token = None
+
+    def decode_jwt_payload(self, token: str) -> dict:
+        parts = token.split('.')
+        if len(parts) != 3:
+            raise ValueError("Invalid JWT token format")
+        payload_b64 = parts[1]
+        payload_b64 += "=" * ((4 - len(payload_b64) % 4) % 4)
+        return json.loads(base64.urlsafe_b64decode(payload_b64))
+
+    def validate_required_roles(self, roles: list):
+        if "Sites.Read.All" not in roles:
+            raise PermissionError("FATAL: Token is missing 'Sites.Read.All' role. Stop immediately.")
+        if "Files.Read.All" not in roles:
+            logger.warning("WARNING: Token is missing 'Files.Read.All' role. Drive/delta steps will fail.")
+            raise PermissionError("FATAL: Token is missing 'Files.Read.All' role. Stop immediately.")
+
+    def get_access_token(self) -> str:
+        if not self._token:
+            token_response = self.credential.get_token(*self.scopes)
+            self._token = token_response.token
+            
+            payload = self.decode_jwt_payload(self._token)
+            
+            aud = payload.get("aud")
+            appid = payload.get("appid")
+            idtyp = payload.get("idtyp")
+            roles = payload.get("roles", [])
+            
+            logger.info(f"Token decoded -> aud: {aud}, appid: {appid}, idtyp: {idtyp}, roles: {roles}")
+            
+            self.validate_required_roles(roles)
+            
+        return self._token
+
+    def _get_headers(self) -> dict:
+        token = self.get_access_token()
+        return {
+            "Authorization": f"Bearer {token}",
+            "Accept": "application/json"
+        }
+
+    def _make_get_request(self, url: str) -> dict:
+        logger.info(f"GET Request to: {url}")
+        headers = self._get_headers()
+        try:
+            response = httpx.get(url, headers=headers)
+            logger.info(f"Response Status: {response.status_code}")
+            response.raise_for_status()
+            return response.json()
+        except httpx.HTTPStatusError as e:
+            logger.error(f"HTTP Error: {e.response.status_code}")
+            logger.error(f"Response Body: {e.response.text}")
+            raise e
+        except Exception as e:
+            logger.error(f"Error making request: {str(e)}")
+            raise e
+
+    def _download_file(self, url: str) -> bytes:
+        logger.info(f"GET Request (Download) to: {url}")
+        headers = self._get_headers()
+        try:
+            # Follow redirects is True by default in httpx.Client, but httpx.get() might need follow_redirects=True
+            with httpx.Client(follow_redirects=True, timeout=60.0) as client:
+                response = client.get(url, headers=headers)
+                logger.info(f"Response Status: {response.status_code}")
+                response.raise_for_status()
+                return response.content
+        except httpx.HTTPStatusError as e:
+            logger.error(f"HTTP Error: {e.response.status_code}")
+            logger.error(f"Response Body: {e.response.text}")
+            raise e
+        except Exception as e:
+            logger.error(f"Error making download request: {str(e)}")
+            raise e
+
+    def get_site_by_hostname(self, hostname: str):
+        """GET /sites/{hostname}"""
+        url = f"{self.base_url}/sites/{hostname}"
+        return self._make_get_request(url)
+
+    def get_site_by_path(self, hostname: str, server_relative_path: str):
+        """GET /sites/{hostname}:/{server-relative-path}"""
+        url = f"{self.base_url}/sites/{hostname}:{server_relative_path}"
+        return self._make_get_request(url)
+
+    def get_drive(self, site_id: str):
+        """GET /sites/{siteId}/drive"""
+        url = f"{self.base_url}/sites/{site_id}/drive"
+        return self._make_get_request(url)
+
+    def get_drive_root_children(self, site_id: str):
+        """GET /sites/{siteId}/drive/root/children"""
+        url = f"{self.base_url}/sites/{site_id}/drive/root/children"
+        return self._make_get_request(url)
+
+    def get_drive_root_delta(self, site_id: str):
+        """GET /sites/{siteId}/drive/root/delta"""
+        url = f"{self.base_url}/sites/{site_id}/drive/root/delta"
+        return self._make_get_request(url)
+
+    def delta_query(self, drive_id: str, delta_link: str = None):
+        """Perform a delta query on a drive."""
+        if delta_link:
+            url = delta_link
+        else:
+            url = f"{self.base_url}/drives/{drive_id}/root/delta"
+        return self._make_get_request(url)
--- a/ingestion/providers/base_provider.py
+++ b/ingestion/providers/base_provider.py
@@ -0,0 +1,36 @@
+from abc import ABC, abstractmethod
+from typing import Dict, List, Tuple
+
+class BaseStorageProvider(ABC):
+    """
+    Abstract Base Class for all Document Storage Providers (SharePoint, Google Drive, Local, NAS, etc.)
+    Any new storage source must implement these methods to be seamlessly integrated into the ingestion pipeline.
+    """
+    
+    @abstractmethod
+    def fetch_changes(self, sync_state: Dict) -> Tuple[List[Dict], Dict]:
+        """
+        Fetch incremental changes (new, updated, or deleted files).
+        
+        Args:
+            sync_state (Dict): The last known synchronization state/token.
+            
+        Returns:
+            Tuple[List[Dict], Dict]: 
+                - A list of standardized item dictionaries.
+                - The new sync state to be saved for the next run.
+        """
+        pass
+
+    @abstractmethod
+    def download_file(self, target_item: Dict) -> bytes:
+        """
+        Download the raw file bytes for a given item.
+        
+        Args:
+            target_item (Dict): The standardized item dictionary returned by fetch_changes.
+            
+        Returns:
+            bytes: The raw file content.
+        """
+        pass
--- a/ingestion/providers/sharepoint_provider.py
+++ b/ingestion/providers/sharepoint_provider.py
@@ -0,0 +1,99 @@
+import logging
+from typing import Dict, List, Tuple
+from .base_provider import BaseStorageProvider
+from ingestion.graph_client import GraphClient
+
+from core.config import settings
+
+logger = logging.getLogger(__name__)
+
+class SharePointProvider(BaseStorageProvider):
+    """
+    Storage Provider implementation for Microsoft SharePoint using Graph API.
+    """
+    def __init__(self, hostname: str = "285pdg.sharepoint.com", site_path: str = "/sites/poc_system"):
+        self.graph = GraphClient()
+        
+        logger.info(f"Resolving site: {hostname}:{site_path}")
+        site_info = self.graph.get_site_by_path(hostname, site_path)
+        site_id = site_info["id"]
+        
+        logger.info(f"Resolving drive for site: {site_id}")
+        drive_info = self.graph.get_drive(site_id)
+        self.drive_id = drive_info["id"]
+        
+        logger.info(f"Initialized SharePoint Provider for Drive ID: {self.drive_id}")
+
+    def fetch_changes(self, sync_state: Dict) -> Tuple[List[Dict], Dict]:
+        """
+        Fetch all delta changes from SharePoint, handling pagination internally.
+        """
+        delta_link = sync_state.get("sharepoint_delta_link")
+        
+        items_collected = []
+        current_url = delta_link
+        
+        # Loop over pagination
+        while True:
+            # We need to construct the URL manually or let graph_client do it
+            if not current_url:
+                current_url = f"{self.graph.base_url}/drives/{self.drive_id}/root/delta"
+                
+            response = self.graph._make_get_request(current_url)
+            values = response.get("value", [])
+            items_collected.extend(values)
+            
+            if "@odata.nextLink" in response:
+                current_url = response["@odata.nextLink"]
+                logger.info("Fetching next page of SharePoint delta results...")
+            elif "@odata.deltaLink" in response:
+                new_delta_link = response["@odata.deltaLink"]
+                logger.info("Reached end of SharePoint delta changes.")
+                break
+            else:
+                logger.warning("No nextLink or deltaLink found in response! Breaking loop.")
+                new_delta_link = current_url
+                break
+            
+        # Standardize output for the ingestion pipeline
+        standardized_items = []
+        for item in items_collected:
+            # Bỏ qua root drive
+            if "folder" in item and "root" in item.get("folder", {}):
+                continue
+                
+            is_deleted = "deleted" in item
+            
+            std_item = {
+                "id": item.get("id"),
+                "name": item.get("name"),
+                "is_deleted": is_deleted,
+                "is_folder": "folder" in item,
+                "provider": "sharepoint",
+                "last_modified": item.get("lastModifiedDateTime"),
+                "size": item.get("size"),
+                "raw_data": item 
+            }
+            standardized_items.append(std_item)
+            
+        new_state = sync_state.copy()
+        new_state["sharepoint_delta_link"] = new_delta_link
+        
+        return standardized_items, new_state
+
+    def download_file(self, target_item: Dict) -> bytes:
+        """
+        Download file content from SharePoint.
+        """
+        try:
+            raw_data = target_item.get("raw_data", {})
+            item_id = target_item.get("id")
+            
+            # Gọi thẳng endpoint /content qua Graph API để tránh lỗi 401
+            url = f"https://graph.microsoft.com/v1.0/drives/{self.drive_id}/items/{item_id}/content"
+            file_bytes = self.graph._download_file(url)
+            
+            return file_bytes
+        except Exception as e:
+            logger.error(f"SharePoint download_file failed for {target_item.get('name')}: {e}")
+            raise e
--- a/ingestion/sync.py
+++ b/ingestion/sync.py
@@ -0,0 +1,147 @@
+import os
+import json
+import logging
+from typing import List, Dict, Any
+
+# Ensure we can import from the root module if run directly
+import sys
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from ingestion.graph_client import GraphClient
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger("IngestionSync")
+
+class SharePointSync:
+    def __init__(self, hostname: str, site_path: str):
+        self.graph_client = GraphClient()
+        self.hostname = hostname
+        self.site_path = site_path
+        self.state_file = "delta_state.json"
+        self.output_file = "ingestion_output.json"
+        
+    def _load_delta_link(self) -> str:
+        """Load delta link from local state file."""
+        if os.path.exists(self.state_file):
+            with open(self.state_file, "r", encoding="utf-8") as f:
+                data = json.load(f)
+                return data.get("delta_link")
+        return None
+        
+    def _save_delta_link(self, delta_link: str):
+        """Save delta link to local state file for next incremental sync."""
+        with open(self.state_file, "w", encoding="utf-8") as f:
+            json.dump({"delta_link": delta_link}, f, indent=2)
+            
+    def _extract_metadata(self, item: Dict[Any, Any], site_id: str, drive_id: str) -> Dict[str, Any]:
+        """Convert Graph API item payload to our target schema."""
+        download_url = item.get("@microsoft.graph.downloadUrl")
+        if not download_url and "folder" not in item and "deleted" not in item:
+            try:
+                # Delta query might not return downloadUrl, so fetch it directly
+                item_id = item.get("id")
+                url = f"https://graph.microsoft.com/v1.0/drives/{drive_id}/items/{item_id}"
+                full_item = self.graph_client._make_get_request(url)
+                download_url = full_item.get("@microsoft.graph.downloadUrl")
+            except Exception as e:
+                logger.error(f"Failed to fetch download_url for {item.get('name')}: {e}")
+
+        return {
+            "site_id": site_id,
+            "drive_id": drive_id,
+            "item_id": item.get("id"),
+            "name": item.get("name"),
+            "web_url": item.get("webUrl"),
+            "download_url": download_url,
+            "mime_type": item.get("file", {}).get("mimeType") if "file" in item else None,
+            "parent_path": item.get("parentReference", {}).get("path"),
+            "is_folder": "folder" in item,
+            "size": item.get("size"),
+            "last_modified": item.get("lastModifiedDateTime"),
+            "created": item.get("createdDateTime"),
+            "eTag": item.get("eTag"),
+            "cTag": item.get("cTag"),
+            "deleted": "deleted" in item
+        }
+
+    def _upsert_to_local_db(self, new_items: List[Dict[str, Any]]):
+        """Simulate upsert into a database by writing to a JSON file."""
+        db = {}
+        if os.path.exists(self.output_file):
+            with open(self.output_file, "r", encoding="utf-8") as f:
+                try:
+                    existing = json.load(f)
+                    for item in existing:
+                        db[item["item_id"]] = item
+                except json.JSONDecodeError:
+                    pass
+                    
+        for item in new_items:
+            if item.get("deleted"):
+                # If deleted, we mark it as deleted in our db (or we could remove it)
+                if item["item_id"] in db:
+                    db[item["item_id"]]["deleted"] = True
+                else:
+                    # It's deleted but we didn't have it anyway
+                    db[item["item_id"]] = item
+            else:
+                db[item["item_id"]] = item
+                
+        final_list = list(db.values())
+        with open(self.output_file, "w", encoding="utf-8") as f:
+            json.dump(final_list, f, indent=2, ensure_ascii=False)
+            
+        logger.info(f"Local database updated. Total items currently stored: {len(final_list)}")
+
+    def run_sync(self):
+        logger.info("=== STARTING SHAREPOINT SYNC ===")
+        
+        # 1. & 2. Resolve Site and Drive
+        logger.info(f"Resolving site: {self.hostname}:{self.site_path}")
+        site_info = self.graph_client.get_site_by_path(self.hostname, self.site_path)
+        site_id = site_info["id"]
+        
+        logger.info(f"Resolving drive for site: {site_id}")
+        drive_info = self.graph_client.get_drive(site_id)
+        drive_id = drive_info["id"]
+        
+        # 3. Delta Query setup
+        delta_link = self._load_delta_link()
+        if delta_link:
+            logger.info("Found existing delta_link. Performing INCREMENTAL sync.")
+        else:
+            logger.info("No delta_link found. Performing FULL sync.")
+            
+        items_collected = []
+        current_url = delta_link
+        
+        # Loop over pagination
+        while True:
+            response = self.graph_client.delta_query(drive_id, current_url)
+            values = response.get("value", [])
+            items_collected.extend(values)
+            
+            if "@odata.nextLink" in response:
+                current_url = response["@odata.nextLink"]
+                logger.info("Fetching next page of delta results...")
+            elif "@odata.deltaLink" in response:
+                new_delta_link = response["@odata.deltaLink"]
+                self._save_delta_link(new_delta_link)
+                logger.info("Reached end of delta changes. Saved new delta_link.")
+                break
+            else:
+                logger.warning("No nextLink or deltaLink found in response! Breaking loop.")
+                break
+                
+        logger.info(f"Delta query returned {len(items_collected)} change(s).")
+        
+        # 4. Extract metadata and save
+        if items_collected:
+            processed_items = [self._extract_metadata(item, site_id, drive_id) for item in items_collected]
+            self._upsert_to_local_db(processed_items)
+        else:
+            logger.info("No items to process.")
+
+if __name__ == "__main__":
+    sync = SharePointSync("285pdg.sharepoint.com", "/sites/poc_system")
+    sync.run_sync()
--- a/paddle_debug.txt
+++ b/paddle_debug.txt
@@ -0,0 +1,59 @@
+[{'input_path': None, 'page_index': None, 'doc_preprocessor_res': {'input_path': None, 'page_index': None, 'input_img': array([[[246, ..., 246],
+        ...,
+        [248, ..., 248]],
+
+       ...,
+
+       [[248, ..., 248],
+        ...,
+        [255, ..., 255]]], shape=(1684, 1191, 3), dtype=uint8), 'model_settings': {'use_doc_orientation_classify': True, 'use_doc_unwarping': True}, 'angle': 0, 'rot_img': array([[[246, ..., 246],
+        ...,
+        [248, ..., 248]],
+
+       ...,
+
+       [[248, ..., 248],
+        ...,
+        [255, ..., 255]]], shape=(1684, 1191, 3), dtype=uint8), 'output_img': array([[[255, ..., 255],
+        ...,
+        [254, ..., 254]],
+
+       ...,
+
+       [[255, ..., 255],
+        ...,
+        [255, ..., 255]]], shape=(1684, 1191, 3), dtype=uint8)}, 'dt_polys': [array([[36, 28],
+       ...,
+       [36, 61]], shape=(4, 2), dtype=int16), array([[ 39, 134],
+       ...,
+       [ 39, 161]], shape=(4, 2), dtype=int16), array([[ 41, 206],
+       ...,
+       [ 41, 233]], shape=(4, 2), dtype=int16), array([[ 44, 280],
+       ...,
+       [ 43, 304]], shape=(4, 2), dtype=int16), array([[ 45, 350],
+       ...,
+       [ 45, 374]], shape=(4, 2), dtype=int16), array([[ 47, 422],
+       ...,
+       [ 47, 445]], shape=(4, 2), dtype=int16), array([[ 46, 490],
+       ...,
+       [ 46, 513]], shape=(4, 2), dtype=int16), array([[ 854, 1345],
+       ...,
+       [ 851, 1393]], shape=(4, 2), dtype=int16)], 'model_settings': {'use_doc_preprocessor': True, 'use_textline_orientation': False}, 'text_det_params': {'limit_side_len': 64, 'limit_type': 'min', 'thresh': 0.3, 'max_side_limit': 4000, 'box_thresh': 0.6, 'unclip_ratio': 1.5}, 'text_type': 'general', 'text_rec_score_thresh': 0.0, 'return_word_box': False, 'rec_texts': ['Biên bn bàn giao h sơ lưu tr', 'Đây là tài liu scan mu đ kim th nhánh SCAN PDF.', 'Ni dung đưσc vê thành nh ri nhúng vào PDF đ không có text layer.', 'Ngưi nhn: Nguyn Văn A', 'Phòng ban: Hành chính - Tng hp', 'Ngày bàn giao: 06/05/2026', 'Ghi chú: kim tra OCR ting Viêt có du.', 'ĐÃNHN'], 'rec_scores': [0.9541749954223633, 0.9710000157356262, 0.9784880876541138, 0.981657862663269, 0.9962664246559143, 0.9966237545013428, 0.9740114212036133, 0.9314918518066406], 'rec_polys': [array([[36, 28],
+       ...,
+       [36, 61]], shape=(4, 2), dtype=int16), array([[ 39, 134],
+       ...,
+       [ 39, 161]], shape=(4, 2), dtype=int16), array([[ 41, 206],
+       ...,
+       [ 41, 233]], shape=(4, 2), dtype=int16), array([[ 44, 280],
+       ...,
+       [ 43, 304]], shape=(4, 2), dtype=int16), array([[ 45, 350],
+       ...,
+       [ 45, 374]], shape=(4, 2), dtype=int16), array([[ 47, 422],
+       ...,
+       [ 47, 445]], shape=(4, 2), dtype=int16), array([[ 46, 490],
+       ...,
+       [ 46, 513]], shape=(4, 2), dtype=int16), array([[ 854, 1345],
+       ...,
+       [ 851, 1393]], shape=(4, 2), dtype=int16)], 'vis_fonts': [<paddlex.utils.fonts.Font object at 0x7afeb51ed190>, <paddlex.utils.fonts.Font object at 0x7afeb51ed190>, <paddlex.utils.fonts.Font object at 0x7afeb51ed190>, <paddlex.utils.fonts.Font object at 0x7afeb51ed190>, <paddlex.utils.fonts.Font object at 0x7afeb51ed190>, <paddlex.utils.fonts.Font object at 0x7afeb51ed190>, <paddlex.utils.fonts.Font object at 0x7afeb51ed190>, <paddlex.utils.fonts.Font object at 0x7afeb51ed190>], 'textline_orientation_angles': [-1, -1, -1, -1, -1, -1, -1, -1], 'rec_boxes': array([[  36, ...,   61],
+       ...,
+       [ 851, ..., 1401]], shape=(8, 4), dtype=int16)}]
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,17 @@
+fastapi>=0.104.1
+uvicorn>=0.24.0
+pydantic>=2.4.2
+pydantic-settings>=2.0.3
+requests>=2.31.0
+opensearch-py>=2.3.1
+azure-identity>=1.15.0
+msgraph-core>=1.0.0
+markitdown>=0.0.1
+paddlepaddle>=2.5.2
+paddleocr>=2.7.0.3
+python-dotenv>=1.0.0
+httpx>=0.25.1
+openai>=1.3.5
+sentence-transformers>=2.2.2
+PyMuPDF>=1.23.0
+Pillow>=10.0.0
--- a/scratch/get_drive.py
+++ b/scratch/get_drive.py
@@ -0,0 +1,26 @@
+import logging
+import json
+from ingestion.graph_client import GraphClient
+
+logging.basicConfig(level=logging.INFO)
+
+def get_real_drive_id():
+    client = GraphClient()
+    try:
+        site = client.get_site_by_path("285pdg.sharepoint.com", "/sites/poc_system")
+        site_id = site["id"]
+        print(f"Site ID: {site_id}")
+        
+        drive = client.get_drive(site_id)
+        drive_id = drive["id"]
+        print(f"Drive ID: {drive_id}")
+        
+        # Write the correct drive ID to a file so we know what it is
+        with open("correct_drive_id.txt", "w") as f:
+            f.write(drive_id)
+            
+    except Exception as e:
+        print(f"Error: {e}")
+
+if __name__ == "__main__":
+    get_real_drive_id()
--- a/scratch/test_item.py
+++ b/scratch/test_item.py
@@ -0,0 +1,12 @@
+import os
+import sys
+import json
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from ingestion.graph_client import GraphClient
+
+client = GraphClient()
+drive_id = "b!15GOzaN4pU2LRNmRYc8vat7d48GJXyJBj-eKaLgrGv9svCswiraBQalAnVnRMl79"
+item_id = "01BP532D2O74Z6FYQPOVBKE5DBMYDWCWCK"
+url = f"https://graph.microsoft.com/v1.0/drives/{drive_id}/items/{item_id}"
+resp = client._make_request("GET", url)
+print(json.dumps(resp, indent=2))
--- a/scratch/test_paddle.py
+++ b/scratch/test_paddle.py
@@ -0,0 +1,14 @@
+import cv2
+import numpy as np
+from paddleocr import PaddleOCR
+
+# Create a dummy image with some text
+img = np.zeros((100, 300, 3), dtype=np.uint8)
+cv2.putText(img, "Hello World", (10, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)
+
+ocr = PaddleOCR(use_angle_cls=False, lang="vi", enable_mkldnn=False)
+result = ocr.ocr(img)
+
+print("===== RAW RESULT =====")
+print(result)
+print("======================")
--- a/search/retriever.py
+++ b/search/retriever.py
@@ -0,0 +1,70 @@
+import logging
+from typing import List
+from opensearchpy import OpenSearch, RequestsHttpConnection
+from core.config import settings
+from core.models import DocumentChunk
+
+logger = logging.getLogger("Retriever")
+
+class SearchRetriever:
+    def __init__(self, index_name: str = "poc_sharepoint_docs"):
+        self.index_name = index_name
+        
+        # Kết nối OpenSearch
+        self.client = OpenSearch(
+            hosts=[{'host': settings.opensearch_host, 'port': settings.opensearch_port}],
+            http_auth=(settings.opensearch_user, settings.opensearch_pass),
+            use_ssl=False,
+            verify_certs=False,
+            connection_class=RequestsHttpConnection
+        )
+        
+        # Load Local Embedding Model (để biến câu hỏi thành vector cùng không gian với dữ liệu)
+        logger.info("Đang nạp Embedding Model cho Retriever...")
+        from sentence_transformers import SentenceTransformer
+        self.embedder = SentenceTransformer('keepitreal/vietnamese-sbert')
+
+    def retrieve(self, query: str, top_k: int = 5) -> List[DocumentChunk]:
+        """
+        Tìm kiếm ngữ nghĩa (Semantic Search) dựa trên Vector k-NN
+        """
+        logger.info(f"Đang tìm kiếm ngữ nghĩa cho câu hỏi: '{query}'")
+        
+        # 1. Chuyển câu hỏi thành Vector
+        query_vector = self.embedder.encode(query).tolist()
+        
+        # 2. Xây dựng k-NN Query cho OpenSearch
+        # Ta có thể kết hợp Hybrid Search (Vector + Text) ở đây nếu muốn
+        search_query = {
+            "size": top_k,
+            "query": {
+                "knn": {
+                    "embedding": {
+                        "vector": query_vector,
+                        "k": top_k
+                    }
+                }
+            }
+        }
+        
+        try:
+            response = self.client.search(
+                index=self.index_name,
+                body=search_query
+            )
+            
+            hits = response.get("hits", {}).get("hits", [])
+            results = []
+            
+            for hit in hits:
+                source = hit["_source"]
+                # Chuyển từ JSON sang DocumentChunk model
+                chunk = DocumentChunk(**source)
+                results.append(chunk)
+                
+            logger.info(f"Tìm thấy {len(results)} đoạn văn phù hợp nhất.")
+            return results
+            
+        except Exception as e:
+            logger.error(f"Lỗi khi truy vấn OpenSearch: {e}")
+            return []
--- a/test_chat.py
+++ b/test_chat.py
@@ -0,0 +1,71 @@
+import logging
+import sys
+import os
+
+# Để import được các module từ thư mục gốc
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+
+from chat.rag_engine import RAGEngine
+from core.config import settings
+
+logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
+logger = logging.getLogger("TestChat")
+
+def start_chat():
+    # Ép buộc dùng localhost cho OpenSearch khi chạy ngoài Docker
+    if settings.opensearch_host == "opensearch":
+        settings.opensearch_host = "localhost"
+
+    logger.info("=== HỆ THỐNG RAG SHAREPOINT ĐÃ SẴN SÀNG ===")
+    logger.info(f"Đang sử dụng bộ não: {settings.llm_provider.upper()}")
+    
+    try:
+        engine = RAGEngine()
+    except Exception as e:
+        logger.error(f"Không thể khởi động RAG Engine: {e}")
+        return
+
+    print("\n" + "="*50)
+    print("MỜI BẠN NHẬP CÂU HỎI (Gõ 'exit' để thoát)")
+    print("="*50)
+
+    history = []
+
+    while True:
+        # Đọc trực tiếp dữ liệu thô (Bytes) từ bàn phím để tránh mọi lỗi mã hóa của Terminal
+        print("\nBạn: ", end='', flush=True)
+        try:
+            raw_line = sys.stdin.buffer.readline()
+            if not raw_line:
+                break
+            # Tự tay giải mã về UTF-8, bỏ qua các byte lỗi nếu có
+            query = raw_line.decode('utf-8', errors='ignore').strip()
+        except Exception as e:
+            logger.error(f"Lỗi khi đọc dữ liệu nhập: {e}")
+            break
+            
+        if not query:
+            continue
+            
+        if query.lower() in ['exit', 'quit']:
+            break
+            
+        print("\nAI đang suy nghĩ...")
+        result = engine.chat(query, history=history)
+        
+        print(f"\nAI: {result['answer']}")
+        
+        print("\n--- NGUỒN TRÍCH DẪN (CITATIONS) ---")
+        for i, src in enumerate(result['sources']):
+            print(f"[{i+1}] {src['file_name']} (Trang {src['page']})")
+        
+        # Lưu vào lịch sử chat để có ngữ cảnh cho câu hỏi tiếp theo
+        history.append({"role": "user", "content": query})
+        history.append({"role": "assistant", "content": result['answer']})
+        
+        # Giữ lịch sử ngắn gọn (3 cặp câu hỏi - trả lời gần nhất)
+        if len(history) > 6:
+            history = history[-6:]
+
+if __name__ == "__main__":
+    start_chat()
--- a/test_dce_pipeline.py
+++ b/test_dce_pipeline.py
@@ -0,0 +1,38 @@
+import os
+import sys
+import json
+import logging
+
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+
+from core.models import IngestedDocument
+from extraction.dce import DocumentClassificationEngine
+
+logging.basicConfig(level=logging.INFO, format="%(levelname)s:%(name)s:%(message)s")
+
+def main():
+    print("=== STARTING DCE PIPELINE TEST ===")
+    
+    if not os.path.exists("ingestion_output.json"):
+        print("File ingestion_output.json not found! Please run ingestion sync first.")
+        return
+        
+    with open("ingestion_output.json", "r", encoding="utf-8") as f:
+        items = json.load(f)
+        
+    dce = DocumentClassificationEngine()
+    
+    print(f"Loaded {len(items)} items from ingestion_output.json\n")
+    
+    for item in items:
+        if item.get("is_folder"):
+            continue # DCE only processes files
+            
+        doc = IngestedDocument(**item)
+        
+        print(f"\n--- Processing: {doc.name} ---")
+        result = dce.classify(doc)
+        print(f">> Policy: {result.processing_policy.value} | Reason: {result.reason}")
+
+if __name__ == "__main__":
+    main()
--- a/test_graph_smoke.py
+++ b/test_graph_smoke.py
@@ -0,0 +1,69 @@
+import os
+import sys
+
+# Ensure the root path is in sys.path
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+
+from ingestion.graph_client import GraphClient
+
+def main():
+    print("=== PHASE 1 & 2 SMOKE TEST ===")
+    client = GraphClient()
+    
+    print(f"Tenant ID: {client.tenant_id}")
+    print(f"Client ID: {client.client_id}")
+    
+    try:
+        token = client.get_access_token()
+        print(">> PHASE 1 PASS: Successfully obtained and validated token.")
+    except Exception as e:
+        print(f">> PHASE 1 FAIL: {e}")
+        return
+        
+    hostname = "285pdg.sharepoint.com"
+    site_path = "/sites/poc_system"
+    
+    site_id = None
+    try:
+        print(f"\n--- GET /sites/{hostname} ---")
+        site_info = client.get_site_by_hostname(hostname)
+        print(">> SUCCESS")
+    except Exception as e:
+        print(f">> FAIL: {e}")
+
+    try:
+        print(f"\n--- GET /sites/{hostname}:{site_path} ---")
+        site_info = client.get_site_by_path(hostname, site_path)
+        site_id = site_info.get("id")
+        print(f">> SUCCESS - Site ID: {site_id}")
+    except Exception as e:
+        print(f">> FAIL: {e}")
+
+    if not site_id:
+        print("Cannot proceed to drive steps without site_id.")
+        return
+
+    try:
+        print(f"\n--- GET /sites/{site_id}/drive ---")
+        drive_info = client.get_drive(site_id)
+        drive_id = drive_info.get("id")
+        print(f">> SUCCESS - Drive ID: {drive_id}")
+    except Exception as e:
+        print(f">> FAIL: {e}")
+
+    try:
+        print(f"\n--- GET /sites/{site_id}/drive/root/children ---")
+        children_info = client.get_drive_root_children(site_id)
+        print(f">> SUCCESS - Found {len(children_info.get('value', []))} items.")
+    except Exception as e:
+        print(f">> FAIL: {e}")
+
+    try:
+        print(f"\n--- GET /sites/{site_id}/drive/root/delta ---")
+        delta_info = client.get_drive_root_delta(site_id)
+        print(">> SUCCESS")
+    except Exception as e:
+        print(f">> FAIL: {e}")
+
+if __name__ == "__main__":
+    main()
--- a/test_modular_architecture.py
+++ b/test_modular_architecture.py
@@ -0,0 +1,41 @@
+import logging
+import sys
+from core.config import settings
+from ingestion.providers.sharepoint_provider import SharePointProvider
+from extraction.ocr_service import OCRService
+
+logging.basicConfig(level=logging.INFO, format="%(levelname)s:%(name)s:%(message)s")
+logger = logging.getLogger("TestModularArch")
+
+def run_tests():
+    logger.info("=== 1. KIỂM TRA CẤU HÌNH (CONFIG) ===")
+    logger.info(f"VLM Endpoint: {settings.VLM_ENDPOINT}")
+    logger.info(f"VLM Temperature: {settings.VLM_TEMPERATURE}")
+    if not settings.VLM_ENDPOINT:
+        logger.error("LỖI: Chưa nạp được cấu hình VLM từ .env")
+        sys.exit(1)
+    
+    logger.info("\n=== 2. KIỂM TRA MÔ HÌNH PROVIDER (SHAREPOINT) ===")
+    try:
+        provider = SharePointProvider()
+        logger.info("Khởi tạo SharePointProvider thành công.")
+        
+        logger.info("Thử nghiệm fetch_changes (Lấy danh sách file delta)...")
+        items, next_state = provider.fetch_changes({})
+        logger.info(f"Thành công! Lấy được {len(items)} items thông qua kiến trúc Provider mới.")
+        if items:
+            logger.info(f"Item mẫu: {items[0]['name']} (ID: {items[0]['id'][:10]}...)")
+    except Exception as e:
+        logger.error(f"LỖI Khởi tạo Provider: {e}")
+
+    logger.info("\n=== 3. KIỂM TRA VLM CLIENT (OCR SERVICE) ===")
+    try:
+        ocr = OCRService()
+        logger.info(f"Khởi tạo OCRService thành công. Đang kết nối tới: {ocr.vlm_url}")
+    except Exception as e:
+        logger.error(f"LỖI Khởi tạo OCR Service: {e}")
+        
+    logger.info("\n=== KIỂM TRA KIẾN TRÚC HOÀN TẤT ===")
+
+if __name__ == "__main__":
+    run_tests()
--- a/test_ocr.py
+++ b/test_ocr.py
@@ -0,0 +1,73 @@
+import os
+import sys
+import json
+import httpx
+import logging
+
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+
+from extraction.ocr_service import OCRService
+
+logging.basicConfig(level=logging.INFO, format="%(levelname)s:%(name)s:%(message)s")
+
+def main():
+    print("=== STARTING OCR SERVICE TEST ===")
+    
+    if not os.path.exists("ingestion_output.json"):
+        print("File ingestion_output.json not found! Vui lòng chạy lại ingestion/sync.py")
+        return
+        
+    with open("ingestion_output.json", "r", encoding="utf-8") as f:
+        items = json.load(f)
+        
+    # Tìm đúng file SCAN_PDF để test
+    target_name = "Bien-ban-ban-giao-scan.pdf"
+    target_item = next((item for item in items if item.get("name") == target_name), None)
+    
+    if not target_item:
+        print(f"Không tìm thấy file {target_name} trong ingestion_output.json")
+        return
+        
+    download_url = target_item.get("download_url")
+    if not download_url:
+        print(f"File {target_name} không có download_url. Vui lòng xoá delta_state.json và chạy lại ingestion/sync.py")
+        return
+        
+    print(f"\n1. Đang tải file: {target_name} trực tiếp từ SharePoint...")
+    try:
+        # Thay vì dùng downloadUrl dễ bị 401 Unauthorized do cơ chế tempauth, 
+        # ta sẽ dùng endpoint /content qua Graph API bằng token xịn của hệ thống.
+        from ingestion.graph_client import GraphClient
+        graph = GraphClient()
+        drive_id = target_item.get("drive_id")
+        item_id = target_item.get("item_id")
+        
+        url = f"https://graph.microsoft.com/v1.0/drives/{drive_id}/items/{item_id}/content"
+        pdf_bytes = graph._download_file(url)
+        
+        print(f"   => Tải thành công {len(pdf_bytes)} bytes.")
+    except Exception as e:
+        print(f"Tải file thất bại: {e}")
+        return
+        
+    print("\n2. Khởi tạo PaddleOCR & VietOCR và bắt đầu nhận diện...")
+    print("   (LƯU Ý: Lần chạy đầu tiên sẽ khá lâu do hệ thống phải tải model AI về máy)")
+    ocr_service = OCRService()
+    
+    # Đẩy byte stream vào mổ xẻ
+    results = ocr_service.process_pdf_bytes(pdf_bytes)
+    
+    print(f"\n3. Quá trình OCR hoàn tất! Tổng số trang đã dịch: {len(results)}")
+    for result in results:
+        print(f"\n==================== TRANG {result.page} ====================")
+        
+        print(f"\n--- [1] KẾT QUẢ TỪ PADDLEOCR MẶC ĐỊNH (Độ tự tin: {result.paddle_confidence}) ---")
+        print(result.paddle_text)
+        
+        print(f"\n--- [2] KẾT QUẢ TỪ VIETOCR (Độ tự tin: {result.confidence}) ---")
+        print(result.text)
+        
+        print("====================================================\n")
+
+if __name__ == "__main__":
+    main()
--- a/test_rag_pipeline.py
+++ b/test_rag_pipeline.py
@@ -0,0 +1,78 @@
+import logging
+import sys
+
+from core.config import settings
+from ingestion.providers.sharepoint_provider import SharePointProvider
+from extraction.ocr_service import OCRService
+from chunking.markdown_chunker import MarkdownChunker
+from indexing.vector_store import VectorStore
+
+logging.basicConfig(level=logging.INFO, format="%(levelname)s:%(name)s:%(message)s")
+logger = logging.getLogger("RAGPipeline")
+
+def run_pipeline():
+    logger.info("=== BẮT ĐẦU TEST TOÀN BỘ ĐƯỜNG ỐNG RAG ===")
+    
+    # Ép buộc dùng localhost cho OpenSearch khi chạy trực tiếp trên WSL
+    if settings.opensearch_host == "opensearch":
+        settings.opensearch_host = "localhost"
+
+    # 1. Tầng Ingestion
+    logger.info("\n--- BƯỚC 1: Lấy file từ SharePoint ---")
+    provider = SharePointProvider()
+    items, _ = provider.fetch_changes({})
+    
+    target_item = None
+    for item in items:
+        if item.get("name", "").lower().endswith(".pdf"):
+            target_item = item
+            break
+            
+    if not target_item:
+        logger.error("Không tìm thấy file PDF nào trên SharePoint để test!")
+        sys.exit(1)
+        
+    logger.info(f"Đã chọn file: {target_item['name']}. Đang tải...")
+    pdf_bytes = provider.download_file(target_item)
+    logger.info(f"Tải thành công {len(pdf_bytes)} bytes.")
+
+    # 2. Tầng Extraction (VLM)
+    logger.info("\n--- BƯỚC 2: OCR / VLM Trích xuất Markdown ---")
+    ocr = OCRService()
+    pages = ocr.process_pdf_bytes(pdf_bytes)
+    
+    if not pages:
+        logger.error("VLM không trích xuất được nội dung nào!")
+        sys.exit(1)
+        
+    logger.info(f"VLM đã trích xuất thành công {len(pages)} trang.")
+
+    # 3. Tầng Chunking
+    logger.info("\n--- BƯỚC 3: Băm nhỏ văn bản (Semantic Chunking) ---")
+    chunker = MarkdownChunker(max_chunk_size=1000, overlap=100)
+    
+    # Tạo metadata giả lập để lưu vào Chunk
+    metadata = {
+        "item_id": target_item["id"],
+        "name": target_item["name"],
+        "web_url": "https://285pdg.sharepoint.com/...",
+        "site_id": settings.sharepoint_site_id
+    }
+    
+    chunks = chunker.chunk_document(pages, metadata)
+    logger.info(f"Đã băm thành {len(chunks)} chunks độc lập.")
+    if chunks:
+        logger.info(f"Ví dụ Chunk đầu tiên:\n[ID: {chunks[0].chunk_id}] {chunks[0].text[:150]}...")
+
+    # 4. Tầng Vector Database (OpenSearch)
+    logger.info("\n--- BƯỚC 4: Mã hóa Vector & Indexing ---")
+    try:
+        vector_db = VectorStore(index_name="poc_sharepoint_docs")
+        vector_db.embed_and_index(chunks)
+        logger.info("🎉 CHÚC MỪNG! DỮ LIỆU ĐÃ NẰM TRONG OPENSEARCH SẴN SÀNG ĐỂ CHAT!")
+    except Exception as e:
+        logger.error(f"LỖI trong quá trình Embedding / Indexing: {e}")
+        logger.warning("Gợi ý: Hãy chắc chắn Docker OpenSearch đang chạy trên cổng 9200!")
+
+if __name__ == "__main__":
+    run_pipeline()
--- a/yeucau.md
+++ b/yeucau.md