Xu ly SSO

2026-05-09 10:31:28 +00:00
parent 9d04e7484c
commit f937d1a98e
21 changed files with 2515 additions and 271 deletions
--- a/api/main.py
+++ b/api/main.py
@@ -1,17 +1,29 @@
 import logging
 import sys
 import os
 import secrets
 from enum import Enum
 from typing import List, Optional, Dict, Any
-from fastapi import FastAPI, HTTPException, BackgroundTasks, status
+from fastapi import FastAPI, HTTPException, BackgroundTasks, Request, status
 from fastapi.responses import RedirectResponse
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel, Field, validator
 import uvicorn
 import msal
 # Đảm bảo đường dẫn module
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 from chat.rag_engine import RAGEngine
 from core.config import settings
 from core.models import IngestedDocument, ProcessingPolicy
 from ingestion.providers.sharepoint_provider import SharePointProvider
 from ingestion.sync import SyncEngine
 from extraction.dce import DocumentClassificationEngine
 from extraction.ocr_service import OCRService
 from extraction.text_extractor import TextExtractor
 from chunking.markdown_chunker import MarkdownChunker
 from indexing.vector_store import VectorStore
 # --- Cấu hình Logging chuyên nghiệp ---
 logging.basicConfig(
@@ -29,8 +41,30 @@ app = FastAPI(
    redoc_url="/redoc"
 )
 # Thêm cấu hình CORS để Frontend có thể gọi API
 app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"], # Cho phép tất cả nguồn (hợp lý cho bản PoC)
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
 )
 # --- Singleton Engine Instance ---
 rag_engine = None
 sync_status = {"running": False, "last_run": None, "processed": 0, "skipped": 0, "errors": []}
 # --- Azure AD SSO Config ---
 REDIRECT_URI = "http://localhost:8000/auth/callback"
 AUTHORITY = f"https://login.microsoftonline.com/{settings.tenant_id}"
 SCOPE = ["User.Read"]
 def _build_msal_app():
    return msal.ConfidentialClientApplication(
        settings.client_id,
        authority=AUTHORITY,
        client_credential=settings.client_secret,
    )
@app.on_event("startup")
 async def startup_event():
@@ -89,6 +123,18 @@ class ChatResponse(BaseModel):
    sources: List[SourceCitation] = Field(default_factory=list, description="Danh sách các nguồn trích dẫn từ tài liệu")
    context_used: Optional[str] = Field(None, description="Ngữ cảnh thực tế đã được trích xuất từ VectorDB (Dùng cho Debug/UI)")
 class SyncResponse(BaseModel):
    status: str
    message: str
 class LoginRequest(BaseModel):
    email: str = Field(..., description="Email người dùng")
 class LoginResponse(BaseModel):
    email: str
    display_name: str
    role: str
 # --- ENDPOINTS ---
@app.get("/health", tags=["System"])
@@ -103,11 +149,82 @@ async def health_check():
        }
    }
@app.get("/auth/login", tags=["Auth"])
 async def sso_login():
    """
    Redirect sang Azure AD login page.
    Dùng chung App Registration với SharePoint ingestion.
    """
    msal_app = _build_msal_app()
    auth_url = msal_app.get_authorization_request_url(
        SCOPE,
        redirect_uri=REDIRECT_URI,
        state=secrets.token_hex(16)
    )
    return RedirectResponse(url=auth_url)
@app.get("/auth/callback", tags=["Auth"])
 async def sso_callback(request: Request):
    """
    Azure AD redirect về đây với authorization code.
    Đổi code lấy token, lấy thông tin user, redirect về frontend.
    """
    code = request.query_params.get("code")
    if not code:
        raise HTTPException(status_code=400, detail="Missing authorization code")
    msal_app = _build_msal_app()
    result = msal_app.acquire_token_by_authorization_code(
        code,
        scopes=SCOPE,
        redirect_uri=REDIRECT_URI
    )
    if "error" in result:
        logger.error(f"SSO error: {result.get('error_description', result.get('error'))}")
        raise HTTPException(status_code=401, detail="Authentication failed")
    # Lấy thông tin user từ token
    id_token_claims = result.get("id_token_claims", {})
    email = id_token_claims.get("preferred_username", id_token_claims.get("email", ""))
    name = id_token_claims.get("name", email.split("@")[0])
    oid = id_token_claims.get("oid", "")
    # Xác định role
    role = "admin" if "admin" in email.lower() else "user"
    logger.info(f"SSO login: {email} (role={role})")
    # Redirect về frontend với user info
    import json
    import urllib.parse
    user_data = json.dumps({"email": email, "display_name": name, "role": role})
    encoded = urllib.parse.quote(user_data)
    return RedirectResponse(url=f"http://localhost:8000?user={encoded}")
@app.post("/auth/login-email", response_model=LoginResponse, tags=["Auth"])
 async def login_email_endpoint(request: LoginRequest):
    """
    Đăng nhập bằng email (fallback khi không dùng SSO).
    """
    email = request.email.strip().lower()
    if not email or "@" not in email:
        raise HTTPException(status_code=400, detail="Email không hợp lệ.")
    local_part = email.split("@")[0]
    display_name = local_part.replace(".", " ").title()
    role = "admin" if "admin" in email else "user"
    logger.info(f"Email login: {email} (role={role})")
    return LoginResponse(email=email, display_name=display_name, role=role)
@app.post("/chat", response_model=ChatResponse, tags=["RAG"], status_code=status.HTTP_200_OK)
-async def chat_endpoint(request: ChatRequest):
+async def chat_endpoint(request: ChatRequest, http_request: Request):
    """
    Điểm cuối xử lý hội thoại RAG.
-    Hệ thống sẽ tự động trích xuất ngữ cảnh từ OpenSearch và sử dụng Provider đã cấu hình để trả lời.
+    Header 'X-User-Email' (optional): Email user để filter quyền.
    Header 'X-User-Role' (optional): "admin" = bypass ACL.
    """
    if not rag_engine:
        raise HTTPException(
@@ -116,11 +233,14 @@ async def chat_endpoint(request: ChatRequest):
        )
    try:
-        # Chuyển đổi ChatHistoryItem sang format dict cho RAGEngine
+        user_email = http_request.headers.get("X-User-Email")
        user_role = http_request.headers.get("X-User-Role", "user")
        is_admin = user_role == "admin" or not user_email
        history_data = [item.dict() for item in request.history]
-        logger.info(f"Xử lý truy vấn: {request.query[:50]}...")
+        logger.info(f"Chat query: {request.query[:50]} (user={user_email or 'none'}, role={user_role})")
-        result = rag_engine.chat(request.query, history=history_data)
+        result = rag_engine.chat(request.query, history=history_data, user_email=user_email, is_admin=is_admin)
        return ChatResponse(
            answer=result["answer"],
@@ -134,5 +254,136 @@ async def chat_endpoint(request: ChatRequest):
            detail="Đã xảy ra lỗi nội bộ trong quá trình xử lý ngôn ngữ."
        )
 def extract_text_from_pdf_bytes(pdf_bytes: bytes) -> str:
    """Trích xuất text trực tiếp từ PDF có text layer."""
    try:
        import fitz
        doc = fitz.open(stream=pdf_bytes, filetype="pdf")
        return "\n\n".join(page.get_text() for page in doc)
    except Exception:
        return ""
 def run_sync_background():
    """Chạy đồng bộ SharePoint → DCE → OCR/Extract → Chunk → Index."""
    global sync_status
    sync_status = {"running": True, "last_run": None, "processed": 0, "skipped": 0, "errors": []}
    try:
        provider = SharePointProvider()
        dce = DocumentClassificationEngine(provider=provider)
        ocr = OCRService()
        chunker = MarkdownChunker(max_chunk_size=1000, overlap=100)
        vector_db = VectorStore(index_name="poc_sharepoint_docs")
        items, _ = provider.fetch_changes({})
        logger.info(f"Sync: Found {len(items)} items from SharePoint")
        for item in items:
            if item.get("is_folder") or item.get("is_deleted"):
                continue
            name = item.get("name", "")
            item_id = item.get("id", "")
            item_details = provider.get_item_details(item_id)
            permissions = provider.get_item_permissions(item_id)
            doc = IngestedDocument(
                site_id=settings.sharepoint_site_id,
                drive_id="",
                item_id=item_id,
                name=name,
                web_url=item_details.get("web_url", ""),
                download_url=item_details.get("download_url"),
                is_folder=False,
                size=item.get("size", 0),
            )
            classification = dce.classify(doc, target_item=item)
            if classification.processing_policy in (ProcessingPolicy.UNSUPPORTED, ProcessingPolicy.METADATA_ONLY, ProcessingPolicy.REQUIRES_REVIEW):
                sync_status["skipped"] += 1
                continue
            try:
                file_bytes = provider.download_file(item)
            except Exception as e:
                sync_status["errors"].append(f"{name}: download failed")
                continue
            if not file_bytes:
                sync_status["errors"].append(f"{name}: empty file")
                continue
            pages = []
            ext = name.lower().rsplit(".", 1)[-1] if "." in name else ""
            if classification.processing_policy == ProcessingPolicy.SKIP_OCR:
                if ext == "pdf":
                    text = extract_text_from_pdf_bytes(file_bytes)
                    if text.strip():
                        from core.models import OCRPageResult
                        pages = [OCRPageResult(page=1, text=text, confidence=1.0)]
                elif ext in ("docx", "doc"):
                    pages = TextExtractor.extract_from_docx(file_bytes)
                elif ext in ("xlsx", "xls"):
                    pages = TextExtractor.extract_from_xlsx(file_bytes)
                elif ext in ("txt", "md", "csv"):
                    pages = TextExtractor.extract_from_text(file_bytes)
            elif classification.processing_policy == ProcessingPolicy.REQUIRES_OCR:
                pages = ocr.process_pdf_bytes(file_bytes)
            if not pages:
                sync_status["skipped"] += 1
                continue
            metadata = {
                "item_id": item_id,
                "name": name,
                "web_url": item_details.get("web_url"),
                "download_url": item_details.get("download_url"),
                "site_id": settings.sharepoint_site_id,
                "permissions": permissions
            }
            chunks = chunker.chunk_document(pages, metadata)
            if chunks:
                vector_db.delete_by_file_id(item_id)
                vector_db.embed_and_index(chunks)
                sync_status["processed"] += 1
                logger.info(f"Sync: Indexed {name} → {len(chunks)} chunks")
            else:
                sync_status["skipped"] += 1
        sync_status["last_run"] = "completed"
        logger.info(f"Sync completed: {sync_status['processed']} processed, {sync_status['skipped']} skipped")
    except Exception as e:
        sync_status["last_run"] = "failed"
        sync_status["errors"].append(str(e))
        logger.error(f"Sync failed: {e}")
    finally:
        sync_status["running"] = False
@app.post("/sync", response_model=SyncResponse, tags=["Ingestion"])
 async def sync_endpoint(background_tasks: BackgroundTasks):
    """
    Trigger đồng bộ dữ liệu từ SharePoint.
    Chạy trong background, trả về trạng thái ngay lập tức.
    """
    if sync_status["running"]:
        return SyncResponse(status="already_running", message="Đồng bộ đang chạy, vui lòng đợi.")
    background_tasks.add_task(run_sync_background)
    return SyncResponse(status="started", message="Đồng bộ đã bắt đầu trong background.")
@app.get("/sync/status", tags=["Ingestion"])
 async def sync_status_endpoint():
    """Kiểm tra trạng thái đồng bộ."""
    return sync_status
 if __name__ == "__main__":
    uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=True)
--- a/chat/rag_engine.py
+++ b/chat/rag_engine.py
@@ -5,37 +5,46 @@ from .llm_factory import LLMFactory
 logger = logging.getLogger("RAGEngine")
 class RAGEngine:
    def __init__(self):
        self.retriever = SearchRetriever()
        self.llm = LLMFactory.get_provider()
-        logger.info(f"RAG Engine đã sẵn sàng với LLM Provider: {type(self.llm).__name__}")
+        logger.info(f"RAG Engine ready with LLM Provider: {type(self.llm).__name__}")
-    def chat(self, user_query: str, history: List[Dict[str, str]] = None) -> Dict:
+    def chat(self, user_query: str, history: List[Dict[str, str]] = None, user_email: str = None, is_admin: bool = False) -> Dict:
        """
-        Quy trình RAG hoàn chỉnh: Search -> Augment -> Generate
+        Quy trình RAG: Search -> Augment -> Generate
        Args:
            user_query: Câu hỏi
            history: Lịch sử chat
            user_email: Email user để filter quyền
            is_admin: True = bypass ACL
        """
-        # 1. RETRIEVAL: Tìm kiếm ngữ cảnh liên quan
+        logger.info(f"Search query: {user_query[:100]} (user={user_email or 'none'}, admin={is_admin})")
-        relevant_chunks = self.retriever.retrieve(user_query, top_k=5)
+        relevant_chunks = self.retriever.retrieve(user_query, top_k=5, user_email=user_email, is_admin=is_admin)
        if not relevant_chunks:
            context_text = "Không tìm thấy thông tin liên quan trong cơ sở dữ liệu nội bộ."
            logger.info("Search result: 0 chunks found")
        else:
            # Gộp text từ các chunks lại thành 1 khối context
            context_text = "\n---\n".join([
                f"[Nguồn: {c.file_name}, Trang: {c.page_from}]\nNội dung: {c.text}" 
                for c in relevant_chunks
            ])
            logger.info(f"Search result: {len(relevant_chunks)} chunks from {len(set(c.file_name for c in relevant_chunks))} files")
-        # 2. GENERATION: Gửi sang LLM để trả lời
+        # 2. GENERATION
-        logger.info("Đang yêu cầu LLM tổng hợp câu trả lời...")
+        logger.info("Requesting LLM to generate answer...")
        answer = self.llm.generate_response(
            prompt=user_query,
            context=context_text,
            history=history
        )
        logger.info(f"LLM response length: {len(answer)} chars")
-        # 3. Trả về kết quả kèm theo nguồn trích dẫn (Citations)
+        # 3. Return with citations
        return {
            "answer": answer,
            "context_used": context_text,
@@ -43,7 +52,8 @@ class RAGEngine:
                {
                    "file_name": c.file_name,
                    "page": c.page_from,
-                    "url": c.source_url
+                    "url": c.source_url,
                    "download_url": c.download_url
                } for c in relevant_chunks
            ]
        }
--- a/chunking/markdown_chunker.py
+++ b/chunking/markdown_chunker.py
@@ -82,8 +82,9 @@ class MarkdownChunker:
                    page_from=p_from,
                    page_to=p_to,
                    source_url=metadata.get("web_url", ""),
                    download_url=metadata.get("download_url", ""),
                    site_id=metadata.get("site_id", ""),
-                    permissions=["*"] # TODO: Sẽ gán quyền thật từ SharePoint
+                    permissions=metadata.get("permissions", ["*"])
                ))
                # Bắt đầu chunk mới với một chút Overlap (chối gối)
@@ -109,8 +110,9 @@ class MarkdownChunker:
                page_from=p_from,
                page_to=p_to,
                source_url=metadata.get("web_url", ""),
                download_url=metadata.get("download_url", ""),
                site_id=metadata.get("site_id", ""),
-                permissions=["*"]
+                permissions=metadata.get("permissions", ["*"])
            ))
        logger.info(f"Chunked document {metadata.get('name')} into {len(chunks)} chunks.")
--- a/core/logging.py
+++ b/core/logging.py
@@ -0,0 +1,62 @@
 import logging
 import json
 import sys
 from datetime import datetime
 from typing import Optional
 class StructuredFormatter(logging.Formatter):
    """JSON structured log formatter for production."""
    def format(self, record):
        log_entry = {
            "timestamp": datetime.utcnow().isoformat() + "Z",
            "level": record.levelname,
            "logger": record.name,
            "message": record.getMessage(),
        }
        if record.exc_info and record.exc_info[0]:
            log_entry["exception"] = self.formatException(record.exc_info)
        if hasattr(record, "extra_data"):
            log_entry["data"] = record.extra_data
        return json.dumps(log_entry, ensure_ascii=False)
 class HumanFormatter(logging.Formatter):
    """Human-readable log formatter for development."""
    def format(self, record):
        return f"{datetime.now().strftime('%H:%M:%S')} [{record.levelname}] {record.name}: {record.getMessage()}"
 def setup_logging(level: str = "INFO", structured: bool = False):
    """
    Setup logging cho toàn bộ ứng dụng.
    Args:
        level: Log level (DEBUG, INFO, WARNING, ERROR)
        structured: True = JSON format (production), False = human readable (development)
    """
    root_logger = logging.getLogger()
    root_logger.setLevel(getattr(logging, level.upper(), logging.INFO))
    # Xóa existing handlers
    root_logger.handlers.clear()
    handler = logging.StreamHandler(sys.stdout)
    handler.setFormatter(StructuredFormatter() if structured else HumanFormatter())
    root_logger.addHandler(handler)
 def log_event(logger: logging.Logger, level: str, message: str, **kwargs):
    """
    Ghi log với structured data.
    Usage:
        log_event(logger, "info", "File processed", file_name="test.pdf", pages=5)
    """
    extra = {"extra_data": kwargs} if kwargs else {}
    getattr(logger, level.lower(), logger.info)(message, extra=extra)
--- a/core/models.py
+++ b/core/models.py
@@ -64,6 +64,7 @@ class DocumentChunk(BaseModel):
    embedding: Optional[list[float]] = None
    page_from: int
    page_to: int
-    source_url: str
+    source_url: Optional[str] = None
    download_url: Optional[str] = None
    permissions: list[str] = []
    site_id: str = ""
--- a/doc/00.AGENT_ARCHITECTURE_MAP.md
+++ b/doc/00.AGENT_ARCHITECTURE_MAP.md
@@ -1,88 +1,202 @@
 # 🧭 AGENT ARCHITECTURE MAP (LIVING DOCUMENT)
 *Đây là tài liệu dẫn đường dành riêng cho các AI Agent tương lai và lập trình viên bảo trì. Không quét toàn bộ code, hãy đọc file này trước.*
-**Lần cập nhật cuối:** Phase 6 (Hoàn thiện Semantic Chunking & Vector Indexing)
+**Lần cập nhật cuối:** Phase 8 Complete (DCE, Text Extraction, ACL, SSO, Logging)
-**Trạng thái Dự án:** Đã hoàn thành Ingestion, Extraction, Chunking & Indexing. Chuẩn bị bước vào Phase 7 (RAG Search & Chat API).
+**Trạng thái Dự án:** Phase 8 hoàn thành. Sẵn sàng cho Phase 9 (Production Ready).
 ---
 ## 1. Bản Đồ Kiến Trúc Lõi (Core Architecture Patterns)
 ### Pipeline hiện tại (ĐÃ HOẠT ĐỘNG)
 ```
 SharePoint → Ingestion → DCE → [OCR/Extract/Skip] → Chunking → OpenSearch → Search → RAG Chat → FastAPI → Frontend
 ```
 ### A. Tầng Ingestion (Thu thập dữ liệu) - Mẫu Modular Provider Pattern
 - **Mục tiêu:** Tách biệt lõi hệ thống khỏi nền tảng lưu trữ (SharePoint, Google Drive, v.v.).
- **Interface gốc:** `ingestion/providers/base_provider.py` (Bắt buộc phải implement `fetch_changes` và `download_file`).
+- **Interface:** `ingestion/providers/base_provider.py` (`fetch_changes`, `download_file`, `get_item_details`, `get_item_permissions`).
- **Implement hiện tại:** `ingestion/providers/sharepoint_provider.py`. Nó bọc lại `GraphClient` và tự động xử lý thuật toán phân trang (pagination) để lấy dữ liệu Delta.
+- **Implement hiện tại:** `ingestion/providers/sharepoint_provider.py`. Bọc lại `GraphClient`, tự động xử lý pagination Delta Query.
- **Nếu cần thêm nguồn dữ liệu mới (ví dụ: NAS, Google Drive):** Chỉ cần tạo một class mới kế thừa `BaseStorageProvider`. Lõi hệ thống không cần biết về API của nguồn đó.
+- **Sync Engine:** `ingestion/sync.py` → `SyncEngine` nhận `BaseStorageProvider` qua constructor, provider-agnostic.
 - **Nếu cần thêm nguồn dữ liệu mới:** Chỉ cần tạo class mới kế thừa `BaseStorageProvider`.
 ### B. Tầng Extraction (Xử lý chữ & Ảnh) - Mẫu Distributed VLM Pattern
- **Lịch sử:** Đã từng dùng PaddleOCR + VietOCR nhưng gặp lỗi "Rụng dấu" và "Ảo giác" do cắt ảnh sai.
+- **Lịch sử:** Đã từng dùng PaddleOCR + VietOCR nhưng gặp lỗi "Rụng dấu" và "Ảo giác". Đã loại bỏ hoàn toàn.
- **Kiến trúc hiện tại:** Hệ thống đóng vai trò như một **VLM Client**.
+- **Kiến trúc hiện tại:** Hệ thống đóng vai trò **VLM Client**.
- **Cách hoạt động:** `extraction/ocr_service.py` render file PDF thành ảnh (DPI=86), nén Base64 và bắn POST Request sang một Server LLM khác trong mạng LAN (chạy `llama.cpp` với model `Vintern-3B`).
+- **Cách hoạt động:** `extraction/ocr_service.py` render PDF thành ảnh (Matrix=1.2), nén Base64, POST sang server LAN (`10.202.50.3:8080`) chạy `llama.cpp` + `Vintern-3B`.
- **Lợi ích:** Giải phóng hoàn toàn RAM cho máy chủ RAG, loại bỏ các thư viện AI nặng nền (Torch, Paddle). Lấy được Markdown nguyên bản, không gãy vỡ layout bảng biểu.
+- **Lợi ích:** Giải phóng RAM cho máy chủ RAG, lấy được Markdown nguyên bản.
 ### C. Tầng Chunking & Vector DB (Semantic Indexing)
- **Chunking:** `chunking/markdown_chunker.py` chia nhỏ văn bản bằng Markdown Rules (nhận biết Header `#`, duy trì overlap chống đứt gãy ngữ cảnh), tự động theo dõi `page_from`, `page_to` chuẩn xác.
+- **Chunking:** `chunking/markdown_chunker.py` chia nhỏ bằng Markdown Rules (Header `#`, overlap), theo dõi `page_from`, `page_to`.
- **Embedding:** Dùng thư viện `sentence-transformers` với model `keepitreal/vietnamese-sbert` chạy Local/Offline. Tạo ra Vector 768 chiều chuyên biệt cho Tiếng Việt.
+- **Embedding:** `sentence-transformers` với model `keepitreal/vietnamese-sbert` (local, 768 chiều).
- **Database:** `indexing/vector_store.py` cấu hình OpenSearch với thuật toán `k-NN HNSW`. Index mặc định là `poc_sharepoint_docs` hoặc `sharepoint_docs`.
+- **Database:** `indexing/vector_store.py` → OpenSearch `k-NN HNSW`. Index: `poc_sharepoint_docs`.
 - **Dedup:** `VectorStore.delete_by_file_id()` xóa chunks cũ trước khi nạp lại.
-### D. Tầng Cấu hình (Decoupled Configuration)
+### D. Tầng Search & RAG Chat
- Toàn bộ thông số hệ thống, đặc biệt là IP máy chủ VLM, Token của SharePoint đều nằm trong `.env`.
+- **Retriever:** `search/retriever.py` → Semantic Search (k-NN vector) trên OpenSearch.
- Mã nguồn load cấu hình thông qua `core/config.py`.
+- **RAG Engine:** `chat/rag_engine.py` → Search → Augment Context → LLM Generate.
 - **LLM Factory:** `chat/llm_factory.py` → Hỗ trợ Gemini, Groq, Local (config trong `.env`).
 ### E. Tầng API & Frontend
 - **Backend:** `api/main.py` → FastAPI tại port 8000. Endpoint: `/health`, `/auth/login` (SSO), `/auth/callback`, `/auth/login-email`, `/chat`, `/sync`, `/sync/status`.
 - **Frontend:** `frontend/` → Glassmorphism UI với SSO login + email fallback + sync button. Gọi `http://localhost:8000`.
 ### F. Tầng Cấu hình (Decoupled Configuration)
 - Toàn bộ thông số trong `.env`. Load qua `core/config.py`.
 - **Tuyệt đối KHÔNG hardcode URL, Token hay Password trong code.**
 ---
-## 2. Bản Đồ File & Thư Mục Quan Trọng
+## 2. Bản Đồ File & Thư Mục Hoàn Chỉnh
 ```text
 📁 poc_system/
 ├── 📁 core/
 │   ├── config.py              # ⚙️ Trái tim cấu hình (Load từ .env)
-│   └── models.py         # 🧩 Định nghĩa Data Classes (OCRPageResult, v.v.)
+│   ├── models.py              # 🧩 Data Classes (OCRPageResult, DocumentChunk, IngestedDocument)
 │   └── logging.py             # 📝 Structured logging (JSON/human formatter)
 ├── 📁 ingestion/
-│   ├── sync.py           # 🔄 Bộ điều phối đồng bộ (Đang chuẩn bị ghép với BaseStorageProvider)
+│   ├── sync.py                # 🔄 SyncEngine (Provider-agnostic)
-│   ├── graph_client.py   # 🌐 Microsoft Graph API Client (Bọc Auth)
+│   ├── graph_client.py        # 🌐 Microsoft Graph API Client
-│   └── 📁 providers/     # 🔌 Nơi chứa các plugin kết nối dữ liệu
+│   └── 📁 providers/
-│       ├── base_provider.py
+│       ├── base_provider.py   # 🔌 Interface: fetch_changes, download_file, get_item_details
 │       └── sharepoint_provider.py
 ├── 📁 extraction/
-│   └── ocr_service.py    # 👁️ VLM Client (Chuyển ảnh -> Text Markdown qua LAN)
+│   ├── dce.py               # 🏷️ Document Classification Engine (phân loại trước khi xử lý)
-├── .env                  # 🔑 Chìa khoá và địa chỉ mạng (KHÔNG commit file này)
+│   ├── pdf_inspector.py     # 🔎 PDF Inspection (TEXT_PDF / SCAN_PDF / DRAWING_PDF)
-└── test_modular_architecture.py # 🧪 Script kiểm tra nhanh kết nối các module
+│   ├── magic_numbers.py     # 🔢 Magic Number validation (chống giả extension)
 │   ├── text_extractor.py    # 📄 Text extraction: DOCX (python-docx), XLSX (openpyxl), TXT
 │   └── ocr_service.py       # 👁️ VLM Client (PDF → Markdown qua LAN)
 ├── 📁 chunking/
 │   └── markdown_chunker.py    # ✂️ Semantic Chunking theo Markdown rules
 ├── 📁 indexing/
 │   └── vector_store.py        # 📦 OpenSearch k-NN Index + Embedding
 ├── 📁 search/
 │   └── retriever.py           # 🔍 Semantic Search (k-NN vector)
 ├── 📁 chat/
 │   ├── rag_engine.py          # 🤖 RAG: Search → Context → LLM
 │   ├── llm_factory.py         # 🏭 Factory: Gemini / Groq / Local
 │   └── 📁 llm_providers/
 │       ├── base_llm.py
 │       ├── gemini_llm.py
 │       ├── groq_llm.py
 │       └── local_llm.py
 ├── 📁 api/
 │   └── main.py                # 🚀 FastAPI Backend (port 8000)
 ├── 📁 frontend/
 │   ├── index.html             # 🎨 Glassmorphism UI (Login + Chat + Sync)
 │   ├── app.js                 # 💬 Chat, Auth, Sync logic
 │   └── style.css              # 🖌️ CSS
 ├── 📁 doc/                    # 📚 Tài liệu dự án
 │   ├── 00.AGENT_ARCHITECTURE_MAP.md  # Bản đồ kiến trúc
 │   ├── AGENT_HANDOVER_PROTOCOL.md    # Protocol cho AI Agent
 │   ├── DEPLOYMENT_GUIDE.md           # Hướng dẫn triển khai & cấu hình
 │   └── ...                           # Các tài liệu khác
 ├── .env                       # 🔑 Chìa khoá (KHÔNG commit)
 ├── docker-compose.yml         # 🐳 OpenSearch
 ├── Dockerfile
 ├── requirements.txt
 ├── test_rag_pipeline.py       # 🧪 Test toàn bộ pipeline
 ├── test_graph_smoke.py        # 🧪 Test kết nối Graph API
 ├── test_modular_architecture.py
 ├── test_chat.py
 ├── test_ocr.py
 └── test_dce_pipeline.py
 ```
 ---
 ## 3. Lịch Sử Các Lỗi Khét Tiếng & Cách Xử Lý (Known Gotchas)
 1. **Lỗi 401 Unauthorized khi tải file từ SharePoint:**
   - *Nguyên nhân:* Microsoft chặn download trực tiếp bằng `@microsoft.graph.downloadUrl` nếu dùng App-Only Token.
-   - *Giải pháp:* Dùng endpoint `.../items/{item_id}/content` kèm Bearer Token (Đã cài đặt trong `graph_client.py`).
+   - *Giải pháp:* Dùng endpoint `.../items/{item_id}/content` kèm Bearer Token.
 2. **Lỗi 500 Internal Server Error từ Llama.cpp VLM:**
-   - *Nguyên nhân:* Bức ảnh ném vào VLM có độ phân giải quá cao (Matrix 2.0) làm tràn Context Window (ví dụ: Token ảnh > 4096).
+   - *Nguyên nhân:* Ảnh có độ phân giải quá cao (Matrix 2.0) làm tràn Context Window.
-   - *Giải pháp:* Hạ `Matrix` xuống `1.2`, hoặc khởi chạy Server Llama.cpp với `-c 8192`. Bắt buộc phải có file `--mmproj`.
+   - *Giải pháp:* Hạ `Matrix` xuống `1.2`, hoặc khởi chạy Server với `-c 8192`.
 3. **Lỗi Rụng dấu / Ảo giác của VietOCR:**
-   - *Nguyên nhân:* PaddleOCR bắt khung quá khít, làm cụt phần đuôi của các chữ tiếng Việt có dấu. Mô hình `vgg_seq2seq` tự nội suy ra từ tiếng Anh linh tinh.
+   - *Nguyên nhân:* PaddleOCR bắt khung quá khít, mô hình `vgg_seq2seq` nội suy sai.
-   - *Giải pháp triệt để:* Đã loại bỏ hoàn toàn VietOCR, chuyển sang dùng VLM (Vintern-3B).
+   - *Giải pháp triệt để:* Đã loại bỏ hoàn toàn VietOCR, chuyển sang VLM (Vintern-3B).
 4. **Lỗi UTF-8 Surrogate (\udcc3) trong Terminal WSL:**
-   - *Hiện tượng:* Câu hỏi đầu tiên đúng, nhưng từ câu thứ 2 bị lỗi mã hóa khi dùng `input()`.
+   - *Giải pháp:* Dùng `sys.stdin.buffer.readline()` cho CLI. Web API (FastAPI) không bị ảnh hưởng.
-   - *Nguyên nhân:* Do sự không đồng nhất giữa `sys.stdin` và bộ đệm Terminal sau khi in lượng lớn dữ liệu từ LLM.
+
-   - *Giải pháp:* Sử dụng `sys.stdin.buffer.readline()` để đọc dữ liệu thô (Bytes) và tự decode bằng UTF-8. Đây là giải pháp cho môi trường CLI, khi lên Web API (FastAPI) sẽ không bị ảnh hưởng.
+5. **Lỗi Link SharePoint không ổn định (Bug #101):**
   - *Nguyên nhân:* Delta Query không trả về `webUrl` và `@microsoft.graph.downloadUrl`.
   - *Giải pháp:* Thêm `get_item_details()` vào `graph_client.py`, `base_provider.py`, `sharepoint_provider.py`.
 6. **Lỗi Chunks trùng lặp khi chạy lại pipeline:**
   - *Hiện tượng:* Mỗi lần chạy `test_rag_pipeline.py`, chunks mới được thêm chồng lên chunks cũ (cùng file).
   - *Nguyên nhân:* `chunk_id` dùng UUID ngẫu nhiên, không có bước xóa cũ.
   - *Giải pháp:* `VectorStore.delete_by_file_id(file_id)` gọi trước `embed_and_index()`.
 7. **Lỗi DCE download PDF 401 Unauthorized:**
   - *Hiện tượng:* DCE không phân loại được PDF vì download file bị 401.
   - *Nguyên nhân:* DCE dùng httpx trực tiếp với `@microsoft.graph.downloadUrl` (không có Bearer Token).
   - *Giải pháp:* Truyền `provider` (BaseStorageProvider) vào DCE constructor, dùng `provider.download_file()` thay vì httpx.
 8. **Lỗi DCE download 404 (items/None/content):**
   - *Hiện tượng:* DCE download PDF bị 404 vì URL có `items/None/content`.
   - *Nguyên nhân:* `ingestion_output.json` dùng key `item_id` nhưng `download_file()` cần `id`.
   - *Giải pháp:* DCE tự chuẩn hóa `item_id` → `id` khi thiếu.
 9. **Lỗi OpenSearch hostname không resolve khi chạy ngoài Docker:**
   - *Hiện tượng:* `ConnectionError: Failed to resolve 'opensearch'`.
   - *Nguyên nhân:* Config `.env` có `opensearch_host=opensearch` (Docker hostname).
   - *Giải pháp:* `VectorStore` và `SearchRetriever` tự detect: nếu host là "opensearch" và ENV != "docker" → đổi sang "localhost".
 10. **Lỗi k-NN query format sai cho OpenSearch 2.x:**
    - *Hiện tượng:* `Unknown key for a START_OBJECT in [knn]`.
    - *Nguyên nhân:* Đặt `knn` ở top level thay vì trong `query`.
    - *Giải pháp:* Đặt `knn` bên trong `query` object.
 ---
-## 4. Nhiệm Vụ Tiếp Theo (Dành cho Lập Trình Viên/AI Agent)
+## 4. Nhiệm Vụ Tiếp Theo (Phase 9 - Production Ready)
- [ ] **Phase 7:** Bọc thành API Backend bằng FastAPI.
+
 ### Đã hoàn thành ✅
 - [x] Ingestion: SharePoint Provider + Delta Query + Pagination
 - [x] DCE: Document Classification Engine (phân loại file theo extension + PDF inspection)
 - [x] PDF Inspection: Detect text layer, classify TEXT_PDF / SCAN_PDF / DRAWING_PDF
 - [x] Conditional OCR: Chỉ OCR SCAN_PDF, TEXT_PDF extract trực tiếp, skip DRAWING/UNSUPPORTED
 - [x] Extraction: VLM OCR (Vintern-3B qua LAN)
 - [x] Chunking: Semantic Markdown Chunker
 - [x] Indexing: OpenSearch k-NN HNSW + vietnamese-sbert
 - [x] Search: Semantic Retriever
 - [x] RAG Chat: LLM Factory (Gemini/Groq/Local)
 - [x] API: FastAPI Backend (/chat, /health, /sync, /sync/status)
 - [x] Frontend: Glassmorphism UI
 - [x] Bug fixes: SharePoint links, Chunk dedup
 - [x] Refactor: SyncEngine provider-agnostic
 - [x] Logging: Structured logging utility (`core/logging.py`)
 - [x] Permission: ACL extraction từ SharePoint + filter search theo user
 - [x] Auth UI: Simple email login + SSO Azure AD + user context cho API calls
 - [x] DOCX Text Extraction: python-docx (paragraphs + tables)
 - [x] XLSX Text Extraction: openpyxl (sheets + cells)
 ### Chưa triển khai (Phase 9 - Production Ready)
 #### Ưu tiên trung bình
 - [ ] **Cấu hình Azure AD cho SSO:** Thêm Redirect URI `http://localhost:8000/auth/callback` và bật "ID tokens" trong App Registration.
 #### Ưu tiên thấp
 - [ ] **Monitoring Dashboard:** Health metrics, ingestion status, OCR success rate.
 - [ ] **Multi-tenant:** Hỗ trợ nhiều SharePoint site/tenant.
 ---
 ## 5. Tiêu chuẩn Lập trình & Môi trường (Coding Standards)
 ### A. Quản lý Mã hóa (Encoding)
- **Quy tắc vàng:** Luôn sử dụng `encoding='utf-8'` trong mọi lệnh `open()`. Tuyệt đối không dựa dẫm vào encoding mặc định của hệ điều hành.
+- **Quy tắc vàng:** Luôn sử dụng `encoding='utf-8'` trong mọi lệnh `open()`.
- **Môi trường:** Hệ thống được thiết kế để chạy trong môi trường UTF-8. Trong Docker hoặc WSL, luôn đảm bảo biến môi trường `PYTHONIOENCODING=utf-8` được thiết lập. Điều này giúp hệ thống tương thích 100% với các ký tự Tiếng Việt từ LLM mà không cần hack code.
+- **Môi trường:** `PYTHONIOENCODING=utf-8` trong Docker/WSL.
 ### B. Mẫu Provider (Provider Pattern)
- Mọi kết nối tới dịch vụ bên thứ ba (Storage, LLM) phải thông qua Interface/BaseClass để đảm bảo tính "Cắm rút" (Pluggable).
+- Mọi kết nối tới dịch vụ bên thứ ba (Storage, LLM) phải thông qua Interface/BaseClass.
 - `BaseStorageProvider` cho Storage, `BaseLLMProvider` cho LLM.
 ### C. Quy tắc an toàn
 - Không commit `.env`, không hardcode secrets.
 - Không thay đổi kiến trúc đã chốt trong `doc/14.Project-Bridge-Context-for-New-Chat.md` mà không có lý do kỹ thuật rõ ràng.
--- a/doc/AGENT_HANDOVER_PROTOCOL.md
+++ b/doc/AGENT_HANDOVER_PROTOCOL.md
@@ -0,0 +1,98 @@
 # 🤖 AGENT HANDOVER PROTOCOL (Dành cho AI Agent)
 > **QUAN TRỌNG:** Nếu bạn là AI Agent mới, hãy đọc file này kết hợp với `doc/00.AGENT_ARCHITECTURE_MAP.md` trước khi viết bất kỳ dòng code nào.
 ## 1. Tóm tắt "Bộ nhớ" Dự án (Memory Snapshot)
 Dự án này là một hệ thống **Enterprise RAG** (Retrieval-Augmented Generation) với các đặc điểm kỹ thuật:
 - **Distributed VLM OCR:** Dùng máy chủ LAN (`10.202.50.3:8080`) chạy `Vintern-3B` để trích xuất Markdown từ PDF.
 - **Modular Provider Pattern:** Tách biệt Storage (SharePoint) và LLM (Gemini, Groq, Local).
 - **Semantic Indexing:** Dùng `vietnamese-sbert` (local) tạo vector 768 chiều, lưu vào OpenSearch k-NN HNSW.
 - **FastAPI Backend:** API tại port 8000. Endpoint: `/health`, `/chat`.
 - **Glassmorphism UI:** Giao diện web tại `frontend/`, gọi `http://localhost:8000/chat`.
 **Pipeline hiện tại (ĐÃ HOẠT ĐỘNG):**
 ```
 SharePoint → Ingestion → VLM OCR → Chunking → OpenSearch → Search → RAG Chat → FastAPI → Frontend
 ```
 ## 2. Trạng thái triển khai
 ### ✅ Đã hoàn thành
 | Module | File | Mô tả |
 |--------|------|-------|
 | Ingestion | `ingestion/providers/sharepoint_provider.py` | Delta Query + Pagination + get_item_details |
 | Sync Engine | `ingestion/sync.py` | Provider-agnostic, nhận BaseStorageProvider |
 | DCE | `extraction/dce.py` | Document Classification Engine (phân loại file) |
 | PDF Inspector | `extraction/pdf_inspector.py` | TEXT_PDF / SCAN_PDF / DRAWING_PDF / AMBIGUOUS_PDF |
 | Magic Numbers | `extraction/magic_numbers.py` | Header byte validation |
 | OCR | `extraction/ocr_service.py` | VLM Client (Vintern-3B qua LAN) |
 | Chunking | `chunking/markdown_chunker.py` | Semantic Markdown rules + page tracking |
 | Indexing | `indexing/vector_store.py` | OpenSearch k-NN + delete_by_file_id dedup |
 | Search | `search/retriever.py` | Semantic k-NN vector search |
 | RAG Chat | `chat/rag_engine.py` | Search → Context → LLM |
 | LLM Factory | `chat/llm_factory.py` | Gemini / Groq / Local |
 | API | `api/main.py` | FastAPI port 8000 |
 | Frontend | `frontend/` | Glassmorphism UI (HTML/CSS/JS) |
 | Bug fixes | Nhiều file | SharePoint links (Bug #101), Chunk dedup |
 ### ❌ Chưa triển khai (Phase 8)
 - **DOCX Text Extraction:** Trích xuất text từ DOCX không cần OCR
 - **XLSX Text Extraction:** Trích xuất header + key columns từ Excel
 - **Permission Enforcement:** ACL filtering theo user/group
 - **Authentication UI:** OAuth2 login
 - **Ingestion API:** Trigger sync từ frontend
 - **Logging & Audit:** Structured logging
 ## 3. Hướng dẫn dành cho AI Agent tiếp theo
 1.  **Luôn kiểm tra `.env`:** Toàn bộ cấu hình nằm ở đây. Không bao giờ hardcode.
 2.  **Sử dụng `core/config.py`:** Cửa ngõ duy nhất để truy cập cài đặt.
 3.  **UTF-8:** Mọi I/O phải có `encoding='utf-8'`. Đặt `export PYTHONIOENCODING=utf-8`.
 4.  **Cập nhật tài liệu:** Khi hoàn thành Phase hoặc thay đổi kiến trúc, BẮT BUỘC cập nhật file này và `00.AGENT_ARCHITECTURE_MAP.md`.
 5.  **Đọc `doc/14.Project-Bridge-Context-for-New-Chat.md`:** Đây là "hợp đồng kiến trúc" - không thay đổi các quyết định đã chốt.
 ## 4. Cách cập nhật Tài liệu (Protocol for Updates)
 - **Bước 1:** Cập nhật trạng thái trong `doc/00.AGENT_ARCHITECTURE_MAP.md` (đánh dấu ✅ vào checkbox).
 - **Bước 2:** Nếu phát hiện lỗi mới, ghi lại vào mục **"Lịch sử các lỗi khét tiếng"** kèm giải pháp.
 - **Bước 3:** Cập nhật mục **Trạng thái triển khai** trong file này.
 ## 5. Lệnh chạy nhanh (Quick Start)
 ```bash
 # Khởi động OpenSearch
 docker-compose up -d opensearch
 # Chạy Backend (FastAPI port 8000)
 python3 api/main.py
 # Mở Frontend
 # Mở frontend/index.html trong trình duyệt (hoặc dùng Live Server)
 # Nạp dữ liệu từ SharePoint → OCR → Chunk → Index
 python3 test_rag_pipeline.py
 ```
 ## 6. Kiểm tra nhanh (Verification)
 ```bash
 # 1. Kiểm tra cú pháp Python
 python3 -m py_compile ingestion/graph_client.py
 python3 -m py_compile ingestion/providers/sharepoint_provider.py
 python3 -m py_compile ingestion/sync.py
 python3 -m py_compile indexing/vector_store.py
 python3 -m py_compile api/main.py
 python3 -m py_compile test_rag_pipeline.py
 # 2. Test kết nối Graph API
 python3 test_graph_smoke.py
 # 3. Test toàn bộ pipeline (cần OpenSearch + VLM server)
 python3 test_rag_pipeline.py
 # 4. Kiểm tra metadata
 cat ingestion_output.json | python3 -m json.tool | grep -E '"web_url"|"download_url"'
 # 5. Test API endpoint
 curl http://localhost:8000/health
 curl -X POST http://localhost:8000/chat -H "Content-Type: application/json" -d '{"query":"test"}'
 ```
 ---
 *Chúc may mắn, Agent đồng nghiệp! Pipeline RAG đã hoạt động. Tiếp theo: DCE, Permission, Hardening.*
--- a/doc/DEPLOYMENT_GUIDE.md
+++ b/doc/DEPLOYMENT_GUIDE.md
@@ -0,0 +1,366 @@
 # 🚀 Hướng dẫn Triển khai & Cấu hình Hệ thống
 > Tài liệu này hướng dẫn cấu hình manual, biến môi trường, và các lưu ý khi triển khai từ PoC lên Production.
 ---
 ## 1. Cấu hình Azure AD App Registration (MANUAL)
 ### 1.1 Tạo App Registration (nếu chưa có)
 1. Vào **Azure Portal** → **Azure Active Directory** → **App registrations** → **New registration**
 2. Điền thông tin:
   - **Name:** `VibeCode-RAG-PoC` (hoặc tên tuỳ chọn)
   - **Supported account types:** `Single tenant` (chỉ tenant công ty)
   - **Redirect URI:** Để trống, sẽ thêm sau
 3. Bấm **Register**
 4. Ghi lại:
   - **Application (client) ID** → dùng cho `CLIENT_ID`
   - **Directory (tenant) ID** → dùng cho `TENANT_ID`
 ### 1.2 Tạo Client Secret
 1. App Registration → **Certificates & secrets** → **New client secret**
 2. Điền Description: `RAG PoC Secret`
 3. Chọn thời hạn: `24 months` (hoặc tuỳ nhu cầu)
 4. Bấm **Add**
 5. **Copy ngay giá trị Secret** → dùng cho `CLIENT_SECRET` (chỉ hiện 1 lần)
 ### 1.3 Cấp quyền Application Permissions
 1. App Registration → **API permissions** → **Add a permission**
 2. Chọn **Microsoft Graph** → **Application permissions**
 3. Tìm và tích:
   - `Sites.Read.All` (đọc SharePoint sites)
   - `Files.Read.All` (đọc files trong drives)
 4. Bấm **Add permissions**
 5. **Quan trọng:** Bấm **Grant admin consent for [tenant]** → Confirm
 ### 1.4 Cấu hình Redirect URI cho SSO (khi cần login)
 1. App Registration → **Authentication** → **Add a platform** → **Web**
 2. Nhập Redirect URI:
   - **PoC (localhost):** `http://localhost:8000/auth/callback`
   - **Production:** `https://your-domain.com/auth/callback`
 3. Tích chọn: ✅ **ID tokens** (implicit grant)
 4. Bấm **Save**
 ### 1.5 Kiểm tra Token Claims
 Sau khi cấu hình xong, decode token JWT và kiểm tra:
 ```text
 aud   : https://graph.microsoft.com
 appid : <client-id-của-bạn>
 idtyp : app
 roles :
  - Sites.Read.All
  - Files.Read.All
 ```
 Nếu token **không có `roles`** → quyền chưa đúng, kiểm tra lại bước 1.3.
 ---
 ## 2. Biến môi trường (.env)
 ### 2.1 File mẫu
 ```env
 # ===== Azure AD / Microsoft Graph =====
 TENANT_ID=your-tenant-id-guid
 CLIENT_ID=your-client-id-guid
 CLIENT_SECRET=your-client-secret-value
 # ===== SharePoint =====
 # Site path để ingestion (đổi thành site SharePoint của bạn)
 # Format: hostname:/sites/site-name
 # Ví dụ: 285pdg.sharepoint.com:/sites/poc_system
 # ===== OpenSearch =====
 OPENSEARCH_HOST=opensearch        # Docker: "opensearch", Local: "localhost"
 OPENSEARCH_PORT=9200
 OPENSEARCH_USER=admin
 OPENSEARCH_PASS=admin
 # ===== VLM OCR Server (Vintern-3B) =====
 VLM_ENDPOINT=http://10.202.50.3:8080/v1/chat/completions
 VLM_TEMPERATURE=0.1
 VLM_MAX_TOKENS=2000
 VLM_TIMEOUT=120.0
 # ===== Chat LLM =====
 LLM_PROVIDER=gemini              # Options: gemini, groq, local
 GEMINI_API_KEY=your-gemini-api-key
 GROQ_API_KEY=your-groq-api-key
 GROQ_MODEL=llama-3.3-70b-versatile
 LOCAL_LLM_ENDPOINT=http://10.202.50.3:8081/v1/chat/completions
 # ===== General =====
 LOG_LEVEL=INFO
 ENVIRONMENT=development           # development hoặc production
 ```
 ### 2.2 Giải thích từng biến
 | Biến | Bắt buộc | Mô tả |
 |------|----------|-------|
 | `TENANT_ID` | ✅ | Azure AD Tenant ID |
 | `CLIENT_ID` | ✅ | Azure AD App Registration Client ID |
 | `CLIENT_SECRET` | ✅ | Azure AD App Registration Client Secret |
 | `OPENSEARCH_HOST` | ✅ | Hostname OpenSearch |
 | `OPENSEARCH_PORT` | ✅ | Port OpenSearch (mặc định 9200) |
 | `VLM_ENDPOINT` | ✅ | URL server VLM OCR (Vintern-3B) |
 | `LLM_PROVIDER` | ✅ | LLM provider: `gemini`, `groq`, hoặc `local` |
 | `GEMINI_API_KEY` | Nếu dùng Gemini | API key từ Google AI Studio |
 | `GROQ_API_KEY` | Nếu dùng Groq | API key từ Groq Console |
 | `ENVIRONMENT` | ✅ | `development` hoặc `production` |
 ---
 ## 3. Kết nối SharePoint khác
 ### 3.1 Thay đổi Site Path
 Chỉnh sửa trong file `ingestion/providers/sharepoint_provider.py`:
 ```python
 # Dòng 14 - Thay đổi hostname và site_path
 def __init__(self, hostname: str = "your-company.sharepoint.com", site_path: str = "/sites/your-site-name"):
 ```
 Hoặc gọi từ bên ngoài:
 ```python
 provider = SharePointProvider(
    hostname="your-company.sharepoint.com",
    site_path="/sites/your-site-name"
 )
 ```
 ### 3.2 Kiểm tra quyền truy cập
 App Registration phải có quyền trên site mới:
 1. Nếu dùng **Application Permissions** (`Sites.Read.All`) → tự động truy cập mọi site
 2. Nếu muốn giới hạn site cụ thể → dùng **SharePoint App-Only Policy** (nâng cao)
 ### 3.3 Xoá dữ liệu cũ
 Khi đổi SharePoint site, cần xoá index cũ:
 ```bash
 curl -X DELETE -u admin:admin "http://localhost:9200/poc_sharepoint_docs"
 ```
 Sau đó chạy lại `python3 test_rag_pipeline.py` để nạp dữ liệu mới.
 ---
 ## 4. Triển khai Production
 ### 4.1 Yêu cầu hạ tầng
 | Component | Yêu cầu tối thiểu | Khuyến nghị |
 |-----------|-------------------|-------------|
 | Server chính | 4 CPU, 8GB RAM | 8 CPU, 16GB RAM |
 | OpenSearch | 2 CPU, 4GB RAM | 4 CPU, 8GB RAM |
 | VLM OCR Server | GPU 8GB VRAM | GPU 16GB VRAM |
 | Domain | Có SSL certificate | Let's Encrypt |
 ### 4.2 Docker Compose Production
 ```yaml
 # docker-compose.prod.yml
 version: '3.8'
 services:
  opensearch:
    image: opensearchproject/opensearch:2.11.1
    environment:
      - discovery.type=single-node
      - OPENSEARCH_INITIAL_ADMIN_PASSWORD=StrongPassword123!
    ports:
      - "9200:9200"
    volumes:
      - opensearch-data:/usr/share/opensearch/data
  rag-api:
    build: .
    ports:
      - "8000:8000"
    env_file: .env
    environment:
      - ENVIRONMENT=production
      - OPENSEARCH_HOST=opensearch
    depends_on:
      - opensearch
  nginx:
    image: nginx:alpine
    ports:
      - "443:443"
    volumes:
      - ./nginx.conf:/etc/nginx/nginx.conf
      - ./certs:/etc/nginx/certs
      - ./frontend:/usr/share/nginx/html
    depends_on:
      - rag-api
 volumes:
  opensearch-data:
 ```
 ### 4.3 Nginx config mẫu
 ```nginx
 # nginx.conf
 events {
    worker_connections 1024;
 }
 http {
    server {
        listen 443 ssl;
        server_name your-domain.com;
        ssl_certificate /etc/nginx/certs/fullchain.pem;
        ssl_certificate_key /etc/nginx/certs/privkey.pem;
        # Frontend
        location / {
            root /usr/share/nginx/html;
            index index.html;
            try_files $uri $uri/ /index.html;
        }
        # API
        location /api/ {
            proxy_pass http://rag-api:8000/;
            proxy_set_header Host $host;
            proxy_set_header X-Real-IP $remote_addr;
            proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
            proxy_set_header X-Forwarded-Proto $scheme;
        }
    }
    # Redirect HTTP → HTTPS
    server {
        listen 80;
        server_name your-domain.com;
        return 301 https://$host$request_uri;
    }
 }
 ```
 ### 4.4 Thay đổi khi Production
 1. **Frontend:** Sửa `API_BASE` trong `app.js`:
   ```javascript
   const API_BASE = '/api';  // Thay vì 'http://localhost:8000'
   ```
 2. **SSO Redirect URI:** Cập nhật trong Azure AD:
   ```
   https://your-domain.com/auth/callback
   ```
 3. **.env:**
   ```env
   OPENSEARCH_HOST=opensearch
   ENVIRONMENT=production
   ```
 4. **OpenSearch password:** Đổi password mặc định:
   ```env
   OPENSEARCH_USER=admin
   OPENSEARCH_PASS=YourStrongPassword123!
   ```
 5. **CORS:** Giới hạn origins trong `api/main.py`:
   ```python
   allow_origins=["https://your-domain.com"]
   ```
 ---
 ## 5. Kiểm tra sau triển khai
 ### 5.1 Kiểm tra kết nối
 ```bash
 # 1. OpenSearch
 curl -u admin:admin http://localhost:9200/_cluster/health
 # 2. API
 curl http://localhost:8000/health
 # 3. Frontend
 curl -I http://localhost/
 ```
 ### 5.2 Kiểm tra SSO
 1. Mở `https://your-domain.com`
 2. Bấm "Đăng nhập Microsoft SSO"
 3. Đăng nhập bằng tài khoản Microsoft 365
 4. Kiểm tra user info hiển thị đúng
 ### 5.3 Kiểm tra Pipeline
 ```bash
 # Trigger sync
 curl -X POST http://localhost:8000/sync
 # Kiểm tra sync status
 curl http://localhost:8000/sync/status
 # Kiểm tra chunks trong OpenSearch
 curl -u admin:admin http://localhost:9200/poc_sharepoint_docs/_count
 ```
 ### 5.4 Kiểm tra ACL
 ```bash
 # Test search với user có quyền
 curl -X POST http://localhost:8000/chat \
  -H "Content-Type: application/json" \
  -H "X-User-Email: user@yourcompany.com" \
  -H "X-User-Role: user" \
  -d '{"query": "test"}'
 # Test search với admin (bypass ACL)
 curl -X POST http://localhost:8000/chat \
  -H "Content-Type: application/json" \
  -H "X-User-Role: admin" \
  -d '{"query": "test"}'
 ```
 ---
 ## 6. Xử lý lỗi thường gặp
 | Lỗi | Nguyên nhân | Giải pháp |
 |-----|-------------|-----------|
 | SSO redirect_uri_mismatch | Redirect URI chưa đúng | Kiểm tra URI trong Azure AD khớp với callback URL |
 | Token không có `roles` | App dùng Delegated thay vì Application permissions | Đổi sang Application permissions + Grant admin consent |
 | OpenSearch connection refused | Chưa khởi động Docker | `docker-compose up -d opensearch` |
 | VLM OCR timeout | Server VLM quá tải hoặc offline | Kiểm tra `VLM_ENDPOINT` có truy cập được |
 | Search trả 0 kết quả | Chưa nạp dữ liệu hoặc sai index name | Chạy `python3 test_rag_pipeline.py` |
 ---
 ## 7. Checklist trước khi Production
 - [ ] Azure AD App Registration đã cấu hình đúng permissions
 - [ ] Client Secret còn hạn sử dụng
 - [ ] Redirect URI đã thêm cho production domain
 - [ ] OpenSearch đã đổi password mặc định
 - [ ] SSL certificate đã cài đặt
 - [ ] CORS đã giới hạn origins
 - [ ] `.env` đã cấu hình cho production
 - [ ] Docker Compose production đã test
 - [ ] Backup strategy cho OpenSearch data
 - [ ] Monitoring (CPU, RAM, disk) đã setup
 ---
 *Tài liệu này cần được cập nhật khi có thay đổi về hạ trúc hoặc cấu hình.*
--- a/extraction/dce.py
+++ b/extraction/dce.py
@@ -1,20 +1,35 @@
 import os
 import httpx
 import logging
 from typing import Optional
 from core.models import IngestedDocument, DocumentClassificationResult, DocumentType, ProcessingPolicy, PdfType
 from extraction.magic_numbers import MagicNumberValidator
 from extraction.pdf_inspector import PDFInspector
 logger = logging.getLogger("DCE")
 class DocumentClassificationEngine:
    """
    Document Classification Engine (DCE).
    Phân loại file trước khi quyết định OCR / MarkItDown / Skip.
    """
    def __init__(self, provider=None):
        """
        Args:
            provider: BaseStorageProvider instance (optional). Nếu có, dùng để download file.
        """
    def __init__(self):
        self.pdf_inspector = PDFInspector()
        self.provider = provider
-    def classify(self, document: IngestedDocument) -> DocumentClassificationResult:
+    def classify(self, document: IngestedDocument, target_item: dict = None) -> DocumentClassificationResult:
        """
        Phân loại tài liệu.
        Args:
            document: IngestedDocument từ ingestion output
            target_item: Original item dict từ provider (dùng để download qua provider)
        """
        logger.info(f"Classifying document: {document.name} (ID: {document.item_id})")
        ext = os.path.splitext(document.name)[1].lower()
@@ -23,31 +38,9 @@ class DocumentClassificationEngine:
        policy = ProcessingPolicy.UNSUPPORTED
        reason = "Initial state"
-        # 1. Magic Number Validation
+        # 1. Routing Rules
        if document.download_url:
            header_bytes = MagicNumberValidator.fetch_header_bytes(document.download_url)
            is_valid, detected_type, sig_desc = MagicNumberValidator.validate_from_bytes(header_bytes)
            if is_valid:
                logger.info(f"Magic Number match: {sig_desc}")
            else:
                logger.warning(f"Could not verify magic number for {document.name}. Trusting extension fallback.")
        # 2. Routing Rules
        if ext == ".pdf":
-            pdf_type = PdfType.SCAN_PDF # Simulated default
+            pdf_type = self._classify_pdf(document, target_item)
            if document.download_url:
                logger.info("Downloading PDF into memory for PyMuPDF inspection...")
                try:
                    with httpx.Client() as client:
                        resp = client.get(document.download_url)
                        resp.raise_for_status()
                        pdf_bytes = resp.content
                    pdf_type = self.pdf_inspector.inspect_pdf_from_bytes(pdf_bytes)
                except Exception as e:
                    logger.error(f"Failed to download/inspect PDF: {e}")
                    pdf_type = PdfType.SCAN_PDF
            else:
                logger.warning("No download_url available for PDF. Defaulting to SCAN_PDF.")
            if pdf_type == PdfType.TEXT_PDF:
                doc_type = DocumentType.TEXTUAL_DOCUMENT
@@ -60,7 +53,7 @@ class DocumentClassificationEngine:
            elif pdf_type == PdfType.AMBIGUOUS_PDF:
                doc_type = DocumentType.UNKNOWN
                policy = ProcessingPolicy.REQUIRES_REVIEW
-                reason = "Kích thước PDF lớn bất thường (khổ A3/A2 hoặc DPI cao), cần con người xác nhận là bản Scan hay Bản vẽ"
+                reason = "PDF size lớn bất thường (A3/A2 hoặc DPI cao), cần con người xác nhận"
            else:
                doc_type = DocumentType.TEXTUAL_DOCUMENT
                policy = ProcessingPolicy.REQUIRES_OCR
@@ -81,6 +74,11 @@ class DocumentClassificationEngine:
            policy = ProcessingPolicy.METADATA_ONLY
            reason = "Native CAD drawing format"
        elif ext in [".pptx", ".ppt"]:
            doc_type = DocumentType.PRESENTATION
            policy = ProcessingPolicy.SKIP_OCR
            reason = "Presentation document format"
        else:
            doc_type = DocumentType.BINARY
            policy = ProcessingPolicy.UNSUPPORTED
@@ -97,3 +95,46 @@ class DocumentClassificationEngine:
        logger.info(f"Result -> Type: {doc_type.value}, Policy: {policy.value}, Reason: {reason}")
        return result
    def _classify_pdf(self, document: IngestedDocument, target_item: dict = None) -> PdfType:
        """Phân loại PDF thành TEXT_PDF, SCAN_PDF, DRAWING_PDF, AMBIGUOUS_PDF."""
        pdf_bytes = self._download_pdf(document, target_item)
        if not pdf_bytes:
            logger.warning(f"Cannot download PDF {document.name}. Defaulting to SCAN_PDF.")
            return PdfType.SCAN_PDF
        # Magic Number validation
        header = pdf_bytes[:256]
        is_valid, detected_type, sig_desc = MagicNumberValidator.validate_from_bytes(header)
        if is_valid:
            logger.info(f"Magic Number match: {sig_desc}")
        else:
            logger.warning(f"Magic number mismatch for {document.name}. Continuing with inspection.")
        # PDF Inspection
        return self.pdf_inspector.inspect_pdf_from_bytes(pdf_bytes)
    def _download_pdf(self, document: IngestedDocument, target_item: dict = None) -> Optional[bytes]:
        """Download PDF bytes. Ưu tiên dùng provider, fallback sang httpx."""
        # Cách 1: Dùng provider (ưu tiên, đúng auth)
        if self.provider and target_item:
            # Chuẩn hóa: đảm bảo có field 'id' (ingestion_output có thể dùng 'item_id')
            if "id" not in target_item and "item_id" in target_item:
                target_item = {**target_item, "id": target_item["item_id"]}
            try:
                return self.provider.download_file(target_item)
            except Exception as e:
                logger.warning(f"Provider download failed: {e}. Falling back to httpx.")
        # Cách 2: Dùng httpx trực tiếp với download_url
        if document.download_url:
            try:
                with httpx.Client(follow_redirects=True, timeout=60.0) as client:
                    resp = client.get(document.download_url)
                    resp.raise_for_status()
                    return resp.content
            except Exception as e:
                logger.error(f"httpx download failed: {e}")
        return None
--- a/extraction/text_extractor.py
+++ b/extraction/text_extractor.py
@@ -0,0 +1,96 @@
 import logging
 from typing import List, Optional
 from core.models import OCRPageResult
 logger = logging.getLogger("TextExtractor")
 class TextExtractor:
    """
    Trích xuất text từ các định dạng tài liệu không cần OCR:
    - DOCX (python-docx)
    - XLSX (openpyxl)
    - TXT/MD (đọc trực tiếp)
    """
    @staticmethod
    def extract_from_docx(file_bytes: bytes) -> List[OCRPageResult]:
        """Trích xuất text từ DOCX, giữ cấu trúc đoạn văn."""
        try:
            from docx import Document
            import io
            doc = Document(io.BytesIO(file_bytes))
            paragraphs = []
            for para in doc.paragraphs:
                text = para.text.strip()
                if text:
                    paragraphs.append(text)
            # Cũng trích xuất text từ bảng
            for table in doc.tables:
                for row in table.rows:
                    row_text = " | ".join(cell.text.strip() for cell in row.cells if cell.text.strip())
                    if row_text:
                        paragraphs.append(row_text)
            full_text = "\n\n".join(paragraphs)
            if not full_text.strip():
                logger.warning("DOCX file is empty or has no readable text.")
                return []
            return [OCRPageResult(page=1, text=full_text, confidence=1.0)]
        except ImportError:
            logger.error("python-docx not installed. Run: pip install python-docx")
            return []
        except Exception as e:
            logger.error(f"Failed to extract text from DOCX: {e}")
            return []
    @staticmethod
    def extract_from_xlsx(file_bytes: bytes) -> List[OCRPageResult]:
        """Trích xuất text từ XLSX (header + mỗi sheet là 1 page)."""
        try:
            from openpyxl import load_workbook
            import io
            wb = load_workbook(io.BytesIO(file_bytes), read_only=True, data_only=True)
            results = []
            for sheet_idx, sheet_name in enumerate(wb.sheetnames, 1):
                ws = wb[sheet_name]
                rows = []
                for row in ws.iter_rows(values_only=True):
                    cells = [str(c).strip() for c in row if c is not None and str(c).strip()]
                    if cells:
                        rows.append(" | ".join(cells))
                if rows:
                    sheet_text = f"[Sheet: {sheet_name}]\n" + "\n".join(rows)
                    results.append(OCRPageResult(page=sheet_idx, text=sheet_text, confidence=1.0))
            wb.close()
            if not results:
                logger.warning("XLSX file is empty or has no readable data.")
            return results
        except ImportError:
            logger.error("openpyxl not installed. Run: pip install openpyxl")
            return []
        except Exception as e:
            logger.error(f"Failed to extract text from XLSX: {e}")
            return []
    @staticmethod
    def extract_from_text(file_bytes: bytes) -> List[OCRPageResult]:
        """Trích xuất text từ file text thuần (TXT, MD, CSV)."""
        try:
            text = file_bytes.decode("utf-8", errors="replace").strip()
            if not text:
                return []
            return [OCRPageResult(page=1, text=text, confidence=1.0)]
        except Exception as e:
            logger.error(f"Failed to extract text from text file: {e}")
            return []
--- a/frontend/app.js
+++ b/frontend/app.js
@@ -0,0 +1,291 @@
 // API Base URL
 const API_BASE = 'http://localhost:8000';
 // DOM Elements
 const loginScreen = document.getElementById('login-screen');
 const appContainer = document.getElementById('app-container');
 const loginForm = document.getElementById('login-form');
 const loginEmail = document.getElementById('login-email');
 const chatWindow = document.getElementById('chat-window');
 const userInput = document.getElementById('user-input');
 const sendBtn = document.getElementById('send-btn');
 const sourcePanel = document.getElementById('source-panel');
 const sourceList = document.getElementById('source-list');
 const closePanel = document.getElementById('close-panel');
 const clearChatBtn = document.getElementById('clear-chat');
 const userName = document.getElementById('user-name');
 const userRole = document.getElementById('user-role');
 const logoutBtn = document.getElementById('logout-btn');
 const syncBtn = document.getElementById('sync-btn');
 const syncStatus = document.getElementById('sync-status');
 const ssoBtn = document.getElementById('sso-btn');
 let chatHistory = [];
 let currentUser = null;
 // ====== AUTH ======
 function checkLogin() {
    // Kiểm tra SSO callback (user data trong URL)
    const params = new URLSearchParams(window.location.search);
    const userData = params.get('user');
    if (userData) {
        try {
            currentUser = JSON.parse(decodeURIComponent(userData));
            localStorage.setItem('vibecode_user', JSON.stringify(currentUser));
            window.history.replaceState({}, '', '/'); // Xóa query param
            showApp();
            return;
        } catch (e) {
            console.error('Parse SSO user data failed:', e);
        }
    }
    // Kiểm tra localStorage
    const saved = localStorage.getItem('vibecode_user');
    if (saved) {
        currentUser = JSON.parse(saved);
        showApp();
    }
 }
 function showApp() {
    loginScreen.style.display = 'none';
    appContainer.style.display = 'flex';
    userName.textContent = currentUser.display_name;
    userRole.textContent = currentUser.role;
 }
 function showLogin() {
    loginScreen.style.display = 'flex';
    appContainer.style.display = 'none';
    currentUser = null;
    localStorage.removeItem('vibecode_user');
 }
 loginForm.onsubmit = async (e) => {
    e.preventDefault();
    const email = loginEmail.value.trim();
    if (!email) return;
    try {
        const response = await fetch(`${API_BASE}/auth/login-email`, {
            method: 'POST',
            headers: { 'Content-Type': 'application/json' },
            body: JSON.stringify({ email })
        });
        if (!response.ok) {
            const err = await response.json();
            alert(err.detail || 'Đăng nhập thất bại');
            return;
        }
        currentUser = await response.json();
        localStorage.setItem('vibecode_user', JSON.stringify(currentUser));
        showApp();
    } catch (error) {
        console.error('Login error:', error);
        alert('Không thể kết nối tới server. Vui lòng đảm bảo Backend đang chạy.');
    }
 };
 logoutBtn.onclick = () => {
    chatHistory = [];
    chatWindow.innerHTML = '';
    showLogin();
 };
 // SSO Login
 ssoBtn.onclick = () => {
    window.location.href = `${API_BASE}/auth/login`;
 };
 // ====== CHAT ======
 // Tự động giãn nở ô nhập liệu
 userInput.addEventListener('input', () => {
    userInput.style.height = 'auto';
    userInput.style.height = (userInput.scrollHeight) + 'px';
 });
 async function sendMessage() {
    const text = userInput.value.trim();
    if (!text) return;
    appendMessage('user', text);
    userInput.value = '';
    userInput.style.height = 'auto';
    const loadingId = appendMessage('ai', '<i class="fas fa-spinner fa-spin"></i> AI đang phân tích dữ liệu...');
    try {
        const headers = {
            'Content-Type': 'application/json'
        };
        if (currentUser) {
            headers['X-User-Email'] = currentUser.email;
            headers['X-User-Role'] = currentUser.role;
        }
        const response = await fetch(`${API_BASE}/chat`, {
            method: 'POST',
            headers: headers,
            body: JSON.stringify({ 
                query: text, 
                history: chatHistory 
            })
        });
        if (!response.ok) {
            throw new Error(`Server error: ${response.status}`);
        }
        const data = await response.json();
        updateMessage(loadingId, data.answer, data.sources);
        chatHistory.push({ role: 'user', content: text });
        chatHistory.push({ role: 'assistant', content: data.answer });
        if (chatHistory.length > 6) chatHistory = chatHistory.slice(-6);
    } catch (error) {
        console.error("Chat error:", error);
        updateMessage(loadingId, "⚠️ Lỗi kết nối tới máy chủ AI. Vui lòng đảm bảo Backend đang chạy tại port 8000.");
    }
 }
 function appendMessage(role, text) {
    const id = Date.now();
    const msgDiv = document.createElement('div');
    msgDiv.className = `message ${role}`;
    msgDiv.id = `msg-${id}`;
    const icon = role === 'ai' ? 'robot' : 'user';
    msgDiv.innerHTML = `
        <div class="avatar"><i class="fas fa-${icon}"></i></div>
        <div class="content">${text}</div>
    `;
    chatWindow.appendChild(msgDiv);
    chatWindow.scrollTop = chatWindow.scrollHeight;
    return id;
 }
 function updateMessage(id, text, sources = []) {
    const msgDiv = document.getElementById(`msg-${id}`);
    if (!msgDiv) return;
    const contentDiv = msgDiv.querySelector('.content');
    contentDiv.innerHTML = text.replace(/\n/g, '<br>');
    if (sources && sources.length > 0) {
        const tagDiv = document.createElement('div');
        tagDiv.style.marginTop = '15px';
        tagDiv.style.display = 'flex';
        tagDiv.style.gap = '8px';
        tagDiv.style.flexWrap = 'wrap';
        sources.forEach((src, idx) => {
            const tag = document.createElement('span');
            tag.className = 'citation-tag';
            tag.innerHTML = `<i class="fas fa-file-pdf"></i> Nguồn ${idx + 1}`;
            tag.onclick = (e) => {
                e.stopPropagation();
                showSources(sources);
            };
            tagDiv.appendChild(tag);
        });
        contentDiv.appendChild(tagDiv);
    }
    chatWindow.scrollTop = chatWindow.scrollHeight;
 }
 function showSources(sources) {
    sourceList.innerHTML = '';
    sources.forEach(src => {
        const item = document.createElement('div');
        item.className = 'source-item';
        item.innerHTML = `
            <h4><i class="fas fa-file-alt"></i> ${src.file_name}</h4>
            <p><strong>Vị trí:</strong> Trang ${src.page}</p>
            ${src.url ? `<a href="${src.url}" target="_blank" style="color: #06b6d4; font-size: 11px; text-decoration: none; display: block; margin-top: 5px;">
                <i class="fas fa-external-link-alt"></i> Xem trên SharePoint
            </a>` : ''}
            ${src.download_url ? `<a href="${src.download_url}" target="_blank" style="color: #10b981; font-size: 11px; text-decoration: none; display: block; margin-top: 5px;">
                <i class="fas fa-download"></i> Tải xuống
            </a>` : ''}
        `;
        sourceList.appendChild(item);
    });
    sourcePanel.classList.add('active');
 }
 // Event Listeners
 closePanel.onclick = () => sourcePanel.classList.remove('active');
 sendBtn.onclick = sendMessage;
 userInput.onkeydown = (e) => {
    if (e.key === 'Enter' && !e.shiftKey) {
        e.preventDefault();
        sendMessage();
    }
 };
 clearChatBtn.onclick = () => {
    chatWindow.innerHTML = '';
    chatHistory = [];
    appendMessage('ai', 'Lịch sử chat đã được làm sạch. Tôi có thể giúp gì tiếp cho bạn?');
 };
 // ====== SYNC ======
 async function triggerSync() {
    syncBtn.disabled = true;
    syncStatus.style.display = 'flex';
    syncStatus.className = 'sync-status';
    syncStatus.querySelector('.sync-text').textContent = 'Đang đồng bộ...';
    try {
        const response = await fetch(`${API_BASE}/sync`, { method: 'POST' });
        const data = await response.json();
        if (data.status === 'already_running') {
            syncStatus.querySelector('.sync-text').textContent = 'Đồng bộ đang chạy...';
        } else {
            syncStatus.querySelector('.sync-text').textContent = 'Đang xử lý...';
            pollSyncStatus();
        }
    } catch (error) {
        syncStatus.className = 'sync-status error';
        syncStatus.querySelector('.sync-text').textContent = 'Lỗi kết nối server';
        syncBtn.disabled = false;
    }
 }
 async function pollSyncStatus() {
    try {
        const response = await fetch(`${API_BASE}/sync/status`);
        const data = await response.json();
        if (data.running) {
            const count = data.processed + data.skipped;
            syncStatus.querySelector('.sync-text').textContent = `Đang xử lý... (${count} file)`;
            setTimeout(pollSyncStatus, 2000);
        } else {
            syncStatus.className = 'sync-status done';
            syncStatus.querySelector('.sync-text').textContent = 
                `Xong! ${data.processed} file đã nạp, ${data.skipped} bỏ qua`;
            syncBtn.disabled = false;
            setTimeout(() => { syncStatus.style.display = 'none'; }, 5000);
        }
    } catch (error) {
        syncStatus.className = 'sync-status error';
        syncStatus.querySelector('.sync-text').textContent = 'Lỗi kiểm tra trạng thái';
        syncBtn.disabled = false;
    }
 }
 syncBtn.onclick = triggerSync;
 // Init
 checkLogin();
--- a/frontend/index.html
+++ b/frontend/index.html
@@ -0,0 +1,119 @@
 <!DOCTYPE html>
 <html lang="vi">
 <head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>AI Knowledge Hub | Enterprise RAG</title>
    <link rel="stylesheet" href="style.css">
    <link rel="preconnect" href="https://fonts.googleapis.com">
    <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
    <link href="https://fonts.googleapis.com/css2?family=Outfit:wght@300;400;600;700&display=swap" rel="stylesheet">
    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.0/css/all.min.css">
 </head>
 <body>
    <!-- Login Screen -->
    <div class="login-screen" id="login-screen">
        <div class="login-card">
            <div class="login-logo">
                <div class="logo-icon"><i class="fas fa-brain"></i></div>
                <h1>VibeCode AI</h1>
                <p>Enterprise Knowledge Hub</p>
            </div>
            <form id="login-form">
                <div class="input-group">
                    <i class="fas fa-envelope"></i>
                    <input type="email" id="login-email" placeholder="Nhập email của bạn" required>
                </div>
                <button type="submit" class="login-btn">
                    <i class="fas fa-sign-in-alt"></i> Đăng nhập
                </button>
            </form>
            <div class="login-divider"><span>hoặc</span></div>
            <button class="sso-btn" id="sso-btn">
                <i class="fab fa-microsoft"></i> Đăng nhập Microsoft SSO
            </button>
            <p class="login-hint">Dùng tài khoản Microsoft 365 của công ty.</p>
        </div>
    </div>
    <!-- Main App (hidden until login) -->
    <div class="app-container" id="app-container" style="display: none;">
        <!-- Sidebar -->
        <aside class="sidebar">
            <div class="logo-area">
                <div class="logo-icon"><i class="fas fa-brain"></i></div>
                <h1>VibeCode AI</h1>
            </div>
            <nav class="side-nav">
                <div class="nav-item active"><i class="fas fa-comments"></i> <span>Hỏi đáp RAG</span></div>
                <button class="sync-btn" id="sync-btn"><i class="fas fa-sync-alt"></i> Đồng bộ SharePoint</button>
            </nav>
            <div class="sync-status" id="sync-status" style="display: none;">
                <div class="sync-spinner"><i class="fas fa-spinner fa-spin"></i></div>
                <span class="sync-text">Đang đồng bộ...</span>
            </div>
            <div class="system-status">
                <div class="status-dot online"></div>
                <div class="status-text">OpenSearch: Online</div>
            </div>
            <div class="user-info" id="user-info">
                <div class="user-avatar"><i class="fas fa-user"></i></div>
                <div class="user-details">
                    <span class="user-name" id="user-name">-</span>
                    <span class="user-role" id="user-role">-</span>
                </div>
                <button id="logout-btn" title="Đăng xuất"><i class="fas fa-sign-out-alt"></i></button>
            </div>
        </aside>
        <!-- Main Chat Area -->
        <main class="chat-main">
            <header class="chat-header">
                <div class="header-info">
                    <h2>SharePoint Intelligence</h2>
                    <p>Hỏi đáp dựa trên cơ sở dữ liệu nội bộ của bạn</p>
                </div>
                <div class="header-actions">
                    <button id="clear-chat" title="Xoá lịch sử"><i class="fas fa-trash-alt"></i></button>
                </div>
            </header>
            <div class="messages-container" id="chat-window">
                <!-- Chào mừng -->
                <div class="message ai greeting">
                    <div class="avatar"><i class="fas fa-robot"></i></div>
                    <div class="content">
                        <p>Xin chào! Tôi là trợ lý tri thức của bạn. Tôi đã sẵn sàng trả lời các câu hỏi dựa trên tài liệu từ SharePoint. Bạn muốn tìm hiểu điều gì hôm nay?</p>
                    </div>
                </div>
            </div>
            <!-- Input Area -->
            <footer class="chat-footer">
                <div class="input-wrapper">
                    <textarea id="user-input" placeholder="Nhập câu hỏi của bạn tại đây..." rows="1"></textarea>
                    <button id="send-btn"><i class="fas fa-paper-plane"></i></button>
                </div>
                <p class="disclaimer">AI có thể đưa ra câu trả lời chưa chính xác. Vui lòng kiểm tra lại nguồn trích dẫn.</p>
            </footer>
        </main>
        <!-- Source Sidebar (Hidden by default) -->
        <section class="source-panel" id="source-panel">
            <div class="panel-header">
                <h3><i class="fas fa-book-open"></i> Nguồn trích dẫn</h3>
                <button id="close-panel"><i class="fas fa-times"></i></button>
            </div>
            <div class="source-list" id="source-list">
                <!-- Nguồn sẽ được render ở đây -->
            </div>
        </section>
    </div>
    <script src="app.js"></script>
 </body>
 </html>
--- a/frontend/style.css
+++ b/frontend/style.css
@@ -0,0 +1,591 @@
 :root {
    --bg-dark: #0f172a;
    --glass-bg: rgba(255, 255, 255, 0.05);
    --glass-border: rgba(255, 255, 255, 0.1);
    --primary: #06b6d4;
    --primary-glow: rgba(6, 182, 212, 0.3);
    --secondary: #8b5cf6;
    --text-main: #f8fafc;
    --text-muted: #94a3b8;
    --ai-bubble: rgba(30, 41, 59, 0.7);
    --user-bubble: linear-gradient(135deg, #06b6d4, #3b82f6);
 }
 * {
    margin: 0;
    padding: 0;
    box-sizing: border-box;
    font-family: 'Outfit', sans-serif;
 }
 body {
    background-color: var(--bg-dark);
    /* Mesh Gradient cực kỳ Premium bằng CSS thuần */
    background-image: 
        radial-gradient(at 0% 0%, rgba(6, 182, 212, 0.15) 0px, transparent 50%),
        radial-gradient(at 100% 0%, rgba(139, 92, 246, 0.15) 0px, transparent 50%),
        radial-gradient(at 100% 100%, rgba(6, 182, 212, 0.1) 0px, transparent 50%),
        radial-gradient(at 0% 100%, rgba(139, 92, 246, 0.1) 0px, transparent 50%);
    background-size: cover;
    height: 100vh;
    display: flex;
    justify-content: center;
    align-items: center;
    color: var(--text-main);
    overflow: hidden;
 }
 .app-container {
    width: 95vw;
    height: 90vh;
    background: var(--glass-bg);
    backdrop-filter: blur(20px);
    -webkit-backdrop-filter: blur(20px);
    border: 1px solid var(--glass-border);
    border-radius: 24px;
    display: flex;
    box-shadow: 0 25px 50px -12px rgba(0, 0, 0, 0.5);
    overflow: hidden;
    position: relative;
 }
 /* Sidebar Styling */
 .sidebar {
    width: 260px;
    background: rgba(15, 23, 42, 0.4);
    border-right: 1px solid var(--glass-border);
    padding: 30px 20px;
    display: flex;
    flex-direction: column;
 }
 .logo-area {
    display: flex;
    align-items: center;
    gap: 12px;
    margin-bottom: 50px;
 }
 .logo-icon {
    width: 40px;
    height: 40px;
    background: var(--user-bubble);
    border-radius: 12px;
    display: flex;
    justify-content: center;
    align-items: center;
    font-size: 20px;
    box-shadow: 0 0 20px var(--primary-glow);
 }
 .logo-area h1 {
    font-size: 20px;
    font-weight: 700;
    letter-spacing: 0.5px;
 }
 .side-nav {
    flex: 1;
 }
 .nav-item {
    padding: 14px 18px;
    border-radius: 12px;
    display: flex;
    align-items: center;
    gap: 15px;
    cursor: pointer;
    transition: 0.3s;
    color: var(--text-muted);
    margin-bottom: 8px;
 }
 .nav-item:hover, .nav-item.active {
    background: rgba(255, 255, 255, 0.08);
    color: var(--text-main);
 }
 .nav-item.active {
    border-left: 4px solid var(--primary);
 }
 .system-status {
    padding-top: 20px;
    border-top: 1px solid var(--glass-border);
    display: flex;
    align-items: center;
    gap: 10px;
    font-size: 13px;
    color: var(--text-muted);
 }
 .status-dot {
    width: 8px;
    height: 8px;
    border-radius: 50%;
 }
 .status-dot.online {
    background: #10b981;
    box-shadow: 0 0 10px #10b981;
 }
 /* Main Chat Area */
 .chat-main {
    flex: 1;
    display: flex;
    flex-direction: column;
    position: relative;
 }
 .chat-header {
    padding: 25px 40px;
    border-bottom: 1px solid var(--glass-border);
    display: flex;
    justify-content: space-between;
    align-items: center;
 }
 .header-info h2 {
    font-size: 18px;
 }
 .header-info p {
    font-size: 13px;
    color: var(--text-muted);
 }
 .header-actions button {
    background: transparent;
    border: none;
    color: var(--text-muted);
    font-size: 18px;
    cursor: pointer;
    transition: 0.3s;
 }
 .header-actions button:hover {
    color: #ef4444;
 }
 .messages-container {
    flex: 1;
    padding: 40px;
    overflow-y: auto;
    display: flex;
    flex-direction: column;
    gap: 25px;
    scrollbar-width: thin;
    scrollbar-color: var(--glass-border) transparent;
 }
 .messages-container::-webkit-scrollbar {
    width: 6px;
 }
 .messages-container::-webkit-scrollbar-thumb {
    background-color: var(--glass-border);
    border-radius: 10px;
 }
 .message {
    display: flex;
    gap: 18px;
    max-width: 85%;
    animation: fadeIn 0.4s ease-out;
 }
@keyframes fadeIn {
    from { opacity: 0; transform: translateY(10px); }
    to { opacity: 1; transform: translateY(0); }
 }
 .message.user {
    align-self: flex-end;
    flex-direction: row-reverse;
 }
 .avatar {
    width: 40px;
    height: 40px;
    border-radius: 12px;
    display: flex;
    justify-content: center;
    align-items: center;
    flex-shrink: 0;
 }
 .ai .avatar { background: rgba(255, 255, 255, 0.1); color: var(--primary); border: 1px solid var(--glass-border); }
 .user .avatar { background: var(--user-bubble); color: white; }
 .content {
    background: var(--ai-bubble);
    padding: 16px 20px;
    border-radius: 18px;
    border-bottom-left-radius: 4px;
    line-height: 1.6;
    font-size: 15px;
    border: 1px solid var(--glass-border);
 }
 .user .content {
    background: var(--user-bubble);
    border-bottom-left-radius: 18px;
    border-bottom-right-radius: 4px;
    border: none;
 }
 .citation-tag {
    display: inline-block;
    margin-top: 10px;
    padding: 4px 10px;
    background: rgba(6, 182, 212, 0.1);
    border: 1px solid rgba(6, 182, 212, 0.2);
    border-radius: 8px;
    font-size: 12px;
    color: var(--primary);
    cursor: pointer;
    transition: 0.3s;
 }
 .citation-tag:hover {
    background: rgba(6, 182, 212, 0.2);
 }
 /* Footer Input Area */
 .chat-footer {
    padding: 30px 40px;
 }
 .input-wrapper {
    background: rgba(255, 255, 255, 0.05);
    border: 1px solid var(--glass-border);
    border-radius: 16px;
    padding: 10px 15px;
    display: flex;
    align-items: center;
    gap: 15px;
    transition: 0.3s;
 }
 .input-wrapper:focus-within {
    border-color: var(--primary);
    box-shadow: 0 0 15px var(--primary-glow);
 }
 textarea {
    flex: 1;
    background: transparent;
    border: none;
    color: var(--text-main);
    resize: none;
    outline: none;
    padding: 10px 5px;
    max-height: 150px;
 }
 #send-btn {
    width: 45px;
    height: 45px;
    background: var(--user-bubble);
    border: none;
    border-radius: 12px;
    color: white;
    cursor: pointer;
    transition: 0.3s;
 }
 #send-btn:hover {
    transform: scale(1.05);
    box-shadow: 0 0 15px var(--primary-glow);
 }
 .disclaimer {
    text-align: center;
    font-size: 11px;
    color: var(--text-muted);
    margin-top: 12px;
 }
 /* Source Panel */
 .source-panel {
    position: absolute;
    right: -350px;
    top: 0;
    width: 350px;
    height: 100%;
    background: rgba(15, 23, 42, 0.95);
    backdrop-filter: blur(25px);
    border-left: 1px solid var(--glass-border);
    transition: 0.4s cubic-bezier(0.4, 0, 0.2, 1);
    z-index: 10;
    padding: 30px;
 }
 .source-panel.active {
    right: 0;
 }
 .panel-header {
    display: flex;
    justify-content: space-between;
    align-items: center;
    margin-bottom: 30px;
 }
 .source-item {
    background: rgba(255, 255, 255, 0.05);
    border: 1px solid var(--glass-border);
    border-radius: 12px;
    padding: 15px;
    margin-bottom: 15px;
 }
 .source-item h4 { font-size: 14px; margin-bottom: 5px; color: var(--primary); }
 .source-item p { font-size: 12px; color: var(--text-muted); line-height: 1.4; }
 /* Login Screen */
 .login-screen {
    position: fixed;
    top: 0;
    left: 0;
    width: 100vw;
    height: 100vh;
    display: flex;
    justify-content: center;
    align-items: center;
    z-index: 100;
 }
 .login-card {
    background: var(--glass-bg);
    backdrop-filter: blur(30px);
    border: 1px solid var(--glass-border);
    border-radius: 24px;
    padding: 50px 40px;
    width: 400px;
    text-align: center;
    box-shadow: 0 25px 50px -12px rgba(0, 0, 0, 0.5);
 }
 .login-logo {
    margin-bottom: 40px;
 }
 .login-logo .logo-icon {
    width: 60px;
    height: 60px;
    margin: 0 auto 20px;
    font-size: 28px;
 }
 .login-logo h1 {
    font-size: 24px;
    margin-bottom: 8px;
 }
 .login-logo p {
    color: var(--text-muted);
    font-size: 14px;
 }
 .input-group {
    display: flex;
    align-items: center;
    gap: 12px;
    background: rgba(255, 255, 255, 0.05);
    border: 1px solid var(--glass-border);
    border-radius: 12px;
    padding: 14px 18px;
    margin-bottom: 20px;
    transition: 0.3s;
 }
 .input-group:focus-within {
    border-color: var(--primary);
    box-shadow: 0 0 15px var(--primary-glow);
 }
 .input-group i {
    color: var(--text-muted);
    font-size: 16px;
 }
 .input-group input {
    flex: 1;
    background: transparent;
    border: none;
    outline: none;
    color: var(--text-main);
    font-size: 15px;
 }
 .login-btn {
    width: 100%;
    padding: 14px;
    background: var(--user-bubble);
    border: none;
    border-radius: 12px;
    color: white;
    font-size: 16px;
    font-weight: 600;
    cursor: pointer;
    transition: 0.3s;
 }
 .login-btn:hover {
    transform: scale(1.02);
    box-shadow: 0 0 20px var(--primary-glow);
 }
 .login-hint {
    margin-top: 20px;
    font-size: 12px;
    color: var(--text-muted);
 }
 .login-divider {
    display: flex;
    align-items: center;
    gap: 15px;
    margin: 20px 0;
    color: var(--text-muted);
    font-size: 13px;
 }
 .login-divider::before,
 .login-divider::after {
    content: '';
    flex: 1;
    height: 1px;
    background: var(--glass-border);
 }
 .sso-btn {
    width: 100%;
    padding: 14px;
    background: rgba(255, 255, 255, 0.08);
    border: 1px solid var(--glass-border);
    border-radius: 12px;
    color: var(--text-main);
    font-size: 15px;
    font-weight: 600;
    cursor: pointer;
    transition: 0.3s;
    display: flex;
    align-items: center;
    justify-content: center;
    gap: 10px;
 }
 .sso-btn:hover {
    background: rgba(255, 255, 255, 0.15);
    border-color: var(--primary);
 }
 /* User Info */
 .user-info {
    padding-top: 20px;
    border-top: 1px solid var(--glass-border);
    display: flex;
    align-items: center;
    gap: 12px;
    font-size: 13px;
    color: var(--text-muted);
 }
 .user-avatar {
    width: 36px;
    height: 36px;
    background: rgba(255, 255, 255, 0.1);
    border-radius: 10px;
    display: flex;
    align-items: center;
    justify-content: center;
    color: var(--primary);
 }
 .user-details {
    flex: 1;
    display: flex;
    flex-direction: column;
 }
 .user-name {
    color: var(--text-main);
    font-weight: 600;
    font-size: 13px;
 }
 .user-role {
    font-size: 11px;
    text-transform: uppercase;
    letter-spacing: 0.5px;
 }
 #logout-btn {
    background: transparent;
    border: none;
    color: var(--text-muted);
    cursor: pointer;
    padding: 8px;
    border-radius: 8px;
    transition: 0.3s;
 }
 #logout-btn:hover {
    color: #ef4444;
    background: rgba(239, 68, 68, 0.1);
 }
 /* Sync Button */
 .sync-btn {
    width: 100%;
    padding: 12px 18px;
    margin-top: 10px;
    background: rgba(6, 182, 212, 0.1);
    border: 1px solid rgba(6, 182, 212, 0.2);
    border-radius: 12px;
    color: var(--primary);
    font-size: 13px;
    cursor: pointer;
    transition: 0.3s;
    display: flex;
    align-items: center;
    gap: 10px;
 }
 .sync-btn:hover {
    background: rgba(6, 182, 212, 0.2);
 }
 .sync-btn:disabled {
    opacity: 0.5;
    cursor: not-allowed;
 }
 /* Sync Status */
 .sync-status {
    padding: 12px;
    margin-top: 10px;
    background: rgba(139, 92, 246, 0.1);
    border: 1px solid rgba(139, 92, 246, 0.2);
    border-radius: 10px;
    display: flex;
    align-items: center;
    gap: 10px;
    font-size: 12px;
    color: var(--secondary);
 }
 .sync-status.done {
    background: rgba(16, 185, 129, 0.1);
    border-color: rgba(16, 185, 129, 0.2);
    color: #10b981;
 }
 .sync-status.error {
    background: rgba(239, 68, 68, 0.1);
    border-color: rgba(239, 68, 68, 0.2);
    color: #ef4444;
 }
--- a/indexing/vector_store.py
+++ b/indexing/vector_store.py
@@ -1,4 +1,5 @@
 import logging
 import os
 from typing import List
 from opensearchpy import OpenSearch, RequestsHttpConnection
 from core.models import DocumentChunk
@@ -10,9 +11,12 @@ class VectorStore:
    def __init__(self, index_name: str = "sharepoint_docs"):
        self.index_name = index_name
-        # Kết nối tới OpenSearch Cluster
+        host = settings.opensearch_host
        if host == "opensearch" and os.environ.get("ENV") != "docker":
            host = "localhost"
        self.client = OpenSearch(
-            hosts=[{'host': settings.opensearch_host, 'port': settings.opensearch_port}],
+            hosts=[{'host': host, 'port': settings.opensearch_port}],
            http_auth=(settings.opensearch_user, settings.opensearch_pass),
            use_ssl=False,
            verify_certs=False,
@@ -64,6 +68,7 @@ class VectorStore:
                        "page_from": { "type": "integer" },
                        "page_to": { "type": "integer" },
                        "source_url": { "type": "keyword" },
                        "download_url": { "type": "keyword" },
                        "permissions": { "type": "keyword" }
                    }
                }
@@ -71,6 +76,23 @@ class VectorStore:
            self.client.indices.create(index=self.index_name, body=mapping)
            logger.info(f"Đã tạo OpenSearch Index: {self.index_name}")
    def delete_by_file_id(self, file_id: str):
        """Xóa tất cả chunks cũ của một file trước khi nạp lại."""
        query = {
            "query": {
                "term": { "file_id": file_id }
            }
        }
        try:
            response = self.client.delete_by_query(index=self.index_name, body=query)
            deleted = response.get("deleted", 0)
            if deleted > 0:
                logger.info(f"Đã xóa {deleted} chunks cũ của file_id={file_id}")
            return deleted
        except Exception as e:
            logger.warning(f"Không thể xóa chunks cũ (có thể index chưa tồn tại): {e}")
            return 0
    def embed_and_index(self, chunks: List[DocumentChunk]):
        """Biến đổi Text thành Vector và lưu vào Database"""
        if not chunks:
--- a/ingestion/graph_client.py
+++ b/ingestion/graph_client.py
@@ -132,3 +132,13 @@ class GraphClient:
        else:
            url = f"{self.base_url}/drives/{drive_id}/root/delta"
        return self._make_get_request(url)
    def get_item_details(self, drive_id: str, item_id: str):
        """GET /drives/{driveId}/items/{itemId} - Lấy thông tin chi tiết bao gồm webUrl và downloadUrl."""
        url = f"{self.base_url}/drives/{drive_id}/items/{item_id}"
        return self._make_get_request(url)
    def get_item_permissions(self, drive_id: str, item_id: str):
        """GET /drives/{driveId}/items/{itemId}/permissions - Lấy danh sách quyền truy cập."""
        url = f"{self.base_url}/drives/{drive_id}/items/{item_id}/permissions"
        return self._make_get_request(url)
--- a/ingestion/providers/base_provider.py
+++ b/ingestion/providers/base_provider.py
@@ -34,3 +34,29 @@ class BaseStorageProvider(ABC):
            bytes: The raw file content.
        """
        pass
    @abstractmethod
    def get_item_details(self, item_id: str) -> Dict:
        """
        Get full item details including webUrl and downloadUrl.
        Args:
            item_id (str): The item ID from fetch_changes.
        Returns:
            Dict: Full item details with links.
        """
        pass
    @abstractmethod
    def get_item_permissions(self, item_id: str) -> List[str]:
        """
        Get permissions for an item. Returns list of user/group emails or IDs.
        Args:
            item_id (str): The item ID from fetch_changes.
        Returns:
            List[str]: List of user/group identifiers. ["*"] means everyone can access.
        """
        pass
--- a/ingestion/providers/sharepoint_provider.py
+++ b/ingestion/providers/sharepoint_provider.py
@@ -81,6 +81,62 @@ class SharePointProvider(BaseStorageProvider):
        return standardized_items, new_state
    def get_item_details(self, item_id: str) -> Dict:
        """
        Get full item details including webUrl and downloadUrl.
        """
        try:
            item = self.graph.get_item_details(self.drive_id, item_id)
            return {
                "id": item.get("id"),
                "name": item.get("name"),
                "web_url": item.get("webUrl"),
                "download_url": item.get("@microsoft.graph.downloadUrl"),
                "size": item.get("size"),
                "last_modified": item.get("lastModifiedDateTime"),
            }
        except Exception as e:
            logger.error(f"Failed to get item details for {item_id}: {e}")
            raise e
    def get_item_permissions(self, item_id: str) -> List[str]:
        """
        Get permissions for an item. Returns list of user/group emails or IDs.
        """
        try:
            response = self.graph.get_item_permissions(self.drive_id, item_id)
            permissions = set()
            for perm in response.get("value", []):
                # Lấy grantedTo hoặc grantedToIdentities
                granted = perm.get("grantedTo", {})
                if not granted:
                    identities = perm.get("grantedToIdentitiesV2", [])
                    for identity in identities:
                        user = identity.get("user", {})
                        if user.get("email"):
                            permissions.add(user["email"].lower())
                        elif user.get("id"):
                            permissions.add(user["id"])
                user = granted.get("user", {})
                if user.get("email"):
                    permissions.add(user["email"].lower())
                elif user.get("id"):
                    permissions.add(user["id"])
                # Nếu có grantedToV2 (site group)
                granted_v2 = perm.get("grantedToV2", {})
                site_group = granted_v2.get("siteGroup", {})
                if site_group.get("displayName"):
                    permissions.add(f"group:{site_group['displayName']}")
            return list(permissions) if permissions else ["*"]
        except Exception as e:
            logger.warning(f"Failed to get permissions for {item_id}: {e}. Defaulting to ['*']")
            return ["*"]
    def download_file(self, target_item: Dict) -> bytes:
        """
        Download file content from SharePoint.
--- a/ingestion/sync.py
+++ b/ingestion/sync.py
@@ -3,145 +3,88 @@ import json
 import logging
 from typing import List, Dict, Any
 # Ensure we can import from the root module if run directly
 import sys
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-from ingestion.graph_client import GraphClient
+from ingestion.providers.base_provider import BaseStorageProvider
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger("IngestionSync")
 class SharePointSync:
    def __init__(self, hostname: str, site_path: str):
        self.graph_client = GraphClient()
        self.hostname = hostname
        self.site_path = site_path
        self.state_file = "delta_state.json"
        self.output_file = "ingestion_output.json"
-    def _load_delta_link(self) -> str:
+class SyncEngine:
-        """Load delta link from local state file."""
+    """
    Bộ điều phối đồng bộ không phụ thuộc vào nhà cung cấp cụ thể.
    Nhận vào bất kỳ BaseStorageProvider nào (SharePoint, Google Drive, NAS, ...).
    """
    def __init__(self, provider: BaseStorageProvider, state_file: str = "delta_state.json", output_file: str = "ingestion_output.json"):
        self.provider = provider
        self.state_file = state_file
        self.output_file = output_file
    def _load_sync_state(self) -> Dict:
        """Load sync state từ local file."""
        if os.path.exists(self.state_file):
            with open(self.state_file, "r", encoding="utf-8") as f:
-                data = json.load(f)
+                return json.load(f)
-                return data.get("delta_link")
+        return {}
        return None
-    def _save_delta_link(self, delta_link: str):
+    def _save_sync_state(self, state: Dict):
-        """Save delta link to local state file for next incremental sync."""
+        """Save sync state ra local file."""
        with open(self.state_file, "w", encoding="utf-8") as f:
-            json.dump({"delta_link": delta_link}, f, indent=2)
+            json.dump(state, f, indent=2, ensure_ascii=False)
    def _extract_metadata(self, item: Dict[Any, Any], site_id: str, drive_id: str) -> Dict[str, Any]:
        """Convert Graph API item payload to our target schema."""
        download_url = item.get("@microsoft.graph.downloadUrl")
        if not download_url and "folder" not in item and "deleted" not in item:
            try:
                # Delta query might not return downloadUrl, so fetch it directly
                item_id = item.get("id")
                url = f"https://graph.microsoft.com/v1.0/drives/{drive_id}/items/{item_id}"
                full_item = self.graph_client._make_get_request(url)
                download_url = full_item.get("@microsoft.graph.downloadUrl")
            except Exception as e:
                logger.error(f"Failed to fetch download_url for {item.get('name')}: {e}")
        return {
            "site_id": site_id,
            "drive_id": drive_id,
            "item_id": item.get("id"),
            "name": item.get("name"),
            "web_url": item.get("webUrl"),
            "download_url": download_url,
            "mime_type": item.get("file", {}).get("mimeType") if "file" in item else None,
            "parent_path": item.get("parentReference", {}).get("path"),
            "is_folder": "folder" in item,
            "size": item.get("size"),
            "last_modified": item.get("lastModifiedDateTime"),
            "created": item.get("createdDateTime"),
            "eTag": item.get("eTag"),
            "cTag": item.get("cTag"),
            "deleted": "deleted" in item
        }
    def _upsert_to_local_db(self, new_items: List[Dict[str, Any]]):
-        """Simulate upsert into a database by writing to a JSON file."""
+        """Lưu kết quả vào local JSON (mô phỏng DB)."""
        db = {}
        if os.path.exists(self.output_file):
            with open(self.output_file, "r", encoding="utf-8") as f:
                try:
                    existing = json.load(f)
                    for item in existing:
-                        db[item["item_id"]] = item
+                        db[item["id"]] = item
                except json.JSONDecodeError:
                    pass
        for item in new_items:
-            if item.get("deleted"):
+            item_id = item.get("id")
-                # If deleted, we mark it as deleted in our db (or we could remove it)
+            if item.get("is_deleted"):
-                if item["item_id"] in db:
+                if item_id in db:
-                    db[item["item_id"]]["deleted"] = True
+                    db[item_id]["is_deleted"] = True
                else:
-                    # It's deleted but we didn't have it anyway
+                    db[item_id] = item
                    db[item["item_id"]] = item
            else:
-                db[item["item_id"]] = item
+                db[item_id] = item
        final_list = list(db.values())
        with open(self.output_file, "w", encoding="utf-8") as f:
            json.dump(final_list, f, indent=2, ensure_ascii=False)
-        logger.info(f"Local database updated. Total items currently stored: {len(final_list)}")
+        logger.info(f"Local database updated. Total items: {len(final_list)}")
    def run_sync(self):
-        logger.info("=== STARTING SHAREPOINT SYNC ===")
+        """Chạy đồng bộ: fetch changes từ provider -> lưu local."""
        logger.info("=== STARTING SYNC ===")
-        # 1. & 2. Resolve Site and Drive
+        sync_state = self._load_sync_state()
-        logger.info(f"Resolving site: {self.hostname}:{self.site_path}")
+        if sync_state:
-        site_info = self.graph_client.get_site_by_path(self.hostname, self.site_path)
+            logger.info("Found existing sync state. Performing INCREMENTAL sync.")
        site_id = site_info["id"]
        logger.info(f"Resolving drive for site: {site_id}")
        drive_info = self.graph_client.get_drive(site_id)
        drive_id = drive_info["id"]
        # 3. Delta Query setup
        delta_link = self._load_delta_link()
        if delta_link:
            logger.info("Found existing delta_link. Performing INCREMENTAL sync.")
        else:
-            logger.info("No delta_link found. Performing FULL sync.")
+            logger.info("No sync state found. Performing FULL sync.")
-        items_collected = []
+        items, new_state = self.provider.fetch_changes(sync_state)
-        current_url = delta_link
+        logger.info(f"Provider returned {len(items)} change(s).")
-        # Loop over pagination
+        if items:
-        while True:
+            self._upsert_to_local_db(items)
            response = self.graph_client.delta_query(drive_id, current_url)
            values = response.get("value", [])
            items_collected.extend(values)
-            if "@odata.nextLink" in response:
+        self._save_sync_state(new_state)
-                current_url = response["@odata.nextLink"]
+        logger.info("Sync state saved.")
                logger.info("Fetching next page of delta results...")
            elif "@odata.deltaLink" in response:
                new_delta_link = response["@odata.deltaLink"]
                self._save_delta_link(new_delta_link)
                logger.info("Reached end of delta changes. Saved new delta_link.")
                break
            else:
                logger.warning("No nextLink or deltaLink found in response! Breaking loop.")
                break
        logger.info(f"Delta query returned {len(items_collected)} change(s).")
        # 4. Extract metadata and save
        if items_collected:
            processed_items = [self._extract_metadata(item, site_id, drive_id) for item in items_collected]
            self._upsert_to_local_db(processed_items)
        else:
            logger.info("No items to process.")
 if __name__ == "__main__":
-    sync = SharePointSync("285pdg.sharepoint.com", "/sites/poc_system")
+    from ingestion.providers.sharepoint_provider import SharePointProvider
-    sync.run_sync()
+
    provider = SharePointProvider()
    engine = SyncEngine(provider)
    engine.run_sync()
--- a/search/retriever.py
+++ b/search/retriever.py
@@ -1,40 +1,49 @@
 import logging
-from typing import List
+import os
 from typing import List, Optional
 from opensearchpy import OpenSearch, RequestsHttpConnection
 from core.config import settings
 from core.models import DocumentChunk
 logger = logging.getLogger("Retriever")
 class SearchRetriever:
    def __init__(self, index_name: str = "poc_sharepoint_docs"):
        self.index_name = index_name
-        # Kết nối OpenSearch
+        host = settings.opensearch_host
        if host == "opensearch" and os.environ.get("ENV") != "docker":
            host = "localhost"
        self.client = OpenSearch(
-            hosts=[{'host': settings.opensearch_host, 'port': settings.opensearch_port}],
+            hosts=[{'host': host, 'port': settings.opensearch_port}],
            http_auth=(settings.opensearch_user, settings.opensearch_pass),
            use_ssl=False,
            verify_certs=False,
            connection_class=RequestsHttpConnection
        )
-        # Load Local Embedding Model (để biến câu hỏi thành vector cùng không gian với dữ liệu)
+        logger.info("Loading Embedding Model for Retriever...")
        logger.info("Đang nạp Embedding Model cho Retriever...")
        from sentence_transformers import SentenceTransformer
        self.embedder = SentenceTransformer('keepitreal/vietnamese-sbert')
-    def retrieve(self, query: str, top_k: int = 5) -> List[DocumentChunk]:
+    def retrieve(self, query: str, top_k: int = 5, user_email: Optional[str] = None, is_admin: bool = False) -> List[DocumentChunk]:
        """
-        Tìm kiếm ngữ nghĩa (Semantic Search) dựa trên Vector k-NN
+        Tìm kiếm ngữ nghĩa với ACL filtering.
-        """
+        
-        logger.info(f"Đang tìm kiếm ngữ nghĩa cho câu hỏi: '{query}'")
+        Args:
            query: Câu hỏi của user
            top_k: Số kết quả tối đa
            user_email: Email user để filter quyền.
            is_admin: True = bypass ACL, thấy tất cả.
        """
        logger.info(f"Search: '{query[:80]}' (user={user_email or 'none'}, admin={is_admin})")
        # 1. Chuyển câu hỏi thành Vector
        query_vector = self.embedder.encode(query).tolist()
-        # 2. Xây dựng k-NN Query cho OpenSearch
+        # Admin hoặc không có user_email → không filter
-        # Ta có thể kết hợp Hybrid Search (Vector + Text) ở đây nếu muốn
+        if is_admin or not user_email:
            search_query = {
                "size": top_k,
                "query": {
@@ -46,6 +55,30 @@ class SearchRetriever:
                    }
                }
            }
        else:
            # User thường → filter theo permissions
            search_query = {
                "size": top_k,
                "query": {
                    "bool": {
                        "must": [
                            {
                                "knn": {
                                    "embedding": {
                                        "vector": query_vector,
                                        "k": top_k * 2
                                    }
                                }
                            }
                        ],
                        "should": [
                            {"term": {"permissions": "*"}},
                            {"term": {"permissions": user_email.lower()}}
                        ],
                        "minimum_should_match": 1
                    }
                }
            }
        try:
            response = self.client.search(
@@ -58,13 +91,12 @@ class SearchRetriever:
            for hit in hits:
                source = hit["_source"]
                # Chuyển từ JSON sang DocumentChunk model
                chunk = DocumentChunk(**source)
                results.append(chunk)
-            logger.info(f"Tìm thấy {len(results)} đoạn văn phù hợp nhất.")
+            logger.info(f"Found {len(results)} chunks")
            return results
        except Exception as e:
-            logger.error(f"Lỗi khi truy vấn OpenSearch: {e}")
+            logger.error(f"OpenSearch query error: {e}")
            return []
--- a/test_dce_pipeline.py
+++ b/test_dce_pipeline.py
@@ -7,6 +7,7 @@ sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
 from core.models import IngestedDocument
 from extraction.dce import DocumentClassificationEngine
 from ingestion.providers.sharepoint_provider import SharePointProvider
 logging.basicConfig(level=logging.INFO, format="%(levelname)s:%(name)s:%(message)s")
@@ -20,19 +21,21 @@ def main():
    with open("ingestion_output.json", "r", encoding="utf-8") as f:
        items = json.load(f)
-    dce = DocumentClassificationEngine()
+    # Khởi tạo provider để download file qua Graph API auth
    provider = SharePointProvider()
    dce = DocumentClassificationEngine(provider=provider)
    print(f"Loaded {len(items)} items from ingestion_output.json\n")
    for item in items:
        if item.get("is_folder"):
-            continue # DCE only processes files
+            continue
        doc = IngestedDocument(**item)
        print(f"\n--- Processing: {doc.name} ---")
-        result = dce.classify(doc)
+        result = dce.classify(doc, target_item=item)
-        print(f">> Policy: {result.processing_policy.value} | Reason: {result.reason}")
+        print(f">> Type: {result.doc_type.value} | Policy: {result.processing_policy.value} | Reason: {result.reason}")
 if __name__ == "__main__":
    main()
--- a/test_rag_pipeline.py
+++ b/test_rag_pipeline.py
@@ -2,77 +2,187 @@ import logging
 import sys
 from core.config import settings
 from core.models import IngestedDocument, ProcessingPolicy
 from ingestion.providers.sharepoint_provider import SharePointProvider
 from extraction.dce import DocumentClassificationEngine
 from extraction.ocr_service import OCRService
 from extraction.text_extractor import TextExtractor
 from chunking.markdown_chunker import MarkdownChunker
 from indexing.vector_store import VectorStore
 logging.basicConfig(level=logging.INFO, format="%(levelname)s:%(name)s:%(message)s")
 logger = logging.getLogger("RAGPipeline")
 def run_pipeline():
    logger.info("=== BẮT ĐẦU TEST TOÀN BỘ ĐƯỜNG ỐNG RAG ===")
-    # Ép buộc dùng localhost cho OpenSearch khi chạy trực tiếp trên WSL
+def extract_text_from_pdf_bytes(pdf_bytes: bytes) -> str:
    """Trích xuất text trực tiếp từ PDF có text layer (không cần OCR)."""
    try:
        import fitz
        doc = fitz.open(stream=pdf_bytes, filetype="pdf")
        texts = []
        for page in doc:
            texts.append(page.get_text())
        return "\n\n".join(texts)
    except Exception as e:
        logger.error(f"Failed to extract text from PDF: {e}")
        return ""
 def run_pipeline():
    logger.info("=== BẮT ĐẦU TEST TOÀN BỘ ĐƯỜNG ỐNG RAG (với DCE) ===")
    if settings.opensearch_host == "opensearch":
        settings.opensearch_host = "localhost"
-    # 1. Tầng Ingestion
+    # 1. INGESTION
    logger.info("\n--- BƯỚC 1: Lấy file từ SharePoint ---")
    provider = SharePointProvider()
    items, _ = provider.fetch_changes({})
-    target_item = None
+    if not items:
-    for item in items:
+        logger.error("Không có file nào trên SharePoint!")
        if item.get("name", "").lower().endswith(".pdf"):
            target_item = item
            break
    if not target_item:
        logger.error("Không tìm thấy file PDF nào trên SharePoint để test!")
        sys.exit(1)
-    logger.info(f"Đã chọn file: {target_item['name']}. Đang tải...")
+    logger.info(f"Đã lấy {len(items)} items từ SharePoint.")
    pdf_bytes = provider.download_file(target_item)
    logger.info(f"Tải thành công {len(pdf_bytes)} bytes.")
-    # 2. Tầng Extraction (VLM)
+    # 2. DCE + PROCESSING
-    logger.info("\n--- BƯỚC 2: OCR / VLM Trích xuất Markdown ---")
+    dce = DocumentClassificationEngine(provider=provider)
    ocr = OCRService()
    pages = ocr.process_pdf_bytes(pdf_bytes)
    if not pages:
        logger.error("VLM không trích xuất được nội dung nào!")
        sys.exit(1)
    logger.info(f"VLM đã trích xuất thành công {len(pages)} trang.")
    # 3. Tầng Chunking
    logger.info("\n--- BƯỚC 3: Băm nhỏ văn bản (Semantic Chunking) ---")
    chunker = MarkdownChunker(max_chunk_size=1000, overlap=100)
    # Tạo metadata giả lập để lưu vào Chunk
    metadata = {
        "item_id": target_item["id"],
        "name": target_item["name"],
        "web_url": "https://285pdg.sharepoint.com/...",
        "site_id": settings.sharepoint_site_id
    }
    chunks = chunker.chunk_document(pages, metadata)
    logger.info(f"Đã băm thành {len(chunks)} chunks độc lập.")
    if chunks:
        logger.info(f"Ví dụ Chunk đầu tiên:\n[ID: {chunks[0].chunk_id}] {chunks[0].text[:150]}...")
    # 4. Tầng Vector Database (OpenSearch)
    logger.info("\n--- BƯỚC 4: Mã hóa Vector & Indexing ---")
    try:
        vector_db = VectorStore(index_name="poc_sharepoint_docs")
        vector_db.embed_and_index(chunks)
        logger.info("🎉 CHÚC MỪNG! DỮ LIỆU ĐÃ NẰM TRONG OPENSEARCH SẴN SÀNG ĐỂ CHAT!")
    except Exception as e:
-        logger.error(f"LỖI trong quá trình Embedding / Indexing: {e}")
+        logger.error(f"Không kết nối được OpenSearch: {e}")
-        logger.warning("Gợi ý: Hãy chắc chắn Docker OpenSearch đang chạy trên cổng 9200!")
+        sys.exit(1)
    processed_count = 0
    skipped_count = 0
    for item in items:
        if item.get("is_folder") or item.get("is_deleted"):
            continue
        name = item.get("name", "")
        item_id = item.get("id", "")
        # Tạo IngestedDocument cho DCE
        item_details = provider.get_item_details(item_id)
        permissions = provider.get_item_permissions(item_id)
        doc = IngestedDocument(
            site_id=settings.sharepoint_site_id,
            drive_id="",
            item_id=item_id,
            name=name,
            web_url=item_details.get("web_url", ""),
            download_url=item_details.get("download_url"),
            is_folder=False,
            size=item.get("size", 0),
        )
        # DCE PHÂN LOẠI
        logger.info(f"\n--- DCE: {name} ---")
        classification = dce.classify(doc, target_item=item)
        logger.info(f"   → {classification.doc_type.value} | {classification.processing_policy.value} | {classification.reason}")
        # XỬ LÝ THEO POLICY
        if classification.processing_policy == ProcessingPolicy.UNSUPPORTED:
            logger.info(f"   ⏭ BỎ QUA: {name} (unsupported)")
            skipped_count += 1
            continue
        if classification.processing_policy == ProcessingPolicy.METADATA_ONLY:
            logger.info(f"   ⏭ BỎ QUA: {name} (metadata-only, không index text)")
            skipped_count += 1
            continue
        if classification.processing_policy == ProcessingPolicy.REQUIRES_REVIEW:
            logger.info(f"   ⏭ BỎ QUA: {name} (cần review thủ công)")
            skipped_count += 1
            continue
        # DOWNLOAD FILE
        logger.info(f"   📥 Đang tải {name}...")
        try:
            file_bytes = provider.download_file(item)
        except Exception as e:
            logger.error(f"   ❌ Lỗi tải {name}: {e}")
            skipped_count += 1
            continue
        if not file_bytes:
            logger.error(f"   ❌ File rỗng: {name}")
            skipped_count += 1
            continue
        # EXTRACTION
        pages = []
        ext = name.lower().rsplit(".", 1)[-1] if "." in name else ""
        if classification.processing_policy == ProcessingPolicy.SKIP_OCR:
            if ext == "pdf":
                # TEXT_PDF: trích xuất text trực tiếp, không OCR
                logger.info(f"   📄 TEXT_PDF: Trích xuất text trực tiếp (không OCR)...")
                text = extract_text_from_pdf_bytes(file_bytes)
                if text.strip():
                    from core.models import OCRPageResult
                    pages = [OCRPageResult(page=1, text=text, confidence=1.0)]
                else:
                    logger.warning(f"   ⚠️ Không trích xuất được text từ {name}")
            elif ext in ("docx", "doc"):
                logger.info(f"   📄 DOCX: Trích xuất text bằng python-docx...")
                pages = TextExtractor.extract_from_docx(file_bytes)
            elif ext in ("xlsx", "xls"):
                logger.info(f"   📄 XLSX: Trích xuất dữ liệu bằng openpyxl...")
                pages = TextExtractor.extract_from_xlsx(file_bytes)
            elif ext in ("txt", "md", "csv"):
                logger.info(f"   📄 {ext.upper()}: Đọc text trực tiếp...")
                pages = TextExtractor.extract_from_text(file_bytes)
            else:
                logger.info(f"   📄 {classification.doc_type.value}: Chưa hỗ trợ extract text, bỏ qua.")
                skipped_count += 1
                continue
        elif classification.processing_policy == ProcessingPolicy.REQUIRES_OCR:
            # SCAN_PDF: dùng VLM OCR
            logger.info(f"   👁️ SCAN_PDF: Đang OCR qua VLM...")
            pages = ocr.process_pdf_bytes(file_bytes)
        if not pages:
            logger.warning(f"   ⚠️ Không có nội dung để index: {name}")
            skipped_count += 1
            continue
        # CHUNKING
        logger.info(f"   ✂️ Đang chunk ({len(pages)} trang)...")
        metadata = {
            "item_id": item_id,
            "name": name,
            "web_url": item_details.get("web_url"),
            "download_url": item_details.get("download_url"),
            "site_id": settings.sharepoint_site_id,
            "permissions": permissions
        }
        chunks = chunker.chunk_document(pages, metadata)
        if not chunks:
            logger.warning(f"   ⚠️ Không có chunks: {name}")
            skipped_count += 1
            continue
        # INDEXING
        logger.info(f"   📦 Đang index {len(chunks)} chunks vào OpenSearch...")
        vector_db.delete_by_file_id(item_id)
        vector_db.embed_and_index(chunks)
        processed_count += 1
        logger.info(f"   ✅ HOÀN TẤT: {name} → {len(chunks)} chunks")
    # SUMMARY
    logger.info("\n" + "=" * 60)
    logger.info(f"📊 TỔNG KẾT: {processed_count} file đã xử lý, {skipped_count} file bỏ qua")
    logger.info("=" * 60)
 if __name__ == "__main__":
    run_pipeline()