diff --git a/api/main.py b/api/main.py
index 8fadce4..306e3b7 100644
--- a/api/main.py
+++ b/api/main.py
@@ -1,17 +1,29 @@
 import logging
 import sys
 import os
+import secrets
 from enum import Enum
 from typing import List, Optional, Dict, Any
-from fastapi import FastAPI, HTTPException, BackgroundTasks, status
+from fastapi import FastAPI, HTTPException, BackgroundTasks, Request, status
+from fastapi.responses import RedirectResponse
+from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel, Field, validator
 import uvicorn
+import msal
 
 # Đảm bảo đường dẫn module
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
 from chat.rag_engine import RAGEngine
 from core.config import settings
+from core.models import IngestedDocument, ProcessingPolicy
+from ingestion.providers.sharepoint_provider import SharePointProvider
+from ingestion.sync import SyncEngine
+from extraction.dce import DocumentClassificationEngine
+from extraction.ocr_service import OCRService
+from extraction.text_extractor import TextExtractor
+from chunking.markdown_chunker import MarkdownChunker
+from indexing.vector_store import VectorStore
 
 # --- Cấu hình Logging chuyên nghiệp ---
 logging.basicConfig(
@@ -29,8 +41,30 @@ app = FastAPI(
     redoc_url="/redoc"
 )
 
+# Thêm cấu hình CORS để Frontend có thể gọi API
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"], # Cho phép tất cả nguồn (hợp lý cho bản PoC)
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
 # --- Singleton Engine Instance ---
 rag_engine = None
+sync_status = {"running": False, "last_run": None, "processed": 0, "skipped": 0, "errors": []}
+
+# --- Azure AD SSO Config ---
+REDIRECT_URI = "http://localhost:8000/auth/callback"
+AUTHORITY = f"https://login.microsoftonline.com/{settings.tenant_id}"
+SCOPE = ["User.Read"]
+
+def _build_msal_app():
+    return msal.ConfidentialClientApplication(
+        settings.client_id,
+        authority=AUTHORITY,
+        client_credential=settings.client_secret,
+    )
 
 @app.on_event("startup")
 async def startup_event():
@@ -89,6 +123,18 @@ class ChatResponse(BaseModel):
     sources: List[SourceCitation] = Field(default_factory=list, description="Danh sách các nguồn trích dẫn từ tài liệu")
     context_used: Optional[str] = Field(None, description="Ngữ cảnh thực tế đã được trích xuất từ VectorDB (Dùng cho Debug/UI)")
 
+class SyncResponse(BaseModel):
+    status: str
+    message: str
+
+class LoginRequest(BaseModel):
+    email: str = Field(..., description="Email người dùng")
+
+class LoginResponse(BaseModel):
+    email: str
+    display_name: str
+    role: str
+
 # --- ENDPOINTS ---
 
 @app.get("/health", tags=["System"])
@@ -103,11 +149,82 @@ async def health_check():
         }
     }
 
+@app.get("/auth/login", tags=["Auth"])
+async def sso_login():
+    """
+    Redirect sang Azure AD login page.
+    Dùng chung App Registration với SharePoint ingestion.
+    """
+    msal_app = _build_msal_app()
+    auth_url = msal_app.get_authorization_request_url(
+        SCOPE,
+        redirect_uri=REDIRECT_URI,
+        state=secrets.token_hex(16)
+    )
+    return RedirectResponse(url=auth_url)
+
+@app.get("/auth/callback", tags=["Auth"])
+async def sso_callback(request: Request):
+    """
+    Azure AD redirect về đây với authorization code.
+    Đổi code lấy token, lấy thông tin user, redirect về frontend.
+    """
+    code = request.query_params.get("code")
+    if not code:
+        raise HTTPException(status_code=400, detail="Missing authorization code")
+    
+    msal_app = _build_msal_app()
+    result = msal_app.acquire_token_by_authorization_code(
+        code,
+        scopes=SCOPE,
+        redirect_uri=REDIRECT_URI
+    )
+    
+    if "error" in result:
+        logger.error(f"SSO error: {result.get('error_description', result.get('error'))}")
+        raise HTTPException(status_code=401, detail="Authentication failed")
+    
+    # Lấy thông tin user từ token
+    id_token_claims = result.get("id_token_claims", {})
+    email = id_token_claims.get("preferred_username", id_token_claims.get("email", ""))
+    name = id_token_claims.get("name", email.split("@")[0])
+    oid = id_token_claims.get("oid", "")
+    
+    # Xác định role
+    role = "admin" if "admin" in email.lower() else "user"
+    
+    logger.info(f"SSO login: {email} (role={role})")
+    
+    # Redirect về frontend với user info
+    import json
+    import urllib.parse
+    user_data = json.dumps({"email": email, "display_name": name, "role": role})
+    encoded = urllib.parse.quote(user_data)
+    return RedirectResponse(url=f"http://localhost:8000?user={encoded}")
+
+@app.post("/auth/login-email", response_model=LoginResponse, tags=["Auth"])
+async def login_email_endpoint(request: LoginRequest):
+    """
+    Đăng nhập bằng email (fallback khi không dùng SSO).
+    """
+    email = request.email.strip().lower()
+    if not email or "@" not in email:
+        raise HTTPException(status_code=400, detail="Email không hợp lệ.")
+    
+    local_part = email.split("@")[0]
+    display_name = local_part.replace(".", " ").title()
+    role = "admin" if "admin" in email else "user"
+    
+    logger.info(f"Email login: {email} (role={role})")
+    
+    return LoginResponse(email=email, display_name=display_name, role=role)
+
 @app.post("/chat", response_model=ChatResponse, tags=["RAG"], status_code=status.HTTP_200_OK)
-async def chat_endpoint(request: ChatRequest):
+async def chat_endpoint(request: ChatRequest, http_request: Request):
     """
     Điểm cuối xử lý hội thoại RAG.
-    Hệ thống sẽ tự động trích xuất ngữ cảnh từ OpenSearch và sử dụng Provider đã cấu hình để trả lời.
+    Header 'X-User-Email' (optional): Email user để filter quyền.
+    Header 'X-User-Role' (optional): "admin" = bypass ACL.
     """
     if not rag_engine:
         raise HTTPException(
@@ -116,11 +233,14 @@ async def chat_endpoint(request: ChatRequest):
         )
     
     try:
-        # Chuyển đổi ChatHistoryItem sang format dict cho RAGEngine
+        user_email = http_request.headers.get("X-User-Email")
+        user_role = http_request.headers.get("X-User-Role", "user")
+        is_admin = user_role == "admin" or not user_email
+        
         history_data = [item.dict() for item in request.history]
         
-        logger.info(f"Xử lý truy vấn: {request.query[:50]}...")
-        result = rag_engine.chat(request.query, history=history_data)
+        logger.info(f"Chat query: {request.query[:50]} (user={user_email or 'none'}, role={user_role})")
+        result = rag_engine.chat(request.query, history=history_data, user_email=user_email, is_admin=is_admin)
         
         return ChatResponse(
             answer=result["answer"],
@@ -134,5 +254,136 @@ async def chat_endpoint(request: ChatRequest):
             detail="Đã xảy ra lỗi nội bộ trong quá trình xử lý ngôn ngữ."
         )
 
+
+def extract_text_from_pdf_bytes(pdf_bytes: bytes) -> str:
+    """Trích xuất text trực tiếp từ PDF có text layer."""
+    try:
+        import fitz
+        doc = fitz.open(stream=pdf_bytes, filetype="pdf")
+        return "\n\n".join(page.get_text() for page in doc)
+    except Exception:
+        return ""
+
+
+def run_sync_background():
+    """Chạy đồng bộ SharePoint → DCE → OCR/Extract → Chunk → Index."""
+    global sync_status
+    sync_status = {"running": True, "last_run": None, "processed": 0, "skipped": 0, "errors": []}
+    
+    try:
+        provider = SharePointProvider()
+        dce = DocumentClassificationEngine(provider=provider)
+        ocr = OCRService()
+        chunker = MarkdownChunker(max_chunk_size=1000, overlap=100)
+        vector_db = VectorStore(index_name="poc_sharepoint_docs")
+        
+        items, _ = provider.fetch_changes({})
+        logger.info(f"Sync: Found {len(items)} items from SharePoint")
+        
+        for item in items:
+            if item.get("is_folder") or item.get("is_deleted"):
+                continue
+            
+            name = item.get("name", "")
+            item_id = item.get("id", "")
+            
+            item_details = provider.get_item_details(item_id)
+            permissions = provider.get_item_permissions(item_id)
+            doc = IngestedDocument(
+                site_id=settings.sharepoint_site_id,
+                drive_id="",
+                item_id=item_id,
+                name=name,
+                web_url=item_details.get("web_url", ""),
+                download_url=item_details.get("download_url"),
+                is_folder=False,
+                size=item.get("size", 0),
+            )
+            
+            classification = dce.classify(doc, target_item=item)
+            
+            if classification.processing_policy in (ProcessingPolicy.UNSUPPORTED, ProcessingPolicy.METADATA_ONLY, ProcessingPolicy.REQUIRES_REVIEW):
+                sync_status["skipped"] += 1
+                continue
+            
+            try:
+                file_bytes = provider.download_file(item)
+            except Exception as e:
+                sync_status["errors"].append(f"{name}: download failed")
+                continue
+            
+            if not file_bytes:
+                sync_status["errors"].append(f"{name}: empty file")
+                continue
+            
+            pages = []
+            ext = name.lower().rsplit(".", 1)[-1] if "." in name else ""
+            
+            if classification.processing_policy == ProcessingPolicy.SKIP_OCR:
+                if ext == "pdf":
+                    text = extract_text_from_pdf_bytes(file_bytes)
+                    if text.strip():
+                        from core.models import OCRPageResult
+                        pages = [OCRPageResult(page=1, text=text, confidence=1.0)]
+                elif ext in ("docx", "doc"):
+                    pages = TextExtractor.extract_from_docx(file_bytes)
+                elif ext in ("xlsx", "xls"):
+                    pages = TextExtractor.extract_from_xlsx(file_bytes)
+                elif ext in ("txt", "md", "csv"):
+                    pages = TextExtractor.extract_from_text(file_bytes)
+            elif classification.processing_policy == ProcessingPolicy.REQUIRES_OCR:
+                pages = ocr.process_pdf_bytes(file_bytes)
+            
+            if not pages:
+                sync_status["skipped"] += 1
+                continue
+            
+            metadata = {
+                "item_id": item_id,
+                "name": name,
+                "web_url": item_details.get("web_url"),
+                "download_url": item_details.get("download_url"),
+                "site_id": settings.sharepoint_site_id,
+                "permissions": permissions
+            }
+            chunks = chunker.chunk_document(pages, metadata)
+            
+            if chunks:
+                vector_db.delete_by_file_id(item_id)
+                vector_db.embed_and_index(chunks)
+                sync_status["processed"] += 1
+                logger.info(f"Sync: Indexed {name} → {len(chunks)} chunks")
+            else:
+                sync_status["skipped"] += 1
+        
+        sync_status["last_run"] = "completed"
+        logger.info(f"Sync completed: {sync_status['processed']} processed, {sync_status['skipped']} skipped")
+        
+    except Exception as e:
+        sync_status["last_run"] = "failed"
+        sync_status["errors"].append(str(e))
+        logger.error(f"Sync failed: {e}")
+    finally:
+        sync_status["running"] = False
+
+
+@app.post("/sync", response_model=SyncResponse, tags=["Ingestion"])
+async def sync_endpoint(background_tasks: BackgroundTasks):
+    """
+    Trigger đồng bộ dữ liệu từ SharePoint.
+    Chạy trong background, trả về trạng thái ngay lập tức.
+    """
+    if sync_status["running"]:
+        return SyncResponse(status="already_running", message="Đồng bộ đang chạy, vui lòng đợi.")
+    
+    background_tasks.add_task(run_sync_background)
+    return SyncResponse(status="started", message="Đồng bộ đã bắt đầu trong background.")
+
+
+@app.get("/sync/status", tags=["Ingestion"])
+async def sync_status_endpoint():
+    """Kiểm tra trạng thái đồng bộ."""
+    return sync_status
+
 if __name__ == "__main__":
     uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=True)
diff --git a/chat/rag_engine.py b/chat/rag_engine.py
index 5171498..dff20c1 100644
--- a/chat/rag_engine.py
+++ b/chat/rag_engine.py
@@ -5,37 +5,46 @@ from .llm_factory import LLMFactory
 
 logger = logging.getLogger("RAGEngine")
 
+
 class RAGEngine:
     def __init__(self):
         self.retriever = SearchRetriever()
         self.llm = LLMFactory.get_provider()
-        logger.info(f"RAG Engine đã sẵn sàng với LLM Provider: {type(self.llm).__name__}")
+        logger.info(f"RAG Engine ready with LLM Provider: {type(self.llm).__name__}")
 
-    def chat(self, user_query: str, history: List[Dict[str, str]] = None) -> Dict:
+    def chat(self, user_query: str, history: List[Dict[str, str]] = None, user_email: str = None, is_admin: bool = False) -> Dict:
         """
-        Quy trình RAG hoàn chỉnh: Search -> Augment -> Generate
+        Quy trình RAG: Search -> Augment -> Generate
+        
+        Args:
+            user_query: Câu hỏi
+            history: Lịch sử chat
+            user_email: Email user để filter quyền
+            is_admin: True = bypass ACL
         """
-        # 1. RETRIEVAL: Tìm kiếm ngữ cảnh liên quan
-        relevant_chunks = self.retriever.retrieve(user_query, top_k=5)
+        logger.info(f"Search query: {user_query[:100]} (user={user_email or 'none'}, admin={is_admin})")
+        relevant_chunks = self.retriever.retrieve(user_query, top_k=5, user_email=user_email, is_admin=is_admin)
         
         if not relevant_chunks:
             context_text = "Không tìm thấy thông tin liên quan trong cơ sở dữ liệu nội bộ."
+            logger.info("Search result: 0 chunks found")
         else:
-            # Gộp text từ các chunks lại thành 1 khối context
             context_text = "\n---\n".join([
                 f"[Nguồn: {c.file_name}, Trang: {c.page_from}]\nNội dung: {c.text}" 
                 for c in relevant_chunks
             ])
+            logger.info(f"Search result: {len(relevant_chunks)} chunks from {len(set(c.file_name for c in relevant_chunks))} files")
 
-        # 2. GENERATION: Gửi sang LLM để trả lời
-        logger.info("Đang yêu cầu LLM tổng hợp câu trả lời...")
+        # 2. GENERATION
+        logger.info("Requesting LLM to generate answer...")
         answer = self.llm.generate_response(
             prompt=user_query,
             context=context_text,
             history=history
         )
+        logger.info(f"LLM response length: {len(answer)} chars")
 
-        # 3. Trả về kết quả kèm theo nguồn trích dẫn (Citations)
+        # 3. Return with citations
         return {
             "answer": answer,
             "context_used": context_text,
@@ -43,7 +52,8 @@ class RAGEngine:
                 {
                     "file_name": c.file_name,
                     "page": c.page_from,
-                    "url": c.source_url
+                    "url": c.source_url,
+                    "download_url": c.download_url
                 } for c in relevant_chunks
             ]
         }
diff --git a/chunking/markdown_chunker.py b/chunking/markdown_chunker.py
index 6fa3912..69a861a 100644
--- a/chunking/markdown_chunker.py
+++ b/chunking/markdown_chunker.py
@@ -82,8 +82,9 @@ class MarkdownChunker:
                     page_from=p_from,
                     page_to=p_to,
                     source_url=metadata.get("web_url", ""),
+                    download_url=metadata.get("download_url", ""),
                     site_id=metadata.get("site_id", ""),
-                    permissions=["*"] # TODO: Sẽ gán quyền thật từ SharePoint
+                    permissions=metadata.get("permissions", ["*"])
                 ))
                 
                 # Bắt đầu chunk mới với một chút Overlap (chối gối)
@@ -109,8 +110,9 @@ class MarkdownChunker:
                 page_from=p_from,
                 page_to=p_to,
                 source_url=metadata.get("web_url", ""),
+                download_url=metadata.get("download_url", ""),
                 site_id=metadata.get("site_id", ""),
-                permissions=["*"]
+                permissions=metadata.get("permissions", ["*"])
             ))
             
         logger.info(f"Chunked document {metadata.get('name')} into {len(chunks)} chunks.")
diff --git a/core/logging.py b/core/logging.py
new file mode 100644
index 0000000..5c1f31b
--- /dev/null
+++ b/core/logging.py
@@ -0,0 +1,62 @@
+import logging
+import json
+import sys
+from datetime import datetime
+from typing import Optional
+
+
+class StructuredFormatter(logging.Formatter):
+    """JSON structured log formatter for production."""
+    
+    def format(self, record):
+        log_entry = {
+            "timestamp": datetime.utcnow().isoformat() + "Z",
+            "level": record.levelname,
+            "logger": record.name,
+            "message": record.getMessage(),
+        }
+        
+        if record.exc_info and record.exc_info[0]:
+            log_entry["exception"] = self.formatException(record.exc_info)
+        
+        if hasattr(record, "extra_data"):
+            log_entry["data"] = record.extra_data
+        
+        return json.dumps(log_entry, ensure_ascii=False)
+
+
+class HumanFormatter(logging.Formatter):
+    """Human-readable log formatter for development."""
+    
+    def format(self, record):
+        return f"{datetime.now().strftime('%H:%M:%S')} [{record.levelname}] {record.name}: {record.getMessage()}"
+
+
+def setup_logging(level: str = "INFO", structured: bool = False):
+    """
+    Setup logging cho toàn bộ ứng dụng.
+    
+    Args:
+        level: Log level (DEBUG, INFO, WARNING, ERROR)
+        structured: True = JSON format (production), False = human readable (development)
+    """
+    root_logger = logging.getLogger()
+    root_logger.setLevel(getattr(logging, level.upper(), logging.INFO))
+    
+    # Xóa existing handlers
+    root_logger.handlers.clear()
+    
+    handler = logging.StreamHandler(sys.stdout)
+    handler.setFormatter(StructuredFormatter() if structured else HumanFormatter())
+    root_logger.addHandler(handler)
+
+
+def log_event(logger: logging.Logger, level: str, message: str, **kwargs):
+    """
+    Ghi log với structured data.
+    
+    Usage:
+        log_event(logger, "info", "File processed", file_name="test.pdf", pages=5)
+    """
+    extra = {"extra_data": kwargs} if kwargs else {}
+    getattr(logger, level.lower(), logger.info)(message, extra=extra)
diff --git a/core/models.py b/core/models.py
index 088be41..58a2c46 100644
--- a/core/models.py
+++ b/core/models.py
@@ -64,6 +64,7 @@ class DocumentChunk(BaseModel):
     embedding: Optional[list[float]] = None
     page_from: int
     page_to: int
-    source_url: str
+    source_url: Optional[str] = None
+    download_url: Optional[str] = None
     permissions: list[str] = []
     site_id: str = ""
diff --git a/doc/00.AGENT_ARCHITECTURE_MAP.md b/doc/00.AGENT_ARCHITECTURE_MAP.md
index 762a6d1..018eb0b 100644
--- a/doc/00.AGENT_ARCHITECTURE_MAP.md
+++ b/doc/00.AGENT_ARCHITECTURE_MAP.md
@@ -1,88 +1,202 @@
 # 🧭 AGENT ARCHITECTURE MAP (LIVING DOCUMENT)
 *Đây là tài liệu dẫn đường dành riêng cho các AI Agent tương lai và lập trình viên bảo trì. Không quét toàn bộ code, hãy đọc file này trước.*
 
-**Lần cập nhật cuối:** Phase 6 (Hoàn thiện Semantic Chunking & Vector Indexing)
-**Trạng thái Dự án:** Đã hoàn thành Ingestion, Extraction, Chunking & Indexing. Chuẩn bị bước vào Phase 7 (RAG Search & Chat API).
+**Lần cập nhật cuối:** Phase 8 Complete (DCE, Text Extraction, ACL, SSO, Logging)
+**Trạng thái Dự án:** Phase 8 hoàn thành. Sẵn sàng cho Phase 9 (Production Ready).
 
 ---
 
 ## 1. Bản Đồ Kiến Trúc Lõi (Core Architecture Patterns)
 
+### Pipeline hiện tại (ĐÃ HOẠT ĐỘNG)
+```
+SharePoint → Ingestion → DCE → [OCR/Extract/Skip] → Chunking → OpenSearch → Search → RAG Chat → FastAPI → Frontend
+```
+
 ### A. Tầng Ingestion (Thu thập dữ liệu) - Mẫu Modular Provider Pattern
 - **Mục tiêu:** Tách biệt lõi hệ thống khỏi nền tảng lưu trữ (SharePoint, Google Drive, v.v.).
-- **Interface gốc:** `ingestion/providers/base_provider.py` (Bắt buộc phải implement `fetch_changes` và `download_file`).
-- **Implement hiện tại:** `ingestion/providers/sharepoint_provider.py`. Nó bọc lại `GraphClient` và tự động xử lý thuật toán phân trang (pagination) để lấy dữ liệu Delta.
-- **Nếu cần thêm nguồn dữ liệu mới (ví dụ: NAS, Google Drive):** Chỉ cần tạo một class mới kế thừa `BaseStorageProvider`. Lõi hệ thống không cần biết về API của nguồn đó.
+- **Interface:** `ingestion/providers/base_provider.py` (`fetch_changes`, `download_file`, `get_item_details`, `get_item_permissions`).
+- **Implement hiện tại:** `ingestion/providers/sharepoint_provider.py`. Bọc lại `GraphClient`, tự động xử lý pagination Delta Query.
+- **Sync Engine:** `ingestion/sync.py` → `SyncEngine` nhận `BaseStorageProvider` qua constructor, provider-agnostic.
+- **Nếu cần thêm nguồn dữ liệu mới:** Chỉ cần tạo class mới kế thừa `BaseStorageProvider`.
 
 ### B. Tầng Extraction (Xử lý chữ & Ảnh) - Mẫu Distributed VLM Pattern
-- **Lịch sử:** Đã từng dùng PaddleOCR + VietOCR nhưng gặp lỗi "Rụng dấu" và "Ảo giác" do cắt ảnh sai.
-- **Kiến trúc hiện tại:** Hệ thống đóng vai trò như một **VLM Client**.
-- **Cách hoạt động:** `extraction/ocr_service.py` render file PDF thành ảnh (DPI=86), nén Base64 và bắn POST Request sang một Server LLM khác trong mạng LAN (chạy `llama.cpp` với model `Vintern-3B`).
-- **Lợi ích:** Giải phóng hoàn toàn RAM cho máy chủ RAG, loại bỏ các thư viện AI nặng nền (Torch, Paddle). Lấy được Markdown nguyên bản, không gãy vỡ layout bảng biểu.
+- **Lịch sử:** Đã từng dùng PaddleOCR + VietOCR nhưng gặp lỗi "Rụng dấu" và "Ảo giác". Đã loại bỏ hoàn toàn.
+- **Kiến trúc hiện tại:** Hệ thống đóng vai trò **VLM Client**.
+- **Cách hoạt động:** `extraction/ocr_service.py` render PDF thành ảnh (Matrix=1.2), nén Base64, POST sang server LAN (`10.202.50.3:8080`) chạy `llama.cpp` + `Vintern-3B`.
+- **Lợi ích:** Giải phóng RAM cho máy chủ RAG, lấy được Markdown nguyên bản.
 
 ### C. Tầng Chunking & Vector DB (Semantic Indexing)
-- **Chunking:** `chunking/markdown_chunker.py` chia nhỏ văn bản bằng Markdown Rules (nhận biết Header `#`, duy trì overlap chống đứt gãy ngữ cảnh), tự động theo dõi `page_from`, `page_to` chuẩn xác.
-- **Embedding:** Dùng thư viện `sentence-transformers` với model `keepitreal/vietnamese-sbert` chạy Local/Offline. Tạo ra Vector 768 chiều chuyên biệt cho Tiếng Việt.
-- **Database:** `indexing/vector_store.py` cấu hình OpenSearch với thuật toán `k-NN HNSW`. Index mặc định là `poc_sharepoint_docs` hoặc `sharepoint_docs`.
+- **Chunking:** `chunking/markdown_chunker.py` chia nhỏ bằng Markdown Rules (Header `#`, overlap), theo dõi `page_from`, `page_to`.
+- **Embedding:** `sentence-transformers` với model `keepitreal/vietnamese-sbert` (local, 768 chiều).
+- **Database:** `indexing/vector_store.py` → OpenSearch `k-NN HNSW`. Index: `poc_sharepoint_docs`.
+- **Dedup:** `VectorStore.delete_by_file_id()` xóa chunks cũ trước khi nạp lại.
 
-### D. Tầng Cấu hình (Decoupled Configuration)
-- Toàn bộ thông số hệ thống, đặc biệt là IP máy chủ VLM, Token của SharePoint đều nằm trong `.env`.
-- Mã nguồn load cấu hình thông qua `core/config.py`.
+### D. Tầng Search & RAG Chat
+- **Retriever:** `search/retriever.py` → Semantic Search (k-NN vector) trên OpenSearch.
+- **RAG Engine:** `chat/rag_engine.py` → Search → Augment Context → LLM Generate.
+- **LLM Factory:** `chat/llm_factory.py` → Hỗ trợ Gemini, Groq, Local (config trong `.env`).
+
+### E. Tầng API & Frontend
+- **Backend:** `api/main.py` → FastAPI tại port 8000. Endpoint: `/health`, `/auth/login` (SSO), `/auth/callback`, `/auth/login-email`, `/chat`, `/sync`, `/sync/status`.
+- **Frontend:** `frontend/` → Glassmorphism UI với SSO login + email fallback + sync button. Gọi `http://localhost:8000`.
+
+### F. Tầng Cấu hình (Decoupled Configuration)
+- Toàn bộ thông số trong `.env`. Load qua `core/config.py`.
 - **Tuyệt đối KHÔNG hardcode URL, Token hay Password trong code.**
 
 ---
 
-## 2. Bản Đồ File & Thư Mục Quan Trọng
+## 2. Bản Đồ File & Thư Mục Hoàn Chỉnh
 
 ```text
 📁 poc_system/
 ├── 📁 core/
-│   ├── config.py         # ⚙️ Trái tim cấu hình (Load từ .env)
-│   └── models.py         # 🧩 Định nghĩa Data Classes (OCRPageResult, v.v.)
+│   ├── config.py              # ⚙️ Trái tim cấu hình (Load từ .env)
+│   ├── models.py              # 🧩 Data Classes (OCRPageResult, DocumentChunk, IngestedDocument)
+│   └── logging.py             # 📝 Structured logging (JSON/human formatter)
 ├── 📁 ingestion/
-│   ├── sync.py           # 🔄 Bộ điều phối đồng bộ (Đang chuẩn bị ghép với BaseStorageProvider)
-│   ├── graph_client.py   # 🌐 Microsoft Graph API Client (Bọc Auth)
-│   └── 📁 providers/     # 🔌 Nơi chứa các plugin kết nối dữ liệu
-│       ├── base_provider.py
+│   ├── sync.py                # 🔄 SyncEngine (Provider-agnostic)
+│   ├── graph_client.py        # 🌐 Microsoft Graph API Client
+│   └── 📁 providers/
+│       ├── base_provider.py   # 🔌 Interface: fetch_changes, download_file, get_item_details
 │       └── sharepoint_provider.py
 ├── 📁 extraction/
-│   └── ocr_service.py    # 👁️ VLM Client (Chuyển ảnh -> Text Markdown qua LAN)
-├── .env                  # 🔑 Chìa khoá và địa chỉ mạng (KHÔNG commit file này)
-└── test_modular_architecture.py # 🧪 Script kiểm tra nhanh kết nối các module
+│   ├── dce.py               # 🏷️ Document Classification Engine (phân loại trước khi xử lý)
+│   ├── pdf_inspector.py     # 🔎 PDF Inspection (TEXT_PDF / SCAN_PDF / DRAWING_PDF)
+│   ├── magic_numbers.py     # 🔢 Magic Number validation (chống giả extension)
+│   ├── text_extractor.py    # 📄 Text extraction: DOCX (python-docx), XLSX (openpyxl), TXT
+│   └── ocr_service.py       # 👁️ VLM Client (PDF → Markdown qua LAN)
+├── 📁 chunking/
+│   └── markdown_chunker.py    # ✂️ Semantic Chunking theo Markdown rules
+├── 📁 indexing/
+│   └── vector_store.py        # 📦 OpenSearch k-NN Index + Embedding
+├── 📁 search/
+│   └── retriever.py           # 🔍 Semantic Search (k-NN vector)
+├── 📁 chat/
+│   ├── rag_engine.py          # 🤖 RAG: Search → Context → LLM
+│   ├── llm_factory.py         # 🏭 Factory: Gemini / Groq / Local
+│   └── 📁 llm_providers/
+│       ├── base_llm.py
+│       ├── gemini_llm.py
+│       ├── groq_llm.py
+│       └── local_llm.py
+├── 📁 api/
+│   └── main.py                # 🚀 FastAPI Backend (port 8000)
+├── 📁 frontend/
+│   ├── index.html             # 🎨 Glassmorphism UI (Login + Chat + Sync)
+│   ├── app.js                 # 💬 Chat, Auth, Sync logic
+│   └── style.css              # 🖌️ CSS
+├── 📁 doc/                    # 📚 Tài liệu dự án
+│   ├── 00.AGENT_ARCHITECTURE_MAP.md  # Bản đồ kiến trúc
+│   ├── AGENT_HANDOVER_PROTOCOL.md    # Protocol cho AI Agent
+│   ├── DEPLOYMENT_GUIDE.md           # Hướng dẫn triển khai & cấu hình
+│   └── ...                           # Các tài liệu khác
+├── .env                       # 🔑 Chìa khoá (KHÔNG commit)
+├── docker-compose.yml         # 🐳 OpenSearch
+├── Dockerfile
+├── requirements.txt
+├── test_rag_pipeline.py       # 🧪 Test toàn bộ pipeline
+├── test_graph_smoke.py        # 🧪 Test kết nối Graph API
+├── test_modular_architecture.py
+├── test_chat.py
+├── test_ocr.py
+└── test_dce_pipeline.py
 ```
 
 ---
 
 ## 3. Lịch Sử Các Lỗi Khét Tiếng & Cách Xử Lý (Known Gotchas)
+
 1. **Lỗi 401 Unauthorized khi tải file từ SharePoint:**
    - *Nguyên nhân:* Microsoft chặn download trực tiếp bằng `@microsoft.graph.downloadUrl` nếu dùng App-Only Token.
-   - *Giải pháp:* Dùng endpoint `.../items/{item_id}/content` kèm Bearer Token (Đã cài đặt trong `graph_client.py`).
+   - *Giải pháp:* Dùng endpoint `.../items/{item_id}/content` kèm Bearer Token.
 
 2. **Lỗi 500 Internal Server Error từ Llama.cpp VLM:**
-   - *Nguyên nhân:* Bức ảnh ném vào VLM có độ phân giải quá cao (Matrix 2.0) làm tràn Context Window (ví dụ: Token ảnh > 4096).
-   - *Giải pháp:* Hạ `Matrix` xuống `1.2`, hoặc khởi chạy Server Llama.cpp với `-c 8192`. Bắt buộc phải có file `--mmproj`.
+   - *Nguyên nhân:* Ảnh có độ phân giải quá cao (Matrix 2.0) làm tràn Context Window.
+   - *Giải pháp:* Hạ `Matrix` xuống `1.2`, hoặc khởi chạy Server với `-c 8192`.
 
 3. **Lỗi Rụng dấu / Ảo giác của VietOCR:**
-   - *Nguyên nhân:* PaddleOCR bắt khung quá khít, làm cụt phần đuôi của các chữ tiếng Việt có dấu. Mô hình `vgg_seq2seq` tự nội suy ra từ tiếng Anh linh tinh.
-   - *Giải pháp triệt để:* Đã loại bỏ hoàn toàn VietOCR, chuyển sang dùng VLM (Vintern-3B).
+   - *Nguyên nhân:* PaddleOCR bắt khung quá khít, mô hình `vgg_seq2seq` nội suy sai.
+   - *Giải pháp triệt để:* Đã loại bỏ hoàn toàn VietOCR, chuyển sang VLM (Vintern-3B).
 
 4. **Lỗi UTF-8 Surrogate (\udcc3) trong Terminal WSL:**
-   - *Hiện tượng:* Câu hỏi đầu tiên đúng, nhưng từ câu thứ 2 bị lỗi mã hóa khi dùng `input()`.
-   - *Nguyên nhân:* Do sự không đồng nhất giữa `sys.stdin` và bộ đệm Terminal sau khi in lượng lớn dữ liệu từ LLM.
-   - *Giải pháp:* Sử dụng `sys.stdin.buffer.readline()` để đọc dữ liệu thô (Bytes) và tự decode bằng UTF-8. Đây là giải pháp cho môi trường CLI, khi lên Web API (FastAPI) sẽ không bị ảnh hưởng.
+   - *Giải pháp:* Dùng `sys.stdin.buffer.readline()` cho CLI. Web API (FastAPI) không bị ảnh hưởng.
+
+5. **Lỗi Link SharePoint không ổn định (Bug #101):**
+   - *Nguyên nhân:* Delta Query không trả về `webUrl` và `@microsoft.graph.downloadUrl`.
+   - *Giải pháp:* Thêm `get_item_details()` vào `graph_client.py`, `base_provider.py`, `sharepoint_provider.py`.
+
+6. **Lỗi Chunks trùng lặp khi chạy lại pipeline:**
+   - *Hiện tượng:* Mỗi lần chạy `test_rag_pipeline.py`, chunks mới được thêm chồng lên chunks cũ (cùng file).
+   - *Nguyên nhân:* `chunk_id` dùng UUID ngẫu nhiên, không có bước xóa cũ.
+   - *Giải pháp:* `VectorStore.delete_by_file_id(file_id)` gọi trước `embed_and_index()`.
+
+7. **Lỗi DCE download PDF 401 Unauthorized:**
+   - *Hiện tượng:* DCE không phân loại được PDF vì download file bị 401.
+   - *Nguyên nhân:* DCE dùng httpx trực tiếp với `@microsoft.graph.downloadUrl` (không có Bearer Token).
+   - *Giải pháp:* Truyền `provider` (BaseStorageProvider) vào DCE constructor, dùng `provider.download_file()` thay vì httpx.
+
+8. **Lỗi DCE download 404 (items/None/content):**
+   - *Hiện tượng:* DCE download PDF bị 404 vì URL có `items/None/content`.
+   - *Nguyên nhân:* `ingestion_output.json` dùng key `item_id` nhưng `download_file()` cần `id`.
+   - *Giải pháp:* DCE tự chuẩn hóa `item_id` → `id` khi thiếu.
+
+9. **Lỗi OpenSearch hostname không resolve khi chạy ngoài Docker:**
+   - *Hiện tượng:* `ConnectionError: Failed to resolve 'opensearch'`.
+   - *Nguyên nhân:* Config `.env` có `opensearch_host=opensearch` (Docker hostname).
+   - *Giải pháp:* `VectorStore` và `SearchRetriever` tự detect: nếu host là "opensearch" và ENV != "docker" → đổi sang "localhost".
+
+10. **Lỗi k-NN query format sai cho OpenSearch 2.x:**
+    - *Hiện tượng:* `Unknown key for a START_OBJECT in [knn]`.
+    - *Nguyên nhân:* Đặt `knn` ở top level thay vì trong `query`.
+    - *Giải pháp:* Đặt `knn` bên trong `query` object.
 
 ---
 
-## 4. Nhiệm Vụ Tiếp Theo (Dành cho Lập Trình Viên/AI Agent)
-- [ ] **Phase 7:** Bọc thành API Backend bằng FastAPI.
+## 4. Nhiệm Vụ Tiếp Theo (Phase 9 - Production Ready)
+
+### Đã hoàn thành ✅
+- [x] Ingestion: SharePoint Provider + Delta Query + Pagination
+- [x] DCE: Document Classification Engine (phân loại file theo extension + PDF inspection)
+- [x] PDF Inspection: Detect text layer, classify TEXT_PDF / SCAN_PDF / DRAWING_PDF
+- [x] Conditional OCR: Chỉ OCR SCAN_PDF, TEXT_PDF extract trực tiếp, skip DRAWING/UNSUPPORTED
+- [x] Extraction: VLM OCR (Vintern-3B qua LAN)
+- [x] Chunking: Semantic Markdown Chunker
+- [x] Indexing: OpenSearch k-NN HNSW + vietnamese-sbert
+- [x] Search: Semantic Retriever
+- [x] RAG Chat: LLM Factory (Gemini/Groq/Local)
+- [x] API: FastAPI Backend (/chat, /health, /sync, /sync/status)
+- [x] Frontend: Glassmorphism UI
+- [x] Bug fixes: SharePoint links, Chunk dedup
+- [x] Refactor: SyncEngine provider-agnostic
+- [x] Logging: Structured logging utility (`core/logging.py`)
+- [x] Permission: ACL extraction từ SharePoint + filter search theo user
+- [x] Auth UI: Simple email login + SSO Azure AD + user context cho API calls
+- [x] DOCX Text Extraction: python-docx (paragraphs + tables)
+- [x] XLSX Text Extraction: openpyxl (sheets + cells)
+
+### Chưa triển khai (Phase 9 - Production Ready)
+
+#### Ưu tiên trung bình
+- [ ] **Cấu hình Azure AD cho SSO:** Thêm Redirect URI `http://localhost:8000/auth/callback` và bật "ID tokens" trong App Registration.
+
+#### Ưu tiên thấp
+- [ ] **Monitoring Dashboard:** Health metrics, ingestion status, OCR success rate.
+- [ ] **Multi-tenant:** Hỗ trợ nhiều SharePoint site/tenant.
 
 ---
 
 ## 5. Tiêu chuẩn Lập trình & Môi trường (Coding Standards)
 
 ### A. Quản lý Mã hóa (Encoding)
-- **Quy tắc vàng:** Luôn sử dụng `encoding='utf-8'` trong mọi lệnh `open()`. Tuyệt đối không dựa dẫm vào encoding mặc định của hệ điều hành.
-- **Môi trường:** Hệ thống được thiết kế để chạy trong môi trường UTF-8. Trong Docker hoặc WSL, luôn đảm bảo biến môi trường `PYTHONIOENCODING=utf-8` được thiết lập. Điều này giúp hệ thống tương thích 100% với các ký tự Tiếng Việt từ LLM mà không cần hack code.
+- **Quy tắc vàng:** Luôn sử dụng `encoding='utf-8'` trong mọi lệnh `open()`.
+- **Môi trường:** `PYTHONIOENCODING=utf-8` trong Docker/WSL.
 
 ### B. Mẫu Provider (Provider Pattern)
-- Mọi kết nối tới dịch vụ bên thứ ba (Storage, LLM) phải thông qua Interface/BaseClass để đảm bảo tính "Cắm rút" (Pluggable).
+- Mọi kết nối tới dịch vụ bên thứ ba (Storage, LLM) phải thông qua Interface/BaseClass.
+- `BaseStorageProvider` cho Storage, `BaseLLMProvider` cho LLM.
+
+### C. Quy tắc an toàn
+- Không commit `.env`, không hardcode secrets.
+- Không thay đổi kiến trúc đã chốt trong `doc/14.Project-Bridge-Context-for-New-Chat.md` mà không có lý do kỹ thuật rõ ràng.
diff --git a/doc/AGENT_HANDOVER_PROTOCOL.md b/doc/AGENT_HANDOVER_PROTOCOL.md
new file mode 100644
index 0000000..632d605
--- /dev/null
+++ b/doc/AGENT_HANDOVER_PROTOCOL.md
@@ -0,0 +1,98 @@
+# 🤖 AGENT HANDOVER PROTOCOL (Dành cho AI Agent)
+
+> **QUAN TRỌNG:** Nếu bạn là AI Agent mới, hãy đọc file này kết hợp với `doc/00.AGENT_ARCHITECTURE_MAP.md` trước khi viết bất kỳ dòng code nào.
+
+## 1. Tóm tắt "Bộ nhớ" Dự án (Memory Snapshot)
+Dự án này là một hệ thống **Enterprise RAG** (Retrieval-Augmented Generation) với các đặc điểm kỹ thuật:
+- **Distributed VLM OCR:** Dùng máy chủ LAN (`10.202.50.3:8080`) chạy `Vintern-3B` để trích xuất Markdown từ PDF.
+- **Modular Provider Pattern:** Tách biệt Storage (SharePoint) và LLM (Gemini, Groq, Local).
+- **Semantic Indexing:** Dùng `vietnamese-sbert` (local) tạo vector 768 chiều, lưu vào OpenSearch k-NN HNSW.
+- **FastAPI Backend:** API tại port 8000. Endpoint: `/health`, `/chat`.
+- **Glassmorphism UI:** Giao diện web tại `frontend/`, gọi `http://localhost:8000/chat`.
+
+**Pipeline hiện tại (ĐÃ HOẠT ĐỘNG):**
+```
+SharePoint → Ingestion → VLM OCR → Chunking → OpenSearch → Search → RAG Chat → FastAPI → Frontend
+```
+
+## 2. Trạng thái triển khai
+
+### ✅ Đã hoàn thành
+| Module | File | Mô tả |
+|--------|------|-------|
+| Ingestion | `ingestion/providers/sharepoint_provider.py` | Delta Query + Pagination + get_item_details |
+| Sync Engine | `ingestion/sync.py` | Provider-agnostic, nhận BaseStorageProvider |
+| DCE | `extraction/dce.py` | Document Classification Engine (phân loại file) |
+| PDF Inspector | `extraction/pdf_inspector.py` | TEXT_PDF / SCAN_PDF / DRAWING_PDF / AMBIGUOUS_PDF |
+| Magic Numbers | `extraction/magic_numbers.py` | Header byte validation |
+| OCR | `extraction/ocr_service.py` | VLM Client (Vintern-3B qua LAN) |
+| Chunking | `chunking/markdown_chunker.py` | Semantic Markdown rules + page tracking |
+| Indexing | `indexing/vector_store.py` | OpenSearch k-NN + delete_by_file_id dedup |
+| Search | `search/retriever.py` | Semantic k-NN vector search |
+| RAG Chat | `chat/rag_engine.py` | Search → Context → LLM |
+| LLM Factory | `chat/llm_factory.py` | Gemini / Groq / Local |
+| API | `api/main.py` | FastAPI port 8000 |
+| Frontend | `frontend/` | Glassmorphism UI (HTML/CSS/JS) |
+| Bug fixes | Nhiều file | SharePoint links (Bug #101), Chunk dedup |
+
+### ❌ Chưa triển khai (Phase 8)
+- **DOCX Text Extraction:** Trích xuất text từ DOCX không cần OCR
+- **XLSX Text Extraction:** Trích xuất header + key columns từ Excel
+- **Permission Enforcement:** ACL filtering theo user/group
+- **Authentication UI:** OAuth2 login
+- **Ingestion API:** Trigger sync từ frontend
+- **Logging & Audit:** Structured logging
+
+## 3. Hướng dẫn dành cho AI Agent tiếp theo
+1.  **Luôn kiểm tra `.env`:** Toàn bộ cấu hình nằm ở đây. Không bao giờ hardcode.
+2.  **Sử dụng `core/config.py`:** Cửa ngõ duy nhất để truy cập cài đặt.
+3.  **UTF-8:** Mọi I/O phải có `encoding='utf-8'`. Đặt `export PYTHONIOENCODING=utf-8`.
+4.  **Cập nhật tài liệu:** Khi hoàn thành Phase hoặc thay đổi kiến trúc, BẮT BUỘC cập nhật file này và `00.AGENT_ARCHITECTURE_MAP.md`.
+5.  **Đọc `doc/14.Project-Bridge-Context-for-New-Chat.md`:** Đây là "hợp đồng kiến trúc" - không thay đổi các quyết định đã chốt.
+
+## 4. Cách cập nhật Tài liệu (Protocol for Updates)
+- **Bước 1:** Cập nhật trạng thái trong `doc/00.AGENT_ARCHITECTURE_MAP.md` (đánh dấu ✅ vào checkbox).
+- **Bước 2:** Nếu phát hiện lỗi mới, ghi lại vào mục **"Lịch sử các lỗi khét tiếng"** kèm giải pháp.
+- **Bước 3:** Cập nhật mục **Trạng thái triển khai** trong file này.
+
+## 5. Lệnh chạy nhanh (Quick Start)
+```bash
+# Khởi động OpenSearch
+docker-compose up -d opensearch
+
+# Chạy Backend (FastAPI port 8000)
+python3 api/main.py
+
+# Mở Frontend
+# Mở frontend/index.html trong trình duyệt (hoặc dùng Live Server)
+
+# Nạp dữ liệu từ SharePoint → OCR → Chunk → Index
+python3 test_rag_pipeline.py
+```
+
+## 6. Kiểm tra nhanh (Verification)
+```bash
+# 1. Kiểm tra cú pháp Python
+python3 -m py_compile ingestion/graph_client.py
+python3 -m py_compile ingestion/providers/sharepoint_provider.py
+python3 -m py_compile ingestion/sync.py
+python3 -m py_compile indexing/vector_store.py
+python3 -m py_compile api/main.py
+python3 -m py_compile test_rag_pipeline.py
+
+# 2. Test kết nối Graph API
+python3 test_graph_smoke.py
+
+# 3. Test toàn bộ pipeline (cần OpenSearch + VLM server)
+python3 test_rag_pipeline.py
+
+# 4. Kiểm tra metadata
+cat ingestion_output.json | python3 -m json.tool | grep -E '"web_url"|"download_url"'
+
+# 5. Test API endpoint
+curl http://localhost:8000/health
+curl -X POST http://localhost:8000/chat -H "Content-Type: application/json" -d '{"query":"test"}'
+```
+
+---
+*Chúc may mắn, Agent đồng nghiệp! Pipeline RAG đã hoạt động. Tiếp theo: DCE, Permission, Hardening.*
diff --git a/doc/DEPLOYMENT_GUIDE.md b/doc/DEPLOYMENT_GUIDE.md
new file mode 100644
index 0000000..f156c22
--- /dev/null
+++ b/doc/DEPLOYMENT_GUIDE.md
@@ -0,0 +1,366 @@
+# 🚀 Hướng dẫn Triển khai & Cấu hình Hệ thống
+
+> Tài liệu này hướng dẫn cấu hình manual, biến môi trường, và các lưu ý khi triển khai từ PoC lên Production.
+
+---
+
+## 1. Cấu hình Azure AD App Registration (MANUAL)
+
+### 1.1 Tạo App Registration (nếu chưa có)
+
+1. Vào **Azure Portal** → **Azure Active Directory** → **App registrations** → **New registration**
+2. Điền thông tin:
+   - **Name:** `VibeCode-RAG-PoC` (hoặc tên tuỳ chọn)
+   - **Supported account types:** `Single tenant` (chỉ tenant công ty)
+   - **Redirect URI:** Để trống, sẽ thêm sau
+3. Bấm **Register**
+4. Ghi lại:
+   - **Application (client) ID** → dùng cho `CLIENT_ID`
+   - **Directory (tenant) ID** → dùng cho `TENANT_ID`
+
+### 1.2 Tạo Client Secret
+
+1. App Registration → **Certificates & secrets** → **New client secret**
+2. Điền Description: `RAG PoC Secret`
+3. Chọn thời hạn: `24 months` (hoặc tuỳ nhu cầu)
+4. Bấm **Add**
+5. **Copy ngay giá trị Secret** → dùng cho `CLIENT_SECRET` (chỉ hiện 1 lần)
+
+### 1.3 Cấp quyền Application Permissions
+
+1. App Registration → **API permissions** → **Add a permission**
+2. Chọn **Microsoft Graph** → **Application permissions**
+3. Tìm và tích:
+   - `Sites.Read.All` (đọc SharePoint sites)
+   - `Files.Read.All` (đọc files trong drives)
+4. Bấm **Add permissions**
+5. **Quan trọng:** Bấm **Grant admin consent for [tenant]** → Confirm
+
+### 1.4 Cấu hình Redirect URI cho SSO (khi cần login)
+
+1. App Registration → **Authentication** → **Add a platform** → **Web**
+2. Nhập Redirect URI:
+   - **PoC (localhost):** `http://localhost:8000/auth/callback`
+   - **Production:** `https://your-domain.com/auth/callback`
+3. Tích chọn: ✅ **ID tokens** (implicit grant)
+4. Bấm **Save**
+
+### 1.5 Kiểm tra Token Claims
+
+Sau khi cấu hình xong, decode token JWT và kiểm tra:
+
+```text
+aud   : https://graph.microsoft.com
+appid : <client-id-của-bạn>
+idtyp : app
+roles :
+  - Sites.Read.All
+  - Files.Read.All
+```
+
+Nếu token **không có `roles`** → quyền chưa đúng, kiểm tra lại bước 1.3.
+
+---
+
+## 2. Biến môi trường (.env)
+
+### 2.1 File mẫu
+
+```env
+# ===== Azure AD / Microsoft Graph =====
+TENANT_ID=your-tenant-id-guid
+CLIENT_ID=your-client-id-guid
+CLIENT_SECRET=your-client-secret-value
+
+# ===== SharePoint =====
+# Site path để ingestion (đổi thành site SharePoint của bạn)
+# Format: hostname:/sites/site-name
+# Ví dụ: 285pdg.sharepoint.com:/sites/poc_system
+
+# ===== OpenSearch =====
+OPENSEARCH_HOST=opensearch        # Docker: "opensearch", Local: "localhost"
+OPENSEARCH_PORT=9200
+OPENSEARCH_USER=admin
+OPENSEARCH_PASS=admin
+
+# ===== VLM OCR Server (Vintern-3B) =====
+VLM_ENDPOINT=http://10.202.50.3:8080/v1/chat/completions
+VLM_TEMPERATURE=0.1
+VLM_MAX_TOKENS=2000
+VLM_TIMEOUT=120.0
+
+# ===== Chat LLM =====
+LLM_PROVIDER=gemini              # Options: gemini, groq, local
+GEMINI_API_KEY=your-gemini-api-key
+GROQ_API_KEY=your-groq-api-key
+GROQ_MODEL=llama-3.3-70b-versatile
+LOCAL_LLM_ENDPOINT=http://10.202.50.3:8081/v1/chat/completions
+
+# ===== General =====
+LOG_LEVEL=INFO
+ENVIRONMENT=development           # development hoặc production
+```
+
+### 2.2 Giải thích từng biến
+
+| Biến | Bắt buộc | Mô tả |
+|------|----------|-------|
+| `TENANT_ID` | ✅ | Azure AD Tenant ID |
+| `CLIENT_ID` | ✅ | Azure AD App Registration Client ID |
+| `CLIENT_SECRET` | ✅ | Azure AD App Registration Client Secret |
+| `OPENSEARCH_HOST` | ✅ | Hostname OpenSearch |
+| `OPENSEARCH_PORT` | ✅ | Port OpenSearch (mặc định 9200) |
+| `VLM_ENDPOINT` | ✅ | URL server VLM OCR (Vintern-3B) |
+| `LLM_PROVIDER` | ✅ | LLM provider: `gemini`, `groq`, hoặc `local` |
+| `GEMINI_API_KEY` | Nếu dùng Gemini | API key từ Google AI Studio |
+| `GROQ_API_KEY` | Nếu dùng Groq | API key từ Groq Console |
+| `ENVIRONMENT` | ✅ | `development` hoặc `production` |
+
+---
+
+## 3. Kết nối SharePoint khác
+
+### 3.1 Thay đổi Site Path
+
+Chỉnh sửa trong file `ingestion/providers/sharepoint_provider.py`:
+
+```python
+# Dòng 14 - Thay đổi hostname và site_path
+def __init__(self, hostname: str = "your-company.sharepoint.com", site_path: str = "/sites/your-site-name"):
+```
+
+Hoặc gọi từ bên ngoài:
+
+```python
+provider = SharePointProvider(
+    hostname="your-company.sharepoint.com",
+    site_path="/sites/your-site-name"
+)
+```
+
+### 3.2 Kiểm tra quyền truy cập
+
+App Registration phải có quyền trên site mới:
+1. Nếu dùng **Application Permissions** (`Sites.Read.All`) → tự động truy cập mọi site
+2. Nếu muốn giới hạn site cụ thể → dùng **SharePoint App-Only Policy** (nâng cao)
+
+### 3.3 Xoá dữ liệu cũ
+
+Khi đổi SharePoint site, cần xoá index cũ:
+
+```bash
+curl -X DELETE -u admin:admin "http://localhost:9200/poc_sharepoint_docs"
+```
+
+Sau đó chạy lại `python3 test_rag_pipeline.py` để nạp dữ liệu mới.
+
+---
+
+## 4. Triển khai Production
+
+### 4.1 Yêu cầu hạ tầng
+
+| Component | Yêu cầu tối thiểu | Khuyến nghị |
+|-----------|-------------------|-------------|
+| Server chính | 4 CPU, 8GB RAM | 8 CPU, 16GB RAM |
+| OpenSearch | 2 CPU, 4GB RAM | 4 CPU, 8GB RAM |
+| VLM OCR Server | GPU 8GB VRAM | GPU 16GB VRAM |
+| Domain | Có SSL certificate | Let's Encrypt |
+
+### 4.2 Docker Compose Production
+
+```yaml
+# docker-compose.prod.yml
+version: '3.8'
+services:
+  opensearch:
+    image: opensearchproject/opensearch:2.11.1
+    environment:
+      - discovery.type=single-node
+      - OPENSEARCH_INITIAL_ADMIN_PASSWORD=StrongPassword123!
+    ports:
+      - "9200:9200"
+    volumes:
+      - opensearch-data:/usr/share/opensearch/data
+
+  rag-api:
+    build: .
+    ports:
+      - "8000:8000"
+    env_file: .env
+    environment:
+      - ENVIRONMENT=production
+      - OPENSEARCH_HOST=opensearch
+    depends_on:
+      - opensearch
+
+  nginx:
+    image: nginx:alpine
+    ports:
+      - "443:443"
+    volumes:
+      - ./nginx.conf:/etc/nginx/nginx.conf
+      - ./certs:/etc/nginx/certs
+      - ./frontend:/usr/share/nginx/html
+    depends_on:
+      - rag-api
+
+volumes:
+  opensearch-data:
+```
+
+### 4.3 Nginx config mẫu
+
+```nginx
+# nginx.conf
+events {
+    worker_connections 1024;
+}
+
+http {
+    server {
+        listen 443 ssl;
+        server_name your-domain.com;
+
+        ssl_certificate /etc/nginx/certs/fullchain.pem;
+        ssl_certificate_key /etc/nginx/certs/privkey.pem;
+
+        # Frontend
+        location / {
+            root /usr/share/nginx/html;
+            index index.html;
+            try_files $uri $uri/ /index.html;
+        }
+
+        # API
+        location /api/ {
+            proxy_pass http://rag-api:8000/;
+            proxy_set_header Host $host;
+            proxy_set_header X-Real-IP $remote_addr;
+            proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+            proxy_set_header X-Forwarded-Proto $scheme;
+        }
+    }
+
+    # Redirect HTTP → HTTPS
+    server {
+        listen 80;
+        server_name your-domain.com;
+        return 301 https://$host$request_uri;
+    }
+}
+```
+
+### 4.4 Thay đổi khi Production
+
+1. **Frontend:** Sửa `API_BASE` trong `app.js`:
+   ```javascript
+   const API_BASE = '/api';  // Thay vì 'http://localhost:8000'
+   ```
+
+2. **SSO Redirect URI:** Cập nhật trong Azure AD:
+   ```
+   https://your-domain.com/auth/callback
+   ```
+
+3. **.env:**
+   ```env
+   OPENSEARCH_HOST=opensearch
+   ENVIRONMENT=production
+   ```
+
+4. **OpenSearch password:** Đổi password mặc định:
+   ```env
+   OPENSEARCH_USER=admin
+   OPENSEARCH_PASS=YourStrongPassword123!
+   ```
+
+5. **CORS:** Giới hạn origins trong `api/main.py`:
+   ```python
+   allow_origins=["https://your-domain.com"]
+   ```
+
+---
+
+## 5. Kiểm tra sau triển khai
+
+### 5.1 Kiểm tra kết nối
+
+```bash
+# 1. OpenSearch
+curl -u admin:admin http://localhost:9200/_cluster/health
+
+# 2. API
+curl http://localhost:8000/health
+
+# 3. Frontend
+curl -I http://localhost/
+```
+
+### 5.2 Kiểm tra SSO
+
+1. Mở `https://your-domain.com`
+2. Bấm "Đăng nhập Microsoft SSO"
+3. Đăng nhập bằng tài khoản Microsoft 365
+4. Kiểm tra user info hiển thị đúng
+
+### 5.3 Kiểm tra Pipeline
+
+```bash
+# Trigger sync
+curl -X POST http://localhost:8000/sync
+
+# Kiểm tra sync status
+curl http://localhost:8000/sync/status
+
+# Kiểm tra chunks trong OpenSearch
+curl -u admin:admin http://localhost:9200/poc_sharepoint_docs/_count
+```
+
+### 5.4 Kiểm tra ACL
+
+```bash
+# Test search với user có quyền
+curl -X POST http://localhost:8000/chat \
+  -H "Content-Type: application/json" \
+  -H "X-User-Email: user@yourcompany.com" \
+  -H "X-User-Role: user" \
+  -d '{"query": "test"}'
+
+# Test search với admin (bypass ACL)
+curl -X POST http://localhost:8000/chat \
+  -H "Content-Type: application/json" \
+  -H "X-User-Role: admin" \
+  -d '{"query": "test"}'
+```
+
+---
+
+## 6. Xử lý lỗi thường gặp
+
+| Lỗi | Nguyên nhân | Giải pháp |
+|-----|-------------|-----------|
+| SSO redirect_uri_mismatch | Redirect URI chưa đúng | Kiểm tra URI trong Azure AD khớp với callback URL |
+| Token không có `roles` | App dùng Delegated thay vì Application permissions | Đổi sang Application permissions + Grant admin consent |
+| OpenSearch connection refused | Chưa khởi động Docker | `docker-compose up -d opensearch` |
+| VLM OCR timeout | Server VLM quá tải hoặc offline | Kiểm tra `VLM_ENDPOINT` có truy cập được |
+| Search trả 0 kết quả | Chưa nạp dữ liệu hoặc sai index name | Chạy `python3 test_rag_pipeline.py` |
+
+---
+
+## 7. Checklist trước khi Production
+
+- [ ] Azure AD App Registration đã cấu hình đúng permissions
+- [ ] Client Secret còn hạn sử dụng
+- [ ] Redirect URI đã thêm cho production domain
+- [ ] OpenSearch đã đổi password mặc định
+- [ ] SSL certificate đã cài đặt
+- [ ] CORS đã giới hạn origins
+- [ ] `.env` đã cấu hình cho production
+- [ ] Docker Compose production đã test
+- [ ] Backup strategy cho OpenSearch data
+- [ ] Monitoring (CPU, RAM, disk) đã setup
+
+---
+
+*Tài liệu này cần được cập nhật khi có thay đổi về hạ trúc hoặc cấu hình.*
diff --git a/extraction/dce.py b/extraction/dce.py
index 0b681fb..ca508de 100644
--- a/extraction/dce.py
+++ b/extraction/dce.py
@@ -1,20 +1,35 @@
 import os
 import httpx
 import logging
+from typing import Optional
 from core.models import IngestedDocument, DocumentClassificationResult, DocumentType, ProcessingPolicy, PdfType
 from extraction.magic_numbers import MagicNumberValidator
 from extraction.pdf_inspector import PDFInspector
 
 logger = logging.getLogger("DCE")
 
+
 class DocumentClassificationEngine:
     """
     Document Classification Engine (DCE).
+    Phân loại file trước khi quyết định OCR / MarkItDown / Skip.
     """
-    def __init__(self):
+    def __init__(self, provider=None):
+        """
+        Args:
+            provider: BaseStorageProvider instance (optional). Nếu có, dùng để download file.
+        """
         self.pdf_inspector = PDFInspector()
+        self.provider = provider
 
-    def classify(self, document: IngestedDocument) -> DocumentClassificationResult:
+    def classify(self, document: IngestedDocument, target_item: dict = None) -> DocumentClassificationResult:
+        """
+        Phân loại tài liệu.
+        
+        Args:
+            document: IngestedDocument từ ingestion output
+            target_item: Original item dict từ provider (dùng để download qua provider)
+        """
         logger.info(f"Classifying document: {document.name} (ID: {document.item_id})")
         
         ext = os.path.splitext(document.name)[1].lower()
@@ -23,31 +38,9 @@ class DocumentClassificationEngine:
         policy = ProcessingPolicy.UNSUPPORTED
         reason = "Initial state"
         
-        # 1. Magic Number Validation
-        if document.download_url:
-            header_bytes = MagicNumberValidator.fetch_header_bytes(document.download_url)
-            is_valid, detected_type, sig_desc = MagicNumberValidator.validate_from_bytes(header_bytes)
-            if is_valid:
-                logger.info(f"Magic Number match: {sig_desc}")
-            else:
-                logger.warning(f"Could not verify magic number for {document.name}. Trusting extension fallback.")
-
-        # 2. Routing Rules
+        # 1. Routing Rules
         if ext == ".pdf":
-            pdf_type = PdfType.SCAN_PDF # Simulated default
-            if document.download_url:
-                logger.info("Downloading PDF into memory for PyMuPDF inspection...")
-                try:
-                    with httpx.Client() as client:
-                        resp = client.get(document.download_url)
-                        resp.raise_for_status()
-                        pdf_bytes = resp.content
-                    pdf_type = self.pdf_inspector.inspect_pdf_from_bytes(pdf_bytes)
-                except Exception as e:
-                    logger.error(f"Failed to download/inspect PDF: {e}")
-                    pdf_type = PdfType.SCAN_PDF
-            else:
-                logger.warning("No download_url available for PDF. Defaulting to SCAN_PDF.")
+            pdf_type = self._classify_pdf(document, target_item)
             
             if pdf_type == PdfType.TEXT_PDF:
                 doc_type = DocumentType.TEXTUAL_DOCUMENT
@@ -60,7 +53,7 @@ class DocumentClassificationEngine:
             elif pdf_type == PdfType.AMBIGUOUS_PDF:
                 doc_type = DocumentType.UNKNOWN
                 policy = ProcessingPolicy.REQUIRES_REVIEW
-                reason = "Kích thước PDF lớn bất thường (khổ A3/A2 hoặc DPI cao), cần con người xác nhận là bản Scan hay Bản vẽ"
+                reason = "PDF size lớn bất thường (A3/A2 hoặc DPI cao), cần con người xác nhận"
             else:
                 doc_type = DocumentType.TEXTUAL_DOCUMENT
                 policy = ProcessingPolicy.REQUIRES_OCR
@@ -81,6 +74,11 @@ class DocumentClassificationEngine:
             policy = ProcessingPolicy.METADATA_ONLY
             reason = "Native CAD drawing format"
             
+        elif ext in [".pptx", ".ppt"]:
+            doc_type = DocumentType.PRESENTATION
+            policy = ProcessingPolicy.SKIP_OCR
+            reason = "Presentation document format"
+            
         else:
             doc_type = DocumentType.BINARY
             policy = ProcessingPolicy.UNSUPPORTED
@@ -97,3 +95,46 @@ class DocumentClassificationEngine:
         
         logger.info(f"Result -> Type: {doc_type.value}, Policy: {policy.value}, Reason: {reason}")
         return result
+
+    def _classify_pdf(self, document: IngestedDocument, target_item: dict = None) -> PdfType:
+        """Phân loại PDF thành TEXT_PDF, SCAN_PDF, DRAWING_PDF, AMBIGUOUS_PDF."""
+        pdf_bytes = self._download_pdf(document, target_item)
+        
+        if not pdf_bytes:
+            logger.warning(f"Cannot download PDF {document.name}. Defaulting to SCAN_PDF.")
+            return PdfType.SCAN_PDF
+        
+        # Magic Number validation
+        header = pdf_bytes[:256]
+        is_valid, detected_type, sig_desc = MagicNumberValidator.validate_from_bytes(header)
+        if is_valid:
+            logger.info(f"Magic Number match: {sig_desc}")
+        else:
+            logger.warning(f"Magic number mismatch for {document.name}. Continuing with inspection.")
+        
+        # PDF Inspection
+        return self.pdf_inspector.inspect_pdf_from_bytes(pdf_bytes)
+
+    def _download_pdf(self, document: IngestedDocument, target_item: dict = None) -> Optional[bytes]:
+        """Download PDF bytes. Ưu tiên dùng provider, fallback sang httpx."""
+        # Cách 1: Dùng provider (ưu tiên, đúng auth)
+        if self.provider and target_item:
+            # Chuẩn hóa: đảm bảo có field 'id' (ingestion_output có thể dùng 'item_id')
+            if "id" not in target_item and "item_id" in target_item:
+                target_item = {**target_item, "id": target_item["item_id"]}
+            try:
+                return self.provider.download_file(target_item)
+            except Exception as e:
+                logger.warning(f"Provider download failed: {e}. Falling back to httpx.")
+        
+        # Cách 2: Dùng httpx trực tiếp với download_url
+        if document.download_url:
+            try:
+                with httpx.Client(follow_redirects=True, timeout=60.0) as client:
+                    resp = client.get(document.download_url)
+                    resp.raise_for_status()
+                    return resp.content
+            except Exception as e:
+                logger.error(f"httpx download failed: {e}")
+        
+        return None
diff --git a/extraction/text_extractor.py b/extraction/text_extractor.py
new file mode 100644
index 0000000..8563107
--- /dev/null
+++ b/extraction/text_extractor.py
@@ -0,0 +1,96 @@
+import logging
+from typing import List, Optional
+from core.models import OCRPageResult
+
+logger = logging.getLogger("TextExtractor")
+
+
+class TextExtractor:
+    """
+    Trích xuất text từ các định dạng tài liệu không cần OCR:
+    - DOCX (python-docx)
+    - XLSX (openpyxl)
+    - TXT/MD (đọc trực tiếp)
+    """
+
+    @staticmethod
+    def extract_from_docx(file_bytes: bytes) -> List[OCRPageResult]:
+        """Trích xuất text từ DOCX, giữ cấu trúc đoạn văn."""
+        try:
+            from docx import Document
+            import io
+
+            doc = Document(io.BytesIO(file_bytes))
+            paragraphs = []
+            for para in doc.paragraphs:
+                text = para.text.strip()
+                if text:
+                    paragraphs.append(text)
+
+            # Cũng trích xuất text từ bảng
+            for table in doc.tables:
+                for row in table.rows:
+                    row_text = " | ".join(cell.text.strip() for cell in row.cells if cell.text.strip())
+                    if row_text:
+                        paragraphs.append(row_text)
+
+            full_text = "\n\n".join(paragraphs)
+            if not full_text.strip():
+                logger.warning("DOCX file is empty or has no readable text.")
+                return []
+
+            return [OCRPageResult(page=1, text=full_text, confidence=1.0)]
+
+        except ImportError:
+            logger.error("python-docx not installed. Run: pip install python-docx")
+            return []
+        except Exception as e:
+            logger.error(f"Failed to extract text from DOCX: {e}")
+            return []
+
+    @staticmethod
+    def extract_from_xlsx(file_bytes: bytes) -> List[OCRPageResult]:
+        """Trích xuất text từ XLSX (header + mỗi sheet là 1 page)."""
+        try:
+            from openpyxl import load_workbook
+            import io
+
+            wb = load_workbook(io.BytesIO(file_bytes), read_only=True, data_only=True)
+            results = []
+
+            for sheet_idx, sheet_name in enumerate(wb.sheetnames, 1):
+                ws = wb[sheet_name]
+                rows = []
+                for row in ws.iter_rows(values_only=True):
+                    cells = [str(c).strip() for c in row if c is not None and str(c).strip()]
+                    if cells:
+                        rows.append(" | ".join(cells))
+
+                if rows:
+                    sheet_text = f"[Sheet: {sheet_name}]\n" + "\n".join(rows)
+                    results.append(OCRPageResult(page=sheet_idx, text=sheet_text, confidence=1.0))
+
+            wb.close()
+
+            if not results:
+                logger.warning("XLSX file is empty or has no readable data.")
+            return results
+
+        except ImportError:
+            logger.error("openpyxl not installed. Run: pip install openpyxl")
+            return []
+        except Exception as e:
+            logger.error(f"Failed to extract text from XLSX: {e}")
+            return []
+
+    @staticmethod
+    def extract_from_text(file_bytes: bytes) -> List[OCRPageResult]:
+        """Trích xuất text từ file text thuần (TXT, MD, CSV)."""
+        try:
+            text = file_bytes.decode("utf-8", errors="replace").strip()
+            if not text:
+                return []
+            return [OCRPageResult(page=1, text=text, confidence=1.0)]
+        except Exception as e:
+            logger.error(f"Failed to extract text from text file: {e}")
+            return []
diff --git a/frontend/app.js b/frontend/app.js
new file mode 100644
index 0000000..8a81ef6
--- /dev/null
+++ b/frontend/app.js
@@ -0,0 +1,291 @@
+// API Base URL
+const API_BASE = 'http://localhost:8000';
+
+// DOM Elements
+const loginScreen = document.getElementById('login-screen');
+const appContainer = document.getElementById('app-container');
+const loginForm = document.getElementById('login-form');
+const loginEmail = document.getElementById('login-email');
+const chatWindow = document.getElementById('chat-window');
+const userInput = document.getElementById('user-input');
+const sendBtn = document.getElementById('send-btn');
+const sourcePanel = document.getElementById('source-panel');
+const sourceList = document.getElementById('source-list');
+const closePanel = document.getElementById('close-panel');
+const clearChatBtn = document.getElementById('clear-chat');
+const userName = document.getElementById('user-name');
+const userRole = document.getElementById('user-role');
+const logoutBtn = document.getElementById('logout-btn');
+const syncBtn = document.getElementById('sync-btn');
+const syncStatus = document.getElementById('sync-status');
+const ssoBtn = document.getElementById('sso-btn');
+
+let chatHistory = [];
+let currentUser = null;
+
+// ====== AUTH ======
+
+function checkLogin() {
+    // Kiểm tra SSO callback (user data trong URL)
+    const params = new URLSearchParams(window.location.search);
+    const userData = params.get('user');
+    if (userData) {
+        try {
+            currentUser = JSON.parse(decodeURIComponent(userData));
+            localStorage.setItem('vibecode_user', JSON.stringify(currentUser));
+            window.history.replaceState({}, '', '/'); // Xóa query param
+            showApp();
+            return;
+        } catch (e) {
+            console.error('Parse SSO user data failed:', e);
+        }
+    }
+    
+    // Kiểm tra localStorage
+    const saved = localStorage.getItem('vibecode_user');
+    if (saved) {
+        currentUser = JSON.parse(saved);
+        showApp();
+    }
+}
+
+function showApp() {
+    loginScreen.style.display = 'none';
+    appContainer.style.display = 'flex';
+    userName.textContent = currentUser.display_name;
+    userRole.textContent = currentUser.role;
+}
+
+function showLogin() {
+    loginScreen.style.display = 'flex';
+    appContainer.style.display = 'none';
+    currentUser = null;
+    localStorage.removeItem('vibecode_user');
+}
+
+loginForm.onsubmit = async (e) => {
+    e.preventDefault();
+    const email = loginEmail.value.trim();
+    if (!email) return;
+
+    try {
+        const response = await fetch(`${API_BASE}/auth/login-email`, {
+            method: 'POST',
+            headers: { 'Content-Type': 'application/json' },
+            body: JSON.stringify({ email })
+        });
+
+        if (!response.ok) {
+            const err = await response.json();
+            alert(err.detail || 'Đăng nhập thất bại');
+            return;
+        }
+
+        currentUser = await response.json();
+        localStorage.setItem('vibecode_user', JSON.stringify(currentUser));
+        showApp();
+    } catch (error) {
+        console.error('Login error:', error);
+        alert('Không thể kết nối tới server. Vui lòng đảm bảo Backend đang chạy.');
+    }
+};
+
+logoutBtn.onclick = () => {
+    chatHistory = [];
+    chatWindow.innerHTML = '';
+    showLogin();
+};
+
+// SSO Login
+ssoBtn.onclick = () => {
+    window.location.href = `${API_BASE}/auth/login`;
+};
+
+// ====== CHAT ======
+
+// Tự động giãn nở ô nhập liệu
+userInput.addEventListener('input', () => {
+    userInput.style.height = 'auto';
+    userInput.style.height = (userInput.scrollHeight) + 'px';
+});
+
+async function sendMessage() {
+    const text = userInput.value.trim();
+    if (!text) return;
+
+    appendMessage('user', text);
+    userInput.value = '';
+    userInput.style.height = 'auto';
+
+    const loadingId = appendMessage('ai', '<i class="fas fa-spinner fa-spin"></i> AI đang phân tích dữ liệu...');
+
+    try {
+        const headers = {
+            'Content-Type': 'application/json'
+        };
+        if (currentUser) {
+            headers['X-User-Email'] = currentUser.email;
+            headers['X-User-Role'] = currentUser.role;
+        }
+
+        const response = await fetch(`${API_BASE}/chat`, {
+            method: 'POST',
+            headers: headers,
+            body: JSON.stringify({ 
+                query: text, 
+                history: chatHistory 
+            })
+        });
+
+        if (!response.ok) {
+            throw new Error(`Server error: ${response.status}`);
+        }
+
+        const data = await response.json();
+        updateMessage(loadingId, data.answer, data.sources);
+        
+        chatHistory.push({ role: 'user', content: text });
+        chatHistory.push({ role: 'assistant', content: data.answer });
+        if (chatHistory.length > 6) chatHistory = chatHistory.slice(-6);
+
+    } catch (error) {
+        console.error("Chat error:", error);
+        updateMessage(loadingId, "⚠️ Lỗi kết nối tới máy chủ AI. Vui lòng đảm bảo Backend đang chạy tại port 8000.");
+    }
+}
+
+function appendMessage(role, text) {
+    const id = Date.now();
+    const msgDiv = document.createElement('div');
+    msgDiv.className = `message ${role}`;
+    msgDiv.id = `msg-${id}`;
+    
+    const icon = role === 'ai' ? 'robot' : 'user';
+    
+    msgDiv.innerHTML = `
+        <div class="avatar"><i class="fas fa-${icon}"></i></div>
+        <div class="content">${text}</div>
+    `;
+    
+    chatWindow.appendChild(msgDiv);
+    chatWindow.scrollTop = chatWindow.scrollHeight;
+    return id;
+}
+
+function updateMessage(id, text, sources = []) {
+    const msgDiv = document.getElementById(`msg-${id}`);
+    if (!msgDiv) return;
+    
+    const contentDiv = msgDiv.querySelector('.content');
+    contentDiv.innerHTML = text.replace(/\n/g, '<br>');
+
+    if (sources && sources.length > 0) {
+        const tagDiv = document.createElement('div');
+        tagDiv.style.marginTop = '15px';
+        tagDiv.style.display = 'flex';
+        tagDiv.style.gap = '8px';
+        tagDiv.style.flexWrap = 'wrap';
+
+        sources.forEach((src, idx) => {
+            const tag = document.createElement('span');
+            tag.className = 'citation-tag';
+            tag.innerHTML = `<i class="fas fa-file-pdf"></i> Nguồn ${idx + 1}`;
+            tag.onclick = (e) => {
+                e.stopPropagation();
+                showSources(sources);
+            };
+            tagDiv.appendChild(tag);
+        });
+        contentDiv.appendChild(tagDiv);
+    }
+    chatWindow.scrollTop = chatWindow.scrollHeight;
+}
+
+function showSources(sources) {
+    sourceList.innerHTML = '';
+    sources.forEach(src => {
+        const item = document.createElement('div');
+        item.className = 'source-item';
+        item.innerHTML = `
+            <h4><i class="fas fa-file-alt"></i> ${src.file_name}</h4>
+            <p><strong>Vị trí:</strong> Trang ${src.page}</p>
+            ${src.url ? `<a href="${src.url}" target="_blank" style="color: #06b6d4; font-size: 11px; text-decoration: none; display: block; margin-top: 5px;">
+                <i class="fas fa-external-link-alt"></i> Xem trên SharePoint
+            </a>` : ''}
+            ${src.download_url ? `<a href="${src.download_url}" target="_blank" style="color: #10b981; font-size: 11px; text-decoration: none; display: block; margin-top: 5px;">
+                <i class="fas fa-download"></i> Tải xuống
+            </a>` : ''}
+        `;
+        sourceList.appendChild(item);
+    });
+    sourcePanel.classList.add('active');
+}
+
+// Event Listeners
+closePanel.onclick = () => sourcePanel.classList.remove('active');
+sendBtn.onclick = sendMessage;
+userInput.onkeydown = (e) => {
+    if (e.key === 'Enter' && !e.shiftKey) {
+        e.preventDefault();
+        sendMessage();
+    }
+};
+
+clearChatBtn.onclick = () => {
+    chatWindow.innerHTML = '';
+    chatHistory = [];
+    appendMessage('ai', 'Lịch sử chat đã được làm sạch. Tôi có thể giúp gì tiếp cho bạn?');
+};
+
+// ====== SYNC ======
+
+async function triggerSync() {
+    syncBtn.disabled = true;
+    syncStatus.style.display = 'flex';
+    syncStatus.className = 'sync-status';
+    syncStatus.querySelector('.sync-text').textContent = 'Đang đồng bộ...';
+
+    try {
+        const response = await fetch(`${API_BASE}/sync`, { method: 'POST' });
+        const data = await response.json();
+
+        if (data.status === 'already_running') {
+            syncStatus.querySelector('.sync-text').textContent = 'Đồng bộ đang chạy...';
+        } else {
+            syncStatus.querySelector('.sync-text').textContent = 'Đang xử lý...';
+            pollSyncStatus();
+        }
+    } catch (error) {
+        syncStatus.className = 'sync-status error';
+        syncStatus.querySelector('.sync-text').textContent = 'Lỗi kết nối server';
+        syncBtn.disabled = false;
+    }
+}
+
+async function pollSyncStatus() {
+    try {
+        const response = await fetch(`${API_BASE}/sync/status`);
+        const data = await response.json();
+
+        if (data.running) {
+            const count = data.processed + data.skipped;
+            syncStatus.querySelector('.sync-text').textContent = `Đang xử lý... (${count} file)`;
+            setTimeout(pollSyncStatus, 2000);
+        } else {
+            syncStatus.className = 'sync-status done';
+            syncStatus.querySelector('.sync-text').textContent = 
+                `Xong! ${data.processed} file đã nạp, ${data.skipped} bỏ qua`;
+            syncBtn.disabled = false;
+            setTimeout(() => { syncStatus.style.display = 'none'; }, 5000);
+        }
+    } catch (error) {
+        syncStatus.className = 'sync-status error';
+        syncStatus.querySelector('.sync-text').textContent = 'Lỗi kiểm tra trạng thái';
+        syncBtn.disabled = false;
+    }
+}
+
+syncBtn.onclick = triggerSync;
+
+// Init
+checkLogin();
diff --git a/frontend/index.html b/frontend/index.html
new file mode 100644
index 0000000..4d73d9e
--- /dev/null
+++ b/frontend/index.html
@@ -0,0 +1,119 @@
+<!DOCTYPE html>
+<html lang="vi">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>AI Knowledge Hub | Enterprise RAG</title>
+    <link rel="stylesheet" href="style.css">
+    <link rel="preconnect" href="https://fonts.googleapis.com">
+    <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+    <link href="https://fonts.googleapis.com/css2?family=Outfit:wght@300;400;600;700&display=swap" rel="stylesheet">
+    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.0/css/all.min.css">
+</head>
+<body>
+    <!-- Login Screen -->
+    <div class="login-screen" id="login-screen">
+        <div class="login-card">
+            <div class="login-logo">
+                <div class="logo-icon"><i class="fas fa-brain"></i></div>
+                <h1>VibeCode AI</h1>
+                <p>Enterprise Knowledge Hub</p>
+            </div>
+            <form id="login-form">
+                <div class="input-group">
+                    <i class="fas fa-envelope"></i>
+                    <input type="email" id="login-email" placeholder="Nhập email của bạn" required>
+                </div>
+                <button type="submit" class="login-btn">
+                    <i class="fas fa-sign-in-alt"></i> Đăng nhập
+                </button>
+            </form>
+            <div class="login-divider"><span>hoặc</span></div>
+            <button class="sso-btn" id="sso-btn">
+                <i class="fab fa-microsoft"></i> Đăng nhập Microsoft SSO
+            </button>
+            <p class="login-hint">Dùng tài khoản Microsoft 365 của công ty.</p>
+        </div>
+    </div>
+
+    <!-- Main App (hidden until login) -->
+    <div class="app-container" id="app-container" style="display: none;">
+        <!-- Sidebar -->
+        <aside class="sidebar">
+            <div class="logo-area">
+                <div class="logo-icon"><i class="fas fa-brain"></i></div>
+                <h1>VibeCode AI</h1>
+            </div>
+            
+            <nav class="side-nav">
+                <div class="nav-item active"><i class="fas fa-comments"></i> <span>Hỏi đáp RAG</span></div>
+                <button class="sync-btn" id="sync-btn"><i class="fas fa-sync-alt"></i> Đồng bộ SharePoint</button>
+            </nav>
+
+            <div class="sync-status" id="sync-status" style="display: none;">
+                <div class="sync-spinner"><i class="fas fa-spinner fa-spin"></i></div>
+                <span class="sync-text">Đang đồng bộ...</span>
+            </div>
+
+            <div class="system-status">
+                <div class="status-dot online"></div>
+                <div class="status-text">OpenSearch: Online</div>
+            </div>
+            
+            <div class="user-info" id="user-info">
+                <div class="user-avatar"><i class="fas fa-user"></i></div>
+                <div class="user-details">
+                    <span class="user-name" id="user-name">-</span>
+                    <span class="user-role" id="user-role">-</span>
+                </div>
+                <button id="logout-btn" title="Đăng xuất"><i class="fas fa-sign-out-alt"></i></button>
+            </div>
+        </aside>
+
+        <!-- Main Chat Area -->
+        <main class="chat-main">
+            <header class="chat-header">
+                <div class="header-info">
+                    <h2>SharePoint Intelligence</h2>
+                    <p>Hỏi đáp dựa trên cơ sở dữ liệu nội bộ của bạn</p>
+                </div>
+                <div class="header-actions">
+                    <button id="clear-chat" title="Xoá lịch sử"><i class="fas fa-trash-alt"></i></button>
+                </div>
+            </header>
+
+            <div class="messages-container" id="chat-window">
+                <!-- Chào mừng -->
+                <div class="message ai greeting">
+                    <div class="avatar"><i class="fas fa-robot"></i></div>
+                    <div class="content">
+                        <p>Xin chào! Tôi là trợ lý tri thức của bạn. Tôi đã sẵn sàng trả lời các câu hỏi dựa trên tài liệu từ SharePoint. Bạn muốn tìm hiểu điều gì hôm nay?</p>
+                    </div>
+                </div>
+            </div>
+
+            <!-- Input Area -->
+            <footer class="chat-footer">
+                <div class="input-wrapper">
+                    <textarea id="user-input" placeholder="Nhập câu hỏi của bạn tại đây..." rows="1"></textarea>
+                    <button id="send-btn"><i class="fas fa-paper-plane"></i></button>
+                </div>
+                <p class="disclaimer">AI có thể đưa ra câu trả lời chưa chính xác. Vui lòng kiểm tra lại nguồn trích dẫn.</p>
+            </footer>
+        </main>
+
+        <!-- Source Sidebar (Hidden by default) -->
+        <section class="source-panel" id="source-panel">
+            <div class="panel-header">
+                <h3><i class="fas fa-book-open"></i> Nguồn trích dẫn</h3>
+                <button id="close-panel"><i class="fas fa-times"></i></button>
+            </div>
+            <div class="source-list" id="source-list">
+                <!-- Nguồn sẽ được render ở đây -->
+            </div>
+        </section>
+    </div>
+
+    <script src="app.js"></script>
+</body>
+</html>
diff --git a/frontend/style.css b/frontend/style.css
new file mode 100644
index 0000000..a673e07
--- /dev/null
+++ b/frontend/style.css
@@ -0,0 +1,591 @@
+:root {
+    --bg-dark: #0f172a;
+    --glass-bg: rgba(255, 255, 255, 0.05);
+    --glass-border: rgba(255, 255, 255, 0.1);
+    --primary: #06b6d4;
+    --primary-glow: rgba(6, 182, 212, 0.3);
+    --secondary: #8b5cf6;
+    --text-main: #f8fafc;
+    --text-muted: #94a3b8;
+    --ai-bubble: rgba(30, 41, 59, 0.7);
+    --user-bubble: linear-gradient(135deg, #06b6d4, #3b82f6);
+}
+
+* {
+    margin: 0;
+    padding: 0;
+    box-sizing: border-box;
+    font-family: 'Outfit', sans-serif;
+}
+
+body {
+    background-color: var(--bg-dark);
+    /* Mesh Gradient cực kỳ Premium bằng CSS thuần */
+    background-image: 
+        radial-gradient(at 0% 0%, rgba(6, 182, 212, 0.15) 0px, transparent 50%),
+        radial-gradient(at 100% 0%, rgba(139, 92, 246, 0.15) 0px, transparent 50%),
+        radial-gradient(at 100% 100%, rgba(6, 182, 212, 0.1) 0px, transparent 50%),
+        radial-gradient(at 0% 100%, rgba(139, 92, 246, 0.1) 0px, transparent 50%);
+    background-size: cover;
+    height: 100vh;
+    display: flex;
+    justify-content: center;
+    align-items: center;
+    color: var(--text-main);
+    overflow: hidden;
+}
+
+.app-container {
+    width: 95vw;
+    height: 90vh;
+    background: var(--glass-bg);
+    backdrop-filter: blur(20px);
+    -webkit-backdrop-filter: blur(20px);
+    border: 1px solid var(--glass-border);
+    border-radius: 24px;
+    display: flex;
+    box-shadow: 0 25px 50px -12px rgba(0, 0, 0, 0.5);
+    overflow: hidden;
+    position: relative;
+}
+
+/* Sidebar Styling */
+.sidebar {
+    width: 260px;
+    background: rgba(15, 23, 42, 0.4);
+    border-right: 1px solid var(--glass-border);
+    padding: 30px 20px;
+    display: flex;
+    flex-direction: column;
+}
+
+.logo-area {
+    display: flex;
+    align-items: center;
+    gap: 12px;
+    margin-bottom: 50px;
+}
+
+.logo-icon {
+    width: 40px;
+    height: 40px;
+    background: var(--user-bubble);
+    border-radius: 12px;
+    display: flex;
+    justify-content: center;
+    align-items: center;
+    font-size: 20px;
+    box-shadow: 0 0 20px var(--primary-glow);
+}
+
+.logo-area h1 {
+    font-size: 20px;
+    font-weight: 700;
+    letter-spacing: 0.5px;
+}
+
+.side-nav {
+    flex: 1;
+}
+
+.nav-item {
+    padding: 14px 18px;
+    border-radius: 12px;
+    display: flex;
+    align-items: center;
+    gap: 15px;
+    cursor: pointer;
+    transition: 0.3s;
+    color: var(--text-muted);
+    margin-bottom: 8px;
+}
+
+.nav-item:hover, .nav-item.active {
+    background: rgba(255, 255, 255, 0.08);
+    color: var(--text-main);
+}
+
+.nav-item.active {
+    border-left: 4px solid var(--primary);
+}
+
+.system-status {
+    padding-top: 20px;
+    border-top: 1px solid var(--glass-border);
+    display: flex;
+    align-items: center;
+    gap: 10px;
+    font-size: 13px;
+    color: var(--text-muted);
+}
+
+.status-dot {
+    width: 8px;
+    height: 8px;
+    border-radius: 50%;
+}
+
+.status-dot.online {
+    background: #10b981;
+    box-shadow: 0 0 10px #10b981;
+}
+
+/* Main Chat Area */
+.chat-main {
+    flex: 1;
+    display: flex;
+    flex-direction: column;
+    position: relative;
+}
+
+.chat-header {
+    padding: 25px 40px;
+    border-bottom: 1px solid var(--glass-border);
+    display: flex;
+    justify-content: space-between;
+    align-items: center;
+}
+
+.header-info h2 {
+    font-size: 18px;
+}
+
+.header-info p {
+    font-size: 13px;
+    color: var(--text-muted);
+}
+
+.header-actions button {
+    background: transparent;
+    border: none;
+    color: var(--text-muted);
+    font-size: 18px;
+    cursor: pointer;
+    transition: 0.3s;
+}
+
+.header-actions button:hover {
+    color: #ef4444;
+}
+
+.messages-container {
+    flex: 1;
+    padding: 40px;
+    overflow-y: auto;
+    display: flex;
+    flex-direction: column;
+    gap: 25px;
+    scrollbar-width: thin;
+    scrollbar-color: var(--glass-border) transparent;
+}
+
+.messages-container::-webkit-scrollbar {
+    width: 6px;
+}
+
+.messages-container::-webkit-scrollbar-thumb {
+    background-color: var(--glass-border);
+    border-radius: 10px;
+}
+
+.message {
+    display: flex;
+    gap: 18px;
+    max-width: 85%;
+    animation: fadeIn 0.4s ease-out;
+}
+
+@keyframes fadeIn {
+    from { opacity: 0; transform: translateY(10px); }
+    to { opacity: 1; transform: translateY(0); }
+}
+
+.message.user {
+    align-self: flex-end;
+    flex-direction: row-reverse;
+}
+
+.avatar {
+    width: 40px;
+    height: 40px;
+    border-radius: 12px;
+    display: flex;
+    justify-content: center;
+    align-items: center;
+    flex-shrink: 0;
+}
+
+.ai .avatar { background: rgba(255, 255, 255, 0.1); color: var(--primary); border: 1px solid var(--glass-border); }
+.user .avatar { background: var(--user-bubble); color: white; }
+
+.content {
+    background: var(--ai-bubble);
+    padding: 16px 20px;
+    border-radius: 18px;
+    border-bottom-left-radius: 4px;
+    line-height: 1.6;
+    font-size: 15px;
+    border: 1px solid var(--glass-border);
+}
+
+.user .content {
+    background: var(--user-bubble);
+    border-bottom-left-radius: 18px;
+    border-bottom-right-radius: 4px;
+    border: none;
+}
+
+.citation-tag {
+    display: inline-block;
+    margin-top: 10px;
+    padding: 4px 10px;
+    background: rgba(6, 182, 212, 0.1);
+    border: 1px solid rgba(6, 182, 212, 0.2);
+    border-radius: 8px;
+    font-size: 12px;
+    color: var(--primary);
+    cursor: pointer;
+    transition: 0.3s;
+}
+
+.citation-tag:hover {
+    background: rgba(6, 182, 212, 0.2);
+}
+
+/* Footer Input Area */
+.chat-footer {
+    padding: 30px 40px;
+}
+
+.input-wrapper {
+    background: rgba(255, 255, 255, 0.05);
+    border: 1px solid var(--glass-border);
+    border-radius: 16px;
+    padding: 10px 15px;
+    display: flex;
+    align-items: center;
+    gap: 15px;
+    transition: 0.3s;
+}
+
+.input-wrapper:focus-within {
+    border-color: var(--primary);
+    box-shadow: 0 0 15px var(--primary-glow);
+}
+
+textarea {
+    flex: 1;
+    background: transparent;
+    border: none;
+    color: var(--text-main);
+    resize: none;
+    outline: none;
+    padding: 10px 5px;
+    max-height: 150px;
+}
+
+#send-btn {
+    width: 45px;
+    height: 45px;
+    background: var(--user-bubble);
+    border: none;
+    border-radius: 12px;
+    color: white;
+    cursor: pointer;
+    transition: 0.3s;
+}
+
+#send-btn:hover {
+    transform: scale(1.05);
+    box-shadow: 0 0 15px var(--primary-glow);
+}
+
+.disclaimer {
+    text-align: center;
+    font-size: 11px;
+    color: var(--text-muted);
+    margin-top: 12px;
+}
+
+/* Source Panel */
+.source-panel {
+    position: absolute;
+    right: -350px;
+    top: 0;
+    width: 350px;
+    height: 100%;
+    background: rgba(15, 23, 42, 0.95);
+    backdrop-filter: blur(25px);
+    border-left: 1px solid var(--glass-border);
+    transition: 0.4s cubic-bezier(0.4, 0, 0.2, 1);
+    z-index: 10;
+    padding: 30px;
+}
+
+.source-panel.active {
+    right: 0;
+}
+
+.panel-header {
+    display: flex;
+    justify-content: space-between;
+    align-items: center;
+    margin-bottom: 30px;
+}
+
+.source-item {
+    background: rgba(255, 255, 255, 0.05);
+    border: 1px solid var(--glass-border);
+    border-radius: 12px;
+    padding: 15px;
+    margin-bottom: 15px;
+}
+
+.source-item h4 { font-size: 14px; margin-bottom: 5px; color: var(--primary); }
+.source-item p { font-size: 12px; color: var(--text-muted); line-height: 1.4; }
+
+/* Login Screen */
+.login-screen {
+    position: fixed;
+    top: 0;
+    left: 0;
+    width: 100vw;
+    height: 100vh;
+    display: flex;
+    justify-content: center;
+    align-items: center;
+    z-index: 100;
+}
+
+.login-card {
+    background: var(--glass-bg);
+    backdrop-filter: blur(30px);
+    border: 1px solid var(--glass-border);
+    border-radius: 24px;
+    padding: 50px 40px;
+    width: 400px;
+    text-align: center;
+    box-shadow: 0 25px 50px -12px rgba(0, 0, 0, 0.5);
+}
+
+.login-logo {
+    margin-bottom: 40px;
+}
+
+.login-logo .logo-icon {
+    width: 60px;
+    height: 60px;
+    margin: 0 auto 20px;
+    font-size: 28px;
+}
+
+.login-logo h1 {
+    font-size: 24px;
+    margin-bottom: 8px;
+}
+
+.login-logo p {
+    color: var(--text-muted);
+    font-size: 14px;
+}
+
+.input-group {
+    display: flex;
+    align-items: center;
+    gap: 12px;
+    background: rgba(255, 255, 255, 0.05);
+    border: 1px solid var(--glass-border);
+    border-radius: 12px;
+    padding: 14px 18px;
+    margin-bottom: 20px;
+    transition: 0.3s;
+}
+
+.input-group:focus-within {
+    border-color: var(--primary);
+    box-shadow: 0 0 15px var(--primary-glow);
+}
+
+.input-group i {
+    color: var(--text-muted);
+    font-size: 16px;
+}
+
+.input-group input {
+    flex: 1;
+    background: transparent;
+    border: none;
+    outline: none;
+    color: var(--text-main);
+    font-size: 15px;
+}
+
+.login-btn {
+    width: 100%;
+    padding: 14px;
+    background: var(--user-bubble);
+    border: none;
+    border-radius: 12px;
+    color: white;
+    font-size: 16px;
+    font-weight: 600;
+    cursor: pointer;
+    transition: 0.3s;
+}
+
+.login-btn:hover {
+    transform: scale(1.02);
+    box-shadow: 0 0 20px var(--primary-glow);
+}
+
+.login-hint {
+    margin-top: 20px;
+    font-size: 12px;
+    color: var(--text-muted);
+}
+
+.login-divider {
+    display: flex;
+    align-items: center;
+    gap: 15px;
+    margin: 20px 0;
+    color: var(--text-muted);
+    font-size: 13px;
+}
+
+.login-divider::before,
+.login-divider::after {
+    content: '';
+    flex: 1;
+    height: 1px;
+    background: var(--glass-border);
+}
+
+.sso-btn {
+    width: 100%;
+    padding: 14px;
+    background: rgba(255, 255, 255, 0.08);
+    border: 1px solid var(--glass-border);
+    border-radius: 12px;
+    color: var(--text-main);
+    font-size: 15px;
+    font-weight: 600;
+    cursor: pointer;
+    transition: 0.3s;
+    display: flex;
+    align-items: center;
+    justify-content: center;
+    gap: 10px;
+}
+
+.sso-btn:hover {
+    background: rgba(255, 255, 255, 0.15);
+    border-color: var(--primary);
+}
+
+/* User Info */
+.user-info {
+    padding-top: 20px;
+    border-top: 1px solid var(--glass-border);
+    display: flex;
+    align-items: center;
+    gap: 12px;
+    font-size: 13px;
+    color: var(--text-muted);
+}
+
+.user-avatar {
+    width: 36px;
+    height: 36px;
+    background: rgba(255, 255, 255, 0.1);
+    border-radius: 10px;
+    display: flex;
+    align-items: center;
+    justify-content: center;
+    color: var(--primary);
+}
+
+.user-details {
+    flex: 1;
+    display: flex;
+    flex-direction: column;
+}
+
+.user-name {
+    color: var(--text-main);
+    font-weight: 600;
+    font-size: 13px;
+}
+
+.user-role {
+    font-size: 11px;
+    text-transform: uppercase;
+    letter-spacing: 0.5px;
+}
+
+#logout-btn {
+    background: transparent;
+    border: none;
+    color: var(--text-muted);
+    cursor: pointer;
+    padding: 8px;
+    border-radius: 8px;
+    transition: 0.3s;
+}
+
+#logout-btn:hover {
+    color: #ef4444;
+    background: rgba(239, 68, 68, 0.1);
+}
+
+/* Sync Button */
+.sync-btn {
+    width: 100%;
+    padding: 12px 18px;
+    margin-top: 10px;
+    background: rgba(6, 182, 212, 0.1);
+    border: 1px solid rgba(6, 182, 212, 0.2);
+    border-radius: 12px;
+    color: var(--primary);
+    font-size: 13px;
+    cursor: pointer;
+    transition: 0.3s;
+    display: flex;
+    align-items: center;
+    gap: 10px;
+}
+
+.sync-btn:hover {
+    background: rgba(6, 182, 212, 0.2);
+}
+
+.sync-btn:disabled {
+    opacity: 0.5;
+    cursor: not-allowed;
+}
+
+/* Sync Status */
+.sync-status {
+    padding: 12px;
+    margin-top: 10px;
+    background: rgba(139, 92, 246, 0.1);
+    border: 1px solid rgba(139, 92, 246, 0.2);
+    border-radius: 10px;
+    display: flex;
+    align-items: center;
+    gap: 10px;
+    font-size: 12px;
+    color: var(--secondary);
+}
+
+.sync-status.done {
+    background: rgba(16, 185, 129, 0.1);
+    border-color: rgba(16, 185, 129, 0.2);
+    color: #10b981;
+}
+
+.sync-status.error {
+    background: rgba(239, 68, 68, 0.1);
+    border-color: rgba(239, 68, 68, 0.2);
+    color: #ef4444;
+}
diff --git a/indexing/vector_store.py b/indexing/vector_store.py
index c8573c4..b3b0a03 100644
--- a/indexing/vector_store.py
+++ b/indexing/vector_store.py
@@ -1,4 +1,5 @@
 import logging
+import os
 from typing import List
 from opensearchpy import OpenSearch, RequestsHttpConnection
 from core.models import DocumentChunk
@@ -10,9 +11,12 @@ class VectorStore:
     def __init__(self, index_name: str = "sharepoint_docs"):
         self.index_name = index_name
         
-        # Kết nối tới OpenSearch Cluster
+        host = settings.opensearch_host
+        if host == "opensearch" and os.environ.get("ENV") != "docker":
+            host = "localhost"
+        
         self.client = OpenSearch(
-            hosts=[{'host': settings.opensearch_host, 'port': settings.opensearch_port}],
+            hosts=[{'host': host, 'port': settings.opensearch_port}],
             http_auth=(settings.opensearch_user, settings.opensearch_pass),
             use_ssl=False,
             verify_certs=False,
@@ -64,6 +68,7 @@ class VectorStore:
                         "page_from": { "type": "integer" },
                         "page_to": { "type": "integer" },
                         "source_url": { "type": "keyword" },
+                        "download_url": { "type": "keyword" },
                         "permissions": { "type": "keyword" }
                     }
                 }
@@ -71,6 +76,23 @@ class VectorStore:
             self.client.indices.create(index=self.index_name, body=mapping)
             logger.info(f"Đã tạo OpenSearch Index: {self.index_name}")
 
+    def delete_by_file_id(self, file_id: str):
+        """Xóa tất cả chunks cũ của một file trước khi nạp lại."""
+        query = {
+            "query": {
+                "term": { "file_id": file_id }
+            }
+        }
+        try:
+            response = self.client.delete_by_query(index=self.index_name, body=query)
+            deleted = response.get("deleted", 0)
+            if deleted > 0:
+                logger.info(f"Đã xóa {deleted} chunks cũ của file_id={file_id}")
+            return deleted
+        except Exception as e:
+            logger.warning(f"Không thể xóa chunks cũ (có thể index chưa tồn tại): {e}")
+            return 0
+
     def embed_and_index(self, chunks: List[DocumentChunk]):
         """Biến đổi Text thành Vector và lưu vào Database"""
         if not chunks:
diff --git a/ingestion/graph_client.py b/ingestion/graph_client.py
index 3564080..325dbad 100755
--- a/ingestion/graph_client.py
+++ b/ingestion/graph_client.py
@@ -132,3 +132,13 @@ class GraphClient:
         else:
             url = f"{self.base_url}/drives/{drive_id}/root/delta"
         return self._make_get_request(url)
+
+    def get_item_details(self, drive_id: str, item_id: str):
+        """GET /drives/{driveId}/items/{itemId} - Lấy thông tin chi tiết bao gồm webUrl và downloadUrl."""
+        url = f"{self.base_url}/drives/{drive_id}/items/{item_id}"
+        return self._make_get_request(url)
+
+    def get_item_permissions(self, drive_id: str, item_id: str):
+        """GET /drives/{driveId}/items/{itemId}/permissions - Lấy danh sách quyền truy cập."""
+        url = f"{self.base_url}/drives/{drive_id}/items/{item_id}/permissions"
+        return self._make_get_request(url)
diff --git a/ingestion/providers/base_provider.py b/ingestion/providers/base_provider.py
index 5824c45..c8502f2 100644
--- a/ingestion/providers/base_provider.py
+++ b/ingestion/providers/base_provider.py
@@ -34,3 +34,29 @@ class BaseStorageProvider(ABC):
             bytes: The raw file content.
         """
         pass
+
+    @abstractmethod
+    def get_item_details(self, item_id: str) -> Dict:
+        """
+        Get full item details including webUrl and downloadUrl.
+        
+        Args:
+            item_id (str): The item ID from fetch_changes.
+            
+        Returns:
+            Dict: Full item details with links.
+        """
+        pass
+
+    @abstractmethod
+    def get_item_permissions(self, item_id: str) -> List[str]:
+        """
+        Get permissions for an item. Returns list of user/group emails or IDs.
+        
+        Args:
+            item_id (str): The item ID from fetch_changes.
+            
+        Returns:
+            List[str]: List of user/group identifiers. ["*"] means everyone can access.
+        """
+        pass
diff --git a/ingestion/providers/sharepoint_provider.py b/ingestion/providers/sharepoint_provider.py
index abf43a2..587f0cd 100644
--- a/ingestion/providers/sharepoint_provider.py
+++ b/ingestion/providers/sharepoint_provider.py
@@ -81,6 +81,62 @@ class SharePointProvider(BaseStorageProvider):
         
         return standardized_items, new_state
 
+    def get_item_details(self, item_id: str) -> Dict:
+        """
+        Get full item details including webUrl and downloadUrl.
+        """
+        try:
+            item = self.graph.get_item_details(self.drive_id, item_id)
+            return {
+                "id": item.get("id"),
+                "name": item.get("name"),
+                "web_url": item.get("webUrl"),
+                "download_url": item.get("@microsoft.graph.downloadUrl"),
+                "size": item.get("size"),
+                "last_modified": item.get("lastModifiedDateTime"),
+            }
+        except Exception as e:
+            logger.error(f"Failed to get item details for {item_id}: {e}")
+            raise e
+
+    def get_item_permissions(self, item_id: str) -> List[str]:
+        """
+        Get permissions for an item. Returns list of user/group emails or IDs.
+        """
+        try:
+            response = self.graph.get_item_permissions(self.drive_id, item_id)
+            permissions = set()
+            
+            for perm in response.get("value", []):
+                # Lấy grantedTo hoặc grantedToIdentities
+                granted = perm.get("grantedTo", {})
+                if not granted:
+                    identities = perm.get("grantedToIdentitiesV2", [])
+                    for identity in identities:
+                        user = identity.get("user", {})
+                        if user.get("email"):
+                            permissions.add(user["email"].lower())
+                        elif user.get("id"):
+                            permissions.add(user["id"])
+                
+                user = granted.get("user", {})
+                if user.get("email"):
+                    permissions.add(user["email"].lower())
+                elif user.get("id"):
+                    permissions.add(user["id"])
+                
+                # Nếu có grantedToV2 (site group)
+                granted_v2 = perm.get("grantedToV2", {})
+                site_group = granted_v2.get("siteGroup", {})
+                if site_group.get("displayName"):
+                    permissions.add(f"group:{site_group['displayName']}")
+            
+            return list(permissions) if permissions else ["*"]
+            
+        except Exception as e:
+            logger.warning(f"Failed to get permissions for {item_id}: {e}. Defaulting to ['*']")
+            return ["*"]
+
     def download_file(self, target_item: Dict) -> bytes:
         """
         Download file content from SharePoint.
diff --git a/ingestion/sync.py b/ingestion/sync.py
index f17556d..8544972 100644
--- a/ingestion/sync.py
+++ b/ingestion/sync.py
@@ -3,145 +3,88 @@ import json
 import logging
 from typing import List, Dict, Any
 
-# Ensure we can import from the root module if run directly
 import sys
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
-from ingestion.graph_client import GraphClient
+from ingestion.providers.base_provider import BaseStorageProvider
 
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger("IngestionSync")
 
-class SharePointSync:
-    def __init__(self, hostname: str, site_path: str):
-        self.graph_client = GraphClient()
-        self.hostname = hostname
-        self.site_path = site_path
-        self.state_file = "delta_state.json"
-        self.output_file = "ingestion_output.json"
-        
-    def _load_delta_link(self) -> str:
-        """Load delta link from local state file."""
+
+class SyncEngine:
+    """
+    Bộ điều phối đồng bộ không phụ thuộc vào nhà cung cấp cụ thể.
+    Nhận vào bất kỳ BaseStorageProvider nào (SharePoint, Google Drive, NAS, ...).
+    """
+    def __init__(self, provider: BaseStorageProvider, state_file: str = "delta_state.json", output_file: str = "ingestion_output.json"):
+        self.provider = provider
+        self.state_file = state_file
+        self.output_file = output_file
+
+    def _load_sync_state(self) -> Dict:
+        """Load sync state từ local file."""
         if os.path.exists(self.state_file):
             with open(self.state_file, "r", encoding="utf-8") as f:
-                data = json.load(f)
-                return data.get("delta_link")
-        return None
-        
-    def _save_delta_link(self, delta_link: str):
-        """Save delta link to local state file for next incremental sync."""
-        with open(self.state_file, "w", encoding="utf-8") as f:
-            json.dump({"delta_link": delta_link}, f, indent=2)
-            
-    def _extract_metadata(self, item: Dict[Any, Any], site_id: str, drive_id: str) -> Dict[str, Any]:
-        """Convert Graph API item payload to our target schema."""
-        download_url = item.get("@microsoft.graph.downloadUrl")
-        if not download_url and "folder" not in item and "deleted" not in item:
-            try:
-                # Delta query might not return downloadUrl, so fetch it directly
-                item_id = item.get("id")
-                url = f"https://graph.microsoft.com/v1.0/drives/{drive_id}/items/{item_id}"
-                full_item = self.graph_client._make_get_request(url)
-                download_url = full_item.get("@microsoft.graph.downloadUrl")
-            except Exception as e:
-                logger.error(f"Failed to fetch download_url for {item.get('name')}: {e}")
+                return json.load(f)
+        return {}
 
-        return {
-            "site_id": site_id,
-            "drive_id": drive_id,
-            "item_id": item.get("id"),
-            "name": item.get("name"),
-            "web_url": item.get("webUrl"),
-            "download_url": download_url,
-            "mime_type": item.get("file", {}).get("mimeType") if "file" in item else None,
-            "parent_path": item.get("parentReference", {}).get("path"),
-            "is_folder": "folder" in item,
-            "size": item.get("size"),
-            "last_modified": item.get("lastModifiedDateTime"),
-            "created": item.get("createdDateTime"),
-            "eTag": item.get("eTag"),
-            "cTag": item.get("cTag"),
-            "deleted": "deleted" in item
-        }
+    def _save_sync_state(self, state: Dict):
+        """Save sync state ra local file."""
+        with open(self.state_file, "w", encoding="utf-8") as f:
+            json.dump(state, f, indent=2, ensure_ascii=False)
 
     def _upsert_to_local_db(self, new_items: List[Dict[str, Any]]):
-        """Simulate upsert into a database by writing to a JSON file."""
+        """Lưu kết quả vào local JSON (mô phỏng DB)."""
         db = {}
         if os.path.exists(self.output_file):
             with open(self.output_file, "r", encoding="utf-8") as f:
                 try:
                     existing = json.load(f)
                     for item in existing:
-                        db[item["item_id"]] = item
+                        db[item["id"]] = item
                 except json.JSONDecodeError:
                     pass
-                    
+
         for item in new_items:
-            if item.get("deleted"):
-                # If deleted, we mark it as deleted in our db (or we could remove it)
-                if item["item_id"] in db:
-                    db[item["item_id"]]["deleted"] = True
+            item_id = item.get("id")
+            if item.get("is_deleted"):
+                if item_id in db:
+                    db[item_id]["is_deleted"] = True
                 else:
-                    # It's deleted but we didn't have it anyway
-                    db[item["item_id"]] = item
+                    db[item_id] = item
             else:
-                db[item["item_id"]] = item
-                
+                db[item_id] = item
+
         final_list = list(db.values())
         with open(self.output_file, "w", encoding="utf-8") as f:
             json.dump(final_list, f, indent=2, ensure_ascii=False)
-            
-        logger.info(f"Local database updated. Total items currently stored: {len(final_list)}")
+
+        logger.info(f"Local database updated. Total items: {len(final_list)}")
 
     def run_sync(self):
-        logger.info("=== STARTING SHAREPOINT SYNC ===")
-        
-        # 1. & 2. Resolve Site and Drive
-        logger.info(f"Resolving site: {self.hostname}:{self.site_path}")
-        site_info = self.graph_client.get_site_by_path(self.hostname, self.site_path)
-        site_id = site_info["id"]
-        
-        logger.info(f"Resolving drive for site: {site_id}")
-        drive_info = self.graph_client.get_drive(site_id)
-        drive_id = drive_info["id"]
-        
-        # 3. Delta Query setup
-        delta_link = self._load_delta_link()
-        if delta_link:
-            logger.info("Found existing delta_link. Performing INCREMENTAL sync.")
+        """Chạy đồng bộ: fetch changes từ provider -> lưu local."""
+        logger.info("=== STARTING SYNC ===")
+
+        sync_state = self._load_sync_state()
+        if sync_state:
+            logger.info("Found existing sync state. Performing INCREMENTAL sync.")
         else:
-            logger.info("No delta_link found. Performing FULL sync.")
-            
-        items_collected = []
-        current_url = delta_link
-        
-        # Loop over pagination
-        while True:
-            response = self.graph_client.delta_query(drive_id, current_url)
-            values = response.get("value", [])
-            items_collected.extend(values)
-            
-            if "@odata.nextLink" in response:
-                current_url = response["@odata.nextLink"]
-                logger.info("Fetching next page of delta results...")
-            elif "@odata.deltaLink" in response:
-                new_delta_link = response["@odata.deltaLink"]
-                self._save_delta_link(new_delta_link)
-                logger.info("Reached end of delta changes. Saved new delta_link.")
-                break
-            else:
-                logger.warning("No nextLink or deltaLink found in response! Breaking loop.")
-                break
-                
-        logger.info(f"Delta query returned {len(items_collected)} change(s).")
-        
-        # 4. Extract metadata and save
-        if items_collected:
-            processed_items = [self._extract_metadata(item, site_id, drive_id) for item in items_collected]
-            self._upsert_to_local_db(processed_items)
-        else:
-            logger.info("No items to process.")
+            logger.info("No sync state found. Performing FULL sync.")
+
+        items, new_state = self.provider.fetch_changes(sync_state)
+        logger.info(f"Provider returned {len(items)} change(s).")
+
+        if items:
+            self._upsert_to_local_db(items)
+
+        self._save_sync_state(new_state)
+        logger.info("Sync state saved.")
+
 
 if __name__ == "__main__":
-    sync = SharePointSync("285pdg.sharepoint.com", "/sites/poc_system")
-    sync.run_sync()
+    from ingestion.providers.sharepoint_provider import SharePointProvider
+
+    provider = SharePointProvider()
+    engine = SyncEngine(provider)
+    engine.run_sync()
diff --git a/search/retriever.py b/search/retriever.py
index b49873f..418f51d 100644
--- a/search/retriever.py
+++ b/search/retriever.py
@@ -1,51 +1,84 @@
 import logging
-from typing import List
+import os
+from typing import List, Optional
 from opensearchpy import OpenSearch, RequestsHttpConnection
 from core.config import settings
 from core.models import DocumentChunk
 
 logger = logging.getLogger("Retriever")
 
+
 class SearchRetriever:
     def __init__(self, index_name: str = "poc_sharepoint_docs"):
         self.index_name = index_name
         
-        # Kết nối OpenSearch
+        host = settings.opensearch_host
+        if host == "opensearch" and os.environ.get("ENV") != "docker":
+            host = "localhost"
+        
         self.client = OpenSearch(
-            hosts=[{'host': settings.opensearch_host, 'port': settings.opensearch_port}],
+            hosts=[{'host': host, 'port': settings.opensearch_port}],
             http_auth=(settings.opensearch_user, settings.opensearch_pass),
             use_ssl=False,
             verify_certs=False,
             connection_class=RequestsHttpConnection
         )
         
-        # Load Local Embedding Model (để biến câu hỏi thành vector cùng không gian với dữ liệu)
-        logger.info("Đang nạp Embedding Model cho Retriever...")
+        logger.info("Loading Embedding Model for Retriever...")
         from sentence_transformers import SentenceTransformer
         self.embedder = SentenceTransformer('keepitreal/vietnamese-sbert')
 
-    def retrieve(self, query: str, top_k: int = 5) -> List[DocumentChunk]:
+    def retrieve(self, query: str, top_k: int = 5, user_email: Optional[str] = None, is_admin: bool = False) -> List[DocumentChunk]:
         """
-        Tìm kiếm ngữ nghĩa (Semantic Search) dựa trên Vector k-NN
-        """
-        logger.info(f"Đang tìm kiếm ngữ nghĩa cho câu hỏi: '{query}'")
+        Tìm kiếm ngữ nghĩa với ACL filtering.
+        
+        Args:
+            query: Câu hỏi của user
+            top_k: Số kết quả tối đa
+            user_email: Email user để filter quyền.
+            is_admin: True = bypass ACL, thấy tất cả.
+        """
+        logger.info(f"Search: '{query[:80]}' (user={user_email or 'none'}, admin={is_admin})")
         
-        # 1. Chuyển câu hỏi thành Vector
         query_vector = self.embedder.encode(query).tolist()
         
-        # 2. Xây dựng k-NN Query cho OpenSearch
-        # Ta có thể kết hợp Hybrid Search (Vector + Text) ở đây nếu muốn
-        search_query = {
-            "size": top_k,
-            "query": {
-                "knn": {
-                    "embedding": {
-                        "vector": query_vector,
-                        "k": top_k
+        # Admin hoặc không có user_email → không filter
+        if is_admin or not user_email:
+            search_query = {
+                "size": top_k,
+                "query": {
+                    "knn": {
+                        "embedding": {
+                            "vector": query_vector,
+                            "k": top_k
+                        }
+                    }
+                }
+            }
+        else:
+            # User thường → filter theo permissions
+            search_query = {
+                "size": top_k,
+                "query": {
+                    "bool": {
+                        "must": [
+                            {
+                                "knn": {
+                                    "embedding": {
+                                        "vector": query_vector,
+                                        "k": top_k * 2
+                                    }
+                                }
+                            }
+                        ],
+                        "should": [
+                            {"term": {"permissions": "*"}},
+                            {"term": {"permissions": user_email.lower()}}
+                        ],
+                        "minimum_should_match": 1
                     }
                 }
             }
-        }
         
         try:
             response = self.client.search(
@@ -58,13 +91,12 @@ class SearchRetriever:
             
             for hit in hits:
                 source = hit["_source"]
-                # Chuyển từ JSON sang DocumentChunk model
                 chunk = DocumentChunk(**source)
                 results.append(chunk)
                 
-            logger.info(f"Tìm thấy {len(results)} đoạn văn phù hợp nhất.")
+            logger.info(f"Found {len(results)} chunks")
             return results
             
         except Exception as e:
-            logger.error(f"Lỗi khi truy vấn OpenSearch: {e}")
+            logger.error(f"OpenSearch query error: {e}")
             return []
diff --git a/test_dce_pipeline.py b/test_dce_pipeline.py
index 8c01b4a..03f0272 100644
--- a/test_dce_pipeline.py
+++ b/test_dce_pipeline.py
@@ -7,6 +7,7 @@ sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
 
 from core.models import IngestedDocument
 from extraction.dce import DocumentClassificationEngine
+from ingestion.providers.sharepoint_provider import SharePointProvider
 
 logging.basicConfig(level=logging.INFO, format="%(levelname)s:%(name)s:%(message)s")
 
@@ -19,20 +20,22 @@ def main():
         
     with open("ingestion_output.json", "r", encoding="utf-8") as f:
         items = json.load(f)
-        
-    dce = DocumentClassificationEngine()
+    
+    # Khởi tạo provider để download file qua Graph API auth
+    provider = SharePointProvider()
+    dce = DocumentClassificationEngine(provider=provider)
     
     print(f"Loaded {len(items)} items from ingestion_output.json\n")
     
     for item in items:
         if item.get("is_folder"):
-            continue # DCE only processes files
+            continue
             
         doc = IngestedDocument(**item)
         
         print(f"\n--- Processing: {doc.name} ---")
-        result = dce.classify(doc)
-        print(f">> Policy: {result.processing_policy.value} | Reason: {result.reason}")
+        result = dce.classify(doc, target_item=item)
+        print(f">> Type: {result.doc_type.value} | Policy: {result.processing_policy.value} | Reason: {result.reason}")
 
 if __name__ == "__main__":
     main()
diff --git a/test_rag_pipeline.py b/test_rag_pipeline.py
index ad2d767..1f4d71c 100644
--- a/test_rag_pipeline.py
+++ b/test_rag_pipeline.py
@@ -2,77 +2,187 @@ import logging
 import sys
 
 from core.config import settings
+from core.models import IngestedDocument, ProcessingPolicy
 from ingestion.providers.sharepoint_provider import SharePointProvider
+from extraction.dce import DocumentClassificationEngine
 from extraction.ocr_service import OCRService
+from extraction.text_extractor import TextExtractor
 from chunking.markdown_chunker import MarkdownChunker
 from indexing.vector_store import VectorStore
 
 logging.basicConfig(level=logging.INFO, format="%(levelname)s:%(name)s:%(message)s")
 logger = logging.getLogger("RAGPipeline")
 
+
+def extract_text_from_pdf_bytes(pdf_bytes: bytes) -> str:
+    """Trích xuất text trực tiếp từ PDF có text layer (không cần OCR)."""
+    try:
+        import fitz
+        doc = fitz.open(stream=pdf_bytes, filetype="pdf")
+        texts = []
+        for page in doc:
+            texts.append(page.get_text())
+        return "\n\n".join(texts)
+    except Exception as e:
+        logger.error(f"Failed to extract text from PDF: {e}")
+        return ""
+
+
 def run_pipeline():
-    logger.info("=== BẮT ĐẦU TEST TOÀN BỘ ĐƯỜNG ỐNG RAG ===")
+    logger.info("=== BẮT ĐẦU TEST TOÀN BỘ ĐƯỜNG ỐNG RAG (với DCE) ===")
     
-    # Ép buộc dùng localhost cho OpenSearch khi chạy trực tiếp trên WSL
     if settings.opensearch_host == "opensearch":
         settings.opensearch_host = "localhost"
 
-    # 1. Tầng Ingestion
+    # 1. INGESTION
     logger.info("\n--- BƯỚC 1: Lấy file từ SharePoint ---")
     provider = SharePointProvider()
     items, _ = provider.fetch_changes({})
     
-    target_item = None
-    for item in items:
-        if item.get("name", "").lower().endswith(".pdf"):
-            target_item = item
-            break
-            
-    if not target_item:
-        logger.error("Không tìm thấy file PDF nào trên SharePoint để test!")
+    if not items:
+        logger.error("Không có file nào trên SharePoint!")
         sys.exit(1)
-        
-    logger.info(f"Đã chọn file: {target_item['name']}. Đang tải...")
-    pdf_bytes = provider.download_file(target_item)
-    logger.info(f"Tải thành công {len(pdf_bytes)} bytes.")
-
-    # 2. Tầng Extraction (VLM)
-    logger.info("\n--- BƯỚC 2: OCR / VLM Trích xuất Markdown ---")
-    ocr = OCRService()
-    pages = ocr.process_pdf_bytes(pdf_bytes)
     
-    if not pages:
-        logger.error("VLM không trích xuất được nội dung nào!")
-        sys.exit(1)
-        
-    logger.info(f"VLM đã trích xuất thành công {len(pages)} trang.")
+    logger.info(f"Đã lấy {len(items)} items từ SharePoint.")
 
-    # 3. Tầng Chunking
-    logger.info("\n--- BƯỚC 3: Băm nhỏ văn bản (Semantic Chunking) ---")
+    # 2. DCE + PROCESSING
+    dce = DocumentClassificationEngine(provider=provider)
+    ocr = OCRService()
     chunker = MarkdownChunker(max_chunk_size=1000, overlap=100)
     
-    # Tạo metadata giả lập để lưu vào Chunk
-    metadata = {
-        "item_id": target_item["id"],
-        "name": target_item["name"],
-        "web_url": "https://285pdg.sharepoint.com/...",
-        "site_id": settings.sharepoint_site_id
-    }
-    
-    chunks = chunker.chunk_document(pages, metadata)
-    logger.info(f"Đã băm thành {len(chunks)} chunks độc lập.")
-    if chunks:
-        logger.info(f"Ví dụ Chunk đầu tiên:\n[ID: {chunks[0].chunk_id}] {chunks[0].text[:150]}...")
-
-    # 4. Tầng Vector Database (OpenSearch)
-    logger.info("\n--- BƯỚC 4: Mã hóa Vector & Indexing ---")
     try:
         vector_db = VectorStore(index_name="poc_sharepoint_docs")
-        vector_db.embed_and_index(chunks)
-        logger.info("🎉 CHÚC MỪNG! DỮ LIỆU ĐÃ NẰM TRONG OPENSEARCH SẴN SÀNG ĐỂ CHAT!")
     except Exception as e:
-        logger.error(f"LỖI trong quá trình Embedding / Indexing: {e}")
-        logger.warning("Gợi ý: Hãy chắc chắn Docker OpenSearch đang chạy trên cổng 9200!")
+        logger.error(f"Không kết nối được OpenSearch: {e}")
+        sys.exit(1)
+
+    processed_count = 0
+    skipped_count = 0
+
+    for item in items:
+        if item.get("is_folder") or item.get("is_deleted"):
+            continue
+        
+        name = item.get("name", "")
+        item_id = item.get("id", "")
+        
+        # Tạo IngestedDocument cho DCE
+        item_details = provider.get_item_details(item_id)
+        permissions = provider.get_item_permissions(item_id)
+        doc = IngestedDocument(
+            site_id=settings.sharepoint_site_id,
+            drive_id="",
+            item_id=item_id,
+            name=name,
+            web_url=item_details.get("web_url", ""),
+            download_url=item_details.get("download_url"),
+            is_folder=False,
+            size=item.get("size", 0),
+        )
+        
+        # DCE PHÂN LOẠI
+        logger.info(f"\n--- DCE: {name} ---")
+        classification = dce.classify(doc, target_item=item)
+        logger.info(f"   → {classification.doc_type.value} | {classification.processing_policy.value} | {classification.reason}")
+        
+        # XỬ LÝ THEO POLICY
+        if classification.processing_policy == ProcessingPolicy.UNSUPPORTED:
+            logger.info(f"   ⏭ BỎ QUA: {name} (unsupported)")
+            skipped_count += 1
+            continue
+        
+        if classification.processing_policy == ProcessingPolicy.METADATA_ONLY:
+            logger.info(f"   ⏭ BỎ QUA: {name} (metadata-only, không index text)")
+            skipped_count += 1
+            continue
+        
+        if classification.processing_policy == ProcessingPolicy.REQUIRES_REVIEW:
+            logger.info(f"   ⏭ BỎ QUA: {name} (cần review thủ công)")
+            skipped_count += 1
+            continue
+        
+        # DOWNLOAD FILE
+        logger.info(f"   📥 Đang tải {name}...")
+        try:
+            file_bytes = provider.download_file(item)
+        except Exception as e:
+            logger.error(f"   ❌ Lỗi tải {name}: {e}")
+            skipped_count += 1
+            continue
+        
+        if not file_bytes:
+            logger.error(f"   ❌ File rỗng: {name}")
+            skipped_count += 1
+            continue
+        
+        # EXTRACTION
+        pages = []
+        ext = name.lower().rsplit(".", 1)[-1] if "." in name else ""
+        
+        if classification.processing_policy == ProcessingPolicy.SKIP_OCR:
+            if ext == "pdf":
+                # TEXT_PDF: trích xuất text trực tiếp, không OCR
+                logger.info(f"   📄 TEXT_PDF: Trích xuất text trực tiếp (không OCR)...")
+                text = extract_text_from_pdf_bytes(file_bytes)
+                if text.strip():
+                    from core.models import OCRPageResult
+                    pages = [OCRPageResult(page=1, text=text, confidence=1.0)]
+                else:
+                    logger.warning(f"   ⚠️ Không trích xuất được text từ {name}")
+            elif ext in ("docx", "doc"):
+                logger.info(f"   📄 DOCX: Trích xuất text bằng python-docx...")
+                pages = TextExtractor.extract_from_docx(file_bytes)
+            elif ext in ("xlsx", "xls"):
+                logger.info(f"   📄 XLSX: Trích xuất dữ liệu bằng openpyxl...")
+                pages = TextExtractor.extract_from_xlsx(file_bytes)
+            elif ext in ("txt", "md", "csv"):
+                logger.info(f"   📄 {ext.upper()}: Đọc text trực tiếp...")
+                pages = TextExtractor.extract_from_text(file_bytes)
+            else:
+                logger.info(f"   📄 {classification.doc_type.value}: Chưa hỗ trợ extract text, bỏ qua.")
+                skipped_count += 1
+                continue
+        
+        elif classification.processing_policy == ProcessingPolicy.REQUIRES_OCR:
+            # SCAN_PDF: dùng VLM OCR
+            logger.info(f"   👁️ SCAN_PDF: Đang OCR qua VLM...")
+            pages = ocr.process_pdf_bytes(file_bytes)
+        
+        if not pages:
+            logger.warning(f"   ⚠️ Không có nội dung để index: {name}")
+            skipped_count += 1
+            continue
+        
+        # CHUNKING
+        logger.info(f"   ✂️ Đang chunk ({len(pages)} trang)...")
+        metadata = {
+            "item_id": item_id,
+            "name": name,
+            "web_url": item_details.get("web_url"),
+            "download_url": item_details.get("download_url"),
+            "site_id": settings.sharepoint_site_id,
+            "permissions": permissions
+        }
+        chunks = chunker.chunk_document(pages, metadata)
+        
+        if not chunks:
+            logger.warning(f"   ⚠️ Không có chunks: {name}")
+            skipped_count += 1
+            continue
+        
+        # INDEXING
+        logger.info(f"   📦 Đang index {len(chunks)} chunks vào OpenSearch...")
+        vector_db.delete_by_file_id(item_id)
+        vector_db.embed_and_index(chunks)
+        
+        processed_count += 1
+        logger.info(f"   ✅ HOÀN TẤT: {name} → {len(chunks)} chunks")
+
+    # SUMMARY
+    logger.info("\n" + "=" * 60)
+    logger.info(f"📊 TỔNG KẾT: {processed_count} file đã xử lý, {skipped_count} file bỏ qua")
+    logger.info("=" * 60)
+
 
 if __name__ == "__main__":
     run_pipeline()