import logging import sys import os import secrets from enum import Enum from typing import List, Optional, Dict, Any from fastapi import FastAPI, HTTPException, BackgroundTasks, Request, status from fastapi.responses import RedirectResponse from fastapi.middleware.cors import CORSMiddleware from pydantic import BaseModel, Field, validator import uvicorn import msal # Đảm bảo đường dẫn module sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from chat.rag_engine import RAGEngine from core.config import settings from core.models import IngestedDocument, ProcessingPolicy from ingestion.providers.sharepoint_provider import SharePointProvider from ingestion.sync import SyncEngine from extraction.dce import DocumentClassificationEngine from extraction.ocr_service import OCRService from extraction.text_extractor import TextExtractor from chunking.markdown_chunker import MarkdownChunker from indexing.vector_store import VectorStore # --- Cấu hình Logging chuyên nghiệp --- logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s: %(message)s", datefmt="%Y-%m-%d %H:%M:%S" ) logger = logging.getLogger("RAG_API") app = FastAPI( title="Enterprise SharePoint RAG API", description="Hệ thống hỏi đáp AI nội bộ sử dụng kiến trúc Modular Providers và Distributed VLM.", version="1.1.0", docs_url="/docs", redoc_url="/redoc" ) # Thêm cấu hình CORS để Frontend có thể gọi API app.add_middleware( CORSMiddleware, allow_origins=["*"], # Cho phép tất cả nguồn (hợp lý cho bản PoC) allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) # --- Singleton Engine Instance --- rag_engine = None sync_status = {"running": False, "last_run": None, "processed": 0, "skipped": 0, "errors": []} # --- Azure AD SSO Config --- REDIRECT_URI = "http://localhost:8000/auth/callback" AUTHORITY = f"https://login.microsoftonline.com/{settings.tenant_id}" SCOPE = ["User.Read"] def _build_msal_app(): return msal.ConfidentialClientApplication( settings.client_id, authority=AUTHORITY, client_credential=settings.client_secret, ) @app.on_event("startup") async def startup_event(): global rag_engine try: logger.info(f"Đang khởi tạo RAG Engine với Provider: {settings.llm_provider}") # Đảm bảo host OpenSearch đúng trong môi trường dev if settings.opensearch_host == "opensearch" and os.environ.get("ENV") != "docker": settings.opensearch_host = "localhost" rag_engine = RAGEngine() # Thông báo sẵn sàng kèm địa chỉ truy cập host = "0.0.0.0" port = 8000 logger.info("="*60) logger.info("🚀 RAG ENGINE ĐÃ SẴN SÀNG PHỤC VỤ!") logger.info(f"🔗 API Endpoint: http://localhost:{port}") logger.info(f"📖 Swagger UI: http://localhost:{port}/docs") logger.info(f"📊 Health Check: http://localhost:{port}/health") logger.info("="*60) except Exception as e: logger.critical(f"❌ THẤT BẠI khi khởi động RAG Engine: {e}") # --- SCHEMAS --- class ChatRole(str, Enum): user = "user" assistant = "assistant" system = "system" class ChatHistoryItem(BaseModel): role: ChatRole = Field(..., description="Vai trò của người gửi (user/assistant)") content: str = Field(..., min_length=1, description="Nội dung tin nhắn") class ChatRequest(BaseModel): query: str = Field( ..., min_length=2, max_length=1000, example="Quy trình bảo trì thiết bị là gì?", description="Câu hỏi của người dùng" ) history: List[ChatHistoryItem] = Field( default_factory=list, description="Lịch sử cuộc trò chuyện để duy trì ngữ cảnh" ) class SourceCitation(BaseModel): file_name: str page: int url: Optional[str] = None class ChatResponse(BaseModel): answer: str = Field(..., description="Câu trả lời từ AI") sources: List[SourceCitation] = Field(default_factory=list, description="Danh sách các nguồn trích dẫn từ tài liệu") context_used: Optional[str] = Field(None, description="Ngữ cảnh thực tế đã được trích xuất từ VectorDB (Dùng cho Debug/UI)") class SyncResponse(BaseModel): status: str message: str class LoginRequest(BaseModel): email: str = Field(..., description="Email người dùng") class LoginResponse(BaseModel): email: str display_name: str role: str # --- ENDPOINTS --- @app.get("/health", tags=["System"]) async def health_check(): """Kiểm tra trạng thái kết nối tới OpenSearch và LLM.""" return { "status": "online" if rag_engine else "offline", "engine_ready": rag_engine is not None, "config": { "provider": settings.llm_provider, "opensearch_host": settings.opensearch_host } } @app.get("/auth/login", tags=["Auth"]) async def sso_login(): """ Redirect sang Azure AD login page. Dùng chung App Registration với SharePoint ingestion. """ msal_app = _build_msal_app() auth_url = msal_app.get_authorization_request_url( SCOPE, redirect_uri=REDIRECT_URI, state=secrets.token_hex(16) ) return RedirectResponse(url=auth_url) @app.get("/auth/callback", tags=["Auth"]) async def sso_callback(request: Request): """ Azure AD redirect về đây với authorization code. Đổi code lấy token, lấy thông tin user, redirect về frontend. """ code = request.query_params.get("code") if not code: raise HTTPException(status_code=400, detail="Missing authorization code") msal_app = _build_msal_app() result = msal_app.acquire_token_by_authorization_code( code, scopes=SCOPE, redirect_uri=REDIRECT_URI ) if "error" in result: logger.error(f"SSO error: {result.get('error_description', result.get('error'))}") raise HTTPException(status_code=401, detail="Authentication failed") # Lấy thông tin user từ token id_token_claims = result.get("id_token_claims", {}) email = id_token_claims.get("preferred_username", id_token_claims.get("email", "")) name = id_token_claims.get("name", email.split("@")[0]) oid = id_token_claims.get("oid", "") # Xác định role role = "admin" if "admin" in email.lower() else "user" logger.info(f"SSO login: {email} (role={role})") # Redirect về frontend với user info import json import urllib.parse user_data = json.dumps({"email": email, "display_name": name, "role": role}) encoded = urllib.parse.quote(user_data) return RedirectResponse(url=f"http://localhost:8000?user={encoded}") @app.post("/auth/login-email", response_model=LoginResponse, tags=["Auth"]) async def login_email_endpoint(request: LoginRequest): """ Đăng nhập bằng email (fallback khi không dùng SSO). """ email = request.email.strip().lower() if not email or "@" not in email: raise HTTPException(status_code=400, detail="Email không hợp lệ.") local_part = email.split("@")[0] display_name = local_part.replace(".", " ").title() role = "admin" if "admin" in email else "user" logger.info(f"Email login: {email} (role={role})") return LoginResponse(email=email, display_name=display_name, role=role) @app.post("/chat", response_model=ChatResponse, tags=["RAG"], status_code=status.HTTP_200_OK) async def chat_endpoint(request: ChatRequest, http_request: Request): """ Điểm cuối xử lý hội thoại RAG. Header 'X-User-Email' (optional): Email user để filter quyền. Header 'X-User-Role' (optional): "admin" = bypass ACL. """ if not rag_engine: raise HTTPException( status_code=status.HTTP_503_SERVICE_UNAVAILABLE, detail="Hệ thống RAG đang khởi động hoặc gặp sự cố kết nối Database." ) try: user_email = http_request.headers.get("X-User-Email") user_role = http_request.headers.get("X-User-Role", "user") is_admin = user_role == "admin" or not user_email history_data = [item.dict() for item in request.history] logger.info(f"Chat query: {request.query[:50]} (user={user_email or 'none'}, role={user_role})") result = rag_engine.chat(request.query, history=history_data, user_email=user_email, is_admin=is_admin) return ChatResponse( answer=result["answer"], sources=result["sources"], context_used=result.get("context_used") ) except Exception as e: logger.error(f"Lỗi thực thi RAG: {str(e)}") raise HTTPException( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail="Đã xảy ra lỗi nội bộ trong quá trình xử lý ngôn ngữ." ) def extract_text_from_pdf_bytes(pdf_bytes: bytes) -> str: """Trích xuất text trực tiếp từ PDF có text layer.""" try: import fitz doc = fitz.open(stream=pdf_bytes, filetype="pdf") return "\n\n".join(page.get_text() for page in doc) except Exception: return "" def run_sync_background(): """Chạy đồng bộ SharePoint → DCE → OCR/Extract → Chunk → Index.""" global sync_status sync_status = {"running": True, "last_run": None, "processed": 0, "skipped": 0, "errors": []} try: provider = SharePointProvider() dce = DocumentClassificationEngine(provider=provider) ocr = OCRService() chunker = MarkdownChunker(max_chunk_size=1000, overlap=100) vector_db = VectorStore(index_name="poc_sharepoint_docs") items, _ = provider.fetch_changes({}) logger.info(f"Sync: Found {len(items)} items from SharePoint") for item in items: if item.get("is_folder") or item.get("is_deleted"): continue name = item.get("name", "") item_id = item.get("id", "") item_details = provider.get_item_details(item_id) permissions = provider.get_item_permissions(item_id) doc = IngestedDocument( site_id=settings.sharepoint_site_id, drive_id="", item_id=item_id, name=name, web_url=item_details.get("web_url", ""), download_url=item_details.get("download_url"), is_folder=False, size=item.get("size", 0), ) classification = dce.classify(doc, target_item=item) if classification.processing_policy in (ProcessingPolicy.UNSUPPORTED, ProcessingPolicy.METADATA_ONLY, ProcessingPolicy.REQUIRES_REVIEW): sync_status["skipped"] += 1 continue try: file_bytes = provider.download_file(item) except Exception as e: sync_status["errors"].append(f"{name}: download failed") continue if not file_bytes: sync_status["errors"].append(f"{name}: empty file") continue pages = [] ext = name.lower().rsplit(".", 1)[-1] if "." in name else "" if classification.processing_policy == ProcessingPolicy.SKIP_OCR: if ext == "pdf": text = extract_text_from_pdf_bytes(file_bytes) if text.strip(): from core.models import OCRPageResult pages = [OCRPageResult(page=1, text=text, confidence=1.0)] elif ext in ("docx", "doc"): pages = TextExtractor.extract_from_docx(file_bytes) elif ext in ("xlsx", "xls"): pages = TextExtractor.extract_from_xlsx(file_bytes) elif ext in ("txt", "md", "csv"): pages = TextExtractor.extract_from_text(file_bytes) elif classification.processing_policy == ProcessingPolicy.REQUIRES_OCR: pages = ocr.process_pdf_bytes(file_bytes) if not pages: sync_status["skipped"] += 1 continue metadata = { "item_id": item_id, "name": name, "web_url": item_details.get("web_url"), "download_url": item_details.get("download_url"), "site_id": settings.sharepoint_site_id, "permissions": permissions } chunks = chunker.chunk_document(pages, metadata) if chunks: vector_db.delete_by_file_id(item_id) vector_db.embed_and_index(chunks) sync_status["processed"] += 1 logger.info(f"Sync: Indexed {name} → {len(chunks)} chunks") else: sync_status["skipped"] += 1 sync_status["last_run"] = "completed" logger.info(f"Sync completed: {sync_status['processed']} processed, {sync_status['skipped']} skipped") except Exception as e: sync_status["last_run"] = "failed" sync_status["errors"].append(str(e)) logger.error(f"Sync failed: {e}") finally: sync_status["running"] = False @app.post("/sync", response_model=SyncResponse, tags=["Ingestion"]) async def sync_endpoint(background_tasks: BackgroundTasks): """ Trigger đồng bộ dữ liệu từ SharePoint. Chạy trong background, trả về trạng thái ngay lập tức. """ if sync_status["running"]: return SyncResponse(status="already_running", message="Đồng bộ đang chạy, vui lòng đợi.") background_tasks.add_task(run_sync_background) return SyncResponse(status="started", message="Đồng bộ đã bắt đầu trong background.") @app.get("/sync/status", tags=["Ingestion"]) async def sync_status_endpoint(): """Kiểm tra trạng thái đồng bộ.""" return sync_status if __name__ == "__main__": uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=True)