Phase 7: Hoàn thiện Modular RAG Backend với FastAPI và Đa LLM Provider

2026-05-08 07:30:30 +00:00
commit 26d1298cf6
51 changed files with 5360 additions and 0 deletions
--- a/core/config.py
+++ b/core/config.py
@@ -0,0 +1,40 @@
+import os
+from pydantic_settings import BaseSettings, SettingsConfigDict
+
+class Settings(BaseSettings):
+    # Azure AD / Microsoft Graph
+    tenant_id: str = ""
+    client_id: str = ""
+    client_secret: str = ""
+    
+    # VLM (Vision-Language Model) Configuration
+    VLM_ENDPOINT: str = "http://10.202.50.3:8080/v1/chat/completions"
+    VLM_TEMPERATURE: float = 0.1
+    VLM_MAX_TOKENS: int = 2000
+    VLM_TIMEOUT: float = 120.0
+    
+    # SharePoint
+    sharepoint_site_id: str = ""
+    sharepoint_drive_id: str = ""
+
+    # OpenSearch
+    opensearch_host: str = "localhost"
+    opensearch_port: int = 9200
+    opensearch_user: str = "admin"
+    opensearch_pass: str = "admin"
+
+    # Chat LLM Config
+    llm_provider: str = "gemini"
+    gemini_api_key: str = ""
+    groq_api_key: str = ""
+    groq_model: str = "llama-3.3-70b-versatile"
+    local_llm_endpoint: str = "http://10.202.50.3:8081/v1/chat/completions"
+    openai_api_key: str = ""
+
+    # General Settings
+    log_level: str = "INFO"
+    environment: str = "development"
+
+    model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8", extra="ignore")
+
+settings = Settings()
--- a/core/models.py
+++ b/core/models.py
@@ -0,0 +1,69 @@
+from pydantic import BaseModel
+from typing import Optional
+from enum import Enum
+
+class DocumentType(str, Enum):
+    TEXTUAL_DOCUMENT = "textual_document"
+    SPREADSHEET = "spreadsheet"
+    PRESENTATION = "presentation"
+    DRAWING = "drawing"
+    BINARY = "binary"
+    UNKNOWN = "unknown"
+
+class ProcessingPolicy(str, Enum):
+    REQUIRES_OCR = "requires_ocr"          # Needs OCR (e.g., SCAN_PDF)
+    SKIP_OCR = "skip_ocr"                  # No OCR needed, pure text extraction (e.g., TEXT_PDF, DOCX)
+    METADATA_ONLY = "metadata_only"        # Search by metadata only, no text extraction (e.g., CAD, DRAWING_PDF)
+    REQUIRES_REVIEW = "requires_review"    # Doubtful cases (e.g. Ambiguous PDF)
+    UNSUPPORTED = "unsupported"            # Ignored
+
+class PdfType(str, Enum):
+    TEXT_PDF = "TEXT_PDF"
+    SCAN_PDF = "SCAN_PDF"
+    DRAWING_PDF = "DRAWING_PDF"
+    AMBIGUOUS_PDF = "AMBIGUOUS_PDF"
+    NOT_PDF = "NOT_PDF"
+
+class IngestedDocument(BaseModel):
+    """Data contract: Output of Ingestion -> Input for Document Classification Engine (DCE)"""
+    site_id: str
+    drive_id: str
+    item_id: str
+    name: str
+    web_url: str
+    download_url: Optional[str] = None
+    mime_type: Optional[str] = None
+    parent_path: Optional[str] = None
+    is_folder: bool
+    size: int
+    last_modified: Optional[str] = None
+
+class DocumentClassificationResult(BaseModel):
+    """Data contract: Output of DCE -> Input for Inspection & Routing"""
+    item_id: str
+    doc_type: DocumentType
+    processing_policy: ProcessingPolicy
+    file_extension: str
+    is_supported: bool
+    reason: str
+
+class OCRPageResult(BaseModel):
+    """Data contract: Output of OCR Service -> Input for Normalization / RAG"""
+    page: int
+    text: str
+    confidence: float
+    paddle_text: str = ""
+    paddle_confidence: float = 0.0
+
+class DocumentChunk(BaseModel):
+    """Data contract: Output of Chunking -> Input for Embedding & Indexing"""
+    chunk_id: str
+    file_id: str
+    file_name: str
+    text: str
+    embedding: Optional[list[float]] = None
+    page_from: int
+    page_to: int
+    source_url: str
+    permissions: list[str] = []
+    site_id: str = ""