Phase 7: Hoàn thiện Modular RAG Backend với FastAPI và Đa LLM Provider
This commit is contained in:
40
core/config.py
Executable file
40
core/config.py
Executable file
@@ -0,0 +1,40 @@
|
||||
import os
|
||||
from pydantic_settings import BaseSettings, SettingsConfigDict
|
||||
|
||||
class Settings(BaseSettings):
|
||||
# Azure AD / Microsoft Graph
|
||||
tenant_id: str = ""
|
||||
client_id: str = ""
|
||||
client_secret: str = ""
|
||||
|
||||
# VLM (Vision-Language Model) Configuration
|
||||
VLM_ENDPOINT: str = "http://10.202.50.3:8080/v1/chat/completions"
|
||||
VLM_TEMPERATURE: float = 0.1
|
||||
VLM_MAX_TOKENS: int = 2000
|
||||
VLM_TIMEOUT: float = 120.0
|
||||
|
||||
# SharePoint
|
||||
sharepoint_site_id: str = ""
|
||||
sharepoint_drive_id: str = ""
|
||||
|
||||
# OpenSearch
|
||||
opensearch_host: str = "localhost"
|
||||
opensearch_port: int = 9200
|
||||
opensearch_user: str = "admin"
|
||||
opensearch_pass: str = "admin"
|
||||
|
||||
# Chat LLM Config
|
||||
llm_provider: str = "gemini"
|
||||
gemini_api_key: str = ""
|
||||
groq_api_key: str = ""
|
||||
groq_model: str = "llama-3.3-70b-versatile"
|
||||
local_llm_endpoint: str = "http://10.202.50.3:8081/v1/chat/completions"
|
||||
openai_api_key: str = ""
|
||||
|
||||
# General Settings
|
||||
log_level: str = "INFO"
|
||||
environment: str = "development"
|
||||
|
||||
model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8", extra="ignore")
|
||||
|
||||
settings = Settings()
|
||||
69
core/models.py
Normal file
69
core/models.py
Normal file
@@ -0,0 +1,69 @@
|
||||
from pydantic import BaseModel
|
||||
from typing import Optional
|
||||
from enum import Enum
|
||||
|
||||
class DocumentType(str, Enum):
|
||||
TEXTUAL_DOCUMENT = "textual_document"
|
||||
SPREADSHEET = "spreadsheet"
|
||||
PRESENTATION = "presentation"
|
||||
DRAWING = "drawing"
|
||||
BINARY = "binary"
|
||||
UNKNOWN = "unknown"
|
||||
|
||||
class ProcessingPolicy(str, Enum):
|
||||
REQUIRES_OCR = "requires_ocr" # Needs OCR (e.g., SCAN_PDF)
|
||||
SKIP_OCR = "skip_ocr" # No OCR needed, pure text extraction (e.g., TEXT_PDF, DOCX)
|
||||
METADATA_ONLY = "metadata_only" # Search by metadata only, no text extraction (e.g., CAD, DRAWING_PDF)
|
||||
REQUIRES_REVIEW = "requires_review" # Doubtful cases (e.g. Ambiguous PDF)
|
||||
UNSUPPORTED = "unsupported" # Ignored
|
||||
|
||||
class PdfType(str, Enum):
|
||||
TEXT_PDF = "TEXT_PDF"
|
||||
SCAN_PDF = "SCAN_PDF"
|
||||
DRAWING_PDF = "DRAWING_PDF"
|
||||
AMBIGUOUS_PDF = "AMBIGUOUS_PDF"
|
||||
NOT_PDF = "NOT_PDF"
|
||||
|
||||
class IngestedDocument(BaseModel):
|
||||
"""Data contract: Output of Ingestion -> Input for Document Classification Engine (DCE)"""
|
||||
site_id: str
|
||||
drive_id: str
|
||||
item_id: str
|
||||
name: str
|
||||
web_url: str
|
||||
download_url: Optional[str] = None
|
||||
mime_type: Optional[str] = None
|
||||
parent_path: Optional[str] = None
|
||||
is_folder: bool
|
||||
size: int
|
||||
last_modified: Optional[str] = None
|
||||
|
||||
class DocumentClassificationResult(BaseModel):
|
||||
"""Data contract: Output of DCE -> Input for Inspection & Routing"""
|
||||
item_id: str
|
||||
doc_type: DocumentType
|
||||
processing_policy: ProcessingPolicy
|
||||
file_extension: str
|
||||
is_supported: bool
|
||||
reason: str
|
||||
|
||||
class OCRPageResult(BaseModel):
|
||||
"""Data contract: Output of OCR Service -> Input for Normalization / RAG"""
|
||||
page: int
|
||||
text: str
|
||||
confidence: float
|
||||
paddle_text: str = ""
|
||||
paddle_confidence: float = 0.0
|
||||
|
||||
class DocumentChunk(BaseModel):
|
||||
"""Data contract: Output of Chunking -> Input for Embedding & Indexing"""
|
||||
chunk_id: str
|
||||
file_id: str
|
||||
file_name: str
|
||||
text: str
|
||||
embedding: Optional[list[float]] = None
|
||||
page_from: int
|
||||
page_to: int
|
||||
source_url: str
|
||||
permissions: list[str] = []
|
||||
site_id: str = ""
|
||||
Reference in New Issue
Block a user