Phase 7: Hoàn thiện Modular RAG Backend với FastAPI và Đa LLM Provider
This commit is contained in:
69
core/models.py
Normal file
69
core/models.py
Normal file
@@ -0,0 +1,69 @@
|
||||
from pydantic import BaseModel
|
||||
from typing import Optional
|
||||
from enum import Enum
|
||||
|
||||
class DocumentType(str, Enum):
|
||||
TEXTUAL_DOCUMENT = "textual_document"
|
||||
SPREADSHEET = "spreadsheet"
|
||||
PRESENTATION = "presentation"
|
||||
DRAWING = "drawing"
|
||||
BINARY = "binary"
|
||||
UNKNOWN = "unknown"
|
||||
|
||||
class ProcessingPolicy(str, Enum):
|
||||
REQUIRES_OCR = "requires_ocr" # Needs OCR (e.g., SCAN_PDF)
|
||||
SKIP_OCR = "skip_ocr" # No OCR needed, pure text extraction (e.g., TEXT_PDF, DOCX)
|
||||
METADATA_ONLY = "metadata_only" # Search by metadata only, no text extraction (e.g., CAD, DRAWING_PDF)
|
||||
REQUIRES_REVIEW = "requires_review" # Doubtful cases (e.g. Ambiguous PDF)
|
||||
UNSUPPORTED = "unsupported" # Ignored
|
||||
|
||||
class PdfType(str, Enum):
|
||||
TEXT_PDF = "TEXT_PDF"
|
||||
SCAN_PDF = "SCAN_PDF"
|
||||
DRAWING_PDF = "DRAWING_PDF"
|
||||
AMBIGUOUS_PDF = "AMBIGUOUS_PDF"
|
||||
NOT_PDF = "NOT_PDF"
|
||||
|
||||
class IngestedDocument(BaseModel):
|
||||
"""Data contract: Output of Ingestion -> Input for Document Classification Engine (DCE)"""
|
||||
site_id: str
|
||||
drive_id: str
|
||||
item_id: str
|
||||
name: str
|
||||
web_url: str
|
||||
download_url: Optional[str] = None
|
||||
mime_type: Optional[str] = None
|
||||
parent_path: Optional[str] = None
|
||||
is_folder: bool
|
||||
size: int
|
||||
last_modified: Optional[str] = None
|
||||
|
||||
class DocumentClassificationResult(BaseModel):
|
||||
"""Data contract: Output of DCE -> Input for Inspection & Routing"""
|
||||
item_id: str
|
||||
doc_type: DocumentType
|
||||
processing_policy: ProcessingPolicy
|
||||
file_extension: str
|
||||
is_supported: bool
|
||||
reason: str
|
||||
|
||||
class OCRPageResult(BaseModel):
|
||||
"""Data contract: Output of OCR Service -> Input for Normalization / RAG"""
|
||||
page: int
|
||||
text: str
|
||||
confidence: float
|
||||
paddle_text: str = ""
|
||||
paddle_confidence: float = 0.0
|
||||
|
||||
class DocumentChunk(BaseModel):
|
||||
"""Data contract: Output of Chunking -> Input for Embedding & Indexing"""
|
||||
chunk_id: str
|
||||
file_id: str
|
||||
file_name: str
|
||||
text: str
|
||||
embedding: Optional[list[float]] = None
|
||||
page_from: int
|
||||
page_to: int
|
||||
source_url: str
|
||||
permissions: list[str] = []
|
||||
site_id: str = ""
|
||||
Reference in New Issue
Block a user