Files
poc_system/core/models.py
2026-05-09 10:31:28 +00:00

71 lines
2.2 KiB
Python

from pydantic import BaseModel
from typing import Optional
from enum import Enum
class DocumentType(str, Enum):
TEXTUAL_DOCUMENT = "textual_document"
SPREADSHEET = "spreadsheet"
PRESENTATION = "presentation"
DRAWING = "drawing"
BINARY = "binary"
UNKNOWN = "unknown"
class ProcessingPolicy(str, Enum):
REQUIRES_OCR = "requires_ocr" # Needs OCR (e.g., SCAN_PDF)
SKIP_OCR = "skip_ocr" # No OCR needed, pure text extraction (e.g., TEXT_PDF, DOCX)
METADATA_ONLY = "metadata_only" # Search by metadata only, no text extraction (e.g., CAD, DRAWING_PDF)
REQUIRES_REVIEW = "requires_review" # Doubtful cases (e.g. Ambiguous PDF)
UNSUPPORTED = "unsupported" # Ignored
class PdfType(str, Enum):
TEXT_PDF = "TEXT_PDF"
SCAN_PDF = "SCAN_PDF"
DRAWING_PDF = "DRAWING_PDF"
AMBIGUOUS_PDF = "AMBIGUOUS_PDF"
NOT_PDF = "NOT_PDF"
class IngestedDocument(BaseModel):
"""Data contract: Output of Ingestion -> Input for Document Classification Engine (DCE)"""
site_id: str
drive_id: str
item_id: str
name: str
web_url: str
download_url: Optional[str] = None
mime_type: Optional[str] = None
parent_path: Optional[str] = None
is_folder: bool
size: int
last_modified: Optional[str] = None
class DocumentClassificationResult(BaseModel):
"""Data contract: Output of DCE -> Input for Inspection & Routing"""
item_id: str
doc_type: DocumentType
processing_policy: ProcessingPolicy
file_extension: str
is_supported: bool
reason: str
class OCRPageResult(BaseModel):
"""Data contract: Output of OCR Service -> Input for Normalization / RAG"""
page: int
text: str
confidence: float
paddle_text: str = ""
paddle_confidence: float = 0.0
class DocumentChunk(BaseModel):
"""Data contract: Output of Chunking -> Input for Embedding & Indexing"""
chunk_id: str
file_id: str
file_name: str
text: str
embedding: Optional[list[float]] = None
page_from: int
page_to: int
source_url: Optional[str] = None
download_url: Optional[str] = None
permissions: list[str] = []
site_id: str = ""