from pydantic import BaseModel from typing import Optional from enum import Enum class DocumentType(str, Enum): TEXTUAL_DOCUMENT = "textual_document" SPREADSHEET = "spreadsheet" PRESENTATION = "presentation" DRAWING = "drawing" BINARY = "binary" UNKNOWN = "unknown" class ProcessingPolicy(str, Enum): REQUIRES_OCR = "requires_ocr" # Needs OCR (e.g., SCAN_PDF) SKIP_OCR = "skip_ocr" # No OCR needed, pure text extraction (e.g., TEXT_PDF, DOCX) METADATA_ONLY = "metadata_only" # Search by metadata only, no text extraction (e.g., CAD, DRAWING_PDF) REQUIRES_REVIEW = "requires_review" # Doubtful cases (e.g. Ambiguous PDF) UNSUPPORTED = "unsupported" # Ignored class PdfType(str, Enum): TEXT_PDF = "TEXT_PDF" SCAN_PDF = "SCAN_PDF" DRAWING_PDF = "DRAWING_PDF" AMBIGUOUS_PDF = "AMBIGUOUS_PDF" NOT_PDF = "NOT_PDF" class IngestedDocument(BaseModel): """Data contract: Output of Ingestion -> Input for Document Classification Engine (DCE)""" site_id: str drive_id: str item_id: str name: str web_url: str download_url: Optional[str] = None mime_type: Optional[str] = None parent_path: Optional[str] = None is_folder: bool size: int last_modified: Optional[str] = None class DocumentClassificationResult(BaseModel): """Data contract: Output of DCE -> Input for Inspection & Routing""" item_id: str doc_type: DocumentType processing_policy: ProcessingPolicy file_extension: str is_supported: bool reason: str class OCRPageResult(BaseModel): """Data contract: Output of OCR Service -> Input for Normalization / RAG""" page: int text: str confidence: float paddle_text: str = "" paddle_confidence: float = 0.0 class DocumentChunk(BaseModel): """Data contract: Output of Chunking -> Input for Embedding & Indexing""" chunk_id: str file_id: str file_name: str text: str embedding: Optional[list[float]] = None page_from: int page_to: int source_url: Optional[str] = None download_url: Optional[str] = None permissions: list[str] = [] site_id: str = ""