poc_system/core/models.py

from pydantic import BaseModel
from typing import Optional
from enum import Enum

class DocumentType(str, Enum):
    TEXTUAL_DOCUMENT = "textual_document"
    SPREADSHEET = "spreadsheet"
    PRESENTATION = "presentation"
    DRAWING = "drawing"
    BINARY = "binary"
    UNKNOWN = "unknown"

class ProcessingPolicy(str, Enum):
    REQUIRES_OCR = "requires_ocr"          # Needs OCR (e.g., SCAN_PDF)
    SKIP_OCR = "skip_ocr"                  # No OCR needed, pure text extraction (e.g., TEXT_PDF, DOCX)
    METADATA_ONLY = "metadata_only"        # Search by metadata only, no text extraction (e.g., CAD, DRAWING_PDF)
    REQUIRES_REVIEW = "requires_review"    # Doubtful cases (e.g. Ambiguous PDF)
    UNSUPPORTED = "unsupported"            # Ignored

class PdfType(str, Enum):
    TEXT_PDF = "TEXT_PDF"
    SCAN_PDF = "SCAN_PDF"
    DRAWING_PDF = "DRAWING_PDF"
    AMBIGUOUS_PDF = "AMBIGUOUS_PDF"
    NOT_PDF = "NOT_PDF"

class IngestedDocument(BaseModel):
    """Data contract: Output of Ingestion -> Input for Document Classification Engine (DCE)"""
    site_id: str
    drive_id: str
    item_id: str
    name: str
    web_url: str
    download_url: Optional[str] = None
    mime_type: Optional[str] = None
    parent_path: Optional[str] = None
    is_folder: bool
    size: int
    last_modified: Optional[str] = None

class DocumentClassificationResult(BaseModel):
    """Data contract: Output of DCE -> Input for Inspection & Routing"""
    item_id: str
    doc_type: DocumentType
    processing_policy: ProcessingPolicy
    file_extension: str
    is_supported: bool
    reason: str

class OCRPageResult(BaseModel):
    """Data contract: Output of OCR Service -> Input for Normalization / RAG"""
    page: int
    text: str
    confidence: float
    paddle_text: str = ""
    paddle_confidence: float = 0.0

class DocumentChunk(BaseModel):
    """Data contract: Output of Chunking -> Input for Embedding & Indexing"""
    chunk_id: str
    file_id: str
    file_name: str
    text: str
    embedding: Optional[list[float]] = None
    page_from: int
    page_to: int
    source_url: Optional[str] = None
    download_url: Optional[str] = None
    permissions: list[str] = []
    site_id: str = ""