70 lines
2.1 KiB
Python
70 lines
2.1 KiB
Python
from pydantic import BaseModel
|
|
from typing import Optional
|
|
from enum import Enum
|
|
|
|
class DocumentType(str, Enum):
|
|
TEXTUAL_DOCUMENT = "textual_document"
|
|
SPREADSHEET = "spreadsheet"
|
|
PRESENTATION = "presentation"
|
|
DRAWING = "drawing"
|
|
BINARY = "binary"
|
|
UNKNOWN = "unknown"
|
|
|
|
class ProcessingPolicy(str, Enum):
|
|
REQUIRES_OCR = "requires_ocr" # Needs OCR (e.g., SCAN_PDF)
|
|
SKIP_OCR = "skip_ocr" # No OCR needed, pure text extraction (e.g., TEXT_PDF, DOCX)
|
|
METADATA_ONLY = "metadata_only" # Search by metadata only, no text extraction (e.g., CAD, DRAWING_PDF)
|
|
REQUIRES_REVIEW = "requires_review" # Doubtful cases (e.g. Ambiguous PDF)
|
|
UNSUPPORTED = "unsupported" # Ignored
|
|
|
|
class PdfType(str, Enum):
|
|
TEXT_PDF = "TEXT_PDF"
|
|
SCAN_PDF = "SCAN_PDF"
|
|
DRAWING_PDF = "DRAWING_PDF"
|
|
AMBIGUOUS_PDF = "AMBIGUOUS_PDF"
|
|
NOT_PDF = "NOT_PDF"
|
|
|
|
class IngestedDocument(BaseModel):
|
|
"""Data contract: Output of Ingestion -> Input for Document Classification Engine (DCE)"""
|
|
site_id: str
|
|
drive_id: str
|
|
item_id: str
|
|
name: str
|
|
web_url: str
|
|
download_url: Optional[str] = None
|
|
mime_type: Optional[str] = None
|
|
parent_path: Optional[str] = None
|
|
is_folder: bool
|
|
size: int
|
|
last_modified: Optional[str] = None
|
|
|
|
class DocumentClassificationResult(BaseModel):
|
|
"""Data contract: Output of DCE -> Input for Inspection & Routing"""
|
|
item_id: str
|
|
doc_type: DocumentType
|
|
processing_policy: ProcessingPolicy
|
|
file_extension: str
|
|
is_supported: bool
|
|
reason: str
|
|
|
|
class OCRPageResult(BaseModel):
|
|
"""Data contract: Output of OCR Service -> Input for Normalization / RAG"""
|
|
page: int
|
|
text: str
|
|
confidence: float
|
|
paddle_text: str = ""
|
|
paddle_confidence: float = 0.0
|
|
|
|
class DocumentChunk(BaseModel):
|
|
"""Data contract: Output of Chunking -> Input for Embedding & Indexing"""
|
|
chunk_id: str
|
|
file_id: str
|
|
file_name: str
|
|
text: str
|
|
embedding: Optional[list[float]] = None
|
|
page_from: int
|
|
page_to: int
|
|
source_url: str
|
|
permissions: list[str] = []
|
|
site_id: str = ""
|