Phase 7: Hoàn thiện Modular RAG Backend với FastAPI và Đa LLM Provider

This commit is contained in:
2026-05-08 07:30:30 +00:00
commit 26d1298cf6
51 changed files with 5360 additions and 0 deletions

View File

@@ -0,0 +1,39 @@
from typing import Dict, Any, Tuple
import httpx
import logging
from core.models import IngestedDocument, DocumentClassificationResult, DocumentType, ProcessingPolicy
logger = logging.getLogger("DCE")
class MagicNumberValidator:
"""Validates file types using magic numbers (file signatures)."""
SIGNATURES = {
b"%PDF-": (DocumentType.TEXTUAL_DOCUMENT, "PDF Document"),
b"PK\x03\x04": (DocumentType.UNKNOWN, "ZIP Archive / Office Open XML"), # Needs further check
b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1": (DocumentType.UNKNOWN, "Legacy Office Document"),
# Add CAD magic numbers here if needed (e.g., AutoCAD DWG: b"AC10")
b"AC10": (DocumentType.DRAWING, "AutoCAD Drawing (DWG)")
}
@classmethod
def validate_from_bytes(cls, header_bytes: bytes) -> Tuple[bool, DocumentType, str]:
"""Checks if the bytes match any known signature."""
for sig, (doc_type, desc) in cls.SIGNATURES.items():
if header_bytes.startswith(sig):
return True, doc_type, desc
return False, DocumentType.UNKNOWN, "Unknown Signature"
@classmethod
def fetch_header_bytes(cls, download_url: str, num_bytes: int = 256) -> bytes:
"""Fetches only the first N bytes of a file using HTTP Range request."""
try:
# Idea: HTTP Range request prevents downloading huge files just to check headers
headers = {"Range": f"bytes=0-{num_bytes - 1}"}
with httpx.Client() as client:
response = client.get(download_url, headers=headers)
response.raise_for_status()
return response.content
except Exception as e:
logger.error(f"Failed to fetch header bytes: {e}")
return b""