import os import sys import json import logging sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) from core.models import IngestedDocument from extraction.dce import DocumentClassificationEngine from ingestion.providers.sharepoint_provider import SharePointProvider logging.basicConfig(level=logging.INFO, format="%(levelname)s:%(name)s:%(message)s") def main(): print("=== STARTING DCE PIPELINE TEST ===") if not os.path.exists("ingestion_output.json"): print("File ingestion_output.json not found! Please run ingestion sync first.") return with open("ingestion_output.json", "r", encoding="utf-8") as f: items = json.load(f) # Khởi tạo provider để download file qua Graph API auth provider = SharePointProvider() dce = DocumentClassificationEngine(provider=provider) print(f"Loaded {len(items)} items from ingestion_output.json\n") for item in items: if item.get("is_folder"): continue doc = IngestedDocument(**item) print(f"\n--- Processing: {doc.name} ---") result = dce.classify(doc, target_item=item) print(f">> Type: {result.doc_type.value} | Policy: {result.processing_policy.value} | Reason: {result.reason}") if __name__ == "__main__": main()