Files
poc_system/test_dce_pipeline.py
2026-05-09 10:31:28 +00:00

42 lines
1.3 KiB
Python

import os
import sys
import json
import logging
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from core.models import IngestedDocument
from extraction.dce import DocumentClassificationEngine
from ingestion.providers.sharepoint_provider import SharePointProvider
logging.basicConfig(level=logging.INFO, format="%(levelname)s:%(name)s:%(message)s")
def main():
print("=== STARTING DCE PIPELINE TEST ===")
if not os.path.exists("ingestion_output.json"):
print("File ingestion_output.json not found! Please run ingestion sync first.")
return
with open("ingestion_output.json", "r", encoding="utf-8") as f:
items = json.load(f)
# Khởi tạo provider để download file qua Graph API auth
provider = SharePointProvider()
dce = DocumentClassificationEngine(provider=provider)
print(f"Loaded {len(items)} items from ingestion_output.json\n")
for item in items:
if item.get("is_folder"):
continue
doc = IngestedDocument(**item)
print(f"\n--- Processing: {doc.name} ---")
result = dce.classify(doc, target_item=item)
print(f">> Type: {result.doc_type.value} | Policy: {result.processing_policy.value} | Reason: {result.reason}")
if __name__ == "__main__":
main()