Files
poc_system/ingestion/sync.py
2026-05-09 10:31:28 +00:00

91 lines
3.1 KiB
Python

import os
import json
import logging
from typing import List, Dict, Any
import sys
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from ingestion.providers.base_provider import BaseStorageProvider
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("IngestionSync")
class SyncEngine:
"""
Bộ điều phối đồng bộ không phụ thuộc vào nhà cung cấp cụ thể.
Nhận vào bất kỳ BaseStorageProvider nào (SharePoint, Google Drive, NAS, ...).
"""
def __init__(self, provider: BaseStorageProvider, state_file: str = "delta_state.json", output_file: str = "ingestion_output.json"):
self.provider = provider
self.state_file = state_file
self.output_file = output_file
def _load_sync_state(self) -> Dict:
"""Load sync state từ local file."""
if os.path.exists(self.state_file):
with open(self.state_file, "r", encoding="utf-8") as f:
return json.load(f)
return {}
def _save_sync_state(self, state: Dict):
"""Save sync state ra local file."""
with open(self.state_file, "w", encoding="utf-8") as f:
json.dump(state, f, indent=2, ensure_ascii=False)
def _upsert_to_local_db(self, new_items: List[Dict[str, Any]]):
"""Lưu kết quả vào local JSON (mô phỏng DB)."""
db = {}
if os.path.exists(self.output_file):
with open(self.output_file, "r", encoding="utf-8") as f:
try:
existing = json.load(f)
for item in existing:
db[item["id"]] = item
except json.JSONDecodeError:
pass
for item in new_items:
item_id = item.get("id")
if item.get("is_deleted"):
if item_id in db:
db[item_id]["is_deleted"] = True
else:
db[item_id] = item
else:
db[item_id] = item
final_list = list(db.values())
with open(self.output_file, "w", encoding="utf-8") as f:
json.dump(final_list, f, indent=2, ensure_ascii=False)
logger.info(f"Local database updated. Total items: {len(final_list)}")
def run_sync(self):
"""Chạy đồng bộ: fetch changes từ provider -> lưu local."""
logger.info("=== STARTING SYNC ===")
sync_state = self._load_sync_state()
if sync_state:
logger.info("Found existing sync state. Performing INCREMENTAL sync.")
else:
logger.info("No sync state found. Performing FULL sync.")
items, new_state = self.provider.fetch_changes(sync_state)
logger.info(f"Provider returned {len(items)} change(s).")
if items:
self._upsert_to_local_db(items)
self._save_sync_state(new_state)
logger.info("Sync state saved.")
if __name__ == "__main__":
from ingestion.providers.sharepoint_provider import SharePointProvider
provider = SharePointProvider()
engine = SyncEngine(provider)
engine.run_sync()