import os import json import logging from typing import List, Dict, Any # Ensure we can import from the root module if run directly import sys sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from ingestion.graph_client import GraphClient logging.basicConfig(level=logging.INFO) logger = logging.getLogger("IngestionSync") class SharePointSync: def __init__(self, hostname: str, site_path: str): self.graph_client = GraphClient() self.hostname = hostname self.site_path = site_path self.state_file = "delta_state.json" self.output_file = "ingestion_output.json" def _load_delta_link(self) -> str: """Load delta link from local state file.""" if os.path.exists(self.state_file): with open(self.state_file, "r", encoding="utf-8") as f: data = json.load(f) return data.get("delta_link") return None def _save_delta_link(self, delta_link: str): """Save delta link to local state file for next incremental sync.""" with open(self.state_file, "w", encoding="utf-8") as f: json.dump({"delta_link": delta_link}, f, indent=2) def _extract_metadata(self, item: Dict[Any, Any], site_id: str, drive_id: str) -> Dict[str, Any]: """Convert Graph API item payload to our target schema.""" download_url = item.get("@microsoft.graph.downloadUrl") if not download_url and "folder" not in item and "deleted" not in item: try: # Delta query might not return downloadUrl, so fetch it directly item_id = item.get("id") url = f"https://graph.microsoft.com/v1.0/drives/{drive_id}/items/{item_id}" full_item = self.graph_client._make_get_request(url) download_url = full_item.get("@microsoft.graph.downloadUrl") except Exception as e: logger.error(f"Failed to fetch download_url for {item.get('name')}: {e}") return { "site_id": site_id, "drive_id": drive_id, "item_id": item.get("id"), "name": item.get("name"), "web_url": item.get("webUrl"), "download_url": download_url, "mime_type": item.get("file", {}).get("mimeType") if "file" in item else None, "parent_path": item.get("parentReference", {}).get("path"), "is_folder": "folder" in item, "size": item.get("size"), "last_modified": item.get("lastModifiedDateTime"), "created": item.get("createdDateTime"), "eTag": item.get("eTag"), "cTag": item.get("cTag"), "deleted": "deleted" in item } def _upsert_to_local_db(self, new_items: List[Dict[str, Any]]): """Simulate upsert into a database by writing to a JSON file.""" db = {} if os.path.exists(self.output_file): with open(self.output_file, "r", encoding="utf-8") as f: try: existing = json.load(f) for item in existing: db[item["item_id"]] = item except json.JSONDecodeError: pass for item in new_items: if item.get("deleted"): # If deleted, we mark it as deleted in our db (or we could remove it) if item["item_id"] in db: db[item["item_id"]]["deleted"] = True else: # It's deleted but we didn't have it anyway db[item["item_id"]] = item else: db[item["item_id"]] = item final_list = list(db.values()) with open(self.output_file, "w", encoding="utf-8") as f: json.dump(final_list, f, indent=2, ensure_ascii=False) logger.info(f"Local database updated. Total items currently stored: {len(final_list)}") def run_sync(self): logger.info("=== STARTING SHAREPOINT SYNC ===") # 1. & 2. Resolve Site and Drive logger.info(f"Resolving site: {self.hostname}:{self.site_path}") site_info = self.graph_client.get_site_by_path(self.hostname, self.site_path) site_id = site_info["id"] logger.info(f"Resolving drive for site: {site_id}") drive_info = self.graph_client.get_drive(site_id) drive_id = drive_info["id"] # 3. Delta Query setup delta_link = self._load_delta_link() if delta_link: logger.info("Found existing delta_link. Performing INCREMENTAL sync.") else: logger.info("No delta_link found. Performing FULL sync.") items_collected = [] current_url = delta_link # Loop over pagination while True: response = self.graph_client.delta_query(drive_id, current_url) values = response.get("value", []) items_collected.extend(values) if "@odata.nextLink" in response: current_url = response["@odata.nextLink"] logger.info("Fetching next page of delta results...") elif "@odata.deltaLink" in response: new_delta_link = response["@odata.deltaLink"] self._save_delta_link(new_delta_link) logger.info("Reached end of delta changes. Saved new delta_link.") break else: logger.warning("No nextLink or deltaLink found in response! Breaking loop.") break logger.info(f"Delta query returned {len(items_collected)} change(s).") # 4. Extract metadata and save if items_collected: processed_items = [self._extract_metadata(item, site_id, drive_id) for item in items_collected] self._upsert_to_local_db(processed_items) else: logger.info("No items to process.") if __name__ == "__main__": sync = SharePointSync("285pdg.sharepoint.com", "/sites/poc_system") sync.run_sync()