148 lines
6.1 KiB
Python
148 lines
6.1 KiB
Python
import os
|
|
import json
|
|
import logging
|
|
from typing import List, Dict, Any
|
|
|
|
# Ensure we can import from the root module if run directly
|
|
import sys
|
|
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
|
|
from ingestion.graph_client import GraphClient
|
|
|
|
logging.basicConfig(level=logging.INFO)
|
|
logger = logging.getLogger("IngestionSync")
|
|
|
|
class SharePointSync:
|
|
def __init__(self, hostname: str, site_path: str):
|
|
self.graph_client = GraphClient()
|
|
self.hostname = hostname
|
|
self.site_path = site_path
|
|
self.state_file = "delta_state.json"
|
|
self.output_file = "ingestion_output.json"
|
|
|
|
def _load_delta_link(self) -> str:
|
|
"""Load delta link from local state file."""
|
|
if os.path.exists(self.state_file):
|
|
with open(self.state_file, "r", encoding="utf-8") as f:
|
|
data = json.load(f)
|
|
return data.get("delta_link")
|
|
return None
|
|
|
|
def _save_delta_link(self, delta_link: str):
|
|
"""Save delta link to local state file for next incremental sync."""
|
|
with open(self.state_file, "w", encoding="utf-8") as f:
|
|
json.dump({"delta_link": delta_link}, f, indent=2)
|
|
|
|
def _extract_metadata(self, item: Dict[Any, Any], site_id: str, drive_id: str) -> Dict[str, Any]:
|
|
"""Convert Graph API item payload to our target schema."""
|
|
download_url = item.get("@microsoft.graph.downloadUrl")
|
|
if not download_url and "folder" not in item and "deleted" not in item:
|
|
try:
|
|
# Delta query might not return downloadUrl, so fetch it directly
|
|
item_id = item.get("id")
|
|
url = f"https://graph.microsoft.com/v1.0/drives/{drive_id}/items/{item_id}"
|
|
full_item = self.graph_client._make_get_request(url)
|
|
download_url = full_item.get("@microsoft.graph.downloadUrl")
|
|
except Exception as e:
|
|
logger.error(f"Failed to fetch download_url for {item.get('name')}: {e}")
|
|
|
|
return {
|
|
"site_id": site_id,
|
|
"drive_id": drive_id,
|
|
"item_id": item.get("id"),
|
|
"name": item.get("name"),
|
|
"web_url": item.get("webUrl"),
|
|
"download_url": download_url,
|
|
"mime_type": item.get("file", {}).get("mimeType") if "file" in item else None,
|
|
"parent_path": item.get("parentReference", {}).get("path"),
|
|
"is_folder": "folder" in item,
|
|
"size": item.get("size"),
|
|
"last_modified": item.get("lastModifiedDateTime"),
|
|
"created": item.get("createdDateTime"),
|
|
"eTag": item.get("eTag"),
|
|
"cTag": item.get("cTag"),
|
|
"deleted": "deleted" in item
|
|
}
|
|
|
|
def _upsert_to_local_db(self, new_items: List[Dict[str, Any]]):
|
|
"""Simulate upsert into a database by writing to a JSON file."""
|
|
db = {}
|
|
if os.path.exists(self.output_file):
|
|
with open(self.output_file, "r", encoding="utf-8") as f:
|
|
try:
|
|
existing = json.load(f)
|
|
for item in existing:
|
|
db[item["item_id"]] = item
|
|
except json.JSONDecodeError:
|
|
pass
|
|
|
|
for item in new_items:
|
|
if item.get("deleted"):
|
|
# If deleted, we mark it as deleted in our db (or we could remove it)
|
|
if item["item_id"] in db:
|
|
db[item["item_id"]]["deleted"] = True
|
|
else:
|
|
# It's deleted but we didn't have it anyway
|
|
db[item["item_id"]] = item
|
|
else:
|
|
db[item["item_id"]] = item
|
|
|
|
final_list = list(db.values())
|
|
with open(self.output_file, "w", encoding="utf-8") as f:
|
|
json.dump(final_list, f, indent=2, ensure_ascii=False)
|
|
|
|
logger.info(f"Local database updated. Total items currently stored: {len(final_list)}")
|
|
|
|
def run_sync(self):
|
|
logger.info("=== STARTING SHAREPOINT SYNC ===")
|
|
|
|
# 1. & 2. Resolve Site and Drive
|
|
logger.info(f"Resolving site: {self.hostname}:{self.site_path}")
|
|
site_info = self.graph_client.get_site_by_path(self.hostname, self.site_path)
|
|
site_id = site_info["id"]
|
|
|
|
logger.info(f"Resolving drive for site: {site_id}")
|
|
drive_info = self.graph_client.get_drive(site_id)
|
|
drive_id = drive_info["id"]
|
|
|
|
# 3. Delta Query setup
|
|
delta_link = self._load_delta_link()
|
|
if delta_link:
|
|
logger.info("Found existing delta_link. Performing INCREMENTAL sync.")
|
|
else:
|
|
logger.info("No delta_link found. Performing FULL sync.")
|
|
|
|
items_collected = []
|
|
current_url = delta_link
|
|
|
|
# Loop over pagination
|
|
while True:
|
|
response = self.graph_client.delta_query(drive_id, current_url)
|
|
values = response.get("value", [])
|
|
items_collected.extend(values)
|
|
|
|
if "@odata.nextLink" in response:
|
|
current_url = response["@odata.nextLink"]
|
|
logger.info("Fetching next page of delta results...")
|
|
elif "@odata.deltaLink" in response:
|
|
new_delta_link = response["@odata.deltaLink"]
|
|
self._save_delta_link(new_delta_link)
|
|
logger.info("Reached end of delta changes. Saved new delta_link.")
|
|
break
|
|
else:
|
|
logger.warning("No nextLink or deltaLink found in response! Breaking loop.")
|
|
break
|
|
|
|
logger.info(f"Delta query returned {len(items_collected)} change(s).")
|
|
|
|
# 4. Extract metadata and save
|
|
if items_collected:
|
|
processed_items = [self._extract_metadata(item, site_id, drive_id) for item in items_collected]
|
|
self._upsert_to_local_db(processed_items)
|
|
else:
|
|
logger.info("No items to process.")
|
|
|
|
if __name__ == "__main__":
|
|
sync = SharePointSync("285pdg.sharepoint.com", "/sites/poc_system")
|
|
sync.run_sync()
|