Xu ly SSO

This commit is contained in:
2026-05-09 10:31:28 +00:00
parent 9d04e7484c
commit f937d1a98e
21 changed files with 2515 additions and 271 deletions

View File

@@ -132,3 +132,13 @@ class GraphClient:
else:
url = f"{self.base_url}/drives/{drive_id}/root/delta"
return self._make_get_request(url)
def get_item_details(self, drive_id: str, item_id: str):
"""GET /drives/{driveId}/items/{itemId} - Lấy thông tin chi tiết bao gồm webUrl và downloadUrl."""
url = f"{self.base_url}/drives/{drive_id}/items/{item_id}"
return self._make_get_request(url)
def get_item_permissions(self, drive_id: str, item_id: str):
"""GET /drives/{driveId}/items/{itemId}/permissions - Lấy danh sách quyền truy cập."""
url = f"{self.base_url}/drives/{drive_id}/items/{item_id}/permissions"
return self._make_get_request(url)

View File

@@ -34,3 +34,29 @@ class BaseStorageProvider(ABC):
bytes: The raw file content.
"""
pass
@abstractmethod
def get_item_details(self, item_id: str) -> Dict:
"""
Get full item details including webUrl and downloadUrl.
Args:
item_id (str): The item ID from fetch_changes.
Returns:
Dict: Full item details with links.
"""
pass
@abstractmethod
def get_item_permissions(self, item_id: str) -> List[str]:
"""
Get permissions for an item. Returns list of user/group emails or IDs.
Args:
item_id (str): The item ID from fetch_changes.
Returns:
List[str]: List of user/group identifiers. ["*"] means everyone can access.
"""
pass

View File

@@ -81,6 +81,62 @@ class SharePointProvider(BaseStorageProvider):
return standardized_items, new_state
def get_item_details(self, item_id: str) -> Dict:
"""
Get full item details including webUrl and downloadUrl.
"""
try:
item = self.graph.get_item_details(self.drive_id, item_id)
return {
"id": item.get("id"),
"name": item.get("name"),
"web_url": item.get("webUrl"),
"download_url": item.get("@microsoft.graph.downloadUrl"),
"size": item.get("size"),
"last_modified": item.get("lastModifiedDateTime"),
}
except Exception as e:
logger.error(f"Failed to get item details for {item_id}: {e}")
raise e
def get_item_permissions(self, item_id: str) -> List[str]:
"""
Get permissions for an item. Returns list of user/group emails or IDs.
"""
try:
response = self.graph.get_item_permissions(self.drive_id, item_id)
permissions = set()
for perm in response.get("value", []):
# Lấy grantedTo hoặc grantedToIdentities
granted = perm.get("grantedTo", {})
if not granted:
identities = perm.get("grantedToIdentitiesV2", [])
for identity in identities:
user = identity.get("user", {})
if user.get("email"):
permissions.add(user["email"].lower())
elif user.get("id"):
permissions.add(user["id"])
user = granted.get("user", {})
if user.get("email"):
permissions.add(user["email"].lower())
elif user.get("id"):
permissions.add(user["id"])
# Nếu có grantedToV2 (site group)
granted_v2 = perm.get("grantedToV2", {})
site_group = granted_v2.get("siteGroup", {})
if site_group.get("displayName"):
permissions.add(f"group:{site_group['displayName']}")
return list(permissions) if permissions else ["*"]
except Exception as e:
logger.warning(f"Failed to get permissions for {item_id}: {e}. Defaulting to ['*']")
return ["*"]
def download_file(self, target_item: Dict) -> bytes:
"""
Download file content from SharePoint.

View File

@@ -3,145 +3,88 @@ import json
import logging
from typing import List, Dict, Any
# Ensure we can import from the root module if run directly
import sys
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from ingestion.graph_client import GraphClient
from ingestion.providers.base_provider import BaseStorageProvider
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("IngestionSync")
class SharePointSync:
def __init__(self, hostname: str, site_path: str):
self.graph_client = GraphClient()
self.hostname = hostname
self.site_path = site_path
self.state_file = "delta_state.json"
self.output_file = "ingestion_output.json"
def _load_delta_link(self) -> str:
"""Load delta link from local state file."""
class SyncEngine:
"""
Bộ điều phối đồng bộ không phụ thuộc vào nhà cung cấp cụ thể.
Nhận vào bất kỳ BaseStorageProvider nào (SharePoint, Google Drive, NAS, ...).
"""
def __init__(self, provider: BaseStorageProvider, state_file: str = "delta_state.json", output_file: str = "ingestion_output.json"):
self.provider = provider
self.state_file = state_file
self.output_file = output_file
def _load_sync_state(self) -> Dict:
"""Load sync state từ local file."""
if os.path.exists(self.state_file):
with open(self.state_file, "r", encoding="utf-8") as f:
data = json.load(f)
return data.get("delta_link")
return None
def _save_delta_link(self, delta_link: str):
"""Save delta link to local state file for next incremental sync."""
with open(self.state_file, "w", encoding="utf-8") as f:
json.dump({"delta_link": delta_link}, f, indent=2)
def _extract_metadata(self, item: Dict[Any, Any], site_id: str, drive_id: str) -> Dict[str, Any]:
"""Convert Graph API item payload to our target schema."""
download_url = item.get("@microsoft.graph.downloadUrl")
if not download_url and "folder" not in item and "deleted" not in item:
try:
# Delta query might not return downloadUrl, so fetch it directly
item_id = item.get("id")
url = f"https://graph.microsoft.com/v1.0/drives/{drive_id}/items/{item_id}"
full_item = self.graph_client._make_get_request(url)
download_url = full_item.get("@microsoft.graph.downloadUrl")
except Exception as e:
logger.error(f"Failed to fetch download_url for {item.get('name')}: {e}")
return json.load(f)
return {}
return {
"site_id": site_id,
"drive_id": drive_id,
"item_id": item.get("id"),
"name": item.get("name"),
"web_url": item.get("webUrl"),
"download_url": download_url,
"mime_type": item.get("file", {}).get("mimeType") if "file" in item else None,
"parent_path": item.get("parentReference", {}).get("path"),
"is_folder": "folder" in item,
"size": item.get("size"),
"last_modified": item.get("lastModifiedDateTime"),
"created": item.get("createdDateTime"),
"eTag": item.get("eTag"),
"cTag": item.get("cTag"),
"deleted": "deleted" in item
}
def _save_sync_state(self, state: Dict):
"""Save sync state ra local file."""
with open(self.state_file, "w", encoding="utf-8") as f:
json.dump(state, f, indent=2, ensure_ascii=False)
def _upsert_to_local_db(self, new_items: List[Dict[str, Any]]):
"""Simulate upsert into a database by writing to a JSON file."""
"""Lưu kết quả vào local JSON (mô phỏng DB)."""
db = {}
if os.path.exists(self.output_file):
with open(self.output_file, "r", encoding="utf-8") as f:
try:
existing = json.load(f)
for item in existing:
db[item["item_id"]] = item
db[item["id"]] = item
except json.JSONDecodeError:
pass
for item in new_items:
if item.get("deleted"):
# If deleted, we mark it as deleted in our db (or we could remove it)
if item["item_id"] in db:
db[item["item_id"]]["deleted"] = True
item_id = item.get("id")
if item.get("is_deleted"):
if item_id in db:
db[item_id]["is_deleted"] = True
else:
# It's deleted but we didn't have it anyway
db[item["item_id"]] = item
db[item_id] = item
else:
db[item["item_id"]] = item
db[item_id] = item
final_list = list(db.values())
with open(self.output_file, "w", encoding="utf-8") as f:
json.dump(final_list, f, indent=2, ensure_ascii=False)
logger.info(f"Local database updated. Total items currently stored: {len(final_list)}")
logger.info(f"Local database updated. Total items: {len(final_list)}")
def run_sync(self):
logger.info("=== STARTING SHAREPOINT SYNC ===")
# 1. & 2. Resolve Site and Drive
logger.info(f"Resolving site: {self.hostname}:{self.site_path}")
site_info = self.graph_client.get_site_by_path(self.hostname, self.site_path)
site_id = site_info["id"]
logger.info(f"Resolving drive for site: {site_id}")
drive_info = self.graph_client.get_drive(site_id)
drive_id = drive_info["id"]
# 3. Delta Query setup
delta_link = self._load_delta_link()
if delta_link:
logger.info("Found existing delta_link. Performing INCREMENTAL sync.")
"""Chạy đồng bộ: fetch changes từ provider -> lưu local."""
logger.info("=== STARTING SYNC ===")
sync_state = self._load_sync_state()
if sync_state:
logger.info("Found existing sync state. Performing INCREMENTAL sync.")
else:
logger.info("No delta_link found. Performing FULL sync.")
items_collected = []
current_url = delta_link
# Loop over pagination
while True:
response = self.graph_client.delta_query(drive_id, current_url)
values = response.get("value", [])
items_collected.extend(values)
if "@odata.nextLink" in response:
current_url = response["@odata.nextLink"]
logger.info("Fetching next page of delta results...")
elif "@odata.deltaLink" in response:
new_delta_link = response["@odata.deltaLink"]
self._save_delta_link(new_delta_link)
logger.info("Reached end of delta changes. Saved new delta_link.")
break
else:
logger.warning("No nextLink or deltaLink found in response! Breaking loop.")
break
logger.info(f"Delta query returned {len(items_collected)} change(s).")
# 4. Extract metadata and save
if items_collected:
processed_items = [self._extract_metadata(item, site_id, drive_id) for item in items_collected]
self._upsert_to_local_db(processed_items)
else:
logger.info("No items to process.")
logger.info("No sync state found. Performing FULL sync.")
items, new_state = self.provider.fetch_changes(sync_state)
logger.info(f"Provider returned {len(items)} change(s).")
if items:
self._upsert_to_local_db(items)
self._save_sync_state(new_state)
logger.info("Sync state saved.")
if __name__ == "__main__":
sync = SharePointSync("285pdg.sharepoint.com", "/sites/poc_system")
sync.run_sync()
from ingestion.providers.sharepoint_provider import SharePointProvider
provider = SharePointProvider()
engine = SyncEngine(provider)
engine.run_sync()