Phase 7: Hoàn thiện Modular RAG Backend với FastAPI và Đa LLM Provider

This commit is contained in:
2026-05-08 07:30:30 +00:00
commit 26d1298cf6
51 changed files with 5360 additions and 0 deletions

View File

@@ -0,0 +1,99 @@
import logging
from typing import Dict, List, Tuple
from .base_provider import BaseStorageProvider
from ingestion.graph_client import GraphClient
from core.config import settings
logger = logging.getLogger(__name__)
class SharePointProvider(BaseStorageProvider):
"""
Storage Provider implementation for Microsoft SharePoint using Graph API.
"""
def __init__(self, hostname: str = "285pdg.sharepoint.com", site_path: str = "/sites/poc_system"):
self.graph = GraphClient()
logger.info(f"Resolving site: {hostname}:{site_path}")
site_info = self.graph.get_site_by_path(hostname, site_path)
site_id = site_info["id"]
logger.info(f"Resolving drive for site: {site_id}")
drive_info = self.graph.get_drive(site_id)
self.drive_id = drive_info["id"]
logger.info(f"Initialized SharePoint Provider for Drive ID: {self.drive_id}")
def fetch_changes(self, sync_state: Dict) -> Tuple[List[Dict], Dict]:
"""
Fetch all delta changes from SharePoint, handling pagination internally.
"""
delta_link = sync_state.get("sharepoint_delta_link")
items_collected = []
current_url = delta_link
# Loop over pagination
while True:
# We need to construct the URL manually or let graph_client do it
if not current_url:
current_url = f"{self.graph.base_url}/drives/{self.drive_id}/root/delta"
response = self.graph._make_get_request(current_url)
values = response.get("value", [])
items_collected.extend(values)
if "@odata.nextLink" in response:
current_url = response["@odata.nextLink"]
logger.info("Fetching next page of SharePoint delta results...")
elif "@odata.deltaLink" in response:
new_delta_link = response["@odata.deltaLink"]
logger.info("Reached end of SharePoint delta changes.")
break
else:
logger.warning("No nextLink or deltaLink found in response! Breaking loop.")
new_delta_link = current_url
break
# Standardize output for the ingestion pipeline
standardized_items = []
for item in items_collected:
# Bỏ qua root drive
if "folder" in item and "root" in item.get("folder", {}):
continue
is_deleted = "deleted" in item
std_item = {
"id": item.get("id"),
"name": item.get("name"),
"is_deleted": is_deleted,
"is_folder": "folder" in item,
"provider": "sharepoint",
"last_modified": item.get("lastModifiedDateTime"),
"size": item.get("size"),
"raw_data": item
}
standardized_items.append(std_item)
new_state = sync_state.copy()
new_state["sharepoint_delta_link"] = new_delta_link
return standardized_items, new_state
def download_file(self, target_item: Dict) -> bytes:
"""
Download file content from SharePoint.
"""
try:
raw_data = target_item.get("raw_data", {})
item_id = target_item.get("id")
# Gọi thẳng endpoint /content qua Graph API để tránh lỗi 401
url = f"https://graph.microsoft.com/v1.0/drives/{self.drive_id}/items/{item_id}/content"
file_bytes = self.graph._download_file(url)
return file_bytes
except Exception as e:
logger.error(f"SharePoint download_file failed for {target_item.get('name')}: {e}")
raise e