Phase 7: Hoàn thiện Modular RAG Backend với FastAPI và Đa LLM Provider

This commit is contained in:
2026-05-08 07:30:30 +00:00
commit 26d1298cf6
51 changed files with 5360 additions and 0 deletions

134
ingestion/graph_client.py Executable file
View File

@@ -0,0 +1,134 @@
import httpx
import base64
import json
import logging
from azure.identity import ClientSecretCredential
from core.config import settings
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("GraphClient")
class GraphClient:
"""
Microsoft Graph API Client using app-only authentication.
"""
def __init__(self):
self.tenant_id = settings.tenant_id
self.client_id = settings.client_id
self.client_secret = settings.client_secret
self.credential = ClientSecretCredential(
tenant_id=self.tenant_id,
client_id=self.client_id,
client_secret=self.client_secret
)
self.scopes = ["https://graph.microsoft.com/.default"]
self.base_url = "https://graph.microsoft.com/v1.0"
self._token = None
def decode_jwt_payload(self, token: str) -> dict:
parts = token.split('.')
if len(parts) != 3:
raise ValueError("Invalid JWT token format")
payload_b64 = parts[1]
payload_b64 += "=" * ((4 - len(payload_b64) % 4) % 4)
return json.loads(base64.urlsafe_b64decode(payload_b64))
def validate_required_roles(self, roles: list):
if "Sites.Read.All" not in roles:
raise PermissionError("FATAL: Token is missing 'Sites.Read.All' role. Stop immediately.")
if "Files.Read.All" not in roles:
logger.warning("WARNING: Token is missing 'Files.Read.All' role. Drive/delta steps will fail.")
raise PermissionError("FATAL: Token is missing 'Files.Read.All' role. Stop immediately.")
def get_access_token(self) -> str:
if not self._token:
token_response = self.credential.get_token(*self.scopes)
self._token = token_response.token
payload = self.decode_jwt_payload(self._token)
aud = payload.get("aud")
appid = payload.get("appid")
idtyp = payload.get("idtyp")
roles = payload.get("roles", [])
logger.info(f"Token decoded -> aud: {aud}, appid: {appid}, idtyp: {idtyp}, roles: {roles}")
self.validate_required_roles(roles)
return self._token
def _get_headers(self) -> dict:
token = self.get_access_token()
return {
"Authorization": f"Bearer {token}",
"Accept": "application/json"
}
def _make_get_request(self, url: str) -> dict:
logger.info(f"GET Request to: {url}")
headers = self._get_headers()
try:
response = httpx.get(url, headers=headers)
logger.info(f"Response Status: {response.status_code}")
response.raise_for_status()
return response.json()
except httpx.HTTPStatusError as e:
logger.error(f"HTTP Error: {e.response.status_code}")
logger.error(f"Response Body: {e.response.text}")
raise e
except Exception as e:
logger.error(f"Error making request: {str(e)}")
raise e
def _download_file(self, url: str) -> bytes:
logger.info(f"GET Request (Download) to: {url}")
headers = self._get_headers()
try:
# Follow redirects is True by default in httpx.Client, but httpx.get() might need follow_redirects=True
with httpx.Client(follow_redirects=True, timeout=60.0) as client:
response = client.get(url, headers=headers)
logger.info(f"Response Status: {response.status_code}")
response.raise_for_status()
return response.content
except httpx.HTTPStatusError as e:
logger.error(f"HTTP Error: {e.response.status_code}")
logger.error(f"Response Body: {e.response.text}")
raise e
except Exception as e:
logger.error(f"Error making download request: {str(e)}")
raise e
def get_site_by_hostname(self, hostname: str):
"""GET /sites/{hostname}"""
url = f"{self.base_url}/sites/{hostname}"
return self._make_get_request(url)
def get_site_by_path(self, hostname: str, server_relative_path: str):
"""GET /sites/{hostname}:/{server-relative-path}"""
url = f"{self.base_url}/sites/{hostname}:{server_relative_path}"
return self._make_get_request(url)
def get_drive(self, site_id: str):
"""GET /sites/{siteId}/drive"""
url = f"{self.base_url}/sites/{site_id}/drive"
return self._make_get_request(url)
def get_drive_root_children(self, site_id: str):
"""GET /sites/{siteId}/drive/root/children"""
url = f"{self.base_url}/sites/{site_id}/drive/root/children"
return self._make_get_request(url)
def get_drive_root_delta(self, site_id: str):
"""GET /sites/{siteId}/drive/root/delta"""
url = f"{self.base_url}/sites/{site_id}/drive/root/delta"
return self._make_get_request(url)
def delta_query(self, drive_id: str, delta_link: str = None):
"""Perform a delta query on a drive."""
if delta_link:
url = delta_link
else:
url = f"{self.base_url}/drives/{drive_id}/root/delta"
return self._make_get_request(url)

View File

@@ -0,0 +1,36 @@
from abc import ABC, abstractmethod
from typing import Dict, List, Tuple
class BaseStorageProvider(ABC):
"""
Abstract Base Class for all Document Storage Providers (SharePoint, Google Drive, Local, NAS, etc.)
Any new storage source must implement these methods to be seamlessly integrated into the ingestion pipeline.
"""
@abstractmethod
def fetch_changes(self, sync_state: Dict) -> Tuple[List[Dict], Dict]:
"""
Fetch incremental changes (new, updated, or deleted files).
Args:
sync_state (Dict): The last known synchronization state/token.
Returns:
Tuple[List[Dict], Dict]:
- A list of standardized item dictionaries.
- The new sync state to be saved for the next run.
"""
pass
@abstractmethod
def download_file(self, target_item: Dict) -> bytes:
"""
Download the raw file bytes for a given item.
Args:
target_item (Dict): The standardized item dictionary returned by fetch_changes.
Returns:
bytes: The raw file content.
"""
pass

View File

@@ -0,0 +1,99 @@
import logging
from typing import Dict, List, Tuple
from .base_provider import BaseStorageProvider
from ingestion.graph_client import GraphClient
from core.config import settings
logger = logging.getLogger(__name__)
class SharePointProvider(BaseStorageProvider):
"""
Storage Provider implementation for Microsoft SharePoint using Graph API.
"""
def __init__(self, hostname: str = "285pdg.sharepoint.com", site_path: str = "/sites/poc_system"):
self.graph = GraphClient()
logger.info(f"Resolving site: {hostname}:{site_path}")
site_info = self.graph.get_site_by_path(hostname, site_path)
site_id = site_info["id"]
logger.info(f"Resolving drive for site: {site_id}")
drive_info = self.graph.get_drive(site_id)
self.drive_id = drive_info["id"]
logger.info(f"Initialized SharePoint Provider for Drive ID: {self.drive_id}")
def fetch_changes(self, sync_state: Dict) -> Tuple[List[Dict], Dict]:
"""
Fetch all delta changes from SharePoint, handling pagination internally.
"""
delta_link = sync_state.get("sharepoint_delta_link")
items_collected = []
current_url = delta_link
# Loop over pagination
while True:
# We need to construct the URL manually or let graph_client do it
if not current_url:
current_url = f"{self.graph.base_url}/drives/{self.drive_id}/root/delta"
response = self.graph._make_get_request(current_url)
values = response.get("value", [])
items_collected.extend(values)
if "@odata.nextLink" in response:
current_url = response["@odata.nextLink"]
logger.info("Fetching next page of SharePoint delta results...")
elif "@odata.deltaLink" in response:
new_delta_link = response["@odata.deltaLink"]
logger.info("Reached end of SharePoint delta changes.")
break
else:
logger.warning("No nextLink or deltaLink found in response! Breaking loop.")
new_delta_link = current_url
break
# Standardize output for the ingestion pipeline
standardized_items = []
for item in items_collected:
# Bỏ qua root drive
if "folder" in item and "root" in item.get("folder", {}):
continue
is_deleted = "deleted" in item
std_item = {
"id": item.get("id"),
"name": item.get("name"),
"is_deleted": is_deleted,
"is_folder": "folder" in item,
"provider": "sharepoint",
"last_modified": item.get("lastModifiedDateTime"),
"size": item.get("size"),
"raw_data": item
}
standardized_items.append(std_item)
new_state = sync_state.copy()
new_state["sharepoint_delta_link"] = new_delta_link
return standardized_items, new_state
def download_file(self, target_item: Dict) -> bytes:
"""
Download file content from SharePoint.
"""
try:
raw_data = target_item.get("raw_data", {})
item_id = target_item.get("id")
# Gọi thẳng endpoint /content qua Graph API để tránh lỗi 401
url = f"https://graph.microsoft.com/v1.0/drives/{self.drive_id}/items/{item_id}/content"
file_bytes = self.graph._download_file(url)
return file_bytes
except Exception as e:
logger.error(f"SharePoint download_file failed for {target_item.get('name')}: {e}")
raise e

147
ingestion/sync.py Normal file
View File

@@ -0,0 +1,147 @@
import os
import json
import logging
from typing import List, Dict, Any
# Ensure we can import from the root module if run directly
import sys
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from ingestion.graph_client import GraphClient
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("IngestionSync")
class SharePointSync:
def __init__(self, hostname: str, site_path: str):
self.graph_client = GraphClient()
self.hostname = hostname
self.site_path = site_path
self.state_file = "delta_state.json"
self.output_file = "ingestion_output.json"
def _load_delta_link(self) -> str:
"""Load delta link from local state file."""
if os.path.exists(self.state_file):
with open(self.state_file, "r", encoding="utf-8") as f:
data = json.load(f)
return data.get("delta_link")
return None
def _save_delta_link(self, delta_link: str):
"""Save delta link to local state file for next incremental sync."""
with open(self.state_file, "w", encoding="utf-8") as f:
json.dump({"delta_link": delta_link}, f, indent=2)
def _extract_metadata(self, item: Dict[Any, Any], site_id: str, drive_id: str) -> Dict[str, Any]:
"""Convert Graph API item payload to our target schema."""
download_url = item.get("@microsoft.graph.downloadUrl")
if not download_url and "folder" not in item and "deleted" not in item:
try:
# Delta query might not return downloadUrl, so fetch it directly
item_id = item.get("id")
url = f"https://graph.microsoft.com/v1.0/drives/{drive_id}/items/{item_id}"
full_item = self.graph_client._make_get_request(url)
download_url = full_item.get("@microsoft.graph.downloadUrl")
except Exception as e:
logger.error(f"Failed to fetch download_url for {item.get('name')}: {e}")
return {
"site_id": site_id,
"drive_id": drive_id,
"item_id": item.get("id"),
"name": item.get("name"),
"web_url": item.get("webUrl"),
"download_url": download_url,
"mime_type": item.get("file", {}).get("mimeType") if "file" in item else None,
"parent_path": item.get("parentReference", {}).get("path"),
"is_folder": "folder" in item,
"size": item.get("size"),
"last_modified": item.get("lastModifiedDateTime"),
"created": item.get("createdDateTime"),
"eTag": item.get("eTag"),
"cTag": item.get("cTag"),
"deleted": "deleted" in item
}
def _upsert_to_local_db(self, new_items: List[Dict[str, Any]]):
"""Simulate upsert into a database by writing to a JSON file."""
db = {}
if os.path.exists(self.output_file):
with open(self.output_file, "r", encoding="utf-8") as f:
try:
existing = json.load(f)
for item in existing:
db[item["item_id"]] = item
except json.JSONDecodeError:
pass
for item in new_items:
if item.get("deleted"):
# If deleted, we mark it as deleted in our db (or we could remove it)
if item["item_id"] in db:
db[item["item_id"]]["deleted"] = True
else:
# It's deleted but we didn't have it anyway
db[item["item_id"]] = item
else:
db[item["item_id"]] = item
final_list = list(db.values())
with open(self.output_file, "w", encoding="utf-8") as f:
json.dump(final_list, f, indent=2, ensure_ascii=False)
logger.info(f"Local database updated. Total items currently stored: {len(final_list)}")
def run_sync(self):
logger.info("=== STARTING SHAREPOINT SYNC ===")
# 1. & 2. Resolve Site and Drive
logger.info(f"Resolving site: {self.hostname}:{self.site_path}")
site_info = self.graph_client.get_site_by_path(self.hostname, self.site_path)
site_id = site_info["id"]
logger.info(f"Resolving drive for site: {site_id}")
drive_info = self.graph_client.get_drive(site_id)
drive_id = drive_info["id"]
# 3. Delta Query setup
delta_link = self._load_delta_link()
if delta_link:
logger.info("Found existing delta_link. Performing INCREMENTAL sync.")
else:
logger.info("No delta_link found. Performing FULL sync.")
items_collected = []
current_url = delta_link
# Loop over pagination
while True:
response = self.graph_client.delta_query(drive_id, current_url)
values = response.get("value", [])
items_collected.extend(values)
if "@odata.nextLink" in response:
current_url = response["@odata.nextLink"]
logger.info("Fetching next page of delta results...")
elif "@odata.deltaLink" in response:
new_delta_link = response["@odata.deltaLink"]
self._save_delta_link(new_delta_link)
logger.info("Reached end of delta changes. Saved new delta_link.")
break
else:
logger.warning("No nextLink or deltaLink found in response! Breaking loop.")
break
logger.info(f"Delta query returned {len(items_collected)} change(s).")
# 4. Extract metadata and save
if items_collected:
processed_items = [self._extract_metadata(item, site_id, drive_id) for item in items_collected]
self._upsert_to_local_db(processed_items)
else:
logger.info("No items to process.")
if __name__ == "__main__":
sync = SharePointSync("285pdg.sharepoint.com", "/sites/poc_system")
sync.run_sync()