Phase 7: Hoàn thiện Modular RAG Backend với FastAPI và Đa LLM Provider

This commit is contained in:
2026-05-08 07:30:30 +00:00
commit 26d1298cf6
51 changed files with 5360 additions and 0 deletions

View File

@@ -0,0 +1,36 @@
from abc import ABC, abstractmethod
from typing import Dict, List, Tuple
class BaseStorageProvider(ABC):
"""
Abstract Base Class for all Document Storage Providers (SharePoint, Google Drive, Local, NAS, etc.)
Any new storage source must implement these methods to be seamlessly integrated into the ingestion pipeline.
"""
@abstractmethod
def fetch_changes(self, sync_state: Dict) -> Tuple[List[Dict], Dict]:
"""
Fetch incremental changes (new, updated, or deleted files).
Args:
sync_state (Dict): The last known synchronization state/token.
Returns:
Tuple[List[Dict], Dict]:
- A list of standardized item dictionaries.
- The new sync state to be saved for the next run.
"""
pass
@abstractmethod
def download_file(self, target_item: Dict) -> bytes:
"""
Download the raw file bytes for a given item.
Args:
target_item (Dict): The standardized item dictionary returned by fetch_changes.
Returns:
bytes: The raw file content.
"""
pass

View File

@@ -0,0 +1,99 @@
import logging
from typing import Dict, List, Tuple
from .base_provider import BaseStorageProvider
from ingestion.graph_client import GraphClient
from core.config import settings
logger = logging.getLogger(__name__)
class SharePointProvider(BaseStorageProvider):
"""
Storage Provider implementation for Microsoft SharePoint using Graph API.
"""
def __init__(self, hostname: str = "285pdg.sharepoint.com", site_path: str = "/sites/poc_system"):
self.graph = GraphClient()
logger.info(f"Resolving site: {hostname}:{site_path}")
site_info = self.graph.get_site_by_path(hostname, site_path)
site_id = site_info["id"]
logger.info(f"Resolving drive for site: {site_id}")
drive_info = self.graph.get_drive(site_id)
self.drive_id = drive_info["id"]
logger.info(f"Initialized SharePoint Provider for Drive ID: {self.drive_id}")
def fetch_changes(self, sync_state: Dict) -> Tuple[List[Dict], Dict]:
"""
Fetch all delta changes from SharePoint, handling pagination internally.
"""
delta_link = sync_state.get("sharepoint_delta_link")
items_collected = []
current_url = delta_link
# Loop over pagination
while True:
# We need to construct the URL manually or let graph_client do it
if not current_url:
current_url = f"{self.graph.base_url}/drives/{self.drive_id}/root/delta"
response = self.graph._make_get_request(current_url)
values = response.get("value", [])
items_collected.extend(values)
if "@odata.nextLink" in response:
current_url = response["@odata.nextLink"]
logger.info("Fetching next page of SharePoint delta results...")
elif "@odata.deltaLink" in response:
new_delta_link = response["@odata.deltaLink"]
logger.info("Reached end of SharePoint delta changes.")
break
else:
logger.warning("No nextLink or deltaLink found in response! Breaking loop.")
new_delta_link = current_url
break
# Standardize output for the ingestion pipeline
standardized_items = []
for item in items_collected:
# Bỏ qua root drive
if "folder" in item and "root" in item.get("folder", {}):
continue
is_deleted = "deleted" in item
std_item = {
"id": item.get("id"),
"name": item.get("name"),
"is_deleted": is_deleted,
"is_folder": "folder" in item,
"provider": "sharepoint",
"last_modified": item.get("lastModifiedDateTime"),
"size": item.get("size"),
"raw_data": item
}
standardized_items.append(std_item)
new_state = sync_state.copy()
new_state["sharepoint_delta_link"] = new_delta_link
return standardized_items, new_state
def download_file(self, target_item: Dict) -> bytes:
"""
Download file content from SharePoint.
"""
try:
raw_data = target_item.get("raw_data", {})
item_id = target_item.get("id")
# Gọi thẳng endpoint /content qua Graph API để tránh lỗi 401
url = f"https://graph.microsoft.com/v1.0/drives/{self.drive_id}/items/{item_id}/content"
file_bytes = self.graph._download_file(url)
return file_bytes
except Exception as e:
logger.error(f"SharePoint download_file failed for {target_item.get('name')}: {e}")
raise e