Phase 7: Hoàn thiện Modular RAG Backend với FastAPI và Đa LLM Provider

2026-05-08 07:30:30 +00:00
commit 26d1298cf6
51 changed files with 5360 additions and 0 deletions
--- a/ingestion/providers/base_provider.py
+++ b/ingestion/providers/base_provider.py
@@ -0,0 +1,36 @@
+from abc import ABC, abstractmethod
+from typing import Dict, List, Tuple
+
+class BaseStorageProvider(ABC):
+    """
+    Abstract Base Class for all Document Storage Providers (SharePoint, Google Drive, Local, NAS, etc.)
+    Any new storage source must implement these methods to be seamlessly integrated into the ingestion pipeline.
+    """
+    
+    @abstractmethod
+    def fetch_changes(self, sync_state: Dict) -> Tuple[List[Dict], Dict]:
+        """
+        Fetch incremental changes (new, updated, or deleted files).
+        
+        Args:
+            sync_state (Dict): The last known synchronization state/token.
+            
+        Returns:
+            Tuple[List[Dict], Dict]: 
+                - A list of standardized item dictionaries.
+                - The new sync state to be saved for the next run.
+        """
+        pass
+
+    @abstractmethod
+    def download_file(self, target_item: Dict) -> bytes:
+        """
+        Download the raw file bytes for a given item.
+        
+        Args:
+            target_item (Dict): The standardized item dictionary returned by fetch_changes.
+            
+        Returns:
+            bytes: The raw file content.
+        """
+        pass
--- a/ingestion/providers/sharepoint_provider.py
+++ b/ingestion/providers/sharepoint_provider.py
@@ -0,0 +1,99 @@
+import logging
+from typing import Dict, List, Tuple
+from .base_provider import BaseStorageProvider
+from ingestion.graph_client import GraphClient
+
+from core.config import settings
+
+logger = logging.getLogger(__name__)
+
+class SharePointProvider(BaseStorageProvider):
+    """
+    Storage Provider implementation for Microsoft SharePoint using Graph API.
+    """
+    def __init__(self, hostname: str = "285pdg.sharepoint.com", site_path: str = "/sites/poc_system"):
+        self.graph = GraphClient()
+        
+        logger.info(f"Resolving site: {hostname}:{site_path}")
+        site_info = self.graph.get_site_by_path(hostname, site_path)
+        site_id = site_info["id"]
+        
+        logger.info(f"Resolving drive for site: {site_id}")
+        drive_info = self.graph.get_drive(site_id)
+        self.drive_id = drive_info["id"]
+        
+        logger.info(f"Initialized SharePoint Provider for Drive ID: {self.drive_id}")
+
+    def fetch_changes(self, sync_state: Dict) -> Tuple[List[Dict], Dict]:
+        """
+        Fetch all delta changes from SharePoint, handling pagination internally.
+        """
+        delta_link = sync_state.get("sharepoint_delta_link")
+        
+        items_collected = []
+        current_url = delta_link
+        
+        # Loop over pagination
+        while True:
+            # We need to construct the URL manually or let graph_client do it
+            if not current_url:
+                current_url = f"{self.graph.base_url}/drives/{self.drive_id}/root/delta"
+                
+            response = self.graph._make_get_request(current_url)
+            values = response.get("value", [])
+            items_collected.extend(values)
+            
+            if "@odata.nextLink" in response:
+                current_url = response["@odata.nextLink"]
+                logger.info("Fetching next page of SharePoint delta results...")
+            elif "@odata.deltaLink" in response:
+                new_delta_link = response["@odata.deltaLink"]
+                logger.info("Reached end of SharePoint delta changes.")
+                break
+            else:
+                logger.warning("No nextLink or deltaLink found in response! Breaking loop.")
+                new_delta_link = current_url
+                break
+            
+        # Standardize output for the ingestion pipeline
+        standardized_items = []
+        for item in items_collected:
+            # Bỏ qua root drive
+            if "folder" in item and "root" in item.get("folder", {}):
+                continue
+                
+            is_deleted = "deleted" in item
+            
+            std_item = {
+                "id": item.get("id"),
+                "name": item.get("name"),
+                "is_deleted": is_deleted,
+                "is_folder": "folder" in item,
+                "provider": "sharepoint",
+                "last_modified": item.get("lastModifiedDateTime"),
+                "size": item.get("size"),
+                "raw_data": item 
+            }
+            standardized_items.append(std_item)
+            
+        new_state = sync_state.copy()
+        new_state["sharepoint_delta_link"] = new_delta_link
+        
+        return standardized_items, new_state
+
+    def download_file(self, target_item: Dict) -> bytes:
+        """
+        Download file content from SharePoint.
+        """
+        try:
+            raw_data = target_item.get("raw_data", {})
+            item_id = target_item.get("id")
+            
+            # Gọi thẳng endpoint /content qua Graph API để tránh lỗi 401
+            url = f"https://graph.microsoft.com/v1.0/drives/{self.drive_id}/items/{item_id}/content"
+            file_bytes = self.graph._download_file(url)
+            
+            return file_bytes
+        except Exception as e:
+            logger.error(f"SharePoint download_file failed for {target_item.get('name')}: {e}")
+            raise e