poc_system/chat/rag_engine.py

import logging
from typing import List, Dict
from search.retriever import SearchRetriever
from .llm_factory import LLMFactory

logger = logging.getLogger("RAGEngine")


class RAGEngine:
    def __init__(self):
        self.retriever = SearchRetriever()
        self.llm = LLMFactory.get_provider()
        logger.info(f"RAG Engine ready with LLM Provider: {type(self.llm).__name__}")

    def chat(self, user_query: str, history: List[Dict[str, str]] = None, user_email: str = None, is_admin: bool = False) -> Dict:
        """
        Quy trình RAG: Search -> Augment -> Generate

        Args:
            user_query: Câu hỏi
            history: Lịch sử chat
            user_email: Email user để filter quyền
            is_admin: True = bypass ACL
        """
        logger.info(f"Search query: {user_query[:100]} (user={user_email or 'none'}, admin={is_admin})")
        relevant_chunks = self.retriever.retrieve(user_query, top_k=5, user_email=user_email, is_admin=is_admin)

        if not relevant_chunks:
            context_text = "Không tìm thấy thông tin liên quan trong cơ sở dữ liệu nội bộ."
            logger.info("Search result: 0 chunks found")
        else:
            context_text = "\n---\n".join([
                f"[Nguồn: {c.file_name}, Trang: {c.page_from}]\nNội dung: {c.text}"
                for c in relevant_chunks
            ])
            logger.info(f"Search result: {len(relevant_chunks)} chunks from {len(set(c.file_name for c in relevant_chunks))} files")

        # 2. GENERATION
        logger.info("Requesting LLM to generate answer...")
        answer = self.llm.generate_response(
            prompt=user_query,
            context=context_text,
            history=history
        )
        logger.info(f"LLM response length: {len(answer)} chars")

        # 3. Return with citations
        return {
            "answer": answer,
            "context_used": context_text,
            "sources": [
                {
                    "file_name": c.file_name,
                    "page": c.page_from,
                    "url": c.source_url,
                    "download_url": c.download_url
                } for c in relevant_chunks
            ]
        }