poc_system/extraction/text_extractor.py

import logging
from typing import List, Optional
from core.models import OCRPageResult

logger = logging.getLogger("TextExtractor")


class TextExtractor:
    """
    Trích xuất text từ các định dạng tài liệu không cần OCR:
    - DOCX (python-docx)
    - XLSX (openpyxl)
    - TXT/MD (đọc trực tiếp)
    """

    @staticmethod
    def extract_from_docx(file_bytes: bytes) -> List[OCRPageResult]:
        """Trích xuất text từ DOCX, giữ cấu trúc đoạn văn."""
        try:
            from docx import Document
            import io

            doc = Document(io.BytesIO(file_bytes))
            paragraphs = []
            for para in doc.paragraphs:
                text = para.text.strip()
                if text:
                    paragraphs.append(text)

            # Cũng trích xuất text từ bảng
            for table in doc.tables:
                for row in table.rows:
                    row_text = " | ".join(cell.text.strip() for cell in row.cells if cell.text.strip())
                    if row_text:
                        paragraphs.append(row_text)

            full_text = "\n\n".join(paragraphs)
            if not full_text.strip():
                logger.warning("DOCX file is empty or has no readable text.")
                return []

            return [OCRPageResult(page=1, text=full_text, confidence=1.0)]

        except ImportError:
            logger.error("python-docx not installed. Run: pip install python-docx")
            return []
        except Exception as e:
            logger.error(f"Failed to extract text from DOCX: {e}")
            return []

    @staticmethod
    def extract_from_xlsx(file_bytes: bytes) -> List[OCRPageResult]:
        """Trích xuất text từ XLSX (header + mỗi sheet là 1 page)."""
        try:
            from openpyxl import load_workbook
            import io

            wb = load_workbook(io.BytesIO(file_bytes), read_only=True, data_only=True)
            results = []

            for sheet_idx, sheet_name in enumerate(wb.sheetnames, 1):
                ws = wb[sheet_name]
                rows = []
                for row in ws.iter_rows(values_only=True):
                    cells = [str(c).strip() for c in row if c is not None and str(c).strip()]
                    if cells:
                        rows.append(" | ".join(cells))

                if rows:
                    sheet_text = f"[Sheet: {sheet_name}]\n" + "\n".join(rows)
                    results.append(OCRPageResult(page=sheet_idx, text=sheet_text, confidence=1.0))

            wb.close()

            if not results:
                logger.warning("XLSX file is empty or has no readable data.")
            return results

        except ImportError:
            logger.error("openpyxl not installed. Run: pip install openpyxl")
            return []
        except Exception as e:
            logger.error(f"Failed to extract text from XLSX: {e}")
            return []

    @staticmethod
    def extract_from_text(file_bytes: bytes) -> List[OCRPageResult]:
        """Trích xuất text từ file text thuần (TXT, MD, CSV)."""
        try:
            text = file_bytes.decode("utf-8", errors="replace").strip()
            if not text:
                return []
            return [OCRPageResult(page=1, text=text, confidence=1.0)]
        except Exception as e:
            logger.error(f"Failed to extract text from text file: {e}")
            return []