import logging from typing import List, Optional from core.models import OCRPageResult logger = logging.getLogger("TextExtractor") class TextExtractor: """ Trích xuất text từ các định dạng tài liệu không cần OCR: - DOCX (python-docx) - XLSX (openpyxl) - TXT/MD (đọc trực tiếp) """ @staticmethod def extract_from_docx(file_bytes: bytes) -> List[OCRPageResult]: """Trích xuất text từ DOCX, giữ cấu trúc đoạn văn.""" try: from docx import Document import io doc = Document(io.BytesIO(file_bytes)) paragraphs = [] for para in doc.paragraphs: text = para.text.strip() if text: paragraphs.append(text) # Cũng trích xuất text từ bảng for table in doc.tables: for row in table.rows: row_text = " | ".join(cell.text.strip() for cell in row.cells if cell.text.strip()) if row_text: paragraphs.append(row_text) full_text = "\n\n".join(paragraphs) if not full_text.strip(): logger.warning("DOCX file is empty or has no readable text.") return [] return [OCRPageResult(page=1, text=full_text, confidence=1.0)] except ImportError: logger.error("python-docx not installed. Run: pip install python-docx") return [] except Exception as e: logger.error(f"Failed to extract text from DOCX: {e}") return [] @staticmethod def extract_from_xlsx(file_bytes: bytes) -> List[OCRPageResult]: """Trích xuất text từ XLSX (header + mỗi sheet là 1 page).""" try: from openpyxl import load_workbook import io wb = load_workbook(io.BytesIO(file_bytes), read_only=True, data_only=True) results = [] for sheet_idx, sheet_name in enumerate(wb.sheetnames, 1): ws = wb[sheet_name] rows = [] for row in ws.iter_rows(values_only=True): cells = [str(c).strip() for c in row if c is not None and str(c).strip()] if cells: rows.append(" | ".join(cells)) if rows: sheet_text = f"[Sheet: {sheet_name}]\n" + "\n".join(rows) results.append(OCRPageResult(page=sheet_idx, text=sheet_text, confidence=1.0)) wb.close() if not results: logger.warning("XLSX file is empty or has no readable data.") return results except ImportError: logger.error("openpyxl not installed. Run: pip install openpyxl") return [] except Exception as e: logger.error(f"Failed to extract text from XLSX: {e}") return [] @staticmethod def extract_from_text(file_bytes: bytes) -> List[OCRPageResult]: """Trích xuất text từ file text thuần (TXT, MD, CSV).""" try: text = file_bytes.decode("utf-8", errors="replace").strip() if not text: return [] return [OCRPageResult(page=1, text=text, confidence=1.0)] except Exception as e: logger.error(f"Failed to extract text from text file: {e}") return []