Files
poc_system/extraction/text_extractor.py
2026-05-09 10:31:28 +00:00

97 lines
3.4 KiB
Python

import logging
from typing import List, Optional
from core.models import OCRPageResult
logger = logging.getLogger("TextExtractor")
class TextExtractor:
"""
Trích xuất text từ các định dạng tài liệu không cần OCR:
- DOCX (python-docx)
- XLSX (openpyxl)
- TXT/MD (đọc trực tiếp)
"""
@staticmethod
def extract_from_docx(file_bytes: bytes) -> List[OCRPageResult]:
"""Trích xuất text từ DOCX, giữ cấu trúc đoạn văn."""
try:
from docx import Document
import io
doc = Document(io.BytesIO(file_bytes))
paragraphs = []
for para in doc.paragraphs:
text = para.text.strip()
if text:
paragraphs.append(text)
# Cũng trích xuất text từ bảng
for table in doc.tables:
for row in table.rows:
row_text = " | ".join(cell.text.strip() for cell in row.cells if cell.text.strip())
if row_text:
paragraphs.append(row_text)
full_text = "\n\n".join(paragraphs)
if not full_text.strip():
logger.warning("DOCX file is empty or has no readable text.")
return []
return [OCRPageResult(page=1, text=full_text, confidence=1.0)]
except ImportError:
logger.error("python-docx not installed. Run: pip install python-docx")
return []
except Exception as e:
logger.error(f"Failed to extract text from DOCX: {e}")
return []
@staticmethod
def extract_from_xlsx(file_bytes: bytes) -> List[OCRPageResult]:
"""Trích xuất text từ XLSX (header + mỗi sheet là 1 page)."""
try:
from openpyxl import load_workbook
import io
wb = load_workbook(io.BytesIO(file_bytes), read_only=True, data_only=True)
results = []
for sheet_idx, sheet_name in enumerate(wb.sheetnames, 1):
ws = wb[sheet_name]
rows = []
for row in ws.iter_rows(values_only=True):
cells = [str(c).strip() for c in row if c is not None and str(c).strip()]
if cells:
rows.append(" | ".join(cells))
if rows:
sheet_text = f"[Sheet: {sheet_name}]\n" + "\n".join(rows)
results.append(OCRPageResult(page=sheet_idx, text=sheet_text, confidence=1.0))
wb.close()
if not results:
logger.warning("XLSX file is empty or has no readable data.")
return results
except ImportError:
logger.error("openpyxl not installed. Run: pip install openpyxl")
return []
except Exception as e:
logger.error(f"Failed to extract text from XLSX: {e}")
return []
@staticmethod
def extract_from_text(file_bytes: bytes) -> List[OCRPageResult]:
"""Trích xuất text từ file text thuần (TXT, MD, CSV)."""
try:
text = file_bytes.decode("utf-8", errors="replace").strip()
if not text:
return []
return [OCRPageResult(page=1, text=text, confidence=1.0)]
except Exception as e:
logger.error(f"Failed to extract text from text file: {e}")
return []