97 lines
3.4 KiB
Python
97 lines
3.4 KiB
Python
import logging
|
|
from typing import List, Optional
|
|
from core.models import OCRPageResult
|
|
|
|
logger = logging.getLogger("TextExtractor")
|
|
|
|
|
|
class TextExtractor:
|
|
"""
|
|
Trích xuất text từ các định dạng tài liệu không cần OCR:
|
|
- DOCX (python-docx)
|
|
- XLSX (openpyxl)
|
|
- TXT/MD (đọc trực tiếp)
|
|
"""
|
|
|
|
@staticmethod
|
|
def extract_from_docx(file_bytes: bytes) -> List[OCRPageResult]:
|
|
"""Trích xuất text từ DOCX, giữ cấu trúc đoạn văn."""
|
|
try:
|
|
from docx import Document
|
|
import io
|
|
|
|
doc = Document(io.BytesIO(file_bytes))
|
|
paragraphs = []
|
|
for para in doc.paragraphs:
|
|
text = para.text.strip()
|
|
if text:
|
|
paragraphs.append(text)
|
|
|
|
# Cũng trích xuất text từ bảng
|
|
for table in doc.tables:
|
|
for row in table.rows:
|
|
row_text = " | ".join(cell.text.strip() for cell in row.cells if cell.text.strip())
|
|
if row_text:
|
|
paragraphs.append(row_text)
|
|
|
|
full_text = "\n\n".join(paragraphs)
|
|
if not full_text.strip():
|
|
logger.warning("DOCX file is empty or has no readable text.")
|
|
return []
|
|
|
|
return [OCRPageResult(page=1, text=full_text, confidence=1.0)]
|
|
|
|
except ImportError:
|
|
logger.error("python-docx not installed. Run: pip install python-docx")
|
|
return []
|
|
except Exception as e:
|
|
logger.error(f"Failed to extract text from DOCX: {e}")
|
|
return []
|
|
|
|
@staticmethod
|
|
def extract_from_xlsx(file_bytes: bytes) -> List[OCRPageResult]:
|
|
"""Trích xuất text từ XLSX (header + mỗi sheet là 1 page)."""
|
|
try:
|
|
from openpyxl import load_workbook
|
|
import io
|
|
|
|
wb = load_workbook(io.BytesIO(file_bytes), read_only=True, data_only=True)
|
|
results = []
|
|
|
|
for sheet_idx, sheet_name in enumerate(wb.sheetnames, 1):
|
|
ws = wb[sheet_name]
|
|
rows = []
|
|
for row in ws.iter_rows(values_only=True):
|
|
cells = [str(c).strip() for c in row if c is not None and str(c).strip()]
|
|
if cells:
|
|
rows.append(" | ".join(cells))
|
|
|
|
if rows:
|
|
sheet_text = f"[Sheet: {sheet_name}]\n" + "\n".join(rows)
|
|
results.append(OCRPageResult(page=sheet_idx, text=sheet_text, confidence=1.0))
|
|
|
|
wb.close()
|
|
|
|
if not results:
|
|
logger.warning("XLSX file is empty or has no readable data.")
|
|
return results
|
|
|
|
except ImportError:
|
|
logger.error("openpyxl not installed. Run: pip install openpyxl")
|
|
return []
|
|
except Exception as e:
|
|
logger.error(f"Failed to extract text from XLSX: {e}")
|
|
return []
|
|
|
|
@staticmethod
|
|
def extract_from_text(file_bytes: bytes) -> List[OCRPageResult]:
|
|
"""Trích xuất text từ file text thuần (TXT, MD, CSV)."""
|
|
try:
|
|
text = file_bytes.decode("utf-8", errors="replace").strip()
|
|
if not text:
|
|
return []
|
|
return [OCRPageResult(page=1, text=text, confidence=1.0)]
|
|
except Exception as e:
|
|
logger.error(f"Failed to extract text from text file: {e}")
|
|
return []
|