""" 文档解析服务 从 PDF/Word/Excel 文档中提取纯文本 """ import os import tempfile from typing import Optional import httpx class DocumentParser: """从文档中提取纯文本""" @staticmethod async def download_and_parse(document_url: str, document_name: str) -> str: """ 下载文档并解析为纯文本 Args: document_url: 文档 URL (TOS) document_name: 原始文件名(用于判断格式) Returns: 提取的纯文本 """ # 下载到临时文件 tmp_path: Optional[str] = None try: async with httpx.AsyncClient(timeout=60.0) as client: resp = await client.get(document_url) resp.raise_for_status() ext = document_name.rsplit(".", 1)[-1].lower() if "." in document_name else "" with tempfile.NamedTemporaryFile(delete=False, suffix=f".{ext}") as tmp: tmp.write(resp.content) tmp_path = tmp.name return DocumentParser.parse_file(tmp_path, document_name) finally: if tmp_path and os.path.exists(tmp_path): os.unlink(tmp_path) @staticmethod def parse_file(file_path: str, file_name: str) -> str: """ 根据扩展名选择解析器,返回纯文本 Args: file_path: 本地文件路径 file_name: 原始文件名 Returns: 提取的纯文本 """ ext = file_name.rsplit(".", 1)[-1].lower() if "." in file_name else "" if ext == "pdf": return DocumentParser._parse_pdf(file_path) elif ext in ("doc", "docx"): return DocumentParser._parse_docx(file_path) elif ext in ("xls", "xlsx"): return DocumentParser._parse_xlsx(file_path) elif ext == "txt": return DocumentParser._parse_txt(file_path) else: raise ValueError(f"不支持的文件格式: {ext}") @staticmethod def _parse_pdf(path: str) -> str: """pdfplumber 提取 PDF 文本""" import pdfplumber texts = [] with pdfplumber.open(path) as pdf: for page in pdf.pages: text = page.extract_text() if text: texts.append(text) return "\n".join(texts) @staticmethod def _parse_docx(path: str) -> str: """python-docx 提取 Word 文本""" from docx import Document doc = Document(path) texts = [] for para in doc.paragraphs: if para.text.strip(): texts.append(para.text) # 也提取表格内容 for table in doc.tables: for row in table.rows: row_text = "\t".join(cell.text.strip() for cell in row.cells if cell.text.strip()) if row_text: texts.append(row_text) return "\n".join(texts) @staticmethod def _parse_xlsx(path: str) -> str: """openpyxl 提取 Excel 文本(所有 sheet 拼接)""" from openpyxl import load_workbook wb = load_workbook(path, read_only=True, data_only=True) texts = [] for sheet in wb.worksheets: for row in sheet.iter_rows(values_only=True): row_text = "\t".join(str(cell) for cell in row if cell is not None) if row_text.strip(): texts.append(row_text) wb.close() return "\n".join(texts) @staticmethod def _parse_txt(path: str) -> str: """纯文本文件""" with open(path, "r", encoding="utf-8") as f: return f.read()