- 新增 PlatformRule 模型 (draft/active/inactive 状态流转) - 新增文档解析服务 (PDF/Word/Excel → 纯文本) - 新增 4 个 API: 解析/确认/查询/删除平台规则 - 脚本审核优先从 DB 读取 active 规则,硬编码兜底 - 视频审核合并平台规则违禁词到检测列表 - Alembic 迁移 006: platform_rules 表 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
120 lines
3.6 KiB
Python
120 lines
3.6 KiB
Python
"""
|
|
文档解析服务
|
|
从 PDF/Word/Excel 文档中提取纯文本
|
|
"""
|
|
import os
|
|
import tempfile
|
|
from typing import Optional
|
|
|
|
import httpx
|
|
|
|
|
|
class DocumentParser:
|
|
"""从文档中提取纯文本"""
|
|
|
|
@staticmethod
|
|
async def download_and_parse(document_url: str, document_name: str) -> str:
|
|
"""
|
|
下载文档并解析为纯文本
|
|
|
|
Args:
|
|
document_url: 文档 URL (TOS)
|
|
document_name: 原始文件名(用于判断格式)
|
|
|
|
Returns:
|
|
提取的纯文本
|
|
"""
|
|
# 下载到临时文件
|
|
tmp_path: Optional[str] = None
|
|
try:
|
|
async with httpx.AsyncClient(timeout=60.0) as client:
|
|
resp = await client.get(document_url)
|
|
resp.raise_for_status()
|
|
|
|
ext = document_name.rsplit(".", 1)[-1].lower() if "." in document_name else ""
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=f".{ext}") as tmp:
|
|
tmp.write(resp.content)
|
|
tmp_path = tmp.name
|
|
|
|
return DocumentParser.parse_file(tmp_path, document_name)
|
|
finally:
|
|
if tmp_path and os.path.exists(tmp_path):
|
|
os.unlink(tmp_path)
|
|
|
|
@staticmethod
|
|
def parse_file(file_path: str, file_name: str) -> str:
|
|
"""
|
|
根据扩展名选择解析器,返回纯文本
|
|
|
|
Args:
|
|
file_path: 本地文件路径
|
|
file_name: 原始文件名
|
|
|
|
Returns:
|
|
提取的纯文本
|
|
"""
|
|
ext = file_name.rsplit(".", 1)[-1].lower() if "." in file_name else ""
|
|
|
|
if ext == "pdf":
|
|
return DocumentParser._parse_pdf(file_path)
|
|
elif ext in ("doc", "docx"):
|
|
return DocumentParser._parse_docx(file_path)
|
|
elif ext in ("xls", "xlsx"):
|
|
return DocumentParser._parse_xlsx(file_path)
|
|
elif ext == "txt":
|
|
return DocumentParser._parse_txt(file_path)
|
|
else:
|
|
raise ValueError(f"不支持的文件格式: {ext}")
|
|
|
|
@staticmethod
|
|
def _parse_pdf(path: str) -> str:
|
|
"""pdfplumber 提取 PDF 文本"""
|
|
import pdfplumber
|
|
|
|
texts = []
|
|
with pdfplumber.open(path) as pdf:
|
|
for page in pdf.pages:
|
|
text = page.extract_text()
|
|
if text:
|
|
texts.append(text)
|
|
return "\n".join(texts)
|
|
|
|
@staticmethod
|
|
def _parse_docx(path: str) -> str:
|
|
"""python-docx 提取 Word 文本"""
|
|
from docx import Document
|
|
|
|
doc = Document(path)
|
|
texts = []
|
|
for para in doc.paragraphs:
|
|
if para.text.strip():
|
|
texts.append(para.text)
|
|
# 也提取表格内容
|
|
for table in doc.tables:
|
|
for row in table.rows:
|
|
row_text = "\t".join(cell.text.strip() for cell in row.cells if cell.text.strip())
|
|
if row_text:
|
|
texts.append(row_text)
|
|
return "\n".join(texts)
|
|
|
|
@staticmethod
|
|
def _parse_xlsx(path: str) -> str:
|
|
"""openpyxl 提取 Excel 文本(所有 sheet 拼接)"""
|
|
from openpyxl import load_workbook
|
|
|
|
wb = load_workbook(path, read_only=True, data_only=True)
|
|
texts = []
|
|
for sheet in wb.worksheets:
|
|
for row in sheet.iter_rows(values_only=True):
|
|
row_text = "\t".join(str(cell) for cell in row if cell is not None)
|
|
if row_text.strip():
|
|
texts.append(row_text)
|
|
wb.close()
|
|
return "\n".join(texts)
|
|
|
|
@staticmethod
|
|
def _parse_txt(path: str) -> str:
|
|
"""纯文本文件"""
|
|
with open(path, "r", encoding="utf-8") as f:
|
|
return f.read()
|