Your Name fed361b9b3 feat: 平台规则从硬编码改为品牌方上传文档 + AI 解析
- 新增 PlatformRule 模型 (draft/active/inactive 状态流转)
- 新增文档解析服务 (PDF/Word/Excel → 纯文本)
- 新增 4 个 API: 解析/确认/查询/删除平台规则
- 脚本审核优先从 DB 读取 active 规则,硬编码兜底
- 视频审核合并平台规则违禁词到检测列表
- Alembic 迁移 006: platform_rules 表

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-10 13:23:11 +08:00

120 lines
3.6 KiB
Python

"""
文档解析服务
从 PDF/Word/Excel 文档中提取纯文本
"""
import os
import tempfile
from typing import Optional
import httpx
class DocumentParser:
"""从文档中提取纯文本"""
@staticmethod
async def download_and_parse(document_url: str, document_name: str) -> str:
"""
下载文档并解析为纯文本
Args:
document_url: 文档 URL (TOS)
document_name: 原始文件名(用于判断格式)
Returns:
提取的纯文本
"""
# 下载到临时文件
tmp_path: Optional[str] = None
try:
async with httpx.AsyncClient(timeout=60.0) as client:
resp = await client.get(document_url)
resp.raise_for_status()
ext = document_name.rsplit(".", 1)[-1].lower() if "." in document_name else ""
with tempfile.NamedTemporaryFile(delete=False, suffix=f".{ext}") as tmp:
tmp.write(resp.content)
tmp_path = tmp.name
return DocumentParser.parse_file(tmp_path, document_name)
finally:
if tmp_path and os.path.exists(tmp_path):
os.unlink(tmp_path)
@staticmethod
def parse_file(file_path: str, file_name: str) -> str:
"""
根据扩展名选择解析器,返回纯文本
Args:
file_path: 本地文件路径
file_name: 原始文件名
Returns:
提取的纯文本
"""
ext = file_name.rsplit(".", 1)[-1].lower() if "." in file_name else ""
if ext == "pdf":
return DocumentParser._parse_pdf(file_path)
elif ext in ("doc", "docx"):
return DocumentParser._parse_docx(file_path)
elif ext in ("xls", "xlsx"):
return DocumentParser._parse_xlsx(file_path)
elif ext == "txt":
return DocumentParser._parse_txt(file_path)
else:
raise ValueError(f"不支持的文件格式: {ext}")
@staticmethod
def _parse_pdf(path: str) -> str:
"""pdfplumber 提取 PDF 文本"""
import pdfplumber
texts = []
with pdfplumber.open(path) as pdf:
for page in pdf.pages:
text = page.extract_text()
if text:
texts.append(text)
return "\n".join(texts)
@staticmethod
def _parse_docx(path: str) -> str:
"""python-docx 提取 Word 文本"""
from docx import Document
doc = Document(path)
texts = []
for para in doc.paragraphs:
if para.text.strip():
texts.append(para.text)
# 也提取表格内容
for table in doc.tables:
for row in table.rows:
row_text = "\t".join(cell.text.strip() for cell in row.cells if cell.text.strip())
if row_text:
texts.append(row_text)
return "\n".join(texts)
@staticmethod
def _parse_xlsx(path: str) -> str:
"""openpyxl 提取 Excel 文本(所有 sheet 拼接)"""
from openpyxl import load_workbook
wb = load_workbook(path, read_only=True, data_only=True)
texts = []
for sheet in wb.worksheets:
for row in sheet.iter_rows(values_only=True):
row_text = "\t".join(str(cell) for cell in row if cell is not None)
if row_text.strip():
texts.append(row_text)
wb.close()
return "\n".join(texts)
@staticmethod
def _parse_txt(path: str) -> str:
"""纯文本文件"""
with open(path, "r", encoding="utf-8") as f:
return f.read()