Your Name 4c9b2f1263 feat: Brief附件/项目平台/规则AI解析/消息中心修复 + 项目创建通知
- Brief 支持代理商附件上传 (迁移 007)
- 项目新增 platform 字段 (迁移 008),前端创建/展示平台信息
- 修复 AI 规则解析:处理中文引号导致 JSON 解析失败的问题
- 修复消息中心崩溃:补全后端消息类型映射 + fallback 保护
- 项目创建时自动发送消息通知
- .gitignore 排除 backend/data/ 数据库文件

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-10 19:00:03 +08:00

252 lines
8.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
文档解析服务
从 PDF/Word/Excel 文档中提取纯文本
"""
import os
import tempfile
from typing import Optional
import httpx
class DocumentParser:
"""从文档中提取纯文本"""
@staticmethod
async def download_and_parse(document_url: str, document_name: str) -> str:
"""
下载文档并解析为纯文本
优先使用 TOS SDK 直接下载(私有桶无需签名),
回退到 HTTP 预签名 URL 下载。
Args:
document_url: 文档 URL (TOS)
document_name: 原始文件名(用于判断格式)
Returns:
提取的纯文本
"""
tmp_path: Optional[str] = None
try:
ext = document_name.rsplit(".", 1)[-1].lower() if "." in document_name else ""
# 优先用 TOS SDK 直接下载(后端有 AK/SK无需签名 URL
content = await DocumentParser._download_via_tos_sdk(document_url)
if content is None:
# 回退:生成预签名 URL 后用 HTTP 下载
content = await DocumentParser._download_via_signed_url(document_url)
with tempfile.NamedTemporaryFile(delete=False, suffix=f".{ext}") as tmp:
tmp.write(content)
tmp_path = tmp.name
return DocumentParser.parse_file(tmp_path, document_name)
finally:
if tmp_path and os.path.exists(tmp_path):
os.unlink(tmp_path)
@staticmethod
async def download_and_get_images(document_url: str, document_name: str) -> Optional[list[str]]:
"""
下载 PDF 并将页面转为 base64 图片列表(用于图片型 PDF 的 AI 视觉解析)。
非 PDF 或非图片型 PDF 返回 None。
"""
ext = document_name.rsplit(".", 1)[-1].lower() if "." in document_name else ""
if ext != "pdf":
return None
tmp_path: Optional[str] = None
try:
file_content = await DocumentParser._download_via_tos_sdk(document_url)
if file_content is None:
file_content = await DocumentParser._download_via_signed_url(document_url)
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
tmp.write(file_content)
tmp_path = tmp.name
if DocumentParser.is_image_pdf(tmp_path):
return DocumentParser.pdf_to_images_base64(tmp_path)
return None
finally:
if tmp_path and os.path.exists(tmp_path):
os.unlink(tmp_path)
@staticmethod
async def _download_via_tos_sdk(document_url: str) -> Optional[bytes]:
"""通过 TOS SDK 直接下载文件(私有桶安全访问)"""
try:
from app.config import settings
from app.services.oss import parse_file_key_from_url
import tos as tos_sdk
if not settings.TOS_ACCESS_KEY_ID or not settings.TOS_SECRET_ACCESS_KEY:
return None
file_key = parse_file_key_from_url(document_url)
if not file_key or file_key == document_url:
return None
region = settings.TOS_REGION
endpoint = settings.TOS_ENDPOINT or f"tos-cn-{region}.volces.com"
client = tos_sdk.TosClientV2(
ak=settings.TOS_ACCESS_KEY_ID,
sk=settings.TOS_SECRET_ACCESS_KEY,
endpoint=f"https://{endpoint}",
region=region,
)
resp = client.get_object(bucket=settings.TOS_BUCKET_NAME, key=file_key)
return resp.read()
except Exception:
return None
@staticmethod
async def _download_via_signed_url(document_url: str) -> bytes:
"""生成预签名 URL 后通过 HTTP 下载"""
from app.services.oss import generate_presigned_url, parse_file_key_from_url
file_key = parse_file_key_from_url(document_url)
signed_url = generate_presigned_url(file_key, expire_seconds=300)
async with httpx.AsyncClient(timeout=60.0) as client:
resp = await client.get(signed_url)
resp.raise_for_status()
return resp.content
@staticmethod
def parse_file(file_path: str, file_name: str) -> str:
"""
根据扩展名选择解析器,返回纯文本
Args:
file_path: 本地文件路径
file_name: 原始文件名
Returns:
提取的纯文本
"""
ext = file_name.rsplit(".", 1)[-1].lower() if "." in file_name else ""
if ext == "pdf":
return DocumentParser._parse_pdf(file_path)
elif ext in ("doc", "docx"):
return DocumentParser._parse_docx(file_path)
elif ext in ("xls", "xlsx"):
return DocumentParser._parse_xlsx(file_path)
elif ext == "txt":
return DocumentParser._parse_txt(file_path)
else:
raise ValueError(f"不支持的文件格式: {ext}")
@staticmethod
def _parse_pdf(path: str) -> str:
"""PyMuPDF 提取 PDF 文本,回退 pdfplumber"""
import fitz
texts = []
doc = fitz.open(path)
for page in doc:
text = page.get_text()
if text and text.strip():
texts.append(text.strip())
doc.close()
result = "\n".join(texts)
# 如果 PyMuPDF 提取文本太少,回退 pdfplumber
if len(result.strip()) < 100:
try:
import pdfplumber
texts2 = []
with pdfplumber.open(path) as pdf:
for page in pdf.pages:
text = page.extract_text()
if text:
texts2.append(text)
fallback = "\n".join(texts2)
if len(fallback.strip()) > len(result.strip()):
result = fallback
except Exception:
pass
return result
@staticmethod
def pdf_to_images_base64(path: str, max_pages: int = 5, dpi: int = 150) -> list[str]:
"""
将 PDF 页面渲染为图片并返回 base64 编码列表。
用于处理扫描件/图片型 PDF。
"""
import fitz
import base64
images = []
doc = fitz.open(path)
for i, page in enumerate(doc):
if i >= max_pages:
break
zoom = dpi / 72
mat = fitz.Matrix(zoom, zoom)
pix = page.get_pixmap(matrix=mat)
img_bytes = pix.tobytes("png")
b64 = base64.b64encode(img_bytes).decode()
images.append(b64)
doc.close()
return images
@staticmethod
def is_image_pdf(path: str) -> bool:
"""判断 PDF 是否为扫描件/图片型(文本内容极少)"""
import fitz
doc = fitz.open(path)
total_text = ""
for page in doc:
total_text += page.get_text()
doc.close()
# 去掉页码等噪音后,有效文字少于 200 字符视为图片 PDF
cleaned = "".join(c for c in total_text if c.strip())
return len(cleaned) < 200
@staticmethod
def _parse_docx(path: str) -> str:
"""python-docx 提取 Word 文本"""
from docx import Document
doc = Document(path)
texts = []
for para in doc.paragraphs:
if para.text.strip():
texts.append(para.text)
# 也提取表格内容
for table in doc.tables:
for row in table.rows:
row_text = "\t".join(cell.text.strip() for cell in row.cells if cell.text.strip())
if row_text:
texts.append(row_text)
return "\n".join(texts)
@staticmethod
def _parse_xlsx(path: str) -> str:
"""openpyxl 提取 Excel 文本(所有 sheet 拼接)"""
from openpyxl import load_workbook
wb = load_workbook(path, read_only=True, data_only=True)
texts = []
for sheet in wb.worksheets:
for row in sheet.iter_rows(values_only=True):
row_text = "\t".join(str(cell) for cell in row if cell is not None)
if row_text.strip():
texts.append(row_text)
wb.close()
return "\n".join(texts)
@staticmethod
def _parse_txt(path: str) -> str:
"""纯文本文件"""
with open(path, "r", encoding="utf-8") as f:
return f.read()