feat: 平台规则从硬编码改为品牌方上传文档 + AI 解析

- 新增 PlatformRule 模型 (draft/active/inactive 状态流转)
- 新增文档解析服务 (PDF/Word/Excel → 纯文本)
- 新增 4 个 API: 解析/确认/查询/删除平台规则
- 脚本审核优先从 DB 读取 active 规则,硬编码兜底
- 视频审核合并平台规则违禁词到检测列表
- Alembic 迁移 006: platform_rules 表

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Your Name 2026-02-10 13:23:11 +08:00
parent a2f6f82e15
commit fed361b9b3
10 changed files with 790 additions and 34 deletions

View File

@ -0,0 +1,38 @@
"""添加平台规则表
Revision ID: 006
Revises: 005
Create Date: 2026-02-10
"""
from typing import Sequence, Union
from alembic import op
import sqlalchemy as sa
from sqlalchemy.dialects import postgresql
# revision identifiers, used by Alembic.
revision: str = '006'
down_revision: Union[str, None] = '005'
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None
def upgrade() -> None:
op.create_table(
'platform_rules',
sa.Column('id', sa.String(64), primary_key=True),
sa.Column('tenant_id', sa.String(64), sa.ForeignKey('tenants.id', ondelete='CASCADE'), nullable=False, index=True),
sa.Column('brand_id', sa.String(64), nullable=False, index=True),
sa.Column('platform', sa.String(50), nullable=False, index=True),
sa.Column('document_url', sa.String(2048), nullable=False),
sa.Column('document_name', sa.String(512), nullable=False),
sa.Column('parsed_rules', sa.JSON().with_variant(postgresql.JSONB, 'postgresql'), nullable=True),
sa.Column('status', sa.String(20), nullable=False, default='draft', index=True),
sa.Column('created_at', sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False),
sa.Column('updated_at', sa.DateTime(timezone=True), server_default=sa.func.now(), nullable=False),
)
def downgrade() -> None:
op.drop_table('platform_rules')

View File

@ -2,8 +2,10 @@
规则管理 API
违禁词库白名单竞品库平台规则
"""
import json
import logging
import uuid
from fastapi import APIRouter, Depends, Header, HTTPException, status
from fastapi import APIRouter, Depends, Header, HTTPException, Query, status
from pydantic import BaseModel, Field
from typing import Optional
from sqlalchemy import select, and_
@ -11,7 +13,19 @@ from sqlalchemy.ext.asyncio import AsyncSession
from app.database import get_db
from app.models.tenant import Tenant
from app.models.rule import ForbiddenWord, WhitelistItem, Competitor
from app.models.rule import ForbiddenWord, WhitelistItem, Competitor, PlatformRule, RuleStatus
from app.schemas.rules import (
PlatformRuleParseRequest,
PlatformRuleParseResponse,
PlatformRuleConfirmRequest,
PlatformRuleResponse as PlatformRuleDBResponse,
PlatformRuleListResponse as PlatformRuleDBListResponse,
ParsedRulesData,
)
from app.services.document_parser import DocumentParser
from app.services.ai_service import AIServiceFactory
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/rules", tags=["rules"])
@ -468,6 +482,289 @@ async def validate_rules(request: RuleValidateRequest) -> RuleValidateResponse:
return RuleValidateResponse(conflicts=conflicts)
# ==================== 品牌方平台规则(文档上传 + AI 解析) ====================
def _format_platform_rule(rule: PlatformRule) -> PlatformRuleDBResponse:
"""将 ORM 对象转为响应 Schema"""
return PlatformRuleDBResponse(
id=rule.id,
platform=rule.platform,
brand_id=rule.brand_id,
document_url=rule.document_url,
document_name=rule.document_name,
parsed_rules=ParsedRulesData(**(rule.parsed_rules or {})),
status=rule.status,
created_at=rule.created_at.isoformat() if rule.created_at else "",
updated_at=rule.updated_at.isoformat() if rule.updated_at else "",
)
@router.post(
"/platform-rules/parse",
response_model=PlatformRuleParseResponse,
status_code=status.HTTP_201_CREATED,
)
async def parse_platform_rule_document(
request: PlatformRuleParseRequest,
x_tenant_id: str = Header(..., alias="X-Tenant-ID"),
db: AsyncSession = Depends(get_db),
) -> PlatformRuleParseResponse:
"""
上传文档并通过 AI 解析平台规则
流程:
1. 下载文档
2. 提取纯文本
3. AI 解析出结构化规则
4. 存入 DB (status=draft)
5. 返回解析结果供品牌方确认
"""
await _ensure_tenant_exists(x_tenant_id, db)
# 1. 下载并解析文档
try:
document_text = await DocumentParser.download_and_parse(
request.document_url, request.document_name,
)
except ValueError as e:
raise HTTPException(status_code=400, detail=str(e))
except Exception as e:
logger.error(f"文档解析失败: {e}")
raise HTTPException(status_code=400, detail=f"文档下载或解析失败: {e}")
if not document_text.strip():
raise HTTPException(status_code=400, detail="文档内容为空,无法解析")
# 2. AI 解析
parsed_rules = await _ai_parse_platform_rules(x_tenant_id, request.platform, document_text, db)
# 3. 存入 DB (draft)
rule_id = f"pr-{uuid.uuid4().hex[:8]}"
rule = PlatformRule(
id=rule_id,
tenant_id=x_tenant_id,
brand_id=request.brand_id,
platform=request.platform,
document_url=request.document_url,
document_name=request.document_name,
parsed_rules=parsed_rules,
status=RuleStatus.DRAFT.value,
)
db.add(rule)
await db.flush()
return PlatformRuleParseResponse(
id=rule.id,
platform=rule.platform,
brand_id=rule.brand_id,
document_url=rule.document_url,
document_name=rule.document_name,
parsed_rules=ParsedRulesData(**parsed_rules),
status=rule.status,
)
@router.put(
"/platform-rules/{rule_id}/confirm",
response_model=PlatformRuleDBResponse,
)
async def confirm_platform_rule(
rule_id: str,
request: PlatformRuleConfirmRequest,
x_tenant_id: str = Header(..., alias="X-Tenant-ID"),
db: AsyncSession = Depends(get_db),
) -> PlatformRuleDBResponse:
"""
确认/编辑平台规则解析结果
draft 状态的规则设为 active同时将同 (tenant_id, brand_id, platform)
已有的 active 规则设为 inactive
"""
result = await db.execute(
select(PlatformRule).where(
and_(
PlatformRule.id == rule_id,
PlatformRule.tenant_id == x_tenant_id,
)
)
)
rule = result.scalar_one_or_none()
if not rule:
raise HTTPException(status_code=404, detail=f"规则不存在: {rule_id}")
# 将同 (tenant_id, brand_id, platform) 下已有的 active 规则设为 inactive
existing_active = await db.execute(
select(PlatformRule).where(
and_(
PlatformRule.tenant_id == x_tenant_id,
PlatformRule.brand_id == rule.brand_id,
PlatformRule.platform == rule.platform,
PlatformRule.status == RuleStatus.ACTIVE.value,
PlatformRule.id != rule_id,
)
)
)
for old_rule in existing_active.scalars().all():
old_rule.status = RuleStatus.INACTIVE.value
# 更新当前规则
rule.parsed_rules = request.parsed_rules.model_dump()
rule.status = RuleStatus.ACTIVE.value
await db.flush()
return _format_platform_rule(rule)
@router.get(
"/platform-rules",
response_model=PlatformRuleDBListResponse,
)
async def list_brand_platform_rules(
brand_id: Optional[str] = Query(None),
platform: Optional[str] = Query(None),
rule_status: Optional[str] = Query(None, alias="status"),
x_tenant_id: str = Header(..., alias="X-Tenant-ID"),
db: AsyncSession = Depends(get_db),
) -> PlatformRuleDBListResponse:
"""查询品牌方的平台规则列表"""
query = select(PlatformRule).where(PlatformRule.tenant_id == x_tenant_id)
if brand_id:
query = query.where(PlatformRule.brand_id == brand_id)
if platform:
query = query.where(PlatformRule.platform == platform)
if rule_status:
query = query.where(PlatformRule.status == rule_status)
result = await db.execute(query.order_by(PlatformRule.created_at.desc()))
rules = result.scalars().all()
return PlatformRuleDBListResponse(
items=[_format_platform_rule(r) for r in rules],
total=len(rules),
)
@router.delete(
"/platform-rules/{rule_id}",
status_code=status.HTTP_204_NO_CONTENT,
)
async def delete_platform_rule(
rule_id: str,
x_tenant_id: str = Header(..., alias="X-Tenant-ID"),
db: AsyncSession = Depends(get_db),
):
"""删除平台规则"""
result = await db.execute(
select(PlatformRule).where(
and_(
PlatformRule.id == rule_id,
PlatformRule.tenant_id == x_tenant_id,
)
)
)
rule = result.scalar_one_or_none()
if not rule:
raise HTTPException(status_code=404, detail=f"规则不存在: {rule_id}")
await db.delete(rule)
await db.flush()
async def _ai_parse_platform_rules(
tenant_id: str,
platform: str,
document_text: str,
db: AsyncSession,
) -> dict:
"""
使用 AI 将文档文本解析为结构化平台规则
AI 失败时返回空规则结构降级为手动编辑
"""
try:
ai_client = await AIServiceFactory.get_client(tenant_id, db)
if not ai_client:
logger.warning(f"租户 {tenant_id} 未配置 AI 服务,返回空规则")
return _empty_parsed_rules()
config = await AIServiceFactory.get_config(tenant_id, db)
if not config:
return _empty_parsed_rules()
text_model = config.models.get("text", "gpt-4o")
# 截断过长文本(避免超出 token 限制)
max_chars = 15000
if len(document_text) > max_chars:
document_text = document_text[:max_chars] + "\n...(文档内容已截断)"
prompt = f"""你是平台广告合规规则分析专家。请从以下 {platform} 平台规则文档中提取结构化规则。
文档内容
{document_text}
请以 JSON 格式返回不要包含其他内容
{{
"forbidden_words": ["违禁词1", "违禁词2"],
"restricted_words": [{{"word": "xx", "condition": "使用条件", "suggestion": "替换建议"}}],
"duration": {{"min_seconds": 7, "max_seconds": null}},
"content_requirements": ["必须展示产品正面", "需要口播品牌名"],
"other_rules": [{{"rule": "规则名称", "description": "详细说明"}}]
}}
注意
- forbidden_words: 明确禁止使用的词语
- restricted_words: 有条件限制的词语
- duration: 视频时长要求如果文档未提及则为 null
- content_requirements: 内容上的硬性要求
- other_rules: 不属于以上分类的其他规则
- 如果某项没有提取到内容使用空数组或 null"""
response = await ai_client.chat_completion(
messages=[{"role": "user", "content": prompt}],
model=text_model,
temperature=0.2,
max_tokens=2000,
)
# 解析 AI 响应
content = response.content.strip()
if content.startswith("```"):
content = content.split("\n", 1)[1]
if content.endswith("```"):
content = content.rsplit("\n", 1)[0]
parsed = json.loads(content)
# 校验并补全字段
return {
"forbidden_words": parsed.get("forbidden_words", []),
"restricted_words": parsed.get("restricted_words", []),
"duration": parsed.get("duration"),
"content_requirements": parsed.get("content_requirements", []),
"other_rules": parsed.get("other_rules", []),
}
except json.JSONDecodeError:
logger.warning("AI 返回内容非 JSON降级为空规则")
return _empty_parsed_rules()
except Exception as e:
logger.error(f"AI 解析平台规则失败: {e}")
return _empty_parsed_rules()
def _empty_parsed_rules() -> dict:
"""返回空的解析规则结构"""
return {
"forbidden_words": [],
"restricted_words": [],
"duration": None,
"content_requirements": [],
"other_rules": [],
}
# ==================== 辅助函数(供其他模块调用) ====================
async def get_whitelist_for_brand(
@ -533,3 +830,31 @@ async def get_forbidden_words_for_tenant(
}
for w in words
]
async def get_active_platform_rules(
tenant_id: str,
brand_id: str,
platform: str,
db: AsyncSession,
) -> Optional[dict]:
"""
获取品牌方在该平台的生效规则 (active)
Returns:
parsed_rules dict None没有上传规则时
"""
result = await db.execute(
select(PlatformRule).where(
and_(
PlatformRule.tenant_id == tenant_id,
PlatformRule.brand_id == brand_id,
PlatformRule.platform == platform,
PlatformRule.status == RuleStatus.ACTIVE.value,
)
)
)
rule = result.scalar_one_or_none()
if not rule:
return None
return rule.parsed_rules

View File

@ -15,11 +15,14 @@ from app.schemas.review import (
RiskLevel,
Position,
SoftRiskWarning,
SoftRiskAction,
)
from app.api.rules import (
get_whitelist_for_brand,
get_other_brands_whitelist_terms,
get_forbidden_words_for_tenant,
get_active_platform_rules,
_platform_rules,
)
from app.services.soft_risk import evaluate_soft_risk
from app.services.ai_service import AIServiceFactory
@ -175,19 +178,88 @@ async def review_script(
position=Position(start=content.find(term), end=content.find(term) + len(term)),
))
# 3A. 平台规则违禁词(优先从 DB 读取,硬编码兜底)
already_checked = set(ABSOLUTE_WORDS + [w["word"] for w in tenant_forbidden_words])
platform_forbidden_words: list[str] = []
# 优先从 DB 获取品牌方上传的 active 平台规则
db_platform_rules = await get_active_platform_rules(
x_tenant_id, request.brand_id, request.platform.value, db,
)
if db_platform_rules:
platform_forbidden_words = db_platform_rules.get("forbidden_words", [])
else:
# 兜底:从硬编码 _platform_rules 读取
platform_rule = _platform_rules.get(request.platform.value)
if platform_rule:
for rule in platform_rule.get("rules", []):
if rule.get("type") == "forbidden_word":
platform_forbidden_words.extend(rule.get("words", []))
for word in platform_forbidden_words:
if word in already_checked or word in whitelist:
continue
start = 0
while True:
pos = content.find(word, start)
if pos == -1:
break
if not _is_ad_context(content, word):
start = pos + 1
continue
violations.append(Violation(
type=ViolationType.FORBIDDEN_WORD,
content=word,
severity=RiskLevel.MEDIUM,
suggestion=f"违反{request.platform.value}平台规则,建议删除:{word}",
position=Position(start=pos, end=pos + len(word)),
))
start = pos + 1
# 3B. Brief 黑名单词
if request.blacklist_words:
for item in request.blacklist_words:
word = item.get("word", "")
reason = item.get("reason", "")
if not word or word in whitelist:
continue
start_pos = 0
while True:
pos = content.find(word, start_pos)
if pos == -1:
break
suggestion = f"Brief 黑名单词:{word}"
if reason:
suggestion += f"{reason}"
violations.append(Violation(
type=ViolationType.FORBIDDEN_WORD,
content=word,
severity=RiskLevel.HIGH,
suggestion=suggestion,
position=Position(start=pos, end=pos + len(word)),
))
start_pos = pos + 1
# 4. 检查遗漏卖点
missing_points: list[str] | None = None
if request.required_points:
missing = _check_selling_point_coverage(content, request.required_points)
missing_points = missing if missing else []
# 5. 可选AI 深度分析
ai_violations = await _ai_deep_analysis(x_tenant_id, content, db)
# 5. 可选AI 深度分析(返回 violations + warnings
ai_violations, ai_warnings = await _ai_deep_analysis(x_tenant_id, content, db)
if ai_violations:
violations.extend(ai_violations)
# 6. 计算分数
score = 100 - len(violations) * 25
# 6. 计算分数(按严重程度加权)
score = 100
for v in violations:
if v.severity == RiskLevel.HIGH:
score -= 25
elif v.severity == RiskLevel.MEDIUM:
score -= 15
else:
score -= 5
if missing_points:
score -= len(missing_points) * 5
score = max(0, score)
@ -209,6 +281,19 @@ async def review_script(
if request.soft_risk_context:
soft_warnings = evaluate_soft_risk(request.soft_risk_context)
# 合并 AI 产出的 soft_warnings
if ai_warnings:
soft_warnings.extend(ai_warnings)
# 遗漏卖点也加入 soft_warnings
if missing_points:
soft_warnings.append(SoftRiskWarning(
code="missing_selling_points",
message=f"遗漏 {len(missing_points)} 个卖点:{', '.join(missing_points)}",
action_required=SoftRiskAction.NOTE,
blocking=False,
))
return ScriptReviewResponse(
score=score,
summary=summary,
@ -222,26 +307,27 @@ async def _ai_deep_analysis(
tenant_id: str,
content: str,
db: AsyncSession,
) -> list[Violation]:
) -> tuple[list[Violation], list[SoftRiskWarning]]:
"""
使用 AI 进行深度分析
返回 (violations, soft_warnings)
AI 分析失败时返回空列表降级到规则检测
"""
try:
# 获取 AI 客户端
ai_client = await AIServiceFactory.get_client(tenant_id, db)
if not ai_client:
return []
return [], []
# 获取模型配置
config = await AIServiceFactory.get_config(tenant_id, db)
if not config:
return []
return [], []
text_model = config.models.get("text", "gpt-4o")
# 构建分析提示
# 构建分析提示(两类输出)
analysis_prompt = f"""作为广告合规审核专家,请分析以下广告脚本内容,检测潜在的合规风险:
脚本内容
@ -253,12 +339,17 @@ async def _ai_deep_analysis(
3. 是否存在夸大描述
4. 是否存在可能违反广告法的其他内容
如果发现问题请以 JSON 数组格式返回每项包含
请以 JSON 数组返回每项包含
- category: "violation"硬性违规明确违法/违规 "warning"软性提醒需人工判断
- type: 违规类型 (forbidden_word/efficacy_claim/brand_safety)
- content: 违规内容
- content: 问题内容
- severity: 严重程度 (high/medium/low)
- suggestion: 修改建议
分类标准
- violation: 违禁词功效宣称品牌安全等明确违规
- warning: 夸大描述易误解表述潜在风险
如果未发现问题返回空数组 []
请只返回 JSON 数组不要包含其他内容"""
@ -283,7 +374,10 @@ async def _ai_deep_analysis(
ai_results = json.loads(response_content)
violations = []
warnings = []
for item in ai_results:
category = item.get("category", "violation") # 默认当硬性违规(安全兜底)
violation_type = item.get("type", "forbidden_word")
if violation_type == "forbidden_word":
vtype = ViolationType.FORBIDDEN_WORD
@ -300,19 +394,28 @@ async def _ai_deep_analysis(
else:
slevel = RiskLevel.MEDIUM
violations.append(Violation(
type=vtype,
content=item.get("content", ""),
severity=slevel,
suggestion=item.get("suggestion", "建议修改"),
))
if category == "warning":
# 软性提醒 → SoftRiskWarning
warnings.append(SoftRiskWarning(
code="ai_warning",
message=f"{item.get('content', '')}: {item.get('suggestion', '建议修改')}",
action_required=SoftRiskAction.NOTE,
blocking=False,
context={"type": violation_type, "severity": severity},
))
else:
# 硬性违规 → Violation
violations.append(Violation(
type=vtype,
content=item.get("content", ""),
severity=slevel,
suggestion=item.get("suggestion", "建议修改"),
))
return violations
return violations, warnings
except json.JSONDecodeError:
# JSON 解析失败,返回空列表
return []
return [], []
except Exception:
# AI 调用失败,降级到规则检测
return []
return [], []

View File

@ -10,7 +10,7 @@ from app.models.task import Task, TaskStage, TaskStatus
from app.models.brief import Brief
from app.models.ai_config import AIConfig
from app.models.review import ReviewTask, Platform
from app.models.rule import ForbiddenWord, WhitelistItem, Competitor
from app.models.rule import ForbiddenWord, WhitelistItem, Competitor, PlatformRule, RuleStatus
from app.models.audit_log import AuditLog
from app.models.message import Message
# 保留 Tenant 兼容旧代码,但新代码应使用 Brand
@ -44,6 +44,8 @@ __all__ = [
"ForbiddenWord",
"WhitelistItem",
"Competitor",
"PlatformRule",
"RuleStatus",
# 审计日志
"AuditLog",
# 消息

View File

@ -1,7 +1,8 @@
"""
规则模型
违禁词白名单竞品
违禁词白名单竞品平台规则
"""
import enum
from typing import TYPE_CHECKING, Optional
from sqlalchemy import String, Text, ForeignKey
from app.models.types import JSONType
@ -13,6 +14,13 @@ if TYPE_CHECKING:
from app.models.tenant import Tenant
class RuleStatus(str, enum.Enum):
"""平台规则状态"""
DRAFT = "draft" # AI 解析完成,待确认
ACTIVE = "active" # 品牌方已确认,生效中
INACTIVE = "inactive" # 已停用
class ForbiddenWord(Base, TimestampMixin):
"""违禁词表"""
__tablename__ = "forbidden_words"
@ -83,3 +91,36 @@ class Competitor(Base, TimestampMixin):
def __repr__(self) -> str:
return f"<Competitor(name={self.name}, brand_id={self.brand_id})>"
class PlatformRule(Base, TimestampMixin):
"""平台规则表 — 品牌方上传文档 + AI 解析"""
__tablename__ = "platform_rules"
id: Mapped[str] = mapped_column(String(64), primary_key=True)
tenant_id: Mapped[str] = mapped_column(
String(64),
ForeignKey("tenants.id", ondelete="CASCADE"),
nullable=False,
index=True,
)
brand_id: Mapped[str] = mapped_column(String(64), nullable=False, index=True)
platform: Mapped[str] = mapped_column(String(50), nullable=False, index=True)
# 文档信息
document_url: Mapped[str] = mapped_column(String(2048), nullable=False)
document_name: Mapped[str] = mapped_column(String(512), nullable=False)
# AI 解析结果JSON
parsed_rules: Mapped[Optional[dict]] = mapped_column(JSONType, nullable=True)
# 状态
status: Mapped[str] = mapped_column(
String(20), nullable=False, default=RuleStatus.DRAFT.value, index=True,
)
# 关联
tenant: Mapped["Tenant"] = relationship("Tenant", back_populates="platform_rules")
def __repr__(self) -> str:
return f"<PlatformRule(id={self.id}, platform={self.platform}, status={self.status})>"

View File

@ -10,7 +10,7 @@ from app.models.base import Base, TimestampMixin
if TYPE_CHECKING:
from app.models.ai_config import AIConfig
from app.models.review import ReviewTask
from app.models.rule import ForbiddenWord, WhitelistItem, Competitor
from app.models.rule import ForbiddenWord, WhitelistItem, Competitor, PlatformRule
class Tenant(Base, TimestampMixin):
@ -48,5 +48,11 @@ class Tenant(Base, TimestampMixin):
back_populates="tenant",
lazy="selectin",
)
platform_rules: Mapped[list["PlatformRule"]] = relationship(
"PlatformRule",
back_populates="tenant",
lazy="selectin",
)
def __repr__(self) -> str:
return f"<Tenant(id={self.id}, name={self.name})>"

View File

@ -0,0 +1,69 @@
"""
平台规则相关 Schema
"""
from typing import Optional
from pydantic import BaseModel, Field
class PlatformRuleParseRequest(BaseModel):
"""上传文档并解析"""
document_url: str = Field(..., description="TOS 上传后的文件 URL")
document_name: str = Field(..., description="原始文件名(用于判断格式)")
platform: str = Field(..., description="目标平台 (douyin/xiaohongshu/bilibili/kuaishou)")
brand_id: str = Field(..., description="品牌 ID")
class ParsedRulesData(BaseModel):
"""AI 解析出的结构化规则"""
forbidden_words: list[str] = Field(default_factory=list, description="违禁词列表")
restricted_words: list[dict] = Field(
default_factory=list,
description="限制词 [{word, condition, suggestion}]",
)
duration: Optional[dict] = Field(
None,
description="时长要求 {min_seconds, max_seconds}",
)
content_requirements: list[str] = Field(
default_factory=list,
description="内容要求(如'必须展示产品'",
)
other_rules: list[dict] = Field(
default_factory=list,
description="其他规则 [{rule, description}]",
)
class PlatformRuleParseResponse(BaseModel):
"""解析响应draft 状态)"""
id: str
platform: str
brand_id: str
document_url: str
document_name: str
parsed_rules: ParsedRulesData
status: str
class PlatformRuleConfirmRequest(BaseModel):
"""确认/编辑解析结果"""
parsed_rules: ParsedRulesData = Field(..., description="品牌方可能修改过的规则")
class PlatformRuleResponse(BaseModel):
"""完整响应"""
id: str
platform: str
brand_id: str
document_url: str
document_name: str
parsed_rules: ParsedRulesData
status: str
created_at: str
updated_at: str
class PlatformRuleListResponse(BaseModel):
"""列表响应"""
items: list[PlatformRuleResponse]
total: int

View File

@ -0,0 +1,119 @@
"""
文档解析服务
PDF/Word/Excel 文档中提取纯文本
"""
import os
import tempfile
from typing import Optional
import httpx
class DocumentParser:
"""从文档中提取纯文本"""
@staticmethod
async def download_and_parse(document_url: str, document_name: str) -> str:
"""
下载文档并解析为纯文本
Args:
document_url: 文档 URL (TOS)
document_name: 原始文件名用于判断格式
Returns:
提取的纯文本
"""
# 下载到临时文件
tmp_path: Optional[str] = None
try:
async with httpx.AsyncClient(timeout=60.0) as client:
resp = await client.get(document_url)
resp.raise_for_status()
ext = document_name.rsplit(".", 1)[-1].lower() if "." in document_name else ""
with tempfile.NamedTemporaryFile(delete=False, suffix=f".{ext}") as tmp:
tmp.write(resp.content)
tmp_path = tmp.name
return DocumentParser.parse_file(tmp_path, document_name)
finally:
if tmp_path and os.path.exists(tmp_path):
os.unlink(tmp_path)
@staticmethod
def parse_file(file_path: str, file_name: str) -> str:
"""
根据扩展名选择解析器返回纯文本
Args:
file_path: 本地文件路径
file_name: 原始文件名
Returns:
提取的纯文本
"""
ext = file_name.rsplit(".", 1)[-1].lower() if "." in file_name else ""
if ext == "pdf":
return DocumentParser._parse_pdf(file_path)
elif ext in ("doc", "docx"):
return DocumentParser._parse_docx(file_path)
elif ext in ("xls", "xlsx"):
return DocumentParser._parse_xlsx(file_path)
elif ext == "txt":
return DocumentParser._parse_txt(file_path)
else:
raise ValueError(f"不支持的文件格式: {ext}")
@staticmethod
def _parse_pdf(path: str) -> str:
"""pdfplumber 提取 PDF 文本"""
import pdfplumber
texts = []
with pdfplumber.open(path) as pdf:
for page in pdf.pages:
text = page.extract_text()
if text:
texts.append(text)
return "\n".join(texts)
@staticmethod
def _parse_docx(path: str) -> str:
"""python-docx 提取 Word 文本"""
from docx import Document
doc = Document(path)
texts = []
for para in doc.paragraphs:
if para.text.strip():
texts.append(para.text)
# 也提取表格内容
for table in doc.tables:
for row in table.rows:
row_text = "\t".join(cell.text.strip() for cell in row.cells if cell.text.strip())
if row_text:
texts.append(row_text)
return "\n".join(texts)
@staticmethod
def _parse_xlsx(path: str) -> str:
"""openpyxl 提取 Excel 文本(所有 sheet 拼接)"""
from openpyxl import load_workbook
wb = load_workbook(path, read_only=True, data_only=True)
texts = []
for sheet in wb.worksheets:
for row in sheet.iter_rows(values_only=True):
row_text = "\t".join(str(cell) for cell in row if cell is not None)
if row_text.strip():
texts.append(row_text)
wb.close()
return "\n".join(texts)
@staticmethod
def _parse_txt(path: str) -> str:
"""纯文本文件"""
with open(path, "r", encoding="utf-8") as f:
return f.read()

View File

@ -14,7 +14,7 @@ from sqlalchemy.orm import sessionmaker
from app.config import settings
from app.models.review import ReviewTask, TaskStatus as DBTaskStatus
from app.models.rule import ForbiddenWord, Competitor
from app.models.rule import ForbiddenWord, Competitor, PlatformRule, RuleStatus
from app.models.ai_config import AIConfig
from app.services.video_download import VideoDownloadService, DownloadResult
from app.services.keyframe import KeyFrameExtractor, ExtractionResult
@ -81,6 +81,7 @@ async def complete_review(
summary: str,
violations: list[dict],
status: DBTaskStatus = DBTaskStatus.COMPLETED,
soft_warnings: Optional[list[dict]] = None,
):
"""完成审核"""
result = await db.execute(
@ -94,6 +95,8 @@ async def complete_review(
task.score = score
task.summary = summary
task.violations = violations
if soft_warnings is not None:
task.soft_warnings = soft_warnings
task.completed_at = datetime.now(timezone.utc)
await db.commit()
@ -153,6 +156,24 @@ async def get_competitors(db: AsyncSession, tenant_id: str, brand_id: str) -> li
return [row[0] for row in result.fetchall()]
async def get_platform_forbidden_words(
db: AsyncSession, tenant_id: str, brand_id: str, platform: str,
) -> list[str]:
"""从 DB 获取品牌方在该平台的 active 规则中的违禁词"""
result = await db.execute(
select(PlatformRule).where(
PlatformRule.tenant_id == tenant_id,
PlatformRule.brand_id == brand_id,
PlatformRule.platform == platform,
PlatformRule.status == RuleStatus.ACTIVE.value,
)
)
rule = result.scalar_one_or_none()
if not rule or not rule.parsed_rules:
return []
return rule.parsed_rules.get("forbidden_words", [])
async def process_video_review(
review_id: str,
tenant_id: str,
@ -199,6 +220,13 @@ async def process_video_review(
# 获取规则
forbidden_words = await get_forbidden_words(db, tenant_id)
# 合并平台规则中的违禁词
platform_fw = await get_platform_forbidden_words(db, tenant_id, brand_id, platform)
existing_set = set(forbidden_words)
for w in platform_fw:
if w not in existing_set:
forbidden_words.append(w)
existing_set.add(w)
competitors = await get_competitors(db, tenant_id, brand_id)
# 初始化 AI 服务
@ -281,16 +309,37 @@ async def process_video_review(
)
all_violations.extend(subtitle_violations)
# 6. 计算分数和生成报告
# 6. 分流 violations / soft_warnings
await update_review_progress(db, review_id, 90, "生成报告")
score = review_service.calculate_score(all_violations)
if not all_violations:
hard_violations = []
soft_warnings_data = []
for v in all_violations:
v_type = v.get("type", "")
if v_type in ("forbidden_word", "efficacy_claim", "competitor_logo", "brand_safety"):
hard_violations.append(v)
elif v_type in ("duration_short", "mention_missing"):
soft_warnings_data.append({
"code": f"video_{v_type}",
"message": v.get("content", ""),
"action_required": "note",
"blocking": False,
"context": {"suggestion": v.get("suggestion", "")},
})
else:
hard_violations.append(v) # 默认当硬性违规
# 计算分数(仅硬性违规影响分数)
score = review_service.calculate_score(hard_violations)
if not hard_violations:
summary = "视频内容合规,未发现违规项"
if soft_warnings_data:
summary += f"{len(soft_warnings_data)} 条提醒)"
else:
high_count = sum(1 for v in all_violations if v.get("risk_level") == "high")
medium_count = sum(1 for v in all_violations if v.get("risk_level") == "medium")
summary = f"发现 {len(all_violations)} 处违规"
high_count = sum(1 for v in hard_violations if v.get("risk_level") == "high")
summary = f"发现 {len(hard_violations)} 处违规"
if high_count > 0:
summary += f"{high_count} 处高风险)"
@ -300,7 +349,8 @@ async def process_video_review(
review_id,
score=score,
summary=summary,
violations=all_violations,
violations=hard_violations,
soft_warnings=soft_warnings_data if soft_warnings_data else None,
)
except Exception as e:

View File

@ -21,6 +21,9 @@ dependencies = [
"openai>=1.12.0",
"cachetools>=5.3.0",
"sse-starlette>=2.0.0",
"pdfplumber>=0.10.0",
"python-docx>=1.1.0",
"openpyxl>=3.1.0",
]
[project.optional-dependencies]