video-compliance-ai/backend/app/api/scripts.py

"""
脚本预审 API
"""
import re
from typing import Optional
from fastapi import APIRouter, Depends, Header
from sqlalchemy.ext.asyncio import AsyncSession

from app.database import get_db
from app.schemas.review import (
    ScriptReviewRequest,
    ScriptReviewResponse,
    Violation,
    ViolationType,
    RiskLevel,
    Position,
    SoftRiskWarning,
    SoftRiskAction,
)
from app.api.rules import (
    get_whitelist_for_brand,
    get_other_brands_whitelist_terms,
    get_forbidden_words_for_tenant,
    get_active_platform_rules,
    _platform_rules,
)
from app.services.soft_risk import evaluate_soft_risk
from app.services.ai_service import AIServiceFactory

router = APIRouter(prefix="/scripts", tags=["scripts"])

# 内置违禁词库（广告极限词）
ABSOLUTE_WORDS = ["最好", "第一", "最佳", "绝对", "100%"]

# 功效词库（医疗/功效宣称）
EFFICACY_WORDS = ["根治", "治愈", "治疗", "药效", "疗效", "特效"]

# 广告语境关键词（用于判断是否为广告场景）
AD_CONTEXT_KEYWORDS = ["产品", "购买", "销量", "品质", "推荐", "价格", "优惠", "促销"]


def _is_ad_context(content: str, word: str) -> bool:
    """
    判断是否为广告语境

    规则：
    - 如果内容中包含广告关键词，认为是广告语境
    - 如果违禁词出现在明显的非广告句式中，不是广告语境
    """
    # 非广告语境模式
    non_ad_patterns = [
        r"他是第一[个名位]",  # 他是第一个/名
        r"[是为]第一[个名位]",  # 是第一个
        r"最开心|最高兴|最难忘",  # 情感表达
        r"第一[次个].*[到来抵达]",  # 第一次到达
    ]

    for pattern in non_ad_patterns:
        if re.search(pattern, content):
            return False

    # 检查是否包含广告关键词
    return any(kw in content for kw in AD_CONTEXT_KEYWORDS)


def _check_selling_point_coverage(content: str, required_points: list[str]) -> list[str]:
    """
    检查卖点覆盖情况

    使用语义匹配而非精确匹配
    """
    missing = []

    # 卖点关键词映射
    point_keywords = {
        "品牌名称": ["品牌", "牌子", "品牌A", "品牌B"],
        "使用方法": ["使用", "用法", "早晚", "每天", "一次", "涂抹", "喷洒"],
        "功效说明": ["功效", "效果", "水润", "美白", "保湿", "滋润", "改善"],
    }

    for point in required_points:
        # 精确匹配
        if point in content:
            continue

        # 关键词匹配
        keywords = point_keywords.get(point, [])
        if any(kw in content for kw in keywords):
            continue

        missing.append(point)

    return missing


@router.post("/review", response_model=ScriptReviewResponse)
async def review_script(
    request: ScriptReviewRequest,
    x_tenant_id: str = Header(..., alias="X-Tenant-ID"),
    db: AsyncSession = Depends(get_db),
) -> ScriptReviewResponse:
    """
    脚本预审

    - 检测违禁词（支持语境感知）
    - 检测功效词
    - 检查必要卖点
    - 应用白名单
    - 可选 AI 深度分析
    - 返回合规分数和修改建议
    """
    violations = []
    content = request.content

    # 获取品牌白名单
    whitelist = await get_whitelist_for_brand(x_tenant_id, request.brand_id, db)

    # 获取租户自定义违禁词
    tenant_forbidden_words = await get_forbidden_words_for_tenant(x_tenant_id, db)

    # 1. 违禁词检测（广告极限词）
    all_forbidden_words = ABSOLUTE_WORDS + [w["word"] for w in tenant_forbidden_words]

    for word in all_forbidden_words:
        # 白名单跳过
        if word in whitelist:
            continue

        start = 0
        while True:
            pos = content.find(word, start)
            if pos == -1:
                break

            # 语境感知：非广告语境跳过
            if not _is_ad_context(content, word):
                start = pos + 1
                continue

            violations.append(Violation(
                type=ViolationType.FORBIDDEN_WORD,
                content=word,
                severity=RiskLevel.HIGH,
                suggestion=f"建议删除或替换违禁词：{word}",
                position=Position(start=pos, end=pos + len(word)),
            ))
            start = pos + 1

    # 2. 功效词检测
    for word in EFFICACY_WORDS:
        if word in whitelist:
            continue

        start = 0
        while True:
            pos = content.find(word, start)
            if pos == -1:
                break

            violations.append(Violation(
                type=ViolationType.EFFICACY_CLAIM,
                content=word,
                severity=RiskLevel.HIGH,
                suggestion=f"功效宣称词违反广告法，建议删除：{word}",
                position=Position(start=pos, end=pos + len(word)),
            ))
            start = pos + 1

    # 3. 检测其他品牌专属词（品牌安全风险）
    other_brand_terms = await get_other_brands_whitelist_terms(x_tenant_id, request.brand_id, db)
    for term, owner_brand in other_brand_terms:
        if term in content:
            violations.append(Violation(
                type=ViolationType.BRAND_SAFETY,
                content=term,
                severity=RiskLevel.MEDIUM,
                suggestion=f"使用了其他品牌的专属词汇：{term}",
                position=Position(start=content.find(term), end=content.find(term) + len(term)),
            ))

    # 3A. 平台规则违禁词（优先从 DB 读取，硬编码兜底）
    already_checked = set(ABSOLUTE_WORDS + [w["word"] for w in tenant_forbidden_words])
    platform_forbidden_words: list[str] = []

    # 优先从 DB 获取品牌方上传的 active 平台规则
    db_platform_rules = await get_active_platform_rules(
        x_tenant_id, request.brand_id, request.platform.value, db,
    )
    if db_platform_rules:
        platform_forbidden_words = db_platform_rules.get("forbidden_words", [])
    else:
        # 兜底：从硬编码 _platform_rules 读取
        platform_rule = _platform_rules.get(request.platform.value)
        if platform_rule:
            for rule in platform_rule.get("rules", []):
                if rule.get("type") == "forbidden_word":
                    platform_forbidden_words.extend(rule.get("words", []))

    for word in platform_forbidden_words:
        if word in already_checked or word in whitelist:
            continue
        start = 0
        while True:
            pos = content.find(word, start)
            if pos == -1:
                break
            if not _is_ad_context(content, word):
                start = pos + 1
                continue
            violations.append(Violation(
                type=ViolationType.FORBIDDEN_WORD,
                content=word,
                severity=RiskLevel.MEDIUM,
                suggestion=f"违反{request.platform.value}平台规则，建议删除：{word}",
                position=Position(start=pos, end=pos + len(word)),
            ))
            start = pos + 1

    # 3B. Brief 黑名单词
    if request.blacklist_words:
        for item in request.blacklist_words:
            word = item.get("word", "")
            reason = item.get("reason", "")
            if not word or word in whitelist:
                continue
            start_pos = 0
            while True:
                pos = content.find(word, start_pos)
                if pos == -1:
                    break
                suggestion = f"Brief 黑名单词：{word}"
                if reason:
                    suggestion += f"（{reason}）"
                violations.append(Violation(
                    type=ViolationType.FORBIDDEN_WORD,
                    content=word,
                    severity=RiskLevel.HIGH,
                    suggestion=suggestion,
                    position=Position(start=pos, end=pos + len(word)),
                ))
                start_pos = pos + 1

    # 4. 检查遗漏卖点
    missing_points: list[str] | None = None
    if request.required_points:
        missing = _check_selling_point_coverage(content, request.required_points)
        missing_points = missing if missing else []

    # 5. 可选：AI 深度分析（返回 violations + warnings）
    ai_violations, ai_warnings = await _ai_deep_analysis(x_tenant_id, content, db)
    if ai_violations:
        violations.extend(ai_violations)

    # 6. 计算分数（按严重程度加权）
    score = 100
    for v in violations:
        if v.severity == RiskLevel.HIGH:
            score -= 25
        elif v.severity == RiskLevel.MEDIUM:
            score -= 15
        else:
            score -= 5
    if missing_points:
        score -= len(missing_points) * 5
    score = max(0, score)

    # 7. 生成摘要
    parts = []
    if violations:
        parts.append(f"发现 {len(violations)} 处违规")
    if missing_points:
        parts.append(f"遗漏 {len(missing_points)} 个卖点")

    if not parts:
        summary = "脚本内容合规，未发现问题"
    else:
        summary = "，".join(parts)

    # 8. 软性风控评估
    soft_warnings: list[SoftRiskWarning] = []
    if request.soft_risk_context:
        soft_warnings = evaluate_soft_risk(request.soft_risk_context)

    # 合并 AI 产出的 soft_warnings
    if ai_warnings:
        soft_warnings.extend(ai_warnings)

    # 遗漏卖点也加入 soft_warnings
    if missing_points:
        soft_warnings.append(SoftRiskWarning(
            code="missing_selling_points",
            message=f"遗漏 {len(missing_points)} 个卖点：{', '.join(missing_points)}",
            action_required=SoftRiskAction.NOTE,
            blocking=False,
        ))

    return ScriptReviewResponse(
        score=score,
        summary=summary,
        violations=violations,
        missing_points=missing_points,
        soft_warnings=soft_warnings,
    )


async def _ai_deep_analysis(
    tenant_id: str,
    content: str,
    db: AsyncSession,
) -> tuple[list[Violation], list[SoftRiskWarning]]:
    """
    使用 AI 进行深度分析

    返回 (violations, soft_warnings)
    AI 分析失败时返回空列表，降级到规则检测
    """
    try:
        # 获取 AI 客户端
        ai_client = await AIServiceFactory.get_client(tenant_id, db)
        if not ai_client:
            return [], []

        # 获取模型配置
        config = await AIServiceFactory.get_config(tenant_id, db)
        if not config:
            return [], []

        text_model = config.models.get("text", "gpt-4o")

        # 构建分析提示（两类输出）
        analysis_prompt = f"""作为广告合规审核专家，请分析以下广告脚本内容，检测潜在的合规风险：

脚本内容：
{content}

请检查以下方面：
1. 是否存在隐性的虚假宣传（如暗示疗效但不直接说明）
2. 是否存在容易引起误解的表述
3. 是否存在夸大描述
4. 是否存在可能违反广告法的其他内容

请以 JSON 数组返回，每项包含：
- category: "violation"（硬性违规，明确违法/违规）或 "warning"（软性提醒，需人工判断）
- type: 违规类型 (forbidden_word/efficacy_claim/brand_safety)
- content: 问题内容
- severity: 严重程度 (high/medium/low)
- suggestion: 修改建议

分类标准：
- violation: 违禁词、功效宣称、品牌安全等明确违规
- warning: 夸大描述、易误解表述、潜在风险

如果未发现问题，返回空数组 []

请只返回 JSON 数组，不要包含其他内容。"""

        response = await ai_client.chat_completion(
            messages=[{"role": "user", "content": analysis_prompt}],
            model=text_model,
            temperature=0.3,
            max_tokens=1000,
        )

        # 解析 AI 响应
        import json
        try:
            # 清理响应内容（移除可能的 markdown 标记）
            response_content = response.content.strip()
            if response_content.startswith("```"):
                response_content = response_content.split("\n", 1)[1]
            if response_content.endswith("```"):
                response_content = response_content.rsplit("\n", 1)[0]

            ai_results = json.loads(response_content)

            violations = []
            warnings = []
            for item in ai_results:
                category = item.get("category", "violation")  # 默认当硬性违规（安全兜底）

                violation_type = item.get("type", "forbidden_word")
                if violation_type == "forbidden_word":
                    vtype = ViolationType.FORBIDDEN_WORD
                elif violation_type == "efficacy_claim":
                    vtype = ViolationType.EFFICACY_CLAIM
                else:
                    vtype = ViolationType.BRAND_SAFETY

                severity = item.get("severity", "medium")
                if severity == "high":
                    slevel = RiskLevel.HIGH
                elif severity == "low":
                    slevel = RiskLevel.LOW
                else:
                    slevel = RiskLevel.MEDIUM

                if category == "warning":
                    # 软性提醒 → SoftRiskWarning
                    warnings.append(SoftRiskWarning(
                        code="ai_warning",
                        message=f"{item.get('content', '')}: {item.get('suggestion', '建议修改')}",
                        action_required=SoftRiskAction.NOTE,
                        blocking=False,
                        context={"type": violation_type, "severity": severity},
                    ))
                else:
                    # 硬性违规 → Violation
                    violations.append(Violation(
                        type=vtype,
                        content=item.get("content", ""),
                        severity=slevel,
                        suggestion=item.get("suggestion", "建议修改"),
                    ))

            return violations, warnings

        except json.JSONDecodeError:
            return [], []

    except Exception:
        return [], []