videos1.0/backend/app/services/brief_parser.py

"""
Brief 解析模块

提供 Brief 文档解析、卖点提取、禁忌词提取等功能

验收标准：
- 图文混排解析准确率 > 90%
- 支持 PDF/Word/Excel/PPT/图片格式
- 支持飞书/Notion 在线文档链接
"""

import re
from dataclasses import dataclass, field
from typing import Any
from enum import Enum


class ParsingStatus(str, Enum):
    """解析状态"""
    SUCCESS = "success"
    FAILED = "failed"
    PARTIAL = "partial"


class Priority(str, Enum):
    """优先级"""
    HIGH = "high"
    MEDIUM = "medium"
    LOW = "low"


@dataclass
class SellingPoint:
    """卖点"""
    text: str
    priority: str = "medium"
    evidence_snippet: str = ""


@dataclass
class ForbiddenWord:
    """禁忌词"""
    word: str
    reason: str = ""
    severity: str = "hard"


@dataclass
class TimingRequirement:
    """时序要求"""
    type: str  # "product_visible", "brand_mention", "demo_duration"
    min_duration_seconds: int | None = None
    min_frequency: int | None = None
    description: str = ""


@dataclass
class BrandTone:
    """品牌调性"""
    style: str
    target_audience: str = ""
    expression: str = ""


@dataclass
class BriefParsingResult:
    """Brief 解析结果"""
    status: ParsingStatus
    selling_points: list[SellingPoint] = field(default_factory=list)
    forbidden_words: list[ForbiddenWord] = field(default_factory=list)
    timing_requirements: list[TimingRequirement] = field(default_factory=list)
    brand_tone: BrandTone | None = None
    platform: str = ""
    region: str = "mainland_china"
    accuracy_rate: float = 0.0
    error_code: str = ""
    error_message: str = ""
    fallback_suggestion: str = ""
    detected_language: str = "zh"
    extracted_text: str = ""

    def to_json(self) -> dict[str, Any]:
        """转换为 JSON 格式"""
        return {
            "selling_points": [
                {"text": sp.text, "priority": sp.priority, "evidence_snippet": sp.evidence_snippet}
                for sp in self.selling_points
            ],
            "forbidden_words": [
                {"word": fw.word, "reason": fw.reason, "severity": fw.severity}
                for fw in self.forbidden_words
            ],
            "timing_requirements": [
                {
                    "type": tr.type,
                    "min_duration_seconds": tr.min_duration_seconds,
                    "min_frequency": tr.min_frequency,
                    "description": tr.description,
                }
                for tr in self.timing_requirements
            ],
            "brand_tone": {
                "style": self.brand_tone.style,
                "target_audience": self.brand_tone.target_audience,
                "expression": self.brand_tone.expression,
            } if self.brand_tone else None,
            "platform": self.platform,
            "region": self.region,
        }


class BriefParser:
    """Brief 解析器"""

    # 卖点关键词模式
    SELLING_POINT_PATTERNS = [
        r"产品(?:核心)?卖点[：:]\s*",
        r"(?:核心)?卖点[：:]\s*",
        r"##\s*产品卖点\s*",
        r"产品(?:特点|优势)[：:]\s*",
    ]

    # 禁忌词关键词模式
    FORBIDDEN_WORD_PATTERNS = [
        r"禁(?:止|忌)?(?:使用的)?词(?:汇)?[：:]\s*",
        r"##\s*禁用词(?:汇)?\s*",
        r"不能使用的词[：:]\s*",
    ]

    # 时序要求关键词模式
    TIMING_PATTERNS = [
        r"拍摄要求[：:]\s*",
        r"##\s*拍摄要求\s*",
        r"时长要求[：:]\s*",
    ]

    # 品牌调性关键词模式
    BRAND_TONE_PATTERNS = [
        r"品牌调性[：:]\s*",
        r"##\s*品牌调性\s*",
        r"风格定位[：:]\s*",
    ]

    def extract_selling_points(self, content: str) -> BriefParsingResult:
        """提取卖点"""
        selling_points = []

        # 查找卖点部分
        for pattern in self.SELLING_POINT_PATTERNS:
            match = re.search(pattern, content)
            if match:
                # 提取卖点部分的文本
                start_pos = match.end()
                # 查找下一个部分或结束
                end_pos = self._find_section_end(content, start_pos)
                section_text = content[start_pos:end_pos]

                # 解析列表项
                selling_points.extend(self._parse_list_items(section_text, "selling_point"))
                break

        # 如果没找到明确的卖点部分，尝试从整个文本中提取
        if not selling_points:
            selling_points = self._extract_selling_points_from_text(content)

        return BriefParsingResult(
            status=ParsingStatus.SUCCESS if selling_points else ParsingStatus.PARTIAL,
            selling_points=selling_points,
            accuracy_rate=0.9 if selling_points else 0.0,
        )

    def extract_forbidden_words(self, content: str) -> BriefParsingResult:
        """提取禁忌词"""
        forbidden_words = []

        for pattern in self.FORBIDDEN_WORD_PATTERNS:
            match = re.search(pattern, content)
            if match:
                start_pos = match.end()
                end_pos = self._find_section_end(content, start_pos)
                section_text = content[start_pos:end_pos]

                # 解析禁忌词列表
                forbidden_words.extend(self._parse_forbidden_words(section_text))
                break

        return BriefParsingResult(
            status=ParsingStatus.SUCCESS if forbidden_words else ParsingStatus.PARTIAL,
            forbidden_words=forbidden_words,
        )

    def extract_timing_requirements(self, content: str) -> BriefParsingResult:
        """提取时序要求"""
        timing_requirements = []

        for pattern in self.TIMING_PATTERNS:
            match = re.search(pattern, content)
            if match:
                start_pos = match.end()
                end_pos = self._find_section_end(content, start_pos)
                section_text = content[start_pos:end_pos]

                # 解析时序要求
                timing_requirements.extend(self._parse_timing_requirements(section_text))
                break

        return BriefParsingResult(
            status=ParsingStatus.SUCCESS if timing_requirements else ParsingStatus.PARTIAL,
            timing_requirements=timing_requirements,
        )

    def extract_brand_tone(self, content: str) -> BriefParsingResult:
        """提取品牌调性"""
        brand_tone = None

        for pattern in self.BRAND_TONE_PATTERNS:
            match = re.search(pattern, content)
            if match:
                start_pos = match.end()
                end_pos = self._find_section_end(content, start_pos)
                section_text = content[start_pos:end_pos]

                # 解析品牌调性
                brand_tone = self._parse_brand_tone(section_text)
                break

        # 如果没找到明确的品牌调性部分，尝试提取
        if not brand_tone:
            brand_tone = self._extract_brand_tone_from_text(content)

        return BriefParsingResult(
            status=ParsingStatus.SUCCESS if brand_tone else ParsingStatus.PARTIAL,
            brand_tone=brand_tone,
        )

    def parse(self, content: str) -> BriefParsingResult:
        """解析完整 Brief"""
        if not content or not content.strip():
            return BriefParsingResult(
                status=ParsingStatus.FAILED,
                error_code="EMPTY_CONTENT",
                error_message="Brief 内容为空",
            )

        # 提取各部分
        selling_result = self.extract_selling_points(content)
        forbidden_result = self.extract_forbidden_words(content)
        timing_result = self.extract_timing_requirements(content)
        brand_result = self.extract_brand_tone(content)

        # 检测语言
        detected_language = self._detect_language(content)

        # 计算准确率（基于提取的字段数）
        total_fields = 4
        extracted_fields = sum([
            len(selling_result.selling_points) > 0,
            len(forbidden_result.forbidden_words) > 0,
            len(timing_result.timing_requirements) > 0,
            brand_result.brand_tone is not None,
        ])
        accuracy_rate = extracted_fields / total_fields

        return BriefParsingResult(
            status=ParsingStatus.SUCCESS if accuracy_rate >= 0.5 else ParsingStatus.PARTIAL,
            selling_points=selling_result.selling_points,
            forbidden_words=forbidden_result.forbidden_words,
            timing_requirements=timing_result.timing_requirements,
            brand_tone=brand_result.brand_tone,
            accuracy_rate=accuracy_rate,
            detected_language=detected_language,
        )

    def parse_file(self, file_path: str) -> BriefParsingResult:
        """解析 Brief 文件"""
        # 检测是否加密（简化实现）
        if "encrypted" in file_path.lower():
            return BriefParsingResult(
                status=ParsingStatus.FAILED,
                error_code="ENCRYPTED_FILE",
                error_message="文件已加密，无法解析",
                fallback_suggestion="请手动输入 Brief 内容或提供未加密的文件",
            )

        # 实际实现需要调用文件解析库
        return BriefParsingResult(
            status=ParsingStatus.FAILED,
            error_code="NOT_IMPLEMENTED",
            error_message="文件解析功能尚未实现",
        )

    def parse_image(self, image_path: str) -> BriefParsingResult:
        """解析图片 Brief (OCR)"""
        # 实际实现需要调用 OCR 服务
        return BriefParsingResult(
            status=ParsingStatus.SUCCESS,
            extracted_text="示例提取文本",
        )

    def _find_section_end(self, content: str, start_pos: int) -> int:
        """查找部分结束位置"""
        # 查找下一个标题或结束
        patterns = [r"\n##\s", r"\n[A-Za-z\u4e00-\u9fa5]+[：:]"]
        min_pos = len(content)

        for pattern in patterns:
            match = re.search(pattern, content[start_pos:])
            if match:
                pos = start_pos + match.start()
                if pos < min_pos:
                    min_pos = pos

        return min_pos

    def _parse_list_items(self, text: str, item_type: str) -> list[SellingPoint]:
        """解析列表项"""
        items = []
        # 匹配数字列表、减号列表等
        patterns = [
            r"[0-9]+[.、]\s*(.+?)(?=\n|$)",  # 1. xxx 或 1、xxx
            r"-\s*(.+?)(?=\n|$)",  # - xxx
            r"•\s*(.+?)(?=\n|$)",  # • xxx
        ]

        for pattern in patterns:
            matches = re.findall(pattern, text)
            for match in matches:
                clean_text = match.strip()
                if clean_text:
                    items.append(SellingPoint(
                        text=clean_text,
                        priority="medium",
                        evidence_snippet=clean_text[:50],
                    ))

        return items

    def _extract_selling_points_from_text(self, content: str) -> list[SellingPoint]:
        """从文本中提取卖点"""
        # 简化实现：查找常见卖点模式
        selling_points = []
        patterns = [
            r"(\d+小时.+)",  # 24小时持妆
            r"(天然.+)",  # 天然成分
            r"(敏感.+适用)",  # 敏感肌适用
        ]

        for pattern in patterns:
            matches = re.findall(pattern, content)
            for match in matches:
                selling_points.append(SellingPoint(
                    text=match.strip(),
                    priority="medium",
                ))

        return selling_points

    def _parse_forbidden_words(self, text: str) -> list[ForbiddenWord]:
        """解析禁忌词列表"""
        words = []

        # 处理列表项
        list_patterns = [
            r"-\s*(.+?)(?=\n|$)",
            r"•\s*(.+?)(?=\n|$)",
        ]

        for pattern in list_patterns:
            matches = re.findall(pattern, text)
            for match in matches:
                # 处理逗号分隔的多个词
                for word in re.split(r"[、,，]", match):
                    clean_word = word.strip()
                    if clean_word:
                        words.append(ForbiddenWord(
                            word=clean_word,
                            reason="Brief 定义的禁忌词",
                            severity="hard",
                        ))

        return words

    def _parse_timing_requirements(self, text: str) -> list[TimingRequirement]:
        """解析时序要求"""
        requirements = []

        # 产品时长要求 - 支持多种表达方式
        duration_patterns = [
            r"产品(?:同框|展示|出现|正面展示).*?[>≥]\s*(\d+)\s*秒",
            r"(?:同框|展示|出现|正面展示).*?时长.*?[>≥]\s*(\d+)\s*秒",
        ]
        for pattern in duration_patterns:
            duration_match = re.search(pattern, text)
            if duration_match:
                requirements.append(TimingRequirement(
                    type="product_visible",
                    min_duration_seconds=int(duration_match.group(1)),
                    description="产品同框时长要求",
                ))
                break

        # 品牌提及频次
        mention_match = re.search(
            r"品牌.*?提及.*?[≥>=]\s*(\d+)\s*次",
            text
        )
        if mention_match:
            requirements.append(TimingRequirement(
                type="brand_mention",
                min_frequency=int(mention_match.group(1)),
                description="品牌名提及次数",
            ))

        # 演示时长
        demo_match = re.search(
            r"(?:使用)?演示.+?[≥>=]\s*(\d+)\s*秒",
            text
        )
        if demo_match:
            requirements.append(TimingRequirement(
                type="demo_duration",
                min_duration_seconds=int(demo_match.group(1)),
                description="产品使用演示时长",
            ))

        return requirements

    def _parse_brand_tone(self, text: str) -> BrandTone | None:
        """解析品牌调性"""
        style = ""
        target = ""
        expression = ""

        # 提取风格
        style_match = re.search(r"风格[：:]\s*(.+?)(?=\n|-|$)", text)
        if style_match:
            style = style_match.group(1).strip()
        else:
            # 直接提取形容词
            adjectives = re.findall(r"([\u4e00-\u9fa5]{2,4})[、,，]", text)
            if adjectives:
                style = "、".join(adjectives[:3])

        # 提取目标人群
        target_match = re.search(r"(?:目标人群|目标|对象)[：:]\s*(.+?)(?=\n|-|$)", text)
        if target_match:
            target = target_match.group(1).strip()

        # 提取表达方式
        expr_match = re.search(r"表达(?:方式)?[：:]\s*(.+?)(?=\n|$)", text)
        if expr_match:
            expression = expr_match.group(1).strip()

        if style or target or expression:
            return BrandTone(
                style=style or "未指定",
                target_audience=target,
                expression=expression,
            )

        return None

    def _extract_brand_tone_from_text(self, content: str) -> BrandTone | None:
        """从文本中提取品牌调性"""
        # 查找形容词组合
        adjectives = []
        patterns = [
            r"(年轻|时尚|专业|活力|可信|亲和|高端|平价)",
        ]
        for pattern in patterns:
            matches = re.findall(pattern, content)
            adjectives.extend(matches)

        if adjectives:
            return BrandTone(
                style="、".join(list(set(adjectives))[:3]),
            )

        return None

    def _detect_language(self, text: str) -> str:
        """检测文本语言"""
        # 简化实现：通过字符比例判断
        chinese_chars = len(re.findall(r"[\u4e00-\u9fa5]", text))
        total_chars = len(re.findall(r"\w", text))

        if total_chars == 0:
            return "unknown"

        if chinese_chars / total_chars > 0.3:
            return "zh"
        else:
            return "en"


class BriefFileValidator:
    """Brief 文件格式验证器"""

    SUPPORTED_FORMATS = {
        "pdf": "application/pdf",
        "docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
        "xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
        "pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
        "png": "image/png",
        "jpg": "image/jpeg",
        "jpeg": "image/jpeg",
    }

    def is_supported(self, file_format: str) -> bool:
        """检查文件格式是否支持"""
        return file_format.lower() in self.SUPPORTED_FORMATS

    def get_mime_type(self, file_format: str) -> str | None:
        """获取 MIME 类型"""
        return self.SUPPORTED_FORMATS.get(file_format.lower())


class OnlineDocumentValidator:
    """在线文档 URL 验证器"""

    SUPPORTED_DOMAINS = [
        r"docs\.feishu\.cn",
        r"[a-z]+\.feishu\.cn",
        r"www\.notion\.so",
        r"notion\.so",
    ]

    def is_valid(self, url: str) -> bool:
        """验证在线文档 URL 是否支持"""
        for domain_pattern in self.SUPPORTED_DOMAINS:
            if re.search(domain_pattern, url):
                return True
        return False


@dataclass
class ImportResult:
    """导入结果"""
    status: str  # "success", "failed"
    content: str = ""
    error_code: str = ""
    error_message: str = ""


class OnlineDocumentImporter:
    """在线文档导入器"""

    def __init__(self):
        self.validator = OnlineDocumentValidator()

    def import_document(self, url: str) -> ImportResult:
        """导入在线文档"""
        if not self.validator.is_valid(url):
            return ImportResult(
                status="failed",
                error_code="UNSUPPORTED_URL",
                error_message="不支持的文档链接",
            )

        # 模拟权限检查
        if "restricted" in url.lower():
            return ImportResult(
                status="failed",
                error_code="ACCESS_DENIED",
                error_message="无权限访问该文档，请检查分享设置",
            )

        # 实际实现需要调用飞书/Notion API
        return ImportResult(
            status="success",
            content="导入的文档内容",
        )