""" Brief 解析模块 提供 Brief 文档解析、卖点提取、禁忌词提取等功能 验收标准: - 图文混排解析准确率 > 90% - 支持 PDF/Word/Excel/PPT/图片格式 - 支持飞书/Notion 在线文档链接 """ import re from dataclasses import dataclass, field from typing import Any from enum import Enum class ParsingStatus(str, Enum): """解析状态""" SUCCESS = "success" FAILED = "failed" PARTIAL = "partial" class Priority(str, Enum): """优先级""" HIGH = "high" MEDIUM = "medium" LOW = "low" @dataclass class SellingPoint: """卖点""" text: str priority: str = "medium" evidence_snippet: str = "" @dataclass class ForbiddenWord: """禁忌词""" word: str reason: str = "" severity: str = "hard" @dataclass class TimingRequirement: """时序要求""" type: str # "product_visible", "brand_mention", "demo_duration" min_duration_seconds: int | None = None min_frequency: int | None = None description: str = "" @dataclass class BrandTone: """品牌调性""" style: str target_audience: str = "" expression: str = "" @dataclass class BriefParsingResult: """Brief 解析结果""" status: ParsingStatus selling_points: list[SellingPoint] = field(default_factory=list) forbidden_words: list[ForbiddenWord] = field(default_factory=list) timing_requirements: list[TimingRequirement] = field(default_factory=list) brand_tone: BrandTone | None = None platform: str = "" region: str = "mainland_china" accuracy_rate: float = 0.0 error_code: str = "" error_message: str = "" fallback_suggestion: str = "" detected_language: str = "zh" extracted_text: str = "" def to_json(self) -> dict[str, Any]: """转换为 JSON 格式""" return { "selling_points": [ {"text": sp.text, "priority": sp.priority, "evidence_snippet": sp.evidence_snippet} for sp in self.selling_points ], "forbidden_words": [ {"word": fw.word, "reason": fw.reason, "severity": fw.severity} for fw in self.forbidden_words ], "timing_requirements": [ { "type": tr.type, "min_duration_seconds": tr.min_duration_seconds, "min_frequency": tr.min_frequency, "description": tr.description, } for tr in self.timing_requirements ], "brand_tone": { "style": self.brand_tone.style, "target_audience": self.brand_tone.target_audience, "expression": self.brand_tone.expression, } if self.brand_tone else None, "platform": self.platform, "region": self.region, } class BriefParser: """Brief 解析器""" # 卖点关键词模式 SELLING_POINT_PATTERNS = [ r"产品(?:核心)?卖点[::]\s*", r"(?:核心)?卖点[::]\s*", r"##\s*产品卖点\s*", r"产品(?:特点|优势)[::]\s*", ] # 禁忌词关键词模式 FORBIDDEN_WORD_PATTERNS = [ r"禁(?:止|忌)?(?:使用的)?词(?:汇)?[::]\s*", r"##\s*禁用词(?:汇)?\s*", r"不能使用的词[::]\s*", ] # 时序要求关键词模式 TIMING_PATTERNS = [ r"拍摄要求[::]\s*", r"##\s*拍摄要求\s*", r"时长要求[::]\s*", ] # 品牌调性关键词模式 BRAND_TONE_PATTERNS = [ r"品牌调性[::]\s*", r"##\s*品牌调性\s*", r"风格定位[::]\s*", ] def extract_selling_points(self, content: str) -> BriefParsingResult: """提取卖点""" selling_points = [] # 查找卖点部分 for pattern in self.SELLING_POINT_PATTERNS: match = re.search(pattern, content) if match: # 提取卖点部分的文本 start_pos = match.end() # 查找下一个部分或结束 end_pos = self._find_section_end(content, start_pos) section_text = content[start_pos:end_pos] # 解析列表项 selling_points.extend(self._parse_list_items(section_text, "selling_point")) break # 如果没找到明确的卖点部分,尝试从整个文本中提取 if not selling_points: selling_points = self._extract_selling_points_from_text(content) return BriefParsingResult( status=ParsingStatus.SUCCESS if selling_points else ParsingStatus.PARTIAL, selling_points=selling_points, accuracy_rate=0.9 if selling_points else 0.0, ) def extract_forbidden_words(self, content: str) -> BriefParsingResult: """提取禁忌词""" forbidden_words = [] for pattern in self.FORBIDDEN_WORD_PATTERNS: match = re.search(pattern, content) if match: start_pos = match.end() end_pos = self._find_section_end(content, start_pos) section_text = content[start_pos:end_pos] # 解析禁忌词列表 forbidden_words.extend(self._parse_forbidden_words(section_text)) break return BriefParsingResult( status=ParsingStatus.SUCCESS if forbidden_words else ParsingStatus.PARTIAL, forbidden_words=forbidden_words, ) def extract_timing_requirements(self, content: str) -> BriefParsingResult: """提取时序要求""" timing_requirements = [] for pattern in self.TIMING_PATTERNS: match = re.search(pattern, content) if match: start_pos = match.end() end_pos = self._find_section_end(content, start_pos) section_text = content[start_pos:end_pos] # 解析时序要求 timing_requirements.extend(self._parse_timing_requirements(section_text)) break return BriefParsingResult( status=ParsingStatus.SUCCESS if timing_requirements else ParsingStatus.PARTIAL, timing_requirements=timing_requirements, ) def extract_brand_tone(self, content: str) -> BriefParsingResult: """提取品牌调性""" brand_tone = None for pattern in self.BRAND_TONE_PATTERNS: match = re.search(pattern, content) if match: start_pos = match.end() end_pos = self._find_section_end(content, start_pos) section_text = content[start_pos:end_pos] # 解析品牌调性 brand_tone = self._parse_brand_tone(section_text) break # 如果没找到明确的品牌调性部分,尝试提取 if not brand_tone: brand_tone = self._extract_brand_tone_from_text(content) return BriefParsingResult( status=ParsingStatus.SUCCESS if brand_tone else ParsingStatus.PARTIAL, brand_tone=brand_tone, ) def parse(self, content: str) -> BriefParsingResult: """解析完整 Brief""" if not content or not content.strip(): return BriefParsingResult( status=ParsingStatus.FAILED, error_code="EMPTY_CONTENT", error_message="Brief 内容为空", ) # 提取各部分 selling_result = self.extract_selling_points(content) forbidden_result = self.extract_forbidden_words(content) timing_result = self.extract_timing_requirements(content) brand_result = self.extract_brand_tone(content) # 检测语言 detected_language = self._detect_language(content) # 计算准确率(基于提取的字段数) total_fields = 4 extracted_fields = sum([ len(selling_result.selling_points) > 0, len(forbidden_result.forbidden_words) > 0, len(timing_result.timing_requirements) > 0, brand_result.brand_tone is not None, ]) accuracy_rate = extracted_fields / total_fields return BriefParsingResult( status=ParsingStatus.SUCCESS if accuracy_rate >= 0.5 else ParsingStatus.PARTIAL, selling_points=selling_result.selling_points, forbidden_words=forbidden_result.forbidden_words, timing_requirements=timing_result.timing_requirements, brand_tone=brand_result.brand_tone, accuracy_rate=accuracy_rate, detected_language=detected_language, ) def parse_file(self, file_path: str) -> BriefParsingResult: """解析 Brief 文件""" # 检测是否加密(简化实现) if "encrypted" in file_path.lower(): return BriefParsingResult( status=ParsingStatus.FAILED, error_code="ENCRYPTED_FILE", error_message="文件已加密,无法解析", fallback_suggestion="请手动输入 Brief 内容或提供未加密的文件", ) # 实际实现需要调用文件解析库 return BriefParsingResult( status=ParsingStatus.FAILED, error_code="NOT_IMPLEMENTED", error_message="文件解析功能尚未实现", ) def parse_image(self, image_path: str) -> BriefParsingResult: """解析图片 Brief (OCR)""" # 实际实现需要调用 OCR 服务 return BriefParsingResult( status=ParsingStatus.SUCCESS, extracted_text="示例提取文本", ) def _find_section_end(self, content: str, start_pos: int) -> int: """查找部分结束位置""" # 查找下一个标题或结束 patterns = [r"\n##\s", r"\n[A-Za-z\u4e00-\u9fa5]+[::]"] min_pos = len(content) for pattern in patterns: match = re.search(pattern, content[start_pos:]) if match: pos = start_pos + match.start() if pos < min_pos: min_pos = pos return min_pos def _parse_list_items(self, text: str, item_type: str) -> list[SellingPoint]: """解析列表项""" items = [] # 匹配数字列表、减号列表等 patterns = [ r"[0-9]+[.、]\s*(.+?)(?=\n|$)", # 1. xxx 或 1、xxx r"-\s*(.+?)(?=\n|$)", # - xxx r"•\s*(.+?)(?=\n|$)", # • xxx ] for pattern in patterns: matches = re.findall(pattern, text) for match in matches: clean_text = match.strip() if clean_text: items.append(SellingPoint( text=clean_text, priority="medium", evidence_snippet=clean_text[:50], )) return items def _extract_selling_points_from_text(self, content: str) -> list[SellingPoint]: """从文本中提取卖点""" # 简化实现:查找常见卖点模式 selling_points = [] patterns = [ r"(\d+小时.+)", # 24小时持妆 r"(天然.+)", # 天然成分 r"(敏感.+适用)", # 敏感肌适用 ] for pattern in patterns: matches = re.findall(pattern, content) for match in matches: selling_points.append(SellingPoint( text=match.strip(), priority="medium", )) return selling_points def _parse_forbidden_words(self, text: str) -> list[ForbiddenWord]: """解析禁忌词列表""" words = [] # 处理列表项 list_patterns = [ r"-\s*(.+?)(?=\n|$)", r"•\s*(.+?)(?=\n|$)", ] for pattern in list_patterns: matches = re.findall(pattern, text) for match in matches: # 处理逗号分隔的多个词 for word in re.split(r"[、,,]", match): clean_word = word.strip() if clean_word: words.append(ForbiddenWord( word=clean_word, reason="Brief 定义的禁忌词", severity="hard", )) return words def _parse_timing_requirements(self, text: str) -> list[TimingRequirement]: """解析时序要求""" requirements = [] # 产品时长要求 - 支持多种表达方式 duration_patterns = [ r"产品(?:同框|展示|出现|正面展示).*?[>≥]\s*(\d+)\s*秒", r"(?:同框|展示|出现|正面展示).*?时长.*?[>≥]\s*(\d+)\s*秒", ] for pattern in duration_patterns: duration_match = re.search(pattern, text) if duration_match: requirements.append(TimingRequirement( type="product_visible", min_duration_seconds=int(duration_match.group(1)), description="产品同框时长要求", )) break # 品牌提及频次 mention_match = re.search( r"品牌.*?提及.*?[≥>=]\s*(\d+)\s*次", text ) if mention_match: requirements.append(TimingRequirement( type="brand_mention", min_frequency=int(mention_match.group(1)), description="品牌名提及次数", )) # 演示时长 demo_match = re.search( r"(?:使用)?演示.+?[≥>=]\s*(\d+)\s*秒", text ) if demo_match: requirements.append(TimingRequirement( type="demo_duration", min_duration_seconds=int(demo_match.group(1)), description="产品使用演示时长", )) return requirements def _parse_brand_tone(self, text: str) -> BrandTone | None: """解析品牌调性""" style = "" target = "" expression = "" # 提取风格 style_match = re.search(r"风格[::]\s*(.+?)(?=\n|-|$)", text) if style_match: style = style_match.group(1).strip() else: # 直接提取形容词 adjectives = re.findall(r"([\u4e00-\u9fa5]{2,4})[、,,]", text) if adjectives: style = "、".join(adjectives[:3]) # 提取目标人群 target_match = re.search(r"(?:目标人群|目标|对象)[::]\s*(.+?)(?=\n|-|$)", text) if target_match: target = target_match.group(1).strip() # 提取表达方式 expr_match = re.search(r"表达(?:方式)?[::]\s*(.+?)(?=\n|$)", text) if expr_match: expression = expr_match.group(1).strip() if style or target or expression: return BrandTone( style=style or "未指定", target_audience=target, expression=expression, ) return None def _extract_brand_tone_from_text(self, content: str) -> BrandTone | None: """从文本中提取品牌调性""" # 查找形容词组合 adjectives = [] patterns = [ r"(年轻|时尚|专业|活力|可信|亲和|高端|平价)", ] for pattern in patterns: matches = re.findall(pattern, content) adjectives.extend(matches) if adjectives: return BrandTone( style="、".join(list(set(adjectives))[:3]), ) return None def _detect_language(self, text: str) -> str: """检测文本语言""" # 简化实现:通过字符比例判断 chinese_chars = len(re.findall(r"[\u4e00-\u9fa5]", text)) total_chars = len(re.findall(r"\w", text)) if total_chars == 0: return "unknown" if chinese_chars / total_chars > 0.3: return "zh" else: return "en" class BriefFileValidator: """Brief 文件格式验证器""" SUPPORTED_FORMATS = { "pdf": "application/pdf", "docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation", "png": "image/png", "jpg": "image/jpeg", "jpeg": "image/jpeg", } def is_supported(self, file_format: str) -> bool: """检查文件格式是否支持""" return file_format.lower() in self.SUPPORTED_FORMATS def get_mime_type(self, file_format: str) -> str | None: """获取 MIME 类型""" return self.SUPPORTED_FORMATS.get(file_format.lower()) class OnlineDocumentValidator: """在线文档 URL 验证器""" SUPPORTED_DOMAINS = [ r"docs\.feishu\.cn", r"[a-z]+\.feishu\.cn", r"www\.notion\.so", r"notion\.so", ] def is_valid(self, url: str) -> bool: """验证在线文档 URL 是否支持""" for domain_pattern in self.SUPPORTED_DOMAINS: if re.search(domain_pattern, url): return True return False @dataclass class ImportResult: """导入结果""" status: str # "success", "failed" content: str = "" error_code: str = "" error_message: str = "" class OnlineDocumentImporter: """在线文档导入器""" def __init__(self): self.validator = OnlineDocumentValidator() def import_document(self, url: str) -> ImportResult: """导入在线文档""" if not self.validator.is_valid(url): return ImportResult( status="failed", error_code="UNSUPPORTED_URL", error_message="不支持的文档链接", ) # 模拟权限检查 if "restricted" in url.lower(): return ImportResult( status="failed", error_code="ACCESS_DENIED", error_message="无权限访问该文档,请检查分享设置", ) # 实际实现需要调用飞书/Notion API return ImportResult( status="success", content="导入的文档内容", )