From 86839a873fb401b6d2c13c28811ab7b1b9d45768 Mon Sep 17 00:00:00 2001 From: wangshaoqing Date: Wed, 6 May 2026 17:10:25 +0800 Subject: [PATCH] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E6=8A=96=E9=9F=B3=E6=8E=A8?= =?UTF-8?q?=E8=8D=90=E6=B5=81=E6=8A=93=E5=8F=96=E5=AE=9E=E7=8E=B0=E8=AE=A1?= =?UTF-8?q?=E5=88=92?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ...26-05-06-douyin-recommendation-crawling.md | 584 ++++++++++++++++++ 1 file changed, 584 insertions(+) create mode 100644 docs/superpowers/plans/2026-05-06-douyin-recommendation-crawling.md diff --git a/docs/superpowers/plans/2026-05-06-douyin-recommendation-crawling.md b/docs/superpowers/plans/2026-05-06-douyin-recommendation-crawling.md new file mode 100644 index 0000000..d49c06f --- /dev/null +++ b/docs/superpowers/plans/2026-05-06-douyin-recommendation-crawling.md @@ -0,0 +1,584 @@ +# 抖音推荐流视频抓取实现计划 + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** 扩展 Douyin.py 支持抓取抖音推荐流(For You页面)视频,记录博主信息,支持滚动加载最多50条 + +**Architecture:** 新建 `collect_recommendations()` 函数处理推荐流,复用现有的下载和工具函数。通过 `parse_target_input()` 扩展识别推荐流URL。 + +**Tech Stack:** Python 3, DrissionPage, requests, unittest + +--- + +## 文件结构 + +| 文件 | 操作 | 说明 | +|------|------|------| +| `Douyin.py` | 修改 | 添加推荐流识别、解析、抓取逻辑 | +| `test_douyin.py` | 修改 | 添加推荐流相关测试 | + +--- + +## Task 1: 推荐流URL识别 + +**Files:** +- Modify: `Douyin.py:17-19`(添加正则表达式) +- Test: `test_douyin.py`(添加测试) + +- [ ] **Step 1: 编写失败测试** + +```python +def test_is_recommendation_url_accepts_douyin_homepage(self) -> None: + module = importlib.import_module("Douyin") + self.assertTrue(module.is_recommendation_url("https://www.douyin.com/")) + self.assertTrue(module.is_recommendation_url("https://www.douyin.com")) + self.assertTrue(module.is_recommendation_url("https://www.douyin.com/?from=web")) + self.assertFalse(module.is_recommendation_url("https://www.douyin.com/user/xxx")) + self.assertFalse(module.is_recommendation_url("https://www.douyin.com/video/123")) +``` + +- [ ] **Step 2: 运行测试确认失败** + +Run: `python3 -m pytest test_douyin.py::DouyinModuleTests::test_is_recommendation_url_accepts_douyin_homepage -v` +Expected: FAIL with "module has no attribute 'is_recommendation_url'" + +- [ ] **Step 3: 实现最小代码** + +在 `Douyin.py` 中添加: +```python +RECOMMENDATION_URL_PATTERN = re.compile(r"^https?://www\.douyin\.com/?(?:\?.*)?$") + +def is_recommendation_url(value: str) -> bool: + return bool(RECOMMENDATION_URL_PATTERN.match(value.strip())) +``` + +- [ ] **Step 4: 运行测试确认通过** + +Run: `python3 -m pytest test_douyin.py::DouyinModuleTests::test_is_recommendation_url_accepts_douyin_homepage -v` +Expected: PASS + +- [ ] **Step 5: 提交** + +```bash +git add Douyin.py test_douyin.py +git commit -m "feat: add recommendation URL recognition" +``` + +--- + +## Task 2: 扩展目标解析支持推荐流 + +**Files:** +- Modify: `Douyin.py:52-68`(修改 `parse_target_input`) +- Test: `test_douyin.py`(添加测试) + +- [ ] **Step 1: 编写失败测试** + +```python +def test_parse_target_input_classifies_recommendation_url(self) -> None: + module = importlib.import_module("Douyin") + target = module.parse_target_input("https://www.douyin.com/", source="manual") + self.assertEqual(target.kind, "recommendation") + self.assertEqual(target.value, "https://www.douyin.com/") + self.assertEqual(target.source, "manual") +``` + +- [ ] **Step 2: 运行测试确认失败** + +Run: `python3 -m pytest test_douyin.py::DouyinModuleTests::test_parse_target_input_classifies_recommendation_url -v` +Expected: FAIL with "不支持的目标" + +- [ ] **Step 3: 修改 `parse_target_input`** + +```python +def parse_target_input(value: str, source: str) -> ResolvedTarget: + normalized = value.strip() + if is_recommendation_url(normalized): + return ResolvedTarget(kind="recommendation", value=normalized, source=source) + if is_creator_url(normalized): + return ResolvedTarget(kind="creator", value=normalized, source=source) + # ... 其余保持不变 +``` + +- [ ] **Step 4: 运行测试确认通过** + +Run: `python3 -m pytest test_douyin.py::DouyinModuleTests::test_parse_target_input_classifies_recommendation_url -v` +Expected: PASS + +- [ ] **Step 5: 提交** + +```bash +git add Douyin.py test_douyin.py +git commit -m "feat: extend target parsing to support recommendation URLs" +``` + +--- + +## Task 3: 增强数据解析提取博主信息 + +**Files:** +- Modify: `Douyin.py:140-170`(修改 `parse_aweme_items`) +- Test: `test_douyin.py`(添加测试) + +- [ ] **Step 1: 编写失败测试** + +```python +def test_parse_aweme_items_extracts_author_info(self) -> None: + module = importlib.import_module("Douyin") + payload = { + "aweme_list": [ + { + "aweme_id": "7619989983668240802", + "desc": "测试视频", + "author": { + "nickname": "测试博主", + "uid": "123456789" + }, + "video": { + "play_addr": { + "url_list": ["https://v26-web.douyinvod.com/example/video.mp4"] + } + }, + } + ] + } + items = module.parse_aweme_items(payload) + self.assertEqual(len(items), 1) + self.assertEqual(items[0]["author_name"], "测试博主") + self.assertEqual(items[0]["author_id"], "123456789") +``` + +- [ ] **Step 2: 运行测试确认失败** + +Run: `python3 -m pytest test_douyin.py::DouyinModuleTests::test_parse_aweme_items_extracts_author_info -v` +Expected: FAIL with KeyError or missing author_name + +- [ ] **Step 3: 修改 `parse_aweme_items`** + +```python +def parse_aweme_items(body: Any) -> list[dict[str, str]]: + # ... 现有代码 ... + + for aweme in aweme_list: + # ... 现有视频提取代码 ... + + author = aweme.get("author") or {} + author_name = str(author.get("nickname") or "").strip() or "unknown" + author_id = str(author.get("uid") or "").strip() or "unknown" + + items.append( + { + "title": title, + "video_id": video_id, + "video_url": choose_video_url([str(url) for url in url_list]), + "author_name": author_name, + "author_id": author_id, + } + ) + + return items +``` + +- [ ] **Step 4: 运行测试确认通过** + +Run: `python3 -m pytest test_douyin.py::DouyinModuleTests::test_parse_aweme_items_extracts_author_info -v` +Expected: PASS + +- [ ] **Step 5: 提交** + +```bash +git add Douyin.py test_douyin.py +git commit -m "feat: extract author info from aweme items" +``` + +--- + +## Task 4: 支持带博主信息的文件名构建 + +**Files:** +- Modify: `Douyin.py:102-104`(修改 `build_output_path`) +- Test: `test_douyin.py`(添加测试) + +- [ ] **Step 1: 编写失败测试** + +```python +def test_build_output_path_with_author_uses_bracket_format(self) -> None: + module = importlib.import_module("Douyin") + output_path = module.build_output_path( + title="测试标题", + video_id="123456", + author_name="测试博主" + ) + self.assertEqual(output_path.as_posix(), "video/[测试博主]测试标题-123456.mp4") + +def test_build_output_path_without_author_uses_original_format(self) -> None: + module = importlib.import_module("Douyin") + output_path = module.build_output_path("测试标题", "123456") + self.assertEqual(output_path.as_posix(), "video/测试标题-123456.mp4") +``` + +- [ ] **Step 2: 运行测试确认失败** + +Run: `python3 -m pytest test_douyin.py::DouyinModuleTests::test_build_output_path_with_author_uses_bracket_format -v` +Expected: FAIL with unexpected keyword argument 'author_name' + +- [ ] **Step 3: 修改 `build_output_path`** + +```python +def build_output_path( + title: str, + video_id: str, + output_dir: Path = Path("video"), + author_name: str | None = None, +) -> Path: + safe_title = sanitize_filename(title, fallback="untitled") + if author_name: + safe_author = sanitize_filename(author_name, fallback="unknown") + filename = f"[{safe_author}]{safe_title}-{video_id}.mp4" + else: + filename = f"{safe_title}-{video_id}.mp4" + return output_dir / filename +``` + +- [ ] **Step 4: 运行测试确认通过** + +Run: `python3 -m pytest test_douyin.py::DouyinModuleTests::test_build_output_path_with_author_uses_bracket_format test_douyin.py::DouyinModuleTests::test_build_output_path_without_author_uses_original_format -v` +Expected: PASS + +- [ ] **Step 5: 提交** + +```bash +git add Douyin.py test_douyin.py +git commit -m "feat: support author prefix in output filename" +``` + +--- + +## Task 5: 实现 `collect_recommendations()` 函数 + +**Files:** +- Modify: `Douyin.py`(添加新函数) +- Test: `test_douyin.py`(添加测试) + +- [ ] **Step 1: 编写失败测试** + +```python +def test_collect_recommendations_downloads_videos_with_author_prefix(self) -> None: + module = importlib.import_module("Douyin") + packet = FakePacket( + { + "aweme_list": [ + { + "aweme_id": "7619989983668240802", + "desc": "推荐视频1", + "author": {"nickname": "博主A", "uid": "111"}, + "video": { + "play_addr": { + "url_list": ["https://v26-web.douyinvod.com/example/video1.mp4"] + } + }, + } + ] + } + ) + page = FakeRuntimePage("https://www.douyin.com/", packet) + + with mock.patch.object(module, "import_runtime_dependencies", return_value=(object(), object(), object())): + with mock.patch.object(module, "create_page", return_value=page): + with mock.patch.object(module, "download_video") as mocked_download: + downloaded = module.collect_recommendations( + max_videos=50, + timeout=10, + output_dir=module.Path("video"), + browser_port=None, + ) + + self.assertEqual(downloaded, 1) + # 验证文件名包含博主前缀 + call_kwargs = mocked_download.call_args[1] + self.assertIn("[博主A]", str(call_kwargs["output_path"])) +``` + +- [ ] **Step 2: 运行测试确认失败** + +Run: `python3 -m pytest test_douyin.py::DouyinModuleTests::test_collect_recommendations_downloads_videos_with_author_prefix -v` +Expected: FAIL with "module has no attribute 'collect_recommendations'" + +- [ ] **Step 3: 实现 `collect_recommendations`** + +```python +def collect_recommendations( + max_videos: int, + timeout: int, + output_dir: Path, + browser_port: int | None, +) -> int: + requests_module, chromium_page_cls, chromium_options_cls = import_runtime_dependencies() + headers = build_headers("https://www.douyin.com/") + if browser_port is not None: + ensure_browser_debug_port_ready(browser_port) + page = create_page(chromium_page_cls, chromium_options_cls, browser_port) + page.listen.start(LISTEN_TARGET) + + print("[INFO] 正在打开抖音推荐流。若出现登录或验证码,请先在浏览器窗口里完成。") + page.get("https://www.douyin.com/") + time.sleep(3) + + downloaded = 0 + seen_ids: set[str] = set() + consecutive_empty = 0 + max_consecutive_empty = 3 + + while downloaded < max_videos: + packet = wait_for_aweme_packet(page, timeout=timeout) + if packet is None: + consecutive_empty += 1 + if consecutive_empty >= max_consecutive_empty: + print("[INFO] 连续多次未获取到新数据,结束抓取。") + break + scroll_to_next_page(page) + continue + + try: + payload = extract_aweme_payload(packet.response) + items = parse_aweme_items(payload) + except Exception as exc: + print(f"[WARN] 解析接口数据失败: {exc}") + consecutive_empty += 1 + if consecutive_empty >= max_consecutive_empty: + break + scroll_to_next_page(page) + continue + + if not items: + consecutive_empty += 1 + if consecutive_empty >= max_consecutive_empty: + break + scroll_to_next_page(page) + continue + + consecutive_empty = 0 + new_items_in_batch = 0 + + for item in items: + if item["video_id"] in seen_ids: + continue + + if downloaded >= max_videos: + break + + seen_ids.add(item["video_id"]) + output_path = build_output_path( + title=item["title"], + video_id=item["video_id"], + output_dir=output_dir, + author_name=item.get("author_name"), + ) + + try: + download_video( + requests_module=requests_module, + headers=headers, + video_url=item["video_url"], + output_path=output_path, + ) + except Exception as exc: + print(f"[WARN] 下载失败 {item['video_id']}: {exc}") + continue + + downloaded += 1 + new_items_in_batch += 1 + print(f"[OK] 已保存: {output_path}") + + if new_items_in_batch == 0: + consecutive_empty += 1 + if consecutive_empty >= max_consecutive_empty: + break + + scroll_to_next_page(page) + + return downloaded +``` + +- [ ] **Step 4: 运行测试确认通过** + +Run: `python3 -m pytest test_douyin.py::DouyinModuleTests::test_collect_recommendations_downloads_videos_with_author_prefix -v` +Expected: PASS + +- [ ] **Step 5: 提交** + +```bash +git add Douyin.py test_douyin.py +git commit -m "feat: implement collect_recommendations() for For You page" +``` + +--- + +## Task 6: 添加 `--max-videos` 命令行参数 + +**Files:** +- Modify: `Douyin.py:295-305`(修改 `build_parser`) +- Modify: `Douyin.py:310-350`(修改 `main`) +- Test: `test_douyin.py`(添加测试) + +- [ ] **Step 1: 编写失败测试** + +```python +def test_build_parser_has_max_videos_argument(self) -> None: + module = importlib.import_module("Douyin") + args = module.build_parser().parse_args(["--max-videos", "30"]) + self.assertEqual(args.max_videos, 30) + +def test_main_dispatches_recommendation_flow_for_recommendation_url(self) -> None: + module = importlib.import_module("Douyin") + stdout = io.StringIO() + recommendation_target = module.ResolvedTarget( + kind="recommendation", + value="https://www.douyin.com/", + source="current-page", + ) + with redirect_stdout(stdout): + with mock.patch.object(module, "resolve_cli_target", return_value=recommendation_target): + with mock.patch.object(module, "collect_recommendations", return_value=5) as mocked_collect: + exit_code = module.main([]) + self.assertEqual(exit_code, 0) + mocked_collect.assert_called_once_with( + max_videos=50, + timeout=10, + output_dir=module.Path("video"), + browser_port=9223, + ) +``` + +- [ ] **Step 2: 运行测试确认失败** + +Run: `python3 -m pytest test_douyin.py::DouyinModuleTests::test_build_parser_has_max_videos_argument -v` +Expected: FAIL with "unrecognized arguments: --max-videos" + +- [ ] **Step 3: 修改 `build_parser` 和 `main`** + +```python +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser(description="附着抖音登录浏览器并下载当前页面或指定目标的视频") + parser.add_argument( + "target", + nargs="?", + default=None, + help="可选:博主主页 URL、单视频 URL 或 aweme_id;不传则读取当前浏览器页面", + ) + parser.add_argument("--pages", type=int, default=1, help="创作者抓取最多处理多少页;默认 1") + parser.add_argument("--timeout", type=int, default=10, help="单次等待接口响应秒数,默认 10") + parser.add_argument( + "--output-dir", + default="video", + help="视频输出目录,默认 video", + ) + parser.add_argument( + "--browser-port", + type=int, + default=DEFAULT_BROWSER_PORT, + help="附着到已启动 Chrome 的调试端口,默认 9223", + ) + parser.add_argument( + "--max-videos", + type=int, + default=50, + help="推荐流最大抓取数量,默认 50", + ) + return parser + + +def main(argv: list[str] | None = None) -> int: + parser = build_parser() + args = parser.parse_args(argv) + + if args.pages <= 0: + parser.error("--pages 必须大于 0") + if args.timeout <= 0: + parser.error("--timeout 必须大于 0") + if args.browser_port is not None and args.browser_port <= 0: + parser.error("--browser-port 必须大于 0") + if args.max_videos <= 0: + parser.error("--max-videos 必须大于 0") + + try: + target = resolve_cli_target(args.target, browser_port=args.browser_port) + if target.kind == "creator": + total = collect_videos( + user_url=target.value, + max_pages=args.pages, + timeout=args.timeout, + output_dir=Path(args.output_dir), + browser_port=args.browser_port, + auto_scroll=args.pages > 1, + ) + elif target.kind == "recommendation": + total = collect_recommendations( + max_videos=args.max_videos, + timeout=args.timeout, + output_dir=Path(args.output_dir), + browser_port=args.browser_port, + ) + elif target.kind == "single-video": + total = collect_single_video( + target=target, + timeout=args.timeout, + output_dir=Path(args.output_dir), + browser_port=args.browser_port, + ) + else: + raise RuntimeError(f"不支持的目标类型: {target.kind}") + except RuntimeError as exc: + print(f"[ERROR] {exc}") + return 1 + except KeyboardInterrupt: + print("\n[INFO] 用户中断。") + return 130 + + print(f"[INFO] 处理结束,共下载 {total} 个视频。") + return 0 +``` + +- [ ] **Step 4: 运行测试确认通过** + +Run: `python3 -m pytest test_douyin.py::DouyinModuleTests::test_build_parser_has_max_videos_argument test_douyin.py::DouyinModuleTests::test_main_dispatches_recommendation_flow_for_recommendation_url -v` +Expected: PASS + +- [ ] **Step 5: 提交** + +```bash +git add Douyin.py test_douyin.py +git commit -m "feat: add --max-videos argument and wire recommendation flow in main" +``` + +--- + +## Task 7: 运行全部测试并验证 + +- [ ] **Step 1: 运行全部测试** + +Run: `python3 -m pytest test_douyin.py -v` +Expected: 所有测试通过 + +- [ ] **Step 2: 运行主脚本帮助确认** + +Run: `python3 Douyin.py --help` +Expected: 显示包含 `--max-videos` 的帮助信息 + +- [ ] **Step 3: 提交** + +```bash +git add -A +git commit -m "test: verify all tests pass for recommendation crawling feature" +``` + +--- + +## 完成标准 + +1. ✅ `Douyin.py` 支持识别 `https://www.douyin.com/` 为推荐流目标 +2. ✅ `collect_recommendations()` 函数实现滚动加载、最多50条、去重 +3. ✅ 视频文件名包含博主昵称:`[博主名]标题-aweme_id.mp4` +4. ✅ `--max-videos` 命令行参数可用 +5. ✅ 所有现有测试继续通过 +6. ✅ 新增测试覆盖推荐流功能