commit d910d6f6b9d986c72ae841bb87f66153fd18f796 Author: wangshaoqing Date: Fri Apr 17 16:55:11 2026 +0800 feat: initial douyin crawler diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..1958682 --- /dev/null +++ b/.gitignore @@ -0,0 +1,8 @@ +.DS_Store +.douyin-chrome-profile/ +.venv/ +__pycache__/ +*.pyc +video/ +video_p2/ +.pytest_cache/ diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..766ef32 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,21 @@ +{ + "files.exclude": { + "**/.venv": true, + "**/__pycache__": true, + "video_p2": true + }, + "files.watcherExclude": { + "**/.venv/**": true, + "**/__pycache__/**": true, + "video/**": true, + "video_p2/**": true + }, + "search.exclude": { + "**/.venv": true, + "**/__pycache__": true, + "video": true, + "video_p2": true + }, + "terminal.integrated.enablePersistentSessions": false, + "terminal.integrated.persistentSessionReviveProcess": "never" +} diff --git a/Douyin.py b/Douyin.py new file mode 100644 index 0000000..77c9e2f --- /dev/null +++ b/Douyin.py @@ -0,0 +1,300 @@ +""" +使用 DrissionPage 监听抖音作品列表接口,并批量下载视频。 + +运行示例: + python3 Douyin.py "https://www.douyin.com/user/xxx?from_tab_name=main" + +依赖: + pip install requests DrissionPage +""" + +from __future__ import annotations + +import argparse +import json +import re +import socket +import sys +import time +from pathlib import Path +from typing import Any + +DEFAULT_USER_URL = ( + "https://www.douyin.com/user/" + "MS4wLjABAAAAx7--dRYA0mPwhwvxNJ-35i6sB8d1Kv4Sj1WmugquqiHK19QYlB18Ikx6cECT1RVO" + "?from_tab_name=main" +) +LISTEN_TARGET = "web/aweme/post/" +INVALID_FILENAME_CHARS = re.compile(r'[\\/:*?"<>|\r\n\t]') + + +def sanitize_filename(value: str, fallback: str = "untitled") -> str: + cleaned = INVALID_FILENAME_CHARS.sub("_", value).strip(" ._") + return cleaned or fallback + + +def choose_video_url(url_list: list[str]) -> str: + for url in url_list: + if "douyinvod.com" in url: + return url + if url_list: + return url_list[0] + raise ValueError("url_list 为空,无法选择视频地址。") + + +def build_output_path(title: str, video_id: str, output_dir: Path = Path("video")) -> Path: + safe_title = sanitize_filename(title, fallback="untitled") + return output_dir / f"{safe_title}-{video_id}.mp4" + + +def build_browser_address(browser_port: int | None) -> str | None: + if browser_port is None: + return None + return f"127.0.0.1:{browser_port}" + + +def ensure_browser_debug_port_ready(browser_port: int) -> None: + try: + with socket.create_connection(("127.0.0.1", browser_port), timeout=2): + return + except OSError as exc: + raise RuntimeError( + "无法连接到已启动的 Chrome 调试端口。" + f"请先运行 `./.venv/bin/python login_douyin.py --browser-port {browser_port}`," + "并确认 Chrome 仍在运行且端口一致。" + ) from exc + + +def extract_aweme_payload(response: Any) -> dict[str, Any]: + body = getattr(response, "body", None) + if isinstance(body, dict): + return body + + raw_body = getattr(response, "raw_body", None) + if isinstance(raw_body, str) and raw_body.strip(): + payload = json.loads(raw_body) + if isinstance(payload, dict): + return payload + + raise ValueError("响应体不是可解析的 JSON 字典。") + + +def parse_aweme_items(body: Any) -> list[dict[str, str]]: + if not isinstance(body, dict): + raise ValueError("接口响应不是字典,无法解析。") + + aweme_list = body.get("aweme_list") + if not isinstance(aweme_list, list): + raise ValueError("接口响应中缺少 aweme_list。") + + items: list[dict[str, str]] = [] + for aweme in aweme_list: + if not isinstance(aweme, dict): + continue + + video = aweme.get("video") or {} + play_addr = video.get("play_addr") or {} + url_list = play_addr.get("url_list") or [] + if not url_list: + continue + + video_id = str(aweme.get("aweme_id") or "").strip() + if not video_id: + continue + + title = str(aweme.get("desc") or "").strip() or "untitled" + items.append( + { + "title": title, + "video_id": video_id, + "video_url": choose_video_url([str(url) for url in url_list]), + } + ) + + return items + + +def build_headers(referer: str) -> dict[str, str]: + return { + "referer": referer, + "user-agent": ( + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/135.0.0.0 Safari/537.36" + ), + } + + +def import_runtime_dependencies() -> tuple[Any, Any, Any]: + try: + import requests + except ModuleNotFoundError as exc: + raise SystemExit( + "缺少 requests,请先执行: python3 -m pip install requests" + ) from exc + + try: + from DrissionPage import ChromiumPage + from DrissionPage import ChromiumOptions + except ModuleNotFoundError as exc: + raise SystemExit( + "缺少 DrissionPage,请先执行: python3 -m pip install DrissionPage" + ) from exc + + return requests, ChromiumPage, ChromiumOptions + + +def create_page(chromium_page_cls: Any, chromium_options_cls: Any, browser_port: int | None) -> Any: + browser_address = build_browser_address(browser_port) + if browser_address is None: + return chromium_page_cls() + + options = chromium_options_cls().set_address(browser_address).existing_only(True) + return chromium_page_cls(options) + + +def wait_for_aweme_packet(page: Any, timeout: int) -> Any | None: + try: + return page.listen.wait(timeout=timeout) + except Exception as exc: + print(f"[WARN] 等待接口数据超时或失败: {exc}") + return None + + +def scroll_to_next_page(page: Any) -> None: + page.run_js("window.scrollTo(0, document.body.scrollHeight);") + time.sleep(2) + + +def download_video( + requests_module: Any, + headers: dict[str, str], + video_url: str, + output_path: Path, +) -> None: + response = requests_module.get(video_url, headers=headers, timeout=60) + response.raise_for_status() + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_bytes(response.content) + + +def collect_videos( + user_url: str, + max_pages: int, + timeout: int, + output_dir: Path, + browser_port: int | None, +) -> int: + requests_module, chromium_page_cls, chromium_options_cls = import_runtime_dependencies() + headers = build_headers(user_url) + if browser_port is not None: + ensure_browser_debug_port_ready(browser_port) + page = create_page(chromium_page_cls, chromium_options_cls, browser_port) + page.listen.start(LISTEN_TARGET) + + print("[INFO] 正在打开抖音主页。若出现登录或验证码,请先在浏览器窗口里完成。") + page.get(user_url) + time.sleep(3) + + downloaded = 0 + seen_ids: set[str] = set() + + for page_number in range(1, max_pages + 1): + print(f"[INFO] 正在处理第 {page_number} 页") + packet = wait_for_aweme_packet(page, timeout=timeout) + if packet is None: + scroll_to_next_page(page) + continue + + try: + payload = extract_aweme_payload(packet.response) + items = parse_aweme_items(payload) + except Exception as exc: + print(f"[WARN] 解析接口数据失败: {exc}") + scroll_to_next_page(page) + continue + + if not items: + print("[WARN] 这一页没有解析到视频。") + + for item in items: + if item["video_id"] in seen_ids: + continue + + seen_ids.add(item["video_id"]) + output_path = build_output_path( + title=item["title"], + video_id=item["video_id"], + output_dir=output_dir, + ) + + try: + download_video( + requests_module=requests_module, + headers=headers, + video_url=item["video_url"], + output_path=output_path, + ) + except Exception as exc: + print(f"[WARN] 下载失败 {item['video_id']}: {exc}") + continue + + downloaded += 1 + print(f"[OK] 已保存: {output_path}") + + scroll_to_next_page(page) + + return downloaded + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser(description="监听抖音作品接口并下载视频") + parser.add_argument("user_url", nargs="?", default=DEFAULT_USER_URL, help="抖音博主主页 URL") + parser.add_argument("--pages", type=int, default=10, help="最多抓取多少页,默认 10") + parser.add_argument("--timeout", type=int, default=10, help="单次等待接口响应秒数,默认 10") + parser.add_argument( + "--output-dir", + default="video", + help="视频输出目录,默认 video", + ) + parser.add_argument( + "--browser-port", + type=int, + default=None, + help="附着到已启动 Chrome 的调试端口,例如 9223;不传则由 DrissionPage 新开浏览器", + ) + return parser + + +def main(argv: list[str] | None = None) -> int: + parser = build_parser() + args = parser.parse_args(argv) + + if args.pages <= 0: + parser.error("--pages 必须大于 0") + if args.timeout <= 0: + parser.error("--timeout 必须大于 0") + if args.browser_port is not None and args.browser_port <= 0: + parser.error("--browser-port 必须大于 0") + + try: + total = collect_videos( + user_url=args.user_url, + max_pages=args.pages, + timeout=args.timeout, + output_dir=Path(args.output_dir), + browser_port=args.browser_port, + ) + except RuntimeError as exc: + print(f"[ERROR] {exc}") + return 1 + except KeyboardInterrupt: + print("\n[INFO] 用户中断。") + return 130 + + print(f"[INFO] 处理结束,共下载 {total} 个视频。") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/docs/superpowers/plans/2026-04-17-douyin-login-entry.md b/docs/superpowers/plans/2026-04-17-douyin-login-entry.md new file mode 100644 index 0000000..003ab81 --- /dev/null +++ b/docs/superpowers/plans/2026-04-17-douyin-login-entry.md @@ -0,0 +1,198 @@ +# Douyin Login Entry Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Add a dedicated browser-login launcher and a clearer attach-port check so the Douyin crawler has a stable two-step workflow: login first, crawl second. + +**Architecture:** Keep browser-launch responsibilities in a new `login_douyin.py` script and keep crawl responsibilities in `Douyin.py`. Add a small socket-based port readiness check before attaching to Chrome, and cover the new behavior with unit tests before implementing production code. + +**Tech Stack:** Python 3, `argparse`, `pathlib`, `subprocess`, `socket`, `unittest` + +--- + +### Task 1: Write failing tests for the new login launcher + +**Files:** +- Create: `login_douyin.py` +- Create: `test_login_douyin.py` + +- [ ] **Step 1: Write the failing test** + +```python +def test_build_login_command_uses_expected_chrome_arguments(self) -> None: + module = importlib.import_module("login_douyin") + command = module.build_login_command( + chrome_path="/Applications/Google Chrome.app/Contents/MacOS/Google Chrome", + profile_dir=Path("/tmp/douyin-profile"), + browser_port=9223, + user_url="https://www.douyin.com/user/example", + ) + self.assertEqual( + command, + [ + "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome", + "--user-data-dir=/tmp/douyin-profile", + "--remote-debugging-port=9223", + "https://www.douyin.com/user/example", + ], + ) +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `./.venv/bin/python -m unittest test_login_douyin.py -v` +Expected: FAIL because `login_douyin.py` does not exist yet. + +- [ ] **Step 3: Write minimal implementation** + +Create `login_douyin.py` with: + +- `DEFAULT_CHROME_PATH` +- `DEFAULT_BROWSER_PORT = 9223` +- `DEFAULT_PROFILE_DIR` +- `build_login_command(...)` + +- [ ] **Step 4: Run test to verify it passes** + +Run: `./.venv/bin/python -m unittest test_login_douyin.py -v` +Expected: PASS for the command-building test. + +- [ ] **Step 5: Commit** + +Not applicable here because the workspace is not a git repository. + +### Task 2: Add tests and implementation for launcher validation and user guidance + +**Files:** +- Modify: `login_douyin.py` +- Modify: `test_login_douyin.py` + +- [ ] **Step 1: Write the failing tests** + +Add tests for: + +- parser defaults use `9223` +- `main()` creates the profile dir +- `main()` prints the follow-up crawl command +- `main()` returns non-zero with a readable message when the Chrome path does not exist + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `./.venv/bin/python -m unittest test_login_douyin.py -v` +Expected: FAIL because validation and guidance behavior is not implemented yet. + +- [ ] **Step 3: Write minimal implementation** + +Add to `login_douyin.py`: + +- `build_parser()` +- `launch_browser(...)` +- `main(...)` +- readable `SystemExit`/stderr-style messaging through printed output and return codes + +- [ ] **Step 4: Run tests to verify they pass** + +Run: `./.venv/bin/python -m unittest test_login_douyin.py -v` +Expected: PASS + +- [ ] **Step 5: Commit** + +Not applicable here because the workspace is not a git repository. + +### Task 3: Write failing tests for attach-port readiness in the crawler + +**Files:** +- Modify: `Douyin.py` +- Modify: `test_douyin.py` + +- [ ] **Step 1: Write the failing tests** + +Add tests for: + +- `ensure_browser_debug_port_ready()` returns successfully when a temporary local server is listening +- `ensure_browser_debug_port_ready()` raises a readable `RuntimeError` when the port is unavailable + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `./.venv/bin/python -m unittest test_douyin.py -v` +Expected: FAIL because the function does not exist yet. + +- [ ] **Step 3: Write minimal implementation** + +Add to `Douyin.py`: + +- socket-based readiness helper +- call it in `collect_videos()` before `create_page(...)` when `browser_port` is provided + +- [ ] **Step 4: Run tests to verify they pass** + +Run: `./.venv/bin/python -m unittest test_douyin.py -v` +Expected: PASS + +- [ ] **Step 5: Commit** + +Not applicable here because the workspace is not a git repository. + +### Task 4: Update usage documentation + +**Files:** +- Modify: `抖音爬取视频.md` + +- [ ] **Step 1: Write the failing doc expectation** + +Define the required doc updates: + +- explicit step 1 command for `login_douyin.py` +- explicit step 2 command for `Douyin.py --browser-port 9223` +- short note that login state is kept in the dedicated profile dir + +- [ ] **Step 2: Verify current doc is incomplete** + +Run: `rg -n "login_douyin.py|--browser-port 9223" 抖音爬取视频.md` +Expected: no matches or incomplete guidance + +- [ ] **Step 3: Write minimal documentation update** + +Append a short “推荐流程” section to `抖音爬取视频.md`. + +- [ ] **Step 4: Verify the doc contains the new commands** + +Run: `rg -n "login_douyin.py|--browser-port 9223" 抖音爬取视频.md` +Expected: matches for both commands + +- [ ] **Step 5: Commit** + +Not applicable here because the workspace is not a git repository. + +### Task 5: Run full verification + +**Files:** +- Modify: `Douyin.py` +- Modify: `login_douyin.py` +- Modify: `test_douyin.py` +- Modify: `test_login_douyin.py` +- Modify: `抖音爬取视频.md` + +- [ ] **Step 1: Run the full unit test suite** + +Run: `./.venv/bin/python -m unittest test_douyin.py test_login_douyin.py -v` +Expected: all tests pass + +- [ ] **Step 2: Run the login launcher manually** + +Run: `./.venv/bin/python login_douyin.py --browser-port 9223` +Expected: visible Chrome launches and prints the next crawl command + +- [ ] **Step 3: Run the crawler against the logged-in browser** + +Run: `./.venv/bin/python Douyin.py --pages 1 --timeout 20 --browser-port 9223` +Expected: videos are downloaded to `video/` + +- [ ] **Step 4: Review changed files for scope drift** + +Run: `rg --files` +Expected: only the planned files changed or were added + +- [ ] **Step 5: Commit** + +Not applicable here because the workspace is not a git repository. diff --git a/docs/superpowers/specs/2026-04-17-douyin-login-entry-design.md b/docs/superpowers/specs/2026-04-17-douyin-login-entry-design.md new file mode 100644 index 0000000..5448018 --- /dev/null +++ b/docs/superpowers/specs/2026-04-17-douyin-login-entry-design.md @@ -0,0 +1,191 @@ +# Douyin Login Entry Design + +## Goal + +将当前“手动先开浏览器登录,再让 `Douyin.py` 附着到调试端口抓取”的隐式流程,固化为稳定、明确、可复用的两步式命令行入口。 + +## Current Context + +- 现有抓取实现位于 `Douyin.py`。 +- `Douyin.py` 已支持通过 `--browser-port` 附着到已启动的 Chrome 调试端口。 +- 本次实测已经证明:用户先在可见 Chrome 中登录抖音并通过验证码后,`Douyin.py --browser-port 9223` 可以成功抓到 `web/aweme/post/` 接口并下载视频。 +- 当前缺少一个明确的“登录准备入口”,导致可操作性依赖人工记忆和临时命令。 + +## Requirements + +### Functional + +1. 提供一个独立脚本,用于启动可见 Chrome,并固定: + - 调试端口,默认 `9223` + - 用户数据目录,默认使用一个项目约定路径 + - 打开的初始 URL,默认指向现有抖音博主页 +2. 登录脚本只负责“打开浏览器并提示用户手动登录”,不负责抓取。 +3. `Douyin.py` 继续负责抓取,并保持“附着已有浏览器”的职责边界。 +4. 当 `Douyin.py` 指定了 `--browser-port` 但端口不可连通时,应给出清晰错误,提示先运行登录脚本。 +5. 文档应给出最短可执行流程: + - 第一步:启动浏览器并登录 + - 第二步:运行抓取命令 + +### Non-Functional + +1. 不改变现有抓包、解析、下载的主逻辑。 +2. 保持现有命令参数兼容。 +3. 入口职责清晰,便于排查“登录问题”和“抓取问题”。 +4. 新增行为应具备可自动化测试的核心单元。 + +## Chosen Approach + +采用双脚本方案: + +- 新增 `login_douyin.py` + - 负责启动可见 Chrome + - 固定 remote debugging port + - 固定 profile 目录 + - 打开目标用户主页 + - 输出明确提示,引导用户完成手动登录和验证码 +- 保留 `Douyin.py` + - 继续承担附着浏览器、监听接口、下载视频的职责 + - 增强附着前检查与报错信息 + +## Rejected Alternatives + +### Alternative 1: 将“启动浏览器”直接并入 `Douyin.py` + +不采用。原因: + +- 会让 `Douyin.py` 同时承担登录准备和抓取职责。 +- 错误定位会变差,用户更难区分是登录失败还是抓取失败。 +- 未来若需要“先登录、稍后再抓”,这种合并入口不灵活。 + +### Alternative 2: 只写 shell 脚本串联所有步骤 + +不采用。原因: + +- 逻辑容易散落在 shell 中,测试性差。 +- 浏览器启动参数、等待逻辑和抓取命令耦合度高。 +- 后续若要扩展默认参数或跨平台兼容,shell 方案维护成本更高。 + +## Proposed CLI UX + +### Step 1: 启动登录浏览器 + +```bash +./.venv/bin/python login_douyin.py +``` + +默认行为: + +- 启动可见 Chrome +- 调试端口为 `9223` +- profile 目录为项目约定的本地路径 +- 打开默认的抖音主页 URL +- 输出“请在浏览器中完成登录/验证码,然后再运行抓取命令” + +可选扩展参数: + +- `--browser-port` +- `--profile-dir` +- `--user-url` +- `--chrome-path` + +### Step 2: 运行抓取 + +```bash +./.venv/bin/python Douyin.py --pages 1 --browser-port 9223 +``` + +## Design Details + +### `login_douyin.py` + +建议拆分为可测试的小函数: + +- `build_login_command(...)` + - 输入 Chrome 路径、profile 目录、端口、URL + - 输出适合 `subprocess.Popen(...)` 的参数列表 +- `launch_browser(...)` + - 调用 `subprocess.Popen(...)` +- `build_parser()` + - 定义 CLI 参数 +- `main()` + - 解析参数 + - 启动浏览器 + - 打印下一步指引 + +### `Douyin.py` + +新增一个显式的端口检查函数,例如: + +- `ensure_browser_debug_port_ready(browser_port: int) -> None` + +行为: + +- 仅当用户传入 `--browser-port` 时执行 +- 尝试连接 `127.0.0.1:` +- 若失败,抛出清晰错误,提示: + - 先启动 `login_douyin.py` + - 确认 Chrome 仍在运行 + - 确认端口与抓取命令一致 + +## Error Handling + +### 登录脚本 + +- Chrome 可执行文件不存在:直接报错并退出。 +- 浏览器启动失败:输出异常原因并返回非零退出码。 +- profile 目录不存在:自动创建。 + +### 抓取脚本 + +- 指定 `--browser-port` 但端口不可达:立即失败,不进入抓取流程。 +- 登录未完成导致页面异常:保留现有抓包等待与警告逻辑。 + +## Testing Strategy + +### Unit Tests + +新增或扩展 `test_douyin.py`,覆盖: + +1. `build_login_command()` 生成的命令参数正确。 +2. 默认调试地址仍为 `127.0.0.1:`。 +3. `ensure_browser_debug_port_ready()` 在端口不可达时抛出可读错误。 +4. `ensure_browser_debug_port_ready()` 在端口可达时正常返回。 + +如测试边界过大,可新增 `test_login_douyin.py`。 + +### Manual Verification + +1. 运行 `./.venv/bin/python login_douyin.py` +2. 在打开的 Chrome 中登录抖音并通过验证码 +3. 运行 `./.venv/bin/python Douyin.py --pages 1 --browser-port 9223` +4. 确认 `video/` 下生成新的 mp4 文件 + +## Implementation Boundaries + +本次只做以下改动: + +- 新增登录入口脚本 +- 为抓取入口补充附着前端口检查 +- 更新测试 +- 更新使用文档 + +本次不做以下改动: + +- 不重写抓取主流程 +- 不改成单命令自动等待登录 +- 不引入 Playwright 作为正式运行时依赖 +- 不增加下载调度、断点续传或批量任务管理 + +## Risks + +1. 本机 Chrome 路径可能与预设不同,因此需要保留 `--chrome-path` 覆盖能力。 +2. profile 目录固定后,用户可能重复复用登录态,这是预期行为,但文档需说明。 +3. 若目标端口被其他进程占用,登录脚本需要给出可诊断的失败信息或允许端口覆盖。 + +## Success Criteria + +满足以下条件即视为完成: + +1. 用户可以通过固定命令启动登录浏览器。 +2. 用户登录完成后,可通过固定命令让 `Douyin.py` 成功附着并抓取。 +3. 当浏览器未启动或端口错误时,抓取脚本会给出明确提示,而不是模糊失败。 diff --git a/externaldocs/2026-04-17-douyin-targeted-crawling-requirements.md b/externaldocs/2026-04-17-douyin-targeted-crawling-requirements.md new file mode 100644 index 0000000..f864233 --- /dev/null +++ b/externaldocs/2026-04-17-douyin-targeted-crawling-requirements.md @@ -0,0 +1,215 @@ +# Douyin Targeted Crawling Requirements + +## Goal + +在现有“登录浏览器后附着抓取”的基础上,扩展为支持更明确的目标选择能力,使系统不仅能抓默认博主主页,还能: + +- 指定某个博主主页进行抓取 +- 直接抓当前浏览器里正在查看的博主主页 +- 指定某个单独视频进行抓取 + +本需求文档只定义需求、范围、交互、错误处理和 TDD 约束,不直接定义实现细节代码。 + +## Current Behavior + +当前系统具备以下行为: + +- 通过 `login_douyin.py` 启动可见 Chrome,并开启调试端口 +- 通过 `Douyin.py` 附着到该浏览器 +- 打开某个博主主页 URL +- 监听抖音作品列表接口 `web/aweme/post/` +- 从接口返回的 `aweme_list` 中提取视频地址并下载 + +当前默认目标是一个硬编码博主主页,但也支持在命令行传入另一个博主主页 URL。 + +## Target Modes + +新版本必须同时支持以下三种目标模式: + +### 1. `creator-url` + +用户显式传入某个博主主页 URL,系统以该博主主页为目标进行抓取。 + +### 2. `current-creator` + +系统直接读取当前已附着浏览器正在查看的页面。如果当前页面是博主主页,则以该页面为目标进行抓取。 + +### 3. `single-video` + +用户传入单个视频链接或 `aweme_id`,系统仅下载这一条视频,不执行博主作品列表抓取。 + +## Scope Rules + +### Creator Targets + +当目标是博主时,默认只抓“当前页面中已加载、当前可见范围对应的作品”。 + +这意味着: + +- 不默认自动抓完整个博主全部作品 +- 不默认自动多页翻完所有历史内容 +- 不自动替用户点击筛选器或改动页面状态 +- 如果用户已经在页面里手动做了筛选、切换或滚动,则抓取结果以当前页面已加载状态为准 + +### Single Video Target + +当目标是单视频时: + +- 若输入是视频 URL,系统需要先解析出对应作品标识 +- 若输入是 `aweme_id`,系统直接按单作品逻辑抓取 +- 最终只下载一条视频 + +## Recommended User Experience + +保留现有两步模式,不改成自动登录的一体化入口: + +### Step 1 + +先启动登录浏览器: + +```bash +./.venv/bin/python login_douyin.py +``` + +### Step 2 + +登录完成后,再运行抓取命令。 + +未来命令行接口应支持显式目标模式,例如: + +```bash +./.venv/bin/python Douyin.py --mode creator-url --target "https://www.douyin.com/user/..." +./.venv/bin/python Douyin.py --mode current-creator +./.venv/bin/python Douyin.py --mode single-video --target "https://www.douyin.com/video/..." +./.venv/bin/python Douyin.py --mode single-video --target "7619989983668240802" +``` + +上面只是推荐交互形态,具体参数名可在实现设计阶段微调,但必须满足以下原则: + +- 模式必须显式可区分 +- “当前浏览器页面”与“传入 URL”不能混淆 +- 单视频目标与博主目标不能混淆 + +## Functional Requirements + +### Requirement A: Explicit Creator URL Crawling + +系统必须允许用户通过博主主页 URL 指定抓取目标。 + +完成条件: + +- 系统接受有效博主主页 URL +- 浏览器打开或切换到该 URL +- 系统只抓当前页面已加载的作品 + +### Requirement B: Current Browser Creator Crawling + +系统必须允许用户不手输目标 URL,而是直接抓当前浏览器页面对应的博主主页。 + +完成条件: + +- 系统能读取当前浏览器页面 URL +- 若当前页面是博主主页,则正常抓取 +- 若当前页面不是博主主页,则明确报错并退出 + +### Requirement C: Single Video Download + +系统必须允许用户通过单个视频链接或 `aweme_id` 只下载一个视频。 + +完成条件: + +- 支持视频 URL 输入 +- 支持 `aweme_id` 输入 +- 最终只落地一个视频文件 + +### Requirement D: Visible-Only Creator Scope + +当目标是博主时,系统默认只处理当前页面已经加载出来的作品。 + +完成条件: + +- 不自动继续滚动抓到所有历史内容 +- 抓取范围受当前页面加载状态约束 +- 用户先手动筛选、滚动、切换后,再执行抓取时,系统按当前页面状态工作 + +## Error Handling Requirements + +系统必须提供明确错误,不允许模糊失败。 + +### Current Creator Errors + +- 当前页面不是博主主页:报错并退出 +- 当前页面虽然像博主页,但未加载出可用作品数据:提示用户先完成页面操作后重试 + +### Single Video Errors + +- 输入既不是合法视频 URL,也不是合法 `aweme_id`:报错并退出 +- 视频标识无法解析:报错并退出 + +### Browser Attachment Errors + +- 调试端口不可用:提示先运行登录脚本并确认浏览器仍在运行 + +### Creator URL Errors + +- 传入 URL 不是受支持的抖音博主主页:报错并退出 + +## Non-Goals + +本次需求明确不包含以下内容: + +- 任意网页抓取 +- 非抖音站点抓取 +- 自动替用户点击页面筛选器 +- 自动抓完整个博主全部历史作品 +- 自动搜索博主 +- 自动在抖音站内执行复杂导航流程 + +## Terminology + +### `aweme` + +抖音接口中的作品对象,可以理解为一条内容或一个视频作品实体。 + +### `aweme_id` + +抖音作品的唯一标识。 + +### `current visible videos` + +指当前页面已经加载出来,并能够通过当前页面对应接口响应获得的作品集合,而不是博主的全量历史作品。 + +## TDD Requirements + +本需求后续实现必须使用 TDD。 + +### Mandatory Process + +1. 先写失败测试 +2. 先验证测试是因为功能未实现而失败 +3. 再写最小实现让测试通过 +4. 最后再做必要重构 + +### Required Test Areas + +至少覆盖以下测试: + +- `creator-url` 模式下,合法博主主页 URL 能被识别并生成正确抓取目标 +- `current-creator` 模式下,当前页面是博主主页时可抓取 +- `current-creator` 模式下,当前页面不是博主主页时明确报错 +- `single-video` 模式支持视频 URL +- `single-video` 模式支持 `aweme_id` +- 创作者抓取默认只处理当前已加载内容,不自动继续翻页 +- 目标模式错误时的报错路径 +- 浏览器端口不可用时的报错路径 + +## Acceptance Criteria + +需求完成后,应满足以下验收标准: + +1. 用户可以显式指定博主主页 URL 抓取 +2. 用户可以直接抓当前浏览器中的博主主页 +3. 用户可以指定单个视频 URL 或 `aweme_id` 下载单条视频 +4. 当目标是博主时,默认只抓当前页面已加载作品 +5. 关键失败场景都有明确报错 +6. 实现过程遵循 TDD,并有对应自动化测试覆盖 diff --git a/image-1.png b/image-1.png new file mode 100644 index 0000000..197440b Binary files /dev/null and b/image-1.png differ diff --git a/image-2.png b/image-2.png new file mode 100644 index 0000000..76ace3a Binary files /dev/null and b/image-2.png differ diff --git a/image.png b/image.png new file mode 100644 index 0000000..030ab6c Binary files /dev/null and b/image.png differ diff --git a/login_douyin.py b/login_douyin.py new file mode 100644 index 0000000..b42b7a2 --- /dev/null +++ b/login_douyin.py @@ -0,0 +1,122 @@ +from __future__ import annotations + +import argparse +import socket +import subprocess +import sys +import time +from pathlib import Path + +from Douyin import DEFAULT_USER_URL + +DEFAULT_CHROME_PATH = "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome" +DEFAULT_BROWSER_PORT = 9223 +DEFAULT_PROFILE_DIR = Path(".douyin-chrome-profile") + + +def derive_chrome_app_path(chrome_path: str) -> str: + marker = ".app/" + if marker not in chrome_path: + return chrome_path + prefix, _ = chrome_path.split(marker, 1) + return f"{prefix}.app" + + +def build_login_command( + chrome_path: str, + profile_dir: Path, + browser_port: int, + user_url: str, +) -> list[str]: + app_path = derive_chrome_app_path(chrome_path) + return [ + "open", + "-na", + app_path, + "--args", + f"--user-data-dir={profile_dir}", + f"--remote-debugging-port={browser_port}", + user_url, + ] + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser(description="启动可见 Chrome,供抖音手动登录后附着抓取") + parser.add_argument("--chrome-path", default=DEFAULT_CHROME_PATH, help="Chrome 可执行文件路径") + parser.add_argument( + "--profile-dir", + default=str(DEFAULT_PROFILE_DIR), + help="Chrome 用户数据目录,默认复用项目内固定目录", + ) + parser.add_argument( + "--browser-port", + type=int, + default=DEFAULT_BROWSER_PORT, + help="Chrome 调试端口,默认 9223", + ) + parser.add_argument("--user-url", default=DEFAULT_USER_URL, help="启动后打开的抖音主页 URL") + return parser + + +def launch_browser(command: list[str]) -> subprocess.Popen[str]: + return subprocess.Popen(command) + + +def wait_for_browser_debug_port( + browser_port: int, + timeout_seconds: float = 15.0, + interval_seconds: float = 0.25, +) -> None: + deadline = time.monotonic() + timeout_seconds + while time.monotonic() < deadline: + try: + with socket.create_connection(("127.0.0.1", browser_port), timeout=1): + return + except OSError: + time.sleep(interval_seconds) + + raise RuntimeError( + f"Chrome 已启动命令,但调试端口 {browser_port} 在限定时间内未就绪。" + ) + + +def main(argv: list[str] | None = None) -> int: + parser = build_parser() + args = parser.parse_args(argv) + + if args.browser_port <= 0: + parser.error("--browser-port 必须大于 0") + + chrome_path = Path(args.chrome_path) + if not chrome_path.exists(): + print(f"[ERROR] Chrome 可执行文件不存在: {chrome_path}") + return 1 + + profile_dir = Path(args.profile_dir).resolve() + profile_dir.mkdir(parents=True, exist_ok=True) + command = build_login_command( + chrome_path=str(chrome_path), + profile_dir=profile_dir, + browser_port=args.browser_port, + user_url=args.user_url, + ) + + try: + launch_browser(command) + except OSError as exc: + print(f"[ERROR] 启动 Chrome 失败: {exc}") + return 1 + + try: + wait_for_browser_debug_port(args.browser_port) + except RuntimeError as exc: + print(f"[ERROR] {exc}") + return 1 + + print("[INFO] Chrome 已启动。请在打开的浏览器中完成抖音登录和验证码。") + print(f"[INFO] 登录完成后执行: ./.venv/bin/python Douyin.py --browser-port {args.browser_port}") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/test_douyin.py b/test_douyin.py new file mode 100644 index 0000000..822e74d --- /dev/null +++ b/test_douyin.py @@ -0,0 +1,76 @@ +import importlib +import unittest +from unittest import mock + + +class FakeResponse: + def __init__(self, body, raw_body): + self.body = body + self.raw_body = raw_body + + +class DouyinModuleTests(unittest.TestCase): + def test_module_can_import_without_optional_runtime_dependencies(self) -> None: + module = importlib.import_module("Douyin") + self.assertIsNotNone(module) + + def test_sanitize_filename_removes_invalid_characters(self) -> None: + module = importlib.import_module("Douyin") + self.assertEqual( + module.sanitize_filename('a/b:c*?d"eg|h\n'), + "a_b_c__d_e_f_g_h", + ) + + def test_choose_video_url_prefers_douyinvod_link(self) -> None: + module = importlib.import_module("Douyin") + urls = [ + "https://www.douyin.com/aweme/v1/play/?video_id=123", + "https://v11-weba.douyinvod.com/example/video.mp4", + "https://v26-web.douyinvod.com/example/video.mp4", + ] + self.assertEqual( + module.choose_video_url(urls), + "https://v11-weba.douyinvod.com/example/video.mp4", + ) + + def test_build_output_path_uses_video_directory(self) -> None: + module = importlib.import_module("Douyin") + output_path = module.build_output_path("测试标题", "123456") + self.assertEqual(output_path.as_posix(), "video/测试标题-123456.mp4") + + def test_extract_aweme_payload_uses_dict_body(self) -> None: + module = importlib.import_module("Douyin") + response = FakeResponse({"aweme_list": []}, "") + self.assertEqual(module.extract_aweme_payload(response), {"aweme_list": []}) + + def test_extract_aweme_payload_falls_back_to_raw_json(self) -> None: + module = importlib.import_module("Douyin") + response = FakeResponse("", '{"aweme_list": [{"aweme_id": "1"}]}') + self.assertEqual( + module.extract_aweme_payload(response), + {"aweme_list": [{"aweme_id": "1"}]}, + ) + + def test_build_browser_address_from_port(self) -> None: + module = importlib.import_module("Douyin") + self.assertEqual(module.build_browser_address(9223), "127.0.0.1:9223") + self.assertIsNone(module.build_browser_address(None)) + + def test_ensure_browser_debug_port_ready_accepts_open_port(self) -> None: + module = importlib.import_module("Douyin") + connection = mock.MagicMock() + connection.__enter__.return_value = connection + connection.__exit__.return_value = False + with mock.patch.object(module.socket, "create_connection", return_value=connection) as mocked_connect: + module.ensure_browser_debug_port_ready(9223) + mocked_connect.assert_called_once() + + def test_ensure_browser_debug_port_ready_rejects_closed_port(self) -> None: + module = importlib.import_module("Douyin") + with mock.patch.object(module.socket, "create_connection", side_effect=OSError("boom")): + with self.assertRaisesRegex(RuntimeError, "login_douyin.py"): + module.ensure_browser_debug_port_ready(9223) + + +if __name__ == "__main__": + unittest.main() diff --git a/test_login_douyin.py b/test_login_douyin.py new file mode 100644 index 0000000..2800a39 --- /dev/null +++ b/test_login_douyin.py @@ -0,0 +1,98 @@ +import importlib +import io +import tempfile +import unittest +from contextlib import redirect_stdout +from pathlib import Path +from unittest import mock + + +class LoginDouyinModuleTests(unittest.TestCase): + def test_build_login_command_uses_expected_chrome_arguments(self) -> None: + module = importlib.import_module("login_douyin") + command = module.build_login_command( + chrome_path="/Applications/Google Chrome.app/Contents/MacOS/Google Chrome", + profile_dir=Path("/tmp/douyin-profile"), + browser_port=9223, + user_url="https://www.douyin.com/user/example", + ) + self.assertEqual( + command, + [ + "open", + "-na", + "/Applications/Google Chrome.app", + "--args", + "--user-data-dir=/tmp/douyin-profile", + "--remote-debugging-port=9223", + "https://www.douyin.com/user/example", + ], + ) + + def test_build_parser_uses_expected_defaults(self) -> None: + module = importlib.import_module("login_douyin") + args = module.build_parser().parse_args([]) + self.assertEqual(args.browser_port, 9223) + self.assertEqual(args.chrome_path, module.DEFAULT_CHROME_PATH) + self.assertEqual(args.user_url, module.DEFAULT_USER_URL) + + def test_main_creates_profile_dir_and_prints_next_step(self) -> None: + module = importlib.import_module("login_douyin") + with tempfile.TemporaryDirectory() as temp_dir: + profile_dir = Path(temp_dir) / "profile" + stdout = io.StringIO() + with redirect_stdout(stdout): + with mock.patch.object(module, "launch_browser") as mocked_launch: + with mock.patch.object(module, "wait_for_browser_debug_port") as mocked_wait: + exit_code = module.main( + [ + "--chrome-path", + "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome", + "--profile-dir", + str(profile_dir), + "--browser-port", + "9333", + ] + ) + self.assertEqual(exit_code, 0) + self.assertTrue(profile_dir.exists()) + mocked_launch.assert_called_once() + mocked_wait.assert_called_once_with(9333) + self.assertIn("9333", stdout.getvalue()) + self.assertIn("./.venv/bin/python Douyin.py --browser-port 9333", stdout.getvalue()) + + def test_main_returns_error_when_chrome_path_missing(self) -> None: + module = importlib.import_module("login_douyin") + stdout = io.StringIO() + with redirect_stdout(stdout): + exit_code = module.main(["--chrome-path", "/tmp/does-not-exist-chrome"]) + self.assertEqual(exit_code, 1) + self.assertIn("Chrome", stdout.getvalue()) + self.assertIn("不存在", stdout.getvalue()) + + def test_main_returns_error_when_debug_port_never_becomes_ready(self) -> None: + module = importlib.import_module("login_douyin") + with tempfile.TemporaryDirectory() as temp_dir: + profile_dir = Path(temp_dir) / "profile" + stdout = io.StringIO() + with redirect_stdout(stdout): + with mock.patch.object(module, "launch_browser"): + with mock.patch.object( + module, + "wait_for_browser_debug_port", + side_effect=RuntimeError("端口未就绪"), + ): + exit_code = module.main( + [ + "--chrome-path", + "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome", + "--profile-dir", + str(profile_dir), + ] + ) + self.assertEqual(exit_code, 1) + self.assertIn("端口未就绪", stdout.getvalue()) + + +if __name__ == "__main__": + unittest.main() diff --git a/抖音爬取视频.md b/抖音爬取视频.md new file mode 100644 index 0000000..ed7b49d --- /dev/null +++ b/抖音爬取视频.md @@ -0,0 +1,45 @@ +打开一个博主主页,f12打开控制台: +鼠标挪到一个视频上面会开始播放,然后出现一个mp4媒体: +![alt text](image.png) +![alt text](image-1.png) +复制这个媒体的rc +NjxpM2hkZWUzNTszNGlpOkBpajdrOHY5cmxqOjMzNGkzM0A1MF41MC8wNmMxMjM2YWAuYSNrbmVqMmRrYWNhLS1kLTBzcw== +粘贴到搜索栏,然后command+R刷新页面,然后搜索就找到了链接: +![alt text](image-2.png) +一般是三条 +``` +"url_list": [ + "https://v26-web.douyinvod.com/71be40af2c168460799af3a778572914/69e1f23b/video/tos/cn/tos-cn-ve-15/oERIAIyg72xQTAKefNevmg5PYSzGBCLVGxxBM0/?a=6383&ch=10010&cr=3&dr=0&lr=all&cd=0%7C0%7C0%7C3&cv=1&br=2657&bt=2657&cs=0&ds=4&ft=pEaFx4hZffPdOW~-N12NvAq-antLjrKiGgZnRkaEkVUpUjVhWL6&mime_type=video_mp4&qs=0&rc=ZDZnZjwzNTgzaWk5ZzloNUBpajdrOHY5cmxqOjMzNGkzM0AxNDIvY2FeNmMxXjQ0MWFiYSNrbmVqMmRrYWNhLS1kLTBzcw%3D%3D&btag=80000e00010000&cquery=100z_100o_101r_100B_100x&dy_q=1776404465&feature_id=37f92ebd2877ae8e7eba995d406c5150&l=202604171341058889DECB07230D0B9B8B", + "https://v11-weba.douyinvod.com/0cc3b59178e5b0065b19ddb2587010c1/69e1f23b/video/tos/cn/tos-cn-ve-15/oERIAIyg72xQTAKefNevmg5PYSzGBCLVGxxBM0/?a=6383&ch=10010&cr=3&dr=0&lr=all&cd=0%7C0%7C0%7C3&cv=1&br=2657&bt=2657&cs=0&ds=4&ft=pEaFx4hZffPdOW~-N12NvAq-antLjrKiGgZnRkaEkVUpUjVhWL6&mime_type=video_mp4&qs=0&rc=ZDZnZjwzNTgzaWk5ZzloNUBpajdrOHY5cmxqOjMzNGkzM0AxNDIvY2FeNmMxXjQ0MWFiYSNrbmVqMmRrYWNhLS1kLTBzcw%3D%3D&btag=80000e00010000&cquery=100o_101r_100B_100x_100z&dy_q=1776404465&feature_id=37f92ebd2877ae8e7eba995d406c5150&l=202604171341058889DECB07230D0B9B8B", + "https://www.douyin.com/aweme/v1/play/?video_id=v0300fg10000d7fgdinog65sm5hhq4ng&line=0&file_id=e2de467d76af4ff095a2d0f92c5d734a&sign=66b2ab7aba34fd66cf343c7ea1aa9994&is_play_url=1&source=PackSourceEnum_PUBLISH" + ], +``` +复制这个中间的v11的到浏览器就有了: +``` +https://v11-weba.douyinvod.com/0cc3b59178e5b0065b19ddb2587010c1/69e1f23b/video/tos/cn/tos-cn-ve-15/oERIAIyg72xQTAKefNevmg5PYSzGBCLVGxxBM0/?a=6383&ch=10010&cr=3&dr=0&lr=all&cd=0%7C0%7C0%7C3&cv=1&br=2657&bt=2657&cs=0&ds=4&ft=pEaFx4hZffPdOW~-N12NvAq-antLjrKiGgZnRkaEkVUpUjVhWL6&mime_type=video_mp4&qs=0&rc=ZDZnZjwzNTgzaWk5ZzloNUBpajdrOHY5cmxqOjMzNGkzM0AxNDIvY2FeNmMxXjQ0MWFiYSNrbmVqMmRrYWNhLS1kLTBzcw%3D%3D&btag=80000e00010000&cquery=100o_101r_100B_100x_100z&dy_q=1776404465&feature_id=37f92ebd2877ae8e7eba995d406c5150&l=202604171341058889DECB07230D0B9B8B +发现可以直接打开video的页面并且可以下载 +``` + +## 推荐流程 + +把“手动登录”和“附着抓取”分成两步走更稳: + +### 1. 先启动登录浏览器 + +```bash +./.venv/bin/python login_douyin.py +``` + +- 默认会启动一个可见的 Chrome +- 默认调试端口是 `9223` +- 登录态会保存在项目目录下的 `.douyin-chrome-profile/` + +在这个浏览器里手动完成抖音登录和验证码。 + +### 2. 再附着已登录浏览器开始抓取 + +```bash +./.venv/bin/python Douyin.py --pages 1 --browser-port 9223 +``` + +如果你改了登录脚本里的端口,抓取时也要传同一个 `--browser-port`。