feat: initial douyin crawler

This commit is contained in:
wangshaoqing 2026-04-17 16:55:11 +08:00
commit d910d6f6b9
13 changed files with 1274 additions and 0 deletions

8
.gitignore vendored Normal file
View File

@ -0,0 +1,8 @@
.DS_Store
.douyin-chrome-profile/
.venv/
__pycache__/
*.pyc
video/
video_p2/
.pytest_cache/

21
.vscode/settings.json vendored Normal file
View File

@ -0,0 +1,21 @@
{
"files.exclude": {
"**/.venv": true,
"**/__pycache__": true,
"video_p2": true
},
"files.watcherExclude": {
"**/.venv/**": true,
"**/__pycache__/**": true,
"video/**": true,
"video_p2/**": true
},
"search.exclude": {
"**/.venv": true,
"**/__pycache__": true,
"video": true,
"video_p2": true
},
"terminal.integrated.enablePersistentSessions": false,
"terminal.integrated.persistentSessionReviveProcess": "never"
}

300
Douyin.py Normal file
View File

@ -0,0 +1,300 @@
"""
使用 DrissionPage 监听抖音作品列表接口并批量下载视频
运行示例:
python3 Douyin.py "https://www.douyin.com/user/xxx?from_tab_name=main"
依赖:
pip install requests DrissionPage
"""
from __future__ import annotations
import argparse
import json
import re
import socket
import sys
import time
from pathlib import Path
from typing import Any
DEFAULT_USER_URL = (
"https://www.douyin.com/user/"
"MS4wLjABAAAAx7--dRYA0mPwhwvxNJ-35i6sB8d1Kv4Sj1WmugquqiHK19QYlB18Ikx6cECT1RVO"
"?from_tab_name=main"
)
LISTEN_TARGET = "web/aweme/post/"
INVALID_FILENAME_CHARS = re.compile(r'[\\/:*?"<>|\r\n\t]')
def sanitize_filename(value: str, fallback: str = "untitled") -> str:
cleaned = INVALID_FILENAME_CHARS.sub("_", value).strip(" ._")
return cleaned or fallback
def choose_video_url(url_list: list[str]) -> str:
for url in url_list:
if "douyinvod.com" in url:
return url
if url_list:
return url_list[0]
raise ValueError("url_list 为空,无法选择视频地址。")
def build_output_path(title: str, video_id: str, output_dir: Path = Path("video")) -> Path:
safe_title = sanitize_filename(title, fallback="untitled")
return output_dir / f"{safe_title}-{video_id}.mp4"
def build_browser_address(browser_port: int | None) -> str | None:
if browser_port is None:
return None
return f"127.0.0.1:{browser_port}"
def ensure_browser_debug_port_ready(browser_port: int) -> None:
try:
with socket.create_connection(("127.0.0.1", browser_port), timeout=2):
return
except OSError as exc:
raise RuntimeError(
"无法连接到已启动的 Chrome 调试端口。"
f"请先运行 `./.venv/bin/python login_douyin.py --browser-port {browser_port}`"
"并确认 Chrome 仍在运行且端口一致。"
) from exc
def extract_aweme_payload(response: Any) -> dict[str, Any]:
body = getattr(response, "body", None)
if isinstance(body, dict):
return body
raw_body = getattr(response, "raw_body", None)
if isinstance(raw_body, str) and raw_body.strip():
payload = json.loads(raw_body)
if isinstance(payload, dict):
return payload
raise ValueError("响应体不是可解析的 JSON 字典。")
def parse_aweme_items(body: Any) -> list[dict[str, str]]:
if not isinstance(body, dict):
raise ValueError("接口响应不是字典,无法解析。")
aweme_list = body.get("aweme_list")
if not isinstance(aweme_list, list):
raise ValueError("接口响应中缺少 aweme_list。")
items: list[dict[str, str]] = []
for aweme in aweme_list:
if not isinstance(aweme, dict):
continue
video = aweme.get("video") or {}
play_addr = video.get("play_addr") or {}
url_list = play_addr.get("url_list") or []
if not url_list:
continue
video_id = str(aweme.get("aweme_id") or "").strip()
if not video_id:
continue
title = str(aweme.get("desc") or "").strip() or "untitled"
items.append(
{
"title": title,
"video_id": video_id,
"video_url": choose_video_url([str(url) for url in url_list]),
}
)
return items
def build_headers(referer: str) -> dict[str, str]:
return {
"referer": referer,
"user-agent": (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/135.0.0.0 Safari/537.36"
),
}
def import_runtime_dependencies() -> tuple[Any, Any, Any]:
try:
import requests
except ModuleNotFoundError as exc:
raise SystemExit(
"缺少 requests请先执行: python3 -m pip install requests"
) from exc
try:
from DrissionPage import ChromiumPage
from DrissionPage import ChromiumOptions
except ModuleNotFoundError as exc:
raise SystemExit(
"缺少 DrissionPage请先执行: python3 -m pip install DrissionPage"
) from exc
return requests, ChromiumPage, ChromiumOptions
def create_page(chromium_page_cls: Any, chromium_options_cls: Any, browser_port: int | None) -> Any:
browser_address = build_browser_address(browser_port)
if browser_address is None:
return chromium_page_cls()
options = chromium_options_cls().set_address(browser_address).existing_only(True)
return chromium_page_cls(options)
def wait_for_aweme_packet(page: Any, timeout: int) -> Any | None:
try:
return page.listen.wait(timeout=timeout)
except Exception as exc:
print(f"[WARN] 等待接口数据超时或失败: {exc}")
return None
def scroll_to_next_page(page: Any) -> None:
page.run_js("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)
def download_video(
requests_module: Any,
headers: dict[str, str],
video_url: str,
output_path: Path,
) -> None:
response = requests_module.get(video_url, headers=headers, timeout=60)
response.raise_for_status()
output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_bytes(response.content)
def collect_videos(
user_url: str,
max_pages: int,
timeout: int,
output_dir: Path,
browser_port: int | None,
) -> int:
requests_module, chromium_page_cls, chromium_options_cls = import_runtime_dependencies()
headers = build_headers(user_url)
if browser_port is not None:
ensure_browser_debug_port_ready(browser_port)
page = create_page(chromium_page_cls, chromium_options_cls, browser_port)
page.listen.start(LISTEN_TARGET)
print("[INFO] 正在打开抖音主页。若出现登录或验证码,请先在浏览器窗口里完成。")
page.get(user_url)
time.sleep(3)
downloaded = 0
seen_ids: set[str] = set()
for page_number in range(1, max_pages + 1):
print(f"[INFO] 正在处理第 {page_number}")
packet = wait_for_aweme_packet(page, timeout=timeout)
if packet is None:
scroll_to_next_page(page)
continue
try:
payload = extract_aweme_payload(packet.response)
items = parse_aweme_items(payload)
except Exception as exc:
print(f"[WARN] 解析接口数据失败: {exc}")
scroll_to_next_page(page)
continue
if not items:
print("[WARN] 这一页没有解析到视频。")
for item in items:
if item["video_id"] in seen_ids:
continue
seen_ids.add(item["video_id"])
output_path = build_output_path(
title=item["title"],
video_id=item["video_id"],
output_dir=output_dir,
)
try:
download_video(
requests_module=requests_module,
headers=headers,
video_url=item["video_url"],
output_path=output_path,
)
except Exception as exc:
print(f"[WARN] 下载失败 {item['video_id']}: {exc}")
continue
downloaded += 1
print(f"[OK] 已保存: {output_path}")
scroll_to_next_page(page)
return downloaded
def build_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(description="监听抖音作品接口并下载视频")
parser.add_argument("user_url", nargs="?", default=DEFAULT_USER_URL, help="抖音博主主页 URL")
parser.add_argument("--pages", type=int, default=10, help="最多抓取多少页,默认 10")
parser.add_argument("--timeout", type=int, default=10, help="单次等待接口响应秒数,默认 10")
parser.add_argument(
"--output-dir",
default="video",
help="视频输出目录,默认 video",
)
parser.add_argument(
"--browser-port",
type=int,
default=None,
help="附着到已启动 Chrome 的调试端口,例如 9223不传则由 DrissionPage 新开浏览器",
)
return parser
def main(argv: list[str] | None = None) -> int:
parser = build_parser()
args = parser.parse_args(argv)
if args.pages <= 0:
parser.error("--pages 必须大于 0")
if args.timeout <= 0:
parser.error("--timeout 必须大于 0")
if args.browser_port is not None and args.browser_port <= 0:
parser.error("--browser-port 必须大于 0")
try:
total = collect_videos(
user_url=args.user_url,
max_pages=args.pages,
timeout=args.timeout,
output_dir=Path(args.output_dir),
browser_port=args.browser_port,
)
except RuntimeError as exc:
print(f"[ERROR] {exc}")
return 1
except KeyboardInterrupt:
print("\n[INFO] 用户中断。")
return 130
print(f"[INFO] 处理结束,共下载 {total} 个视频。")
return 0
if __name__ == "__main__":
sys.exit(main())

View File

@ -0,0 +1,198 @@
# Douyin Login Entry Implementation Plan
> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
**Goal:** Add a dedicated browser-login launcher and a clearer attach-port check so the Douyin crawler has a stable two-step workflow: login first, crawl second.
**Architecture:** Keep browser-launch responsibilities in a new `login_douyin.py` script and keep crawl responsibilities in `Douyin.py`. Add a small socket-based port readiness check before attaching to Chrome, and cover the new behavior with unit tests before implementing production code.
**Tech Stack:** Python 3, `argparse`, `pathlib`, `subprocess`, `socket`, `unittest`
---
### Task 1: Write failing tests for the new login launcher
**Files:**
- Create: `login_douyin.py`
- Create: `test_login_douyin.py`
- [ ] **Step 1: Write the failing test**
```python
def test_build_login_command_uses_expected_chrome_arguments(self) -> None:
module = importlib.import_module("login_douyin")
command = module.build_login_command(
chrome_path="/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
profile_dir=Path("/tmp/douyin-profile"),
browser_port=9223,
user_url="https://www.douyin.com/user/example",
)
self.assertEqual(
command,
[
"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
"--user-data-dir=/tmp/douyin-profile",
"--remote-debugging-port=9223",
"https://www.douyin.com/user/example",
],
)
```
- [ ] **Step 2: Run test to verify it fails**
Run: `./.venv/bin/python -m unittest test_login_douyin.py -v`
Expected: FAIL because `login_douyin.py` does not exist yet.
- [ ] **Step 3: Write minimal implementation**
Create `login_douyin.py` with:
- `DEFAULT_CHROME_PATH`
- `DEFAULT_BROWSER_PORT = 9223`
- `DEFAULT_PROFILE_DIR`
- `build_login_command(...)`
- [ ] **Step 4: Run test to verify it passes**
Run: `./.venv/bin/python -m unittest test_login_douyin.py -v`
Expected: PASS for the command-building test.
- [ ] **Step 5: Commit**
Not applicable here because the workspace is not a git repository.
### Task 2: Add tests and implementation for launcher validation and user guidance
**Files:**
- Modify: `login_douyin.py`
- Modify: `test_login_douyin.py`
- [ ] **Step 1: Write the failing tests**
Add tests for:
- parser defaults use `9223`
- `main()` creates the profile dir
- `main()` prints the follow-up crawl command
- `main()` returns non-zero with a readable message when the Chrome path does not exist
- [ ] **Step 2: Run tests to verify they fail**
Run: `./.venv/bin/python -m unittest test_login_douyin.py -v`
Expected: FAIL because validation and guidance behavior is not implemented yet.
- [ ] **Step 3: Write minimal implementation**
Add to `login_douyin.py`:
- `build_parser()`
- `launch_browser(...)`
- `main(...)`
- readable `SystemExit`/stderr-style messaging through printed output and return codes
- [ ] **Step 4: Run tests to verify they pass**
Run: `./.venv/bin/python -m unittest test_login_douyin.py -v`
Expected: PASS
- [ ] **Step 5: Commit**
Not applicable here because the workspace is not a git repository.
### Task 3: Write failing tests for attach-port readiness in the crawler
**Files:**
- Modify: `Douyin.py`
- Modify: `test_douyin.py`
- [ ] **Step 1: Write the failing tests**
Add tests for:
- `ensure_browser_debug_port_ready()` returns successfully when a temporary local server is listening
- `ensure_browser_debug_port_ready()` raises a readable `RuntimeError` when the port is unavailable
- [ ] **Step 2: Run tests to verify they fail**
Run: `./.venv/bin/python -m unittest test_douyin.py -v`
Expected: FAIL because the function does not exist yet.
- [ ] **Step 3: Write minimal implementation**
Add to `Douyin.py`:
- socket-based readiness helper
- call it in `collect_videos()` before `create_page(...)` when `browser_port` is provided
- [ ] **Step 4: Run tests to verify they pass**
Run: `./.venv/bin/python -m unittest test_douyin.py -v`
Expected: PASS
- [ ] **Step 5: Commit**
Not applicable here because the workspace is not a git repository.
### Task 4: Update usage documentation
**Files:**
- Modify: `抖音爬取视频.md`
- [ ] **Step 1: Write the failing doc expectation**
Define the required doc updates:
- explicit step 1 command for `login_douyin.py`
- explicit step 2 command for `Douyin.py --browser-port 9223`
- short note that login state is kept in the dedicated profile dir
- [ ] **Step 2: Verify current doc is incomplete**
Run: `rg -n "login_douyin.py|--browser-port 9223" 抖音爬取视频.md`
Expected: no matches or incomplete guidance
- [ ] **Step 3: Write minimal documentation update**
Append a short “推荐流程” section to `抖音爬取视频.md`.
- [ ] **Step 4: Verify the doc contains the new commands**
Run: `rg -n "login_douyin.py|--browser-port 9223" 抖音爬取视频.md`
Expected: matches for both commands
- [ ] **Step 5: Commit**
Not applicable here because the workspace is not a git repository.
### Task 5: Run full verification
**Files:**
- Modify: `Douyin.py`
- Modify: `login_douyin.py`
- Modify: `test_douyin.py`
- Modify: `test_login_douyin.py`
- Modify: `抖音爬取视频.md`
- [ ] **Step 1: Run the full unit test suite**
Run: `./.venv/bin/python -m unittest test_douyin.py test_login_douyin.py -v`
Expected: all tests pass
- [ ] **Step 2: Run the login launcher manually**
Run: `./.venv/bin/python login_douyin.py --browser-port 9223`
Expected: visible Chrome launches and prints the next crawl command
- [ ] **Step 3: Run the crawler against the logged-in browser**
Run: `./.venv/bin/python Douyin.py --pages 1 --timeout 20 --browser-port 9223`
Expected: videos are downloaded to `video/`
- [ ] **Step 4: Review changed files for scope drift**
Run: `rg --files`
Expected: only the planned files changed or were added
- [ ] **Step 5: Commit**
Not applicable here because the workspace is not a git repository.

View File

@ -0,0 +1,191 @@
# Douyin Login Entry Design
## Goal
将当前“手动先开浏览器登录,再让 `Douyin.py` 附着到调试端口抓取”的隐式流程,固化为稳定、明确、可复用的两步式命令行入口。
## Current Context
- 现有抓取实现位于 `Douyin.py`
- `Douyin.py` 已支持通过 `--browser-port` 附着到已启动的 Chrome 调试端口。
- 本次实测已经证明:用户先在可见 Chrome 中登录抖音并通过验证码后,`Douyin.py --browser-port 9223` 可以成功抓到 `web/aweme/post/` 接口并下载视频。
- 当前缺少一个明确的“登录准备入口”,导致可操作性依赖人工记忆和临时命令。
## Requirements
### Functional
1. 提供一个独立脚本,用于启动可见 Chrome并固定
- 调试端口,默认 `9223`
- 用户数据目录,默认使用一个项目约定路径
- 打开的初始 URL默认指向现有抖音博主页
2. 登录脚本只负责“打开浏览器并提示用户手动登录”,不负责抓取。
3. `Douyin.py` 继续负责抓取,并保持“附着已有浏览器”的职责边界。
4. 当 `Douyin.py` 指定了 `--browser-port` 但端口不可连通时,应给出清晰错误,提示先运行登录脚本。
5. 文档应给出最短可执行流程:
- 第一步:启动浏览器并登录
- 第二步:运行抓取命令
### Non-Functional
1. 不改变现有抓包、解析、下载的主逻辑。
2. 保持现有命令参数兼容。
3. 入口职责清晰,便于排查“登录问题”和“抓取问题”。
4. 新增行为应具备可自动化测试的核心单元。
## Chosen Approach
采用双脚本方案:
- 新增 `login_douyin.py`
- 负责启动可见 Chrome
- 固定 remote debugging port
- 固定 profile 目录
- 打开目标用户主页
- 输出明确提示,引导用户完成手动登录和验证码
- 保留 `Douyin.py`
- 继续承担附着浏览器、监听接口、下载视频的职责
- 增强附着前检查与报错信息
## Rejected Alternatives
### Alternative 1: 将“启动浏览器”直接并入 `Douyin.py`
不采用。原因:
- 会让 `Douyin.py` 同时承担登录准备和抓取职责。
- 错误定位会变差,用户更难区分是登录失败还是抓取失败。
- 未来若需要“先登录、稍后再抓”,这种合并入口不灵活。
### Alternative 2: 只写 shell 脚本串联所有步骤
不采用。原因:
- 逻辑容易散落在 shell 中,测试性差。
- 浏览器启动参数、等待逻辑和抓取命令耦合度高。
- 后续若要扩展默认参数或跨平台兼容shell 方案维护成本更高。
## Proposed CLI UX
### Step 1: 启动登录浏览器
```bash
./.venv/bin/python login_douyin.py
```
默认行为:
- 启动可见 Chrome
- 调试端口为 `9223`
- profile 目录为项目约定的本地路径
- 打开默认的抖音主页 URL
- 输出“请在浏览器中完成登录/验证码,然后再运行抓取命令”
可选扩展参数:
- `--browser-port`
- `--profile-dir`
- `--user-url`
- `--chrome-path`
### Step 2: 运行抓取
```bash
./.venv/bin/python Douyin.py --pages 1 --browser-port 9223
```
## Design Details
### `login_douyin.py`
建议拆分为可测试的小函数:
- `build_login_command(...)`
- 输入 Chrome 路径、profile 目录、端口、URL
- 输出适合 `subprocess.Popen(...)` 的参数列表
- `launch_browser(...)`
- 调用 `subprocess.Popen(...)`
- `build_parser()`
- 定义 CLI 参数
- `main()`
- 解析参数
- 启动浏览器
- 打印下一步指引
### `Douyin.py`
新增一个显式的端口检查函数,例如:
- `ensure_browser_debug_port_ready(browser_port: int) -> None`
行为:
- 仅当用户传入 `--browser-port` 时执行
- 尝试连接 `127.0.0.1:<port>`
- 若失败,抛出清晰错误,提示:
- 先启动 `login_douyin.py`
- 确认 Chrome 仍在运行
- 确认端口与抓取命令一致
## Error Handling
### 登录脚本
- Chrome 可执行文件不存在:直接报错并退出。
- 浏览器启动失败:输出异常原因并返回非零退出码。
- profile 目录不存在:自动创建。
### 抓取脚本
- 指定 `--browser-port` 但端口不可达:立即失败,不进入抓取流程。
- 登录未完成导致页面异常:保留现有抓包等待与警告逻辑。
## Testing Strategy
### Unit Tests
新增或扩展 `test_douyin.py`,覆盖:
1. `build_login_command()` 生成的命令参数正确。
2. 默认调试地址仍为 `127.0.0.1:<port>`
3. `ensure_browser_debug_port_ready()` 在端口不可达时抛出可读错误。
4. `ensure_browser_debug_port_ready()` 在端口可达时正常返回。
如测试边界过大,可新增 `test_login_douyin.py`
### Manual Verification
1. 运行 `./.venv/bin/python login_douyin.py`
2. 在打开的 Chrome 中登录抖音并通过验证码
3. 运行 `./.venv/bin/python Douyin.py --pages 1 --browser-port 9223`
4. 确认 `video/` 下生成新的 mp4 文件
## Implementation Boundaries
本次只做以下改动:
- 新增登录入口脚本
- 为抓取入口补充附着前端口检查
- 更新测试
- 更新使用文档
本次不做以下改动:
- 不重写抓取主流程
- 不改成单命令自动等待登录
- 不引入 Playwright 作为正式运行时依赖
- 不增加下载调度、断点续传或批量任务管理
## Risks
1. 本机 Chrome 路径可能与预设不同,因此需要保留 `--chrome-path` 覆盖能力。
2. profile 目录固定后,用户可能重复复用登录态,这是预期行为,但文档需说明。
3. 若目标端口被其他进程占用,登录脚本需要给出可诊断的失败信息或允许端口覆盖。
## Success Criteria
满足以下条件即视为完成:
1. 用户可以通过固定命令启动登录浏览器。
2. 用户登录完成后,可通过固定命令让 `Douyin.py` 成功附着并抓取。
3. 当浏览器未启动或端口错误时,抓取脚本会给出明确提示,而不是模糊失败。

View File

@ -0,0 +1,215 @@
# Douyin Targeted Crawling Requirements
## Goal
在现有“登录浏览器后附着抓取”的基础上,扩展为支持更明确的目标选择能力,使系统不仅能抓默认博主主页,还能:
- 指定某个博主主页进行抓取
- 直接抓当前浏览器里正在查看的博主主页
- 指定某个单独视频进行抓取
本需求文档只定义需求、范围、交互、错误处理和 TDD 约束,不直接定义实现细节代码。
## Current Behavior
当前系统具备以下行为:
- 通过 `login_douyin.py` 启动可见 Chrome并开启调试端口
- 通过 `Douyin.py` 附着到该浏览器
- 打开某个博主主页 URL
- 监听抖音作品列表接口 `web/aweme/post/`
- 从接口返回的 `aweme_list` 中提取视频地址并下载
当前默认目标是一个硬编码博主主页,但也支持在命令行传入另一个博主主页 URL。
## Target Modes
新版本必须同时支持以下三种目标模式:
### 1. `creator-url`
用户显式传入某个博主主页 URL系统以该博主主页为目标进行抓取。
### 2. `current-creator`
系统直接读取当前已附着浏览器正在查看的页面。如果当前页面是博主主页,则以该页面为目标进行抓取。
### 3. `single-video`
用户传入单个视频链接或 `aweme_id`,系统仅下载这一条视频,不执行博主作品列表抓取。
## Scope Rules
### Creator Targets
当目标是博主时,默认只抓“当前页面中已加载、当前可见范围对应的作品”。
这意味着:
- 不默认自动抓完整个博主全部作品
- 不默认自动多页翻完所有历史内容
- 不自动替用户点击筛选器或改动页面状态
- 如果用户已经在页面里手动做了筛选、切换或滚动,则抓取结果以当前页面已加载状态为准
### Single Video Target
当目标是单视频时:
- 若输入是视频 URL系统需要先解析出对应作品标识
- 若输入是 `aweme_id`,系统直接按单作品逻辑抓取
- 最终只下载一条视频
## Recommended User Experience
保留现有两步模式,不改成自动登录的一体化入口:
### Step 1
先启动登录浏览器:
```bash
./.venv/bin/python login_douyin.py
```
### Step 2
登录完成后,再运行抓取命令。
未来命令行接口应支持显式目标模式,例如:
```bash
./.venv/bin/python Douyin.py --mode creator-url --target "https://www.douyin.com/user/..."
./.venv/bin/python Douyin.py --mode current-creator
./.venv/bin/python Douyin.py --mode single-video --target "https://www.douyin.com/video/..."
./.venv/bin/python Douyin.py --mode single-video --target "7619989983668240802"
```
上面只是推荐交互形态,具体参数名可在实现设计阶段微调,但必须满足以下原则:
- 模式必须显式可区分
- “当前浏览器页面”与“传入 URL”不能混淆
- 单视频目标与博主目标不能混淆
## Functional Requirements
### Requirement A: Explicit Creator URL Crawling
系统必须允许用户通过博主主页 URL 指定抓取目标。
完成条件:
- 系统接受有效博主主页 URL
- 浏览器打开或切换到该 URL
- 系统只抓当前页面已加载的作品
### Requirement B: Current Browser Creator Crawling
系统必须允许用户不手输目标 URL而是直接抓当前浏览器页面对应的博主主页。
完成条件:
- 系统能读取当前浏览器页面 URL
- 若当前页面是博主主页,则正常抓取
- 若当前页面不是博主主页,则明确报错并退出
### Requirement C: Single Video Download
系统必须允许用户通过单个视频链接或 `aweme_id` 只下载一个视频。
完成条件:
- 支持视频 URL 输入
- 支持 `aweme_id` 输入
- 最终只落地一个视频文件
### Requirement D: Visible-Only Creator Scope
当目标是博主时,系统默认只处理当前页面已经加载出来的作品。
完成条件:
- 不自动继续滚动抓到所有历史内容
- 抓取范围受当前页面加载状态约束
- 用户先手动筛选、滚动、切换后,再执行抓取时,系统按当前页面状态工作
## Error Handling Requirements
系统必须提供明确错误,不允许模糊失败。
### Current Creator Errors
- 当前页面不是博主主页:报错并退出
- 当前页面虽然像博主页,但未加载出可用作品数据:提示用户先完成页面操作后重试
### Single Video Errors
- 输入既不是合法视频 URL也不是合法 `aweme_id`:报错并退出
- 视频标识无法解析:报错并退出
### Browser Attachment Errors
- 调试端口不可用:提示先运行登录脚本并确认浏览器仍在运行
### Creator URL Errors
- 传入 URL 不是受支持的抖音博主主页:报错并退出
## Non-Goals
本次需求明确不包含以下内容:
- 任意网页抓取
- 非抖音站点抓取
- 自动替用户点击页面筛选器
- 自动抓完整个博主全部历史作品
- 自动搜索博主
- 自动在抖音站内执行复杂导航流程
## Terminology
### `aweme`
抖音接口中的作品对象,可以理解为一条内容或一个视频作品实体。
### `aweme_id`
抖音作品的唯一标识。
### `current visible videos`
指当前页面已经加载出来,并能够通过当前页面对应接口响应获得的作品集合,而不是博主的全量历史作品。
## TDD Requirements
本需求后续实现必须使用 TDD。
### Mandatory Process
1. 先写失败测试
2. 先验证测试是因为功能未实现而失败
3. 再写最小实现让测试通过
4. 最后再做必要重构
### Required Test Areas
至少覆盖以下测试:
- `creator-url` 模式下,合法博主主页 URL 能被识别并生成正确抓取目标
- `current-creator` 模式下,当前页面是博主主页时可抓取
- `current-creator` 模式下,当前页面不是博主主页时明确报错
- `single-video` 模式支持视频 URL
- `single-video` 模式支持 `aweme_id`
- 创作者抓取默认只处理当前已加载内容,不自动继续翻页
- 目标模式错误时的报错路径
- 浏览器端口不可用时的报错路径
## Acceptance Criteria
需求完成后,应满足以下验收标准:
1. 用户可以显式指定博主主页 URL 抓取
2. 用户可以直接抓当前浏览器中的博主主页
3. 用户可以指定单个视频 URL 或 `aweme_id` 下载单条视频
4. 当目标是博主时,默认只抓当前页面已加载作品
5. 关键失败场景都有明确报错
6. 实现过程遵循 TDD并有对应自动化测试覆盖

BIN
image-1.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 460 KiB

BIN
image-2.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.9 MiB

BIN
image.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.2 MiB

122
login_douyin.py Normal file
View File

@ -0,0 +1,122 @@
from __future__ import annotations
import argparse
import socket
import subprocess
import sys
import time
from pathlib import Path
from Douyin import DEFAULT_USER_URL
DEFAULT_CHROME_PATH = "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"
DEFAULT_BROWSER_PORT = 9223
DEFAULT_PROFILE_DIR = Path(".douyin-chrome-profile")
def derive_chrome_app_path(chrome_path: str) -> str:
marker = ".app/"
if marker not in chrome_path:
return chrome_path
prefix, _ = chrome_path.split(marker, 1)
return f"{prefix}.app"
def build_login_command(
chrome_path: str,
profile_dir: Path,
browser_port: int,
user_url: str,
) -> list[str]:
app_path = derive_chrome_app_path(chrome_path)
return [
"open",
"-na",
app_path,
"--args",
f"--user-data-dir={profile_dir}",
f"--remote-debugging-port={browser_port}",
user_url,
]
def build_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(description="启动可见 Chrome供抖音手动登录后附着抓取")
parser.add_argument("--chrome-path", default=DEFAULT_CHROME_PATH, help="Chrome 可执行文件路径")
parser.add_argument(
"--profile-dir",
default=str(DEFAULT_PROFILE_DIR),
help="Chrome 用户数据目录,默认复用项目内固定目录",
)
parser.add_argument(
"--browser-port",
type=int,
default=DEFAULT_BROWSER_PORT,
help="Chrome 调试端口,默认 9223",
)
parser.add_argument("--user-url", default=DEFAULT_USER_URL, help="启动后打开的抖音主页 URL")
return parser
def launch_browser(command: list[str]) -> subprocess.Popen[str]:
return subprocess.Popen(command)
def wait_for_browser_debug_port(
browser_port: int,
timeout_seconds: float = 15.0,
interval_seconds: float = 0.25,
) -> None:
deadline = time.monotonic() + timeout_seconds
while time.monotonic() < deadline:
try:
with socket.create_connection(("127.0.0.1", browser_port), timeout=1):
return
except OSError:
time.sleep(interval_seconds)
raise RuntimeError(
f"Chrome 已启动命令,但调试端口 {browser_port} 在限定时间内未就绪。"
)
def main(argv: list[str] | None = None) -> int:
parser = build_parser()
args = parser.parse_args(argv)
if args.browser_port <= 0:
parser.error("--browser-port 必须大于 0")
chrome_path = Path(args.chrome_path)
if not chrome_path.exists():
print(f"[ERROR] Chrome 可执行文件不存在: {chrome_path}")
return 1
profile_dir = Path(args.profile_dir).resolve()
profile_dir.mkdir(parents=True, exist_ok=True)
command = build_login_command(
chrome_path=str(chrome_path),
profile_dir=profile_dir,
browser_port=args.browser_port,
user_url=args.user_url,
)
try:
launch_browser(command)
except OSError as exc:
print(f"[ERROR] 启动 Chrome 失败: {exc}")
return 1
try:
wait_for_browser_debug_port(args.browser_port)
except RuntimeError as exc:
print(f"[ERROR] {exc}")
return 1
print("[INFO] Chrome 已启动。请在打开的浏览器中完成抖音登录和验证码。")
print(f"[INFO] 登录完成后执行: ./.venv/bin/python Douyin.py --browser-port {args.browser_port}")
return 0
if __name__ == "__main__":
sys.exit(main())

76
test_douyin.py Normal file
View File

@ -0,0 +1,76 @@
import importlib
import unittest
from unittest import mock
class FakeResponse:
def __init__(self, body, raw_body):
self.body = body
self.raw_body = raw_body
class DouyinModuleTests(unittest.TestCase):
def test_module_can_import_without_optional_runtime_dependencies(self) -> None:
module = importlib.import_module("Douyin")
self.assertIsNotNone(module)
def test_sanitize_filename_removes_invalid_characters(self) -> None:
module = importlib.import_module("Douyin")
self.assertEqual(
module.sanitize_filename('a/b:c*?d"e<f>g|h\n'),
"a_b_c__d_e_f_g_h",
)
def test_choose_video_url_prefers_douyinvod_link(self) -> None:
module = importlib.import_module("Douyin")
urls = [
"https://www.douyin.com/aweme/v1/play/?video_id=123",
"https://v11-weba.douyinvod.com/example/video.mp4",
"https://v26-web.douyinvod.com/example/video.mp4",
]
self.assertEqual(
module.choose_video_url(urls),
"https://v11-weba.douyinvod.com/example/video.mp4",
)
def test_build_output_path_uses_video_directory(self) -> None:
module = importlib.import_module("Douyin")
output_path = module.build_output_path("测试标题", "123456")
self.assertEqual(output_path.as_posix(), "video/测试标题-123456.mp4")
def test_extract_aweme_payload_uses_dict_body(self) -> None:
module = importlib.import_module("Douyin")
response = FakeResponse({"aweme_list": []}, "")
self.assertEqual(module.extract_aweme_payload(response), {"aweme_list": []})
def test_extract_aweme_payload_falls_back_to_raw_json(self) -> None:
module = importlib.import_module("Douyin")
response = FakeResponse("", '{"aweme_list": [{"aweme_id": "1"}]}')
self.assertEqual(
module.extract_aweme_payload(response),
{"aweme_list": [{"aweme_id": "1"}]},
)
def test_build_browser_address_from_port(self) -> None:
module = importlib.import_module("Douyin")
self.assertEqual(module.build_browser_address(9223), "127.0.0.1:9223")
self.assertIsNone(module.build_browser_address(None))
def test_ensure_browser_debug_port_ready_accepts_open_port(self) -> None:
module = importlib.import_module("Douyin")
connection = mock.MagicMock()
connection.__enter__.return_value = connection
connection.__exit__.return_value = False
with mock.patch.object(module.socket, "create_connection", return_value=connection) as mocked_connect:
module.ensure_browser_debug_port_ready(9223)
mocked_connect.assert_called_once()
def test_ensure_browser_debug_port_ready_rejects_closed_port(self) -> None:
module = importlib.import_module("Douyin")
with mock.patch.object(module.socket, "create_connection", side_effect=OSError("boom")):
with self.assertRaisesRegex(RuntimeError, "login_douyin.py"):
module.ensure_browser_debug_port_ready(9223)
if __name__ == "__main__":
unittest.main()

98
test_login_douyin.py Normal file
View File

@ -0,0 +1,98 @@
import importlib
import io
import tempfile
import unittest
from contextlib import redirect_stdout
from pathlib import Path
from unittest import mock
class LoginDouyinModuleTests(unittest.TestCase):
def test_build_login_command_uses_expected_chrome_arguments(self) -> None:
module = importlib.import_module("login_douyin")
command = module.build_login_command(
chrome_path="/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
profile_dir=Path("/tmp/douyin-profile"),
browser_port=9223,
user_url="https://www.douyin.com/user/example",
)
self.assertEqual(
command,
[
"open",
"-na",
"/Applications/Google Chrome.app",
"--args",
"--user-data-dir=/tmp/douyin-profile",
"--remote-debugging-port=9223",
"https://www.douyin.com/user/example",
],
)
def test_build_parser_uses_expected_defaults(self) -> None:
module = importlib.import_module("login_douyin")
args = module.build_parser().parse_args([])
self.assertEqual(args.browser_port, 9223)
self.assertEqual(args.chrome_path, module.DEFAULT_CHROME_PATH)
self.assertEqual(args.user_url, module.DEFAULT_USER_URL)
def test_main_creates_profile_dir_and_prints_next_step(self) -> None:
module = importlib.import_module("login_douyin")
with tempfile.TemporaryDirectory() as temp_dir:
profile_dir = Path(temp_dir) / "profile"
stdout = io.StringIO()
with redirect_stdout(stdout):
with mock.patch.object(module, "launch_browser") as mocked_launch:
with mock.patch.object(module, "wait_for_browser_debug_port") as mocked_wait:
exit_code = module.main(
[
"--chrome-path",
"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
"--profile-dir",
str(profile_dir),
"--browser-port",
"9333",
]
)
self.assertEqual(exit_code, 0)
self.assertTrue(profile_dir.exists())
mocked_launch.assert_called_once()
mocked_wait.assert_called_once_with(9333)
self.assertIn("9333", stdout.getvalue())
self.assertIn("./.venv/bin/python Douyin.py --browser-port 9333", stdout.getvalue())
def test_main_returns_error_when_chrome_path_missing(self) -> None:
module = importlib.import_module("login_douyin")
stdout = io.StringIO()
with redirect_stdout(stdout):
exit_code = module.main(["--chrome-path", "/tmp/does-not-exist-chrome"])
self.assertEqual(exit_code, 1)
self.assertIn("Chrome", stdout.getvalue())
self.assertIn("不存在", stdout.getvalue())
def test_main_returns_error_when_debug_port_never_becomes_ready(self) -> None:
module = importlib.import_module("login_douyin")
with tempfile.TemporaryDirectory() as temp_dir:
profile_dir = Path(temp_dir) / "profile"
stdout = io.StringIO()
with redirect_stdout(stdout):
with mock.patch.object(module, "launch_browser"):
with mock.patch.object(
module,
"wait_for_browser_debug_port",
side_effect=RuntimeError("端口未就绪"),
):
exit_code = module.main(
[
"--chrome-path",
"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
"--profile-dir",
str(profile_dir),
]
)
self.assertEqual(exit_code, 1)
self.assertIn("端口未就绪", stdout.getvalue())
if __name__ == "__main__":
unittest.main()

45
抖音爬取视频.md Normal file
View File

@ -0,0 +1,45 @@
打开一个博主主页f12打开控制台
鼠标挪到一个视频上面会开始播放然后出现一个mp4媒体
![alt text](image.png)
![alt text](image-1.png)
复制这个媒体的rc
NjxpM2hkZWUzNTszNGlpOkBpajdrOHY5cmxqOjMzNGkzM0A1MF41MC8wNmMxMjM2YWAuYSNrbmVqMmRrYWNhLS1kLTBzcw==
粘贴到搜索栏然后command+R刷新页面然后搜索就找到了链接
![alt text](image-2.png)
一般是三条
```
"url_list": [
"https://v26-web.douyinvod.com/71be40af2c168460799af3a778572914/69e1f23b/video/tos/cn/tos-cn-ve-15/oERIAIyg72xQTAKefNevmg5PYSzGBCLVGxxBM0/?a=6383&ch=10010&cr=3&dr=0&lr=all&cd=0%7C0%7C0%7C3&cv=1&br=2657&bt=2657&cs=0&ds=4&ft=pEaFx4hZffPdOW~-N12NvAq-antLjrKiGgZnRkaEkVUpUjVhWL6&mime_type=video_mp4&qs=0&rc=ZDZnZjwzNTgzaWk5ZzloNUBpajdrOHY5cmxqOjMzNGkzM0AxNDIvY2FeNmMxXjQ0MWFiYSNrbmVqMmRrYWNhLS1kLTBzcw%3D%3D&btag=80000e00010000&cquery=100z_100o_101r_100B_100x&dy_q=1776404465&feature_id=37f92ebd2877ae8e7eba995d406c5150&l=202604171341058889DECB07230D0B9B8B",
"https://v11-weba.douyinvod.com/0cc3b59178e5b0065b19ddb2587010c1/69e1f23b/video/tos/cn/tos-cn-ve-15/oERIAIyg72xQTAKefNevmg5PYSzGBCLVGxxBM0/?a=6383&ch=10010&cr=3&dr=0&lr=all&cd=0%7C0%7C0%7C3&cv=1&br=2657&bt=2657&cs=0&ds=4&ft=pEaFx4hZffPdOW~-N12NvAq-antLjrKiGgZnRkaEkVUpUjVhWL6&mime_type=video_mp4&qs=0&rc=ZDZnZjwzNTgzaWk5ZzloNUBpajdrOHY5cmxqOjMzNGkzM0AxNDIvY2FeNmMxXjQ0MWFiYSNrbmVqMmRrYWNhLS1kLTBzcw%3D%3D&btag=80000e00010000&cquery=100o_101r_100B_100x_100z&dy_q=1776404465&feature_id=37f92ebd2877ae8e7eba995d406c5150&l=202604171341058889DECB07230D0B9B8B",
"https://www.douyin.com/aweme/v1/play/?video_id=v0300fg10000d7fgdinog65sm5hhq4ng&line=0&file_id=e2de467d76af4ff095a2d0f92c5d734a&sign=66b2ab7aba34fd66cf343c7ea1aa9994&is_play_url=1&source=PackSourceEnum_PUBLISH"
],
```
复制这个中间的v11的到浏览器就有了
```
https://v11-weba.douyinvod.com/0cc3b59178e5b0065b19ddb2587010c1/69e1f23b/video/tos/cn/tos-cn-ve-15/oERIAIyg72xQTAKefNevmg5PYSzGBCLVGxxBM0/?a=6383&ch=10010&cr=3&dr=0&lr=all&cd=0%7C0%7C0%7C3&cv=1&br=2657&bt=2657&cs=0&ds=4&ft=pEaFx4hZffPdOW~-N12NvAq-antLjrKiGgZnRkaEkVUpUjVhWL6&mime_type=video_mp4&qs=0&rc=ZDZnZjwzNTgzaWk5ZzloNUBpajdrOHY5cmxqOjMzNGkzM0AxNDIvY2FeNmMxXjQ0MWFiYSNrbmVqMmRrYWNhLS1kLTBzcw%3D%3D&btag=80000e00010000&cquery=100o_101r_100B_100x_100z&dy_q=1776404465&feature_id=37f92ebd2877ae8e7eba995d406c5150&l=202604171341058889DECB07230D0B9B8B
发现可以直接打开video的页面并且可以下载
```
## 推荐流程
把“手动登录”和“附着抓取”分成两步走更稳:
### 1. 先启动登录浏览器
```bash
./.venv/bin/python login_douyin.py
```
- 默认会启动一个可见的 Chrome
- 默认调试端口是 `9223`
- 登录态会保存在项目目录下的 `.douyin-chrome-profile/`
在这个浏览器里手动完成抖音登录和验证码。
### 2. 再附着已登录浏览器开始抓取
```bash
./.venv/bin/python Douyin.py --pages 1 --browser-port 9223
```
如果你改了登录脚本里的端口,抓取时也要传同一个 `--browser-port`