feat: initial douyin crawler
This commit is contained in:
commit
d910d6f6b9
8
.gitignore
vendored
Normal file
8
.gitignore
vendored
Normal file
@ -0,0 +1,8 @@
|
||||
.DS_Store
|
||||
.douyin-chrome-profile/
|
||||
.venv/
|
||||
__pycache__/
|
||||
*.pyc
|
||||
video/
|
||||
video_p2/
|
||||
.pytest_cache/
|
||||
21
.vscode/settings.json
vendored
Normal file
21
.vscode/settings.json
vendored
Normal file
@ -0,0 +1,21 @@
|
||||
{
|
||||
"files.exclude": {
|
||||
"**/.venv": true,
|
||||
"**/__pycache__": true,
|
||||
"video_p2": true
|
||||
},
|
||||
"files.watcherExclude": {
|
||||
"**/.venv/**": true,
|
||||
"**/__pycache__/**": true,
|
||||
"video/**": true,
|
||||
"video_p2/**": true
|
||||
},
|
||||
"search.exclude": {
|
||||
"**/.venv": true,
|
||||
"**/__pycache__": true,
|
||||
"video": true,
|
||||
"video_p2": true
|
||||
},
|
||||
"terminal.integrated.enablePersistentSessions": false,
|
||||
"terminal.integrated.persistentSessionReviveProcess": "never"
|
||||
}
|
||||
300
Douyin.py
Normal file
300
Douyin.py
Normal file
@ -0,0 +1,300 @@
|
||||
"""
|
||||
使用 DrissionPage 监听抖音作品列表接口,并批量下载视频。
|
||||
|
||||
运行示例:
|
||||
python3 Douyin.py "https://www.douyin.com/user/xxx?from_tab_name=main"
|
||||
|
||||
依赖:
|
||||
pip install requests DrissionPage
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
import socket
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
DEFAULT_USER_URL = (
|
||||
"https://www.douyin.com/user/"
|
||||
"MS4wLjABAAAAx7--dRYA0mPwhwvxNJ-35i6sB8d1Kv4Sj1WmugquqiHK19QYlB18Ikx6cECT1RVO"
|
||||
"?from_tab_name=main"
|
||||
)
|
||||
LISTEN_TARGET = "web/aweme/post/"
|
||||
INVALID_FILENAME_CHARS = re.compile(r'[\\/:*?"<>|\r\n\t]')
|
||||
|
||||
|
||||
def sanitize_filename(value: str, fallback: str = "untitled") -> str:
|
||||
cleaned = INVALID_FILENAME_CHARS.sub("_", value).strip(" ._")
|
||||
return cleaned or fallback
|
||||
|
||||
|
||||
def choose_video_url(url_list: list[str]) -> str:
|
||||
for url in url_list:
|
||||
if "douyinvod.com" in url:
|
||||
return url
|
||||
if url_list:
|
||||
return url_list[0]
|
||||
raise ValueError("url_list 为空,无法选择视频地址。")
|
||||
|
||||
|
||||
def build_output_path(title: str, video_id: str, output_dir: Path = Path("video")) -> Path:
|
||||
safe_title = sanitize_filename(title, fallback="untitled")
|
||||
return output_dir / f"{safe_title}-{video_id}.mp4"
|
||||
|
||||
|
||||
def build_browser_address(browser_port: int | None) -> str | None:
|
||||
if browser_port is None:
|
||||
return None
|
||||
return f"127.0.0.1:{browser_port}"
|
||||
|
||||
|
||||
def ensure_browser_debug_port_ready(browser_port: int) -> None:
|
||||
try:
|
||||
with socket.create_connection(("127.0.0.1", browser_port), timeout=2):
|
||||
return
|
||||
except OSError as exc:
|
||||
raise RuntimeError(
|
||||
"无法连接到已启动的 Chrome 调试端口。"
|
||||
f"请先运行 `./.venv/bin/python login_douyin.py --browser-port {browser_port}`,"
|
||||
"并确认 Chrome 仍在运行且端口一致。"
|
||||
) from exc
|
||||
|
||||
|
||||
def extract_aweme_payload(response: Any) -> dict[str, Any]:
|
||||
body = getattr(response, "body", None)
|
||||
if isinstance(body, dict):
|
||||
return body
|
||||
|
||||
raw_body = getattr(response, "raw_body", None)
|
||||
if isinstance(raw_body, str) and raw_body.strip():
|
||||
payload = json.loads(raw_body)
|
||||
if isinstance(payload, dict):
|
||||
return payload
|
||||
|
||||
raise ValueError("响应体不是可解析的 JSON 字典。")
|
||||
|
||||
|
||||
def parse_aweme_items(body: Any) -> list[dict[str, str]]:
|
||||
if not isinstance(body, dict):
|
||||
raise ValueError("接口响应不是字典,无法解析。")
|
||||
|
||||
aweme_list = body.get("aweme_list")
|
||||
if not isinstance(aweme_list, list):
|
||||
raise ValueError("接口响应中缺少 aweme_list。")
|
||||
|
||||
items: list[dict[str, str]] = []
|
||||
for aweme in aweme_list:
|
||||
if not isinstance(aweme, dict):
|
||||
continue
|
||||
|
||||
video = aweme.get("video") or {}
|
||||
play_addr = video.get("play_addr") or {}
|
||||
url_list = play_addr.get("url_list") or []
|
||||
if not url_list:
|
||||
continue
|
||||
|
||||
video_id = str(aweme.get("aweme_id") or "").strip()
|
||||
if not video_id:
|
||||
continue
|
||||
|
||||
title = str(aweme.get("desc") or "").strip() or "untitled"
|
||||
items.append(
|
||||
{
|
||||
"title": title,
|
||||
"video_id": video_id,
|
||||
"video_url": choose_video_url([str(url) for url in url_list]),
|
||||
}
|
||||
)
|
||||
|
||||
return items
|
||||
|
||||
|
||||
def build_headers(referer: str) -> dict[str, str]:
|
||||
return {
|
||||
"referer": referer,
|
||||
"user-agent": (
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/135.0.0.0 Safari/537.36"
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
def import_runtime_dependencies() -> tuple[Any, Any, Any]:
|
||||
try:
|
||||
import requests
|
||||
except ModuleNotFoundError as exc:
|
||||
raise SystemExit(
|
||||
"缺少 requests,请先执行: python3 -m pip install requests"
|
||||
) from exc
|
||||
|
||||
try:
|
||||
from DrissionPage import ChromiumPage
|
||||
from DrissionPage import ChromiumOptions
|
||||
except ModuleNotFoundError as exc:
|
||||
raise SystemExit(
|
||||
"缺少 DrissionPage,请先执行: python3 -m pip install DrissionPage"
|
||||
) from exc
|
||||
|
||||
return requests, ChromiumPage, ChromiumOptions
|
||||
|
||||
|
||||
def create_page(chromium_page_cls: Any, chromium_options_cls: Any, browser_port: int | None) -> Any:
|
||||
browser_address = build_browser_address(browser_port)
|
||||
if browser_address is None:
|
||||
return chromium_page_cls()
|
||||
|
||||
options = chromium_options_cls().set_address(browser_address).existing_only(True)
|
||||
return chromium_page_cls(options)
|
||||
|
||||
|
||||
def wait_for_aweme_packet(page: Any, timeout: int) -> Any | None:
|
||||
try:
|
||||
return page.listen.wait(timeout=timeout)
|
||||
except Exception as exc:
|
||||
print(f"[WARN] 等待接口数据超时或失败: {exc}")
|
||||
return None
|
||||
|
||||
|
||||
def scroll_to_next_page(page: Any) -> None:
|
||||
page.run_js("window.scrollTo(0, document.body.scrollHeight);")
|
||||
time.sleep(2)
|
||||
|
||||
|
||||
def download_video(
|
||||
requests_module: Any,
|
||||
headers: dict[str, str],
|
||||
video_url: str,
|
||||
output_path: Path,
|
||||
) -> None:
|
||||
response = requests_module.get(video_url, headers=headers, timeout=60)
|
||||
response.raise_for_status()
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
output_path.write_bytes(response.content)
|
||||
|
||||
|
||||
def collect_videos(
|
||||
user_url: str,
|
||||
max_pages: int,
|
||||
timeout: int,
|
||||
output_dir: Path,
|
||||
browser_port: int | None,
|
||||
) -> int:
|
||||
requests_module, chromium_page_cls, chromium_options_cls = import_runtime_dependencies()
|
||||
headers = build_headers(user_url)
|
||||
if browser_port is not None:
|
||||
ensure_browser_debug_port_ready(browser_port)
|
||||
page = create_page(chromium_page_cls, chromium_options_cls, browser_port)
|
||||
page.listen.start(LISTEN_TARGET)
|
||||
|
||||
print("[INFO] 正在打开抖音主页。若出现登录或验证码,请先在浏览器窗口里完成。")
|
||||
page.get(user_url)
|
||||
time.sleep(3)
|
||||
|
||||
downloaded = 0
|
||||
seen_ids: set[str] = set()
|
||||
|
||||
for page_number in range(1, max_pages + 1):
|
||||
print(f"[INFO] 正在处理第 {page_number} 页")
|
||||
packet = wait_for_aweme_packet(page, timeout=timeout)
|
||||
if packet is None:
|
||||
scroll_to_next_page(page)
|
||||
continue
|
||||
|
||||
try:
|
||||
payload = extract_aweme_payload(packet.response)
|
||||
items = parse_aweme_items(payload)
|
||||
except Exception as exc:
|
||||
print(f"[WARN] 解析接口数据失败: {exc}")
|
||||
scroll_to_next_page(page)
|
||||
continue
|
||||
|
||||
if not items:
|
||||
print("[WARN] 这一页没有解析到视频。")
|
||||
|
||||
for item in items:
|
||||
if item["video_id"] in seen_ids:
|
||||
continue
|
||||
|
||||
seen_ids.add(item["video_id"])
|
||||
output_path = build_output_path(
|
||||
title=item["title"],
|
||||
video_id=item["video_id"],
|
||||
output_dir=output_dir,
|
||||
)
|
||||
|
||||
try:
|
||||
download_video(
|
||||
requests_module=requests_module,
|
||||
headers=headers,
|
||||
video_url=item["video_url"],
|
||||
output_path=output_path,
|
||||
)
|
||||
except Exception as exc:
|
||||
print(f"[WARN] 下载失败 {item['video_id']}: {exc}")
|
||||
continue
|
||||
|
||||
downloaded += 1
|
||||
print(f"[OK] 已保存: {output_path}")
|
||||
|
||||
scroll_to_next_page(page)
|
||||
|
||||
return downloaded
|
||||
|
||||
|
||||
def build_parser() -> argparse.ArgumentParser:
|
||||
parser = argparse.ArgumentParser(description="监听抖音作品接口并下载视频")
|
||||
parser.add_argument("user_url", nargs="?", default=DEFAULT_USER_URL, help="抖音博主主页 URL")
|
||||
parser.add_argument("--pages", type=int, default=10, help="最多抓取多少页,默认 10")
|
||||
parser.add_argument("--timeout", type=int, default=10, help="单次等待接口响应秒数,默认 10")
|
||||
parser.add_argument(
|
||||
"--output-dir",
|
||||
default="video",
|
||||
help="视频输出目录,默认 video",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--browser-port",
|
||||
type=int,
|
||||
default=None,
|
||||
help="附着到已启动 Chrome 的调试端口,例如 9223;不传则由 DrissionPage 新开浏览器",
|
||||
)
|
||||
return parser
|
||||
|
||||
|
||||
def main(argv: list[str] | None = None) -> int:
|
||||
parser = build_parser()
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
if args.pages <= 0:
|
||||
parser.error("--pages 必须大于 0")
|
||||
if args.timeout <= 0:
|
||||
parser.error("--timeout 必须大于 0")
|
||||
if args.browser_port is not None and args.browser_port <= 0:
|
||||
parser.error("--browser-port 必须大于 0")
|
||||
|
||||
try:
|
||||
total = collect_videos(
|
||||
user_url=args.user_url,
|
||||
max_pages=args.pages,
|
||||
timeout=args.timeout,
|
||||
output_dir=Path(args.output_dir),
|
||||
browser_port=args.browser_port,
|
||||
)
|
||||
except RuntimeError as exc:
|
||||
print(f"[ERROR] {exc}")
|
||||
return 1
|
||||
except KeyboardInterrupt:
|
||||
print("\n[INFO] 用户中断。")
|
||||
return 130
|
||||
|
||||
print(f"[INFO] 处理结束,共下载 {total} 个视频。")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
198
docs/superpowers/plans/2026-04-17-douyin-login-entry.md
Normal file
198
docs/superpowers/plans/2026-04-17-douyin-login-entry.md
Normal file
@ -0,0 +1,198 @@
|
||||
# Douyin Login Entry Implementation Plan
|
||||
|
||||
> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
|
||||
|
||||
**Goal:** Add a dedicated browser-login launcher and a clearer attach-port check so the Douyin crawler has a stable two-step workflow: login first, crawl second.
|
||||
|
||||
**Architecture:** Keep browser-launch responsibilities in a new `login_douyin.py` script and keep crawl responsibilities in `Douyin.py`. Add a small socket-based port readiness check before attaching to Chrome, and cover the new behavior with unit tests before implementing production code.
|
||||
|
||||
**Tech Stack:** Python 3, `argparse`, `pathlib`, `subprocess`, `socket`, `unittest`
|
||||
|
||||
---
|
||||
|
||||
### Task 1: Write failing tests for the new login launcher
|
||||
|
||||
**Files:**
|
||||
- Create: `login_douyin.py`
|
||||
- Create: `test_login_douyin.py`
|
||||
|
||||
- [ ] **Step 1: Write the failing test**
|
||||
|
||||
```python
|
||||
def test_build_login_command_uses_expected_chrome_arguments(self) -> None:
|
||||
module = importlib.import_module("login_douyin")
|
||||
command = module.build_login_command(
|
||||
chrome_path="/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
|
||||
profile_dir=Path("/tmp/douyin-profile"),
|
||||
browser_port=9223,
|
||||
user_url="https://www.douyin.com/user/example",
|
||||
)
|
||||
self.assertEqual(
|
||||
command,
|
||||
[
|
||||
"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
|
||||
"--user-data-dir=/tmp/douyin-profile",
|
||||
"--remote-debugging-port=9223",
|
||||
"https://www.douyin.com/user/example",
|
||||
],
|
||||
)
|
||||
```
|
||||
|
||||
- [ ] **Step 2: Run test to verify it fails**
|
||||
|
||||
Run: `./.venv/bin/python -m unittest test_login_douyin.py -v`
|
||||
Expected: FAIL because `login_douyin.py` does not exist yet.
|
||||
|
||||
- [ ] **Step 3: Write minimal implementation**
|
||||
|
||||
Create `login_douyin.py` with:
|
||||
|
||||
- `DEFAULT_CHROME_PATH`
|
||||
- `DEFAULT_BROWSER_PORT = 9223`
|
||||
- `DEFAULT_PROFILE_DIR`
|
||||
- `build_login_command(...)`
|
||||
|
||||
- [ ] **Step 4: Run test to verify it passes**
|
||||
|
||||
Run: `./.venv/bin/python -m unittest test_login_douyin.py -v`
|
||||
Expected: PASS for the command-building test.
|
||||
|
||||
- [ ] **Step 5: Commit**
|
||||
|
||||
Not applicable here because the workspace is not a git repository.
|
||||
|
||||
### Task 2: Add tests and implementation for launcher validation and user guidance
|
||||
|
||||
**Files:**
|
||||
- Modify: `login_douyin.py`
|
||||
- Modify: `test_login_douyin.py`
|
||||
|
||||
- [ ] **Step 1: Write the failing tests**
|
||||
|
||||
Add tests for:
|
||||
|
||||
- parser defaults use `9223`
|
||||
- `main()` creates the profile dir
|
||||
- `main()` prints the follow-up crawl command
|
||||
- `main()` returns non-zero with a readable message when the Chrome path does not exist
|
||||
|
||||
- [ ] **Step 2: Run tests to verify they fail**
|
||||
|
||||
Run: `./.venv/bin/python -m unittest test_login_douyin.py -v`
|
||||
Expected: FAIL because validation and guidance behavior is not implemented yet.
|
||||
|
||||
- [ ] **Step 3: Write minimal implementation**
|
||||
|
||||
Add to `login_douyin.py`:
|
||||
|
||||
- `build_parser()`
|
||||
- `launch_browser(...)`
|
||||
- `main(...)`
|
||||
- readable `SystemExit`/stderr-style messaging through printed output and return codes
|
||||
|
||||
- [ ] **Step 4: Run tests to verify they pass**
|
||||
|
||||
Run: `./.venv/bin/python -m unittest test_login_douyin.py -v`
|
||||
Expected: PASS
|
||||
|
||||
- [ ] **Step 5: Commit**
|
||||
|
||||
Not applicable here because the workspace is not a git repository.
|
||||
|
||||
### Task 3: Write failing tests for attach-port readiness in the crawler
|
||||
|
||||
**Files:**
|
||||
- Modify: `Douyin.py`
|
||||
- Modify: `test_douyin.py`
|
||||
|
||||
- [ ] **Step 1: Write the failing tests**
|
||||
|
||||
Add tests for:
|
||||
|
||||
- `ensure_browser_debug_port_ready()` returns successfully when a temporary local server is listening
|
||||
- `ensure_browser_debug_port_ready()` raises a readable `RuntimeError` when the port is unavailable
|
||||
|
||||
- [ ] **Step 2: Run tests to verify they fail**
|
||||
|
||||
Run: `./.venv/bin/python -m unittest test_douyin.py -v`
|
||||
Expected: FAIL because the function does not exist yet.
|
||||
|
||||
- [ ] **Step 3: Write minimal implementation**
|
||||
|
||||
Add to `Douyin.py`:
|
||||
|
||||
- socket-based readiness helper
|
||||
- call it in `collect_videos()` before `create_page(...)` when `browser_port` is provided
|
||||
|
||||
- [ ] **Step 4: Run tests to verify they pass**
|
||||
|
||||
Run: `./.venv/bin/python -m unittest test_douyin.py -v`
|
||||
Expected: PASS
|
||||
|
||||
- [ ] **Step 5: Commit**
|
||||
|
||||
Not applicable here because the workspace is not a git repository.
|
||||
|
||||
### Task 4: Update usage documentation
|
||||
|
||||
**Files:**
|
||||
- Modify: `抖音爬取视频.md`
|
||||
|
||||
- [ ] **Step 1: Write the failing doc expectation**
|
||||
|
||||
Define the required doc updates:
|
||||
|
||||
- explicit step 1 command for `login_douyin.py`
|
||||
- explicit step 2 command for `Douyin.py --browser-port 9223`
|
||||
- short note that login state is kept in the dedicated profile dir
|
||||
|
||||
- [ ] **Step 2: Verify current doc is incomplete**
|
||||
|
||||
Run: `rg -n "login_douyin.py|--browser-port 9223" 抖音爬取视频.md`
|
||||
Expected: no matches or incomplete guidance
|
||||
|
||||
- [ ] **Step 3: Write minimal documentation update**
|
||||
|
||||
Append a short “推荐流程” section to `抖音爬取视频.md`.
|
||||
|
||||
- [ ] **Step 4: Verify the doc contains the new commands**
|
||||
|
||||
Run: `rg -n "login_douyin.py|--browser-port 9223" 抖音爬取视频.md`
|
||||
Expected: matches for both commands
|
||||
|
||||
- [ ] **Step 5: Commit**
|
||||
|
||||
Not applicable here because the workspace is not a git repository.
|
||||
|
||||
### Task 5: Run full verification
|
||||
|
||||
**Files:**
|
||||
- Modify: `Douyin.py`
|
||||
- Modify: `login_douyin.py`
|
||||
- Modify: `test_douyin.py`
|
||||
- Modify: `test_login_douyin.py`
|
||||
- Modify: `抖音爬取视频.md`
|
||||
|
||||
- [ ] **Step 1: Run the full unit test suite**
|
||||
|
||||
Run: `./.venv/bin/python -m unittest test_douyin.py test_login_douyin.py -v`
|
||||
Expected: all tests pass
|
||||
|
||||
- [ ] **Step 2: Run the login launcher manually**
|
||||
|
||||
Run: `./.venv/bin/python login_douyin.py --browser-port 9223`
|
||||
Expected: visible Chrome launches and prints the next crawl command
|
||||
|
||||
- [ ] **Step 3: Run the crawler against the logged-in browser**
|
||||
|
||||
Run: `./.venv/bin/python Douyin.py --pages 1 --timeout 20 --browser-port 9223`
|
||||
Expected: videos are downloaded to `video/`
|
||||
|
||||
- [ ] **Step 4: Review changed files for scope drift**
|
||||
|
||||
Run: `rg --files`
|
||||
Expected: only the planned files changed or were added
|
||||
|
||||
- [ ] **Step 5: Commit**
|
||||
|
||||
Not applicable here because the workspace is not a git repository.
|
||||
191
docs/superpowers/specs/2026-04-17-douyin-login-entry-design.md
Normal file
191
docs/superpowers/specs/2026-04-17-douyin-login-entry-design.md
Normal file
@ -0,0 +1,191 @@
|
||||
# Douyin Login Entry Design
|
||||
|
||||
## Goal
|
||||
|
||||
将当前“手动先开浏览器登录,再让 `Douyin.py` 附着到调试端口抓取”的隐式流程,固化为稳定、明确、可复用的两步式命令行入口。
|
||||
|
||||
## Current Context
|
||||
|
||||
- 现有抓取实现位于 `Douyin.py`。
|
||||
- `Douyin.py` 已支持通过 `--browser-port` 附着到已启动的 Chrome 调试端口。
|
||||
- 本次实测已经证明:用户先在可见 Chrome 中登录抖音并通过验证码后,`Douyin.py --browser-port 9223` 可以成功抓到 `web/aweme/post/` 接口并下载视频。
|
||||
- 当前缺少一个明确的“登录准备入口”,导致可操作性依赖人工记忆和临时命令。
|
||||
|
||||
## Requirements
|
||||
|
||||
### Functional
|
||||
|
||||
1. 提供一个独立脚本,用于启动可见 Chrome,并固定:
|
||||
- 调试端口,默认 `9223`
|
||||
- 用户数据目录,默认使用一个项目约定路径
|
||||
- 打开的初始 URL,默认指向现有抖音博主页
|
||||
2. 登录脚本只负责“打开浏览器并提示用户手动登录”,不负责抓取。
|
||||
3. `Douyin.py` 继续负责抓取,并保持“附着已有浏览器”的职责边界。
|
||||
4. 当 `Douyin.py` 指定了 `--browser-port` 但端口不可连通时,应给出清晰错误,提示先运行登录脚本。
|
||||
5. 文档应给出最短可执行流程:
|
||||
- 第一步:启动浏览器并登录
|
||||
- 第二步:运行抓取命令
|
||||
|
||||
### Non-Functional
|
||||
|
||||
1. 不改变现有抓包、解析、下载的主逻辑。
|
||||
2. 保持现有命令参数兼容。
|
||||
3. 入口职责清晰,便于排查“登录问题”和“抓取问题”。
|
||||
4. 新增行为应具备可自动化测试的核心单元。
|
||||
|
||||
## Chosen Approach
|
||||
|
||||
采用双脚本方案:
|
||||
|
||||
- 新增 `login_douyin.py`
|
||||
- 负责启动可见 Chrome
|
||||
- 固定 remote debugging port
|
||||
- 固定 profile 目录
|
||||
- 打开目标用户主页
|
||||
- 输出明确提示,引导用户完成手动登录和验证码
|
||||
- 保留 `Douyin.py`
|
||||
- 继续承担附着浏览器、监听接口、下载视频的职责
|
||||
- 增强附着前检查与报错信息
|
||||
|
||||
## Rejected Alternatives
|
||||
|
||||
### Alternative 1: 将“启动浏览器”直接并入 `Douyin.py`
|
||||
|
||||
不采用。原因:
|
||||
|
||||
- 会让 `Douyin.py` 同时承担登录准备和抓取职责。
|
||||
- 错误定位会变差,用户更难区分是登录失败还是抓取失败。
|
||||
- 未来若需要“先登录、稍后再抓”,这种合并入口不灵活。
|
||||
|
||||
### Alternative 2: 只写 shell 脚本串联所有步骤
|
||||
|
||||
不采用。原因:
|
||||
|
||||
- 逻辑容易散落在 shell 中,测试性差。
|
||||
- 浏览器启动参数、等待逻辑和抓取命令耦合度高。
|
||||
- 后续若要扩展默认参数或跨平台兼容,shell 方案维护成本更高。
|
||||
|
||||
## Proposed CLI UX
|
||||
|
||||
### Step 1: 启动登录浏览器
|
||||
|
||||
```bash
|
||||
./.venv/bin/python login_douyin.py
|
||||
```
|
||||
|
||||
默认行为:
|
||||
|
||||
- 启动可见 Chrome
|
||||
- 调试端口为 `9223`
|
||||
- profile 目录为项目约定的本地路径
|
||||
- 打开默认的抖音主页 URL
|
||||
- 输出“请在浏览器中完成登录/验证码,然后再运行抓取命令”
|
||||
|
||||
可选扩展参数:
|
||||
|
||||
- `--browser-port`
|
||||
- `--profile-dir`
|
||||
- `--user-url`
|
||||
- `--chrome-path`
|
||||
|
||||
### Step 2: 运行抓取
|
||||
|
||||
```bash
|
||||
./.venv/bin/python Douyin.py --pages 1 --browser-port 9223
|
||||
```
|
||||
|
||||
## Design Details
|
||||
|
||||
### `login_douyin.py`
|
||||
|
||||
建议拆分为可测试的小函数:
|
||||
|
||||
- `build_login_command(...)`
|
||||
- 输入 Chrome 路径、profile 目录、端口、URL
|
||||
- 输出适合 `subprocess.Popen(...)` 的参数列表
|
||||
- `launch_browser(...)`
|
||||
- 调用 `subprocess.Popen(...)`
|
||||
- `build_parser()`
|
||||
- 定义 CLI 参数
|
||||
- `main()`
|
||||
- 解析参数
|
||||
- 启动浏览器
|
||||
- 打印下一步指引
|
||||
|
||||
### `Douyin.py`
|
||||
|
||||
新增一个显式的端口检查函数,例如:
|
||||
|
||||
- `ensure_browser_debug_port_ready(browser_port: int) -> None`
|
||||
|
||||
行为:
|
||||
|
||||
- 仅当用户传入 `--browser-port` 时执行
|
||||
- 尝试连接 `127.0.0.1:<port>`
|
||||
- 若失败,抛出清晰错误,提示:
|
||||
- 先启动 `login_douyin.py`
|
||||
- 确认 Chrome 仍在运行
|
||||
- 确认端口与抓取命令一致
|
||||
|
||||
## Error Handling
|
||||
|
||||
### 登录脚本
|
||||
|
||||
- Chrome 可执行文件不存在:直接报错并退出。
|
||||
- 浏览器启动失败:输出异常原因并返回非零退出码。
|
||||
- profile 目录不存在:自动创建。
|
||||
|
||||
### 抓取脚本
|
||||
|
||||
- 指定 `--browser-port` 但端口不可达:立即失败,不进入抓取流程。
|
||||
- 登录未完成导致页面异常:保留现有抓包等待与警告逻辑。
|
||||
|
||||
## Testing Strategy
|
||||
|
||||
### Unit Tests
|
||||
|
||||
新增或扩展 `test_douyin.py`,覆盖:
|
||||
|
||||
1. `build_login_command()` 生成的命令参数正确。
|
||||
2. 默认调试地址仍为 `127.0.0.1:<port>`。
|
||||
3. `ensure_browser_debug_port_ready()` 在端口不可达时抛出可读错误。
|
||||
4. `ensure_browser_debug_port_ready()` 在端口可达时正常返回。
|
||||
|
||||
如测试边界过大,可新增 `test_login_douyin.py`。
|
||||
|
||||
### Manual Verification
|
||||
|
||||
1. 运行 `./.venv/bin/python login_douyin.py`
|
||||
2. 在打开的 Chrome 中登录抖音并通过验证码
|
||||
3. 运行 `./.venv/bin/python Douyin.py --pages 1 --browser-port 9223`
|
||||
4. 确认 `video/` 下生成新的 mp4 文件
|
||||
|
||||
## Implementation Boundaries
|
||||
|
||||
本次只做以下改动:
|
||||
|
||||
- 新增登录入口脚本
|
||||
- 为抓取入口补充附着前端口检查
|
||||
- 更新测试
|
||||
- 更新使用文档
|
||||
|
||||
本次不做以下改动:
|
||||
|
||||
- 不重写抓取主流程
|
||||
- 不改成单命令自动等待登录
|
||||
- 不引入 Playwright 作为正式运行时依赖
|
||||
- 不增加下载调度、断点续传或批量任务管理
|
||||
|
||||
## Risks
|
||||
|
||||
1. 本机 Chrome 路径可能与预设不同,因此需要保留 `--chrome-path` 覆盖能力。
|
||||
2. profile 目录固定后,用户可能重复复用登录态,这是预期行为,但文档需说明。
|
||||
3. 若目标端口被其他进程占用,登录脚本需要给出可诊断的失败信息或允许端口覆盖。
|
||||
|
||||
## Success Criteria
|
||||
|
||||
满足以下条件即视为完成:
|
||||
|
||||
1. 用户可以通过固定命令启动登录浏览器。
|
||||
2. 用户登录完成后,可通过固定命令让 `Douyin.py` 成功附着并抓取。
|
||||
3. 当浏览器未启动或端口错误时,抓取脚本会给出明确提示,而不是模糊失败。
|
||||
215
externaldocs/2026-04-17-douyin-targeted-crawling-requirements.md
Normal file
215
externaldocs/2026-04-17-douyin-targeted-crawling-requirements.md
Normal file
@ -0,0 +1,215 @@
|
||||
# Douyin Targeted Crawling Requirements
|
||||
|
||||
## Goal
|
||||
|
||||
在现有“登录浏览器后附着抓取”的基础上,扩展为支持更明确的目标选择能力,使系统不仅能抓默认博主主页,还能:
|
||||
|
||||
- 指定某个博主主页进行抓取
|
||||
- 直接抓当前浏览器里正在查看的博主主页
|
||||
- 指定某个单独视频进行抓取
|
||||
|
||||
本需求文档只定义需求、范围、交互、错误处理和 TDD 约束,不直接定义实现细节代码。
|
||||
|
||||
## Current Behavior
|
||||
|
||||
当前系统具备以下行为:
|
||||
|
||||
- 通过 `login_douyin.py` 启动可见 Chrome,并开启调试端口
|
||||
- 通过 `Douyin.py` 附着到该浏览器
|
||||
- 打开某个博主主页 URL
|
||||
- 监听抖音作品列表接口 `web/aweme/post/`
|
||||
- 从接口返回的 `aweme_list` 中提取视频地址并下载
|
||||
|
||||
当前默认目标是一个硬编码博主主页,但也支持在命令行传入另一个博主主页 URL。
|
||||
|
||||
## Target Modes
|
||||
|
||||
新版本必须同时支持以下三种目标模式:
|
||||
|
||||
### 1. `creator-url`
|
||||
|
||||
用户显式传入某个博主主页 URL,系统以该博主主页为目标进行抓取。
|
||||
|
||||
### 2. `current-creator`
|
||||
|
||||
系统直接读取当前已附着浏览器正在查看的页面。如果当前页面是博主主页,则以该页面为目标进行抓取。
|
||||
|
||||
### 3. `single-video`
|
||||
|
||||
用户传入单个视频链接或 `aweme_id`,系统仅下载这一条视频,不执行博主作品列表抓取。
|
||||
|
||||
## Scope Rules
|
||||
|
||||
### Creator Targets
|
||||
|
||||
当目标是博主时,默认只抓“当前页面中已加载、当前可见范围对应的作品”。
|
||||
|
||||
这意味着:
|
||||
|
||||
- 不默认自动抓完整个博主全部作品
|
||||
- 不默认自动多页翻完所有历史内容
|
||||
- 不自动替用户点击筛选器或改动页面状态
|
||||
- 如果用户已经在页面里手动做了筛选、切换或滚动,则抓取结果以当前页面已加载状态为准
|
||||
|
||||
### Single Video Target
|
||||
|
||||
当目标是单视频时:
|
||||
|
||||
- 若输入是视频 URL,系统需要先解析出对应作品标识
|
||||
- 若输入是 `aweme_id`,系统直接按单作品逻辑抓取
|
||||
- 最终只下载一条视频
|
||||
|
||||
## Recommended User Experience
|
||||
|
||||
保留现有两步模式,不改成自动登录的一体化入口:
|
||||
|
||||
### Step 1
|
||||
|
||||
先启动登录浏览器:
|
||||
|
||||
```bash
|
||||
./.venv/bin/python login_douyin.py
|
||||
```
|
||||
|
||||
### Step 2
|
||||
|
||||
登录完成后,再运行抓取命令。
|
||||
|
||||
未来命令行接口应支持显式目标模式,例如:
|
||||
|
||||
```bash
|
||||
./.venv/bin/python Douyin.py --mode creator-url --target "https://www.douyin.com/user/..."
|
||||
./.venv/bin/python Douyin.py --mode current-creator
|
||||
./.venv/bin/python Douyin.py --mode single-video --target "https://www.douyin.com/video/..."
|
||||
./.venv/bin/python Douyin.py --mode single-video --target "7619989983668240802"
|
||||
```
|
||||
|
||||
上面只是推荐交互形态,具体参数名可在实现设计阶段微调,但必须满足以下原则:
|
||||
|
||||
- 模式必须显式可区分
|
||||
- “当前浏览器页面”与“传入 URL”不能混淆
|
||||
- 单视频目标与博主目标不能混淆
|
||||
|
||||
## Functional Requirements
|
||||
|
||||
### Requirement A: Explicit Creator URL Crawling
|
||||
|
||||
系统必须允许用户通过博主主页 URL 指定抓取目标。
|
||||
|
||||
完成条件:
|
||||
|
||||
- 系统接受有效博主主页 URL
|
||||
- 浏览器打开或切换到该 URL
|
||||
- 系统只抓当前页面已加载的作品
|
||||
|
||||
### Requirement B: Current Browser Creator Crawling
|
||||
|
||||
系统必须允许用户不手输目标 URL,而是直接抓当前浏览器页面对应的博主主页。
|
||||
|
||||
完成条件:
|
||||
|
||||
- 系统能读取当前浏览器页面 URL
|
||||
- 若当前页面是博主主页,则正常抓取
|
||||
- 若当前页面不是博主主页,则明确报错并退出
|
||||
|
||||
### Requirement C: Single Video Download
|
||||
|
||||
系统必须允许用户通过单个视频链接或 `aweme_id` 只下载一个视频。
|
||||
|
||||
完成条件:
|
||||
|
||||
- 支持视频 URL 输入
|
||||
- 支持 `aweme_id` 输入
|
||||
- 最终只落地一个视频文件
|
||||
|
||||
### Requirement D: Visible-Only Creator Scope
|
||||
|
||||
当目标是博主时,系统默认只处理当前页面已经加载出来的作品。
|
||||
|
||||
完成条件:
|
||||
|
||||
- 不自动继续滚动抓到所有历史内容
|
||||
- 抓取范围受当前页面加载状态约束
|
||||
- 用户先手动筛选、滚动、切换后,再执行抓取时,系统按当前页面状态工作
|
||||
|
||||
## Error Handling Requirements
|
||||
|
||||
系统必须提供明确错误,不允许模糊失败。
|
||||
|
||||
### Current Creator Errors
|
||||
|
||||
- 当前页面不是博主主页:报错并退出
|
||||
- 当前页面虽然像博主页,但未加载出可用作品数据:提示用户先完成页面操作后重试
|
||||
|
||||
### Single Video Errors
|
||||
|
||||
- 输入既不是合法视频 URL,也不是合法 `aweme_id`:报错并退出
|
||||
- 视频标识无法解析:报错并退出
|
||||
|
||||
### Browser Attachment Errors
|
||||
|
||||
- 调试端口不可用:提示先运行登录脚本并确认浏览器仍在运行
|
||||
|
||||
### Creator URL Errors
|
||||
|
||||
- 传入 URL 不是受支持的抖音博主主页:报错并退出
|
||||
|
||||
## Non-Goals
|
||||
|
||||
本次需求明确不包含以下内容:
|
||||
|
||||
- 任意网页抓取
|
||||
- 非抖音站点抓取
|
||||
- 自动替用户点击页面筛选器
|
||||
- 自动抓完整个博主全部历史作品
|
||||
- 自动搜索博主
|
||||
- 自动在抖音站内执行复杂导航流程
|
||||
|
||||
## Terminology
|
||||
|
||||
### `aweme`
|
||||
|
||||
抖音接口中的作品对象,可以理解为一条内容或一个视频作品实体。
|
||||
|
||||
### `aweme_id`
|
||||
|
||||
抖音作品的唯一标识。
|
||||
|
||||
### `current visible videos`
|
||||
|
||||
指当前页面已经加载出来,并能够通过当前页面对应接口响应获得的作品集合,而不是博主的全量历史作品。
|
||||
|
||||
## TDD Requirements
|
||||
|
||||
本需求后续实现必须使用 TDD。
|
||||
|
||||
### Mandatory Process
|
||||
|
||||
1. 先写失败测试
|
||||
2. 先验证测试是因为功能未实现而失败
|
||||
3. 再写最小实现让测试通过
|
||||
4. 最后再做必要重构
|
||||
|
||||
### Required Test Areas
|
||||
|
||||
至少覆盖以下测试:
|
||||
|
||||
- `creator-url` 模式下,合法博主主页 URL 能被识别并生成正确抓取目标
|
||||
- `current-creator` 模式下,当前页面是博主主页时可抓取
|
||||
- `current-creator` 模式下,当前页面不是博主主页时明确报错
|
||||
- `single-video` 模式支持视频 URL
|
||||
- `single-video` 模式支持 `aweme_id`
|
||||
- 创作者抓取默认只处理当前已加载内容,不自动继续翻页
|
||||
- 目标模式错误时的报错路径
|
||||
- 浏览器端口不可用时的报错路径
|
||||
|
||||
## Acceptance Criteria
|
||||
|
||||
需求完成后,应满足以下验收标准:
|
||||
|
||||
1. 用户可以显式指定博主主页 URL 抓取
|
||||
2. 用户可以直接抓当前浏览器中的博主主页
|
||||
3. 用户可以指定单个视频 URL 或 `aweme_id` 下载单条视频
|
||||
4. 当目标是博主时,默认只抓当前页面已加载作品
|
||||
5. 关键失败场景都有明确报错
|
||||
6. 实现过程遵循 TDD,并有对应自动化测试覆盖
|
||||
BIN
image-1.png
Normal file
BIN
image-1.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 460 KiB |
BIN
image-2.png
Normal file
BIN
image-2.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 1.9 MiB |
122
login_douyin.py
Normal file
122
login_douyin.py
Normal file
@ -0,0 +1,122 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import socket
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
from Douyin import DEFAULT_USER_URL
|
||||
|
||||
DEFAULT_CHROME_PATH = "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"
|
||||
DEFAULT_BROWSER_PORT = 9223
|
||||
DEFAULT_PROFILE_DIR = Path(".douyin-chrome-profile")
|
||||
|
||||
|
||||
def derive_chrome_app_path(chrome_path: str) -> str:
|
||||
marker = ".app/"
|
||||
if marker not in chrome_path:
|
||||
return chrome_path
|
||||
prefix, _ = chrome_path.split(marker, 1)
|
||||
return f"{prefix}.app"
|
||||
|
||||
|
||||
def build_login_command(
|
||||
chrome_path: str,
|
||||
profile_dir: Path,
|
||||
browser_port: int,
|
||||
user_url: str,
|
||||
) -> list[str]:
|
||||
app_path = derive_chrome_app_path(chrome_path)
|
||||
return [
|
||||
"open",
|
||||
"-na",
|
||||
app_path,
|
||||
"--args",
|
||||
f"--user-data-dir={profile_dir}",
|
||||
f"--remote-debugging-port={browser_port}",
|
||||
user_url,
|
||||
]
|
||||
|
||||
|
||||
def build_parser() -> argparse.ArgumentParser:
|
||||
parser = argparse.ArgumentParser(description="启动可见 Chrome,供抖音手动登录后附着抓取")
|
||||
parser.add_argument("--chrome-path", default=DEFAULT_CHROME_PATH, help="Chrome 可执行文件路径")
|
||||
parser.add_argument(
|
||||
"--profile-dir",
|
||||
default=str(DEFAULT_PROFILE_DIR),
|
||||
help="Chrome 用户数据目录,默认复用项目内固定目录",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--browser-port",
|
||||
type=int,
|
||||
default=DEFAULT_BROWSER_PORT,
|
||||
help="Chrome 调试端口,默认 9223",
|
||||
)
|
||||
parser.add_argument("--user-url", default=DEFAULT_USER_URL, help="启动后打开的抖音主页 URL")
|
||||
return parser
|
||||
|
||||
|
||||
def launch_browser(command: list[str]) -> subprocess.Popen[str]:
|
||||
return subprocess.Popen(command)
|
||||
|
||||
|
||||
def wait_for_browser_debug_port(
|
||||
browser_port: int,
|
||||
timeout_seconds: float = 15.0,
|
||||
interval_seconds: float = 0.25,
|
||||
) -> None:
|
||||
deadline = time.monotonic() + timeout_seconds
|
||||
while time.monotonic() < deadline:
|
||||
try:
|
||||
with socket.create_connection(("127.0.0.1", browser_port), timeout=1):
|
||||
return
|
||||
except OSError:
|
||||
time.sleep(interval_seconds)
|
||||
|
||||
raise RuntimeError(
|
||||
f"Chrome 已启动命令,但调试端口 {browser_port} 在限定时间内未就绪。"
|
||||
)
|
||||
|
||||
|
||||
def main(argv: list[str] | None = None) -> int:
|
||||
parser = build_parser()
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
if args.browser_port <= 0:
|
||||
parser.error("--browser-port 必须大于 0")
|
||||
|
||||
chrome_path = Path(args.chrome_path)
|
||||
if not chrome_path.exists():
|
||||
print(f"[ERROR] Chrome 可执行文件不存在: {chrome_path}")
|
||||
return 1
|
||||
|
||||
profile_dir = Path(args.profile_dir).resolve()
|
||||
profile_dir.mkdir(parents=True, exist_ok=True)
|
||||
command = build_login_command(
|
||||
chrome_path=str(chrome_path),
|
||||
profile_dir=profile_dir,
|
||||
browser_port=args.browser_port,
|
||||
user_url=args.user_url,
|
||||
)
|
||||
|
||||
try:
|
||||
launch_browser(command)
|
||||
except OSError as exc:
|
||||
print(f"[ERROR] 启动 Chrome 失败: {exc}")
|
||||
return 1
|
||||
|
||||
try:
|
||||
wait_for_browser_debug_port(args.browser_port)
|
||||
except RuntimeError as exc:
|
||||
print(f"[ERROR] {exc}")
|
||||
return 1
|
||||
|
||||
print("[INFO] Chrome 已启动。请在打开的浏览器中完成抖音登录和验证码。")
|
||||
print(f"[INFO] 登录完成后执行: ./.venv/bin/python Douyin.py --browser-port {args.browser_port}")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
76
test_douyin.py
Normal file
76
test_douyin.py
Normal file
@ -0,0 +1,76 @@
|
||||
import importlib
|
||||
import unittest
|
||||
from unittest import mock
|
||||
|
||||
|
||||
class FakeResponse:
|
||||
def __init__(self, body, raw_body):
|
||||
self.body = body
|
||||
self.raw_body = raw_body
|
||||
|
||||
|
||||
class DouyinModuleTests(unittest.TestCase):
|
||||
def test_module_can_import_without_optional_runtime_dependencies(self) -> None:
|
||||
module = importlib.import_module("Douyin")
|
||||
self.assertIsNotNone(module)
|
||||
|
||||
def test_sanitize_filename_removes_invalid_characters(self) -> None:
|
||||
module = importlib.import_module("Douyin")
|
||||
self.assertEqual(
|
||||
module.sanitize_filename('a/b:c*?d"e<f>g|h\n'),
|
||||
"a_b_c__d_e_f_g_h",
|
||||
)
|
||||
|
||||
def test_choose_video_url_prefers_douyinvod_link(self) -> None:
|
||||
module = importlib.import_module("Douyin")
|
||||
urls = [
|
||||
"https://www.douyin.com/aweme/v1/play/?video_id=123",
|
||||
"https://v11-weba.douyinvod.com/example/video.mp4",
|
||||
"https://v26-web.douyinvod.com/example/video.mp4",
|
||||
]
|
||||
self.assertEqual(
|
||||
module.choose_video_url(urls),
|
||||
"https://v11-weba.douyinvod.com/example/video.mp4",
|
||||
)
|
||||
|
||||
def test_build_output_path_uses_video_directory(self) -> None:
|
||||
module = importlib.import_module("Douyin")
|
||||
output_path = module.build_output_path("测试标题", "123456")
|
||||
self.assertEqual(output_path.as_posix(), "video/测试标题-123456.mp4")
|
||||
|
||||
def test_extract_aweme_payload_uses_dict_body(self) -> None:
|
||||
module = importlib.import_module("Douyin")
|
||||
response = FakeResponse({"aweme_list": []}, "")
|
||||
self.assertEqual(module.extract_aweme_payload(response), {"aweme_list": []})
|
||||
|
||||
def test_extract_aweme_payload_falls_back_to_raw_json(self) -> None:
|
||||
module = importlib.import_module("Douyin")
|
||||
response = FakeResponse("", '{"aweme_list": [{"aweme_id": "1"}]}')
|
||||
self.assertEqual(
|
||||
module.extract_aweme_payload(response),
|
||||
{"aweme_list": [{"aweme_id": "1"}]},
|
||||
)
|
||||
|
||||
def test_build_browser_address_from_port(self) -> None:
|
||||
module = importlib.import_module("Douyin")
|
||||
self.assertEqual(module.build_browser_address(9223), "127.0.0.1:9223")
|
||||
self.assertIsNone(module.build_browser_address(None))
|
||||
|
||||
def test_ensure_browser_debug_port_ready_accepts_open_port(self) -> None:
|
||||
module = importlib.import_module("Douyin")
|
||||
connection = mock.MagicMock()
|
||||
connection.__enter__.return_value = connection
|
||||
connection.__exit__.return_value = False
|
||||
with mock.patch.object(module.socket, "create_connection", return_value=connection) as mocked_connect:
|
||||
module.ensure_browser_debug_port_ready(9223)
|
||||
mocked_connect.assert_called_once()
|
||||
|
||||
def test_ensure_browser_debug_port_ready_rejects_closed_port(self) -> None:
|
||||
module = importlib.import_module("Douyin")
|
||||
with mock.patch.object(module.socket, "create_connection", side_effect=OSError("boom")):
|
||||
with self.assertRaisesRegex(RuntimeError, "login_douyin.py"):
|
||||
module.ensure_browser_debug_port_ready(9223)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
98
test_login_douyin.py
Normal file
98
test_login_douyin.py
Normal file
@ -0,0 +1,98 @@
|
||||
import importlib
|
||||
import io
|
||||
import tempfile
|
||||
import unittest
|
||||
from contextlib import redirect_stdout
|
||||
from pathlib import Path
|
||||
from unittest import mock
|
||||
|
||||
|
||||
class LoginDouyinModuleTests(unittest.TestCase):
|
||||
def test_build_login_command_uses_expected_chrome_arguments(self) -> None:
|
||||
module = importlib.import_module("login_douyin")
|
||||
command = module.build_login_command(
|
||||
chrome_path="/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
|
||||
profile_dir=Path("/tmp/douyin-profile"),
|
||||
browser_port=9223,
|
||||
user_url="https://www.douyin.com/user/example",
|
||||
)
|
||||
self.assertEqual(
|
||||
command,
|
||||
[
|
||||
"open",
|
||||
"-na",
|
||||
"/Applications/Google Chrome.app",
|
||||
"--args",
|
||||
"--user-data-dir=/tmp/douyin-profile",
|
||||
"--remote-debugging-port=9223",
|
||||
"https://www.douyin.com/user/example",
|
||||
],
|
||||
)
|
||||
|
||||
def test_build_parser_uses_expected_defaults(self) -> None:
|
||||
module = importlib.import_module("login_douyin")
|
||||
args = module.build_parser().parse_args([])
|
||||
self.assertEqual(args.browser_port, 9223)
|
||||
self.assertEqual(args.chrome_path, module.DEFAULT_CHROME_PATH)
|
||||
self.assertEqual(args.user_url, module.DEFAULT_USER_URL)
|
||||
|
||||
def test_main_creates_profile_dir_and_prints_next_step(self) -> None:
|
||||
module = importlib.import_module("login_douyin")
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
profile_dir = Path(temp_dir) / "profile"
|
||||
stdout = io.StringIO()
|
||||
with redirect_stdout(stdout):
|
||||
with mock.patch.object(module, "launch_browser") as mocked_launch:
|
||||
with mock.patch.object(module, "wait_for_browser_debug_port") as mocked_wait:
|
||||
exit_code = module.main(
|
||||
[
|
||||
"--chrome-path",
|
||||
"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
|
||||
"--profile-dir",
|
||||
str(profile_dir),
|
||||
"--browser-port",
|
||||
"9333",
|
||||
]
|
||||
)
|
||||
self.assertEqual(exit_code, 0)
|
||||
self.assertTrue(profile_dir.exists())
|
||||
mocked_launch.assert_called_once()
|
||||
mocked_wait.assert_called_once_with(9333)
|
||||
self.assertIn("9333", stdout.getvalue())
|
||||
self.assertIn("./.venv/bin/python Douyin.py --browser-port 9333", stdout.getvalue())
|
||||
|
||||
def test_main_returns_error_when_chrome_path_missing(self) -> None:
|
||||
module = importlib.import_module("login_douyin")
|
||||
stdout = io.StringIO()
|
||||
with redirect_stdout(stdout):
|
||||
exit_code = module.main(["--chrome-path", "/tmp/does-not-exist-chrome"])
|
||||
self.assertEqual(exit_code, 1)
|
||||
self.assertIn("Chrome", stdout.getvalue())
|
||||
self.assertIn("不存在", stdout.getvalue())
|
||||
|
||||
def test_main_returns_error_when_debug_port_never_becomes_ready(self) -> None:
|
||||
module = importlib.import_module("login_douyin")
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
profile_dir = Path(temp_dir) / "profile"
|
||||
stdout = io.StringIO()
|
||||
with redirect_stdout(stdout):
|
||||
with mock.patch.object(module, "launch_browser"):
|
||||
with mock.patch.object(
|
||||
module,
|
||||
"wait_for_browser_debug_port",
|
||||
side_effect=RuntimeError("端口未就绪"),
|
||||
):
|
||||
exit_code = module.main(
|
||||
[
|
||||
"--chrome-path",
|
||||
"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
|
||||
"--profile-dir",
|
||||
str(profile_dir),
|
||||
]
|
||||
)
|
||||
self.assertEqual(exit_code, 1)
|
||||
self.assertIn("端口未就绪", stdout.getvalue())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
45
抖音爬取视频.md
Normal file
45
抖音爬取视频.md
Normal file
@ -0,0 +1,45 @@
|
||||
打开一个博主主页,f12打开控制台:
|
||||
鼠标挪到一个视频上面会开始播放,然后出现一个mp4媒体:
|
||||

|
||||

|
||||
复制这个媒体的rc
|
||||
NjxpM2hkZWUzNTszNGlpOkBpajdrOHY5cmxqOjMzNGkzM0A1MF41MC8wNmMxMjM2YWAuYSNrbmVqMmRrYWNhLS1kLTBzcw==
|
||||
粘贴到搜索栏,然后command+R刷新页面,然后搜索就找到了链接:
|
||||

|
||||
一般是三条
|
||||
```
|
||||
"url_list": [
|
||||
"https://v26-web.douyinvod.com/71be40af2c168460799af3a778572914/69e1f23b/video/tos/cn/tos-cn-ve-15/oERIAIyg72xQTAKefNevmg5PYSzGBCLVGxxBM0/?a=6383&ch=10010&cr=3&dr=0&lr=all&cd=0%7C0%7C0%7C3&cv=1&br=2657&bt=2657&cs=0&ds=4&ft=pEaFx4hZffPdOW~-N12NvAq-antLjrKiGgZnRkaEkVUpUjVhWL6&mime_type=video_mp4&qs=0&rc=ZDZnZjwzNTgzaWk5ZzloNUBpajdrOHY5cmxqOjMzNGkzM0AxNDIvY2FeNmMxXjQ0MWFiYSNrbmVqMmRrYWNhLS1kLTBzcw%3D%3D&btag=80000e00010000&cquery=100z_100o_101r_100B_100x&dy_q=1776404465&feature_id=37f92ebd2877ae8e7eba995d406c5150&l=202604171341058889DECB07230D0B9B8B",
|
||||
"https://v11-weba.douyinvod.com/0cc3b59178e5b0065b19ddb2587010c1/69e1f23b/video/tos/cn/tos-cn-ve-15/oERIAIyg72xQTAKefNevmg5PYSzGBCLVGxxBM0/?a=6383&ch=10010&cr=3&dr=0&lr=all&cd=0%7C0%7C0%7C3&cv=1&br=2657&bt=2657&cs=0&ds=4&ft=pEaFx4hZffPdOW~-N12NvAq-antLjrKiGgZnRkaEkVUpUjVhWL6&mime_type=video_mp4&qs=0&rc=ZDZnZjwzNTgzaWk5ZzloNUBpajdrOHY5cmxqOjMzNGkzM0AxNDIvY2FeNmMxXjQ0MWFiYSNrbmVqMmRrYWNhLS1kLTBzcw%3D%3D&btag=80000e00010000&cquery=100o_101r_100B_100x_100z&dy_q=1776404465&feature_id=37f92ebd2877ae8e7eba995d406c5150&l=202604171341058889DECB07230D0B9B8B",
|
||||
"https://www.douyin.com/aweme/v1/play/?video_id=v0300fg10000d7fgdinog65sm5hhq4ng&line=0&file_id=e2de467d76af4ff095a2d0f92c5d734a&sign=66b2ab7aba34fd66cf343c7ea1aa9994&is_play_url=1&source=PackSourceEnum_PUBLISH"
|
||||
],
|
||||
```
|
||||
复制这个中间的v11的到浏览器就有了:
|
||||
```
|
||||
https://v11-weba.douyinvod.com/0cc3b59178e5b0065b19ddb2587010c1/69e1f23b/video/tos/cn/tos-cn-ve-15/oERIAIyg72xQTAKefNevmg5PYSzGBCLVGxxBM0/?a=6383&ch=10010&cr=3&dr=0&lr=all&cd=0%7C0%7C0%7C3&cv=1&br=2657&bt=2657&cs=0&ds=4&ft=pEaFx4hZffPdOW~-N12NvAq-antLjrKiGgZnRkaEkVUpUjVhWL6&mime_type=video_mp4&qs=0&rc=ZDZnZjwzNTgzaWk5ZzloNUBpajdrOHY5cmxqOjMzNGkzM0AxNDIvY2FeNmMxXjQ0MWFiYSNrbmVqMmRrYWNhLS1kLTBzcw%3D%3D&btag=80000e00010000&cquery=100o_101r_100B_100x_100z&dy_q=1776404465&feature_id=37f92ebd2877ae8e7eba995d406c5150&l=202604171341058889DECB07230D0B9B8B
|
||||
发现可以直接打开video的页面并且可以下载
|
||||
```
|
||||
|
||||
## 推荐流程
|
||||
|
||||
把“手动登录”和“附着抓取”分成两步走更稳:
|
||||
|
||||
### 1. 先启动登录浏览器
|
||||
|
||||
```bash
|
||||
./.venv/bin/python login_douyin.py
|
||||
```
|
||||
|
||||
- 默认会启动一个可见的 Chrome
|
||||
- 默认调试端口是 `9223`
|
||||
- 登录态会保存在项目目录下的 `.douyin-chrome-profile/`
|
||||
|
||||
在这个浏览器里手动完成抖音登录和验证码。
|
||||
|
||||
### 2. 再附着已登录浏览器开始抓取
|
||||
|
||||
```bash
|
||||
./.venv/bin/python Douyin.py --pages 1 --browser-port 9223
|
||||
```
|
||||
|
||||
如果你改了登录脚本里的端口,抓取时也要传同一个 `--browser-port`。
|
||||
Loading…
x
Reference in New Issue
Block a user