Compare commits
9 Commits
321bbff1c1
...
4c33f40289
| Author | SHA1 | Date | |
|---|---|---|---|
| 4c33f40289 | |||
| 340293deba | |||
| 5ba771f882 | |||
| 96f96c2295 | |||
| c56c54d35d | |||
| f7374d2088 | |||
| 86839a873f | |||
| ec1ff6322c | |||
| 4b14586a91 |
136
Douyin.py
136
Douyin.py
@ -29,6 +29,7 @@ DEFAULT_BROWSER_PORT = 9223
|
|||||||
LISTEN_TARGET = "web/aweme/post/"
|
LISTEN_TARGET = "web/aweme/post/"
|
||||||
SINGLE_VIDEO_LISTEN_TARGET = "web/aweme/detail/"
|
SINGLE_VIDEO_LISTEN_TARGET = "web/aweme/detail/"
|
||||||
INVALID_FILENAME_CHARS = re.compile(r'[\\/:*?"<>|\r\n\t]')
|
INVALID_FILENAME_CHARS = re.compile(r'[\\/:*?"<>|\r\n\t]')
|
||||||
|
RECOMMENDATION_URL_PATTERN = re.compile(r"^https?://www\.douyin\.com/?(?:\?.*)?$")
|
||||||
CREATOR_URL_PATTERN = re.compile(r"^https?://www\.douyin\.com/user/[^/?#]+(?:\?.*)?$")
|
CREATOR_URL_PATTERN = re.compile(r"^https?://www\.douyin\.com/user/[^/?#]+(?:\?.*)?$")
|
||||||
VIDEO_URL_PATTERN = re.compile(r"^https?://www\.douyin\.com/video/(?P<aweme_id>\d+)(?:[/?#].*)?$")
|
VIDEO_URL_PATTERN = re.compile(r"^https?://www\.douyin\.com/video/(?P<aweme_id>\d+)(?:[/?#].*)?$")
|
||||||
AWEME_ID_PATTERN = re.compile(r"^\d{5,}$")
|
AWEME_ID_PATTERN = re.compile(r"^\d{5,}$")
|
||||||
@ -47,6 +48,10 @@ def sanitize_filename(value: str, fallback: str = "untitled") -> str:
|
|||||||
return cleaned or fallback
|
return cleaned or fallback
|
||||||
|
|
||||||
|
|
||||||
|
def is_recommendation_url(value: str) -> bool:
|
||||||
|
return bool(RECOMMENDATION_URL_PATTERN.match(value.strip()))
|
||||||
|
|
||||||
|
|
||||||
def is_creator_url(value: str) -> bool:
|
def is_creator_url(value: str) -> bool:
|
||||||
return bool(CREATOR_URL_PATTERN.match(value.strip()))
|
return bool(CREATOR_URL_PATTERN.match(value.strip()))
|
||||||
|
|
||||||
@ -72,6 +77,8 @@ def build_video_page_url(aweme_id: str) -> str:
|
|||||||
|
|
||||||
def parse_target_input(value: str, source: str) -> ResolvedTarget:
|
def parse_target_input(value: str, source: str) -> ResolvedTarget:
|
||||||
normalized = value.strip()
|
normalized = value.strip()
|
||||||
|
if is_recommendation_url(normalized):
|
||||||
|
return ResolvedTarget(kind="recommendation", value=normalized, source=source)
|
||||||
if is_creator_url(normalized):
|
if is_creator_url(normalized):
|
||||||
return ResolvedTarget(kind="creator", value=normalized, source=source)
|
return ResolvedTarget(kind="creator", value=normalized, source=source)
|
||||||
if is_video_url(normalized):
|
if is_video_url(normalized):
|
||||||
@ -131,9 +138,19 @@ def choose_video_url(url_list: list[str]) -> str:
|
|||||||
raise ValueError("url_list 为空,无法选择视频地址。")
|
raise ValueError("url_list 为空,无法选择视频地址。")
|
||||||
|
|
||||||
|
|
||||||
def build_output_path(title: str, video_id: str, output_dir: Path = Path("video")) -> Path:
|
def build_output_path(
|
||||||
|
title: str,
|
||||||
|
video_id: str,
|
||||||
|
output_dir: Path = Path("video"),
|
||||||
|
author_name: str | None = None,
|
||||||
|
) -> Path:
|
||||||
safe_title = sanitize_filename(title, fallback="untitled")
|
safe_title = sanitize_filename(title, fallback="untitled")
|
||||||
return output_dir / f"{safe_title}-{video_id}.mp4"
|
if author_name:
|
||||||
|
safe_author = sanitize_filename(author_name, fallback="unknown")
|
||||||
|
filename = f"[{safe_author}]{safe_title}-{video_id}.mp4"
|
||||||
|
else:
|
||||||
|
filename = f"{safe_title}-{video_id}.mp4"
|
||||||
|
return output_dir / filename
|
||||||
|
|
||||||
|
|
||||||
def build_browser_address(browser_port: int | None) -> str | None:
|
def build_browser_address(browser_port: int | None) -> str | None:
|
||||||
@ -192,11 +209,18 @@ def parse_aweme_items(body: Any) -> list[dict[str, str]]:
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
title = str(aweme.get("desc") or "").strip() or "untitled"
|
title = str(aweme.get("desc") or "").strip() or "untitled"
|
||||||
|
|
||||||
|
author = aweme.get("author") or {}
|
||||||
|
author_name = str(author.get("nickname") or "").strip() or "unknown"
|
||||||
|
author_id = str(author.get("uid") or "").strip() or "unknown"
|
||||||
|
|
||||||
items.append(
|
items.append(
|
||||||
{
|
{
|
||||||
"title": title,
|
"title": title,
|
||||||
"video_id": video_id,
|
"video_id": video_id,
|
||||||
"video_url": choose_video_url([str(url) for url in url_list]),
|
"video_url": choose_video_url([str(url) for url in url_list]),
|
||||||
|
"author_name": author_name,
|
||||||
|
"author_id": author_id,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -363,6 +387,99 @@ def collect_videos(
|
|||||||
return downloaded
|
return downloaded
|
||||||
|
|
||||||
|
|
||||||
|
def collect_recommendations(
|
||||||
|
max_videos: int,
|
||||||
|
timeout: int,
|
||||||
|
output_dir: Path,
|
||||||
|
browser_port: int | None,
|
||||||
|
) -> int:
|
||||||
|
requests_module, chromium_page_cls, chromium_options_cls = import_runtime_dependencies()
|
||||||
|
headers = build_headers("https://www.douyin.com/")
|
||||||
|
if browser_port is not None:
|
||||||
|
ensure_browser_debug_port_ready(browser_port)
|
||||||
|
page = create_page(chromium_page_cls, chromium_options_cls, browser_port)
|
||||||
|
page.listen.start(LISTEN_TARGET)
|
||||||
|
|
||||||
|
print("[INFO] 正在打开抖音推荐流。若出现登录或验证码,请先在浏览器窗口里完成。")
|
||||||
|
page.get("https://www.douyin.com/")
|
||||||
|
time.sleep(3)
|
||||||
|
|
||||||
|
downloaded = 0
|
||||||
|
seen_ids: set[str] = set()
|
||||||
|
consecutive_empty = 0
|
||||||
|
max_consecutive_empty = 3
|
||||||
|
|
||||||
|
while downloaded < max_videos:
|
||||||
|
packet = wait_for_aweme_packet(page, timeout=timeout)
|
||||||
|
if packet is None:
|
||||||
|
consecutive_empty += 1
|
||||||
|
if consecutive_empty >= max_consecutive_empty:
|
||||||
|
print("[INFO] 连续多次未获取到新数据,结束抓取。")
|
||||||
|
break
|
||||||
|
scroll_to_next_page(page)
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
payload = extract_aweme_payload(packet.response)
|
||||||
|
items = parse_aweme_items(payload)
|
||||||
|
except Exception as exc:
|
||||||
|
print(f"[WARN] 解析接口数据失败: {exc}")
|
||||||
|
consecutive_empty += 1
|
||||||
|
if consecutive_empty >= max_consecutive_empty:
|
||||||
|
break
|
||||||
|
scroll_to_next_page(page)
|
||||||
|
continue
|
||||||
|
|
||||||
|
if not items:
|
||||||
|
consecutive_empty += 1
|
||||||
|
if consecutive_empty >= max_consecutive_empty:
|
||||||
|
break
|
||||||
|
scroll_to_next_page(page)
|
||||||
|
continue
|
||||||
|
|
||||||
|
consecutive_empty = 0
|
||||||
|
new_items_in_batch = 0
|
||||||
|
|
||||||
|
for item in items:
|
||||||
|
if item["video_id"] in seen_ids:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if downloaded >= max_videos:
|
||||||
|
break
|
||||||
|
|
||||||
|
seen_ids.add(item["video_id"])
|
||||||
|
output_path = build_output_path(
|
||||||
|
title=item["title"],
|
||||||
|
video_id=item["video_id"],
|
||||||
|
output_dir=output_dir,
|
||||||
|
author_name=item.get("author_name"),
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
download_video(
|
||||||
|
requests_module=requests_module,
|
||||||
|
headers=headers,
|
||||||
|
video_url=item["video_url"],
|
||||||
|
output_path=output_path,
|
||||||
|
)
|
||||||
|
except Exception as exc:
|
||||||
|
print(f"[WARN] 下载失败 {item['video_id']}: {exc}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
downloaded += 1
|
||||||
|
new_items_in_batch += 1
|
||||||
|
print(f"[OK] 已保存: {output_path}")
|
||||||
|
|
||||||
|
if new_items_in_batch == 0:
|
||||||
|
consecutive_empty += 1
|
||||||
|
if consecutive_empty >= max_consecutive_empty:
|
||||||
|
break
|
||||||
|
|
||||||
|
scroll_to_next_page(page)
|
||||||
|
|
||||||
|
return downloaded
|
||||||
|
|
||||||
|
|
||||||
def collect_single_video(
|
def collect_single_video(
|
||||||
target: ResolvedTarget,
|
target: ResolvedTarget,
|
||||||
timeout: int,
|
timeout: int,
|
||||||
@ -430,6 +547,12 @@ def build_parser() -> argparse.ArgumentParser:
|
|||||||
default=DEFAULT_BROWSER_PORT,
|
default=DEFAULT_BROWSER_PORT,
|
||||||
help="附着到已启动 Chrome 的调试端口,默认 9223",
|
help="附着到已启动 Chrome 的调试端口,默认 9223",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--max-videos",
|
||||||
|
type=int,
|
||||||
|
default=50,
|
||||||
|
help="推荐流最大抓取数量,默认 50",
|
||||||
|
)
|
||||||
return parser
|
return parser
|
||||||
|
|
||||||
|
|
||||||
@ -443,6 +566,8 @@ def main(argv: list[str] | None = None) -> int:
|
|||||||
parser.error("--timeout 必须大于 0")
|
parser.error("--timeout 必须大于 0")
|
||||||
if args.browser_port is not None and args.browser_port <= 0:
|
if args.browser_port is not None and args.browser_port <= 0:
|
||||||
parser.error("--browser-port 必须大于 0")
|
parser.error("--browser-port 必须大于 0")
|
||||||
|
if args.max_videos <= 0:
|
||||||
|
parser.error("--max-videos 必须大于 0")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
target = resolve_cli_target(args.target, browser_port=args.browser_port)
|
target = resolve_cli_target(args.target, browser_port=args.browser_port)
|
||||||
@ -455,6 +580,13 @@ def main(argv: list[str] | None = None) -> int:
|
|||||||
browser_port=args.browser_port,
|
browser_port=args.browser_port,
|
||||||
auto_scroll=args.pages > 1,
|
auto_scroll=args.pages > 1,
|
||||||
)
|
)
|
||||||
|
elif target.kind == "recommendation":
|
||||||
|
total = collect_recommendations(
|
||||||
|
max_videos=args.max_videos,
|
||||||
|
timeout=args.timeout,
|
||||||
|
output_dir=Path(args.output_dir),
|
||||||
|
browser_port=args.browser_port,
|
||||||
|
)
|
||||||
elif target.kind == "single-video":
|
elif target.kind == "single-video":
|
||||||
total = collect_single_video(
|
total = collect_single_video(
|
||||||
target=target,
|
target=target,
|
||||||
|
|||||||
@ -0,0 +1,584 @@
|
|||||||
|
# 抖音推荐流视频抓取实现计划
|
||||||
|
|
||||||
|
> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
|
||||||
|
|
||||||
|
**Goal:** 扩展 Douyin.py 支持抓取抖音推荐流(For You页面)视频,记录博主信息,支持滚动加载最多50条
|
||||||
|
|
||||||
|
**Architecture:** 新建 `collect_recommendations()` 函数处理推荐流,复用现有的下载和工具函数。通过 `parse_target_input()` 扩展识别推荐流URL。
|
||||||
|
|
||||||
|
**Tech Stack:** Python 3, DrissionPage, requests, unittest
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 文件结构
|
||||||
|
|
||||||
|
| 文件 | 操作 | 说明 |
|
||||||
|
|------|------|------|
|
||||||
|
| `Douyin.py` | 修改 | 添加推荐流识别、解析、抓取逻辑 |
|
||||||
|
| `test_douyin.py` | 修改 | 添加推荐流相关测试 |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Task 1: 推荐流URL识别
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- Modify: `Douyin.py:17-19`(添加正则表达式)
|
||||||
|
- Test: `test_douyin.py`(添加测试)
|
||||||
|
|
||||||
|
- [ ] **Step 1: 编写失败测试**
|
||||||
|
|
||||||
|
```python
|
||||||
|
def test_is_recommendation_url_accepts_douyin_homepage(self) -> None:
|
||||||
|
module = importlib.import_module("Douyin")
|
||||||
|
self.assertTrue(module.is_recommendation_url("https://www.douyin.com/"))
|
||||||
|
self.assertTrue(module.is_recommendation_url("https://www.douyin.com"))
|
||||||
|
self.assertTrue(module.is_recommendation_url("https://www.douyin.com/?from=web"))
|
||||||
|
self.assertFalse(module.is_recommendation_url("https://www.douyin.com/user/xxx"))
|
||||||
|
self.assertFalse(module.is_recommendation_url("https://www.douyin.com/video/123"))
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 2: 运行测试确认失败**
|
||||||
|
|
||||||
|
Run: `python3 -m pytest test_douyin.py::DouyinModuleTests::test_is_recommendation_url_accepts_douyin_homepage -v`
|
||||||
|
Expected: FAIL with "module has no attribute 'is_recommendation_url'"
|
||||||
|
|
||||||
|
- [ ] **Step 3: 实现最小代码**
|
||||||
|
|
||||||
|
在 `Douyin.py` 中添加:
|
||||||
|
```python
|
||||||
|
RECOMMENDATION_URL_PATTERN = re.compile(r"^https?://www\.douyin\.com/?(?:\?.*)?$")
|
||||||
|
|
||||||
|
def is_recommendation_url(value: str) -> bool:
|
||||||
|
return bool(RECOMMENDATION_URL_PATTERN.match(value.strip()))
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 4: 运行测试确认通过**
|
||||||
|
|
||||||
|
Run: `python3 -m pytest test_douyin.py::DouyinModuleTests::test_is_recommendation_url_accepts_douyin_homepage -v`
|
||||||
|
Expected: PASS
|
||||||
|
|
||||||
|
- [ ] **Step 5: 提交**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git add Douyin.py test_douyin.py
|
||||||
|
git commit -m "feat: add recommendation URL recognition"
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Task 2: 扩展目标解析支持推荐流
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- Modify: `Douyin.py:52-68`(修改 `parse_target_input`)
|
||||||
|
- Test: `test_douyin.py`(添加测试)
|
||||||
|
|
||||||
|
- [ ] **Step 1: 编写失败测试**
|
||||||
|
|
||||||
|
```python
|
||||||
|
def test_parse_target_input_classifies_recommendation_url(self) -> None:
|
||||||
|
module = importlib.import_module("Douyin")
|
||||||
|
target = module.parse_target_input("https://www.douyin.com/", source="manual")
|
||||||
|
self.assertEqual(target.kind, "recommendation")
|
||||||
|
self.assertEqual(target.value, "https://www.douyin.com/")
|
||||||
|
self.assertEqual(target.source, "manual")
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 2: 运行测试确认失败**
|
||||||
|
|
||||||
|
Run: `python3 -m pytest test_douyin.py::DouyinModuleTests::test_parse_target_input_classifies_recommendation_url -v`
|
||||||
|
Expected: FAIL with "不支持的目标"
|
||||||
|
|
||||||
|
- [ ] **Step 3: 修改 `parse_target_input`**
|
||||||
|
|
||||||
|
```python
|
||||||
|
def parse_target_input(value: str, source: str) -> ResolvedTarget:
|
||||||
|
normalized = value.strip()
|
||||||
|
if is_recommendation_url(normalized):
|
||||||
|
return ResolvedTarget(kind="recommendation", value=normalized, source=source)
|
||||||
|
if is_creator_url(normalized):
|
||||||
|
return ResolvedTarget(kind="creator", value=normalized, source=source)
|
||||||
|
# ... 其余保持不变
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 4: 运行测试确认通过**
|
||||||
|
|
||||||
|
Run: `python3 -m pytest test_douyin.py::DouyinModuleTests::test_parse_target_input_classifies_recommendation_url -v`
|
||||||
|
Expected: PASS
|
||||||
|
|
||||||
|
- [ ] **Step 5: 提交**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git add Douyin.py test_douyin.py
|
||||||
|
git commit -m "feat: extend target parsing to support recommendation URLs"
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Task 3: 增强数据解析提取博主信息
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- Modify: `Douyin.py:140-170`(修改 `parse_aweme_items`)
|
||||||
|
- Test: `test_douyin.py`(添加测试)
|
||||||
|
|
||||||
|
- [ ] **Step 1: 编写失败测试**
|
||||||
|
|
||||||
|
```python
|
||||||
|
def test_parse_aweme_items_extracts_author_info(self) -> None:
|
||||||
|
module = importlib.import_module("Douyin")
|
||||||
|
payload = {
|
||||||
|
"aweme_list": [
|
||||||
|
{
|
||||||
|
"aweme_id": "7619989983668240802",
|
||||||
|
"desc": "测试视频",
|
||||||
|
"author": {
|
||||||
|
"nickname": "测试博主",
|
||||||
|
"uid": "123456789"
|
||||||
|
},
|
||||||
|
"video": {
|
||||||
|
"play_addr": {
|
||||||
|
"url_list": ["https://v26-web.douyinvod.com/example/video.mp4"]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
items = module.parse_aweme_items(payload)
|
||||||
|
self.assertEqual(len(items), 1)
|
||||||
|
self.assertEqual(items[0]["author_name"], "测试博主")
|
||||||
|
self.assertEqual(items[0]["author_id"], "123456789")
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 2: 运行测试确认失败**
|
||||||
|
|
||||||
|
Run: `python3 -m pytest test_douyin.py::DouyinModuleTests::test_parse_aweme_items_extracts_author_info -v`
|
||||||
|
Expected: FAIL with KeyError or missing author_name
|
||||||
|
|
||||||
|
- [ ] **Step 3: 修改 `parse_aweme_items`**
|
||||||
|
|
||||||
|
```python
|
||||||
|
def parse_aweme_items(body: Any) -> list[dict[str, str]]:
|
||||||
|
# ... 现有代码 ...
|
||||||
|
|
||||||
|
for aweme in aweme_list:
|
||||||
|
# ... 现有视频提取代码 ...
|
||||||
|
|
||||||
|
author = aweme.get("author") or {}
|
||||||
|
author_name = str(author.get("nickname") or "").strip() or "unknown"
|
||||||
|
author_id = str(author.get("uid") or "").strip() or "unknown"
|
||||||
|
|
||||||
|
items.append(
|
||||||
|
{
|
||||||
|
"title": title,
|
||||||
|
"video_id": video_id,
|
||||||
|
"video_url": choose_video_url([str(url) for url in url_list]),
|
||||||
|
"author_name": author_name,
|
||||||
|
"author_id": author_id,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
return items
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 4: 运行测试确认通过**
|
||||||
|
|
||||||
|
Run: `python3 -m pytest test_douyin.py::DouyinModuleTests::test_parse_aweme_items_extracts_author_info -v`
|
||||||
|
Expected: PASS
|
||||||
|
|
||||||
|
- [ ] **Step 5: 提交**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git add Douyin.py test_douyin.py
|
||||||
|
git commit -m "feat: extract author info from aweme items"
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Task 4: 支持带博主信息的文件名构建
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- Modify: `Douyin.py:102-104`(修改 `build_output_path`)
|
||||||
|
- Test: `test_douyin.py`(添加测试)
|
||||||
|
|
||||||
|
- [ ] **Step 1: 编写失败测试**
|
||||||
|
|
||||||
|
```python
|
||||||
|
def test_build_output_path_with_author_uses_bracket_format(self) -> None:
|
||||||
|
module = importlib.import_module("Douyin")
|
||||||
|
output_path = module.build_output_path(
|
||||||
|
title="测试标题",
|
||||||
|
video_id="123456",
|
||||||
|
author_name="测试博主"
|
||||||
|
)
|
||||||
|
self.assertEqual(output_path.as_posix(), "video/[测试博主]测试标题-123456.mp4")
|
||||||
|
|
||||||
|
def test_build_output_path_without_author_uses_original_format(self) -> None:
|
||||||
|
module = importlib.import_module("Douyin")
|
||||||
|
output_path = module.build_output_path("测试标题", "123456")
|
||||||
|
self.assertEqual(output_path.as_posix(), "video/测试标题-123456.mp4")
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 2: 运行测试确认失败**
|
||||||
|
|
||||||
|
Run: `python3 -m pytest test_douyin.py::DouyinModuleTests::test_build_output_path_with_author_uses_bracket_format -v`
|
||||||
|
Expected: FAIL with unexpected keyword argument 'author_name'
|
||||||
|
|
||||||
|
- [ ] **Step 3: 修改 `build_output_path`**
|
||||||
|
|
||||||
|
```python
|
||||||
|
def build_output_path(
|
||||||
|
title: str,
|
||||||
|
video_id: str,
|
||||||
|
output_dir: Path = Path("video"),
|
||||||
|
author_name: str | None = None,
|
||||||
|
) -> Path:
|
||||||
|
safe_title = sanitize_filename(title, fallback="untitled")
|
||||||
|
if author_name:
|
||||||
|
safe_author = sanitize_filename(author_name, fallback="unknown")
|
||||||
|
filename = f"[{safe_author}]{safe_title}-{video_id}.mp4"
|
||||||
|
else:
|
||||||
|
filename = f"{safe_title}-{video_id}.mp4"
|
||||||
|
return output_dir / filename
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 4: 运行测试确认通过**
|
||||||
|
|
||||||
|
Run: `python3 -m pytest test_douyin.py::DouyinModuleTests::test_build_output_path_with_author_uses_bracket_format test_douyin.py::DouyinModuleTests::test_build_output_path_without_author_uses_original_format -v`
|
||||||
|
Expected: PASS
|
||||||
|
|
||||||
|
- [ ] **Step 5: 提交**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git add Douyin.py test_douyin.py
|
||||||
|
git commit -m "feat: support author prefix in output filename"
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Task 5: 实现 `collect_recommendations()` 函数
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- Modify: `Douyin.py`(添加新函数)
|
||||||
|
- Test: `test_douyin.py`(添加测试)
|
||||||
|
|
||||||
|
- [ ] **Step 1: 编写失败测试**
|
||||||
|
|
||||||
|
```python
|
||||||
|
def test_collect_recommendations_downloads_videos_with_author_prefix(self) -> None:
|
||||||
|
module = importlib.import_module("Douyin")
|
||||||
|
packet = FakePacket(
|
||||||
|
{
|
||||||
|
"aweme_list": [
|
||||||
|
{
|
||||||
|
"aweme_id": "7619989983668240802",
|
||||||
|
"desc": "推荐视频1",
|
||||||
|
"author": {"nickname": "博主A", "uid": "111"},
|
||||||
|
"video": {
|
||||||
|
"play_addr": {
|
||||||
|
"url_list": ["https://v26-web.douyinvod.com/example/video1.mp4"]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
)
|
||||||
|
page = FakeRuntimePage("https://www.douyin.com/", packet)
|
||||||
|
|
||||||
|
with mock.patch.object(module, "import_runtime_dependencies", return_value=(object(), object(), object())):
|
||||||
|
with mock.patch.object(module, "create_page", return_value=page):
|
||||||
|
with mock.patch.object(module, "download_video") as mocked_download:
|
||||||
|
downloaded = module.collect_recommendations(
|
||||||
|
max_videos=50,
|
||||||
|
timeout=10,
|
||||||
|
output_dir=module.Path("video"),
|
||||||
|
browser_port=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(downloaded, 1)
|
||||||
|
# 验证文件名包含博主前缀
|
||||||
|
call_kwargs = mocked_download.call_args[1]
|
||||||
|
self.assertIn("[博主A]", str(call_kwargs["output_path"]))
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 2: 运行测试确认失败**
|
||||||
|
|
||||||
|
Run: `python3 -m pytest test_douyin.py::DouyinModuleTests::test_collect_recommendations_downloads_videos_with_author_prefix -v`
|
||||||
|
Expected: FAIL with "module has no attribute 'collect_recommendations'"
|
||||||
|
|
||||||
|
- [ ] **Step 3: 实现 `collect_recommendations`**
|
||||||
|
|
||||||
|
```python
|
||||||
|
def collect_recommendations(
|
||||||
|
max_videos: int,
|
||||||
|
timeout: int,
|
||||||
|
output_dir: Path,
|
||||||
|
browser_port: int | None,
|
||||||
|
) -> int:
|
||||||
|
requests_module, chromium_page_cls, chromium_options_cls = import_runtime_dependencies()
|
||||||
|
headers = build_headers("https://www.douyin.com/")
|
||||||
|
if browser_port is not None:
|
||||||
|
ensure_browser_debug_port_ready(browser_port)
|
||||||
|
page = create_page(chromium_page_cls, chromium_options_cls, browser_port)
|
||||||
|
page.listen.start(LISTEN_TARGET)
|
||||||
|
|
||||||
|
print("[INFO] 正在打开抖音推荐流。若出现登录或验证码,请先在浏览器窗口里完成。")
|
||||||
|
page.get("https://www.douyin.com/")
|
||||||
|
time.sleep(3)
|
||||||
|
|
||||||
|
downloaded = 0
|
||||||
|
seen_ids: set[str] = set()
|
||||||
|
consecutive_empty = 0
|
||||||
|
max_consecutive_empty = 3
|
||||||
|
|
||||||
|
while downloaded < max_videos:
|
||||||
|
packet = wait_for_aweme_packet(page, timeout=timeout)
|
||||||
|
if packet is None:
|
||||||
|
consecutive_empty += 1
|
||||||
|
if consecutive_empty >= max_consecutive_empty:
|
||||||
|
print("[INFO] 连续多次未获取到新数据,结束抓取。")
|
||||||
|
break
|
||||||
|
scroll_to_next_page(page)
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
payload = extract_aweme_payload(packet.response)
|
||||||
|
items = parse_aweme_items(payload)
|
||||||
|
except Exception as exc:
|
||||||
|
print(f"[WARN] 解析接口数据失败: {exc}")
|
||||||
|
consecutive_empty += 1
|
||||||
|
if consecutive_empty >= max_consecutive_empty:
|
||||||
|
break
|
||||||
|
scroll_to_next_page(page)
|
||||||
|
continue
|
||||||
|
|
||||||
|
if not items:
|
||||||
|
consecutive_empty += 1
|
||||||
|
if consecutive_empty >= max_consecutive_empty:
|
||||||
|
break
|
||||||
|
scroll_to_next_page(page)
|
||||||
|
continue
|
||||||
|
|
||||||
|
consecutive_empty = 0
|
||||||
|
new_items_in_batch = 0
|
||||||
|
|
||||||
|
for item in items:
|
||||||
|
if item["video_id"] in seen_ids:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if downloaded >= max_videos:
|
||||||
|
break
|
||||||
|
|
||||||
|
seen_ids.add(item["video_id"])
|
||||||
|
output_path = build_output_path(
|
||||||
|
title=item["title"],
|
||||||
|
video_id=item["video_id"],
|
||||||
|
output_dir=output_dir,
|
||||||
|
author_name=item.get("author_name"),
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
download_video(
|
||||||
|
requests_module=requests_module,
|
||||||
|
headers=headers,
|
||||||
|
video_url=item["video_url"],
|
||||||
|
output_path=output_path,
|
||||||
|
)
|
||||||
|
except Exception as exc:
|
||||||
|
print(f"[WARN] 下载失败 {item['video_id']}: {exc}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
downloaded += 1
|
||||||
|
new_items_in_batch += 1
|
||||||
|
print(f"[OK] 已保存: {output_path}")
|
||||||
|
|
||||||
|
if new_items_in_batch == 0:
|
||||||
|
consecutive_empty += 1
|
||||||
|
if consecutive_empty >= max_consecutive_empty:
|
||||||
|
break
|
||||||
|
|
||||||
|
scroll_to_next_page(page)
|
||||||
|
|
||||||
|
return downloaded
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 4: 运行测试确认通过**
|
||||||
|
|
||||||
|
Run: `python3 -m pytest test_douyin.py::DouyinModuleTests::test_collect_recommendations_downloads_videos_with_author_prefix -v`
|
||||||
|
Expected: PASS
|
||||||
|
|
||||||
|
- [ ] **Step 5: 提交**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git add Douyin.py test_douyin.py
|
||||||
|
git commit -m "feat: implement collect_recommendations() for For You page"
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Task 6: 添加 `--max-videos` 命令行参数
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- Modify: `Douyin.py:295-305`(修改 `build_parser`)
|
||||||
|
- Modify: `Douyin.py:310-350`(修改 `main`)
|
||||||
|
- Test: `test_douyin.py`(添加测试)
|
||||||
|
|
||||||
|
- [ ] **Step 1: 编写失败测试**
|
||||||
|
|
||||||
|
```python
|
||||||
|
def test_build_parser_has_max_videos_argument(self) -> None:
|
||||||
|
module = importlib.import_module("Douyin")
|
||||||
|
args = module.build_parser().parse_args(["--max-videos", "30"])
|
||||||
|
self.assertEqual(args.max_videos, 30)
|
||||||
|
|
||||||
|
def test_main_dispatches_recommendation_flow_for_recommendation_url(self) -> None:
|
||||||
|
module = importlib.import_module("Douyin")
|
||||||
|
stdout = io.StringIO()
|
||||||
|
recommendation_target = module.ResolvedTarget(
|
||||||
|
kind="recommendation",
|
||||||
|
value="https://www.douyin.com/",
|
||||||
|
source="current-page",
|
||||||
|
)
|
||||||
|
with redirect_stdout(stdout):
|
||||||
|
with mock.patch.object(module, "resolve_cli_target", return_value=recommendation_target):
|
||||||
|
with mock.patch.object(module, "collect_recommendations", return_value=5) as mocked_collect:
|
||||||
|
exit_code = module.main([])
|
||||||
|
self.assertEqual(exit_code, 0)
|
||||||
|
mocked_collect.assert_called_once_with(
|
||||||
|
max_videos=50,
|
||||||
|
timeout=10,
|
||||||
|
output_dir=module.Path("video"),
|
||||||
|
browser_port=9223,
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 2: 运行测试确认失败**
|
||||||
|
|
||||||
|
Run: `python3 -m pytest test_douyin.py::DouyinModuleTests::test_build_parser_has_max_videos_argument -v`
|
||||||
|
Expected: FAIL with "unrecognized arguments: --max-videos"
|
||||||
|
|
||||||
|
- [ ] **Step 3: 修改 `build_parser` 和 `main`**
|
||||||
|
|
||||||
|
```python
|
||||||
|
def build_parser() -> argparse.ArgumentParser:
|
||||||
|
parser = argparse.ArgumentParser(description="附着抖音登录浏览器并下载当前页面或指定目标的视频")
|
||||||
|
parser.add_argument(
|
||||||
|
"target",
|
||||||
|
nargs="?",
|
||||||
|
default=None,
|
||||||
|
help="可选:博主主页 URL、单视频 URL 或 aweme_id;不传则读取当前浏览器页面",
|
||||||
|
)
|
||||||
|
parser.add_argument("--pages", type=int, default=1, help="创作者抓取最多处理多少页;默认 1")
|
||||||
|
parser.add_argument("--timeout", type=int, default=10, help="单次等待接口响应秒数,默认 10")
|
||||||
|
parser.add_argument(
|
||||||
|
"--output-dir",
|
||||||
|
default="video",
|
||||||
|
help="视频输出目录,默认 video",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--browser-port",
|
||||||
|
type=int,
|
||||||
|
default=DEFAULT_BROWSER_PORT,
|
||||||
|
help="附着到已启动 Chrome 的调试端口,默认 9223",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--max-videos",
|
||||||
|
type=int,
|
||||||
|
default=50,
|
||||||
|
help="推荐流最大抓取数量,默认 50",
|
||||||
|
)
|
||||||
|
return parser
|
||||||
|
|
||||||
|
|
||||||
|
def main(argv: list[str] | None = None) -> int:
|
||||||
|
parser = build_parser()
|
||||||
|
args = parser.parse_args(argv)
|
||||||
|
|
||||||
|
if args.pages <= 0:
|
||||||
|
parser.error("--pages 必须大于 0")
|
||||||
|
if args.timeout <= 0:
|
||||||
|
parser.error("--timeout 必须大于 0")
|
||||||
|
if args.browser_port is not None and args.browser_port <= 0:
|
||||||
|
parser.error("--browser-port 必须大于 0")
|
||||||
|
if args.max_videos <= 0:
|
||||||
|
parser.error("--max-videos 必须大于 0")
|
||||||
|
|
||||||
|
try:
|
||||||
|
target = resolve_cli_target(args.target, browser_port=args.browser_port)
|
||||||
|
if target.kind == "creator":
|
||||||
|
total = collect_videos(
|
||||||
|
user_url=target.value,
|
||||||
|
max_pages=args.pages,
|
||||||
|
timeout=args.timeout,
|
||||||
|
output_dir=Path(args.output_dir),
|
||||||
|
browser_port=args.browser_port,
|
||||||
|
auto_scroll=args.pages > 1,
|
||||||
|
)
|
||||||
|
elif target.kind == "recommendation":
|
||||||
|
total = collect_recommendations(
|
||||||
|
max_videos=args.max_videos,
|
||||||
|
timeout=args.timeout,
|
||||||
|
output_dir=Path(args.output_dir),
|
||||||
|
browser_port=args.browser_port,
|
||||||
|
)
|
||||||
|
elif target.kind == "single-video":
|
||||||
|
total = collect_single_video(
|
||||||
|
target=target,
|
||||||
|
timeout=args.timeout,
|
||||||
|
output_dir=Path(args.output_dir),
|
||||||
|
browser_port=args.browser_port,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
raise RuntimeError(f"不支持的目标类型: {target.kind}")
|
||||||
|
except RuntimeError as exc:
|
||||||
|
print(f"[ERROR] {exc}")
|
||||||
|
return 1
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
print("\n[INFO] 用户中断。")
|
||||||
|
return 130
|
||||||
|
|
||||||
|
print(f"[INFO] 处理结束,共下载 {total} 个视频。")
|
||||||
|
return 0
|
||||||
|
```
|
||||||
|
|
||||||
|
- [ ] **Step 4: 运行测试确认通过**
|
||||||
|
|
||||||
|
Run: `python3 -m pytest test_douyin.py::DouyinModuleTests::test_build_parser_has_max_videos_argument test_douyin.py::DouyinModuleTests::test_main_dispatches_recommendation_flow_for_recommendation_url -v`
|
||||||
|
Expected: PASS
|
||||||
|
|
||||||
|
- [ ] **Step 5: 提交**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git add Douyin.py test_douyin.py
|
||||||
|
git commit -m "feat: add --max-videos argument and wire recommendation flow in main"
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Task 7: 运行全部测试并验证
|
||||||
|
|
||||||
|
- [ ] **Step 1: 运行全部测试**
|
||||||
|
|
||||||
|
Run: `python3 -m pytest test_douyin.py -v`
|
||||||
|
Expected: 所有测试通过
|
||||||
|
|
||||||
|
- [ ] **Step 2: 运行主脚本帮助确认**
|
||||||
|
|
||||||
|
Run: `python3 Douyin.py --help`
|
||||||
|
Expected: 显示包含 `--max-videos` 的帮助信息
|
||||||
|
|
||||||
|
- [ ] **Step 3: 提交**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git add -A
|
||||||
|
git commit -m "test: verify all tests pass for recommendation crawling feature"
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 完成标准
|
||||||
|
|
||||||
|
1. ✅ `Douyin.py` 支持识别 `https://www.douyin.com/` 为推荐流目标
|
||||||
|
2. ✅ `collect_recommendations()` 函数实现滚动加载、最多50条、去重
|
||||||
|
3. ✅ 视频文件名包含博主昵称:`[博主名]标题-aweme_id.mp4`
|
||||||
|
4. ✅ `--max-videos` 命令行参数可用
|
||||||
|
5. ✅ 所有现有测试继续通过
|
||||||
|
6. ✅ 新增测试覆盖推荐流功能
|
||||||
@ -0,0 +1,165 @@
|
|||||||
|
# 抖音推荐流视频抓取设计文档
|
||||||
|
|
||||||
|
## 背景
|
||||||
|
|
||||||
|
当前系统支持抓取指定博主主页或单视频。现需扩展支持抓取抖音推荐流(For You页面)当前可见的视频。
|
||||||
|
|
||||||
|
## 目标
|
||||||
|
|
||||||
|
- 支持抓取抖音推荐流(`https://www.douyin.com/`)的视频
|
||||||
|
- 记录每个视频对应的博主信息
|
||||||
|
- 支持滚动加载,最多抓取50条
|
||||||
|
- 视频统一保存到 `video/` 目录
|
||||||
|
- 保持现有两步式工作流不变
|
||||||
|
|
||||||
|
## 方案选择
|
||||||
|
|
||||||
|
采用**方案B:新建推荐流专用抓取函数**
|
||||||
|
|
||||||
|
理由:
|
||||||
|
- 逻辑清晰,推荐流和博主页完全解耦
|
||||||
|
- 便于后续分别维护
|
||||||
|
- 工作量适中,可快速实现验证
|
||||||
|
|
||||||
|
## 详细设计
|
||||||
|
|
||||||
|
### 1. 目标识别扩展
|
||||||
|
|
||||||
|
新增推荐流URL识别模式:
|
||||||
|
|
||||||
|
```python
|
||||||
|
RECOMMENDATION_URL_PATTERN = re.compile(
|
||||||
|
r"^https?://www\.douyin\.com/?(?:\?.*)?$"
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
目标类型扩展为三种:
|
||||||
|
- `creator` - 博主主页(现有)
|
||||||
|
- `single-video` - 单视频(现有)
|
||||||
|
- `recommendation` - 推荐流(新增)
|
||||||
|
|
||||||
|
### 2. 核心流程
|
||||||
|
|
||||||
|
```
|
||||||
|
用户执行 ./.venv/bin/python Douyin.py
|
||||||
|
↓
|
||||||
|
读取当前浏览器页面URL
|
||||||
|
↓
|
||||||
|
判断页面类型:
|
||||||
|
- 推荐流 → 执行 collect_recommendations()
|
||||||
|
- 博主页 → 执行 collect_videos()(现有)
|
||||||
|
- 单视频 → 执行 collect_single_video()(现有)
|
||||||
|
- 其他 → 报错提示
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. collect_recommendations() 函数
|
||||||
|
|
||||||
|
**参数:**
|
||||||
|
- `max_videos`: 最大抓取数量(默认50)
|
||||||
|
- `timeout`: 单次等待接口响应秒数(默认10)
|
||||||
|
- `output_dir`: 输出目录(默认 `video/`)
|
||||||
|
- `browser_port`: 浏览器调试端口(默认9223)
|
||||||
|
|
||||||
|
**行为:**
|
||||||
|
1. 通过 `page.get("https://www.douyin.com/")` 打开推荐流页面(复用现有页面打开逻辑,不切换标签页)
|
||||||
|
2. 启动监听,目标接口:`web/aweme/post/`(推荐流与博主页共用此接口)
|
||||||
|
3. 循环直到收集够 `max_videos` 条或无法继续加载:
|
||||||
|
- 等待接口响应
|
||||||
|
- 解析视频列表,提取:标题、视频ID、视频URL、博主信息
|
||||||
|
- 过滤已下载(按 `video_id` 去重,使用 `seen_ids: set[str]` 集合)
|
||||||
|
- 下载视频
|
||||||
|
- 向下滚动加载更多
|
||||||
|
4. 返回实际下载数量
|
||||||
|
|
||||||
|
### 4. 数据解析增强
|
||||||
|
|
||||||
|
新增博主信息提取字段:
|
||||||
|
|
||||||
|
```python
|
||||||
|
{
|
||||||
|
"title": "视频标题",
|
||||||
|
"video_id": "aweme_id",
|
||||||
|
"video_url": "下载链接",
|
||||||
|
"author_name": "博主昵称",
|
||||||
|
"author_id": "博主ID",
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 5. 文件名格式
|
||||||
|
|
||||||
|
**推荐流视频:**
|
||||||
|
```
|
||||||
|
[博主昵称]视频标题-aweme_id.mp4
|
||||||
|
```
|
||||||
|
示例:`[张三]搞笑视频-7619989983668240802.mp4`
|
||||||
|
|
||||||
|
**博主页视频(保持现有):**
|
||||||
|
```
|
||||||
|
视频标题-aweme_id.mp4
|
||||||
|
```
|
||||||
|
|
||||||
|
### 6. 命令行参数
|
||||||
|
|
||||||
|
**新增参数:**
|
||||||
|
```bash
|
||||||
|
--max-videos 50 # 推荐流最大抓取数量(默认50)
|
||||||
|
```
|
||||||
|
|
||||||
|
**使用示例:**
|
||||||
|
```bash
|
||||||
|
# 零参数,自动识别当前页面
|
||||||
|
./.venv/bin/python Douyin.py
|
||||||
|
|
||||||
|
# 自定义抓取数量(仅对推荐流有效)
|
||||||
|
./.venv/bin/python Douyin.py --max-videos 30
|
||||||
|
|
||||||
|
# 显式传入URL时,--max-videos 不适用(博主页和单视频页忽略此参数)
|
||||||
|
./.venv/bin/python Douyin.py "https://www.douyin.com/user/xxx"
|
||||||
|
```
|
||||||
|
|
||||||
|
### 7. 错误处理
|
||||||
|
|
||||||
|
- **推荐流页面未加载数据**:提示用户先滚动加载内容
|
||||||
|
- **滚动后无新数据**:正常结束,返回已下载数量
|
||||||
|
- **达到最大数量**:正常结束
|
||||||
|
- **其他错误**:复用现有错误处理机制
|
||||||
|
|
||||||
|
### 8. 测试覆盖(TDD)
|
||||||
|
|
||||||
|
必须覆盖以下测试场景:
|
||||||
|
|
||||||
|
- [ ] 推荐流URL识别测试
|
||||||
|
- [ ] 推荐流页面解析测试(模拟API响应含博主信息)
|
||||||
|
- [ ] 滚动加载逻辑测试
|
||||||
|
- [ ] 最大数量限制测试
|
||||||
|
- [ ] 文件名构建测试(含博主名)
|
||||||
|
- [ ] 博主信息提取测试
|
||||||
|
- [ ] 去重逻辑测试
|
||||||
|
|
||||||
|
## 非目标
|
||||||
|
|
||||||
|
- 自动登录抖音
|
||||||
|
- 自动过验证码
|
||||||
|
- 抓取非推荐流页面(如话题页、搜索页)
|
||||||
|
- 自动筛选视频内容
|
||||||
|
- 抓取超过50条视频(如需更多,需用户手动调整参数)
|
||||||
|
|
||||||
|
## 验收标准
|
||||||
|
|
||||||
|
1. 用户可以在推荐流页面执行 `./.venv/bin/python Douyin.py` 抓取视频
|
||||||
|
2. 系统能自动识别当前页面是推荐流
|
||||||
|
3. 支持滚动加载,最多抓取50条
|
||||||
|
4. 文件名包含博主昵称
|
||||||
|
5. 所有测试通过
|
||||||
|
6. 关键失败场景有明确报错
|
||||||
|
|
||||||
|
## 实现步骤
|
||||||
|
|
||||||
|
1. 编写测试(TDD)
|
||||||
|
2. 实现推荐流URL识别
|
||||||
|
3. 实现 `collect_recommendations()` 函数
|
||||||
|
4. 增强数据解析(提取博主信息)
|
||||||
|
5. 修改文件名构建逻辑
|
||||||
|
6. 更新命令行参数
|
||||||
|
7. 运行全部测试
|
||||||
|
8. 提交代码
|
||||||
115
test_douyin.py
115
test_douyin.py
@ -31,12 +31,16 @@ class FakeListener:
|
|||||||
def __init__(self, packet):
|
def __init__(self, packet):
|
||||||
self.packet = packet
|
self.packet = packet
|
||||||
self.started_targets = []
|
self.started_targets = []
|
||||||
|
self.call_count = 0
|
||||||
|
|
||||||
def start(self, target):
|
def start(self, target):
|
||||||
self.started_targets.append(target)
|
self.started_targets.append(target)
|
||||||
|
|
||||||
def wait(self, timeout):
|
def wait(self, timeout):
|
||||||
return self.packet
|
self.call_count += 1
|
||||||
|
if self.call_count == 1:
|
||||||
|
return self.packet
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
class FakeRuntimePage:
|
class FakeRuntimePage:
|
||||||
@ -82,6 +86,15 @@ class DouyinModuleTests(unittest.TestCase):
|
|||||||
output_path = module.build_output_path("测试标题", "123456")
|
output_path = module.build_output_path("测试标题", "123456")
|
||||||
self.assertEqual(output_path.as_posix(), "video/测试标题-123456.mp4")
|
self.assertEqual(output_path.as_posix(), "video/测试标题-123456.mp4")
|
||||||
|
|
||||||
|
def test_build_output_path_with_author_uses_bracket_format(self) -> None:
|
||||||
|
module = importlib.import_module("Douyin")
|
||||||
|
output_path = module.build_output_path(
|
||||||
|
title="测试标题",
|
||||||
|
video_id="123456",
|
||||||
|
author_name="测试博主"
|
||||||
|
)
|
||||||
|
self.assertEqual(output_path.as_posix(), "video/[测试博主]测试标题-123456.mp4")
|
||||||
|
|
||||||
def test_extract_aweme_payload_uses_dict_body(self) -> None:
|
def test_extract_aweme_payload_uses_dict_body(self) -> None:
|
||||||
module = importlib.import_module("Douyin")
|
module = importlib.import_module("Douyin")
|
||||||
response = FakeResponse({"aweme_list": []}, "")
|
response = FakeResponse({"aweme_list": []}, "")
|
||||||
@ -115,6 +128,14 @@ class DouyinModuleTests(unittest.TestCase):
|
|||||||
with self.assertRaisesRegex(RuntimeError, "login_douyin.py"):
|
with self.assertRaisesRegex(RuntimeError, "login_douyin.py"):
|
||||||
module.ensure_browser_debug_port_ready(9223)
|
module.ensure_browser_debug_port_ready(9223)
|
||||||
|
|
||||||
|
def test_is_recommendation_url_accepts_douyin_homepage(self) -> None:
|
||||||
|
module = importlib.import_module("Douyin")
|
||||||
|
self.assertTrue(module.is_recommendation_url("https://www.douyin.com/"))
|
||||||
|
self.assertTrue(module.is_recommendation_url("https://www.douyin.com"))
|
||||||
|
self.assertTrue(module.is_recommendation_url("https://www.douyin.com/?from=web"))
|
||||||
|
self.assertFalse(module.is_recommendation_url("https://www.douyin.com/user/xxx"))
|
||||||
|
self.assertFalse(module.is_recommendation_url("https://www.douyin.com/video/123"))
|
||||||
|
|
||||||
def test_is_creator_url_accepts_supported_douyin_creator_url(self) -> None:
|
def test_is_creator_url_accepts_supported_douyin_creator_url(self) -> None:
|
||||||
module = importlib.import_module("Douyin")
|
module = importlib.import_module("Douyin")
|
||||||
self.assertTrue(
|
self.assertTrue(
|
||||||
@ -136,6 +157,13 @@ class DouyinModuleTests(unittest.TestCase):
|
|||||||
self.assertTrue(module.is_aweme_id("7619989983668240802"))
|
self.assertTrue(module.is_aweme_id("7619989983668240802"))
|
||||||
self.assertFalse(module.is_aweme_id("not-an-aweme-id"))
|
self.assertFalse(module.is_aweme_id("not-an-aweme-id"))
|
||||||
|
|
||||||
|
def test_parse_target_input_classifies_recommendation_url(self) -> None:
|
||||||
|
module = importlib.import_module("Douyin")
|
||||||
|
target = module.parse_target_input("https://www.douyin.com/", source="manual")
|
||||||
|
self.assertEqual(target.kind, "recommendation")
|
||||||
|
self.assertEqual(target.value, "https://www.douyin.com/")
|
||||||
|
self.assertEqual(target.source, "manual")
|
||||||
|
|
||||||
def test_parse_target_input_classifies_creator_url(self) -> None:
|
def test_parse_target_input_classifies_creator_url(self) -> None:
|
||||||
module = importlib.import_module("Douyin")
|
module = importlib.import_module("Douyin")
|
||||||
target = module.parse_target_input(
|
target = module.parse_target_input(
|
||||||
@ -239,6 +267,30 @@ class DouyinModuleTests(unittest.TestCase):
|
|||||||
browser_port=None,
|
browser_port=None,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def test_parse_aweme_items_extracts_author_info(self) -> None:
|
||||||
|
module = importlib.import_module("Douyin")
|
||||||
|
payload = {
|
||||||
|
"aweme_list": [
|
||||||
|
{
|
||||||
|
"aweme_id": "7619989983668240802",
|
||||||
|
"desc": "测试视频",
|
||||||
|
"author": {
|
||||||
|
"nickname": "测试博主",
|
||||||
|
"uid": "123456789"
|
||||||
|
},
|
||||||
|
"video": {
|
||||||
|
"play_addr": {
|
||||||
|
"url_list": ["https://v26-web.douyinvod.com/example/video.mp4"]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
items = module.parse_aweme_items(payload)
|
||||||
|
self.assertEqual(len(items), 1)
|
||||||
|
self.assertEqual(items[0]["author_name"], "测试博主")
|
||||||
|
self.assertEqual(items[0]["author_id"], "123456789")
|
||||||
|
|
||||||
def test_build_video_page_url_uses_aweme_id(self) -> None:
|
def test_build_video_page_url_uses_aweme_id(self) -> None:
|
||||||
module = importlib.import_module("Douyin")
|
module = importlib.import_module("Douyin")
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
@ -246,6 +298,42 @@ class DouyinModuleTests(unittest.TestCase):
|
|||||||
"https://www.douyin.com/video/7619989983668240802",
|
"https://www.douyin.com/video/7619989983668240802",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def test_collect_recommendations_downloads_videos_with_author_prefix(self) -> None:
|
||||||
|
module = importlib.import_module("Douyin")
|
||||||
|
packet = FakePacket(
|
||||||
|
{
|
||||||
|
"aweme_list": [
|
||||||
|
{
|
||||||
|
"aweme_id": "7619989983668240802",
|
||||||
|
"desc": "推荐视频1",
|
||||||
|
"author": {"nickname": "博主A", "uid": "111"},
|
||||||
|
"video": {
|
||||||
|
"play_addr": {
|
||||||
|
"url_list": ["https://v26-web.douyinvod.com/example/video1.mp4"]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
)
|
||||||
|
page = FakeRuntimePage("https://www.douyin.com/", packet)
|
||||||
|
|
||||||
|
with mock.patch.object(module, "import_runtime_dependencies", return_value=(object(), object(), object())):
|
||||||
|
with mock.patch.object(module, "create_page", return_value=page):
|
||||||
|
with mock.patch.object(module, "download_video") as mocked_download:
|
||||||
|
with mock.patch.object(module, "scroll_to_next_page"):
|
||||||
|
downloaded = module.collect_recommendations(
|
||||||
|
max_videos=50,
|
||||||
|
timeout=10,
|
||||||
|
output_dir=module.Path("video"),
|
||||||
|
browser_port=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(downloaded, 1)
|
||||||
|
# 验证文件名包含博主前缀
|
||||||
|
call_kwargs = mocked_download.call_args[1]
|
||||||
|
self.assertIn("[博主A]", str(call_kwargs["output_path"]))
|
||||||
|
|
||||||
def test_collect_single_video_downloads_exactly_one_file_for_video_url_target(self) -> None:
|
def test_collect_single_video_downloads_exactly_one_file_for_video_url_target(self) -> None:
|
||||||
module = importlib.import_module("Douyin")
|
module = importlib.import_module("Douyin")
|
||||||
packet = FakePacket(
|
packet = FakePacket(
|
||||||
@ -316,6 +404,11 @@ class DouyinModuleTests(unittest.TestCase):
|
|||||||
self.assertEqual(page.visited_urls, ["https://www.douyin.com/video/7619989983668240802"])
|
self.assertEqual(page.visited_urls, ["https://www.douyin.com/video/7619989983668240802"])
|
||||||
mocked_download.assert_called_once()
|
mocked_download.assert_called_once()
|
||||||
|
|
||||||
|
def test_build_parser_has_max_videos_argument(self) -> None:
|
||||||
|
module = importlib.import_module("Douyin")
|
||||||
|
args = module.build_parser().parse_args(["--max-videos", "30"])
|
||||||
|
self.assertEqual(args.max_videos, 30)
|
||||||
|
|
||||||
def test_build_parser_defaults_to_zero_argument_current_page_flow(self) -> None:
|
def test_build_parser_defaults_to_zero_argument_current_page_flow(self) -> None:
|
||||||
module = importlib.import_module("Douyin")
|
module = importlib.import_module("Douyin")
|
||||||
args = module.build_parser().parse_args([])
|
args = module.build_parser().parse_args([])
|
||||||
@ -331,6 +424,26 @@ class DouyinModuleTests(unittest.TestCase):
|
|||||||
self.assertEqual(target.aweme_id, "7619989983668240802")
|
self.assertEqual(target.aweme_id, "7619989983668240802")
|
||||||
mocked_imports.assert_not_called()
|
mocked_imports.assert_not_called()
|
||||||
|
|
||||||
|
def test_main_dispatches_recommendation_flow_for_recommendation_url(self) -> None:
|
||||||
|
module = importlib.import_module("Douyin")
|
||||||
|
stdout = io.StringIO()
|
||||||
|
recommendation_target = module.ResolvedTarget(
|
||||||
|
kind="recommendation",
|
||||||
|
value="https://www.douyin.com/",
|
||||||
|
source="current-page",
|
||||||
|
)
|
||||||
|
with redirect_stdout(stdout):
|
||||||
|
with mock.patch.object(module, "resolve_cli_target", return_value=recommendation_target):
|
||||||
|
with mock.patch.object(module, "collect_recommendations", return_value=5) as mocked_collect:
|
||||||
|
exit_code = module.main([])
|
||||||
|
self.assertEqual(exit_code, 0)
|
||||||
|
mocked_collect.assert_called_once_with(
|
||||||
|
max_videos=50,
|
||||||
|
timeout=10,
|
||||||
|
output_dir=module.Path("video"),
|
||||||
|
browser_port=9223,
|
||||||
|
)
|
||||||
|
|
||||||
def test_main_without_target_dispatches_current_page_creator_flow(self) -> None:
|
def test_main_without_target_dispatches_current_page_creator_flow(self) -> None:
|
||||||
module = importlib.import_module("Douyin")
|
module = importlib.import_module("Douyin")
|
||||||
stdout = io.StringIO()
|
stdout = io.StringIO()
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user