From 321bbff1c155f96ec2fd4994069a9a233680ded8 Mon Sep 17 00:00:00 2001 From: wangshaoqing Date: Wed, 6 May 2026 16:39:55 +0800 Subject: [PATCH] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E5=AD=A6=E4=B9=A0=E6=95=99?= =?UTF-8?q?=E7=A8=8B=E7=9B=AE=E5=BD=95(learning/)=E5=8F=8A=E5=AF=B9?= =?UTF-8?q?=E5=BA=94=E5=8D=95=E5=85=83=E6=B5=8B=E8=AF=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 新增 DrissionPage 基础教程(01-05) - 新增 Playwright 基础教程(01-05) - 新增网络基础教程(01-05) - 新增 test_learning_examples.py 单元测试 - 更新 .gitignore 忽略 learning/*/output/ 目录 --- .gitignore | 1 + learning/README.md | 21 ++ learning/drissionpage_basics/01_open_page.py | 32 ++ .../drissionpage_basics/02_attach_browser.py | 49 +++ learning/drissionpage_basics/03_listen_api.py | 76 +++++ .../drissionpage_basics/04_parse_aweme.py | 104 ++++++ .../drissionpage_basics/05_download_video.py | 103 ++++++ learning/drissionpage_basics/README.md | 121 +++++++ .../01_open_devtools_check_xhr.md | 58 ++++ .../network_basics/02_find_aweme_list_api.md | 53 +++ .../03_read_headers_and_cookies.md | 62 ++++ .../network_basics/04_analyze_pagination.md | 58 ++++ .../05_copy_as_curl_and_replay.md | 60 ++++ learning/network_basics/README.md | 60 ++++ learning/playwright_basics/01_open_page.py | 36 ++ .../02_persistent_context.py | 56 ++++ .../playwright_basics/03_wait_and_locate.py | 43 +++ .../playwright_basics/04_listen_response.py | 81 +++++ .../playwright_basics/05_download_video.py | 87 +++++ learning/playwright_basics/README.md | 119 +++++++ test_learning_examples.py | 307 ++++++++++++++++++ 21 files changed, 1587 insertions(+) create mode 100644 learning/README.md create mode 100644 learning/drissionpage_basics/01_open_page.py create mode 100644 learning/drissionpage_basics/02_attach_browser.py create mode 100644 learning/drissionpage_basics/03_listen_api.py create mode 100644 learning/drissionpage_basics/04_parse_aweme.py create mode 100644 learning/drissionpage_basics/05_download_video.py create mode 100644 learning/drissionpage_basics/README.md create mode 100644 learning/network_basics/01_open_devtools_check_xhr.md create mode 100644 learning/network_basics/02_find_aweme_list_api.md create mode 100644 learning/network_basics/03_read_headers_and_cookies.md create mode 100644 learning/network_basics/04_analyze_pagination.md create mode 100644 learning/network_basics/05_copy_as_curl_and_replay.md create mode 100644 learning/network_basics/README.md create mode 100644 learning/playwright_basics/01_open_page.py create mode 100644 learning/playwright_basics/02_persistent_context.py create mode 100644 learning/playwright_basics/03_wait_and_locate.py create mode 100644 learning/playwright_basics/04_listen_response.py create mode 100644 learning/playwright_basics/05_download_video.py create mode 100644 learning/playwright_basics/README.md create mode 100644 test_learning_examples.py diff --git a/.gitignore b/.gitignore index 1958682..4e5669a 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,4 @@ __pycache__/ video/ video_p2/ .pytest_cache/ +learning/*/output/ diff --git a/learning/README.md b/learning/README.md new file mode 100644 index 0000000..e3a7384 --- /dev/null +++ b/learning/README.md @@ -0,0 +1,21 @@ +# Learning Index + +这里现在拆成两条学习线。 + +- `drissionpage_basics/` + 当前项目第一阶段练习,围绕 `DrissionPage`、Chrome 调试端口、接口监听和视频下载。 +- `playwright_basics/` + 第二阶段练习,改用 `Playwright Python`,学习另一套浏览器自动化和响应监听方式。 +- `network_basics/` + 第三阶段练习,重点练 DevTools 抓包、接口识别、请求头分析、分页分析和请求重放。 + +## 建议顺序 + +如果你已经把 `drissionpage_basics/` 学完了,就直接进入: + +1. [playwright_basics/README.md](/Users/wangshaoqing/Desktop/MiaoSi/Study/douyin-crawler-poc/learning/playwright_basics/README.md) +2. [network_basics/README.md](/Users/wangshaoqing/Desktop/MiaoSi/Study/douyin-crawler-poc/learning/network_basics/README.md) + +如果你后面想回顾第一阶段,再看: + +1. [drissionpage_basics/README.md](/Users/wangshaoqing/Desktop/MiaoSi/Study/douyin-crawler-poc/learning/drissionpage_basics/README.md) diff --git a/learning/drissionpage_basics/01_open_page.py b/learning/drissionpage_basics/01_open_page.py new file mode 100644 index 0000000..3f97115 --- /dev/null +++ b/learning/drissionpage_basics/01_open_page.py @@ -0,0 +1,32 @@ +""" +练习 01: 用 DrissionPage 打开一个网页。 + +目标: +1. 自己导入 `ChromiumPage` +2. 创建浏览器页面对象 +3. 打开一个 URL +4. 打印当前页面标题或 URL + +建议: +- 第一版先不要封装函数 +- 能打开页面就算过关 +- 如果你想停住窗口,可以在最后加 `input()` + +运行: + ./.venv/bin/python learning/drissionpage_basics/01_open_page.py +""" + +DEFAULT_URL = "https://www.douyin.com/" + +def main() -> None: + from DrissionPage import ChromiumPage + + page = ChromiumPage() + page.get(DEFAULT_URL) + print(f"[INFO] 已打开 URL: {page.url}") + print(f"[INFO] 页面标题: {page.title}") + input("按回车退出...") + + +if __name__ == "__main__": + main() diff --git a/learning/drissionpage_basics/02_attach_browser.py b/learning/drissionpage_basics/02_attach_browser.py new file mode 100644 index 0000000..e240dab --- /dev/null +++ b/learning/drissionpage_basics/02_attach_browser.py @@ -0,0 +1,49 @@ +""" +练习 02: 附着到已经启动的 Chrome 调试端口。 + +前置: +1. 先执行 `./.venv/bin/python login_douyin.py --browser-port 9223` +2. 在打开的浏览器里完成登录 + +目标: +1. 自己导入 `ChromiumPage` 和 `ChromiumOptions` +2. 通过 `127.0.0.1:9223` 附着到已启动浏览器 +3. 打印当前页面 URL 或标题 +4. 手动确认它附着的是你刚才登录的那个浏览器 + +运行: + ./.venv/bin/python learning/drissionpage_basics/02_attach_browser.py +""" + +import sys +from pathlib import Path + +PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent +if str(PROJECT_ROOT) not in sys.path: + sys.path.insert(0, str(PROJECT_ROOT)) + +from Douyin import ensure_browser_debug_port_ready + +DEFAULT_BROWSER_PORT = 9223 + + +def build_browser_address(browser_port: int) -> str: + return f"127.0.0.1:{browser_port}" + + +def main() -> None: + from DrissionPage import ChromiumOptions + from DrissionPage import ChromiumPage + + browser_address = build_browser_address(DEFAULT_BROWSER_PORT) + ensure_browser_debug_port_ready(DEFAULT_BROWSER_PORT) + + options = ChromiumOptions().set_address(browser_address).existing_only(True) + page = ChromiumPage(options) + print(f"[INFO] 已附着浏览器: {browser_address}") + print(f"[INFO] 当前 URL: {page.url}") + print(f"[INFO] 页面标题: {page.title}") + + +if __name__ == "__main__": + main() diff --git a/learning/drissionpage_basics/03_listen_api.py b/learning/drissionpage_basics/03_listen_api.py new file mode 100644 index 0000000..37095d4 --- /dev/null +++ b/learning/drissionpage_basics/03_listen_api.py @@ -0,0 +1,76 @@ +""" +练习 03: 监听抖音作品接口。 + +前置: +1. 先执行 `./.venv/bin/python login_douyin.py --browser-port 9223` +2. 在浏览器里完成登录 +3. 打开某个抖音博主主页 + +目标: +1. 附着到浏览器 +2. 调用 `page.listen.start(...)` +3. 等待一个接口包 +4. 打印 `packet`、`packet.response`、`packet.response.body` 的类型或部分内容 + +建议: +- 第一版不要急着做完整解析 +- 先看清楚监听回来的对象长什么样 +- 如果没监听到,手动滚动页面再试 + +运行: + ./.venv/bin/python learning/drissionpage_basics/03_listen_api.py +""" + +import sys +from pathlib import Path + +PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent +if str(PROJECT_ROOT) not in sys.path: + sys.path.insert(0, str(PROJECT_ROOT)) + +from Douyin import DEFAULT_USER_URL +from Douyin import build_browser_address +from Douyin import ensure_browser_debug_port_ready + +LISTEN_TARGET = "web/aweme/post/" +DEFAULT_BROWSER_PORT = 9223 + + +def main() -> None: + from DrissionPage import ChromiumOptions + from DrissionPage import ChromiumPage + + ensure_browser_debug_port_ready(DEFAULT_BROWSER_PORT) + options = ChromiumOptions().set_address(build_browser_address(DEFAULT_BROWSER_PORT)).existing_only(True) + page = ChromiumPage(options) + + page.listen.start(LISTEN_TARGET) + page.get(DEFAULT_USER_URL) + print(f"[INFO] 当前 URL: {page.url}") + print("[INFO] 已开始监听作品接口。如果当前页不是博主主页,请在浏览器里切到博主主页并轻微滚动。") + + packet = page.listen.wait(timeout=10) + if packet is None: + print("[WARN] 10 秒内没有监听到接口包。请切到博主主页并滚动后重试。") + return + + response = packet.response + body = getattr(response, "body", None) + print(f"[INFO] packet 类型: {type(packet).__name__}") + print(f"[INFO] response 类型: {type(response).__name__}") + print(f"[INFO] response.body 类型: {type(body).__name__}") + + if isinstance(body, dict): + print(f"[INFO] body keys: {list(body.keys())[:10]}") + aweme_list = body.get("aweme_list") + if isinstance(aweme_list, list): + print(f"[INFO] aweme_list 数量: {len(aweme_list)}") + if aweme_list: + print(f"[INFO] 第一条 aweme keys: {list(aweme_list[0].keys())[:10]}") + else: + raw_body = getattr(response, "raw_body", "") + print(f"[INFO] raw_body 前 300 个字符: {str(raw_body)[:300]}") + + +if __name__ == "__main__": + main() diff --git a/learning/drissionpage_basics/04_parse_aweme.py b/learning/drissionpage_basics/04_parse_aweme.py new file mode 100644 index 0000000..7f64c72 --- /dev/null +++ b/learning/drissionpage_basics/04_parse_aweme.py @@ -0,0 +1,104 @@ +""" +练习 04: 解析 aweme_list。 + +这一题不连接浏览器,只练 JSON 结构解析。 + +目标: +1. 看懂 `aweme_list` 的层级 +2. 提取 title / video_id / video_url +3. 返回一个列表,列表里每项都是字典 + +预期输出格式: +[ + { + "title": "...", + "video_id": "...", + "video_url": "..." + } +] + +运行: + ./.venv/bin/python learning/drissionpage_basics/04_parse_aweme.py +""" + +from __future__ import annotations + +from typing import Any + +SAMPLE_PAYLOAD: dict[str, Any] = { + "aweme_list": [ + { + "aweme_id": "7500000000000000001", + "desc": "第一个示例视频", + "video": { + "play_addr": { + "url_list": [ + "https://example.com/play/first", + "https://v11-weba.douyinvod.com/example/first.mp4", + ] + } + }, + }, + { + "aweme_id": "7500000000000000002", + "desc": "第二个示例视频", + "video": { + "play_addr": { + "url_list": [ + "https://v26-web.douyinvod.com/example/second.mp4", + ] + } + }, + }, + ] +} + + +def choose_video_url(url_list: list[str]) -> str: + for url in url_list: + if "douyinvod.com" in url: + return url + if url_list: + return url_list[0] + raise ValueError("url_list 为空,无法选择视频地址。") + + +def parse_aweme_items(body: dict[str, Any]) -> list[dict[str, str]]: + aweme_list = body.get("aweme_list") + if not isinstance(aweme_list, list): + raise ValueError("body 里缺少 aweme_list。") + + items: list[dict[str, str]] = [] + for aweme in aweme_list: + if not isinstance(aweme, dict): + continue + + video_id = str(aweme.get("aweme_id") or "").strip() + if not video_id: + continue + + title = str(aweme.get("desc") or "").strip() or "untitled" + video = aweme.get("video") or {} + play_addr = video.get("play_addr") or {} + url_list = play_addr.get("url_list") or [] + if not isinstance(url_list, list) or not url_list: + continue + + items.append( + { + "title": title, + "video_id": video_id, + "video_url": choose_video_url([str(url) for url in url_list]), + } + ) + + return items + + +def main() -> None: + items = parse_aweme_items(SAMPLE_PAYLOAD) + print(items) + + +if __name__ == "__main__": + main() diff --git a/learning/drissionpage_basics/05_download_video.py b/learning/drissionpage_basics/05_download_video.py new file mode 100644 index 0000000..d253cb8 --- /dev/null +++ b/learning/drissionpage_basics/05_download_video.py @@ -0,0 +1,103 @@ +""" +练习 05: 用 requests 下载一个 mp4 到本地。 + +目标: +1. 自己导入 requests +2. 发起 GET 请求 +3. 把响应内容写入本地文件 +4. 手动确认文件确实存在 + +建议: +- 先从你在 03 / 04 里拿到的真实 mp4 链接开始 +- 不要一开始就封装复杂函数 +- 第一版先写死 URL 和输出文件名 + +运行: + ./.venv/bin/python learning/drissionpage_basics/05_download_video.py +""" + +from __future__ import annotations + +import sys +from pathlib import Path + +PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent +if str(PROJECT_ROOT) not in sys.path: + sys.path.insert(0, str(PROJECT_ROOT)) + +from Douyin import ( + LISTEN_TARGET, + build_headers, + download_video, + ensure_browser_debug_port_ready, + extract_aweme_payload, + parse_aweme_items, + sanitize_filename, +) + +DEFAULT_BROWSER_PORT = 9223 +OUTPUT_DIR = Path("learning/drissionpage_basics/output") +TIMEOUT_SECONDS = 20 + + +def build_output_path(title: str, video_id: str, output_dir: Path = OUTPUT_DIR) -> Path: + safe_title = sanitize_filename(title, fallback="practice-video") + return output_dir / f"{safe_title}-{video_id}.mp4" + + +def attach_to_browser(browser_port: int = DEFAULT_BROWSER_PORT): + from DrissionPage import ChromiumOptions + from DrissionPage import ChromiumPage + + ensure_browser_debug_port_ready(browser_port) + options = ChromiumOptions().set_address(f"127.0.0.1:{browser_port}").existing_only(True) + return ChromiumPage(options) + + +def download_first_real_video( + page, + requests_module, + output_dir: Path = OUTPUT_DIR, + timeout: int = TIMEOUT_SECONDS, +) -> Path: + current_url = page.url + page.listen.start(LISTEN_TARGET) + page.get(current_url) + packet = page.listen.wait(timeout=timeout) + if not packet or not hasattr(packet, "response"): + raise RuntimeError("当前页面没有监听到作品接口,请先切到博主主页并滚动页面后重试。") + + payload = extract_aweme_payload(packet.response) + items = parse_aweme_items(payload) + if not items: + raise RuntimeError("当前页面没有解析到可下载视频,请先确认页面已加载出作品。") + + first_item = items[0] + output_path = build_output_path( + title=first_item["title"], + video_id=first_item["video_id"], + output_dir=output_dir, + ) + headers = build_headers(page.url) + download_video( + requests_module=requests_module, + headers=headers, + video_url=first_item["video_url"], + output_path=output_path, + ) + return output_path + + +def main() -> None: + import requests + + page = attach_to_browser() + print(f"[INFO] 当前页面: {page.title}") + print(f"[INFO] 当前 URL: {page.url}") + print("[INFO] 正在监听当前页面的作品接口,必要时请在浏览器中轻微滚动一下页面。") + output_path = download_first_real_video(page=page, requests_module=requests) + print(f"[OK] 已下载到: {output_path}") + + +if __name__ == "__main__": + main() diff --git a/learning/drissionpage_basics/README.md b/learning/drissionpage_basics/README.md new file mode 100644 index 0000000..9791673 --- /dev/null +++ b/learning/drissionpage_basics/README.md @@ -0,0 +1,121 @@ +# DrissionPage Basics + +这里放的是你已经学过的第一阶段内容。 + +这一组练习围绕当前项目真实使用的 `DrissionPage` 工作流展开,重点是先把“打开页面、附着浏览器、监听接口、解析数据、下载文件”这条链路打通。 + +## 建议学习顺序 + +1. `01_open_page.py` +2. `02_attach_browser.py` +3. `03_listen_api.py` +4. `04_parse_aweme.py` +5. `05_download_video.py` + +## 开始前先准备 + +在项目根目录执行: + +```bash +python3 -m venv .venv +source .venv/bin/activate +pip install requests DrissionPage +``` + +如果你要练附着浏览器,先单独启动登录浏览器: + +```bash +./.venv/bin/python login_douyin.py --browser-port 9223 +``` + +## 练习原则 + +- 不要复制主脚本全部代码 +- 每次只补一个文件 +- 跑起来后先 `print()` 看对象和数据结构 +- 先写最小可运行版本,再做整理 +- 如果报错,优先看报错里提到的对象类型和字段名 + +## 每个文件你应该关注什么 + +### `01_open_page.py` + +- 学会创建 `ChromiumPage` +- 学会 `page.get(url)` +- 学会确认页面是否真的打开 + +运行: + +```bash +./.venv/bin/python learning/drissionpage_basics/01_open_page.py +``` + +### `02_attach_browser.py` + +- 学会通过 `127.0.0.1:9223` 附着到已启动 Chrome +- 学会验证附着的是不是你刚才登录的浏览器 + +运行: + +```bash +./.venv/bin/python learning/drissionpage_basics/02_attach_browser.py +``` + +### `03_listen_api.py` + +- 学会 `page.listen.start(...)` +- 学会 `page.listen.wait(timeout=...)` +- 学会观察返回包的 `response.body` 和 `response.raw_body` + +运行: + +```bash +./.venv/bin/python learning/drissionpage_basics/03_listen_api.py +``` + +### `04_parse_aweme.py` + +- 不连浏览器,只拿示例数据练解析 +- 学会从 `aweme_list` 提取标题、视频 id、视频 url + +运行: + +```bash +./.venv/bin/python learning/drissionpage_basics/04_parse_aweme.py +``` + +### `05_download_video.py` + +- 学会用 `requests` 下载一个 mp4 +- 学会写入本地文件 +- 学会验证文件是否真的下载成功 + +运行: + +```bash +./.venv/bin/python learning/drissionpage_basics/05_download_video.py +``` + +## 推荐你的手动练法 + +1. 先补 `01`,只做到“能打开页面” +2. 再补 `02`,只做到“能附着到浏览器” +3. 再补 `03`,只做到“能打印监听结果” +4. 再补 `04`,只做到“能把字段提出来” +5. 最后补 `05`,把一个视频链接下载到 `learning/drissionpage_basics/output/` + +## 练完后再回看主脚本 + +等你把这 5 个文件都手写过一遍,再回头看这些文件会更清楚: + +- [Douyin.py](/Users/wangshaoqing/Desktop/MiaoSi/Study/douyin-crawler-poc/Douyin.py) +- [login_douyin.py](/Users/wangshaoqing/Desktop/MiaoSi/Study/douyin-crawler-poc/login_douyin.py) + +重点看: + +- `create_page` +- `page.listen.start` +- `page.listen.wait` +- `extract_aweme_payload` +- `parse_aweme_items` +- `download_video` diff --git a/learning/network_basics/01_open_devtools_check_xhr.md b/learning/network_basics/01_open_devtools_check_xhr.md new file mode 100644 index 0000000..e9459e1 --- /dev/null +++ b/learning/network_basics/01_open_devtools_check_xhr.md @@ -0,0 +1,58 @@ +# 练习 01: 打开 DevTools,只看 Fetch/XHR + +## 目标 + +1. 学会打开 Chrome DevTools +2. 学会切到 `Network` +3. 学会只看 `Fetch/XHR` +4. 学会观察“页面动作”和“请求出现”之间的关系 + +## 你要动手做什么 + +1. 打开一个抖音博主主页 +2. 按 `F12` 或右键打开开发者工具 +3. 切到 `Network` +4. 勾选或点击只看 `Fetch/XHR` +5. 清空已有请求记录 +6. 手动滚动页面一次 +7. 观察滚动之后新增了哪些请求 + +## 你要重点看什么 + +- 有没有请求是在你滚动之后立刻出现的 +- 哪些请求名字重复出现 +- 哪些请求的响应大小明显更大 +- 哪些请求看起来像“作品列表”而不是埋点或日志 + +## 本题不要急着做什么 + +- 不要一开始就复制所有请求 +- 不要先看一堆 `Headers` +- 不要马上写代码 + +## 本题完成标准 + +你能手写回答这 3 个问题就算完成: + +1. 滚动页面之后,`Fetch/XHR` 里有没有新请求出现? +2. 哪一个请求最像“返回作品列表”的接口? +3. 你为什么怀疑是它? + +## 建议记录模板 + +```text +页面: +抖音博主主页 + +触发动作: +手动向下滚动一次 + +看到的现象: +新增了几个 Fetch/XHR 请求 + +最可疑的请求: +写出它的名字或 URL 片段 + +怀疑原因: +比如“滚动之后立刻出现,且响应较大” +``` diff --git a/learning/network_basics/02_find_aweme_list_api.md b/learning/network_basics/02_find_aweme_list_api.md new file mode 100644 index 0000000..3237c01 --- /dev/null +++ b/learning/network_basics/02_find_aweme_list_api.md @@ -0,0 +1,53 @@ +# 练习 02: 找到真正返回 `aweme_list` 的接口 + +## 目标 + +1. 从多个请求里筛出真正有用的那个 +2. 学会看 `Preview` 和 `Response` +3. 确认返回视频列表的接口里确实有 `aweme_list` +4. 和当前项目的监听目标建立对应关系 + +## 你要动手做什么 + +1. 回到上一题找到的可疑请求 +2. 点开这个请求 +3. 先看 `Preview` +4. 再看 `Response` +5. 搜索 `aweme_list` +6. 如果没有,就换下一个可疑请求继续查 + +## 你要重点看什么 + +- 最外层是不是 JSON +- 是否出现 `aweme_list` +- `aweme_list` 是列表还是别的结构 +- 列表里的每一项是不是一个视频对象 + +## 对照项目 + +对照 [Douyin.py](/Users/wangshaoqing/Desktop/MiaoSi/Study/douyin-crawler-poc/Douyin.py) 里的这些位置: + +- `LISTEN_TARGET` +- `extract_aweme_payload` +- `parse_aweme_items` + +你要尝试回答: + +- 代码里为什么监听的是这个接口片段? +- 它和你现在在 DevTools 里看到的请求是不是同一类? + +## 本题完成标准 + +你能手写写出下面这些内容就算完成: + +```text +目标接口 URL: + +为什么确认是它: + +响应最外层 keys: + +aweme_list 的类型: + +aweme_list[0] 里我先看到的关键字段: +``` diff --git a/learning/network_basics/03_read_headers_and_cookies.md b/learning/network_basics/03_read_headers_and_cookies.md new file mode 100644 index 0000000..9763115 --- /dev/null +++ b/learning/network_basics/03_read_headers_and_cookies.md @@ -0,0 +1,62 @@ +# 练习 03: 看懂 Headers、Cookies、Referer + +## 目标 + +1. 学会区分请求头和响应头 +2. 学会找 `cookie` +3. 学会看 `referer` +4. 理解为什么很多接口离开浏览器后不一定还能直接请求成功 + +## 你要动手做什么 + +1. 打开你上一题确认过的目标接口 +2. 切到 `Headers` +3. 先看 `General` +4. 再看 `Request Headers` +5. 再看 `Response Headers` +6. 如果有 `Cookies` 面板,也进去看一眼 + +## 你要重点看什么 + +### `Request Headers` + +- `cookie` +- `referer` +- `user-agent` +- `accept` + +### `Response Headers` + +- `content-type` +- 和缓存相关的字段 +- 有没有看起来和鉴权、跨域有关的字段 + +## 你现在要建立的概念 + +- `cookie` 往往代表登录态或会话状态 +- `referer` 往往告诉服务端“这个请求是从哪个页面发出来的” +- `user-agent` 往往是最基础的浏览器身份 + +## 本题不要钻太深的点 + +- 先别研究所有 cookie 都是什么意思 +- 先别碰签名算法 +- 先做“看得见、说得清” + +## 本题完成标准 + +你能写出下面这些内容就算完成: + +```text +这个请求方法: + +这个请求完整 URL: + +我看到的关键请求头: +- cookie +- referer +- user-agent + +我当前的判断: +如果离开浏览器单独发,最可能缺什么 +``` diff --git a/learning/network_basics/04_analyze_pagination.md b/learning/network_basics/04_analyze_pagination.md new file mode 100644 index 0000000..bba297a --- /dev/null +++ b/learning/network_basics/04_analyze_pagination.md @@ -0,0 +1,58 @@ +# 练习 04: 分析分页参数和翻页规律 + +## 目标 + +1. 学会找分页参数 +2. 学会比较第一页和第二页请求的差异 +3. 学会判断“翻页”是靠哪个字段驱动的 +4. 理解为什么脚本批量抓取时要关注这些参数 + +## 你要动手做什么 + +1. 在 `Network` 里保留第一页和继续滚动后的下一次请求 +2. 点开两次同类接口 +3. 对比它们的 URL 和 Query 参数 +4. 把不同的字段逐个记下来 + +## 你要重点看什么 + +- `max_cursor` +- `offset` +- `count` +- `cursor` +- `has_more` + +不一定就是这些名字,但你要去找“看起来像翻页状态”的字段。 + +## 建议比较方式 + +你可以直接手工写一个对照表: + +```text +第一页请求: +url = ... +query = ... + +第二页请求: +url = ... +query = ... + +发生变化的字段: +- xxx +- xxx +``` + +## 对照项目 + +虽然当前这个项目主要靠浏览器监听接口并解析结果,但你现在要开始建立这种意识: + +- 如果以后不用浏览器监听,而是直接发 HTTP 请求 +- 那分页参数就是你必须手动控制的内容 + +## 本题完成标准 + +你能回答下面这些问题就算完成: + +1. 连续两次作品列表请求里,哪个参数变了? +2. 哪个参数最像“分页游标”? +3. 响应里有没有提示“还有没有下一页”? diff --git a/learning/network_basics/05_copy_as_curl_and_replay.md b/learning/network_basics/05_copy_as_curl_and_replay.md new file mode 100644 index 0000000..e03183a --- /dev/null +++ b/learning/network_basics/05_copy_as_curl_and_replay.md @@ -0,0 +1,60 @@ +# 练习 05: Copy as cURL,并尝试重放请求 + +## 目标 + +1. 学会把浏览器里的请求复制出来 +2. 学会用 `curl` 或 Postman 试着重放 +3. 验证这个请求离开浏览器后还能不能发通 +4. 知道如果失败,下一步该怀疑什么 + +## 你要动手做什么 + +1. 在目标请求上右键 +2. 选择 `Copy` +3. 选择 `Copy as cURL` +4. 粘贴到终端里 +5. 先不要改内容,直接试一次 +6. 看返回结果是不是你在浏览器里看到的同类 JSON + +## 你要观察什么 + +- 返回的是不是 JSON +- 是否还包含 `aweme_list` +- 是否报未登录、权限不足、签名错误、参数缺失 + +## 如果重放失败,优先怀疑什么 + +1. `cookie` 失效或缺失 +2. `referer` 不对 +3. 有动态参数 +4. 有签名参数 +5. 请求上下文必须来自真实浏览器环境 + +## 本题最关键的认识 + +你不是为了“这一次一定重放成功”而学。 + +你是为了建立判断力: + +- 哪些接口可以脱离浏览器单独请求 +- 哪些接口必须依赖浏览器环境 +- 哪些失败是因为少了基础请求头 +- 哪些失败说明还有更深的反爬机制 + +## 本题完成标准 + +你能写出下面这些内容就算完成: + +```text +我复制出来的请求: +是 / 不是 浏览器里那个目标接口 + +重放结果: +成功 / 失败 + +如果失败: +我最先怀疑的 3 个原因是什么 + +如果成功: +说明这个接口对浏览器环境的依赖程度如何 +``` diff --git a/learning/network_basics/README.md b/learning/network_basics/README.md new file mode 100644 index 0000000..8d84ce8 --- /dev/null +++ b/learning/network_basics/README.md @@ -0,0 +1,60 @@ +# Network Basics + +这个目录是你的第三阶段练习。 + +这里不再重点练“怎么控制浏览器”,而是练“怎么用浏览器开发者工具看懂页面背后的接口”。 + +目标是把这条链路练熟: + +1. 打开页面 +2. 打开 `DevTools` +3. 只看 `Network` +4. 找到真正返回数据的接口 +5. 看懂请求头、响应体、分页参数 +6. 尝试把请求复制出来单独重放 + +## 建议学习顺序 + +1. `01_open_devtools_check_xhr.md` +2. `02_find_aweme_list_api.md` +3. `03_read_headers_and_cookies.md` +4. `04_analyze_pagination.md` +5. `05_copy_as_curl_and_replay.md` + +## 开始前先准备 + +建议你先准备好这几样东西: + +- Chrome 或 Chromium +- 一个已经能正常打开抖音页面的环境 +- 项目里的 [Douyin.py](/Users/wangshaoqing/Desktop/MiaoSi/Study/douyin-crawler-poc/Douyin.py) +- 你自己的学习笔记 + +## 练习原则 + +- 每次只分析一个页面动作 +- 先手工看懂,再考虑写代码 +- 不要同时盯很多请求,先锁定一个最可疑的 +- 看到字段时先记层级,不要急着背名字 +- 每次练完都要写出“我这次确认了什么” + +## 这组练习要解决什么问题 + +练完之后,你应该能回答这些问题: + +- 这个页面的数据是 HTML 自带的,还是接口后加载的? +- 哪个请求才是真正返回作品列表的? +- 你要的 `aweme_id`、标题、视频地址分别在哪一层? +- 翻页靠什么参数推动? +- 如果浏览器监听不到数据,你下一步该去 DevTools 看什么? + +## 和当前项目怎么对照 + +练这组时,重点对照这些文件: + +- [Douyin.py](/Users/wangshaoqing/Desktop/MiaoSi/Study/douyin-crawler-poc/Douyin.py) +- [learning/drissionpage_basics/03_listen_api.py](/Users/wangshaoqing/Desktop/MiaoSi/Study/douyin-crawler-poc/learning/drissionpage_basics/03_listen_api.py) +- [learning/drissionpage_basics/04_parse_aweme.py](/Users/wangshaoqing/Desktop/MiaoSi/Study/douyin-crawler-poc/learning/drissionpage_basics/04_parse_aweme.py) +- [learning/playwright_basics/04_listen_response.py](/Users/wangshaoqing/Desktop/MiaoSi/Study/douyin-crawler-poc/learning/playwright_basics/04_listen_response.py) + +你会慢慢发现,浏览器自动化只是“拿数据”的一种方式,真正的关键是你能不能识别出正确的接口。 diff --git a/learning/playwright_basics/01_open_page.py b/learning/playwright_basics/01_open_page.py new file mode 100644 index 0000000..300b3fe --- /dev/null +++ b/learning/playwright_basics/01_open_page.py @@ -0,0 +1,36 @@ +""" +练习 01: 用 Playwright 打开一个网页。 + +目标: +1. 自己导入 `sync_playwright` +2. 启动一个可见的 Chromium 浏览器 +3. 新建一个页面并打开 URL +4. 打印当前页面标题和 URL + +建议: +- 第一版先不要封装太多函数 +- 先把浏览器正常打开,再补打印信息 +- 写完后记得主动关闭浏览器 + +运行: + ./.venv/bin/python learning/playwright_basics/01_open_page.py +""" + +DEFAULT_URL = "https://example.com/" + + +def main() -> None: + from playwright.sync_api import sync_playwright + + with sync_playwright() as playwright: + browser = playwright.chromium.launch(headless=False) + page = browser.new_page() + page.goto(DEFAULT_URL) + print(f"[INFO] 已打开 URL: {page.url}") + print(f"[INFO] 页面标题: {page.title()}") + input("按回车退出...") + browser.close() + + +if __name__ == "__main__": + main() diff --git a/learning/playwright_basics/02_persistent_context.py b/learning/playwright_basics/02_persistent_context.py new file mode 100644 index 0000000..8e6bb1f --- /dev/null +++ b/learning/playwright_basics/02_persistent_context.py @@ -0,0 +1,56 @@ +""" +练习 02: 用 Playwright 持久化浏览器目录保留登录态。 + +目标: +1. 自己导入 `sync_playwright` +2. 学会 `launch_persistent_context(user_data_dir=...)` +3. 打开一个固定页面后,手动确认这个浏览器目录会被复用 +4. 理解为什么不要直接复用你日常 Chrome 默认资料目录 + +建议: +- 第一版先把浏览器正常启动起来 +- `user_data_dir` 建议放在项目目录里,便于观察 +- 如果你想验证登录态,连续运行两次并观察 cookie / 登录状态变化 + +运行: + ./.venv/bin/python learning/playwright_basics/02_persistent_context.py +""" + +from __future__ import annotations + +import sys +from pathlib import Path + +PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent +if str(PROJECT_ROOT) not in sys.path: + sys.path.insert(0, str(PROJECT_ROOT)) + +USER_DATA_DIR = PROJECT_ROOT / ".playwright-douyin-profile" +DEFAULT_URL = "https://www.douyin.com/" + + +def get_or_create_page(context): + if getattr(context, "pages", None): + return context.pages[0] + return context.new_page() + + +def main() -> None: + from playwright.sync_api import sync_playwright + + with sync_playwright() as playwright: + context = playwright.chromium.launch_persistent_context( + user_data_dir=str(USER_DATA_DIR), + headless=False, + ) + page = get_or_create_page(context) + page.goto(DEFAULT_URL) + print(f"[INFO] 用户目录: {USER_DATA_DIR}") + print(f"[INFO] 当前 URL: {page.url}") + print(f"[INFO] 页面标题: {page.title()}") + input("观察浏览器状态后按回车退出...") + context.close() + + +if __name__ == "__main__": + main() diff --git a/learning/playwright_basics/03_wait_and_locate.py b/learning/playwright_basics/03_wait_and_locate.py new file mode 100644 index 0000000..6cab28c --- /dev/null +++ b/learning/playwright_basics/03_wait_and_locate.py @@ -0,0 +1,43 @@ +""" +练习 03: 用 Playwright 等待元素并定位元素。 + +目标: +1. 练会 `page.locator(...)` +2. 练会 `page.get_by_role(...)` +3. 练会等待元素出现 +4. 成功打印一个稳定元素的文本内容 + +建议: +- 先选一个结构稳定的页面,不要一上来就拿复杂站点练 +- 如果定位不到元素,先打印页面标题和 URL,确认你打开的是对的页面 +- 第一版优先使用 role / text 这种更直观的定位方式 + +运行: + ./.venv/bin/python learning/playwright_basics/03_wait_and_locate.py +""" + +DEFAULT_URL = "https://example.com/" + + +def main() -> None: + from playwright.sync_api import sync_playwright + + with sync_playwright() as playwright: + browser = playwright.chromium.launch(headless=False) + page = browser.new_page() + page.goto(DEFAULT_URL) + + heading = page.get_by_role("heading", name="Example Domain") + heading.wait_for() + print(f"[INFO] 标题文本: {heading.text_content()}") + + first_paragraph = page.locator("p").first + first_paragraph.wait_for() + print(f"[INFO] 第一段文本: {first_paragraph.text_content()}") + + input("按回车退出...") + browser.close() + + +if __name__ == "__main__": + main() diff --git a/learning/playwright_basics/04_listen_response.py b/learning/playwright_basics/04_listen_response.py new file mode 100644 index 0000000..c12ccf0 --- /dev/null +++ b/learning/playwright_basics/04_listen_response.py @@ -0,0 +1,81 @@ +""" +练习 04: 用 Playwright 监听响应。 + +前置: +1. 你已经会用 `02_persistent_context.py` 启动带用户目录的浏览器 +2. 最好先手动完成一次抖音登录 +3. 打开一个抖音博主主页,或者让脚本自己打开 + +目标: +1. 学会 `page.on("response", callback)` +2. 学会筛选你关心的接口 URL +3. 打印响应状态码和 URL +4. 尝试读取 JSON 响应体 + +建议: +- 第一版先只打印 URL,别急着做完整解析 +- 如果没看到目标接口,手动滚动页面触发加载 +- 你也可以对比试一下 `page.expect_response(...)` 的写法 + +运行: + ./.venv/bin/python learning/playwright_basics/04_listen_response.py +""" + +from __future__ import annotations + +import sys +from pathlib import Path + +PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent +if str(PROJECT_ROOT) not in sys.path: + sys.path.insert(0, str(PROJECT_ROOT)) + +from Douyin import DEFAULT_USER_URL + +USER_DATA_DIR = PROJECT_ROOT / ".playwright-douyin-profile" +LISTEN_TARGET = "web/aweme/post/" + + +def is_target_response_url(url: str) -> bool: + return LISTEN_TARGET in url + + +def try_read_json_payload(response): + try: + payload = response.json() + except Exception: + return None + return payload if isinstance(payload, dict) else None + + +def main() -> None: + from playwright.sync_api import sync_playwright + + def on_response(response) -> None: + if not is_target_response_url(response.url): + return + + print(f"[INFO] 命中目标响应: {response.status} {response.url}") + payload = try_read_json_payload(response) + if payload is None: + print("[WARN] 这个响应不是可直接读取的 JSON 字典。") + return + + print(f"[INFO] JSON keys: {list(payload.keys())[:10]}") + + with sync_playwright() as playwright: + context = playwright.chromium.launch_persistent_context( + user_data_dir=str(USER_DATA_DIR), + headless=False, + ) + page = context.pages[0] if context.pages else context.new_page() + page.on("response", on_response) + page.goto(DEFAULT_USER_URL) + print(f"[INFO] 当前 URL: {page.url}") + print("[INFO] 已开始监听响应。请在页面中滚动一下,观察是否出现目标接口。") + input("观察输出后按回车退出...") + context.close() + + +if __name__ == "__main__": + main() diff --git a/learning/playwright_basics/05_download_video.py b/learning/playwright_basics/05_download_video.py new file mode 100644 index 0000000..96140b5 --- /dev/null +++ b/learning/playwright_basics/05_download_video.py @@ -0,0 +1,87 @@ +""" +练习 05: 用 Playwright 监听到的接口数据配合 requests 下载视频。 + +目标: +1. 用 Playwright 找到目标接口响应 +2. 从 JSON 里提取第一个可下载视频 +3. 用 `requests` 把 mp4 写到本地 +4. 输出最终文件路径 + +建议: +- 浏览器负责“拿到页面里的接口数据” +- `requests` 负责“把真实 mp4 下载下来” +- 先拿第一条视频练通,不要一开始就做批量下载 + +运行: + ./.venv/bin/python learning/playwright_basics/05_download_video.py +""" + +from __future__ import annotations + +import sys +from pathlib import Path + +PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent +if str(PROJECT_ROOT) not in sys.path: + sys.path.insert(0, str(PROJECT_ROOT)) + +from Douyin import build_headers +from Douyin import DEFAULT_USER_URL +from Douyin import download_video +from Douyin import parse_aweme_items +from Douyin import sanitize_filename + +OUTPUT_DIR = Path("learning/playwright_basics/output") +USER_DATA_DIR = PROJECT_ROOT / ".playwright-douyin-profile" +LISTEN_TARGET = "web/aweme/post/" + + +def build_output_path(title: str, video_id: str, output_dir: Path = OUTPUT_DIR) -> Path: + safe_title = sanitize_filename(title, fallback="playwright-video") + return output_dir / f"{safe_title}-{video_id}.mp4" + + +def extract_first_item_from_payload(payload) -> dict[str, str]: + items = parse_aweme_items(payload) + if not items: + raise RuntimeError("当前接口里没有可下载视频,请先确认页面已加载出作品。") + return items[0] + + +def main() -> None: + import requests + from playwright.sync_api import sync_playwright + + with sync_playwright() as playwright: + context = playwright.chromium.launch_persistent_context( + user_data_dir=str(USER_DATA_DIR), + headless=False, + ) + page = context.pages[0] if context.pages else context.new_page() + page.goto(DEFAULT_USER_URL) + print("[INFO] 页面已打开。请在必要时滚动一下,等待作品接口出现。") + + with page.expect_response(lambda response: LISTEN_TARGET in response.url, timeout=20000) as response_info: + page.reload() + + response = response_info.value + payload = response.json() + first_item = extract_first_item_from_payload(payload) + output_path = build_output_path( + title=first_item["title"], + video_id=first_item["video_id"], + output_dir=OUTPUT_DIR, + ) + headers = build_headers(page.url) + download_video( + requests_module=requests, + headers=headers, + video_url=first_item["video_url"], + output_path=output_path, + ) + print(f"[OK] 已下载到: {output_path}") + context.close() + + +if __name__ == "__main__": + main() diff --git a/learning/playwright_basics/README.md b/learning/playwright_basics/README.md new file mode 100644 index 0000000..3140059 --- /dev/null +++ b/learning/playwright_basics/README.md @@ -0,0 +1,119 @@ +# Playwright Basics + +这个目录是你的第二阶段练习。 + +目标不是立刻重写整个项目,而是先把 `Playwright Python` 最常用的几个能力拆开练一遍,再回头和现在的 `DrissionPage` 实现对照。 + +这里的文件故意保留了 `TODO`,你需要自己把它们补完整。 + +## 建议学习顺序 + +1. `01_open_page.py` +2. `02_persistent_context.py` +3. `03_wait_and_locate.py` +4. `04_listen_response.py` +5. `05_download_video.py` + +## 开始前先准备 + +在项目根目录执行: + +```bash +python3 -m venv .venv +source .venv/bin/activate +pip install requests playwright +./.venv/bin/python -m playwright install chromium +``` + +## 练习原则 + +- 不要一开始就追求“项目可直接替换” +- 每次只补一个文件 +- 跑起来后优先 `print()` 看页面对象、locator、response、json 数据长什么样 +- 先写最小可运行版本,再慢慢整理函数 +- 如果遇到元素定位失败,先确认页面是不是你以为的那个页面 + +## 每个文件你应该关注什么 + +### `01_open_page.py` + +- 学会 `sync_playwright()` +- 学会 `browser = playwright.chromium.launch(...)` +- 学会 `page.goto(url)` +- 学会打印标题和 URL + +运行: + +```bash +./.venv/bin/python learning/playwright_basics/01_open_page.py +``` + +### `02_persistent_context.py` + +- 学会 `launch_persistent_context(user_data_dir=...)` +- 理解“浏览器用户目录”和“保留登录态”的关系 +- 学会复用同一个 Playwright 浏览器资料目录 + +运行: + +```bash +./.venv/bin/python learning/playwright_basics/02_persistent_context.py +``` + +### `03_wait_and_locate.py` + +- 学会 `locator(...)` +- 学会 `get_by_role(...)` +- 学会等待元素出现后再读文本 +- 学会先确认页面结构,再决定怎么定位 + +运行: + +```bash +./.venv/bin/python learning/playwright_basics/03_wait_and_locate.py +``` + +### `04_listen_response.py` + +- 学会 `page.on("response", ...)` +- 学会只关注你想看的接口 URL +- 学会打印 `response.status`、`response.url` +- 学会尝试 `response.json()` 看接口结构 + +运行: + +```bash +./.venv/bin/python learning/playwright_basics/04_listen_response.py +``` + +### `05_download_video.py` + +- 学会把 Playwright 监听到的接口 JSON 接到下载逻辑 +- 学会把浏览器控制和文件下载拆成两段 +- 学会把结果保存到 `learning/playwright_basics/output/` + +运行: + +```bash +./.venv/bin/python learning/playwright_basics/05_download_video.py +``` + +## 推荐你的手动练法 + +1. 先补 `01`,只做到“能打开页面” +2. 再补 `02`,只做到“能复用同一个用户目录” +3. 再补 `03`,只做到“能定位到一个稳定元素并打印文本” +4. 再补 `04`,只做到“能看到目标响应并打印部分 JSON” +5. 最后补 `05`,把一个真实视频链接下载到本地 + +## 练完后建议你对照这些内容 + +- [Douyin.py](/Users/wangshaoqing/Desktop/MiaoSi/Study/douyin-crawler-poc/Douyin.py) +- [learning/drissionpage_basics/03_listen_api.py](/Users/wangshaoqing/Desktop/MiaoSi/Study/douyin-crawler-poc/learning/drissionpage_basics/03_listen_api.py) +- [learning/drissionpage_basics/05_download_video.py](/Users/wangshaoqing/Desktop/MiaoSi/Study/douyin-crawler-poc/learning/drissionpage_basics/05_download_video.py) + +你重点对照的是: + +- `DrissionPage` 的附着方式和 `Playwright` 的持久化上下文有什么差别 +- `page.listen.wait(...)` 和 `page.on("response", ...)` / `expect_response(...)` 的心智模型有什么差别 +- 为什么下载文件时经常还是会回到 `requests` diff --git a/test_learning_examples.py b/test_learning_examples.py new file mode 100644 index 0000000..f134834 --- /dev/null +++ b/test_learning_examples.py @@ -0,0 +1,307 @@ +import importlib.util +import subprocess +import sys +import unittest +from pathlib import Path +from unittest import mock + + +ROOT = Path("/Users/wangshaoqing/Desktop/MiaoSi/Study/douyin-crawler-poc") + + +def load_module(module_name: str, relative_path: str): + file_path = ROOT / relative_path + spec = importlib.util.spec_from_file_location(module_name, file_path) + if spec is None or spec.loader is None: + raise RuntimeError(f"无法加载模块: {relative_path}") + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module + + +class FakePacketResponse: + def __init__(self, body): + self.body = body + self.raw_body = "" + + +class FakePacket: + def __init__(self, body): + self.response = FakePacketResponse(body) + + +class FakeListener: + def __init__(self, packet): + self.packet = packet + self.targets = [] + + def start(self, target): + self.targets.append(target) + + def wait(self, timeout): + return self.packet + + +class FakePage: + def __init__(self, url: str, packet): + self.url = url + self.listen = FakeListener(packet) + self.visited_urls = [] + + def get(self, url: str): + self.visited_urls.append(url) + self.url = url + + +class LearningParseAwemeTests(unittest.TestCase): + def test_choose_video_url_prefers_douyinvod_link(self) -> None: + module = load_module("learning04", "learning/drissionpage_basics/04_parse_aweme.py") + self.assertEqual( + module.choose_video_url( + [ + "https://example.com/play/first", + "https://v11-weba.douyinvod.com/example/first.mp4", + ] + ), + "https://v11-weba.douyinvod.com/example/first.mp4", + ) + + def test_parse_aweme_items_extracts_expected_fields(self) -> None: + module = load_module("learning04", "learning/drissionpage_basics/04_parse_aweme.py") + items = module.parse_aweme_items(module.SAMPLE_PAYLOAD) + self.assertEqual( + items, + [ + { + "title": "第一个示例视频", + "video_id": "7500000000000000001", + "video_url": "https://v11-weba.douyinvod.com/example/first.mp4", + }, + { + "title": "第二个示例视频", + "video_id": "7500000000000000002", + "video_url": "https://v26-web.douyinvod.com/example/second.mp4", + }, + ], + ) + + +class LearningDownloadVideoTests(unittest.TestCase): + def test_build_output_path_uses_learning_output_directory(self) -> None: + module = load_module("learning05", "learning/drissionpage_basics/05_download_video.py") + output_path = module.build_output_path("示例标题", "123456") + self.assertEqual( + output_path.as_posix(), + "learning/drissionpage_basics/output/示例标题-123456.mp4", + ) + + def test_download_first_real_video_uses_first_item_from_packet(self) -> None: + module = load_module("learning05", "learning/drissionpage_basics/05_download_video.py") + packet = FakePacket( + { + "aweme_list": [ + { + "aweme_id": "7619989983668240802", + "desc": "单条练习视频", + "video": { + "play_addr": { + "url_list": ["https://v26-web.douyinvod.com/example/single.mp4"] + } + }, + } + ] + } + ) + page = FakePage( + "https://www.douyin.com/user/MS4wLjABAAAAexample?from_tab_name=main", + packet, + ) + requests_module = object() + with mock.patch.object(module, "download_video") as mocked_download: + output_path = module.download_first_real_video( + page=page, + requests_module=requests_module, + output_dir=module.Path("learning/drissionpage_basics/output"), + timeout=15, + ) + self.assertEqual(page.listen.targets, [module.LISTEN_TARGET]) + self.assertEqual( + page.visited_urls, + ["https://www.douyin.com/user/MS4wLjABAAAAexample?from_tab_name=main"], + ) + self.assertEqual( + output_path.as_posix(), + "learning/drissionpage_basics/output/单条练习视频-7619989983668240802.mp4", + ) + mocked_download.assert_called_once_with( + requests_module=requests_module, + headers=mock.ANY, + video_url="https://v26-web.douyinvod.com/example/single.mp4", + output_path=output_path, + ) + + def test_download_first_real_video_raises_readable_error_when_listener_returns_false(self) -> None: + module = load_module("learning05", "learning/drissionpage_basics/05_download_video.py") + page = FakePage( + "https://www.douyin.com/user/MS4wLjABAAAAexample?from_tab_name=main", + False, + ) + with self.assertRaisesRegex(RuntimeError, "没有监听到作品接口"): + module.download_first_real_video( + page=page, + requests_module=object(), + output_dir=module.Path("learning/drissionpage_basics/output"), + timeout=15, + ) + + +class LearningScriptImportTests(unittest.TestCase): + def test_learning_directory_layout_has_both_tracks(self) -> None: + expected_paths = [ + ROOT / "learning" / "README.md", + ROOT / "learning" / "drissionpage_basics" / "README.md", + ROOT / "learning" / "playwright_basics" / "README.md", + ROOT / "learning" / "network_basics" / "README.md", + ROOT / "learning" / "network_basics" / "01_open_devtools_check_xhr.md", + ROOT / "learning" / "network_basics" / "02_find_aweme_list_api.md", + ROOT / "learning" / "network_basics" / "03_read_headers_and_cookies.md", + ROOT / "learning" / "network_basics" / "04_analyze_pagination.md", + ROOT / "learning" / "network_basics" / "05_copy_as_curl_and_replay.md", + ROOT / "learning" / "playwright_basics" / "01_open_page.py", + ROOT / "learning" / "playwright_basics" / "02_persistent_context.py", + ROOT / "learning" / "playwright_basics" / "03_wait_and_locate.py", + ROOT / "learning" / "playwright_basics" / "04_listen_response.py", + ROOT / "learning" / "playwright_basics" / "05_download_video.py", + ] + for path in expected_paths: + self.assertTrue(path.exists(), msg=f"缺少学习文件: {path}") + + def assert_script_can_import_project_root(self, relative_path: str) -> None: + script_path = ROOT / relative_path + command = ( + "import runpy, sys; " + f"root = {str(ROOT)!r}; " + "sys.path = [p for p in sys.path if p not in ('', root)]; " + f"runpy.run_path({str(script_path)!r}, run_name='not_main')" + ) + result = subprocess.run( + [sys.executable, "-c", command], + cwd=ROOT, + capture_output=True, + text=True, + ) + self.assertEqual(result.returncode, 0, msg=result.stderr) + + def test_attach_browser_script_can_import_project_modules_when_run_from_learning(self) -> None: + self.assert_script_can_import_project_root("learning/drissionpage_basics/02_attach_browser.py") + + def test_listen_api_script_can_import_project_modules_when_run_from_learning(self) -> None: + self.assert_script_can_import_project_root("learning/drissionpage_basics/03_listen_api.py") + + def test_download_video_script_can_import_project_modules_when_run_from_learning(self) -> None: + self.assert_script_can_import_project_root("learning/drissionpage_basics/05_download_video.py") + + def test_playwright_persistent_context_script_can_import_project_modules(self) -> None: + self.assert_script_can_import_project_root("learning/playwright_basics/02_persistent_context.py") + + def test_playwright_listen_response_script_can_import_project_modules(self) -> None: + self.assert_script_can_import_project_root("learning/playwright_basics/04_listen_response.py") + + def test_playwright_download_video_script_can_import_project_modules(self) -> None: + self.assert_script_can_import_project_root("learning/playwright_basics/05_download_video.py") + + +class FakePlaywrightPage: + def __init__(self, url: str = "https://example.com/"): + self.url = url + + +class FakePlaywrightContext: + def __init__(self, pages): + self.pages = pages + self.created_pages = [] + + def new_page(self): + page = FakePlaywrightPage("https://created.example.com/") + self.created_pages.append(page) + self.pages.append(page) + return page + + +class FakePlaywrightResponse: + def __init__(self, url: str, payload=None, status: int = 200): + self.url = url + self.status = status + self._payload = payload + + def json(self): + if isinstance(self._payload, Exception): + raise self._payload + return self._payload + + +class PlaywrightLearningHelperTests(unittest.TestCase): + def test_persistent_context_reuses_first_existing_page(self) -> None: + module = load_module("playwright02", "learning/playwright_basics/02_persistent_context.py") + page = FakePlaywrightPage("https://existing.example.com/") + context = FakePlaywrightContext([page]) + self.assertIs(module.get_or_create_page(context), page) + self.assertEqual(context.created_pages, []) + + def test_persistent_context_creates_page_when_context_is_empty(self) -> None: + module = load_module("playwright02", "learning/playwright_basics/02_persistent_context.py") + context = FakePlaywrightContext([]) + page = module.get_or_create_page(context) + self.assertEqual(page.url, "https://created.example.com/") + self.assertEqual(len(context.created_pages), 1) + + def test_listen_response_target_matching_uses_url_substring(self) -> None: + module = load_module("playwright04", "learning/playwright_basics/04_listen_response.py") + self.assertTrue(module.is_target_response_url("https://www.douyin.com/aweme/v1/web/aweme/post/")) + self.assertFalse(module.is_target_response_url("https://www.example.com/api")) + + def test_try_read_json_payload_returns_none_on_json_error(self) -> None: + module = load_module("playwright04", "learning/playwright_basics/04_listen_response.py") + response = FakePlaywrightResponse( + "https://www.douyin.com/aweme/v1/web/aweme/post/", + payload=ValueError("bad json"), + ) + self.assertIsNone(module.try_read_json_payload(response)) + + def test_build_output_path_uses_playwright_output_directory(self) -> None: + module = load_module("playwright05", "learning/playwright_basics/05_download_video.py") + output_path = module.build_output_path("示例标题", "123456") + self.assertEqual( + output_path.as_posix(), + "learning/playwright_basics/output/示例标题-123456.mp4", + ) + + def test_extract_first_item_from_payload_uses_existing_parser(self) -> None: + module = load_module("playwright05", "learning/playwright_basics/05_download_video.py") + item = module.extract_first_item_from_payload( + { + "aweme_list": [ + { + "aweme_id": "7619989983668240802", + "desc": "Playwright 示例", + "video": { + "play_addr": { + "url_list": ["https://v26-web.douyinvod.com/example/single.mp4"] + } + }, + } + ] + } + ) + self.assertEqual( + item, + { + "title": "Playwright 示例", + "video_id": "7619989983668240802", + "video_url": "https://v26-web.douyinvod.com/example/single.mp4", + }, + ) + + +if __name__ == "__main__": + unittest.main()