douyin-crawler-poc/test_learning_examples.py
wangshaoqing 321bbff1c1 添加学习教程目录(learning/)及对应单元测试
- 新增 DrissionPage 基础教程(01-05)
- 新增 Playwright 基础教程(01-05)
- 新增网络基础教程(01-05)
- 新增 test_learning_examples.py 单元测试
- 更新 .gitignore 忽略 learning/*/output/ 目录
2026-05-06 16:39:55 +08:00

308 lines
12 KiB
Python

import importlib.util
import subprocess
import sys
import unittest
from pathlib import Path
from unittest import mock
ROOT = Path("/Users/wangshaoqing/Desktop/MiaoSi/Study/douyin-crawler-poc")
def load_module(module_name: str, relative_path: str):
file_path = ROOT / relative_path
spec = importlib.util.spec_from_file_location(module_name, file_path)
if spec is None or spec.loader is None:
raise RuntimeError(f"无法加载模块: {relative_path}")
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
return module
class FakePacketResponse:
def __init__(self, body):
self.body = body
self.raw_body = ""
class FakePacket:
def __init__(self, body):
self.response = FakePacketResponse(body)
class FakeListener:
def __init__(self, packet):
self.packet = packet
self.targets = []
def start(self, target):
self.targets.append(target)
def wait(self, timeout):
return self.packet
class FakePage:
def __init__(self, url: str, packet):
self.url = url
self.listen = FakeListener(packet)
self.visited_urls = []
def get(self, url: str):
self.visited_urls.append(url)
self.url = url
class LearningParseAwemeTests(unittest.TestCase):
def test_choose_video_url_prefers_douyinvod_link(self) -> None:
module = load_module("learning04", "learning/drissionpage_basics/04_parse_aweme.py")
self.assertEqual(
module.choose_video_url(
[
"https://example.com/play/first",
"https://v11-weba.douyinvod.com/example/first.mp4",
]
),
"https://v11-weba.douyinvod.com/example/first.mp4",
)
def test_parse_aweme_items_extracts_expected_fields(self) -> None:
module = load_module("learning04", "learning/drissionpage_basics/04_parse_aweme.py")
items = module.parse_aweme_items(module.SAMPLE_PAYLOAD)
self.assertEqual(
items,
[
{
"title": "第一个示例视频",
"video_id": "7500000000000000001",
"video_url": "https://v11-weba.douyinvod.com/example/first.mp4",
},
{
"title": "第二个示例视频",
"video_id": "7500000000000000002",
"video_url": "https://v26-web.douyinvod.com/example/second.mp4",
},
],
)
class LearningDownloadVideoTests(unittest.TestCase):
def test_build_output_path_uses_learning_output_directory(self) -> None:
module = load_module("learning05", "learning/drissionpage_basics/05_download_video.py")
output_path = module.build_output_path("示例标题", "123456")
self.assertEqual(
output_path.as_posix(),
"learning/drissionpage_basics/output/示例标题-123456.mp4",
)
def test_download_first_real_video_uses_first_item_from_packet(self) -> None:
module = load_module("learning05", "learning/drissionpage_basics/05_download_video.py")
packet = FakePacket(
{
"aweme_list": [
{
"aweme_id": "7619989983668240802",
"desc": "单条练习视频",
"video": {
"play_addr": {
"url_list": ["https://v26-web.douyinvod.com/example/single.mp4"]
}
},
}
]
}
)
page = FakePage(
"https://www.douyin.com/user/MS4wLjABAAAAexample?from_tab_name=main",
packet,
)
requests_module = object()
with mock.patch.object(module, "download_video") as mocked_download:
output_path = module.download_first_real_video(
page=page,
requests_module=requests_module,
output_dir=module.Path("learning/drissionpage_basics/output"),
timeout=15,
)
self.assertEqual(page.listen.targets, [module.LISTEN_TARGET])
self.assertEqual(
page.visited_urls,
["https://www.douyin.com/user/MS4wLjABAAAAexample?from_tab_name=main"],
)
self.assertEqual(
output_path.as_posix(),
"learning/drissionpage_basics/output/单条练习视频-7619989983668240802.mp4",
)
mocked_download.assert_called_once_with(
requests_module=requests_module,
headers=mock.ANY,
video_url="https://v26-web.douyinvod.com/example/single.mp4",
output_path=output_path,
)
def test_download_first_real_video_raises_readable_error_when_listener_returns_false(self) -> None:
module = load_module("learning05", "learning/drissionpage_basics/05_download_video.py")
page = FakePage(
"https://www.douyin.com/user/MS4wLjABAAAAexample?from_tab_name=main",
False,
)
with self.assertRaisesRegex(RuntimeError, "没有监听到作品接口"):
module.download_first_real_video(
page=page,
requests_module=object(),
output_dir=module.Path("learning/drissionpage_basics/output"),
timeout=15,
)
class LearningScriptImportTests(unittest.TestCase):
def test_learning_directory_layout_has_both_tracks(self) -> None:
expected_paths = [
ROOT / "learning" / "README.md",
ROOT / "learning" / "drissionpage_basics" / "README.md",
ROOT / "learning" / "playwright_basics" / "README.md",
ROOT / "learning" / "network_basics" / "README.md",
ROOT / "learning" / "network_basics" / "01_open_devtools_check_xhr.md",
ROOT / "learning" / "network_basics" / "02_find_aweme_list_api.md",
ROOT / "learning" / "network_basics" / "03_read_headers_and_cookies.md",
ROOT / "learning" / "network_basics" / "04_analyze_pagination.md",
ROOT / "learning" / "network_basics" / "05_copy_as_curl_and_replay.md",
ROOT / "learning" / "playwright_basics" / "01_open_page.py",
ROOT / "learning" / "playwright_basics" / "02_persistent_context.py",
ROOT / "learning" / "playwright_basics" / "03_wait_and_locate.py",
ROOT / "learning" / "playwright_basics" / "04_listen_response.py",
ROOT / "learning" / "playwright_basics" / "05_download_video.py",
]
for path in expected_paths:
self.assertTrue(path.exists(), msg=f"缺少学习文件: {path}")
def assert_script_can_import_project_root(self, relative_path: str) -> None:
script_path = ROOT / relative_path
command = (
"import runpy, sys; "
f"root = {str(ROOT)!r}; "
"sys.path = [p for p in sys.path if p not in ('', root)]; "
f"runpy.run_path({str(script_path)!r}, run_name='not_main')"
)
result = subprocess.run(
[sys.executable, "-c", command],
cwd=ROOT,
capture_output=True,
text=True,
)
self.assertEqual(result.returncode, 0, msg=result.stderr)
def test_attach_browser_script_can_import_project_modules_when_run_from_learning(self) -> None:
self.assert_script_can_import_project_root("learning/drissionpage_basics/02_attach_browser.py")
def test_listen_api_script_can_import_project_modules_when_run_from_learning(self) -> None:
self.assert_script_can_import_project_root("learning/drissionpage_basics/03_listen_api.py")
def test_download_video_script_can_import_project_modules_when_run_from_learning(self) -> None:
self.assert_script_can_import_project_root("learning/drissionpage_basics/05_download_video.py")
def test_playwright_persistent_context_script_can_import_project_modules(self) -> None:
self.assert_script_can_import_project_root("learning/playwright_basics/02_persistent_context.py")
def test_playwright_listen_response_script_can_import_project_modules(self) -> None:
self.assert_script_can_import_project_root("learning/playwright_basics/04_listen_response.py")
def test_playwright_download_video_script_can_import_project_modules(self) -> None:
self.assert_script_can_import_project_root("learning/playwright_basics/05_download_video.py")
class FakePlaywrightPage:
def __init__(self, url: str = "https://example.com/"):
self.url = url
class FakePlaywrightContext:
def __init__(self, pages):
self.pages = pages
self.created_pages = []
def new_page(self):
page = FakePlaywrightPage("https://created.example.com/")
self.created_pages.append(page)
self.pages.append(page)
return page
class FakePlaywrightResponse:
def __init__(self, url: str, payload=None, status: int = 200):
self.url = url
self.status = status
self._payload = payload
def json(self):
if isinstance(self._payload, Exception):
raise self._payload
return self._payload
class PlaywrightLearningHelperTests(unittest.TestCase):
def test_persistent_context_reuses_first_existing_page(self) -> None:
module = load_module("playwright02", "learning/playwright_basics/02_persistent_context.py")
page = FakePlaywrightPage("https://existing.example.com/")
context = FakePlaywrightContext([page])
self.assertIs(module.get_or_create_page(context), page)
self.assertEqual(context.created_pages, [])
def test_persistent_context_creates_page_when_context_is_empty(self) -> None:
module = load_module("playwright02", "learning/playwright_basics/02_persistent_context.py")
context = FakePlaywrightContext([])
page = module.get_or_create_page(context)
self.assertEqual(page.url, "https://created.example.com/")
self.assertEqual(len(context.created_pages), 1)
def test_listen_response_target_matching_uses_url_substring(self) -> None:
module = load_module("playwright04", "learning/playwright_basics/04_listen_response.py")
self.assertTrue(module.is_target_response_url("https://www.douyin.com/aweme/v1/web/aweme/post/"))
self.assertFalse(module.is_target_response_url("https://www.example.com/api"))
def test_try_read_json_payload_returns_none_on_json_error(self) -> None:
module = load_module("playwright04", "learning/playwright_basics/04_listen_response.py")
response = FakePlaywrightResponse(
"https://www.douyin.com/aweme/v1/web/aweme/post/",
payload=ValueError("bad json"),
)
self.assertIsNone(module.try_read_json_payload(response))
def test_build_output_path_uses_playwright_output_directory(self) -> None:
module = load_module("playwright05", "learning/playwright_basics/05_download_video.py")
output_path = module.build_output_path("示例标题", "123456")
self.assertEqual(
output_path.as_posix(),
"learning/playwright_basics/output/示例标题-123456.mp4",
)
def test_extract_first_item_from_payload_uses_existing_parser(self) -> None:
module = load_module("playwright05", "learning/playwright_basics/05_download_video.py")
item = module.extract_first_item_from_payload(
{
"aweme_list": [
{
"aweme_id": "7619989983668240802",
"desc": "Playwright 示例",
"video": {
"play_addr": {
"url_list": ["https://v26-web.douyinvod.com/example/single.mp4"]
}
},
}
]
}
)
self.assertEqual(
item,
{
"title": "Playwright 示例",
"video_id": "7619989983668240802",
"video_url": "https://v26-web.douyinvod.com/example/single.mp4",
},
)
if __name__ == "__main__":
unittest.main()