- 新增 DrissionPage 基础教程(01-05) - 新增 Playwright 基础教程(01-05) - 新增网络基础教程(01-05) - 新增 test_learning_examples.py 单元测试 - 更新 .gitignore 忽略 learning/*/output/ 目录
308 lines
12 KiB
Python
308 lines
12 KiB
Python
import importlib.util
|
|
import subprocess
|
|
import sys
|
|
import unittest
|
|
from pathlib import Path
|
|
from unittest import mock
|
|
|
|
|
|
ROOT = Path("/Users/wangshaoqing/Desktop/MiaoSi/Study/douyin-crawler-poc")
|
|
|
|
|
|
def load_module(module_name: str, relative_path: str):
|
|
file_path = ROOT / relative_path
|
|
spec = importlib.util.spec_from_file_location(module_name, file_path)
|
|
if spec is None or spec.loader is None:
|
|
raise RuntimeError(f"无法加载模块: {relative_path}")
|
|
module = importlib.util.module_from_spec(spec)
|
|
spec.loader.exec_module(module)
|
|
return module
|
|
|
|
|
|
class FakePacketResponse:
|
|
def __init__(self, body):
|
|
self.body = body
|
|
self.raw_body = ""
|
|
|
|
|
|
class FakePacket:
|
|
def __init__(self, body):
|
|
self.response = FakePacketResponse(body)
|
|
|
|
|
|
class FakeListener:
|
|
def __init__(self, packet):
|
|
self.packet = packet
|
|
self.targets = []
|
|
|
|
def start(self, target):
|
|
self.targets.append(target)
|
|
|
|
def wait(self, timeout):
|
|
return self.packet
|
|
|
|
|
|
class FakePage:
|
|
def __init__(self, url: str, packet):
|
|
self.url = url
|
|
self.listen = FakeListener(packet)
|
|
self.visited_urls = []
|
|
|
|
def get(self, url: str):
|
|
self.visited_urls.append(url)
|
|
self.url = url
|
|
|
|
|
|
class LearningParseAwemeTests(unittest.TestCase):
|
|
def test_choose_video_url_prefers_douyinvod_link(self) -> None:
|
|
module = load_module("learning04", "learning/drissionpage_basics/04_parse_aweme.py")
|
|
self.assertEqual(
|
|
module.choose_video_url(
|
|
[
|
|
"https://example.com/play/first",
|
|
"https://v11-weba.douyinvod.com/example/first.mp4",
|
|
]
|
|
),
|
|
"https://v11-weba.douyinvod.com/example/first.mp4",
|
|
)
|
|
|
|
def test_parse_aweme_items_extracts_expected_fields(self) -> None:
|
|
module = load_module("learning04", "learning/drissionpage_basics/04_parse_aweme.py")
|
|
items = module.parse_aweme_items(module.SAMPLE_PAYLOAD)
|
|
self.assertEqual(
|
|
items,
|
|
[
|
|
{
|
|
"title": "第一个示例视频",
|
|
"video_id": "7500000000000000001",
|
|
"video_url": "https://v11-weba.douyinvod.com/example/first.mp4",
|
|
},
|
|
{
|
|
"title": "第二个示例视频",
|
|
"video_id": "7500000000000000002",
|
|
"video_url": "https://v26-web.douyinvod.com/example/second.mp4",
|
|
},
|
|
],
|
|
)
|
|
|
|
|
|
class LearningDownloadVideoTests(unittest.TestCase):
|
|
def test_build_output_path_uses_learning_output_directory(self) -> None:
|
|
module = load_module("learning05", "learning/drissionpage_basics/05_download_video.py")
|
|
output_path = module.build_output_path("示例标题", "123456")
|
|
self.assertEqual(
|
|
output_path.as_posix(),
|
|
"learning/drissionpage_basics/output/示例标题-123456.mp4",
|
|
)
|
|
|
|
def test_download_first_real_video_uses_first_item_from_packet(self) -> None:
|
|
module = load_module("learning05", "learning/drissionpage_basics/05_download_video.py")
|
|
packet = FakePacket(
|
|
{
|
|
"aweme_list": [
|
|
{
|
|
"aweme_id": "7619989983668240802",
|
|
"desc": "单条练习视频",
|
|
"video": {
|
|
"play_addr": {
|
|
"url_list": ["https://v26-web.douyinvod.com/example/single.mp4"]
|
|
}
|
|
},
|
|
}
|
|
]
|
|
}
|
|
)
|
|
page = FakePage(
|
|
"https://www.douyin.com/user/MS4wLjABAAAAexample?from_tab_name=main",
|
|
packet,
|
|
)
|
|
requests_module = object()
|
|
with mock.patch.object(module, "download_video") as mocked_download:
|
|
output_path = module.download_first_real_video(
|
|
page=page,
|
|
requests_module=requests_module,
|
|
output_dir=module.Path("learning/drissionpage_basics/output"),
|
|
timeout=15,
|
|
)
|
|
self.assertEqual(page.listen.targets, [module.LISTEN_TARGET])
|
|
self.assertEqual(
|
|
page.visited_urls,
|
|
["https://www.douyin.com/user/MS4wLjABAAAAexample?from_tab_name=main"],
|
|
)
|
|
self.assertEqual(
|
|
output_path.as_posix(),
|
|
"learning/drissionpage_basics/output/单条练习视频-7619989983668240802.mp4",
|
|
)
|
|
mocked_download.assert_called_once_with(
|
|
requests_module=requests_module,
|
|
headers=mock.ANY,
|
|
video_url="https://v26-web.douyinvod.com/example/single.mp4",
|
|
output_path=output_path,
|
|
)
|
|
|
|
def test_download_first_real_video_raises_readable_error_when_listener_returns_false(self) -> None:
|
|
module = load_module("learning05", "learning/drissionpage_basics/05_download_video.py")
|
|
page = FakePage(
|
|
"https://www.douyin.com/user/MS4wLjABAAAAexample?from_tab_name=main",
|
|
False,
|
|
)
|
|
with self.assertRaisesRegex(RuntimeError, "没有监听到作品接口"):
|
|
module.download_first_real_video(
|
|
page=page,
|
|
requests_module=object(),
|
|
output_dir=module.Path("learning/drissionpage_basics/output"),
|
|
timeout=15,
|
|
)
|
|
|
|
|
|
class LearningScriptImportTests(unittest.TestCase):
|
|
def test_learning_directory_layout_has_both_tracks(self) -> None:
|
|
expected_paths = [
|
|
ROOT / "learning" / "README.md",
|
|
ROOT / "learning" / "drissionpage_basics" / "README.md",
|
|
ROOT / "learning" / "playwright_basics" / "README.md",
|
|
ROOT / "learning" / "network_basics" / "README.md",
|
|
ROOT / "learning" / "network_basics" / "01_open_devtools_check_xhr.md",
|
|
ROOT / "learning" / "network_basics" / "02_find_aweme_list_api.md",
|
|
ROOT / "learning" / "network_basics" / "03_read_headers_and_cookies.md",
|
|
ROOT / "learning" / "network_basics" / "04_analyze_pagination.md",
|
|
ROOT / "learning" / "network_basics" / "05_copy_as_curl_and_replay.md",
|
|
ROOT / "learning" / "playwright_basics" / "01_open_page.py",
|
|
ROOT / "learning" / "playwright_basics" / "02_persistent_context.py",
|
|
ROOT / "learning" / "playwright_basics" / "03_wait_and_locate.py",
|
|
ROOT / "learning" / "playwright_basics" / "04_listen_response.py",
|
|
ROOT / "learning" / "playwright_basics" / "05_download_video.py",
|
|
]
|
|
for path in expected_paths:
|
|
self.assertTrue(path.exists(), msg=f"缺少学习文件: {path}")
|
|
|
|
def assert_script_can_import_project_root(self, relative_path: str) -> None:
|
|
script_path = ROOT / relative_path
|
|
command = (
|
|
"import runpy, sys; "
|
|
f"root = {str(ROOT)!r}; "
|
|
"sys.path = [p for p in sys.path if p not in ('', root)]; "
|
|
f"runpy.run_path({str(script_path)!r}, run_name='not_main')"
|
|
)
|
|
result = subprocess.run(
|
|
[sys.executable, "-c", command],
|
|
cwd=ROOT,
|
|
capture_output=True,
|
|
text=True,
|
|
)
|
|
self.assertEqual(result.returncode, 0, msg=result.stderr)
|
|
|
|
def test_attach_browser_script_can_import_project_modules_when_run_from_learning(self) -> None:
|
|
self.assert_script_can_import_project_root("learning/drissionpage_basics/02_attach_browser.py")
|
|
|
|
def test_listen_api_script_can_import_project_modules_when_run_from_learning(self) -> None:
|
|
self.assert_script_can_import_project_root("learning/drissionpage_basics/03_listen_api.py")
|
|
|
|
def test_download_video_script_can_import_project_modules_when_run_from_learning(self) -> None:
|
|
self.assert_script_can_import_project_root("learning/drissionpage_basics/05_download_video.py")
|
|
|
|
def test_playwright_persistent_context_script_can_import_project_modules(self) -> None:
|
|
self.assert_script_can_import_project_root("learning/playwright_basics/02_persistent_context.py")
|
|
|
|
def test_playwright_listen_response_script_can_import_project_modules(self) -> None:
|
|
self.assert_script_can_import_project_root("learning/playwright_basics/04_listen_response.py")
|
|
|
|
def test_playwright_download_video_script_can_import_project_modules(self) -> None:
|
|
self.assert_script_can_import_project_root("learning/playwright_basics/05_download_video.py")
|
|
|
|
|
|
class FakePlaywrightPage:
|
|
def __init__(self, url: str = "https://example.com/"):
|
|
self.url = url
|
|
|
|
|
|
class FakePlaywrightContext:
|
|
def __init__(self, pages):
|
|
self.pages = pages
|
|
self.created_pages = []
|
|
|
|
def new_page(self):
|
|
page = FakePlaywrightPage("https://created.example.com/")
|
|
self.created_pages.append(page)
|
|
self.pages.append(page)
|
|
return page
|
|
|
|
|
|
class FakePlaywrightResponse:
|
|
def __init__(self, url: str, payload=None, status: int = 200):
|
|
self.url = url
|
|
self.status = status
|
|
self._payload = payload
|
|
|
|
def json(self):
|
|
if isinstance(self._payload, Exception):
|
|
raise self._payload
|
|
return self._payload
|
|
|
|
|
|
class PlaywrightLearningHelperTests(unittest.TestCase):
|
|
def test_persistent_context_reuses_first_existing_page(self) -> None:
|
|
module = load_module("playwright02", "learning/playwright_basics/02_persistent_context.py")
|
|
page = FakePlaywrightPage("https://existing.example.com/")
|
|
context = FakePlaywrightContext([page])
|
|
self.assertIs(module.get_or_create_page(context), page)
|
|
self.assertEqual(context.created_pages, [])
|
|
|
|
def test_persistent_context_creates_page_when_context_is_empty(self) -> None:
|
|
module = load_module("playwright02", "learning/playwright_basics/02_persistent_context.py")
|
|
context = FakePlaywrightContext([])
|
|
page = module.get_or_create_page(context)
|
|
self.assertEqual(page.url, "https://created.example.com/")
|
|
self.assertEqual(len(context.created_pages), 1)
|
|
|
|
def test_listen_response_target_matching_uses_url_substring(self) -> None:
|
|
module = load_module("playwright04", "learning/playwright_basics/04_listen_response.py")
|
|
self.assertTrue(module.is_target_response_url("https://www.douyin.com/aweme/v1/web/aweme/post/"))
|
|
self.assertFalse(module.is_target_response_url("https://www.example.com/api"))
|
|
|
|
def test_try_read_json_payload_returns_none_on_json_error(self) -> None:
|
|
module = load_module("playwright04", "learning/playwright_basics/04_listen_response.py")
|
|
response = FakePlaywrightResponse(
|
|
"https://www.douyin.com/aweme/v1/web/aweme/post/",
|
|
payload=ValueError("bad json"),
|
|
)
|
|
self.assertIsNone(module.try_read_json_payload(response))
|
|
|
|
def test_build_output_path_uses_playwright_output_directory(self) -> None:
|
|
module = load_module("playwright05", "learning/playwright_basics/05_download_video.py")
|
|
output_path = module.build_output_path("示例标题", "123456")
|
|
self.assertEqual(
|
|
output_path.as_posix(),
|
|
"learning/playwright_basics/output/示例标题-123456.mp4",
|
|
)
|
|
|
|
def test_extract_first_item_from_payload_uses_existing_parser(self) -> None:
|
|
module = load_module("playwright05", "learning/playwright_basics/05_download_video.py")
|
|
item = module.extract_first_item_from_payload(
|
|
{
|
|
"aweme_list": [
|
|
{
|
|
"aweme_id": "7619989983668240802",
|
|
"desc": "Playwright 示例",
|
|
"video": {
|
|
"play_addr": {
|
|
"url_list": ["https://v26-web.douyinvod.com/example/single.mp4"]
|
|
}
|
|
},
|
|
}
|
|
]
|
|
}
|
|
)
|
|
self.assertEqual(
|
|
item,
|
|
{
|
|
"title": "Playwright 示例",
|
|
"video_id": "7619989983668240802",
|
|
"video_url": "https://v26-web.douyinvod.com/example/single.mp4",
|
|
},
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
unittest.main()
|