- 新增 DrissionPage 基础教程(01-05) - 新增 Playwright 基础教程(01-05) - 新增网络基础教程(01-05) - 新增 test_learning_examples.py 单元测试 - 更新 .gitignore 忽略 learning/*/output/ 目录
88 lines
2.7 KiB
Python
88 lines
2.7 KiB
Python
"""
|
|
练习 05: 用 Playwright 监听到的接口数据配合 requests 下载视频。
|
|
|
|
目标:
|
|
1. 用 Playwright 找到目标接口响应
|
|
2. 从 JSON 里提取第一个可下载视频
|
|
3. 用 `requests` 把 mp4 写到本地
|
|
4. 输出最终文件路径
|
|
|
|
建议:
|
|
- 浏览器负责“拿到页面里的接口数据”
|
|
- `requests` 负责“把真实 mp4 下载下来”
|
|
- 先拿第一条视频练通,不要一开始就做批量下载
|
|
|
|
运行:
|
|
./.venv/bin/python learning/playwright_basics/05_download_video.py
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent
|
|
if str(PROJECT_ROOT) not in sys.path:
|
|
sys.path.insert(0, str(PROJECT_ROOT))
|
|
|
|
from Douyin import build_headers
|
|
from Douyin import DEFAULT_USER_URL
|
|
from Douyin import download_video
|
|
from Douyin import parse_aweme_items
|
|
from Douyin import sanitize_filename
|
|
|
|
OUTPUT_DIR = Path("learning/playwright_basics/output")
|
|
USER_DATA_DIR = PROJECT_ROOT / ".playwright-douyin-profile"
|
|
LISTEN_TARGET = "web/aweme/post/"
|
|
|
|
|
|
def build_output_path(title: str, video_id: str, output_dir: Path = OUTPUT_DIR) -> Path:
|
|
safe_title = sanitize_filename(title, fallback="playwright-video")
|
|
return output_dir / f"{safe_title}-{video_id}.mp4"
|
|
|
|
|
|
def extract_first_item_from_payload(payload) -> dict[str, str]:
|
|
items = parse_aweme_items(payload)
|
|
if not items:
|
|
raise RuntimeError("当前接口里没有可下载视频,请先确认页面已加载出作品。")
|
|
return items[0]
|
|
|
|
|
|
def main() -> None:
|
|
import requests
|
|
from playwright.sync_api import sync_playwright
|
|
|
|
with sync_playwright() as playwright:
|
|
context = playwright.chromium.launch_persistent_context(
|
|
user_data_dir=str(USER_DATA_DIR),
|
|
headless=False,
|
|
)
|
|
page = context.pages[0] if context.pages else context.new_page()
|
|
page.goto(DEFAULT_USER_URL)
|
|
print("[INFO] 页面已打开。请在必要时滚动一下,等待作品接口出现。")
|
|
|
|
with page.expect_response(lambda response: LISTEN_TARGET in response.url, timeout=20000) as response_info:
|
|
page.reload()
|
|
|
|
response = response_info.value
|
|
payload = response.json()
|
|
first_item = extract_first_item_from_payload(payload)
|
|
output_path = build_output_path(
|
|
title=first_item["title"],
|
|
video_id=first_item["video_id"],
|
|
output_dir=OUTPUT_DIR,
|
|
)
|
|
headers = build_headers(page.url)
|
|
download_video(
|
|
requests_module=requests,
|
|
headers=headers,
|
|
video_url=first_item["video_url"],
|
|
output_path=output_path,
|
|
)
|
|
print(f"[OK] 已下载到: {output_path}")
|
|
context.close()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|