Compare commits

..

No commits in common. "452f14da69560751fe7b7303c3aafdc6962800e4" and "46499446b2ee59a6ba58b584f034d9eb7f2b18da" have entirely different histories.

4 changed files with 7 additions and 98 deletions

View File

@ -12,7 +12,6 @@ from __future__ import annotations
import argparse
import json
import random
import re
import socket
import sys
@ -140,40 +139,6 @@ def choose_video_url(url_list: list[str]) -> str:
raise ValueError("url_list 为空,无法选择视频地址。")
def extract_url_list_from_play_addr(play_addr: Any) -> list[str]:
if not isinstance(play_addr, dict):
return []
url_list = play_addr.get("url_list") or []
if not isinstance(url_list, list):
return []
return [str(url) for url in url_list if str(url).strip()]
def extract_video_url_list(video: Any) -> list[str]:
if not isinstance(video, dict):
return []
for address_key in ("play_addr", "play_addr_h264", "play_addr_lowbr"):
url_list = extract_url_list_from_play_addr(video.get(address_key))
if url_list:
return url_list
bit_rate_list = video.get("bit_rate") or []
if not isinstance(bit_rate_list, list):
return []
for bit_rate in bit_rate_list:
if not isinstance(bit_rate, dict):
continue
url_list = extract_url_list_from_play_addr(bit_rate.get("play_addr"))
if url_list:
return url_list
return []
def build_output_path(
title: str,
video_id: str,
@ -235,7 +200,8 @@ def parse_aweme_items(body: Any) -> list[dict[str, str]]:
continue
video = aweme.get("video") or {}
url_list = extract_video_url_list(video)
play_addr = video.get("play_addr") or {}
url_list = play_addr.get("url_list") or []
if not url_list:
continue
@ -253,7 +219,7 @@ def parse_aweme_items(body: Any) -> list[dict[str, str]]:
{
"title": title,
"video_id": video_id,
"video_url": choose_video_url(url_list),
"video_url": choose_video_url([str(url) for url in url_list]),
"author_name": author_name,
"author_id": author_id,
}
@ -330,14 +296,6 @@ def scroll_to_next_page(page: Any) -> None:
time.sleep(2)
def human_like_scroll(page: Any) -> None:
"""模拟人类滚动行为:随机滚动距离和随机停顿时间"""
scroll_distance = random.randint(300, 800)
page.run_js(f"window.scrollBy(0, {scroll_distance});")
sleep_time = random.uniform(1.5, 4.0)
time.sleep(sleep_time)
def download_video(
requests_module: Any,
headers: dict[str, str],
@ -459,7 +417,7 @@ def collect_recommendations(
if consecutive_empty >= max_consecutive_empty:
print("[INFO] 连续多次未获取到新数据,结束抓取。")
break
human_like_scroll(page)
scroll_to_next_page(page)
continue
try:
@ -470,14 +428,14 @@ def collect_recommendations(
consecutive_empty += 1
if consecutive_empty >= max_consecutive_empty:
break
human_like_scroll(page)
scroll_to_next_page(page)
continue
if not items:
consecutive_empty += 1
if consecutive_empty >= max_consecutive_empty:
break
human_like_scroll(page)
scroll_to_next_page(page)
continue
consecutive_empty = 0
@ -518,7 +476,7 @@ def collect_recommendations(
if consecutive_empty >= max_consecutive_empty:
break
human_like_scroll(page)
scroll_to_next_page(page)
return downloaded

View File

@ -8,7 +8,6 @@ import time
from pathlib import Path
DEFAULT_RECOMMENDATION_URL = "https://www.douyin.com/"
DEFAULT_USER_URL = DEFAULT_RECOMMENDATION_URL
DEFAULT_CHROME_PATH = "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"
DEFAULT_BROWSER_PORT = 9223

View File

@ -54,9 +54,6 @@ class FakeRuntimePage:
self.url = url
def run_js(self, script):
# Allow both old scroll_to_next_page and new human_like_scroll
if "window.scrollTo" in script or "window.scrollBy" in script:
return
raise AssertionError(f"unexpected scroll script: {script}")
@ -294,49 +291,6 @@ class DouyinModuleTests(unittest.TestCase):
self.assertEqual(items[0]["author_name"], "测试博主")
self.assertEqual(items[0]["author_id"], "123456789")
def test_parse_aweme_items_uses_play_addr_h264_when_play_addr_is_missing(self) -> None:
module = importlib.import_module("Douyin")
payload = {
"aweme_list": [
{
"aweme_id": "7619989983668240802",
"desc": "推荐视频",
"video": {
"play_addr_h264": {
"url_list": ["https://v26-web.douyinvod.com/example/h264.mp4"]
}
},
}
]
}
items = module.parse_aweme_items(payload)
self.assertEqual(len(items), 1)
self.assertEqual(items[0]["video_url"], "https://v26-web.douyinvod.com/example/h264.mp4")
def test_parse_aweme_items_uses_bit_rate_play_addr_when_top_level_addresses_are_missing(self) -> None:
module = importlib.import_module("Douyin")
payload = {
"aweme_list": [
{
"aweme_id": "7619989983668240802",
"desc": "推荐视频",
"video": {
"bit_rate": [
{
"format": "mp4",
"play_addr": {
"url_list": ["https://v11-weba.douyinvod.com/example/bitrate.mp4"]
},
}
]
},
}
]
}
items = module.parse_aweme_items(payload)
self.assertEqual(len(items), 1)
self.assertEqual(items[0]["video_url"], "https://v11-weba.douyinvod.com/example/bitrate.mp4")
def test_build_video_page_url_uses_aweme_id(self) -> None:
module = importlib.import_module("Douyin")
self.assertEqual(

View File

@ -299,8 +299,6 @@ class PlaywrightLearningHelperTests(unittest.TestCase):
"title": "Playwright 示例",
"video_id": "7619989983668240802",
"video_url": "https://v26-web.douyinvod.com/example/single.mp4",
"author_name": "unknown",
"author_id": "unknown",
},
)