Compare commits
2 Commits
46499446b2
...
452f14da69
| Author | SHA1 | Date | |
|---|---|---|---|
| 452f14da69 | |||
| 4fb4131217 |
56
Douyin.py
56
Douyin.py
@ -12,6 +12,7 @@ from __future__ import annotations
|
|||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import json
|
import json
|
||||||
|
import random
|
||||||
import re
|
import re
|
||||||
import socket
|
import socket
|
||||||
import sys
|
import sys
|
||||||
@ -139,6 +140,40 @@ def choose_video_url(url_list: list[str]) -> str:
|
|||||||
raise ValueError("url_list 为空,无法选择视频地址。")
|
raise ValueError("url_list 为空,无法选择视频地址。")
|
||||||
|
|
||||||
|
|
||||||
|
def extract_url_list_from_play_addr(play_addr: Any) -> list[str]:
|
||||||
|
if not isinstance(play_addr, dict):
|
||||||
|
return []
|
||||||
|
|
||||||
|
url_list = play_addr.get("url_list") or []
|
||||||
|
if not isinstance(url_list, list):
|
||||||
|
return []
|
||||||
|
|
||||||
|
return [str(url) for url in url_list if str(url).strip()]
|
||||||
|
|
||||||
|
|
||||||
|
def extract_video_url_list(video: Any) -> list[str]:
|
||||||
|
if not isinstance(video, dict):
|
||||||
|
return []
|
||||||
|
|
||||||
|
for address_key in ("play_addr", "play_addr_h264", "play_addr_lowbr"):
|
||||||
|
url_list = extract_url_list_from_play_addr(video.get(address_key))
|
||||||
|
if url_list:
|
||||||
|
return url_list
|
||||||
|
|
||||||
|
bit_rate_list = video.get("bit_rate") or []
|
||||||
|
if not isinstance(bit_rate_list, list):
|
||||||
|
return []
|
||||||
|
|
||||||
|
for bit_rate in bit_rate_list:
|
||||||
|
if not isinstance(bit_rate, dict):
|
||||||
|
continue
|
||||||
|
url_list = extract_url_list_from_play_addr(bit_rate.get("play_addr"))
|
||||||
|
if url_list:
|
||||||
|
return url_list
|
||||||
|
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
def build_output_path(
|
def build_output_path(
|
||||||
title: str,
|
title: str,
|
||||||
video_id: str,
|
video_id: str,
|
||||||
@ -200,8 +235,7 @@ def parse_aweme_items(body: Any) -> list[dict[str, str]]:
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
video = aweme.get("video") or {}
|
video = aweme.get("video") or {}
|
||||||
play_addr = video.get("play_addr") or {}
|
url_list = extract_video_url_list(video)
|
||||||
url_list = play_addr.get("url_list") or []
|
|
||||||
if not url_list:
|
if not url_list:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@ -219,7 +253,7 @@ def parse_aweme_items(body: Any) -> list[dict[str, str]]:
|
|||||||
{
|
{
|
||||||
"title": title,
|
"title": title,
|
||||||
"video_id": video_id,
|
"video_id": video_id,
|
||||||
"video_url": choose_video_url([str(url) for url in url_list]),
|
"video_url": choose_video_url(url_list),
|
||||||
"author_name": author_name,
|
"author_name": author_name,
|
||||||
"author_id": author_id,
|
"author_id": author_id,
|
||||||
}
|
}
|
||||||
@ -296,6 +330,14 @@ def scroll_to_next_page(page: Any) -> None:
|
|||||||
time.sleep(2)
|
time.sleep(2)
|
||||||
|
|
||||||
|
|
||||||
|
def human_like_scroll(page: Any) -> None:
|
||||||
|
"""模拟人类滚动行为:随机滚动距离和随机停顿时间"""
|
||||||
|
scroll_distance = random.randint(300, 800)
|
||||||
|
page.run_js(f"window.scrollBy(0, {scroll_distance});")
|
||||||
|
sleep_time = random.uniform(1.5, 4.0)
|
||||||
|
time.sleep(sleep_time)
|
||||||
|
|
||||||
|
|
||||||
def download_video(
|
def download_video(
|
||||||
requests_module: Any,
|
requests_module: Any,
|
||||||
headers: dict[str, str],
|
headers: dict[str, str],
|
||||||
@ -417,7 +459,7 @@ def collect_recommendations(
|
|||||||
if consecutive_empty >= max_consecutive_empty:
|
if consecutive_empty >= max_consecutive_empty:
|
||||||
print("[INFO] 连续多次未获取到新数据,结束抓取。")
|
print("[INFO] 连续多次未获取到新数据,结束抓取。")
|
||||||
break
|
break
|
||||||
scroll_to_next_page(page)
|
human_like_scroll(page)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@ -428,14 +470,14 @@ def collect_recommendations(
|
|||||||
consecutive_empty += 1
|
consecutive_empty += 1
|
||||||
if consecutive_empty >= max_consecutive_empty:
|
if consecutive_empty >= max_consecutive_empty:
|
||||||
break
|
break
|
||||||
scroll_to_next_page(page)
|
human_like_scroll(page)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if not items:
|
if not items:
|
||||||
consecutive_empty += 1
|
consecutive_empty += 1
|
||||||
if consecutive_empty >= max_consecutive_empty:
|
if consecutive_empty >= max_consecutive_empty:
|
||||||
break
|
break
|
||||||
scroll_to_next_page(page)
|
human_like_scroll(page)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
consecutive_empty = 0
|
consecutive_empty = 0
|
||||||
@ -476,7 +518,7 @@ def collect_recommendations(
|
|||||||
if consecutive_empty >= max_consecutive_empty:
|
if consecutive_empty >= max_consecutive_empty:
|
||||||
break
|
break
|
||||||
|
|
||||||
scroll_to_next_page(page)
|
human_like_scroll(page)
|
||||||
|
|
||||||
return downloaded
|
return downloaded
|
||||||
|
|
||||||
|
|||||||
@ -8,6 +8,7 @@ import time
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
DEFAULT_RECOMMENDATION_URL = "https://www.douyin.com/"
|
DEFAULT_RECOMMENDATION_URL = "https://www.douyin.com/"
|
||||||
|
DEFAULT_USER_URL = DEFAULT_RECOMMENDATION_URL
|
||||||
|
|
||||||
DEFAULT_CHROME_PATH = "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"
|
DEFAULT_CHROME_PATH = "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"
|
||||||
DEFAULT_BROWSER_PORT = 9223
|
DEFAULT_BROWSER_PORT = 9223
|
||||||
|
|||||||
@ -54,6 +54,9 @@ class FakeRuntimePage:
|
|||||||
self.url = url
|
self.url = url
|
||||||
|
|
||||||
def run_js(self, script):
|
def run_js(self, script):
|
||||||
|
# Allow both old scroll_to_next_page and new human_like_scroll
|
||||||
|
if "window.scrollTo" in script or "window.scrollBy" in script:
|
||||||
|
return
|
||||||
raise AssertionError(f"unexpected scroll script: {script}")
|
raise AssertionError(f"unexpected scroll script: {script}")
|
||||||
|
|
||||||
|
|
||||||
@ -291,6 +294,49 @@ class DouyinModuleTests(unittest.TestCase):
|
|||||||
self.assertEqual(items[0]["author_name"], "测试博主")
|
self.assertEqual(items[0]["author_name"], "测试博主")
|
||||||
self.assertEqual(items[0]["author_id"], "123456789")
|
self.assertEqual(items[0]["author_id"], "123456789")
|
||||||
|
|
||||||
|
def test_parse_aweme_items_uses_play_addr_h264_when_play_addr_is_missing(self) -> None:
|
||||||
|
module = importlib.import_module("Douyin")
|
||||||
|
payload = {
|
||||||
|
"aweme_list": [
|
||||||
|
{
|
||||||
|
"aweme_id": "7619989983668240802",
|
||||||
|
"desc": "推荐视频",
|
||||||
|
"video": {
|
||||||
|
"play_addr_h264": {
|
||||||
|
"url_list": ["https://v26-web.douyinvod.com/example/h264.mp4"]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
items = module.parse_aweme_items(payload)
|
||||||
|
self.assertEqual(len(items), 1)
|
||||||
|
self.assertEqual(items[0]["video_url"], "https://v26-web.douyinvod.com/example/h264.mp4")
|
||||||
|
|
||||||
|
def test_parse_aweme_items_uses_bit_rate_play_addr_when_top_level_addresses_are_missing(self) -> None:
|
||||||
|
module = importlib.import_module("Douyin")
|
||||||
|
payload = {
|
||||||
|
"aweme_list": [
|
||||||
|
{
|
||||||
|
"aweme_id": "7619989983668240802",
|
||||||
|
"desc": "推荐视频",
|
||||||
|
"video": {
|
||||||
|
"bit_rate": [
|
||||||
|
{
|
||||||
|
"format": "mp4",
|
||||||
|
"play_addr": {
|
||||||
|
"url_list": ["https://v11-weba.douyinvod.com/example/bitrate.mp4"]
|
||||||
|
},
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
items = module.parse_aweme_items(payload)
|
||||||
|
self.assertEqual(len(items), 1)
|
||||||
|
self.assertEqual(items[0]["video_url"], "https://v11-weba.douyinvod.com/example/bitrate.mp4")
|
||||||
|
|
||||||
def test_build_video_page_url_uses_aweme_id(self) -> None:
|
def test_build_video_page_url_uses_aweme_id(self) -> None:
|
||||||
module = importlib.import_module("Douyin")
|
module = importlib.import_module("Douyin")
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
|
|||||||
@ -299,6 +299,8 @@ class PlaywrightLearningHelperTests(unittest.TestCase):
|
|||||||
"title": "Playwright 示例",
|
"title": "Playwright 示例",
|
||||||
"video_id": "7619989983668240802",
|
"video_id": "7619989983668240802",
|
||||||
"video_url": "https://v26-web.douyinvod.com/example/single.mp4",
|
"video_url": "https://v26-web.douyinvod.com/example/single.mp4",
|
||||||
|
"author_name": "unknown",
|
||||||
|
"author_id": "unknown",
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user