feat: add search result video crawling

This commit is contained in:
wangshaoqing 2026-05-26 16:18:44 +08:00
parent cc1109628f
commit ca5fe9634a
2 changed files with 215 additions and 20 deletions

142
Douyin.py
View File

@ -20,6 +20,7 @@ import time
from dataclasses import dataclass from dataclasses import dataclass
from pathlib import Path from pathlib import Path
from typing import Any from typing import Any
from urllib.parse import quote
DEFAULT_USER_URL = ( DEFAULT_USER_URL = (
"https://www.douyin.com/user/" "https://www.douyin.com/user/"
@ -30,6 +31,7 @@ DEFAULT_BROWSER_PORT = 9223
LISTEN_TARGET = "web/aweme/post/" LISTEN_TARGET = "web/aweme/post/"
RECOMMENDATION_LISTEN_TARGET = "aweme/v2/web/module/feed/" RECOMMENDATION_LISTEN_TARGET = "aweme/v2/web/module/feed/"
SINGLE_VIDEO_LISTEN_TARGET = "web/aweme/detail/" SINGLE_VIDEO_LISTEN_TARGET = "web/aweme/detail/"
SEARCH_LISTEN_TARGET = "aweme/v1/web/general/search/single"
MAX_FILENAME_BYTES = 240 MAX_FILENAME_BYTES = 240
INVALID_FILENAME_CHARS = re.compile(r'[\\/:*?"<>|\r\n\t]') INVALID_FILENAME_CHARS = re.compile(r'[\\/:*?"<>|\r\n\t]')
RECOMMENDATION_URL_PATTERN = re.compile(r"^https?://www\.douyin\.com/?(?:jingxuan)?(?:\?.*)?$") RECOMMENDATION_URL_PATTERN = re.compile(r"^https?://www\.douyin\.com/?(?:jingxuan)?(?:\?.*)?$")
@ -115,6 +117,10 @@ def build_video_page_url(aweme_id: str) -> str:
return f"https://www.douyin.com/video/{aweme_id}" return f"https://www.douyin.com/video/{aweme_id}"
def build_search_page_url(keyword: str) -> str:
return f"https://www.douyin.com/search/{quote(keyword)}?type=general"
def parse_target_input(value: str, source: str) -> ResolvedTarget: def parse_target_input(value: str, source: str) -> ResolvedTarget:
normalized = value.strip() normalized = value.strip()
if is_recommendation_url(normalized): if is_recommendation_url(normalized):
@ -325,6 +331,25 @@ def parse_single_aweme_item(body: Any) -> dict[str, str]:
raise ValueError("接口响应中缺少可下载的单视频数据。") raise ValueError("接口响应中缺少可下载的单视频数据。")
def parse_search_items(body: Any) -> list[dict[str, str]]:
if not isinstance(body, dict):
raise ValueError("接口响应不是字典,无法解析。")
data = body.get("data")
if not isinstance(data, list):
raise ValueError("搜索接口响应中缺少 data。")
aweme_list = []
for entry in data:
if not isinstance(entry, dict):
continue
aweme_info = entry.get("aweme_info")
if isinstance(aweme_info, dict):
aweme_list.append(aweme_info)
return parse_aweme_items({"aweme_list": aweme_list})
def build_headers(referer: str) -> dict[str, str]: def build_headers(referer: str) -> dict[str, str]:
return { return {
"referer": referer, "referer": referer,
@ -659,6 +684,108 @@ def collect_recommendations(
return downloaded return downloaded
def collect_search_results(
keyword: str,
max_videos: int,
timeout: int,
output_dir: Path,
browser_port: int | None,
scroll_settings: ScrollSettings | None = None,
) -> int:
requests_module, chromium_page_cls, chromium_options_cls = import_runtime_dependencies()
search_url = build_search_page_url(keyword)
headers = build_headers(search_url)
if browser_port is not None:
ensure_browser_debug_port_ready(browser_port)
page = create_page(chromium_page_cls, chromium_options_cls, browser_port)
page.listen.start(SEARCH_LISTEN_TARGET)
print(f"[INFO] 正在打开抖音搜索页:{keyword}。若出现登录或验证码,请先在浏览器窗口里完成。")
page.get(search_url)
time.sleep(3)
downloaded = 0
seen_ids: set[str] = set()
consecutive_empty = 0
max_consecutive_empty = 6
settings = scroll_settings or ScrollSettings()
started_at = time.monotonic()
while downloaded < max_videos:
if settings.max_runtime > 0 and time.monotonic() - started_at >= settings.max_runtime:
print("[INFO] 已达到最大运行时间,结束抓取。")
break
packet = wait_for_aweme_packet(page, timeout=timeout)
if packet is None:
consecutive_empty += 1
if consecutive_empty >= max_consecutive_empty:
print("[INFO] 连续多次未获取到新搜索数据,结束抓取。")
break
human_like_scroll(page, settings=settings)
continue
try:
payload = extract_aweme_payload(packet.response)
items = parse_search_items(payload)
except Exception as exc:
print(f"[WARN] 解析搜索接口数据失败: {exc}")
consecutive_empty += 1
if consecutive_empty >= max_consecutive_empty:
break
human_like_scroll(page, settings=settings)
continue
if not items:
consecutive_empty += 1
if consecutive_empty >= max_consecutive_empty:
break
human_like_scroll(page, settings=settings)
continue
consecutive_empty = 0
new_items_in_batch = 0
for item in items:
if item["video_id"] in seen_ids:
continue
if downloaded >= max_videos:
break
seen_ids.add(item["video_id"])
output_path = build_output_path(
title=item["title"],
video_id=item["video_id"],
output_dir=output_dir,
author_name=item.get("author_name"),
)
try:
download_video(
requests_module=requests_module,
headers=headers,
video_url=item["video_url"],
output_path=output_path,
)
except Exception as exc:
print(f"[WARN] 下载失败 {item['video_id']}: {exc}")
continue
downloaded += 1
new_items_in_batch += 1
print(f"[OK] 已保存: {output_path}")
if new_items_in_batch == 0:
consecutive_empty += 1
if consecutive_empty >= max_consecutive_empty:
break
human_like_scroll(page, settings=settings)
return downloaded
def collect_single_video( def collect_single_video(
target: ResolvedTarget, target: ResolvedTarget,
timeout: int, timeout: int,
@ -732,6 +859,11 @@ def build_parser() -> argparse.ArgumentParser:
default=50, default=50,
help="推荐流最大抓取数量,默认 50", help="推荐流最大抓取数量,默认 50",
) )
parser.add_argument(
"--search-keyword",
default=None,
help="搜索关键词;提供后抓取搜索结果页视频",
)
parser.add_argument( parser.add_argument(
"--scroll-mode", "--scroll-mode",
choices=["human"], choices=["human"],
@ -795,6 +927,16 @@ def main(argv: list[str] | None = None) -> int:
) )
try: try:
if args.search_keyword:
total = collect_search_results(
keyword=args.search_keyword,
max_videos=args.max_videos,
timeout=args.timeout,
output_dir=Path(args.output_dir),
browser_port=args.browser_port,
scroll_settings=scroll_settings,
)
else:
target = resolve_cli_target(args.target, browser_port=args.browser_port) target = resolve_cli_target(args.target, browser_port=args.browser_port)
if target.kind == "creator": if target.kind == "creator":
total = collect_videos( total = collect_videos(

View File

@ -442,6 +442,38 @@ class DouyinModuleTests(unittest.TestCase):
"https://www.douyin.com/video/7619989983668240802", "https://www.douyin.com/video/7619989983668240802",
) )
def test_build_search_page_url_encodes_keyword(self) -> None:
module = importlib.import_module("Douyin")
self.assertEqual(
module.build_search_page_url("猫咪"),
"https://www.douyin.com/search/%E7%8C%AB%E5%92%AA?type=general",
)
def test_parse_search_items_extracts_aweme_info(self) -> None:
module = importlib.import_module("Douyin")
payload = {
"data": [
{
"type": 1,
"aweme_info": {
"aweme_id": "7319795133048769829",
"desc": "猫咪视频",
"author": {"nickname": "奶芙芙", "uid": "75478174642"},
"video": {
"play_addr_lowbr": {
"url_list": ["https://v26-web.douyinvod.com/example/search.mp4"]
}
},
},
}
]
}
items = module.parse_search_items(payload)
self.assertEqual(len(items), 1)
self.assertEqual(items[0]["video_id"], "7319795133048769829")
self.assertEqual(items[0]["author_name"], "奶芙芙")
self.assertEqual(items[0]["video_url"], "https://v26-web.douyinvod.com/example/search.mp4")
def test_collect_recommendations_downloads_videos_with_author_prefix(self) -> None: def test_collect_recommendations_downloads_videos_with_author_prefix(self) -> None:
module = importlib.import_module("Douyin") module = importlib.import_module("Douyin")
packet = FakePacket( packet = FakePacket(
@ -575,6 +607,27 @@ class DouyinModuleTests(unittest.TestCase):
self.assertEqual(args.reverse_scroll_probability, 0.4) self.assertEqual(args.reverse_scroll_probability, 0.4)
self.assertEqual(args.max_runtime, 600) self.assertEqual(args.max_runtime, 600)
def test_build_parser_has_search_keyword_argument(self) -> None:
module = importlib.import_module("Douyin")
args = module.build_parser().parse_args(["--search-keyword", "猫咪"])
self.assertEqual(args.search_keyword, "猫咪")
def test_main_dispatches_search_flow_for_search_keyword(self) -> None:
module = importlib.import_module("Douyin")
stdout = io.StringIO()
with redirect_stdout(stdout):
with mock.patch.object(module, "collect_search_results", return_value=7) as mocked_collect:
exit_code = module.main(["--search-keyword", "猫咪"])
self.assertEqual(exit_code, 0)
mocked_collect.assert_called_once_with(
keyword="猫咪",
max_videos=50,
timeout=10,
output_dir=module.Path("video"),
browser_port=9223,
scroll_settings=module.ScrollSettings(),
)
def test_build_parser_defaults_to_zero_argument_current_page_flow(self) -> None: def test_build_parser_defaults_to_zero_argument_current_page_flow(self) -> None:
module = importlib.import_module("Douyin") module = importlib.import_module("Douyin")
args = module.build_parser().parse_args([]) args = module.build_parser().parse_args([])