Add XHS explore batch download

This commit is contained in:
wangshaoqing 2026-05-27 15:29:23 +08:00
parent 16f262ada1
commit 12c2009950
3 changed files with 122 additions and 0 deletions

1
.gitignore vendored
View File

@ -5,3 +5,4 @@ __pycache__/
.xhs-chrome-profile/
video/
video_bad_*/
video_good_*/

93
XHS.py
View File

@ -10,6 +10,7 @@ import time
from dataclasses import dataclass
from pathlib import Path
from typing import Any
from urllib.parse import urljoin
DEFAULT_EXPLORE_URL = "https://www.xiaohongshu.com/explore"
DEFAULT_BROWSER_PORT = 9224
@ -368,6 +369,21 @@ def collect_videos(
consecutive_empty = 0
max_consecutive_empty = 6
if not use_current_page:
downloaded += collect_videos_from_explore_cards(
page=page,
requests_module=requests_module,
output_dir=output_dir,
max_videos=max_videos,
start_url=start_url,
seen_urls=seen_urls,
seen_files=seen_files,
)
if downloaded >= max_videos:
return downloaded
page.get(start_url)
time.sleep(2)
while downloaded < max_videos and consecutive_empty < max_consecutive_empty:
packet = wait_for_feed_packet(page, timeout=timeout)
if packet is None:
@ -429,6 +445,55 @@ def collect_videos(
return downloaded
def collect_videos_from_explore_cards(
page: Any,
requests_module: Any,
output_dir: Path,
max_videos: int,
start_url: str,
seen_urls: set[str],
seen_files: set[Path],
) -> int:
note_urls = collect_note_urls_from_page(page, limit=max_videos * 8)
downloaded = 0
for note_url in note_urls:
if downloaded >= max_videos:
break
page.get(note_url)
time.sleep(2)
candidates = group_video_candidates(
extract_video_candidates_from_html(
page.run_js("return document.documentElement.outerHTML"),
video_id=extract_note_id_from_url(note_url),
)
)
if not candidates:
continue
for candidate in candidates:
if downloaded >= max_videos:
break
if candidate.video_url in seen_urls:
continue
seen_urls.add(candidate.video_url)
output_path = build_output_path(candidate, output_dir=output_dir)
if output_path in seen_files or output_path.exists():
continue
try:
download_video(
requests_module=requests_module,
headers=build_headers(start_url),
video_url=candidate.video_url,
output_path=output_path,
)
except Exception as exc:
print(f"[WARN] 下载失败 {candidate.video_id}: {exc}")
continue
downloaded += 1
seen_files.add(output_path)
print(f"[OK] 已保存: {output_path}")
return downloaded
def extract_note_id_from_url(url: str) -> str:
match = re.search(r"/explore/([^/?#]+)", url)
if match:
@ -436,6 +501,34 @@ def extract_note_id_from_url(url: str) -> str:
return "current-page"
def normalize_note_urls(urls: list[str]) -> list[str]:
by_note_id: dict[str, str] = {}
order: list[str] = []
for url in urls:
full_url = urljoin("https://www.xiaohongshu.com", str(url))
note_id = extract_note_id_from_url(full_url)
if note_id == "current-page":
continue
if note_id not in by_note_id:
order.append(note_id)
by_note_id[note_id] = f"https://www.xiaohongshu.com/explore/{note_id}?xsec_source=pc_feed"
if "xsec_token=" in full_url:
by_note_id[note_id] = full_url
return [by_note_id[note_id] for note_id in order]
def collect_note_urls_from_page(page: Any, limit: int) -> list[str]:
script = """
return Array.from(document.querySelectorAll('a[href*="/explore/"]'))
.map((a) => a.href || a.getAttribute('href') || '')
.filter(Boolean);
"""
raw_urls = page.run_js(script) or []
if not isinstance(raw_urls, list):
return []
return normalize_note_urls([str(url) for url in raw_urls])[:limit]
def build_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(description="附着到已登录小红书 Chrome监听 feed 响应并下载视频")
parser.add_argument("--max-videos", type=int, default=10, help="最多下载视频数量,默认 10")

View File

@ -245,6 +245,34 @@ class XhsModuleTests(unittest.TestCase):
)
output_path.write_bytes.assert_called_once_with(b"\x00\x00\x00\x18ftypmp42payload")
def test_normalize_note_urls_deduplicates_explore_links(self) -> None:
module = importlib.import_module("XHS")
urls = module.normalize_note_urls(
[
"https://www.xiaohongshu.com/explore/abc",
"https://www.xiaohongshu.com/explore/abc?xsec_token=token",
"/explore/def?xsec_token=token",
"https://www.xiaohongshu.com/user/profile/123",
]
)
self.assertEqual(
urls,
[
"https://www.xiaohongshu.com/explore/abc?xsec_token=token",
"https://www.xiaohongshu.com/explore/def?xsec_token=token",
],
)
def test_normalize_note_urls_prefers_xsec_token_url_for_same_note(self) -> None:
module = importlib.import_module("XHS")
urls = module.normalize_note_urls(
[
"https://www.xiaohongshu.com/explore/abc",
"https://www.xiaohongshu.com/explore/abc?xsec_token=token&xsec_source=",
],
)
self.assertEqual(urls, ["https://www.xiaohongshu.com/explore/abc?xsec_token=token&xsec_source="])
if __name__ == "__main__":
unittest.main()