Add XHS explore batch download

This commit is contained in:
wangshaoqing 2026-05-27 15:29:23 +08:00
parent 16f262ada1
commit 12c2009950
3 changed files with 122 additions and 0 deletions

1
.gitignore vendored
View File

@ -5,3 +5,4 @@ __pycache__/
.xhs-chrome-profile/ .xhs-chrome-profile/
video/ video/
video_bad_*/ video_bad_*/
video_good_*/

93
XHS.py
View File

@ -10,6 +10,7 @@ import time
from dataclasses import dataclass from dataclasses import dataclass
from pathlib import Path from pathlib import Path
from typing import Any from typing import Any
from urllib.parse import urljoin
DEFAULT_EXPLORE_URL = "https://www.xiaohongshu.com/explore" DEFAULT_EXPLORE_URL = "https://www.xiaohongshu.com/explore"
DEFAULT_BROWSER_PORT = 9224 DEFAULT_BROWSER_PORT = 9224
@ -368,6 +369,21 @@ def collect_videos(
consecutive_empty = 0 consecutive_empty = 0
max_consecutive_empty = 6 max_consecutive_empty = 6
if not use_current_page:
downloaded += collect_videos_from_explore_cards(
page=page,
requests_module=requests_module,
output_dir=output_dir,
max_videos=max_videos,
start_url=start_url,
seen_urls=seen_urls,
seen_files=seen_files,
)
if downloaded >= max_videos:
return downloaded
page.get(start_url)
time.sleep(2)
while downloaded < max_videos and consecutive_empty < max_consecutive_empty: while downloaded < max_videos and consecutive_empty < max_consecutive_empty:
packet = wait_for_feed_packet(page, timeout=timeout) packet = wait_for_feed_packet(page, timeout=timeout)
if packet is None: if packet is None:
@ -429,6 +445,55 @@ def collect_videos(
return downloaded return downloaded
def collect_videos_from_explore_cards(
page: Any,
requests_module: Any,
output_dir: Path,
max_videos: int,
start_url: str,
seen_urls: set[str],
seen_files: set[Path],
) -> int:
note_urls = collect_note_urls_from_page(page, limit=max_videos * 8)
downloaded = 0
for note_url in note_urls:
if downloaded >= max_videos:
break
page.get(note_url)
time.sleep(2)
candidates = group_video_candidates(
extract_video_candidates_from_html(
page.run_js("return document.documentElement.outerHTML"),
video_id=extract_note_id_from_url(note_url),
)
)
if not candidates:
continue
for candidate in candidates:
if downloaded >= max_videos:
break
if candidate.video_url in seen_urls:
continue
seen_urls.add(candidate.video_url)
output_path = build_output_path(candidate, output_dir=output_dir)
if output_path in seen_files or output_path.exists():
continue
try:
download_video(
requests_module=requests_module,
headers=build_headers(start_url),
video_url=candidate.video_url,
output_path=output_path,
)
except Exception as exc:
print(f"[WARN] 下载失败 {candidate.video_id}: {exc}")
continue
downloaded += 1
seen_files.add(output_path)
print(f"[OK] 已保存: {output_path}")
return downloaded
def extract_note_id_from_url(url: str) -> str: def extract_note_id_from_url(url: str) -> str:
match = re.search(r"/explore/([^/?#]+)", url) match = re.search(r"/explore/([^/?#]+)", url)
if match: if match:
@ -436,6 +501,34 @@ def extract_note_id_from_url(url: str) -> str:
return "current-page" return "current-page"
def normalize_note_urls(urls: list[str]) -> list[str]:
by_note_id: dict[str, str] = {}
order: list[str] = []
for url in urls:
full_url = urljoin("https://www.xiaohongshu.com", str(url))
note_id = extract_note_id_from_url(full_url)
if note_id == "current-page":
continue
if note_id not in by_note_id:
order.append(note_id)
by_note_id[note_id] = f"https://www.xiaohongshu.com/explore/{note_id}?xsec_source=pc_feed"
if "xsec_token=" in full_url:
by_note_id[note_id] = full_url
return [by_note_id[note_id] for note_id in order]
def collect_note_urls_from_page(page: Any, limit: int) -> list[str]:
script = """
return Array.from(document.querySelectorAll('a[href*="/explore/"]'))
.map((a) => a.href || a.getAttribute('href') || '')
.filter(Boolean);
"""
raw_urls = page.run_js(script) or []
if not isinstance(raw_urls, list):
return []
return normalize_note_urls([str(url) for url in raw_urls])[:limit]
def build_parser() -> argparse.ArgumentParser: def build_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(description="附着到已登录小红书 Chrome监听 feed 响应并下载视频") parser = argparse.ArgumentParser(description="附着到已登录小红书 Chrome监听 feed 响应并下载视频")
parser.add_argument("--max-videos", type=int, default=10, help="最多下载视频数量,默认 10") parser.add_argument("--max-videos", type=int, default=10, help="最多下载视频数量,默认 10")

View File

@ -245,6 +245,34 @@ class XhsModuleTests(unittest.TestCase):
) )
output_path.write_bytes.assert_called_once_with(b"\x00\x00\x00\x18ftypmp42payload") output_path.write_bytes.assert_called_once_with(b"\x00\x00\x00\x18ftypmp42payload")
def test_normalize_note_urls_deduplicates_explore_links(self) -> None:
module = importlib.import_module("XHS")
urls = module.normalize_note_urls(
[
"https://www.xiaohongshu.com/explore/abc",
"https://www.xiaohongshu.com/explore/abc?xsec_token=token",
"/explore/def?xsec_token=token",
"https://www.xiaohongshu.com/user/profile/123",
]
)
self.assertEqual(
urls,
[
"https://www.xiaohongshu.com/explore/abc?xsec_token=token",
"https://www.xiaohongshu.com/explore/def?xsec_token=token",
],
)
def test_normalize_note_urls_prefers_xsec_token_url_for_same_note(self) -> None:
module = importlib.import_module("XHS")
urls = module.normalize_note_urls(
[
"https://www.xiaohongshu.com/explore/abc",
"https://www.xiaohongshu.com/explore/abc?xsec_token=token&xsec_source=",
],
)
self.assertEqual(urls, ["https://www.xiaohongshu.com/explore/abc?xsec_token=token&xsec_source="])
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()