Add XHS explore batch download

2026-05-27 15:29:23 +08:00 · 2026-05-27 15:29:23 +08:00 · 12c2009950
commit 12c2009950
parent 16f262ada1
3 changed files with 122 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -5,3 +5,4 @@ __pycache__/
 .xhs-chrome-profile/
 video/
 video_bad_*/
+video_good_*/
--- a/XHS.py
+++ b/XHS.py
@ -10,6 +10,7 @@ import time
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Any
+from urllib.parse import urljoin

 DEFAULT_EXPLORE_URL = "https://www.xiaohongshu.com/explore"
 DEFAULT_BROWSER_PORT = 9224
@ -368,6 +369,21 @@ def collect_videos(
    consecutive_empty = 0
    max_consecutive_empty = 6

+    if not use_current_page:
+        downloaded += collect_videos_from_explore_cards(
+            page=page,
+            requests_module=requests_module,
+            output_dir=output_dir,
+            max_videos=max_videos,
+            start_url=start_url,
+            seen_urls=seen_urls,
+            seen_files=seen_files,
+        )
+        if downloaded >= max_videos:
+            return downloaded
+        page.get(start_url)
+        time.sleep(2)
+
    while downloaded < max_videos and consecutive_empty < max_consecutive_empty:
        packet = wait_for_feed_packet(page, timeout=timeout)
        if packet is None:
@ -429,6 +445,55 @@ def collect_videos(
    return downloaded


+def collect_videos_from_explore_cards(
+    page: Any,
+    requests_module: Any,
+    output_dir: Path,
+    max_videos: int,
+    start_url: str,
+    seen_urls: set[str],
+    seen_files: set[Path],
+) -> int:
+    note_urls = collect_note_urls_from_page(page, limit=max_videos * 8)
+    downloaded = 0
+    for note_url in note_urls:
+        if downloaded >= max_videos:
+            break
+        page.get(note_url)
+        time.sleep(2)
+        candidates = group_video_candidates(
+            extract_video_candidates_from_html(
+                page.run_js("return document.documentElement.outerHTML"),
+                video_id=extract_note_id_from_url(note_url),
+            )
+        )
+        if not candidates:
+            continue
+        for candidate in candidates:
+            if downloaded >= max_videos:
+                break
+            if candidate.video_url in seen_urls:
+                continue
+            seen_urls.add(candidate.video_url)
+            output_path = build_output_path(candidate, output_dir=output_dir)
+            if output_path in seen_files or output_path.exists():
+                continue
+            try:
+                download_video(
+                    requests_module=requests_module,
+                    headers=build_headers(start_url),
+                    video_url=candidate.video_url,
+                    output_path=output_path,
+                )
+            except Exception as exc:
+                print(f"[WARN] 下载失败 {candidate.video_id}: {exc}")
+                continue
+            downloaded += 1
+            seen_files.add(output_path)
+            print(f"[OK] 已保存: {output_path}")
+    return downloaded
+
+
 def extract_note_id_from_url(url: str) -> str:
    match = re.search(r"/explore/([^/?#]+)", url)
    if match:
@ -436,6 +501,34 @@ def extract_note_id_from_url(url: str) -> str:
    return "current-page"


+def normalize_note_urls(urls: list[str]) -> list[str]:
+    by_note_id: dict[str, str] = {}
+    order: list[str] = []
+    for url in urls:
+        full_url = urljoin("https://www.xiaohongshu.com", str(url))
+        note_id = extract_note_id_from_url(full_url)
+        if note_id == "current-page":
+            continue
+        if note_id not in by_note_id:
+            order.append(note_id)
+            by_note_id[note_id] = f"https://www.xiaohongshu.com/explore/{note_id}?xsec_source=pc_feed"
+        if "xsec_token=" in full_url:
+            by_note_id[note_id] = full_url
+    return [by_note_id[note_id] for note_id in order]
+
+
+def collect_note_urls_from_page(page: Any, limit: int) -> list[str]:
+    script = """
+return Array.from(document.querySelectorAll('a[href*="/explore/"]'))
+  .map((a) => a.href || a.getAttribute('href') || '')
+  .filter(Boolean);
+"""
+    raw_urls = page.run_js(script) or []
+    if not isinstance(raw_urls, list):
+        return []
+    return normalize_note_urls([str(url) for url in raw_urls])[:limit]
+
+
 def build_parser() -> argparse.ArgumentParser:
    parser = argparse.ArgumentParser(description="附着到已登录小红书 Chrome，监听 feed 响应并下载视频")
    parser.add_argument("--max-videos", type=int, default=10, help="最多下载视频数量，默认 10")
--- a/test_xhs.py
+++ b/test_xhs.py
@ -245,6 +245,34 @@ class XhsModuleTests(unittest.TestCase):
        )
        output_path.write_bytes.assert_called_once_with(b"\x00\x00\x00\x18ftypmp42payload")

+    def test_normalize_note_urls_deduplicates_explore_links(self) -> None:
+        module = importlib.import_module("XHS")
+        urls = module.normalize_note_urls(
+            [
+                "https://www.xiaohongshu.com/explore/abc",
+                "https://www.xiaohongshu.com/explore/abc?xsec_token=token",
+                "/explore/def?xsec_token=token",
+                "https://www.xiaohongshu.com/user/profile/123",
+            ]
+        )
+        self.assertEqual(
+            urls,
+            [
+                "https://www.xiaohongshu.com/explore/abc?xsec_token=token",
+                "https://www.xiaohongshu.com/explore/def?xsec_token=token",
+            ],
+        )
+
+    def test_normalize_note_urls_prefers_xsec_token_url_for_same_note(self) -> None:
+        module = importlib.import_module("XHS")
+        urls = module.normalize_note_urls(
+            [
+                "https://www.xiaohongshu.com/explore/abc",
+                "https://www.xiaohongshu.com/explore/abc?xsec_token=token&xsec_source=",
+            ],
+        )
+        self.assertEqual(urls, ["https://www.xiaohongshu.com/explore/abc?xsec_token=token&xsec_source="])
+

 if __name__ == "__main__":
    unittest.main()