xhs_video_crawler/XHS.py

from __future__ import annotations

import argparse
import json
import re
import socket
import sys
import time
from dataclasses import dataclass
from pathlib import Path
from typing import Any

DEFAULT_EXPLORE_URL = "https://www.xiaohongshu.com/explore"
DEFAULT_BROWSER_PORT = 9224
DEFAULT_OUTPUT_DIR = Path("video")
LISTEN_TARGET = "feed"
MAX_FILENAME_BYTES = 240
INVALID_FILENAME_CHARS = re.compile(r'[\\/:*?"<>|\r\n\t]')
VIDEO_URL_KEYS = {"master_url", "backup_url", "backup_urls", "url"}
TITLE_KEYS = ("display_title", "title", "desc", "description")
ID_KEYS = ("id", "note_id", "noteId", "video_id", "file_id")
AUTHOR_KEYS = ("nickname", "name", "user_name", "userName")


@dataclass(frozen=True)
class VideoCandidate:
    video_id: str
    title: str
    video_url: str
    author_name: str
    source_key: str


def sanitize_filename(value: str, fallback: str = "untitled") -> str:
    cleaned = INVALID_FILENAME_CHARS.sub("_", value).strip(" ._")
    return cleaned or fallback


def truncate_utf8_bytes(value: str, max_bytes: int) -> str:
    if len(value.encode("utf-8")) <= max_bytes:
        return value

    result = ""
    used = 0
    for character in value:
        character_bytes = len(character.encode("utf-8"))
        if used + character_bytes > max_bytes:
            break
        result += character
        used += character_bytes
    return result.rstrip(" ._")


def looks_like_video_url(value: str) -> bool:
    normalized = value.strip()
    return normalized.startswith(("http://", "https://")) and (
        ".mp4" in normalized or "sns-video" in normalized or "xhscdn.com" in normalized
    )


def first_string_by_keys(value: Any, keys: tuple[str, ...]) -> str | None:
    if isinstance(value, dict):
        for key in keys:
            candidate = value.get(key)
            if isinstance(candidate, str) and candidate.strip():
                return candidate.strip()
        for child in value.values():
            found = first_string_by_keys(child, keys)
            if found:
                return found
    elif isinstance(value, list):
        for child in value:
            found = first_string_by_keys(child, keys)
            if found:
                return found
    return None


def find_nearest_note_context(path: tuple[Any, ...]) -> dict[str, str]:
    video_id = ""
    title = ""
    author_name = ""

    for node in reversed(path):
        if not isinstance(node, dict):
            continue
        if not video_id:
            video_id = first_string_by_keys(node, ID_KEYS) or ""
        if not title:
            title = first_string_by_keys(node, TITLE_KEYS) or ""
        if not author_name:
            user = node.get("user") or node.get("author")
            if isinstance(user, dict):
                author_name = first_string_by_keys(user, AUTHOR_KEYS) or ""

    return {
        "video_id": video_id or "unknown",
        "title": title or "untitled",
        "author_name": author_name or "unknown",
    }


def append_candidate(
    candidates: list[VideoCandidate],
    url: str,
    source_key: str,
    path: tuple[Any, ...],
) -> None:
    if not looks_like_video_url(url):
        return
    context = find_nearest_note_context(path)
    candidates.append(
        VideoCandidate(
            video_id=context["video_id"],
            title=context["title"],
            video_url=url.strip(),
            author_name=context["author_name"],
            source_key=source_key,
        )
    )


def walk_for_video_candidates(value: Any, path: tuple[Any, ...], candidates: list[VideoCandidate]) -> None:
    if isinstance(value, dict):
        current_path = (*path, value)
        for key, child in value.items():
            if key in VIDEO_URL_KEYS:
                if isinstance(child, str):
                    append_candidate(candidates, child, key, current_path)
                elif isinstance(child, list):
                    for item in child:
                        if isinstance(item, str):
                            append_candidate(candidates, item, key, current_path)
            walk_for_video_candidates(child, current_path, candidates)
    elif isinstance(value, list):
        for child in value:
            walk_for_video_candidates(child, path, candidates)


def extract_video_candidates(payload: Any) -> list[VideoCandidate]:
    candidates: list[VideoCandidate] = []
    walk_for_video_candidates(payload, (), candidates)
    return candidates


def choose_video_candidate(candidates: list[VideoCandidate]) -> VideoCandidate:
    if not candidates:
        raise ValueError("没有可用的视频候选地址。")

    source_priority = {"master_url": 0, "backup_url": 1, "backup_urls": 2, "url": 3}
    return sorted(candidates, key=lambda item: source_priority.get(item.source_key, 99))[0]


def group_video_candidates(candidates: list[VideoCandidate]) -> list[VideoCandidate]:
    grouped: dict[str, list[VideoCandidate]] = {}
    order: list[str] = []
    for candidate in candidates:
        key = candidate.video_id or candidate.video_url
        if key not in grouped:
            grouped[key] = []
            order.append(key)
        grouped[key].append(candidate)
    return [choose_video_candidate(grouped[key]) for key in order]


def build_output_path(candidate: VideoCandidate, output_dir: Path = DEFAULT_OUTPUT_DIR) -> Path:
    safe_author = sanitize_filename(candidate.author_name, fallback="unknown")
    safe_title = sanitize_filename(candidate.title, fallback="untitled")
    safe_video_id = sanitize_filename(candidate.video_id, fallback="unknown")
    prefix = f"[{safe_author}]"
    suffix = f"-{safe_video_id}.mp4"
    title_budget = MAX_FILENAME_BYTES - len(prefix.encode("utf-8")) - len(suffix.encode("utf-8"))
    if title_budget < 1:
        prefix_budget = MAX_FILENAME_BYTES - len(suffix.encode("utf-8")) - 1
        prefix = truncate_utf8_bytes(prefix, max(1, prefix_budget))
        title_budget = MAX_FILENAME_BYTES - len(prefix.encode("utf-8")) - len(suffix.encode("utf-8"))
    filename = f"{prefix}{truncate_utf8_bytes(safe_title, max(1, title_budget))}{suffix}"
    return output_dir / filename


def build_browser_address(browser_port: int | None) -> str | None:
    if browser_port is None:
        return None
    return f"127.0.0.1:{browser_port}"


def ensure_browser_debug_port_ready(browser_port: int) -> None:
    try:
        with socket.create_connection(("127.0.0.1", browser_port), timeout=2):
            return
    except OSError as exc:
        raise RuntimeError(
            "无法连接到已启动的 Chrome 调试端口。"
            f"请先运行 `./.venv/bin/python login_xhs.py --browser-port {browser_port}`，"
            "并确认 Chrome 仍在运行且端口一致。"
        ) from exc


def build_headers(referer: str) -> dict[str, str]:
    return {
        "referer": referer,
        "user-agent": (
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/135.0.0.0 Safari/537.36"
        ),
    }


def import_runtime_dependencies() -> tuple[Any, Any, Any]:
    try:
        import requests
    except ModuleNotFoundError as exc:
        raise SystemExit("缺少 requests，请先执行: python3 -m pip install requests") from exc

    try:
        from DrissionPage import ChromiumOptions
        from DrissionPage import ChromiumPage
    except ModuleNotFoundError as exc:
        raise SystemExit("缺少 DrissionPage，请先执行: python3 -m pip install DrissionPage") from exc

    return requests, ChromiumPage, ChromiumOptions


def create_page(chromium_page_cls: Any, chromium_options_cls: Any, browser_port: int | None) -> Any:
    browser_address = build_browser_address(browser_port)
    if browser_address is None:
        return chromium_page_cls()

    options = chromium_options_cls().set_address(browser_address).existing_only(True)
    return chromium_page_cls(options)


def extract_feed_payload(response: Any) -> dict[str, Any]:
    body = getattr(response, "body", None)
    if isinstance(body, dict):
        return body

    raw_body = getattr(response, "raw_body", None)
    if isinstance(raw_body, str) and raw_body.strip():
        payload = json.loads(raw_body)
        if isinstance(payload, dict):
            return payload

    raise ValueError("响应体不是可解析的 JSON 字典。")


def download_video(
    requests_module: Any,
    headers: dict[str, str],
    video_url: str,
    output_path: Path,
) -> None:
    response = requests_module.get(video_url, headers=headers, timeout=60)
    response.raise_for_status()
    output_path.parent.mkdir(parents=True, exist_ok=True)
    output_path.write_bytes(response.content)


def wait_for_feed_packet(page: Any, timeout: int) -> Any | None:
    try:
        packet = page.listen.wait(timeout=timeout)
        return packet if packet else None
    except Exception as exc:
        print(f"[WARN] 等待 feed 数据超时或失败: {exc}")
        return None


def scroll_feed(page: Any, distance: int = 900) -> None:
    script = f"""
const distance = {distance};
const candidates = Array.from(document.querySelectorAll('*'))
  .filter((el) => {{
    const rect = el.getBoundingClientRect();
    return rect.width > 300
      && rect.height > 200
      && el.scrollHeight > el.clientHeight + 20;
  }})
  .sort((a, b) => {{
    const areaA = a.getBoundingClientRect().width * a.getBoundingClientRect().height;
    const areaB = b.getBoundingClientRect().width * b.getBoundingClientRect().height;
    return areaB - areaA;
  }});
const target = candidates[0] || document.scrollingElement || document.documentElement;
target.scrollBy(0, distance);
"""
    page.run_js(script)
    time.sleep(2)


def collect_videos(
    max_videos: int,
    timeout: int,
    output_dir: Path,
    browser_port: int | None,
    start_url: str = DEFAULT_EXPLORE_URL,
) -> int:
    requests_module, chromium_page_cls, chromium_options_cls = import_runtime_dependencies()
    if browser_port is not None:
        ensure_browser_debug_port_ready(browser_port)
    page = create_page(chromium_page_cls, chromium_options_cls, browser_port)
    page.listen.start(LISTEN_TARGET)

    print("[INFO] 正在打开小红书发现页。若出现登录或验证码，请先在浏览器窗口里完成。")
    page.get(start_url)
    time.sleep(3)

    downloaded = 0
    seen_urls: set[str] = set()
    seen_files: set[Path] = set()
    consecutive_empty = 0
    max_consecutive_empty = 6

    while downloaded < max_videos and consecutive_empty < max_consecutive_empty:
        packet = wait_for_feed_packet(page, timeout=timeout)
        if packet is None:
            consecutive_empty += 1
            scroll_feed(page)
            continue

        try:
            payload = extract_feed_payload(packet.response)
            candidates = group_video_candidates(extract_video_candidates(payload))
        except Exception as exc:
            print(f"[WARN] 解析 feed 数据失败: {exc}")
            consecutive_empty += 1
            scroll_feed(page)
            continue

        fresh_candidates = [candidate for candidate in candidates if candidate.video_url not in seen_urls]
        if not fresh_candidates:
            consecutive_empty += 1
            scroll_feed(page)
            continue

        consecutive_empty = 0
        for candidate in fresh_candidates:
            if downloaded >= max_videos:
                break
            seen_urls.add(candidate.video_url)
            output_path = build_output_path(candidate, output_dir=output_dir)
            if output_path in seen_files or output_path.exists():
                continue
            headers = build_headers(getattr(page, "url", start_url) or start_url)
            try:
                download_video(
                    requests_module=requests_module,
                    headers=headers,
                    video_url=candidate.video_url,
                    output_path=output_path,
                )
            except Exception as exc:
                print(f"[WARN] 下载失败 {candidate.video_id}: {exc}")
                continue

            downloaded += 1
            seen_files.add(output_path)
            print(f"[OK] 已保存: {output_path}")

        if downloaded < max_videos:
            scroll_feed(page)

    if downloaded == 0:
        print("[WARN] 没有下载到视频。请确认已登录小红书、页面已加载 feed，并在浏览器中滚动后重试。")
    return downloaded


def build_parser() -> argparse.ArgumentParser:
    parser = argparse.ArgumentParser(description="附着到已登录小红书 Chrome，监听 feed 响应并下载视频")
    parser.add_argument("--max-videos", type=int, default=10, help="最多下载视频数量，默认 10")
    parser.add_argument("--output-dir", default=str(DEFAULT_OUTPUT_DIR), help="视频保存目录，默认 video")
    parser.add_argument("--browser-port", type=int, default=DEFAULT_BROWSER_PORT, help="Chrome 调试端口，默认 9224")
    parser.add_argument("--timeout", type=int, default=20, help="等待单次 feed 响应的秒数，默认 20")
    parser.add_argument("--start-url", default=DEFAULT_EXPLORE_URL, help="打开或刷新使用的小红书页面")
    return parser


def main(argv: list[str] | None = None) -> int:
    parser = build_parser()
    args = parser.parse_args(argv)
    if args.max_videos <= 0:
        parser.error("--max-videos 必须大于 0")
    if args.browser_port <= 0:
        parser.error("--browser-port 必须大于 0")
    downloaded = collect_videos(
        max_videos=args.max_videos,
        timeout=args.timeout,
        output_dir=Path(args.output_dir),
        browser_port=args.browser_port,
        start_url=args.start_url,
    )
    print(f"[INFO] 本次共下载 {downloaded} 个视频。")
    return 0


if __name__ == "__main__":
    sys.exit(main())