From 7944ac1f0c7af92f67b1beb45f93e2fcb7dceb29 Mon Sep 17 00:00:00 2001 From: wangshaoqing Date: Wed, 27 May 2026 19:02:00 +0800 Subject: [PATCH] Save visible XHS comments by default --- XHS.py | 83 ++++++++++++++++++++++++++++++++++++++++++++++++++++- test_xhs.py | 57 ++++++++++++++++++++++++++++++++++++ 2 files changed, 139 insertions(+), 1 deletion(-) diff --git a/XHS.py b/XHS.py index d2432ec..1b54fc0 100644 --- a/XHS.py +++ b/XHS.py @@ -252,7 +252,7 @@ def build_download_metadata_record( record["video_source_key"] = candidate.source_key record["downloaded_path"] = output_path.as_posix() record["downloaded_at"] = current_timestamp() - record["comments"] = comments or [] + record["comments"] = comments if comments is not None else record.get("comments") or [] if comments_error: record["comments_error"] = comments_error return record @@ -741,6 +741,78 @@ def run_human_browse_sequence(page: Any, plan: HumanBrowsePlan) -> None: time.sleep(plan.settle_wait) +def count_visible_comments(page: Any) -> int: + script = """ +const xhsVisibleCommentCount = () => { + return document.querySelectorAll( + '.comments-container .comment-item:not(.comment-item-sub), .comment-item:not(.comment-item-sub)' + ).length; +}; +return xhsVisibleCommentCount(); +""" + try: + count = page.run_js(script) + return int(count or 0) + except Exception: + return 0 + + +def scroll_comment_container(page: Any, distance: int = 720) -> bool: + script = f""" +const xhsScrollCommentContainer = () => {{ + const distance = {int(distance)}; + const preferred = Array.from(document.querySelectorAll( + '.comments-container, .comments-el, .comment-list, .interaction-container, .note-scroller, .note-detail-mask' + )); + const scrollables = Array.from(document.querySelectorAll('*')) + .filter((el) => {{ + const className = String(el.className || '').toLowerCase(); + const rect = el.getBoundingClientRect(); + return el.scrollHeight > el.clientHeight + 20 + && rect.width > 240 + && rect.height > 160 + && /comment|interaction|note|detail|right/.test(className); + }}); + const candidates = preferred.concat(scrollables) + .filter((el, index, list) => el && list.indexOf(el) === index) + .filter((el) => el.scrollHeight > el.clientHeight + 20); + const target = candidates[0] || document.scrollingElement || document.documentElement; + if (!target) return false; + target.scrollBy(0, distance); + target.dispatchEvent(new WheelEvent('wheel', {{deltaY: distance, bubbles: true}})); + return true; +}}; +return xhsScrollCommentContainer(); +""" + try: + return bool(page.run_js(script)) + except Exception: + return False + + +def load_visible_comments( + page: Any, + human_settings: HumanBrowseSettings, + max_comments: int = 20, + timeout: float = 8.0, +) -> bool: + if max_comments <= 0: + return False + + deadline = time.monotonic() + max(0.0, timeout) + wait_seconds = 0.3 + if human_settings.enabled: + wait_seconds = max(0.1, min(1.5, human_settings.max_wait)) + + while True: + if count_visible_comments(page) > 0: + return True + if time.monotonic() >= deadline: + return False + scroll_comment_container(page) + time.sleep(wait_seconds) + + def human_pause(settings: HumanBrowseSettings, random_module: Any = random) -> None: if settings.enabled: time.sleep(random_module.uniform(settings.min_wait, settings.max_wait)) @@ -1236,6 +1308,15 @@ def run_queue_download( human_pause(human_settings) if human_settings.enabled: run_human_browse_sequence(page, create_human_browse_plan(human_settings)) + if max_comments > 0: + loaded_comments = load_visible_comments( + page, + human_settings=human_settings, + max_comments=max_comments, + timeout=min(8.0, max(2.0, float(timeout))), + ) + if loaded_comments: + print("[INFO] 已检测到可见评论,开始保存评论元数据。") page_html = page.run_js("return document.documentElement.outerHTML") metadata = merge_metadata( extract_note_metadata_from_html(page_html, note_id=record.note_id), diff --git a/test_xhs.py b/test_xhs.py index 9af60a3..1d657df 100644 --- a/test_xhs.py +++ b/test_xhs.py @@ -38,6 +38,21 @@ class FakeScrollPage: self.scripts.append(script) +class FakeDelayedCommentPage: + def __init__(self): + self.comment_checks = 0 + self.scroll_scripts = [] + + def run_js(self, script): + if "xhsVisibleCommentCount" in script: + self.comment_checks += 1 + return 1 if self.comment_checks >= 2 else 0 + if "xhsScrollCommentContainer" in script: + self.scroll_scripts.append(script) + return True + return None + + class FakeLinkPage: def __init__(self, links): self.links = links @@ -285,6 +300,35 @@ class XhsModuleTests(unittest.TestCase): self.assertEqual(record["downloaded_at"], "2026-05-27T17:00:00+0800") self.assertEqual(record["comments"], []) + def test_build_download_metadata_record_preserves_metadata_comments(self) -> None: + module = importlib.import_module("XHS") + candidate = module.VideoCandidate( + video_id="note123", + title="视频标题", + video_url="https://sns-video.xhscdn.com/a.mp4", + author_name="作者", + source_key="master_url", + ) + base_metadata = { + "comments": [ + { + "author": "评论用户", + "content": "评论内容", + "liked_count": "9", + "time": "1小时前", + } + ] + } + + record = module.build_download_metadata_record( + base_metadata=base_metadata, + candidate=candidate, + queue_record=module.QueueRecord("note123", "https://www.xiaohongshu.com/explore/note123", "video-channel"), + output_path=Path("video/a.mp4"), + ) + + self.assertEqual(record["comments"], base_metadata["comments"]) + def test_extract_note_metadata_from_page_uses_visible_dom(self) -> None: module = importlib.import_module("XHS") metadata = module.extract_note_metadata_from_page(FakeMetadataPage(), note_id="note123", max_comments=20) @@ -618,6 +662,19 @@ class XhsModuleTests(unittest.TestCase): self.assertIn("scrollBy(0, distance)", page.scripts[0]) mocked_sleep.assert_has_calls([mock.call(2.5), mock.call(1.5), mock.call(3.0)]) + def test_load_visible_comments_scrolls_until_comment_dom_exists(self) -> None: + module = importlib.import_module("XHS") + page = FakeDelayedCommentPage() + settings = module.HumanBrowseSettings(enabled=True, min_wait=0.1, max_wait=0.1) + + with mock.patch.object(module.time, "sleep") as mocked_sleep: + loaded = module.load_visible_comments(page, human_settings=settings, max_comments=20, timeout=1.0) + + self.assertTrue(loaded) + self.assertGreaterEqual(page.comment_checks, 2) + self.assertEqual(len(page.scroll_scripts), 1) + mocked_sleep.assert_called_once_with(0.1) + def test_should_take_long_break_uses_every_n_downloads(self) -> None: module = importlib.import_module("XHS") settings = module.HumanBrowseSettings(long_break_every=4)