Filter search queue to video cards

This commit is contained in:
wangshaoqing 2026-05-27 17:35:30 +08:00
parent f247cb1a3a
commit a43168d5a9
2 changed files with 104 additions and 5 deletions

55
XHS.py
View File

@ -674,7 +674,16 @@ def normalize_note_urls(urls: list[str]) -> list[str]:
return [by_note_id[note_id] for note_id in order]
def collect_note_urls_from_page(page: Any, limit: int) -> list[str]:
def collect_note_urls_from_page(page: Any, limit: int, video_only: bool = False) -> list[str]:
if video_only:
script = """
return Array.from(document.querySelectorAll('section.note-item'))
.filter((section) => section.querySelector('.play-icon, use[href="#play-s"], use[xlink\\\\:href="#play-s"]'))
.flatMap((section) => Array.from(section.querySelectorAll('a[href*="/explore/"], a[href*="/search_result/"]')))
.map((a) => a.href || a.getAttribute('href') || '')
.filter(Boolean);
"""
else:
script = """
return Array.from(document.querySelectorAll('a[href*="/explore/"], a[href*="/search_result/"]'))
.map((a) => a.href || a.getAttribute('href') || '')
@ -686,15 +695,45 @@ return Array.from(document.querySelectorAll('a[href*="/explore/"], a[href*="/sea
return normalize_note_urls([str(url) for url in raw_urls])[:limit]
def wait_for_note_urls_from_page(page: Any, limit: int, timeout: float = 8.0, interval: float = 0.5) -> list[str]:
def wait_for_note_urls_from_page(
page: Any,
limit: int,
timeout: float = 8.0,
interval: float = 0.5,
video_only: bool = False,
) -> list[str]:
deadline = time.monotonic() + timeout
while True:
note_urls = collect_note_urls_from_page(page, limit=limit)
note_urls = collect_note_urls_from_page(page, limit=limit, video_only=video_only)
if note_urls or time.monotonic() >= deadline:
return note_urls
time.sleep(interval)
def collect_note_urls_with_browse(
page: Any,
limit: int,
human_settings: HumanBrowseSettings,
rounds: int = 3,
video_only: bool = False,
) -> list[str]:
collected: list[str] = []
seen_note_ids: set[str] = set()
for round_index in range(max(1, rounds)):
note_urls = wait_for_note_urls_from_page(page, limit=limit, video_only=video_only)
for note_url in note_urls:
note_id = extract_note_id_from_url(note_url)
if note_id in seen_note_ids:
continue
seen_note_ids.add(note_id)
collected.append(note_url)
if len(collected) >= limit:
return collected
if round_index < rounds - 1:
run_human_browse_sequence(page, create_human_browse_plan(human_settings))
return collected
def filter_unvisited_note_urls(urls: list[str], visited_note_ids: set[str]) -> list[str]:
return [url for url in urls if extract_note_id_from_url(url) not in visited_note_ids]
@ -835,7 +874,15 @@ def run_queue_download(
else:
page.get(source_url)
human_pause(human_settings)
note_urls = wait_for_note_urls_from_page(page, limit=max(50, target_videos * 2))
note_limit = max(50, target_videos * 3 if source == "search" else target_videos * 2)
browse_rounds = 8 if source == "search" else 2
note_urls = collect_note_urls_with_browse(
page,
limit=note_limit,
human_settings=human_settings,
rounds=browse_rounds,
video_only=source == "search",
)
records = merge_note_urls_into_queue(records, note_urls, source=source)
save_queue(queue_file, records)
added = len(records) - before_count

View File

@ -59,6 +59,36 @@ class FakeDelayedLinkPage:
return ["https://www.xiaohongshu.com/search_result/abc?xsec_token=token"]
class FakeGrowingLinkPage:
def __init__(self):
self.collect_calls = 0
def run_js(self, script):
if "querySelectorAll" not in script:
return None
self.collect_calls += 1
if self.collect_calls == 1:
return ["https://www.xiaohongshu.com/search_result/one?xsec_token=token1"]
return [
"https://www.xiaohongshu.com/search_result/one?xsec_token=token1",
"https://www.xiaohongshu.com/search_result/two?xsec_token=token2",
]
class FakeVideoOnlyLinkPage:
def __init__(self):
self.scripts = []
def run_js(self, script):
self.scripts.append(script)
if "play-icon" in script:
return ["https://www.xiaohongshu.com/search_result/video?xsec_token=video-token"]
return [
"https://www.xiaohongshu.com/search_result/image?xsec_token=image-token",
"https://www.xiaohongshu.com/search_result/video?xsec_token=video-token",
]
class XhsModuleTests(unittest.TestCase):
def test_module_can_import_without_optional_runtime_dependencies(self) -> None:
module = importlib.import_module("XHS")
@ -390,6 +420,13 @@ class XhsModuleTests(unittest.TestCase):
)
self.assertIn('/search_result/', page.scripts[0])
def test_collect_note_urls_from_page_can_filter_video_cards(self) -> None:
module = importlib.import_module("XHS")
page = FakeVideoOnlyLinkPage()
urls = module.collect_note_urls_from_page(page, limit=10, video_only=True)
self.assertEqual(urls, ["https://www.xiaohongshu.com/search_result/video?xsec_token=video-token"])
self.assertIn("play-icon", page.scripts[0])
def test_wait_for_note_urls_from_page_polls_until_links_are_rendered(self) -> None:
module = importlib.import_module("XHS")
page = FakeDelayedLinkPage()
@ -398,6 +435,21 @@ class XhsModuleTests(unittest.TestCase):
self.assertEqual(urls, ["https://www.xiaohongshu.com/search_result/abc?xsec_token=token"])
mocked_sleep.assert_called_once_with(0.1)
def test_collect_note_urls_with_browse_accumulates_after_scroll(self) -> None:
module = importlib.import_module("XHS")
page = FakeGrowingLinkPage()
settings = module.HumanBrowseSettings(enabled=False)
with mock.patch.object(module, "run_human_browse_sequence") as mocked_browse:
urls = module.collect_note_urls_with_browse(page, limit=10, human_settings=settings, rounds=2)
self.assertEqual(
urls,
[
"https://www.xiaohongshu.com/search_result/one?xsec_token=token1",
"https://www.xiaohongshu.com/search_result/two?xsec_token=token2",
],
)
mocked_browse.assert_called_once()
def test_filter_unvisited_note_urls_skips_seen_note_ids(self) -> None:
module = importlib.import_module("XHS")
urls = [