Filter search queue to video cards
This commit is contained in:
parent
f247cb1a3a
commit
a43168d5a9
55
XHS.py
55
XHS.py
@ -674,7 +674,16 @@ def normalize_note_urls(urls: list[str]) -> list[str]:
|
||||
return [by_note_id[note_id] for note_id in order]
|
||||
|
||||
|
||||
def collect_note_urls_from_page(page: Any, limit: int) -> list[str]:
|
||||
def collect_note_urls_from_page(page: Any, limit: int, video_only: bool = False) -> list[str]:
|
||||
if video_only:
|
||||
script = """
|
||||
return Array.from(document.querySelectorAll('section.note-item'))
|
||||
.filter((section) => section.querySelector('.play-icon, use[href="#play-s"], use[xlink\\\\:href="#play-s"]'))
|
||||
.flatMap((section) => Array.from(section.querySelectorAll('a[href*="/explore/"], a[href*="/search_result/"]')))
|
||||
.map((a) => a.href || a.getAttribute('href') || '')
|
||||
.filter(Boolean);
|
||||
"""
|
||||
else:
|
||||
script = """
|
||||
return Array.from(document.querySelectorAll('a[href*="/explore/"], a[href*="/search_result/"]'))
|
||||
.map((a) => a.href || a.getAttribute('href') || '')
|
||||
@ -686,15 +695,45 @@ return Array.from(document.querySelectorAll('a[href*="/explore/"], a[href*="/sea
|
||||
return normalize_note_urls([str(url) for url in raw_urls])[:limit]
|
||||
|
||||
|
||||
def wait_for_note_urls_from_page(page: Any, limit: int, timeout: float = 8.0, interval: float = 0.5) -> list[str]:
|
||||
def wait_for_note_urls_from_page(
|
||||
page: Any,
|
||||
limit: int,
|
||||
timeout: float = 8.0,
|
||||
interval: float = 0.5,
|
||||
video_only: bool = False,
|
||||
) -> list[str]:
|
||||
deadline = time.monotonic() + timeout
|
||||
while True:
|
||||
note_urls = collect_note_urls_from_page(page, limit=limit)
|
||||
note_urls = collect_note_urls_from_page(page, limit=limit, video_only=video_only)
|
||||
if note_urls or time.monotonic() >= deadline:
|
||||
return note_urls
|
||||
time.sleep(interval)
|
||||
|
||||
|
||||
def collect_note_urls_with_browse(
|
||||
page: Any,
|
||||
limit: int,
|
||||
human_settings: HumanBrowseSettings,
|
||||
rounds: int = 3,
|
||||
video_only: bool = False,
|
||||
) -> list[str]:
|
||||
collected: list[str] = []
|
||||
seen_note_ids: set[str] = set()
|
||||
for round_index in range(max(1, rounds)):
|
||||
note_urls = wait_for_note_urls_from_page(page, limit=limit, video_only=video_only)
|
||||
for note_url in note_urls:
|
||||
note_id = extract_note_id_from_url(note_url)
|
||||
if note_id in seen_note_ids:
|
||||
continue
|
||||
seen_note_ids.add(note_id)
|
||||
collected.append(note_url)
|
||||
if len(collected) >= limit:
|
||||
return collected
|
||||
if round_index < rounds - 1:
|
||||
run_human_browse_sequence(page, create_human_browse_plan(human_settings))
|
||||
return collected
|
||||
|
||||
|
||||
def filter_unvisited_note_urls(urls: list[str], visited_note_ids: set[str]) -> list[str]:
|
||||
return [url for url in urls if extract_note_id_from_url(url) not in visited_note_ids]
|
||||
|
||||
@ -835,7 +874,15 @@ def run_queue_download(
|
||||
else:
|
||||
page.get(source_url)
|
||||
human_pause(human_settings)
|
||||
note_urls = wait_for_note_urls_from_page(page, limit=max(50, target_videos * 2))
|
||||
note_limit = max(50, target_videos * 3 if source == "search" else target_videos * 2)
|
||||
browse_rounds = 8 if source == "search" else 2
|
||||
note_urls = collect_note_urls_with_browse(
|
||||
page,
|
||||
limit=note_limit,
|
||||
human_settings=human_settings,
|
||||
rounds=browse_rounds,
|
||||
video_only=source == "search",
|
||||
)
|
||||
records = merge_note_urls_into_queue(records, note_urls, source=source)
|
||||
save_queue(queue_file, records)
|
||||
added = len(records) - before_count
|
||||
|
||||
52
test_xhs.py
52
test_xhs.py
@ -59,6 +59,36 @@ class FakeDelayedLinkPage:
|
||||
return ["https://www.xiaohongshu.com/search_result/abc?xsec_token=token"]
|
||||
|
||||
|
||||
class FakeGrowingLinkPage:
|
||||
def __init__(self):
|
||||
self.collect_calls = 0
|
||||
|
||||
def run_js(self, script):
|
||||
if "querySelectorAll" not in script:
|
||||
return None
|
||||
self.collect_calls += 1
|
||||
if self.collect_calls == 1:
|
||||
return ["https://www.xiaohongshu.com/search_result/one?xsec_token=token1"]
|
||||
return [
|
||||
"https://www.xiaohongshu.com/search_result/one?xsec_token=token1",
|
||||
"https://www.xiaohongshu.com/search_result/two?xsec_token=token2",
|
||||
]
|
||||
|
||||
|
||||
class FakeVideoOnlyLinkPage:
|
||||
def __init__(self):
|
||||
self.scripts = []
|
||||
|
||||
def run_js(self, script):
|
||||
self.scripts.append(script)
|
||||
if "play-icon" in script:
|
||||
return ["https://www.xiaohongshu.com/search_result/video?xsec_token=video-token"]
|
||||
return [
|
||||
"https://www.xiaohongshu.com/search_result/image?xsec_token=image-token",
|
||||
"https://www.xiaohongshu.com/search_result/video?xsec_token=video-token",
|
||||
]
|
||||
|
||||
|
||||
class XhsModuleTests(unittest.TestCase):
|
||||
def test_module_can_import_without_optional_runtime_dependencies(self) -> None:
|
||||
module = importlib.import_module("XHS")
|
||||
@ -390,6 +420,13 @@ class XhsModuleTests(unittest.TestCase):
|
||||
)
|
||||
self.assertIn('/search_result/', page.scripts[0])
|
||||
|
||||
def test_collect_note_urls_from_page_can_filter_video_cards(self) -> None:
|
||||
module = importlib.import_module("XHS")
|
||||
page = FakeVideoOnlyLinkPage()
|
||||
urls = module.collect_note_urls_from_page(page, limit=10, video_only=True)
|
||||
self.assertEqual(urls, ["https://www.xiaohongshu.com/search_result/video?xsec_token=video-token"])
|
||||
self.assertIn("play-icon", page.scripts[0])
|
||||
|
||||
def test_wait_for_note_urls_from_page_polls_until_links_are_rendered(self) -> None:
|
||||
module = importlib.import_module("XHS")
|
||||
page = FakeDelayedLinkPage()
|
||||
@ -398,6 +435,21 @@ class XhsModuleTests(unittest.TestCase):
|
||||
self.assertEqual(urls, ["https://www.xiaohongshu.com/search_result/abc?xsec_token=token"])
|
||||
mocked_sleep.assert_called_once_with(0.1)
|
||||
|
||||
def test_collect_note_urls_with_browse_accumulates_after_scroll(self) -> None:
|
||||
module = importlib.import_module("XHS")
|
||||
page = FakeGrowingLinkPage()
|
||||
settings = module.HumanBrowseSettings(enabled=False)
|
||||
with mock.patch.object(module, "run_human_browse_sequence") as mocked_browse:
|
||||
urls = module.collect_note_urls_with_browse(page, limit=10, human_settings=settings, rounds=2)
|
||||
self.assertEqual(
|
||||
urls,
|
||||
[
|
||||
"https://www.xiaohongshu.com/search_result/one?xsec_token=token1",
|
||||
"https://www.xiaohongshu.com/search_result/two?xsec_token=token2",
|
||||
],
|
||||
)
|
||||
mocked_browse.assert_called_once()
|
||||
|
||||
def test_filter_unvisited_note_urls_skips_seen_note_ids(self) -> None:
|
||||
module = importlib.import_module("XHS")
|
||||
urls = [
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user