feat: fetcher: try multiple candidates rather just the best one

This commit is contained in:
2026-04-03 22:16:49 +02:00
parent 89553a6da6
commit 2f8004581b
8 changed files with 176 additions and 43 deletions
+4
View File
@@ -49,6 +49,10 @@ SCORE_W_SYNCED = 10.0
MIN_CONFIDENCE = 25.0 # below this, candidate is rejected
HIGH_CONFIDENCE = 80.0 # at or above this, stop searching early
# Multi-candidate fetching
MULTI_CANDIDATE_LIMIT = 3 # max candidates to try per search-based fetcher
MULTI_CANDIDATE_DELAY_S = 0.2 # delay between sequential lyric fetches
# Legacy cache rows (no confidence stored) get a base score by sync status
LEGACY_CONFIDENCE_SYNCED = 50.0
LEGACY_CONFIDENCE_UNSYNCED = 40.0
+1 -1
View File
@@ -29,7 +29,7 @@ FetcherMethodType = Literal[
]
# Fetchers within a group run in parallel; groups run sequentially.
# A group that produces any positive result stops the pipeline.
# A group that produces any trusted and synced result stops the pipeline.
_FETCHER_GROUPS: list[list[FetcherMethodType]] = [
["local"],
["cache-search"],
+29 -18
View File
@@ -12,18 +12,20 @@ Search results are filtered by duration when the track has a known length
to avoid returning lyrics for the wrong version of a song.
"""
import asyncio
from typing import Optional
import httpx
from loguru import logger
from .base import BaseFetcher
from .selection import SearchCandidate, select_best
from .selection import SearchCandidate, select_ranked
from ..models import TrackMeta, LyricResult, CacheStatus
from ..lrc import LRCData
from ..config import (
HTTP_TIMEOUT,
TTL_NOT_FOUND,
TTL_NETWORK_ERROR,
MULTI_CANDIDATE_DELAY_S,
NETEASE_SEARCH_URL,
NETEASE_LYRIC_URL,
UA_BROWSER,
@@ -45,10 +47,10 @@ class NeteaseFetcher(BaseFetcher):
async def _search(
self, track: TrackMeta, limit: int = 10
) -> tuple[Optional[int], float]:
) -> list[tuple[int, float]]:
query = f"{track.artist or ''} {track.title or ''}".strip()
if not query:
return None, 0.0
return []
logger.debug(f"Netease: searching for '{query}' (limit={limit})")
@@ -66,23 +68,23 @@ class NeteaseFetcher(BaseFetcher):
logger.error(
f"Netease: search returned non-dict: {type(result).__name__}"
)
return None, 0.0
return []
result_body = result.get("result")
if not isinstance(result_body, dict):
logger.debug("Netease: search 'result' field missing or invalid")
return None, 0.0
return []
songs = result_body.get("songs")
if not isinstance(songs, list) or len(songs) == 0:
logger.debug("Netease: search returned 0 results")
return None, 0.0
return []
logger.debug(f"Netease: search returned {len(songs)} candidates")
candidates = [
SearchCandidate(
item=song.get("id"),
item=song_id,
duration_ms=float(song["dt"])
if isinstance(song.get("dt"), int)
else None,
@@ -92,27 +94,27 @@ class NeteaseFetcher(BaseFetcher):
album=(song.get("al") or {}).get("name"),
)
for song in songs
if isinstance(song, dict) and song.get("id") is not None
if isinstance(song, dict) and isinstance(song_id := song.get("id"), int)
]
best_id, confidence = select_best(
ranked = select_ranked(
candidates,
track.length,
title=track.title,
artist=track.artist,
album=track.album,
)
if best_id is not None:
if ranked:
logger.debug(
f"Netease: selected id={best_id} (confidence={confidence:.0f})"
"Netease: top candidates: "
+ ", ".join(f"id={i} ({c:.0f})" for i, c in ranked)
)
return best_id, confidence
else:
logger.debug("Netease: no suitable candidate found")
return None, 0.0
return ranked
except Exception as e:
logger.error(f"Netease: search failed: {e}")
return None, 0.0
return []
async def _get_lyric(
self, song_id: int, confidence: float = 0.0
@@ -185,9 +187,18 @@ class NeteaseFetcher(BaseFetcher):
return None
logger.info(f"Netease: fetching lyrics for {track.display_name()}")
song_id, confidence = await self._search(track)
if not song_id:
candidates = await self._search(track)
if not candidates:
logger.debug(f"Netease: no match found for {track.display_name()}")
return LyricResult(status=CacheStatus.NOT_FOUND, ttl=TTL_NOT_FOUND)
return await self._get_lyric(song_id, confidence=confidence)
for i, (song_id, confidence) in enumerate(candidates):
if i > 0:
await asyncio.sleep(MULTI_CANDIDATE_DELAY_S)
result = await self._get_lyric(song_id, confidence=confidence)
if result is None or result.status == CacheStatus.NETWORK_ERROR:
return result
if result.status != CacheStatus.NOT_FOUND:
return result
return LyricResult(status=CacheStatus.NOT_FOUND, ttl=TTL_NOT_FOUND)
+28 -17
View File
@@ -11,18 +11,20 @@ The base URL is read from the QQ_MUSIC_API_URL environment variable.
Search → pick best match by duration → fetch LRC lyrics.
"""
import asyncio
from typing import Optional
import httpx
from loguru import logger
from .base import BaseFetcher
from .selection import SearchCandidate, select_best
from .selection import SearchCandidate, select_ranked
from ..models import TrackMeta, LyricResult, CacheStatus
from ..lrc import LRCData
from ..config import (
HTTP_TIMEOUT,
TTL_NOT_FOUND,
TTL_NETWORK_ERROR,
MULTI_CANDIDATE_DELAY_S,
QQ_MUSIC_API_URL,
)
@@ -37,10 +39,10 @@ class QQMusicFetcher(BaseFetcher):
async def _search(
self, track: TrackMeta, limit: int = 10
) -> tuple[Optional[str], float]:
) -> list[tuple[str, float]]:
query = f"{track.artist or ''} {track.title or ''}".strip()
if not query:
return None, 0.0
return []
logger.debug(f"QQMusic: searching for '{query}' (limit={limit})")
@@ -55,18 +57,18 @@ class QQMusicFetcher(BaseFetcher):
if data.get("code") != 0:
logger.error(f"QQMusic: search API error: {data}")
return None, 0.0
return []
songs = data.get("data", {}).get("list", [])
if not songs:
logger.debug("QQMusic: search returned 0 results")
return None, 0.0
return []
logger.debug(f"QQMusic: search returned {len(songs)} candidates")
candidates = [
SearchCandidate(
item=song.get("mid"),
item=mid,
duration_ms=float(song["interval"]) * 1000
if isinstance(song.get("interval"), int)
else None,
@@ -76,27 +78,27 @@ class QQMusicFetcher(BaseFetcher):
album=(song.get("album") or {}).get("name"),
)
for song in songs
if isinstance(song, dict) and song.get("mid") is not None
if isinstance(song, dict) and isinstance(mid := song.get("mid"), str)
]
best_mid, confidence = select_best(
ranked = select_ranked(
candidates,
track.length,
title=track.title,
artist=track.artist,
album=track.album,
)
if best_mid is not None:
if ranked:
logger.debug(
f"QQMusic: selected mid={best_mid} (confidence={confidence:.0f})"
"QQMusic: top candidates: "
+ ", ".join(f"mid={m} ({c:.0f})" for m, c in ranked)
)
return best_mid, confidence
else:
logger.debug("QQMusic: no suitable candidate found")
return None, 0.0
return ranked
except Exception as e:
logger.error(f"QQMusic: search failed: {e}")
return None, 0.0
return []
async def _get_lyric(
self, mid: str, confidence: float = 0.0
@@ -152,9 +154,18 @@ class QQMusicFetcher(BaseFetcher):
return None
logger.info(f"QQMusic: fetching lyrics for {track.display_name()}")
mid, confidence = await self._search(track)
if not mid:
candidates = await self._search(track)
if not candidates:
logger.debug(f"QQMusic: no match found for {track.display_name()}")
return LyricResult(status=CacheStatus.NOT_FOUND, ttl=TTL_NOT_FOUND)
return await self._get_lyric(mid, confidence=confidence)
for i, (mid, confidence) in enumerate(candidates):
if i > 0:
await asyncio.sleep(MULTI_CANDIDATE_DELAY_S)
result = await self._get_lyric(mid, confidence=confidence)
if result is None or result.status == CacheStatus.NETWORK_ERROR:
return result
if result.status != CacheStatus.NOT_FOUND:
return result
return LyricResult(status=CacheStatus.NOT_FOUND, ttl=TTL_NOT_FOUND)
+27
View File
@@ -11,6 +11,7 @@ from typing import Generic, Optional, TypeVar
from ..config import (
DURATION_TOLERANCE_MS,
MULTI_CANDIDATE_LIMIT,
SCORE_W_TITLE as _W_TITLE,
SCORE_W_ARTIST as _W_ARTIST,
SCORE_W_ALBUM as _W_ALBUM,
@@ -143,6 +144,32 @@ def _score_candidate(
return metadata_score + synced_score
def select_ranked(
candidates: list[SearchCandidate[T]],
track_length_ms: Optional[int] = None,
*,
title: Optional[str] = None,
artist: Optional[str] = None,
album: Optional[str] = None,
min_confidence: float = MIN_CONFIDENCE,
max_results: int = MULTI_CANDIDATE_LIMIT,
) -> list[tuple[T, float]]:
"""Score candidates and return top max_results above min_confidence, sorted by score descending."""
scored: list[tuple[T, float]] = []
for c in candidates:
if (
track_length_ms is not None
and c.duration_ms is not None
and abs(c.duration_ms - track_length_ms) > DURATION_TOLERANCE_MS
):
continue
s = _score_candidate(c, title, artist, album, track_length_ms)
if s >= min_confidence:
scored.append((c.item, s))
scored.sort(key=lambda x: x[1], reverse=True)
return scored[:max_results]
def select_best(
candidates: list[SearchCandidate[T]],
track_length_ms: Optional[int] = None,
+1 -1
View File
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
[project]
name = "lrx-cli"
version = "0.4.1"
version = "0.4.2"
description = "Fetch line-synced lyrics for your music player."
readme = "README.md"
requires-python = ">=3.13"
+83 -3
View File
@@ -3,6 +3,7 @@ from __future__ import annotations
from lrx_cli.fetchers.selection import (
SearchCandidate,
select_best,
select_ranked,
_score_candidate,
_text_similarity,
MIN_CONFIDENCE,
@@ -407,9 +408,6 @@ def test_netease_without_ref_metadata_rejects_below_confidence() -> None:
assert best is None
# --- Edge cases ---
def test_empty_candidates_returns_none() -> None:
assert select_best([], track_length_ms=5000) == (None, 0.0)
assert select_best([], track_length_ms=None) == (None, 0.0)
@@ -445,3 +443,85 @@ def test_generic_type_preserved() -> None:
dict_candidates = [SearchCandidate(item={"id": 1}, title="x")]
best, _ = select_best(dict_candidates, title="x")
assert best == {"id": 1}
def test_select_ranked_empty_input() -> None:
assert select_ranked([]) == []
def test_select_ranked_all_below_confidence() -> None:
"""All candidates below threshold → empty list."""
candidates = [
SearchCandidate(item="x", title="Completely Different", duration_ms=999999.0)
]
result = select_ranked(
candidates, 232000, title="My Love", artist="Westlife", min_confidence=90.0
)
assert result == []
def test_select_ranked_sorted_descending() -> None:
"""Results are ordered highest score first."""
candidates = _netease_candidates()
ranked = select_ranked(
candidates,
_REF_LENGTH,
title=_REF_TITLE,
artist=_REF_ARTIST,
album=_REF_ALBUM,
)
assert len(ranked) >= 2
scores = [score for _, score in ranked]
assert scores == sorted(scores, reverse=True)
def test_select_ranked_respects_max_results() -> None:
candidates = _netease_candidates()
ranked = select_ranked(
candidates,
_REF_LENGTH,
title=_REF_TITLE,
artist=_REF_ARTIST,
album=_REF_ALBUM,
max_results=2,
)
assert len(ranked) <= 2
def test_select_ranked_consistent_with_select_best() -> None:
"""First result of select_ranked matches select_best."""
candidates = _netease_candidates()
kwargs = dict(title=_REF_TITLE, artist=_REF_ARTIST, album=_REF_ALBUM)
ranked = select_ranked(candidates, _REF_LENGTH, **kwargs) # type: ignore
best_item, best_score = select_best(candidates, _REF_LENGTH, **kwargs) # type: ignore
assert ranked[0] == (best_item, best_score)
def test_select_ranked_duration_hard_filter_applies() -> None:
"""Candidates outside duration tolerance are excluded from ranked results."""
candidates = _netease_candidates()
ranked = select_ranked(
candidates,
_REF_LENGTH,
title=_REF_TITLE,
artist=_REF_ARTIST,
album=_REF_ALBUM,
)
ids = [item for item, _ in ranked]
# 29809886 (dt=262000, diff=30000ms) and 20707713 (dt=241116, diff=9116ms)
# both exceed DURATION_TOLERANCE_MS=3000 → must not appear
assert 29809886 not in ids
assert 20707713 not in ids
def test_select_ranked_netease_top_is_best_duration_match() -> None:
"""2080607 (diff=59ms) should rank first over 572412968 (diff=1000ms)."""
candidates = _netease_candidates()
ranked = select_ranked(
candidates,
_REF_LENGTH,
title=_REF_TITLE,
artist=_REF_ARTIST,
album=_REF_ALBUM,
)
assert ranked[0][0] == 2080607
Generated
+1 -1
View File
@@ -153,7 +153,7 @@ wheels = [
[[package]]
name = "lrx-cli"
version = "0.4.0"
version = "0.4.1"
source = { editable = "." }
dependencies = [
{ name = "cyclopts" },