feat: fetcher: try multiple candidates rather just the best one

This commit is contained in:
2026-04-03 22:16:49 +02:00
parent aa13940aa7
commit e3f12c1756
8 changed files with 176 additions and 43 deletions
+4
View File
@@ -49,6 +49,10 @@ SCORE_W_SYNCED = 10.0
MIN_CONFIDENCE = 25.0 # below this, candidate is rejected
HIGH_CONFIDENCE = 80.0 # at or above this, stop searching early
# Multi-candidate fetching
MULTI_CANDIDATE_LIMIT = 3 # max candidates to try per search-based fetcher
MULTI_CANDIDATE_DELAY_S = 0.2 # delay between sequential lyric fetches
# Legacy cache rows (no confidence stored) get a base score by sync status
LEGACY_CONFIDENCE_SYNCED = 50.0
LEGACY_CONFIDENCE_UNSYNCED = 40.0
+1 -1
View File
@@ -29,7 +29,7 @@ FetcherMethodType = Literal[
]
# Fetchers within a group run in parallel; groups run sequentially.
# A group that produces any positive result stops the pipeline.
# A group that produces any trusted and synced result stops the pipeline.
_FETCHER_GROUPS: list[list[FetcherMethodType]] = [
["local"],
["cache-search"],
+30 -19
View File
@@ -12,18 +12,20 @@ Search results are filtered by duration when the track has a known length
to avoid returning lyrics for the wrong version of a song.
"""
import asyncio
from typing import Optional
import httpx
from loguru import logger
from .base import BaseFetcher
from .selection import SearchCandidate, select_best
from .selection import SearchCandidate, select_ranked
from ..models import TrackMeta, LyricResult, CacheStatus
from ..lrc import LRCData
from ..config import (
HTTP_TIMEOUT,
TTL_NOT_FOUND,
TTL_NETWORK_ERROR,
MULTI_CANDIDATE_DELAY_S,
NETEASE_SEARCH_URL,
NETEASE_LYRIC_URL,
UA_BROWSER,
@@ -45,10 +47,10 @@ class NeteaseFetcher(BaseFetcher):
async def _search(
self, track: TrackMeta, limit: int = 10
) -> tuple[Optional[int], float]:
) -> list[tuple[int, float]]:
query = f"{track.artist or ''} {track.title or ''}".strip()
if not query:
return None, 0.0
return []
logger.debug(f"Netease: searching for '{query}' (limit={limit})")
@@ -66,23 +68,23 @@ class NeteaseFetcher(BaseFetcher):
logger.error(
f"Netease: search returned non-dict: {type(result).__name__}"
)
return None, 0.0
return []
result_body = result.get("result")
if not isinstance(result_body, dict):
logger.debug("Netease: search 'result' field missing or invalid")
return None, 0.0
return []
songs = result_body.get("songs")
if not isinstance(songs, list) or len(songs) == 0:
logger.debug("Netease: search returned 0 results")
return None, 0.0
return []
logger.debug(f"Netease: search returned {len(songs)} candidates")
candidates = [
SearchCandidate(
item=song.get("id"),
item=song_id,
duration_ms=float(song["dt"])
if isinstance(song.get("dt"), int)
else None,
@@ -92,27 +94,27 @@ class NeteaseFetcher(BaseFetcher):
album=(song.get("al") or {}).get("name"),
)
for song in songs
if isinstance(song, dict) and song.get("id") is not None
if isinstance(song, dict) and isinstance(song_id := song.get("id"), int)
]
best_id, confidence = select_best(
ranked = select_ranked(
candidates,
track.length,
title=track.title,
artist=track.artist,
album=track.album,
)
if best_id is not None:
if ranked:
logger.debug(
f"Netease: selected id={best_id} (confidence={confidence:.0f})"
"Netease: top candidates: "
+ ", ".join(f"id={i} ({c:.0f})" for i, c in ranked)
)
return best_id, confidence
logger.debug("Netease: no suitable candidate found")
return None, 0.0
else:
logger.debug("Netease: no suitable candidate found")
return ranked
except Exception as e:
logger.error(f"Netease: search failed: {e}")
return None, 0.0
return []
async def _get_lyric(
self, song_id: int, confidence: float = 0.0
@@ -185,9 +187,18 @@ class NeteaseFetcher(BaseFetcher):
return None
logger.info(f"Netease: fetching lyrics for {track.display_name()}")
song_id, confidence = await self._search(track)
if not song_id:
candidates = await self._search(track)
if not candidates:
logger.debug(f"Netease: no match found for {track.display_name()}")
return LyricResult(status=CacheStatus.NOT_FOUND, ttl=TTL_NOT_FOUND)
return await self._get_lyric(song_id, confidence=confidence)
for i, (song_id, confidence) in enumerate(candidates):
if i > 0:
await asyncio.sleep(MULTI_CANDIDATE_DELAY_S)
result = await self._get_lyric(song_id, confidence=confidence)
if result is None or result.status == CacheStatus.NETWORK_ERROR:
return result
if result.status != CacheStatus.NOT_FOUND:
return result
return LyricResult(status=CacheStatus.NOT_FOUND, ttl=TTL_NOT_FOUND)
+29 -18
View File
@@ -11,18 +11,20 @@ The base URL is read from the QQ_MUSIC_API_URL environment variable.
Search → pick best match by duration → fetch LRC lyrics.
"""
import asyncio
from typing import Optional
import httpx
from loguru import logger
from .base import BaseFetcher
from .selection import SearchCandidate, select_best
from .selection import SearchCandidate, select_ranked
from ..models import TrackMeta, LyricResult, CacheStatus
from ..lrc import LRCData
from ..config import (
HTTP_TIMEOUT,
TTL_NOT_FOUND,
TTL_NETWORK_ERROR,
MULTI_CANDIDATE_DELAY_S,
QQ_MUSIC_API_URL,
)
@@ -37,10 +39,10 @@ class QQMusicFetcher(BaseFetcher):
async def _search(
self, track: TrackMeta, limit: int = 10
) -> tuple[Optional[str], float]:
) -> list[tuple[str, float]]:
query = f"{track.artist or ''} {track.title or ''}".strip()
if not query:
return None, 0.0
return []
logger.debug(f"QQMusic: searching for '{query}' (limit={limit})")
@@ -55,18 +57,18 @@ class QQMusicFetcher(BaseFetcher):
if data.get("code") != 0:
logger.error(f"QQMusic: search API error: {data}")
return None, 0.0
return []
songs = data.get("data", {}).get("list", [])
if not songs:
logger.debug("QQMusic: search returned 0 results")
return None, 0.0
return []
logger.debug(f"QQMusic: search returned {len(songs)} candidates")
candidates = [
SearchCandidate(
item=song.get("mid"),
item=mid,
duration_ms=float(song["interval"]) * 1000
if isinstance(song.get("interval"), int)
else None,
@@ -76,27 +78,27 @@ class QQMusicFetcher(BaseFetcher):
album=(song.get("album") or {}).get("name"),
)
for song in songs
if isinstance(song, dict) and song.get("mid") is not None
if isinstance(song, dict) and isinstance(mid := song.get("mid"), str)
]
best_mid, confidence = select_best(
ranked = select_ranked(
candidates,
track.length,
title=track.title,
artist=track.artist,
album=track.album,
)
if best_mid is not None:
if ranked:
logger.debug(
f"QQMusic: selected mid={best_mid} (confidence={confidence:.0f})"
"QQMusic: top candidates: "
+ ", ".join(f"mid={m} ({c:.0f})" for m, c in ranked)
)
return best_mid, confidence
logger.debug("QQMusic: no suitable candidate found")
return None, 0.0
else:
logger.debug("QQMusic: no suitable candidate found")
return ranked
except Exception as e:
logger.error(f"QQMusic: search failed: {e}")
return None, 0.0
return []
async def _get_lyric(
self, mid: str, confidence: float = 0.0
@@ -152,9 +154,18 @@ class QQMusicFetcher(BaseFetcher):
return None
logger.info(f"QQMusic: fetching lyrics for {track.display_name()}")
mid, confidence = await self._search(track)
if not mid:
candidates = await self._search(track)
if not candidates:
logger.debug(f"QQMusic: no match found for {track.display_name()}")
return LyricResult(status=CacheStatus.NOT_FOUND, ttl=TTL_NOT_FOUND)
return await self._get_lyric(mid, confidence=confidence)
for i, (mid, confidence) in enumerate(candidates):
if i > 0:
await asyncio.sleep(MULTI_CANDIDATE_DELAY_S)
result = await self._get_lyric(mid, confidence=confidence)
if result is None or result.status == CacheStatus.NETWORK_ERROR:
return result
if result.status != CacheStatus.NOT_FOUND:
return result
return LyricResult(status=CacheStatus.NOT_FOUND, ttl=TTL_NOT_FOUND)
+27
View File
@@ -11,6 +11,7 @@ from typing import Generic, Optional, TypeVar
from ..config import (
DURATION_TOLERANCE_MS,
MULTI_CANDIDATE_LIMIT,
SCORE_W_TITLE as _W_TITLE,
SCORE_W_ARTIST as _W_ARTIST,
SCORE_W_ALBUM as _W_ALBUM,
@@ -143,6 +144,32 @@ def _score_candidate(
return metadata_score + synced_score
def select_ranked(
candidates: list[SearchCandidate[T]],
track_length_ms: Optional[int] = None,
*,
title: Optional[str] = None,
artist: Optional[str] = None,
album: Optional[str] = None,
min_confidence: float = MIN_CONFIDENCE,
max_results: int = MULTI_CANDIDATE_LIMIT,
) -> list[tuple[T, float]]:
"""Score candidates and return top max_results above min_confidence, sorted by score descending."""
scored: list[tuple[T, float]] = []
for c in candidates:
if (
track_length_ms is not None
and c.duration_ms is not None
and abs(c.duration_ms - track_length_ms) > DURATION_TOLERANCE_MS
):
continue
s = _score_candidate(c, title, artist, album, track_length_ms)
if s >= min_confidence:
scored.append((c.item, s))
scored.sort(key=lambda x: x[1], reverse=True)
return scored[:max_results]
def select_best(
candidates: list[SearchCandidate[T]],
track_length_ms: Optional[int] = None,