feat: evaluate fetch results with "confidence"
This commit is contained in:
@@ -64,16 +64,26 @@ class CacheSearchFetcher(BaseFetcher):
|
||||
logger.debug(f"Cache-search: no match for {track.display_name()}")
|
||||
return None
|
||||
|
||||
# Pick best: prefer synced, then first available
|
||||
# Pick best by confidence scoring
|
||||
candidates = [
|
||||
SearchCandidate(
|
||||
item=m,
|
||||
duration_ms=float(m["length"]) if m.get("length") else None,
|
||||
is_synced=m.get("status") == CacheStatus.SUCCESS_SYNCED.value,
|
||||
title=m.get("title"),
|
||||
artist=m.get("artist"),
|
||||
album=m.get("album"),
|
||||
)
|
||||
for m in matches
|
||||
if m.get("lyrics")
|
||||
]
|
||||
best = select_best(candidates, track.length)
|
||||
best, confidence = select_best(
|
||||
candidates,
|
||||
track.length,
|
||||
title=track.title,
|
||||
artist=track.artist,
|
||||
album=track.album,
|
||||
)
|
||||
|
||||
if not best:
|
||||
return None
|
||||
@@ -81,10 +91,11 @@ class CacheSearchFetcher(BaseFetcher):
|
||||
status = CacheStatus(best["status"])
|
||||
logger.info(
|
||||
f"Cache-search: fuzzy hit from [{best.get('source')}] "
|
||||
f"album={best.get('album')!r} ({status.value})"
|
||||
f"album={best.get('album')!r} ({status.value}, confidence={confidence:.0f})"
|
||||
)
|
||||
return LyricResult(
|
||||
status=status,
|
||||
lyrics=LRCData(best["lyrics"]),
|
||||
source=self.source_name,
|
||||
confidence=confidence,
|
||||
)
|
||||
|
||||
@@ -126,10 +126,19 @@ class LrclibSearchFetcher(BaseFetcher):
|
||||
else None,
|
||||
is_synced=isinstance(item.get("syncedLyrics"), str)
|
||||
and bool(item["syncedLyrics"].strip()),
|
||||
title=item.get("trackName"),
|
||||
artist=item.get("artistName"),
|
||||
album=item.get("albumName"),
|
||||
)
|
||||
for item in candidates
|
||||
]
|
||||
best = select_best(mapped, track.length)
|
||||
best, confidence = select_best(
|
||||
mapped,
|
||||
track.length,
|
||||
title=track.title,
|
||||
artist=track.artist,
|
||||
album=track.album,
|
||||
)
|
||||
if best is None:
|
||||
logger.debug("LRCLIB-search: no valid candidate found")
|
||||
return LyricResult(status=CacheStatus.NOT_FOUND, ttl=TTL_NOT_FOUND)
|
||||
@@ -139,20 +148,26 @@ class LrclibSearchFetcher(BaseFetcher):
|
||||
|
||||
if isinstance(synced, str) and synced.strip():
|
||||
lyrics = LRCData(synced)
|
||||
logger.info(f"LRCLIB-search: got synced lyrics ({len(lyrics)} lines)")
|
||||
logger.info(
|
||||
f"LRCLIB-search: got synced lyrics ({len(lyrics)} lines, confidence={confidence:.0f})"
|
||||
)
|
||||
return LyricResult(
|
||||
status=CacheStatus.SUCCESS_SYNCED,
|
||||
lyrics=lyrics,
|
||||
source=self.source_name,
|
||||
confidence=confidence,
|
||||
)
|
||||
elif isinstance(unsynced, str) and unsynced.strip():
|
||||
lyrics = LRCData(unsynced)
|
||||
logger.info(f"LRCLIB-search: got unsynced lyrics ({len(lyrics)} lines)")
|
||||
logger.info(
|
||||
f"LRCLIB-search: got unsynced lyrics ({len(lyrics)} lines, confidence={confidence:.0f})"
|
||||
)
|
||||
return LyricResult(
|
||||
status=CacheStatus.SUCCESS_UNSYNCED,
|
||||
lyrics=lyrics,
|
||||
source=self.source_name,
|
||||
ttl=TTL_UNSYNCED,
|
||||
confidence=confidence,
|
||||
)
|
||||
else:
|
||||
logger.debug("LRCLIB-search: best candidate has empty lyrics")
|
||||
|
||||
+34
-15
@@ -43,15 +43,15 @@ class NeteaseFetcher(BaseFetcher):
|
||||
def is_available(self, track: TrackMeta) -> bool:
|
||||
return bool(track.title)
|
||||
|
||||
def _search(self, track: TrackMeta, limit: int = 10) -> Optional[int]:
|
||||
"""Search Netease and return the best-matching song ID.
|
||||
def _search(self, track: TrackMeta, limit: int = 10) -> tuple[Optional[int], float]:
|
||||
"""Search Netease and return the best-matching song ID with confidence.
|
||||
|
||||
When ``track.length`` is available, candidates are ranked by duration
|
||||
difference and only accepted if within ``DURATION_TOLERANCE_MS``.
|
||||
"""
|
||||
query = f"{track.artist or ''} {track.title or ''}".strip()
|
||||
if not query:
|
||||
return None
|
||||
return None, 0.0
|
||||
|
||||
logger.debug(f"Netease: searching for '{query}' (limit={limit})")
|
||||
|
||||
@@ -70,17 +70,17 @@ class NeteaseFetcher(BaseFetcher):
|
||||
logger.error(
|
||||
f"Netease: search returned non-dict: {type(result).__name__}"
|
||||
)
|
||||
return None
|
||||
return None, 0.0
|
||||
|
||||
result_body = result.get("result")
|
||||
if not isinstance(result_body, dict):
|
||||
logger.debug("Netease: search 'result' field missing or invalid")
|
||||
return None
|
||||
return None, 0.0
|
||||
|
||||
songs = result_body.get("songs")
|
||||
if not isinstance(songs, list) or len(songs) == 0:
|
||||
logger.debug("Netease: search returned 0 results")
|
||||
return None
|
||||
return None, 0.0
|
||||
|
||||
logger.debug(f"Netease: search returned {len(songs)} candidates")
|
||||
|
||||
@@ -90,23 +90,37 @@ class NeteaseFetcher(BaseFetcher):
|
||||
duration_ms=float(song["dt"])
|
||||
if isinstance(song.get("dt"), int)
|
||||
else None,
|
||||
title=song.get("name"),
|
||||
artist=", ".join(a.get("name", "") for a in song.get("ar", []))
|
||||
or None,
|
||||
album=(song.get("al") or {}).get("name"),
|
||||
)
|
||||
for song in songs
|
||||
if isinstance(song, dict) and song.get("id") is not None
|
||||
]
|
||||
best_id = select_best(candidates, track.length)
|
||||
best_id, confidence = select_best(
|
||||
candidates,
|
||||
track.length,
|
||||
title=track.title,
|
||||
artist=track.artist,
|
||||
album=track.album,
|
||||
)
|
||||
if best_id is not None:
|
||||
logger.debug(f"Netease: selected id={best_id}")
|
||||
return best_id
|
||||
logger.debug(
|
||||
f"Netease: selected id={best_id} (confidence={confidence:.0f})"
|
||||
)
|
||||
return best_id, confidence
|
||||
|
||||
logger.debug("Netease: no suitable candidate found")
|
||||
return None
|
||||
return None, 0.0
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Netease: search failed: {e}")
|
||||
return None
|
||||
return None, 0.0
|
||||
|
||||
def _get_lyric(self, song_id: int) -> Optional[LyricResult]:
|
||||
def _get_lyric(
|
||||
self, song_id: int, confidence: float = 0.0
|
||||
) -> Optional[LyricResult]:
|
||||
"""Fetch lyrics for a given Netease song ID."""
|
||||
logger.debug(f"Netease: fetching lyrics for song_id={song_id}")
|
||||
|
||||
@@ -158,7 +172,12 @@ class NeteaseFetcher(BaseFetcher):
|
||||
f"Netease: got {status.value} lyrics for song_id={song_id} "
|
||||
f"({len(lrcdata)} lines)"
|
||||
)
|
||||
return LyricResult(status=status, lyrics=lrcdata, source=self.source_name)
|
||||
return LyricResult(
|
||||
status=status,
|
||||
lyrics=lrcdata,
|
||||
source=self.source_name,
|
||||
confidence=confidence,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Netease: lyric fetch failed for song_id={song_id}: {e}")
|
||||
@@ -174,9 +193,9 @@ class NeteaseFetcher(BaseFetcher):
|
||||
return None
|
||||
|
||||
logger.info(f"Netease: fetching lyrics for {track.display_name()}")
|
||||
song_id = self._search(track)
|
||||
song_id, confidence = self._search(track)
|
||||
if not song_id:
|
||||
logger.debug(f"Netease: no match found for {track.display_name()}")
|
||||
return LyricResult(status=CacheStatus.NOT_FOUND, ttl=TTL_NOT_FOUND)
|
||||
|
||||
return self._get_lyric(song_id)
|
||||
return self._get_lyric(song_id, confidence=confidence)
|
||||
|
||||
+31
-14
@@ -35,11 +35,11 @@ class QQMusicFetcher(BaseFetcher):
|
||||
def is_available(self, track: TrackMeta) -> bool:
|
||||
return bool(track.title) and bool(QQ_MUSIC_API_URL)
|
||||
|
||||
def _search(self, track: TrackMeta, limit: int = 10) -> Optional[str]:
|
||||
"""Search QQ Music and return the best-matching song MID."""
|
||||
def _search(self, track: TrackMeta, limit: int = 10) -> tuple[Optional[str], float]:
|
||||
"""Search QQ Music and return the best-matching song MID with confidence."""
|
||||
query = f"{track.artist or ''} {track.title or ''}".strip()
|
||||
if not query:
|
||||
return None
|
||||
return None, 0.0
|
||||
|
||||
logger.debug(f"QQMusic: searching for '{query}' (limit={limit})")
|
||||
|
||||
@@ -54,12 +54,12 @@ class QQMusicFetcher(BaseFetcher):
|
||||
|
||||
if data.get("code") != 0:
|
||||
logger.error(f"QQMusic: search API error: {data}")
|
||||
return None
|
||||
return None, 0.0
|
||||
|
||||
songs = data.get("data", {}).get("list", [])
|
||||
if not songs:
|
||||
logger.debug("QQMusic: search returned 0 results")
|
||||
return None
|
||||
return None, 0.0
|
||||
|
||||
logger.debug(f"QQMusic: search returned {len(songs)} candidates")
|
||||
|
||||
@@ -69,23 +69,35 @@ class QQMusicFetcher(BaseFetcher):
|
||||
duration_ms=float(song["interval"]) * 1000
|
||||
if isinstance(song.get("interval"), int)
|
||||
else None,
|
||||
title=song.get("name"),
|
||||
artist=", ".join(s.get("name", "") for s in song.get("singer", []))
|
||||
or None,
|
||||
album=(song.get("album") or {}).get("name"),
|
||||
)
|
||||
for song in songs
|
||||
if isinstance(song, dict) and song.get("mid") is not None
|
||||
]
|
||||
best_mid = select_best(candidates, track.length)
|
||||
best_mid, confidence = select_best(
|
||||
candidates,
|
||||
track.length,
|
||||
title=track.title,
|
||||
artist=track.artist,
|
||||
album=track.album,
|
||||
)
|
||||
if best_mid is not None:
|
||||
logger.debug(f"QQMusic: selected mid={best_mid}")
|
||||
return best_mid
|
||||
logger.debug(
|
||||
f"QQMusic: selected mid={best_mid} (confidence={confidence:.0f})"
|
||||
)
|
||||
return best_mid, confidence
|
||||
|
||||
logger.debug("QQMusic: no suitable candidate found")
|
||||
return None
|
||||
return None, 0.0
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"QQMusic: search failed: {e}")
|
||||
return None
|
||||
return None, 0.0
|
||||
|
||||
def _get_lyric(self, mid: str) -> Optional[LyricResult]:
|
||||
def _get_lyric(self, mid: str, confidence: float = 0.0) -> Optional[LyricResult]:
|
||||
"""Fetch lyrics for a given QQ Music song MID."""
|
||||
logger.debug(f"QQMusic: fetching lyrics for mid={mid}")
|
||||
|
||||
@@ -115,7 +127,12 @@ class QQMusicFetcher(BaseFetcher):
|
||||
f"QQMusic: got {status.value} lyrics for mid={mid} "
|
||||
f"({len(lrcdata)} lines)"
|
||||
)
|
||||
return LyricResult(status=status, lyrics=lrcdata, source=self.source_name)
|
||||
return LyricResult(
|
||||
status=status,
|
||||
lyrics=lrcdata,
|
||||
source=self.source_name,
|
||||
confidence=confidence,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"QQMusic: lyric fetch failed for mid={mid}: {e}")
|
||||
@@ -135,9 +152,9 @@ class QQMusicFetcher(BaseFetcher):
|
||||
return None
|
||||
|
||||
logger.info(f"QQMusic: fetching lyrics for {track.display_name()}")
|
||||
mid = self._search(track)
|
||||
mid, confidence = self._search(track)
|
||||
if not mid:
|
||||
logger.debug(f"QQMusic: no match found for {track.display_name()}")
|
||||
return LyricResult(status=CacheStatus.NOT_FOUND, ttl=TTL_NOT_FOUND)
|
||||
|
||||
return self._get_lyric(mid)
|
||||
return self._get_lyric(mid, confidence=confidence)
|
||||
|
||||
+132
-32
@@ -2,13 +2,23 @@
|
||||
Shared candidate-selection logic for search-based fetchers.
|
||||
|
||||
Each fetcher maps its API-specific results to SearchCandidate, then calls
|
||||
select_best() which handles duration filtering and synced preference uniformly.
|
||||
select_best() which scores candidates by metadata similarity, duration
|
||||
proximity, and sync status.
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Generic, Optional, TypeVar
|
||||
|
||||
from ..config import DURATION_TOLERANCE_MS
|
||||
from ..config import (
|
||||
DURATION_TOLERANCE_MS,
|
||||
SCORE_W_TITLE as _W_TITLE,
|
||||
SCORE_W_ARTIST as _W_ARTIST,
|
||||
SCORE_W_ALBUM as _W_ALBUM,
|
||||
SCORE_W_DURATION as _W_DURATION,
|
||||
SCORE_W_SYNCED as _W_SYNCED,
|
||||
MIN_CONFIDENCE,
|
||||
)
|
||||
from ..normalize import normalize_for_match, normalize_artist
|
||||
|
||||
T = TypeVar("T")
|
||||
|
||||
@@ -21,48 +31,138 @@ class SearchCandidate(Generic[T]):
|
||||
item: The original API-specific object (dict, ID, etc.)
|
||||
duration_ms: Track duration in milliseconds, or None if unknown.
|
||||
is_synced: Whether this candidate is known to have synced lyrics.
|
||||
title: Candidate track title for similarity scoring.
|
||||
artist: Candidate artist name for similarity scoring.
|
||||
album: Candidate album name for similarity scoring.
|
||||
"""
|
||||
|
||||
item: T
|
||||
duration_ms: Optional[float] = None
|
||||
is_synced: bool = False
|
||||
title: Optional[str] = None
|
||||
artist: Optional[str] = None
|
||||
album: Optional[str] = None
|
||||
|
||||
|
||||
def _text_similarity(a: str, b: str) -> float:
|
||||
"""Compare two normalized strings. Returns 0.0-1.0."""
|
||||
if a == b:
|
||||
return 1.0
|
||||
if not a or not b:
|
||||
return 0.0
|
||||
# Containment: one is a substring of the other (e.g. "My Love" vs "My Love (Album Version)")
|
||||
if a in b or b in a:
|
||||
return min(len(a), len(b)) / max(len(a), len(b))
|
||||
return 0.0
|
||||
|
||||
|
||||
def _score_candidate(
|
||||
c: SearchCandidate[T],
|
||||
ref_title: Optional[str],
|
||||
ref_artist: Optional[str],
|
||||
ref_album: Optional[str],
|
||||
ref_length_ms: Optional[int],
|
||||
) -> float:
|
||||
"""Score a candidate from 0-100 based on metadata match quality.
|
||||
|
||||
Scoring works in two tiers:
|
||||
|
||||
1. **Metadata score** — computed from fields available on *both* sides,
|
||||
then rescaled to fill the 0-90 range so that missing fields don't
|
||||
inflate the score. Fields missing on both sides are simply excluded
|
||||
from the calculation (neutral). Fields present on only one side
|
||||
contribute 0 to the numerator but their weight still counts in the
|
||||
denominator (penalty for asymmetric absence).
|
||||
|
||||
2. **Synced bonus** — a flat 10 pts, always applied independently.
|
||||
|
||||
Field weights (before rescaling):
|
||||
- Title: 40
|
||||
- Artist: 30
|
||||
- Album: 10
|
||||
- Duration: 10
|
||||
"""
|
||||
raw = 0.0
|
||||
available_weight = 0.0
|
||||
|
||||
# Title
|
||||
if ref_title is not None or c.title is not None:
|
||||
available_weight += _W_TITLE
|
||||
if ref_title is not None and c.title is not None:
|
||||
raw += _W_TITLE * _text_similarity(
|
||||
normalize_for_match(ref_title), normalize_for_match(c.title)
|
||||
)
|
||||
# else both None → excluded
|
||||
|
||||
# Artist
|
||||
if ref_artist is not None or c.artist is not None:
|
||||
available_weight += _W_ARTIST
|
||||
if ref_artist is not None and c.artist is not None:
|
||||
na = normalize_artist(ref_artist)
|
||||
nb = normalize_artist(c.artist)
|
||||
if na == nb:
|
||||
raw += _W_ARTIST
|
||||
else:
|
||||
raw += _W_ARTIST * _text_similarity(
|
||||
normalize_for_match(ref_artist), normalize_for_match(c.artist)
|
||||
)
|
||||
|
||||
# Album
|
||||
if ref_album is not None or c.album is not None:
|
||||
available_weight += _W_ALBUM
|
||||
if ref_album is not None and c.album is not None:
|
||||
raw += _W_ALBUM * _text_similarity(
|
||||
normalize_for_match(ref_album), normalize_for_match(c.album)
|
||||
)
|
||||
|
||||
# Duration
|
||||
if ref_length_ms is not None or c.duration_ms is not None:
|
||||
available_weight += _W_DURATION
|
||||
if ref_length_ms is not None and c.duration_ms is not None:
|
||||
diff = abs(c.duration_ms - ref_length_ms)
|
||||
if diff <= DURATION_TOLERANCE_MS:
|
||||
raw += _W_DURATION * (1.0 - diff / DURATION_TOLERANCE_MS)
|
||||
|
||||
# Rescale metadata to 0-90 range
|
||||
_MAX_METADATA = _W_TITLE + _W_ARTIST + _W_ALBUM + _W_DURATION # 90
|
||||
if available_weight > 0:
|
||||
metadata_score = (raw / available_weight) * _MAX_METADATA
|
||||
else:
|
||||
# No comparable fields at all — only synced bonus matters
|
||||
metadata_score = 0.0
|
||||
|
||||
# Synced bonus (always 10 pts, independent of metadata)
|
||||
synced_score = _W_SYNCED if c.is_synced else 0.0
|
||||
|
||||
return metadata_score + synced_score
|
||||
|
||||
|
||||
def select_best(
|
||||
candidates: list[SearchCandidate[T]],
|
||||
track_length_ms: Optional[int] = None,
|
||||
tolerance_ms: float = DURATION_TOLERANCE_MS,
|
||||
) -> Optional[T]:
|
||||
"""Pick the best candidate by duration proximity and sync preference.
|
||||
*,
|
||||
title: Optional[str] = None,
|
||||
artist: Optional[str] = None,
|
||||
album: Optional[str] = None,
|
||||
min_confidence: float = MIN_CONFIDENCE,
|
||||
) -> tuple[Optional[T], float]:
|
||||
"""Pick the best candidate by confidence scoring.
|
||||
|
||||
When track_length_ms is available:
|
||||
- Filter by tolerance_ms
|
||||
- Pick closest duration, prefer synced at equal distance
|
||||
When track_length_ms is unavailable:
|
||||
- Pick first synced candidate, or first overall
|
||||
Returns (item, score). Item is None if no candidate scores above min_confidence.
|
||||
"""
|
||||
if track_length_ms is not None:
|
||||
best: Optional[SearchCandidate[T]] = None
|
||||
best_diff = float("inf")
|
||||
if not candidates:
|
||||
return None, 0.0
|
||||
|
||||
for c in candidates:
|
||||
if c.duration_ms is None:
|
||||
continue
|
||||
diff = abs(c.duration_ms - track_length_ms)
|
||||
if diff > tolerance_ms:
|
||||
continue
|
||||
if diff < best_diff or (
|
||||
diff == best_diff
|
||||
and c.is_synced
|
||||
and (best is None or not best.is_synced)
|
||||
):
|
||||
best_diff = diff
|
||||
best = c
|
||||
best_item: Optional[T] = None
|
||||
best_score = -1.0
|
||||
|
||||
return best.item if best is not None else None
|
||||
|
||||
# No duration — prefer synced, fallback to first
|
||||
for c in candidates:
|
||||
if c.is_synced:
|
||||
return c.item
|
||||
return candidates[0].item if candidates else None
|
||||
s = _score_candidate(c, title, artist, album, track_length_ms)
|
||||
if s > best_score:
|
||||
best_score = s
|
||||
best_item = c.item
|
||||
|
||||
if best_score < min_confidence:
|
||||
return None, best_score
|
||||
|
||||
return best_item, best_score
|
||||
|
||||
Reference in New Issue
Block a user