feat: evaluate fetch results with "confidence"

This commit is contained in:
2026-04-02 04:26:19 +02:00
parent e2f45f80f6
commit 7ebf51b78d
15 changed files with 836 additions and 238 deletions
-3
View File
@@ -1,3 +0,0 @@
{
"python-envs.defaultEnvManager": "ms-python.python:venv"
}
+63 -57
View File
@@ -4,56 +4,21 @@ Date: 2026-03-25 10:18:03
Description: SQLite-based lyric cache with per-source storage and TTL expiration
"""
import re
import sqlite3
import hashlib
import time
import unicodedata
from typing import Optional
from loguru import logger
from .lrc import LRCData
from .config import DURATION_TOLERANCE_MS
from .models import TrackMeta, LyricResult, CacheStatus
# Punctuation to strip for fuzzy matching (ASCII + fullwidth + CJK brackets/symbols)
_PUNCT_RE = re.compile(
r"[~!@#$%^&*()_+\-=\[\]{}|;:'\",.<>?/\\`"
r"~!@#$%^&*()_+-=【】{}|;:'",。<>?/\`"
r"「」『』《》〈〉〔〕·•‥…—–]"
from .normalize import normalize_for_match as _normalize_for_match
from .normalize import normalize_artist as _normalize_artist
from .config import (
DURATION_TOLERANCE_MS,
LEGACY_CONFIDENCE_SYNCED,
LEGACY_CONFIDENCE_UNSYNCED,
)
_SPACE_RE = re.compile(r"\s+")
# feat./ft./featuring and everything after (case-insensitive, word boundary)
_FEAT_RE = re.compile(r"\s*(?:\bfeat\.?\b|\bft\.?\b|\bfeaturing\b).*", re.IGNORECASE)
# Multi-artist separators: /, &, ×, x (surrounded by spaces), ;, 、, vs.
_ARTIST_SEP_RE = re.compile(r"\s*(?:[/&;×、]|\bvs\.?\b|\bx\b)\s*", re.IGNORECASE)
def _normalize_for_match(s: str) -> str:
"""Normalize a string for fuzzy comparison.
Lowercases, NFKC-normalizes (fullwidth → halfwidth), strips punctuation,
and collapses whitespace.
"""
s = unicodedata.normalize("NFKC", s).lower()
s = _FEAT_RE.sub("", s)
s = _PUNCT_RE.sub(" ", s)
s = _SPACE_RE.sub(" ", s).strip()
return s
def _normalize_artist(s: str) -> str:
"""Normalize an artist string: split by separators, normalize each, sort.
Splits first (on /, &, ;, ×, 、, vs., x), then strips feat./ft./featuring
from each part individually, so 'A feat. C / B' → ['a', 'b'] not just ['a'].
"""
s = unicodedata.normalize("NFKC", s).lower()
parts = _ARTIST_SEP_RE.split(s)
normed = sorted(
{_normalize_for_match(p) for p in parts if _FEAT_RE.sub("", p).strip()}
)
return "\0".join(normed) if normed else _normalize_for_match(s)
from .models import TrackMeta, LyricResult, CacheStatus
def _generate_key(track: TrackMeta, source: str) -> str:
@@ -110,10 +75,12 @@ class CacheEngine:
length INTEGER
)
""")
# Migration: add length column if missing
# Migrations
cols = {r[1] for r in conn.execute("PRAGMA table_info(cache)").fetchall()}
if "length" not in cols:
conn.execute("ALTER TABLE cache ADD COLUMN length INTEGER")
if "confidence" not in cols:
conn.execute("ALTER TABLE cache ADD COLUMN confidence REAL")
conn.commit()
# Read
@@ -130,7 +97,7 @@ class CacheEngine:
with sqlite3.connect(self.db_path) as conn:
row = conn.execute(
"SELECT status, lyrics, source, expires_at, length FROM cache WHERE key = ?",
"SELECT status, lyrics, source, expires_at, length, confidence FROM cache WHERE key = ?",
(key,),
).fetchone()
@@ -138,7 +105,7 @@ class CacheEngine:
logger.debug(f"Cache miss: {source} / {track.display_name()}")
return None
status_str, lyrics, src, expires_at, cached_length = row
status_str, lyrics, src, expires_at, cached_length, confidence = row
# Check TTL expiration
if expires_at and expires_at < int(time.time()):
@@ -160,15 +127,27 @@ class CacheEngine:
f"Cache hit: {source} / {track.display_name()} "
f"[{status_str}, ttl={remaining}s]"
)
status = CacheStatus(status_str)
if confidence is None and status in (
CacheStatus.SUCCESS_SYNCED,
CacheStatus.SUCCESS_UNSYNCED,
):
confidence = (
LEGACY_CONFIDENCE_SYNCED
if status == CacheStatus.SUCCESS_SYNCED
else LEGACY_CONFIDENCE_UNSYNCED
)
return LyricResult(
status=CacheStatus(status_str),
status=status,
lyrics=LRCData(lyrics) if lyrics else None,
source=src,
ttl=remaining,
confidence=confidence,
)
def get_best(self, track: TrackMeta, sources: list[str]) -> Optional[LyricResult]:
"""Return the best cached result across *sources* (synced > unsynced).
"""Return the best cached result across *sources* by confidence.
Skips negative statuses (NOT_FOUND, NETWORK_ERROR) — those are only
consulted per-source to avoid redundant fetches.
@@ -178,9 +157,19 @@ class CacheEngine:
cached = self.get(track, src)
if not cached:
continue
if cached.status == CacheStatus.SUCCESS_SYNCED:
return cached # Can't do better
if cached.status == CacheStatus.SUCCESS_UNSYNCED and best is None:
if cached.status not in (
CacheStatus.SUCCESS_SYNCED,
CacheStatus.SUCCESS_UNSYNCED,
):
continue
if best is None:
best = cached
else:
cached_conf = (
cached.confidence if cached.confidence is not None else 100.0
)
best_conf = best.confidence if best.confidence is not None else 100.0
if cached_conf > best_conf:
best = cached
return best
@@ -207,8 +196,8 @@ class CacheEngine:
conn.execute(
"""INSERT OR REPLACE INTO cache
(key, source, status, lyrics, created_at, expires_at,
artist, title, album, length)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
artist, title, album, length, confidence)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
(
key,
source,
@@ -220,6 +209,7 @@ class CacheEngine:
track.title,
track.album,
track.length,
result.confidence,
),
)
conn.commit()
@@ -288,7 +278,7 @@ class CacheEngine:
"""Find the best positive (synced/unsynced) cache entry for *track*.
Uses exact metadata match (artist + title + album) across all sources.
Returns synced if available, otherwise unsynced, or None.
Returns the highest-confidence entry, or None.
"""
conditions, params = self._track_where(track)
if not conditions:
@@ -306,19 +296,34 @@ class CacheEngine:
with sqlite3.connect(self.db_path) as conn:
conn.row_factory = sqlite3.Row
rows = conn.execute(
f"SELECT status, lyrics, source FROM cache WHERE {where} "
"ORDER BY CASE status WHEN ? THEN 0 ELSE 1 END LIMIT 1",
params + [CacheStatus.SUCCESS_SYNCED.value],
f"SELECT status, lyrics, source, confidence FROM cache WHERE {where} "
"ORDER BY COALESCE(confidence, "
" CASE status WHEN ? THEN ? ELSE ? END"
") DESC, created_at DESC LIMIT 1",
params
+ [
CacheStatus.SUCCESS_SYNCED.value,
LEGACY_CONFIDENCE_SYNCED,
LEGACY_CONFIDENCE_UNSYNCED,
],
).fetchall()
if not rows:
return None
row = dict(rows[0])
confidence = row["confidence"]
if confidence is None:
confidence = (
LEGACY_CONFIDENCE_SYNCED
if row["status"] == CacheStatus.SUCCESS_SYNCED.value
else LEGACY_CONFIDENCE_UNSYNCED
)
return LyricResult(
status=CacheStatus(row["status"]),
lyrics=LRCData(row["lyrics"]) if row["lyrics"] else None,
source="cache-search",
confidence=confidence,
)
# Fuzzy search
@@ -384,7 +389,8 @@ class CacheEngine:
scored.sort(
key=lambda x: (
x[0],
x[1].get("status") != CacheStatus.SUCCESS_SYNCED.value,
-(x[1].get("confidence") or 0),
-(x[1].get("created_at") or 0),
)
)
matches = [m for _, m in scored]
+3
View File
@@ -428,6 +428,7 @@ def _print_cache_row(row: dict, indent: str = "") -> None:
created = row.get("created_at", 0)
expires = row.get("expires_at")
lyrics = row.get("lyrics", "")
confidence = row.get("confidence")
name = f"{artist} - {title}" if artist and title else row.get("key", "?")
print(f"{indent}[{source}] {name}")
@@ -450,6 +451,8 @@ def _print_cache_row(row: dict, indent: str = "") -> None:
if lyrics:
line_count = len(lyrics.splitlines())
print(f"{indent} Lyrics : {line_count} lines")
if confidence is not None:
print(f"{indent} Confidence: {confidence:.0f}")
def run():
+15
View File
@@ -38,6 +38,21 @@ TTL_NETWORK_ERROR = 3600 # 1 hour
# Search
DURATION_TOLERANCE_MS = 3000 # max duration mismatch for search matching
# Confidence scoring weights (sum to 100)
SCORE_W_TITLE = 40.0
SCORE_W_ARTIST = 30.0
SCORE_W_ALBUM = 10.0
SCORE_W_DURATION = 10.0
SCORE_W_SYNCED = 10.0
# Confidence thresholds
MIN_CONFIDENCE = 25.0 # below this, candidate is rejected
HIGH_CONFIDENCE = 80.0 # at or above this, stop searching early
# Legacy cache rows (no confidence stored) get a base score by sync status
LEGACY_CONFIDENCE_SYNCED = 50.0
LEGACY_CONFIDENCE_UNSYNCED = 40.0
# Spotify related
SPOTIFY_TOKEN_URL = "https://open.spotify.com/api/token"
SPOTIFY_LYRICS_URL = "https://spclient.wg.spotify.com/color-lyrics/v2/track/"
+57 -24
View File
@@ -9,7 +9,7 @@ Fetch pipeline:
1. Check cache for each source in the fallback sequence
2. For sources without a valid cache hit, call the fetcher
3. Cache every result (success, not-found, or error) per source
4. Return the best result (synced > unsynced > None)
4. Return the best result by confidence (highest wins)
"""
from typing import Optional
@@ -19,7 +19,13 @@ from .fetchers import FetcherMethodType, create_fetchers
from .fetchers.base import BaseFetcher
from .cache import CacheEngine
from .lrc import LRCData
from .config import TTL_SYNCED, TTL_UNSYNCED, TTL_NOT_FOUND, TTL_NETWORK_ERROR
from .config import (
TTL_SYNCED,
TTL_UNSYNCED,
TTL_NOT_FOUND,
TTL_NETWORK_ERROR,
HIGH_CONFIDENCE,
)
from .models import TrackMeta, LyricResult, CacheStatus
from .enrichers import enrich_track
@@ -33,6 +39,18 @@ _STATUS_TTL: dict[CacheStatus, Optional[int]] = {
}
def _is_better(new: LyricResult, old: LyricResult) -> bool:
"""Compare two results by confidence only.
Synced/unsynced preference is already baked into the confidence score
(synced bonus in scoring weights), so we don't need a separate tier.
None confidence = trusted = 100.
"""
new_conf = new.confidence if new.confidence is not None else 100.0
old_conf = old.confidence if old.confidence is not None else 100.0
return new_conf > old_conf
class LrcManager:
"""Main entry point for fetching lyrics with caching."""
@@ -72,7 +90,7 @@ class LrcManager:
- Cache miss or unsynced → call fetcher, then cache the result
After all sources are tried, returns the best result found
(synced > unsynced > None).
(highest confidence wins).
"""
track = enrich_track(track)
logger.info(f"Fetching lyrics for: {track.display_name()}")
@@ -81,7 +99,7 @@ class LrcManager:
if not sequence:
return None
# Best result seen so far (synced wins over unsynced)
# Best result seen so far (highest confidence wins)
best_result: Optional[LyricResult] = None
for fetcher in sequence:
@@ -91,17 +109,7 @@ class LrcManager:
if not bypass_cache and not fetcher.self_cached:
cached = self.cache.get(track, source)
if cached:
if cached.status == CacheStatus.SUCCESS_SYNCED:
logger.info(f"[{source}] cache hit: synced lyrics")
return cached
elif cached.status == CacheStatus.SUCCESS_UNSYNCED:
logger.debug(
f"[{source}] cache hit: unsynced lyrics (continuing)"
)
if best_result is None:
best_result = cached
continue # Try next source for synced
elif cached.status in (
if cached.status in (
CacheStatus.NOT_FOUND,
CacheStatus.NETWORK_ERROR,
):
@@ -109,6 +117,23 @@ class LrcManager:
f"[{source}] cache hit: {cached.status.value}, skipping"
)
continue
# Positive cache hit — apply the same confidence evaluation
# as fresh fetches so that low-confidence cached results
# don't block better results from later fetchers.
is_trusted = (
cached.confidence is None
or cached.confidence >= HIGH_CONFIDENCE
)
logger.info(
f"[{source}] cache hit: {cached.status.value}"
f" (confidence={'trusted' if cached.confidence is None else f'{cached.confidence:.0f}'})"
)
if cached.status == CacheStatus.SUCCESS_SYNCED and is_trusted:
return cached
if best_result is None or _is_better(cached, best_result):
best_result = cached
continue
elif not fetcher.self_cached:
logger.debug(f"[{source}] cache bypassed")
@@ -126,20 +151,28 @@ class LrcManager:
self.cache.set(track, source, result, ttl_seconds=ttl)
# Evaluate result
if result.status == CacheStatus.SUCCESS_SYNCED:
logger.info(f"[{source}] got synced lyrics")
if result.status in (
CacheStatus.SUCCESS_SYNCED,
CacheStatus.SUCCESS_UNSYNCED,
):
is_trusted = (
result.confidence is None or result.confidence >= HIGH_CONFIDENCE
)
logger.info(
f"[{source}] got {result.status.value} lyrics"
f" (confidence={'trusted' if result.confidence is None else f'{result.confidence:.0f}'})"
)
# Trusted synced → return immediately
if result.status == CacheStatus.SUCCESS_SYNCED and is_trusted:
return result
if result.status == CacheStatus.SUCCESS_UNSYNCED:
logger.debug(f"[{source}] got unsynced lyrics (continuing)")
if best_result is None:
# Track best result by confidence
if best_result is None or _is_better(result, best_result):
best_result = result
# NOT_FOUND / NETWORK_ERROR: already cached, try next
# Return best available
if best_result:
# Normalize unsynced lyrics: set all timestamps to [00:00.00]
if (
best_result.status == CacheStatus.SUCCESS_UNSYNCED
and best_result.lyrics
@@ -149,10 +182,10 @@ class LrcManager:
lyrics=best_result.lyrics.normalize_unsynced(),
source=best_result.source,
ttl=best_result.ttl,
confidence=best_result.confidence,
)
logger.info(
f"Returning unsynced lyrics from {best_result.source} "
f"(no synced source found)"
f"Returning {best_result.status.value} lyrics from {best_result.source}"
)
else:
logger.info(f"No lyrics found for {track.display_name()}")
+14 -3
View File
@@ -64,16 +64,26 @@ class CacheSearchFetcher(BaseFetcher):
logger.debug(f"Cache-search: no match for {track.display_name()}")
return None
# Pick best: prefer synced, then first available
# Pick best by confidence scoring
candidates = [
SearchCandidate(
item=m,
duration_ms=float(m["length"]) if m.get("length") else None,
is_synced=m.get("status") == CacheStatus.SUCCESS_SYNCED.value,
title=m.get("title"),
artist=m.get("artist"),
album=m.get("album"),
)
for m in matches
if m.get("lyrics")
]
best = select_best(candidates, track.length)
best, confidence = select_best(
candidates,
track.length,
title=track.title,
artist=track.artist,
album=track.album,
)
if not best:
return None
@@ -81,10 +91,11 @@ class CacheSearchFetcher(BaseFetcher):
status = CacheStatus(best["status"])
logger.info(
f"Cache-search: fuzzy hit from [{best.get('source')}] "
f"album={best.get('album')!r} ({status.value})"
f"album={best.get('album')!r} ({status.value}, confidence={confidence:.0f})"
)
return LyricResult(
status=status,
lyrics=LRCData(best["lyrics"]),
source=self.source_name,
confidence=confidence,
)
+18 -3
View File
@@ -126,10 +126,19 @@ class LrclibSearchFetcher(BaseFetcher):
else None,
is_synced=isinstance(item.get("syncedLyrics"), str)
and bool(item["syncedLyrics"].strip()),
title=item.get("trackName"),
artist=item.get("artistName"),
album=item.get("albumName"),
)
for item in candidates
]
best = select_best(mapped, track.length)
best, confidence = select_best(
mapped,
track.length,
title=track.title,
artist=track.artist,
album=track.album,
)
if best is None:
logger.debug("LRCLIB-search: no valid candidate found")
return LyricResult(status=CacheStatus.NOT_FOUND, ttl=TTL_NOT_FOUND)
@@ -139,20 +148,26 @@ class LrclibSearchFetcher(BaseFetcher):
if isinstance(synced, str) and synced.strip():
lyrics = LRCData(synced)
logger.info(f"LRCLIB-search: got synced lyrics ({len(lyrics)} lines)")
logger.info(
f"LRCLIB-search: got synced lyrics ({len(lyrics)} lines, confidence={confidence:.0f})"
)
return LyricResult(
status=CacheStatus.SUCCESS_SYNCED,
lyrics=lyrics,
source=self.source_name,
confidence=confidence,
)
elif isinstance(unsynced, str) and unsynced.strip():
lyrics = LRCData(unsynced)
logger.info(f"LRCLIB-search: got unsynced lyrics ({len(lyrics)} lines)")
logger.info(
f"LRCLIB-search: got unsynced lyrics ({len(lyrics)} lines, confidence={confidence:.0f})"
)
return LyricResult(
status=CacheStatus.SUCCESS_UNSYNCED,
lyrics=lyrics,
source=self.source_name,
ttl=TTL_UNSYNCED,
confidence=confidence,
)
else:
logger.debug("LRCLIB-search: best candidate has empty lyrics")
+34 -15
View File
@@ -43,15 +43,15 @@ class NeteaseFetcher(BaseFetcher):
def is_available(self, track: TrackMeta) -> bool:
return bool(track.title)
def _search(self, track: TrackMeta, limit: int = 10) -> Optional[int]:
"""Search Netease and return the best-matching song ID.
def _search(self, track: TrackMeta, limit: int = 10) -> tuple[Optional[int], float]:
"""Search Netease and return the best-matching song ID with confidence.
When ``track.length`` is available, candidates are ranked by duration
difference and only accepted if within ``DURATION_TOLERANCE_MS``.
"""
query = f"{track.artist or ''} {track.title or ''}".strip()
if not query:
return None
return None, 0.0
logger.debug(f"Netease: searching for '{query}' (limit={limit})")
@@ -70,17 +70,17 @@ class NeteaseFetcher(BaseFetcher):
logger.error(
f"Netease: search returned non-dict: {type(result).__name__}"
)
return None
return None, 0.0
result_body = result.get("result")
if not isinstance(result_body, dict):
logger.debug("Netease: search 'result' field missing or invalid")
return None
return None, 0.0
songs = result_body.get("songs")
if not isinstance(songs, list) or len(songs) == 0:
logger.debug("Netease: search returned 0 results")
return None
return None, 0.0
logger.debug(f"Netease: search returned {len(songs)} candidates")
@@ -90,23 +90,37 @@ class NeteaseFetcher(BaseFetcher):
duration_ms=float(song["dt"])
if isinstance(song.get("dt"), int)
else None,
title=song.get("name"),
artist=", ".join(a.get("name", "") for a in song.get("ar", []))
or None,
album=(song.get("al") or {}).get("name"),
)
for song in songs
if isinstance(song, dict) and song.get("id") is not None
]
best_id = select_best(candidates, track.length)
best_id, confidence = select_best(
candidates,
track.length,
title=track.title,
artist=track.artist,
album=track.album,
)
if best_id is not None:
logger.debug(f"Netease: selected id={best_id}")
return best_id
logger.debug(
f"Netease: selected id={best_id} (confidence={confidence:.0f})"
)
return best_id, confidence
logger.debug("Netease: no suitable candidate found")
return None
return None, 0.0
except Exception as e:
logger.error(f"Netease: search failed: {e}")
return None
return None, 0.0
def _get_lyric(self, song_id: int) -> Optional[LyricResult]:
def _get_lyric(
self, song_id: int, confidence: float = 0.0
) -> Optional[LyricResult]:
"""Fetch lyrics for a given Netease song ID."""
logger.debug(f"Netease: fetching lyrics for song_id={song_id}")
@@ -158,7 +172,12 @@ class NeteaseFetcher(BaseFetcher):
f"Netease: got {status.value} lyrics for song_id={song_id} "
f"({len(lrcdata)} lines)"
)
return LyricResult(status=status, lyrics=lrcdata, source=self.source_name)
return LyricResult(
status=status,
lyrics=lrcdata,
source=self.source_name,
confidence=confidence,
)
except Exception as e:
logger.error(f"Netease: lyric fetch failed for song_id={song_id}: {e}")
@@ -174,9 +193,9 @@ class NeteaseFetcher(BaseFetcher):
return None
logger.info(f"Netease: fetching lyrics for {track.display_name()}")
song_id = self._search(track)
song_id, confidence = self._search(track)
if not song_id:
logger.debug(f"Netease: no match found for {track.display_name()}")
return LyricResult(status=CacheStatus.NOT_FOUND, ttl=TTL_NOT_FOUND)
return self._get_lyric(song_id)
return self._get_lyric(song_id, confidence=confidence)
+31 -14
View File
@@ -35,11 +35,11 @@ class QQMusicFetcher(BaseFetcher):
def is_available(self, track: TrackMeta) -> bool:
return bool(track.title) and bool(QQ_MUSIC_API_URL)
def _search(self, track: TrackMeta, limit: int = 10) -> Optional[str]:
"""Search QQ Music and return the best-matching song MID."""
def _search(self, track: TrackMeta, limit: int = 10) -> tuple[Optional[str], float]:
"""Search QQ Music and return the best-matching song MID with confidence."""
query = f"{track.artist or ''} {track.title or ''}".strip()
if not query:
return None
return None, 0.0
logger.debug(f"QQMusic: searching for '{query}' (limit={limit})")
@@ -54,12 +54,12 @@ class QQMusicFetcher(BaseFetcher):
if data.get("code") != 0:
logger.error(f"QQMusic: search API error: {data}")
return None
return None, 0.0
songs = data.get("data", {}).get("list", [])
if not songs:
logger.debug("QQMusic: search returned 0 results")
return None
return None, 0.0
logger.debug(f"QQMusic: search returned {len(songs)} candidates")
@@ -69,23 +69,35 @@ class QQMusicFetcher(BaseFetcher):
duration_ms=float(song["interval"]) * 1000
if isinstance(song.get("interval"), int)
else None,
title=song.get("name"),
artist=", ".join(s.get("name", "") for s in song.get("singer", []))
or None,
album=(song.get("album") or {}).get("name"),
)
for song in songs
if isinstance(song, dict) and song.get("mid") is not None
]
best_mid = select_best(candidates, track.length)
best_mid, confidence = select_best(
candidates,
track.length,
title=track.title,
artist=track.artist,
album=track.album,
)
if best_mid is not None:
logger.debug(f"QQMusic: selected mid={best_mid}")
return best_mid
logger.debug(
f"QQMusic: selected mid={best_mid} (confidence={confidence:.0f})"
)
return best_mid, confidence
logger.debug("QQMusic: no suitable candidate found")
return None
return None, 0.0
except Exception as e:
logger.error(f"QQMusic: search failed: {e}")
return None
return None, 0.0
def _get_lyric(self, mid: str) -> Optional[LyricResult]:
def _get_lyric(self, mid: str, confidence: float = 0.0) -> Optional[LyricResult]:
"""Fetch lyrics for a given QQ Music song MID."""
logger.debug(f"QQMusic: fetching lyrics for mid={mid}")
@@ -115,7 +127,12 @@ class QQMusicFetcher(BaseFetcher):
f"QQMusic: got {status.value} lyrics for mid={mid} "
f"({len(lrcdata)} lines)"
)
return LyricResult(status=status, lyrics=lrcdata, source=self.source_name)
return LyricResult(
status=status,
lyrics=lrcdata,
source=self.source_name,
confidence=confidence,
)
except Exception as e:
logger.error(f"QQMusic: lyric fetch failed for mid={mid}: {e}")
@@ -135,9 +152,9 @@ class QQMusicFetcher(BaseFetcher):
return None
logger.info(f"QQMusic: fetching lyrics for {track.display_name()}")
mid = self._search(track)
mid, confidence = self._search(track)
if not mid:
logger.debug(f"QQMusic: no match found for {track.display_name()}")
return LyricResult(status=CacheStatus.NOT_FOUND, ttl=TTL_NOT_FOUND)
return self._get_lyric(mid)
return self._get_lyric(mid, confidence=confidence)
+131 -31
View File
@@ -2,13 +2,23 @@
Shared candidate-selection logic for search-based fetchers.
Each fetcher maps its API-specific results to SearchCandidate, then calls
select_best() which handles duration filtering and synced preference uniformly.
select_best() which scores candidates by metadata similarity, duration
proximity, and sync status.
"""
from dataclasses import dataclass
from typing import Generic, Optional, TypeVar
from ..config import DURATION_TOLERANCE_MS
from ..config import (
DURATION_TOLERANCE_MS,
SCORE_W_TITLE as _W_TITLE,
SCORE_W_ARTIST as _W_ARTIST,
SCORE_W_ALBUM as _W_ALBUM,
SCORE_W_DURATION as _W_DURATION,
SCORE_W_SYNCED as _W_SYNCED,
MIN_CONFIDENCE,
)
from ..normalize import normalize_for_match, normalize_artist
T = TypeVar("T")
@@ -21,48 +31,138 @@ class SearchCandidate(Generic[T]):
item: The original API-specific object (dict, ID, etc.)
duration_ms: Track duration in milliseconds, or None if unknown.
is_synced: Whether this candidate is known to have synced lyrics.
title: Candidate track title for similarity scoring.
artist: Candidate artist name for similarity scoring.
album: Candidate album name for similarity scoring.
"""
item: T
duration_ms: Optional[float] = None
is_synced: bool = False
title: Optional[str] = None
artist: Optional[str] = None
album: Optional[str] = None
def _text_similarity(a: str, b: str) -> float:
"""Compare two normalized strings. Returns 0.0-1.0."""
if a == b:
return 1.0
if not a or not b:
return 0.0
# Containment: one is a substring of the other (e.g. "My Love" vs "My Love (Album Version)")
if a in b or b in a:
return min(len(a), len(b)) / max(len(a), len(b))
return 0.0
def _score_candidate(
c: SearchCandidate[T],
ref_title: Optional[str],
ref_artist: Optional[str],
ref_album: Optional[str],
ref_length_ms: Optional[int],
) -> float:
"""Score a candidate from 0-100 based on metadata match quality.
Scoring works in two tiers:
1. **Metadata score** — computed from fields available on *both* sides,
then rescaled to fill the 0-90 range so that missing fields don't
inflate the score. Fields missing on both sides are simply excluded
from the calculation (neutral). Fields present on only one side
contribute 0 to the numerator but their weight still counts in the
denominator (penalty for asymmetric absence).
2. **Synced bonus** — a flat 10 pts, always applied independently.
Field weights (before rescaling):
- Title: 40
- Artist: 30
- Album: 10
- Duration: 10
"""
raw = 0.0
available_weight = 0.0
# Title
if ref_title is not None or c.title is not None:
available_weight += _W_TITLE
if ref_title is not None and c.title is not None:
raw += _W_TITLE * _text_similarity(
normalize_for_match(ref_title), normalize_for_match(c.title)
)
# else both None → excluded
# Artist
if ref_artist is not None or c.artist is not None:
available_weight += _W_ARTIST
if ref_artist is not None and c.artist is not None:
na = normalize_artist(ref_artist)
nb = normalize_artist(c.artist)
if na == nb:
raw += _W_ARTIST
else:
raw += _W_ARTIST * _text_similarity(
normalize_for_match(ref_artist), normalize_for_match(c.artist)
)
# Album
if ref_album is not None or c.album is not None:
available_weight += _W_ALBUM
if ref_album is not None and c.album is not None:
raw += _W_ALBUM * _text_similarity(
normalize_for_match(ref_album), normalize_for_match(c.album)
)
# Duration
if ref_length_ms is not None or c.duration_ms is not None:
available_weight += _W_DURATION
if ref_length_ms is not None and c.duration_ms is not None:
diff = abs(c.duration_ms - ref_length_ms)
if diff <= DURATION_TOLERANCE_MS:
raw += _W_DURATION * (1.0 - diff / DURATION_TOLERANCE_MS)
# Rescale metadata to 0-90 range
_MAX_METADATA = _W_TITLE + _W_ARTIST + _W_ALBUM + _W_DURATION # 90
if available_weight > 0:
metadata_score = (raw / available_weight) * _MAX_METADATA
else:
# No comparable fields at all — only synced bonus matters
metadata_score = 0.0
# Synced bonus (always 10 pts, independent of metadata)
synced_score = _W_SYNCED if c.is_synced else 0.0
return metadata_score + synced_score
def select_best(
candidates: list[SearchCandidate[T]],
track_length_ms: Optional[int] = None,
tolerance_ms: float = DURATION_TOLERANCE_MS,
) -> Optional[T]:
"""Pick the best candidate by duration proximity and sync preference.
*,
title: Optional[str] = None,
artist: Optional[str] = None,
album: Optional[str] = None,
min_confidence: float = MIN_CONFIDENCE,
) -> tuple[Optional[T], float]:
"""Pick the best candidate by confidence scoring.
When track_length_ms is available:
- Filter by tolerance_ms
- Pick closest duration, prefer synced at equal distance
When track_length_ms is unavailable:
- Pick first synced candidate, or first overall
Returns (item, score). Item is None if no candidate scores above min_confidence.
"""
if track_length_ms is not None:
best: Optional[SearchCandidate[T]] = None
best_diff = float("inf")
if not candidates:
return None, 0.0
best_item: Optional[T] = None
best_score = -1.0
for c in candidates:
if c.duration_ms is None:
continue
diff = abs(c.duration_ms - track_length_ms)
if diff > tolerance_ms:
continue
if diff < best_diff or (
diff == best_diff
and c.is_synced
and (best is None or not best.is_synced)
):
best_diff = diff
best = c
s = _score_candidate(c, title, artist, album, track_length_ms)
if s > best_score:
best_score = s
best_item = c.item
return best.item if best is not None else None
if best_score < min_confidence:
return None, best_score
# No duration — prefer synced, fallback to first
for c in candidates:
if c.is_synced:
return c.item
return candidates[0].item if candidates else None
return best_item, best_score
+3
View File
@@ -62,3 +62,6 @@ class LyricResult:
lyrics: Optional[LRCData] = None
source: Optional[str] = None # Which fetcher produced this result
ttl: Optional[int] = None # Hint for cache TTL (seconds)
confidence: Optional[float] = (
None # 0-100 selection confidence (None = exact/trusted)
)
+47
View File
@@ -0,0 +1,47 @@
"""
Shared text normalization utilities for fuzzy matching.
Used by cache key generation, cache search, and candidate selection scoring.
"""
import re
import unicodedata
# Punctuation to strip for fuzzy matching (ASCII + fullwidth + CJK brackets/symbols)
_PUNCT_RE = re.compile(
r"[~!@#$%^&*()_+\-=\[\]{}|;:'\",.<>?/\\`"
r"~!@#$%^&*()_+-=【】{}|;:'",。<>?/\`"
r"「」『』《》〈〉〔〕·•‥…—–]"
)
_SPACE_RE = re.compile(r"\s+")
# feat./ft./featuring and everything after (case-insensitive, word boundary)
_FEAT_RE = re.compile(r"\s*(?:\bfeat\.?\b|\bft\.?\b|\bfeaturing\b).*", re.IGNORECASE)
# Multi-artist separators: /, &, ×, x (surrounded by spaces), ;, 、, vs.
_ARTIST_SEP_RE = re.compile(r"\s*(?:[/&;×、]|\bvs\.?\b|\bx\b)\s*", re.IGNORECASE)
def normalize_for_match(s: str) -> str:
"""Normalize a string for fuzzy comparison.
Lowercases, NFKC-normalizes (fullwidth → halfwidth), strips punctuation,
and collapses whitespace.
"""
s = unicodedata.normalize("NFKC", s).lower()
s = _FEAT_RE.sub("", s)
s = _PUNCT_RE.sub(" ", s)
s = _SPACE_RE.sub(" ", s).strip()
return s
def normalize_artist(s: str) -> str:
"""Normalize an artist string: split by separators, normalize each, sort.
Splits first (on /, &, ;, ×, 、, vs., x), then strips feat./ft./featuring
from each part individually, so 'A feat. C / B' → ['a', 'b'] not just ['a'].
"""
s = unicodedata.normalize("NFKC", s).lower()
parts = _ARTIST_SEP_RE.split(s)
normed = sorted(
{normalize_for_match(p) for p in parts if _FEAT_RE.sub("", p).strip()}
)
return "\0".join(normed) if normed else normalize_for_match(s)
+4 -21
View File
@@ -8,11 +8,10 @@ import pytest
from lrx_cli.cache import (
CacheEngine,
_generate_key,
_normalize_artist,
_normalize_for_match,
)
from lrx_cli.config import DURATION_TOLERANCE_MS
from lrx_cli.models import CacheStatus, LyricResult, TrackMeta
from lrx_cli.lrc import LRCData
def _track(
@@ -39,7 +38,7 @@ def _result(
lyrics: str | None,
source: str,
) -> LyricResult:
return LyricResult(status=status, lyrics=lyrics, source=source)
return LyricResult(status=status, lyrics=LRCData(lyrics), source=source)
@pytest.fixture
@@ -48,22 +47,6 @@ def cache_db(tmp_path: Path) -> CacheEngine:
return CacheEngine(str(db_path))
def test_normalize_for_match_covers_nfkc_punct_feat_and_whitespace() -> None:
text = " feat. SOMEONE "
normalized = _normalize_for_match(text)
assert normalized == "test"
def test_normalize_artist_splits_separators_and_sorts_parts() -> None:
artist = "B / A feat. C; D vs. E × F 、 G"
normalized = _normalize_artist(artist)
assert normalized == "a\0b\0d\0e\0f\0g"
def test_generate_key_uses_spotify_trackid_and_url_fallback() -> None:
spotify_track = _track(
trackid="abc123", artist=None, title=None, album=None, length=None
@@ -157,7 +140,7 @@ def test_get_backfills_missing_length_when_track_provides_it(
assert row[0] == 200000
def test_get_best_prefers_synced_over_unsynced_and_negative(
def test_get_best_prefers_higher_confidence_and_skips_negative(
cache_db: CacheEngine,
) -> None:
track = _track()
@@ -314,7 +297,7 @@ def test_search_by_meta_fuzzy_rules_and_duration_sorting(cache_db: CacheEngine)
sources = [r["source"] for r in rows]
assert "negative" not in sources
assert "far-len" not in sources
# Sorted by duration diff, then synced before unsynced for equal diff.
# Sorted by duration diff, then confidence for equal diff.
assert sources[0] == "seed"
assert sources[1] == "close-synced"
assert sources[2] == "close-unsynced"
+19
View File
@@ -0,0 +1,19 @@
from __future__ import annotations
from lrx_cli.normalize import normalize_for_match, normalize_artist
def test_normalize_for_match_covers_nfkc_punct_feat_and_whitespace() -> None:
text = " feat. SOMEONE "
normalized = normalize_for_match(text)
assert normalized == "test"
def test_normalize_artist_splits_separators_and_sorts_parts() -> None:
artist = "B / A feat. C; D vs. E × F 、 G"
normalized = normalize_artist(artist)
assert normalized == "a\0b\0d\0e\0f\0g"
+395 -65
View File
@@ -1,92 +1,422 @@
from __future__ import annotations
from lrx_cli.fetchers.selection import SearchCandidate, select_best
from lrx_cli.fetchers.selection import (
SearchCandidate,
select_best,
_score_candidate,
_text_similarity,
MIN_CONFIDENCE,
)
def test_picks_closest_duration_within_tolerance() -> None:
candidates = [
SearchCandidate(item="far", duration_ms=10000.0),
SearchCandidate(item="close", duration_ms=5100.0),
SearchCandidate(item="exact", duration_ms=5000.0),
def test_text_similarity_exact() -> None:
assert _text_similarity("my love", "my love") == 1.0
def test_text_similarity_empty() -> None:
assert _text_similarity("", "anything") == 0.0
assert _text_similarity("anything", "") == 0.0
def test_text_similarity_no_overlap() -> None:
assert _text_similarity("hello", "world") == 0.0
def test_text_similarity_containment() -> None:
# "my love" is contained in "my love album version"
score = _text_similarity("my love", "my love album version")
assert 0.0 < score < 1.0
assert score == len("my love") / len("my love album version")
def test_score_perfect_match() -> None:
"""Exact metadata + close duration + synced = 100."""
c = SearchCandidate(
item="x",
duration_ms=232000.0,
is_synced=True,
title="My Love",
artist="Westlife",
album="Coast To Coast",
)
score = _score_candidate(c, "My Love", "Westlife", "Coast To Coast", 232000)
assert score == 100.0
def test_score_no_metadata_match() -> None:
"""Completely wrong metadata should score very low."""
c = SearchCandidate(
item="x",
duration_ms=192000.0,
is_synced=True,
title="Let My Love Be Your Pillow (Live)",
artist="Ronnie Milsap",
album="The Essential Ronnie Milsap",
)
score = _score_candidate(c, "My Love", "Westlife", "Coast To Coast", 232000)
assert score < MIN_CONFIDENCE
def test_score_missing_both_sides_neutral() -> None:
"""If neither ref nor candidate has any field, only synced bonus applies."""
c = SearchCandidate(item="x", is_synced=True)
score = _score_candidate(c, None, None, None, None)
# No comparable fields → metadata = 0, synced = 10
assert score == 10.0
def test_score_missing_one_side_gives_zero_for_field() -> None:
"""If ref has title but candidate doesn't, title gets 0 and weight still counts."""
c = SearchCandidate(item="x", title=None, is_synced=True)
# Only title is in play (weight=40), candidate missing → raw=0, rescaled=0, + synced=10
score = _score_candidate(c, "My Love", None, None, None)
assert score == 10.0
def test_score_synced_bonus() -> None:
"""Synced adds 10 points."""
base = SearchCandidate(item="x", title="My Love", is_synced=False)
synced = SearchCandidate(item="x", title="My Love", is_synced=True)
diff = _score_candidate(synced, "My Love", None, None, None) - _score_candidate(
base, "My Love", None, None, None
)
assert diff == 10.0
def test_score_duration_linear_decay() -> None:
"""Duration score decays linearly; ratios between exact/half/edge are preserved."""
exact = SearchCandidate(item="x", duration_ms=232000.0)
score_exact = _score_candidate(exact, None, None, None, 232000)
half_tol = SearchCandidate(item="x", duration_ms=232000.0 + 1500.0)
score_half = _score_candidate(half_tol, None, None, None, 232000)
at_tol = SearchCandidate(item="x", duration_ms=232000.0 + 3000.0)
score_edge = _score_candidate(at_tol, None, None, None, 232000)
# Only duration is comparable → rescaled to fill 0-90
# exact=90, half=45, edge=0
assert score_exact == 90.0
assert score_half == 45.0
assert score_edge == 0.0
def test_score_case_insensitive_title() -> None:
c = SearchCandidate(item="x", title="my love")
s1 = _score_candidate(c, "My Love", None, None, None)
s2 = _score_candidate(c, "my love", None, None, None)
assert s1 == s2
def test_score_artist_normalization() -> None:
"""'Westlife feat. Someone' should still match 'Westlife'."""
c = SearchCandidate(item="x", artist="Westlife feat. Someone")
# normalize_artist strips feat. → both become "westlife"
score = _score_candidate(c, None, "Westlife", None, None)
assert score >= 30.0 # full artist weight (30) when both None on other fields
# Reference track: Westlife - My Love, album Coast To Coast, ~232s
_REF_TITLE = "My Love"
_REF_ARTIST = "Westlife"
_REF_ALBUM = "Coast To Coast"
_REF_LENGTH = 232000 # ms
def _lrclib_candidates() -> list[SearchCandidate[dict]]:
"""Fixtures from real LRCLIB search results."""
raw = [
{
"trackName": "My Love",
"artistName": "Westlife",
"albumName": "null",
"duration": 232.0,
"synced": True,
},
{
"trackName": "My Love",
"artistName": "Westlife",
"albumName": "null",
"duration": 180.0,
"synced": True,
},
{
"trackName": "My love",
"artistName": "Westlife",
"albumName": "moments",
"duration": 235.327,
"synced": True,
},
{
"trackName": "My Love",
"artistName": "Westlife",
"albumName": "Unbreakable",
"duration": 233.026,
"synced": True,
},
{
"trackName": "My Love",
"artistName": "Westlife",
"albumName": "Coast To Coast",
"duration": 231.847,
"synced": True,
},
{
"trackName": "Hello My Love",
"artistName": "Westlife",
"albumName": "Spectrum",
"duration": 216.0,
"synced": True,
},
{
"trackName": "My Love",
"artistName": "Westlife",
"albumName": "Hitzone 13",
"duration": 231.0,
"synced": True,
},
]
assert select_best(candidates, track_length_ms=5000) == "exact"
def test_filters_out_candidates_beyond_tolerance() -> None:
candidates = [
SearchCandidate(item="too_far", duration_ms=100000.0),
return [
SearchCandidate(
item=r,
duration_ms=r["duration"] * 1000,
is_synced=r["synced"],
title=r["trackName"],
artist=r["artistName"],
album=r["albumName"],
)
for r in raw
]
assert select_best(candidates, track_length_ms=5000, tolerance_ms=2000) is None
def test_prefers_synced_at_equal_duration() -> None:
candidates = [
SearchCandidate(item="unsynced", duration_ms=5000.0, is_synced=False),
SearchCandidate(item="synced", duration_ms=5000.0, is_synced=True),
def _lrclib_noisy_candidates() -> list[SearchCandidate[dict]]:
"""Fixtures from LRCLIB title-only search — lots of wrong artists."""
raw = [
{
"trackName": "Let My Love Be Your Pillow (Live)",
"artistName": "Ronnie Milsap",
"albumName": "The Essential Ronnie Milsap",
"duration": 192.0,
"synced": True,
},
{
"trackName": "My Love",
"artistName": "Little Texas",
"albumName": "Big Time",
"duration": 248.0,
"synced": True,
},
{
"trackName": "My Love (Album Version)",
"artistName": "Little Texas",
"albumName": "Greatest Hits",
"duration": 248.0,
"synced": True,
},
{
"trackName": "My Love - Digitally Remastered '89",
"artistName": "Sonny James",
"albumName": "Capitol Collectors Series",
"duration": 169.0,
"synced": False,
},
{
"trackName": "My Love",
"artistName": "Westlife",
"albumName": "Coast To Coast",
"duration": 231.847,
"synced": True,
},
]
assert select_best(candidates, track_length_ms=5000) == "synced"
def test_closer_duration_wins_over_synced() -> None:
candidates = [
SearchCandidate(item="synced_far", duration_ms=6000.0, is_synced=True),
SearchCandidate(item="unsynced_close", duration_ms=5001.0, is_synced=False),
return [
SearchCandidate(
item=r,
duration_ms=r["duration"] * 1000,
is_synced=r["synced"],
title=r["trackName"],
artist=r["artistName"],
album=r["albumName"],
)
for r in raw
]
assert select_best(candidates, track_length_ms=5000) == "unsynced_close"
def test_skips_candidates_without_duration_when_track_length_given() -> None:
candidates = [
SearchCandidate(item="no_dur", duration_ms=None),
SearchCandidate(item="has_dur", duration_ms=5000.0),
def _netease_candidates() -> list[SearchCandidate[int]]:
"""Fixtures from real Netease search results."""
raw = [
{
"id": 2080607,
"name": "My Love",
"artist": "Westlife",
"album": "Unbreakable, Vol. 1 - The Greatest Hits",
"dt": 231941,
},
{
"id": 2080749,
"name": "My Love (Radio Edit)",
"artist": "Westlife",
"album": "World Of Our Own - No. 1 Hits Plus (EP)",
"dt": 232920,
},
{
"id": 29809886,
"name": "My Love (Live)",
"artist": "Westlife",
"album": "The Farewell Tour: Live at Croke Park",
"dt": 262000,
},
{
"id": 572412968,
"name": "My Love",
"artist": "Westlife",
"album": "Pure... Love",
"dt": 231000,
},
{
"id": 20707713,
"name": "You Raise Me Up",
"artist": "Westlife",
"album": "You Raise Me Up",
"dt": 241116,
},
]
assert select_best(candidates, track_length_ms=5000) == "has_dur"
def test_returns_none_when_all_lack_duration_and_track_length_given() -> None:
candidates = [
SearchCandidate(item="a", duration_ms=None),
SearchCandidate(item="b", duration_ms=None),
return [
SearchCandidate(
item=r["id"],
duration_ms=float(r["dt"]),
title=r["name"],
artist=r["artist"],
album=r["album"],
)
for r in raw
]
assert select_best(candidates, track_length_ms=5000) is None
def test_prefers_synced_when_no_track_length() -> None:
candidates = [
SearchCandidate(item="unsynced", is_synced=False),
SearchCandidate(item="synced", is_synced=True),
]
assert select_best(candidates, track_length_ms=None) == "synced"
def test_lrclib_picks_exact_album_match() -> None:
"""With full metadata, should pick the Coast To Coast entry."""
candidates = _lrclib_candidates()
best, score = select_best(
candidates,
_REF_LENGTH,
title=_REF_TITLE,
artist=_REF_ARTIST,
album=_REF_ALBUM,
)
assert best is not None
assert best["albumName"] == "Coast To Coast"
assert score >= MIN_CONFIDENCE
def test_falls_back_to_first_when_none_synced() -> None:
candidates = [
SearchCandidate(item="first"),
SearchCandidate(item="second"),
]
assert select_best(candidates, track_length_ms=None) == "first"
def test_lrclib_rejects_wrong_title() -> None:
"""'Hello My Love' should not beat 'My Love' entries."""
candidates = _lrclib_candidates()
best, _ = select_best(
candidates,
_REF_LENGTH,
title=_REF_TITLE,
artist=_REF_ARTIST,
album=_REF_ALBUM,
)
assert best is not None
assert best["trackName"] != "Hello My Love"
def test_lrclib_noisy_picks_westlife() -> None:
"""In noisy title-only results, artist matching should filter to Westlife."""
candidates = _lrclib_noisy_candidates()
best, _ = select_best(
candidates,
_REF_LENGTH,
title=_REF_TITLE,
artist=_REF_ARTIST,
album=_REF_ALBUM,
)
assert best is not None
assert best["artistName"] == "Westlife"
def test_lrclib_noisy_rejects_all_without_ref_artist() -> None:
"""Without ref artist, wrong-artist candidates may still win, but right title should rank higher."""
candidates = _lrclib_noisy_candidates()
best, _ = select_best(
candidates,
_REF_LENGTH,
title=_REF_TITLE,
)
# Should pick a "My Love" over "Let My Love Be Your Pillow"
assert best is not None
assert "My Love" == best["trackName"] or best["trackName"].startswith("My Love")
def test_netease_picks_closest_duration() -> None:
candidates = _netease_candidates()
best, _ = select_best(
candidates,
_REF_LENGTH,
title=_REF_TITLE,
artist=_REF_ARTIST,
album=_REF_ALBUM,
)
# 2080607 has dt=231941 (diff=59ms), closest to 232000
assert best == 2080607
def test_netease_rejects_wrong_title() -> None:
"""'You Raise Me Up' should not be selected."""
candidates = _netease_candidates()
best, _ = select_best(
candidates,
_REF_LENGTH,
title=_REF_TITLE,
artist=_REF_ARTIST,
)
assert best != 20707713
def test_netease_without_ref_metadata_rejects_below_confidence() -> None:
"""Without any ref metadata, candidates with one-sided fields score low and get rejected."""
candidates = _netease_candidates()
best, _ = select_best(candidates, _REF_LENGTH)
# Candidates have title/artist/album but ref has None for all → 0 for text fields
# Only duration (max 10) contributes → below MIN_CONFIDENCE (25)
assert best is None
# --- Edge cases ---
def test_empty_candidates_returns_none() -> None:
assert select_best([], track_length_ms=5000) is None
assert select_best([], track_length_ms=None) is None
assert select_best([], track_length_ms=5000) == (None, 0.0)
assert select_best([], track_length_ms=None) == (None, 0.0)
def test_single_candidate_within_tolerance() -> None:
candidates = [SearchCandidate(item="only", duration_ms=5000.0)]
assert select_best(candidates, track_length_ms=5000) == "only"
def test_single_candidate_beyond_tolerance() -> None:
candidates = [SearchCandidate(item="only", duration_ms=99999.0)]
assert select_best(candidates, track_length_ms=5000, tolerance_ms=1000) is None
def test_all_below_min_confidence_returns_none() -> None:
"""If all candidates score below threshold, return None."""
candidates = [
SearchCandidate(
item="x",
title="Completely Different Song",
artist="Unknown Artist",
album="Unknown Album",
duration_ms=999999.0,
),
]
result, _ = select_best(
candidates,
232000,
title="My Love",
artist="Westlife",
album="Coast To Coast",
min_confidence=90.0,
)
assert result is None
def test_generic_type_preserved() -> None:
"""select_best returns the same type as SearchCandidate.item."""
int_candidates = [SearchCandidate(item=42, duration_ms=5000.0)]
assert select_best(int_candidates, track_length_ms=5000) == 42
int_candidates = [SearchCandidate(item=42, duration_ms=5000.0, title="x")]
best, _ = select_best(int_candidates, 5000, title="x")
assert best == 42
dict_candidates = [SearchCandidate(item={"id": 1}, duration_ms=5000.0)]
result = select_best(dict_candidates, track_length_ms=5000)
assert result == {"id": 1}
dict_candidates = [SearchCandidate(item={"id": 1}, title="x")]
best, _ = select_best(dict_candidates, title="x")
assert best == {"id": 1}