From 2df167e31d4bfaafec65a8bca039d26a0ad73549 Mon Sep 17 00:00:00 2001 From: Uyanide Date: Thu, 2 Apr 2026 04:26:19 +0200 Subject: [PATCH] feat: evaluate fetch results with "confidence" --- .vscode/settings.json | 3 - lrx_cli/cache.py | 120 ++++---- lrx_cli/cli.py | 3 + lrx_cli/config.py | 15 + lrx_cli/core.py | 83 ++++-- lrx_cli/fetchers/cache_search.py | 17 +- lrx_cli/fetchers/lrclib_search.py | 21 +- lrx_cli/fetchers/netease.py | 49 +++- lrx_cli/fetchers/qqmusic.py | 45 ++- lrx_cli/fetchers/selection.py | 164 ++++++++--- lrx_cli/models.py | 3 + lrx_cli/normalize.py | 47 +++ tests/test_cache.py | 25 +- tests/test_normalize.py | 19 ++ tests/test_selection.py | 460 +++++++++++++++++++++++++----- 15 files changed, 836 insertions(+), 238 deletions(-) delete mode 100644 .vscode/settings.json create mode 100644 lrx_cli/normalize.py create mode 100644 tests/test_normalize.py diff --git a/.vscode/settings.json b/.vscode/settings.json deleted file mode 100644 index 6760576..0000000 --- a/.vscode/settings.json +++ /dev/null @@ -1,3 +0,0 @@ -{ - "python-envs.defaultEnvManager": "ms-python.python:venv" -} diff --git a/lrx_cli/cache.py b/lrx_cli/cache.py index 4471855..c5b2b1e 100644 --- a/lrx_cli/cache.py +++ b/lrx_cli/cache.py @@ -4,56 +4,21 @@ Date: 2026-03-25 10:18:03 Description: SQLite-based lyric cache with per-source storage and TTL expiration """ -import re import sqlite3 import hashlib import time -import unicodedata from typing import Optional from loguru import logger from .lrc import LRCData -from .config import DURATION_TOLERANCE_MS -from .models import TrackMeta, LyricResult, CacheStatus - -# Punctuation to strip for fuzzy matching (ASCII + fullwidth + CJK brackets/symbols) -_PUNCT_RE = re.compile( - r"[~!@#$%^&*()_+\-=\[\]{}|;:'\",.<>?/\\`" - r"~!@#$%^&*()_+-=【】{}|;:'",。<>?/\`" - r"「」『』《》〈〉〔〕·•‥…—–]" +from .normalize import normalize_for_match as _normalize_for_match +from .normalize import normalize_artist as _normalize_artist +from .config import ( + DURATION_TOLERANCE_MS, + LEGACY_CONFIDENCE_SYNCED, + LEGACY_CONFIDENCE_UNSYNCED, ) -_SPACE_RE = re.compile(r"\s+") -# feat./ft./featuring and everything after (case-insensitive, word boundary) -_FEAT_RE = re.compile(r"\s*(?:\bfeat\.?\b|\bft\.?\b|\bfeaturing\b).*", re.IGNORECASE) -# Multi-artist separators: /, &, ×, x (surrounded by spaces), ;, 、, vs. -_ARTIST_SEP_RE = re.compile(r"\s*(?:[/&;×、]|\bvs\.?\b|\bx\b)\s*", re.IGNORECASE) - - -def _normalize_for_match(s: str) -> str: - """Normalize a string for fuzzy comparison. - - Lowercases, NFKC-normalizes (fullwidth → halfwidth), strips punctuation, - and collapses whitespace. - """ - s = unicodedata.normalize("NFKC", s).lower() - s = _FEAT_RE.sub("", s) - s = _PUNCT_RE.sub(" ", s) - s = _SPACE_RE.sub(" ", s).strip() - return s - - -def _normalize_artist(s: str) -> str: - """Normalize an artist string: split by separators, normalize each, sort. - - Splits first (on /, &, ;, ×, 、, vs., x), then strips feat./ft./featuring - from each part individually, so 'A feat. C / B' → ['a', 'b'] not just ['a']. - """ - s = unicodedata.normalize("NFKC", s).lower() - parts = _ARTIST_SEP_RE.split(s) - normed = sorted( - {_normalize_for_match(p) for p in parts if _FEAT_RE.sub("", p).strip()} - ) - return "\0".join(normed) if normed else _normalize_for_match(s) +from .models import TrackMeta, LyricResult, CacheStatus def _generate_key(track: TrackMeta, source: str) -> str: @@ -110,10 +75,12 @@ class CacheEngine: length INTEGER ) """) - # Migration: add length column if missing + # Migrations cols = {r[1] for r in conn.execute("PRAGMA table_info(cache)").fetchall()} if "length" not in cols: conn.execute("ALTER TABLE cache ADD COLUMN length INTEGER") + if "confidence" not in cols: + conn.execute("ALTER TABLE cache ADD COLUMN confidence REAL") conn.commit() # Read @@ -130,7 +97,7 @@ class CacheEngine: with sqlite3.connect(self.db_path) as conn: row = conn.execute( - "SELECT status, lyrics, source, expires_at, length FROM cache WHERE key = ?", + "SELECT status, lyrics, source, expires_at, length, confidence FROM cache WHERE key = ?", (key,), ).fetchone() @@ -138,7 +105,7 @@ class CacheEngine: logger.debug(f"Cache miss: {source} / {track.display_name()}") return None - status_str, lyrics, src, expires_at, cached_length = row + status_str, lyrics, src, expires_at, cached_length, confidence = row # Check TTL expiration if expires_at and expires_at < int(time.time()): @@ -160,15 +127,27 @@ class CacheEngine: f"Cache hit: {source} / {track.display_name()} " f"[{status_str}, ttl={remaining}s]" ) + status = CacheStatus(status_str) + if confidence is None and status in ( + CacheStatus.SUCCESS_SYNCED, + CacheStatus.SUCCESS_UNSYNCED, + ): + confidence = ( + LEGACY_CONFIDENCE_SYNCED + if status == CacheStatus.SUCCESS_SYNCED + else LEGACY_CONFIDENCE_UNSYNCED + ) + return LyricResult( - status=CacheStatus(status_str), + status=status, lyrics=LRCData(lyrics) if lyrics else None, source=src, ttl=remaining, + confidence=confidence, ) def get_best(self, track: TrackMeta, sources: list[str]) -> Optional[LyricResult]: - """Return the best cached result across *sources* (synced > unsynced). + """Return the best cached result across *sources* by confidence. Skips negative statuses (NOT_FOUND, NETWORK_ERROR) — those are only consulted per-source to avoid redundant fetches. @@ -178,10 +157,20 @@ class CacheEngine: cached = self.get(track, src) if not cached: continue - if cached.status == CacheStatus.SUCCESS_SYNCED: - return cached # Can't do better - if cached.status == CacheStatus.SUCCESS_UNSYNCED and best is None: + if cached.status not in ( + CacheStatus.SUCCESS_SYNCED, + CacheStatus.SUCCESS_UNSYNCED, + ): + continue + if best is None: best = cached + else: + cached_conf = ( + cached.confidence if cached.confidence is not None else 100.0 + ) + best_conf = best.confidence if best.confidence is not None else 100.0 + if cached_conf > best_conf: + best = cached return best # Write @@ -207,8 +196,8 @@ class CacheEngine: conn.execute( """INSERT OR REPLACE INTO cache (key, source, status, lyrics, created_at, expires_at, - artist, title, album, length) - VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""", + artist, title, album, length, confidence) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""", ( key, source, @@ -220,6 +209,7 @@ class CacheEngine: track.title, track.album, track.length, + result.confidence, ), ) conn.commit() @@ -288,7 +278,7 @@ class CacheEngine: """Find the best positive (synced/unsynced) cache entry for *track*. Uses exact metadata match (artist + title + album) across all sources. - Returns synced if available, otherwise unsynced, or None. + Returns the highest-confidence entry, or None. """ conditions, params = self._track_where(track) if not conditions: @@ -306,19 +296,34 @@ class CacheEngine: with sqlite3.connect(self.db_path) as conn: conn.row_factory = sqlite3.Row rows = conn.execute( - f"SELECT status, lyrics, source FROM cache WHERE {where} " - "ORDER BY CASE status WHEN ? THEN 0 ELSE 1 END LIMIT 1", - params + [CacheStatus.SUCCESS_SYNCED.value], + f"SELECT status, lyrics, source, confidence FROM cache WHERE {where} " + "ORDER BY COALESCE(confidence, " + " CASE status WHEN ? THEN ? ELSE ? END" + ") DESC, created_at DESC LIMIT 1", + params + + [ + CacheStatus.SUCCESS_SYNCED.value, + LEGACY_CONFIDENCE_SYNCED, + LEGACY_CONFIDENCE_UNSYNCED, + ], ).fetchall() if not rows: return None row = dict(rows[0]) + confidence = row["confidence"] + if confidence is None: + confidence = ( + LEGACY_CONFIDENCE_SYNCED + if row["status"] == CacheStatus.SUCCESS_SYNCED.value + else LEGACY_CONFIDENCE_UNSYNCED + ) return LyricResult( status=CacheStatus(row["status"]), lyrics=LRCData(row["lyrics"]) if row["lyrics"] else None, source="cache-search", + confidence=confidence, ) # Fuzzy search @@ -384,7 +389,8 @@ class CacheEngine: scored.sort( key=lambda x: ( x[0], - x[1].get("status") != CacheStatus.SUCCESS_SYNCED.value, + -(x[1].get("confidence") or 0), + -(x[1].get("created_at") or 0), ) ) matches = [m for _, m in scored] diff --git a/lrx_cli/cli.py b/lrx_cli/cli.py index 3063c07..8568ee0 100644 --- a/lrx_cli/cli.py +++ b/lrx_cli/cli.py @@ -428,6 +428,7 @@ def _print_cache_row(row: dict, indent: str = "") -> None: created = row.get("created_at", 0) expires = row.get("expires_at") lyrics = row.get("lyrics", "") + confidence = row.get("confidence") name = f"{artist} - {title}" if artist and title else row.get("key", "?") print(f"{indent}[{source}] {name}") @@ -450,6 +451,8 @@ def _print_cache_row(row: dict, indent: str = "") -> None: if lyrics: line_count = len(lyrics.splitlines()) print(f"{indent} Lyrics : {line_count} lines") + if confidence is not None: + print(f"{indent} Confidence: {confidence:.0f}") def run(): diff --git a/lrx_cli/config.py b/lrx_cli/config.py index 7706ed8..f5d62f2 100644 --- a/lrx_cli/config.py +++ b/lrx_cli/config.py @@ -38,6 +38,21 @@ TTL_NETWORK_ERROR = 3600 # 1 hour # Search DURATION_TOLERANCE_MS = 3000 # max duration mismatch for search matching +# Confidence scoring weights (sum to 100) +SCORE_W_TITLE = 40.0 +SCORE_W_ARTIST = 30.0 +SCORE_W_ALBUM = 10.0 +SCORE_W_DURATION = 10.0 +SCORE_W_SYNCED = 10.0 + +# Confidence thresholds +MIN_CONFIDENCE = 25.0 # below this, candidate is rejected +HIGH_CONFIDENCE = 80.0 # at or above this, stop searching early + +# Legacy cache rows (no confidence stored) get a base score by sync status +LEGACY_CONFIDENCE_SYNCED = 50.0 +LEGACY_CONFIDENCE_UNSYNCED = 40.0 + # Spotify related SPOTIFY_TOKEN_URL = "https://open.spotify.com/api/token" SPOTIFY_LYRICS_URL = "https://spclient.wg.spotify.com/color-lyrics/v2/track/" diff --git a/lrx_cli/core.py b/lrx_cli/core.py index 8ecbc1f..6ef5182 100644 --- a/lrx_cli/core.py +++ b/lrx_cli/core.py @@ -9,7 +9,7 @@ Fetch pipeline: 1. Check cache for each source in the fallback sequence 2. For sources without a valid cache hit, call the fetcher 3. Cache every result (success, not-found, or error) per source - 4. Return the best result (synced > unsynced > None) + 4. Return the best result by confidence (highest wins) """ from typing import Optional @@ -19,7 +19,13 @@ from .fetchers import FetcherMethodType, create_fetchers from .fetchers.base import BaseFetcher from .cache import CacheEngine from .lrc import LRCData -from .config import TTL_SYNCED, TTL_UNSYNCED, TTL_NOT_FOUND, TTL_NETWORK_ERROR +from .config import ( + TTL_SYNCED, + TTL_UNSYNCED, + TTL_NOT_FOUND, + TTL_NETWORK_ERROR, + HIGH_CONFIDENCE, +) from .models import TrackMeta, LyricResult, CacheStatus from .enrichers import enrich_track @@ -33,6 +39,18 @@ _STATUS_TTL: dict[CacheStatus, Optional[int]] = { } +def _is_better(new: LyricResult, old: LyricResult) -> bool: + """Compare two results by confidence only. + + Synced/unsynced preference is already baked into the confidence score + (synced bonus in scoring weights), so we don't need a separate tier. + None confidence = trusted = 100. + """ + new_conf = new.confidence if new.confidence is not None else 100.0 + old_conf = old.confidence if old.confidence is not None else 100.0 + return new_conf > old_conf + + class LrcManager: """Main entry point for fetching lyrics with caching.""" @@ -72,7 +90,7 @@ class LrcManager: - Cache miss or unsynced → call fetcher, then cache the result After all sources are tried, returns the best result found - (synced > unsynced > None). + (highest confidence wins). """ track = enrich_track(track) logger.info(f"Fetching lyrics for: {track.display_name()}") @@ -81,7 +99,7 @@ class LrcManager: if not sequence: return None - # Best result seen so far (synced wins over unsynced) + # Best result seen so far (highest confidence wins) best_result: Optional[LyricResult] = None for fetcher in sequence: @@ -91,17 +109,7 @@ class LrcManager: if not bypass_cache and not fetcher.self_cached: cached = self.cache.get(track, source) if cached: - if cached.status == CacheStatus.SUCCESS_SYNCED: - logger.info(f"[{source}] cache hit: synced lyrics") - return cached - elif cached.status == CacheStatus.SUCCESS_UNSYNCED: - logger.debug( - f"[{source}] cache hit: unsynced lyrics (continuing)" - ) - if best_result is None: - best_result = cached - continue # Try next source for synced - elif cached.status in ( + if cached.status in ( CacheStatus.NOT_FOUND, CacheStatus.NETWORK_ERROR, ): @@ -109,6 +117,23 @@ class LrcManager: f"[{source}] cache hit: {cached.status.value}, skipping" ) continue + + # Positive cache hit — apply the same confidence evaluation + # as fresh fetches so that low-confidence cached results + # don't block better results from later fetchers. + is_trusted = ( + cached.confidence is None + or cached.confidence >= HIGH_CONFIDENCE + ) + logger.info( + f"[{source}] cache hit: {cached.status.value}" + f" (confidence={'trusted' if cached.confidence is None else f'{cached.confidence:.0f}'})" + ) + if cached.status == CacheStatus.SUCCESS_SYNCED and is_trusted: + return cached + if best_result is None or _is_better(cached, best_result): + best_result = cached + continue elif not fetcher.self_cached: logger.debug(f"[{source}] cache bypassed") @@ -126,20 +151,28 @@ class LrcManager: self.cache.set(track, source, result, ttl_seconds=ttl) # Evaluate result - if result.status == CacheStatus.SUCCESS_SYNCED: - logger.info(f"[{source}] got synced lyrics") - return result - - if result.status == CacheStatus.SUCCESS_UNSYNCED: - logger.debug(f"[{source}] got unsynced lyrics (continuing)") - if best_result is None: + if result.status in ( + CacheStatus.SUCCESS_SYNCED, + CacheStatus.SUCCESS_UNSYNCED, + ): + is_trusted = ( + result.confidence is None or result.confidence >= HIGH_CONFIDENCE + ) + logger.info( + f"[{source}] got {result.status.value} lyrics" + f" (confidence={'trusted' if result.confidence is None else f'{result.confidence:.0f}'})" + ) + # Trusted synced → return immediately + if result.status == CacheStatus.SUCCESS_SYNCED and is_trusted: + return result + # Track best result by confidence + if best_result is None or _is_better(result, best_result): best_result = result # NOT_FOUND / NETWORK_ERROR: already cached, try next # Return best available if best_result: - # Normalize unsynced lyrics: set all timestamps to [00:00.00] if ( best_result.status == CacheStatus.SUCCESS_UNSYNCED and best_result.lyrics @@ -149,10 +182,10 @@ class LrcManager: lyrics=best_result.lyrics.normalize_unsynced(), source=best_result.source, ttl=best_result.ttl, + confidence=best_result.confidence, ) logger.info( - f"Returning unsynced lyrics from {best_result.source} " - f"(no synced source found)" + f"Returning {best_result.status.value} lyrics from {best_result.source}" ) else: logger.info(f"No lyrics found for {track.display_name()}") diff --git a/lrx_cli/fetchers/cache_search.py b/lrx_cli/fetchers/cache_search.py index 0b4c318..119ab3a 100644 --- a/lrx_cli/fetchers/cache_search.py +++ b/lrx_cli/fetchers/cache_search.py @@ -64,16 +64,26 @@ class CacheSearchFetcher(BaseFetcher): logger.debug(f"Cache-search: no match for {track.display_name()}") return None - # Pick best: prefer synced, then first available + # Pick best by confidence scoring candidates = [ SearchCandidate( item=m, + duration_ms=float(m["length"]) if m.get("length") else None, is_synced=m.get("status") == CacheStatus.SUCCESS_SYNCED.value, + title=m.get("title"), + artist=m.get("artist"), + album=m.get("album"), ) for m in matches if m.get("lyrics") ] - best = select_best(candidates, track.length) + best, confidence = select_best( + candidates, + track.length, + title=track.title, + artist=track.artist, + album=track.album, + ) if not best: return None @@ -81,10 +91,11 @@ class CacheSearchFetcher(BaseFetcher): status = CacheStatus(best["status"]) logger.info( f"Cache-search: fuzzy hit from [{best.get('source')}] " - f"album={best.get('album')!r} ({status.value})" + f"album={best.get('album')!r} ({status.value}, confidence={confidence:.0f})" ) return LyricResult( status=status, lyrics=LRCData(best["lyrics"]), source=self.source_name, + confidence=confidence, ) diff --git a/lrx_cli/fetchers/lrclib_search.py b/lrx_cli/fetchers/lrclib_search.py index 3bbc372..a09820d 100644 --- a/lrx_cli/fetchers/lrclib_search.py +++ b/lrx_cli/fetchers/lrclib_search.py @@ -126,10 +126,19 @@ class LrclibSearchFetcher(BaseFetcher): else None, is_synced=isinstance(item.get("syncedLyrics"), str) and bool(item["syncedLyrics"].strip()), + title=item.get("trackName"), + artist=item.get("artistName"), + album=item.get("albumName"), ) for item in candidates ] - best = select_best(mapped, track.length) + best, confidence = select_best( + mapped, + track.length, + title=track.title, + artist=track.artist, + album=track.album, + ) if best is None: logger.debug("LRCLIB-search: no valid candidate found") return LyricResult(status=CacheStatus.NOT_FOUND, ttl=TTL_NOT_FOUND) @@ -139,20 +148,26 @@ class LrclibSearchFetcher(BaseFetcher): if isinstance(synced, str) and synced.strip(): lyrics = LRCData(synced) - logger.info(f"LRCLIB-search: got synced lyrics ({len(lyrics)} lines)") + logger.info( + f"LRCLIB-search: got synced lyrics ({len(lyrics)} lines, confidence={confidence:.0f})" + ) return LyricResult( status=CacheStatus.SUCCESS_SYNCED, lyrics=lyrics, source=self.source_name, + confidence=confidence, ) elif isinstance(unsynced, str) and unsynced.strip(): lyrics = LRCData(unsynced) - logger.info(f"LRCLIB-search: got unsynced lyrics ({len(lyrics)} lines)") + logger.info( + f"LRCLIB-search: got unsynced lyrics ({len(lyrics)} lines, confidence={confidence:.0f})" + ) return LyricResult( status=CacheStatus.SUCCESS_UNSYNCED, lyrics=lyrics, source=self.source_name, ttl=TTL_UNSYNCED, + confidence=confidence, ) else: logger.debug("LRCLIB-search: best candidate has empty lyrics") diff --git a/lrx_cli/fetchers/netease.py b/lrx_cli/fetchers/netease.py index 428fd5b..d054a3f 100644 --- a/lrx_cli/fetchers/netease.py +++ b/lrx_cli/fetchers/netease.py @@ -43,15 +43,15 @@ class NeteaseFetcher(BaseFetcher): def is_available(self, track: TrackMeta) -> bool: return bool(track.title) - def _search(self, track: TrackMeta, limit: int = 10) -> Optional[int]: - """Search Netease and return the best-matching song ID. + def _search(self, track: TrackMeta, limit: int = 10) -> tuple[Optional[int], float]: + """Search Netease and return the best-matching song ID with confidence. When ``track.length`` is available, candidates are ranked by duration difference and only accepted if within ``DURATION_TOLERANCE_MS``. """ query = f"{track.artist or ''} {track.title or ''}".strip() if not query: - return None + return None, 0.0 logger.debug(f"Netease: searching for '{query}' (limit={limit})") @@ -70,17 +70,17 @@ class NeteaseFetcher(BaseFetcher): logger.error( f"Netease: search returned non-dict: {type(result).__name__}" ) - return None + return None, 0.0 result_body = result.get("result") if not isinstance(result_body, dict): logger.debug("Netease: search 'result' field missing or invalid") - return None + return None, 0.0 songs = result_body.get("songs") if not isinstance(songs, list) or len(songs) == 0: logger.debug("Netease: search returned 0 results") - return None + return None, 0.0 logger.debug(f"Netease: search returned {len(songs)} candidates") @@ -90,23 +90,37 @@ class NeteaseFetcher(BaseFetcher): duration_ms=float(song["dt"]) if isinstance(song.get("dt"), int) else None, + title=song.get("name"), + artist=", ".join(a.get("name", "") for a in song.get("ar", [])) + or None, + album=(song.get("al") or {}).get("name"), ) for song in songs if isinstance(song, dict) and song.get("id") is not None ] - best_id = select_best(candidates, track.length) + best_id, confidence = select_best( + candidates, + track.length, + title=track.title, + artist=track.artist, + album=track.album, + ) if best_id is not None: - logger.debug(f"Netease: selected id={best_id}") - return best_id + logger.debug( + f"Netease: selected id={best_id} (confidence={confidence:.0f})" + ) + return best_id, confidence logger.debug("Netease: no suitable candidate found") - return None + return None, 0.0 except Exception as e: logger.error(f"Netease: search failed: {e}") - return None + return None, 0.0 - def _get_lyric(self, song_id: int) -> Optional[LyricResult]: + def _get_lyric( + self, song_id: int, confidence: float = 0.0 + ) -> Optional[LyricResult]: """Fetch lyrics for a given Netease song ID.""" logger.debug(f"Netease: fetching lyrics for song_id={song_id}") @@ -158,7 +172,12 @@ class NeteaseFetcher(BaseFetcher): f"Netease: got {status.value} lyrics for song_id={song_id} " f"({len(lrcdata)} lines)" ) - return LyricResult(status=status, lyrics=lrcdata, source=self.source_name) + return LyricResult( + status=status, + lyrics=lrcdata, + source=self.source_name, + confidence=confidence, + ) except Exception as e: logger.error(f"Netease: lyric fetch failed for song_id={song_id}: {e}") @@ -174,9 +193,9 @@ class NeteaseFetcher(BaseFetcher): return None logger.info(f"Netease: fetching lyrics for {track.display_name()}") - song_id = self._search(track) + song_id, confidence = self._search(track) if not song_id: logger.debug(f"Netease: no match found for {track.display_name()}") return LyricResult(status=CacheStatus.NOT_FOUND, ttl=TTL_NOT_FOUND) - return self._get_lyric(song_id) + return self._get_lyric(song_id, confidence=confidence) diff --git a/lrx_cli/fetchers/qqmusic.py b/lrx_cli/fetchers/qqmusic.py index b38ed78..dfaef78 100644 --- a/lrx_cli/fetchers/qqmusic.py +++ b/lrx_cli/fetchers/qqmusic.py @@ -35,11 +35,11 @@ class QQMusicFetcher(BaseFetcher): def is_available(self, track: TrackMeta) -> bool: return bool(track.title) and bool(QQ_MUSIC_API_URL) - def _search(self, track: TrackMeta, limit: int = 10) -> Optional[str]: - """Search QQ Music and return the best-matching song MID.""" + def _search(self, track: TrackMeta, limit: int = 10) -> tuple[Optional[str], float]: + """Search QQ Music and return the best-matching song MID with confidence.""" query = f"{track.artist or ''} {track.title or ''}".strip() if not query: - return None + return None, 0.0 logger.debug(f"QQMusic: searching for '{query}' (limit={limit})") @@ -54,12 +54,12 @@ class QQMusicFetcher(BaseFetcher): if data.get("code") != 0: logger.error(f"QQMusic: search API error: {data}") - return None + return None, 0.0 songs = data.get("data", {}).get("list", []) if not songs: logger.debug("QQMusic: search returned 0 results") - return None + return None, 0.0 logger.debug(f"QQMusic: search returned {len(songs)} candidates") @@ -69,23 +69,35 @@ class QQMusicFetcher(BaseFetcher): duration_ms=float(song["interval"]) * 1000 if isinstance(song.get("interval"), int) else None, + title=song.get("name"), + artist=", ".join(s.get("name", "") for s in song.get("singer", [])) + or None, + album=(song.get("album") or {}).get("name"), ) for song in songs if isinstance(song, dict) and song.get("mid") is not None ] - best_mid = select_best(candidates, track.length) + best_mid, confidence = select_best( + candidates, + track.length, + title=track.title, + artist=track.artist, + album=track.album, + ) if best_mid is not None: - logger.debug(f"QQMusic: selected mid={best_mid}") - return best_mid + logger.debug( + f"QQMusic: selected mid={best_mid} (confidence={confidence:.0f})" + ) + return best_mid, confidence logger.debug("QQMusic: no suitable candidate found") - return None + return None, 0.0 except Exception as e: logger.error(f"QQMusic: search failed: {e}") - return None + return None, 0.0 - def _get_lyric(self, mid: str) -> Optional[LyricResult]: + def _get_lyric(self, mid: str, confidence: float = 0.0) -> Optional[LyricResult]: """Fetch lyrics for a given QQ Music song MID.""" logger.debug(f"QQMusic: fetching lyrics for mid={mid}") @@ -115,7 +127,12 @@ class QQMusicFetcher(BaseFetcher): f"QQMusic: got {status.value} lyrics for mid={mid} " f"({len(lrcdata)} lines)" ) - return LyricResult(status=status, lyrics=lrcdata, source=self.source_name) + return LyricResult( + status=status, + lyrics=lrcdata, + source=self.source_name, + confidence=confidence, + ) except Exception as e: logger.error(f"QQMusic: lyric fetch failed for mid={mid}: {e}") @@ -135,9 +152,9 @@ class QQMusicFetcher(BaseFetcher): return None logger.info(f"QQMusic: fetching lyrics for {track.display_name()}") - mid = self._search(track) + mid, confidence = self._search(track) if not mid: logger.debug(f"QQMusic: no match found for {track.display_name()}") return LyricResult(status=CacheStatus.NOT_FOUND, ttl=TTL_NOT_FOUND) - return self._get_lyric(mid) + return self._get_lyric(mid, confidence=confidence) diff --git a/lrx_cli/fetchers/selection.py b/lrx_cli/fetchers/selection.py index 1753ae4..cb2f76d 100644 --- a/lrx_cli/fetchers/selection.py +++ b/lrx_cli/fetchers/selection.py @@ -2,13 +2,23 @@ Shared candidate-selection logic for search-based fetchers. Each fetcher maps its API-specific results to SearchCandidate, then calls -select_best() which handles duration filtering and synced preference uniformly. +select_best() which scores candidates by metadata similarity, duration +proximity, and sync status. """ from dataclasses import dataclass from typing import Generic, Optional, TypeVar -from ..config import DURATION_TOLERANCE_MS +from ..config import ( + DURATION_TOLERANCE_MS, + SCORE_W_TITLE as _W_TITLE, + SCORE_W_ARTIST as _W_ARTIST, + SCORE_W_ALBUM as _W_ALBUM, + SCORE_W_DURATION as _W_DURATION, + SCORE_W_SYNCED as _W_SYNCED, + MIN_CONFIDENCE, +) +from ..normalize import normalize_for_match, normalize_artist T = TypeVar("T") @@ -21,48 +31,138 @@ class SearchCandidate(Generic[T]): item: The original API-specific object (dict, ID, etc.) duration_ms: Track duration in milliseconds, or None if unknown. is_synced: Whether this candidate is known to have synced lyrics. + title: Candidate track title for similarity scoring. + artist: Candidate artist name for similarity scoring. + album: Candidate album name for similarity scoring. """ item: T duration_ms: Optional[float] = None is_synced: bool = False + title: Optional[str] = None + artist: Optional[str] = None + album: Optional[str] = None + + +def _text_similarity(a: str, b: str) -> float: + """Compare two normalized strings. Returns 0.0-1.0.""" + if a == b: + return 1.0 + if not a or not b: + return 0.0 + # Containment: one is a substring of the other (e.g. "My Love" vs "My Love (Album Version)") + if a in b or b in a: + return min(len(a), len(b)) / max(len(a), len(b)) + return 0.0 + + +def _score_candidate( + c: SearchCandidate[T], + ref_title: Optional[str], + ref_artist: Optional[str], + ref_album: Optional[str], + ref_length_ms: Optional[int], +) -> float: + """Score a candidate from 0-100 based on metadata match quality. + + Scoring works in two tiers: + + 1. **Metadata score** — computed from fields available on *both* sides, + then rescaled to fill the 0-90 range so that missing fields don't + inflate the score. Fields missing on both sides are simply excluded + from the calculation (neutral). Fields present on only one side + contribute 0 to the numerator but their weight still counts in the + denominator (penalty for asymmetric absence). + + 2. **Synced bonus** — a flat 10 pts, always applied independently. + + Field weights (before rescaling): + - Title: 40 + - Artist: 30 + - Album: 10 + - Duration: 10 + """ + raw = 0.0 + available_weight = 0.0 + + # Title + if ref_title is not None or c.title is not None: + available_weight += _W_TITLE + if ref_title is not None and c.title is not None: + raw += _W_TITLE * _text_similarity( + normalize_for_match(ref_title), normalize_for_match(c.title) + ) + # else both None → excluded + + # Artist + if ref_artist is not None or c.artist is not None: + available_weight += _W_ARTIST + if ref_artist is not None and c.artist is not None: + na = normalize_artist(ref_artist) + nb = normalize_artist(c.artist) + if na == nb: + raw += _W_ARTIST + else: + raw += _W_ARTIST * _text_similarity( + normalize_for_match(ref_artist), normalize_for_match(c.artist) + ) + + # Album + if ref_album is not None or c.album is not None: + available_weight += _W_ALBUM + if ref_album is not None and c.album is not None: + raw += _W_ALBUM * _text_similarity( + normalize_for_match(ref_album), normalize_for_match(c.album) + ) + + # Duration + if ref_length_ms is not None or c.duration_ms is not None: + available_weight += _W_DURATION + if ref_length_ms is not None and c.duration_ms is not None: + diff = abs(c.duration_ms - ref_length_ms) + if diff <= DURATION_TOLERANCE_MS: + raw += _W_DURATION * (1.0 - diff / DURATION_TOLERANCE_MS) + + # Rescale metadata to 0-90 range + _MAX_METADATA = _W_TITLE + _W_ARTIST + _W_ALBUM + _W_DURATION # 90 + if available_weight > 0: + metadata_score = (raw / available_weight) * _MAX_METADATA + else: + # No comparable fields at all — only synced bonus matters + metadata_score = 0.0 + + # Synced bonus (always 10 pts, independent of metadata) + synced_score = _W_SYNCED if c.is_synced else 0.0 + + return metadata_score + synced_score def select_best( candidates: list[SearchCandidate[T]], track_length_ms: Optional[int] = None, - tolerance_ms: float = DURATION_TOLERANCE_MS, -) -> Optional[T]: - """Pick the best candidate by duration proximity and sync preference. + *, + title: Optional[str] = None, + artist: Optional[str] = None, + album: Optional[str] = None, + min_confidence: float = MIN_CONFIDENCE, +) -> tuple[Optional[T], float]: + """Pick the best candidate by confidence scoring. - When track_length_ms is available: - - Filter by tolerance_ms - - Pick closest duration, prefer synced at equal distance - When track_length_ms is unavailable: - - Pick first synced candidate, or first overall + Returns (item, score). Item is None if no candidate scores above min_confidence. """ - if track_length_ms is not None: - best: Optional[SearchCandidate[T]] = None - best_diff = float("inf") + if not candidates: + return None, 0.0 - for c in candidates: - if c.duration_ms is None: - continue - diff = abs(c.duration_ms - track_length_ms) - if diff > tolerance_ms: - continue - if diff < best_diff or ( - diff == best_diff - and c.is_synced - and (best is None or not best.is_synced) - ): - best_diff = diff - best = c + best_item: Optional[T] = None + best_score = -1.0 - return best.item if best is not None else None - - # No duration — prefer synced, fallback to first for c in candidates: - if c.is_synced: - return c.item - return candidates[0].item if candidates else None + s = _score_candidate(c, title, artist, album, track_length_ms) + if s > best_score: + best_score = s + best_item = c.item + + if best_score < min_confidence: + return None, best_score + + return best_item, best_score diff --git a/lrx_cli/models.py b/lrx_cli/models.py index 5597d0f..b172144 100644 --- a/lrx_cli/models.py +++ b/lrx_cli/models.py @@ -62,3 +62,6 @@ class LyricResult: lyrics: Optional[LRCData] = None source: Optional[str] = None # Which fetcher produced this result ttl: Optional[int] = None # Hint for cache TTL (seconds) + confidence: Optional[float] = ( + None # 0-100 selection confidence (None = exact/trusted) + ) diff --git a/lrx_cli/normalize.py b/lrx_cli/normalize.py new file mode 100644 index 0000000..941e3d9 --- /dev/null +++ b/lrx_cli/normalize.py @@ -0,0 +1,47 @@ +""" +Shared text normalization utilities for fuzzy matching. + +Used by cache key generation, cache search, and candidate selection scoring. +""" + +import re +import unicodedata + +# Punctuation to strip for fuzzy matching (ASCII + fullwidth + CJK brackets/symbols) +_PUNCT_RE = re.compile( + r"[~!@#$%^&*()_+\-=\[\]{}|;:'\",.<>?/\\`" + r"~!@#$%^&*()_+-=【】{}|;:'",。<>?/\`" + r"「」『』《》〈〉〔〕·•‥…—–]" +) +_SPACE_RE = re.compile(r"\s+") +# feat./ft./featuring and everything after (case-insensitive, word boundary) +_FEAT_RE = re.compile(r"\s*(?:\bfeat\.?\b|\bft\.?\b|\bfeaturing\b).*", re.IGNORECASE) +# Multi-artist separators: /, &, ×, x (surrounded by spaces), ;, 、, vs. +_ARTIST_SEP_RE = re.compile(r"\s*(?:[/&;×、]|\bvs\.?\b|\bx\b)\s*", re.IGNORECASE) + + +def normalize_for_match(s: str) -> str: + """Normalize a string for fuzzy comparison. + + Lowercases, NFKC-normalizes (fullwidth → halfwidth), strips punctuation, + and collapses whitespace. + """ + s = unicodedata.normalize("NFKC", s).lower() + s = _FEAT_RE.sub("", s) + s = _PUNCT_RE.sub(" ", s) + s = _SPACE_RE.sub(" ", s).strip() + return s + + +def normalize_artist(s: str) -> str: + """Normalize an artist string: split by separators, normalize each, sort. + + Splits first (on /, &, ;, ×, 、, vs., x), then strips feat./ft./featuring + from each part individually, so 'A feat. C / B' → ['a', 'b'] not just ['a']. + """ + s = unicodedata.normalize("NFKC", s).lower() + parts = _ARTIST_SEP_RE.split(s) + normed = sorted( + {normalize_for_match(p) for p in parts if _FEAT_RE.sub("", p).strip()} + ) + return "\0".join(normed) if normed else normalize_for_match(s) diff --git a/tests/test_cache.py b/tests/test_cache.py index de591ad..ce6bc89 100644 --- a/tests/test_cache.py +++ b/tests/test_cache.py @@ -8,11 +8,10 @@ import pytest from lrx_cli.cache import ( CacheEngine, _generate_key, - _normalize_artist, - _normalize_for_match, ) from lrx_cli.config import DURATION_TOLERANCE_MS from lrx_cli.models import CacheStatus, LyricResult, TrackMeta +from lrx_cli.lrc import LRCData def _track( @@ -39,7 +38,7 @@ def _result( lyrics: str | None, source: str, ) -> LyricResult: - return LyricResult(status=status, lyrics=lyrics, source=source) + return LyricResult(status=status, lyrics=LRCData(lyrics), source=source) @pytest.fixture @@ -48,22 +47,6 @@ def cache_db(tmp_path: Path) -> CacheEngine: return CacheEngine(str(db_path)) -def test_normalize_for_match_covers_nfkc_punct_feat_and_whitespace() -> None: - text = " Test! feat. SOMEONE " - - normalized = _normalize_for_match(text) - - assert normalized == "test" - - -def test_normalize_artist_splits_separators_and_sorts_parts() -> None: - artist = "B / A feat. C; D vs. E × F 、 G" - - normalized = _normalize_artist(artist) - - assert normalized == "a\0b\0d\0e\0f\0g" - - def test_generate_key_uses_spotify_trackid_and_url_fallback() -> None: spotify_track = _track( trackid="abc123", artist=None, title=None, album=None, length=None @@ -157,7 +140,7 @@ def test_get_backfills_missing_length_when_track_provides_it( assert row[0] == 200000 -def test_get_best_prefers_synced_over_unsynced_and_negative( +def test_get_best_prefers_higher_confidence_and_skips_negative( cache_db: CacheEngine, ) -> None: track = _track() @@ -314,7 +297,7 @@ def test_search_by_meta_fuzzy_rules_and_duration_sorting(cache_db: CacheEngine) sources = [r["source"] for r in rows] assert "negative" not in sources assert "far-len" not in sources - # Sorted by duration diff, then synced before unsynced for equal diff. + # Sorted by duration diff, then confidence for equal diff. assert sources[0] == "seed" assert sources[1] == "close-synced" assert sources[2] == "close-unsynced" diff --git a/tests/test_normalize.py b/tests/test_normalize.py new file mode 100644 index 0000000..ff2f985 --- /dev/null +++ b/tests/test_normalize.py @@ -0,0 +1,19 @@ +from __future__ import annotations + +from lrx_cli.normalize import normalize_for_match, normalize_artist + + +def test_normalize_for_match_covers_nfkc_punct_feat_and_whitespace() -> None: + text = " Test! feat. SOMEONE " + + normalized = normalize_for_match(text) + + assert normalized == "test" + + +def test_normalize_artist_splits_separators_and_sorts_parts() -> None: + artist = "B / A feat. C; D vs. E × F 、 G" + + normalized = normalize_artist(artist) + + assert normalized == "a\0b\0d\0e\0f\0g" diff --git a/tests/test_selection.py b/tests/test_selection.py index 4109c11..632a14b 100644 --- a/tests/test_selection.py +++ b/tests/test_selection.py @@ -1,92 +1,422 @@ from __future__ import annotations -from lrx_cli.fetchers.selection import SearchCandidate, select_best +from lrx_cli.fetchers.selection import ( + SearchCandidate, + select_best, + _score_candidate, + _text_similarity, + MIN_CONFIDENCE, +) -def test_picks_closest_duration_within_tolerance() -> None: - candidates = [ - SearchCandidate(item="far", duration_ms=10000.0), - SearchCandidate(item="close", duration_ms=5100.0), - SearchCandidate(item="exact", duration_ms=5000.0), +def test_text_similarity_exact() -> None: + assert _text_similarity("my love", "my love") == 1.0 + + +def test_text_similarity_empty() -> None: + assert _text_similarity("", "anything") == 0.0 + assert _text_similarity("anything", "") == 0.0 + + +def test_text_similarity_no_overlap() -> None: + assert _text_similarity("hello", "world") == 0.0 + + +def test_text_similarity_containment() -> None: + # "my love" is contained in "my love album version" + score = _text_similarity("my love", "my love album version") + assert 0.0 < score < 1.0 + assert score == len("my love") / len("my love album version") + + +def test_score_perfect_match() -> None: + """Exact metadata + close duration + synced = 100.""" + c = SearchCandidate( + item="x", + duration_ms=232000.0, + is_synced=True, + title="My Love", + artist="Westlife", + album="Coast To Coast", + ) + score = _score_candidate(c, "My Love", "Westlife", "Coast To Coast", 232000) + assert score == 100.0 + + +def test_score_no_metadata_match() -> None: + """Completely wrong metadata should score very low.""" + c = SearchCandidate( + item="x", + duration_ms=192000.0, + is_synced=True, + title="Let My Love Be Your Pillow (Live)", + artist="Ronnie Milsap", + album="The Essential Ronnie Milsap", + ) + score = _score_candidate(c, "My Love", "Westlife", "Coast To Coast", 232000) + assert score < MIN_CONFIDENCE + + +def test_score_missing_both_sides_neutral() -> None: + """If neither ref nor candidate has any field, only synced bonus applies.""" + c = SearchCandidate(item="x", is_synced=True) + score = _score_candidate(c, None, None, None, None) + # No comparable fields → metadata = 0, synced = 10 + assert score == 10.0 + + +def test_score_missing_one_side_gives_zero_for_field() -> None: + """If ref has title but candidate doesn't, title gets 0 and weight still counts.""" + c = SearchCandidate(item="x", title=None, is_synced=True) + # Only title is in play (weight=40), candidate missing → raw=0, rescaled=0, + synced=10 + score = _score_candidate(c, "My Love", None, None, None) + assert score == 10.0 + + +def test_score_synced_bonus() -> None: + """Synced adds 10 points.""" + base = SearchCandidate(item="x", title="My Love", is_synced=False) + synced = SearchCandidate(item="x", title="My Love", is_synced=True) + diff = _score_candidate(synced, "My Love", None, None, None) - _score_candidate( + base, "My Love", None, None, None + ) + assert diff == 10.0 + + +def test_score_duration_linear_decay() -> None: + """Duration score decays linearly; ratios between exact/half/edge are preserved.""" + exact = SearchCandidate(item="x", duration_ms=232000.0) + score_exact = _score_candidate(exact, None, None, None, 232000) + + half_tol = SearchCandidate(item="x", duration_ms=232000.0 + 1500.0) + score_half = _score_candidate(half_tol, None, None, None, 232000) + + at_tol = SearchCandidate(item="x", duration_ms=232000.0 + 3000.0) + score_edge = _score_candidate(at_tol, None, None, None, 232000) + + # Only duration is comparable → rescaled to fill 0-90 + # exact=90, half=45, edge=0 + assert score_exact == 90.0 + assert score_half == 45.0 + assert score_edge == 0.0 + + +def test_score_case_insensitive_title() -> None: + c = SearchCandidate(item="x", title="my love") + s1 = _score_candidate(c, "My Love", None, None, None) + s2 = _score_candidate(c, "my love", None, None, None) + assert s1 == s2 + + +def test_score_artist_normalization() -> None: + """'Westlife feat. Someone' should still match 'Westlife'.""" + c = SearchCandidate(item="x", artist="Westlife feat. Someone") + # normalize_artist strips feat. → both become "westlife" + score = _score_candidate(c, None, "Westlife", None, None) + assert score >= 30.0 # full artist weight (30) when both None on other fields + + +# Reference track: Westlife - My Love, album Coast To Coast, ~232s +_REF_TITLE = "My Love" +_REF_ARTIST = "Westlife" +_REF_ALBUM = "Coast To Coast" +_REF_LENGTH = 232000 # ms + + +def _lrclib_candidates() -> list[SearchCandidate[dict]]: + """Fixtures from real LRCLIB search results.""" + raw = [ + { + "trackName": "My Love", + "artistName": "Westlife", + "albumName": "null", + "duration": 232.0, + "synced": True, + }, + { + "trackName": "My Love", + "artistName": "Westlife", + "albumName": "null", + "duration": 180.0, + "synced": True, + }, + { + "trackName": "My love", + "artistName": "Westlife", + "albumName": "moments", + "duration": 235.327, + "synced": True, + }, + { + "trackName": "My Love", + "artistName": "Westlife", + "albumName": "Unbreakable", + "duration": 233.026, + "synced": True, + }, + { + "trackName": "My Love", + "artistName": "Westlife", + "albumName": "Coast To Coast", + "duration": 231.847, + "synced": True, + }, + { + "trackName": "Hello My Love", + "artistName": "Westlife", + "albumName": "Spectrum", + "duration": 216.0, + "synced": True, + }, + { + "trackName": "My Love", + "artistName": "Westlife", + "albumName": "Hitzone 13", + "duration": 231.0, + "synced": True, + }, ] - assert select_best(candidates, track_length_ms=5000) == "exact" - - -def test_filters_out_candidates_beyond_tolerance() -> None: - candidates = [ - SearchCandidate(item="too_far", duration_ms=100000.0), + return [ + SearchCandidate( + item=r, + duration_ms=r["duration"] * 1000, + is_synced=r["synced"], + title=r["trackName"], + artist=r["artistName"], + album=r["albumName"], + ) + for r in raw ] - assert select_best(candidates, track_length_ms=5000, tolerance_ms=2000) is None -def test_prefers_synced_at_equal_duration() -> None: - candidates = [ - SearchCandidate(item="unsynced", duration_ms=5000.0, is_synced=False), - SearchCandidate(item="synced", duration_ms=5000.0, is_synced=True), +def _lrclib_noisy_candidates() -> list[SearchCandidate[dict]]: + """Fixtures from LRCLIB title-only search — lots of wrong artists.""" + raw = [ + { + "trackName": "Let My Love Be Your Pillow (Live)", + "artistName": "Ronnie Milsap", + "albumName": "The Essential Ronnie Milsap", + "duration": 192.0, + "synced": True, + }, + { + "trackName": "My Love", + "artistName": "Little Texas", + "albumName": "Big Time", + "duration": 248.0, + "synced": True, + }, + { + "trackName": "My Love (Album Version)", + "artistName": "Little Texas", + "albumName": "Greatest Hits", + "duration": 248.0, + "synced": True, + }, + { + "trackName": "My Love - Digitally Remastered '89", + "artistName": "Sonny James", + "albumName": "Capitol Collectors Series", + "duration": 169.0, + "synced": False, + }, + { + "trackName": "My Love", + "artistName": "Westlife", + "albumName": "Coast To Coast", + "duration": 231.847, + "synced": True, + }, ] - assert select_best(candidates, track_length_ms=5000) == "synced" - - -def test_closer_duration_wins_over_synced() -> None: - candidates = [ - SearchCandidate(item="synced_far", duration_ms=6000.0, is_synced=True), - SearchCandidate(item="unsynced_close", duration_ms=5001.0, is_synced=False), + return [ + SearchCandidate( + item=r, + duration_ms=r["duration"] * 1000, + is_synced=r["synced"], + title=r["trackName"], + artist=r["artistName"], + album=r["albumName"], + ) + for r in raw ] - assert select_best(candidates, track_length_ms=5000) == "unsynced_close" -def test_skips_candidates_without_duration_when_track_length_given() -> None: - candidates = [ - SearchCandidate(item="no_dur", duration_ms=None), - SearchCandidate(item="has_dur", duration_ms=5000.0), +def _netease_candidates() -> list[SearchCandidate[int]]: + """Fixtures from real Netease search results.""" + raw = [ + { + "id": 2080607, + "name": "My Love", + "artist": "Westlife", + "album": "Unbreakable, Vol. 1 - The Greatest Hits", + "dt": 231941, + }, + { + "id": 2080749, + "name": "My Love (Radio Edit)", + "artist": "Westlife", + "album": "World Of Our Own - No. 1 Hits Plus (EP)", + "dt": 232920, + }, + { + "id": 29809886, + "name": "My Love (Live)", + "artist": "Westlife", + "album": "The Farewell Tour: Live at Croke Park", + "dt": 262000, + }, + { + "id": 572412968, + "name": "My Love", + "artist": "Westlife", + "album": "Pure... Love", + "dt": 231000, + }, + { + "id": 20707713, + "name": "You Raise Me Up", + "artist": "Westlife", + "album": "You Raise Me Up", + "dt": 241116, + }, ] - assert select_best(candidates, track_length_ms=5000) == "has_dur" - - -def test_returns_none_when_all_lack_duration_and_track_length_given() -> None: - candidates = [ - SearchCandidate(item="a", duration_ms=None), - SearchCandidate(item="b", duration_ms=None), + return [ + SearchCandidate( + item=r["id"], + duration_ms=float(r["dt"]), + title=r["name"], + artist=r["artist"], + album=r["album"], + ) + for r in raw ] - assert select_best(candidates, track_length_ms=5000) is None -def test_prefers_synced_when_no_track_length() -> None: - candidates = [ - SearchCandidate(item="unsynced", is_synced=False), - SearchCandidate(item="synced", is_synced=True), - ] - assert select_best(candidates, track_length_ms=None) == "synced" +def test_lrclib_picks_exact_album_match() -> None: + """With full metadata, should pick the Coast To Coast entry.""" + candidates = _lrclib_candidates() + best, score = select_best( + candidates, + _REF_LENGTH, + title=_REF_TITLE, + artist=_REF_ARTIST, + album=_REF_ALBUM, + ) + assert best is not None + assert best["albumName"] == "Coast To Coast" + assert score >= MIN_CONFIDENCE -def test_falls_back_to_first_when_none_synced() -> None: - candidates = [ - SearchCandidate(item="first"), - SearchCandidate(item="second"), - ] - assert select_best(candidates, track_length_ms=None) == "first" +def test_lrclib_rejects_wrong_title() -> None: + """'Hello My Love' should not beat 'My Love' entries.""" + candidates = _lrclib_candidates() + best, _ = select_best( + candidates, + _REF_LENGTH, + title=_REF_TITLE, + artist=_REF_ARTIST, + album=_REF_ALBUM, + ) + assert best is not None + assert best["trackName"] != "Hello My Love" + + +def test_lrclib_noisy_picks_westlife() -> None: + """In noisy title-only results, artist matching should filter to Westlife.""" + candidates = _lrclib_noisy_candidates() + best, _ = select_best( + candidates, + _REF_LENGTH, + title=_REF_TITLE, + artist=_REF_ARTIST, + album=_REF_ALBUM, + ) + assert best is not None + assert best["artistName"] == "Westlife" + + +def test_lrclib_noisy_rejects_all_without_ref_artist() -> None: + """Without ref artist, wrong-artist candidates may still win, but right title should rank higher.""" + candidates = _lrclib_noisy_candidates() + best, _ = select_best( + candidates, + _REF_LENGTH, + title=_REF_TITLE, + ) + # Should pick a "My Love" over "Let My Love Be Your Pillow" + assert best is not None + assert "My Love" == best["trackName"] or best["trackName"].startswith("My Love") + + +def test_netease_picks_closest_duration() -> None: + candidates = _netease_candidates() + best, _ = select_best( + candidates, + _REF_LENGTH, + title=_REF_TITLE, + artist=_REF_ARTIST, + album=_REF_ALBUM, + ) + # 2080607 has dt=231941 (diff=59ms), closest to 232000 + assert best == 2080607 + + +def test_netease_rejects_wrong_title() -> None: + """'You Raise Me Up' should not be selected.""" + candidates = _netease_candidates() + best, _ = select_best( + candidates, + _REF_LENGTH, + title=_REF_TITLE, + artist=_REF_ARTIST, + ) + assert best != 20707713 + + +def test_netease_without_ref_metadata_rejects_below_confidence() -> None: + """Without any ref metadata, candidates with one-sided fields score low and get rejected.""" + candidates = _netease_candidates() + best, _ = select_best(candidates, _REF_LENGTH) + # Candidates have title/artist/album but ref has None for all → 0 for text fields + # Only duration (max 10) contributes → below MIN_CONFIDENCE (25) + assert best is None + + +# --- Edge cases --- def test_empty_candidates_returns_none() -> None: - assert select_best([], track_length_ms=5000) is None - assert select_best([], track_length_ms=None) is None + assert select_best([], track_length_ms=5000) == (None, 0.0) + assert select_best([], track_length_ms=None) == (None, 0.0) -def test_single_candidate_within_tolerance() -> None: - candidates = [SearchCandidate(item="only", duration_ms=5000.0)] - assert select_best(candidates, track_length_ms=5000) == "only" - - -def test_single_candidate_beyond_tolerance() -> None: - candidates = [SearchCandidate(item="only", duration_ms=99999.0)] - assert select_best(candidates, track_length_ms=5000, tolerance_ms=1000) is None +def test_all_below_min_confidence_returns_none() -> None: + """If all candidates score below threshold, return None.""" + candidates = [ + SearchCandidate( + item="x", + title="Completely Different Song", + artist="Unknown Artist", + album="Unknown Album", + duration_ms=999999.0, + ), + ] + result, _ = select_best( + candidates, + 232000, + title="My Love", + artist="Westlife", + album="Coast To Coast", + min_confidence=90.0, + ) + assert result is None def test_generic_type_preserved() -> None: - """select_best returns the same type as SearchCandidate.item.""" - int_candidates = [SearchCandidate(item=42, duration_ms=5000.0)] - assert select_best(int_candidates, track_length_ms=5000) == 42 + int_candidates = [SearchCandidate(item=42, duration_ms=5000.0, title="x")] + best, _ = select_best(int_candidates, 5000, title="x") + assert best == 42 - dict_candidates = [SearchCandidate(item={"id": 1}, duration_ms=5000.0)] - result = select_best(dict_candidates, track_length_ms=5000) - assert result == {"id": 1} + dict_candidates = [SearchCandidate(item={"id": 1}, title="x")] + best, _ = select_best(dict_candidates, title="x") + assert best == {"id": 1}