feat: evaluate fetch results with "confidence"

This commit is contained in:
2026-04-02 04:26:19 +02:00
parent 9aaf4d8aed
commit 2df167e31d
15 changed files with 836 additions and 238 deletions
+63 -57
View File
@@ -4,56 +4,21 @@ Date: 2026-03-25 10:18:03
Description: SQLite-based lyric cache with per-source storage and TTL expiration
"""
import re
import sqlite3
import hashlib
import time
import unicodedata
from typing import Optional
from loguru import logger
from .lrc import LRCData
from .config import DURATION_TOLERANCE_MS
from .models import TrackMeta, LyricResult, CacheStatus
# Punctuation to strip for fuzzy matching (ASCII + fullwidth + CJK brackets/symbols)
_PUNCT_RE = re.compile(
r"[~!@#$%^&*()_+\-=\[\]{}|;:'\",.<>?/\\`"
r"~!@#$%^&*()_+-=【】{}|;:'",。<>?/\`"
r"「」『』《》〈〉〔〕·•‥…—–]"
from .normalize import normalize_for_match as _normalize_for_match
from .normalize import normalize_artist as _normalize_artist
from .config import (
DURATION_TOLERANCE_MS,
LEGACY_CONFIDENCE_SYNCED,
LEGACY_CONFIDENCE_UNSYNCED,
)
_SPACE_RE = re.compile(r"\s+")
# feat./ft./featuring and everything after (case-insensitive, word boundary)
_FEAT_RE = re.compile(r"\s*(?:\bfeat\.?\b|\bft\.?\b|\bfeaturing\b).*", re.IGNORECASE)
# Multi-artist separators: /, &, ×, x (surrounded by spaces), ;, 、, vs.
_ARTIST_SEP_RE = re.compile(r"\s*(?:[/&;×、]|\bvs\.?\b|\bx\b)\s*", re.IGNORECASE)
def _normalize_for_match(s: str) -> str:
"""Normalize a string for fuzzy comparison.
Lowercases, NFKC-normalizes (fullwidth → halfwidth), strips punctuation,
and collapses whitespace.
"""
s = unicodedata.normalize("NFKC", s).lower()
s = _FEAT_RE.sub("", s)
s = _PUNCT_RE.sub(" ", s)
s = _SPACE_RE.sub(" ", s).strip()
return s
def _normalize_artist(s: str) -> str:
"""Normalize an artist string: split by separators, normalize each, sort.
Splits first (on /, &, ;, ×, 、, vs., x), then strips feat./ft./featuring
from each part individually, so 'A feat. C / B' → ['a', 'b'] not just ['a'].
"""
s = unicodedata.normalize("NFKC", s).lower()
parts = _ARTIST_SEP_RE.split(s)
normed = sorted(
{_normalize_for_match(p) for p in parts if _FEAT_RE.sub("", p).strip()}
)
return "\0".join(normed) if normed else _normalize_for_match(s)
from .models import TrackMeta, LyricResult, CacheStatus
def _generate_key(track: TrackMeta, source: str) -> str:
@@ -110,10 +75,12 @@ class CacheEngine:
length INTEGER
)
""")
# Migration: add length column if missing
# Migrations
cols = {r[1] for r in conn.execute("PRAGMA table_info(cache)").fetchall()}
if "length" not in cols:
conn.execute("ALTER TABLE cache ADD COLUMN length INTEGER")
if "confidence" not in cols:
conn.execute("ALTER TABLE cache ADD COLUMN confidence REAL")
conn.commit()
# Read
@@ -130,7 +97,7 @@ class CacheEngine:
with sqlite3.connect(self.db_path) as conn:
row = conn.execute(
"SELECT status, lyrics, source, expires_at, length FROM cache WHERE key = ?",
"SELECT status, lyrics, source, expires_at, length, confidence FROM cache WHERE key = ?",
(key,),
).fetchone()
@@ -138,7 +105,7 @@ class CacheEngine:
logger.debug(f"Cache miss: {source} / {track.display_name()}")
return None
status_str, lyrics, src, expires_at, cached_length = row
status_str, lyrics, src, expires_at, cached_length, confidence = row
# Check TTL expiration
if expires_at and expires_at < int(time.time()):
@@ -160,15 +127,27 @@ class CacheEngine:
f"Cache hit: {source} / {track.display_name()} "
f"[{status_str}, ttl={remaining}s]"
)
status = CacheStatus(status_str)
if confidence is None and status in (
CacheStatus.SUCCESS_SYNCED,
CacheStatus.SUCCESS_UNSYNCED,
):
confidence = (
LEGACY_CONFIDENCE_SYNCED
if status == CacheStatus.SUCCESS_SYNCED
else LEGACY_CONFIDENCE_UNSYNCED
)
return LyricResult(
status=CacheStatus(status_str),
status=status,
lyrics=LRCData(lyrics) if lyrics else None,
source=src,
ttl=remaining,
confidence=confidence,
)
def get_best(self, track: TrackMeta, sources: list[str]) -> Optional[LyricResult]:
"""Return the best cached result across *sources* (synced > unsynced).
"""Return the best cached result across *sources* by confidence.
Skips negative statuses (NOT_FOUND, NETWORK_ERROR) — those are only
consulted per-source to avoid redundant fetches.
@@ -178,10 +157,20 @@ class CacheEngine:
cached = self.get(track, src)
if not cached:
continue
if cached.status == CacheStatus.SUCCESS_SYNCED:
return cached # Can't do better
if cached.status == CacheStatus.SUCCESS_UNSYNCED and best is None:
if cached.status not in (
CacheStatus.SUCCESS_SYNCED,
CacheStatus.SUCCESS_UNSYNCED,
):
continue
if best is None:
best = cached
else:
cached_conf = (
cached.confidence if cached.confidence is not None else 100.0
)
best_conf = best.confidence if best.confidence is not None else 100.0
if cached_conf > best_conf:
best = cached
return best
# Write
@@ -207,8 +196,8 @@ class CacheEngine:
conn.execute(
"""INSERT OR REPLACE INTO cache
(key, source, status, lyrics, created_at, expires_at,
artist, title, album, length)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
artist, title, album, length, confidence)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
(
key,
source,
@@ -220,6 +209,7 @@ class CacheEngine:
track.title,
track.album,
track.length,
result.confidence,
),
)
conn.commit()
@@ -288,7 +278,7 @@ class CacheEngine:
"""Find the best positive (synced/unsynced) cache entry for *track*.
Uses exact metadata match (artist + title + album) across all sources.
Returns synced if available, otherwise unsynced, or None.
Returns the highest-confidence entry, or None.
"""
conditions, params = self._track_where(track)
if not conditions:
@@ -306,19 +296,34 @@ class CacheEngine:
with sqlite3.connect(self.db_path) as conn:
conn.row_factory = sqlite3.Row
rows = conn.execute(
f"SELECT status, lyrics, source FROM cache WHERE {where} "
"ORDER BY CASE status WHEN ? THEN 0 ELSE 1 END LIMIT 1",
params + [CacheStatus.SUCCESS_SYNCED.value],
f"SELECT status, lyrics, source, confidence FROM cache WHERE {where} "
"ORDER BY COALESCE(confidence, "
" CASE status WHEN ? THEN ? ELSE ? END"
") DESC, created_at DESC LIMIT 1",
params
+ [
CacheStatus.SUCCESS_SYNCED.value,
LEGACY_CONFIDENCE_SYNCED,
LEGACY_CONFIDENCE_UNSYNCED,
],
).fetchall()
if not rows:
return None
row = dict(rows[0])
confidence = row["confidence"]
if confidence is None:
confidence = (
LEGACY_CONFIDENCE_SYNCED
if row["status"] == CacheStatus.SUCCESS_SYNCED.value
else LEGACY_CONFIDENCE_UNSYNCED
)
return LyricResult(
status=CacheStatus(row["status"]),
lyrics=LRCData(row["lyrics"]) if row["lyrics"] else None,
source="cache-search",
confidence=confidence,
)
# Fuzzy search
@@ -384,7 +389,8 @@ class CacheEngine:
scored.sort(
key=lambda x: (
x[0],
x[1].get("status") != CacheStatus.SUCCESS_SYNCED.value,
-(x[1].get("confidence") or 0),
-(x[1].get("created_at") or 0),
)
)
matches = [m for _, m in scored]
+3
View File
@@ -428,6 +428,7 @@ def _print_cache_row(row: dict, indent: str = "") -> None:
created = row.get("created_at", 0)
expires = row.get("expires_at")
lyrics = row.get("lyrics", "")
confidence = row.get("confidence")
name = f"{artist} - {title}" if artist and title else row.get("key", "?")
print(f"{indent}[{source}] {name}")
@@ -450,6 +451,8 @@ def _print_cache_row(row: dict, indent: str = "") -> None:
if lyrics:
line_count = len(lyrics.splitlines())
print(f"{indent} Lyrics : {line_count} lines")
if confidence is not None:
print(f"{indent} Confidence: {confidence:.0f}")
def run():
+15
View File
@@ -38,6 +38,21 @@ TTL_NETWORK_ERROR = 3600 # 1 hour
# Search
DURATION_TOLERANCE_MS = 3000 # max duration mismatch for search matching
# Confidence scoring weights (sum to 100)
SCORE_W_TITLE = 40.0
SCORE_W_ARTIST = 30.0
SCORE_W_ALBUM = 10.0
SCORE_W_DURATION = 10.0
SCORE_W_SYNCED = 10.0
# Confidence thresholds
MIN_CONFIDENCE = 25.0 # below this, candidate is rejected
HIGH_CONFIDENCE = 80.0 # at or above this, stop searching early
# Legacy cache rows (no confidence stored) get a base score by sync status
LEGACY_CONFIDENCE_SYNCED = 50.0
LEGACY_CONFIDENCE_UNSYNCED = 40.0
# Spotify related
SPOTIFY_TOKEN_URL = "https://open.spotify.com/api/token"
SPOTIFY_LYRICS_URL = "https://spclient.wg.spotify.com/color-lyrics/v2/track/"
+58 -25
View File
@@ -9,7 +9,7 @@ Fetch pipeline:
1. Check cache for each source in the fallback sequence
2. For sources without a valid cache hit, call the fetcher
3. Cache every result (success, not-found, or error) per source
4. Return the best result (synced > unsynced > None)
4. Return the best result by confidence (highest wins)
"""
from typing import Optional
@@ -19,7 +19,13 @@ from .fetchers import FetcherMethodType, create_fetchers
from .fetchers.base import BaseFetcher
from .cache import CacheEngine
from .lrc import LRCData
from .config import TTL_SYNCED, TTL_UNSYNCED, TTL_NOT_FOUND, TTL_NETWORK_ERROR
from .config import (
TTL_SYNCED,
TTL_UNSYNCED,
TTL_NOT_FOUND,
TTL_NETWORK_ERROR,
HIGH_CONFIDENCE,
)
from .models import TrackMeta, LyricResult, CacheStatus
from .enrichers import enrich_track
@@ -33,6 +39,18 @@ _STATUS_TTL: dict[CacheStatus, Optional[int]] = {
}
def _is_better(new: LyricResult, old: LyricResult) -> bool:
"""Compare two results by confidence only.
Synced/unsynced preference is already baked into the confidence score
(synced bonus in scoring weights), so we don't need a separate tier.
None confidence = trusted = 100.
"""
new_conf = new.confidence if new.confidence is not None else 100.0
old_conf = old.confidence if old.confidence is not None else 100.0
return new_conf > old_conf
class LrcManager:
"""Main entry point for fetching lyrics with caching."""
@@ -72,7 +90,7 @@ class LrcManager:
- Cache miss or unsynced → call fetcher, then cache the result
After all sources are tried, returns the best result found
(synced > unsynced > None).
(highest confidence wins).
"""
track = enrich_track(track)
logger.info(f"Fetching lyrics for: {track.display_name()}")
@@ -81,7 +99,7 @@ class LrcManager:
if not sequence:
return None
# Best result seen so far (synced wins over unsynced)
# Best result seen so far (highest confidence wins)
best_result: Optional[LyricResult] = None
for fetcher in sequence:
@@ -91,17 +109,7 @@ class LrcManager:
if not bypass_cache and not fetcher.self_cached:
cached = self.cache.get(track, source)
if cached:
if cached.status == CacheStatus.SUCCESS_SYNCED:
logger.info(f"[{source}] cache hit: synced lyrics")
return cached
elif cached.status == CacheStatus.SUCCESS_UNSYNCED:
logger.debug(
f"[{source}] cache hit: unsynced lyrics (continuing)"
)
if best_result is None:
best_result = cached
continue # Try next source for synced
elif cached.status in (
if cached.status in (
CacheStatus.NOT_FOUND,
CacheStatus.NETWORK_ERROR,
):
@@ -109,6 +117,23 @@ class LrcManager:
f"[{source}] cache hit: {cached.status.value}, skipping"
)
continue
# Positive cache hit — apply the same confidence evaluation
# as fresh fetches so that low-confidence cached results
# don't block better results from later fetchers.
is_trusted = (
cached.confidence is None
or cached.confidence >= HIGH_CONFIDENCE
)
logger.info(
f"[{source}] cache hit: {cached.status.value}"
f" (confidence={'trusted' if cached.confidence is None else f'{cached.confidence:.0f}'})"
)
if cached.status == CacheStatus.SUCCESS_SYNCED and is_trusted:
return cached
if best_result is None or _is_better(cached, best_result):
best_result = cached
continue
elif not fetcher.self_cached:
logger.debug(f"[{source}] cache bypassed")
@@ -126,20 +151,28 @@ class LrcManager:
self.cache.set(track, source, result, ttl_seconds=ttl)
# Evaluate result
if result.status == CacheStatus.SUCCESS_SYNCED:
logger.info(f"[{source}] got synced lyrics")
return result
if result.status == CacheStatus.SUCCESS_UNSYNCED:
logger.debug(f"[{source}] got unsynced lyrics (continuing)")
if best_result is None:
if result.status in (
CacheStatus.SUCCESS_SYNCED,
CacheStatus.SUCCESS_UNSYNCED,
):
is_trusted = (
result.confidence is None or result.confidence >= HIGH_CONFIDENCE
)
logger.info(
f"[{source}] got {result.status.value} lyrics"
f" (confidence={'trusted' if result.confidence is None else f'{result.confidence:.0f}'})"
)
# Trusted synced → return immediately
if result.status == CacheStatus.SUCCESS_SYNCED and is_trusted:
return result
# Track best result by confidence
if best_result is None or _is_better(result, best_result):
best_result = result
# NOT_FOUND / NETWORK_ERROR: already cached, try next
# Return best available
if best_result:
# Normalize unsynced lyrics: set all timestamps to [00:00.00]
if (
best_result.status == CacheStatus.SUCCESS_UNSYNCED
and best_result.lyrics
@@ -149,10 +182,10 @@ class LrcManager:
lyrics=best_result.lyrics.normalize_unsynced(),
source=best_result.source,
ttl=best_result.ttl,
confidence=best_result.confidence,
)
logger.info(
f"Returning unsynced lyrics from {best_result.source} "
f"(no synced source found)"
f"Returning {best_result.status.value} lyrics from {best_result.source}"
)
else:
logger.info(f"No lyrics found for {track.display_name()}")
+14 -3
View File
@@ -64,16 +64,26 @@ class CacheSearchFetcher(BaseFetcher):
logger.debug(f"Cache-search: no match for {track.display_name()}")
return None
# Pick best: prefer synced, then first available
# Pick best by confidence scoring
candidates = [
SearchCandidate(
item=m,
duration_ms=float(m["length"]) if m.get("length") else None,
is_synced=m.get("status") == CacheStatus.SUCCESS_SYNCED.value,
title=m.get("title"),
artist=m.get("artist"),
album=m.get("album"),
)
for m in matches
if m.get("lyrics")
]
best = select_best(candidates, track.length)
best, confidence = select_best(
candidates,
track.length,
title=track.title,
artist=track.artist,
album=track.album,
)
if not best:
return None
@@ -81,10 +91,11 @@ class CacheSearchFetcher(BaseFetcher):
status = CacheStatus(best["status"])
logger.info(
f"Cache-search: fuzzy hit from [{best.get('source')}] "
f"album={best.get('album')!r} ({status.value})"
f"album={best.get('album')!r} ({status.value}, confidence={confidence:.0f})"
)
return LyricResult(
status=status,
lyrics=LRCData(best["lyrics"]),
source=self.source_name,
confidence=confidence,
)
+18 -3
View File
@@ -126,10 +126,19 @@ class LrclibSearchFetcher(BaseFetcher):
else None,
is_synced=isinstance(item.get("syncedLyrics"), str)
and bool(item["syncedLyrics"].strip()),
title=item.get("trackName"),
artist=item.get("artistName"),
album=item.get("albumName"),
)
for item in candidates
]
best = select_best(mapped, track.length)
best, confidence = select_best(
mapped,
track.length,
title=track.title,
artist=track.artist,
album=track.album,
)
if best is None:
logger.debug("LRCLIB-search: no valid candidate found")
return LyricResult(status=CacheStatus.NOT_FOUND, ttl=TTL_NOT_FOUND)
@@ -139,20 +148,26 @@ class LrclibSearchFetcher(BaseFetcher):
if isinstance(synced, str) and synced.strip():
lyrics = LRCData(synced)
logger.info(f"LRCLIB-search: got synced lyrics ({len(lyrics)} lines)")
logger.info(
f"LRCLIB-search: got synced lyrics ({len(lyrics)} lines, confidence={confidence:.0f})"
)
return LyricResult(
status=CacheStatus.SUCCESS_SYNCED,
lyrics=lyrics,
source=self.source_name,
confidence=confidence,
)
elif isinstance(unsynced, str) and unsynced.strip():
lyrics = LRCData(unsynced)
logger.info(f"LRCLIB-search: got unsynced lyrics ({len(lyrics)} lines)")
logger.info(
f"LRCLIB-search: got unsynced lyrics ({len(lyrics)} lines, confidence={confidence:.0f})"
)
return LyricResult(
status=CacheStatus.SUCCESS_UNSYNCED,
lyrics=lyrics,
source=self.source_name,
ttl=TTL_UNSYNCED,
confidence=confidence,
)
else:
logger.debug("LRCLIB-search: best candidate has empty lyrics")
+34 -15
View File
@@ -43,15 +43,15 @@ class NeteaseFetcher(BaseFetcher):
def is_available(self, track: TrackMeta) -> bool:
return bool(track.title)
def _search(self, track: TrackMeta, limit: int = 10) -> Optional[int]:
"""Search Netease and return the best-matching song ID.
def _search(self, track: TrackMeta, limit: int = 10) -> tuple[Optional[int], float]:
"""Search Netease and return the best-matching song ID with confidence.
When ``track.length`` is available, candidates are ranked by duration
difference and only accepted if within ``DURATION_TOLERANCE_MS``.
"""
query = f"{track.artist or ''} {track.title or ''}".strip()
if not query:
return None
return None, 0.0
logger.debug(f"Netease: searching for '{query}' (limit={limit})")
@@ -70,17 +70,17 @@ class NeteaseFetcher(BaseFetcher):
logger.error(
f"Netease: search returned non-dict: {type(result).__name__}"
)
return None
return None, 0.0
result_body = result.get("result")
if not isinstance(result_body, dict):
logger.debug("Netease: search 'result' field missing or invalid")
return None
return None, 0.0
songs = result_body.get("songs")
if not isinstance(songs, list) or len(songs) == 0:
logger.debug("Netease: search returned 0 results")
return None
return None, 0.0
logger.debug(f"Netease: search returned {len(songs)} candidates")
@@ -90,23 +90,37 @@ class NeteaseFetcher(BaseFetcher):
duration_ms=float(song["dt"])
if isinstance(song.get("dt"), int)
else None,
title=song.get("name"),
artist=", ".join(a.get("name", "") for a in song.get("ar", []))
or None,
album=(song.get("al") or {}).get("name"),
)
for song in songs
if isinstance(song, dict) and song.get("id") is not None
]
best_id = select_best(candidates, track.length)
best_id, confidence = select_best(
candidates,
track.length,
title=track.title,
artist=track.artist,
album=track.album,
)
if best_id is not None:
logger.debug(f"Netease: selected id={best_id}")
return best_id
logger.debug(
f"Netease: selected id={best_id} (confidence={confidence:.0f})"
)
return best_id, confidence
logger.debug("Netease: no suitable candidate found")
return None
return None, 0.0
except Exception as e:
logger.error(f"Netease: search failed: {e}")
return None
return None, 0.0
def _get_lyric(self, song_id: int) -> Optional[LyricResult]:
def _get_lyric(
self, song_id: int, confidence: float = 0.0
) -> Optional[LyricResult]:
"""Fetch lyrics for a given Netease song ID."""
logger.debug(f"Netease: fetching lyrics for song_id={song_id}")
@@ -158,7 +172,12 @@ class NeteaseFetcher(BaseFetcher):
f"Netease: got {status.value} lyrics for song_id={song_id} "
f"({len(lrcdata)} lines)"
)
return LyricResult(status=status, lyrics=lrcdata, source=self.source_name)
return LyricResult(
status=status,
lyrics=lrcdata,
source=self.source_name,
confidence=confidence,
)
except Exception as e:
logger.error(f"Netease: lyric fetch failed for song_id={song_id}: {e}")
@@ -174,9 +193,9 @@ class NeteaseFetcher(BaseFetcher):
return None
logger.info(f"Netease: fetching lyrics for {track.display_name()}")
song_id = self._search(track)
song_id, confidence = self._search(track)
if not song_id:
logger.debug(f"Netease: no match found for {track.display_name()}")
return LyricResult(status=CacheStatus.NOT_FOUND, ttl=TTL_NOT_FOUND)
return self._get_lyric(song_id)
return self._get_lyric(song_id, confidence=confidence)
+31 -14
View File
@@ -35,11 +35,11 @@ class QQMusicFetcher(BaseFetcher):
def is_available(self, track: TrackMeta) -> bool:
return bool(track.title) and bool(QQ_MUSIC_API_URL)
def _search(self, track: TrackMeta, limit: int = 10) -> Optional[str]:
"""Search QQ Music and return the best-matching song MID."""
def _search(self, track: TrackMeta, limit: int = 10) -> tuple[Optional[str], float]:
"""Search QQ Music and return the best-matching song MID with confidence."""
query = f"{track.artist or ''} {track.title or ''}".strip()
if not query:
return None
return None, 0.0
logger.debug(f"QQMusic: searching for '{query}' (limit={limit})")
@@ -54,12 +54,12 @@ class QQMusicFetcher(BaseFetcher):
if data.get("code") != 0:
logger.error(f"QQMusic: search API error: {data}")
return None
return None, 0.0
songs = data.get("data", {}).get("list", [])
if not songs:
logger.debug("QQMusic: search returned 0 results")
return None
return None, 0.0
logger.debug(f"QQMusic: search returned {len(songs)} candidates")
@@ -69,23 +69,35 @@ class QQMusicFetcher(BaseFetcher):
duration_ms=float(song["interval"]) * 1000
if isinstance(song.get("interval"), int)
else None,
title=song.get("name"),
artist=", ".join(s.get("name", "") for s in song.get("singer", []))
or None,
album=(song.get("album") or {}).get("name"),
)
for song in songs
if isinstance(song, dict) and song.get("mid") is not None
]
best_mid = select_best(candidates, track.length)
best_mid, confidence = select_best(
candidates,
track.length,
title=track.title,
artist=track.artist,
album=track.album,
)
if best_mid is not None:
logger.debug(f"QQMusic: selected mid={best_mid}")
return best_mid
logger.debug(
f"QQMusic: selected mid={best_mid} (confidence={confidence:.0f})"
)
return best_mid, confidence
logger.debug("QQMusic: no suitable candidate found")
return None
return None, 0.0
except Exception as e:
logger.error(f"QQMusic: search failed: {e}")
return None
return None, 0.0
def _get_lyric(self, mid: str) -> Optional[LyricResult]:
def _get_lyric(self, mid: str, confidence: float = 0.0) -> Optional[LyricResult]:
"""Fetch lyrics for a given QQ Music song MID."""
logger.debug(f"QQMusic: fetching lyrics for mid={mid}")
@@ -115,7 +127,12 @@ class QQMusicFetcher(BaseFetcher):
f"QQMusic: got {status.value} lyrics for mid={mid} "
f"({len(lrcdata)} lines)"
)
return LyricResult(status=status, lyrics=lrcdata, source=self.source_name)
return LyricResult(
status=status,
lyrics=lrcdata,
source=self.source_name,
confidence=confidence,
)
except Exception as e:
logger.error(f"QQMusic: lyric fetch failed for mid={mid}: {e}")
@@ -135,9 +152,9 @@ class QQMusicFetcher(BaseFetcher):
return None
logger.info(f"QQMusic: fetching lyrics for {track.display_name()}")
mid = self._search(track)
mid, confidence = self._search(track)
if not mid:
logger.debug(f"QQMusic: no match found for {track.display_name()}")
return LyricResult(status=CacheStatus.NOT_FOUND, ttl=TTL_NOT_FOUND)
return self._get_lyric(mid)
return self._get_lyric(mid, confidence=confidence)
+132 -32
View File
@@ -2,13 +2,23 @@
Shared candidate-selection logic for search-based fetchers.
Each fetcher maps its API-specific results to SearchCandidate, then calls
select_best() which handles duration filtering and synced preference uniformly.
select_best() which scores candidates by metadata similarity, duration
proximity, and sync status.
"""
from dataclasses import dataclass
from typing import Generic, Optional, TypeVar
from ..config import DURATION_TOLERANCE_MS
from ..config import (
DURATION_TOLERANCE_MS,
SCORE_W_TITLE as _W_TITLE,
SCORE_W_ARTIST as _W_ARTIST,
SCORE_W_ALBUM as _W_ALBUM,
SCORE_W_DURATION as _W_DURATION,
SCORE_W_SYNCED as _W_SYNCED,
MIN_CONFIDENCE,
)
from ..normalize import normalize_for_match, normalize_artist
T = TypeVar("T")
@@ -21,48 +31,138 @@ class SearchCandidate(Generic[T]):
item: The original API-specific object (dict, ID, etc.)
duration_ms: Track duration in milliseconds, or None if unknown.
is_synced: Whether this candidate is known to have synced lyrics.
title: Candidate track title for similarity scoring.
artist: Candidate artist name for similarity scoring.
album: Candidate album name for similarity scoring.
"""
item: T
duration_ms: Optional[float] = None
is_synced: bool = False
title: Optional[str] = None
artist: Optional[str] = None
album: Optional[str] = None
def _text_similarity(a: str, b: str) -> float:
"""Compare two normalized strings. Returns 0.0-1.0."""
if a == b:
return 1.0
if not a or not b:
return 0.0
# Containment: one is a substring of the other (e.g. "My Love" vs "My Love (Album Version)")
if a in b or b in a:
return min(len(a), len(b)) / max(len(a), len(b))
return 0.0
def _score_candidate(
c: SearchCandidate[T],
ref_title: Optional[str],
ref_artist: Optional[str],
ref_album: Optional[str],
ref_length_ms: Optional[int],
) -> float:
"""Score a candidate from 0-100 based on metadata match quality.
Scoring works in two tiers:
1. **Metadata score** — computed from fields available on *both* sides,
then rescaled to fill the 0-90 range so that missing fields don't
inflate the score. Fields missing on both sides are simply excluded
from the calculation (neutral). Fields present on only one side
contribute 0 to the numerator but their weight still counts in the
denominator (penalty for asymmetric absence).
2. **Synced bonus** — a flat 10 pts, always applied independently.
Field weights (before rescaling):
- Title: 40
- Artist: 30
- Album: 10
- Duration: 10
"""
raw = 0.0
available_weight = 0.0
# Title
if ref_title is not None or c.title is not None:
available_weight += _W_TITLE
if ref_title is not None and c.title is not None:
raw += _W_TITLE * _text_similarity(
normalize_for_match(ref_title), normalize_for_match(c.title)
)
# else both None → excluded
# Artist
if ref_artist is not None or c.artist is not None:
available_weight += _W_ARTIST
if ref_artist is not None and c.artist is not None:
na = normalize_artist(ref_artist)
nb = normalize_artist(c.artist)
if na == nb:
raw += _W_ARTIST
else:
raw += _W_ARTIST * _text_similarity(
normalize_for_match(ref_artist), normalize_for_match(c.artist)
)
# Album
if ref_album is not None or c.album is not None:
available_weight += _W_ALBUM
if ref_album is not None and c.album is not None:
raw += _W_ALBUM * _text_similarity(
normalize_for_match(ref_album), normalize_for_match(c.album)
)
# Duration
if ref_length_ms is not None or c.duration_ms is not None:
available_weight += _W_DURATION
if ref_length_ms is not None and c.duration_ms is not None:
diff = abs(c.duration_ms - ref_length_ms)
if diff <= DURATION_TOLERANCE_MS:
raw += _W_DURATION * (1.0 - diff / DURATION_TOLERANCE_MS)
# Rescale metadata to 0-90 range
_MAX_METADATA = _W_TITLE + _W_ARTIST + _W_ALBUM + _W_DURATION # 90
if available_weight > 0:
metadata_score = (raw / available_weight) * _MAX_METADATA
else:
# No comparable fields at all — only synced bonus matters
metadata_score = 0.0
# Synced bonus (always 10 pts, independent of metadata)
synced_score = _W_SYNCED if c.is_synced else 0.0
return metadata_score + synced_score
def select_best(
candidates: list[SearchCandidate[T]],
track_length_ms: Optional[int] = None,
tolerance_ms: float = DURATION_TOLERANCE_MS,
) -> Optional[T]:
"""Pick the best candidate by duration proximity and sync preference.
*,
title: Optional[str] = None,
artist: Optional[str] = None,
album: Optional[str] = None,
min_confidence: float = MIN_CONFIDENCE,
) -> tuple[Optional[T], float]:
"""Pick the best candidate by confidence scoring.
When track_length_ms is available:
- Filter by tolerance_ms
- Pick closest duration, prefer synced at equal distance
When track_length_ms is unavailable:
- Pick first synced candidate, or first overall
Returns (item, score). Item is None if no candidate scores above min_confidence.
"""
if track_length_ms is not None:
best: Optional[SearchCandidate[T]] = None
best_diff = float("inf")
if not candidates:
return None, 0.0
for c in candidates:
if c.duration_ms is None:
continue
diff = abs(c.duration_ms - track_length_ms)
if diff > tolerance_ms:
continue
if diff < best_diff or (
diff == best_diff
and c.is_synced
and (best is None or not best.is_synced)
):
best_diff = diff
best = c
best_item: Optional[T] = None
best_score = -1.0
return best.item if best is not None else None
# No duration — prefer synced, fallback to first
for c in candidates:
if c.is_synced:
return c.item
return candidates[0].item if candidates else None
s = _score_candidate(c, title, artist, album, track_length_ms)
if s > best_score:
best_score = s
best_item = c.item
if best_score < min_confidence:
return None, best_score
return best_item, best_score
+3
View File
@@ -62,3 +62,6 @@ class LyricResult:
lyrics: Optional[LRCData] = None
source: Optional[str] = None # Which fetcher produced this result
ttl: Optional[int] = None # Hint for cache TTL (seconds)
confidence: Optional[float] = (
None # 0-100 selection confidence (None = exact/trusted)
)
+47
View File
@@ -0,0 +1,47 @@
"""
Shared text normalization utilities for fuzzy matching.
Used by cache key generation, cache search, and candidate selection scoring.
"""
import re
import unicodedata
# Punctuation to strip for fuzzy matching (ASCII + fullwidth + CJK brackets/symbols)
_PUNCT_RE = re.compile(
r"[~!@#$%^&*()_+\-=\[\]{}|;:'\",.<>?/\\`"
r"~!@#$%^&*()_+-=【】{}|;:'",。<>?/\`"
r"「」『』《》〈〉〔〕·•‥…—–]"
)
_SPACE_RE = re.compile(r"\s+")
# feat./ft./featuring and everything after (case-insensitive, word boundary)
_FEAT_RE = re.compile(r"\s*(?:\bfeat\.?\b|\bft\.?\b|\bfeaturing\b).*", re.IGNORECASE)
# Multi-artist separators: /, &, ×, x (surrounded by spaces), ;, 、, vs.
_ARTIST_SEP_RE = re.compile(r"\s*(?:[/&;×、]|\bvs\.?\b|\bx\b)\s*", re.IGNORECASE)
def normalize_for_match(s: str) -> str:
"""Normalize a string for fuzzy comparison.
Lowercases, NFKC-normalizes (fullwidth → halfwidth), strips punctuation,
and collapses whitespace.
"""
s = unicodedata.normalize("NFKC", s).lower()
s = _FEAT_RE.sub("", s)
s = _PUNCT_RE.sub(" ", s)
s = _SPACE_RE.sub(" ", s).strip()
return s
def normalize_artist(s: str) -> str:
"""Normalize an artist string: split by separators, normalize each, sort.
Splits first (on /, &, ;, ×, 、, vs., x), then strips feat./ft./featuring
from each part individually, so 'A feat. C / B' → ['a', 'b'] not just ['a'].
"""
s = unicodedata.normalize("NFKC", s).lower()
parts = _ARTIST_SEP_RE.split(s)
normed = sorted(
{normalize_for_match(p) for p in parts if _FEAT_RE.sub("", p).strip()}
)
return "\0".join(normed) if normed else normalize_for_match(s)