feat: evaluate fetch results with "confidence"
This commit is contained in:
Vendored
-3
@@ -1,3 +0,0 @@
|
|||||||
{
|
|
||||||
"python-envs.defaultEnvManager": "ms-python.python:venv"
|
|
||||||
}
|
|
||||||
+63
-57
@@ -4,56 +4,21 @@ Date: 2026-03-25 10:18:03
|
|||||||
Description: SQLite-based lyric cache with per-source storage and TTL expiration
|
Description: SQLite-based lyric cache with per-source storage and TTL expiration
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import re
|
|
||||||
import sqlite3
|
import sqlite3
|
||||||
import hashlib
|
import hashlib
|
||||||
import time
|
import time
|
||||||
import unicodedata
|
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
|
||||||
from .lrc import LRCData
|
from .lrc import LRCData
|
||||||
from .config import DURATION_TOLERANCE_MS
|
from .normalize import normalize_for_match as _normalize_for_match
|
||||||
from .models import TrackMeta, LyricResult, CacheStatus
|
from .normalize import normalize_artist as _normalize_artist
|
||||||
|
from .config import (
|
||||||
# Punctuation to strip for fuzzy matching (ASCII + fullwidth + CJK brackets/symbols)
|
DURATION_TOLERANCE_MS,
|
||||||
_PUNCT_RE = re.compile(
|
LEGACY_CONFIDENCE_SYNCED,
|
||||||
r"[~!@#$%^&*()_+\-=\[\]{}|;:'\",.<>?/\\`"
|
LEGACY_CONFIDENCE_UNSYNCED,
|
||||||
r"~!@#$%^&*()_+-=【】{}|;:'",。<>?/\`"
|
|
||||||
r"「」『』《》〈〉〔〕·•‥…—–]"
|
|
||||||
)
|
)
|
||||||
_SPACE_RE = re.compile(r"\s+")
|
from .models import TrackMeta, LyricResult, CacheStatus
|
||||||
# feat./ft./featuring and everything after (case-insensitive, word boundary)
|
|
||||||
_FEAT_RE = re.compile(r"\s*(?:\bfeat\.?\b|\bft\.?\b|\bfeaturing\b).*", re.IGNORECASE)
|
|
||||||
# Multi-artist separators: /, &, ×, x (surrounded by spaces), ;, 、, vs.
|
|
||||||
_ARTIST_SEP_RE = re.compile(r"\s*(?:[/&;×、]|\bvs\.?\b|\bx\b)\s*", re.IGNORECASE)
|
|
||||||
|
|
||||||
|
|
||||||
def _normalize_for_match(s: str) -> str:
|
|
||||||
"""Normalize a string for fuzzy comparison.
|
|
||||||
|
|
||||||
Lowercases, NFKC-normalizes (fullwidth → halfwidth), strips punctuation,
|
|
||||||
and collapses whitespace.
|
|
||||||
"""
|
|
||||||
s = unicodedata.normalize("NFKC", s).lower()
|
|
||||||
s = _FEAT_RE.sub("", s)
|
|
||||||
s = _PUNCT_RE.sub(" ", s)
|
|
||||||
s = _SPACE_RE.sub(" ", s).strip()
|
|
||||||
return s
|
|
||||||
|
|
||||||
|
|
||||||
def _normalize_artist(s: str) -> str:
|
|
||||||
"""Normalize an artist string: split by separators, normalize each, sort.
|
|
||||||
|
|
||||||
Splits first (on /, &, ;, ×, 、, vs., x), then strips feat./ft./featuring
|
|
||||||
from each part individually, so 'A feat. C / B' → ['a', 'b'] not just ['a'].
|
|
||||||
"""
|
|
||||||
s = unicodedata.normalize("NFKC", s).lower()
|
|
||||||
parts = _ARTIST_SEP_RE.split(s)
|
|
||||||
normed = sorted(
|
|
||||||
{_normalize_for_match(p) for p in parts if _FEAT_RE.sub("", p).strip()}
|
|
||||||
)
|
|
||||||
return "\0".join(normed) if normed else _normalize_for_match(s)
|
|
||||||
|
|
||||||
|
|
||||||
def _generate_key(track: TrackMeta, source: str) -> str:
|
def _generate_key(track: TrackMeta, source: str) -> str:
|
||||||
@@ -110,10 +75,12 @@ class CacheEngine:
|
|||||||
length INTEGER
|
length INTEGER
|
||||||
)
|
)
|
||||||
""")
|
""")
|
||||||
# Migration: add length column if missing
|
# Migrations
|
||||||
cols = {r[1] for r in conn.execute("PRAGMA table_info(cache)").fetchall()}
|
cols = {r[1] for r in conn.execute("PRAGMA table_info(cache)").fetchall()}
|
||||||
if "length" not in cols:
|
if "length" not in cols:
|
||||||
conn.execute("ALTER TABLE cache ADD COLUMN length INTEGER")
|
conn.execute("ALTER TABLE cache ADD COLUMN length INTEGER")
|
||||||
|
if "confidence" not in cols:
|
||||||
|
conn.execute("ALTER TABLE cache ADD COLUMN confidence REAL")
|
||||||
conn.commit()
|
conn.commit()
|
||||||
|
|
||||||
# Read
|
# Read
|
||||||
@@ -130,7 +97,7 @@ class CacheEngine:
|
|||||||
|
|
||||||
with sqlite3.connect(self.db_path) as conn:
|
with sqlite3.connect(self.db_path) as conn:
|
||||||
row = conn.execute(
|
row = conn.execute(
|
||||||
"SELECT status, lyrics, source, expires_at, length FROM cache WHERE key = ?",
|
"SELECT status, lyrics, source, expires_at, length, confidence FROM cache WHERE key = ?",
|
||||||
(key,),
|
(key,),
|
||||||
).fetchone()
|
).fetchone()
|
||||||
|
|
||||||
@@ -138,7 +105,7 @@ class CacheEngine:
|
|||||||
logger.debug(f"Cache miss: {source} / {track.display_name()}")
|
logger.debug(f"Cache miss: {source} / {track.display_name()}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
status_str, lyrics, src, expires_at, cached_length = row
|
status_str, lyrics, src, expires_at, cached_length, confidence = row
|
||||||
|
|
||||||
# Check TTL expiration
|
# Check TTL expiration
|
||||||
if expires_at and expires_at < int(time.time()):
|
if expires_at and expires_at < int(time.time()):
|
||||||
@@ -160,15 +127,27 @@ class CacheEngine:
|
|||||||
f"Cache hit: {source} / {track.display_name()} "
|
f"Cache hit: {source} / {track.display_name()} "
|
||||||
f"[{status_str}, ttl={remaining}s]"
|
f"[{status_str}, ttl={remaining}s]"
|
||||||
)
|
)
|
||||||
|
status = CacheStatus(status_str)
|
||||||
|
if confidence is None and status in (
|
||||||
|
CacheStatus.SUCCESS_SYNCED,
|
||||||
|
CacheStatus.SUCCESS_UNSYNCED,
|
||||||
|
):
|
||||||
|
confidence = (
|
||||||
|
LEGACY_CONFIDENCE_SYNCED
|
||||||
|
if status == CacheStatus.SUCCESS_SYNCED
|
||||||
|
else LEGACY_CONFIDENCE_UNSYNCED
|
||||||
|
)
|
||||||
|
|
||||||
return LyricResult(
|
return LyricResult(
|
||||||
status=CacheStatus(status_str),
|
status=status,
|
||||||
lyrics=LRCData(lyrics) if lyrics else None,
|
lyrics=LRCData(lyrics) if lyrics else None,
|
||||||
source=src,
|
source=src,
|
||||||
ttl=remaining,
|
ttl=remaining,
|
||||||
|
confidence=confidence,
|
||||||
)
|
)
|
||||||
|
|
||||||
def get_best(self, track: TrackMeta, sources: list[str]) -> Optional[LyricResult]:
|
def get_best(self, track: TrackMeta, sources: list[str]) -> Optional[LyricResult]:
|
||||||
"""Return the best cached result across *sources* (synced > unsynced).
|
"""Return the best cached result across *sources* by confidence.
|
||||||
|
|
||||||
Skips negative statuses (NOT_FOUND, NETWORK_ERROR) — those are only
|
Skips negative statuses (NOT_FOUND, NETWORK_ERROR) — those are only
|
||||||
consulted per-source to avoid redundant fetches.
|
consulted per-source to avoid redundant fetches.
|
||||||
@@ -178,9 +157,19 @@ class CacheEngine:
|
|||||||
cached = self.get(track, src)
|
cached = self.get(track, src)
|
||||||
if not cached:
|
if not cached:
|
||||||
continue
|
continue
|
||||||
if cached.status == CacheStatus.SUCCESS_SYNCED:
|
if cached.status not in (
|
||||||
return cached # Can't do better
|
CacheStatus.SUCCESS_SYNCED,
|
||||||
if cached.status == CacheStatus.SUCCESS_UNSYNCED and best is None:
|
CacheStatus.SUCCESS_UNSYNCED,
|
||||||
|
):
|
||||||
|
continue
|
||||||
|
if best is None:
|
||||||
|
best = cached
|
||||||
|
else:
|
||||||
|
cached_conf = (
|
||||||
|
cached.confidence if cached.confidence is not None else 100.0
|
||||||
|
)
|
||||||
|
best_conf = best.confidence if best.confidence is not None else 100.0
|
||||||
|
if cached_conf > best_conf:
|
||||||
best = cached
|
best = cached
|
||||||
return best
|
return best
|
||||||
|
|
||||||
@@ -207,8 +196,8 @@ class CacheEngine:
|
|||||||
conn.execute(
|
conn.execute(
|
||||||
"""INSERT OR REPLACE INTO cache
|
"""INSERT OR REPLACE INTO cache
|
||||||
(key, source, status, lyrics, created_at, expires_at,
|
(key, source, status, lyrics, created_at, expires_at,
|
||||||
artist, title, album, length)
|
artist, title, album, length, confidence)
|
||||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
|
||||||
(
|
(
|
||||||
key,
|
key,
|
||||||
source,
|
source,
|
||||||
@@ -220,6 +209,7 @@ class CacheEngine:
|
|||||||
track.title,
|
track.title,
|
||||||
track.album,
|
track.album,
|
||||||
track.length,
|
track.length,
|
||||||
|
result.confidence,
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
conn.commit()
|
conn.commit()
|
||||||
@@ -288,7 +278,7 @@ class CacheEngine:
|
|||||||
"""Find the best positive (synced/unsynced) cache entry for *track*.
|
"""Find the best positive (synced/unsynced) cache entry for *track*.
|
||||||
|
|
||||||
Uses exact metadata match (artist + title + album) across all sources.
|
Uses exact metadata match (artist + title + album) across all sources.
|
||||||
Returns synced if available, otherwise unsynced, or None.
|
Returns the highest-confidence entry, or None.
|
||||||
"""
|
"""
|
||||||
conditions, params = self._track_where(track)
|
conditions, params = self._track_where(track)
|
||||||
if not conditions:
|
if not conditions:
|
||||||
@@ -306,19 +296,34 @@ class CacheEngine:
|
|||||||
with sqlite3.connect(self.db_path) as conn:
|
with sqlite3.connect(self.db_path) as conn:
|
||||||
conn.row_factory = sqlite3.Row
|
conn.row_factory = sqlite3.Row
|
||||||
rows = conn.execute(
|
rows = conn.execute(
|
||||||
f"SELECT status, lyrics, source FROM cache WHERE {where} "
|
f"SELECT status, lyrics, source, confidence FROM cache WHERE {where} "
|
||||||
"ORDER BY CASE status WHEN ? THEN 0 ELSE 1 END LIMIT 1",
|
"ORDER BY COALESCE(confidence, "
|
||||||
params + [CacheStatus.SUCCESS_SYNCED.value],
|
" CASE status WHEN ? THEN ? ELSE ? END"
|
||||||
|
") DESC, created_at DESC LIMIT 1",
|
||||||
|
params
|
||||||
|
+ [
|
||||||
|
CacheStatus.SUCCESS_SYNCED.value,
|
||||||
|
LEGACY_CONFIDENCE_SYNCED,
|
||||||
|
LEGACY_CONFIDENCE_UNSYNCED,
|
||||||
|
],
|
||||||
).fetchall()
|
).fetchall()
|
||||||
|
|
||||||
if not rows:
|
if not rows:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
row = dict(rows[0])
|
row = dict(rows[0])
|
||||||
|
confidence = row["confidence"]
|
||||||
|
if confidence is None:
|
||||||
|
confidence = (
|
||||||
|
LEGACY_CONFIDENCE_SYNCED
|
||||||
|
if row["status"] == CacheStatus.SUCCESS_SYNCED.value
|
||||||
|
else LEGACY_CONFIDENCE_UNSYNCED
|
||||||
|
)
|
||||||
return LyricResult(
|
return LyricResult(
|
||||||
status=CacheStatus(row["status"]),
|
status=CacheStatus(row["status"]),
|
||||||
lyrics=LRCData(row["lyrics"]) if row["lyrics"] else None,
|
lyrics=LRCData(row["lyrics"]) if row["lyrics"] else None,
|
||||||
source="cache-search",
|
source="cache-search",
|
||||||
|
confidence=confidence,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Fuzzy search
|
# Fuzzy search
|
||||||
@@ -384,7 +389,8 @@ class CacheEngine:
|
|||||||
scored.sort(
|
scored.sort(
|
||||||
key=lambda x: (
|
key=lambda x: (
|
||||||
x[0],
|
x[0],
|
||||||
x[1].get("status") != CacheStatus.SUCCESS_SYNCED.value,
|
-(x[1].get("confidence") or 0),
|
||||||
|
-(x[1].get("created_at") or 0),
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
matches = [m for _, m in scored]
|
matches = [m for _, m in scored]
|
||||||
|
|||||||
@@ -428,6 +428,7 @@ def _print_cache_row(row: dict, indent: str = "") -> None:
|
|||||||
created = row.get("created_at", 0)
|
created = row.get("created_at", 0)
|
||||||
expires = row.get("expires_at")
|
expires = row.get("expires_at")
|
||||||
lyrics = row.get("lyrics", "")
|
lyrics = row.get("lyrics", "")
|
||||||
|
confidence = row.get("confidence")
|
||||||
|
|
||||||
name = f"{artist} - {title}" if artist and title else row.get("key", "?")
|
name = f"{artist} - {title}" if artist and title else row.get("key", "?")
|
||||||
print(f"{indent}[{source}] {name}")
|
print(f"{indent}[{source}] {name}")
|
||||||
@@ -450,6 +451,8 @@ def _print_cache_row(row: dict, indent: str = "") -> None:
|
|||||||
if lyrics:
|
if lyrics:
|
||||||
line_count = len(lyrics.splitlines())
|
line_count = len(lyrics.splitlines())
|
||||||
print(f"{indent} Lyrics : {line_count} lines")
|
print(f"{indent} Lyrics : {line_count} lines")
|
||||||
|
if confidence is not None:
|
||||||
|
print(f"{indent} Confidence: {confidence:.0f}")
|
||||||
|
|
||||||
|
|
||||||
def run():
|
def run():
|
||||||
|
|||||||
@@ -38,6 +38,21 @@ TTL_NETWORK_ERROR = 3600 # 1 hour
|
|||||||
# Search
|
# Search
|
||||||
DURATION_TOLERANCE_MS = 3000 # max duration mismatch for search matching
|
DURATION_TOLERANCE_MS = 3000 # max duration mismatch for search matching
|
||||||
|
|
||||||
|
# Confidence scoring weights (sum to 100)
|
||||||
|
SCORE_W_TITLE = 40.0
|
||||||
|
SCORE_W_ARTIST = 30.0
|
||||||
|
SCORE_W_ALBUM = 10.0
|
||||||
|
SCORE_W_DURATION = 10.0
|
||||||
|
SCORE_W_SYNCED = 10.0
|
||||||
|
|
||||||
|
# Confidence thresholds
|
||||||
|
MIN_CONFIDENCE = 25.0 # below this, candidate is rejected
|
||||||
|
HIGH_CONFIDENCE = 80.0 # at or above this, stop searching early
|
||||||
|
|
||||||
|
# Legacy cache rows (no confidence stored) get a base score by sync status
|
||||||
|
LEGACY_CONFIDENCE_SYNCED = 50.0
|
||||||
|
LEGACY_CONFIDENCE_UNSYNCED = 40.0
|
||||||
|
|
||||||
# Spotify related
|
# Spotify related
|
||||||
SPOTIFY_TOKEN_URL = "https://open.spotify.com/api/token"
|
SPOTIFY_TOKEN_URL = "https://open.spotify.com/api/token"
|
||||||
SPOTIFY_LYRICS_URL = "https://spclient.wg.spotify.com/color-lyrics/v2/track/"
|
SPOTIFY_LYRICS_URL = "https://spclient.wg.spotify.com/color-lyrics/v2/track/"
|
||||||
|
|||||||
+57
-24
@@ -9,7 +9,7 @@ Fetch pipeline:
|
|||||||
1. Check cache for each source in the fallback sequence
|
1. Check cache for each source in the fallback sequence
|
||||||
2. For sources without a valid cache hit, call the fetcher
|
2. For sources without a valid cache hit, call the fetcher
|
||||||
3. Cache every result (success, not-found, or error) per source
|
3. Cache every result (success, not-found, or error) per source
|
||||||
4. Return the best result (synced > unsynced > None)
|
4. Return the best result by confidence (highest wins)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
@@ -19,7 +19,13 @@ from .fetchers import FetcherMethodType, create_fetchers
|
|||||||
from .fetchers.base import BaseFetcher
|
from .fetchers.base import BaseFetcher
|
||||||
from .cache import CacheEngine
|
from .cache import CacheEngine
|
||||||
from .lrc import LRCData
|
from .lrc import LRCData
|
||||||
from .config import TTL_SYNCED, TTL_UNSYNCED, TTL_NOT_FOUND, TTL_NETWORK_ERROR
|
from .config import (
|
||||||
|
TTL_SYNCED,
|
||||||
|
TTL_UNSYNCED,
|
||||||
|
TTL_NOT_FOUND,
|
||||||
|
TTL_NETWORK_ERROR,
|
||||||
|
HIGH_CONFIDENCE,
|
||||||
|
)
|
||||||
from .models import TrackMeta, LyricResult, CacheStatus
|
from .models import TrackMeta, LyricResult, CacheStatus
|
||||||
from .enrichers import enrich_track
|
from .enrichers import enrich_track
|
||||||
|
|
||||||
@@ -33,6 +39,18 @@ _STATUS_TTL: dict[CacheStatus, Optional[int]] = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _is_better(new: LyricResult, old: LyricResult) -> bool:
|
||||||
|
"""Compare two results by confidence only.
|
||||||
|
|
||||||
|
Synced/unsynced preference is already baked into the confidence score
|
||||||
|
(synced bonus in scoring weights), so we don't need a separate tier.
|
||||||
|
None confidence = trusted = 100.
|
||||||
|
"""
|
||||||
|
new_conf = new.confidence if new.confidence is not None else 100.0
|
||||||
|
old_conf = old.confidence if old.confidence is not None else 100.0
|
||||||
|
return new_conf > old_conf
|
||||||
|
|
||||||
|
|
||||||
class LrcManager:
|
class LrcManager:
|
||||||
"""Main entry point for fetching lyrics with caching."""
|
"""Main entry point for fetching lyrics with caching."""
|
||||||
|
|
||||||
@@ -72,7 +90,7 @@ class LrcManager:
|
|||||||
- Cache miss or unsynced → call fetcher, then cache the result
|
- Cache miss or unsynced → call fetcher, then cache the result
|
||||||
|
|
||||||
After all sources are tried, returns the best result found
|
After all sources are tried, returns the best result found
|
||||||
(synced > unsynced > None).
|
(highest confidence wins).
|
||||||
"""
|
"""
|
||||||
track = enrich_track(track)
|
track = enrich_track(track)
|
||||||
logger.info(f"Fetching lyrics for: {track.display_name()}")
|
logger.info(f"Fetching lyrics for: {track.display_name()}")
|
||||||
@@ -81,7 +99,7 @@ class LrcManager:
|
|||||||
if not sequence:
|
if not sequence:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# Best result seen so far (synced wins over unsynced)
|
# Best result seen so far (highest confidence wins)
|
||||||
best_result: Optional[LyricResult] = None
|
best_result: Optional[LyricResult] = None
|
||||||
|
|
||||||
for fetcher in sequence:
|
for fetcher in sequence:
|
||||||
@@ -91,17 +109,7 @@ class LrcManager:
|
|||||||
if not bypass_cache and not fetcher.self_cached:
|
if not bypass_cache and not fetcher.self_cached:
|
||||||
cached = self.cache.get(track, source)
|
cached = self.cache.get(track, source)
|
||||||
if cached:
|
if cached:
|
||||||
if cached.status == CacheStatus.SUCCESS_SYNCED:
|
if cached.status in (
|
||||||
logger.info(f"[{source}] cache hit: synced lyrics")
|
|
||||||
return cached
|
|
||||||
elif cached.status == CacheStatus.SUCCESS_UNSYNCED:
|
|
||||||
logger.debug(
|
|
||||||
f"[{source}] cache hit: unsynced lyrics (continuing)"
|
|
||||||
)
|
|
||||||
if best_result is None:
|
|
||||||
best_result = cached
|
|
||||||
continue # Try next source for synced
|
|
||||||
elif cached.status in (
|
|
||||||
CacheStatus.NOT_FOUND,
|
CacheStatus.NOT_FOUND,
|
||||||
CacheStatus.NETWORK_ERROR,
|
CacheStatus.NETWORK_ERROR,
|
||||||
):
|
):
|
||||||
@@ -109,6 +117,23 @@ class LrcManager:
|
|||||||
f"[{source}] cache hit: {cached.status.value}, skipping"
|
f"[{source}] cache hit: {cached.status.value}, skipping"
|
||||||
)
|
)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
# Positive cache hit — apply the same confidence evaluation
|
||||||
|
# as fresh fetches so that low-confidence cached results
|
||||||
|
# don't block better results from later fetchers.
|
||||||
|
is_trusted = (
|
||||||
|
cached.confidence is None
|
||||||
|
or cached.confidence >= HIGH_CONFIDENCE
|
||||||
|
)
|
||||||
|
logger.info(
|
||||||
|
f"[{source}] cache hit: {cached.status.value}"
|
||||||
|
f" (confidence={'trusted' if cached.confidence is None else f'{cached.confidence:.0f}'})"
|
||||||
|
)
|
||||||
|
if cached.status == CacheStatus.SUCCESS_SYNCED and is_trusted:
|
||||||
|
return cached
|
||||||
|
if best_result is None or _is_better(cached, best_result):
|
||||||
|
best_result = cached
|
||||||
|
continue
|
||||||
elif not fetcher.self_cached:
|
elif not fetcher.self_cached:
|
||||||
logger.debug(f"[{source}] cache bypassed")
|
logger.debug(f"[{source}] cache bypassed")
|
||||||
|
|
||||||
@@ -126,20 +151,28 @@ class LrcManager:
|
|||||||
self.cache.set(track, source, result, ttl_seconds=ttl)
|
self.cache.set(track, source, result, ttl_seconds=ttl)
|
||||||
|
|
||||||
# Evaluate result
|
# Evaluate result
|
||||||
if result.status == CacheStatus.SUCCESS_SYNCED:
|
if result.status in (
|
||||||
logger.info(f"[{source}] got synced lyrics")
|
CacheStatus.SUCCESS_SYNCED,
|
||||||
|
CacheStatus.SUCCESS_UNSYNCED,
|
||||||
|
):
|
||||||
|
is_trusted = (
|
||||||
|
result.confidence is None or result.confidence >= HIGH_CONFIDENCE
|
||||||
|
)
|
||||||
|
logger.info(
|
||||||
|
f"[{source}] got {result.status.value} lyrics"
|
||||||
|
f" (confidence={'trusted' if result.confidence is None else f'{result.confidence:.0f}'})"
|
||||||
|
)
|
||||||
|
# Trusted synced → return immediately
|
||||||
|
if result.status == CacheStatus.SUCCESS_SYNCED and is_trusted:
|
||||||
return result
|
return result
|
||||||
|
# Track best result by confidence
|
||||||
if result.status == CacheStatus.SUCCESS_UNSYNCED:
|
if best_result is None or _is_better(result, best_result):
|
||||||
logger.debug(f"[{source}] got unsynced lyrics (continuing)")
|
|
||||||
if best_result is None:
|
|
||||||
best_result = result
|
best_result = result
|
||||||
|
|
||||||
# NOT_FOUND / NETWORK_ERROR: already cached, try next
|
# NOT_FOUND / NETWORK_ERROR: already cached, try next
|
||||||
|
|
||||||
# Return best available
|
# Return best available
|
||||||
if best_result:
|
if best_result:
|
||||||
# Normalize unsynced lyrics: set all timestamps to [00:00.00]
|
|
||||||
if (
|
if (
|
||||||
best_result.status == CacheStatus.SUCCESS_UNSYNCED
|
best_result.status == CacheStatus.SUCCESS_UNSYNCED
|
||||||
and best_result.lyrics
|
and best_result.lyrics
|
||||||
@@ -149,10 +182,10 @@ class LrcManager:
|
|||||||
lyrics=best_result.lyrics.normalize_unsynced(),
|
lyrics=best_result.lyrics.normalize_unsynced(),
|
||||||
source=best_result.source,
|
source=best_result.source,
|
||||||
ttl=best_result.ttl,
|
ttl=best_result.ttl,
|
||||||
|
confidence=best_result.confidence,
|
||||||
)
|
)
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Returning unsynced lyrics from {best_result.source} "
|
f"Returning {best_result.status.value} lyrics from {best_result.source}"
|
||||||
f"(no synced source found)"
|
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
logger.info(f"No lyrics found for {track.display_name()}")
|
logger.info(f"No lyrics found for {track.display_name()}")
|
||||||
|
|||||||
@@ -64,16 +64,26 @@ class CacheSearchFetcher(BaseFetcher):
|
|||||||
logger.debug(f"Cache-search: no match for {track.display_name()}")
|
logger.debug(f"Cache-search: no match for {track.display_name()}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# Pick best: prefer synced, then first available
|
# Pick best by confidence scoring
|
||||||
candidates = [
|
candidates = [
|
||||||
SearchCandidate(
|
SearchCandidate(
|
||||||
item=m,
|
item=m,
|
||||||
|
duration_ms=float(m["length"]) if m.get("length") else None,
|
||||||
is_synced=m.get("status") == CacheStatus.SUCCESS_SYNCED.value,
|
is_synced=m.get("status") == CacheStatus.SUCCESS_SYNCED.value,
|
||||||
|
title=m.get("title"),
|
||||||
|
artist=m.get("artist"),
|
||||||
|
album=m.get("album"),
|
||||||
)
|
)
|
||||||
for m in matches
|
for m in matches
|
||||||
if m.get("lyrics")
|
if m.get("lyrics")
|
||||||
]
|
]
|
||||||
best = select_best(candidates, track.length)
|
best, confidence = select_best(
|
||||||
|
candidates,
|
||||||
|
track.length,
|
||||||
|
title=track.title,
|
||||||
|
artist=track.artist,
|
||||||
|
album=track.album,
|
||||||
|
)
|
||||||
|
|
||||||
if not best:
|
if not best:
|
||||||
return None
|
return None
|
||||||
@@ -81,10 +91,11 @@ class CacheSearchFetcher(BaseFetcher):
|
|||||||
status = CacheStatus(best["status"])
|
status = CacheStatus(best["status"])
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Cache-search: fuzzy hit from [{best.get('source')}] "
|
f"Cache-search: fuzzy hit from [{best.get('source')}] "
|
||||||
f"album={best.get('album')!r} ({status.value})"
|
f"album={best.get('album')!r} ({status.value}, confidence={confidence:.0f})"
|
||||||
)
|
)
|
||||||
return LyricResult(
|
return LyricResult(
|
||||||
status=status,
|
status=status,
|
||||||
lyrics=LRCData(best["lyrics"]),
|
lyrics=LRCData(best["lyrics"]),
|
||||||
source=self.source_name,
|
source=self.source_name,
|
||||||
|
confidence=confidence,
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -126,10 +126,19 @@ class LrclibSearchFetcher(BaseFetcher):
|
|||||||
else None,
|
else None,
|
||||||
is_synced=isinstance(item.get("syncedLyrics"), str)
|
is_synced=isinstance(item.get("syncedLyrics"), str)
|
||||||
and bool(item["syncedLyrics"].strip()),
|
and bool(item["syncedLyrics"].strip()),
|
||||||
|
title=item.get("trackName"),
|
||||||
|
artist=item.get("artistName"),
|
||||||
|
album=item.get("albumName"),
|
||||||
)
|
)
|
||||||
for item in candidates
|
for item in candidates
|
||||||
]
|
]
|
||||||
best = select_best(mapped, track.length)
|
best, confidence = select_best(
|
||||||
|
mapped,
|
||||||
|
track.length,
|
||||||
|
title=track.title,
|
||||||
|
artist=track.artist,
|
||||||
|
album=track.album,
|
||||||
|
)
|
||||||
if best is None:
|
if best is None:
|
||||||
logger.debug("LRCLIB-search: no valid candidate found")
|
logger.debug("LRCLIB-search: no valid candidate found")
|
||||||
return LyricResult(status=CacheStatus.NOT_FOUND, ttl=TTL_NOT_FOUND)
|
return LyricResult(status=CacheStatus.NOT_FOUND, ttl=TTL_NOT_FOUND)
|
||||||
@@ -139,20 +148,26 @@ class LrclibSearchFetcher(BaseFetcher):
|
|||||||
|
|
||||||
if isinstance(synced, str) and synced.strip():
|
if isinstance(synced, str) and synced.strip():
|
||||||
lyrics = LRCData(synced)
|
lyrics = LRCData(synced)
|
||||||
logger.info(f"LRCLIB-search: got synced lyrics ({len(lyrics)} lines)")
|
logger.info(
|
||||||
|
f"LRCLIB-search: got synced lyrics ({len(lyrics)} lines, confidence={confidence:.0f})"
|
||||||
|
)
|
||||||
return LyricResult(
|
return LyricResult(
|
||||||
status=CacheStatus.SUCCESS_SYNCED,
|
status=CacheStatus.SUCCESS_SYNCED,
|
||||||
lyrics=lyrics,
|
lyrics=lyrics,
|
||||||
source=self.source_name,
|
source=self.source_name,
|
||||||
|
confidence=confidence,
|
||||||
)
|
)
|
||||||
elif isinstance(unsynced, str) and unsynced.strip():
|
elif isinstance(unsynced, str) and unsynced.strip():
|
||||||
lyrics = LRCData(unsynced)
|
lyrics = LRCData(unsynced)
|
||||||
logger.info(f"LRCLIB-search: got unsynced lyrics ({len(lyrics)} lines)")
|
logger.info(
|
||||||
|
f"LRCLIB-search: got unsynced lyrics ({len(lyrics)} lines, confidence={confidence:.0f})"
|
||||||
|
)
|
||||||
return LyricResult(
|
return LyricResult(
|
||||||
status=CacheStatus.SUCCESS_UNSYNCED,
|
status=CacheStatus.SUCCESS_UNSYNCED,
|
||||||
lyrics=lyrics,
|
lyrics=lyrics,
|
||||||
source=self.source_name,
|
source=self.source_name,
|
||||||
ttl=TTL_UNSYNCED,
|
ttl=TTL_UNSYNCED,
|
||||||
|
confidence=confidence,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
logger.debug("LRCLIB-search: best candidate has empty lyrics")
|
logger.debug("LRCLIB-search: best candidate has empty lyrics")
|
||||||
|
|||||||
+34
-15
@@ -43,15 +43,15 @@ class NeteaseFetcher(BaseFetcher):
|
|||||||
def is_available(self, track: TrackMeta) -> bool:
|
def is_available(self, track: TrackMeta) -> bool:
|
||||||
return bool(track.title)
|
return bool(track.title)
|
||||||
|
|
||||||
def _search(self, track: TrackMeta, limit: int = 10) -> Optional[int]:
|
def _search(self, track: TrackMeta, limit: int = 10) -> tuple[Optional[int], float]:
|
||||||
"""Search Netease and return the best-matching song ID.
|
"""Search Netease and return the best-matching song ID with confidence.
|
||||||
|
|
||||||
When ``track.length`` is available, candidates are ranked by duration
|
When ``track.length`` is available, candidates are ranked by duration
|
||||||
difference and only accepted if within ``DURATION_TOLERANCE_MS``.
|
difference and only accepted if within ``DURATION_TOLERANCE_MS``.
|
||||||
"""
|
"""
|
||||||
query = f"{track.artist or ''} {track.title or ''}".strip()
|
query = f"{track.artist or ''} {track.title or ''}".strip()
|
||||||
if not query:
|
if not query:
|
||||||
return None
|
return None, 0.0
|
||||||
|
|
||||||
logger.debug(f"Netease: searching for '{query}' (limit={limit})")
|
logger.debug(f"Netease: searching for '{query}' (limit={limit})")
|
||||||
|
|
||||||
@@ -70,17 +70,17 @@ class NeteaseFetcher(BaseFetcher):
|
|||||||
logger.error(
|
logger.error(
|
||||||
f"Netease: search returned non-dict: {type(result).__name__}"
|
f"Netease: search returned non-dict: {type(result).__name__}"
|
||||||
)
|
)
|
||||||
return None
|
return None, 0.0
|
||||||
|
|
||||||
result_body = result.get("result")
|
result_body = result.get("result")
|
||||||
if not isinstance(result_body, dict):
|
if not isinstance(result_body, dict):
|
||||||
logger.debug("Netease: search 'result' field missing or invalid")
|
logger.debug("Netease: search 'result' field missing or invalid")
|
||||||
return None
|
return None, 0.0
|
||||||
|
|
||||||
songs = result_body.get("songs")
|
songs = result_body.get("songs")
|
||||||
if not isinstance(songs, list) or len(songs) == 0:
|
if not isinstance(songs, list) or len(songs) == 0:
|
||||||
logger.debug("Netease: search returned 0 results")
|
logger.debug("Netease: search returned 0 results")
|
||||||
return None
|
return None, 0.0
|
||||||
|
|
||||||
logger.debug(f"Netease: search returned {len(songs)} candidates")
|
logger.debug(f"Netease: search returned {len(songs)} candidates")
|
||||||
|
|
||||||
@@ -90,23 +90,37 @@ class NeteaseFetcher(BaseFetcher):
|
|||||||
duration_ms=float(song["dt"])
|
duration_ms=float(song["dt"])
|
||||||
if isinstance(song.get("dt"), int)
|
if isinstance(song.get("dt"), int)
|
||||||
else None,
|
else None,
|
||||||
|
title=song.get("name"),
|
||||||
|
artist=", ".join(a.get("name", "") for a in song.get("ar", []))
|
||||||
|
or None,
|
||||||
|
album=(song.get("al") or {}).get("name"),
|
||||||
)
|
)
|
||||||
for song in songs
|
for song in songs
|
||||||
if isinstance(song, dict) and song.get("id") is not None
|
if isinstance(song, dict) and song.get("id") is not None
|
||||||
]
|
]
|
||||||
best_id = select_best(candidates, track.length)
|
best_id, confidence = select_best(
|
||||||
|
candidates,
|
||||||
|
track.length,
|
||||||
|
title=track.title,
|
||||||
|
artist=track.artist,
|
||||||
|
album=track.album,
|
||||||
|
)
|
||||||
if best_id is not None:
|
if best_id is not None:
|
||||||
logger.debug(f"Netease: selected id={best_id}")
|
logger.debug(
|
||||||
return best_id
|
f"Netease: selected id={best_id} (confidence={confidence:.0f})"
|
||||||
|
)
|
||||||
|
return best_id, confidence
|
||||||
|
|
||||||
logger.debug("Netease: no suitable candidate found")
|
logger.debug("Netease: no suitable candidate found")
|
||||||
return None
|
return None, 0.0
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Netease: search failed: {e}")
|
logger.error(f"Netease: search failed: {e}")
|
||||||
return None
|
return None, 0.0
|
||||||
|
|
||||||
def _get_lyric(self, song_id: int) -> Optional[LyricResult]:
|
def _get_lyric(
|
||||||
|
self, song_id: int, confidence: float = 0.0
|
||||||
|
) -> Optional[LyricResult]:
|
||||||
"""Fetch lyrics for a given Netease song ID."""
|
"""Fetch lyrics for a given Netease song ID."""
|
||||||
logger.debug(f"Netease: fetching lyrics for song_id={song_id}")
|
logger.debug(f"Netease: fetching lyrics for song_id={song_id}")
|
||||||
|
|
||||||
@@ -158,7 +172,12 @@ class NeteaseFetcher(BaseFetcher):
|
|||||||
f"Netease: got {status.value} lyrics for song_id={song_id} "
|
f"Netease: got {status.value} lyrics for song_id={song_id} "
|
||||||
f"({len(lrcdata)} lines)"
|
f"({len(lrcdata)} lines)"
|
||||||
)
|
)
|
||||||
return LyricResult(status=status, lyrics=lrcdata, source=self.source_name)
|
return LyricResult(
|
||||||
|
status=status,
|
||||||
|
lyrics=lrcdata,
|
||||||
|
source=self.source_name,
|
||||||
|
confidence=confidence,
|
||||||
|
)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Netease: lyric fetch failed for song_id={song_id}: {e}")
|
logger.error(f"Netease: lyric fetch failed for song_id={song_id}: {e}")
|
||||||
@@ -174,9 +193,9 @@ class NeteaseFetcher(BaseFetcher):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
logger.info(f"Netease: fetching lyrics for {track.display_name()}")
|
logger.info(f"Netease: fetching lyrics for {track.display_name()}")
|
||||||
song_id = self._search(track)
|
song_id, confidence = self._search(track)
|
||||||
if not song_id:
|
if not song_id:
|
||||||
logger.debug(f"Netease: no match found for {track.display_name()}")
|
logger.debug(f"Netease: no match found for {track.display_name()}")
|
||||||
return LyricResult(status=CacheStatus.NOT_FOUND, ttl=TTL_NOT_FOUND)
|
return LyricResult(status=CacheStatus.NOT_FOUND, ttl=TTL_NOT_FOUND)
|
||||||
|
|
||||||
return self._get_lyric(song_id)
|
return self._get_lyric(song_id, confidence=confidence)
|
||||||
|
|||||||
+31
-14
@@ -35,11 +35,11 @@ class QQMusicFetcher(BaseFetcher):
|
|||||||
def is_available(self, track: TrackMeta) -> bool:
|
def is_available(self, track: TrackMeta) -> bool:
|
||||||
return bool(track.title) and bool(QQ_MUSIC_API_URL)
|
return bool(track.title) and bool(QQ_MUSIC_API_URL)
|
||||||
|
|
||||||
def _search(self, track: TrackMeta, limit: int = 10) -> Optional[str]:
|
def _search(self, track: TrackMeta, limit: int = 10) -> tuple[Optional[str], float]:
|
||||||
"""Search QQ Music and return the best-matching song MID."""
|
"""Search QQ Music and return the best-matching song MID with confidence."""
|
||||||
query = f"{track.artist or ''} {track.title or ''}".strip()
|
query = f"{track.artist or ''} {track.title or ''}".strip()
|
||||||
if not query:
|
if not query:
|
||||||
return None
|
return None, 0.0
|
||||||
|
|
||||||
logger.debug(f"QQMusic: searching for '{query}' (limit={limit})")
|
logger.debug(f"QQMusic: searching for '{query}' (limit={limit})")
|
||||||
|
|
||||||
@@ -54,12 +54,12 @@ class QQMusicFetcher(BaseFetcher):
|
|||||||
|
|
||||||
if data.get("code") != 0:
|
if data.get("code") != 0:
|
||||||
logger.error(f"QQMusic: search API error: {data}")
|
logger.error(f"QQMusic: search API error: {data}")
|
||||||
return None
|
return None, 0.0
|
||||||
|
|
||||||
songs = data.get("data", {}).get("list", [])
|
songs = data.get("data", {}).get("list", [])
|
||||||
if not songs:
|
if not songs:
|
||||||
logger.debug("QQMusic: search returned 0 results")
|
logger.debug("QQMusic: search returned 0 results")
|
||||||
return None
|
return None, 0.0
|
||||||
|
|
||||||
logger.debug(f"QQMusic: search returned {len(songs)} candidates")
|
logger.debug(f"QQMusic: search returned {len(songs)} candidates")
|
||||||
|
|
||||||
@@ -69,23 +69,35 @@ class QQMusicFetcher(BaseFetcher):
|
|||||||
duration_ms=float(song["interval"]) * 1000
|
duration_ms=float(song["interval"]) * 1000
|
||||||
if isinstance(song.get("interval"), int)
|
if isinstance(song.get("interval"), int)
|
||||||
else None,
|
else None,
|
||||||
|
title=song.get("name"),
|
||||||
|
artist=", ".join(s.get("name", "") for s in song.get("singer", []))
|
||||||
|
or None,
|
||||||
|
album=(song.get("album") or {}).get("name"),
|
||||||
)
|
)
|
||||||
for song in songs
|
for song in songs
|
||||||
if isinstance(song, dict) and song.get("mid") is not None
|
if isinstance(song, dict) and song.get("mid") is not None
|
||||||
]
|
]
|
||||||
best_mid = select_best(candidates, track.length)
|
best_mid, confidence = select_best(
|
||||||
|
candidates,
|
||||||
|
track.length,
|
||||||
|
title=track.title,
|
||||||
|
artist=track.artist,
|
||||||
|
album=track.album,
|
||||||
|
)
|
||||||
if best_mid is not None:
|
if best_mid is not None:
|
||||||
logger.debug(f"QQMusic: selected mid={best_mid}")
|
logger.debug(
|
||||||
return best_mid
|
f"QQMusic: selected mid={best_mid} (confidence={confidence:.0f})"
|
||||||
|
)
|
||||||
|
return best_mid, confidence
|
||||||
|
|
||||||
logger.debug("QQMusic: no suitable candidate found")
|
logger.debug("QQMusic: no suitable candidate found")
|
||||||
return None
|
return None, 0.0
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"QQMusic: search failed: {e}")
|
logger.error(f"QQMusic: search failed: {e}")
|
||||||
return None
|
return None, 0.0
|
||||||
|
|
||||||
def _get_lyric(self, mid: str) -> Optional[LyricResult]:
|
def _get_lyric(self, mid: str, confidence: float = 0.0) -> Optional[LyricResult]:
|
||||||
"""Fetch lyrics for a given QQ Music song MID."""
|
"""Fetch lyrics for a given QQ Music song MID."""
|
||||||
logger.debug(f"QQMusic: fetching lyrics for mid={mid}")
|
logger.debug(f"QQMusic: fetching lyrics for mid={mid}")
|
||||||
|
|
||||||
@@ -115,7 +127,12 @@ class QQMusicFetcher(BaseFetcher):
|
|||||||
f"QQMusic: got {status.value} lyrics for mid={mid} "
|
f"QQMusic: got {status.value} lyrics for mid={mid} "
|
||||||
f"({len(lrcdata)} lines)"
|
f"({len(lrcdata)} lines)"
|
||||||
)
|
)
|
||||||
return LyricResult(status=status, lyrics=lrcdata, source=self.source_name)
|
return LyricResult(
|
||||||
|
status=status,
|
||||||
|
lyrics=lrcdata,
|
||||||
|
source=self.source_name,
|
||||||
|
confidence=confidence,
|
||||||
|
)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"QQMusic: lyric fetch failed for mid={mid}: {e}")
|
logger.error(f"QQMusic: lyric fetch failed for mid={mid}: {e}")
|
||||||
@@ -135,9 +152,9 @@ class QQMusicFetcher(BaseFetcher):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
logger.info(f"QQMusic: fetching lyrics for {track.display_name()}")
|
logger.info(f"QQMusic: fetching lyrics for {track.display_name()}")
|
||||||
mid = self._search(track)
|
mid, confidence = self._search(track)
|
||||||
if not mid:
|
if not mid:
|
||||||
logger.debug(f"QQMusic: no match found for {track.display_name()}")
|
logger.debug(f"QQMusic: no match found for {track.display_name()}")
|
||||||
return LyricResult(status=CacheStatus.NOT_FOUND, ttl=TTL_NOT_FOUND)
|
return LyricResult(status=CacheStatus.NOT_FOUND, ttl=TTL_NOT_FOUND)
|
||||||
|
|
||||||
return self._get_lyric(mid)
|
return self._get_lyric(mid, confidence=confidence)
|
||||||
|
|||||||
+131
-31
@@ -2,13 +2,23 @@
|
|||||||
Shared candidate-selection logic for search-based fetchers.
|
Shared candidate-selection logic for search-based fetchers.
|
||||||
|
|
||||||
Each fetcher maps its API-specific results to SearchCandidate, then calls
|
Each fetcher maps its API-specific results to SearchCandidate, then calls
|
||||||
select_best() which handles duration filtering and synced preference uniformly.
|
select_best() which scores candidates by metadata similarity, duration
|
||||||
|
proximity, and sync status.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import Generic, Optional, TypeVar
|
from typing import Generic, Optional, TypeVar
|
||||||
|
|
||||||
from ..config import DURATION_TOLERANCE_MS
|
from ..config import (
|
||||||
|
DURATION_TOLERANCE_MS,
|
||||||
|
SCORE_W_TITLE as _W_TITLE,
|
||||||
|
SCORE_W_ARTIST as _W_ARTIST,
|
||||||
|
SCORE_W_ALBUM as _W_ALBUM,
|
||||||
|
SCORE_W_DURATION as _W_DURATION,
|
||||||
|
SCORE_W_SYNCED as _W_SYNCED,
|
||||||
|
MIN_CONFIDENCE,
|
||||||
|
)
|
||||||
|
from ..normalize import normalize_for_match, normalize_artist
|
||||||
|
|
||||||
T = TypeVar("T")
|
T = TypeVar("T")
|
||||||
|
|
||||||
@@ -21,48 +31,138 @@ class SearchCandidate(Generic[T]):
|
|||||||
item: The original API-specific object (dict, ID, etc.)
|
item: The original API-specific object (dict, ID, etc.)
|
||||||
duration_ms: Track duration in milliseconds, or None if unknown.
|
duration_ms: Track duration in milliseconds, or None if unknown.
|
||||||
is_synced: Whether this candidate is known to have synced lyrics.
|
is_synced: Whether this candidate is known to have synced lyrics.
|
||||||
|
title: Candidate track title for similarity scoring.
|
||||||
|
artist: Candidate artist name for similarity scoring.
|
||||||
|
album: Candidate album name for similarity scoring.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
item: T
|
item: T
|
||||||
duration_ms: Optional[float] = None
|
duration_ms: Optional[float] = None
|
||||||
is_synced: bool = False
|
is_synced: bool = False
|
||||||
|
title: Optional[str] = None
|
||||||
|
artist: Optional[str] = None
|
||||||
|
album: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
|
def _text_similarity(a: str, b: str) -> float:
|
||||||
|
"""Compare two normalized strings. Returns 0.0-1.0."""
|
||||||
|
if a == b:
|
||||||
|
return 1.0
|
||||||
|
if not a or not b:
|
||||||
|
return 0.0
|
||||||
|
# Containment: one is a substring of the other (e.g. "My Love" vs "My Love (Album Version)")
|
||||||
|
if a in b or b in a:
|
||||||
|
return min(len(a), len(b)) / max(len(a), len(b))
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
|
||||||
|
def _score_candidate(
|
||||||
|
c: SearchCandidate[T],
|
||||||
|
ref_title: Optional[str],
|
||||||
|
ref_artist: Optional[str],
|
||||||
|
ref_album: Optional[str],
|
||||||
|
ref_length_ms: Optional[int],
|
||||||
|
) -> float:
|
||||||
|
"""Score a candidate from 0-100 based on metadata match quality.
|
||||||
|
|
||||||
|
Scoring works in two tiers:
|
||||||
|
|
||||||
|
1. **Metadata score** — computed from fields available on *both* sides,
|
||||||
|
then rescaled to fill the 0-90 range so that missing fields don't
|
||||||
|
inflate the score. Fields missing on both sides are simply excluded
|
||||||
|
from the calculation (neutral). Fields present on only one side
|
||||||
|
contribute 0 to the numerator but their weight still counts in the
|
||||||
|
denominator (penalty for asymmetric absence).
|
||||||
|
|
||||||
|
2. **Synced bonus** — a flat 10 pts, always applied independently.
|
||||||
|
|
||||||
|
Field weights (before rescaling):
|
||||||
|
- Title: 40
|
||||||
|
- Artist: 30
|
||||||
|
- Album: 10
|
||||||
|
- Duration: 10
|
||||||
|
"""
|
||||||
|
raw = 0.0
|
||||||
|
available_weight = 0.0
|
||||||
|
|
||||||
|
# Title
|
||||||
|
if ref_title is not None or c.title is not None:
|
||||||
|
available_weight += _W_TITLE
|
||||||
|
if ref_title is not None and c.title is not None:
|
||||||
|
raw += _W_TITLE * _text_similarity(
|
||||||
|
normalize_for_match(ref_title), normalize_for_match(c.title)
|
||||||
|
)
|
||||||
|
# else both None → excluded
|
||||||
|
|
||||||
|
# Artist
|
||||||
|
if ref_artist is not None or c.artist is not None:
|
||||||
|
available_weight += _W_ARTIST
|
||||||
|
if ref_artist is not None and c.artist is not None:
|
||||||
|
na = normalize_artist(ref_artist)
|
||||||
|
nb = normalize_artist(c.artist)
|
||||||
|
if na == nb:
|
||||||
|
raw += _W_ARTIST
|
||||||
|
else:
|
||||||
|
raw += _W_ARTIST * _text_similarity(
|
||||||
|
normalize_for_match(ref_artist), normalize_for_match(c.artist)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Album
|
||||||
|
if ref_album is not None or c.album is not None:
|
||||||
|
available_weight += _W_ALBUM
|
||||||
|
if ref_album is not None and c.album is not None:
|
||||||
|
raw += _W_ALBUM * _text_similarity(
|
||||||
|
normalize_for_match(ref_album), normalize_for_match(c.album)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Duration
|
||||||
|
if ref_length_ms is not None or c.duration_ms is not None:
|
||||||
|
available_weight += _W_DURATION
|
||||||
|
if ref_length_ms is not None and c.duration_ms is not None:
|
||||||
|
diff = abs(c.duration_ms - ref_length_ms)
|
||||||
|
if diff <= DURATION_TOLERANCE_MS:
|
||||||
|
raw += _W_DURATION * (1.0 - diff / DURATION_TOLERANCE_MS)
|
||||||
|
|
||||||
|
# Rescale metadata to 0-90 range
|
||||||
|
_MAX_METADATA = _W_TITLE + _W_ARTIST + _W_ALBUM + _W_DURATION # 90
|
||||||
|
if available_weight > 0:
|
||||||
|
metadata_score = (raw / available_weight) * _MAX_METADATA
|
||||||
|
else:
|
||||||
|
# No comparable fields at all — only synced bonus matters
|
||||||
|
metadata_score = 0.0
|
||||||
|
|
||||||
|
# Synced bonus (always 10 pts, independent of metadata)
|
||||||
|
synced_score = _W_SYNCED if c.is_synced else 0.0
|
||||||
|
|
||||||
|
return metadata_score + synced_score
|
||||||
|
|
||||||
|
|
||||||
def select_best(
|
def select_best(
|
||||||
candidates: list[SearchCandidate[T]],
|
candidates: list[SearchCandidate[T]],
|
||||||
track_length_ms: Optional[int] = None,
|
track_length_ms: Optional[int] = None,
|
||||||
tolerance_ms: float = DURATION_TOLERANCE_MS,
|
*,
|
||||||
) -> Optional[T]:
|
title: Optional[str] = None,
|
||||||
"""Pick the best candidate by duration proximity and sync preference.
|
artist: Optional[str] = None,
|
||||||
|
album: Optional[str] = None,
|
||||||
|
min_confidence: float = MIN_CONFIDENCE,
|
||||||
|
) -> tuple[Optional[T], float]:
|
||||||
|
"""Pick the best candidate by confidence scoring.
|
||||||
|
|
||||||
When track_length_ms is available:
|
Returns (item, score). Item is None if no candidate scores above min_confidence.
|
||||||
- Filter by tolerance_ms
|
|
||||||
- Pick closest duration, prefer synced at equal distance
|
|
||||||
When track_length_ms is unavailable:
|
|
||||||
- Pick first synced candidate, or first overall
|
|
||||||
"""
|
"""
|
||||||
if track_length_ms is not None:
|
if not candidates:
|
||||||
best: Optional[SearchCandidate[T]] = None
|
return None, 0.0
|
||||||
best_diff = float("inf")
|
|
||||||
|
best_item: Optional[T] = None
|
||||||
|
best_score = -1.0
|
||||||
|
|
||||||
for c in candidates:
|
for c in candidates:
|
||||||
if c.duration_ms is None:
|
s = _score_candidate(c, title, artist, album, track_length_ms)
|
||||||
continue
|
if s > best_score:
|
||||||
diff = abs(c.duration_ms - track_length_ms)
|
best_score = s
|
||||||
if diff > tolerance_ms:
|
best_item = c.item
|
||||||
continue
|
|
||||||
if diff < best_diff or (
|
|
||||||
diff == best_diff
|
|
||||||
and c.is_synced
|
|
||||||
and (best is None or not best.is_synced)
|
|
||||||
):
|
|
||||||
best_diff = diff
|
|
||||||
best = c
|
|
||||||
|
|
||||||
return best.item if best is not None else None
|
if best_score < min_confidence:
|
||||||
|
return None, best_score
|
||||||
|
|
||||||
# No duration — prefer synced, fallback to first
|
return best_item, best_score
|
||||||
for c in candidates:
|
|
||||||
if c.is_synced:
|
|
||||||
return c.item
|
|
||||||
return candidates[0].item if candidates else None
|
|
||||||
|
|||||||
@@ -62,3 +62,6 @@ class LyricResult:
|
|||||||
lyrics: Optional[LRCData] = None
|
lyrics: Optional[LRCData] = None
|
||||||
source: Optional[str] = None # Which fetcher produced this result
|
source: Optional[str] = None # Which fetcher produced this result
|
||||||
ttl: Optional[int] = None # Hint for cache TTL (seconds)
|
ttl: Optional[int] = None # Hint for cache TTL (seconds)
|
||||||
|
confidence: Optional[float] = (
|
||||||
|
None # 0-100 selection confidence (None = exact/trusted)
|
||||||
|
)
|
||||||
|
|||||||
@@ -0,0 +1,47 @@
|
|||||||
|
"""
|
||||||
|
Shared text normalization utilities for fuzzy matching.
|
||||||
|
|
||||||
|
Used by cache key generation, cache search, and candidate selection scoring.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
import unicodedata
|
||||||
|
|
||||||
|
# Punctuation to strip for fuzzy matching (ASCII + fullwidth + CJK brackets/symbols)
|
||||||
|
_PUNCT_RE = re.compile(
|
||||||
|
r"[~!@#$%^&*()_+\-=\[\]{}|;:'\",.<>?/\\`"
|
||||||
|
r"~!@#$%^&*()_+-=【】{}|;:'",。<>?/\`"
|
||||||
|
r"「」『』《》〈〉〔〕·•‥…—–]"
|
||||||
|
)
|
||||||
|
_SPACE_RE = re.compile(r"\s+")
|
||||||
|
# feat./ft./featuring and everything after (case-insensitive, word boundary)
|
||||||
|
_FEAT_RE = re.compile(r"\s*(?:\bfeat\.?\b|\bft\.?\b|\bfeaturing\b).*", re.IGNORECASE)
|
||||||
|
# Multi-artist separators: /, &, ×, x (surrounded by spaces), ;, 、, vs.
|
||||||
|
_ARTIST_SEP_RE = re.compile(r"\s*(?:[/&;×、]|\bvs\.?\b|\bx\b)\s*", re.IGNORECASE)
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_for_match(s: str) -> str:
|
||||||
|
"""Normalize a string for fuzzy comparison.
|
||||||
|
|
||||||
|
Lowercases, NFKC-normalizes (fullwidth → halfwidth), strips punctuation,
|
||||||
|
and collapses whitespace.
|
||||||
|
"""
|
||||||
|
s = unicodedata.normalize("NFKC", s).lower()
|
||||||
|
s = _FEAT_RE.sub("", s)
|
||||||
|
s = _PUNCT_RE.sub(" ", s)
|
||||||
|
s = _SPACE_RE.sub(" ", s).strip()
|
||||||
|
return s
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_artist(s: str) -> str:
|
||||||
|
"""Normalize an artist string: split by separators, normalize each, sort.
|
||||||
|
|
||||||
|
Splits first (on /, &, ;, ×, 、, vs., x), then strips feat./ft./featuring
|
||||||
|
from each part individually, so 'A feat. C / B' → ['a', 'b'] not just ['a'].
|
||||||
|
"""
|
||||||
|
s = unicodedata.normalize("NFKC", s).lower()
|
||||||
|
parts = _ARTIST_SEP_RE.split(s)
|
||||||
|
normed = sorted(
|
||||||
|
{normalize_for_match(p) for p in parts if _FEAT_RE.sub("", p).strip()}
|
||||||
|
)
|
||||||
|
return "\0".join(normed) if normed else normalize_for_match(s)
|
||||||
+4
-21
@@ -8,11 +8,10 @@ import pytest
|
|||||||
from lrx_cli.cache import (
|
from lrx_cli.cache import (
|
||||||
CacheEngine,
|
CacheEngine,
|
||||||
_generate_key,
|
_generate_key,
|
||||||
_normalize_artist,
|
|
||||||
_normalize_for_match,
|
|
||||||
)
|
)
|
||||||
from lrx_cli.config import DURATION_TOLERANCE_MS
|
from lrx_cli.config import DURATION_TOLERANCE_MS
|
||||||
from lrx_cli.models import CacheStatus, LyricResult, TrackMeta
|
from lrx_cli.models import CacheStatus, LyricResult, TrackMeta
|
||||||
|
from lrx_cli.lrc import LRCData
|
||||||
|
|
||||||
|
|
||||||
def _track(
|
def _track(
|
||||||
@@ -39,7 +38,7 @@ def _result(
|
|||||||
lyrics: str | None,
|
lyrics: str | None,
|
||||||
source: str,
|
source: str,
|
||||||
) -> LyricResult:
|
) -> LyricResult:
|
||||||
return LyricResult(status=status, lyrics=lyrics, source=source)
|
return LyricResult(status=status, lyrics=LRCData(lyrics), source=source)
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
@@ -48,22 +47,6 @@ def cache_db(tmp_path: Path) -> CacheEngine:
|
|||||||
return CacheEngine(str(db_path))
|
return CacheEngine(str(db_path))
|
||||||
|
|
||||||
|
|
||||||
def test_normalize_for_match_covers_nfkc_punct_feat_and_whitespace() -> None:
|
|
||||||
text = " Test! feat. SOMEONE "
|
|
||||||
|
|
||||||
normalized = _normalize_for_match(text)
|
|
||||||
|
|
||||||
assert normalized == "test"
|
|
||||||
|
|
||||||
|
|
||||||
def test_normalize_artist_splits_separators_and_sorts_parts() -> None:
|
|
||||||
artist = "B / A feat. C; D vs. E × F 、 G"
|
|
||||||
|
|
||||||
normalized = _normalize_artist(artist)
|
|
||||||
|
|
||||||
assert normalized == "a\0b\0d\0e\0f\0g"
|
|
||||||
|
|
||||||
|
|
||||||
def test_generate_key_uses_spotify_trackid_and_url_fallback() -> None:
|
def test_generate_key_uses_spotify_trackid_and_url_fallback() -> None:
|
||||||
spotify_track = _track(
|
spotify_track = _track(
|
||||||
trackid="abc123", artist=None, title=None, album=None, length=None
|
trackid="abc123", artist=None, title=None, album=None, length=None
|
||||||
@@ -157,7 +140,7 @@ def test_get_backfills_missing_length_when_track_provides_it(
|
|||||||
assert row[0] == 200000
|
assert row[0] == 200000
|
||||||
|
|
||||||
|
|
||||||
def test_get_best_prefers_synced_over_unsynced_and_negative(
|
def test_get_best_prefers_higher_confidence_and_skips_negative(
|
||||||
cache_db: CacheEngine,
|
cache_db: CacheEngine,
|
||||||
) -> None:
|
) -> None:
|
||||||
track = _track()
|
track = _track()
|
||||||
@@ -314,7 +297,7 @@ def test_search_by_meta_fuzzy_rules_and_duration_sorting(cache_db: CacheEngine)
|
|||||||
sources = [r["source"] for r in rows]
|
sources = [r["source"] for r in rows]
|
||||||
assert "negative" not in sources
|
assert "negative" not in sources
|
||||||
assert "far-len" not in sources
|
assert "far-len" not in sources
|
||||||
# Sorted by duration diff, then synced before unsynced for equal diff.
|
# Sorted by duration diff, then confidence for equal diff.
|
||||||
assert sources[0] == "seed"
|
assert sources[0] == "seed"
|
||||||
assert sources[1] == "close-synced"
|
assert sources[1] == "close-synced"
|
||||||
assert sources[2] == "close-unsynced"
|
assert sources[2] == "close-unsynced"
|
||||||
|
|||||||
@@ -0,0 +1,19 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from lrx_cli.normalize import normalize_for_match, normalize_artist
|
||||||
|
|
||||||
|
|
||||||
|
def test_normalize_for_match_covers_nfkc_punct_feat_and_whitespace() -> None:
|
||||||
|
text = " Test! feat. SOMEONE "
|
||||||
|
|
||||||
|
normalized = normalize_for_match(text)
|
||||||
|
|
||||||
|
assert normalized == "test"
|
||||||
|
|
||||||
|
|
||||||
|
def test_normalize_artist_splits_separators_and_sorts_parts() -> None:
|
||||||
|
artist = "B / A feat. C; D vs. E × F 、 G"
|
||||||
|
|
||||||
|
normalized = normalize_artist(artist)
|
||||||
|
|
||||||
|
assert normalized == "a\0b\0d\0e\0f\0g"
|
||||||
+395
-65
@@ -1,92 +1,422 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
from lrx_cli.fetchers.selection import SearchCandidate, select_best
|
from lrx_cli.fetchers.selection import (
|
||||||
|
SearchCandidate,
|
||||||
|
select_best,
|
||||||
|
_score_candidate,
|
||||||
|
_text_similarity,
|
||||||
|
MIN_CONFIDENCE,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_picks_closest_duration_within_tolerance() -> None:
|
def test_text_similarity_exact() -> None:
|
||||||
candidates = [
|
assert _text_similarity("my love", "my love") == 1.0
|
||||||
SearchCandidate(item="far", duration_ms=10000.0),
|
|
||||||
SearchCandidate(item="close", duration_ms=5100.0),
|
|
||||||
SearchCandidate(item="exact", duration_ms=5000.0),
|
def test_text_similarity_empty() -> None:
|
||||||
|
assert _text_similarity("", "anything") == 0.0
|
||||||
|
assert _text_similarity("anything", "") == 0.0
|
||||||
|
|
||||||
|
|
||||||
|
def test_text_similarity_no_overlap() -> None:
|
||||||
|
assert _text_similarity("hello", "world") == 0.0
|
||||||
|
|
||||||
|
|
||||||
|
def test_text_similarity_containment() -> None:
|
||||||
|
# "my love" is contained in "my love album version"
|
||||||
|
score = _text_similarity("my love", "my love album version")
|
||||||
|
assert 0.0 < score < 1.0
|
||||||
|
assert score == len("my love") / len("my love album version")
|
||||||
|
|
||||||
|
|
||||||
|
def test_score_perfect_match() -> None:
|
||||||
|
"""Exact metadata + close duration + synced = 100."""
|
||||||
|
c = SearchCandidate(
|
||||||
|
item="x",
|
||||||
|
duration_ms=232000.0,
|
||||||
|
is_synced=True,
|
||||||
|
title="My Love",
|
||||||
|
artist="Westlife",
|
||||||
|
album="Coast To Coast",
|
||||||
|
)
|
||||||
|
score = _score_candidate(c, "My Love", "Westlife", "Coast To Coast", 232000)
|
||||||
|
assert score == 100.0
|
||||||
|
|
||||||
|
|
||||||
|
def test_score_no_metadata_match() -> None:
|
||||||
|
"""Completely wrong metadata should score very low."""
|
||||||
|
c = SearchCandidate(
|
||||||
|
item="x",
|
||||||
|
duration_ms=192000.0,
|
||||||
|
is_synced=True,
|
||||||
|
title="Let My Love Be Your Pillow (Live)",
|
||||||
|
artist="Ronnie Milsap",
|
||||||
|
album="The Essential Ronnie Milsap",
|
||||||
|
)
|
||||||
|
score = _score_candidate(c, "My Love", "Westlife", "Coast To Coast", 232000)
|
||||||
|
assert score < MIN_CONFIDENCE
|
||||||
|
|
||||||
|
|
||||||
|
def test_score_missing_both_sides_neutral() -> None:
|
||||||
|
"""If neither ref nor candidate has any field, only synced bonus applies."""
|
||||||
|
c = SearchCandidate(item="x", is_synced=True)
|
||||||
|
score = _score_candidate(c, None, None, None, None)
|
||||||
|
# No comparable fields → metadata = 0, synced = 10
|
||||||
|
assert score == 10.0
|
||||||
|
|
||||||
|
|
||||||
|
def test_score_missing_one_side_gives_zero_for_field() -> None:
|
||||||
|
"""If ref has title but candidate doesn't, title gets 0 and weight still counts."""
|
||||||
|
c = SearchCandidate(item="x", title=None, is_synced=True)
|
||||||
|
# Only title is in play (weight=40), candidate missing → raw=0, rescaled=0, + synced=10
|
||||||
|
score = _score_candidate(c, "My Love", None, None, None)
|
||||||
|
assert score == 10.0
|
||||||
|
|
||||||
|
|
||||||
|
def test_score_synced_bonus() -> None:
|
||||||
|
"""Synced adds 10 points."""
|
||||||
|
base = SearchCandidate(item="x", title="My Love", is_synced=False)
|
||||||
|
synced = SearchCandidate(item="x", title="My Love", is_synced=True)
|
||||||
|
diff = _score_candidate(synced, "My Love", None, None, None) - _score_candidate(
|
||||||
|
base, "My Love", None, None, None
|
||||||
|
)
|
||||||
|
assert diff == 10.0
|
||||||
|
|
||||||
|
|
||||||
|
def test_score_duration_linear_decay() -> None:
|
||||||
|
"""Duration score decays linearly; ratios between exact/half/edge are preserved."""
|
||||||
|
exact = SearchCandidate(item="x", duration_ms=232000.0)
|
||||||
|
score_exact = _score_candidate(exact, None, None, None, 232000)
|
||||||
|
|
||||||
|
half_tol = SearchCandidate(item="x", duration_ms=232000.0 + 1500.0)
|
||||||
|
score_half = _score_candidate(half_tol, None, None, None, 232000)
|
||||||
|
|
||||||
|
at_tol = SearchCandidate(item="x", duration_ms=232000.0 + 3000.0)
|
||||||
|
score_edge = _score_candidate(at_tol, None, None, None, 232000)
|
||||||
|
|
||||||
|
# Only duration is comparable → rescaled to fill 0-90
|
||||||
|
# exact=90, half=45, edge=0
|
||||||
|
assert score_exact == 90.0
|
||||||
|
assert score_half == 45.0
|
||||||
|
assert score_edge == 0.0
|
||||||
|
|
||||||
|
|
||||||
|
def test_score_case_insensitive_title() -> None:
|
||||||
|
c = SearchCandidate(item="x", title="my love")
|
||||||
|
s1 = _score_candidate(c, "My Love", None, None, None)
|
||||||
|
s2 = _score_candidate(c, "my love", None, None, None)
|
||||||
|
assert s1 == s2
|
||||||
|
|
||||||
|
|
||||||
|
def test_score_artist_normalization() -> None:
|
||||||
|
"""'Westlife feat. Someone' should still match 'Westlife'."""
|
||||||
|
c = SearchCandidate(item="x", artist="Westlife feat. Someone")
|
||||||
|
# normalize_artist strips feat. → both become "westlife"
|
||||||
|
score = _score_candidate(c, None, "Westlife", None, None)
|
||||||
|
assert score >= 30.0 # full artist weight (30) when both None on other fields
|
||||||
|
|
||||||
|
|
||||||
|
# Reference track: Westlife - My Love, album Coast To Coast, ~232s
|
||||||
|
_REF_TITLE = "My Love"
|
||||||
|
_REF_ARTIST = "Westlife"
|
||||||
|
_REF_ALBUM = "Coast To Coast"
|
||||||
|
_REF_LENGTH = 232000 # ms
|
||||||
|
|
||||||
|
|
||||||
|
def _lrclib_candidates() -> list[SearchCandidate[dict]]:
|
||||||
|
"""Fixtures from real LRCLIB search results."""
|
||||||
|
raw = [
|
||||||
|
{
|
||||||
|
"trackName": "My Love",
|
||||||
|
"artistName": "Westlife",
|
||||||
|
"albumName": "null",
|
||||||
|
"duration": 232.0,
|
||||||
|
"synced": True,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"trackName": "My Love",
|
||||||
|
"artistName": "Westlife",
|
||||||
|
"albumName": "null",
|
||||||
|
"duration": 180.0,
|
||||||
|
"synced": True,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"trackName": "My love",
|
||||||
|
"artistName": "Westlife",
|
||||||
|
"albumName": "moments",
|
||||||
|
"duration": 235.327,
|
||||||
|
"synced": True,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"trackName": "My Love",
|
||||||
|
"artistName": "Westlife",
|
||||||
|
"albumName": "Unbreakable",
|
||||||
|
"duration": 233.026,
|
||||||
|
"synced": True,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"trackName": "My Love",
|
||||||
|
"artistName": "Westlife",
|
||||||
|
"albumName": "Coast To Coast",
|
||||||
|
"duration": 231.847,
|
||||||
|
"synced": True,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"trackName": "Hello My Love",
|
||||||
|
"artistName": "Westlife",
|
||||||
|
"albumName": "Spectrum",
|
||||||
|
"duration": 216.0,
|
||||||
|
"synced": True,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"trackName": "My Love",
|
||||||
|
"artistName": "Westlife",
|
||||||
|
"albumName": "Hitzone 13",
|
||||||
|
"duration": 231.0,
|
||||||
|
"synced": True,
|
||||||
|
},
|
||||||
]
|
]
|
||||||
assert select_best(candidates, track_length_ms=5000) == "exact"
|
return [
|
||||||
|
SearchCandidate(
|
||||||
|
item=r,
|
||||||
def test_filters_out_candidates_beyond_tolerance() -> None:
|
duration_ms=r["duration"] * 1000,
|
||||||
candidates = [
|
is_synced=r["synced"],
|
||||||
SearchCandidate(item="too_far", duration_ms=100000.0),
|
title=r["trackName"],
|
||||||
|
artist=r["artistName"],
|
||||||
|
album=r["albumName"],
|
||||||
|
)
|
||||||
|
for r in raw
|
||||||
]
|
]
|
||||||
assert select_best(candidates, track_length_ms=5000, tolerance_ms=2000) is None
|
|
||||||
|
|
||||||
|
|
||||||
def test_prefers_synced_at_equal_duration() -> None:
|
def _lrclib_noisy_candidates() -> list[SearchCandidate[dict]]:
|
||||||
candidates = [
|
"""Fixtures from LRCLIB title-only search — lots of wrong artists."""
|
||||||
SearchCandidate(item="unsynced", duration_ms=5000.0, is_synced=False),
|
raw = [
|
||||||
SearchCandidate(item="synced", duration_ms=5000.0, is_synced=True),
|
{
|
||||||
|
"trackName": "Let My Love Be Your Pillow (Live)",
|
||||||
|
"artistName": "Ronnie Milsap",
|
||||||
|
"albumName": "The Essential Ronnie Milsap",
|
||||||
|
"duration": 192.0,
|
||||||
|
"synced": True,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"trackName": "My Love",
|
||||||
|
"artistName": "Little Texas",
|
||||||
|
"albumName": "Big Time",
|
||||||
|
"duration": 248.0,
|
||||||
|
"synced": True,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"trackName": "My Love (Album Version)",
|
||||||
|
"artistName": "Little Texas",
|
||||||
|
"albumName": "Greatest Hits",
|
||||||
|
"duration": 248.0,
|
||||||
|
"synced": True,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"trackName": "My Love - Digitally Remastered '89",
|
||||||
|
"artistName": "Sonny James",
|
||||||
|
"albumName": "Capitol Collectors Series",
|
||||||
|
"duration": 169.0,
|
||||||
|
"synced": False,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"trackName": "My Love",
|
||||||
|
"artistName": "Westlife",
|
||||||
|
"albumName": "Coast To Coast",
|
||||||
|
"duration": 231.847,
|
||||||
|
"synced": True,
|
||||||
|
},
|
||||||
]
|
]
|
||||||
assert select_best(candidates, track_length_ms=5000) == "synced"
|
return [
|
||||||
|
SearchCandidate(
|
||||||
|
item=r,
|
||||||
def test_closer_duration_wins_over_synced() -> None:
|
duration_ms=r["duration"] * 1000,
|
||||||
candidates = [
|
is_synced=r["synced"],
|
||||||
SearchCandidate(item="synced_far", duration_ms=6000.0, is_synced=True),
|
title=r["trackName"],
|
||||||
SearchCandidate(item="unsynced_close", duration_ms=5001.0, is_synced=False),
|
artist=r["artistName"],
|
||||||
|
album=r["albumName"],
|
||||||
|
)
|
||||||
|
for r in raw
|
||||||
]
|
]
|
||||||
assert select_best(candidates, track_length_ms=5000) == "unsynced_close"
|
|
||||||
|
|
||||||
|
|
||||||
def test_skips_candidates_without_duration_when_track_length_given() -> None:
|
def _netease_candidates() -> list[SearchCandidate[int]]:
|
||||||
candidates = [
|
"""Fixtures from real Netease search results."""
|
||||||
SearchCandidate(item="no_dur", duration_ms=None),
|
raw = [
|
||||||
SearchCandidate(item="has_dur", duration_ms=5000.0),
|
{
|
||||||
|
"id": 2080607,
|
||||||
|
"name": "My Love",
|
||||||
|
"artist": "Westlife",
|
||||||
|
"album": "Unbreakable, Vol. 1 - The Greatest Hits",
|
||||||
|
"dt": 231941,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 2080749,
|
||||||
|
"name": "My Love (Radio Edit)",
|
||||||
|
"artist": "Westlife",
|
||||||
|
"album": "World Of Our Own - No. 1 Hits Plus (EP)",
|
||||||
|
"dt": 232920,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 29809886,
|
||||||
|
"name": "My Love (Live)",
|
||||||
|
"artist": "Westlife",
|
||||||
|
"album": "The Farewell Tour: Live at Croke Park",
|
||||||
|
"dt": 262000,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 572412968,
|
||||||
|
"name": "My Love",
|
||||||
|
"artist": "Westlife",
|
||||||
|
"album": "Pure... Love",
|
||||||
|
"dt": 231000,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 20707713,
|
||||||
|
"name": "You Raise Me Up",
|
||||||
|
"artist": "Westlife",
|
||||||
|
"album": "You Raise Me Up",
|
||||||
|
"dt": 241116,
|
||||||
|
},
|
||||||
]
|
]
|
||||||
assert select_best(candidates, track_length_ms=5000) == "has_dur"
|
return [
|
||||||
|
SearchCandidate(
|
||||||
|
item=r["id"],
|
||||||
def test_returns_none_when_all_lack_duration_and_track_length_given() -> None:
|
duration_ms=float(r["dt"]),
|
||||||
candidates = [
|
title=r["name"],
|
||||||
SearchCandidate(item="a", duration_ms=None),
|
artist=r["artist"],
|
||||||
SearchCandidate(item="b", duration_ms=None),
|
album=r["album"],
|
||||||
|
)
|
||||||
|
for r in raw
|
||||||
]
|
]
|
||||||
assert select_best(candidates, track_length_ms=5000) is None
|
|
||||||
|
|
||||||
|
|
||||||
def test_prefers_synced_when_no_track_length() -> None:
|
def test_lrclib_picks_exact_album_match() -> None:
|
||||||
candidates = [
|
"""With full metadata, should pick the Coast To Coast entry."""
|
||||||
SearchCandidate(item="unsynced", is_synced=False),
|
candidates = _lrclib_candidates()
|
||||||
SearchCandidate(item="synced", is_synced=True),
|
best, score = select_best(
|
||||||
]
|
candidates,
|
||||||
assert select_best(candidates, track_length_ms=None) == "synced"
|
_REF_LENGTH,
|
||||||
|
title=_REF_TITLE,
|
||||||
|
artist=_REF_ARTIST,
|
||||||
|
album=_REF_ALBUM,
|
||||||
|
)
|
||||||
|
assert best is not None
|
||||||
|
assert best["albumName"] == "Coast To Coast"
|
||||||
|
assert score >= MIN_CONFIDENCE
|
||||||
|
|
||||||
|
|
||||||
def test_falls_back_to_first_when_none_synced() -> None:
|
def test_lrclib_rejects_wrong_title() -> None:
|
||||||
candidates = [
|
"""'Hello My Love' should not beat 'My Love' entries."""
|
||||||
SearchCandidate(item="first"),
|
candidates = _lrclib_candidates()
|
||||||
SearchCandidate(item="second"),
|
best, _ = select_best(
|
||||||
]
|
candidates,
|
||||||
assert select_best(candidates, track_length_ms=None) == "first"
|
_REF_LENGTH,
|
||||||
|
title=_REF_TITLE,
|
||||||
|
artist=_REF_ARTIST,
|
||||||
|
album=_REF_ALBUM,
|
||||||
|
)
|
||||||
|
assert best is not None
|
||||||
|
assert best["trackName"] != "Hello My Love"
|
||||||
|
|
||||||
|
|
||||||
|
def test_lrclib_noisy_picks_westlife() -> None:
|
||||||
|
"""In noisy title-only results, artist matching should filter to Westlife."""
|
||||||
|
candidates = _lrclib_noisy_candidates()
|
||||||
|
best, _ = select_best(
|
||||||
|
candidates,
|
||||||
|
_REF_LENGTH,
|
||||||
|
title=_REF_TITLE,
|
||||||
|
artist=_REF_ARTIST,
|
||||||
|
album=_REF_ALBUM,
|
||||||
|
)
|
||||||
|
assert best is not None
|
||||||
|
assert best["artistName"] == "Westlife"
|
||||||
|
|
||||||
|
|
||||||
|
def test_lrclib_noisy_rejects_all_without_ref_artist() -> None:
|
||||||
|
"""Without ref artist, wrong-artist candidates may still win, but right title should rank higher."""
|
||||||
|
candidates = _lrclib_noisy_candidates()
|
||||||
|
best, _ = select_best(
|
||||||
|
candidates,
|
||||||
|
_REF_LENGTH,
|
||||||
|
title=_REF_TITLE,
|
||||||
|
)
|
||||||
|
# Should pick a "My Love" over "Let My Love Be Your Pillow"
|
||||||
|
assert best is not None
|
||||||
|
assert "My Love" == best["trackName"] or best["trackName"].startswith("My Love")
|
||||||
|
|
||||||
|
|
||||||
|
def test_netease_picks_closest_duration() -> None:
|
||||||
|
candidates = _netease_candidates()
|
||||||
|
best, _ = select_best(
|
||||||
|
candidates,
|
||||||
|
_REF_LENGTH,
|
||||||
|
title=_REF_TITLE,
|
||||||
|
artist=_REF_ARTIST,
|
||||||
|
album=_REF_ALBUM,
|
||||||
|
)
|
||||||
|
# 2080607 has dt=231941 (diff=59ms), closest to 232000
|
||||||
|
assert best == 2080607
|
||||||
|
|
||||||
|
|
||||||
|
def test_netease_rejects_wrong_title() -> None:
|
||||||
|
"""'You Raise Me Up' should not be selected."""
|
||||||
|
candidates = _netease_candidates()
|
||||||
|
best, _ = select_best(
|
||||||
|
candidates,
|
||||||
|
_REF_LENGTH,
|
||||||
|
title=_REF_TITLE,
|
||||||
|
artist=_REF_ARTIST,
|
||||||
|
)
|
||||||
|
assert best != 20707713
|
||||||
|
|
||||||
|
|
||||||
|
def test_netease_without_ref_metadata_rejects_below_confidence() -> None:
|
||||||
|
"""Without any ref metadata, candidates with one-sided fields score low and get rejected."""
|
||||||
|
candidates = _netease_candidates()
|
||||||
|
best, _ = select_best(candidates, _REF_LENGTH)
|
||||||
|
# Candidates have title/artist/album but ref has None for all → 0 for text fields
|
||||||
|
# Only duration (max 10) contributes → below MIN_CONFIDENCE (25)
|
||||||
|
assert best is None
|
||||||
|
|
||||||
|
|
||||||
|
# --- Edge cases ---
|
||||||
|
|
||||||
|
|
||||||
def test_empty_candidates_returns_none() -> None:
|
def test_empty_candidates_returns_none() -> None:
|
||||||
assert select_best([], track_length_ms=5000) is None
|
assert select_best([], track_length_ms=5000) == (None, 0.0)
|
||||||
assert select_best([], track_length_ms=None) is None
|
assert select_best([], track_length_ms=None) == (None, 0.0)
|
||||||
|
|
||||||
|
|
||||||
def test_single_candidate_within_tolerance() -> None:
|
def test_all_below_min_confidence_returns_none() -> None:
|
||||||
candidates = [SearchCandidate(item="only", duration_ms=5000.0)]
|
"""If all candidates score below threshold, return None."""
|
||||||
assert select_best(candidates, track_length_ms=5000) == "only"
|
candidates = [
|
||||||
|
SearchCandidate(
|
||||||
|
item="x",
|
||||||
def test_single_candidate_beyond_tolerance() -> None:
|
title="Completely Different Song",
|
||||||
candidates = [SearchCandidate(item="only", duration_ms=99999.0)]
|
artist="Unknown Artist",
|
||||||
assert select_best(candidates, track_length_ms=5000, tolerance_ms=1000) is None
|
album="Unknown Album",
|
||||||
|
duration_ms=999999.0,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
result, _ = select_best(
|
||||||
|
candidates,
|
||||||
|
232000,
|
||||||
|
title="My Love",
|
||||||
|
artist="Westlife",
|
||||||
|
album="Coast To Coast",
|
||||||
|
min_confidence=90.0,
|
||||||
|
)
|
||||||
|
assert result is None
|
||||||
|
|
||||||
|
|
||||||
def test_generic_type_preserved() -> None:
|
def test_generic_type_preserved() -> None:
|
||||||
"""select_best returns the same type as SearchCandidate.item."""
|
int_candidates = [SearchCandidate(item=42, duration_ms=5000.0, title="x")]
|
||||||
int_candidates = [SearchCandidate(item=42, duration_ms=5000.0)]
|
best, _ = select_best(int_candidates, 5000, title="x")
|
||||||
assert select_best(int_candidates, track_length_ms=5000) == 42
|
assert best == 42
|
||||||
|
|
||||||
dict_candidates = [SearchCandidate(item={"id": 1}, duration_ms=5000.0)]
|
dict_candidates = [SearchCandidate(item={"id": 1}, title="x")]
|
||||||
result = select_best(dict_candidates, track_length_ms=5000)
|
best, _ = select_best(dict_candidates, title="x")
|
||||||
assert result == {"id": 1}
|
assert best == {"id": 1}
|
||||||
|
|||||||
Reference in New Issue
Block a user