feat: implement cache-search fetcher for cross-album fuzzy lookup

This commit is contained in:
2026-03-28 06:21:31 +01:00
parent 8ba9daf968
commit d2b4134c96
3 changed files with 181 additions and 11 deletions
+103 -6
View File
@@ -4,15 +4,33 @@ Date: 2026-03-25 10:18:03
Description: SQLite-based lyric cache with per-source storage and TTL expiration Description: SQLite-based lyric cache with per-source storage and TTL expiration
""" """
import re
import sqlite3 import sqlite3
import hashlib import hashlib
import time import time
import unicodedata
from typing import Optional from typing import Optional
from loguru import logger from loguru import logger
from .config import DB_PATH from .config import DB_PATH, DURATION_TOLERANCE_MS
from .models import TrackMeta, LyricResult, CacheStatus from .models import TrackMeta, LyricResult, CacheStatus
# Punctuation to strip for fuzzy matching (ASCII + common fullwidth)
_PUNCT_RE = re.compile(r"[~!@#$%^&*()_+\-=\[\]{}|;:'\",.<>?/\\`~!@#$%^&*()_+-=【】{}|;:'",。<>?/\`]")
_SPACE_RE = re.compile(r"\s+")
def _normalize_for_match(s: str) -> str:
"""Normalize a string for fuzzy comparison.
Lowercases, NFKC-normalizes (fullwidth → halfwidth), strips punctuation,
and collapses whitespace.
"""
s = unicodedata.normalize("NFKC", s).lower()
s = _PUNCT_RE.sub("", s)
s = _SPACE_RE.sub(" ", s).strip()
return s
def _generate_key(track: TrackMeta, source: str) -> str: def _generate_key(track: TrackMeta, source: str) -> str:
"""Generate a unique cache key from track metadata and source. """Generate a unique cache key from track metadata and source.
@@ -64,9 +82,14 @@ class CacheEngine:
expires_at INTEGER, expires_at INTEGER,
artist TEXT, artist TEXT,
title TEXT, title TEXT,
album TEXT album TEXT,
length INTEGER
) )
""") """)
# Migration: add length column if missing
cols = {r[1] for r in conn.execute("PRAGMA table_info(cache)").fetchall()}
if "length" not in cols:
conn.execute("ALTER TABLE cache ADD COLUMN length INTEGER")
conn.commit() conn.commit()
# Read # Read
@@ -83,7 +106,7 @@ class CacheEngine:
with sqlite3.connect(self.db_path) as conn: with sqlite3.connect(self.db_path) as conn:
row = conn.execute( row = conn.execute(
"SELECT status, lyrics, source, expires_at FROM cache WHERE key = ?", "SELECT status, lyrics, source, expires_at, length FROM cache WHERE key = ?",
(key,), (key,),
).fetchone() ).fetchone()
@@ -91,7 +114,7 @@ class CacheEngine:
logger.debug(f"Cache miss: {source} / {track.display_name()}") logger.debug(f"Cache miss: {source} / {track.display_name()}")
return None return None
status_str, lyrics, src, expires_at = row status_str, lyrics, src, expires_at, cached_length = row
# Check TTL expiration # Check TTL expiration
if expires_at and expires_at < int(time.time()): if expires_at and expires_at < int(time.time()):
@@ -100,6 +123,14 @@ class CacheEngine:
conn.commit() conn.commit()
return None return None
# Backfill length if the cached row is missing it
if cached_length is None and track.length is not None:
conn.execute(
"UPDATE cache SET length = ? WHERE key = ?",
(track.length, key),
)
conn.commit()
remaining = expires_at - int(time.time()) if expires_at else None remaining = expires_at - int(time.time()) if expires_at else None
logger.debug( logger.debug(
f"Cache hit: {source} / {track.display_name()} " f"Cache hit: {source} / {track.display_name()} "
@@ -152,8 +183,8 @@ class CacheEngine:
conn.execute( conn.execute(
"""INSERT OR REPLACE INTO cache """INSERT OR REPLACE INTO cache
(key, source, status, lyrics, created_at, expires_at, (key, source, status, lyrics, created_at, expires_at,
artist, title, album) artist, title, album, length)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)""", VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
( (
key, key,
source, source,
@@ -164,6 +195,7 @@ class CacheEngine:
track.artist, track.artist,
track.title, track.title,
track.album, track.album,
track.length,
), ),
) )
conn.commit() conn.commit()
@@ -226,6 +258,71 @@ class CacheEngine:
params.append(track.album) params.append(track.album)
return conditions, params return conditions, params
# Fuzzy search
def search_by_meta(
self,
artist: Optional[str],
title: Optional[str],
length: Optional[int] = None,
) -> list[dict]:
"""Search cache for lyrics matching artist/title with fuzzy normalization.
Ignores album and source. Only returns positive results (synced/unsynced)
that have not expired. When *length* is provided, filters by duration
tolerance and sorts by closest match.
"""
if not title:
return []
now = int(time.time())
with sqlite3.connect(self.db_path) as conn:
conn.row_factory = sqlite3.Row
rows = conn.execute(
"""SELECT * FROM cache
WHERE status IN (?, ?)
AND (expires_at IS NULL OR expires_at > ?)""",
(
CacheStatus.SUCCESS_SYNCED.value,
CacheStatus.SUCCESS_UNSYNCED.value,
now,
),
).fetchall()
norm_title = _normalize_for_match(title)
norm_artist = _normalize_for_match(artist) if artist else None
matches: list[dict] = []
for row in rows:
row_dict = dict(row)
# Title must match
row_title = row_dict.get("title") or ""
if _normalize_for_match(row_title) != norm_title:
continue
# Artist must match if provided
if norm_artist:
row_artist = row_dict.get("artist") or ""
if _normalize_for_match(row_artist) != norm_artist:
continue
matches.append(row_dict)
# Duration filtering
if length is not None and matches:
scored = []
for m in matches:
row_len = m.get("length")
if row_len is not None:
diff = abs(row_len - length)
if diff <= DURATION_TOLERANCE_MS:
scored.append((diff, m))
else:
# No duration info in cache — still a candidate but lower priority
scored.append((DURATION_TOLERANCE_MS, m))
scored.sort(key=lambda x: (x[0], x[1].get("status") != CacheStatus.SUCCESS_SYNCED.value))
matches = [m for _, m in scored]
return matches
# Query / inspect # Query / inspect
def query_track(self, track: TrackMeta) -> list[dict]: def query_track(self, track: TrackMeta) -> list[dict]:
+13 -5
View File
@@ -20,6 +20,7 @@ from .fetchers.lrclib_search import LrclibSearchFetcher
from .fetchers.lrclib import LrclibFetcher from .fetchers.lrclib import LrclibFetcher
from .fetchers.spotify import SpotifyFetcher from .fetchers.spotify import SpotifyFetcher
from .fetchers.local import LocalFetcher from .fetchers.local import LocalFetcher
from .fetchers.cache_search import CacheSearchFetcher
from .fetchers.base import BaseFetcher from .fetchers.base import BaseFetcher
from .cache import CacheEngine from .cache import CacheEngine
from .lrc import LRC_LINE_RE, normalize_tags from .lrc import LRC_LINE_RE, normalize_tags
@@ -59,10 +60,14 @@ _STATUS_TTL: dict[CacheStatus, Optional[int]] = {
class LrcManager: class LrcManager:
"""Main entry point for fetching lyrics with caching.""" """Main entry point for fetching lyrics with caching."""
# Fetchers that manage their own cache logic (skip per-source cache check)
_SELF_CACHED = frozenset({"cache-search"})
def __init__(self) -> None: def __init__(self) -> None:
self.cache = CacheEngine() self.cache = CacheEngine()
self.fetchers: dict[str, BaseFetcher] = { self.fetchers: dict[str, BaseFetcher] = {
"local": LocalFetcher(), "local": LocalFetcher(),
"cache-search": CacheSearchFetcher(self.cache),
"spotify": SpotifyFetcher(), "spotify": SpotifyFetcher(),
"lrclib": LrclibFetcher(), "lrclib": LrclibFetcher(),
"lrclib-search": LrclibSearchFetcher(), "lrclib-search": LrclibSearchFetcher(),
@@ -82,6 +87,8 @@ class LrcManager:
sequence: list[BaseFetcher] = [] sequence: list[BaseFetcher] = []
if track.is_local: if track.is_local:
sequence.append(self.fetchers["local"]) sequence.append(self.fetchers["local"])
if track.title:
sequence.append(self.fetchers["cache-search"])
if track.trackid: if track.trackid:
sequence.append(self.fetchers["spotify"]) sequence.append(self.fetchers["spotify"])
if track.is_complete: if track.is_complete:
@@ -121,8 +128,8 @@ class LrcManager:
for fetcher in sequence: for fetcher in sequence:
source = fetcher.source_name source = fetcher.source_name
# Cache check # Cache check (skip for fetchers that handle their own caching)
if not bypass_cache: if not bypass_cache and source not in self._SELF_CACHED:
cached = self.cache.get(track, source) cached = self.cache.get(track, source)
if cached: if cached:
if cached.status == CacheStatus.SUCCESS_SYNCED: if cached.status == CacheStatus.SUCCESS_SYNCED:
@@ -163,9 +170,10 @@ class LrcManager:
ttl=result.ttl, ttl=result.ttl,
) )
# Cache the normalized result # Cache the normalized result (skip for read-only fetchers)
ttl = result.ttl or _STATUS_TTL.get(result.status, TTL_NOT_FOUND) if source not in self._SELF_CACHED:
self.cache.set(track, source, result, ttl_seconds=ttl) ttl = result.ttl or _STATUS_TTL.get(result.status, TTL_NOT_FOUND)
self.cache.set(track, source, result, ttl_seconds=ttl)
# Evaluate result # Evaluate result
if result.status == CacheStatus.SUCCESS_SYNCED: if result.status == CacheStatus.SUCCESS_SYNCED:
+65
View File
@@ -0,0 +1,65 @@
"""
Author: Uyanide pywang0608@foxmail.com
Date: 2026-03-28 05:57:46
Description: Cache-search fetcher — cross-album fuzzy lookup in the local cache
"""
"""
Searches existing cache entries by artist + title with fuzzy normalization,
ignoring album and source. Useful when the same track appears on different
albums or is played from different players.
"""
from typing import Optional
from loguru import logger
from .base import BaseFetcher
from ..models import TrackMeta, LyricResult, CacheStatus
from ..cache import CacheEngine
class CacheSearchFetcher(BaseFetcher):
def __init__(self, cache: CacheEngine) -> None:
self._cache = cache
@property
def source_name(self) -> str:
return "cache-search"
def fetch(self, track: TrackMeta) -> Optional[LyricResult]:
if not track.title:
logger.debug("Cache-search: skipped — no title")
return None
matches = self._cache.search_by_meta(
artist=track.artist,
title=track.title,
length=track.length,
)
if not matches:
logger.debug(f"Cache-search: no match for {track.display_name()}")
return None
# Pick best: prefer synced, then first available
best = None
for m in matches:
if m.get("status") == CacheStatus.SUCCESS_SYNCED.value:
best = m
break
if best is None:
best = m
if not best or not best.get("lyrics"):
return None
status = CacheStatus(best["status"])
logger.info(
f"Cache-search: hit from [{best.get('source')}] "
f"album={best.get('album')!r} ({status.value})"
)
return LyricResult(
status=status,
lyrics=best["lyrics"],
source=self.source_name,
)