feat: implement cache-search fetcher for cross-album fuzzy lookup

This commit is contained in:
2026-03-28 06:21:31 +01:00
parent 4182229ae2
commit 05d7def249
3 changed files with 181 additions and 11 deletions
+103 -6
View File
@@ -4,15 +4,33 @@ Date: 2026-03-25 10:18:03
Description: SQLite-based lyric cache with per-source storage and TTL expiration
"""
import re
import sqlite3
import hashlib
import time
import unicodedata
from typing import Optional
from loguru import logger
from .config import DB_PATH
from .config import DB_PATH, DURATION_TOLERANCE_MS
from .models import TrackMeta, LyricResult, CacheStatus
# Punctuation to strip for fuzzy matching (ASCII + common fullwidth)
_PUNCT_RE = re.compile(r"[~!@#$%^&*()_+\-=\[\]{}|;:'\",.<>?/\\`~!@#$%^&*()_+-=【】{}|;:'",。<>?/\`]")
_SPACE_RE = re.compile(r"\s+")
def _normalize_for_match(s: str) -> str:
"""Normalize a string for fuzzy comparison.
Lowercases, NFKC-normalizes (fullwidth → halfwidth), strips punctuation,
and collapses whitespace.
"""
s = unicodedata.normalize("NFKC", s).lower()
s = _PUNCT_RE.sub("", s)
s = _SPACE_RE.sub(" ", s).strip()
return s
def _generate_key(track: TrackMeta, source: str) -> str:
"""Generate a unique cache key from track metadata and source.
@@ -64,9 +82,14 @@ class CacheEngine:
expires_at INTEGER,
artist TEXT,
title TEXT,
album TEXT
album TEXT,
length INTEGER
)
""")
# Migration: add length column if missing
cols = {r[1] for r in conn.execute("PRAGMA table_info(cache)").fetchall()}
if "length" not in cols:
conn.execute("ALTER TABLE cache ADD COLUMN length INTEGER")
conn.commit()
# Read
@@ -83,7 +106,7 @@ class CacheEngine:
with sqlite3.connect(self.db_path) as conn:
row = conn.execute(
"SELECT status, lyrics, source, expires_at FROM cache WHERE key = ?",
"SELECT status, lyrics, source, expires_at, length FROM cache WHERE key = ?",
(key,),
).fetchone()
@@ -91,7 +114,7 @@ class CacheEngine:
logger.debug(f"Cache miss: {source} / {track.display_name()}")
return None
status_str, lyrics, src, expires_at = row
status_str, lyrics, src, expires_at, cached_length = row
# Check TTL expiration
if expires_at and expires_at < int(time.time()):
@@ -100,6 +123,14 @@ class CacheEngine:
conn.commit()
return None
# Backfill length if the cached row is missing it
if cached_length is None and track.length is not None:
conn.execute(
"UPDATE cache SET length = ? WHERE key = ?",
(track.length, key),
)
conn.commit()
remaining = expires_at - int(time.time()) if expires_at else None
logger.debug(
f"Cache hit: {source} / {track.display_name()} "
@@ -152,8 +183,8 @@ class CacheEngine:
conn.execute(
"""INSERT OR REPLACE INTO cache
(key, source, status, lyrics, created_at, expires_at,
artist, title, album)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)""",
artist, title, album, length)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
(
key,
source,
@@ -164,6 +195,7 @@ class CacheEngine:
track.artist,
track.title,
track.album,
track.length,
),
)
conn.commit()
@@ -226,6 +258,71 @@ class CacheEngine:
params.append(track.album)
return conditions, params
# Fuzzy search
def search_by_meta(
self,
artist: Optional[str],
title: Optional[str],
length: Optional[int] = None,
) -> list[dict]:
"""Search cache for lyrics matching artist/title with fuzzy normalization.
Ignores album and source. Only returns positive results (synced/unsynced)
that have not expired. When *length* is provided, filters by duration
tolerance and sorts by closest match.
"""
if not title:
return []
now = int(time.time())
with sqlite3.connect(self.db_path) as conn:
conn.row_factory = sqlite3.Row
rows = conn.execute(
"""SELECT * FROM cache
WHERE status IN (?, ?)
AND (expires_at IS NULL OR expires_at > ?)""",
(
CacheStatus.SUCCESS_SYNCED.value,
CacheStatus.SUCCESS_UNSYNCED.value,
now,
),
).fetchall()
norm_title = _normalize_for_match(title)
norm_artist = _normalize_for_match(artist) if artist else None
matches: list[dict] = []
for row in rows:
row_dict = dict(row)
# Title must match
row_title = row_dict.get("title") or ""
if _normalize_for_match(row_title) != norm_title:
continue
# Artist must match if provided
if norm_artist:
row_artist = row_dict.get("artist") or ""
if _normalize_for_match(row_artist) != norm_artist:
continue
matches.append(row_dict)
# Duration filtering
if length is not None and matches:
scored = []
for m in matches:
row_len = m.get("length")
if row_len is not None:
diff = abs(row_len - length)
if diff <= DURATION_TOLERANCE_MS:
scored.append((diff, m))
else:
# No duration info in cache — still a candidate but lower priority
scored.append((DURATION_TOLERANCE_MS, m))
scored.sort(key=lambda x: (x[0], x[1].get("status") != CacheStatus.SUCCESS_SYNCED.value))
matches = [m for _, m in scored]
return matches
# Query / inspect
def query_track(self, track: TrackMeta) -> list[dict]:
+13 -5
View File
@@ -20,6 +20,7 @@ from .fetchers.lrclib_search import LrclibSearchFetcher
from .fetchers.lrclib import LrclibFetcher
from .fetchers.spotify import SpotifyFetcher
from .fetchers.local import LocalFetcher
from .fetchers.cache_search import CacheSearchFetcher
from .fetchers.base import BaseFetcher
from .cache import CacheEngine
from .lrc import LRC_LINE_RE, normalize_tags
@@ -59,10 +60,14 @@ _STATUS_TTL: dict[CacheStatus, Optional[int]] = {
class LrcManager:
"""Main entry point for fetching lyrics with caching."""
# Fetchers that manage their own cache logic (skip per-source cache check)
_SELF_CACHED = frozenset({"cache-search"})
def __init__(self) -> None:
self.cache = CacheEngine()
self.fetchers: dict[str, BaseFetcher] = {
"local": LocalFetcher(),
"cache-search": CacheSearchFetcher(self.cache),
"spotify": SpotifyFetcher(),
"lrclib": LrclibFetcher(),
"lrclib-search": LrclibSearchFetcher(),
@@ -82,6 +87,8 @@ class LrcManager:
sequence: list[BaseFetcher] = []
if track.is_local:
sequence.append(self.fetchers["local"])
if track.title:
sequence.append(self.fetchers["cache-search"])
if track.trackid:
sequence.append(self.fetchers["spotify"])
if track.is_complete:
@@ -121,8 +128,8 @@ class LrcManager:
for fetcher in sequence:
source = fetcher.source_name
# Cache check
if not bypass_cache:
# Cache check (skip for fetchers that handle their own caching)
if not bypass_cache and source not in self._SELF_CACHED:
cached = self.cache.get(track, source)
if cached:
if cached.status == CacheStatus.SUCCESS_SYNCED:
@@ -163,9 +170,10 @@ class LrcManager:
ttl=result.ttl,
)
# Cache the normalized result
ttl = result.ttl or _STATUS_TTL.get(result.status, TTL_NOT_FOUND)
self.cache.set(track, source, result, ttl_seconds=ttl)
# Cache the normalized result (skip for read-only fetchers)
if source not in self._SELF_CACHED:
ttl = result.ttl or _STATUS_TTL.get(result.status, TTL_NOT_FOUND)
self.cache.set(track, source, result, ttl_seconds=ttl)
# Evaluate result
if result.status == CacheStatus.SUCCESS_SYNCED:
+65
View File
@@ -0,0 +1,65 @@
"""
Author: Uyanide pywang0608@foxmail.com
Date: 2026-03-28 05:57:46
Description: Cache-search fetcher — cross-album fuzzy lookup in the local cache
"""
"""
Searches existing cache entries by artist + title with fuzzy normalization,
ignoring album and source. Useful when the same track appears on different
albums or is played from different players.
"""
from typing import Optional
from loguru import logger
from .base import BaseFetcher
from ..models import TrackMeta, LyricResult, CacheStatus
from ..cache import CacheEngine
class CacheSearchFetcher(BaseFetcher):
def __init__(self, cache: CacheEngine) -> None:
self._cache = cache
@property
def source_name(self) -> str:
return "cache-search"
def fetch(self, track: TrackMeta) -> Optional[LyricResult]:
if not track.title:
logger.debug("Cache-search: skipped — no title")
return None
matches = self._cache.search_by_meta(
artist=track.artist,
title=track.title,
length=track.length,
)
if not matches:
logger.debug(f"Cache-search: no match for {track.display_name()}")
return None
# Pick best: prefer synced, then first available
best = None
for m in matches:
if m.get("status") == CacheStatus.SUCCESS_SYNCED.value:
best = m
break
if best is None:
best = m
if not best or not best.get("lyrics"):
return None
status = CacheStatus(best["status"])
logger.info(
f"Cache-search: hit from [{best.get('source')}] "
f"album={best.get('album')!r} ({status.value})"
)
return LyricResult(
status=status,
lyrics=best["lyrics"],
source=self.source_name,
)