diff --git a/lrcfetch/cache.py b/lrcfetch/cache.py index db39af4..a8b83ea 100644 --- a/lrcfetch/cache.py +++ b/lrcfetch/cache.py @@ -4,15 +4,33 @@ Date: 2026-03-25 10:18:03 Description: SQLite-based lyric cache with per-source storage and TTL expiration """ +import re import sqlite3 import hashlib import time +import unicodedata from typing import Optional from loguru import logger -from .config import DB_PATH +from .config import DB_PATH, DURATION_TOLERANCE_MS from .models import TrackMeta, LyricResult, CacheStatus +# Punctuation to strip for fuzzy matching (ASCII + common fullwidth) +_PUNCT_RE = re.compile(r"[~!@#$%^&*()_+\-=\[\]{}|;:'\",.<>?/\\`~!@#$%^&*()_+-=【】{}|;:'",。<>?/\`]") +_SPACE_RE = re.compile(r"\s+") + + +def _normalize_for_match(s: str) -> str: + """Normalize a string for fuzzy comparison. + + Lowercases, NFKC-normalizes (fullwidth → halfwidth), strips punctuation, + and collapses whitespace. + """ + s = unicodedata.normalize("NFKC", s).lower() + s = _PUNCT_RE.sub("", s) + s = _SPACE_RE.sub(" ", s).strip() + return s + def _generate_key(track: TrackMeta, source: str) -> str: """Generate a unique cache key from track metadata and source. @@ -64,9 +82,14 @@ class CacheEngine: expires_at INTEGER, artist TEXT, title TEXT, - album TEXT + album TEXT, + length INTEGER ) """) + # Migration: add length column if missing + cols = {r[1] for r in conn.execute("PRAGMA table_info(cache)").fetchall()} + if "length" not in cols: + conn.execute("ALTER TABLE cache ADD COLUMN length INTEGER") conn.commit() # Read @@ -83,7 +106,7 @@ class CacheEngine: with sqlite3.connect(self.db_path) as conn: row = conn.execute( - "SELECT status, lyrics, source, expires_at FROM cache WHERE key = ?", + "SELECT status, lyrics, source, expires_at, length FROM cache WHERE key = ?", (key,), ).fetchone() @@ -91,7 +114,7 @@ class CacheEngine: logger.debug(f"Cache miss: {source} / {track.display_name()}") return None - status_str, lyrics, src, expires_at = row + status_str, lyrics, src, expires_at, cached_length = row # Check TTL expiration if expires_at and expires_at < int(time.time()): @@ -100,6 +123,14 @@ class CacheEngine: conn.commit() return None + # Backfill length if the cached row is missing it + if cached_length is None and track.length is not None: + conn.execute( + "UPDATE cache SET length = ? WHERE key = ?", + (track.length, key), + ) + conn.commit() + remaining = expires_at - int(time.time()) if expires_at else None logger.debug( f"Cache hit: {source} / {track.display_name()} " @@ -152,8 +183,8 @@ class CacheEngine: conn.execute( """INSERT OR REPLACE INTO cache (key, source, status, lyrics, created_at, expires_at, - artist, title, album) - VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)""", + artist, title, album, length) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""", ( key, source, @@ -164,6 +195,7 @@ class CacheEngine: track.artist, track.title, track.album, + track.length, ), ) conn.commit() @@ -226,6 +258,71 @@ class CacheEngine: params.append(track.album) return conditions, params + # Fuzzy search + + def search_by_meta( + self, + artist: Optional[str], + title: Optional[str], + length: Optional[int] = None, + ) -> list[dict]: + """Search cache for lyrics matching artist/title with fuzzy normalization. + + Ignores album and source. Only returns positive results (synced/unsynced) + that have not expired. When *length* is provided, filters by duration + tolerance and sorts by closest match. + """ + if not title: + return [] + + now = int(time.time()) + with sqlite3.connect(self.db_path) as conn: + conn.row_factory = sqlite3.Row + rows = conn.execute( + """SELECT * FROM cache + WHERE status IN (?, ?) + AND (expires_at IS NULL OR expires_at > ?)""", + ( + CacheStatus.SUCCESS_SYNCED.value, + CacheStatus.SUCCESS_UNSYNCED.value, + now, + ), + ).fetchall() + + norm_title = _normalize_for_match(title) + norm_artist = _normalize_for_match(artist) if artist else None + + matches: list[dict] = [] + for row in rows: + row_dict = dict(row) + # Title must match + row_title = row_dict.get("title") or "" + if _normalize_for_match(row_title) != norm_title: + continue + # Artist must match if provided + if norm_artist: + row_artist = row_dict.get("artist") or "" + if _normalize_for_match(row_artist) != norm_artist: + continue + matches.append(row_dict) + + # Duration filtering + if length is not None and matches: + scored = [] + for m in matches: + row_len = m.get("length") + if row_len is not None: + diff = abs(row_len - length) + if diff <= DURATION_TOLERANCE_MS: + scored.append((diff, m)) + else: + # No duration info in cache — still a candidate but lower priority + scored.append((DURATION_TOLERANCE_MS, m)) + scored.sort(key=lambda x: (x[0], x[1].get("status") != CacheStatus.SUCCESS_SYNCED.value)) + matches = [m for _, m in scored] + + return matches + # Query / inspect def query_track(self, track: TrackMeta) -> list[dict]: diff --git a/lrcfetch/core.py b/lrcfetch/core.py index 303dfaa..3edfabe 100644 --- a/lrcfetch/core.py +++ b/lrcfetch/core.py @@ -20,6 +20,7 @@ from .fetchers.lrclib_search import LrclibSearchFetcher from .fetchers.lrclib import LrclibFetcher from .fetchers.spotify import SpotifyFetcher from .fetchers.local import LocalFetcher +from .fetchers.cache_search import CacheSearchFetcher from .fetchers.base import BaseFetcher from .cache import CacheEngine from .lrc import LRC_LINE_RE, normalize_tags @@ -59,10 +60,14 @@ _STATUS_TTL: dict[CacheStatus, Optional[int]] = { class LrcManager: """Main entry point for fetching lyrics with caching.""" + # Fetchers that manage their own cache logic (skip per-source cache check) + _SELF_CACHED = frozenset({"cache-search"}) + def __init__(self) -> None: self.cache = CacheEngine() self.fetchers: dict[str, BaseFetcher] = { "local": LocalFetcher(), + "cache-search": CacheSearchFetcher(self.cache), "spotify": SpotifyFetcher(), "lrclib": LrclibFetcher(), "lrclib-search": LrclibSearchFetcher(), @@ -82,6 +87,8 @@ class LrcManager: sequence: list[BaseFetcher] = [] if track.is_local: sequence.append(self.fetchers["local"]) + if track.title: + sequence.append(self.fetchers["cache-search"]) if track.trackid: sequence.append(self.fetchers["spotify"]) if track.is_complete: @@ -121,8 +128,8 @@ class LrcManager: for fetcher in sequence: source = fetcher.source_name - # Cache check - if not bypass_cache: + # Cache check (skip for fetchers that handle their own caching) + if not bypass_cache and source not in self._SELF_CACHED: cached = self.cache.get(track, source) if cached: if cached.status == CacheStatus.SUCCESS_SYNCED: @@ -163,9 +170,10 @@ class LrcManager: ttl=result.ttl, ) - # Cache the normalized result - ttl = result.ttl or _STATUS_TTL.get(result.status, TTL_NOT_FOUND) - self.cache.set(track, source, result, ttl_seconds=ttl) + # Cache the normalized result (skip for read-only fetchers) + if source not in self._SELF_CACHED: + ttl = result.ttl or _STATUS_TTL.get(result.status, TTL_NOT_FOUND) + self.cache.set(track, source, result, ttl_seconds=ttl) # Evaluate result if result.status == CacheStatus.SUCCESS_SYNCED: diff --git a/lrcfetch/fetchers/cache_search.py b/lrcfetch/fetchers/cache_search.py new file mode 100644 index 0000000..14fb0ae --- /dev/null +++ b/lrcfetch/fetchers/cache_search.py @@ -0,0 +1,65 @@ +""" +Author: Uyanide pywang0608@foxmail.com +Date: 2026-03-28 05:57:46 +Description: Cache-search fetcher — cross-album fuzzy lookup in the local cache +""" + +""" +Searches existing cache entries by artist + title with fuzzy normalization, +ignoring album and source. Useful when the same track appears on different +albums or is played from different players. +""" + +from typing import Optional +from loguru import logger + +from .base import BaseFetcher +from ..models import TrackMeta, LyricResult, CacheStatus +from ..cache import CacheEngine + + +class CacheSearchFetcher(BaseFetcher): + def __init__(self, cache: CacheEngine) -> None: + self._cache = cache + + @property + def source_name(self) -> str: + return "cache-search" + + def fetch(self, track: TrackMeta) -> Optional[LyricResult]: + if not track.title: + logger.debug("Cache-search: skipped — no title") + return None + + matches = self._cache.search_by_meta( + artist=track.artist, + title=track.title, + length=track.length, + ) + + if not matches: + logger.debug(f"Cache-search: no match for {track.display_name()}") + return None + + # Pick best: prefer synced, then first available + best = None + for m in matches: + if m.get("status") == CacheStatus.SUCCESS_SYNCED.value: + best = m + break + if best is None: + best = m + + if not best or not best.get("lyrics"): + return None + + status = CacheStatus(best["status"]) + logger.info( + f"Cache-search: hit from [{best.get('source')}] " + f"album={best.get('album')!r} ({status.value})" + ) + return LyricResult( + status=status, + lyrics=best["lyrics"], + source=self.source_name, + )