feat: implement cache-search fetcher for cross-album fuzzy lookup

2026-03-28 06:21:31 +01:00
parent 4182229ae2
commit 05d7def249
3 changed files with 181 additions and 11 deletions
@@ -4,15 +4,33 @@ Date: 2026-03-25 10:18:03
 Description: SQLite-based lyric cache with per-source storage and TTL expiration
 """

+import re
 import sqlite3
 import hashlib
 import time
+import unicodedata
 from typing import Optional
 from loguru import logger

-from .config import DB_PATH
+from .config import DB_PATH, DURATION_TOLERANCE_MS
 from .models import TrackMeta, LyricResult, CacheStatus

+# Punctuation to strip for fuzzy matching (ASCII + common fullwidth)
+_PUNCT_RE = re.compile(r"[~!@#$%^&*()_+\-=\[\]{}|;:'\",.<>?/\\`～！＠＃＄％＾＆＊（）＿＋－＝【】｛｝｜；：＇＂，。＜＞？／＼｀]")
+_SPACE_RE = re.compile(r"\s+")
+
+
+def _normalize_for_match(s: str) -> str:
+    """Normalize a string for fuzzy comparison.
+
+    Lowercases, NFKC-normalizes (fullwidth → halfwidth), strips punctuation,
+    and collapses whitespace.
+    """
+    s = unicodedata.normalize("NFKC", s).lower()
+    s = _PUNCT_RE.sub("", s)
+    s = _SPACE_RE.sub(" ", s).strip()
+    return s
+

 def _generate_key(track: TrackMeta, source: str) -> str:
    """Generate a unique cache key from track metadata and source.
@@ -64,9 +82,14 @@ class CacheEngine:
                    expires_at INTEGER,
                    artist TEXT,
                    title TEXT,
-                    album TEXT
+                    album TEXT,
+                    length INTEGER
                )
            """)
+            # Migration: add length column if missing
+            cols = {r[1] for r in conn.execute("PRAGMA table_info(cache)").fetchall()}
+            if "length" not in cols:
+                conn.execute("ALTER TABLE cache ADD COLUMN length INTEGER")
            conn.commit()

    # Read
@@ -83,7 +106,7 @@ class CacheEngine:

        with sqlite3.connect(self.db_path) as conn:
            row = conn.execute(
-                "SELECT status, lyrics, source, expires_at FROM cache WHERE key = ?",
+                "SELECT status, lyrics, source, expires_at, length FROM cache WHERE key = ?",
                (key,),
            ).fetchone()

@@ -91,7 +114,7 @@ class CacheEngine:
                logger.debug(f"Cache miss: {source} / {track.display_name()}")
                return None

-            status_str, lyrics, src, expires_at = row
+            status_str, lyrics, src, expires_at, cached_length = row

            # Check TTL expiration
            if expires_at and expires_at < int(time.time()):
@@ -100,6 +123,14 @@ class CacheEngine:
                conn.commit()
                return None

+            # Backfill length if the cached row is missing it
+            if cached_length is None and track.length is not None:
+                conn.execute(
+                    "UPDATE cache SET length = ? WHERE key = ?",
+                    (track.length, key),
+                )
+                conn.commit()
+
            remaining = expires_at - int(time.time()) if expires_at else None
            logger.debug(
                f"Cache hit: {source} / {track.display_name()} "
@@ -152,8 +183,8 @@ class CacheEngine:
            conn.execute(
                """INSERT OR REPLACE INTO cache
                   (key, source, status, lyrics, created_at, expires_at,
-                    artist, title, album)
-                   VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)""",
+                    artist, title, album, length)
+                   VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
                (
                    key,
                    source,
@@ -164,6 +195,7 @@ class CacheEngine:
                    track.artist,
                    track.title,
                    track.album,
+                    track.length,
                ),
            )
            conn.commit()
@@ -226,6 +258,71 @@ class CacheEngine:
            params.append(track.album)
        return conditions, params

+    # Fuzzy search
+
+    def search_by_meta(
+        self,
+        artist: Optional[str],
+        title: Optional[str],
+        length: Optional[int] = None,
+    ) -> list[dict]:
+        """Search cache for lyrics matching artist/title with fuzzy normalization.
+
+        Ignores album and source. Only returns positive results (synced/unsynced)
+        that have not expired. When *length* is provided, filters by duration
+        tolerance and sorts by closest match.
+        """
+        if not title:
+            return []
+
+        now = int(time.time())
+        with sqlite3.connect(self.db_path) as conn:
+            conn.row_factory = sqlite3.Row
+            rows = conn.execute(
+                """SELECT * FROM cache
+                   WHERE status IN (?, ?)
+                     AND (expires_at IS NULL OR expires_at > ?)""",
+                (
+                    CacheStatus.SUCCESS_SYNCED.value,
+                    CacheStatus.SUCCESS_UNSYNCED.value,
+                    now,
+                ),
+            ).fetchall()
+
+        norm_title = _normalize_for_match(title)
+        norm_artist = _normalize_for_match(artist) if artist else None
+
+        matches: list[dict] = []
+        for row in rows:
+            row_dict = dict(row)
+            # Title must match
+            row_title = row_dict.get("title") or ""
+            if _normalize_for_match(row_title) != norm_title:
+                continue
+            # Artist must match if provided
+            if norm_artist:
+                row_artist = row_dict.get("artist") or ""
+                if _normalize_for_match(row_artist) != norm_artist:
+                    continue
+            matches.append(row_dict)
+
+        # Duration filtering
+        if length is not None and matches:
+            scored = []
+            for m in matches:
+                row_len = m.get("length")
+                if row_len is not None:
+                    diff = abs(row_len - length)
+                    if diff <= DURATION_TOLERANCE_MS:
+                        scored.append((diff, m))
+                else:
+                    # No duration info in cache — still a candidate but lower priority
+                    scored.append((DURATION_TOLERANCE_MS, m))
+            scored.sort(key=lambda x: (x[0], x[1].get("status") != CacheStatus.SUCCESS_SYNCED.value))
+            matches = [m for _, m in scored]
+
+        return matches
+
    # Query / inspect

    def query_track(self, track: TrackMeta) -> list[dict]:
@@ -20,6 +20,7 @@ from .fetchers.lrclib_search import LrclibSearchFetcher
 from .fetchers.lrclib import LrclibFetcher
 from .fetchers.spotify import SpotifyFetcher
 from .fetchers.local import LocalFetcher
+from .fetchers.cache_search import CacheSearchFetcher
 from .fetchers.base import BaseFetcher
 from .cache import CacheEngine
 from .lrc import LRC_LINE_RE, normalize_tags
@@ -59,10 +60,14 @@ _STATUS_TTL: dict[CacheStatus, Optional[int]] = {
 class LrcManager:
    """Main entry point for fetching lyrics with caching."""

+    # Fetchers that manage their own cache logic (skip per-source cache check)
+    _SELF_CACHED = frozenset({"cache-search"})
+
    def __init__(self) -> None:
        self.cache = CacheEngine()
        self.fetchers: dict[str, BaseFetcher] = {
            "local": LocalFetcher(),
+            "cache-search": CacheSearchFetcher(self.cache),
            "spotify": SpotifyFetcher(),
            "lrclib": LrclibFetcher(),
            "lrclib-search": LrclibSearchFetcher(),
@@ -82,6 +87,8 @@ class LrcManager:
        sequence: list[BaseFetcher] = []
        if track.is_local:
            sequence.append(self.fetchers["local"])
+        if track.title:
+            sequence.append(self.fetchers["cache-search"])
        if track.trackid:
            sequence.append(self.fetchers["spotify"])
        if track.is_complete:
@@ -121,8 +128,8 @@ class LrcManager:
        for fetcher in sequence:
            source = fetcher.source_name

-            # Cache check
-            if not bypass_cache:
+            # Cache check (skip for fetchers that handle their own caching)
+            if not bypass_cache and source not in self._SELF_CACHED:
                cached = self.cache.get(track, source)
                if cached:
                    if cached.status == CacheStatus.SUCCESS_SYNCED:
@@ -163,9 +170,10 @@ class LrcManager:
                    ttl=result.ttl,
                )

-            # Cache the normalized result
-            ttl = result.ttl or _STATUS_TTL.get(result.status, TTL_NOT_FOUND)
-            self.cache.set(track, source, result, ttl_seconds=ttl)
+            # Cache the normalized result (skip for read-only fetchers)
+            if source not in self._SELF_CACHED:
+                ttl = result.ttl or _STATUS_TTL.get(result.status, TTL_NOT_FOUND)
+                self.cache.set(track, source, result, ttl_seconds=ttl)

            # Evaluate result
            if result.status == CacheStatus.SUCCESS_SYNCED:
@@ -0,0 +1,65 @@
+"""
+Author: Uyanide pywang0608@foxmail.com
+Date: 2026-03-28 05:57:46
+Description: Cache-search fetcher — cross-album fuzzy lookup in the local cache
+"""
+
+"""
+Searches existing cache entries by artist + title with fuzzy normalization,
+ignoring album and source. Useful when the same track appears on different
+albums or is played from different players.
+"""
+
+from typing import Optional
+from loguru import logger
+
+from .base import BaseFetcher
+from ..models import TrackMeta, LyricResult, CacheStatus
+from ..cache import CacheEngine
+
+
+class CacheSearchFetcher(BaseFetcher):
+    def __init__(self, cache: CacheEngine) -> None:
+        self._cache = cache
+
+    @property
+    def source_name(self) -> str:
+        return "cache-search"
+
+    def fetch(self, track: TrackMeta) -> Optional[LyricResult]:
+        if not track.title:
+            logger.debug("Cache-search: skipped — no title")
+            return None
+
+        matches = self._cache.search_by_meta(
+            artist=track.artist,
+            title=track.title,
+            length=track.length,
+        )
+
+        if not matches:
+            logger.debug(f"Cache-search: no match for {track.display_name()}")
+            return None
+
+        # Pick best: prefer synced, then first available
+        best = None
+        for m in matches:
+            if m.get("status") == CacheStatus.SUCCESS_SYNCED.value:
+                best = m
+                break
+            if best is None:
+                best = m
+
+        if not best or not best.get("lyrics"):
+            return None
+
+        status = CacheStatus(best["status"])
+        logger.info(
+            f"Cache-search: hit from [{best.get('source')}] "
+            f"album={best.get('album')!r} ({status.value})"
+        )
+        return LyricResult(
+            status=status,
+            lyrics=best["lyrics"],
+            source=self.source_name,
+        )