feat: implement cache-search fetcher for cross-album fuzzy lookup
This commit is contained in:
+103
-6
@@ -4,15 +4,33 @@ Date: 2026-03-25 10:18:03
|
||||
Description: SQLite-based lyric cache with per-source storage and TTL expiration
|
||||
"""
|
||||
|
||||
import re
|
||||
import sqlite3
|
||||
import hashlib
|
||||
import time
|
||||
import unicodedata
|
||||
from typing import Optional
|
||||
from loguru import logger
|
||||
|
||||
from .config import DB_PATH
|
||||
from .config import DB_PATH, DURATION_TOLERANCE_MS
|
||||
from .models import TrackMeta, LyricResult, CacheStatus
|
||||
|
||||
# Punctuation to strip for fuzzy matching (ASCII + common fullwidth)
|
||||
_PUNCT_RE = re.compile(r"[~!@#$%^&*()_+\-=\[\]{}|;:'\",.<>?/\\`~!@#$%^&*()_+-=【】{}|;:'",。<>?/\`]")
|
||||
_SPACE_RE = re.compile(r"\s+")
|
||||
|
||||
|
||||
def _normalize_for_match(s: str) -> str:
|
||||
"""Normalize a string for fuzzy comparison.
|
||||
|
||||
Lowercases, NFKC-normalizes (fullwidth → halfwidth), strips punctuation,
|
||||
and collapses whitespace.
|
||||
"""
|
||||
s = unicodedata.normalize("NFKC", s).lower()
|
||||
s = _PUNCT_RE.sub("", s)
|
||||
s = _SPACE_RE.sub(" ", s).strip()
|
||||
return s
|
||||
|
||||
|
||||
def _generate_key(track: TrackMeta, source: str) -> str:
|
||||
"""Generate a unique cache key from track metadata and source.
|
||||
@@ -64,9 +82,14 @@ class CacheEngine:
|
||||
expires_at INTEGER,
|
||||
artist TEXT,
|
||||
title TEXT,
|
||||
album TEXT
|
||||
album TEXT,
|
||||
length INTEGER
|
||||
)
|
||||
""")
|
||||
# Migration: add length column if missing
|
||||
cols = {r[1] for r in conn.execute("PRAGMA table_info(cache)").fetchall()}
|
||||
if "length" not in cols:
|
||||
conn.execute("ALTER TABLE cache ADD COLUMN length INTEGER")
|
||||
conn.commit()
|
||||
|
||||
# Read
|
||||
@@ -83,7 +106,7 @@ class CacheEngine:
|
||||
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
row = conn.execute(
|
||||
"SELECT status, lyrics, source, expires_at FROM cache WHERE key = ?",
|
||||
"SELECT status, lyrics, source, expires_at, length FROM cache WHERE key = ?",
|
||||
(key,),
|
||||
).fetchone()
|
||||
|
||||
@@ -91,7 +114,7 @@ class CacheEngine:
|
||||
logger.debug(f"Cache miss: {source} / {track.display_name()}")
|
||||
return None
|
||||
|
||||
status_str, lyrics, src, expires_at = row
|
||||
status_str, lyrics, src, expires_at, cached_length = row
|
||||
|
||||
# Check TTL expiration
|
||||
if expires_at and expires_at < int(time.time()):
|
||||
@@ -100,6 +123,14 @@ class CacheEngine:
|
||||
conn.commit()
|
||||
return None
|
||||
|
||||
# Backfill length if the cached row is missing it
|
||||
if cached_length is None and track.length is not None:
|
||||
conn.execute(
|
||||
"UPDATE cache SET length = ? WHERE key = ?",
|
||||
(track.length, key),
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
remaining = expires_at - int(time.time()) if expires_at else None
|
||||
logger.debug(
|
||||
f"Cache hit: {source} / {track.display_name()} "
|
||||
@@ -152,8 +183,8 @@ class CacheEngine:
|
||||
conn.execute(
|
||||
"""INSERT OR REPLACE INTO cache
|
||||
(key, source, status, lyrics, created_at, expires_at,
|
||||
artist, title, album)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)""",
|
||||
artist, title, album, length)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
|
||||
(
|
||||
key,
|
||||
source,
|
||||
@@ -164,6 +195,7 @@ class CacheEngine:
|
||||
track.artist,
|
||||
track.title,
|
||||
track.album,
|
||||
track.length,
|
||||
),
|
||||
)
|
||||
conn.commit()
|
||||
@@ -226,6 +258,71 @@ class CacheEngine:
|
||||
params.append(track.album)
|
||||
return conditions, params
|
||||
|
||||
# Fuzzy search
|
||||
|
||||
def search_by_meta(
|
||||
self,
|
||||
artist: Optional[str],
|
||||
title: Optional[str],
|
||||
length: Optional[int] = None,
|
||||
) -> list[dict]:
|
||||
"""Search cache for lyrics matching artist/title with fuzzy normalization.
|
||||
|
||||
Ignores album and source. Only returns positive results (synced/unsynced)
|
||||
that have not expired. When *length* is provided, filters by duration
|
||||
tolerance and sorts by closest match.
|
||||
"""
|
||||
if not title:
|
||||
return []
|
||||
|
||||
now = int(time.time())
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
conn.row_factory = sqlite3.Row
|
||||
rows = conn.execute(
|
||||
"""SELECT * FROM cache
|
||||
WHERE status IN (?, ?)
|
||||
AND (expires_at IS NULL OR expires_at > ?)""",
|
||||
(
|
||||
CacheStatus.SUCCESS_SYNCED.value,
|
||||
CacheStatus.SUCCESS_UNSYNCED.value,
|
||||
now,
|
||||
),
|
||||
).fetchall()
|
||||
|
||||
norm_title = _normalize_for_match(title)
|
||||
norm_artist = _normalize_for_match(artist) if artist else None
|
||||
|
||||
matches: list[dict] = []
|
||||
for row in rows:
|
||||
row_dict = dict(row)
|
||||
# Title must match
|
||||
row_title = row_dict.get("title") or ""
|
||||
if _normalize_for_match(row_title) != norm_title:
|
||||
continue
|
||||
# Artist must match if provided
|
||||
if norm_artist:
|
||||
row_artist = row_dict.get("artist") or ""
|
||||
if _normalize_for_match(row_artist) != norm_artist:
|
||||
continue
|
||||
matches.append(row_dict)
|
||||
|
||||
# Duration filtering
|
||||
if length is not None and matches:
|
||||
scored = []
|
||||
for m in matches:
|
||||
row_len = m.get("length")
|
||||
if row_len is not None:
|
||||
diff = abs(row_len - length)
|
||||
if diff <= DURATION_TOLERANCE_MS:
|
||||
scored.append((diff, m))
|
||||
else:
|
||||
# No duration info in cache — still a candidate but lower priority
|
||||
scored.append((DURATION_TOLERANCE_MS, m))
|
||||
scored.sort(key=lambda x: (x[0], x[1].get("status") != CacheStatus.SUCCESS_SYNCED.value))
|
||||
matches = [m for _, m in scored]
|
||||
|
||||
return matches
|
||||
|
||||
# Query / inspect
|
||||
|
||||
def query_track(self, track: TrackMeta) -> list[dict]:
|
||||
|
||||
+11
-3
@@ -20,6 +20,7 @@ from .fetchers.lrclib_search import LrclibSearchFetcher
|
||||
from .fetchers.lrclib import LrclibFetcher
|
||||
from .fetchers.spotify import SpotifyFetcher
|
||||
from .fetchers.local import LocalFetcher
|
||||
from .fetchers.cache_search import CacheSearchFetcher
|
||||
from .fetchers.base import BaseFetcher
|
||||
from .cache import CacheEngine
|
||||
from .lrc import LRC_LINE_RE, normalize_tags
|
||||
@@ -59,10 +60,14 @@ _STATUS_TTL: dict[CacheStatus, Optional[int]] = {
|
||||
class LrcManager:
|
||||
"""Main entry point for fetching lyrics with caching."""
|
||||
|
||||
# Fetchers that manage their own cache logic (skip per-source cache check)
|
||||
_SELF_CACHED = frozenset({"cache-search"})
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.cache = CacheEngine()
|
||||
self.fetchers: dict[str, BaseFetcher] = {
|
||||
"local": LocalFetcher(),
|
||||
"cache-search": CacheSearchFetcher(self.cache),
|
||||
"spotify": SpotifyFetcher(),
|
||||
"lrclib": LrclibFetcher(),
|
||||
"lrclib-search": LrclibSearchFetcher(),
|
||||
@@ -82,6 +87,8 @@ class LrcManager:
|
||||
sequence: list[BaseFetcher] = []
|
||||
if track.is_local:
|
||||
sequence.append(self.fetchers["local"])
|
||||
if track.title:
|
||||
sequence.append(self.fetchers["cache-search"])
|
||||
if track.trackid:
|
||||
sequence.append(self.fetchers["spotify"])
|
||||
if track.is_complete:
|
||||
@@ -121,8 +128,8 @@ class LrcManager:
|
||||
for fetcher in sequence:
|
||||
source = fetcher.source_name
|
||||
|
||||
# Cache check
|
||||
if not bypass_cache:
|
||||
# Cache check (skip for fetchers that handle their own caching)
|
||||
if not bypass_cache and source not in self._SELF_CACHED:
|
||||
cached = self.cache.get(track, source)
|
||||
if cached:
|
||||
if cached.status == CacheStatus.SUCCESS_SYNCED:
|
||||
@@ -163,7 +170,8 @@ class LrcManager:
|
||||
ttl=result.ttl,
|
||||
)
|
||||
|
||||
# Cache the normalized result
|
||||
# Cache the normalized result (skip for read-only fetchers)
|
||||
if source not in self._SELF_CACHED:
|
||||
ttl = result.ttl or _STATUS_TTL.get(result.status, TTL_NOT_FOUND)
|
||||
self.cache.set(track, source, result, ttl_seconds=ttl)
|
||||
|
||||
|
||||
@@ -0,0 +1,65 @@
|
||||
"""
|
||||
Author: Uyanide pywang0608@foxmail.com
|
||||
Date: 2026-03-28 05:57:46
|
||||
Description: Cache-search fetcher — cross-album fuzzy lookup in the local cache
|
||||
"""
|
||||
|
||||
"""
|
||||
Searches existing cache entries by artist + title with fuzzy normalization,
|
||||
ignoring album and source. Useful when the same track appears on different
|
||||
albums or is played from different players.
|
||||
"""
|
||||
|
||||
from typing import Optional
|
||||
from loguru import logger
|
||||
|
||||
from .base import BaseFetcher
|
||||
from ..models import TrackMeta, LyricResult, CacheStatus
|
||||
from ..cache import CacheEngine
|
||||
|
||||
|
||||
class CacheSearchFetcher(BaseFetcher):
|
||||
def __init__(self, cache: CacheEngine) -> None:
|
||||
self._cache = cache
|
||||
|
||||
@property
|
||||
def source_name(self) -> str:
|
||||
return "cache-search"
|
||||
|
||||
def fetch(self, track: TrackMeta) -> Optional[LyricResult]:
|
||||
if not track.title:
|
||||
logger.debug("Cache-search: skipped — no title")
|
||||
return None
|
||||
|
||||
matches = self._cache.search_by_meta(
|
||||
artist=track.artist,
|
||||
title=track.title,
|
||||
length=track.length,
|
||||
)
|
||||
|
||||
if not matches:
|
||||
logger.debug(f"Cache-search: no match for {track.display_name()}")
|
||||
return None
|
||||
|
||||
# Pick best: prefer synced, then first available
|
||||
best = None
|
||||
for m in matches:
|
||||
if m.get("status") == CacheStatus.SUCCESS_SYNCED.value:
|
||||
best = m
|
||||
break
|
||||
if best is None:
|
||||
best = m
|
||||
|
||||
if not best or not best.get("lyrics"):
|
||||
return None
|
||||
|
||||
status = CacheStatus(best["status"])
|
||||
logger.info(
|
||||
f"Cache-search: hit from [{best.get('source')}] "
|
||||
f"album={best.get('album')!r} ({status.value})"
|
||||
)
|
||||
return LyricResult(
|
||||
status=status,
|
||||
lyrics=best["lyrics"],
|
||||
source=self.source_name,
|
||||
)
|
||||
Reference in New Issue
Block a user