feat: implement cache-search fetcher for cross-album fuzzy lookup
This commit is contained in:
+103
-6
@@ -4,15 +4,33 @@ Date: 2026-03-25 10:18:03
|
|||||||
Description: SQLite-based lyric cache with per-source storage and TTL expiration
|
Description: SQLite-based lyric cache with per-source storage and TTL expiration
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import re
|
||||||
import sqlite3
|
import sqlite3
|
||||||
import hashlib
|
import hashlib
|
||||||
import time
|
import time
|
||||||
|
import unicodedata
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
|
||||||
from .config import DB_PATH
|
from .config import DB_PATH, DURATION_TOLERANCE_MS
|
||||||
from .models import TrackMeta, LyricResult, CacheStatus
|
from .models import TrackMeta, LyricResult, CacheStatus
|
||||||
|
|
||||||
|
# Punctuation to strip for fuzzy matching (ASCII + common fullwidth)
|
||||||
|
_PUNCT_RE = re.compile(r"[~!@#$%^&*()_+\-=\[\]{}|;:'\",.<>?/\\`~!@#$%^&*()_+-=【】{}|;:'",。<>?/\`]")
|
||||||
|
_SPACE_RE = re.compile(r"\s+")
|
||||||
|
|
||||||
|
|
||||||
|
def _normalize_for_match(s: str) -> str:
|
||||||
|
"""Normalize a string for fuzzy comparison.
|
||||||
|
|
||||||
|
Lowercases, NFKC-normalizes (fullwidth → halfwidth), strips punctuation,
|
||||||
|
and collapses whitespace.
|
||||||
|
"""
|
||||||
|
s = unicodedata.normalize("NFKC", s).lower()
|
||||||
|
s = _PUNCT_RE.sub("", s)
|
||||||
|
s = _SPACE_RE.sub(" ", s).strip()
|
||||||
|
return s
|
||||||
|
|
||||||
|
|
||||||
def _generate_key(track: TrackMeta, source: str) -> str:
|
def _generate_key(track: TrackMeta, source: str) -> str:
|
||||||
"""Generate a unique cache key from track metadata and source.
|
"""Generate a unique cache key from track metadata and source.
|
||||||
@@ -64,9 +82,14 @@ class CacheEngine:
|
|||||||
expires_at INTEGER,
|
expires_at INTEGER,
|
||||||
artist TEXT,
|
artist TEXT,
|
||||||
title TEXT,
|
title TEXT,
|
||||||
album TEXT
|
album TEXT,
|
||||||
|
length INTEGER
|
||||||
)
|
)
|
||||||
""")
|
""")
|
||||||
|
# Migration: add length column if missing
|
||||||
|
cols = {r[1] for r in conn.execute("PRAGMA table_info(cache)").fetchall()}
|
||||||
|
if "length" not in cols:
|
||||||
|
conn.execute("ALTER TABLE cache ADD COLUMN length INTEGER")
|
||||||
conn.commit()
|
conn.commit()
|
||||||
|
|
||||||
# Read
|
# Read
|
||||||
@@ -83,7 +106,7 @@ class CacheEngine:
|
|||||||
|
|
||||||
with sqlite3.connect(self.db_path) as conn:
|
with sqlite3.connect(self.db_path) as conn:
|
||||||
row = conn.execute(
|
row = conn.execute(
|
||||||
"SELECT status, lyrics, source, expires_at FROM cache WHERE key = ?",
|
"SELECT status, lyrics, source, expires_at, length FROM cache WHERE key = ?",
|
||||||
(key,),
|
(key,),
|
||||||
).fetchone()
|
).fetchone()
|
||||||
|
|
||||||
@@ -91,7 +114,7 @@ class CacheEngine:
|
|||||||
logger.debug(f"Cache miss: {source} / {track.display_name()}")
|
logger.debug(f"Cache miss: {source} / {track.display_name()}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
status_str, lyrics, src, expires_at = row
|
status_str, lyrics, src, expires_at, cached_length = row
|
||||||
|
|
||||||
# Check TTL expiration
|
# Check TTL expiration
|
||||||
if expires_at and expires_at < int(time.time()):
|
if expires_at and expires_at < int(time.time()):
|
||||||
@@ -100,6 +123,14 @@ class CacheEngine:
|
|||||||
conn.commit()
|
conn.commit()
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
# Backfill length if the cached row is missing it
|
||||||
|
if cached_length is None and track.length is not None:
|
||||||
|
conn.execute(
|
||||||
|
"UPDATE cache SET length = ? WHERE key = ?",
|
||||||
|
(track.length, key),
|
||||||
|
)
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
remaining = expires_at - int(time.time()) if expires_at else None
|
remaining = expires_at - int(time.time()) if expires_at else None
|
||||||
logger.debug(
|
logger.debug(
|
||||||
f"Cache hit: {source} / {track.display_name()} "
|
f"Cache hit: {source} / {track.display_name()} "
|
||||||
@@ -152,8 +183,8 @@ class CacheEngine:
|
|||||||
conn.execute(
|
conn.execute(
|
||||||
"""INSERT OR REPLACE INTO cache
|
"""INSERT OR REPLACE INTO cache
|
||||||
(key, source, status, lyrics, created_at, expires_at,
|
(key, source, status, lyrics, created_at, expires_at,
|
||||||
artist, title, album)
|
artist, title, album, length)
|
||||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)""",
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
|
||||||
(
|
(
|
||||||
key,
|
key,
|
||||||
source,
|
source,
|
||||||
@@ -164,6 +195,7 @@ class CacheEngine:
|
|||||||
track.artist,
|
track.artist,
|
||||||
track.title,
|
track.title,
|
||||||
track.album,
|
track.album,
|
||||||
|
track.length,
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
conn.commit()
|
conn.commit()
|
||||||
@@ -226,6 +258,71 @@ class CacheEngine:
|
|||||||
params.append(track.album)
|
params.append(track.album)
|
||||||
return conditions, params
|
return conditions, params
|
||||||
|
|
||||||
|
# Fuzzy search
|
||||||
|
|
||||||
|
def search_by_meta(
|
||||||
|
self,
|
||||||
|
artist: Optional[str],
|
||||||
|
title: Optional[str],
|
||||||
|
length: Optional[int] = None,
|
||||||
|
) -> list[dict]:
|
||||||
|
"""Search cache for lyrics matching artist/title with fuzzy normalization.
|
||||||
|
|
||||||
|
Ignores album and source. Only returns positive results (synced/unsynced)
|
||||||
|
that have not expired. When *length* is provided, filters by duration
|
||||||
|
tolerance and sorts by closest match.
|
||||||
|
"""
|
||||||
|
if not title:
|
||||||
|
return []
|
||||||
|
|
||||||
|
now = int(time.time())
|
||||||
|
with sqlite3.connect(self.db_path) as conn:
|
||||||
|
conn.row_factory = sqlite3.Row
|
||||||
|
rows = conn.execute(
|
||||||
|
"""SELECT * FROM cache
|
||||||
|
WHERE status IN (?, ?)
|
||||||
|
AND (expires_at IS NULL OR expires_at > ?)""",
|
||||||
|
(
|
||||||
|
CacheStatus.SUCCESS_SYNCED.value,
|
||||||
|
CacheStatus.SUCCESS_UNSYNCED.value,
|
||||||
|
now,
|
||||||
|
),
|
||||||
|
).fetchall()
|
||||||
|
|
||||||
|
norm_title = _normalize_for_match(title)
|
||||||
|
norm_artist = _normalize_for_match(artist) if artist else None
|
||||||
|
|
||||||
|
matches: list[dict] = []
|
||||||
|
for row in rows:
|
||||||
|
row_dict = dict(row)
|
||||||
|
# Title must match
|
||||||
|
row_title = row_dict.get("title") or ""
|
||||||
|
if _normalize_for_match(row_title) != norm_title:
|
||||||
|
continue
|
||||||
|
# Artist must match if provided
|
||||||
|
if norm_artist:
|
||||||
|
row_artist = row_dict.get("artist") or ""
|
||||||
|
if _normalize_for_match(row_artist) != norm_artist:
|
||||||
|
continue
|
||||||
|
matches.append(row_dict)
|
||||||
|
|
||||||
|
# Duration filtering
|
||||||
|
if length is not None and matches:
|
||||||
|
scored = []
|
||||||
|
for m in matches:
|
||||||
|
row_len = m.get("length")
|
||||||
|
if row_len is not None:
|
||||||
|
diff = abs(row_len - length)
|
||||||
|
if diff <= DURATION_TOLERANCE_MS:
|
||||||
|
scored.append((diff, m))
|
||||||
|
else:
|
||||||
|
# No duration info in cache — still a candidate but lower priority
|
||||||
|
scored.append((DURATION_TOLERANCE_MS, m))
|
||||||
|
scored.sort(key=lambda x: (x[0], x[1].get("status") != CacheStatus.SUCCESS_SYNCED.value))
|
||||||
|
matches = [m for _, m in scored]
|
||||||
|
|
||||||
|
return matches
|
||||||
|
|
||||||
# Query / inspect
|
# Query / inspect
|
||||||
|
|
||||||
def query_track(self, track: TrackMeta) -> list[dict]:
|
def query_track(self, track: TrackMeta) -> list[dict]:
|
||||||
|
|||||||
+13
-5
@@ -20,6 +20,7 @@ from .fetchers.lrclib_search import LrclibSearchFetcher
|
|||||||
from .fetchers.lrclib import LrclibFetcher
|
from .fetchers.lrclib import LrclibFetcher
|
||||||
from .fetchers.spotify import SpotifyFetcher
|
from .fetchers.spotify import SpotifyFetcher
|
||||||
from .fetchers.local import LocalFetcher
|
from .fetchers.local import LocalFetcher
|
||||||
|
from .fetchers.cache_search import CacheSearchFetcher
|
||||||
from .fetchers.base import BaseFetcher
|
from .fetchers.base import BaseFetcher
|
||||||
from .cache import CacheEngine
|
from .cache import CacheEngine
|
||||||
from .lrc import LRC_LINE_RE, normalize_tags
|
from .lrc import LRC_LINE_RE, normalize_tags
|
||||||
@@ -59,10 +60,14 @@ _STATUS_TTL: dict[CacheStatus, Optional[int]] = {
|
|||||||
class LrcManager:
|
class LrcManager:
|
||||||
"""Main entry point for fetching lyrics with caching."""
|
"""Main entry point for fetching lyrics with caching."""
|
||||||
|
|
||||||
|
# Fetchers that manage their own cache logic (skip per-source cache check)
|
||||||
|
_SELF_CACHED = frozenset({"cache-search"})
|
||||||
|
|
||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
self.cache = CacheEngine()
|
self.cache = CacheEngine()
|
||||||
self.fetchers: dict[str, BaseFetcher] = {
|
self.fetchers: dict[str, BaseFetcher] = {
|
||||||
"local": LocalFetcher(),
|
"local": LocalFetcher(),
|
||||||
|
"cache-search": CacheSearchFetcher(self.cache),
|
||||||
"spotify": SpotifyFetcher(),
|
"spotify": SpotifyFetcher(),
|
||||||
"lrclib": LrclibFetcher(),
|
"lrclib": LrclibFetcher(),
|
||||||
"lrclib-search": LrclibSearchFetcher(),
|
"lrclib-search": LrclibSearchFetcher(),
|
||||||
@@ -82,6 +87,8 @@ class LrcManager:
|
|||||||
sequence: list[BaseFetcher] = []
|
sequence: list[BaseFetcher] = []
|
||||||
if track.is_local:
|
if track.is_local:
|
||||||
sequence.append(self.fetchers["local"])
|
sequence.append(self.fetchers["local"])
|
||||||
|
if track.title:
|
||||||
|
sequence.append(self.fetchers["cache-search"])
|
||||||
if track.trackid:
|
if track.trackid:
|
||||||
sequence.append(self.fetchers["spotify"])
|
sequence.append(self.fetchers["spotify"])
|
||||||
if track.is_complete:
|
if track.is_complete:
|
||||||
@@ -121,8 +128,8 @@ class LrcManager:
|
|||||||
for fetcher in sequence:
|
for fetcher in sequence:
|
||||||
source = fetcher.source_name
|
source = fetcher.source_name
|
||||||
|
|
||||||
# Cache check
|
# Cache check (skip for fetchers that handle their own caching)
|
||||||
if not bypass_cache:
|
if not bypass_cache and source not in self._SELF_CACHED:
|
||||||
cached = self.cache.get(track, source)
|
cached = self.cache.get(track, source)
|
||||||
if cached:
|
if cached:
|
||||||
if cached.status == CacheStatus.SUCCESS_SYNCED:
|
if cached.status == CacheStatus.SUCCESS_SYNCED:
|
||||||
@@ -163,9 +170,10 @@ class LrcManager:
|
|||||||
ttl=result.ttl,
|
ttl=result.ttl,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Cache the normalized result
|
# Cache the normalized result (skip for read-only fetchers)
|
||||||
ttl = result.ttl or _STATUS_TTL.get(result.status, TTL_NOT_FOUND)
|
if source not in self._SELF_CACHED:
|
||||||
self.cache.set(track, source, result, ttl_seconds=ttl)
|
ttl = result.ttl or _STATUS_TTL.get(result.status, TTL_NOT_FOUND)
|
||||||
|
self.cache.set(track, source, result, ttl_seconds=ttl)
|
||||||
|
|
||||||
# Evaluate result
|
# Evaluate result
|
||||||
if result.status == CacheStatus.SUCCESS_SYNCED:
|
if result.status == CacheStatus.SUCCESS_SYNCED:
|
||||||
|
|||||||
@@ -0,0 +1,65 @@
|
|||||||
|
"""
|
||||||
|
Author: Uyanide pywang0608@foxmail.com
|
||||||
|
Date: 2026-03-28 05:57:46
|
||||||
|
Description: Cache-search fetcher — cross-album fuzzy lookup in the local cache
|
||||||
|
"""
|
||||||
|
|
||||||
|
"""
|
||||||
|
Searches existing cache entries by artist + title with fuzzy normalization,
|
||||||
|
ignoring album and source. Useful when the same track appears on different
|
||||||
|
albums or is played from different players.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from typing import Optional
|
||||||
|
from loguru import logger
|
||||||
|
|
||||||
|
from .base import BaseFetcher
|
||||||
|
from ..models import TrackMeta, LyricResult, CacheStatus
|
||||||
|
from ..cache import CacheEngine
|
||||||
|
|
||||||
|
|
||||||
|
class CacheSearchFetcher(BaseFetcher):
|
||||||
|
def __init__(self, cache: CacheEngine) -> None:
|
||||||
|
self._cache = cache
|
||||||
|
|
||||||
|
@property
|
||||||
|
def source_name(self) -> str:
|
||||||
|
return "cache-search"
|
||||||
|
|
||||||
|
def fetch(self, track: TrackMeta) -> Optional[LyricResult]:
|
||||||
|
if not track.title:
|
||||||
|
logger.debug("Cache-search: skipped — no title")
|
||||||
|
return None
|
||||||
|
|
||||||
|
matches = self._cache.search_by_meta(
|
||||||
|
artist=track.artist,
|
||||||
|
title=track.title,
|
||||||
|
length=track.length,
|
||||||
|
)
|
||||||
|
|
||||||
|
if not matches:
|
||||||
|
logger.debug(f"Cache-search: no match for {track.display_name()}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Pick best: prefer synced, then first available
|
||||||
|
best = None
|
||||||
|
for m in matches:
|
||||||
|
if m.get("status") == CacheStatus.SUCCESS_SYNCED.value:
|
||||||
|
best = m
|
||||||
|
break
|
||||||
|
if best is None:
|
||||||
|
best = m
|
||||||
|
|
||||||
|
if not best or not best.get("lyrics"):
|
||||||
|
return None
|
||||||
|
|
||||||
|
status = CacheStatus(best["status"])
|
||||||
|
logger.info(
|
||||||
|
f"Cache-search: hit from [{best.get('source')}] "
|
||||||
|
f"album={best.get('album')!r} ({status.value})"
|
||||||
|
)
|
||||||
|
return LyricResult(
|
||||||
|
status=status,
|
||||||
|
lyrics=best["lyrics"],
|
||||||
|
source=self.source_name,
|
||||||
|
)
|
||||||
Reference in New Issue
Block a user