feat: enhance fuzzy matching and add artist normalization in cache.py
This commit is contained in:
+25
-4
@@ -15,11 +15,17 @@ from loguru import logger
|
|||||||
from .config import DB_PATH, DURATION_TOLERANCE_MS
|
from .config import DB_PATH, DURATION_TOLERANCE_MS
|
||||||
from .models import TrackMeta, LyricResult, CacheStatus
|
from .models import TrackMeta, LyricResult, CacheStatus
|
||||||
|
|
||||||
# Punctuation to strip for fuzzy matching (ASCII + common fullwidth)
|
# Punctuation to strip for fuzzy matching (ASCII + fullwidth + CJK brackets/symbols)
|
||||||
_PUNCT_RE = re.compile(
|
_PUNCT_RE = re.compile(
|
||||||
r"[~!@#$%^&*()_+\-=\[\]{}|;:'\",.<>?/\\`~!@#$%^&*()_+-=【】{}|;:'",。<>?/\`]"
|
r"[~!@#$%^&*()_+\-=\[\]{}|;:'\",.<>?/\\`"
|
||||||
|
r"~!@#$%^&*()_+-=【】{}|;:'",。<>?/\`"
|
||||||
|
r"「」『』《》〈〉〔〕·•‥…—–]"
|
||||||
)
|
)
|
||||||
_SPACE_RE = re.compile(r"\s+")
|
_SPACE_RE = re.compile(r"\s+")
|
||||||
|
# feat./ft./featuring and everything after (case-insensitive, word boundary)
|
||||||
|
_FEAT_RE = re.compile(r"\s*(?:\bfeat\.?\b|\bft\.?\b|\bfeaturing\b).*", re.IGNORECASE)
|
||||||
|
# Multi-artist separators: /, &, ×, x (surrounded by spaces), ;, 、, vs.
|
||||||
|
_ARTIST_SEP_RE = re.compile(r"\s*(?:[/&;×、]|\bvs\.?\b|\bx\b)\s*", re.IGNORECASE)
|
||||||
|
|
||||||
|
|
||||||
def _normalize_for_match(s: str) -> str:
|
def _normalize_for_match(s: str) -> str:
|
||||||
@@ -29,11 +35,26 @@ def _normalize_for_match(s: str) -> str:
|
|||||||
and collapses whitespace.
|
and collapses whitespace.
|
||||||
"""
|
"""
|
||||||
s = unicodedata.normalize("NFKC", s).lower()
|
s = unicodedata.normalize("NFKC", s).lower()
|
||||||
|
s = _FEAT_RE.sub("", s)
|
||||||
s = _PUNCT_RE.sub(" ", s)
|
s = _PUNCT_RE.sub(" ", s)
|
||||||
s = _SPACE_RE.sub(" ", s).strip()
|
s = _SPACE_RE.sub(" ", s).strip()
|
||||||
return s
|
return s
|
||||||
|
|
||||||
|
|
||||||
|
def _normalize_artist(s: str) -> str:
|
||||||
|
"""Normalize an artist string: split by separators, normalize each, sort.
|
||||||
|
|
||||||
|
Splits first (on /, &, ;, ×, 、, vs., x), then strips feat./ft./featuring
|
||||||
|
from each part individually, so 'A feat. C / B' → ['a', 'b'] not just ['a'].
|
||||||
|
"""
|
||||||
|
s = unicodedata.normalize("NFKC", s).lower()
|
||||||
|
parts = _ARTIST_SEP_RE.split(s)
|
||||||
|
normed = sorted(
|
||||||
|
{_normalize_for_match(p) for p in parts if _FEAT_RE.sub("", p).strip()}
|
||||||
|
)
|
||||||
|
return "\0".join(normed) if normed else _normalize_for_match(s)
|
||||||
|
|
||||||
|
|
||||||
def _generate_key(track: TrackMeta, source: str) -> str:
|
def _generate_key(track: TrackMeta, source: str) -> str:
|
||||||
"""Generate a unique cache key from track metadata and source.
|
"""Generate a unique cache key from track metadata and source.
|
||||||
|
|
||||||
@@ -331,7 +352,7 @@ class CacheEngine:
|
|||||||
).fetchall()
|
).fetchall()
|
||||||
|
|
||||||
norm_title = _normalize_for_match(title)
|
norm_title = _normalize_for_match(title)
|
||||||
norm_artist = _normalize_for_match(artist) if artist else None
|
norm_artist = _normalize_artist(artist) if artist else None
|
||||||
|
|
||||||
matches: list[dict] = []
|
matches: list[dict] = []
|
||||||
for row in rows:
|
for row in rows:
|
||||||
@@ -343,7 +364,7 @@ class CacheEngine:
|
|||||||
# Artist must match if provided
|
# Artist must match if provided
|
||||||
if norm_artist:
|
if norm_artist:
|
||||||
row_artist = row_dict.get("artist") or ""
|
row_artist = row_dict.get("artist") or ""
|
||||||
if _normalize_for_match(row_artist) != norm_artist:
|
if _normalize_artist(row_artist) != norm_artist:
|
||||||
continue
|
continue
|
||||||
matches.append(row_dict)
|
matches.append(row_dict)
|
||||||
|
|
||||||
|
|||||||
@@ -157,6 +157,9 @@ requires-dist = [
|
|||||||
{ name = "typer", specifier = ">=0.24.1" },
|
{ name = "typer", specifier = ">=0.24.1" },
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[package.metadata.requires-dev]
|
||||||
|
dev = []
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "markdown-it-py"
|
name = "markdown-it-py"
|
||||||
version = "4.0.0"
|
version = "4.0.0"
|
||||||
|
|||||||
Reference in New Issue
Block a user