feat: enhance fuzzy matching and add artist normalization in cache.py

2026-03-28 07:35:29 +01:00
parent a74bf885a2
commit cf3fe3d00e
2 changed files with 29 additions and 5 deletions
@@ -15,11 +15,17 @@ from loguru import logger
 from .config import DB_PATH, DURATION_TOLERANCE_MS
 from .models import TrackMeta, LyricResult, CacheStatus

-# Punctuation to strip for fuzzy matching (ASCII + common fullwidth)
+# Punctuation to strip for fuzzy matching (ASCII + fullwidth + CJK brackets/symbols)
 _PUNCT_RE = re.compile(
-    r"[~!@#$%^&*()_+\-=\[\]{}|;:'\",.<>?/\\`～！＠＃＄％＾＆＊（）＿＋－＝【】｛｝｜；：＇＂，。＜＞？／＼｀]"
+    r"[~!@#$%^&*()_+\-=\[\]{}|;:'\",.<>?/\\`"
+    r"～！＠＃＄％＾＆＊（）＿＋－＝【】｛｝｜；：＇＂，。＜＞？／＼｀"
+    r"「」『』《》〈〉〔〕·•‥…—–]"
 )
 _SPACE_RE = re.compile(r"\s+")
+# feat./ft./featuring and everything after (case-insensitive, word boundary)
+_FEAT_RE = re.compile(r"\s*(?:\bfeat\.?\b|\bft\.?\b|\bfeaturing\b).*", re.IGNORECASE)
+# Multi-artist separators: /, &, ×, x (surrounded by spaces), ;, 、, vs.
+_ARTIST_SEP_RE = re.compile(r"\s*(?:[/&;×、]|\bvs\.?\b|\bx\b)\s*", re.IGNORECASE)


 def _normalize_for_match(s: str) -> str:
@@ -29,11 +35,26 @@ def _normalize_for_match(s: str) -> str:
    and collapses whitespace.
    """
    s = unicodedata.normalize("NFKC", s).lower()
-    s = _PUNCT_RE.sub("", s)
+    s = _FEAT_RE.sub("", s)
+    s = _PUNCT_RE.sub(" ", s)
    s = _SPACE_RE.sub(" ", s).strip()
    return s


+def _normalize_artist(s: str) -> str:
+    """Normalize an artist string: split by separators, normalize each, sort.
+
+    Splits first (on /, &, ;, ×, 、, vs., x), then strips feat./ft./featuring
+    from each part individually, so 'A feat. C / B' → ['a', 'b'] not just ['a'].
+    """
+    s = unicodedata.normalize("NFKC", s).lower()
+    parts = _ARTIST_SEP_RE.split(s)
+    normed = sorted(
+        {_normalize_for_match(p) for p in parts if _FEAT_RE.sub("", p).strip()}
+    )
+    return "\0".join(normed) if normed else _normalize_for_match(s)
+
+
 def _generate_key(track: TrackMeta, source: str) -> str:
    """Generate a unique cache key from track metadata and source.

@@ -331,7 +352,7 @@ class CacheEngine:
            ).fetchall()

        norm_title = _normalize_for_match(title)
-        norm_artist = _normalize_for_match(artist) if artist else None
+        norm_artist = _normalize_artist(artist) if artist else None

        matches: list[dict] = []
        for row in rows:
@@ -343,7 +364,7 @@ class CacheEngine:
            # Artist must match if provided
            if norm_artist:
                row_artist = row_dict.get("artist") or ""
-                if _normalize_for_match(row_artist) != norm_artist:
+                if _normalize_artist(row_artist) != norm_artist:
                    continue
            matches.append(row_dict)