48 lines
1.7 KiB
Python
48 lines
1.7 KiB
Python
"""
|
||
Shared text normalization utilities for fuzzy matching.
|
||
|
||
Used by cache key generation, cache search, and candidate selection scoring.
|
||
"""
|
||
|
||
import re
|
||
import unicodedata
|
||
|
||
# Punctuation to strip for fuzzy matching (ASCII + fullwidth + CJK brackets/symbols)
|
||
_PUNCT_RE = re.compile(
|
||
r"[~!@#$%^&*()_+\-=\[\]{}|;:'\",.<>?/\\`"
|
||
r"~!@#$%^&*()_+-=【】{}|;:'",。<>?/\`"
|
||
r"「」『』《》〈〉〔〕·•‥…—–]"
|
||
)
|
||
_SPACE_RE = re.compile(r"\s+")
|
||
# feat./ft./featuring and everything after (case-insensitive, word boundary)
|
||
_FEAT_RE = re.compile(r"\s*(?:\bfeat\.?\b|\bft\.?\b|\bfeaturing\b).*", re.IGNORECASE)
|
||
# Multi-artist separators: /, &, ×, x (surrounded by spaces), ;, 、, vs.
|
||
_ARTIST_SEP_RE = re.compile(r"\s*(?:[/&;×、]|\bvs\.?\b|\bx\b)\s*", re.IGNORECASE)
|
||
|
||
|
||
def normalize_for_match(s: str) -> str:
|
||
"""Normalize a string for fuzzy comparison.
|
||
|
||
Lowercases, NFKC-normalizes (fullwidth → halfwidth), strips punctuation,
|
||
and collapses whitespace.
|
||
"""
|
||
s = unicodedata.normalize("NFKC", s).lower()
|
||
s = _FEAT_RE.sub("", s)
|
||
s = _PUNCT_RE.sub(" ", s)
|
||
s = _SPACE_RE.sub(" ", s).strip()
|
||
return s
|
||
|
||
|
||
def normalize_artist(s: str) -> str:
|
||
"""Normalize an artist string: split by separators, normalize each, sort.
|
||
|
||
Splits first (on /, &, ;, ×, 、, vs., x), then strips feat./ft./featuring
|
||
from each part individually, so 'A feat. C / B' → ['a', 'b'] not just ['a'].
|
||
"""
|
||
s = unicodedata.normalize("NFKC", s).lower()
|
||
parts = _ARTIST_SEP_RE.split(s)
|
||
normed = sorted(
|
||
{normalize_for_match(p) for p in parts if _FEAT_RE.sub("", p).strip()}
|
||
)
|
||
return "\0".join(normed) if normed else normalize_for_match(s)
|