""" Shared candidate-selection logic for search-based fetchers. Each fetcher maps its API-specific results to SearchCandidate, then calls select_best() which scores candidates by metadata similarity, duration proximity, and sync status. """ from dataclasses import dataclass from typing import Generic, Optional, TypeVar from ..config import ( DURATION_TOLERANCE_MS, MULTI_CANDIDATE_LIMIT, SCORE_W_TITLE as _W_TITLE, SCORE_W_ARTIST as _W_ARTIST, SCORE_W_ALBUM as _W_ALBUM, SCORE_W_DURATION as _W_DURATION, SCORE_W_SYNCED as _W_SYNCED, MIN_CONFIDENCE, ) from ..normalize import normalize_for_match, normalize_artist T = TypeVar("T") @dataclass class SearchCandidate(Generic[T]): """A normalized search result for best-match selection. Attributes: item: The original API-specific object (dict, ID, etc.) duration_ms: Track duration in milliseconds, or None if unknown. is_synced: Whether this candidate is known to have synced lyrics. title: Candidate track title for similarity scoring. artist: Candidate artist name for similarity scoring. album: Candidate album name for similarity scoring. """ item: T duration_ms: Optional[float] = None is_synced: bool = False title: Optional[str] = None artist: Optional[str] = None album: Optional[str] = None def _text_similarity(a: str, b: str) -> float: """Compare two normalized strings. Returns 0.0-1.0.""" if a == b: return 1.0 if not a or not b: return 0.0 # Containment: one is a substring of the other (e.g. "My Love" vs "My Love (Album Version)") if a in b or b in a: return min(len(a), len(b)) / max(len(a), len(b)) return 0.0 def _score_candidate( c: SearchCandidate[T], ref_title: Optional[str], ref_artist: Optional[str], ref_album: Optional[str], ref_length_ms: Optional[int], ) -> float: """Score a candidate from 0-100 based on metadata match quality. Scoring works in two tiers: 1. **Metadata score** — computed from fields available on *both* sides, then rescaled to fill the 0-90 range so that missing fields don't inflate the score. Fields missing on both sides are simply excluded from the calculation (neutral). Fields present on only one side contribute 0 to the numerator but their weight still counts in the denominator (penalty for asymmetric absence). 2. **Synced bonus** — a flat 10 pts, always applied independently. Field weights (before rescaling): - Title: 40 - Artist: 30 - Album: 10 - Duration: 10 (only when reference track has duration; hard mismatch is pre-filtered before scoring) """ raw = 0.0 available_weight = 0.0 # Title if ref_title is not None or c.title is not None: available_weight += _W_TITLE if ref_title is not None and c.title is not None: raw += _W_TITLE * _text_similarity( normalize_for_match(ref_title), normalize_for_match(c.title) ) # else both None → excluded # Artist if ref_artist is not None or c.artist is not None: available_weight += _W_ARTIST if ref_artist is not None and c.artist is not None: na = normalize_artist(ref_artist) nb = normalize_artist(c.artist) if na == nb: raw += _W_ARTIST else: raw += _W_ARTIST * _text_similarity( normalize_for_match(ref_artist), normalize_for_match(c.artist) ) # Album if ref_album is not None or c.album is not None: available_weight += _W_ALBUM if ref_album is not None and c.album is not None: raw += _W_ALBUM * _text_similarity( normalize_for_match(ref_album), normalize_for_match(c.album) ) # Duration — only counted when the reference track has duration. # If the candidate also has duration, it contributes positively when matching # (hard mismatch is already filtered upstream in select_best). # If the candidate lacks duration, it contributes 0 to raw but still counts # in available_weight (penalty for missing verifiable info). # If the reference has no duration, duration is excluded entirely (neutral). if ref_length_ms is not None: available_weight += _W_DURATION if c.duration_ms is not None: diff = abs(c.duration_ms - ref_length_ms) if diff <= DURATION_TOLERANCE_MS: raw += _W_DURATION * (1.0 - diff / DURATION_TOLERANCE_MS) # Rescale metadata to 0-90 range _MAX_METADATA = _W_TITLE + _W_ARTIST + _W_ALBUM + _W_DURATION # 90 if available_weight > 0: metadata_score = (raw / available_weight) * _MAX_METADATA else: # No comparable fields at all — only synced bonus matters metadata_score = 0.0 # Synced bonus (always 10 pts, independent of metadata) synced_score = _W_SYNCED if c.is_synced else 0.0 return metadata_score + synced_score def select_ranked( candidates: list[SearchCandidate[T]], track_length_ms: Optional[int] = None, *, title: Optional[str] = None, artist: Optional[str] = None, album: Optional[str] = None, min_confidence: float = MIN_CONFIDENCE, max_results: int = MULTI_CANDIDATE_LIMIT, ) -> list[tuple[T, float]]: """Score candidates and return top max_results above min_confidence, sorted by score descending.""" scored: list[tuple[T, float]] = [] for c in candidates: if ( track_length_ms is not None and c.duration_ms is not None and abs(c.duration_ms - track_length_ms) > DURATION_TOLERANCE_MS ): continue s = _score_candidate(c, title, artist, album, track_length_ms) if s >= min_confidence: scored.append((c.item, s)) scored.sort(key=lambda x: x[1], reverse=True) return scored[:max_results] def select_best( candidates: list[SearchCandidate[T]], track_length_ms: Optional[int] = None, *, title: Optional[str] = None, artist: Optional[str] = None, album: Optional[str] = None, min_confidence: float = MIN_CONFIDENCE, ) -> tuple[Optional[T], float]: """Pick the best candidate by confidence scoring. Returns (item, score). Item is None if no candidate scores above min_confidence. """ if not candidates: return None, 0.0 best_item: Optional[T] = None best_score = -1.0 for c in candidates: # Hard duration filter: both sides have duration but they don't match → skip. if ( track_length_ms is not None and c.duration_ms is not None and abs(c.duration_ms - track_length_ms) > DURATION_TOLERANCE_MS ): continue s = _score_candidate(c, title, artist, album, track_length_ms) if s > best_score: best_score = s best_item = c.item if best_score < min_confidence: return None, best_score return best_item, best_score