chore: switch to src layout
This commit is contained in:
@@ -0,0 +1,211 @@
|
||||
"""
|
||||
Author: Uyanide pywang0608@foxmail.com
|
||||
Date: 2026-04-04 11:32:23
|
||||
Description: Shared candidate-selection logic for search-based fetchers.
|
||||
|
||||
Each fetcher maps its API-specific results to SearchCandidate, then calls
|
||||
select_best() which scores candidates by metadata similarity, duration
|
||||
proximity, and sync status.
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Generic, Optional, TypeVar
|
||||
|
||||
from ..config import (
|
||||
DURATION_TOLERANCE_MS,
|
||||
MULTI_CANDIDATE_LIMIT,
|
||||
SCORE_W_TITLE as _W_TITLE,
|
||||
SCORE_W_ARTIST as _W_ARTIST,
|
||||
SCORE_W_ALBUM as _W_ALBUM,
|
||||
SCORE_W_DURATION as _W_DURATION,
|
||||
SCORE_W_SYNCED as _W_SYNCED,
|
||||
MIN_CONFIDENCE,
|
||||
)
|
||||
from ..normalize import normalize_for_match, normalize_artist
|
||||
|
||||
T = TypeVar("T")
|
||||
|
||||
|
||||
@dataclass
|
||||
class SearchCandidate(Generic[T]):
|
||||
"""A normalized search result for best-match selection.
|
||||
|
||||
Attributes:
|
||||
item: The original API-specific object (dict, ID, etc.)
|
||||
duration_ms: Track duration in milliseconds, or None if unknown.
|
||||
is_synced: Whether this candidate is known to have synced lyrics.
|
||||
title: Candidate track title for similarity scoring.
|
||||
artist: Candidate artist name for similarity scoring.
|
||||
album: Candidate album name for similarity scoring.
|
||||
"""
|
||||
|
||||
item: T
|
||||
duration_ms: Optional[float] = None
|
||||
is_synced: bool = False
|
||||
title: Optional[str] = None
|
||||
artist: Optional[str] = None
|
||||
album: Optional[str] = None
|
||||
|
||||
|
||||
def _text_similarity(a: str, b: str) -> float:
|
||||
"""Compare two normalized strings. Returns 0.0-1.0."""
|
||||
if a == b:
|
||||
return 1.0
|
||||
if not a or not b:
|
||||
return 0.0
|
||||
# Containment: one is a substring of the other (e.g. "My Love" vs "My Love (Album Version)")
|
||||
if a in b or b in a:
|
||||
return min(len(a), len(b)) / max(len(a), len(b))
|
||||
return 0.0
|
||||
|
||||
|
||||
def _score_candidate(
|
||||
c: SearchCandidate[T],
|
||||
ref_title: Optional[str],
|
||||
ref_artist: Optional[str],
|
||||
ref_album: Optional[str],
|
||||
ref_length_ms: Optional[int],
|
||||
) -> float:
|
||||
"""Score a candidate from 0-100 based on metadata match quality.
|
||||
|
||||
Scoring works in two tiers:
|
||||
|
||||
1. **Metadata score** — computed from fields available on *both* sides,
|
||||
then rescaled to fill the 0-90 range so that missing fields don't
|
||||
inflate the score. Fields missing on both sides are simply excluded
|
||||
from the calculation (neutral). Fields present on only one side
|
||||
contribute 0 to the numerator but their weight still counts in the
|
||||
denominator (penalty for asymmetric absence).
|
||||
|
||||
2. **Synced bonus** — a flat 10 pts, always applied independently.
|
||||
|
||||
Field weights (before rescaling):
|
||||
- Title: 40
|
||||
- Artist: 30
|
||||
- Album: 10
|
||||
- Duration: 10 (only when reference track has duration; hard mismatch is
|
||||
pre-filtered before scoring)
|
||||
"""
|
||||
raw = 0.0
|
||||
available_weight = 0.0
|
||||
|
||||
# Title
|
||||
if ref_title is not None or c.title is not None:
|
||||
available_weight += _W_TITLE
|
||||
if ref_title is not None and c.title is not None:
|
||||
raw += _W_TITLE * _text_similarity(
|
||||
normalize_for_match(ref_title), normalize_for_match(c.title)
|
||||
)
|
||||
# else both None → excluded
|
||||
|
||||
# Artist
|
||||
if ref_artist is not None or c.artist is not None:
|
||||
available_weight += _W_ARTIST
|
||||
if ref_artist is not None and c.artist is not None:
|
||||
na = normalize_artist(ref_artist)
|
||||
nb = normalize_artist(c.artist)
|
||||
if na == nb:
|
||||
raw += _W_ARTIST
|
||||
else:
|
||||
raw += _W_ARTIST * _text_similarity(
|
||||
normalize_for_match(ref_artist), normalize_for_match(c.artist)
|
||||
)
|
||||
|
||||
# Album
|
||||
if ref_album is not None or c.album is not None:
|
||||
available_weight += _W_ALBUM
|
||||
if ref_album is not None and c.album is not None:
|
||||
raw += _W_ALBUM * _text_similarity(
|
||||
normalize_for_match(ref_album), normalize_for_match(c.album)
|
||||
)
|
||||
|
||||
# Duration — only counted when the reference track has duration.
|
||||
# If the candidate also has duration, it contributes positively when matching
|
||||
# (hard mismatch is already filtered upstream in select_best).
|
||||
# If the candidate lacks duration, it contributes 0 to raw but still counts
|
||||
# in available_weight (penalty for missing verifiable info).
|
||||
# If the reference has no duration, duration is excluded entirely (neutral).
|
||||
if ref_length_ms is not None:
|
||||
available_weight += _W_DURATION
|
||||
if c.duration_ms is not None:
|
||||
diff = abs(c.duration_ms - ref_length_ms)
|
||||
if diff <= DURATION_TOLERANCE_MS:
|
||||
raw += _W_DURATION * (1.0 - diff / DURATION_TOLERANCE_MS)
|
||||
|
||||
# Rescale metadata to 0-90 range
|
||||
_MAX_METADATA = _W_TITLE + _W_ARTIST + _W_ALBUM + _W_DURATION # 90
|
||||
if available_weight > 0:
|
||||
metadata_score = (raw / available_weight) * _MAX_METADATA
|
||||
else:
|
||||
# No comparable fields at all — only synced bonus matters
|
||||
metadata_score = 0.0
|
||||
|
||||
# Synced bonus (always 10 pts, independent of metadata)
|
||||
synced_score = _W_SYNCED if c.is_synced else 0.0
|
||||
|
||||
return metadata_score + synced_score
|
||||
|
||||
|
||||
def select_ranked(
|
||||
candidates: list[SearchCandidate[T]],
|
||||
track_length_ms: Optional[int] = None,
|
||||
*,
|
||||
title: Optional[str] = None,
|
||||
artist: Optional[str] = None,
|
||||
album: Optional[str] = None,
|
||||
min_confidence: float = MIN_CONFIDENCE,
|
||||
max_results: int = MULTI_CANDIDATE_LIMIT,
|
||||
) -> list[tuple[T, float]]:
|
||||
"""Score candidates and return top max_results above min_confidence, sorted by score descending."""
|
||||
scored: list[tuple[T, float]] = []
|
||||
for c in candidates:
|
||||
if (
|
||||
track_length_ms is not None
|
||||
and c.duration_ms is not None
|
||||
and abs(c.duration_ms - track_length_ms) > DURATION_TOLERANCE_MS
|
||||
):
|
||||
continue
|
||||
s = _score_candidate(c, title, artist, album, track_length_ms)
|
||||
if s >= min_confidence:
|
||||
scored.append((c.item, s))
|
||||
scored.sort(key=lambda x: x[1], reverse=True)
|
||||
return scored[:max_results]
|
||||
|
||||
|
||||
def select_best(
|
||||
candidates: list[SearchCandidate[T]],
|
||||
track_length_ms: Optional[int] = None,
|
||||
*,
|
||||
title: Optional[str] = None,
|
||||
artist: Optional[str] = None,
|
||||
album: Optional[str] = None,
|
||||
min_confidence: float = MIN_CONFIDENCE,
|
||||
) -> tuple[Optional[T], float]:
|
||||
"""Pick the best candidate by confidence scoring.
|
||||
|
||||
Returns (item, score). Item is None if no candidate scores above min_confidence.
|
||||
"""
|
||||
if not candidates:
|
||||
return None, 0.0
|
||||
|
||||
best_item: Optional[T] = None
|
||||
best_score = -1.0
|
||||
|
||||
for c in candidates:
|
||||
# Hard duration filter: both sides have duration but they don't match → skip.
|
||||
if (
|
||||
track_length_ms is not None
|
||||
and c.duration_ms is not None
|
||||
and abs(c.duration_ms - track_length_ms) > DURATION_TOLERANCE_MS
|
||||
):
|
||||
continue
|
||||
|
||||
s = _score_candidate(c, title, artist, album, track_length_ms)
|
||||
if s > best_score:
|
||||
best_score = s
|
||||
best_item = c.item
|
||||
|
||||
if best_score < min_confidence:
|
||||
return None, best_score
|
||||
|
||||
return best_item, best_score
|
||||
Reference in New Issue
Block a user