chore: switch to src layout

2026-04-06 09:15:07 +02:00
parent c5abbff14c
commit 69b7f5c60c
35 changed files with 4 additions and 7 deletions
@@ -0,0 +1,211 @@
+"""
+Author: Uyanide pywang0608@foxmail.com
+Date: 2026-04-04 11:32:23
+Description: Shared candidate-selection logic for search-based fetchers.
+
+             Each fetcher maps its API-specific results to SearchCandidate, then calls
+             select_best() which scores candidates by metadata similarity, duration
+             proximity, and sync status.
+"""
+
+from dataclasses import dataclass
+from typing import Generic, Optional, TypeVar
+
+from ..config import (
+    DURATION_TOLERANCE_MS,
+    MULTI_CANDIDATE_LIMIT,
+    SCORE_W_TITLE as _W_TITLE,
+    SCORE_W_ARTIST as _W_ARTIST,
+    SCORE_W_ALBUM as _W_ALBUM,
+    SCORE_W_DURATION as _W_DURATION,
+    SCORE_W_SYNCED as _W_SYNCED,
+    MIN_CONFIDENCE,
+)
+from ..normalize import normalize_for_match, normalize_artist
+
+T = TypeVar("T")
+
+
+@dataclass
+class SearchCandidate(Generic[T]):
+    """A normalized search result for best-match selection.
+
+    Attributes:
+        item: The original API-specific object (dict, ID, etc.)
+        duration_ms: Track duration in milliseconds, or None if unknown.
+        is_synced: Whether this candidate is known to have synced lyrics.
+        title: Candidate track title for similarity scoring.
+        artist: Candidate artist name for similarity scoring.
+        album: Candidate album name for similarity scoring.
+    """
+
+    item: T
+    duration_ms: Optional[float] = None
+    is_synced: bool = False
+    title: Optional[str] = None
+    artist: Optional[str] = None
+    album: Optional[str] = None
+
+
+def _text_similarity(a: str, b: str) -> float:
+    """Compare two normalized strings. Returns 0.0-1.0."""
+    if a == b:
+        return 1.0
+    if not a or not b:
+        return 0.0
+    # Containment: one is a substring of the other (e.g. "My Love" vs "My Love (Album Version)")
+    if a in b or b in a:
+        return min(len(a), len(b)) / max(len(a), len(b))
+    return 0.0
+
+
+def _score_candidate(
+    c: SearchCandidate[T],
+    ref_title: Optional[str],
+    ref_artist: Optional[str],
+    ref_album: Optional[str],
+    ref_length_ms: Optional[int],
+) -> float:
+    """Score a candidate from 0-100 based on metadata match quality.
+
+    Scoring works in two tiers:
+
+    1. **Metadata score** — computed from fields available on *both* sides,
+       then rescaled to fill the 0-90 range so that missing fields don't
+       inflate the score.  Fields missing on both sides are simply excluded
+       from the calculation (neutral).  Fields present on only one side
+       contribute 0 to the numerator but their weight still counts in the
+       denominator (penalty for asymmetric absence).
+
+    2. **Synced bonus** — a flat 10 pts, always applied independently.
+
+    Field weights (before rescaling):
+      - Title:    40
+      - Artist:   30
+      - Album:    10
+      - Duration: 10 (only when reference track has duration; hard mismatch is
+                      pre-filtered before scoring)
+    """
+    raw = 0.0
+    available_weight = 0.0
+
+    # Title
+    if ref_title is not None or c.title is not None:
+        available_weight += _W_TITLE
+        if ref_title is not None and c.title is not None:
+            raw += _W_TITLE * _text_similarity(
+                normalize_for_match(ref_title), normalize_for_match(c.title)
+            )
+    # else both None → excluded
+
+    # Artist
+    if ref_artist is not None or c.artist is not None:
+        available_weight += _W_ARTIST
+        if ref_artist is not None and c.artist is not None:
+            na = normalize_artist(ref_artist)
+            nb = normalize_artist(c.artist)
+            if na == nb:
+                raw += _W_ARTIST
+            else:
+                raw += _W_ARTIST * _text_similarity(
+                    normalize_for_match(ref_artist), normalize_for_match(c.artist)
+                )
+
+    # Album
+    if ref_album is not None or c.album is not None:
+        available_weight += _W_ALBUM
+        if ref_album is not None and c.album is not None:
+            raw += _W_ALBUM * _text_similarity(
+                normalize_for_match(ref_album), normalize_for_match(c.album)
+            )
+
+    # Duration — only counted when the reference track has duration.
+    # If the candidate also has duration, it contributes positively when matching
+    # (hard mismatch is already filtered upstream in select_best).
+    # If the candidate lacks duration, it contributes 0 to raw but still counts
+    # in available_weight (penalty for missing verifiable info).
+    # If the reference has no duration, duration is excluded entirely (neutral).
+    if ref_length_ms is not None:
+        available_weight += _W_DURATION
+        if c.duration_ms is not None:
+            diff = abs(c.duration_ms - ref_length_ms)
+            if diff <= DURATION_TOLERANCE_MS:
+                raw += _W_DURATION * (1.0 - diff / DURATION_TOLERANCE_MS)
+
+    # Rescale metadata to 0-90 range
+    _MAX_METADATA = _W_TITLE + _W_ARTIST + _W_ALBUM + _W_DURATION  # 90
+    if available_weight > 0:
+        metadata_score = (raw / available_weight) * _MAX_METADATA
+    else:
+        # No comparable fields at all — only synced bonus matters
+        metadata_score = 0.0
+
+    # Synced bonus (always 10 pts, independent of metadata)
+    synced_score = _W_SYNCED if c.is_synced else 0.0
+
+    return metadata_score + synced_score
+
+
+def select_ranked(
+    candidates: list[SearchCandidate[T]],
+    track_length_ms: Optional[int] = None,
+    *,
+    title: Optional[str] = None,
+    artist: Optional[str] = None,
+    album: Optional[str] = None,
+    min_confidence: float = MIN_CONFIDENCE,
+    max_results: int = MULTI_CANDIDATE_LIMIT,
+) -> list[tuple[T, float]]:
+    """Score candidates and return top max_results above min_confidence, sorted by score descending."""
+    scored: list[tuple[T, float]] = []
+    for c in candidates:
+        if (
+            track_length_ms is not None
+            and c.duration_ms is not None
+            and abs(c.duration_ms - track_length_ms) > DURATION_TOLERANCE_MS
+        ):
+            continue
+        s = _score_candidate(c, title, artist, album, track_length_ms)
+        if s >= min_confidence:
+            scored.append((c.item, s))
+    scored.sort(key=lambda x: x[1], reverse=True)
+    return scored[:max_results]
+
+
+def select_best(
+    candidates: list[SearchCandidate[T]],
+    track_length_ms: Optional[int] = None,
+    *,
+    title: Optional[str] = None,
+    artist: Optional[str] = None,
+    album: Optional[str] = None,
+    min_confidence: float = MIN_CONFIDENCE,
+) -> tuple[Optional[T], float]:
+    """Pick the best candidate by confidence scoring.
+
+    Returns (item, score). Item is None if no candidate scores above min_confidence.
+    """
+    if not candidates:
+        return None, 0.0
+
+    best_item: Optional[T] = None
+    best_score = -1.0
+
+    for c in candidates:
+        # Hard duration filter: both sides have duration but they don't match → skip.
+        if (
+            track_length_ms is not None
+            and c.duration_ms is not None
+            and abs(c.duration_ms - track_length_ms) > DURATION_TOLERANCE_MS
+        ):
+            continue
+
+        s = _score_candidate(c, title, artist, album, track_length_ms)
+        if s > best_score:
+            best_score = s
+            best_item = c.item
+
+    if best_score < min_confidence:
+        return None, best_score
+
+    return best_item, best_score