Files
lrx-cli/tests/test_selection.py
T

513 lines
16 KiB
Python

from __future__ import annotations
from lrx_cli.fetchers.selection import (
SearchCandidate,
select_best,
select_ranked,
_score_candidate,
_text_similarity,
MIN_CONFIDENCE,
)
def test_text_similarity_exact() -> None:
assert _text_similarity("my love", "my love") == 1.0
def test_text_similarity_empty() -> None:
assert _text_similarity("", "anything") == 0.0
assert _text_similarity("anything", "") == 0.0
def test_text_similarity_no_overlap() -> None:
assert _text_similarity("hello", "world") == 0.0
def test_text_similarity_containment() -> None:
# "my love" is contained in "my love album version"
score = _text_similarity("my love", "my love album version")
assert 0.0 < score < 1.0
assert score == len("my love") / len("my love album version")
def test_score_perfect_match() -> None:
"""Exact metadata + close duration + synced = 100."""
c = SearchCandidate(
item="x",
duration_ms=232000.0,
is_synced=True,
title="My Love",
artist="Westlife",
album="Coast To Coast",
)
score = _score_candidate(c, "My Love", "Westlife", "Coast To Coast", 232000)
assert score == 100.0
def test_score_no_metadata_match() -> None:
"""Completely wrong metadata should score very low."""
c = SearchCandidate(
item="x",
duration_ms=192000.0,
is_synced=True,
title="Let My Love Be Your Pillow (Live)",
artist="Ronnie Milsap",
album="The Essential Ronnie Milsap",
)
score = _score_candidate(c, "My Love", "Westlife", "Coast To Coast", 232000)
assert score < MIN_CONFIDENCE
def test_score_missing_both_sides_neutral() -> None:
"""If neither ref nor candidate has any field, only synced bonus applies."""
c = SearchCandidate(item="x", is_synced=True)
score = _score_candidate(c, None, None, None, None)
# No comparable fields → metadata = 0, synced = 10
assert score == 10.0
def test_score_missing_one_side_gives_zero_for_field() -> None:
"""If ref has title but candidate doesn't, title gets 0 and weight still counts."""
c = SearchCandidate(item="x", title=None, is_synced=True)
# Only title is in play (weight=40), candidate missing → raw=0, rescaled=0, + synced=10
score = _score_candidate(c, "My Love", None, None, None)
assert score == 10.0
def test_synced_state_does_not_affect_score() -> None:
base = SearchCandidate(item="x", title="My Love", is_synced=False)
synced = SearchCandidate(item="x", title="My Love", is_synced=True)
diff = _score_candidate(synced, "My Love", None, None, None) - _score_candidate(
base, "My Love", None, None, None
)
assert diff == 0.0
def test_score_duration_linear_decay() -> None:
"""Duration score decays linearly; ratios between exact/half/edge are preserved."""
exact = SearchCandidate(item="x", duration_ms=232000.0)
score_exact = _score_candidate(exact, None, None, None, 232000)
half_tol = SearchCandidate(item="x", duration_ms=232000.0 + 1500.0)
score_half = _score_candidate(half_tol, None, None, None, 232000)
at_tol = SearchCandidate(item="x", duration_ms=232000.0 + 3000.0)
score_edge = _score_candidate(at_tol, None, None, None, 232000)
# Only duration is comparable → metadata spans 0-90, plus a constant baseline +10
# exact=100, half=55, edge=10
assert score_exact == 100.0
assert score_half == 55.0
assert score_edge == 10.0
def test_duration_hard_filter_rejects_all_mismatched() -> None:
"""All candidates outside duration tolerance are filtered before scoring."""
candidates = [
SearchCandidate(
item="wrong", duration_ms=180000.0, title="My Love", artist="Westlife"
),
SearchCandidate(
item="also-wrong", duration_ms=300000.0, title="My Love", artist="Westlife"
),
]
best, _ = select_best(candidates, 232000, title="My Love", artist="Westlife")
assert best is None
def test_duration_neutral_when_ref_has_no_duration() -> None:
"""Candidate duration does not penalise when the reference has no duration."""
# Candidate A: title only (no duration)
c_no_dur = SearchCandidate(item="no-dur", title="My Love")
# Candidate B: same title + a duration (ref has none)
c_with_dur = SearchCandidate(item="with-dur", title="My Love", duration_ms=232000.0)
score_no_dur = _score_candidate(c_no_dur, "My Love", None, None, None)
score_with_dur = _score_candidate(c_with_dur, "My Love", None, None, None)
assert score_no_dur == score_with_dur
def test_score_case_insensitive_title() -> None:
c = SearchCandidate(item="x", title="my love")
s1 = _score_candidate(c, "My Love", None, None, None)
s2 = _score_candidate(c, "my love", None, None, None)
assert s1 == s2
def test_score_artist_normalization() -> None:
"""'Westlife feat. Someone' should still match 'Westlife'."""
c = SearchCandidate(item="x", artist="Westlife feat. Someone")
# normalize_artist strips feat. → both become "westlife"
score = _score_candidate(c, None, "Westlife", None, None)
assert score >= 30.0 # full artist weight (30) when both None on other fields
# Reference track: Westlife - My Love, album Coast To Coast, ~232s
_REF_TITLE = "My Love"
_REF_ARTIST = "Westlife"
_REF_ALBUM = "Coast To Coast"
_REF_LENGTH = 232000 # ms
def _lrclib_candidates() -> list[SearchCandidate[dict]]:
"""Fixtures from real LRCLIB search results."""
raw = [
{
"trackName": "My Love",
"artistName": "Westlife",
"albumName": "null",
"duration": 232.0,
"synced": True,
},
{
"trackName": "My Love",
"artistName": "Westlife",
"albumName": "null",
"duration": 180.0,
"synced": True,
},
{
"trackName": "My love",
"artistName": "Westlife",
"albumName": "moments",
"duration": 235.327,
"synced": True,
},
{
"trackName": "My Love",
"artistName": "Westlife",
"albumName": "Unbreakable",
"duration": 233.026,
"synced": True,
},
{
"trackName": "My Love",
"artistName": "Westlife",
"albumName": "Coast To Coast",
"duration": 231.847,
"synced": True,
},
{
"trackName": "Hello My Love",
"artistName": "Westlife",
"albumName": "Spectrum",
"duration": 216.0,
"synced": True,
},
{
"trackName": "My Love",
"artistName": "Westlife",
"albumName": "Hitzone 13",
"duration": 231.0,
"synced": True,
},
]
return [
SearchCandidate(
item=r,
duration_ms=r["duration"] * 1000,
is_synced=r["synced"],
title=r["trackName"],
artist=r["artistName"],
album=r["albumName"],
)
for r in raw
]
def _lrclib_noisy_candidates() -> list[SearchCandidate[dict]]:
"""Fixtures from LRCLIB title-only search — lots of wrong artists."""
raw = [
{
"trackName": "Let My Love Be Your Pillow (Live)",
"artistName": "Ronnie Milsap",
"albumName": "The Essential Ronnie Milsap",
"duration": 192.0,
"synced": True,
},
{
"trackName": "My Love",
"artistName": "Little Texas",
"albumName": "Big Time",
"duration": 248.0,
"synced": True,
},
{
"trackName": "My Love (Album Version)",
"artistName": "Little Texas",
"albumName": "Greatest Hits",
"duration": 248.0,
"synced": True,
},
{
"trackName": "My Love - Digitally Remastered '89",
"artistName": "Sonny James",
"albumName": "Capitol Collectors Series",
"duration": 169.0,
"synced": False,
},
{
"trackName": "My Love",
"artistName": "Westlife",
"albumName": "Coast To Coast",
"duration": 231.847,
"synced": True,
},
]
return [
SearchCandidate(
item=r,
duration_ms=r["duration"] * 1000,
is_synced=r["synced"],
title=r["trackName"],
artist=r["artistName"],
album=r["albumName"],
)
for r in raw
]
def _netease_candidates() -> list[SearchCandidate[int]]:
"""Fixtures from real Netease search results."""
raw = [
{
"id": 2080607,
"name": "My Love",
"artist": "Westlife",
"album": "Unbreakable, Vol. 1 - The Greatest Hits",
"dt": 231941,
},
{
"id": 2080749,
"name": "My Love (Radio Edit)",
"artist": "Westlife",
"album": "World Of Our Own - No. 1 Hits Plus (EP)",
"dt": 232920,
},
{
"id": 29809886,
"name": "My Love (Live)",
"artist": "Westlife",
"album": "The Farewell Tour: Live at Croke Park",
"dt": 262000,
},
{
"id": 572412968,
"name": "My Love",
"artist": "Westlife",
"album": "Pure... Love",
"dt": 231000,
},
{
"id": 20707713,
"name": "You Raise Me Up",
"artist": "Westlife",
"album": "You Raise Me Up",
"dt": 241116,
},
]
return [
SearchCandidate(
item=r["id"],
duration_ms=float(r["dt"]),
title=r["name"],
artist=r["artist"],
album=r["album"],
)
for r in raw
]
def test_lrclib_picks_exact_album_match() -> None:
"""With full metadata, should pick the Coast To Coast entry."""
candidates = _lrclib_candidates()
best, score = select_best(
candidates,
_REF_LENGTH,
title=_REF_TITLE,
artist=_REF_ARTIST,
album=_REF_ALBUM,
)
assert best is not None
assert best["albumName"] == "Coast To Coast"
assert score >= MIN_CONFIDENCE
def test_lrclib_noisy_picks_westlife() -> None:
"""In noisy title-only results, artist matching should filter to Westlife."""
candidates = _lrclib_noisy_candidates()
best, _ = select_best(
candidates,
_REF_LENGTH,
title=_REF_TITLE,
artist=_REF_ARTIST,
album=_REF_ALBUM,
)
assert best is not None
assert best["artistName"] == "Westlife"
def test_lrclib_noisy_rejects_all_without_ref_artist() -> None:
"""Without ref artist, wrong-artist candidates may still win, but right title should rank higher."""
candidates = _lrclib_noisy_candidates()
best, _ = select_best(
candidates,
_REF_LENGTH,
title=_REF_TITLE,
)
# Should pick a "My Love" over "Let My Love Be Your Pillow"
assert best is not None
assert "My Love" == best["trackName"] or best["trackName"].startswith("My Love")
def test_netease_picks_closest_duration() -> None:
candidates = _netease_candidates()
best, _ = select_best(
candidates,
_REF_LENGTH,
title=_REF_TITLE,
artist=_REF_ARTIST,
album=_REF_ALBUM,
)
# 2080607 has dt=231941 (diff=59ms), closest to 232000
assert best == 2080607
def test_netease_rejects_wrong_title() -> None:
"""'You Raise Me Up' should not be selected."""
candidates = _netease_candidates()
best, _ = select_best(
candidates,
_REF_LENGTH,
title=_REF_TITLE,
artist=_REF_ARTIST,
)
assert best != 20707713
def test_netease_without_ref_metadata_rejects_below_confidence() -> None:
"""Without any ref metadata, candidates with one-sided fields score low and get rejected."""
candidates = _netease_candidates()
best, _ = select_best(candidates, _REF_LENGTH)
# Candidates have title/artist/album but ref has None for all → 0 for text fields
# Only duration (max 10) contributes → below MIN_CONFIDENCE (25)
assert best is None
def test_empty_candidates_returns_none() -> None:
assert select_best([], track_length_ms=5000) == (None, 0.0)
assert select_best([], track_length_ms=None) == (None, 0.0)
def test_all_below_min_confidence_returns_none() -> None:
"""If all candidates score below threshold, return None."""
candidates = [
SearchCandidate(
item="x",
title="Completely Different Song",
artist="Unknown Artist",
album="Unknown Album",
duration_ms=999999.0,
),
]
result, _ = select_best(
candidates,
232000,
title="My Love",
artist="Westlife",
album="Coast To Coast",
min_confidence=90.0,
)
assert result is None
def test_generic_type_preserved() -> None:
int_candidates = [SearchCandidate(item=42, duration_ms=5000.0, title="x")]
best, _ = select_best(int_candidates, 5000, title="x")
assert best == 42
dict_candidates = [SearchCandidate(item={"id": 1}, title="x")]
best, _ = select_best(dict_candidates, title="x")
assert best == {"id": 1}
def test_select_ranked_empty_input() -> None:
assert select_ranked([]) == []
def test_select_ranked_all_below_confidence() -> None:
"""All candidates below threshold → empty list."""
candidates = [
SearchCandidate(item="x", title="Completely Different", duration_ms=999999.0)
]
result = select_ranked(
candidates, 232000, title="My Love", artist="Westlife", min_confidence=90.0
)
assert result == []
def test_select_ranked_sorted_descending() -> None:
"""Results are ordered highest score first."""
candidates = _netease_candidates()
ranked = select_ranked(
candidates,
_REF_LENGTH,
title=_REF_TITLE,
artist=_REF_ARTIST,
album=_REF_ALBUM,
)
assert len(ranked) >= 2
scores = [score for _, score in ranked]
assert scores == sorted(scores, reverse=True)
def test_select_ranked_respects_max_results() -> None:
candidates = _netease_candidates()
ranked = select_ranked(
candidates,
_REF_LENGTH,
title=_REF_TITLE,
artist=_REF_ARTIST,
album=_REF_ALBUM,
max_results=2,
)
assert len(ranked) <= 2
def test_select_ranked_consistent_with_select_best() -> None:
"""First result of select_ranked matches select_best."""
candidates = _netease_candidates()
kwargs = dict(title=_REF_TITLE, artist=_REF_ARTIST, album=_REF_ALBUM)
ranked = select_ranked(candidates, _REF_LENGTH, **kwargs) # type: ignore
best_item, best_score = select_best(candidates, _REF_LENGTH, **kwargs) # type: ignore
assert ranked[0] == (best_item, best_score)
def test_select_ranked_duration_hard_filter_applies() -> None:
"""Candidates outside duration tolerance are excluded from ranked results."""
candidates = _netease_candidates()
ranked = select_ranked(
candidates,
_REF_LENGTH,
title=_REF_TITLE,
artist=_REF_ARTIST,
album=_REF_ALBUM,
)
ids = [item for item, _ in ranked]
# 29809886 (dt=262000, diff=30000ms) and 20707713 (dt=241116, diff=9116ms)
# both exceed DURATION_TOLERANCE_MS=3000 → must not appear
assert 29809886 not in ids
assert 20707713 not in ids
def test_select_ranked_netease_top_is_best_duration_match() -> None:
"""2080607 (diff=59ms) should rank first over 572412968 (diff=1000ms)."""
candidates = _netease_candidates()
ranked = select_ranked(
candidates,
_REF_LENGTH,
title=_REF_TITLE,
artist=_REF_ARTIST,
album=_REF_ALBUM,
)
assert ranked[0][0] == 2080607