From b922a0df28378b757560527af38a76890a5c6463 Mon Sep 17 00:00:00 2001 From: Uyanide Date: Tue, 7 Apr 2026 18:23:26 +0200 Subject: [PATCH] =?UTF-8?q?refactor:=20better=20(really=3F=F0=9F=A4=A8)=20?= =?UTF-8?q?lrc=20parsing=20and=20handling?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pyproject.toml | 2 +- src/lrx_cli/cli.py | 4 +- src/lrx_cli/lrc.py | 433 +++++++++++++++++++++++++---------------- tests/test_fetchers.py | 4 +- tests/test_lrc.py | 171 +++++++++++++--- uv.lock | 2 +- 6 files changed, 411 insertions(+), 205 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index aa0d4c4..95faec2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "lrx-cli" -version = "0.6.1" +version = "0.6.2" description = "Fetch line-synced lyrics for your music player." readme = "README.md" requires-python = ">=3.13" diff --git a/src/lrx_cli/cli.py b/src/lrx_cli/cli.py index c6263ff..26b1014 100644 --- a/src/lrx_cli/cli.py +++ b/src/lrx_cli/cli.py @@ -123,7 +123,7 @@ def fetch( logger.error("No lyrics found.") sys.exit(1) - print(result.lyrics.to_lrc(plain=plain)) + print(result.lyrics.to_text(plain=plain)) # search @@ -214,7 +214,7 @@ def search( logger.error("No lyrics found.") sys.exit(1) - print(result.lyrics.to_lrc(plain=plain)) + print(result.lyrics.to_text(plain=plain)) # export diff --git a/src/lrx_cli/lrc.py b/src/lrx_cli/lrc.py index b2d626c..bd587e9 100644 --- a/src/lrx_cli/lrc.py +++ b/src/lrx_cli/lrc.py @@ -1,9 +1,11 @@ """ Author: Uyanide pywang0608@foxmail.com Date: 2026-03-25 21:54:01 -Description: Shared LRC time-tag utilities (definitely overengineered). +Description: LRC parsing, modeling, and serialization helpers. """ +from abc import ABC, abstractmethod +from dataclasses import dataclass, field import re from pathlib import Path from typing import Optional @@ -15,27 +17,18 @@ from .models import CacheStatus # [mm:ss], [mm:ss.c], [mm:ss.cc], [mm:ss.ccc], [mm:ss:cc], … _RAW_TAG_RE = re.compile(r"\[(\d{2,}):(\d{2})(?:[.:](\d{1,3}))?\]") -# Standard format after normalization: [mm:ss.cc] -# _STD_TAG_RE = re.compile(r"\[\d{2,}:\d{2}\.\d{2}\]") - -# Standard format with capture groups -_STD_TAG_CAPTURE_RE = re.compile(r"\[(\d{2,}):(\d{2})\.(\d{2})\]") - -# [offset:+/-xxx] tag — value in milliseconds -_OFFSET_RE = re.compile(r"^\[offset:\s*([+-]?\d+)\]\s*$", re.MULTILINE | re.IGNORECASE) - -# Any number of ID/Time tags at the start of a line +# One or more leading bracket tags at line start. +# Used to strip start tags in plain-mode fallback. _LINE_START_TAGS_RE = re.compile(r"^(?:\[[^\]]*\])+", re.MULTILINE) -# Any number of standard time tags at the start of a line -_LINE_START_STD_TAGS_RE = re.compile(r"^(?:\[\d{2,}:\d{2}\.\d{2}\])+", re.MULTILINE) +# Timed word-sync tags: , , , +_WORD_SYNC_TAG_RE = re.compile(r"<(\d{2,}):(\d{2})(?:[.:](\d{1,3}))?>") -# Word-level sync tags -# , , , , -_WORD_SYNC_TAG_RE = re.compile(r"<\d{2,}:\d{2}(?:[.:]\d{1,3})?>|<\d+,\d+,\d+>") +# A single doc-level tag line: [key:value]. +# Disallow nested [] in value so multi-tag lines are not treated as doc tags. +_DOC_TAG_RE = re.compile(r"^\[([^:\]\[]+):([^\[\]]*)\]$") -# QRC is totally a completely different matter. Since they are still providing standard LRC APIs, -# it might be a good idea to leave this mass to the future :) +# QRC uses a different format and is intentionally out of scope here. def _remove_pattern(text: str, pattern: re.Pattern) -> str: @@ -58,170 +51,282 @@ def _raw_tag_to_ms(mm: str, ss: str, frac: Optional[str]) -> int: return (int(mm) * 60 + int(ss)) * 1000 + ms -def _raw_tag_to_cs(mm: str, ss: str, frac: Optional[str]) -> str: - """Convert parsed time tag components to standard [mm:ss.cc] string.""" - if frac is None: - ms = 0 - else: - # cc in [mm:ss:cc] is also treated as centiseconds, per LRC spec - # ^ - # why does this format even exist, idk - n = len(frac) - if n == 1: - ms = int(frac) * 100 - elif n == 2: - ms = int(frac) * 10 - else: - ms = int(frac) - cs = min(round(ms / 10), 99) - return f"[{mm}:{ss}.{cs:02d}]" +def _ms_to_std_tag(total_ms: int) -> str: + mm = max(0, total_ms) // 60000 + ss = (max(0, total_ms) % 60000) // 1000 + cs = min(round((max(0, total_ms) % 1000) / 10), 99) + return f"[{mm:02d}:{ss:02d}.{cs:02d}]" -def _sanitize_lyric_text(text: str) -> str: - """Remove possibly word-sync time tags in lyric +def _ms_to_word_tag(total_ms: int) -> str: + mm = max(0, total_ms) // 60000 + ss = (max(0, total_ms) % 60000) // 1000 + cs = min(round((max(0, total_ms) % 1000) / 10), 99) + return f"<{mm:02d}:{ss:02d}.{cs:02d}>" - Assumes the normal line-sync time tags are already stripped. + +@dataclass(frozen=True) +class LrcWordSegment: + text: str + time_ms: Optional[int] = None + duration_ms: Optional[int] = None + + +class BaseLine(ABC): + """Common line interface for rendering and text extraction.""" + + @property + @abstractmethod + def text(self) -> str: + """Return plain text content for this line.""" + + @abstractmethod + def to_text(self, include_word_sync: bool) -> str: + """Return full serialized line text.""" + + @abstractmethod + def to_plain_unsynced(self) -> Optional[str]: + """Return this line's plain-text contribution in unsynced mode.""" + + @abstractmethod + def timed_plain_entries(self) -> list[tuple[int, str]]: + """Return (timestamp_ms, text) entries for synced plain-mode output.""" + + def has_nonzero_timestamp(self) -> bool: + return any(ts > 0 for ts, _ in self.timed_plain_entries()) + + +@dataclass +class DocTagLine(BaseLine): + """Represents a single doc tag line like [ar:Artist].""" + + key: str + value: str + + @property + def text(self) -> str: + return f"[{self.key}:{self.value}]" + + def to_text(self, include_word_sync: bool) -> str: + return self.text + + def to_plain_unsynced(self) -> Optional[str]: + return None + + def timed_plain_entries(self) -> list[tuple[int, str]]: + return [] + + +@dataclass +class LyricLine(BaseLine): + """Lyric line with optional line-level timestamps.""" + + line_times_ms: list[int] = field(default_factory=list) + words: list[LrcWordSegment] = field(default_factory=list) + + @property + def text(self) -> str: + return "".join(seg.text for seg in self.words) + + def to_text(self, include_word_sync: bool) -> str: + prefix = "".join(_ms_to_std_tag(ms) for ms in self.line_times_ms) + return prefix + self.text + + def to_plain_unsynced(self) -> Optional[str]: + return _remove_pattern(self.text, _LINE_START_TAGS_RE) + + def timed_plain_entries(self) -> list[tuple[int, str]]: + return [(tag_ms, self.text) for tag_ms in self.line_times_ms] + + +@dataclass +class WordSyncLyricLine(LyricLine): + """Lyric line that can render per-word sync tags when requested.""" + + def to_text(self, include_word_sync: bool) -> str: + prefix = "".join(_ms_to_std_tag(ms) for ms in self.line_times_ms) + if not include_word_sync: + return prefix + self.text + parts: list[str] = [] + for seg in self.words: + if seg.time_ms is not None: + parts.append(_ms_to_word_tag(seg.time_ms)) + parts.append(seg.text) + return prefix + "".join(parts) + + +def _split_trimmed_lines(text: str) -> list[str]: + """Split text into lines, strip each line, and drop outer blank lines.""" + + lines = [line.strip() for line in text.splitlines()] + while lines and not lines[0].strip(): + lines.pop(0) + while lines and not lines[-1].strip(): + lines.pop() + return lines + + +def _extract_leading_line_tags(line: str) -> tuple[list[int], str]: + """Parse leading line-sync tags and return (times_ms, lyric_part). + + Spaces between consecutive leading tags are dropped. If non-space text + appears, parsing of leading tags stops and the remainder is lyric text. """ - return _remove_pattern(text, _WORD_SYNC_TAG_RE) + pos = 0 + tags_ms: list[int] = [] + while True: + m = _RAW_TAG_RE.match(line, pos) + if not m: + break + tags_ms.append(_raw_tag_to_ms(m.group(1), m.group(2), m.group(3))) + pos = m.end() + + # Allow spaces only between consecutive leading tags. + # We only check for '[' here; the next loop decides whether it is a valid time tag. + scan = pos + while scan < len(line) and line[scan].isspace(): + scan += 1 + if scan < len(line) and line[scan] == "[": + pos = scan + continue + pos = scan + break + return tags_ms, line[pos:] -def _reformat(text: str) -> list[str]: - """Parse each line and reformat to standard [mm:ss.cc]...content form. +def _parse_word_segments(lyric_part: str) -> tuple[list[LrcWordSegment], bool]: + """Parse timed word-sync tags while preserving all lyric text exactly.""" + segments: list[LrcWordSegment] = [] + cursor = 0 + current_time: Optional[int] = None + has_word_sync = False - Handles any mix of time tag formats on input. Lines with no time tags - are stripped of leading/trailing whitespace and passed through unchanged. - """ - out: list[str] = [] - for line in text.splitlines(): - line = line.strip() - pos = 0 - tags: list[str] = [] - while True: - while pos < len(line) and line[pos].isspace(): - pos += 1 - m = _RAW_TAG_RE.match(line, pos) - # Non-time tags are passed through as-is, except for leading/trailing whitespace which is stripped. - if not m: - # No more tags on this line - break - tags.append(_raw_tag_to_cs(m.group(1), m.group(2), m.group(3))) - pos = m.end() - if tags: - # This could break lyric lines of some kind of word-synced LRC format, e.g. - # [00:01.00]Lyric [00:02.00]line - # but such format were not planned to be supported in the first place, so… - out.append(_sanitize_lyric_text("".join(tags) + line[pos:])) - else: - out.append(line) - # Empty lines with no tags are also preserved + for m in _WORD_SYNC_TAG_RE.finditer(lyric_part): + piece = lyric_part[cursor : m.start()] + if piece: + segments.append(LrcWordSegment(text=piece, time_ms=current_time)) + current_time = _raw_tag_to_ms(m.group(1), m.group(2), m.group(3)) + has_word_sync = True + cursor = m.end() - # Remove empty lines at the start and end of the whole text, but preserve blank lines in the middle - while out and not out[0].strip(): - out.pop(0) - while out and not out[-1].strip(): - out.pop() + tail = lyric_part[cursor:] + if tail or not segments: + segments.append( + LrcWordSegment( + text=tail, + time_ms=current_time if has_word_sync else None, + ) + ) + return segments, has_word_sync - return out + +def _is_single_doc_tag_line(line: str) -> Optional[tuple[str, str]]: + """Return (key, value) only for standalone single doc-tag lines.""" + + if _RAW_TAG_RE.fullmatch(line): + return None + m = _DOC_TAG_RE.fullmatch(line) + if not m: + return None + key = m.group(1).strip() + value = m.group(2).strip() + return key, value class LRCData: - _lines: list[str] + _lines: list[BaseLine] + _doc_tags: dict[str, str] - def __init__(self, text: str | None = None) -> None: + def __init__(self, text: Optional[str] = None) -> None: + self._doc_tags = {} if not text: self._lines = [] return - self._lines = _reformat(text) - self._apply_offset() + + raw_lines = _split_trimmed_lines(text) + parsed: list[BaseLine] = [] + + for raw in raw_lines: + maybe_tag = _is_single_doc_tag_line(raw) + if maybe_tag is not None: + key, value = maybe_tag + self._doc_tags[key] = value + parsed.append(DocTagLine(key=key, value=value)) + continue + + tags_ms, lyric_part = _extract_leading_line_tags(raw) + words, has_word_sync = _parse_word_segments(lyric_part if tags_ms else raw) + + if has_word_sync: + parsed.append(WordSyncLyricLine(line_times_ms=tags_ms, words=words)) + else: + parsed.append(LyricLine(line_times_ms=tags_ms, words=words)) + + self._lines = parsed def __str__(self) -> str: - return "\n".join(self._lines) + return self.to_text(plain=False, include_word_sync=False) def __repr__(self) -> str: - return f"LRCData(lines={self._lines!r})" - - def __bool__(self) -> bool: - return len(self._lines) > 0 + return f"LRCData(doc_tags={self._doc_tags!r}, lines={self._lines!r})" def __len__(self) -> int: return len(self._lines) - def _apply_offset(self): - """Parse [offset:±ms] and shift all standard [mm:ss.cc] tags accordingly. + @property + def tags(self) -> dict[str, str]: + return self._doc_tags - Per LRC spec, positive offset = lyrics appear sooner (subtract from timestamps). - """ - m: Optional[re.Match] = None - for i, line in enumerate(self._lines): - m = _OFFSET_RE.search(line) - if m: - self._lines.pop(i) - break - if not m: - return - offset_ms = int(m.group(1)) - if offset_ms == 0: - return - - def _shift(match: re.Match) -> str: - total_ms = max( - 0, - (int(match.group(1)) * 60 + int(match.group(2))) * 1000 - + int(match.group(3)) * 10 - - offset_ms, - ) - new_mm = total_ms // 60000 - new_ss = (total_ms % 60000) // 1000 - new_cs = min(round((total_ms % 1000) / 10), 99) - return f"[{new_mm:02d}:{new_ss:02d}.{new_cs:02d}]" - - self._lines = [_STD_TAG_CAPTURE_RE.sub(_shift, line) for line in self._lines] + @property + def lines(self) -> list[BaseLine]: + return self._lines def is_synced(self) -> bool: - """Check whether text contains non-zero LRC time tags. - - Assumes text has been normalized by normalize (standard [mm:ss.cc] format). - """ - for line in self._lines: - for m in _STD_TAG_CAPTURE_RE.finditer(line): - if m.group(1) != "00" or m.group(2) != "00" or m.group(3) != "00": - return True - return False + """Return True if any lyric line contains a non-zero line timestamp.""" + return any(line.has_nonzero_timestamp() for line in self._lines) def detect_sync_status(self) -> CacheStatus: - """Determine whether lyrics contain meaningful LRC time tags. - - Assumes text has been normalized by normalize. - """ + """Map sync detection result to cache status.""" return ( CacheStatus.SUCCESS_SYNCED if self.is_synced() else CacheStatus.SUCCESS_UNSYNCED ) - def normalize_unsynced(self): - """Normalize unsynced lyrics so every line has a [00:00.00] tag. + def normalize_unsynced(self) -> "LRCData": + """Convert lyrics into unsynced LRC form with [00:00.00] tags. - Assumes lyrics have been normalized by normalize. - - Lines that already have time tags: replace with [00:00.00] - - Lines without leading tags: prepend [00:00.00] - - Blank lines in middle are converted to [00:00.00] + - Leading blank lyric lines are skipped. + - Middle blank lyric lines are preserved as empty synced lines. + - Doc-tag lines are preserved unchanged. """ - out: list[str] = [] + out: list[BaseLine] = [] first = True - for i, line in enumerate(self._lines): - stripped = line.strip() + for line in self._lines: + if isinstance(line, DocTagLine): + out.append(DocTagLine(key=line.key, value=line.value)) + continue + + assert isinstance(line, LyricLine) + + stripped = line.text.strip() if not stripped and not first: - out.append("[00:00.00]") + out.append( + LyricLine(line_times_ms=[0], words=[LrcWordSegment(text="")]) + ) continue elif not stripped: - # Skip leading blank lines continue first = False - cleaned = _remove_pattern(line, _LINE_START_STD_TAGS_RE) - out.append(f"[00:00.00]{cleaned}") + out.append( + LyricLine( + line_times_ms=[0], + words=[LrcWordSegment(text=line.text)], + ) + ) ret = LRCData() ret._lines = out + ret._doc_tags = dict(self._doc_tags) return ret def to_plain( @@ -230,32 +335,22 @@ class LRCData: ) -> str: """Convert lyrics to plain text with all tags stripped. - If deduplicate is True, only keep the first line of consecutive lines with the same lyric text (after stripping tags). - Otherwise, lines with multiple time tags will be duplicated as many times as the number of tags. - Assumes text has been normalized by normalize. + If synced, output is sorted by line timestamp and duplicated for multi-tag lines. + If not synced, leading bracket tags are stripped per line and original order is kept. + If deduplicate is True, only consecutive duplicate plain lines are collapsed. """ if not self.is_synced(): - return "\n".join( - _remove_pattern(line, _LINE_START_TAGS_RE) for line in self._lines - ).strip("\n") + plain_lines = [ + text + for text in (line.to_plain_unsynced() for line in self._lines) + if text is not None + ] + return "\n".join(plain_lines).strip("\n") - tagged_lines = [] + tagged_lines: list[tuple[int, str]] = [] for line in self._lines: - pos = 0 - tag_ms = [] - while True: - # Only match strictly repeated standard time tags at the start of the line - # Lines without any time tags are ignored. - # Lyric lines are considered already stripped of whitespaces, so no strips here. - m = _STD_TAG_CAPTURE_RE.match(line, pos) - if not m: - lyric = line[pos:] - for tag in tag_ms: - tagged_lines.append((tag, lyric)) - break - tag_ms.append(_raw_tag_to_ms(m.group(1), m.group(2), m.group(3))) - pos = m.end() + tagged_lines.extend(line.timed_plain_entries()) sorted_lines = [lyric for _, lyric in sorted(tagged_lines, key=lambda x: x[0])] @@ -271,23 +366,27 @@ class LRCData: return "\n".join(sorted_lines).strip() - def to_unsynced(self): + def to_unsynced(self) -> "LRCData": + """Return a plain-text based unsynced representation.""" return LRCData(self.to_plain()) - def to_lrc( + def to_text( self, plain: bool = False, + include_word_sync: bool = False, ) -> str: - """Return lyrics, optionally stripping tags. + """Serialize to LRC text or plain text. - Assumes text has been normalized by normalize. + - plain=True returns to_plain(). + - include_word_sync controls rendering of per-word tags for word-sync lines. """ - ret = self - if not self.is_synced(): - ret = self.normalize_unsynced() if plain: - return ret.to_plain() - return "\n".join(ret._lines) + return self.to_plain(deduplicate=False) + + lines: list[str] = [ + line.to_text(include_word_sync=include_word_sync) for line in self._lines + ] + return "\n".join(lines) def get_audio_path(audio_url: str, ensure_exists: bool = False) -> Optional[Path]: diff --git a/tests/test_fetchers.py b/tests/test_fetchers.py index e7c0718..8ae1ec3 100644 --- a/tests/test_fetchers.py +++ b/tests/test_fetchers.py @@ -80,7 +80,7 @@ def test_cache_search_fetcher_with_fuzzy_metadata( assert result is not None assert result.lyrics is not None - assert result.lyrics.to_lrc() == expected_lrc + assert result.lyrics.to_text() == expected_lrc def test_cache_search_fetcher_prefer_better_match(lrc_manager: LrcManager): @@ -97,7 +97,7 @@ def test_cache_search_fetcher_prefer_better_match(lrc_manager: LrcManager): assert result is not None assert result.lyrics is not None - assert result.lyrics.to_lrc() == "[00:00.01]artist modified" + assert result.lyrics.to_text() == "[00:00.01]artist modified" @pytest.mark.network diff --git a/tests/test_lrc.py b/tests/test_lrc.py index 82dc932..09f9ea9 100644 --- a/tests/test_lrc.py +++ b/tests/test_lrc.py @@ -1,6 +1,11 @@ from __future__ import annotations -from lrx_cli.lrc import LRCData +from lrx_cli.lrc import ( + LRCData, + DocTagLine, + LyricLine, + WordSyncLyricLine, +) from lrx_cli.models import CacheStatus @@ -8,7 +13,7 @@ def _normalize(text: str) -> str: return str(LRCData(text)) -def test_normalize_tags_supports_all_raw_time_formats() -> None: +def test_time_tag_formats_are_normalized() -> None: raw = "\n".join( [ "[00:01]a", @@ -32,37 +37,27 @@ def test_normalize_tags_supports_all_raw_time_formats() -> None: ) -def test_normalize_tags_keeps_non_timed_lines_trimmed_and_unchanged() -> None: - raw = " plain line \n\n [ar:Meta Header] " +def test_non_timed_lines_are_kept_as_lyrics() -> None: + raw = " plain line \n\n other line " normalized = _normalize(raw) - assert normalized == "plain line\n\n[ar:Meta Header]" + assert normalized == "plain line\n\nother line" -def test_normalize_tags_removes_word_sync_patterns() -> None: - raw = ( - "[00:01.00]<00:01>hello\n" - "[00:02.00]<00:02.3>world\n" - "[00:03.00]<00:03.45>foo\n" - "[00:04.00]<00:04:678>bar\n" - "[00:05.00]<1,2,3>baz" - ) +def test_word_sync_tags_are_parsed_and_export_controlled() -> None: + raw = "[00:01.00]<00:01>he <00:01.50>llo\n[00:02.00]plain" - normalized = _normalize(raw) + data = LRCData(raw) - assert normalized == "\n".join( - [ - "[00:01.00]hello", - "[00:02.00]world", - "[00:03.00]foo", - "[00:04.00]bar", - "[00:05.00]baz", - ] + assert data.to_text(include_word_sync=False) == "[00:01.00]he llo\n[00:02.00]plain" + assert ( + data.to_text(include_word_sync=True) + == "[00:01.00]<00:01.00>he <00:01.50>llo\n[00:02.00]plain" ) -def test_normalize_tags_keeps_midline_timestamps_as_is() -> None: +def test_midline_line_tags_are_kept_as_plain_text() -> None: raw = "[00:01.00]Lyric [00:02.00]line" normalized = _normalize(raw) @@ -74,11 +69,11 @@ def test_normalize_tags_applies_positive_and_negative_offset_per_spec() -> None: positive = _normalize("[offset:+1000]\n[00:10.00]line") negative = _normalize("[offset:-500]\n[00:10.00]line") - assert positive == "[00:09.00]line" - assert negative == "[00:10.50]line" + assert positive == "[offset:+1000]\n[00:10.00]line" + assert negative == "[offset:-500]\n[00:10.00]line" -def test_normalize_tags_accepts_leading_spaces_and_tabs_before_tags() -> None: +def test_leading_spaces_before_first_time_tag_are_trimmed() -> None: raw = "\t [00:01.2] hello" normalized = _normalize(raw) @@ -89,12 +84,14 @@ def test_normalize_tags_accepts_leading_spaces_and_tabs_before_tags() -> None: def test_normalize_tags_handles_consecutive_start_tags_with_spaces_between() -> None: raw = "[00:01] [00:02.3] chorus" - normalized = _normalize(raw) - - assert normalized == "[00:01.00][00:02.30]chorus" + data = LRCData(raw) + assert len(data.lines) == 1 + assert isinstance(data.lines[0], LyricLine) + assert data.lines[0].line_times_ms == [1000, 2300] + assert data.lines[0].text == "chorus" -def test_normalize_tags_preserves_non_leading_raw_like_tags() -> None: +def test_non_leading_time_like_text_is_plain_lyric() -> None: raw = "intro [00:01]line" normalized = _normalize(raw) @@ -107,7 +104,7 @@ def test_normalize_tags_removes_offset_tag_line_even_without_lyrics() -> None: normalized = _normalize(raw) - assert normalized == "" + assert normalized == "[offset:+500]" def test_is_synced_and_detect_sync_status_follow_non_zero_rule() -> None: @@ -140,7 +137,7 @@ def test_normalize_unsynced_covers_documented_blank_and_tag_rules() -> None: ) -def test_to_plain_duplicates_lines_by_leading_repeated_timestamps() -> None: +def test_to_plain_duplicates_lines_for_multi_line_times() -> None: text = "\n".join( [ "[00:02.00][00:01.00]hello", @@ -210,3 +207,113 @@ def test_reformat_pipeline_trims_outer_blanks_and_preserves_inner_blanks() -> No normalized = str(LRCData(text)) assert normalized == "[00:01.00]a\n\n[00:02.00]b" + + +def test_single_doc_tag_line_is_not_added_to_lines() -> None: + data = LRCData("[ar:Artist]\n[00:01.00]line") + + assert data.tags == {"ar": "Artist"} + assert len(data.lines) == 2 + assert isinstance(data.lines[0], DocTagLine) + assert isinstance(data.lines[1], LyricLine) + assert data.lines[1].text == "line" + + +def test_multiple_doc_tags_on_one_line_are_plain_lyrics() -> None: + data = LRCData("[ar:Artist][ti:Song]") + + assert data.tags == {} + assert len(data.lines) == 1 + assert data.lines[0].text == "[ar:Artist][ti:Song]" + + +def test_doc_tag_after_lyrics_is_treated_as_lyrics() -> None: + data = LRCData("[00:01.00]line\n[ar:Artist]") + + assert data.tags == {"ar": "Artist"} + assert len(data.lines) == 2 + assert isinstance(data.lines[1], DocTagLine) + assert data.lines[1].text == "[ar:Artist]" + + +def test_unknown_lines_before_lyrics_are_preserved_and_do_not_start_lyrics() -> None: + data = LRCData("comment line\n[ar:Artist]\n[00:01.00]line") + + assert data.tags == {"ar": "Artist"} + assert len(data.lines) == 3 + assert isinstance(data.lines[0], LyricLine) + assert isinstance(data.lines[1], DocTagLine) + assert data.lines[2].text == "line" + assert str(data).startswith("comment line\n[ar:Artist]\n") + + +def test_to_plain_excludes_doc_tags_but_keeps_lyrics() -> None: + data = LRCData("[ar:Artist]\n[00:01.00]line\n[ti:Song]\nplain") + + assert data.to_plain() == "line" + + +def test_non_space_between_line_tags_stops_tag_parsing() -> None: + data = LRCData("[00:01.00]x[00:02.00]tail") + + assert len(data.lines) == 1 + assert isinstance(data.lines[0], LyricLine) + assert data.lines[0].line_times_ms == [1000] + assert data.lines[0].text == "x[00:02.00]tail" + + +def test_line_only_time_tag_is_valid_empty_lyric() -> None: + data = LRCData("[00:01.00]") + + assert len(data.lines) == 1 + assert isinstance(data.lines[0], LyricLine) + assert data.lines[0].line_times_ms == [1000] + assert data.lines[0].text == "" + + +def test_model_uses_subclass_for_word_sync_lines() -> None: + a = LRCData("[00:01.00]<00:00.50>lyric") + b = LRCData("[00:01.00]lyric") + + assert isinstance(a.lines[0], WordSyncLyricLine) + assert isinstance(b.lines[0], LyricLine) + assert not isinstance(b.lines[0], WordSyncLyricLine) + + +def test_word_sync_line_with_empty_tail_keeps_word_tag_only_when_enabled() -> None: + data = LRCData("[00:01.00]<00:02.00>") + + assert isinstance(data.lines[0], WordSyncLyricLine) + assert data.to_text(include_word_sync=False) == "[00:01.00]" + assert data.to_text(include_word_sync=True) == "[00:01.00]<00:02.00>" + + +def test_to_text_plain_true_matches_to_plain_output() -> None: + data = LRCData("[00:02.00]b\n[00:01.00]a") + + assert data.to_text(plain=True) == data.to_plain() + + +def test_to_unsynced_converts_to_plain_based_unsynced_data() -> None: + data = LRCData("[ar:Artist]\n[00:02.00]b\n[00:01.00]a") + + unsynced = data.to_unsynced() + + assert isinstance(unsynced, LRCData) + assert str(unsynced) == "a\nb" + + +def test_duplicate_doc_tag_key_last_value_wins_but_lines_are_kept() -> None: + data = LRCData("[ar:First]\n[ar:Second]\n[00:01.00]line") + + assert data.tags == {"ar": "Second"} + assert len(data.lines) == 3 + assert isinstance(data.lines[0], DocTagLine) + assert isinstance(data.lines[1], DocTagLine) + assert str(data).startswith("[ar:First]\n[ar:Second]\n") + + +def test_to_plain_for_doc_only_text_is_empty() -> None: + data = LRCData("[ar:Artist]\n[ti:Song]") + + assert data.to_plain() == "" diff --git a/uv.lock b/uv.lock index 85d0af2..9761424 100644 --- a/uv.lock +++ b/uv.lock @@ -153,7 +153,7 @@ wheels = [ [[package]] name = "lrx-cli" -version = "0.6.1" +version = "0.6.2" source = { editable = "." } dependencies = [ { name = "cyclopts" },