refactor: better (really?🤨) lrc parsing and handling

chore: de-markdown-lize comments
2026-04-07 19:33:17 +02:00 · 2026-04-07 19:33:17 +02:00
9 changed files with 418 additions and 212 deletions
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 [project]
 name = "lrx-cli"
-version = "0.6.1"
+version = "0.6.2"
 description = "Fetch line-synced lyrics for your music player."
 readme = "README.md"
 requires-python = ">=3.13"
@@ -29,7 +29,7 @@ _ALL_SLOTS = (SLOT_SYNCED, SLOT_UNSYNCED)
 # Fixed WHERE clause for exact track matching. Column names are hardcoded
-# literals; only the *values* come from user-supplied params — no injection risk.
+# literals; only the values come from user-supplied params — no injection risk.
 _TRACK_WHERE = (
    "(? IS NULL OR artist = ?) AND "
    "(? IS NULL OR title = ?) AND "
@@ -249,7 +249,7 @@ class CacheEngine:
    # Read
    def get_all(self, track: TrackMeta, source: str) -> list[LyricResult]:
-        """Return all non-expired cached slot rows for *track*/*source*."""
+        """Return all non-expired cached slot rows for track/source."""
        try:
            key = _generate_key(track, source)
        except ValueError:
@@ -430,7 +430,7 @@ class CacheEngine:
    def find_best_positive(
        self, track: TrackMeta, status: CacheStatus
    ) -> Optional[LyricResult]:
-        """Find the best positive (synced/unsynced) cache entry for *track*.
+        """Find the best positive (synced/unsynced) cache entry for track.
        Uses exact metadata match (artist + title + album) across all sources.
        Returns the highest-confidence entry, or None.
@@ -488,7 +488,7 @@ class CacheEngine:
        making hard artist filtering unreliable for cross-language queries.
        Ignores artist, album and source. Only returns positive results
-        (synced/unsynced) that have not expired. When *length* is provided,
+        (synced/unsynced) that have not expired. When length is provided,
        filters by duration tolerance and sorts by closest match.
        """
        if not title:
@@ -551,7 +551,7 @@ class CacheEngine:
        confidence: float,
        source: str,
    ) -> int:
-        """Update confidence for a specific source's cache entry matching *track*.
+        """Update confidence for a specific source's cache entry matching track.
        Returns the number of rows updated.
        """
@@ -123,7 +123,7 @@ def fetch(
        logger.error("No lyrics found.")
        sys.exit(1)
-    print(result.lyrics.to_lrc(plain=plain))
+    print(result.lyrics.to_text(plain=plain))
 # search
@@ -214,7 +214,7 @@ def search(
        logger.error("No lyrics found.")
        sys.exit(1)
-    print(result.lyrics.to_lrc(plain=plain))
+    print(result.lyrics.to_text(plain=plain))
 # export
@@ -275,7 +275,7 @@ class LrcManager:
        bypass_cache: bool = False,
        allow_unsynced: bool = False,
    ) -> Optional[LyricResult]:
-        """Fetch lyrics for *track* using the group-based parallel pipeline."""
+        """Fetch lyrics for track using the group-based parallel pipeline."""
        return asyncio.run(
            self._fetch_for_track(
                track,
@@ -1,9 +1,11 @@
 """
 Author: Uyanide pywang0608@foxmail.com
 Date: 2026-03-25 21:54:01
-Description: Shared LRC time-tag utilities (definitely overengineered).
+Description: LRC parsing, modeling, and serialization helpers.
 """
 from abc import ABC, abstractmethod
 from dataclasses import dataclass, field
 import re
 from pathlib import Path
 from typing import Optional
@@ -15,27 +17,18 @@ from .models import CacheStatus
 #   [mm:ss], [mm:ss.c], [mm:ss.cc], [mm:ss.ccc], [mm:ss:cc], …
 _RAW_TAG_RE = re.compile(r"\[(\d{2,}):(\d{2})(?:[.:](\d{1,3}))?\]")
-# Standard format after normalization: [mm:ss.cc]
+# One or more leading bracket tags at line start.
-# _STD_TAG_RE = re.compile(r"\[\d{2,}:\d{2}\.\d{2}\]")
+# Used to strip start tags in plain-mode fallback.
 # Standard format with capture groups
 _STD_TAG_CAPTURE_RE = re.compile(r"\[(\d{2,}):(\d{2})\.(\d{2})\]")
 # [offset:+/-xxx] tag — value in milliseconds
 _OFFSET_RE = re.compile(r"^\[offset:\s*([+-]?\d+)\]\s*$", re.MULTILINE | re.IGNORECASE)
 # Any number of ID/Time tags at the start of a line
 _LINE_START_TAGS_RE = re.compile(r"^(?:\[[^\]]*\])+", re.MULTILINE)
-# Any number of standard time tags at the start of a line
+# Timed word-sync tags: <mm:ss>, <mm:ss.c>, <mm:ss.cc>, <mm:ss:cc>
-_LINE_START_STD_TAGS_RE = re.compile(r"^(?:\[\d{2,}:\d{2}\.\d{2}\])+", re.MULTILINE)
+_WORD_SYNC_TAG_RE = re.compile(r"<(\d{2,}):(\d{2})(?:[.:](\d{1,3}))?>")
-# Word-level sync tags
+# A single doc-level tag line: [key:value].
-#   <mm:ss>, <mm:ss.c>, <mm:ss.cc>, <mm:ss:cc>, <xx,yy,zz>
+# Disallow nested [] in value so multi-tag lines are not treated as doc tags.
-_WORD_SYNC_TAG_RE = re.compile(r"<\d{2,}:\d{2}(?:[.:]\d{1,3})?>|<\d+,\d+,\d+>")
+_DOC_TAG_RE = re.compile(r"^\[([^:\]\[]+):([^\[\]]*)\]$")
-# QRC is totally a completely different matter. Since they are still providing standard LRC APIs,
+# QRC uses a different format and is intentionally out of scope here.
 # it might be a good idea to leave this mass to the future :)
 def _remove_pattern(text: str, pattern: re.Pattern) -> str:
@@ -58,170 +51,282 @@ def _raw_tag_to_ms(mm: str, ss: str, frac: Optional[str]) -> int:
    return (int(mm) * 60 + int(ss)) * 1000 + ms
-def _raw_tag_to_cs(mm: str, ss: str, frac: Optional[str]) -> str:
+def _ms_to_std_tag(total_ms: int) -> str:
-    """Convert parsed time tag components to standard [mm:ss.cc] string."""
+    mm = max(0, total_ms) // 60000
-    if frac is None:
+    ss = (max(0, total_ms) % 60000) // 1000
-        ms = 0
+    cs = min(round((max(0, total_ms) % 1000) / 10), 99)
-    else:
+    return f"[{mm:02d}:{ss:02d}.{cs:02d}]"
        # cc in [mm:ss:cc] is also treated as centiseconds, per LRC spec
        #             ^
        # why does this format even exist, idk
        n = len(frac)
        if n == 1:
            ms = int(frac) * 100
        elif n == 2:
            ms = int(frac) * 10
        else:
            ms = int(frac)
    cs = min(round(ms / 10), 99)
    return f"[{mm}:{ss}.{cs:02d}]"
-def _sanitize_lyric_text(text: str) -> str:
+def _ms_to_word_tag(total_ms: int) -> str:
-    """Remove possibly word-sync time tags in lyric
+    mm = max(0, total_ms) // 60000
    ss = (max(0, total_ms) % 60000) // 1000
    cs = min(round((max(0, total_ms) % 1000) / 10), 99)
    return f"<{mm:02d}:{ss:02d}.{cs:02d}>"
-    Assumes the normal line-sync time tags are already stripped.
+
@dataclass(frozen=True)
 class LrcWordSegment:
    text: str
    time_ms: Optional[int] = None
    duration_ms: Optional[int] = None
 class BaseLine(ABC):
    """Common line interface for rendering and text extraction."""
    @property
    @abstractmethod
    def text(self) -> str:
        """Return plain text content for this line."""
    @abstractmethod
    def to_text(self, include_word_sync: bool) -> str:
        """Return full serialized line text."""
    @abstractmethod
    def to_plain_unsynced(self) -> Optional[str]:
        """Return this line's plain-text contribution in unsynced mode."""
    @abstractmethod
    def timed_plain_entries(self) -> list[tuple[int, str]]:
        """Return (timestamp_ms, text) entries for synced plain-mode output."""
    def has_nonzero_timestamp(self) -> bool:
        return any(ts > 0 for ts, _ in self.timed_plain_entries())
@dataclass
 class DocTagLine(BaseLine):
    """Represents a single doc tag line like [ar:Artist]."""
    key: str
    value: str
    @property
    def text(self) -> str:
        return f"[{self.key}:{self.value}]"
    def to_text(self, include_word_sync: bool) -> str:
        return self.text
    def to_plain_unsynced(self) -> Optional[str]:
        return None
    def timed_plain_entries(self) -> list[tuple[int, str]]:
        return []
@dataclass
 class LyricLine(BaseLine):
    """Lyric line with optional line-level timestamps."""
    line_times_ms: list[int] = field(default_factory=list)
    words: list[LrcWordSegment] = field(default_factory=list)
    @property
    def text(self) -> str:
        return "".join(seg.text for seg in self.words)
    def to_text(self, include_word_sync: bool) -> str:
        prefix = "".join(_ms_to_std_tag(ms) for ms in self.line_times_ms)
        return prefix + self.text
    def to_plain_unsynced(self) -> Optional[str]:
        return _remove_pattern(self.text, _LINE_START_TAGS_RE)
    def timed_plain_entries(self) -> list[tuple[int, str]]:
        return [(tag_ms, self.text) for tag_ms in self.line_times_ms]
@dataclass
 class WordSyncLyricLine(LyricLine):
    """Lyric line that can render per-word sync tags when requested."""
    def to_text(self, include_word_sync: bool) -> str:
        prefix = "".join(_ms_to_std_tag(ms) for ms in self.line_times_ms)
        if not include_word_sync:
            return prefix + self.text
        parts: list[str] = []
        for seg in self.words:
            if seg.time_ms is not None:
                parts.append(_ms_to_word_tag(seg.time_ms))
            parts.append(seg.text)
        return prefix + "".join(parts)
 def _split_trimmed_lines(text: str) -> list[str]:
    """Split text into lines, strip each line, and drop outer blank lines."""
    lines = [line.strip() for line in text.splitlines()]
    while lines and not lines[0].strip():
        lines.pop(0)
    while lines and not lines[-1].strip():
        lines.pop()
    return lines
 def _extract_leading_line_tags(line: str) -> tuple[list[int], str]:
    """Parse leading line-sync tags and return (times_ms, lyric_part).
    Spaces between consecutive leading tags are dropped. If non-space text
    appears, parsing of leading tags stops and the remainder is lyric text.
    """
-    return _remove_pattern(text, _WORD_SYNC_TAG_RE)
+    pos = 0
    tags_ms: list[int] = []
    while True:
        m = _RAW_TAG_RE.match(line, pos)
        if not m:
            break
        tags_ms.append(_raw_tag_to_ms(m.group(1), m.group(2), m.group(3)))
        pos = m.end()
        # Allow spaces only between consecutive leading tags.
        # We only check for '[' here; the next loop decides whether it is a valid time tag.
        scan = pos
        while scan < len(line) and line[scan].isspace():
            scan += 1
        if scan < len(line) and line[scan] == "[":
            pos = scan
            continue
        pos = scan
        break
    return tags_ms, line[pos:]
-def _reformat(text: str) -> list[str]:
+def _parse_word_segments(lyric_part: str) -> tuple[list[LrcWordSegment], bool]:
-    """Parse each line and reformat to standard [mm:ss.cc]...content form.
+    """Parse timed word-sync tags while preserving all lyric text exactly."""
    segments: list[LrcWordSegment] = []
    cursor = 0
    current_time: Optional[int] = None
    has_word_sync = False
-    Handles any mix of time tag formats on input. Lines with no time tags
+    for m in _WORD_SYNC_TAG_RE.finditer(lyric_part):
-    are stripped of leading/trailing whitespace and passed through unchanged.
+        piece = lyric_part[cursor : m.start()]
-    """
+        if piece:
-    out: list[str] = []
+            segments.append(LrcWordSegment(text=piece, time_ms=current_time))
-    for line in text.splitlines():
+        current_time = _raw_tag_to_ms(m.group(1), m.group(2), m.group(3))
-        line = line.strip()
+        has_word_sync = True
-        pos = 0
+        cursor = m.end()
        tags: list[str] = []
        while True:
            while pos < len(line) and line[pos].isspace():
                pos += 1
            m = _RAW_TAG_RE.match(line, pos)
            # Non-time tags are passed through as-is, except for leading/trailing whitespace which is stripped.
            if not m:
                # No more tags on this line
                break
            tags.append(_raw_tag_to_cs(m.group(1), m.group(2), m.group(3)))
            pos = m.end()
        if tags:
            # This could break lyric lines of some kind of word-synced LRC format, e.g.
            #   [00:01.00]Lyric [00:02.00]line
            # but such format were not planned to be supported in the first place, so…
            out.append(_sanitize_lyric_text("".join(tags) + line[pos:]))
        else:
            out.append(line)
            # Empty lines with no tags are also preserved
-    # Remove empty lines at the start and end of the whole text, but preserve blank lines in the middle
+    tail = lyric_part[cursor:]
-    while out and not out[0].strip():
+    if tail or not segments:
-        out.pop(0)
+        segments.append(
-    while out and not out[-1].strip():
+            LrcWordSegment(
-        out.pop()
+                text=tail,
                time_ms=current_time if has_word_sync else None,
            )
        )
    return segments, has_word_sync
-    return out
+
 def _is_single_doc_tag_line(line: str) -> Optional[tuple[str, str]]:
    """Return (key, value) only for standalone single doc-tag lines."""
    if _RAW_TAG_RE.fullmatch(line):
        return None
    m = _DOC_TAG_RE.fullmatch(line)
    if not m:
        return None
    key = m.group(1).strip()
    value = m.group(2).strip()
    return key, value
 class LRCData:
-    _lines: list[str]
+    _lines: list[BaseLine]
    _doc_tags: dict[str, str]
-    def __init__(self, text: str | None = None) -> None:
+    def __init__(self, text: Optional[str] = None) -> None:
        self._doc_tags = {}
        if not text:
            self._lines = []
            return
-        self._lines = _reformat(text)
+
-        self._apply_offset()
+        raw_lines = _split_trimmed_lines(text)
        parsed: list[BaseLine] = []
        for raw in raw_lines:
            maybe_tag = _is_single_doc_tag_line(raw)
            if maybe_tag is not None:
                key, value = maybe_tag
                self._doc_tags[key] = value
                parsed.append(DocTagLine(key=key, value=value))
                continue
            tags_ms, lyric_part = _extract_leading_line_tags(raw)
            words, has_word_sync = _parse_word_segments(lyric_part if tags_ms else raw)
            if has_word_sync:
                parsed.append(WordSyncLyricLine(line_times_ms=tags_ms, words=words))
            else:
                parsed.append(LyricLine(line_times_ms=tags_ms, words=words))
        self._lines = parsed
    def __str__(self) -> str:
-        return "\n".join(self._lines)
+        return self.to_text(plain=False, include_word_sync=False)
    def __repr__(self) -> str:
-        return f"LRCData(lines={self._lines!r})"
+        return f"LRCData(doc_tags={self._doc_tags!r}, lines={self._lines!r})"
    def __bool__(self) -> bool:
        return len(self._lines) > 0
    def __len__(self) -> int:
        return len(self._lines)
-    def _apply_offset(self):
+    @property
-        """Parse [offset:±ms] and shift all standard [mm:ss.cc] tags accordingly.
+    def tags(self) -> dict[str, str]:
        return self._doc_tags
-        Per LRC spec, positive offset = lyrics appear sooner (subtract from timestamps).
+    @property
-        """
+    def lines(self) -> list[BaseLine]:
-        m: Optional[re.Match] = None
+        return self._lines
        for i, line in enumerate(self._lines):
            m = _OFFSET_RE.search(line)
            if m:
                self._lines.pop(i)
                break
        if not m:
            return
        offset_ms = int(m.group(1))
        if offset_ms == 0:
            return
        def _shift(match: re.Match) -> str:
            total_ms = max(
                0,
                (int(match.group(1)) * 60 + int(match.group(2))) * 1000
                + int(match.group(3)) * 10
                - offset_ms,
            )
            new_mm = total_ms // 60000
            new_ss = (total_ms % 60000) // 1000
            new_cs = min(round((total_ms % 1000) / 10), 99)
            return f"[{new_mm:02d}:{new_ss:02d}.{new_cs:02d}]"
        self._lines = [_STD_TAG_CAPTURE_RE.sub(_shift, line) for line in self._lines]
    def is_synced(self) -> bool:
-        """Check whether text contains non-zero LRC time tags.
+        """Return True if any lyric line contains a non-zero line timestamp."""
-
+        return any(line.has_nonzero_timestamp() for line in self._lines)
        Assumes text has been normalized by normalize (standard [mm:ss.cc] format).
        """
        for line in self._lines:
            for m in _STD_TAG_CAPTURE_RE.finditer(line):
                if m.group(1) != "00" or m.group(2) != "00" or m.group(3) != "00":
                    return True
        return False
    def detect_sync_status(self) -> CacheStatus:
-        """Determine whether lyrics contain meaningful LRC time tags.
+        """Map sync detection result to cache status."""
        Assumes text has been normalized by normalize.
        """
        return (
            CacheStatus.SUCCESS_SYNCED
            if self.is_synced()
            else CacheStatus.SUCCESS_UNSYNCED
        )
-    def normalize_unsynced(self):
+    def normalize_unsynced(self) -> "LRCData":
-        """Normalize unsynced lyrics so every line has a [00:00.00] tag.
+        """Convert lyrics into unsynced LRC form with [00:00.00] tags.
-        Assumes lyrics have been normalized by normalize.
+        - Leading blank lyric lines are skipped.
-        - Lines that already have time tags: replace with [00:00.00]
+        - Middle blank lyric lines are preserved as empty synced lines.
-        - Lines without leading tags: prepend [00:00.00]
+        - Doc-tag lines are preserved unchanged.
        - Blank lines in middle are converted to [00:00.00]
        """
-        out: list[str] = []
+        out: list[BaseLine] = []
        first = True
-        for i, line in enumerate(self._lines):
+        for line in self._lines:
-            stripped = line.strip()
+            if isinstance(line, DocTagLine):
                out.append(DocTagLine(key=line.key, value=line.value))
                continue
            assert isinstance(line, LyricLine)
            stripped = line.text.strip()
            if not stripped and not first:
-                out.append("[00:00.00]")
+                out.append(
                    LyricLine(line_times_ms=[0], words=[LrcWordSegment(text="")])
                )
                continue
            elif not stripped:
                # Skip leading blank lines
                continue
            first = False
-            cleaned = _remove_pattern(line, _LINE_START_STD_TAGS_RE)
+            out.append(
-            out.append(f"[00:00.00]{cleaned}")
+                LyricLine(
                    line_times_ms=[0],
                    words=[LrcWordSegment(text=line.text)],
                )
            )
        ret = LRCData()
        ret._lines = out
        ret._doc_tags = dict(self._doc_tags)
        return ret
    def to_plain(
@@ -230,32 +335,22 @@ class LRCData:
    ) -> str:
        """Convert lyrics to plain text with all tags stripped.
-        If deduplicate is True, only keep the first line of consecutive lines with the same lyric text (after stripping tags).
+        If synced, output is sorted by line timestamp and duplicated for multi-tag lines.
-        Otherwise, lines with multiple time tags will be duplicated as many times as the number of tags.
+        If not synced, leading bracket tags are stripped per line and original order is kept.
-        Assumes text has been normalized by normalize.
+        If deduplicate is True, only consecutive duplicate plain lines are collapsed.
        """
        if not self.is_synced():
-            return "\n".join(
+            plain_lines = [
-                _remove_pattern(line, _LINE_START_TAGS_RE) for line in self._lines
+                text
-            ).strip("\n")
+                for text in (line.to_plain_unsynced() for line in self._lines)
                if text is not None
            ]
            return "\n".join(plain_lines).strip("\n")
-        tagged_lines = []
+        tagged_lines: list[tuple[int, str]] = []
        for line in self._lines:
-            pos = 0
+            tagged_lines.extend(line.timed_plain_entries())
            tag_ms = []
            while True:
                # Only match strictly repeated standard time tags at the start of the line
                # Lines without any time tags are ignored.
                # Lyric lines are considered already stripped of whitespaces, so no strips here.
                m = _STD_TAG_CAPTURE_RE.match(line, pos)
                if not m:
                    lyric = line[pos:]
                    for tag in tag_ms:
                        tagged_lines.append((tag, lyric))
                    break
                tag_ms.append(_raw_tag_to_ms(m.group(1), m.group(2), m.group(3)))
                pos = m.end()
        sorted_lines = [lyric for _, lyric in sorted(tagged_lines, key=lambda x: x[0])]
@@ -271,23 +366,27 @@ class LRCData:
        return "\n".join(sorted_lines).strip()
-    def to_unsynced(self):
+    def to_unsynced(self) -> "LRCData":
        """Return a plain-text based unsynced representation."""
        return LRCData(self.to_plain())
-    def to_lrc(
+    def to_text(
        self,
        plain: bool = False,
        include_word_sync: bool = False,
    ) -> str:
-        """Return lyrics, optionally stripping tags.
+        """Serialize to LRC text or plain text.
-        Assumes text has been normalized by normalize.
+        - plain=True returns to_plain().
        - include_word_sync controls rendering of per-word tags for word-sync lines.
        """
        ret = self
        if not self.is_synced():
            ret = self.normalize_unsynced()
        if plain:
-            return ret.to_plain()
+            return self.to_plain(deduplicate=False)
-        return "\n".join(ret._lines)
+
        lines: list[str] = [
            line.to_text(include_word_sync=include_word_sync) for line in self._lines
        ]
        return "\n".join(lines)
 def get_audio_path(audio_url: str, ensure_exists: bool = False) -> Optional[Path]:
@@ -21,7 +21,7 @@ def is_better_result(
    *,
    allow_unsynced: bool,
 ) -> bool:
-    """Return True when *new* should rank above *old*.
+    """Return True when new should rank above old.
    Ordering rules (highest first):
    1) Positive statuses always beat negative statuses.
@@ -80,7 +80,7 @@ def test_cache_search_fetcher_with_fuzzy_metadata(
    assert result is not None
    assert result.lyrics is not None
-    assert result.lyrics.to_lrc() == expected_lrc
+    assert result.lyrics.to_text() == expected_lrc
 def test_cache_search_fetcher_prefer_better_match(lrc_manager: LrcManager):
@@ -97,7 +97,7 @@ def test_cache_search_fetcher_prefer_better_match(lrc_manager: LrcManager):
    assert result is not None
    assert result.lyrics is not None
-    assert result.lyrics.to_lrc() == "[00:00.01]artist modified"
+    assert result.lyrics.to_text() == "[00:00.01]artist modified"
@pytest.mark.network
@@ -1,6 +1,11 @@
 from __future__ import annotations
-from lrx_cli.lrc import LRCData
+from lrx_cli.lrc import (
    LRCData,
    DocTagLine,
    LyricLine,
    WordSyncLyricLine,
 )
 from lrx_cli.models import CacheStatus
@@ -8,7 +13,7 @@ def _normalize(text: str) -> str:
    return str(LRCData(text))
-def test_normalize_tags_supports_all_raw_time_formats() -> None:
+def test_time_tag_formats_are_normalized() -> None:
    raw = "\n".join(
        [
            "[00:01]a",
@@ -32,37 +37,27 @@ def test_normalize_tags_supports_all_raw_time_formats() -> None:
    )
-def test_normalize_tags_keeps_non_timed_lines_trimmed_and_unchanged() -> None:
+def test_non_timed_lines_are_kept_as_lyrics() -> None:
-    raw = "  plain line  \n\n  [ar:Meta Header]  "
+    raw = "  plain line  \n\n  other line  "
    normalized = _normalize(raw)
-    assert normalized == "plain line\n\n[ar:Meta Header]"
+    assert normalized == "plain line\n\nother line"
-def test_normalize_tags_removes_word_sync_patterns() -> None:
+def test_word_sync_tags_are_parsed_and_export_controlled() -> None:
-    raw = (
+    raw = "[00:01.00]<00:01>he <00:01.50>llo\n[00:02.00]plain"
        "[00:01.00]<00:01>hello\n"
        "[00:02.00]<00:02.3>world\n"
        "[00:03.00]<00:03.45>foo\n"
        "[00:04.00]<00:04:678>bar\n"
        "[00:05.00]<1,2,3>baz"
    )
-    normalized = _normalize(raw)
+    data = LRCData(raw)
-    assert normalized == "\n".join(
+    assert data.to_text(include_word_sync=False) == "[00:01.00]he llo\n[00:02.00]plain"
-        [
+    assert (
-            "[00:01.00]hello",
+        data.to_text(include_word_sync=True)
-            "[00:02.00]world",
+        == "[00:01.00]<00:01.00>he <00:01.50>llo\n[00:02.00]plain"
            "[00:03.00]foo",
            "[00:04.00]bar",
            "[00:05.00]baz",
        ]
    )
-def test_normalize_tags_keeps_midline_timestamps_as_is() -> None:
+def test_midline_line_tags_are_kept_as_plain_text() -> None:
    raw = "[00:01.00]Lyric [00:02.00]line"
    normalized = _normalize(raw)
@@ -74,11 +69,11 @@ def test_normalize_tags_applies_positive_and_negative_offset_per_spec() -> None:
    positive = _normalize("[offset:+1000]\n[00:10.00]line")
    negative = _normalize("[offset:-500]\n[00:10.00]line")
-    assert positive == "[00:09.00]line"
+    assert positive == "[offset:+1000]\n[00:10.00]line"
-    assert negative == "[00:10.50]line"
+    assert negative == "[offset:-500]\n[00:10.00]line"
-def test_normalize_tags_accepts_leading_spaces_and_tabs_before_tags() -> None:
+def test_leading_spaces_before_first_time_tag_are_trimmed() -> None:
    raw = "\t   [00:01.2] hello"
    normalized = _normalize(raw)
@@ -89,12 +84,14 @@ def test_normalize_tags_accepts_leading_spaces_and_tabs_before_tags() -> None:
 def test_normalize_tags_handles_consecutive_start_tags_with_spaces_between() -> None:
    raw = "[00:01]   [00:02.3]    chorus"
-    normalized = _normalize(raw)
+    data = LRCData(raw)
-
+    assert len(data.lines) == 1
-    assert normalized == "[00:01.00][00:02.30]chorus"
+    assert isinstance(data.lines[0], LyricLine)
    assert data.lines[0].line_times_ms == [1000, 2300]
    assert data.lines[0].text == "chorus"
-def test_normalize_tags_preserves_non_leading_raw_like_tags() -> None:
+def test_non_leading_time_like_text_is_plain_lyric() -> None:
    raw = "intro [00:01]line"
    normalized = _normalize(raw)
@@ -107,7 +104,7 @@ def test_normalize_tags_removes_offset_tag_line_even_without_lyrics() -> None:
    normalized = _normalize(raw)
-    assert normalized == ""
+    assert normalized == "[offset:+500]"
 def test_is_synced_and_detect_sync_status_follow_non_zero_rule() -> None:
@@ -140,7 +137,7 @@ def test_normalize_unsynced_covers_documented_blank_and_tag_rules() -> None:
    )
-def test_to_plain_duplicates_lines_by_leading_repeated_timestamps() -> None:
+def test_to_plain_duplicates_lines_for_multi_line_times() -> None:
    text = "\n".join(
        [
            "[00:02.00][00:01.00]hello",
@@ -210,3 +207,113 @@ def test_reformat_pipeline_trims_outer_blanks_and_preserves_inner_blanks() -> No
    normalized = str(LRCData(text))
    assert normalized == "[00:01.00]a\n\n[00:02.00]b"
 def test_single_doc_tag_line_is_not_added_to_lines() -> None:
    data = LRCData("[ar:Artist]\n[00:01.00]line")
    assert data.tags == {"ar": "Artist"}
    assert len(data.lines) == 2
    assert isinstance(data.lines[0], DocTagLine)
    assert isinstance(data.lines[1], LyricLine)
    assert data.lines[1].text == "line"
 def test_multiple_doc_tags_on_one_line_are_plain_lyrics() -> None:
    data = LRCData("[ar:Artist][ti:Song]")
    assert data.tags == {}
    assert len(data.lines) == 1
    assert data.lines[0].text == "[ar:Artist][ti:Song]"
 def test_doc_tag_after_lyrics_is_treated_as_lyrics() -> None:
    data = LRCData("[00:01.00]line\n[ar:Artist]")
    assert data.tags == {"ar": "Artist"}
    assert len(data.lines) == 2
    assert isinstance(data.lines[1], DocTagLine)
    assert data.lines[1].text == "[ar:Artist]"
 def test_unknown_lines_before_lyrics_are_preserved_and_do_not_start_lyrics() -> None:
    data = LRCData("comment line\n[ar:Artist]\n[00:01.00]line")
    assert data.tags == {"ar": "Artist"}
    assert len(data.lines) == 3
    assert isinstance(data.lines[0], LyricLine)
    assert isinstance(data.lines[1], DocTagLine)
    assert data.lines[2].text == "line"
    assert str(data).startswith("comment line\n[ar:Artist]\n")
 def test_to_plain_excludes_doc_tags_but_keeps_lyrics() -> None:
    data = LRCData("[ar:Artist]\n[00:01.00]line\n[ti:Song]\nplain")
    assert data.to_plain() == "line"
 def test_non_space_between_line_tags_stops_tag_parsing() -> None:
    data = LRCData("[00:01.00]x[00:02.00]tail")
    assert len(data.lines) == 1
    assert isinstance(data.lines[0], LyricLine)
    assert data.lines[0].line_times_ms == [1000]
    assert data.lines[0].text == "x[00:02.00]tail"
 def test_line_only_time_tag_is_valid_empty_lyric() -> None:
    data = LRCData("[00:01.00]")
    assert len(data.lines) == 1
    assert isinstance(data.lines[0], LyricLine)
    assert data.lines[0].line_times_ms == [1000]
    assert data.lines[0].text == ""
 def test_model_uses_subclass_for_word_sync_lines() -> None:
    a = LRCData("[00:01.00]<00:00.50>lyric")
    b = LRCData("[00:01.00]lyric")
    assert isinstance(a.lines[0], WordSyncLyricLine)
    assert isinstance(b.lines[0], LyricLine)
    assert not isinstance(b.lines[0], WordSyncLyricLine)
 def test_word_sync_line_with_empty_tail_keeps_word_tag_only_when_enabled() -> None:
    data = LRCData("[00:01.00]<00:02.00>")
    assert isinstance(data.lines[0], WordSyncLyricLine)
    assert data.to_text(include_word_sync=False) == "[00:01.00]"
    assert data.to_text(include_word_sync=True) == "[00:01.00]<00:02.00>"
 def test_to_text_plain_true_matches_to_plain_output() -> None:
    data = LRCData("[00:02.00]b\n[00:01.00]a")
    assert data.to_text(plain=True) == data.to_plain()
 def test_to_unsynced_converts_to_plain_based_unsynced_data() -> None:
    data = LRCData("[ar:Artist]\n[00:02.00]b\n[00:01.00]a")
    unsynced = data.to_unsynced()
    assert isinstance(unsynced, LRCData)
    assert str(unsynced) == "a\nb"
 def test_duplicate_doc_tag_key_last_value_wins_but_lines_are_kept() -> None:
    data = LRCData("[ar:First]\n[ar:Second]\n[00:01.00]line")
    assert data.tags == {"ar": "Second"}
    assert len(data.lines) == 3
    assert isinstance(data.lines[0], DocTagLine)
    assert isinstance(data.lines[1], DocTagLine)
    assert str(data).startswith("[ar:First]\n[ar:Second]\n")
 def test_to_plain_for_doc_only_text_is_empty() -> None:
    data = LRCData("[ar:Artist]\n[ti:Song]")
    assert data.to_plain() == ""
@@ -153,7 +153,7 @@ wheels = [
 [[package]]
 name = "lrx-cli"
-version = "0.6.1"
+version = "0.6.2"
 source = { editable = "." }
 dependencies = [
    { name = "cyclopts" },
Author	SHA1	Message	Date
Uyanide	b922a0df28	refactor: better (really?🤨) lrc parsing and handling	2026-04-07 19:33:17 +02:00
Uyanide	1414066eed	chore: de-markdown-lize comments	2026-04-07 19:33:17 +02:00