refactor: better (really?🤨) lrc parsing and handling

2026-04-07 18:23:26 +02:00
parent 1414066eed
commit b922a0df28
6 changed files with 411 additions and 205 deletions
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"

 [project]
 name = "lrx-cli"
-version = "0.6.1"
+version = "0.6.2"
 description = "Fetch line-synced lyrics for your music player."
 readme = "README.md"
 requires-python = ">=3.13"
@@ -123,7 +123,7 @@ def fetch(
        logger.error("No lyrics found.")
        sys.exit(1)

-    print(result.lyrics.to_lrc(plain=plain))
+    print(result.lyrics.to_text(plain=plain))


 # search
@@ -214,7 +214,7 @@ def search(
        logger.error("No lyrics found.")
        sys.exit(1)

-    print(result.lyrics.to_lrc(plain=plain))
+    print(result.lyrics.to_text(plain=plain))


 # export
@@ -1,9 +1,11 @@
 """
 Author: Uyanide pywang0608@foxmail.com
 Date: 2026-03-25 21:54:01
-Description: Shared LRC time-tag utilities (definitely overengineered).
+Description: LRC parsing, modeling, and serialization helpers.
 """

+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
 import re
 from pathlib import Path
 from typing import Optional
@@ -15,27 +17,18 @@ from .models import CacheStatus
 #   [mm:ss], [mm:ss.c], [mm:ss.cc], [mm:ss.ccc], [mm:ss:cc], …
 _RAW_TAG_RE = re.compile(r"\[(\d{2,}):(\d{2})(?:[.:](\d{1,3}))?\]")

-# Standard format after normalization: [mm:ss.cc]
-# _STD_TAG_RE = re.compile(r"\[\d{2,}:\d{2}\.\d{2}\]")
-
-# Standard format with capture groups
-_STD_TAG_CAPTURE_RE = re.compile(r"\[(\d{2,}):(\d{2})\.(\d{2})\]")
-
-# [offset:+/-xxx] tag — value in milliseconds
-_OFFSET_RE = re.compile(r"^\[offset:\s*([+-]?\d+)\]\s*$", re.MULTILINE | re.IGNORECASE)
-
-# Any number of ID/Time tags at the start of a line
+# One or more leading bracket tags at line start.
+# Used to strip start tags in plain-mode fallback.
 _LINE_START_TAGS_RE = re.compile(r"^(?:\[[^\]]*\])+", re.MULTILINE)

-# Any number of standard time tags at the start of a line
-_LINE_START_STD_TAGS_RE = re.compile(r"^(?:\[\d{2,}:\d{2}\.\d{2}\])+", re.MULTILINE)
+# Timed word-sync tags: <mm:ss>, <mm:ss.c>, <mm:ss.cc>, <mm:ss:cc>
+_WORD_SYNC_TAG_RE = re.compile(r"<(\d{2,}):(\d{2})(?:[.:](\d{1,3}))?>")

-# Word-level sync tags
-#   <mm:ss>, <mm:ss.c>, <mm:ss.cc>, <mm:ss:cc>, <xx,yy,zz>
-_WORD_SYNC_TAG_RE = re.compile(r"<\d{2,}:\d{2}(?:[.:]\d{1,3})?>|<\d+,\d+,\d+>")
+# A single doc-level tag line: [key:value].
+# Disallow nested [] in value so multi-tag lines are not treated as doc tags.
+_DOC_TAG_RE = re.compile(r"^\[([^:\]\[]+):([^\[\]]*)\]$")

-# QRC is totally a completely different matter. Since they are still providing standard LRC APIs,
-# it might be a good idea to leave this mass to the future :)
+# QRC uses a different format and is intentionally out of scope here.


 def _remove_pattern(text: str, pattern: re.Pattern) -> str:
@@ -58,170 +51,282 @@ def _raw_tag_to_ms(mm: str, ss: str, frac: Optional[str]) -> int:
    return (int(mm) * 60 + int(ss)) * 1000 + ms


-def _raw_tag_to_cs(mm: str, ss: str, frac: Optional[str]) -> str:
-    """Convert parsed time tag components to standard [mm:ss.cc] string."""
-    if frac is None:
-        ms = 0
-    else:
-        # cc in [mm:ss:cc] is also treated as centiseconds, per LRC spec
-        #             ^
-        # why does this format even exist, idk
-        n = len(frac)
-        if n == 1:
-            ms = int(frac) * 100
-        elif n == 2:
-            ms = int(frac) * 10
-        else:
-            ms = int(frac)
-    cs = min(round(ms / 10), 99)
-    return f"[{mm}:{ss}.{cs:02d}]"
+def _ms_to_std_tag(total_ms: int) -> str:
+    mm = max(0, total_ms) // 60000
+    ss = (max(0, total_ms) % 60000) // 1000
+    cs = min(round((max(0, total_ms) % 1000) / 10), 99)
+    return f"[{mm:02d}:{ss:02d}.{cs:02d}]"


-def _sanitize_lyric_text(text: str) -> str:
-    """Remove possibly word-sync time tags in lyric
+def _ms_to_word_tag(total_ms: int) -> str:
+    mm = max(0, total_ms) // 60000
+    ss = (max(0, total_ms) % 60000) // 1000
+    cs = min(round((max(0, total_ms) % 1000) / 10), 99)
+    return f"<{mm:02d}:{ss:02d}.{cs:02d}>"

-    Assumes the normal line-sync time tags are already stripped.
+
+@dataclass(frozen=True)
+class LrcWordSegment:
+    text: str
+    time_ms: Optional[int] = None
+    duration_ms: Optional[int] = None
+
+
+class BaseLine(ABC):
+    """Common line interface for rendering and text extraction."""
+
+    @property
+    @abstractmethod
+    def text(self) -> str:
+        """Return plain text content for this line."""
+
+    @abstractmethod
+    def to_text(self, include_word_sync: bool) -> str:
+        """Return full serialized line text."""
+
+    @abstractmethod
+    def to_plain_unsynced(self) -> Optional[str]:
+        """Return this line's plain-text contribution in unsynced mode."""
+
+    @abstractmethod
+    def timed_plain_entries(self) -> list[tuple[int, str]]:
+        """Return (timestamp_ms, text) entries for synced plain-mode output."""
+
+    def has_nonzero_timestamp(self) -> bool:
+        return any(ts > 0 for ts, _ in self.timed_plain_entries())
+
+
+@dataclass
+class DocTagLine(BaseLine):
+    """Represents a single doc tag line like [ar:Artist]."""
+
+    key: str
+    value: str
+
+    @property
+    def text(self) -> str:
+        return f"[{self.key}:{self.value}]"
+
+    def to_text(self, include_word_sync: bool) -> str:
+        return self.text
+
+    def to_plain_unsynced(self) -> Optional[str]:
+        return None
+
+    def timed_plain_entries(self) -> list[tuple[int, str]]:
+        return []
+
+
+@dataclass
+class LyricLine(BaseLine):
+    """Lyric line with optional line-level timestamps."""
+
+    line_times_ms: list[int] = field(default_factory=list)
+    words: list[LrcWordSegment] = field(default_factory=list)
+
+    @property
+    def text(self) -> str:
+        return "".join(seg.text for seg in self.words)
+
+    def to_text(self, include_word_sync: bool) -> str:
+        prefix = "".join(_ms_to_std_tag(ms) for ms in self.line_times_ms)
+        return prefix + self.text
+
+    def to_plain_unsynced(self) -> Optional[str]:
+        return _remove_pattern(self.text, _LINE_START_TAGS_RE)
+
+    def timed_plain_entries(self) -> list[tuple[int, str]]:
+        return [(tag_ms, self.text) for tag_ms in self.line_times_ms]
+
+
+@dataclass
+class WordSyncLyricLine(LyricLine):
+    """Lyric line that can render per-word sync tags when requested."""
+
+    def to_text(self, include_word_sync: bool) -> str:
+        prefix = "".join(_ms_to_std_tag(ms) for ms in self.line_times_ms)
+        if not include_word_sync:
+            return prefix + self.text
+        parts: list[str] = []
+        for seg in self.words:
+            if seg.time_ms is not None:
+                parts.append(_ms_to_word_tag(seg.time_ms))
+            parts.append(seg.text)
+        return prefix + "".join(parts)
+
+
+def _split_trimmed_lines(text: str) -> list[str]:
+    """Split text into lines, strip each line, and drop outer blank lines."""
+
+    lines = [line.strip() for line in text.splitlines()]
+    while lines and not lines[0].strip():
+        lines.pop(0)
+    while lines and not lines[-1].strip():
+        lines.pop()
+    return lines
+
+
+def _extract_leading_line_tags(line: str) -> tuple[list[int], str]:
+    """Parse leading line-sync tags and return (times_ms, lyric_part).
+
+    Spaces between consecutive leading tags are dropped. If non-space text
+    appears, parsing of leading tags stops and the remainder is lyric text.
    """
-    return _remove_pattern(text, _WORD_SYNC_TAG_RE)
+    pos = 0
+    tags_ms: list[int] = []
+    while True:
+        m = _RAW_TAG_RE.match(line, pos)
+        if not m:
+            break
+        tags_ms.append(_raw_tag_to_ms(m.group(1), m.group(2), m.group(3)))
+        pos = m.end()
+
+        # Allow spaces only between consecutive leading tags.
+        # We only check for '[' here; the next loop decides whether it is a valid time tag.
+        scan = pos
+        while scan < len(line) and line[scan].isspace():
+            scan += 1
+        if scan < len(line) and line[scan] == "[":
+            pos = scan
+            continue
+        pos = scan
+        break
+    return tags_ms, line[pos:]


-def _reformat(text: str) -> list[str]:
-    """Parse each line and reformat to standard [mm:ss.cc]...content form.
+def _parse_word_segments(lyric_part: str) -> tuple[list[LrcWordSegment], bool]:
+    """Parse timed word-sync tags while preserving all lyric text exactly."""
+    segments: list[LrcWordSegment] = []
+    cursor = 0
+    current_time: Optional[int] = None
+    has_word_sync = False

-    Handles any mix of time tag formats on input. Lines with no time tags
-    are stripped of leading/trailing whitespace and passed through unchanged.
-    """
-    out: list[str] = []
-    for line in text.splitlines():
-        line = line.strip()
-        pos = 0
-        tags: list[str] = []
-        while True:
-            while pos < len(line) and line[pos].isspace():
-                pos += 1
-            m = _RAW_TAG_RE.match(line, pos)
-            # Non-time tags are passed through as-is, except for leading/trailing whitespace which is stripped.
-            if not m:
-                # No more tags on this line
-                break
-            tags.append(_raw_tag_to_cs(m.group(1), m.group(2), m.group(3)))
-            pos = m.end()
-        if tags:
-            # This could break lyric lines of some kind of word-synced LRC format, e.g.
-            #   [00:01.00]Lyric [00:02.00]line
-            # but such format were not planned to be supported in the first place, so…
-            out.append(_sanitize_lyric_text("".join(tags) + line[pos:]))
-        else:
-            out.append(line)
-            # Empty lines with no tags are also preserved
+    for m in _WORD_SYNC_TAG_RE.finditer(lyric_part):
+        piece = lyric_part[cursor : m.start()]
+        if piece:
+            segments.append(LrcWordSegment(text=piece, time_ms=current_time))
+        current_time = _raw_tag_to_ms(m.group(1), m.group(2), m.group(3))
+        has_word_sync = True
+        cursor = m.end()

-    # Remove empty lines at the start and end of the whole text, but preserve blank lines in the middle
-    while out and not out[0].strip():
-        out.pop(0)
-    while out and not out[-1].strip():
-        out.pop()
+    tail = lyric_part[cursor:]
+    if tail or not segments:
+        segments.append(
+            LrcWordSegment(
+                text=tail,
+                time_ms=current_time if has_word_sync else None,
+            )
+        )
+    return segments, has_word_sync

-    return out
+
+def _is_single_doc_tag_line(line: str) -> Optional[tuple[str, str]]:
+    """Return (key, value) only for standalone single doc-tag lines."""
+
+    if _RAW_TAG_RE.fullmatch(line):
+        return None
+    m = _DOC_TAG_RE.fullmatch(line)
+    if not m:
+        return None
+    key = m.group(1).strip()
+    value = m.group(2).strip()
+    return key, value


 class LRCData:
-    _lines: list[str]
+    _lines: list[BaseLine]
+    _doc_tags: dict[str, str]

-    def __init__(self, text: str | None = None) -> None:
+    def __init__(self, text: Optional[str] = None) -> None:
+        self._doc_tags = {}
        if not text:
            self._lines = []
            return
-        self._lines = _reformat(text)
-        self._apply_offset()
+
+        raw_lines = _split_trimmed_lines(text)
+        parsed: list[BaseLine] = []
+
+        for raw in raw_lines:
+            maybe_tag = _is_single_doc_tag_line(raw)
+            if maybe_tag is not None:
+                key, value = maybe_tag
+                self._doc_tags[key] = value
+                parsed.append(DocTagLine(key=key, value=value))
+                continue
+
+            tags_ms, lyric_part = _extract_leading_line_tags(raw)
+            words, has_word_sync = _parse_word_segments(lyric_part if tags_ms else raw)
+
+            if has_word_sync:
+                parsed.append(WordSyncLyricLine(line_times_ms=tags_ms, words=words))
+            else:
+                parsed.append(LyricLine(line_times_ms=tags_ms, words=words))
+
+        self._lines = parsed

    def __str__(self) -> str:
-        return "\n".join(self._lines)
+        return self.to_text(plain=False, include_word_sync=False)

    def __repr__(self) -> str:
-        return f"LRCData(lines={self._lines!r})"
-
-    def __bool__(self) -> bool:
-        return len(self._lines) > 0
+        return f"LRCData(doc_tags={self._doc_tags!r}, lines={self._lines!r})"

    def __len__(self) -> int:
        return len(self._lines)

-    def _apply_offset(self):
-        """Parse [offset:±ms] and shift all standard [mm:ss.cc] tags accordingly.
+    @property
+    def tags(self) -> dict[str, str]:
+        return self._doc_tags

-        Per LRC spec, positive offset = lyrics appear sooner (subtract from timestamps).
-        """
-        m: Optional[re.Match] = None
-        for i, line in enumerate(self._lines):
-            m = _OFFSET_RE.search(line)
-            if m:
-                self._lines.pop(i)
-                break
-        if not m:
-            return
-        offset_ms = int(m.group(1))
-        if offset_ms == 0:
-            return
-
-        def _shift(match: re.Match) -> str:
-            total_ms = max(
-                0,
-                (int(match.group(1)) * 60 + int(match.group(2))) * 1000
-                + int(match.group(3)) * 10
-                - offset_ms,
-            )
-            new_mm = total_ms // 60000
-            new_ss = (total_ms % 60000) // 1000
-            new_cs = min(round((total_ms % 1000) / 10), 99)
-            return f"[{new_mm:02d}:{new_ss:02d}.{new_cs:02d}]"
-
-        self._lines = [_STD_TAG_CAPTURE_RE.sub(_shift, line) for line in self._lines]
+    @property
+    def lines(self) -> list[BaseLine]:
+        return self._lines

    def is_synced(self) -> bool:
-        """Check whether text contains non-zero LRC time tags.
-
-        Assumes text has been normalized by normalize (standard [mm:ss.cc] format).
-        """
-        for line in self._lines:
-            for m in _STD_TAG_CAPTURE_RE.finditer(line):
-                if m.group(1) != "00" or m.group(2) != "00" or m.group(3) != "00":
-                    return True
-        return False
+        """Return True if any lyric line contains a non-zero line timestamp."""
+        return any(line.has_nonzero_timestamp() for line in self._lines)

    def detect_sync_status(self) -> CacheStatus:
-        """Determine whether lyrics contain meaningful LRC time tags.
-
-        Assumes text has been normalized by normalize.
-        """
+        """Map sync detection result to cache status."""
        return (
            CacheStatus.SUCCESS_SYNCED
            if self.is_synced()
            else CacheStatus.SUCCESS_UNSYNCED
        )

-    def normalize_unsynced(self):
-        """Normalize unsynced lyrics so every line has a [00:00.00] tag.
+    def normalize_unsynced(self) -> "LRCData":
+        """Convert lyrics into unsynced LRC form with [00:00.00] tags.

-        Assumes lyrics have been normalized by normalize.
-        - Lines that already have time tags: replace with [00:00.00]
-        - Lines without leading tags: prepend [00:00.00]
-        - Blank lines in middle are converted to [00:00.00]
+        - Leading blank lyric lines are skipped.
+        - Middle blank lyric lines are preserved as empty synced lines.
+        - Doc-tag lines are preserved unchanged.
        """
-        out: list[str] = []
+        out: list[BaseLine] = []
        first = True
-        for i, line in enumerate(self._lines):
-            stripped = line.strip()
+        for line in self._lines:
+            if isinstance(line, DocTagLine):
+                out.append(DocTagLine(key=line.key, value=line.value))
+                continue
+
+            assert isinstance(line, LyricLine)
+
+            stripped = line.text.strip()
            if not stripped and not first:
-                out.append("[00:00.00]")
+                out.append(
+                    LyricLine(line_times_ms=[0], words=[LrcWordSegment(text="")])
+                )
                continue
            elif not stripped:
-                # Skip leading blank lines
                continue
            first = False
-            cleaned = _remove_pattern(line, _LINE_START_STD_TAGS_RE)
-            out.append(f"[00:00.00]{cleaned}")
+            out.append(
+                LyricLine(
+                    line_times_ms=[0],
+                    words=[LrcWordSegment(text=line.text)],
+                )
+            )
        ret = LRCData()
        ret._lines = out
+        ret._doc_tags = dict(self._doc_tags)
        return ret

    def to_plain(
@@ -230,32 +335,22 @@ class LRCData:
    ) -> str:
        """Convert lyrics to plain text with all tags stripped.

-        If deduplicate is True, only keep the first line of consecutive lines with the same lyric text (after stripping tags).
-        Otherwise, lines with multiple time tags will be duplicated as many times as the number of tags.
-        Assumes text has been normalized by normalize.
+        If synced, output is sorted by line timestamp and duplicated for multi-tag lines.
+        If not synced, leading bracket tags are stripped per line and original order is kept.
+        If deduplicate is True, only consecutive duplicate plain lines are collapsed.
        """

        if not self.is_synced():
-            return "\n".join(
-                _remove_pattern(line, _LINE_START_TAGS_RE) for line in self._lines
-            ).strip("\n")
+            plain_lines = [
+                text
+                for text in (line.to_plain_unsynced() for line in self._lines)
+                if text is not None
+            ]
+            return "\n".join(plain_lines).strip("\n")

-        tagged_lines = []
+        tagged_lines: list[tuple[int, str]] = []
        for line in self._lines:
-            pos = 0
-            tag_ms = []
-            while True:
-                # Only match strictly repeated standard time tags at the start of the line
-                # Lines without any time tags are ignored.
-                # Lyric lines are considered already stripped of whitespaces, so no strips here.
-                m = _STD_TAG_CAPTURE_RE.match(line, pos)
-                if not m:
-                    lyric = line[pos:]
-                    for tag in tag_ms:
-                        tagged_lines.append((tag, lyric))
-                    break
-                tag_ms.append(_raw_tag_to_ms(m.group(1), m.group(2), m.group(3)))
-                pos = m.end()
+            tagged_lines.extend(line.timed_plain_entries())

        sorted_lines = [lyric for _, lyric in sorted(tagged_lines, key=lambda x: x[0])]

@@ -271,23 +366,27 @@ class LRCData:

        return "\n".join(sorted_lines).strip()

-    def to_unsynced(self):
+    def to_unsynced(self) -> "LRCData":
+        """Return a plain-text based unsynced representation."""
        return LRCData(self.to_plain())

-    def to_lrc(
+    def to_text(
        self,
        plain: bool = False,
+        include_word_sync: bool = False,
    ) -> str:
-        """Return lyrics, optionally stripping tags.
+        """Serialize to LRC text or plain text.

-        Assumes text has been normalized by normalize.
+        - plain=True returns to_plain().
+        - include_word_sync controls rendering of per-word tags for word-sync lines.
        """
-        ret = self
-        if not self.is_synced():
-            ret = self.normalize_unsynced()
        if plain:
-            return ret.to_plain()
-        return "\n".join(ret._lines)
+            return self.to_plain(deduplicate=False)
+
+        lines: list[str] = [
+            line.to_text(include_word_sync=include_word_sync) for line in self._lines
+        ]
+        return "\n".join(lines)


 def get_audio_path(audio_url: str, ensure_exists: bool = False) -> Optional[Path]:
@@ -80,7 +80,7 @@ def test_cache_search_fetcher_with_fuzzy_metadata(

    assert result is not None
    assert result.lyrics is not None
-    assert result.lyrics.to_lrc() == expected_lrc
+    assert result.lyrics.to_text() == expected_lrc


 def test_cache_search_fetcher_prefer_better_match(lrc_manager: LrcManager):
@@ -97,7 +97,7 @@ def test_cache_search_fetcher_prefer_better_match(lrc_manager: LrcManager):

    assert result is not None
    assert result.lyrics is not None
-    assert result.lyrics.to_lrc() == "[00:00.01]artist modified"
+    assert result.lyrics.to_text() == "[00:00.01]artist modified"


@pytest.mark.network
@@ -1,6 +1,11 @@
 from __future__ import annotations

-from lrx_cli.lrc import LRCData
+from lrx_cli.lrc import (
+    LRCData,
+    DocTagLine,
+    LyricLine,
+    WordSyncLyricLine,
+)
 from lrx_cli.models import CacheStatus


@@ -8,7 +13,7 @@ def _normalize(text: str) -> str:
    return str(LRCData(text))


-def test_normalize_tags_supports_all_raw_time_formats() -> None:
+def test_time_tag_formats_are_normalized() -> None:
    raw = "\n".join(
        [
            "[00:01]a",
@@ -32,37 +37,27 @@ def test_normalize_tags_supports_all_raw_time_formats() -> None:
    )


-def test_normalize_tags_keeps_non_timed_lines_trimmed_and_unchanged() -> None:
-    raw = "  plain line  \n\n  [ar:Meta Header]  "
+def test_non_timed_lines_are_kept_as_lyrics() -> None:
+    raw = "  plain line  \n\n  other line  "

    normalized = _normalize(raw)

-    assert normalized == "plain line\n\n[ar:Meta Header]"
+    assert normalized == "plain line\n\nother line"


-def test_normalize_tags_removes_word_sync_patterns() -> None:
-    raw = (
-        "[00:01.00]<00:01>hello\n"
-        "[00:02.00]<00:02.3>world\n"
-        "[00:03.00]<00:03.45>foo\n"
-        "[00:04.00]<00:04:678>bar\n"
-        "[00:05.00]<1,2,3>baz"
-    )
+def test_word_sync_tags_are_parsed_and_export_controlled() -> None:
+    raw = "[00:01.00]<00:01>he <00:01.50>llo\n[00:02.00]plain"

-    normalized = _normalize(raw)
+    data = LRCData(raw)

-    assert normalized == "\n".join(
-        [
-            "[00:01.00]hello",
-            "[00:02.00]world",
-            "[00:03.00]foo",
-            "[00:04.00]bar",
-            "[00:05.00]baz",
-        ]
+    assert data.to_text(include_word_sync=False) == "[00:01.00]he llo\n[00:02.00]plain"
+    assert (
+        data.to_text(include_word_sync=True)
+        == "[00:01.00]<00:01.00>he <00:01.50>llo\n[00:02.00]plain"
    )


-def test_normalize_tags_keeps_midline_timestamps_as_is() -> None:
+def test_midline_line_tags_are_kept_as_plain_text() -> None:
    raw = "[00:01.00]Lyric [00:02.00]line"

    normalized = _normalize(raw)
@@ -74,11 +69,11 @@ def test_normalize_tags_applies_positive_and_negative_offset_per_spec() -> None:
    positive = _normalize("[offset:+1000]\n[00:10.00]line")
    negative = _normalize("[offset:-500]\n[00:10.00]line")

-    assert positive == "[00:09.00]line"
-    assert negative == "[00:10.50]line"
+    assert positive == "[offset:+1000]\n[00:10.00]line"
+    assert negative == "[offset:-500]\n[00:10.00]line"


-def test_normalize_tags_accepts_leading_spaces_and_tabs_before_tags() -> None:
+def test_leading_spaces_before_first_time_tag_are_trimmed() -> None:
    raw = "\t   [00:01.2] hello"

    normalized = _normalize(raw)
@@ -89,12 +84,14 @@ def test_normalize_tags_accepts_leading_spaces_and_tabs_before_tags() -> None:
 def test_normalize_tags_handles_consecutive_start_tags_with_spaces_between() -> None:
    raw = "[00:01]   [00:02.3]    chorus"

-    normalized = _normalize(raw)
-
-    assert normalized == "[00:01.00][00:02.30]chorus"
+    data = LRCData(raw)
+    assert len(data.lines) == 1
+    assert isinstance(data.lines[0], LyricLine)
+    assert data.lines[0].line_times_ms == [1000, 2300]
+    assert data.lines[0].text == "chorus"


-def test_normalize_tags_preserves_non_leading_raw_like_tags() -> None:
+def test_non_leading_time_like_text_is_plain_lyric() -> None:
    raw = "intro [00:01]line"

    normalized = _normalize(raw)
@@ -107,7 +104,7 @@ def test_normalize_tags_removes_offset_tag_line_even_without_lyrics() -> None:

    normalized = _normalize(raw)

-    assert normalized == ""
+    assert normalized == "[offset:+500]"


 def test_is_synced_and_detect_sync_status_follow_non_zero_rule() -> None:
@@ -140,7 +137,7 @@ def test_normalize_unsynced_covers_documented_blank_and_tag_rules() -> None:
    )


-def test_to_plain_duplicates_lines_by_leading_repeated_timestamps() -> None:
+def test_to_plain_duplicates_lines_for_multi_line_times() -> None:
    text = "\n".join(
        [
            "[00:02.00][00:01.00]hello",
@@ -210,3 +207,113 @@ def test_reformat_pipeline_trims_outer_blanks_and_preserves_inner_blanks() -> No
    normalized = str(LRCData(text))

    assert normalized == "[00:01.00]a\n\n[00:02.00]b"
+
+
+def test_single_doc_tag_line_is_not_added_to_lines() -> None:
+    data = LRCData("[ar:Artist]\n[00:01.00]line")
+
+    assert data.tags == {"ar": "Artist"}
+    assert len(data.lines) == 2
+    assert isinstance(data.lines[0], DocTagLine)
+    assert isinstance(data.lines[1], LyricLine)
+    assert data.lines[1].text == "line"
+
+
+def test_multiple_doc_tags_on_one_line_are_plain_lyrics() -> None:
+    data = LRCData("[ar:Artist][ti:Song]")
+
+    assert data.tags == {}
+    assert len(data.lines) == 1
+    assert data.lines[0].text == "[ar:Artist][ti:Song]"
+
+
+def test_doc_tag_after_lyrics_is_treated_as_lyrics() -> None:
+    data = LRCData("[00:01.00]line\n[ar:Artist]")
+
+    assert data.tags == {"ar": "Artist"}
+    assert len(data.lines) == 2
+    assert isinstance(data.lines[1], DocTagLine)
+    assert data.lines[1].text == "[ar:Artist]"
+
+
+def test_unknown_lines_before_lyrics_are_preserved_and_do_not_start_lyrics() -> None:
+    data = LRCData("comment line\n[ar:Artist]\n[00:01.00]line")
+
+    assert data.tags == {"ar": "Artist"}
+    assert len(data.lines) == 3
+    assert isinstance(data.lines[0], LyricLine)
+    assert isinstance(data.lines[1], DocTagLine)
+    assert data.lines[2].text == "line"
+    assert str(data).startswith("comment line\n[ar:Artist]\n")
+
+
+def test_to_plain_excludes_doc_tags_but_keeps_lyrics() -> None:
+    data = LRCData("[ar:Artist]\n[00:01.00]line\n[ti:Song]\nplain")
+
+    assert data.to_plain() == "line"
+
+
+def test_non_space_between_line_tags_stops_tag_parsing() -> None:
+    data = LRCData("[00:01.00]x[00:02.00]tail")
+
+    assert len(data.lines) == 1
+    assert isinstance(data.lines[0], LyricLine)
+    assert data.lines[0].line_times_ms == [1000]
+    assert data.lines[0].text == "x[00:02.00]tail"
+
+
+def test_line_only_time_tag_is_valid_empty_lyric() -> None:
+    data = LRCData("[00:01.00]")
+
+    assert len(data.lines) == 1
+    assert isinstance(data.lines[0], LyricLine)
+    assert data.lines[0].line_times_ms == [1000]
+    assert data.lines[0].text == ""
+
+
+def test_model_uses_subclass_for_word_sync_lines() -> None:
+    a = LRCData("[00:01.00]<00:00.50>lyric")
+    b = LRCData("[00:01.00]lyric")
+
+    assert isinstance(a.lines[0], WordSyncLyricLine)
+    assert isinstance(b.lines[0], LyricLine)
+    assert not isinstance(b.lines[0], WordSyncLyricLine)
+
+
+def test_word_sync_line_with_empty_tail_keeps_word_tag_only_when_enabled() -> None:
+    data = LRCData("[00:01.00]<00:02.00>")
+
+    assert isinstance(data.lines[0], WordSyncLyricLine)
+    assert data.to_text(include_word_sync=False) == "[00:01.00]"
+    assert data.to_text(include_word_sync=True) == "[00:01.00]<00:02.00>"
+
+
+def test_to_text_plain_true_matches_to_plain_output() -> None:
+    data = LRCData("[00:02.00]b\n[00:01.00]a")
+
+    assert data.to_text(plain=True) == data.to_plain()
+
+
+def test_to_unsynced_converts_to_plain_based_unsynced_data() -> None:
+    data = LRCData("[ar:Artist]\n[00:02.00]b\n[00:01.00]a")
+
+    unsynced = data.to_unsynced()
+
+    assert isinstance(unsynced, LRCData)
+    assert str(unsynced) == "a\nb"
+
+
+def test_duplicate_doc_tag_key_last_value_wins_but_lines_are_kept() -> None:
+    data = LRCData("[ar:First]\n[ar:Second]\n[00:01.00]line")
+
+    assert data.tags == {"ar": "Second"}
+    assert len(data.lines) == 3
+    assert isinstance(data.lines[0], DocTagLine)
+    assert isinstance(data.lines[1], DocTagLine)
+    assert str(data).startswith("[ar:First]\n[ar:Second]\n")
+
+
+def test_to_plain_for_doc_only_text_is_empty() -> None:
+    data = LRCData("[ar:Artist]\n[ti:Song]")
+
+    assert data.to_plain() == ""
@@ -153,7 +153,7 @@ wheels = [

 [[package]]
 name = "lrx-cli"
-version = "0.6.1"
+version = "0.6.2"
 source = { editable = "." }
 dependencies = [
    { name = "cyclopts" },