From 0e9bf29ff4add1539230d410f73ed8025aec59c0 Mon Sep 17 00:00:00 2001
From: Uyanide <pywang0608@foxmail.com>
Date: Wed, 1 Apr 2026 17:51:28 +0200
Subject: [PATCH] feat: better lrc handling

---
 lrx_cli/lrc.py    | 110 +++++++++++++++++++--------
 pyproject.toml    |   5 +-
 tests/test_lrc.py | 184 ++++++++++++++++++++++++++++++++++++++++++++++
 uv.lock           |  49 +++++++++++-
 4 files changed, 317 insertions(+), 31 deletions(-)
 create mode 100644 tests/test_lrc.py

diff --git a/lrx_cli/lrc.py b/lrx_cli/lrc.py
index 2965d52..c207dd6 100644
--- a/lrx_cli/lrc.py
+++ b/lrx_cli/lrc.py
@@ -21,14 +21,26 @@ _STD_TAG_RE = re.compile(r"\[\d{2,}:\d{2}\.\d{2}\]")
 # Standard format with capture groups
 _STD_TAG_CAPTURE_RE = re.compile(r"\[(\d{2,}):(\d{2})\.(\d{2})\]")
 
-# Matches a standard time tag at the start of a line
-_LRC_LINE_RE = re.compile(r"^\[\d{2,}:\d{2}\.\d{2}\]", re.MULTILINE)
-
 # [offset:+/-xxx] tag — value in milliseconds
 _OFFSET_RE = re.compile(r"^\[offset:\s*([+-]?\d+)\]\s*$", re.MULTILINE | re.IGNORECASE)
 
-# Matches any number of tags at the start of a line
-_LINE_START_TAGS_RE = re.compile(r"^(?:\[[^\]]*\])+")
+# Any number of ID/Time tags at the start of a line
+_LINE_START_TAGS_RE = re.compile(r"^(?:\[[^\]]*\])+", re.MULTILINE)
+
+# Any number of standard time tags at the start of a line
+_LINE_START_STD_TAGS_RE = re.compile(r"^(?:\[\d{2,}:\d{2}\.\d{2}\])+", re.MULTILINE)
+
+# Word-level sync tags
+#   <mm:ss>, <mm:ss.c>, <mm:ss.cc>, <mm:ss:cc>, <xx,yy,zz>
+_WORD_SYNC_TAG_RE = re.compile(r"<\d{2,}:\d{2}(?:[.:]\d{1,3})?>|<\d+,\d+,\d+>")
+
+# QRC is totally a completely different matter. Since they are still providing standard LRC APIs,
+# it might be a good idea to leave this mass to the future :)
+
+
+def _remove_pattern(text: str, pattern: re.Pattern) -> str:
+    """Remove all occurrences of pattern from text, then strip leading/trailing whitespace."""
+    return pattern.sub("", text).strip()
 
 
 def _raw_tag_to_cs(mm: str, ss: str, frac: Optional[str]) -> str:
@@ -50,6 +62,14 @@ def _raw_tag_to_cs(mm: str, ss: str, frac: Optional[str]) -> str:
     return f"[{mm}:{ss}.{cs:02d}]"
 
 
+def _sanitize_lyric_text(text: str) -> str:
+    """Remove possibly word-sync time tags in lyric
+
+    Assumes the normal line-sync time tags are already stripped.
+    """
+    return _remove_pattern(text, _WORD_SYNC_TAG_RE)
+
+
 def _reformat(text: str) -> str:
     """Parse each line and reformat to standard [mm:ss.cc]...content form.
 
@@ -62,7 +82,7 @@ def _reformat(text: str) -> str:
         pos = 0
         tags: list[str] = []
         while True:
-            while pos < len(line) and line[pos] == " ":
+            while pos < len(line) and line[pos].isspace():
                 pos += 1
             m = _RAW_TAG_RE.match(line, pos)
             # Non-time tags are passed through as-is, except for leading/trailing whitespace which is stripped.
@@ -72,9 +92,10 @@ def _reformat(text: str) -> str:
             tags.append(_raw_tag_to_cs(m.group(1), m.group(2), m.group(3)))
             pos = m.end()
         if tags:
-            # This could break lyric lines of some kind of word-synced LRC format,
+            # This could break lyric lines of some kind of word-synced LRC format, e.g.
+            #   [00:01.00]Lyric [00:02.00]line
             # but such format were not planned to be supported in the first place, so…
-            out.append("".join(tags) + line[pos:].lstrip())
+            out.append(_sanitize_lyric_text("".join(tags) + line[pos:]))
         else:
             out.append(line)
             # Empty lines with no tags are also preserved
@@ -117,7 +138,7 @@ def normalize_tags(text: str) -> str:
 def is_synced(text: str) -> bool:
     """Check whether text contains non-zero LRC time tags.
 
-    Assumes text has been normalized by normalize_tags (standard [mm:ss.cc] format).
+    Assumes text has been normalized by normalize (standard [mm:ss.cc] format).
     """
     tags = _STD_TAG_RE.findall(text)
     return bool(tags) and any(tag != "[00:00.00]" for tag in tags)
@@ -126,7 +147,7 @@ def is_synced(text: str) -> bool:
 def detect_sync_status(text: str) -> CacheStatus:
     """Determine whether lyrics contain meaningful LRC time tags.
 
-    Assumes text has been normalized by normalize_tags.
+    Assumes text has been normalized by normalize.
     """
     return (
         CacheStatus.SUCCESS_SYNCED if is_synced(text) else CacheStatus.SUCCESS_UNSYNCED
@@ -136,19 +157,23 @@ def detect_sync_status(text: str) -> CacheStatus:
 def normalize_unsynced(lyrics: str) -> str:
     """Normalize unsynced lyrics so every line has a [00:00.00] tag.
 
+    Assumes lyrics have been normalized by normalize.
     - Lines that already have time tags: replace with [00:00.00]
-    - Lines without time tags: prepend [00:00.00]
-    - Blank lines are converted to [00:00.00]
+    - Lines without leading tags: prepend [00:00.00]
+    - Blank lines in middle are converted to [00:00.00]
     """
     out: list[str] = []
+    first = True
     for line in lyrics.splitlines():
         stripped = line.strip()
-        if not stripped:
+        if not stripped and not first:
             out.append("[00:00.00]")
             continue
-        cleaned = _LRC_LINE_RE.sub("", stripped)
-        while _LRC_LINE_RE.match(cleaned):
-            cleaned = _LRC_LINE_RE.sub("", cleaned)
+        elif not stripped:
+            # Skip leading blank lines
+            continue
+        first = False
+        cleaned = _remove_pattern(line, _LINE_START_STD_TAGS_RE)
         out.append(f"[00:00.00]{cleaned}")
     return "\n".join(out)
 
@@ -183,25 +208,52 @@ def get_sidecar_path(
 
 def to_plain(
     text: str,
+    deduplicate: bool = False,
 ) -> str:
     """Convert lyrics to plain text with all tags stripped.
 
-    Assumes text has been normalized by normalize_tags.
+    If deduplicate is True, only keep the first line of consecutive lines with the same lyric text (after stripping tags).
+    Otherwise, lines with multiple time tags will be duplicated as many times as the number of tags.
+    Assumes text has been normalized by normalize.
     """
 
+    if not is_synced(text):
+        # If there are no meaningful time tags, just strip all tags and return
+        return _remove_pattern(text, _LINE_START_TAGS_RE)
+
     lines = []
-    first = True
     for line in text.splitlines():
-        cleaned = _LINE_START_TAGS_RE.sub("", line).strip()
-        # Ignore the leading empty lines that is likely caused by tag lines
-        if not cleaned and not first:
-            lines.append("")
-        elif cleaned:
-            lines.append(cleaned)
-            first = False
-    # Remove trailing empty lines that are meaningless
-    while lines and not lines[-1]:
-        lines.pop()
+        pos = 0
+        cnt = 0
+        plain_line = ""
+        while True:
+            # Only match strictly repeated standard time tags at the start of the line
+            # Lines without any time tags are ignored.
+            # Lyric lines are considered already stripped of whitespaces, so no strips here.
+            m = _STD_TAG_RE.match(line, pos)
+            if not m:
+                plain_line += line[pos:]
+                break
+            pos = m.end()
+            cnt += 1
+        # Also avoid dulplicating blank lines
+        if deduplicate or not plain_line:
+            if cnt > 0:
+                lines.append(plain_line)
+        else:
+            for _ in range(cnt):
+                lines.append(plain_line)
+
+    if deduplicate:
+        # Remove consecutive duplicates
+        deduped_lines = []
+        prev_line = None
+        for line in lines:
+            if line != prev_line:
+                deduped_lines.append(line)
+            prev_line = line
+        lines = deduped_lines
+
     return "\n".join(lines)
 
 
@@ -211,7 +263,7 @@ def print_lyrics(
 ) -> None:
     """Print lyrics, optionally stripping tags.
 
-    Assumes text has been normalized by normalize_tags.
+    Assumes text has been normalized by normalize.
     """
     if plain:
         print(to_plain(text))
diff --git a/pyproject.toml b/pyproject.toml
index ca37659..02a9bd0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -25,4 +25,7 @@ lrx = "lrx_cli.cli:run"
 ignore = ["E402"]
 
 [dependency-groups]
-dev = ["ruff>=0.15.8"]
+dev = [
+    "pytest>=9.0.2",
+    "ruff>=0.15.8",
+]
diff --git a/tests/test_lrc.py b/tests/test_lrc.py
new file mode 100644
index 0000000..839c1ac
--- /dev/null
+++ b/tests/test_lrc.py
@@ -0,0 +1,184 @@
+from __future__ import annotations
+
+from lrx_cli.lrc import (
+    detect_sync_status,
+    is_synced,
+    normalize_tags,
+    normalize_unsynced,
+    to_plain,
+)
+from lrx_cli.models import CacheStatus
+
+
+def test_normalize_tags_supports_all_raw_time_formats() -> None:
+    raw = "\n".join(
+        [
+            "[00:01]a",
+            "[00:02.3]b",
+            "[00:03.45]c",
+            "[00:04.678]d",
+            "[00:05:999]e",
+        ]
+    )
+
+    normalized = normalize_tags(raw)
+
+    assert normalized == "\n".join(
+        [
+            "[00:01.00]a",
+            "[00:02.30]b",
+            "[00:03.45]c",
+            "[00:04.68]d",
+            "[00:05.99]e",
+        ]
+    )
+
+
+def test_normalize_tags_keeps_non_timed_lines_trimmed_and_unchanged() -> None:
+    raw = "  plain line  \n\n  [ar:Meta Header]  "
+
+    normalized = normalize_tags(raw)
+
+    assert normalized == "plain line\n\n[ar:Meta Header]"
+
+
+def test_normalize_tags_removes_word_sync_patterns() -> None:
+    raw = (
+        "[00:01.00]<00:01>hello\n"
+        "[00:02.00]<00:02.3>world\n"
+        "[00:03.00]<00:03.45>foo\n"
+        "[00:04.00]<00:04:678>bar\n"
+        "[00:05.00]<1,2,3>baz"
+    )
+
+    normalized = normalize_tags(raw)
+
+    assert normalized == "\n".join(
+        [
+            "[00:01.00]hello",
+            "[00:02.00]world",
+            "[00:03.00]foo",
+            "[00:04.00]bar",
+            "[00:05.00]baz",
+        ]
+    )
+
+
+def test_normalize_tags_keeps_midline_timestamps_as_is() -> None:
+    raw = "[00:01.00]Lyric [00:02.00]line"
+
+    normalized = normalize_tags(raw)
+
+    assert normalized == "[00:01.00]Lyric [00:02.00]line"
+
+
+def test_normalize_tags_applies_positive_and_negative_offset_per_spec() -> None:
+    positive = normalize_tags("[offset:+1000]\n[00:10.00]line")
+    negative = normalize_tags("[offset:-500]\n[00:10.00]line")
+
+    assert positive == "[00:09.00]line"
+    assert negative == "[00:10.50]line"
+
+
+def test_normalize_tags_accepts_leading_spaces_and_tabs_before_tags() -> None:
+    raw = "\t   [00:01.2] hello"
+
+    normalized = normalize_tags(raw)
+
+    assert normalized == "[00:01.20]hello"
+
+
+def test_normalize_tags_handles_consecutive_start_tags_with_spaces_between() -> None:
+    raw = "[00:01]   [00:02.3]    chorus"
+
+    normalized = normalize_tags(raw)
+
+    assert normalized == "[00:01.00][00:02.30]chorus"
+
+
+def test_normalize_tags_preserves_non_leading_raw_like_tags() -> None:
+    raw = "intro [00:01]line"
+
+    normalized = normalize_tags(raw)
+
+    assert normalized == "intro [00:01]line"
+
+
+def test_normalize_tags_removes_offset_tag_line_even_without_lyrics() -> None:
+    raw = "[offset:+500]"
+
+    normalized = normalize_tags(raw)
+
+    assert normalized == ""
+
+
+def test_is_synced_and_detect_sync_status_follow_non_zero_rule() -> None:
+    plain_text = "just some lyrics\nwithout tags"
+    unsynced_text = "[00:00.00]a\n[00:00.00]b"
+    synced_text = "[00:00.00]a\n[00:01.00]b"
+
+    assert is_synced(plain_text) is False
+    assert detect_sync_status(plain_text) is CacheStatus.SUCCESS_UNSYNCED
+
+    assert is_synced(unsynced_text) is False
+    assert detect_sync_status(unsynced_text) is CacheStatus.SUCCESS_UNSYNCED
+
+    assert is_synced(synced_text) is True
+    assert detect_sync_status(synced_text) is CacheStatus.SUCCESS_SYNCED
+
+
+def test_normalize_unsynced_covers_documented_blank_and_tag_rules() -> None:
+    lyrics = "\n[00:12.34]first\nsecond\n\n[00:00.00]third"
+
+    normalized = normalize_unsynced(lyrics)
+
+    assert normalized == "\n".join(
+        [
+            "[00:00.00]first",
+            "[00:00.00]second",
+            "[00:00.00]",
+            "[00:00.00]third",
+        ]
+    )
+
+
+def test_to_plain_duplicates_lines_by_leading_repeated_timestamps() -> None:
+    text = "\n".join(
+        [
+            "[00:01.00][00:02.00]hello",
+            "[00:03.00]world",
+            "no-tag-line",
+            "[00:00.00]zero-only",
+        ]
+    )
+
+    plain = to_plain(text)
+
+    # In synced mode, lines with standard tags are kept (including [00:00.00]),
+    # while lines without leading standard tags are ignored.
+    assert plain == "\n".join(["hello", "hello", "world", "zero-only"])
+
+
+def test_to_plain_deduplicate_collapses_only_consecutive_equals() -> None:
+    text = "\n".join(
+        [
+            "[00:01.00][00:02.00]hello",
+            "[00:03.00]hello",
+            "[00:04.00]",
+            "[00:05.00]",
+            "[00:06.00]world",
+            "[00:07.00]hello",
+        ]
+    )
+
+    plain = to_plain(text, deduplicate=True)
+
+    assert plain == "\n".join(["hello", "", "world", "hello"])
+
+
+def test_to_plain_fallback_for_non_synced_text_strips_start_tags() -> None:
+    text = "\n".join(["[ar:Artist]", "[00:00.00]only-zero", "plain line"])
+
+    plain = to_plain(text)
+
+    assert plain == "only-zero\nplain line"
diff --git a/uv.lock b/uv.lock
index 5318acb..002986e 100644
--- a/uv.lock
+++ b/uv.lock
@@ -129,6 +129,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/0e/61/66938bbb5fc52dbdf84594873d5b51fb1f7c7794e9c0f5bd885f30bc507b/idna-3.11-py3-none-any.whl", hash = "sha256:771a87f49d9defaf64091e6e6fe9c18d4833f140bd19464795bc32d966ca37ea", size = 71008, upload-time = "2025-10-12T14:55:18.883Z" },
 ]
 
+[[package]]
+name = "iniconfig"
+version = "2.3.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/72/34/14ca021ce8e5dfedc35312d08ba8bf51fdd999c576889fc2c24cb97f4f10/iniconfig-2.3.0.tar.gz", hash = "sha256:c76315c77db068650d49c5b56314774a7804df16fee4402c1f19d6d15d8c4730", size = 20503, upload-time = "2025-10-18T21:55:43.219Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/cb/b1/3846dd7f199d53cb17f49cba7e651e9ce294d8497c8c150530ed11865bb8/iniconfig-2.3.0-py3-none-any.whl", hash = "sha256:f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12", size = 7484, upload-time = "2025-10-18T21:55:41.639Z" },
+]
+
 [[package]]
 name = "loguru"
 version = "0.7.3"
@@ -158,6 +167,7 @@ dependencies = [
 
 [package.dev-dependencies]
 dev = [
+    { name = "pytest" },
     { name = "ruff" },
 ]
 
@@ -173,7 +183,10 @@ requires-dist = [
 ]
 
 [package.metadata.requires-dev]
-dev = [{ name = "ruff", specifier = ">=0.15.8" }]
+dev = [
+    { name = "pytest", specifier = ">=9.0.2" },
+    { name = "ruff", specifier = ">=0.15.8" },
+]
 
 [[package]]
 name = "markdown-it-py"
@@ -205,6 +218,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/b0/7a/620f945b96be1f6ee357d211d5bf74ab1b7fe72a9f1525aafbfe3aee6875/mutagen-1.47.0-py3-none-any.whl", hash = "sha256:edd96f50c5907a9539d8e5bba7245f62c9f520aef333d13392a79a4f70aca719", size = 194391, upload-time = "2023-09-03T16:33:29.955Z" },
 ]
 
+[[package]]
+name = "packaging"
+version = "26.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/65/ee/299d360cdc32edc7d2cf530f3accf79c4fca01e96ffc950d8a52213bd8e4/packaging-26.0.tar.gz", hash = "sha256:00243ae351a257117b6a241061796684b084ed1c516a08c48a3f7e147a9d80b4", size = 143416, upload-time = "2026-01-21T20:50:39.064Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b7/b9/c538f279a4e237a006a2c98387d081e9eb060d203d8ed34467cc0f0b9b53/packaging-26.0-py3-none-any.whl", hash = "sha256:b36f1fef9334a5588b4166f8bcd26a14e521f2b55e6b9de3aaa80d3ff7a37529", size = 74366, upload-time = "2026-01-21T20:50:37.788Z" },
+]
+
 [[package]]
 name = "platformdirs"
 version = "4.9.4"
@@ -214,6 +236,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/63/d7/97f7e3a6abb67d8080dd406fd4df842c2be0efaf712d1c899c32a075027c/platformdirs-4.9.4-py3-none-any.whl", hash = "sha256:68a9a4619a666ea6439f2ff250c12a853cd1cbd5158d258bd824a7df6be2f868", size = 21216, upload-time = "2026-03-05T18:34:12.172Z" },
 ]
 
+[[package]]
+name = "pluggy"
+version = "1.6.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/f9/e2/3e91f31a7d2b083fe6ef3fa267035b518369d9511ffab804f839851d2779/pluggy-1.6.0.tar.gz", hash = "sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3", size = 69412, upload-time = "2025-05-15T12:30:07.975Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746", size = 20538, upload-time = "2025-05-15T12:30:06.134Z" },
+]
+
 [[package]]
 name = "pygments"
 version = "2.19.2"
@@ -223,6 +254,22 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/c7/21/705964c7812476f378728bdf590ca4b771ec72385c533964653c68e86bdc/pygments-2.19.2-py3-none-any.whl", hash = "sha256:86540386c03d588bb81d44bc3928634ff26449851e99741617ecb9037ee5ec0b", size = 1225217, upload-time = "2025-06-21T13:39:07.939Z" },
 ]
 
+[[package]]
+name = "pytest"
+version = "9.0.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "colorama", marker = "sys_platform == 'win32'" },
+    { name = "iniconfig" },
+    { name = "packaging" },
+    { name = "pluggy" },
+    { name = "pygments" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/d1/db/7ef3487e0fb0049ddb5ce41d3a49c235bf9ad299b6a25d5780a89f19230f/pytest-9.0.2.tar.gz", hash = "sha256:75186651a92bd89611d1d9fc20f0b4345fd827c41ccd5c299a868a05d70edf11", size = 1568901, upload-time = "2025-12-06T21:30:51.014Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/3b/ab/b3226f0bd7cdcf710fbede2b3548584366da3b19b5021e74f5bde2a8fa3f/pytest-9.0.2-py3-none-any.whl", hash = "sha256:711ffd45bf766d5264d487b917733b453d917afd2b0ad65223959f59089f875b", size = 374801, upload-time = "2025-12-06T21:30:49.154Z" },
+]
+
 [[package]]
 name = "python-dotenv"
 version = "1.2.2"