feat: better lrc handling

2026-04-01 17:51:28 +02:00
parent cd60d3042c
commit 0b830e176d
4 changed files with 317 additions and 31 deletions
@@ -21,14 +21,26 @@ _STD_TAG_RE = re.compile(r"\[\d{2,}:\d{2}\.\d{2}\]")
 # Standard format with capture groups
 _STD_TAG_CAPTURE_RE = re.compile(r"\[(\d{2,}):(\d{2})\.(\d{2})\]")
 # Matches a standard time tag at the start of a line
 _LRC_LINE_RE = re.compile(r"^\[\d{2,}:\d{2}\.\d{2}\]", re.MULTILINE)
 # [offset:+/-xxx] tag — value in milliseconds
 _OFFSET_RE = re.compile(r"^\[offset:\s*([+-]?\d+)\]\s*$", re.MULTILINE | re.IGNORECASE)
-# Matches any number of tags at the start of a line
+# Any number of ID/Time tags at the start of a line
-_LINE_START_TAGS_RE = re.compile(r"^(?:\[[^\]]*\])+")
+_LINE_START_TAGS_RE = re.compile(r"^(?:\[[^\]]*\])+", re.MULTILINE)
 # Any number of standard time tags at the start of a line
 _LINE_START_STD_TAGS_RE = re.compile(r"^(?:\[\d{2,}:\d{2}\.\d{2}\])+", re.MULTILINE)
 # Word-level sync tags
 #   <mm:ss>, <mm:ss.c>, <mm:ss.cc>, <mm:ss:cc>, <xx,yy,zz>
 _WORD_SYNC_TAG_RE = re.compile(r"<\d{2,}:\d{2}(?:[.:]\d{1,3})?>|<\d+,\d+,\d+>")
 # QRC is totally a completely different matter. Since they are still providing standard LRC APIs,
 # it might be a good idea to leave this mass to the future :)
 def _remove_pattern(text: str, pattern: re.Pattern) -> str:
    """Remove all occurrences of pattern from text, then strip leading/trailing whitespace."""
    return pattern.sub("", text).strip()
 def _raw_tag_to_cs(mm: str, ss: str, frac: Optional[str]) -> str:
@@ -50,6 +62,14 @@ def _raw_tag_to_cs(mm: str, ss: str, frac: Optional[str]) -> str:
    return f"[{mm}:{ss}.{cs:02d}]"
 def _sanitize_lyric_text(text: str) -> str:
    """Remove possibly word-sync time tags in lyric
    Assumes the normal line-sync time tags are already stripped.
    """
    return _remove_pattern(text, _WORD_SYNC_TAG_RE)
 def _reformat(text: str) -> str:
    """Parse each line and reformat to standard [mm:ss.cc]...content form.
@@ -62,7 +82,7 @@ def _reformat(text: str) -> str:
        pos = 0
        tags: list[str] = []
        while True:
-            while pos < len(line) and line[pos] == " ":
+            while pos < len(line) and line[pos].isspace():
                pos += 1
            m = _RAW_TAG_RE.match(line, pos)
            # Non-time tags are passed through as-is, except for leading/trailing whitespace which is stripped.
@@ -72,9 +92,10 @@ def _reformat(text: str) -> str:
            tags.append(_raw_tag_to_cs(m.group(1), m.group(2), m.group(3)))
            pos = m.end()
        if tags:
-            # This could break lyric lines of some kind of word-synced LRC format,
+            # This could break lyric lines of some kind of word-synced LRC format, e.g.
            #   [00:01.00]Lyric [00:02.00]line
            # but such format were not planned to be supported in the first place, so…
-            out.append("".join(tags) + line[pos:].lstrip())
+            out.append(_sanitize_lyric_text("".join(tags) + line[pos:]))
        else:
            out.append(line)
            # Empty lines with no tags are also preserved
@@ -117,7 +138,7 @@ def normalize_tags(text: str) -> str:
 def is_synced(text: str) -> bool:
    """Check whether text contains non-zero LRC time tags.
-    Assumes text has been normalized by normalize_tags (standard [mm:ss.cc] format).
+    Assumes text has been normalized by normalize (standard [mm:ss.cc] format).
    """
    tags = _STD_TAG_RE.findall(text)
    return bool(tags) and any(tag != "[00:00.00]" for tag in tags)
@@ -126,7 +147,7 @@ def is_synced(text: str) -> bool:
 def detect_sync_status(text: str) -> CacheStatus:
    """Determine whether lyrics contain meaningful LRC time tags.
-    Assumes text has been normalized by normalize_tags.
+    Assumes text has been normalized by normalize.
    """
    return (
        CacheStatus.SUCCESS_SYNCED if is_synced(text) else CacheStatus.SUCCESS_UNSYNCED
@@ -136,19 +157,23 @@ def detect_sync_status(text: str) -> CacheStatus:
 def normalize_unsynced(lyrics: str) -> str:
    """Normalize unsynced lyrics so every line has a [00:00.00] tag.
    Assumes lyrics have been normalized by normalize.
    - Lines that already have time tags: replace with [00:00.00]
-    - Lines without time tags: prepend [00:00.00]
+    - Lines without leading tags: prepend [00:00.00]
-    - Blank lines are converted to [00:00.00]
+    - Blank lines in middle are converted to [00:00.00]
    """
    out: list[str] = []
    first = True
    for line in lyrics.splitlines():
        stripped = line.strip()
-        if not stripped:
+        if not stripped and not first:
            out.append("[00:00.00]")
            continue
-        cleaned = _LRC_LINE_RE.sub("", stripped)
+        elif not stripped:
-        while _LRC_LINE_RE.match(cleaned):
+            # Skip leading blank lines
-            cleaned = _LRC_LINE_RE.sub("", cleaned)
+            continue
        first = False
        cleaned = _remove_pattern(line, _LINE_START_STD_TAGS_RE)
        out.append(f"[00:00.00]{cleaned}")
    return "\n".join(out)
@@ -183,25 +208,52 @@ def get_sidecar_path(
 def to_plain(
    text: str,
    deduplicate: bool = False,
 ) -> str:
    """Convert lyrics to plain text with all tags stripped.
-    Assumes text has been normalized by normalize_tags.
+    If deduplicate is True, only keep the first line of consecutive lines with the same lyric text (after stripping tags).
    Otherwise, lines with multiple time tags will be duplicated as many times as the number of tags.
    Assumes text has been normalized by normalize.
    """
    if not is_synced(text):
        # If there are no meaningful time tags, just strip all tags and return
        return _remove_pattern(text, _LINE_START_TAGS_RE)
    lines = []
    first = True
    for line in text.splitlines():
-        cleaned = _LINE_START_TAGS_RE.sub("", line).strip()
+        pos = 0
-        # Ignore the leading empty lines that is likely caused by tag lines
+        cnt = 0
-        if not cleaned and not first:
+        plain_line = ""
-            lines.append("")
+        while True:
-        elif cleaned:
+            # Only match strictly repeated standard time tags at the start of the line
-            lines.append(cleaned)
+            # Lines without any time tags are ignored.
-            first = False
+            # Lyric lines are considered already stripped of whitespaces, so no strips here.
-    # Remove trailing empty lines that are meaningless
+            m = _STD_TAG_RE.match(line, pos)
-    while lines and not lines[-1]:
+            if not m:
-        lines.pop()
+                plain_line += line[pos:]
                break
            pos = m.end()
            cnt += 1
        # Also avoid dulplicating blank lines
        if deduplicate or not plain_line:
            if cnt > 0:
                lines.append(plain_line)
        else:
            for _ in range(cnt):
                lines.append(plain_line)
    if deduplicate:
        # Remove consecutive duplicates
        deduped_lines = []
        prev_line = None
        for line in lines:
            if line != prev_line:
                deduped_lines.append(line)
            prev_line = line
        lines = deduped_lines
    return "\n".join(lines)
@@ -211,7 +263,7 @@ def print_lyrics(
 ) -> None:
    """Print lyrics, optionally stripping tags.
-    Assumes text has been normalized by normalize_tags.
+    Assumes text has been normalized by normalize.
    """
    if plain:
        print(to_plain(text))
@@ -25,4 +25,7 @@ lrx = "lrx_cli.cli:run"
 ignore = ["E402"]
 [dependency-groups]
-dev = ["ruff>=0.15.8"]
+dev = [
    "pytest>=9.0.2",
    "ruff>=0.15.8",
 ]
@@ -0,0 +1,184 @@
 from __future__ import annotations
 from lrx_cli.lrc import (
    detect_sync_status,
    is_synced,
    normalize_tags,
    normalize_unsynced,
    to_plain,
 )
 from lrx_cli.models import CacheStatus
 def test_normalize_tags_supports_all_raw_time_formats() -> None:
    raw = "\n".join(
        [
            "[00:01]a",
            "[00:02.3]b",
            "[00:03.45]c",
            "[00:04.678]d",
            "[00:05:999]e",
        ]
    )
    normalized = normalize_tags(raw)
    assert normalized == "\n".join(
        [
            "[00:01.00]a",
            "[00:02.30]b",
            "[00:03.45]c",
            "[00:04.68]d",
            "[00:05.99]e",
        ]
    )
 def test_normalize_tags_keeps_non_timed_lines_trimmed_and_unchanged() -> None:
    raw = "  plain line  \n\n  [ar:Meta Header]  "
    normalized = normalize_tags(raw)
    assert normalized == "plain line\n\n[ar:Meta Header]"
 def test_normalize_tags_removes_word_sync_patterns() -> None:
    raw = (
        "[00:01.00]<00:01>hello\n"
        "[00:02.00]<00:02.3>world\n"
        "[00:03.00]<00:03.45>foo\n"
        "[00:04.00]<00:04:678>bar\n"
        "[00:05.00]<1,2,3>baz"
    )
    normalized = normalize_tags(raw)
    assert normalized == "\n".join(
        [
            "[00:01.00]hello",
            "[00:02.00]world",
            "[00:03.00]foo",
            "[00:04.00]bar",
            "[00:05.00]baz",
        ]
    )
 def test_normalize_tags_keeps_midline_timestamps_as_is() -> None:
    raw = "[00:01.00]Lyric [00:02.00]line"
    normalized = normalize_tags(raw)
    assert normalized == "[00:01.00]Lyric [00:02.00]line"
 def test_normalize_tags_applies_positive_and_negative_offset_per_spec() -> None:
    positive = normalize_tags("[offset:+1000]\n[00:10.00]line")
    negative = normalize_tags("[offset:-500]\n[00:10.00]line")
    assert positive == "[00:09.00]line"
    assert negative == "[00:10.50]line"
 def test_normalize_tags_accepts_leading_spaces_and_tabs_before_tags() -> None:
    raw = "\t   [00:01.2] hello"
    normalized = normalize_tags(raw)
    assert normalized == "[00:01.20]hello"
 def test_normalize_tags_handles_consecutive_start_tags_with_spaces_between() -> None:
    raw = "[00:01]   [00:02.3]    chorus"
    normalized = normalize_tags(raw)
    assert normalized == "[00:01.00][00:02.30]chorus"
 def test_normalize_tags_preserves_non_leading_raw_like_tags() -> None:
    raw = "intro [00:01]line"
    normalized = normalize_tags(raw)
    assert normalized == "intro [00:01]line"
 def test_normalize_tags_removes_offset_tag_line_even_without_lyrics() -> None:
    raw = "[offset:+500]"
    normalized = normalize_tags(raw)
    assert normalized == ""
 def test_is_synced_and_detect_sync_status_follow_non_zero_rule() -> None:
    plain_text = "just some lyrics\nwithout tags"
    unsynced_text = "[00:00.00]a\n[00:00.00]b"
    synced_text = "[00:00.00]a\n[00:01.00]b"
    assert is_synced(plain_text) is False
    assert detect_sync_status(plain_text) is CacheStatus.SUCCESS_UNSYNCED
    assert is_synced(unsynced_text) is False
    assert detect_sync_status(unsynced_text) is CacheStatus.SUCCESS_UNSYNCED
    assert is_synced(synced_text) is True
    assert detect_sync_status(synced_text) is CacheStatus.SUCCESS_SYNCED
 def test_normalize_unsynced_covers_documented_blank_and_tag_rules() -> None:
    lyrics = "\n[00:12.34]first\nsecond\n\n[00:00.00]third"
    normalized = normalize_unsynced(lyrics)
    assert normalized == "\n".join(
        [
            "[00:00.00]first",
            "[00:00.00]second",
            "[00:00.00]",
            "[00:00.00]third",
        ]
    )
 def test_to_plain_duplicates_lines_by_leading_repeated_timestamps() -> None:
    text = "\n".join(
        [
            "[00:01.00][00:02.00]hello",
            "[00:03.00]world",
            "no-tag-line",
            "[00:00.00]zero-only",
        ]
    )
    plain = to_plain(text)
    # In synced mode, lines with standard tags are kept (including [00:00.00]),
    # while lines without leading standard tags are ignored.
    assert plain == "\n".join(["hello", "hello", "world", "zero-only"])
 def test_to_plain_deduplicate_collapses_only_consecutive_equals() -> None:
    text = "\n".join(
        [
            "[00:01.00][00:02.00]hello",
            "[00:03.00]hello",
            "[00:04.00]",
            "[00:05.00]",
            "[00:06.00]world",
            "[00:07.00]hello",
        ]
    )
    plain = to_plain(text, deduplicate=True)
    assert plain == "\n".join(["hello", "", "world", "hello"])
 def test_to_plain_fallback_for_non_synced_text_strips_start_tags() -> None:
    text = "\n".join(["[ar:Artist]", "[00:00.00]only-zero", "plain line"])
    plain = to_plain(text)
    assert plain == "only-zero\nplain line"
@@ -129,6 +129,15 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/0e/61/66938bbb5fc52dbdf84594873d5b51fb1f7c7794e9c0f5bd885f30bc507b/idna-3.11-py3-none-any.whl", hash = "sha256:771a87f49d9defaf64091e6e6fe9c18d4833f140bd19464795bc32d966ca37ea", size = 71008, upload-time = "2025-10-12T14:55:18.883Z" },
 ]
 [[package]]
 name = "iniconfig"
 version = "2.3.0"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/72/34/14ca021ce8e5dfedc35312d08ba8bf51fdd999c576889fc2c24cb97f4f10/iniconfig-2.3.0.tar.gz", hash = "sha256:c76315c77db068650d49c5b56314774a7804df16fee4402c1f19d6d15d8c4730", size = 20503, upload-time = "2025-10-18T21:55:43.219Z" }
 wheels = [
    { url = "https://files.pythonhosted.org/packages/cb/b1/3846dd7f199d53cb17f49cba7e651e9ce294d8497c8c150530ed11865bb8/iniconfig-2.3.0-py3-none-any.whl", hash = "sha256:f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12", size = 7484, upload-time = "2025-10-18T21:55:41.639Z" },
 ]
 [[package]]
 name = "loguru"
 version = "0.7.3"
@@ -158,6 +167,7 @@ dependencies = [
 [package.dev-dependencies]
 dev = [
    { name = "pytest" },
    { name = "ruff" },
 ]
@@ -173,7 +183,10 @@ requires-dist = [
 ]
 [package.metadata.requires-dev]
-dev = [{ name = "ruff", specifier = ">=0.15.8" }]
+dev = [
    { name = "pytest", specifier = ">=9.0.2" },
    { name = "ruff", specifier = ">=0.15.8" },
 ]
 [[package]]
 name = "markdown-it-py"
@@ -205,6 +218,15 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/b0/7a/620f945b96be1f6ee357d211d5bf74ab1b7fe72a9f1525aafbfe3aee6875/mutagen-1.47.0-py3-none-any.whl", hash = "sha256:edd96f50c5907a9539d8e5bba7245f62c9f520aef333d13392a79a4f70aca719", size = 194391, upload-time = "2023-09-03T16:33:29.955Z" },
 ]
 [[package]]
 name = "packaging"
 version = "26.0"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/65/ee/299d360cdc32edc7d2cf530f3accf79c4fca01e96ffc950d8a52213bd8e4/packaging-26.0.tar.gz", hash = "sha256:00243ae351a257117b6a241061796684b084ed1c516a08c48a3f7e147a9d80b4", size = 143416, upload-time = "2026-01-21T20:50:39.064Z" }
 wheels = [
    { url = "https://files.pythonhosted.org/packages/b7/b9/c538f279a4e237a006a2c98387d081e9eb060d203d8ed34467cc0f0b9b53/packaging-26.0-py3-none-any.whl", hash = "sha256:b36f1fef9334a5588b4166f8bcd26a14e521f2b55e6b9de3aaa80d3ff7a37529", size = 74366, upload-time = "2026-01-21T20:50:37.788Z" },
 ]
 [[package]]
 name = "platformdirs"
 version = "4.9.4"
@@ -214,6 +236,15 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/63/d7/97f7e3a6abb67d8080dd406fd4df842c2be0efaf712d1c899c32a075027c/platformdirs-4.9.4-py3-none-any.whl", hash = "sha256:68a9a4619a666ea6439f2ff250c12a853cd1cbd5158d258bd824a7df6be2f868", size = 21216, upload-time = "2026-03-05T18:34:12.172Z" },
 ]
 [[package]]
 name = "pluggy"
 version = "1.6.0"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/f9/e2/3e91f31a7d2b083fe6ef3fa267035b518369d9511ffab804f839851d2779/pluggy-1.6.0.tar.gz", hash = "sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3", size = 69412, upload-time = "2025-05-15T12:30:07.975Z" }
 wheels = [
    { url = "https://files.pythonhosted.org/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746", size = 20538, upload-time = "2025-05-15T12:30:06.134Z" },
 ]
 [[package]]
 name = "pygments"
 version = "2.19.2"
@@ -223,6 +254,22 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/c7/21/705964c7812476f378728bdf590ca4b771ec72385c533964653c68e86bdc/pygments-2.19.2-py3-none-any.whl", hash = "sha256:86540386c03d588bb81d44bc3928634ff26449851e99741617ecb9037ee5ec0b", size = 1225217, upload-time = "2025-06-21T13:39:07.939Z" },
 ]
 [[package]]
 name = "pytest"
 version = "9.0.2"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "colorama", marker = "sys_platform == 'win32'" },
    { name = "iniconfig" },
    { name = "packaging" },
    { name = "pluggy" },
    { name = "pygments" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/d1/db/7ef3487e0fb0049ddb5ce41d3a49c235bf9ad299b6a25d5780a89f19230f/pytest-9.0.2.tar.gz", hash = "sha256:75186651a92bd89611d1d9fc20f0b4345fd827c41ccd5c299a868a05d70edf11", size = 1568901, upload-time = "2025-12-06T21:30:51.014Z" }
 wheels = [
    { url = "https://files.pythonhosted.org/packages/3b/ab/b3226f0bd7cdcf710fbede2b3548584366da3b19b5021e74f5bde2a8fa3f/pytest-9.0.2-py3-none-any.whl", hash = "sha256:711ffd45bf766d5264d487b917733b453d917afd2b0ad65223959f59089f875b", size = 374801, upload-time = "2025-12-06T21:30:49.154Z" },
 ]
 [[package]]
 name = "python-dotenv"
 version = "1.2.2"