From 0e9bf29ff4add1539230d410f73ed8025aec59c0 Mon Sep 17 00:00:00 2001 From: Uyanide Date: Wed, 1 Apr 2026 17:51:28 +0200 Subject: [PATCH] feat: better lrc handling --- lrx_cli/lrc.py | 110 +++++++++++++++++++-------- pyproject.toml | 5 +- tests/test_lrc.py | 184 ++++++++++++++++++++++++++++++++++++++++++++++ uv.lock | 49 +++++++++++- 4 files changed, 317 insertions(+), 31 deletions(-) create mode 100644 tests/test_lrc.py diff --git a/lrx_cli/lrc.py b/lrx_cli/lrc.py index 2965d52..c207dd6 100644 --- a/lrx_cli/lrc.py +++ b/lrx_cli/lrc.py @@ -21,14 +21,26 @@ _STD_TAG_RE = re.compile(r"\[\d{2,}:\d{2}\.\d{2}\]") # Standard format with capture groups _STD_TAG_CAPTURE_RE = re.compile(r"\[(\d{2,}):(\d{2})\.(\d{2})\]") -# Matches a standard time tag at the start of a line -_LRC_LINE_RE = re.compile(r"^\[\d{2,}:\d{2}\.\d{2}\]", re.MULTILINE) - # [offset:+/-xxx] tag — value in milliseconds _OFFSET_RE = re.compile(r"^\[offset:\s*([+-]?\d+)\]\s*$", re.MULTILINE | re.IGNORECASE) -# Matches any number of tags at the start of a line -_LINE_START_TAGS_RE = re.compile(r"^(?:\[[^\]]*\])+") +# Any number of ID/Time tags at the start of a line +_LINE_START_TAGS_RE = re.compile(r"^(?:\[[^\]]*\])+", re.MULTILINE) + +# Any number of standard time tags at the start of a line +_LINE_START_STD_TAGS_RE = re.compile(r"^(?:\[\d{2,}:\d{2}\.\d{2}\])+", re.MULTILINE) + +# Word-level sync tags +# , , , , +_WORD_SYNC_TAG_RE = re.compile(r"<\d{2,}:\d{2}(?:[.:]\d{1,3})?>|<\d+,\d+,\d+>") + +# QRC is totally a completely different matter. Since they are still providing standard LRC APIs, +# it might be a good idea to leave this mass to the future :) + + +def _remove_pattern(text: str, pattern: re.Pattern) -> str: + """Remove all occurrences of pattern from text, then strip leading/trailing whitespace.""" + return pattern.sub("", text).strip() def _raw_tag_to_cs(mm: str, ss: str, frac: Optional[str]) -> str: @@ -50,6 +62,14 @@ def _raw_tag_to_cs(mm: str, ss: str, frac: Optional[str]) -> str: return f"[{mm}:{ss}.{cs:02d}]" +def _sanitize_lyric_text(text: str) -> str: + """Remove possibly word-sync time tags in lyric + + Assumes the normal line-sync time tags are already stripped. + """ + return _remove_pattern(text, _WORD_SYNC_TAG_RE) + + def _reformat(text: str) -> str: """Parse each line and reformat to standard [mm:ss.cc]...content form. @@ -62,7 +82,7 @@ def _reformat(text: str) -> str: pos = 0 tags: list[str] = [] while True: - while pos < len(line) and line[pos] == " ": + while pos < len(line) and line[pos].isspace(): pos += 1 m = _RAW_TAG_RE.match(line, pos) # Non-time tags are passed through as-is, except for leading/trailing whitespace which is stripped. @@ -72,9 +92,10 @@ def _reformat(text: str) -> str: tags.append(_raw_tag_to_cs(m.group(1), m.group(2), m.group(3))) pos = m.end() if tags: - # This could break lyric lines of some kind of word-synced LRC format, + # This could break lyric lines of some kind of word-synced LRC format, e.g. + # [00:01.00]Lyric [00:02.00]line # but such format were not planned to be supported in the first place, so… - out.append("".join(tags) + line[pos:].lstrip()) + out.append(_sanitize_lyric_text("".join(tags) + line[pos:])) else: out.append(line) # Empty lines with no tags are also preserved @@ -117,7 +138,7 @@ def normalize_tags(text: str) -> str: def is_synced(text: str) -> bool: """Check whether text contains non-zero LRC time tags. - Assumes text has been normalized by normalize_tags (standard [mm:ss.cc] format). + Assumes text has been normalized by normalize (standard [mm:ss.cc] format). """ tags = _STD_TAG_RE.findall(text) return bool(tags) and any(tag != "[00:00.00]" for tag in tags) @@ -126,7 +147,7 @@ def is_synced(text: str) -> bool: def detect_sync_status(text: str) -> CacheStatus: """Determine whether lyrics contain meaningful LRC time tags. - Assumes text has been normalized by normalize_tags. + Assumes text has been normalized by normalize. """ return ( CacheStatus.SUCCESS_SYNCED if is_synced(text) else CacheStatus.SUCCESS_UNSYNCED @@ -136,19 +157,23 @@ def detect_sync_status(text: str) -> CacheStatus: def normalize_unsynced(lyrics: str) -> str: """Normalize unsynced lyrics so every line has a [00:00.00] tag. + Assumes lyrics have been normalized by normalize. - Lines that already have time tags: replace with [00:00.00] - - Lines without time tags: prepend [00:00.00] - - Blank lines are converted to [00:00.00] + - Lines without leading tags: prepend [00:00.00] + - Blank lines in middle are converted to [00:00.00] """ out: list[str] = [] + first = True for line in lyrics.splitlines(): stripped = line.strip() - if not stripped: + if not stripped and not first: out.append("[00:00.00]") continue - cleaned = _LRC_LINE_RE.sub("", stripped) - while _LRC_LINE_RE.match(cleaned): - cleaned = _LRC_LINE_RE.sub("", cleaned) + elif not stripped: + # Skip leading blank lines + continue + first = False + cleaned = _remove_pattern(line, _LINE_START_STD_TAGS_RE) out.append(f"[00:00.00]{cleaned}") return "\n".join(out) @@ -183,25 +208,52 @@ def get_sidecar_path( def to_plain( text: str, + deduplicate: bool = False, ) -> str: """Convert lyrics to plain text with all tags stripped. - Assumes text has been normalized by normalize_tags. + If deduplicate is True, only keep the first line of consecutive lines with the same lyric text (after stripping tags). + Otherwise, lines with multiple time tags will be duplicated as many times as the number of tags. + Assumes text has been normalized by normalize. """ + if not is_synced(text): + # If there are no meaningful time tags, just strip all tags and return + return _remove_pattern(text, _LINE_START_TAGS_RE) + lines = [] - first = True for line in text.splitlines(): - cleaned = _LINE_START_TAGS_RE.sub("", line).strip() - # Ignore the leading empty lines that is likely caused by tag lines - if not cleaned and not first: - lines.append("") - elif cleaned: - lines.append(cleaned) - first = False - # Remove trailing empty lines that are meaningless - while lines and not lines[-1]: - lines.pop() + pos = 0 + cnt = 0 + plain_line = "" + while True: + # Only match strictly repeated standard time tags at the start of the line + # Lines without any time tags are ignored. + # Lyric lines are considered already stripped of whitespaces, so no strips here. + m = _STD_TAG_RE.match(line, pos) + if not m: + plain_line += line[pos:] + break + pos = m.end() + cnt += 1 + # Also avoid dulplicating blank lines + if deduplicate or not plain_line: + if cnt > 0: + lines.append(plain_line) + else: + for _ in range(cnt): + lines.append(plain_line) + + if deduplicate: + # Remove consecutive duplicates + deduped_lines = [] + prev_line = None + for line in lines: + if line != prev_line: + deduped_lines.append(line) + prev_line = line + lines = deduped_lines + return "\n".join(lines) @@ -211,7 +263,7 @@ def print_lyrics( ) -> None: """Print lyrics, optionally stripping tags. - Assumes text has been normalized by normalize_tags. + Assumes text has been normalized by normalize. """ if plain: print(to_plain(text)) diff --git a/pyproject.toml b/pyproject.toml index ca37659..02a9bd0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,4 +25,7 @@ lrx = "lrx_cli.cli:run" ignore = ["E402"] [dependency-groups] -dev = ["ruff>=0.15.8"] +dev = [ + "pytest>=9.0.2", + "ruff>=0.15.8", +] diff --git a/tests/test_lrc.py b/tests/test_lrc.py new file mode 100644 index 0000000..839c1ac --- /dev/null +++ b/tests/test_lrc.py @@ -0,0 +1,184 @@ +from __future__ import annotations + +from lrx_cli.lrc import ( + detect_sync_status, + is_synced, + normalize_tags, + normalize_unsynced, + to_plain, +) +from lrx_cli.models import CacheStatus + + +def test_normalize_tags_supports_all_raw_time_formats() -> None: + raw = "\n".join( + [ + "[00:01]a", + "[00:02.3]b", + "[00:03.45]c", + "[00:04.678]d", + "[00:05:999]e", + ] + ) + + normalized = normalize_tags(raw) + + assert normalized == "\n".join( + [ + "[00:01.00]a", + "[00:02.30]b", + "[00:03.45]c", + "[00:04.68]d", + "[00:05.99]e", + ] + ) + + +def test_normalize_tags_keeps_non_timed_lines_trimmed_and_unchanged() -> None: + raw = " plain line \n\n [ar:Meta Header] " + + normalized = normalize_tags(raw) + + assert normalized == "plain line\n\n[ar:Meta Header]" + + +def test_normalize_tags_removes_word_sync_patterns() -> None: + raw = ( + "[00:01.00]<00:01>hello\n" + "[00:02.00]<00:02.3>world\n" + "[00:03.00]<00:03.45>foo\n" + "[00:04.00]<00:04:678>bar\n" + "[00:05.00]<1,2,3>baz" + ) + + normalized = normalize_tags(raw) + + assert normalized == "\n".join( + [ + "[00:01.00]hello", + "[00:02.00]world", + "[00:03.00]foo", + "[00:04.00]bar", + "[00:05.00]baz", + ] + ) + + +def test_normalize_tags_keeps_midline_timestamps_as_is() -> None: + raw = "[00:01.00]Lyric [00:02.00]line" + + normalized = normalize_tags(raw) + + assert normalized == "[00:01.00]Lyric [00:02.00]line" + + +def test_normalize_tags_applies_positive_and_negative_offset_per_spec() -> None: + positive = normalize_tags("[offset:+1000]\n[00:10.00]line") + negative = normalize_tags("[offset:-500]\n[00:10.00]line") + + assert positive == "[00:09.00]line" + assert negative == "[00:10.50]line" + + +def test_normalize_tags_accepts_leading_spaces_and_tabs_before_tags() -> None: + raw = "\t [00:01.2] hello" + + normalized = normalize_tags(raw) + + assert normalized == "[00:01.20]hello" + + +def test_normalize_tags_handles_consecutive_start_tags_with_spaces_between() -> None: + raw = "[00:01] [00:02.3] chorus" + + normalized = normalize_tags(raw) + + assert normalized == "[00:01.00][00:02.30]chorus" + + +def test_normalize_tags_preserves_non_leading_raw_like_tags() -> None: + raw = "intro [00:01]line" + + normalized = normalize_tags(raw) + + assert normalized == "intro [00:01]line" + + +def test_normalize_tags_removes_offset_tag_line_even_without_lyrics() -> None: + raw = "[offset:+500]" + + normalized = normalize_tags(raw) + + assert normalized == "" + + +def test_is_synced_and_detect_sync_status_follow_non_zero_rule() -> None: + plain_text = "just some lyrics\nwithout tags" + unsynced_text = "[00:00.00]a\n[00:00.00]b" + synced_text = "[00:00.00]a\n[00:01.00]b" + + assert is_synced(plain_text) is False + assert detect_sync_status(plain_text) is CacheStatus.SUCCESS_UNSYNCED + + assert is_synced(unsynced_text) is False + assert detect_sync_status(unsynced_text) is CacheStatus.SUCCESS_UNSYNCED + + assert is_synced(synced_text) is True + assert detect_sync_status(synced_text) is CacheStatus.SUCCESS_SYNCED + + +def test_normalize_unsynced_covers_documented_blank_and_tag_rules() -> None: + lyrics = "\n[00:12.34]first\nsecond\n\n[00:00.00]third" + + normalized = normalize_unsynced(lyrics) + + assert normalized == "\n".join( + [ + "[00:00.00]first", + "[00:00.00]second", + "[00:00.00]", + "[00:00.00]third", + ] + ) + + +def test_to_plain_duplicates_lines_by_leading_repeated_timestamps() -> None: + text = "\n".join( + [ + "[00:01.00][00:02.00]hello", + "[00:03.00]world", + "no-tag-line", + "[00:00.00]zero-only", + ] + ) + + plain = to_plain(text) + + # In synced mode, lines with standard tags are kept (including [00:00.00]), + # while lines without leading standard tags are ignored. + assert plain == "\n".join(["hello", "hello", "world", "zero-only"]) + + +def test_to_plain_deduplicate_collapses_only_consecutive_equals() -> None: + text = "\n".join( + [ + "[00:01.00][00:02.00]hello", + "[00:03.00]hello", + "[00:04.00]", + "[00:05.00]", + "[00:06.00]world", + "[00:07.00]hello", + ] + ) + + plain = to_plain(text, deduplicate=True) + + assert plain == "\n".join(["hello", "", "world", "hello"]) + + +def test_to_plain_fallback_for_non_synced_text_strips_start_tags() -> None: + text = "\n".join(["[ar:Artist]", "[00:00.00]only-zero", "plain line"]) + + plain = to_plain(text) + + assert plain == "only-zero\nplain line" diff --git a/uv.lock b/uv.lock index 5318acb..002986e 100644 --- a/uv.lock +++ b/uv.lock @@ -129,6 +129,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/0e/61/66938bbb5fc52dbdf84594873d5b51fb1f7c7794e9c0f5bd885f30bc507b/idna-3.11-py3-none-any.whl", hash = "sha256:771a87f49d9defaf64091e6e6fe9c18d4833f140bd19464795bc32d966ca37ea", size = 71008, upload-time = "2025-10-12T14:55:18.883Z" }, ] +[[package]] +name = "iniconfig" +version = "2.3.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/72/34/14ca021ce8e5dfedc35312d08ba8bf51fdd999c576889fc2c24cb97f4f10/iniconfig-2.3.0.tar.gz", hash = "sha256:c76315c77db068650d49c5b56314774a7804df16fee4402c1f19d6d15d8c4730", size = 20503, upload-time = "2025-10-18T21:55:43.219Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/cb/b1/3846dd7f199d53cb17f49cba7e651e9ce294d8497c8c150530ed11865bb8/iniconfig-2.3.0-py3-none-any.whl", hash = "sha256:f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12", size = 7484, upload-time = "2025-10-18T21:55:41.639Z" }, +] + [[package]] name = "loguru" version = "0.7.3" @@ -158,6 +167,7 @@ dependencies = [ [package.dev-dependencies] dev = [ + { name = "pytest" }, { name = "ruff" }, ] @@ -173,7 +183,10 @@ requires-dist = [ ] [package.metadata.requires-dev] -dev = [{ name = "ruff", specifier = ">=0.15.8" }] +dev = [ + { name = "pytest", specifier = ">=9.0.2" }, + { name = "ruff", specifier = ">=0.15.8" }, +] [[package]] name = "markdown-it-py" @@ -205,6 +218,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b0/7a/620f945b96be1f6ee357d211d5bf74ab1b7fe72a9f1525aafbfe3aee6875/mutagen-1.47.0-py3-none-any.whl", hash = "sha256:edd96f50c5907a9539d8e5bba7245f62c9f520aef333d13392a79a4f70aca719", size = 194391, upload-time = "2023-09-03T16:33:29.955Z" }, ] +[[package]] +name = "packaging" +version = "26.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/65/ee/299d360cdc32edc7d2cf530f3accf79c4fca01e96ffc950d8a52213bd8e4/packaging-26.0.tar.gz", hash = "sha256:00243ae351a257117b6a241061796684b084ed1c516a08c48a3f7e147a9d80b4", size = 143416, upload-time = "2026-01-21T20:50:39.064Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b7/b9/c538f279a4e237a006a2c98387d081e9eb060d203d8ed34467cc0f0b9b53/packaging-26.0-py3-none-any.whl", hash = "sha256:b36f1fef9334a5588b4166f8bcd26a14e521f2b55e6b9de3aaa80d3ff7a37529", size = 74366, upload-time = "2026-01-21T20:50:37.788Z" }, +] + [[package]] name = "platformdirs" version = "4.9.4" @@ -214,6 +236,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/63/d7/97f7e3a6abb67d8080dd406fd4df842c2be0efaf712d1c899c32a075027c/platformdirs-4.9.4-py3-none-any.whl", hash = "sha256:68a9a4619a666ea6439f2ff250c12a853cd1cbd5158d258bd824a7df6be2f868", size = 21216, upload-time = "2026-03-05T18:34:12.172Z" }, ] +[[package]] +name = "pluggy" +version = "1.6.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f9/e2/3e91f31a7d2b083fe6ef3fa267035b518369d9511ffab804f839851d2779/pluggy-1.6.0.tar.gz", hash = "sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3", size = 69412, upload-time = "2025-05-15T12:30:07.975Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746", size = 20538, upload-time = "2025-05-15T12:30:06.134Z" }, +] + [[package]] name = "pygments" version = "2.19.2" @@ -223,6 +254,22 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c7/21/705964c7812476f378728bdf590ca4b771ec72385c533964653c68e86bdc/pygments-2.19.2-py3-none-any.whl", hash = "sha256:86540386c03d588bb81d44bc3928634ff26449851e99741617ecb9037ee5ec0b", size = 1225217, upload-time = "2025-06-21T13:39:07.939Z" }, ] +[[package]] +name = "pytest" +version = "9.0.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, + { name = "iniconfig" }, + { name = "packaging" }, + { name = "pluggy" }, + { name = "pygments" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/d1/db/7ef3487e0fb0049ddb5ce41d3a49c235bf9ad299b6a25d5780a89f19230f/pytest-9.0.2.tar.gz", hash = "sha256:75186651a92bd89611d1d9fc20f0b4345fd827c41ccd5c299a868a05d70edf11", size = 1568901, upload-time = "2025-12-06T21:30:51.014Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/3b/ab/b3226f0bd7cdcf710fbede2b3548584366da3b19b5021e74f5bde2a8fa3f/pytest-9.0.2-py3-none-any.whl", hash = "sha256:711ffd45bf766d5264d487b917733b453d917afd2b0ad65223959f59089f875b", size = 374801, upload-time = "2025-12-06T21:30:49.154Z" }, +] + [[package]] name = "python-dotenv" version = "1.2.2"