feat: better lrc handling

This commit is contained in:
2026-04-01 17:51:28 +02:00
parent 39be7275fc
commit 0e9bf29ff4
4 changed files with 317 additions and 31 deletions
+81 -29
View File
@@ -21,14 +21,26 @@ _STD_TAG_RE = re.compile(r"\[\d{2,}:\d{2}\.\d{2}\]")
# Standard format with capture groups
_STD_TAG_CAPTURE_RE = re.compile(r"\[(\d{2,}):(\d{2})\.(\d{2})\]")
# Matches a standard time tag at the start of a line
_LRC_LINE_RE = re.compile(r"^\[\d{2,}:\d{2}\.\d{2}\]", re.MULTILINE)
# [offset:+/-xxx] tag — value in milliseconds
_OFFSET_RE = re.compile(r"^\[offset:\s*([+-]?\d+)\]\s*$", re.MULTILINE | re.IGNORECASE)
# Matches any number of tags at the start of a line
_LINE_START_TAGS_RE = re.compile(r"^(?:\[[^\]]*\])+")
# Any number of ID/Time tags at the start of a line
_LINE_START_TAGS_RE = re.compile(r"^(?:\[[^\]]*\])+", re.MULTILINE)
# Any number of standard time tags at the start of a line
_LINE_START_STD_TAGS_RE = re.compile(r"^(?:\[\d{2,}:\d{2}\.\d{2}\])+", re.MULTILINE)
# Word-level sync tags
# <mm:ss>, <mm:ss.c>, <mm:ss.cc>, <mm:ss:cc>, <xx,yy,zz>
_WORD_SYNC_TAG_RE = re.compile(r"<\d{2,}:\d{2}(?:[.:]\d{1,3})?>|<\d+,\d+,\d+>")
# QRC is totally a completely different matter. Since they are still providing standard LRC APIs,
# it might be a good idea to leave this mass to the future :)
def _remove_pattern(text: str, pattern: re.Pattern) -> str:
"""Remove all occurrences of pattern from text, then strip leading/trailing whitespace."""
return pattern.sub("", text).strip()
def _raw_tag_to_cs(mm: str, ss: str, frac: Optional[str]) -> str:
@@ -50,6 +62,14 @@ def _raw_tag_to_cs(mm: str, ss: str, frac: Optional[str]) -> str:
return f"[{mm}:{ss}.{cs:02d}]"
def _sanitize_lyric_text(text: str) -> str:
"""Remove possibly word-sync time tags in lyric
Assumes the normal line-sync time tags are already stripped.
"""
return _remove_pattern(text, _WORD_SYNC_TAG_RE)
def _reformat(text: str) -> str:
"""Parse each line and reformat to standard [mm:ss.cc]...content form.
@@ -62,7 +82,7 @@ def _reformat(text: str) -> str:
pos = 0
tags: list[str] = []
while True:
while pos < len(line) and line[pos] == " ":
while pos < len(line) and line[pos].isspace():
pos += 1
m = _RAW_TAG_RE.match(line, pos)
# Non-time tags are passed through as-is, except for leading/trailing whitespace which is stripped.
@@ -72,9 +92,10 @@ def _reformat(text: str) -> str:
tags.append(_raw_tag_to_cs(m.group(1), m.group(2), m.group(3)))
pos = m.end()
if tags:
# This could break lyric lines of some kind of word-synced LRC format,
# This could break lyric lines of some kind of word-synced LRC format, e.g.
# [00:01.00]Lyric [00:02.00]line
# but such format were not planned to be supported in the first place, so…
out.append("".join(tags) + line[pos:].lstrip())
out.append(_sanitize_lyric_text("".join(tags) + line[pos:]))
else:
out.append(line)
# Empty lines with no tags are also preserved
@@ -117,7 +138,7 @@ def normalize_tags(text: str) -> str:
def is_synced(text: str) -> bool:
"""Check whether text contains non-zero LRC time tags.
Assumes text has been normalized by normalize_tags (standard [mm:ss.cc] format).
Assumes text has been normalized by normalize (standard [mm:ss.cc] format).
"""
tags = _STD_TAG_RE.findall(text)
return bool(tags) and any(tag != "[00:00.00]" for tag in tags)
@@ -126,7 +147,7 @@ def is_synced(text: str) -> bool:
def detect_sync_status(text: str) -> CacheStatus:
"""Determine whether lyrics contain meaningful LRC time tags.
Assumes text has been normalized by normalize_tags.
Assumes text has been normalized by normalize.
"""
return (
CacheStatus.SUCCESS_SYNCED if is_synced(text) else CacheStatus.SUCCESS_UNSYNCED
@@ -136,19 +157,23 @@ def detect_sync_status(text: str) -> CacheStatus:
def normalize_unsynced(lyrics: str) -> str:
"""Normalize unsynced lyrics so every line has a [00:00.00] tag.
Assumes lyrics have been normalized by normalize.
- Lines that already have time tags: replace with [00:00.00]
- Lines without time tags: prepend [00:00.00]
- Blank lines are converted to [00:00.00]
- Lines without leading tags: prepend [00:00.00]
- Blank lines in middle are converted to [00:00.00]
"""
out: list[str] = []
first = True
for line in lyrics.splitlines():
stripped = line.strip()
if not stripped:
if not stripped and not first:
out.append("[00:00.00]")
continue
cleaned = _LRC_LINE_RE.sub("", stripped)
while _LRC_LINE_RE.match(cleaned):
cleaned = _LRC_LINE_RE.sub("", cleaned)
elif not stripped:
# Skip leading blank lines
continue
first = False
cleaned = _remove_pattern(line, _LINE_START_STD_TAGS_RE)
out.append(f"[00:00.00]{cleaned}")
return "\n".join(out)
@@ -183,25 +208,52 @@ def get_sidecar_path(
def to_plain(
text: str,
deduplicate: bool = False,
) -> str:
"""Convert lyrics to plain text with all tags stripped.
Assumes text has been normalized by normalize_tags.
If deduplicate is True, only keep the first line of consecutive lines with the same lyric text (after stripping tags).
Otherwise, lines with multiple time tags will be duplicated as many times as the number of tags.
Assumes text has been normalized by normalize.
"""
if not is_synced(text):
# If there are no meaningful time tags, just strip all tags and return
return _remove_pattern(text, _LINE_START_TAGS_RE)
lines = []
first = True
for line in text.splitlines():
cleaned = _LINE_START_TAGS_RE.sub("", line).strip()
# Ignore the leading empty lines that is likely caused by tag lines
if not cleaned and not first:
lines.append("")
elif cleaned:
lines.append(cleaned)
first = False
# Remove trailing empty lines that are meaningless
while lines and not lines[-1]:
lines.pop()
pos = 0
cnt = 0
plain_line = ""
while True:
# Only match strictly repeated standard time tags at the start of the line
# Lines without any time tags are ignored.
# Lyric lines are considered already stripped of whitespaces, so no strips here.
m = _STD_TAG_RE.match(line, pos)
if not m:
plain_line += line[pos:]
break
pos = m.end()
cnt += 1
# Also avoid dulplicating blank lines
if deduplicate or not plain_line:
if cnt > 0:
lines.append(plain_line)
else:
for _ in range(cnt):
lines.append(plain_line)
if deduplicate:
# Remove consecutive duplicates
deduped_lines = []
prev_line = None
for line in lines:
if line != prev_line:
deduped_lines.append(line)
prev_line = line
lines = deduped_lines
return "\n".join(lines)
@@ -211,7 +263,7 @@ def print_lyrics(
) -> None:
"""Print lyrics, optionally stripping tags.
Assumes text has been normalized by normalize_tags.
Assumes text has been normalized by normalize.
"""
if plain:
print(to_plain(text))