chore: switch to src layout
This commit is contained in:
@@ -0,0 +1,315 @@
|
||||
"""
|
||||
Author: Uyanide pywang0608@foxmail.com
|
||||
Date: 2026-03-25 21:54:01
|
||||
Description: Shared LRC time-tag utilities (definitely overengineered).
|
||||
"""
|
||||
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
from urllib.parse import unquote
|
||||
|
||||
from .models import CacheStatus
|
||||
|
||||
# Parses any time tag input format:
|
||||
# [mm:ss], [mm:ss.c], [mm:ss.cc], [mm:ss.ccc], [mm:ss:cc], …
|
||||
_RAW_TAG_RE = re.compile(r"\[(\d{2,}):(\d{2})(?:[.:](\d{1,3}))?\]")
|
||||
|
||||
# Standard format after normalization: [mm:ss.cc]
|
||||
# _STD_TAG_RE = re.compile(r"\[\d{2,}:\d{2}\.\d{2}\]")
|
||||
|
||||
# Standard format with capture groups
|
||||
_STD_TAG_CAPTURE_RE = re.compile(r"\[(\d{2,}):(\d{2})\.(\d{2})\]")
|
||||
|
||||
# [offset:+/-xxx] tag — value in milliseconds
|
||||
_OFFSET_RE = re.compile(r"^\[offset:\s*([+-]?\d+)\]\s*$", re.MULTILINE | re.IGNORECASE)
|
||||
|
||||
# Any number of ID/Time tags at the start of a line
|
||||
_LINE_START_TAGS_RE = re.compile(r"^(?:\[[^\]]*\])+", re.MULTILINE)
|
||||
|
||||
# Any number of standard time tags at the start of a line
|
||||
_LINE_START_STD_TAGS_RE = re.compile(r"^(?:\[\d{2,}:\d{2}\.\d{2}\])+", re.MULTILINE)
|
||||
|
||||
# Word-level sync tags
|
||||
# <mm:ss>, <mm:ss.c>, <mm:ss.cc>, <mm:ss:cc>, <xx,yy,zz>
|
||||
_WORD_SYNC_TAG_RE = re.compile(r"<\d{2,}:\d{2}(?:[.:]\d{1,3})?>|<\d+,\d+,\d+>")
|
||||
|
||||
# QRC is totally a completely different matter. Since they are still providing standard LRC APIs,
|
||||
# it might be a good idea to leave this mass to the future :)
|
||||
|
||||
|
||||
def _remove_pattern(text: str, pattern: re.Pattern) -> str:
|
||||
"""Remove all occurrences of pattern from text, then strip leading/trailing whitespace."""
|
||||
return pattern.sub("", text).strip()
|
||||
|
||||
|
||||
def _raw_tag_to_ms(mm: str, ss: str, frac: Optional[str]) -> int:
|
||||
"""Convert parsed time tag components to total milliseconds."""
|
||||
if frac is None:
|
||||
ms = 0
|
||||
else:
|
||||
n = len(frac)
|
||||
if n == 1:
|
||||
ms = int(frac) * 100
|
||||
elif n == 2:
|
||||
ms = int(frac) * 10
|
||||
else:
|
||||
ms = int(frac)
|
||||
return (int(mm) * 60 + int(ss)) * 1000 + ms
|
||||
|
||||
|
||||
def _raw_tag_to_cs(mm: str, ss: str, frac: Optional[str]) -> str:
|
||||
"""Convert parsed time tag components to standard [mm:ss.cc] string."""
|
||||
if frac is None:
|
||||
ms = 0
|
||||
else:
|
||||
# cc in [mm:ss:cc] is also treated as centiseconds, per LRC spec
|
||||
# ^
|
||||
# why does this format even exist, idk
|
||||
n = len(frac)
|
||||
if n == 1:
|
||||
ms = int(frac) * 100
|
||||
elif n == 2:
|
||||
ms = int(frac) * 10
|
||||
else:
|
||||
ms = int(frac)
|
||||
cs = min(round(ms / 10), 99)
|
||||
return f"[{mm}:{ss}.{cs:02d}]"
|
||||
|
||||
|
||||
def _sanitize_lyric_text(text: str) -> str:
|
||||
"""Remove possibly word-sync time tags in lyric
|
||||
|
||||
Assumes the normal line-sync time tags are already stripped.
|
||||
"""
|
||||
return _remove_pattern(text, _WORD_SYNC_TAG_RE)
|
||||
|
||||
|
||||
def _reformat(text: str) -> list[str]:
|
||||
"""Parse each line and reformat to standard [mm:ss.cc]...content form.
|
||||
|
||||
Handles any mix of time tag formats on input. Lines with no time tags
|
||||
are stripped of leading/trailing whitespace and passed through unchanged.
|
||||
"""
|
||||
out: list[str] = []
|
||||
for line in text.splitlines():
|
||||
line = line.strip()
|
||||
pos = 0
|
||||
tags: list[str] = []
|
||||
while True:
|
||||
while pos < len(line) and line[pos].isspace():
|
||||
pos += 1
|
||||
m = _RAW_TAG_RE.match(line, pos)
|
||||
# Non-time tags are passed through as-is, except for leading/trailing whitespace which is stripped.
|
||||
if not m:
|
||||
# No more tags on this line
|
||||
break
|
||||
tags.append(_raw_tag_to_cs(m.group(1), m.group(2), m.group(3)))
|
||||
pos = m.end()
|
||||
if tags:
|
||||
# This could break lyric lines of some kind of word-synced LRC format, e.g.
|
||||
# [00:01.00]Lyric [00:02.00]line
|
||||
# but such format were not planned to be supported in the first place, so…
|
||||
out.append(_sanitize_lyric_text("".join(tags) + line[pos:]))
|
||||
else:
|
||||
out.append(line)
|
||||
# Empty lines with no tags are also preserved
|
||||
|
||||
# Remove empty lines at the start and end of the whole text, but preserve blank lines in the middle
|
||||
while out and not out[0].strip():
|
||||
out.pop(0)
|
||||
while out and not out[-1].strip():
|
||||
out.pop()
|
||||
|
||||
return out
|
||||
|
||||
|
||||
class LRCData:
|
||||
_lines: list[str]
|
||||
|
||||
def __init__(self, text: str | None = None) -> None:
|
||||
if not text:
|
||||
self._lines = []
|
||||
return
|
||||
self._lines = _reformat(text)
|
||||
self._apply_offset()
|
||||
|
||||
def __str__(self) -> str:
|
||||
return "\n".join(self._lines)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"LRCData(lines={self._lines!r})"
|
||||
|
||||
def __bool__(self) -> bool:
|
||||
return len(self._lines) > 0
|
||||
|
||||
def __len__(self) -> int:
|
||||
return len(self._lines)
|
||||
|
||||
def _apply_offset(self):
|
||||
"""Parse [offset:±ms] and shift all standard [mm:ss.cc] tags accordingly.
|
||||
|
||||
Per LRC spec, positive offset = lyrics appear sooner (subtract from timestamps).
|
||||
"""
|
||||
m: Optional[re.Match] = None
|
||||
for i, line in enumerate(self._lines):
|
||||
m = _OFFSET_RE.search(line)
|
||||
if m:
|
||||
self._lines.pop(i)
|
||||
break
|
||||
if not m:
|
||||
return
|
||||
offset_ms = int(m.group(1))
|
||||
if offset_ms == 0:
|
||||
return
|
||||
|
||||
def _shift(match: re.Match) -> str:
|
||||
total_ms = max(
|
||||
0,
|
||||
(int(match.group(1)) * 60 + int(match.group(2))) * 1000
|
||||
+ int(match.group(3)) * 10
|
||||
- offset_ms,
|
||||
)
|
||||
new_mm = total_ms // 60000
|
||||
new_ss = (total_ms % 60000) // 1000
|
||||
new_cs = min(round((total_ms % 1000) / 10), 99)
|
||||
return f"[{new_mm:02d}:{new_ss:02d}.{new_cs:02d}]"
|
||||
|
||||
self._lines = [_STD_TAG_CAPTURE_RE.sub(_shift, line) for line in self._lines]
|
||||
|
||||
def is_synced(self) -> bool:
|
||||
"""Check whether text contains non-zero LRC time tags.
|
||||
|
||||
Assumes text has been normalized by normalize (standard [mm:ss.cc] format).
|
||||
"""
|
||||
for line in self._lines:
|
||||
for m in _STD_TAG_CAPTURE_RE.finditer(line):
|
||||
if m.group(1) != "00" or m.group(2) != "00" or m.group(3) != "00":
|
||||
return True
|
||||
return False
|
||||
|
||||
def detect_sync_status(self) -> CacheStatus:
|
||||
"""Determine whether lyrics contain meaningful LRC time tags.
|
||||
|
||||
Assumes text has been normalized by normalize.
|
||||
"""
|
||||
return (
|
||||
CacheStatus.SUCCESS_SYNCED
|
||||
if self.is_synced()
|
||||
else CacheStatus.SUCCESS_UNSYNCED
|
||||
)
|
||||
|
||||
def normalize_unsynced(self):
|
||||
"""Normalize unsynced lyrics so every line has a [00:00.00] tag.
|
||||
|
||||
Assumes lyrics have been normalized by normalize.
|
||||
- Lines that already have time tags: replace with [00:00.00]
|
||||
- Lines without leading tags: prepend [00:00.00]
|
||||
- Blank lines in middle are converted to [00:00.00]
|
||||
"""
|
||||
out: list[str] = []
|
||||
first = True
|
||||
for i, line in enumerate(self._lines):
|
||||
stripped = line.strip()
|
||||
if not stripped and not first:
|
||||
out.append("[00:00.00]")
|
||||
continue
|
||||
elif not stripped:
|
||||
# Skip leading blank lines
|
||||
continue
|
||||
first = False
|
||||
cleaned = _remove_pattern(line, _LINE_START_STD_TAGS_RE)
|
||||
out.append(f"[00:00.00]{cleaned}")
|
||||
ret = LRCData()
|
||||
ret._lines = out
|
||||
return ret
|
||||
|
||||
def to_plain(
|
||||
self,
|
||||
deduplicate: bool = False,
|
||||
) -> str:
|
||||
"""Convert lyrics to plain text with all tags stripped.
|
||||
|
||||
If deduplicate is True, only keep the first line of consecutive lines with the same lyric text (after stripping tags).
|
||||
Otherwise, lines with multiple time tags will be duplicated as many times as the number of tags.
|
||||
Assumes text has been normalized by normalize.
|
||||
"""
|
||||
|
||||
if not self.is_synced():
|
||||
return "\n".join(
|
||||
_remove_pattern(line, _LINE_START_TAGS_RE) for line in self._lines
|
||||
).strip("\n")
|
||||
|
||||
tagged_lines = []
|
||||
for line in self._lines:
|
||||
pos = 0
|
||||
tag_ms = []
|
||||
while True:
|
||||
# Only match strictly repeated standard time tags at the start of the line
|
||||
# Lines without any time tags are ignored.
|
||||
# Lyric lines are considered already stripped of whitespaces, so no strips here.
|
||||
m = _STD_TAG_CAPTURE_RE.match(line, pos)
|
||||
if not m:
|
||||
lyric = line[pos:]
|
||||
for tag in tag_ms:
|
||||
tagged_lines.append((tag, lyric))
|
||||
break
|
||||
tag_ms.append(_raw_tag_to_ms(m.group(1), m.group(2), m.group(3)))
|
||||
pos = m.end()
|
||||
|
||||
sorted_lines = [lyric for _, lyric in sorted(tagged_lines, key=lambda x: x[0])]
|
||||
|
||||
if deduplicate:
|
||||
# Remove consecutive duplicates
|
||||
deduped_lines = []
|
||||
prev_line = None
|
||||
for line in sorted_lines:
|
||||
if line != prev_line:
|
||||
deduped_lines.append(line)
|
||||
prev_line = line
|
||||
sorted_lines = deduped_lines
|
||||
|
||||
return "\n".join(sorted_lines).strip()
|
||||
|
||||
def to_lrc(
|
||||
self,
|
||||
plain: bool = False,
|
||||
) -> str:
|
||||
"""Return lyrics, optionally stripping tags.
|
||||
|
||||
Assumes text has been normalized by normalize.
|
||||
"""
|
||||
if plain:
|
||||
return self.to_plain()
|
||||
return "\n".join(self._lines)
|
||||
|
||||
|
||||
def get_audio_path(audio_url: str, ensure_exists: bool = False) -> Optional[Path]:
|
||||
"""Convert file:// URL to Path, return None if invalid or (if ensure_exists) file doesn't exist."""
|
||||
if not audio_url.startswith("file://"):
|
||||
return None
|
||||
file_path = unquote(audio_url.replace("file://", "", 1))
|
||||
path = Path(file_path)
|
||||
if ensure_exists and not path.exists():
|
||||
return None
|
||||
return path
|
||||
|
||||
|
||||
def get_sidecar_path(
|
||||
audio_url: str,
|
||||
ensure_audio_exists: bool = False,
|
||||
ensure_exists: bool = False,
|
||||
extension: str = ".lrc",
|
||||
) -> Optional[Path]:
|
||||
"""Given a file:// URL, return the corresponding .lrc sidecar path.
|
||||
|
||||
If ensure_audio_exists is True, return None if the audio file does not exist.
|
||||
If ensure_exists is True, return None if the .lrc file does not exist.
|
||||
"""
|
||||
audio_path = get_audio_path(audio_url, ensure_exists=ensure_audio_exists)
|
||||
if not audio_path:
|
||||
return None
|
||||
lrc_path = audio_path.with_suffix(extension)
|
||||
if ensure_exists and not lrc_path.exists():
|
||||
return None
|
||||
return lrc_path
|
||||
Reference in New Issue
Block a user