From 03acda4478181690399e3291b7227b8dfd348105 Mon Sep 17 00:00:00 2001 From: Uyanide Date: Tue, 31 Mar 2026 15:35:22 +0200 Subject: [PATCH] feat: better LRC format handling --- lrcfetch/core.py | 35 +------ lrcfetch/fetchers/local.py | 6 +- lrcfetch/fetchers/lrclib.py | 11 ++- lrcfetch/fetchers/lrclib_search.py | 11 ++- lrcfetch/fetchers/netease.py | 8 +- lrcfetch/fetchers/qqmusic.py | 8 +- lrcfetch/fetchers/spotify.py | 3 +- lrcfetch/lrc.py | 152 +++++++++++++++++++---------- pyproject.toml | 2 +- uv.lock | 2 +- 10 files changed, 133 insertions(+), 105 deletions(-) diff --git a/lrcfetch/core.py b/lrcfetch/core.py index de92f9a..e51bd87 100644 --- a/lrcfetch/core.py +++ b/lrcfetch/core.py @@ -18,32 +18,12 @@ from loguru import logger from .fetchers import FetcherMethodType, create_fetchers from .fetchers.base import BaseFetcher from .cache import CacheEngine -from .lrc import LRC_LINE_RE, normalize_tags +from .lrc import normalize_unsynced from .config import TTL_SYNCED, TTL_UNSYNCED, TTL_NOT_FOUND, TTL_NETWORK_ERROR from .models import TrackMeta, LyricResult, CacheStatus from .enrichers import enrich_track -def _normalize_unsynced(lyrics: str) -> str: - """Normalize unsynced lyrics so every line has a [00:00.00] tag. - - - Lines that already have time tags: replace with [00:00.00] - - Lines without time tags: prepend [00:00.00] - - Blank lines are kept as-is - """ - out: list[str] = [] - for line in lyrics.splitlines(): - stripped = line.strip() - if not stripped: - out.append("") - continue - cleaned = LRC_LINE_RE.sub("", stripped) - while LRC_LINE_RE.match(cleaned): - cleaned = LRC_LINE_RE.sub("", cleaned) - out.append(f"[00:00.00]{cleaned}") - return "\n".join(out) - - # Maps CacheStatus to the default TTL used when storing results _STATUS_TTL: dict[CacheStatus, Optional[int]] = { CacheStatus.SUCCESS_SYNCED: TTL_SYNCED, @@ -149,16 +129,7 @@ class LrcManager: logger.debug(f"[{source}] returned None (no result)") continue - # Normalize non-standard time tags [mm:ss:cc] → [mm:ss.cc] - if result.lyrics: - result = LyricResult( - status=result.status, - lyrics=normalize_tags(result.lyrics), - source=result.source, - ttl=result.ttl, - ) - - # Cache the normalized result (skip for self-cached fetchers) + # Cache the result (skip for self-cached fetchers) if not fetcher.self_cached: ttl = result.ttl or _STATUS_TTL.get(result.status, TTL_NOT_FOUND) self.cache.set(track, source, result, ttl_seconds=ttl) @@ -184,7 +155,7 @@ class LrcManager: ): best_result = LyricResult( status=best_result.status, - lyrics=_normalize_unsynced(best_result.lyrics), + lyrics=normalize_unsynced(best_result.lyrics), source=best_result.source, ttl=best_result.ttl, ) diff --git a/lrcfetch/fetchers/local.py b/lrcfetch/fetchers/local.py index e1e46e7..8e2267e 100644 --- a/lrcfetch/fetchers/local.py +++ b/lrcfetch/fetchers/local.py @@ -17,7 +17,7 @@ from mutagen.flac import FLAC from .base import BaseFetcher from ..models import TrackMeta, LyricResult -from ..lrc import detect_sync_status, get_audio_path, get_sidecar_path +from ..lrc import detect_sync_status, normalize_tags, get_audio_path, get_sidecar_path class LocalFetcher(BaseFetcher): @@ -45,6 +45,7 @@ class LocalFetcher(BaseFetcher): with open(lrc_path, "r", encoding="utf-8") as f: content = f.read().strip() if content: + content = normalize_tags(content) status = detect_sync_status(content) logger.info(f"Local: found .lrc sidecar ({status.value})") return LyricResult( @@ -77,11 +78,12 @@ class LocalFetcher(BaseFetcher): break if lyrics: + lyrics = normalize_tags(lyrics.strip()) status = detect_sync_status(lyrics) logger.info(f"Local: found embedded lyrics ({status.value})") return LyricResult( status=status, - lyrics=lyrics.strip(), + lyrics=lyrics, source=f"{self.source_name} (embedded)", ) else: diff --git a/lrcfetch/fetchers/lrclib.py b/lrcfetch/fetchers/lrclib.py index e5e4c3f..71d08d1 100644 --- a/lrcfetch/fetchers/lrclib.py +++ b/lrcfetch/fetchers/lrclib.py @@ -15,6 +15,7 @@ from urllib.parse import urlencode from .base import BaseFetcher from ..models import TrackMeta, LyricResult, CacheStatus +from ..lrc import normalize_tags from ..config import ( HTTP_TIMEOUT, TTL_UNSYNCED, @@ -75,21 +76,23 @@ class LrclibFetcher(BaseFetcher): unsynced = data.get("plainLyrics") if isinstance(synced, str) and synced.strip(): + lyrics = normalize_tags(synced.strip()) logger.info( - f"LRCLIB: got synced lyrics ({len(synced.splitlines())} lines)" + f"LRCLIB: got synced lyrics ({len(lyrics.splitlines())} lines)" ) return LyricResult( status=CacheStatus.SUCCESS_SYNCED, - lyrics=synced.strip(), + lyrics=lyrics, source=self.source_name, ) elif isinstance(unsynced, str) and unsynced.strip(): + lyrics = normalize_tags(unsynced.strip()) logger.info( - f"LRCLIB: got unsynced lyrics ({len(unsynced.splitlines())} lines)" + f"LRCLIB: got unsynced lyrics ({len(lyrics.splitlines())} lines)" ) return LyricResult( status=CacheStatus.SUCCESS_UNSYNCED, - lyrics=unsynced.strip(), + lyrics=lyrics, source=self.source_name, ttl=TTL_UNSYNCED, ) diff --git a/lrcfetch/fetchers/lrclib_search.py b/lrcfetch/fetchers/lrclib_search.py index 83e2c30..3fa9357 100644 --- a/lrcfetch/fetchers/lrclib_search.py +++ b/lrcfetch/fetchers/lrclib_search.py @@ -16,6 +16,7 @@ from urllib.parse import urlencode from .base import BaseFetcher from ..models import TrackMeta, LyricResult, CacheStatus +from ..lrc import normalize_tags from ..config import ( HTTP_TIMEOUT, TTL_UNSYNCED, @@ -78,21 +79,23 @@ class LrclibSearchFetcher(BaseFetcher): unsynced = best.get("plainLyrics") if isinstance(synced, str) and synced.strip(): + lyrics = normalize_tags(synced.strip()) logger.info( - f"LRCLIB-search: got synced lyrics ({len(synced.splitlines())} lines)" + f"LRCLIB-search: got synced lyrics ({len(lyrics.splitlines())} lines)" ) return LyricResult( status=CacheStatus.SUCCESS_SYNCED, - lyrics=synced.strip(), + lyrics=lyrics, source=self.source_name, ) elif isinstance(unsynced, str) and unsynced.strip(): + lyrics = normalize_tags(unsynced.strip()) logger.info( - f"LRCLIB-search: got unsynced lyrics ({len(unsynced.splitlines())} lines)" + f"LRCLIB-search: got unsynced lyrics ({len(lyrics.splitlines())} lines)" ) return LyricResult( status=CacheStatus.SUCCESS_UNSYNCED, - lyrics=unsynced.strip(), + lyrics=lyrics, source=self.source_name, ttl=TTL_UNSYNCED, ) diff --git a/lrcfetch/fetchers/netease.py b/lrcfetch/fetchers/netease.py index 9d1ebba..cee4ab9 100644 --- a/lrcfetch/fetchers/netease.py +++ b/lrcfetch/fetchers/netease.py @@ -18,7 +18,7 @@ from loguru import logger from .base import BaseFetcher from ..models import TrackMeta, LyricResult, CacheStatus -from ..lrc import is_synced +from ..lrc import detect_sync_status, normalize_tags from ..config import ( HTTP_TIMEOUT, TTL_NOT_FOUND, @@ -178,10 +178,8 @@ class NeteaseFetcher(BaseFetcher): return LyricResult(status=CacheStatus.NOT_FOUND, ttl=TTL_NOT_FOUND) # Determine sync status - synced = is_synced(lrc) - status = ( - CacheStatus.SUCCESS_SYNCED if synced else CacheStatus.SUCCESS_UNSYNCED - ) + lrc = normalize_tags(lrc) + status = detect_sync_status(lrc) logger.info( f"Netease: got {status.value} lyrics for song_id={song_id} " f"({len(lrc.splitlines())} lines)" diff --git a/lrcfetch/fetchers/qqmusic.py b/lrcfetch/fetchers/qqmusic.py index d64c01e..061fc7b 100644 --- a/lrcfetch/fetchers/qqmusic.py +++ b/lrcfetch/fetchers/qqmusic.py @@ -17,7 +17,7 @@ from loguru import logger from .base import BaseFetcher from ..models import TrackMeta, LyricResult, CacheStatus -from ..lrc import is_synced +from ..lrc import detect_sync_status, normalize_tags from ..config import ( HTTP_TIMEOUT, TTL_NOT_FOUND, @@ -139,10 +139,8 @@ class QQMusicFetcher(BaseFetcher): logger.debug(f"QQMusic: empty lyrics for mid={mid}") return LyricResult(status=CacheStatus.NOT_FOUND, ttl=TTL_NOT_FOUND) - synced = is_synced(lrc) - status = ( - CacheStatus.SUCCESS_SYNCED if synced else CacheStatus.SUCCESS_UNSYNCED - ) + lrc = normalize_tags(lrc) + status = detect_sync_status(lrc) logger.info( f"QQMusic: got {status.value} lyrics for mid={mid} " f"({len(lrc.splitlines())} lines)" diff --git a/lrcfetch/fetchers/spotify.py b/lrcfetch/fetchers/spotify.py index c0e1886..110f86a 100644 --- a/lrcfetch/fetchers/spotify.py +++ b/lrcfetch/fetchers/spotify.py @@ -28,6 +28,7 @@ from loguru import logger from .base import BaseFetcher from ..models import TrackMeta, LyricResult, CacheStatus +from ..lrc import normalize_tags from ..config import ( HTTP_TIMEOUT, SPOTIFY_APP_VERSION, @@ -354,7 +355,7 @@ class SpotifyFetcher(BaseFetcher): # Unsynced: emit with zero timestamps lrc_lines.append(f"[00:00.00]{words}") - content = "\n".join(lrc_lines) + content = normalize_tags("\n".join(lrc_lines)) status = ( CacheStatus.SUCCESS_SYNCED if is_synced diff --git a/lrcfetch/lrc.py b/lrcfetch/lrc.py index 65deeb6..6913512 100644 --- a/lrcfetch/lrc.py +++ b/lrcfetch/lrc.py @@ -1,7 +1,7 @@ """ Author: Uyanide pywang0608@foxmail.com Date: 2026-03-25 21:54:01 -Description: Shared LRC time-tag utilities +Description: Shared LRC time-tag utilities (definitely overengineered) """ import re @@ -11,93 +11,145 @@ from urllib.parse import unquote from .models import CacheStatus -# Standard format: [mm:ss.cc] or [mm:ss.ccc] -_STANDARD_TAG_RE = re.compile(r"\[\d{2}:\d{2}\.\d{2,3}\]") +# Parses any time tag input format: +# [mm:ss], [mm:ss.c], [mm:ss.cc], [mm:ss.ccc], [mm:ss:cc], … +_RAW_TAG_RE = re.compile(r"\[(\d{2,}):(\d{2})(?:[.:](\d{1,3}))?\]") -# Non-standard format: [mm:ss:cc] (two colons instead of dot) -_COLON_TAG_RE = re.compile(r"\[(\d{2}:\d{2}):(\d{2,3})\]") +# Standard format after normalization: [mm:ss.cc] +_STD_TAG_RE = re.compile(r"\[\d{2,}:\d{2}\.\d{2}\]") -# Matches any LRC time tag (standard or non-standard) at start of line -LRC_LINE_RE = re.compile(r"^\[(\d{2}:\d{2}[.:]\d{2,3})\]", re.MULTILINE) +# Standard format with capture groups +_STD_TAG_CAPTURE_RE = re.compile(r"\[(\d{2,}):(\d{2})\.(\d{2})\]") -# All-zero tags -_ZERO_TAG_RE = re.compile(r"^\[00:00[.:]0{2,3}\]$") +# Matches a standard time tag at the start of a line +_LRC_LINE_RE = re.compile(r"^\[\d{2,}:\d{2}\.\d{2}\]", re.MULTILINE) # [offset:+/-xxx] tag — value in milliseconds _OFFSET_RE = re.compile(r"^\[offset:\s*([+-]?\d+)\]\s*$", re.MULTILINE | re.IGNORECASE) -# Time tag for offset application: captures mm, ss, cc/ccc -_TIME_TAG_RE = re.compile(r"\[(\d{2}):(\d{2})\.(\d{2,3})\]") + +def _raw_tag_to_cs(mm: str, ss: str, frac: Optional[str]) -> str: + """Convert parsed time tag components to standard [mm:ss.cc] string.""" + if frac is None: + ms = 0 + else: + # cc in [mm:ss:cc] is also treated as centiseconds, per LRC spec + # ^ + # why does this format even exist, idk + n = len(frac) + if n == 1: + ms = int(frac) * 100 + elif n == 2: + ms = int(frac) * 10 + else: + ms = int(frac) + cs = min(round(ms / 10), 99) + return f"[{mm}:{ss}.{cs:02d}]" + + +def _reformat(text: str) -> str: + """Parse each line and reformat to standard [mm:ss.cc]...content form. + + Handles any mix of time tag formats on input. Lines with no time tags + are stripped of leading/trailing whitespace and passed through unchanged. + """ + out: list[str] = [] + for line in text.splitlines(): + line = line.strip() + pos = 0 + tags: list[str] = [] + while True: + while pos < len(line) and line[pos] == " ": + pos += 1 + m = _RAW_TAG_RE.match(line, pos) + # Non-time tags are passed through as-is, except for leading/trailing whitespace which is stripped. + if not m: + # No more tags on this line + break + tags.append(_raw_tag_to_cs(m.group(1), m.group(2), m.group(3))) + pos = m.end() + if tags: + # This could break lyric lines of some kind of word-synced LRC format, + # but such format were not planned to be supported in the first place, so… + out.append("".join(tags) + line[pos:].lstrip()) + else: + out.append(line) + # Empty lines with no tags are also preserved + return "\n".join(out) def _apply_offset(text: str) -> str: - """Parse [offset:±ms] tag and shift all time tags accordingly. + """Parse [offset:±ms] and shift all standard [mm:ss.cc] tags accordingly. - Per LRC spec, a positive offset means lyrics appear sooner (subtract - from timestamps), negative means later (add to timestamps). + Per LRC spec, positive offset = lyrics appear sooner (subtract from timestamps). """ m = _OFFSET_RE.search(text) if not m: return text offset_ms = int(m.group(1)) + text = _OFFSET_RE.sub("", text).strip("\n") if offset_ms == 0: - return _OFFSET_RE.sub("", text).strip("\n") - - # Remove the offset tag line - text = _OFFSET_RE.sub("", text) + return text def _shift(match: re.Match) -> str: - mm, ss, cs = int(match.group(1)), int(match.group(2)), match.group(3) - # Normalize centiseconds to milliseconds - if len(cs) == 2: - ms = int(cs) * 10 - fmt_cs = 2 - else: - ms = int(cs) - fmt_cs = 3 - total_ms = (mm * 60 + ss) * 1000 + ms - offset_ms - total_ms = max(0, total_ms) + total_ms = max( + 0, + (int(match.group(1)) * 60 + int(match.group(2))) * 1000 + + int(match.group(3)) * 10 + - offset_ms, + ) new_mm = total_ms // 60000 new_ss = (total_ms % 60000) // 1000 - new_cs = total_ms % 1000 - if fmt_cs == 2: - new_cs = new_cs // 10 - return f"[{new_mm:02d}:{new_ss:02d}.{new_cs:02d}]" - return f"[{new_mm:02d}:{new_ss:02d}.{new_cs:03d}]" + new_cs = min(round((total_ms % 1000) / 10), 99) + return f"[{new_mm:02d}:{new_ss:02d}.{new_cs:02d}]" - return _TIME_TAG_RE.sub(_shift, text) + return _STD_TAG_CAPTURE_RE.sub(_shift, text) def normalize_tags(text: str) -> str: - """Normalize LRC time tags: colon format → dot format, then apply offset.""" - text = _COLON_TAG_RE.sub(r"[\1.\2]", text) - return _apply_offset(text) + """Normalize LRC to standard form: reformat all tags to [mm:ss.cc], then apply offset.""" + return _apply_offset(_reformat(text)) def is_synced(text: str) -> bool: - """Check whether text contains actual LRC time tags with non-zero times. + """Check whether text contains non-zero LRC time tags. - Returns False if no tags exist or all tags are [00:00.00]. - Handles both [mm:ss.cc] and [mm:ss:cc] formats. + Assumes text has been normalized by normalize_tags (standard [mm:ss.cc] format). """ - tags = _STANDARD_TAG_RE.findall(text) - # Also check non-standard format - tags += [f"[{m.group(1)}.{m.group(2)}]" for m in _COLON_TAG_RE.finditer(text)] - if not tags: - return False - for tag in tags: - if not _ZERO_TAG_RE.match(tag): - return True - return False + tags = _STD_TAG_RE.findall(text) + return bool(tags) and any(tag != "[00:00.00]" for tag in tags) def detect_sync_status(text: str) -> CacheStatus: - """Determine whether lyrics contain meaningful LRC time tags.""" + """Determine whether lyrics contain meaningful LRC time tags. + + Assumes text has been normalized by normalize_tags. + """ return ( CacheStatus.SUCCESS_SYNCED if is_synced(text) else CacheStatus.SUCCESS_UNSYNCED ) +def normalize_unsynced(lyrics: str) -> str: + """Normalize unsynced lyrics so every line has a [00:00.00] tag. + + - Lines that already have time tags: replace with [00:00.00] + - Lines without time tags: prepend [00:00.00] + - Blank lines are converted to [00:00.00] + """ + out: list[str] = [] + for line in lyrics.splitlines(): + stripped = line.strip() + if not stripped: + out.append("[00:00.00]") + continue + cleaned = _LRC_LINE_RE.sub("", stripped) + while _LRC_LINE_RE.match(cleaned): + cleaned = _LRC_LINE_RE.sub("", cleaned) + out.append(f"[00:00.00]{cleaned}") + return "\n".join(out) + + def get_audio_path(audio_url: str, ensure_exists: bool = False) -> Optional[Path]: """Convert file:// URL to Path, return None if invalid or (if ensure_exists) file doesn't exist.""" if not audio_url.startswith("file://"): diff --git a/pyproject.toml b/pyproject.toml index 001acdb..f969b98 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "lrcfetch" -version = "0.1.5" +version = "0.1.6" description = "Fetch line-synced lyrics for your music player." readme = "README.md" requires-python = ">=3.13" diff --git a/uv.lock b/uv.lock index 7fa21c8..6431238 100644 --- a/uv.lock +++ b/uv.lock @@ -153,7 +153,7 @@ wheels = [ [[package]] name = "lrcfetch" -version = "0.1.5" +version = "0.1.6" source = { editable = "." } dependencies = [ { name = "cyclopts" },