feat: better LRC format handling

This commit is contained in:
2026-03-31 15:35:22 +02:00
parent f30a51204f
commit 8e0f3c7af5
10 changed files with 133 additions and 105 deletions
+3 -32
View File
@@ -18,32 +18,12 @@ from loguru import logger
from .fetchers import FetcherMethodType, create_fetchers from .fetchers import FetcherMethodType, create_fetchers
from .fetchers.base import BaseFetcher from .fetchers.base import BaseFetcher
from .cache import CacheEngine from .cache import CacheEngine
from .lrc import LRC_LINE_RE, normalize_tags from .lrc import normalize_unsynced
from .config import TTL_SYNCED, TTL_UNSYNCED, TTL_NOT_FOUND, TTL_NETWORK_ERROR from .config import TTL_SYNCED, TTL_UNSYNCED, TTL_NOT_FOUND, TTL_NETWORK_ERROR
from .models import TrackMeta, LyricResult, CacheStatus from .models import TrackMeta, LyricResult, CacheStatus
from .enrichers import enrich_track from .enrichers import enrich_track
def _normalize_unsynced(lyrics: str) -> str:
"""Normalize unsynced lyrics so every line has a [00:00.00] tag.
- Lines that already have time tags: replace with [00:00.00]
- Lines without time tags: prepend [00:00.00]
- Blank lines are kept as-is
"""
out: list[str] = []
for line in lyrics.splitlines():
stripped = line.strip()
if not stripped:
out.append("")
continue
cleaned = LRC_LINE_RE.sub("", stripped)
while LRC_LINE_RE.match(cleaned):
cleaned = LRC_LINE_RE.sub("", cleaned)
out.append(f"[00:00.00]{cleaned}")
return "\n".join(out)
# Maps CacheStatus to the default TTL used when storing results # Maps CacheStatus to the default TTL used when storing results
_STATUS_TTL: dict[CacheStatus, Optional[int]] = { _STATUS_TTL: dict[CacheStatus, Optional[int]] = {
CacheStatus.SUCCESS_SYNCED: TTL_SYNCED, CacheStatus.SUCCESS_SYNCED: TTL_SYNCED,
@@ -149,16 +129,7 @@ class LrcManager:
logger.debug(f"[{source}] returned None (no result)") logger.debug(f"[{source}] returned None (no result)")
continue continue
# Normalize non-standard time tags [mm:ss:cc] → [mm:ss.cc] # Cache the result (skip for self-cached fetchers)
if result.lyrics:
result = LyricResult(
status=result.status,
lyrics=normalize_tags(result.lyrics),
source=result.source,
ttl=result.ttl,
)
# Cache the normalized result (skip for self-cached fetchers)
if not fetcher.self_cached: if not fetcher.self_cached:
ttl = result.ttl or _STATUS_TTL.get(result.status, TTL_NOT_FOUND) ttl = result.ttl or _STATUS_TTL.get(result.status, TTL_NOT_FOUND)
self.cache.set(track, source, result, ttl_seconds=ttl) self.cache.set(track, source, result, ttl_seconds=ttl)
@@ -184,7 +155,7 @@ class LrcManager:
): ):
best_result = LyricResult( best_result = LyricResult(
status=best_result.status, status=best_result.status,
lyrics=_normalize_unsynced(best_result.lyrics), lyrics=normalize_unsynced(best_result.lyrics),
source=best_result.source, source=best_result.source,
ttl=best_result.ttl, ttl=best_result.ttl,
) )
+4 -2
View File
@@ -17,7 +17,7 @@ from mutagen.flac import FLAC
from .base import BaseFetcher from .base import BaseFetcher
from ..models import TrackMeta, LyricResult from ..models import TrackMeta, LyricResult
from ..lrc import detect_sync_status, get_audio_path, get_sidecar_path from ..lrc import detect_sync_status, normalize_tags, get_audio_path, get_sidecar_path
class LocalFetcher(BaseFetcher): class LocalFetcher(BaseFetcher):
@@ -45,6 +45,7 @@ class LocalFetcher(BaseFetcher):
with open(lrc_path, "r", encoding="utf-8") as f: with open(lrc_path, "r", encoding="utf-8") as f:
content = f.read().strip() content = f.read().strip()
if content: if content:
content = normalize_tags(content)
status = detect_sync_status(content) status = detect_sync_status(content)
logger.info(f"Local: found .lrc sidecar ({status.value})") logger.info(f"Local: found .lrc sidecar ({status.value})")
return LyricResult( return LyricResult(
@@ -77,11 +78,12 @@ class LocalFetcher(BaseFetcher):
break break
if lyrics: if lyrics:
lyrics = normalize_tags(lyrics.strip())
status = detect_sync_status(lyrics) status = detect_sync_status(lyrics)
logger.info(f"Local: found embedded lyrics ({status.value})") logger.info(f"Local: found embedded lyrics ({status.value})")
return LyricResult( return LyricResult(
status=status, status=status,
lyrics=lyrics.strip(), lyrics=lyrics,
source=f"{self.source_name} (embedded)", source=f"{self.source_name} (embedded)",
) )
else: else:
+7 -4
View File
@@ -15,6 +15,7 @@ from urllib.parse import urlencode
from .base import BaseFetcher from .base import BaseFetcher
from ..models import TrackMeta, LyricResult, CacheStatus from ..models import TrackMeta, LyricResult, CacheStatus
from ..lrc import normalize_tags
from ..config import ( from ..config import (
HTTP_TIMEOUT, HTTP_TIMEOUT,
TTL_UNSYNCED, TTL_UNSYNCED,
@@ -75,21 +76,23 @@ class LrclibFetcher(BaseFetcher):
unsynced = data.get("plainLyrics") unsynced = data.get("plainLyrics")
if isinstance(synced, str) and synced.strip(): if isinstance(synced, str) and synced.strip():
lyrics = normalize_tags(synced.strip())
logger.info( logger.info(
f"LRCLIB: got synced lyrics ({len(synced.splitlines())} lines)" f"LRCLIB: got synced lyrics ({len(lyrics.splitlines())} lines)"
) )
return LyricResult( return LyricResult(
status=CacheStatus.SUCCESS_SYNCED, status=CacheStatus.SUCCESS_SYNCED,
lyrics=synced.strip(), lyrics=lyrics,
source=self.source_name, source=self.source_name,
) )
elif isinstance(unsynced, str) and unsynced.strip(): elif isinstance(unsynced, str) and unsynced.strip():
lyrics = normalize_tags(unsynced.strip())
logger.info( logger.info(
f"LRCLIB: got unsynced lyrics ({len(unsynced.splitlines())} lines)" f"LRCLIB: got unsynced lyrics ({len(lyrics.splitlines())} lines)"
) )
return LyricResult( return LyricResult(
status=CacheStatus.SUCCESS_UNSYNCED, status=CacheStatus.SUCCESS_UNSYNCED,
lyrics=unsynced.strip(), lyrics=lyrics,
source=self.source_name, source=self.source_name,
ttl=TTL_UNSYNCED, ttl=TTL_UNSYNCED,
) )
+7 -4
View File
@@ -16,6 +16,7 @@ from urllib.parse import urlencode
from .base import BaseFetcher from .base import BaseFetcher
from ..models import TrackMeta, LyricResult, CacheStatus from ..models import TrackMeta, LyricResult, CacheStatus
from ..lrc import normalize_tags
from ..config import ( from ..config import (
HTTP_TIMEOUT, HTTP_TIMEOUT,
TTL_UNSYNCED, TTL_UNSYNCED,
@@ -78,21 +79,23 @@ class LrclibSearchFetcher(BaseFetcher):
unsynced = best.get("plainLyrics") unsynced = best.get("plainLyrics")
if isinstance(synced, str) and synced.strip(): if isinstance(synced, str) and synced.strip():
lyrics = normalize_tags(synced.strip())
logger.info( logger.info(
f"LRCLIB-search: got synced lyrics ({len(synced.splitlines())} lines)" f"LRCLIB-search: got synced lyrics ({len(lyrics.splitlines())} lines)"
) )
return LyricResult( return LyricResult(
status=CacheStatus.SUCCESS_SYNCED, status=CacheStatus.SUCCESS_SYNCED,
lyrics=synced.strip(), lyrics=lyrics,
source=self.source_name, source=self.source_name,
) )
elif isinstance(unsynced, str) and unsynced.strip(): elif isinstance(unsynced, str) and unsynced.strip():
lyrics = normalize_tags(unsynced.strip())
logger.info( logger.info(
f"LRCLIB-search: got unsynced lyrics ({len(unsynced.splitlines())} lines)" f"LRCLIB-search: got unsynced lyrics ({len(lyrics.splitlines())} lines)"
) )
return LyricResult( return LyricResult(
status=CacheStatus.SUCCESS_UNSYNCED, status=CacheStatus.SUCCESS_UNSYNCED,
lyrics=unsynced.strip(), lyrics=lyrics,
source=self.source_name, source=self.source_name,
ttl=TTL_UNSYNCED, ttl=TTL_UNSYNCED,
) )
+3 -5
View File
@@ -18,7 +18,7 @@ from loguru import logger
from .base import BaseFetcher from .base import BaseFetcher
from ..models import TrackMeta, LyricResult, CacheStatus from ..models import TrackMeta, LyricResult, CacheStatus
from ..lrc import is_synced from ..lrc import detect_sync_status, normalize_tags
from ..config import ( from ..config import (
HTTP_TIMEOUT, HTTP_TIMEOUT,
TTL_NOT_FOUND, TTL_NOT_FOUND,
@@ -178,10 +178,8 @@ class NeteaseFetcher(BaseFetcher):
return LyricResult(status=CacheStatus.NOT_FOUND, ttl=TTL_NOT_FOUND) return LyricResult(status=CacheStatus.NOT_FOUND, ttl=TTL_NOT_FOUND)
# Determine sync status # Determine sync status
synced = is_synced(lrc) lrc = normalize_tags(lrc)
status = ( status = detect_sync_status(lrc)
CacheStatus.SUCCESS_SYNCED if synced else CacheStatus.SUCCESS_UNSYNCED
)
logger.info( logger.info(
f"Netease: got {status.value} lyrics for song_id={song_id} " f"Netease: got {status.value} lyrics for song_id={song_id} "
f"({len(lrc.splitlines())} lines)" f"({len(lrc.splitlines())} lines)"
+3 -5
View File
@@ -17,7 +17,7 @@ from loguru import logger
from .base import BaseFetcher from .base import BaseFetcher
from ..models import TrackMeta, LyricResult, CacheStatus from ..models import TrackMeta, LyricResult, CacheStatus
from ..lrc import is_synced from ..lrc import detect_sync_status, normalize_tags
from ..config import ( from ..config import (
HTTP_TIMEOUT, HTTP_TIMEOUT,
TTL_NOT_FOUND, TTL_NOT_FOUND,
@@ -139,10 +139,8 @@ class QQMusicFetcher(BaseFetcher):
logger.debug(f"QQMusic: empty lyrics for mid={mid}") logger.debug(f"QQMusic: empty lyrics for mid={mid}")
return LyricResult(status=CacheStatus.NOT_FOUND, ttl=TTL_NOT_FOUND) return LyricResult(status=CacheStatus.NOT_FOUND, ttl=TTL_NOT_FOUND)
synced = is_synced(lrc) lrc = normalize_tags(lrc)
status = ( status = detect_sync_status(lrc)
CacheStatus.SUCCESS_SYNCED if synced else CacheStatus.SUCCESS_UNSYNCED
)
logger.info( logger.info(
f"QQMusic: got {status.value} lyrics for mid={mid} " f"QQMusic: got {status.value} lyrics for mid={mid} "
f"({len(lrc.splitlines())} lines)" f"({len(lrc.splitlines())} lines)"
+2 -1
View File
@@ -28,6 +28,7 @@ from loguru import logger
from .base import BaseFetcher from .base import BaseFetcher
from ..models import TrackMeta, LyricResult, CacheStatus from ..models import TrackMeta, LyricResult, CacheStatus
from ..lrc import normalize_tags
from ..config import ( from ..config import (
HTTP_TIMEOUT, HTTP_TIMEOUT,
SPOTIFY_APP_VERSION, SPOTIFY_APP_VERSION,
@@ -354,7 +355,7 @@ class SpotifyFetcher(BaseFetcher):
# Unsynced: emit with zero timestamps # Unsynced: emit with zero timestamps
lrc_lines.append(f"[00:00.00]{words}") lrc_lines.append(f"[00:00.00]{words}")
content = "\n".join(lrc_lines) content = normalize_tags("\n".join(lrc_lines))
status = ( status = (
CacheStatus.SUCCESS_SYNCED CacheStatus.SUCCESS_SYNCED
if is_synced if is_synced
+102 -50
View File
@@ -1,7 +1,7 @@
""" """
Author: Uyanide pywang0608@foxmail.com Author: Uyanide pywang0608@foxmail.com
Date: 2026-03-25 21:54:01 Date: 2026-03-25 21:54:01
Description: Shared LRC time-tag utilities Description: Shared LRC time-tag utilities (definitely overengineered)
""" """
import re import re
@@ -11,93 +11,145 @@ from urllib.parse import unquote
from .models import CacheStatus from .models import CacheStatus
# Standard format: [mm:ss.cc] or [mm:ss.ccc] # Parses any time tag input format:
_STANDARD_TAG_RE = re.compile(r"\[\d{2}:\d{2}\.\d{2,3}\]") # [mm:ss], [mm:ss.c], [mm:ss.cc], [mm:ss.ccc], [mm:ss:cc], …
_RAW_TAG_RE = re.compile(r"\[(\d{2,}):(\d{2})(?:[.:](\d{1,3}))?\]")
# Non-standard format: [mm:ss:cc] (two colons instead of dot) # Standard format after normalization: [mm:ss.cc]
_COLON_TAG_RE = re.compile(r"\[(\d{2}:\d{2}):(\d{2,3})\]") _STD_TAG_RE = re.compile(r"\[\d{2,}:\d{2}\.\d{2}\]")
# Matches any LRC time tag (standard or non-standard) at start of line # Standard format with capture groups
LRC_LINE_RE = re.compile(r"^\[(\d{2}:\d{2}[.:]\d{2,3})\]", re.MULTILINE) _STD_TAG_CAPTURE_RE = re.compile(r"\[(\d{2,}):(\d{2})\.(\d{2})\]")
# All-zero tags # Matches a standard time tag at the start of a line
_ZERO_TAG_RE = re.compile(r"^\[00:00[.:]0{2,3}\]$") _LRC_LINE_RE = re.compile(r"^\[\d{2,}:\d{2}\.\d{2}\]", re.MULTILINE)
# [offset:+/-xxx] tag — value in milliseconds # [offset:+/-xxx] tag — value in milliseconds
_OFFSET_RE = re.compile(r"^\[offset:\s*([+-]?\d+)\]\s*$", re.MULTILINE | re.IGNORECASE) _OFFSET_RE = re.compile(r"^\[offset:\s*([+-]?\d+)\]\s*$", re.MULTILINE | re.IGNORECASE)
# Time tag for offset application: captures mm, ss, cc/ccc
_TIME_TAG_RE = re.compile(r"\[(\d{2}):(\d{2})\.(\d{2,3})\]") def _raw_tag_to_cs(mm: str, ss: str, frac: Optional[str]) -> str:
"""Convert parsed time tag components to standard [mm:ss.cc] string."""
if frac is None:
ms = 0
else:
# cc in [mm:ss:cc] is also treated as centiseconds, per LRC spec
# ^
# why does this format even exist, idk
n = len(frac)
if n == 1:
ms = int(frac) * 100
elif n == 2:
ms = int(frac) * 10
else:
ms = int(frac)
cs = min(round(ms / 10), 99)
return f"[{mm}:{ss}.{cs:02d}]"
def _reformat(text: str) -> str:
"""Parse each line and reformat to standard [mm:ss.cc]...content form.
Handles any mix of time tag formats on input. Lines with no time tags
are stripped of leading/trailing whitespace and passed through unchanged.
"""
out: list[str] = []
for line in text.splitlines():
line = line.strip()
pos = 0
tags: list[str] = []
while True:
while pos < len(line) and line[pos] == " ":
pos += 1
m = _RAW_TAG_RE.match(line, pos)
# Non-time tags are passed through as-is, except for leading/trailing whitespace which is stripped.
if not m:
# No more tags on this line
break
tags.append(_raw_tag_to_cs(m.group(1), m.group(2), m.group(3)))
pos = m.end()
if tags:
# This could break lyric lines of some kind of word-synced LRC format,
# but such format were not planned to be supported in the first place, so…
out.append("".join(tags) + line[pos:].lstrip())
else:
out.append(line)
# Empty lines with no tags are also preserved
return "\n".join(out)
def _apply_offset(text: str) -> str: def _apply_offset(text: str) -> str:
"""Parse [offset:±ms] tag and shift all time tags accordingly. """Parse [offset:±ms] and shift all standard [mm:ss.cc] tags accordingly.
Per LRC spec, a positive offset means lyrics appear sooner (subtract Per LRC spec, positive offset = lyrics appear sooner (subtract from timestamps).
from timestamps), negative means later (add to timestamps).
""" """
m = _OFFSET_RE.search(text) m = _OFFSET_RE.search(text)
if not m: if not m:
return text return text
offset_ms = int(m.group(1)) offset_ms = int(m.group(1))
text = _OFFSET_RE.sub("", text).strip("\n")
if offset_ms == 0: if offset_ms == 0:
return _OFFSET_RE.sub("", text).strip("\n") return text
# Remove the offset tag line
text = _OFFSET_RE.sub("", text)
def _shift(match: re.Match) -> str: def _shift(match: re.Match) -> str:
mm, ss, cs = int(match.group(1)), int(match.group(2)), match.group(3) total_ms = max(
# Normalize centiseconds to milliseconds 0,
if len(cs) == 2: (int(match.group(1)) * 60 + int(match.group(2))) * 1000
ms = int(cs) * 10 + int(match.group(3)) * 10
fmt_cs = 2 - offset_ms,
else: )
ms = int(cs)
fmt_cs = 3
total_ms = (mm * 60 + ss) * 1000 + ms - offset_ms
total_ms = max(0, total_ms)
new_mm = total_ms // 60000 new_mm = total_ms // 60000
new_ss = (total_ms % 60000) // 1000 new_ss = (total_ms % 60000) // 1000
new_cs = total_ms % 1000 new_cs = min(round((total_ms % 1000) / 10), 99)
if fmt_cs == 2: return f"[{new_mm:02d}:{new_ss:02d}.{new_cs:02d}]"
new_cs = new_cs // 10
return f"[{new_mm:02d}:{new_ss:02d}.{new_cs:02d}]"
return f"[{new_mm:02d}:{new_ss:02d}.{new_cs:03d}]"
return _TIME_TAG_RE.sub(_shift, text) return _STD_TAG_CAPTURE_RE.sub(_shift, text)
def normalize_tags(text: str) -> str: def normalize_tags(text: str) -> str:
"""Normalize LRC time tags: colon format → dot format, then apply offset.""" """Normalize LRC to standard form: reformat all tags to [mm:ss.cc], then apply offset."""
text = _COLON_TAG_RE.sub(r"[\1.\2]", text) return _apply_offset(_reformat(text))
return _apply_offset(text)
def is_synced(text: str) -> bool: def is_synced(text: str) -> bool:
"""Check whether text contains actual LRC time tags with non-zero times. """Check whether text contains non-zero LRC time tags.
Returns False if no tags exist or all tags are [00:00.00]. Assumes text has been normalized by normalize_tags (standard [mm:ss.cc] format).
Handles both [mm:ss.cc] and [mm:ss:cc] formats.
""" """
tags = _STANDARD_TAG_RE.findall(text) tags = _STD_TAG_RE.findall(text)
# Also check non-standard format return bool(tags) and any(tag != "[00:00.00]" for tag in tags)
tags += [f"[{m.group(1)}.{m.group(2)}]" for m in _COLON_TAG_RE.finditer(text)]
if not tags:
return False
for tag in tags:
if not _ZERO_TAG_RE.match(tag):
return True
return False
def detect_sync_status(text: str) -> CacheStatus: def detect_sync_status(text: str) -> CacheStatus:
"""Determine whether lyrics contain meaningful LRC time tags.""" """Determine whether lyrics contain meaningful LRC time tags.
Assumes text has been normalized by normalize_tags.
"""
return ( return (
CacheStatus.SUCCESS_SYNCED if is_synced(text) else CacheStatus.SUCCESS_UNSYNCED CacheStatus.SUCCESS_SYNCED if is_synced(text) else CacheStatus.SUCCESS_UNSYNCED
) )
def normalize_unsynced(lyrics: str) -> str:
"""Normalize unsynced lyrics so every line has a [00:00.00] tag.
- Lines that already have time tags: replace with [00:00.00]
- Lines without time tags: prepend [00:00.00]
- Blank lines are converted to [00:00.00]
"""
out: list[str] = []
for line in lyrics.splitlines():
stripped = line.strip()
if not stripped:
out.append("[00:00.00]")
continue
cleaned = _LRC_LINE_RE.sub("", stripped)
while _LRC_LINE_RE.match(cleaned):
cleaned = _LRC_LINE_RE.sub("", cleaned)
out.append(f"[00:00.00]{cleaned}")
return "\n".join(out)
def get_audio_path(audio_url: str, ensure_exists: bool = False) -> Optional[Path]: def get_audio_path(audio_url: str, ensure_exists: bool = False) -> Optional[Path]:
"""Convert file:// URL to Path, return None if invalid or (if ensure_exists) file doesn't exist.""" """Convert file:// URL to Path, return None if invalid or (if ensure_exists) file doesn't exist."""
if not audio_url.startswith("file://"): if not audio_url.startswith("file://"):
+1 -1
View File
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
[project] [project]
name = "lrcfetch" name = "lrcfetch"
version = "0.1.5" version = "0.1.6"
description = "Fetch line-synced lyrics for your music player." description = "Fetch line-synced lyrics for your music player."
readme = "README.md" readme = "README.md"
requires-python = ">=3.13" requires-python = ">=3.13"
Generated
+1 -1
View File
@@ -153,7 +153,7 @@ wheels = [
[[package]] [[package]]
name = "lrcfetch" name = "lrcfetch"
version = "0.1.5" version = "0.1.6"
source = { editable = "." } source = { editable = "." }
dependencies = [ dependencies = [
{ name = "cyclopts" }, { name = "cyclopts" },