From 075853ad5b006a199af07bd50f0475cc54719936 Mon Sep 17 00:00:00 2001 From: Uyanide Date: Wed, 25 Mar 2026 11:16:03 +0100 Subject: [PATCH] fix: normalize time tags in fetched lrc (why [00:17:06]?) --- lrcfetch/core.py | 25 +++++++++++--------- lrcfetch/fetchers/local.py | 25 +++----------------- lrcfetch/fetchers/netease.py | 26 ++------------------ lrcfetch/lrc.py | 46 ++++++++++++++++++++++++++++++++++++ 4 files changed, 65 insertions(+), 57 deletions(-) create mode 100644 lrcfetch/lrc.py diff --git a/lrcfetch/core.py b/lrcfetch/core.py index a6b7f64..ff6575e 100644 --- a/lrcfetch/core.py +++ b/lrcfetch/core.py @@ -7,11 +7,11 @@ Fetch pipeline: 4. Return the best result (synced > unsynced > None) """ -import re from typing import Optional from loguru import logger from lrcfetch.models import TrackMeta, LyricResult, CacheStatus from lrcfetch.config import TTL_SYNCED, TTL_UNSYNCED, TTL_NOT_FOUND, TTL_NETWORK_ERROR +from lrcfetch.lrc import LRC_LINE_RE, normalize_tags from lrcfetch.cache import CacheEngine from lrcfetch.fetchers.base import BaseFetcher from lrcfetch.fetchers.local import LocalFetcher @@ -20,10 +20,6 @@ from lrcfetch.fetchers.lrclib import LrclibFetcher from lrcfetch.fetchers.lrclib_search import LrclibSearchFetcher from lrcfetch.fetchers.netease import NeteaseFetcher -# Matches any LRC time tag at the start of a line: [mm:ss.cc] or [mm:ss.ccc] -_LRC_LINE_RE = re.compile(r"^\[(\d{2}:\d{2}\.\d{2,3})\]", re.MULTILINE) - - def _normalize_unsynced(lyrics: str) -> str: """Normalize unsynced lyrics so every line has a [00:00.00] tag. @@ -37,11 +33,9 @@ def _normalize_unsynced(lyrics: str) -> str: if not stripped: out.append("") continue - # Strip existing time tag(s) from the beginning - cleaned = _LRC_LINE_RE.sub("", stripped) - # Could have multiple tags like [00:12.34][00:56.78]text - while _LRC_LINE_RE.match(cleaned): - cleaned = _LRC_LINE_RE.sub("", cleaned) + cleaned = LRC_LINE_RE.sub("", stripped) + while LRC_LINE_RE.match(cleaned): + cleaned = LRC_LINE_RE.sub("", cleaned) out.append(f"[00:00.00]{cleaned}") return "\n".join(out) @@ -148,7 +142,16 @@ class LrcManager: logger.debug(f"[{source}] returned None (no result)") continue - # Cache the result + # Normalize non-standard time tags [mm:ss:cc] → [mm:ss.cc] + if result.lyrics: + result = LyricResult( + status=result.status, + lyrics=normalize_tags(result.lyrics), + source=result.source, + ttl=result.ttl, + ) + + # Cache the normalized result ttl = result.ttl or _STATUS_TTL.get(result.status, TTL_NOT_FOUND) self.cache.set(track, source, result, ttl_seconds=ttl) diff --git a/lrcfetch/fetchers/local.py b/lrcfetch/fetchers/local.py index fcef53c..6c87ef9 100644 --- a/lrcfetch/fetchers/local.py +++ b/lrcfetch/fetchers/local.py @@ -5,34 +5,15 @@ Priority: 2. Embedded lyrics in audio metadata (FLAC, MP3 USLT/SYLT tags) """ -import re import os from typing import Optional from loguru import logger from lrcfetch.models import TrackMeta, LyricResult, CacheStatus from lrcfetch.fetchers.base import BaseFetcher +from lrcfetch.lrc import detect_sync_status from mutagen._file import File from mutagen.flac import FLAC -# Matches LRC time tags like [00:12.34] or [01:23.456] -_LRC_TIME_TAG_RE = re.compile(r"\[\d{2}:\d{2}\.\d{2,3}\]") -# Matches time tags that are all zeros -_ZERO_TIME_TAG_RE = re.compile(r"^\[00:00\.0{2,3}\]$") - - -def _detect_sync_status(text: str) -> CacheStatus: - """Determine whether lyrics text contains meaningful LRC time tags. - - Returns UNSYNCED if no tags exist or all tags are [00:00.00]. - """ - tags = _LRC_TIME_TAG_RE.findall(text) - if not tags: - return CacheStatus.SUCCESS_UNSYNCED - for tag in tags: - if not _ZERO_TIME_TAG_RE.match(tag): - return CacheStatus.SUCCESS_SYNCED - return CacheStatus.SUCCESS_UNSYNCED - class LocalFetcher(BaseFetcher): @property @@ -58,7 +39,7 @@ class LocalFetcher(BaseFetcher): with open(lrc_path, "r", encoding="utf-8") as f: content = f.read().strip() if content: - status = _detect_sync_status(content) + status = detect_sync_status(content) logger.info(f"Local: found .lrc sidecar ({status.value})") return LyricResult( status=status, lyrics=content, source=self.source_name @@ -83,7 +64,7 @@ class LocalFetcher(BaseFetcher): break if lyrics: - status = _detect_sync_status(lyrics) + status = detect_sync_status(lyrics) logger.info(f"Local: found embedded lyrics ({status.value})") return LyricResult( status=status, diff --git a/lrcfetch/fetchers/netease.py b/lrcfetch/fetchers/netease.py index 6eaa4d0..bae0760 100644 --- a/lrcfetch/fetchers/netease.py +++ b/lrcfetch/fetchers/netease.py @@ -7,12 +7,12 @@ Search results are filtered by duration when the track has a known length to avoid returning lyrics for the wrong version of a song. """ -import re import httpx from typing import Optional from loguru import logger from lrcfetch.models import TrackMeta, LyricResult, CacheStatus from lrcfetch.fetchers.base import BaseFetcher +from lrcfetch.lrc import is_synced from lrcfetch.config import ( HTTP_TIMEOUT, TTL_NOT_FOUND, @@ -23,34 +23,12 @@ from lrcfetch.config import ( UA_BROWSER, ) -# Matches LRC time tags like [00:12.34] or [01:23.456] -_LRC_TIME_TAG_RE = re.compile(r"\[\d{2}:\d{2}\.\d{2,3}\]") -# Matches time tags that are all zeros: [00:00.00] or [00:00.000] -_ZERO_TIME_TAG_RE = re.compile(r"^\[00:00\.0{2,3}\]") - _HEADERS = { "User-Agent": UA_BROWSER, "Referer": "https://music.163.com/", } -def _is_synced_lrc(text: str) -> bool: - """Check whether *text* contains actual LRC time tags with non-zero times. - - Returns False if: - - No time tags at all - - All time tags are [00:00.00] (unsynced disguised as synced) - """ - lines_with_tags = _LRC_TIME_TAG_RE.findall(text) - if not lines_with_tags: - return False - # Check if ALL tags are zero — if so, it's unsynced - for tag in lines_with_tags: - if not _ZERO_TIME_TAG_RE.match(tag): - return True # Found at least one non-zero tag - return False - - class NeteaseFetcher(BaseFetcher): @property def source_name(self) -> str: @@ -186,7 +164,7 @@ class NeteaseFetcher(BaseFetcher): return LyricResult(status=CacheStatus.NOT_FOUND, ttl=TTL_NOT_FOUND) # Determine sync status - synced = _is_synced_lrc(lrc) + synced = is_synced(lrc) status = CacheStatus.SUCCESS_SYNCED if synced else CacheStatus.SUCCESS_UNSYNCED logger.info( f"Netease: got {status.value} lyrics for song_id={song_id} " diff --git a/lrcfetch/lrc.py b/lrcfetch/lrc.py new file mode 100644 index 0000000..578d4b4 --- /dev/null +++ b/lrcfetch/lrc.py @@ -0,0 +1,46 @@ +"""Shared LRC time-tag utilities. + +Handles detection, normalization, and sync-status checks for LRC lyrics. +""" + +import re +from lrcfetch.models import CacheStatus + +# Standard format: [mm:ss.cc] or [mm:ss.ccc] +_STANDARD_TAG_RE = re.compile(r"\[\d{2}:\d{2}\.\d{2,3}\]") + +# Non-standard format: [mm:ss:cc] (two colons instead of dot) +_COLON_TAG_RE = re.compile(r"\[(\d{2}:\d{2}):(\d{2,3})\]") + +# Matches any LRC time tag (standard or non-standard) at start of line +LRC_LINE_RE = re.compile(r"^\[(\d{2}:\d{2}[.:]\d{2,3})\]", re.MULTILINE) + +# All-zero tags +_ZERO_TAG_RE = re.compile(r"^\[00:00[.:]0{2,3}\]$") + + +def normalize_tags(text: str) -> str: + """Convert non-standard time tags [mm:ss:cc] to standard [mm:ss.cc].""" + return _COLON_TAG_RE.sub(r"[\1.\2]", text) + + +def is_synced(text: str) -> bool: + """Check whether text contains actual LRC time tags with non-zero times. + + Returns False if no tags exist or all tags are [00:00.00]. + Handles both [mm:ss.cc] and [mm:ss:cc] formats. + """ + tags = _STANDARD_TAG_RE.findall(text) + # Also check non-standard format + tags += [f"[{m.group(1)}.{m.group(2)}]" for m in _COLON_TAG_RE.finditer(text)] + if not tags: + return False + for tag in tags: + if not _ZERO_TAG_RE.match(tag): + return True + return False + + +def detect_sync_status(text: str) -> CacheStatus: + """Determine whether lyrics contain meaningful LRC time tags.""" + return CacheStatus.SUCCESS_SYNCED if is_synced(text) else CacheStatus.SUCCESS_UNSYNCED