fix: normalize time tags in fetched lrc (why [00:17:06]?)

This commit is contained in:
2026-03-25 11:16:03 +01:00
parent 6e50352934
commit b9fa6c6705
4 changed files with 65 additions and 57 deletions
+14 -11
View File
@@ -7,11 +7,11 @@ Fetch pipeline:
4. Return the best result (synced > unsynced > None) 4. Return the best result (synced > unsynced > None)
""" """
import re
from typing import Optional from typing import Optional
from loguru import logger from loguru import logger
from lrcfetch.models import TrackMeta, LyricResult, CacheStatus from lrcfetch.models import TrackMeta, LyricResult, CacheStatus
from lrcfetch.config import TTL_SYNCED, TTL_UNSYNCED, TTL_NOT_FOUND, TTL_NETWORK_ERROR from lrcfetch.config import TTL_SYNCED, TTL_UNSYNCED, TTL_NOT_FOUND, TTL_NETWORK_ERROR
from lrcfetch.lrc import LRC_LINE_RE, normalize_tags
from lrcfetch.cache import CacheEngine from lrcfetch.cache import CacheEngine
from lrcfetch.fetchers.base import BaseFetcher from lrcfetch.fetchers.base import BaseFetcher
from lrcfetch.fetchers.local import LocalFetcher from lrcfetch.fetchers.local import LocalFetcher
@@ -20,10 +20,6 @@ from lrcfetch.fetchers.lrclib import LrclibFetcher
from lrcfetch.fetchers.lrclib_search import LrclibSearchFetcher from lrcfetch.fetchers.lrclib_search import LrclibSearchFetcher
from lrcfetch.fetchers.netease import NeteaseFetcher from lrcfetch.fetchers.netease import NeteaseFetcher
# Matches any LRC time tag at the start of a line: [mm:ss.cc] or [mm:ss.ccc]
_LRC_LINE_RE = re.compile(r"^\[(\d{2}:\d{2}\.\d{2,3})\]", re.MULTILINE)
def _normalize_unsynced(lyrics: str) -> str: def _normalize_unsynced(lyrics: str) -> str:
"""Normalize unsynced lyrics so every line has a [00:00.00] tag. """Normalize unsynced lyrics so every line has a [00:00.00] tag.
@@ -37,11 +33,9 @@ def _normalize_unsynced(lyrics: str) -> str:
if not stripped: if not stripped:
out.append("") out.append("")
continue continue
# Strip existing time tag(s) from the beginning cleaned = LRC_LINE_RE.sub("", stripped)
cleaned = _LRC_LINE_RE.sub("", stripped) while LRC_LINE_RE.match(cleaned):
# Could have multiple tags like [00:12.34][00:56.78]text cleaned = LRC_LINE_RE.sub("", cleaned)
while _LRC_LINE_RE.match(cleaned):
cleaned = _LRC_LINE_RE.sub("", cleaned)
out.append(f"[00:00.00]{cleaned}") out.append(f"[00:00.00]{cleaned}")
return "\n".join(out) return "\n".join(out)
@@ -148,7 +142,16 @@ class LrcManager:
logger.debug(f"[{source}] returned None (no result)") logger.debug(f"[{source}] returned None (no result)")
continue continue
# Cache the result # Normalize non-standard time tags [mm:ss:cc] → [mm:ss.cc]
if result.lyrics:
result = LyricResult(
status=result.status,
lyrics=normalize_tags(result.lyrics),
source=result.source,
ttl=result.ttl,
)
# Cache the normalized result
ttl = result.ttl or _STATUS_TTL.get(result.status, TTL_NOT_FOUND) ttl = result.ttl or _STATUS_TTL.get(result.status, TTL_NOT_FOUND)
self.cache.set(track, source, result, ttl_seconds=ttl) self.cache.set(track, source, result, ttl_seconds=ttl)
+3 -22
View File
@@ -5,34 +5,15 @@ Priority:
2. Embedded lyrics in audio metadata (FLAC, MP3 USLT/SYLT tags) 2. Embedded lyrics in audio metadata (FLAC, MP3 USLT/SYLT tags)
""" """
import re
import os import os
from typing import Optional from typing import Optional
from loguru import logger from loguru import logger
from lrcfetch.models import TrackMeta, LyricResult, CacheStatus from lrcfetch.models import TrackMeta, LyricResult, CacheStatus
from lrcfetch.fetchers.base import BaseFetcher from lrcfetch.fetchers.base import BaseFetcher
from lrcfetch.lrc import detect_sync_status
from mutagen._file import File from mutagen._file import File
from mutagen.flac import FLAC from mutagen.flac import FLAC
# Matches LRC time tags like [00:12.34] or [01:23.456]
_LRC_TIME_TAG_RE = re.compile(r"\[\d{2}:\d{2}\.\d{2,3}\]")
# Matches time tags that are all zeros
_ZERO_TIME_TAG_RE = re.compile(r"^\[00:00\.0{2,3}\]$")
def _detect_sync_status(text: str) -> CacheStatus:
"""Determine whether lyrics text contains meaningful LRC time tags.
Returns UNSYNCED if no tags exist or all tags are [00:00.00].
"""
tags = _LRC_TIME_TAG_RE.findall(text)
if not tags:
return CacheStatus.SUCCESS_UNSYNCED
for tag in tags:
if not _ZERO_TIME_TAG_RE.match(tag):
return CacheStatus.SUCCESS_SYNCED
return CacheStatus.SUCCESS_UNSYNCED
class LocalFetcher(BaseFetcher): class LocalFetcher(BaseFetcher):
@property @property
@@ -58,7 +39,7 @@ class LocalFetcher(BaseFetcher):
with open(lrc_path, "r", encoding="utf-8") as f: with open(lrc_path, "r", encoding="utf-8") as f:
content = f.read().strip() content = f.read().strip()
if content: if content:
status = _detect_sync_status(content) status = detect_sync_status(content)
logger.info(f"Local: found .lrc sidecar ({status.value})") logger.info(f"Local: found .lrc sidecar ({status.value})")
return LyricResult( return LyricResult(
status=status, lyrics=content, source=self.source_name status=status, lyrics=content, source=self.source_name
@@ -83,7 +64,7 @@ class LocalFetcher(BaseFetcher):
break break
if lyrics: if lyrics:
status = _detect_sync_status(lyrics) status = detect_sync_status(lyrics)
logger.info(f"Local: found embedded lyrics ({status.value})") logger.info(f"Local: found embedded lyrics ({status.value})")
return LyricResult( return LyricResult(
status=status, status=status,
+2 -24
View File
@@ -7,12 +7,12 @@ Search results are filtered by duration when the track has a known length
to avoid returning lyrics for the wrong version of a song. to avoid returning lyrics for the wrong version of a song.
""" """
import re
import httpx import httpx
from typing import Optional from typing import Optional
from loguru import logger from loguru import logger
from lrcfetch.models import TrackMeta, LyricResult, CacheStatus from lrcfetch.models import TrackMeta, LyricResult, CacheStatus
from lrcfetch.fetchers.base import BaseFetcher from lrcfetch.fetchers.base import BaseFetcher
from lrcfetch.lrc import is_synced
from lrcfetch.config import ( from lrcfetch.config import (
HTTP_TIMEOUT, HTTP_TIMEOUT,
TTL_NOT_FOUND, TTL_NOT_FOUND,
@@ -23,34 +23,12 @@ from lrcfetch.config import (
UA_BROWSER, UA_BROWSER,
) )
# Matches LRC time tags like [00:12.34] or [01:23.456]
_LRC_TIME_TAG_RE = re.compile(r"\[\d{2}:\d{2}\.\d{2,3}\]")
# Matches time tags that are all zeros: [00:00.00] or [00:00.000]
_ZERO_TIME_TAG_RE = re.compile(r"^\[00:00\.0{2,3}\]")
_HEADERS = { _HEADERS = {
"User-Agent": UA_BROWSER, "User-Agent": UA_BROWSER,
"Referer": "https://music.163.com/", "Referer": "https://music.163.com/",
} }
def _is_synced_lrc(text: str) -> bool:
"""Check whether *text* contains actual LRC time tags with non-zero times.
Returns False if:
- No time tags at all
- All time tags are [00:00.00] (unsynced disguised as synced)
"""
lines_with_tags = _LRC_TIME_TAG_RE.findall(text)
if not lines_with_tags:
return False
# Check if ALL tags are zero — if so, it's unsynced
for tag in lines_with_tags:
if not _ZERO_TIME_TAG_RE.match(tag):
return True # Found at least one non-zero tag
return False
class NeteaseFetcher(BaseFetcher): class NeteaseFetcher(BaseFetcher):
@property @property
def source_name(self) -> str: def source_name(self) -> str:
@@ -186,7 +164,7 @@ class NeteaseFetcher(BaseFetcher):
return LyricResult(status=CacheStatus.NOT_FOUND, ttl=TTL_NOT_FOUND) return LyricResult(status=CacheStatus.NOT_FOUND, ttl=TTL_NOT_FOUND)
# Determine sync status # Determine sync status
synced = _is_synced_lrc(lrc) synced = is_synced(lrc)
status = CacheStatus.SUCCESS_SYNCED if synced else CacheStatus.SUCCESS_UNSYNCED status = CacheStatus.SUCCESS_SYNCED if synced else CacheStatus.SUCCESS_UNSYNCED
logger.info( logger.info(
f"Netease: got {status.value} lyrics for song_id={song_id} " f"Netease: got {status.value} lyrics for song_id={song_id} "
+46
View File
@@ -0,0 +1,46 @@
"""Shared LRC time-tag utilities.
Handles detection, normalization, and sync-status checks for LRC lyrics.
"""
import re
from lrcfetch.models import CacheStatus
# Standard format: [mm:ss.cc] or [mm:ss.ccc]
_STANDARD_TAG_RE = re.compile(r"\[\d{2}:\d{2}\.\d{2,3}\]")
# Non-standard format: [mm:ss:cc] (two colons instead of dot)
_COLON_TAG_RE = re.compile(r"\[(\d{2}:\d{2}):(\d{2,3})\]")
# Matches any LRC time tag (standard or non-standard) at start of line
LRC_LINE_RE = re.compile(r"^\[(\d{2}:\d{2}[.:]\d{2,3})\]", re.MULTILINE)
# All-zero tags
_ZERO_TAG_RE = re.compile(r"^\[00:00[.:]0{2,3}\]$")
def normalize_tags(text: str) -> str:
"""Convert non-standard time tags [mm:ss:cc] to standard [mm:ss.cc]."""
return _COLON_TAG_RE.sub(r"[\1.\2]", text)
def is_synced(text: str) -> bool:
"""Check whether text contains actual LRC time tags with non-zero times.
Returns False if no tags exist or all tags are [00:00.00].
Handles both [mm:ss.cc] and [mm:ss:cc] formats.
"""
tags = _STANDARD_TAG_RE.findall(text)
# Also check non-standard format
tags += [f"[{m.group(1)}.{m.group(2)}]" for m in _COLON_TAG_RE.finditer(text)]
if not tags:
return False
for tag in tags:
if not _ZERO_TAG_RE.match(tag):
return True
return False
def detect_sync_status(text: str) -> CacheStatus:
"""Determine whether lyrics contain meaningful LRC time tags."""
return CacheStatus.SUCCESS_SYNCED if is_synced(text) else CacheStatus.SUCCESS_UNSYNCED