fix: normalize time tags in fetched lrc (why [00:17:06]?)

2026-03-25 11:16:03 +01:00
parent 6e50352934
commit b9fa6c6705
4 changed files with 65 additions and 57 deletions
@@ -7,11 +7,11 @@ Fetch pipeline:
  4. Return the best result (synced > unsynced > None)
 """
 import re
 from typing import Optional
 from loguru import logger
 from lrcfetch.models import TrackMeta, LyricResult, CacheStatus
 from lrcfetch.config import TTL_SYNCED, TTL_UNSYNCED, TTL_NOT_FOUND, TTL_NETWORK_ERROR
 from lrcfetch.lrc import LRC_LINE_RE, normalize_tags
 from lrcfetch.cache import CacheEngine
 from lrcfetch.fetchers.base import BaseFetcher
 from lrcfetch.fetchers.local import LocalFetcher
@@ -20,10 +20,6 @@ from lrcfetch.fetchers.lrclib import LrclibFetcher
 from lrcfetch.fetchers.lrclib_search import LrclibSearchFetcher
 from lrcfetch.fetchers.netease import NeteaseFetcher
 # Matches any LRC time tag at the start of a line: [mm:ss.cc] or [mm:ss.ccc]
 _LRC_LINE_RE = re.compile(r"^\[(\d{2}:\d{2}\.\d{2,3})\]", re.MULTILINE)
 def _normalize_unsynced(lyrics: str) -> str:
    """Normalize unsynced lyrics so every line has a [00:00.00] tag.
@@ -37,11 +33,9 @@ def _normalize_unsynced(lyrics: str) -> str:
        if not stripped:
            out.append("")
            continue
-        # Strip existing time tag(s) from the beginning
+        cleaned = LRC_LINE_RE.sub("", stripped)
-        cleaned = _LRC_LINE_RE.sub("", stripped)
+        while LRC_LINE_RE.match(cleaned):
-        # Could have multiple tags like [00:12.34][00:56.78]text
+            cleaned = LRC_LINE_RE.sub("", cleaned)
        while _LRC_LINE_RE.match(cleaned):
            cleaned = _LRC_LINE_RE.sub("", cleaned)
        out.append(f"[00:00.00]{cleaned}")
    return "\n".join(out)
@@ -148,7 +142,16 @@ class LrcManager:
                logger.debug(f"[{source}] returned None (no result)")
                continue
-            # Cache the result
+            # Normalize non-standard time tags [mm:ss:cc] → [mm:ss.cc]
            if result.lyrics:
                result = LyricResult(
                    status=result.status,
                    lyrics=normalize_tags(result.lyrics),
                    source=result.source,
                    ttl=result.ttl,
                )
            # Cache the normalized result
            ttl = result.ttl or _STATUS_TTL.get(result.status, TTL_NOT_FOUND)
            self.cache.set(track, source, result, ttl_seconds=ttl)
@@ -5,34 +5,15 @@ Priority:
  2. Embedded lyrics in audio metadata (FLAC, MP3 USLT/SYLT tags)
 """
 import re
 import os
 from typing import Optional
 from loguru import logger
 from lrcfetch.models import TrackMeta, LyricResult, CacheStatus
 from lrcfetch.fetchers.base import BaseFetcher
 from lrcfetch.lrc import detect_sync_status
 from mutagen._file import File
 from mutagen.flac import FLAC
 # Matches LRC time tags like [00:12.34] or [01:23.456]
 _LRC_TIME_TAG_RE = re.compile(r"\[\d{2}:\d{2}\.\d{2,3}\]")
 # Matches time tags that are all zeros
 _ZERO_TIME_TAG_RE = re.compile(r"^\[00:00\.0{2,3}\]$")
 def _detect_sync_status(text: str) -> CacheStatus:
    """Determine whether lyrics text contains meaningful LRC time tags.
    Returns UNSYNCED if no tags exist or all tags are [00:00.00].
    """
    tags = _LRC_TIME_TAG_RE.findall(text)
    if not tags:
        return CacheStatus.SUCCESS_UNSYNCED
    for tag in tags:
        if not _ZERO_TIME_TAG_RE.match(tag):
            return CacheStatus.SUCCESS_SYNCED
    return CacheStatus.SUCCESS_UNSYNCED
 class LocalFetcher(BaseFetcher):
    @property
@@ -58,7 +39,7 @@ class LocalFetcher(BaseFetcher):
                with open(lrc_path, "r", encoding="utf-8") as f:
                    content = f.read().strip()
                if content:
-                    status = _detect_sync_status(content)
+                    status = detect_sync_status(content)
                    logger.info(f"Local: found .lrc sidecar ({status.value})")
                    return LyricResult(
                        status=status, lyrics=content, source=self.source_name
@@ -83,7 +64,7 @@ class LocalFetcher(BaseFetcher):
                            break
                if lyrics:
-                    status = _detect_sync_status(lyrics)
+                    status = detect_sync_status(lyrics)
                    logger.info(f"Local: found embedded lyrics ({status.value})")
                    return LyricResult(
                        status=status,
@@ -7,12 +7,12 @@ Search results are filtered by duration when the track has a known length
 to avoid returning lyrics for the wrong version of a song.
 """
 import re
 import httpx
 from typing import Optional
 from loguru import logger
 from lrcfetch.models import TrackMeta, LyricResult, CacheStatus
 from lrcfetch.fetchers.base import BaseFetcher
 from lrcfetch.lrc import is_synced
 from lrcfetch.config import (
    HTTP_TIMEOUT,
    TTL_NOT_FOUND,
@@ -23,34 +23,12 @@ from lrcfetch.config import (
    UA_BROWSER,
 )
 # Matches LRC time tags like [00:12.34] or [01:23.456]
 _LRC_TIME_TAG_RE = re.compile(r"\[\d{2}:\d{2}\.\d{2,3}\]")
 # Matches time tags that are all zeros: [00:00.00] or [00:00.000]
 _ZERO_TIME_TAG_RE = re.compile(r"^\[00:00\.0{2,3}\]")
 _HEADERS = {
    "User-Agent": UA_BROWSER,
    "Referer": "https://music.163.com/",
 }
 def _is_synced_lrc(text: str) -> bool:
    """Check whether *text* contains actual LRC time tags with non-zero times.
    Returns False if:
    - No time tags at all
    - All time tags are [00:00.00] (unsynced disguised as synced)
    """
    lines_with_tags = _LRC_TIME_TAG_RE.findall(text)
    if not lines_with_tags:
        return False
    # Check if ALL tags are zero — if so, it's unsynced
    for tag in lines_with_tags:
        if not _ZERO_TIME_TAG_RE.match(tag):
            return True  # Found at least one non-zero tag
    return False
 class NeteaseFetcher(BaseFetcher):
    @property
    def source_name(self) -> str:
@@ -186,7 +164,7 @@ class NeteaseFetcher(BaseFetcher):
                return LyricResult(status=CacheStatus.NOT_FOUND, ttl=TTL_NOT_FOUND)
            # Determine sync status
-            synced = _is_synced_lrc(lrc)
+            synced = is_synced(lrc)
            status = CacheStatus.SUCCESS_SYNCED if synced else CacheStatus.SUCCESS_UNSYNCED
            logger.info(
                f"Netease: got {status.value} lyrics for song_id={song_id} "
@@ -0,0 +1,46 @@
 """Shared LRC time-tag utilities.
 Handles detection, normalization, and sync-status checks for LRC lyrics.
 """
 import re
 from lrcfetch.models import CacheStatus
 # Standard format: [mm:ss.cc] or [mm:ss.ccc]
 _STANDARD_TAG_RE = re.compile(r"\[\d{2}:\d{2}\.\d{2,3}\]")
 # Non-standard format: [mm:ss:cc] (two colons instead of dot)
 _COLON_TAG_RE = re.compile(r"\[(\d{2}:\d{2}):(\d{2,3})\]")
 # Matches any LRC time tag (standard or non-standard) at start of line
 LRC_LINE_RE = re.compile(r"^\[(\d{2}:\d{2}[.:]\d{2,3})\]", re.MULTILINE)
 # All-zero tags
 _ZERO_TAG_RE = re.compile(r"^\[00:00[.:]0{2,3}\]$")
 def normalize_tags(text: str) -> str:
    """Convert non-standard time tags [mm:ss:cc] to standard [mm:ss.cc]."""
    return _COLON_TAG_RE.sub(r"[\1.\2]", text)
 def is_synced(text: str) -> bool:
    """Check whether text contains actual LRC time tags with non-zero times.
    Returns False if no tags exist or all tags are [00:00.00].
    Handles both [mm:ss.cc] and [mm:ss:cc] formats.
    """
    tags = _STANDARD_TAG_RE.findall(text)
    # Also check non-standard format
    tags += [f"[{m.group(1)}.{m.group(2)}]" for m in _COLON_TAG_RE.finditer(text)]
    if not tags:
        return False
    for tag in tags:
        if not _ZERO_TAG_RE.match(tag):
            return True
    return False
 def detect_sync_status(text: str) -> CacheStatus:
    """Determine whether lyrics contain meaningful LRC time tags."""
    return CacheStatus.SUCCESS_SYNCED if is_synced(text) else CacheStatus.SUCCESS_UNSYNCED