From 03acda4478181690399e3291b7227b8dfd348105 Mon Sep 17 00:00:00 2001
From: Uyanide <pywang0608@foxmail.com>
Date: Tue, 31 Mar 2026 15:35:22 +0200
Subject: [PATCH] feat: better LRC format handling

---
 lrcfetch/core.py                   |  35 +------
 lrcfetch/fetchers/local.py         |   6 +-
 lrcfetch/fetchers/lrclib.py        |  11 ++-
 lrcfetch/fetchers/lrclib_search.py |  11 ++-
 lrcfetch/fetchers/netease.py       |   8 +-
 lrcfetch/fetchers/qqmusic.py       |   8 +-
 lrcfetch/fetchers/spotify.py       |   3 +-
 lrcfetch/lrc.py                    | 152 +++++++++++++++++++----------
 pyproject.toml                     |   2 +-
 uv.lock                            |   2 +-
 10 files changed, 133 insertions(+), 105 deletions(-)

diff --git a/lrcfetch/core.py b/lrcfetch/core.py
index de92f9a..e51bd87 100644
--- a/lrcfetch/core.py
+++ b/lrcfetch/core.py
@@ -18,32 +18,12 @@ from loguru import logger
 from .fetchers import FetcherMethodType, create_fetchers
 from .fetchers.base import BaseFetcher
 from .cache import CacheEngine
-from .lrc import LRC_LINE_RE, normalize_tags
+from .lrc import normalize_unsynced
 from .config import TTL_SYNCED, TTL_UNSYNCED, TTL_NOT_FOUND, TTL_NETWORK_ERROR
 from .models import TrackMeta, LyricResult, CacheStatus
 from .enrichers import enrich_track
 
 
-def _normalize_unsynced(lyrics: str) -> str:
-    """Normalize unsynced lyrics so every line has a [00:00.00] tag.
-
-    - Lines that already have time tags: replace with [00:00.00]
-    - Lines without time tags: prepend [00:00.00]
-    - Blank lines are kept as-is
-    """
-    out: list[str] = []
-    for line in lyrics.splitlines():
-        stripped = line.strip()
-        if not stripped:
-            out.append("")
-            continue
-        cleaned = LRC_LINE_RE.sub("", stripped)
-        while LRC_LINE_RE.match(cleaned):
-            cleaned = LRC_LINE_RE.sub("", cleaned)
-        out.append(f"[00:00.00]{cleaned}")
-    return "\n".join(out)
-
-
 # Maps CacheStatus to the default TTL used when storing results
 _STATUS_TTL: dict[CacheStatus, Optional[int]] = {
     CacheStatus.SUCCESS_SYNCED: TTL_SYNCED,
@@ -149,16 +129,7 @@ class LrcManager:
                 logger.debug(f"[{source}] returned None (no result)")
                 continue
 
-            # Normalize non-standard time tags [mm:ss:cc] → [mm:ss.cc]
-            if result.lyrics:
-                result = LyricResult(
-                    status=result.status,
-                    lyrics=normalize_tags(result.lyrics),
-                    source=result.source,
-                    ttl=result.ttl,
-                )
-
-            # Cache the normalized result (skip for self-cached fetchers)
+            # Cache the result (skip for self-cached fetchers)
             if not fetcher.self_cached:
                 ttl = result.ttl or _STATUS_TTL.get(result.status, TTL_NOT_FOUND)
                 self.cache.set(track, source, result, ttl_seconds=ttl)
@@ -184,7 +155,7 @@ class LrcManager:
             ):
                 best_result = LyricResult(
                     status=best_result.status,
-                    lyrics=_normalize_unsynced(best_result.lyrics),
+                    lyrics=normalize_unsynced(best_result.lyrics),
                     source=best_result.source,
                     ttl=best_result.ttl,
                 )
diff --git a/lrcfetch/fetchers/local.py b/lrcfetch/fetchers/local.py
index e1e46e7..8e2267e 100644
--- a/lrcfetch/fetchers/local.py
+++ b/lrcfetch/fetchers/local.py
@@ -17,7 +17,7 @@ from mutagen.flac import FLAC
 
 from .base import BaseFetcher
 from ..models import TrackMeta, LyricResult
-from ..lrc import detect_sync_status, get_audio_path, get_sidecar_path
+from ..lrc import detect_sync_status, normalize_tags, get_audio_path, get_sidecar_path
 
 
 class LocalFetcher(BaseFetcher):
@@ -45,6 +45,7 @@ class LocalFetcher(BaseFetcher):
                 with open(lrc_path, "r", encoding="utf-8") as f:
                     content = f.read().strip()
                 if content:
+                    content = normalize_tags(content)
                     status = detect_sync_status(content)
                     logger.info(f"Local: found .lrc sidecar ({status.value})")
                     return LyricResult(
@@ -77,11 +78,12 @@ class LocalFetcher(BaseFetcher):
                             break
 
                 if lyrics:
+                    lyrics = normalize_tags(lyrics.strip())
                     status = detect_sync_status(lyrics)
                     logger.info(f"Local: found embedded lyrics ({status.value})")
                     return LyricResult(
                         status=status,
-                        lyrics=lyrics.strip(),
+                        lyrics=lyrics,
                         source=f"{self.source_name} (embedded)",
                     )
                 else:
diff --git a/lrcfetch/fetchers/lrclib.py b/lrcfetch/fetchers/lrclib.py
index e5e4c3f..71d08d1 100644
--- a/lrcfetch/fetchers/lrclib.py
+++ b/lrcfetch/fetchers/lrclib.py
@@ -15,6 +15,7 @@ from urllib.parse import urlencode
 
 from .base import BaseFetcher
 from ..models import TrackMeta, LyricResult, CacheStatus
+from ..lrc import normalize_tags
 from ..config import (
     HTTP_TIMEOUT,
     TTL_UNSYNCED,
@@ -75,21 +76,23 @@ class LrclibFetcher(BaseFetcher):
             unsynced = data.get("plainLyrics")
 
             if isinstance(synced, str) and synced.strip():
+                lyrics = normalize_tags(synced.strip())
                 logger.info(
-                    f"LRCLIB: got synced lyrics ({len(synced.splitlines())} lines)"
+                    f"LRCLIB: got synced lyrics ({len(lyrics.splitlines())} lines)"
                 )
                 return LyricResult(
                     status=CacheStatus.SUCCESS_SYNCED,
-                    lyrics=synced.strip(),
+                    lyrics=lyrics,
                     source=self.source_name,
                 )
             elif isinstance(unsynced, str) and unsynced.strip():
+                lyrics = normalize_tags(unsynced.strip())
                 logger.info(
-                    f"LRCLIB: got unsynced lyrics ({len(unsynced.splitlines())} lines)"
+                    f"LRCLIB: got unsynced lyrics ({len(lyrics.splitlines())} lines)"
                 )
                 return LyricResult(
                     status=CacheStatus.SUCCESS_UNSYNCED,
-                    lyrics=unsynced.strip(),
+                    lyrics=lyrics,
                     source=self.source_name,
                     ttl=TTL_UNSYNCED,
                 )
diff --git a/lrcfetch/fetchers/lrclib_search.py b/lrcfetch/fetchers/lrclib_search.py
index 83e2c30..3fa9357 100644
--- a/lrcfetch/fetchers/lrclib_search.py
+++ b/lrcfetch/fetchers/lrclib_search.py
@@ -16,6 +16,7 @@ from urllib.parse import urlencode
 
 from .base import BaseFetcher
 from ..models import TrackMeta, LyricResult, CacheStatus
+from ..lrc import normalize_tags
 from ..config import (
     HTTP_TIMEOUT,
     TTL_UNSYNCED,
@@ -78,21 +79,23 @@ class LrclibSearchFetcher(BaseFetcher):
             unsynced = best.get("plainLyrics")
 
             if isinstance(synced, str) and synced.strip():
+                lyrics = normalize_tags(synced.strip())
                 logger.info(
-                    f"LRCLIB-search: got synced lyrics ({len(synced.splitlines())} lines)"
+                    f"LRCLIB-search: got synced lyrics ({len(lyrics.splitlines())} lines)"
                 )
                 return LyricResult(
                     status=CacheStatus.SUCCESS_SYNCED,
-                    lyrics=synced.strip(),
+                    lyrics=lyrics,
                     source=self.source_name,
                 )
             elif isinstance(unsynced, str) and unsynced.strip():
+                lyrics = normalize_tags(unsynced.strip())
                 logger.info(
-                    f"LRCLIB-search: got unsynced lyrics ({len(unsynced.splitlines())} lines)"
+                    f"LRCLIB-search: got unsynced lyrics ({len(lyrics.splitlines())} lines)"
                 )
                 return LyricResult(
                     status=CacheStatus.SUCCESS_UNSYNCED,
-                    lyrics=unsynced.strip(),
+                    lyrics=lyrics,
                     source=self.source_name,
                     ttl=TTL_UNSYNCED,
                 )
diff --git a/lrcfetch/fetchers/netease.py b/lrcfetch/fetchers/netease.py
index 9d1ebba..cee4ab9 100644
--- a/lrcfetch/fetchers/netease.py
+++ b/lrcfetch/fetchers/netease.py
@@ -18,7 +18,7 @@ from loguru import logger
 
 from .base import BaseFetcher
 from ..models import TrackMeta, LyricResult, CacheStatus
-from ..lrc import is_synced
+from ..lrc import detect_sync_status, normalize_tags
 from ..config import (
     HTTP_TIMEOUT,
     TTL_NOT_FOUND,
@@ -178,10 +178,8 @@ class NeteaseFetcher(BaseFetcher):
                 return LyricResult(status=CacheStatus.NOT_FOUND, ttl=TTL_NOT_FOUND)
 
             # Determine sync status
-            synced = is_synced(lrc)
-            status = (
-                CacheStatus.SUCCESS_SYNCED if synced else CacheStatus.SUCCESS_UNSYNCED
-            )
+            lrc = normalize_tags(lrc)
+            status = detect_sync_status(lrc)
             logger.info(
                 f"Netease: got {status.value} lyrics for song_id={song_id} "
                 f"({len(lrc.splitlines())} lines)"
diff --git a/lrcfetch/fetchers/qqmusic.py b/lrcfetch/fetchers/qqmusic.py
index d64c01e..061fc7b 100644
--- a/lrcfetch/fetchers/qqmusic.py
+++ b/lrcfetch/fetchers/qqmusic.py
@@ -17,7 +17,7 @@ from loguru import logger
 
 from .base import BaseFetcher
 from ..models import TrackMeta, LyricResult, CacheStatus
-from ..lrc import is_synced
+from ..lrc import detect_sync_status, normalize_tags
 from ..config import (
     HTTP_TIMEOUT,
     TTL_NOT_FOUND,
@@ -139,10 +139,8 @@ class QQMusicFetcher(BaseFetcher):
                 logger.debug(f"QQMusic: empty lyrics for mid={mid}")
                 return LyricResult(status=CacheStatus.NOT_FOUND, ttl=TTL_NOT_FOUND)
 
-            synced = is_synced(lrc)
-            status = (
-                CacheStatus.SUCCESS_SYNCED if synced else CacheStatus.SUCCESS_UNSYNCED
-            )
+            lrc = normalize_tags(lrc)
+            status = detect_sync_status(lrc)
             logger.info(
                 f"QQMusic: got {status.value} lyrics for mid={mid} "
                 f"({len(lrc.splitlines())} lines)"
diff --git a/lrcfetch/fetchers/spotify.py b/lrcfetch/fetchers/spotify.py
index c0e1886..110f86a 100644
--- a/lrcfetch/fetchers/spotify.py
+++ b/lrcfetch/fetchers/spotify.py
@@ -28,6 +28,7 @@ from loguru import logger
 
 from .base import BaseFetcher
 from ..models import TrackMeta, LyricResult, CacheStatus
+from ..lrc import normalize_tags
 from ..config import (
     HTTP_TIMEOUT,
     SPOTIFY_APP_VERSION,
@@ -354,7 +355,7 @@ class SpotifyFetcher(BaseFetcher):
                     # Unsynced: emit with zero timestamps
                     lrc_lines.append(f"[00:00.00]{words}")
 
-            content = "\n".join(lrc_lines)
+            content = normalize_tags("\n".join(lrc_lines))
             status = (
                 CacheStatus.SUCCESS_SYNCED
                 if is_synced
diff --git a/lrcfetch/lrc.py b/lrcfetch/lrc.py
index 65deeb6..6913512 100644
--- a/lrcfetch/lrc.py
+++ b/lrcfetch/lrc.py
@@ -1,7 +1,7 @@
 """
 Author: Uyanide pywang0608@foxmail.com
 Date: 2026-03-25 21:54:01
-Description: Shared LRC time-tag utilities
+Description: Shared LRC time-tag utilities (definitely overengineered)
 """
 
 import re
@@ -11,93 +11,145 @@ from urllib.parse import unquote
 
 from .models import CacheStatus
 
-# Standard format: [mm:ss.cc] or [mm:ss.ccc]
-_STANDARD_TAG_RE = re.compile(r"\[\d{2}:\d{2}\.\d{2,3}\]")
+# Parses any time tag input format:
+#   [mm:ss], [mm:ss.c], [mm:ss.cc], [mm:ss.ccc], [mm:ss:cc], …
+_RAW_TAG_RE = re.compile(r"\[(\d{2,}):(\d{2})(?:[.:](\d{1,3}))?\]")
 
-# Non-standard format: [mm:ss:cc] (two colons instead of dot)
-_COLON_TAG_RE = re.compile(r"\[(\d{2}:\d{2}):(\d{2,3})\]")
+# Standard format after normalization: [mm:ss.cc]
+_STD_TAG_RE = re.compile(r"\[\d{2,}:\d{2}\.\d{2}\]")
 
-# Matches any LRC time tag (standard or non-standard) at start of line
-LRC_LINE_RE = re.compile(r"^\[(\d{2}:\d{2}[.:]\d{2,3})\]", re.MULTILINE)
+# Standard format with capture groups
+_STD_TAG_CAPTURE_RE = re.compile(r"\[(\d{2,}):(\d{2})\.(\d{2})\]")
 
-# All-zero tags
-_ZERO_TAG_RE = re.compile(r"^\[00:00[.:]0{2,3}\]$")
+# Matches a standard time tag at the start of a line
+_LRC_LINE_RE = re.compile(r"^\[\d{2,}:\d{2}\.\d{2}\]", re.MULTILINE)
 
 # [offset:+/-xxx] tag — value in milliseconds
 _OFFSET_RE = re.compile(r"^\[offset:\s*([+-]?\d+)\]\s*$", re.MULTILINE | re.IGNORECASE)
 
-# Time tag for offset application: captures mm, ss, cc/ccc
-_TIME_TAG_RE = re.compile(r"\[(\d{2}):(\d{2})\.(\d{2,3})\]")
+
+def _raw_tag_to_cs(mm: str, ss: str, frac: Optional[str]) -> str:
+    """Convert parsed time tag components to standard [mm:ss.cc] string."""
+    if frac is None:
+        ms = 0
+    else:
+        # cc in [mm:ss:cc] is also treated as centiseconds, per LRC spec
+        #             ^
+        # why does this format even exist, idk
+        n = len(frac)
+        if n == 1:
+            ms = int(frac) * 100
+        elif n == 2:
+            ms = int(frac) * 10
+        else:
+            ms = int(frac)
+    cs = min(round(ms / 10), 99)
+    return f"[{mm}:{ss}.{cs:02d}]"
+
+
+def _reformat(text: str) -> str:
+    """Parse each line and reformat to standard [mm:ss.cc]...content form.
+
+    Handles any mix of time tag formats on input. Lines with no time tags
+    are stripped of leading/trailing whitespace and passed through unchanged.
+    """
+    out: list[str] = []
+    for line in text.splitlines():
+        line = line.strip()
+        pos = 0
+        tags: list[str] = []
+        while True:
+            while pos < len(line) and line[pos] == " ":
+                pos += 1
+            m = _RAW_TAG_RE.match(line, pos)
+            # Non-time tags are passed through as-is, except for leading/trailing whitespace which is stripped.
+            if not m:
+                # No more tags on this line
+                break
+            tags.append(_raw_tag_to_cs(m.group(1), m.group(2), m.group(3)))
+            pos = m.end()
+        if tags:
+            # This could break lyric lines of some kind of word-synced LRC format,
+            # but such format were not planned to be supported in the first place, so…
+            out.append("".join(tags) + line[pos:].lstrip())
+        else:
+            out.append(line)
+            # Empty lines with no tags are also preserved
+    return "\n".join(out)
 
 
 def _apply_offset(text: str) -> str:
-    """Parse [offset:±ms] tag and shift all time tags accordingly.
+    """Parse [offset:±ms] and shift all standard [mm:ss.cc] tags accordingly.
 
-    Per LRC spec, a positive offset means lyrics appear sooner (subtract
-    from timestamps), negative means later (add to timestamps).
+    Per LRC spec, positive offset = lyrics appear sooner (subtract from timestamps).
     """
     m = _OFFSET_RE.search(text)
     if not m:
         return text
     offset_ms = int(m.group(1))
+    text = _OFFSET_RE.sub("", text).strip("\n")
     if offset_ms == 0:
-        return _OFFSET_RE.sub("", text).strip("\n")
-
-    # Remove the offset tag line
-    text = _OFFSET_RE.sub("", text)
+        return text
 
     def _shift(match: re.Match) -> str:
-        mm, ss, cs = int(match.group(1)), int(match.group(2)), match.group(3)
-        # Normalize centiseconds to milliseconds
-        if len(cs) == 2:
-            ms = int(cs) * 10
-            fmt_cs = 2
-        else:
-            ms = int(cs)
-            fmt_cs = 3
-        total_ms = (mm * 60 + ss) * 1000 + ms - offset_ms
-        total_ms = max(0, total_ms)
+        total_ms = max(
+            0,
+            (int(match.group(1)) * 60 + int(match.group(2))) * 1000
+            + int(match.group(3)) * 10
+            - offset_ms,
+        )
         new_mm = total_ms // 60000
         new_ss = (total_ms % 60000) // 1000
-        new_cs = total_ms % 1000
-        if fmt_cs == 2:
-            new_cs = new_cs // 10
-            return f"[{new_mm:02d}:{new_ss:02d}.{new_cs:02d}]"
-        return f"[{new_mm:02d}:{new_ss:02d}.{new_cs:03d}]"
+        new_cs = min(round((total_ms % 1000) / 10), 99)
+        return f"[{new_mm:02d}:{new_ss:02d}.{new_cs:02d}]"
 
-    return _TIME_TAG_RE.sub(_shift, text)
+    return _STD_TAG_CAPTURE_RE.sub(_shift, text)
 
 
 def normalize_tags(text: str) -> str:
-    """Normalize LRC time tags: colon format → dot format, then apply offset."""
-    text = _COLON_TAG_RE.sub(r"[\1.\2]", text)
-    return _apply_offset(text)
+    """Normalize LRC to standard form: reformat all tags to [mm:ss.cc], then apply offset."""
+    return _apply_offset(_reformat(text))
 
 
 def is_synced(text: str) -> bool:
-    """Check whether text contains actual LRC time tags with non-zero times.
+    """Check whether text contains non-zero LRC time tags.
 
-    Returns False if no tags exist or all tags are [00:00.00].
-    Handles both [mm:ss.cc] and [mm:ss:cc] formats.
+    Assumes text has been normalized by normalize_tags (standard [mm:ss.cc] format).
     """
-    tags = _STANDARD_TAG_RE.findall(text)
-    # Also check non-standard format
-    tags += [f"[{m.group(1)}.{m.group(2)}]" for m in _COLON_TAG_RE.finditer(text)]
-    if not tags:
-        return False
-    for tag in tags:
-        if not _ZERO_TAG_RE.match(tag):
-            return True
-    return False
+    tags = _STD_TAG_RE.findall(text)
+    return bool(tags) and any(tag != "[00:00.00]" for tag in tags)
 
 
 def detect_sync_status(text: str) -> CacheStatus:
-    """Determine whether lyrics contain meaningful LRC time tags."""
+    """Determine whether lyrics contain meaningful LRC time tags.
+
+    Assumes text has been normalized by normalize_tags.
+    """
     return (
         CacheStatus.SUCCESS_SYNCED if is_synced(text) else CacheStatus.SUCCESS_UNSYNCED
     )
 
 
+def normalize_unsynced(lyrics: str) -> str:
+    """Normalize unsynced lyrics so every line has a [00:00.00] tag.
+
+    - Lines that already have time tags: replace with [00:00.00]
+    - Lines without time tags: prepend [00:00.00]
+    - Blank lines are converted to [00:00.00]
+    """
+    out: list[str] = []
+    for line in lyrics.splitlines():
+        stripped = line.strip()
+        if not stripped:
+            out.append("[00:00.00]")
+            continue
+        cleaned = _LRC_LINE_RE.sub("", stripped)
+        while _LRC_LINE_RE.match(cleaned):
+            cleaned = _LRC_LINE_RE.sub("", cleaned)
+        out.append(f"[00:00.00]{cleaned}")
+    return "\n".join(out)
+
+
 def get_audio_path(audio_url: str, ensure_exists: bool = False) -> Optional[Path]:
     """Convert file:// URL to Path, return None if invalid or (if ensure_exists) file doesn't exist."""
     if not audio_url.startswith("file://"):
diff --git a/pyproject.toml b/pyproject.toml
index 001acdb..f969b98 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "lrcfetch"
-version = "0.1.5"
+version = "0.1.6"
 description = "Fetch line-synced lyrics for your music player."
 readme = "README.md"
 requires-python = ">=3.13"
diff --git a/uv.lock b/uv.lock
index 7fa21c8..6431238 100644
--- a/uv.lock
+++ b/uv.lock
@@ -153,7 +153,7 @@ wheels = [
 
 [[package]]
 name = "lrcfetch"
-version = "0.1.5"
+version = "0.1.6"
 source = { editable = "." }
 dependencies = [
     { name = "cyclopts" },