feat: normalize option

2026-04-08 07:32:25 +02:00
parent 1e0f8e2868
commit 587d5dbe46
5 changed files with 245 additions and 22 deletions
@@ -99,7 +99,17 @@ def fetch(
    plain: Annotated[
        bool,
        cyclopts.Parameter(
-            name="--plain", negative="", help="Output only the raw lyrics without tags."
+            name="--plain",
+            negative="",
+            help="Output only plain lyrics without tags (highest priority over --normalize).",
+        ),
+    ] = False,
+    normalize: Annotated[
+        bool,
+        cyclopts.Parameter(
+            name="--normalize",
+            negative="",
+            help="Output normalized LRC (ignored when --plain is also set).",
        ),
    ] = False,
 ):
@@ -123,7 +133,12 @@ def fetch(
        logger.error("No lyrics found.")
        sys.exit(1)

-    print(result.lyrics.to_text(plain=plain))
+    if plain:
+        print(result.lyrics.to_plain())
+    elif normalize:
+        print(result.lyrics.to_normalized_text())
+    else:
+        print(result.lyrics.to_text())


 # search
@@ -179,7 +194,17 @@ def search(
    plain: Annotated[
        bool,
        cyclopts.Parameter(
-            name="--plain", negative="", help="Output only the raw lyrics without tags."
+            name="--plain",
+            negative="",
+            help="Output only plain lyrics without tags (highest priority over --normalize).",
+        ),
+    ] = False,
+    normalize: Annotated[
+        bool,
+        cyclopts.Parameter(
+            name="--normalize",
+            negative="",
+            help="Output normalized LRC (ignored when --plain is also set).",
        ),
    ] = False,
 ):
@@ -214,7 +239,12 @@ def search(
        logger.error("No lyrics found.")
        sys.exit(1)

-    print(result.lyrics.to_text(plain=plain))
+    if plain:
+        print(result.lyrics.to_plain())
+    elif normalize:
+        print(result.lyrics.to_normalized_text())
+    else:
+        print(result.lyrics.to_text())


 # export
@@ -253,7 +283,17 @@ def export(
    plain: Annotated[
        bool,
        cyclopts.Parameter(
-            name="--plain", negative="", help="Export only the raw lyrics without tags."
+            name="--plain",
+            negative="",
+            help="Export only plain lyrics (.txt, highest priority over --normalize).",
+        ),
+    ] = False,
+    normalize: Annotated[
+        bool,
+        cyclopts.Parameter(
+            name="--normalize",
+            negative="",
+            help="Export normalized LRC output (ignored when --plain is also set).",
        ),
    ] = False,
 ):
@@ -307,8 +347,10 @@ def export(
        with open(output, "w", encoding="utf-8") as f:
            if plain:
                f.write(result.lyrics.to_plain())
+            elif normalize:
+                f.write(result.lyrics.to_normalized_text())
            else:
-                f.write(str(result.lyrics))
+                f.write(result.lyrics.to_text())
        logger.info(f"Exported lyrics to {output}")
    except Exception as e:
        logger.error(f"Failed to write file: {e}")
@@ -233,6 +233,14 @@ def _is_single_doc_tag_line(line: str) -> Optional[tuple[str, str]]:
    return key, value


+def _parse_offset_value(value: str) -> Optional[int]:
+    """Parse doc offset value in milliseconds, returning None for invalid values."""
+    try:
+        return int(value.strip())
+    except ValueError:
+        return None
+
+
 class LRCData:
    _lines: list[BaseLine]
    _doc_tags: dict[str, str]
@@ -265,7 +273,7 @@ class LRCData:
        self._lines = parsed

    def __str__(self) -> str:
-        return self.to_text(plain=False, include_word_sync=False)
+        return self._serialize_lines(self._lines, include_word_sync=True)

    def __repr__(self) -> str:
        return f"LRCData(doc_tags={self._doc_tags!r}, lines={self._lines!r})"
@@ -293,7 +301,7 @@ class LRCData:
            else CacheStatus.SUCCESS_UNSYNCED
        )

-    def normalize_unsynced(self):
+    def normalize_unsynced(self) -> "LRCData":
        """Convert lyrics into unsynced LRC form with [00:00.00] tags.

        - Leading blank lyric lines are skipped.
@@ -329,6 +337,59 @@ class LRCData:
        ret._doc_tags = dict(self._doc_tags)
        return ret

+    def normalize(self) -> "LRCData":
+        """Normalize LRC for decode/export oriented output.
+
+        Rules:
+        - Move all doc tags to the beginning, preserving line order and duplicates.
+        - Keep doc tags unchanged except removing all offset tags.
+        - Remove word-sync tags.
+        - Convert untagged non-empty lyric lines to [00:00.00] lyrics.
+        - Drop empty lyric lines.
+        - Expand lyric lines with multiple time tags into one line per tag.
+        - Apply offset (ms) to lyric timestamps and sort by timestamp.
+        """
+        out_doc_tags: list[DocTagLine] = []
+        lyric_entries: list[tuple[int, str]] = []
+        offset_ms = 0
+
+        # Resolve offset first so it applies to all lyric lines, independent of tag position.
+        for line in self._lines:
+            if isinstance(line, DocTagLine) and line.key.strip().lower() == "offset":
+                parsed_offset = _parse_offset_value(line.value)
+                if parsed_offset is not None:
+                    offset_ms = parsed_offset
+
+        for line in self._lines:
+            if isinstance(line, DocTagLine):
+                if line.key.strip().lower() == "offset":
+                    continue
+                out_doc_tags.append(DocTagLine(key=line.key, value=line.value))
+                continue
+
+            assert isinstance(line, LyricLine)
+
+            lyric_text = line.text
+            if not lyric_text.strip():
+                continue
+
+            line_times = line.line_times_ms if line.line_times_ms else [0]
+            for time_ms in line_times:
+                shifted = max(0, time_ms + offset_ms)
+                lyric_entries.append((shifted, lyric_text))
+
+        lyric_entries.sort(key=lambda item: item[0])
+
+        out_lyrics: list[LyricLine] = [
+            LyricLine(line_times_ms=[time_ms], words=[LrcWordSegment(text=text)])
+            for time_ms, text in lyric_entries
+        ]
+
+        ret = LRCData()
+        ret._lines = [*out_doc_tags, *out_lyrics]
+        ret._doc_tags = {line.key: line.value for line in out_doc_tags}
+        return ret
+
    def to_plain(
        self,
        deduplicate: bool = False,
@@ -366,23 +427,32 @@ class LRCData:

        return "\n".join(sorted_lines).strip()

+    @staticmethod
+    def _serialize_lines(lines: list[BaseLine], include_word_sync: bool) -> str:
+        return "\n".join(
+            line.to_text(include_word_sync=include_word_sync) for line in lines
+        )
+
    def to_text(
        self,
-        plain: bool = False,
        include_word_sync: bool = False,
    ) -> str:
-        """Serialize to LRC text or plain text.
+        """Serialize to non-normalized LRC text.

-        - plain=True returns to_plain().
-        - include_word_sync controls rendering of per-word tags for word-sync lines.
+        - Unsynced lyrics are converted to [00:00.00]-tagged form.
+        - include_word_sync only controls rendering of per-word tags.
+        - This method does not apply normalize() rules.
        """
-        if plain:
-            return self.to_plain(deduplicate=False)
+        res = self if self.is_synced() else self.normalize_unsynced()
+        return self._serialize_lines(res._lines, include_word_sync=include_word_sync)

-        lines: list[str] = [
-            line.to_text(include_word_sync=include_word_sync) for line in self._lines
-        ]
-        return "\n".join(lines)
+    def to_normalized_text(self) -> str:
+        """Serialize using normalize() rules.
+
+        Normalized output always strips word-sync tags.
+        """
+        normalized = self.normalize()
+        return self._serialize_lines(normalized._lines, include_word_sync=False)


 def get_audio_path(audio_url: str, ensure_exists: bool = False) -> Optional[Path]: