文件预览

make_translation_batches.py

查看 YouTube Chinese Subtitle Burn-in 技能包中的文件内容。

文件内容

scripts/make_translation_batches.py

#!/usr/bin/env python3
from __future__ import annotations

import argparse
import json
import sys
from pathlib import Path


def load_cache(path: Path | None) -> set[int]:
    if not path or not path.exists():
        return set()
    data = json.loads(path.read_text(encoding="utf-8"))
    return {int(item["id"]) for item in data if str(item.get("zh", "")).strip()}


def cue_id(cue: dict[str, object], fallback: int) -> int:
    return int(cue.get("id", fallback))


def make_batches(items: list[dict[str, object]], batch_size: int, max_chars: int) -> list[list[dict[str, object]]]:
    batches: list[list[dict[str, object]]] = []
    current: list[dict[str, object]] = []
    current_chars = 0
    for item in items:
        item_chars = len(str(item.get("en", "")))
        if current and (len(current) >= batch_size or current_chars + item_chars > max_chars):
            batches.append(current)
            current = []
            current_chars = 0
        current.append(item)
        current_chars += item_chars
    if current:
        batches.append(current)
    return batches


def main() -> int:
    parser = argparse.ArgumentParser(description="Create numbered subtitle batches for agent-model translation.")
    parser.add_argument("cue_json", type=Path)
    parser.add_argument("--out-dir", type=Path, required=True)
    parser.add_argument("--cache", type=Path, help="Existing translation cache JSON")
    parser.add_argument("--batch-size", type=int, default=80)
    parser.add_argument("--max-chars", type=int, default=6000, help="Maximum English characters per batch")
    args = parser.parse_args()

    cues = json.loads(args.cue_json.read_text(encoding="utf-8"))
    completed = load_cache(args.cache)
    pending = [
        {"id": cue_id(cue, index + 1), "en": cue["en"]}
        for index, cue in enumerate(cues)
        if cue_id(cue, index + 1) not in completed
    ]
    if not pending:
        print("PASS: no pending subtitles")
        return 0

    args.out_dir.mkdir(parents=True, exist_ok=True)
    batches = make_batches(pending, args.batch_size, args.max_chars)
    for batch_index, batch in enumerate(batches):
        payload = {
            "instruction": (
                "Translate these English video subtitles into natural Simplified Chinese. "
                "Keep every id, return JSON only as an array of {id, zh}. "
                "Use concise on-screen wording. Preserve product names such as Codex, ChatGPT, "
                "Claude, Claude Code, GitHub, OpenAI, Chronicle, Computer Use, MCP."
            ),
            "items": batch,
        }
        output = args.out_dir / f"batch-{batch_index + 1:03d}.json"
        output.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
        print(f"wrote {output}")
    print(f"PASS: wrote {len(batches)} batch file(s), {len(pending)} pending cue(s)")
    return 0


if __name__ == "__main__":
    sys.exit(main())