文件预览

make_preview_clip.py

查看 YouTube Chinese Subtitle Burn-in 技能包中的文件内容。

文件内容

scripts/make_preview_clip.py

#!/usr/bin/env python3
from __future__ import annotations

import argparse
import re
import shutil
import subprocess
import sys
import tempfile
from dataclasses import dataclass
from pathlib import Path

from PIL import Image, ImageDraw, ImageFont

from subtitle_style import get_style, style_names

SRT_TS = re.compile(r"(\d{2}:\d{2}:\d{2},\d{3})\s+-->\s+(\d{2}:\d{2}:\d{2},\d{3})")
ASS_TS = re.compile(r"Dialogue:[^,]*,([^,]+),([^,]+),[^,]*,[^,]*,[^,]*,[^,]*,[^,]*,[^,]*,(.*)")
ASS_OVERRIDE = re.compile(r"\{[^}]*\}")


@dataclass
class Cue:
    start: float
    end: float
    text: str
    style: str = "Default"


def parse_srt_time(value: str) -> float:
    hh, mm, rest = value.split(":")
    ss, ms = rest.split(",")
    return int(hh) * 3600 + int(mm) * 60 + int(ss) + int(ms) / 1000


def parse_ass_time(value: str) -> float:
    hh, mm, rest = value.strip().split(":")
    ss, cs = rest.split(".")
    return int(hh) * 3600 + int(mm) * 60 + int(ss) + int(cs) / 100


def format_srt_time(seconds: float) -> str:
    seconds = max(0, seconds)
    total_ms = int(round(seconds * 1000))
    hh = total_ms // 3_600_000
    total_ms %= 3_600_000
    mm = total_ms // 60_000
    total_ms %= 60_000
    ss = total_ms // 1000
    ms = total_ms % 1000
    return f"{hh:02d}:{mm:02d}:{ss:02d},{ms:03d}"


def clean_ass_text(text: str) -> str:
    text = ASS_OVERRIDE.sub("", text)
    return text.replace("\\N", "\n").replace("\\n", "\n").strip()


def parse_srt(path: Path) -> list[Cue]:
    blocks = re.split(r"\n\s*\n", path.read_text(encoding="utf-8-sig").strip())
    cues: list[Cue] = []
    for block in blocks:
        lines = [line.rstrip() for line in block.splitlines() if line.strip()]
        if not lines:
            continue
        ts_line_index = next((i for i, line in enumerate(lines) if SRT_TS.search(line)), None)
        if ts_line_index is None:
            continue
        match = SRT_TS.search(lines[ts_line_index])
        assert match is not None
        cues.append(Cue(parse_srt_time(match.group(1)), parse_srt_time(match.group(2)), "\n".join(lines[ts_line_index + 1 :]), "Default"))
    return cues


def parse_ass(path: Path) -> list[Cue]:
    cues: list[Cue] = []
    for line in path.read_text(encoding="utf-8-sig").splitlines():
        match = ASS_TS.match(line)
        if match:
            style = line.split(",", 4)[3].strip() if line.startswith("Dialogue:") and len(line.split(",", 4)) >= 4 else "Default"
            cues.append(Cue(parse_ass_time(match.group(1)), parse_ass_time(match.group(2)), clean_ass_text(match.group(3)), style))
    return cues


def parse_subtitle(path: Path) -> list[Cue]:
    if path.suffix.lower() == ".srt":
        return parse_srt(path)
    if path.suffix.lower() == ".ass":
        return parse_ass(path)
    raise SystemExit(f"Unsupported subtitle format: {path.suffix}")


def parse_time(value: str | None) -> float | None:
    if value is None:
        return None
    value = value.strip()
    if re.fullmatch(r"\d+(\.\d+)?", value):
        return float(value)
    parts = value.split(":")
    if len(parts) == 3:
        return int(parts[0]) * 3600 + int(parts[1]) * 60 + float(parts[2])
    if len(parts) == 2:
        return int(parts[0]) * 60 + float(parts[1])
    raise SystemExit(f"Invalid time: {value}")


def overlapping_cues(cues: list[Cue], start: float, end: float) -> list[Cue]:
    return [cue for cue in cues if cue.end > start and cue.start < end and cue.text.strip()]


def choose_window(cues: list[Cue], requested_start: float | None, duration: float, max_duration: float, increment: float) -> tuple[float, float, list[Cue]]:
    if not cues:
        raise SystemExit("No subtitle cues found")
    start = requested_start if requested_start is not None else 0
    current_duration = duration
    while current_duration <= max_duration:
        selected = overlapping_cues(cues, start, start + current_duration)
        if selected:
            return start, current_duration, selected
        current_duration += increment
    first = next((cue for cue in cues if cue.text.strip()), cues[0])
    start = max(0, first.start - 2)
    return start, duration, overlapping_cues(cues, start, start + duration)


def write_shifted_srt(cues: list[Cue], start: float, end: float, output: Path) -> None:
    lines: list[str] = []
    idx = 1
    for cue in cues:
        cue_start = max(cue.start, start)
        cue_end = min(cue.end, end)
        if cue_end <= start or cue_start >= end or not cue.text.strip():
            continue
        lines.append(str(idx))
        lines.append(f"{format_srt_time(cue_start - start)} --> {format_srt_time(cue_end - start)}")
        lines.append(cue.text.strip())
        lines.append("")
        idx += 1
    output.write_text("\n".join(lines), encoding="utf-8")


def has_filter(name: str) -> bool:
    result = subprocess.run(["ffmpeg", "-hide_banner", "-filters"], check=False, text=True, capture_output=True)
    return result.returncode == 0 and re.search(rf"\b{name}\b", result.stdout) is not None


def find_font() -> str | None:
    candidates = [
        "/System/Library/Fonts/PingFang.ttc",
        "/System/Library/Fonts/STHeiti Light.ttc",
        "/System/Library/Fonts/Supplemental/Arial Unicode.ttf",
        "/Library/Fonts/Arial Unicode.ttf",
    ]
    return next((path for path in candidates if Path(path).exists()), None)


def write_drawtext_files(cues: list[Cue], start: float, end: float, temp_dir: Path, style_profile: str) -> str:
    style = get_style(style_profile)
    font = find_font()
    filters: list[str] = []
    fontsize = style.zh_size
    for idx, cue in enumerate(cues):
        cue_start = max(cue.start, start)
        cue_end = min(cue.end, end)
        if cue_end <= start or cue_start >= end or not cue.text.strip():
            continue
        textfile = temp_dir / f"cue_{idx}.txt"
        textfile.write_text(cue.text.strip(), encoding="utf-8")
        font_part = f"fontfile='{font}':" if font else ""
        filters.append(
            "drawtext="
            f"{font_part}"
            f"textfile=cue_{idx}.txt:"
            f"fontcolor=white:fontsize={fontsize}:borderw={style.stroke_width}:bordercolor=black:"
            f"x=(w-text_w)/2:y=h-text_h-{style.zh_margin_v}:"
            f"enable='between(t\\,{cue_start - start:.3f}\\,{cue_end - start:.3f})'"
        )
    if not filters:
        raise SystemExit("No subtitle cues available for drawtext fallback")
    return ",".join(filters)


def load_font(size: int) -> ImageFont.FreeTypeFont | ImageFont.ImageFont:
    font = find_font()
    if font:
        try:
            return ImageFont.truetype(font, size)
        except OSError:
            pass
    return ImageFont.load_default()


def text_width(draw: ImageDraw.ImageDraw, text: str, font: ImageFont.ImageFont) -> int:
    bbox = draw.textbbox((0, 0), text, font=font, stroke_width=4)
    return bbox[2] - bbox[0]


def wrap_text(draw: ImageDraw.ImageDraw, text: str, font: ImageFont.ImageFont, max_width: int) -> list[str]:
    lines: list[str] = []
    for raw_line in text.splitlines():
        current = ""
        for char in raw_line:
            trial = current + char
            if current and text_width(draw, trial, font) > max_width:
                lines.append(current)
                current = char
            else:
                current = trial
        if current:
            lines.append(current)
    return lines or [text]


def active_text(cues: list[Cue], start: float, local_time: float) -> str:
    absolute = start + local_time
    texts = [cue.text.strip() for cue in cues if cue.start <= absolute <= cue.end and cue.text.strip()]
    return "\n".join(texts)


def active_cues(cues: list[Cue], start: float, local_time: float) -> list[Cue]:
    absolute = start + local_time
    active = [cue for cue in cues if cue.start <= absolute <= cue.end and cue.text.strip()]
    return sorted(active, key=lambda cue: (0 if cue.style.lower().startswith("chinese") else 1, cue.style.lower(), cue.start))


def is_bilingual(cues: list[Cue]) -> bool:
    styles = {cue.style.lower() for cue in cues}
    return any(style.startswith("chinese") for style in styles) and any(style.startswith("english") for style in styles)


def draw_subtitles(image: Image.Image, cues: list[Cue], style_profile: str) -> None:
    if not cues:
        return
    style = get_style(style_profile)
    draw = ImageDraw.Draw(image)
    bilingual = is_bilingual(cues)
    font_size = max(28, round(image.height * style.font_scale))
    font = load_font(font_size)
    if bilingual:
        english_font_size = max(18, round(font_size * 0.64))
        english_font = load_font(english_font_size)
        zh_text = "\n".join(cue.text.strip() for cue in cues if cue.style.lower().startswith("chinese"))
        en_text = " ".join(cue.text.strip().replace("\n", " ") for cue in cues if cue.style.lower().startswith("english"))
        zh_lines = wrap_text(draw, zh_text, font, round(image.width * style.max_width))[:2]
        en_lines = wrap_text(draw, en_text, english_font, round(image.width * style.max_width))[:1] if en_text else []
        line_height = round(font_size * 1.18)
        english_line_height = round(english_font_size * 1.3)
        total_height = line_height * len(zh_lines) + english_line_height * len(en_lines)
        y = image.height - total_height - round(image.height * style.bottom_margin)
        for line in zh_lines:
            bbox = draw.textbbox((0, 0), line, font=font, stroke_width=style.stroke_width)
            x = (image.width - (bbox[2] - bbox[0])) / 2
            draw.text((x, y), line, font=font, fill="white", stroke_width=style.stroke_width, stroke_fill="black")
            y += line_height
        for line in en_lines:
            en_stroke = max(2, style.stroke_width - 1)
            bbox = draw.textbbox((0, 0), line, font=english_font, stroke_width=en_stroke)
            x = (image.width - (bbox[2] - bbox[0])) / 2
            draw.text((x, y), line, font=english_font, fill="white", stroke_width=en_stroke, stroke_fill="black")
            y += english_line_height
        return

    text = "\n".join(cue.text.strip() for cue in cues)
    lines = wrap_text(draw, text, font, round(image.width * style.max_width))
    line_height = round(font_size * 1.25)
    total_height = line_height * len(lines)
    y = image.height - total_height - round(image.height * style.bottom_margin)
    for line in lines:
        bbox = draw.textbbox((0, 0), line, font=font, stroke_width=style.stroke_width)
        line_width = bbox[2] - bbox[0]
        x = (image.width - line_width) / 2
        draw.text((x, y), line, font=font, fill="white", stroke_width=style.stroke_width, stroke_fill="black")
        y += line_height


def render_pil_preview(video: Path, cues: list[Cue], start: float, duration: float, output: Path, temp_dir: Path, style_profile: str) -> None:
    fps = 12
    frames_dir = temp_dir / "frames"
    frames_dir.mkdir()
    frame_pattern = frames_dir / "frame_%06d.png"
    extract = subprocess.run(
        [
            "ffmpeg",
            "-y",
            "-ss",
            f"{start:.3f}",
            "-t",
            f"{duration:.3f}",
            "-i",
            str(video),
            "-vf",
            f"fps={fps}",
            str(frame_pattern),
        ],
        check=False,
        text=True,
        capture_output=True,
    )
    if extract.returncode != 0:
        raise SystemExit(extract.stderr.strip() or "failed to extract preview frames")

    frame_paths = sorted(frames_dir.glob("frame_*.png"))
    if not frame_paths:
        raise SystemExit("no preview frames extracted")

    for index, frame_path in enumerate(frame_paths):
        local_time = index / fps
        current = active_cues(cues, start, local_time)
        if not current:
            continue
        image = Image.open(frame_path).convert("RGB")
        draw_subtitles(image, current, style_profile)
        image.save(frame_path)

    encode = subprocess.run(
        [
            "ffmpeg",
            "-y",
            "-framerate",
            str(fps),
            "-i",
            str(frame_pattern),
            "-ss",
            f"{start:.3f}",
            "-t",
            f"{duration:.3f}",
            "-i",
            str(video),
            "-map",
            "0:v",
            "-map",
            "1:a?",
            "-shortest",
            "-c:v",
            "libx264",
            "-pix_fmt",
            "yuv420p",
            "-c:a",
            "aac",
            "-movflags",
            "+faststart",
            str(output),
        ],
        check=False,
        text=True,
        capture_output=True,
    )
    if encode.returncode != 0:
        raise SystemExit(encode.stderr.strip() or "failed to encode PIL preview")


def main() -> int:
    parser = argparse.ArgumentParser(description="Create a subtitled preview clip before full burn.")
    parser.add_argument("video", type=Path)
    parser.add_argument("subtitle", type=Path)
    parser.add_argument("--out", type=Path, required=True)
    parser.add_argument("--start", help="Optional preview start, e.g. 00:00:00 or 75.5")
    parser.add_argument("--duration", type=float, default=60)
    parser.add_argument("--max-duration", type=float, default=180)
    parser.add_argument("--increment", type=float, default=30)
    parser.add_argument("--style-profile", choices=style_names(), default="zh-only-default")
    args = parser.parse_args()

    if shutil.which("ffmpeg") is None:
        print("FAIL: ffmpeg is required")
        return 1
    if not args.video.exists():
        print(f"FAIL: missing video {args.video}")
        return 1
    if not args.subtitle.exists():
        print(f"FAIL: missing subtitle {args.subtitle}")
        return 1

    cues = parse_subtitle(args.subtitle)
    if is_bilingual(cues) and args.style_profile == "zh-only-default":
        args.style_profile = "bilingual-default"
    start, duration, selected = choose_window(cues, parse_time(args.start), args.duration, args.max_duration, args.increment)
    if not selected:
        print("FAIL: could not find a subtitle-bearing preview window")
        return 1

    args.out.parent.mkdir(parents=True, exist_ok=True)
    with tempfile.TemporaryDirectory() as temp_dir:
        temp_path = Path(temp_dir)
        if is_bilingual(cues):
            render_pil_preview(args.video, selected, start, duration, args.out, temp_path, args.style_profile)
            mode = "PIL frame fallback (bilingual layout)"
            print(f"PASS: wrote subtitled preview {args.out}")
            print(f"Window: start={start:.3f}s duration={duration:.3f}s subtitle_cues={len(selected)}")
            print(f"Render mode: {mode}")
            return 0
        if has_filter("subtitles"):
            shifted = temp_path / "preview.srt"
            write_shifted_srt(selected, start, start + duration, shifted)
            video_filter = "subtitles=filename=preview.srt"
            mode = "subtitles"
        elif has_filter("drawtext"):
            video_filter = write_drawtext_files(selected, start, start + duration, temp_path, args.style_profile)
            mode = "drawtext fallback"
        else:
            render_pil_preview(args.video, selected, start, duration, args.out, temp_path, args.style_profile)
            mode = "PIL frame fallback"
            print(f"PASS: wrote subtitled preview {args.out}")
            print(f"Window: start={start:.3f}s duration={duration:.3f}s subtitle_cues={len(selected)}")
            print(f"Render mode: {mode}")
            return 0
        result = subprocess.run(
            [
                "ffmpeg",
                "-y",
                "-ss",
                f"{start:.3f}",
                "-t",
                f"{duration:.3f}",
                "-i",
                str(args.video),
                "-vf",
                video_filter,
                "-c:v",
                "libx264",
                "-c:a",
                "aac",
                "-movflags",
                "+faststart",
                str(args.out),
            ],
            check=False,
            text=True,
            capture_output=True,
            cwd=temp_dir,
        )
        if result.returncode != 0:
            print(result.stderr.strip())
            return 1

    print(f"PASS: wrote subtitled preview {args.out}")
    print(f"Window: start={start:.3f}s duration={duration:.3f}s subtitle_cues={len(selected)}")
    print(f"Render mode: {mode}")
    return 0


if __name__ == "__main__":
    sys.exit(main())