文件预览

enhance_video.py

查看 Huo15 Img Prompt 技能包中的文件内容。

文件内容

scripts/enhance_video.py

#!/usr/bin/env python3
"""
huo15-img-prompt — T2V 视频提示词增强脚本 v2.2

把 enhance_prompt.py 的 88 风格预设 + 一致性锁,扩展到视频维度:
  - 镜头运动(推/拉/摇/移/跟/环绕/手持/无人机...)
  - 节奏(缓慢 / 中速 / 紧张快切)
  - 时长(建议秒数 + 关键帧拆分)
  - 主体动作(自动从描述中抽词,或显式 --action)
  - 模型适配:Sora / Kling 可灵 / Runway Gen-3/Gen-4 / Pika / Luma DreamMachine / 即梦 / Hailuo MiniMax / Wan2.1

调用:
  enhance_video.py "雨夜霓虹街头一只猫漫步" -p 赛博朋克 -m Sora --duration 8
  enhance_video.py "汉服少女转身回眸" -p 汉服写真 -m Kling --motion 慢速跟拍
  enhance_video.py "宇宙飞船穿越星云" -p scifi -m Runway --action "ship accelerates, lens flare"

依赖:
  enhance_prompt.py 同目录(复用其预设 + 意图解析 + 一致性锁)
"""

import sys
import os
import json
import re
import argparse
import hashlib
from typing import Dict, List, Optional, Tuple

sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))

from enhance_prompt import (
    STYLE_PRESETS,
    ALIASES,
    QUALITY_TIERS,
    resolve_preset,
    parse_requirement,
    parse_mix_preset,
    mix_presets,
    sanitize_subject,
    strip_negative_clauses,
    stable_seed,
    list_presets as list_image_presets,
)

VERSION = "3.1.0"

# ─────────────────────────────────────────────────────────
# 镜头运动(中文 → 英文 + 视频专业术语)
# ─────────────────────────────────────────────────────────
CAMERA_MOTION: Dict[str, str] = {
    "推": "slow push-in (dolly in)",
    "推镜": "smooth dolly in, gradual close-up",
    "拉": "pull back (dolly out)",
    "拉镜": "slow pull back revealing wider scene",
    "摇": "pan (horizontal)",
    "横摇": "horizontal pan from left to right",
    "竖摇": "vertical tilt up to down",
    "移": "lateral tracking shot",
    "跟": "tracking shot following the subject",
    "跟拍": "smooth tracking shot, subject locked in frame",
    "环绕": "360 orbital shot around the subject",
    "围绕": "360 orbit shot, slow rotation",
    "手持": "handheld camera, slight shake, documentary feel",
    "稳定": "smooth gimbal stabilized, fluid motion",
    "无人机": "aerial drone shot, high-altitude reveal",
    "航拍": "aerial drone descent, cinematic reveal",
    "升": "crane up, vertical rise",
    "降": "crane down, descent",
    "变焦": "zoom in, focal length change",
    "希区柯克": "dolly zoom (vertigo effect)",
    "希区": "dolly zoom (vertigo effect)",
    "鱼眼": "fisheye lens distortion, wide warped perspective",
    "POV": "first-person POV, immersive",
    "POV视角": "first-person POV, immersive",
    "子弹时间": "bullet-time freeze, 360 frozen pan",
    "延时": "time-lapse, accelerated motion",
    "慢动作": "slow motion 120fps, ultra-smooth",
    "快切": "rapid cuts, high-energy montage",
}

# 节奏 → 英文
PACING: Dict[str, str] = {
    "缓慢": "slow steady pacing, contemplative rhythm",
    "舒缓": "slow steady pacing, contemplative rhythm",
    "宁静": "calm, atmospheric, lingering shots",
    "中速": "moderate pacing, balanced cuts",
    "紧张": "tense pacing, building intensity",
    "急促": "fast pacing, urgent cuts",
    "快切": "rapid cuts, high-energy edit",
    "动感": "kinetic energy, dynamic motion",
    "史诗": "epic crescendo, sweeping movement",
}

# 主体动作关键词(自动抽词)
ACTION_KEYWORDS: Dict[str, str] = {
    "走": "walking forward",
    "漫步": "walking calmly",
    "奔跑": "running fast",
    "跑": "running",
    "跳": "jumping",
    "飞": "flying through the air",
    "舞": "dancing gracefully",
    "舞蹈": "dancing gracefully",
    "回眸": "turning to look back over shoulder",
    "转身": "turning around",
    "微笑": "smiling softly",
    "战斗": "fighting, dynamic combat motion",
    "挥剑": "swinging a sword",
    "射箭": "drawing and releasing an arrow",
    "骑马": "riding a horse at full gallop",
    "驾驶": "driving forward",
    "穿越": "traveling through, breaking forward",
    "升起": "rising up slowly",
    "落下": "falling down gently",
    "爆炸": "explosion blooming outward",
    "绽放": "blooming open",
    "凝视": "gazing intently into the camera",
    "对视": "locking eyes with the viewer",
    "睁眼": "eyes opening slowly",
    "闭眼": "eyes closing slowly",
    "呼吸": "breathing softly, chest rising and falling",
    "拥抱": "embracing tenderly",
    "牵手": "holding hands",
    "握手": "shaking hands",
}

# ─────────────────────────────────────────────────────────
# 模型规格
# ─────────────────────────────────────────────────────────
VIDEO_MODELS: Dict[str, Dict[str, str]] = {
    "Sora": {
        "max_duration": "20s (Sora 2 Pro)",
        "default_duration": 10,
        "aspect_default": "16:9",
        "tip": "支持长自然语言描述。可叠加 'cinematic, IMAX, 35mm film, photorealistic'。一致性强,可复用 character description。",
        "format": "natural",
    },
    "Kling": {
        "max_duration": "10s (1080p Pro)",
        "default_duration": 5,
        "aspect_default": "16:9",
        "tip": "可灵 1.6/2.0:建议提示前置主体,后置镜头/光影。支持首尾帧控制(image-to-video)。",
        "format": "natural",
    },
    "可灵": {
        "max_duration": "10s (1080p Pro)",
        "default_duration": 5,
        "aspect_default": "16:9",
        "tip": "可灵 1.6/2.0:中文提示词支持良好,可加 'cinematic 电影感'。",
        "format": "natural",
    },
    "Runway": {
        "max_duration": "10s (Gen-3 Alpha Turbo)",
        "default_duration": 5,
        "aspect_default": "16:9",
        "tip": "Gen-3 / Gen-4:英文提示效果最佳。支持 Motion Brush 局部运动。CFG ~7。",
        "format": "natural",
    },
    "Pika": {
        "max_duration": "10s (Pika 2.0)",
        "default_duration": 4,
        "aspect_default": "16:9",
        "tip": "Pika:标签式提示,支持 -gs (guidance scale) 和 -motion (1-4)。",
        "format": "tag",
    },
    "Luma": {
        "max_duration": "9s (Dream Machine 1.6)",
        "default_duration": 5,
        "aspect_default": "16:9",
        "tip": "Luma Dream Machine:自然语言 + 关键帧(首尾图)。Loop 模式支持无缝循环。",
        "format": "natural",
    },
    "DreamMachine": {
        "max_duration": "9s",
        "default_duration": 5,
        "aspect_default": "16:9",
        "tip": "Luma Dream Machine:自然语言 + 关键帧。",
        "format": "natural",
    },
    "Hailuo": {
        "max_duration": "10s (MiniMax 02 / S2V-01)",
        "default_duration": 6,
        "aspect_default": "16:9",
        "tip": "海螺 MiniMax 02:中文支持优秀。S2V-01 可指定参考人物。",
        "format": "natural",
    },
    "MiniMax": {
        "max_duration": "10s",
        "default_duration": 6,
        "aspect_default": "16:9",
        "tip": "MiniMax 视频:中英双语,长描述效果好。",
        "format": "natural",
    },
    "即梦": {
        "max_duration": "12s (Seedance 1.0)",
        "default_duration": 5,
        "aspect_default": "16:9",
        "tip": "即梦 / Seedance:抖音生态,支持中文 + 多镜头剧情连贯。",
        "format": "natural",
    },
    "Seedance": {
        "max_duration": "12s",
        "default_duration": 5,
        "aspect_default": "16:9",
        "tip": "Seedance 1.0:多镜头剧情连贯,支持中文。",
        "format": "natural",
    },
    "Wan": {
        "max_duration": "8s (Wan 2.1)",
        "default_duration": 4,
        "aspect_default": "16:9",
        "tip": "通义 Wan 2.1:阿里开源,I2V 支持高分辨率。中英双语提示。",
        "format": "natural",
    },
    "Wan2.1": {
        "max_duration": "8s",
        "default_duration": 4,
        "aspect_default": "16:9",
        "tip": "通义 Wan 2.1:阿里开源 14B / 1.3B 双参数。",
        "format": "natural",
    },
    "通用": {
        "max_duration": "—",
        "default_duration": 5,
        "aspect_default": "16:9",
        "tip": "通用模板:自然语言 + 镜头 + 节奏 + 主体动作。",
        "format": "natural",
    },
}

MODEL_ALIASES: Dict[str, str] = {
    "sora": "Sora", "kling": "Kling", "kelin": "Kling", "klingai": "Kling",
    "runway": "Runway", "gen3": "Runway", "gen4": "Runway",
    "pika": "Pika", "luma": "Luma", "dreammachine": "Luma",
    "hailuo": "Hailuo", "minimax": "Hailuo",
    "jimeng": "即梦", "seedance": "即梦",
    "wan": "Wan", "wan21": "Wan", "wan2.1": "Wan",
    "tongyi": "Wan",
}


def resolve_video_model(name: str) -> str:
    if not name:
        return "通用"
    key = name.strip().lower().replace("-", "").replace("_", "").replace(" ", "")
    if key in MODEL_ALIASES:
        return MODEL_ALIASES[key]
    for m in VIDEO_MODELS:
        if m.lower() == key:
            return m
    return name if name in VIDEO_MODELS else "通用"


# ─────────────────────────────────────────────────────────
# 解析
# ─────────────────────────────────────────────────────────
def parse_motion(text: str) -> str:
    for zh, en in CAMERA_MOTION.items():
        if zh in text:
            return en
    return ""


def parse_pacing(text: str) -> str:
    for zh, en in PACING.items():
        if zh in text:
            return en
    return ""


def parse_action(text: str) -> str:
    actions = []
    for zh, en in ACTION_KEYWORDS.items():
        if zh in text and en not in actions:
            actions.append(en)
    return ", ".join(actions[:3])


# ─────────────────────────────────────────────────────────
# 关键帧拆分
# ─────────────────────────────────────────────────────────
def keyframe_breakdown(subject: str, motion: str, duration: int) -> List[Dict[str, str]]:
    """简单的三段式拆分:开场(建立)→ 中段(动作)→ 结尾(落点)。"""
    if duration <= 3:
        return [{"t": "0s", "desc": f"establish shot: {subject}"}]
    third = max(1, duration // 3)
    return [
        {"t": "0s", "desc": f"opening: establish {subject} in scene, static composition"},
        {"t": f"{third}s", "desc": f"mid: {motion or 'subject performs main action'}, peak motion"},
        {"t": f"{2*third}s", "desc": f"closing: settle into resting frame, fade or hold"},
    ]


# ─────────────────────────────────────────────────────────
# 主构建
# ─────────────────────────────────────────────────────────
def build_video_prompt(
    subject: str,
    preset: str,
    model: str = "通用",
    aspect: str = "",
    duration: Optional[int] = None,
    motion: str = "",
    pacing: str = "",
    action: str = "",
    seed: Optional[int] = None,
    quality_tier: str = "pro",
    extra_negatives: str = "",
    mix_secondary: Optional[str] = None,
    mix_ratio: float = 0.6,
) -> Dict:
    preset = resolve_preset(preset) or "电影感"
    if mix_secondary:
        mix_secondary = resolve_preset(mix_secondary) or ""
    model = resolve_video_model(model)
    spec = VIDEO_MODELS[model]

    # 视觉锁(复用 image preset)
    if mix_secondary and mix_secondary != preset:
        data = mix_presets(preset, mix_secondary, mix_ratio, model)
        mixed_label = f"{preset}+{mix_secondary}@{mix_ratio:.2f}"
    else:
        data = STYLE_PRESETS[preset]
        mixed_label = ""

    # 时长 / 画幅
    if duration is None:
        duration = spec["default_duration"]
    if not aspect:
        aspect = data.get("aspect", spec["aspect_default"])

    # 自动解析
    auto = parse_requirement(subject)
    subject_clean = sanitize_subject(strip_negative_clauses(subject))
    if not motion:
        motion = parse_motion(subject) or "smooth gimbal stabilized, fluid motion"
    if not pacing:
        pacing = parse_pacing(subject) or "moderate pacing, balanced cuts"
    if not action:
        action = parse_action(subject)

    # ambient
    ambient_parts = [auto["time_of_day"], auto["weather"], auto["season"]]
    ambient = ", ".join([x for x in ambient_parts if x])

    # 视觉锁字段
    visual_lock = ", ".join([
        x for x in [data["tags"], data.get("camera", ""), data.get("lighting", ""), data.get("palette", "")] if x
    ])

    quality_phrase = QUALITY_TIERS.get(quality_tier, QUALITY_TIERS["pro"])

    seed_key = mixed_label or preset
    seed_value = seed or stable_seed(subject_clean, seed_key)

    # 构造正向提示
    if spec["format"] == "tag":  # Pika 标签格式
        parts = [
            subject_clean,
            f"{motion}",
            f"{pacing}",
            visual_lock,
            ambient,
            action,
            quality_phrase,
            "cinematic video",
        ]
        positive = ", ".join([p for p in parts if p])
        positive += f" -gs 12 -motion 3 -ar {aspect}"
    else:  # 自然语言格式
        sentences = []
        sentences.append(f"A {duration}-second video of {subject_clean}.")
        sentences.append(f"Camera movement: {motion}.")
        if action:
            sentences.append(f"The subject is {action}.")
        sentences.append(f"Pacing: {pacing}.")
        sentences.append(f"Visual style: {visual_lock}.")
        if ambient:
            sentences.append(f"Atmosphere: {ambient}.")
        sentences.append(f"Quality: {quality_phrase}, cinematic, smooth temporal coherence, no flicker, consistent character across frames.")
        positive = " ".join(sentences)

    # 负面
    base_neg = data["neg"]
    video_neg = (
        "flicker, frame drop, motion blur artifacts, jittery camera, "
        "low fps, choppy motion, morphing artifacts, identity drift, "
        "deformed limbs mid-motion, inconsistent character, watermark"
    )
    neg_parts = [base_neg, video_neg, extra_negatives, ", ".join(auto.get("user_negatives", []))]
    negative = ", ".join([x for x in neg_parts if x])

    # 关键帧
    keyframes = keyframe_breakdown(subject_clean, motion, duration)

    hint = (
        f"{model} tips:\n"
        f"  • {spec['tip']}\n"
        f"  • 推荐时长:{duration}s(上限 {spec['max_duration']})\n"
        f"  • 一致性:i2v 模式可固定首帧角色 / 用 image-prompt 保持服装色彩\n"
        f"  • seed: {seed_value}(同一 seed + 同一 prompt 在多数模型可复现)"
    )

    return {
        "version": VERSION,
        "type": "t2v",
        "original": subject,
        "preset": preset,
        "mix_secondary": mix_secondary or "",
        "mix_label": mixed_label,
        "model": model,
        "aspect": aspect,
        "duration_s": duration,
        "max_duration": spec["max_duration"],
        "motion": motion,
        "pacing": pacing,
        "action": action,
        "time_of_day": auto.get("time_of_day", ""),
        "weather": auto.get("weather", ""),
        "season": auto.get("season", ""),
        "seed_suggestion": seed_value,
        "quality_tier": quality_tier,
        "positive": positive,
        "negative": negative,
        "keyframes": keyframes,
        "hint": hint,
        "consistency_lock": {
            "camera": data.get("camera", ""),
            "lighting": data.get("lighting", ""),
            "palette": data.get("palette", ""),
            "aspect": aspect,
            "motion": motion,
        },
    }


def print_video_prompt(r: Dict):
    sep = "═" * 60
    print(f"\n{sep}")
    print(f"🎬 视频提示词(v{r['version']})")
    print(f"📌 原始描述   : {r['original']}")
    if r.get("mix_label"):
        print(f"🎨 风格预设   : {r['mix_label']} (混合)")
    else:
        print(f"🎨 风格预设   : {r['preset']}")
    print(f"🤖 目标模型   : {r['model']}(上限 {r['max_duration']})")
    print(f"📐 画幅       : {r['aspect']}")
    print(f"⏱  时长       : {r['duration_s']}s")
    print(f"🎥 镜头运动   : {r['motion']}")
    print(f"🎵 节奏       : {r['pacing']}")
    if r.get("action"):
        print(f"💪 主体动作   : {r['action']}")
    if r.get("time_of_day") or r.get("weather") or r.get("season"):
        amb = ", ".join([x for x in [r.get("time_of_day", ""), r.get("weather", ""), r.get("season", "")] if x])
        print(f"🌤  环境       : {amb}")
    print(f"⭐ 质量档位   : {r['quality_tier']}")
    print(f"🎲 种子建议   : {r['seed_suggestion']}")
    print(f"\n✅ 正向提示词:\n{r['positive']}")
    print(f"\n❌ 负向提示词:\n{r['negative']}")
    print(f"\n🎞  关键帧拆分:")
    for kf in r["keyframes"]:
        print(f"   {kf['t']:>4s}  {kf['desc']}")
    print(f"\n🔒 一致性锁:")
    for k, v in r["consistency_lock"].items():
        if v:
            print(f"   {k:8s}: {v}")
    print(f"\n💡 {r['hint']}")
    print(f"{sep}\n")


def main():
    parser = argparse.ArgumentParser(
        description=f"huo15-img-prompt enhance_video v{VERSION} — T2V 视频提示词增强",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
示例:
  enhance_video.py "雨夜霓虹街头一只猫漫步" -p 赛博朋克 -m Sora --duration 8
  enhance_video.py "汉服少女转身回眸" -p 汉服写真 -m Kling --motion 慢速跟拍
  enhance_video.py "宇宙飞船穿越星云" -p scifi -m Runway --duration 5 --pacing 史诗
  enhance_video.py "山中神女腾云" -p "原神+敦煌壁画" --mix 0.6 -m Hailuo
  enhance_video.py "侠客挥剑" -p 水墨 -m 即梦 --action "spinning sword strike"
""",
    )
    parser.add_argument("subject", nargs="?", help="主体描述")
    parser.add_argument("-p", "--preset", help="风格预设(沿用 88 款图像预设;支持 A+B 混合)")
    parser.add_argument("--mix", type=float, default=0.6, help="主预设权重 0.1-0.9(默认 0.6)")
    parser.add_argument(
        "-m", "--model", default="通用",
        help="视频模型: Sora / Kling / Runway / Pika / Luma / Hailuo / 即梦 / Wan / 通用",
    )
    parser.add_argument("-a", "--aspect", default="", help="画幅 16:9 / 9:16 / 1:1 / 21:9")
    parser.add_argument("--duration", type=int, help="时长(秒),不给走模型默认")
    parser.add_argument("--motion", default="", help="镜头运动覆盖(中/英)")
    parser.add_argument("--pacing", default="", help="节奏覆盖")
    parser.add_argument("--action", default="", help="主体动作覆盖")
    parser.add_argument("--avoid", default="", help="额外负面词")
    parser.add_argument("-t", "--tier", choices=["basic", "pro", "master"], default="pro")
    parser.add_argument("--seed", type=int, help="种子")
    parser.add_argument("-l", "--list", action="store_true", help="列出图像预设(视频沿用)")
    parser.add_argument("--list-models", action="store_true", help="列出视频模型规格")
    parser.add_argument("-j", "--json", action="store_true", help="JSON 输出")
    parser.add_argument("-v", "--version", action="version", version=f"%(prog)s v{VERSION}")
    args = parser.parse_args()

    if args.list:
        list_image_presets()
        return

    if args.list_models:
        print(f"\n🎬 视频模型规格 (v{VERSION})\n" + "─" * 50)
        for name, spec in VIDEO_MODELS.items():
            print(f"\n【{name}】")
            print(f"  上限时长: {spec['max_duration']}")
            print(f"  默认时长: {spec['default_duration']}s")
            print(f"  默认画幅: {spec['aspect_default']}")
            print(f"  说明: {spec['tip']}")
        return

    if not args.subject:
        parser.print_help()
        sys.exit(1)

    raw_preset = args.preset or "电影感"
    primary_raw, secondary_raw = parse_mix_preset(raw_preset)
    if secondary_raw:
        primary_resolved = resolve_preset(primary_raw)
        secondary_resolved = resolve_preset(secondary_raw)
        if not primary_resolved or not secondary_resolved:
            unknown = [n for n, r in [(primary_raw, primary_resolved), (secondary_raw, secondary_resolved)] if not r]
            print(f"❌ 未知预设:{', '.join(unknown)}", file=sys.stderr)
            sys.exit(1)
        preset, mix_secondary = primary_resolved, secondary_resolved
    else:
        preset, mix_secondary = primary_raw, None

    result = build_video_prompt(
        args.subject, preset, model=args.model, aspect=args.aspect,
        duration=args.duration, motion=args.motion, pacing=args.pacing,
        action=args.action, seed=args.seed, quality_tier=args.tier,
        extra_negatives=args.avoid, mix_secondary=mix_secondary, mix_ratio=args.mix,
    )

    if args.json:
        print(json.dumps(result, ensure_ascii=False, indent=2))
    else:
        print_video_prompt(result)


if __name__ == "__main__":
    main()