文件预览

vosk_transcribe.py

查看 Bilibili Video Transcriber 技能包中的文件内容。

文件内容

vosk_transcribe.py

#!/usr/bin/env python3
"""使用 Vosk 进行快速语音转写"""
import os
import sys
import json
import wave

output_dir = "/root/.openclaw/workspace/skills/bilibili-video-transcriber/bilibili_transcripts"
audio_file = os.path.join(output_dir, "raspberry_zero.mp3")
output_file = os.path.join(output_dir, "raspberry_zero_transcript.txt")

try:
    from vosk import Model, KaldiRecognizer
    
    print(f"🎤 使用 Vosk 转写:{audio_file}")
    print("=" * 60)
    
    # 检查模型
    model_path = os.path.expanduser("~/.cache/vosk-models/vosk-model-small-cn-0.22")
    if not os.path.exists(model_path):
        print(f"⬇️  下载 Vosk 中文模型...")
        model_path = os.path.expanduser("~/.cache/vosk-models/vosk-model-cn-0.22")
    
    if not os.path.exists(model_path):
        # 尝试自动下载
        from vosk import SetLogLevel
        SetLogLevel(-1)  # 静默下载
        print(f"📦 首次运行会自动下载模型...")
    
    print(f"📦 加载模型:{model_path}")
    model = Model(model_path)
    
    # 转换音频为 wav 格式(如果需要)
    import subprocess
    wav_file = audio_file.replace('.mp3', '.wav')
    if not os.path.exists(wav_file):
        print(f"🔄 转换音频为 WAV 格式...")
        subprocess.run([
            "ffmpeg", "-y", "-i", audio_file,
            "-ar", "16000", "-ac", "1",
            "-f", "s16le", "-bitexact",
            wav_file + ".raw"
        ], check=True, capture_output=True)
        wav_file = wav_file + ".raw"
    
    # 转写
    print(f"🎤 开始转写...")
    recognizer = KaldiRecognizer(model, 16000)
    
    text_parts = []
    with open(wav_file, "rb") as f:
        while True:
            data = f.read(4000)
            if len(data) == 0:
                break
            if recognizer.AcceptWaveform(data):
                result = json.loads(recognizer.Result())
                if 'text' in result and result['text']:
                    text_parts.append(result['text'])
    
    # 获取最后的结果
    final_result = json.loads(recognizer.FinalResult())
    if 'text' in final_result and final_result['text']:
        text_parts.append(final_result['text'])
    
    # 合并文本
    full_text = " ".join(text_parts)
    
    # 保存结果
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(full_text)
    
    print(f"✅ 转写完成!")
    print(f"📄 输出文件:{output_file}")
    print(f"\n📝 内容预览:\n{full_text[:500]}...")
    
except ImportError as e:
    print(f"❌ Vosk 未安装:{e}")
    print(f"💡 尝试使用 whisper 转写...")
    
    # 回退到 whisper
    import subprocess
    try:
        cmd = [
            "whisper",
            audio_file,
            "--model", "tiny",  # 使用更小的模型
            "--language", "zh",
            "--output_dir", output_dir,
            "--output_format", "txt"
        ]
        result = subprocess.run(cmd, capture_output=True, text=True, timeout=900)
        if result.returncode == 0:
            print(f"✅ Whisper 转写完成!")
        else:
            print(f"❌ Whisper 转写失败:{result.stderr[:500]}")
    except Exception as we:
        print(f"❌ Whisper 也失败了:{we}")
        sys.exit(1)
except Exception as e:
    print(f"❌ 错误:{e}")
    import traceback
    traceback.print_exc()
    sys.exit(1)