文件内容
voice-ai-tts.yaml
# ============================================================================
# Voice.ai Text-to-Speech Skill
# ============================================================================
# A comprehensive skill for Voice.ai's TTS API with speech generation,
# streaming, and voice management capabilities.
#
# Version: 1.1.4
# Author: Nick Gill (https://github.com/gizmoGremlin)
# License: MIT
# Documentation: https://voice.ai/docs
# ============================================================================
skill:
name: voice-ai-tts
display_name: "Voice.ai Text-to-Speech"
description: |
High-quality voice synthesis with streaming audio generation,
multilingual support (11 languages), and flexible audio formats using the
Voice.ai Developer API. Features include real-time streaming, WebSocket
support for low-latency applications, and comprehensive voice management.
version: "1.1.4"
author: "Nick Gill (https://github.com/gizmoGremlin)"
icon: "🎙️"
category: "audio"
tags:
- text-to-speech
- tts
- audio-generation
- speech-synthesis
- ai-voice
- streaming
- websocket
# ============================================================================
# Authentication Configuration
# ============================================================================
authentication:
type: bearer
header: Authorization
prefix: "Bearer "
description: |
Voice.ai uses Bearer token authentication. Get your API key from
https://voice.ai/dashboard and pass it as: Authorization: Bearer <your-api-key>
environment_variable: VOICE_AI_API_KEY
required: true
# ============================================================================
# API Configuration
# ============================================================================
api:
base_url: "https://dev.voice.ai" # Official Voice.ai production API endpoint
version: "v1"
content_type: "application/json"
timeout: 60000
rate_limit:
requests_per_minute: 60
description: "Rate limits vary by plan. Check your dashboard for details."
# ============================================================================
# Available Models
# ============================================================================
models:
- id: "voiceai-tts-v1-latest"
name: "Voice.ai TTS v1 (Latest)"
description: "Latest English TTS model with highest quality"
languages: ["en"]
- id: "voiceai-tts-v1-2025-12-19"
name: "Voice.ai TTS v1 (2025-12-19)"
description: "Stable English TTS model snapshot"
languages: ["en"]
- id: "voiceai-tts-multilingual-v1-latest"
name: "Voice.ai Multilingual v1 (Latest)"
description: "Latest multilingual TTS model"
languages: ["en", "es", "fr", "de", "it", "pt", "pl", "ru", "nl", "sv", "ca"]
- id: "voiceai-tts-multilingual-v1-2025-01-14"
name: "Voice.ai Multilingual v1 (2025-01-14)"
description: "Stable multilingual TTS model snapshot"
languages: ["en", "es", "fr", "de", "it", "pt", "pl", "ru", "nl", "sv", "ca"]
# ============================================================================
# Popular Voices
# ============================================================================
voices:
- id: "d1bf0f33-8e0e-4fbf-acf8-45c3c6262513"
name: "Ellie"
gender: female
style: "Youthful, vibrant fashion vlogger"
- id: "f9e6a5eb-a7fd-4525-9e92-75125249c933"
name: "Oliver"
gender: male
style: "Friendly British, conversational"
- id: "4388040c-8812-42f4-a264-f457a6b2b5b9"
name: "Lilith"
gender: female
style: "Soft, feminine"
- id: "dbb271df-db25-4225-abb0-5200ba1426bc"
name: "Smooth Calm Voice"
gender: male
style: "Deep, smooth narrator"
- id: "72d2a864-b236-402e-a166-a838ccc2c273"
name: "Shadow"
gender: male
style: "Deep, distinctive narrator"
- id: "559d3b72-3e79-4f11-9b62-9ec702a6c057"
name: "Sakura"
gender: female
style: "Anime-inspired character"
- id: "ed751d4d-e633-4bb0-8f5e-b5c8ddb04402"
name: "Zenith"
gender: male
style: "Deep, dramatic baritone"
- id: "a931a6af-fb01-42f0-a8c0-bd14bc302bb1"
name: "Flora"
gender: female
style: "High pitch, cheerful"
- id: "bd35e4e6-6283-46b9-86b6-7cfa3dd409b9"
name: "Commander"
gender: male
style: "Deep heroic, commanding"
# ============================================================================
# Audio Formats
# ============================================================================
audio_formats:
basic:
- mp3: "MP3 at 32kHz (default)"
- wav: "WAV at 32kHz"
- pcm: "Raw PCM 16-bit signed little-endian at 32kHz"
mp3_variants:
- mp3_22050_32: "MP3 22.05kHz, 32kbps"
- mp3_24000_48: "MP3 24kHz, 48kbps"
- mp3_44100_32: "MP3 44.1kHz, 32kbps"
- mp3_44100_64: "MP3 44.1kHz, 64kbps"
- mp3_44100_96: "MP3 44.1kHz, 96kbps"
- mp3_44100_128: "MP3 44.1kHz, 128kbps"
- mp3_44100_192: "MP3 44.1kHz, 192kbps"
opus_variants:
- opus_48000_32: "Opus 48kHz, 32kbps"
- opus_48000_64: "Opus 48kHz, 64kbps"
- opus_48000_96: "Opus 48kHz, 96kbps"
- opus_48000_128: "Opus 48kHz, 128kbps"
- opus_48000_192: "Opus 48kHz, 192kbps"
pcm_variants:
- pcm_8000: "PCM 8kHz"
- pcm_16000: "PCM 16kHz"
- pcm_22050: "PCM 22.05kHz"
- pcm_24000: "PCM 24kHz"
- pcm_32000: "PCM 32kHz"
- pcm_44100: "PCM 44.1kHz"
- pcm_48000: "PCM 48kHz"
wav_variants:
- wav_16000: "WAV 16kHz"
- wav_22050: "WAV 22.05kHz"
- wav_24000: "WAV 24kHz"
telephony:
- alaw_8000: "A-law 8kHz (telephony)"
- ulaw_8000: "μ-law 8kHz (telephony)"
# ============================================================================
# Tools / Actions
# ============================================================================
tools:
# --------------------------------------------------------------------------
# List Voices
# --------------------------------------------------------------------------
- name: list_voices
display_name: "List Voices"
description: |
Retrieve a list of available voices including public voices and your
private voices. Returns voice metadata including ID, name, language,
and visibility status.
method: GET
endpoint: "/api/v1/tts/voices"
parameters:
- name: limit
type: integer
required: false
default: 10
description: "Maximum number of voices to return (default: 10)"
- name: offset
type: integer
required: false
default: 0
description: "Offset for pagination"
- name: visibility
type: string
required: false
enum: ["PUBLIC", "PRIVATE"]
description: "Filter by voice visibility"
response:
type: object
properties:
voices:
type: array
items:
type: object
properties:
voice_id:
type: string
description: "Unique voice identifier"
name:
type: string
description: "Voice name"
language:
type: string
description: "Voice language (ISO 639-1)"
visibility:
type: string
enum: ["PUBLIC", "PRIVATE"]
status:
type: string
enum: ["PENDING", "PROCESSING", "AVAILABLE", "FAILED"]
example:
request:
method: GET
url: "https://dev.voice.ai/api/v1/tts/voices?limit=10"
headers:
Authorization: "Bearer YOUR_API_KEY"
response:
voices:
- voice_id: "abc123"
name: "Sarah"
language: "en"
visibility: "PUBLIC"
status: "AVAILABLE"
# --------------------------------------------------------------------------
# Get Voice
# --------------------------------------------------------------------------
- name: get_voice
display_name: "Get Voice Details"
description: |
Retrieve detailed information about a specific voice by its ID.
method: GET
endpoint: "/api/v1/tts/voice/{voice_id}"
parameters:
- name: voice_id
type: string
required: true
in: path
description: "The unique voice identifier"
response:
type: object
properties:
voice_id:
type: string
name:
type: string
language:
type: string
visibility:
type: string
status:
type: string
created_at:
type: string
format: datetime
example:
request:
method: GET
url: "https://dev.voice.ai/api/v1/tts/voice/abc123"
response:
voice_id: "abc123"
name: "Sarah"
language: "en"
visibility: "PUBLIC"
status: "AVAILABLE"
# --------------------------------------------------------------------------
# Generate Speech
# --------------------------------------------------------------------------
- name: generate_speech
display_name: "Generate Speech"
description: |
Generate speech from text. Returns complete audio file after generation
is complete. For real-time streaming, use the speech_stream endpoint instead.
method: POST
endpoint: "/api/v1/tts/speech"
parameters:
- name: text
type: string
required: true
description: "The text to convert to speech"
max_length: 5000
- name: voice_id
type: string
required: false
description: "Voice ID to use. Omit to use the default built-in voice."
- name: audio_format
type: string
required: false
default: "mp3"
enum: ["mp3", "wav", "pcm", "alaw_8000", "ulaw_8000", "mp3_44100_128", "opus_48000_64"]
description: "Output audio format (32kHz sample rate for basic formats)"
- name: temperature
type: number
required: false
default: 1.0
minimum: 0.0
maximum: 2.0
description: "Sampling temperature for variation (0.0-2.0)"
- name: top_p
type: number
required: false
default: 0.8
minimum: 0.0
maximum: 1.0
description: "Nucleus sampling parameter (0.0-1.0)"
- name: model
type: string
required: false
enum: ["voiceai-tts-v1-latest", "voiceai-tts-multilingual-v1-latest"]
description: "TTS model. Auto-selected based on language if not specified."
- name: language
type: string
required: false
default: "en"
enum: ["en", "es", "fr", "de", "it", "pt", "pl", "ru", "nl", "sv", "ca"]
description: "Language code (ISO 639-1)"
response:
type: binary
content_type: "audio/mpeg"
description: "Audio file in the requested format"
example:
request:
method: POST
url: "https://dev.voice.ai/api/v1/tts/speech"
headers:
Authorization: "Bearer YOUR_API_KEY"
Content-Type: "application/json"
body:
text: "Hello, this is a test of Voice.ai text to speech."
voice_id: "abc123"
audio_format: "mp3"
language: "en"
# --------------------------------------------------------------------------
# Speech Stream
# --------------------------------------------------------------------------
- name: speech_stream
display_name: "Speech Stream"
description: |
Generate speech from text with HTTP chunked streaming. Returns audio
chunks as they are generated for low-latency playback. Ideal for
real-time applications.
method: POST
endpoint: "/api/v1/tts/speech/stream"
streaming: true
parameters:
- name: text
type: string
required: true
description: "The text to convert to speech"
- name: voice_id
type: string
required: false
description: "Voice ID to use. Omit for default voice."
- name: audio_format
type: string
required: false
default: "mp3"
description: "Output audio format"
- name: temperature
type: number
required: false
default: 1.0
minimum: 0.0
maximum: 2.0
- name: top_p
type: number
required: false
default: 0.8
- name: model
type: string
required: false
- name: language
type: string
required: false
default: "en"
response:
type: stream
content_type: ["audio/mpeg", "audio/wav", "audio/pcm"]
description: "Chunked audio stream via HTTP chunked transfer encoding"
example:
request:
method: POST
url: "https://dev.voice.ai/api/v1/tts/speech/stream"
headers:
Authorization: "Bearer YOUR_API_KEY"
Content-Type: "application/json"
body:
text: "Streaming audio is great for real-time applications."
audio_format: "mp3"
# --------------------------------------------------------------------------
# Update Voice
# --------------------------------------------------------------------------
- name: update_voice
display_name: "Update Voice"
description: |
Update voice metadata such as name and visibility. Owner-only operation.
method: PATCH
endpoint: "/api/v1/tts/voice/{voice_id}"
parameters:
- name: voice_id
type: string
required: true
in: path
description: "The voice ID to update"
- name: name
type: string
required: false
description: "New name for the voice"
- name: voice_visibility
type: string
required: false
enum: ["PUBLIC", "PRIVATE"]
description: "New visibility setting"
response:
type: object
properties:
voice_id:
type: string
name:
type: string
visibility:
type: string
updated_at:
type: string
example:
request:
method: PATCH
url: "https://dev.voice.ai/api/v1/tts/voice/abc123"
body:
name: "Updated Voice Name"
voice_visibility: "PRIVATE"
# --------------------------------------------------------------------------
# Delete Voice
# --------------------------------------------------------------------------
- name: delete_voice
display_name: "Delete Voice"
description: |
Delete a voice. This is a permanent action and cannot be undone.
Owner-only operation.
method: DELETE
endpoint: "/api/v1/tts/voice/{voice_id}"
parameters:
- name: voice_id
type: string
required: true
in: path
description: "The voice ID to delete"
response:
type: object
properties:
success:
type: boolean
message:
type: string
example:
request:
method: DELETE
url: "https://dev.voice.ai/api/v1/tts/voice/abc123"
response:
success: true
message: "Voice deleted successfully"
# --------------------------------------------------------------------------
# Single Context WebSocket
# --------------------------------------------------------------------------
- name: websocket_single_context
display_name: "WebSocket (Single Context)"
description: |
Real-time bidirectional streaming via WebSocket for single conversation
context. Ideal for interactive applications requiring ultra-low latency.
Send text chunks and receive audio chunks in real-time.
method: WEBSOCKET
endpoint: "/api/v1/tts/ws"
parameters:
- name: voice_id
type: string
required: false
description: "Voice ID to use"
- name: audio_format
type: string
required: false
default: "pcm_16000"
description: "Audio format for streaming"
- name: model
type: string
required: false
messages:
send:
type: object
properties:
text:
type: string
description: "Text chunk to synthesize"
flush:
type: boolean
description: "Force flush buffered audio"
end:
type: boolean
description: "Signal end of stream"
receive:
type: binary
description: "Audio data chunks"
example:
url: "wss://dev.voice.ai/api/v1/tts/ws?voice_id=abc123&audio_format=pcm_16000"
send: '{"text": "Hello, how are you today?"}'
receive: "<binary audio data>"
# --------------------------------------------------------------------------
# Multi Context WebSocket
# --------------------------------------------------------------------------
- name: websocket_multi_context
display_name: "WebSocket (Multi Context)"
description: |
Real-time bidirectional streaming via WebSocket with support for multiple
conversation contexts. Allows switching between different voice contexts
within a single connection for complex applications.
method: WEBSOCKET
endpoint: "/api/v1/tts/ws/multi"
parameters:
- name: audio_format
type: string
required: false
default: "pcm_16000"
messages:
send:
type: object
properties:
context_id:
type: string
description: "Context identifier for multiplexing"
voice_id:
type: string
description: "Voice ID for this context"
text:
type: string
description: "Text chunk to synthesize"
flush:
type: boolean
end:
type: boolean
receive:
type: object
properties:
context_id:
type: string
audio:
type: binary
description: "Audio data for the context"
example:
url: "wss://dev.voice.ai/api/v1/tts/ws/multi?audio_format=pcm_16000"
send: '{"context_id": "conv1", "voice_id": "abc123", "text": "Hello!"}'
# ============================================================================
# Error Codes
# ============================================================================
errors:
- code: 401
name: "Unauthorized"
description: "Invalid or missing API key"
- code: 402
name: "Payment Required"
description: "Insufficient credits or voice slot limit reached"
- code: 403
name: "Forbidden"
description: "Insufficient permissions for the requested operation"
- code: 404
name: "Not Found"
description: "Voice ID does not exist or is not accessible"
- code: 422
name: "Validation Error"
description: "Invalid request parameters"
- code: 429
name: "Rate Limited"
description: "Too many requests. Please slow down."
- code: 500
name: "Internal Server Error"
description: "Server error. Please try again later."
# ============================================================================
# Code Examples
# ============================================================================
examples:
python:
list_voices: |
import requests
API_KEY = "your_api_key_here"
response = requests.get(
"https://dev.voice.ai/api/v1/tts/voices",
headers={"Authorization": f"Bearer {API_KEY}"},
params={"limit": 10}
)
voices = response.json()["voices"]
for voice in voices:
print(f"{voice['name']} ({voice['voice_id']})")
generate_speech: |
import requests
API_KEY = "your_api_key_here"
response = requests.post(
"https://dev.voice.ai/api/v1/tts/speech",
headers={
"Authorization": f"Bearer {API_KEY}",
"Content-Type": "application/json"
},
json={
"text": "Hello, this is Voice.ai text to speech!",
"voice_id": "your_voice_id", # optional
"audio_format": "mp3"
}
)
with open("output.mp3", "wb") as f:
f.write(response.content)
stream_speech: |
import requests
API_KEY = "your_api_key_here"
response = requests.post(
"https://dev.voice.ai/api/v1/tts/speech/stream",
headers={
"Authorization": f"Bearer {API_KEY}",
"Content-Type": "application/json"
},
json={
"text": "Streaming audio for real-time playback.",
"audio_format": "mp3"
},
stream=True
)
with open("stream_output.mp3", "wb") as f:
for chunk in response.iter_content(chunk_size=1024):
f.write(chunk)
curl:
list_voices: |
curl -X GET "https://dev.voice.ai/api/v1/tts/voices?limit=10" \
-H "Authorization: Bearer YOUR_API_KEY"
generate_speech: |
curl -X POST "https://dev.voice.ai/api/v1/tts/speech" \
-H "Authorization: Bearer YOUR_API_KEY" \
-H "Content-Type: application/json" \
-d '{"text": "Hello world!", "audio_format": "mp3"}' \
--output speech.mp3
typescript:
generate_speech: |
const response = await fetch("https://dev.voice.ai/api/v1/tts/speech", {
method: "POST",
headers: {
"Authorization": `Bearer ${API_KEY}`,
"Content-Type": "application/json"
},
body: JSON.stringify({
text: "Hello from TypeScript!",
audio_format: "mp3"
})
});
const audioBlob = await response.blob();
const audioUrl = URL.createObjectURL(audioBlob);
// Play the audio
const audio = new Audio(audioUrl);
audio.play();
# ============================================================================
# Changelog
# ============================================================================
changelog:
- version: "1.1.4"
date: "2026-02-16"
changes:
- "Declare primary env var in metadata for scanners"
- version: "1.1.3"
date: "2026-02-16"
changes:
- "Remove voice-sample upload tool entry to reduce privacy risk"
- version: "1.1.2"
date: "2026-02-16"
changes:
- "Add SECURITY.md and LICENSE.md for provenance"
- "Restrict SDK to https only (remove http transport)"
- version: "1.1.1"
date: "2026-02-16"
changes:
- "Packaging metadata improvements for ClawHub import"
- version: "1.1.0"
date: "2026-02-16"
changes:
- "Documented production API endpoint"
- "Renamed voice personas for IP-safe labeling"
- "Metadata alignment for required credentials"
- version: "1.0.0"
date: "2026-01-30"
changes:
- "Initial release"
- "Support for all TTS endpoints"
- "HTTP and WebSocket streaming"
- "Multilingual support (11 languages)"
- "Comprehensive audio format options"