Text to Speech (T2A) WebSocket

Messages

{
  "event": "task_start",
  "model": "speech-2.8-turbo",
  "language_boost": "Chinese",
  "voice_setting": {
    "voice_id": "English_expressive_narrator",
    "speed": 1,
    "vol": 1,
    "pitch": 0
  },
  "pronunciation_dict": {
    "tone": [
      "Omg/Oh my god"
    ]
  },
  "audio_setting": {
    "sample_rate": 32000,
    "bitrate": 128000,
    "format": "mp3",
    "channel": 1
  }
}

{
  "event": "task_continue",
  "text": "Omg(sighs), the real danger is not that computers start thinking like people, but that people start thinking like computers. Computers can only help us with simple tasks."
}

{
  "data": {
    "audio": "xxx"
  },
  "extra_info": {
    "audio_channel": 1,
    "audio_format": "mp3",
    "audio_length": 9914,
    "audio_sample_rate": 32000,
    "audio_size": 157869,
    "bitrate": 128000,
    "invisible_character_ratio": 0,
    "usage_characters": 158,
    "word_count": 158
  },
  "is_final": true,
  "session_id": "301871346491491",
  "trace_id": "04ee3794e2c9e4a6d5f99e77742f06fd",
  "base_resp": {
    "status_code": 0,
    "status_msg": "success"
  }
}

WSS

t2a_v2

Messages

{
  "event": "task_start",
  "model": "speech-2.8-turbo",
  "language_boost": "Chinese",
  "voice_setting": {
    "voice_id": "English_expressive_narrator",
    "speed": 1,
    "vol": 1,
    "pitch": 0
  },
  "pronunciation_dict": {
    "tone": [
      "Omg/Oh my god"
    ]
  },
  "audio_setting": {
    "sample_rate": 32000,
    "bitrate": 128000,
    "format": "mp3",
    "channel": 1
  }
}

{
  "event": "task_continue",
  "text": "Omg(sighs), the real danger is not that computers start thinking like people, but that people start thinking like computers. Computers can only help us with simple tasks."
}

{
  "data": {
    "audio": "xxx"
  },
  "extra_info": {
    "audio_channel": 1,
    "audio_format": "mp3",
    "audio_length": 9914,
    "audio_sample_rate": 32000,
    "audio_size": 157869,
    "bitrate": 128000,
    "invisible_character_ratio": 0,
    "usage_characters": 158,
    "word_count": 158
  },
  "is_final": true,
  "session_id": "301871346491491",
  "trace_id": "04ee3794e2c9e4a6d5f99e77742f06fd",
  "base_resp": {
    "status_code": 0,
    "status_msg": "success"
  }
}

Task Start Event

type:object

Sending the "task_start" event officially begins the speech synthesis task. The task is considered successfully started when the server returns a "task_started" event. Only after receiving this event can you send "task_continue" or "task_finish" events to the server.

Task Continue Event

type:object

After receiving the "task_started" event from the server, the task officially begins. You can send "task_continue" events to provide text for synthesis. Multiple "task_continue" events can be sent sequentially. If no new event is sent within 120 seconds after receiving the last result, the WebSocket connection will automatically close.

Task Finish Event

type:object

When the server receives the task_finish event, it waits for all tasks in the current queue to complete, then closes the WebSocket connection and ends the session.

Connected Success Event

type:object

Notification that T2A task has started

Task Started Event

type:object

Notification that T2A task has started

Task Continued Event

type:object

Notification that T2A task is continuing

Task Finished Event

type:object

Notification that T2A task has completed successfully

Task Failed Event

type:object

If the task_failed event is received, it indicates that the task has failed. In this case, the WebSocket connection must be closed, and the error should be handled.

This example streams and plays the returned audio in real time while also saving the complete audio file. Note: To enable real-time audio playback, you must first install the mpv player. Additionally, make sure to set your API key in the environment variable MINIMAX_API_KEY.

import asyncio
import websockets
import json
import ssl
import subprocess
import os

model = "speech-2.8-hd"
file_format = "mp3"

class StreamAudioPlayer:
    def __init__(self):
        self.mpv_process = None

    def start_mpv(self):
        """Start MPV player process"""
        try:
            mpv_command = ["mpv", "--no-cache", "--no-terminal", "--", "fd://0"]
            self.mpv_process = subprocess.Popen(
                mpv_command,
                stdin=subprocess.PIPE,
                stdout=subprocess.DEVNULL,
                stderr=subprocess.DEVNULL,
            )
            print("MPV player started")
            return True
        except FileNotFoundError:
            print("Error: mpv not found. Please install mpv")
            return False
        except Exception as e:
            print(f"Failed to start mpv: {e}")
            return False

    def play_audio_chunk(self, hex_audio):
        """Play audio chunk"""
        try:
            if self.mpv_process and self.mpv_process.stdin:
                audio_bytes = bytes.fromhex(hex_audio)
                self.mpv_process.stdin.write(audio_bytes)
                self.mpv_process.stdin.flush()
                return True
        except Exception as e:
            print(f"Play failed: {e}")
            return False
        return False

    def stop(self):
        """Stop player"""
        if self.mpv_process:
            if self.mpv_process.stdin and not self.mpv_process.stdin.closed:
                self.mpv_process.stdin.close()
            try:
                self.mpv_process.wait(timeout=20)
            except subprocess.TimeoutExpired:
                self.mpv_process.terminate()

async def establish_connection(api_key):
    """Establish WebSocket connection"""
    url = "wss://api.minimax.io/ws/v1/t2a_v2"
    headers = {"Authorization": f"Bearer {api_key}"}

    ssl_context = ssl.create_default_context()
    ssl_context.check_hostname = False
    ssl_context.verify_mode = ssl.CERT_NONE

    try:
        ws = await websockets.connect(url, additional_headers=headers, ssl=ssl_context)
        connected = json.loads(await ws.recv())
        if connected.get("event") == "connected_success":
            print("Connection successful")
            return ws
        return None
    except Exception as e:
        print(f"Connection failed: {e}")
        return None

async def start_task(websocket):
    """Send task start request"""
    start_msg = {
        "event": "task_start",
        "model": model,
        "voice_setting": {
            "voice_id": "male-qn-qingse",
            "speed": 1,
            "vol": 1,
            "pitch": 0,
            "english_normalization": False
        }
        "audio_setting": {
            "sample_rate": 32000,
            "bitrate": 128000,
            "format": file_format,
            "channel": 1
        }
    }
    await websocket.send(json.dumps(start_msg))
    response = json.loads(await websocket.recv())
    return response.get("event") == "task_started"

async def continue_task_with_stream_play(websocket, text, player):
    """Send continue request and stream play audio"""
    await websocket.send(json.dumps({
        "event": "task_continue",
        "text": text
    }))

    chunk_counter = 1
    total_audio_size = 0
    audio_data = b""

    while True:
        try:
            response = json.loads(await websocket.recv())

            if "data" in response and "audio" in response["data"]:
                audio = response["data"]["audio"]
                if audio:
                    print(f"Playing chunk #{chunk_counter}")
                    audio_bytes = bytes.fromhex(audio)
                    if player.play_audio_chunk(audio):
                        total_audio_size += len(audio_bytes)
                        audio_data += audio_bytes
                        chunk_counter += 1

            if response.get("is_final"):
                print(f"Audio synthesis completed: {chunk_counter-1} chunks")
                if player.mpv_process and player.mpv_process.stdin:
                    player.mpv_process.stdin.close()

                # Save audio to file
                with open(f"output.{file_format}", "wb") as f:
                    f.write(audio_data)
                print(f"Audio saved as output.{file_format}")

                estimated_duration = total_audio_size * 0.0625 / 1000
                wait_time = max(estimated_duration + 5, 10)
                return wait_time

        except Exception as e:
            print(f"Error: {e}")
            break

    return 10

async def close_connection(websocket):
    """Close connection"""
    if websocket:
        try:
            await websocket.send(json.dumps({"event": "task_finish"}))
            await websocket.close()
        except Exception:
            pass

async def main():
    API_KEY = os.getenv("MINIMAX_API_KEY")
    TEXT = "The real danger is not that computers start thinking like people(sighs), but that people start thinking like computers. Computers can only help us with simple tasks."

    player = StreamAudioPlayer()

    try:
        if not player.start_mpv():
            return

        ws = await establish_connection(API_KEY)
        if not ws:
            return

        if not await start_task(ws):
            print("Task startup failed")
            return

        wait_time = await continue_task_with_stream_play(ws, TEXT, player)
        await asyncio.sleep(wait_time)

    except Exception as e:
        print(f"Error: {e}")
    finally:
        player.stop()
        if 'ws' in locals():
            await close_connection(ws)

if __name__ == "__main__":
    asyncio.run(main())

T2A (HTTP)Create Speech Generation Task

Using the API

Text

Speech

Video

Image

Music

File

Text to Speech (T2A) WebSocket