Orpheus TTS

Example usage

Orpheus TTS must generate ~83 tokens/second for real-time streaming. This implementation supports streaming and, on an H100 MIG GPU, can produce:

16 concurrent real-time streams with variable traffic
24 concurrent real-time streams with consistent traffic
128 concurrent non-real-time generations for cost-efficient batching

Input
import requests
import pyaudio
import wave
import time
import os


# ——— Request parameters ———
orpheus_model_id = "" # Paste your model ID here
api_key = os.environ["BASETEN_API_KEY"]
max_tokens = 2000

def stream_audio(text: str, voice: str):
    """
    Stream the audio directly to your speakers
    """
    p = pyaudio.PyAudio()
    stream = p.open(format=pyaudio.paInt16, channels=1, rate=24000, output=True)

    resp = requests.post(
        f"https://model-{orpheus_model_id}.api.baseten.co/environments/production/predict",
        headers={"Authorization": f"Api-Key {api_key}"},
        json={"voice": voice, "prompt": text, "max_tokens": max_tokens},
        stream=True,
    )
    resp.raise_for_status()

    for chunk in resp.iter_content(chunk_size=4096):
        if chunk:
            stream.write(chunk)

    stream.stop_stream()
    stream.close()
    p.terminate()


def save_audio(text: str, voice: str, output_path: str = "output.wav"):
    """
    Save the audio to a WAV file.
    """
    start_time = time.monotonic()

    resp = requests.post(
        f"https://model-{orpheus_model_id}.api.baseten.co/environments/production/predict",
        headers={"Authorization": f"Api-Key {api_key}"},
        json={"voice": voice, "prompt": text, "max_tokens": max_tokens},
        stream=False,
    )
    resp.raise_for_status()

    with wave.open(output_path, "wb") as wf:
        wf.setnchannels(1)
        wf.setsampwidth(2)
        wf.setframerate(24000)

        total_frames = 0
        chunk_counter = 0

        for chunk in resp.iter_content(chunk_size=4096):
            if not chunk:
                continue
            chunk_counter += 1
            frame_count = len(chunk) // (wf.getsampwidth() * wf.getnchannels())
            total_frames += frame_count
            wf.writeframes(chunk)

        duration = total_frames / wf.getframerate()

    end_time = time.monotonic()
    elapsed = end_time - start_time
    print(f"Generated {duration:.2f}s of audio in {chunk_counter} chunks in {elapsed:.2f}s.")


if __name__ == "__main__":
    voice = "tara"

    original_text = """
    Nothing beside remains. Round the decay of that colossal wreck, boundless and bare,
    The lone and level sands stretch far away.
    """

    print("🔊 Streaming live:")
    stream_audio(original_text, voice)

    print("\n💾 Saving to output.wav:")
    save_audio(original_text, voice)

    print("Done!")
JSON output
{}

When benchmarking latency for high-performance models, network overhead can significantly affect the time-to-first-byte (TTFB), often resulting in inflated latency measurements. This overhead is typically introduced by the cost of establishing new connections.

To mitigate these effects and achieve more accurate, consistent latency metrics, consider implementing the following best practices:

Connection Pooling: Reuse established TCP/TLS connections across requests to avoid the cost of reconnecting. This is especially beneficial in environments where models are accessed over HTTP.
Warm-up Requests: Send a dummy or low-cost request before measuring latency. This establishes the network handshake, initializes any necessary request paths, and ensures subsequent requests benefit from connection reuse and stabilized performance.

By adopting these techniques, you can minimize variability introduced by network and infrastructure overhead, yielding lower latencies and more reliable performance measurements.

1import asyncio
2import httpx
3import os
4import time
5import wave
6import numpy as np
7
8API_KEY = os.getenv("BASETEN_API_KEY")
9model_id = ""
10
11URL = f"https://model-{model_id}.api.baseten.co/environments/production/predict"
12
13# Audio settings
14SAMPLE_RATE = 24000
15CHANNELS = 1
16SAMPLE_WIDTH = 2  # 16-bit PCM
17
18# Model inputs
19VOICE = "tara"
20MAX_TOKENS = 2000
21PROMPT = "Nothing beside remains. Round the decay of that colossal wreck, boundless and bare, The lone and level sands stretch far away."
22
23headers = {
24    "Authorization": f"Api-Key {API_KEY}",
25    "Content-Type": "application/json",
26    "Connection": "keep-alive",
27}
28
29payload = {
30    "voice": VOICE,
31    "prompt": PROMPT,
32    "max_tokens": MAX_TOKENS,
33    "repetition_penalty": 1.0,
34}
35
36# Warm-up the TLS connection
37async def warmup_connection():
38    async with httpx.AsyncClient(http2=True, timeout=10) as client:
39        try:
40            start = time.monotonic()
41            resp = await client.get(f"https://model-{model_id}.api.baseten.co/health")
42            duration = time.monotonic() - start
43            print(f"Warm-up request completed in {duration:.4f}s with status {resp.status_code}")
44        except Exception as e:
45            print(f"Warm-up failed: {e}")
46
47# Function to stream and save audio
48async def stream_audio_rest(output_file: str = "output.wav", verbose=True):
49    async with httpx.AsyncClient(http2=True, timeout=None) as client:
50        start_time = time.monotonic()
51
52        async with client.stream("POST", URL, headers=headers, json=payload) as response:
53            response.raise_for_status()
54
55            first_chunk_received = False
56            ttfb = -1.0
57            audio_chunks = []
58
59            async for chunk in response.aiter_bytes():
60                if not first_chunk_received:
61                    ttfb = time.monotonic() - start_time
62                    if verbose:
63                        print(f"Time to first audio byte: {ttfb:.4f}s")
64                    first_chunk_received = True
65                audio_chunks.append(chunk)
66
67    total_time = time.monotonic() - start_time
68
69    # Save audio
70    if output_file and audio_chunks:
71        with wave.open(output_file, "wb") as wav_file:
72            wav_file.setnchannels(CHANNELS)
73            wav_file.setsampwidth(SAMPLE_WIDTH)
74            wav_file.setframerate(SAMPLE_RATE)
75            wav_file.writeframes(b"".join(audio_chunks))
76        if verbose:
77            print(f"Audio saved to: {output_file}")
78
79    if verbose:
80        print(f"Done in {total_time:.4f}s")
81
82    return ttfb, total_time
83
84# Run the test loop with warm-up
85async def main():
86    await warmup_connection()
87
88    num_runs = 5
89    times_to_first_audio = []
90    total_times = []
91
92    for i in range(num_runs):
93        print(f"--- Run {i+1}/{num_runs} ---")
94        output_filename = f"tts_output_run_{i+1}.wav"
95        ttfa, total_time = await stream_audio_rest(output_file=output_filename, verbose=True)
96        if ttfa != -1.0:
97            times_to_first_audio.append(ttfa)
98        total_times.append(total_time)
99
100    print("\n--- Statistics ---")
101    if times_to_first_audio:
102        print(f"Avg Time to First Audio: {np.mean(times_to_first_audio):.4f}s")
103        print(f"Std Dev Time to First Audio: {np.std(times_to_first_audio):.4f}s")
104    if total_times:
105        print(f"Avg Total Time: {np.mean(total_times):.4f}s")
106        print(f"Std Dev Total Time: {np.std(total_times):.4f}s")
107
108if __name__ == "__main__":
109    asyncio.run(main())

Input
JSON output
null

Model details

Example usage

text to speech models

Orpheus 3B Websockets

Orpheus TTS

MARS6

Canopy Labs models

Orpheus 3B Websockets

Orpheus TTS

🔥 Trending models

GPT OSS 120B

GPT OSS 20B

Qwen Image

Explore Baseten today