text to speech

Canopy Labs LogoOrpheus TTS

An incredibly lifelike speech synthesis model by Canopy Labs.

Model details

View repository

Example usage

Orpheus TTS must generate ~83 tokens/second for real-time streaming. This implementation supports streaming and, on an H100 MIG GPU, can produce:

  • 16 concurrent real-time streams with variable traffic

  • 24 concurrent real-time streams with consistent traffic

  • 128 concurrent non-real-time generations for cost-efficient batching

Input
1import requests
2import pyaudio
3import wave
4import time
5import os
6
7
8# ——— Request parameters ———
9orpheus_model_id = "" # Paste your model ID here
10api_key = os.environ["BASETEN_API_KEY"]
11max_tokens = 2000
12
13def stream_audio(text: str, voice: str):
14    """
15    Stream the audio directly to your speakers
16    """
17    p = pyaudio.PyAudio()
18    stream = p.open(format=pyaudio.paInt16, channels=1, rate=24000, output=True)
19
20    resp = requests.post(
21        f"https://model-{orpheus_model_id}.api.baseten.co/environments/production/predict",
22        headers={"Authorization": f"Api-Key {api_key}"},
23        json={"voice": voice, "prompt": text, "max_tokens": max_tokens},
24        stream=True,
25    )
26    resp.raise_for_status()
27
28    for chunk in resp.iter_content(chunk_size=4096):
29        if chunk:
30            stream.write(chunk)
31
32    stream.stop_stream()
33    stream.close()
34    p.terminate()
35
36
37def save_audio(text: str, voice: str, output_path: str = "output.wav"):
38    """
39    Save the audio to a WAV file.
40    """
41    start_time = time.monotonic()
42
43    resp = requests.post(
44        f"https://model-{orpheus_model_id}.api.baseten.co/environments/production/predict",
45        headers={"Authorization": f"Api-Key {api_key}"},
46        json={"voice": voice, "prompt": text, "max_tokens": max_tokens},
47        stream=False,
48    )
49    resp.raise_for_status()
50
51    with wave.open(output_path, "wb") as wf:
52        wf.setnchannels(1)
53        wf.setsampwidth(2)
54        wf.setframerate(24000)
55
56        total_frames = 0
57        chunk_counter = 0
58
59        for chunk in resp.iter_content(chunk_size=4096):
60            if not chunk:
61                continue
62            chunk_counter += 1
63            frame_count = len(chunk) // (wf.getsampwidth() * wf.getnchannels())
64            total_frames += frame_count
65            wf.writeframes(chunk)
66
67        duration = total_frames / wf.getframerate()
68
69    end_time = time.monotonic()
70    elapsed = end_time - start_time
71    print(f"Generated {duration:.2f}s of audio in {chunk_counter} chunks in {elapsed:.2f}s.")
72
73
74if __name__ == "__main__":
75    voice = "tara"
76
77    original_text = """
78    Nothing beside remains. Round the decay of that colossal wreck, boundless and bare,
79    The lone and level sands stretch far away.
80    """
81
82    print("🔊 Streaming live:")
83    stream_audio(original_text, voice)
84
85    print("\n💾 Saving to output.wav:")
86    save_audio(original_text, voice)
87
88    print("Done!")
JSON output
1{}

When benchmarking latency for high-performance models, network overhead can significantly affect the time-to-first-byte (TTFB), often resulting in inflated latency measurements. This overhead is typically introduced by the cost of establishing new connections.

To mitigate these effects and achieve more accurate, consistent latency metrics, consider implementing the following best practices:

  • Connection Pooling: Reuse established TCP/TLS connections across requests to avoid the cost of reconnecting. This is especially beneficial in environments where models are accessed over HTTP.

  • Warm-up Requests: Send a dummy or low-cost request before measuring latency. This establishes the network handshake, initializes any necessary request paths, and ensures subsequent requests benefit from connection reuse and stabilized performance.

By adopting these techniques, you can minimize variability introduced by network and infrastructure overhead, yielding lower latencies and more reliable performance measurements.

1import asyncio
2import httpx
3import os
4import time
5import wave
6import numpy as np
7
8API_KEY = os.getenv("BASETEN_API_KEY")
9model_id = ""
10
11URL = f"https://model-{model_id}.api.baseten.co/environments/production/predict"
12
13# Audio settings
14SAMPLE_RATE = 24000
15CHANNELS = 1
16SAMPLE_WIDTH = 2  # 16-bit PCM
17
18# Model inputs
19VOICE = "tara"
20MAX_TOKENS = 2000
21PROMPT = "Nothing beside remains. Round the decay of that colossal wreck, boundless and bare, The lone and level sands stretch far away."
22
23headers = {
24    "Authorization": f"Api-Key {API_KEY}",
25    "Content-Type": "application/json",
26    "Connection": "keep-alive",
27}
28
29payload = {
30    "voice": VOICE,
31    "prompt": PROMPT,
32    "max_tokens": MAX_TOKENS,
33    "repetition_penalty": 1.0,
34}
35
36# Warm-up the TLS connection
37async def warmup_connection():
38    async with httpx.AsyncClient(http2=True, timeout=10) as client:
39        try:
40            start = time.monotonic()
41            resp = await client.get(f"https://model-{model_id}.api.baseten.co/health")
42            duration = time.monotonic() - start
43            print(f"Warm-up request completed in {duration:.4f}s with status {resp.status_code}")
44        except Exception as e:
45            print(f"Warm-up failed: {e}")
46
47# Function to stream and save audio
48async def stream_audio_rest(output_file: str = "output.wav", verbose=True):
49    async with httpx.AsyncClient(http2=True, timeout=None) as client:
50        start_time = time.monotonic()
51
52        async with client.stream("POST", URL, headers=headers, json=payload) as response:
53            response.raise_for_status()
54
55            first_chunk_received = False
56            ttfb = -1.0
57            audio_chunks = []
58
59            async for chunk in response.aiter_bytes():
60                if not first_chunk_received:
61                    ttfb = time.monotonic() - start_time
62                    if verbose:
63                        print(f"Time to first audio byte: {ttfb:.4f}s")
64                    first_chunk_received = True
65                audio_chunks.append(chunk)
66
67    total_time = time.monotonic() - start_time
68
69    # Save audio
70    if output_file and audio_chunks:
71        with wave.open(output_file, "wb") as wav_file:
72            wav_file.setnchannels(CHANNELS)
73            wav_file.setsampwidth(SAMPLE_WIDTH)
74            wav_file.setframerate(SAMPLE_RATE)
75            wav_file.writeframes(b"".join(audio_chunks))
76        if verbose:
77            print(f"Audio saved to: {output_file}")
78
79    if verbose:
80        print(f"Done in {total_time:.4f}s")
81
82    return ttfb, total_time
83
84# Run the test loop with warm-up
85async def main():
86    await warmup_connection()
87
88    num_runs = 5
89    times_to_first_audio = []
90    total_times = []
91
92    for i in range(num_runs):
93        print(f"--- Run {i+1}/{num_runs} ---")
94        output_filename = f"tts_output_run_{i+1}.wav"
95        ttfa, total_time = await stream_audio_rest(output_file=output_filename, verbose=True)
96        if ttfa != -1.0:
97            times_to_first_audio.append(ttfa)
98        total_times.append(total_time)
99
100    print("\n--- Statistics ---")
101    if times_to_first_audio:
102        print(f"Avg Time to First Audio: {np.mean(times_to_first_audio):.4f}s")
103        print(f"Std Dev Time to First Audio: {np.std(times_to_first_audio):.4f}s")
104    if total_times:
105        print(f"Avg Total Time: {np.mean(total_times):.4f}s")
106        print(f"Std Dev Total Time: {np.std(total_times):.4f}s")
107
108if __name__ == "__main__":
109    asyncio.run(main())
Input
JSON output
1null

text to speech models

See all
Canopy Labs Logo
Text to speech

Orpheus TTS

TRT-LLM - H100 MIG 40GB
three triangles with the bottom edge missing inside each other
Text to speech

MARS6

V6 - L4
Coqui
Text to speech

XTTS V2

T4

Canopy Labs models

See all
Canopy Labs Logo
Text to speech

Orpheus TTS

TRT-LLM - H100 MIG 40GB

🔥 Trending models