MARS6
MARS6 is a frontier, highly-prosodic, edge-first text-to-speech model by CAMB.AI in 10 languages and voice/prosody cloning capabilities.
Deploy MARS6 behind an API endpoint in seconds.
Deploy modelExample usage
This model requires at least four inputs:
text
: The input text that needs to be spokenaudio_ref
: An audio file containing the audio of a single personref_text
: What is spoken inaudio_ref
language:
The language code for the target language
The model will try to output an audio file containing the speech in the reference audio's style. The output is a base64 string so it needs to get converted to an audio format before it can be played.
data = {"text": "The quick brown fox jumps over the lazy dog",
"audio_ref": encoded_str,
"ref_text": prompt_txt,
"language": 'en-us', # Target language, in this case english.
# "top_p": 0.7, # Optionally specify a top_p (default 0.7)
# "temperature": 0.7, # Optionally specify a temperature (default 0.7)
# "chunk_length": 200, # Optional text chunk length for splitting long pieces of input text. Default 200
# "max_new_tokens": 0, # Optional limit on max number of new tokens, default is zero (unlimited)
# "repetition_penalty": 1.5 # Optional rep penalty, default 1.5
}
1import httpx
2import base64
3import time
4import torchaudio
5import IPython.display as ipd
6import librosa, librosa.display
7import torch
8import io
9
10# Step 1: set endpoint url and api key:
11url = "<YOUR PREDICTION ENDPOINT>"
12headers = {"Authorization": "Api-Key <YOUR API KEY>"}
13
14
15# Step 2: pick reference audio to clone, encode it as base64
16file_path = 'ref_debug.flac' # any valid audio filepath, ideally between 6s-90s.
17wav, sr = librosa.load(file_path, sr=None, mono=True, offset=0, duration=5)
18io_data = io.BytesIO()
19torchaudio.save(io_data, torch.from_numpy(wav)[None], sample_rate=sr, format='wav')
20io_data.seek(0)
21encoded_data = base64.b64encode(io_data.read())
22encoded_str = encoded_data.decode("utf-8")
23# OPTIONAL: specify the transcript of the reference/prompt (slightly speeds up inference, and may make it sound a bit better).
24prompt_txt = None # if unspecified, can be left as None
25
26# Step 3: define other inference settings:
27data = {"text": "The quick brown fox jumps over the lazy dog",
28 "audio_ref": encoded_str,
29 "ref_text": prompt_txt,
30 "language": 'en-us', # Target language, in this case english.
31 # "top_p": 0.7, # Optionally specify a top_p (default 0.7)
32 # "temperature": 0.7, # Optionally specify a temperature (default 0.7)
33 # "chunk_length": 200, # Optional text chunk length for splitting long pieces of input text. Default 200
34 # "max_new_tokens": 0, # Optional limit on max number of new tokens, default is zero (unlimited)
35 # "repetition_penalty": 1.5 # Optional rep penalty, default 1.5
36}
37
38# Step 4: Send the POST request (note the first request might be a bit slow, but following requests should be fast)
39st = time.time()
40response = httpx.post(url, headers=headers, json=data, timeout=120)
41et = time.time()
42
43print(f"Runtime: {et-st:.2f} seconds")
44# Check the response status code
45if response.status_code == 200: print("Request successful!")
46else: print("Request failed with status code", response.status_code, response.content)
47
48# Step 5: decode base64 output back to audio
49wav, sr = torchaudio.load(base64.b64decode(response.json()['result'])), rate=sr))
1{
2 "result": "iVBORw0KGgoAAAANSUhEU"
3}