large language
NVIDIA Nemotron 3 Ultra
550B hybrid Mamba-Transformer MoE with 55B active params, latent MoE routing, multi-token prediction, and 1M token context
Model details
View repositoryExample usage
OpenAI-compatible chat completion. NVIDIA recommends temperature=1.0 and top_p=0.95; toggle reasoning via chat_template_kwargs.enable_thinking.
Input
1from openai import OpenAI
2
3client = OpenAI(
4 api_key="BASETEN_API_KEY",
5 base_url="https://inference.baseten.co/v1"
6)
7
8# NVIDIA recommends temperature=1.0 and top_p=0.95 for all tasks.
9# Toggle reasoning on/off via chat_template_kwargs.enable_thinking.
10response = client.chat.completions.create(
11 model="nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B",
12 messages=[
13 {
14 "role": "user",
15 "content": "Tell me a fun fact about hummingbirds."
16 }
17 ],
18 stream=True,
19 stream_options={
20 "include_usage": True,
21 "continuous_usage_stats": True
22 },
23 top_p=0.95,
24 max_tokens=256,
25 temperature=1,
26 presence_penalty=0,
27 frequency_penalty=0
28)
29
30for chunk in response:
31 if chunk.choices and chunk.choices[0].delta.content is not None:
32 print(chunk.choices[0].delta.content, end="", flush=True)JSON output
1{
2 "id": "143",
3 "choices": [
4 {
5 "finish_reason": "stop",
6 "index": 0,
7 "logprobs": null,
8 "message": {
9 "content": "[Model output here]",
10 "role": "assistant",
11 "audio": null,
12 "function_call": null,
13 "tool_calls": null
14 }
15 }
16 ],
17 "created": 1741224586,
18 "model": "",
19 "object": "chat.completion",
20 "service_tier": null,
21 "system_fingerprint": null,
22 "usage": {
23 "completion_tokens": 145,
24 "prompt_tokens": 38,
25 "total_tokens": 183,
26 "completion_tokens_details": null,
27 "prompt_tokens_details": null
28 }
29}