Simple Voice Generation
Basic text-to-speech example:Copy
// app/api/tts/route.ts
import Cartesia from '@cartesia/cartesia-js';
export async function POST(request: Request) {
const { text } = await request.json();
const cartesia = new Cartesia({
apiKey: process.env.CARTESIA_API_KEY,
});
const response = await cartesia.tts.bytes({
model_id: "sonic",
transcript: text,
voice: {
mode: "id",
id: "a0e99841-438c-4a64-b679-ae501e7d6091"
},
output_format: {
container: "mp3",
encoding: "mp3",
sample_rate: 44100
}
});
return new Response(response.audio, {
headers: { 'Content-Type': 'audio/mp3' }
});
}
Real-time Streaming
Stream audio in real-time:Copy
'use client';
import { useState } from 'react';
import Cartesia from '@cartesia/cartesia-js';
export default function VoiceChat() {
const [isPlaying, setIsPlaying] = useState(false);
async function generateAndPlay(text: string) {
setIsPlaying(true);
const cartesia = new Cartesia({
apiKey: process.env.NEXT_PUBLIC_CARTESIA_KEY,
});
const response = await cartesia.tts.sse({
model_id: "sonic",
transcript: text,
voice: { mode: "id", id: voiceId },
output_format: {
container: "raw",
encoding: "pcm_s16le",
sample_rate: 16000
}
});
const audioContext = new AudioContext();
for await (const chunk of response) {
// Convert and play each chunk
const audioBuffer = await audioContext.decodeAudioData(chunk);
const source = audioContext.createBufferSource();
source.buffer = audioBuffer;
source.connect(audioContext.destination);
source.start();
}
setIsPlaying(false);
}
return (
<button
onClick={() => generateAndPlay("Hello from CARTER!")}
disabled={isPlaying}
>
{isPlaying ? 'Playing...' : 'Speak'}
</button>
);
}
WebSocket Chat
Low-latency voice chat:Copy
import Cartesia from '@cartesia/cartesia-js';
class VoiceChat {
constructor(apiKey, voiceId) {
this.cartesia = new Cartesia({ apiKey });
this.voiceId = voiceId;
this.ws = null;
}
async connect() {
this.ws = await this.cartesia.tts.websocket({
model_id: "sonic",
voice: { mode: "id", id: this.voiceId },
output_format: {
container: "raw",
encoding: "pcm_s16le",
sample_rate: 16000
}
});
this.ws.on('message', (audioChunk) => {
this.playAudio(audioChunk);
});
this.ws.on('close', () => {
console.log('Connection closed');
});
}
async say(text) {
if (!this.ws) await this.connect();
await this.ws.send({
transcript: text,
context_id: 'chat-session'
});
}
playAudio(chunk) {
// Implement audio playback
const audioContext = new AudioContext();
// ... decode and play chunk
}
async disconnect() {
if (this.ws) {
await this.ws.close();
}
}
}
// Usage
const chat = new VoiceChat(apiKey, voiceId);
await chat.say("Hello!");
Emotional Voice
Control emotions dynamically:Copy
async function speakWithEmotion(text, emotion) {
const cartesia = new Cartesia({ apiKey });
const emotionMap = {
happy: ["positivity:highest", "excitement"],
sad: ["sadness"],
angry: ["anger"],
curious: ["curiosity"],
neutral: []
};
const response = await cartesia.tts.bytes({
model_id: "sonic",
transcript: text,
voice: { mode: "id", id: voiceId },
_experimental_voice_controls: {
emotion: emotionMap[emotion] || []
}
});
return response.audio;
}
// Usage
await speakWithEmotion("I'm so excited!", "happy");
await speakWithEmotion("This is unfortunate.", "sad");
Voice Cloning
Clone a custom voice:Copy
import cartesia
client = cartesia.Cartesia(api_key="your-key")
# Read audio samples
with open("sample1.wav", "rb") as f1:
audio1 = f1.read()
with open("sample2.wav", "rb") as f2:
audio2 = f2.read()
with open("sample3.wav", "rb") as f3:
audio3 = f3.read()
# Create cloned voice
voice = client.voices.create(
name="My Custom Voice",
description="Cloned from my samples",
audio_files=[audio1, audio2, audio3]
)
print(f"Voice ID: {voice['id']}")
# Use the cloned voice
output = client.tts.bytes(
model_id="sonic",
transcript="Hello in my custom voice!",
voice_id=voice['id']
)
Error Handling
Robust error handling example:Copy
async function generateSpeechWithRetry(text, maxRetries = 3) {
for (let attempt = 0; attempt < maxRetries; attempt++) {
try {
const response = await cartesia.tts.bytes({
model_id: "sonic",
transcript: text,
voice: { mode: "id", id: voiceId }
});
return response.audio;
} catch (error) {
console.error(`Attempt ${attempt + 1} failed:`, error);
if (error.status === 429) {
// Rate limit - exponential backoff
const delay = Math.pow(2, attempt) * 1000;
console.log(`Rate limited. Waiting ${delay}ms...`);
await new Promise(resolve => setTimeout(resolve, delay));
continue;
}
if (error.status === 401) {
throw new Error('Invalid API key');
}
if (attempt === maxRetries - 1) {
throw error;
}
}
}
}
Context Management
Maintain conversation context:Copy
class ConversationManager {
constructor(cartesia, voiceId) {
this.cartesia = cartesia;
this.voiceId = voiceId;
this.contextId = `conv-${Date.now()}`;
}
async speak(text) {
const response = await this.cartesia.tts.sse({
model_id: "sonic",
transcript: text,
voice: { mode: "id", id: this.voiceId },
context_id: this.contextId, // Maintain context
output_format: {
container: "raw",
encoding: "pcm_s16le",
sample_rate: 16000
}
});
for await (const chunk of response) {
await this.playAudio(chunk);
}
}
async playAudio(chunk) {
// Audio playback implementation
}
resetContext() {
this.contextId = `conv-${Date.now()}`;
}
}
// Usage
const conversation = new ConversationManager(cartesia, voiceId);
await conversation.speak("Hello, how are you?");
await conversation.speak("That's great to hear!"); // Uses same context
Complete Chat Application
Full example with UI:Copy
'use client';
import { useState, useRef } from 'react';
import Cartesia from '@cartesia/cartesia-js';
export default function CarterClone() {
const [messages, setMessages] = useState([]);
const [input, setInput] = useState('');
const [isGenerating, setIsGenerating] = useState(false);
const cartesiaRef = useRef(null);
if (!cartesiaRef.current) {
cartesiaRef.current = new Cartesia({
apiKey: process.env.NEXT_PUBLIC_CARTESIA_KEY
});
}
async function handleSend() {
if (!input.trim() || isGenerating) return;
const userMessage = { role: 'user', content: input };
setMessages(prev => [...prev, userMessage]);
setInput('');
setIsGenerating(true);
try {
// Generate speech
const response = await cartesiaRef.current.tts.bytes({
model_id: "sonic",
transcript: input,
voice: { mode: "id", id: "voice-id" },
_experimental_voice_controls: {
emotion: ["curiosity"]
}
});
// Play audio
const audio = new Audio();
audio.src = URL.createObjectURL(
new Blob([response.audio], { type: 'audio/mp3' })
);
await audio.play();
const aiMessage = { role: 'assistant', content: input };
setMessages(prev => [...prev, aiMessage]);
} catch (error) {
console.error('Error:', error);
alert('Failed to generate speech');
} finally {
setIsGenerating(false);
}
}
return (
<div className="flex flex-col h-screen">
<div className="flex-1 overflow-y-auto p-4">
{messages.map((msg, i) => (
<div key={i} className={msg.role}>
{msg.content}
</div>
))}
</div>
<div className="p-4 border-t flex gap-2">
<input
value={input}
onChange={(e) => setInput(e.target.value)}
onKeyPress={(e) => e.key === 'Enter' && handleSend()}
placeholder="Type a message..."
className="flex-1 px-4 py-2 border rounded"
disabled={isGenerating}
/>
<button
onClick={handleSend}
disabled={isGenerating || !input.trim()}
className="px-6 py-2 bg-cyan-500 text-white rounded"
>
{isGenerating ? 'Generating...' : 'Send'}
</button>
</div>
</div>
);
}
Resources
Always keep your API keys secure. Never expose them in client-side code or commit them to version control.
