Skip to main content

Simple Voice Generation

Basic text-to-speech example:
// app/api/tts/route.ts
import Cartesia from '@cartesia/cartesia-js';

export async function POST(request: Request) {
  const { text } = await request.json();
  
  const cartesia = new Cartesia({
    apiKey: process.env.CARTESIA_API_KEY,
  });
  
  const response = await cartesia.tts.bytes({
    model_id: "sonic",
    transcript: text,
    voice: {
      mode: "id",
      id: "a0e99841-438c-4a64-b679-ae501e7d6091"
    },
    output_format: {
      container: "mp3",
      encoding: "mp3",
      sample_rate: 44100
    }
  });
  
  return new Response(response.audio, {
    headers: { 'Content-Type': 'audio/mp3' }
  });
}

Real-time Streaming

Stream audio in real-time:
'use client';
import { useState } from 'react';
import Cartesia from '@cartesia/cartesia-js';

export default function VoiceChat() {
  const [isPlaying, setIsPlaying] = useState(false);
  
  async function generateAndPlay(text: string) {
    setIsPlaying(true);
    
    const cartesia = new Cartesia({
      apiKey: process.env.NEXT_PUBLIC_CARTESIA_KEY,
    });
    
    const response = await cartesia.tts.sse({
      model_id: "sonic",
      transcript: text,
      voice: { mode: "id", id: voiceId },
      output_format: {
        container: "raw",
        encoding: "pcm_s16le",
        sample_rate: 16000
      }
    });
    
    const audioContext = new AudioContext();
    
    for await (const chunk of response) {
      // Convert and play each chunk
      const audioBuffer = await audioContext.decodeAudioData(chunk);
      const source = audioContext.createBufferSource();
      source.buffer = audioBuffer;
      source.connect(audioContext.destination);
      source.start();
    }
    
    setIsPlaying(false);
  }
  
  return (
    <button 
      onClick={() => generateAndPlay("Hello from CARTER!")}
      disabled={isPlaying}
    >
      {isPlaying ? 'Playing...' : 'Speak'}
    </button>
  );
}

WebSocket Chat

Low-latency voice chat:
import Cartesia from '@cartesia/cartesia-js';

class VoiceChat {
  constructor(apiKey, voiceId) {
    this.cartesia = new Cartesia({ apiKey });
    this.voiceId = voiceId;
    this.ws = null;
  }
  
  async connect() {
    this.ws = await this.cartesia.tts.websocket({
      model_id: "sonic",
      voice: { mode: "id", id: this.voiceId },
      output_format: {
        container: "raw",
        encoding: "pcm_s16le",
        sample_rate: 16000
      }
    });
    
    this.ws.on('message', (audioChunk) => {
      this.playAudio(audioChunk);
    });
    
    this.ws.on('close', () => {
      console.log('Connection closed');
    });
  }
  
  async say(text) {
    if (!this.ws) await this.connect();
    
    await this.ws.send({
      transcript: text,
      context_id: 'chat-session'
    });
  }
  
  playAudio(chunk) {
    // Implement audio playback
    const audioContext = new AudioContext();
    // ... decode and play chunk
  }
  
  async disconnect() {
    if (this.ws) {
      await this.ws.close();
    }
  }
}

// Usage
const chat = new VoiceChat(apiKey, voiceId);
await chat.say("Hello!");

Emotional Voice

Control emotions dynamically:
async function speakWithEmotion(text, emotion) {
  const cartesia = new Cartesia({ apiKey });
  
  const emotionMap = {
    happy: ["positivity:highest", "excitement"],
    sad: ["sadness"],
    angry: ["anger"],
    curious: ["curiosity"],
    neutral: []
  };
  
  const response = await cartesia.tts.bytes({
    model_id: "sonic",
    transcript: text,
    voice: { mode: "id", id: voiceId },
    _experimental_voice_controls: {
      emotion: emotionMap[emotion] || []
    }
  });
  
  return response.audio;
}

// Usage
await speakWithEmotion("I'm so excited!", "happy");
await speakWithEmotion("This is unfortunate.", "sad");

Voice Cloning

Clone a custom voice:
import cartesia

client = cartesia.Cartesia(api_key="your-key")

# Read audio samples
with open("sample1.wav", "rb") as f1:
    audio1 = f1.read()
with open("sample2.wav", "rb") as f2:
    audio2 = f2.read()
with open("sample3.wav", "rb") as f3:
    audio3 = f3.read()

# Create cloned voice
voice = client.voices.create(
    name="My Custom Voice",
    description="Cloned from my samples",
    audio_files=[audio1, audio2, audio3]
)

print(f"Voice ID: {voice['id']}")

# Use the cloned voice
output = client.tts.bytes(
    model_id="sonic",
    transcript="Hello in my custom voice!",
    voice_id=voice['id']
)

Error Handling

Robust error handling example:
async function generateSpeechWithRetry(text, maxRetries = 3) {
  for (let attempt = 0; attempt < maxRetries; attempt++) {
    try {
      const response = await cartesia.tts.bytes({
        model_id: "sonic",
        transcript: text,
        voice: { mode: "id", id: voiceId }
      });
      
      return response.audio;
      
    } catch (error) {
      console.error(`Attempt ${attempt + 1} failed:`, error);
      
      if (error.status === 429) {
        // Rate limit - exponential backoff
        const delay = Math.pow(2, attempt) * 1000;
        console.log(`Rate limited. Waiting ${delay}ms...`);
        await new Promise(resolve => setTimeout(resolve, delay));
        continue;
      }
      
      if (error.status === 401) {
        throw new Error('Invalid API key');
      }
      
      if (attempt === maxRetries - 1) {
        throw error;
      }
    }
  }
}

Context Management

Maintain conversation context:
class ConversationManager {
  constructor(cartesia, voiceId) {
    this.cartesia = cartesia;
    this.voiceId = voiceId;
    this.contextId = `conv-${Date.now()}`;
  }
  
  async speak(text) {
    const response = await this.cartesia.tts.sse({
      model_id: "sonic",
      transcript: text,
      voice: { mode: "id", id: this.voiceId },
      context_id: this.contextId,  // Maintain context
      output_format: {
        container: "raw",
        encoding: "pcm_s16le",
        sample_rate: 16000
      }
    });
    
    for await (const chunk of response) {
      await this.playAudio(chunk);
    }
  }
  
  async playAudio(chunk) {
    // Audio playback implementation
  }
  
  resetContext() {
    this.contextId = `conv-${Date.now()}`;
  }
}

// Usage
const conversation = new ConversationManager(cartesia, voiceId);
await conversation.speak("Hello, how are you?");
await conversation.speak("That's great to hear!"); // Uses same context

Complete Chat Application

Full example with UI:
'use client';
import { useState, useRef } from 'react';
import Cartesia from '@cartesia/cartesia-js';

export default function CarterClone() {
  const [messages, setMessages] = useState([]);
  const [input, setInput] = useState('');
  const [isGenerating, setIsGenerating] = useState(false);
  
  const cartesiaRef = useRef(null);
  
  if (!cartesiaRef.current) {
    cartesiaRef.current = new Cartesia({
      apiKey: process.env.NEXT_PUBLIC_CARTESIA_KEY
    });
  }
  
  async function handleSend() {
    if (!input.trim() || isGenerating) return;
    
    const userMessage = { role: 'user', content: input };
    setMessages(prev => [...prev, userMessage]);
    setInput('');
    setIsGenerating(true);
    
    try {
      // Generate speech
      const response = await cartesiaRef.current.tts.bytes({
        model_id: "sonic",
        transcript: input,
        voice: { mode: "id", id: "voice-id" },
        _experimental_voice_controls: {
          emotion: ["curiosity"]
        }
      });
      
      // Play audio
      const audio = new Audio();
      audio.src = URL.createObjectURL(
        new Blob([response.audio], { type: 'audio/mp3' })
      );
      await audio.play();
      
      const aiMessage = { role: 'assistant', content: input };
      setMessages(prev => [...prev, aiMessage]);
      
    } catch (error) {
      console.error('Error:', error);
      alert('Failed to generate speech');
    } finally {
      setIsGenerating(false);
    }
  }
  
  return (
    <div className="flex flex-col h-screen">
      <div className="flex-1 overflow-y-auto p-4">
        {messages.map((msg, i) => (
          <div key={i} className={msg.role}>
            {msg.content}
          </div>
        ))}
      </div>
      
      <div className="p-4 border-t flex gap-2">
        <input
          value={input}
          onChange={(e) => setInput(e.target.value)}
          onKeyPress={(e) => e.key === 'Enter' && handleSend()}
          placeholder="Type a message..."
          className="flex-1 px-4 py-2 border rounded"
          disabled={isGenerating}
        />
        <button
          onClick={handleSend}
          disabled={isGenerating || !input.trim()}
          className="px-6 py-2 bg-cyan-500 text-white rounded"
        >
          {isGenerating ? 'Generating...' : 'Send'}
        </button>
      </div>
    </div>
  );
}

Resources

Always keep your API keys secure. Never expose them in client-side code or commit them to version control.