import type { TTSProvider, StreamingTTSProvider } from "@cloudflare/voice"; export interface ElevenLabsTTSOptions { /** ElevenLabs API key. */ apiKey: string; /** Voice ID. Browse voices at https://elevenlabs.io/app/voice-library @default "JBFqnCBsd6RMkjVDRZzb" (George) */ voiceId?: string; /** Model ID. @default "eleven_flash_v2_5" (lowest latency) */ modelId?: string; /** Output format. @default "mp3_44100_128" */ outputFormat?: string; } const DEFAULT_VOICE_ID = "JBFqnCBsd6RMkjVDRZzb"; // George const DEFAULT_MODEL_ID = "eleven_flash_v2_5"; const DEFAULT_OUTPUT_FORMAT = "mp3_44100_128"; /** * ElevenLabs text-to-speech provider for the Agents voice pipeline. * * Implements both `TTSProvider` and `StreamingTTSProvider` from `agents/voice`. * * - `synthesize(text)` — waits for the full audio response (simple, higher latency). * - `synthesizeStream(text)` — uses the ElevenLabs `/stream` endpoint, yielding audio * chunks as they are generated (lower time-to-first-audio per sentence). * * Set as the `tts` provider on your VoiceAgent subclass: * * @example * ```typescript * import { Agent } from "agents"; * import { withVoice } from "@cloudflare/voice"; * import { ElevenLabsTTS } from "@cloudflare/voice-elevenlabs"; * * const VoiceAgent = withVoice(Agent); * * export class MyAgent extends VoiceAgent { * tts = new ElevenLabsTTS({ apiKey: this.env.ELEVENLABS_API_KEY }); * * async onTurn(transcript, context) { ... } * } * ``` */ export class ElevenLabsTTS implements TTSProvider, StreamingTTSProvider { #apiKey: string; #voiceId: string; #modelId: string; #outputFormat: string; constructor(options: ElevenLabsTTSOptions) { this.#apiKey = options.apiKey; this.#voiceId = options.voiceId ?? DEFAULT_VOICE_ID; this.#modelId = options.modelId ?? DEFAULT_MODEL_ID; this.#outputFormat = options.outputFormat ?? DEFAULT_OUTPUT_FORMAT; } /** * Non-streaming TTS — sends the full text and waits for the complete * audio response. Simple but higher latency per sentence. */ async synthesize( text: string, signal?: AbortSignal ): Promise { try { const response = await fetch( `https://api.elevenlabs.io/v1/text-to-speech/${this.#voiceId}?output_format=${this.#outputFormat}`, { method: "POST", headers: { "Content-Type": "application/json", "xi-api-key": this.#apiKey }, body: JSON.stringify({ text, model_id: this.#modelId }), signal } ); if (!response.ok) { console.error( `[ElevenLabsTTS] Error: ${response.status} ${response.statusText}` ); return null; } return await response.arrayBuffer(); } catch (error) { console.error("[ElevenLabsTTS] Error:", error); return null; } } /** * Streaming TTS — sends the full text and yields audio chunks as they * are generated by ElevenLabs. Uses the `/stream` endpoint which returns * a chunked HTTP response. * * This reduces time-to-first-audio within each sentence: the client * starts playing audio before the full sentence has been synthesized. */ async *synthesizeStream( text: string, signal?: AbortSignal ): AsyncGenerator { try { const response = await fetch( `https://api.elevenlabs.io/v1/text-to-speech/${this.#voiceId}/stream?output_format=${this.#outputFormat}`, { method: "POST", headers: { "Content-Type": "application/json", "xi-api-key": this.#apiKey }, body: JSON.stringify({ text, model_id: this.#modelId }), signal } ); if (!response.ok || !response.body) { console.error( `[ElevenLabsTTS] Stream error: ${response.status} ${response.statusText}` ); return; } const reader = response.body.getReader(); while (true) { const { done, value } = await reader.read(); if (done) break; if (value && value.byteLength > 0) { // value is a Uint8Array — yield the underlying ArrayBuffer slice yield value.buffer.slice( value.byteOffset, value.byteOffset + value.byteLength ); } } } catch (error) { console.error("[ElevenLabsTTS] Stream error:", error); } } }