branch:
index.ts
4547 bytesRaw
import type { TTSProvider, StreamingTTSProvider } from "@cloudflare/voice";

export interface ElevenLabsTTSOptions {
  /** ElevenLabs API key. */
  apiKey: string;
  /** Voice ID. Browse voices at https://elevenlabs.io/app/voice-library @default "JBFqnCBsd6RMkjVDRZzb" (George) */
  voiceId?: string;
  /** Model ID. @default "eleven_flash_v2_5" (lowest latency) */
  modelId?: string;
  /** Output format. @default "mp3_44100_128" */
  outputFormat?: string;
}

const DEFAULT_VOICE_ID = "JBFqnCBsd6RMkjVDRZzb"; // George
const DEFAULT_MODEL_ID = "eleven_flash_v2_5";
const DEFAULT_OUTPUT_FORMAT = "mp3_44100_128";

/**
 * ElevenLabs text-to-speech provider for the Agents voice pipeline.
 *
 * Implements both `TTSProvider` and `StreamingTTSProvider` from `agents/voice`.
 *
 * - `synthesize(text)` — waits for the full audio response (simple, higher latency).
 * - `synthesizeStream(text)` — uses the ElevenLabs `/stream` endpoint, yielding audio
 *   chunks as they are generated (lower time-to-first-audio per sentence).
 *
 * Set as the `tts` provider on your VoiceAgent subclass:
 *
 * @example
 * ```typescript
 * import { Agent } from "agents";
 * import { withVoice } from "@cloudflare/voice";
 * import { ElevenLabsTTS } from "@cloudflare/voice-elevenlabs";
 *
 * const VoiceAgent = withVoice(Agent);
 *
 * export class MyAgent extends VoiceAgent<Env> {
 *   tts = new ElevenLabsTTS({ apiKey: this.env.ELEVENLABS_API_KEY });
 *
 *   async onTurn(transcript, context) { ... }
 * }
 * ```
 */
export class ElevenLabsTTS implements TTSProvider, StreamingTTSProvider {
  #apiKey: string;
  #voiceId: string;
  #modelId: string;
  #outputFormat: string;

  constructor(options: ElevenLabsTTSOptions) {
    this.#apiKey = options.apiKey;
    this.#voiceId = options.voiceId ?? DEFAULT_VOICE_ID;
    this.#modelId = options.modelId ?? DEFAULT_MODEL_ID;
    this.#outputFormat = options.outputFormat ?? DEFAULT_OUTPUT_FORMAT;
  }

  /**
   * Non-streaming TTS — sends the full text and waits for the complete
   * audio response. Simple but higher latency per sentence.
   */
  async synthesize(
    text: string,
    signal?: AbortSignal
  ): Promise<ArrayBuffer | null> {
    try {
      const response = await fetch(
        `https://api.elevenlabs.io/v1/text-to-speech/${this.#voiceId}?output_format=${this.#outputFormat}`,
        {
          method: "POST",
          headers: {
            "Content-Type": "application/json",
            "xi-api-key": this.#apiKey
          },
          body: JSON.stringify({
            text,
            model_id: this.#modelId
          }),
          signal
        }
      );

      if (!response.ok) {
        console.error(
          `[ElevenLabsTTS] Error: ${response.status} ${response.statusText}`
        );
        return null;
      }

      return await response.arrayBuffer();
    } catch (error) {
      console.error("[ElevenLabsTTS] Error:", error);
      return null;
    }
  }

  /**
   * Streaming TTS — sends the full text and yields audio chunks as they
   * are generated by ElevenLabs. Uses the `/stream` endpoint which returns
   * a chunked HTTP response.
   *
   * This reduces time-to-first-audio within each sentence: the client
   * starts playing audio before the full sentence has been synthesized.
   */
  async *synthesizeStream(
    text: string,
    signal?: AbortSignal
  ): AsyncGenerator<ArrayBuffer> {
    try {
      const response = await fetch(
        `https://api.elevenlabs.io/v1/text-to-speech/${this.#voiceId}/stream?output_format=${this.#outputFormat}`,
        {
          method: "POST",
          headers: {
            "Content-Type": "application/json",
            "xi-api-key": this.#apiKey
          },
          body: JSON.stringify({
            text,
            model_id: this.#modelId
          }),
          signal
        }
      );

      if (!response.ok || !response.body) {
        console.error(
          `[ElevenLabsTTS] Stream error: ${response.status} ${response.statusText}`
        );
        return;
      }

      const reader = response.body.getReader();
      while (true) {
        const { done, value } = await reader.read();
        if (done) break;
        if (value && value.byteLength > 0) {
          // value is a Uint8Array — yield the underlying ArrayBuffer slice
          yield value.buffer.slice(
            value.byteOffset,
            value.byteOffset + value.byteLength
          );
        }
      }
    } catch (error) {
      console.error("[ElevenLabsTTS] Stream error:", error);
    }
  }
}