branch:
types.ts
8183 bytesRaw
/**
 * Shared types for the voice pipeline.
 *
 * Used by both the server (voice.ts) and client (voice-client.ts)
 * to ensure protocol consistency.
 */

// --- Protocol version ---

/**
 * Current voice protocol version.
 * Bump this when making backwards-incompatible wire protocol changes.
 * The server sends this in the initial `welcome` message so clients
 * can detect version mismatches.
 */
export const VOICE_PROTOCOL_VERSION = 1;

// --- Voice status ---

export type VoiceStatus = "idle" | "listening" | "thinking" | "speaking";

// --- Audio format ---

/** Audio format the server uses for binary audio payloads. */
export type VoiceAudioFormat = "mp3" | "pcm16" | "wav" | "opus";

// --- Conversation message role ---

export type VoiceRole = "user" | "assistant";

// --- Wire protocol: Client → Server ---

export type VoiceClientMessage =
  | { type: "hello"; protocol_version?: number }
  | { type: "start_call"; preferred_format?: VoiceAudioFormat }
  | { type: "end_call" }
  | { type: "start_of_speech" }
  | { type: "end_of_speech" }
  | { type: "interrupt" }
  | { type: "text_message"; text: string };

// --- Wire protocol: Server → Client ---

export type VoiceServerMessage =
  | { type: "welcome"; protocol_version: number }
  | { type: "status"; status: VoiceStatus }
  | { type: "audio_config"; format: VoiceAudioFormat; sampleRate?: number }
  | { type: "transcript"; role: VoiceRole; text: string }
  | { type: "transcript_start"; role: VoiceRole }
  | { type: "transcript_delta"; text: string }
  | { type: "transcript_end"; text: string }
  | { type: "transcript_interim"; text: string }
  | {
      type: "metrics";
      vad_ms: number;
      stt_ms: number;
      llm_ms: number;
      tts_ms: number;
      first_audio_ms: number;
      total_ms: number;
    }
  | { type: "error"; message: string };

// --- Pipeline metrics (structured form for consumers) ---

export interface VoicePipelineMetrics {
  vad_ms: number;
  stt_ms: number;
  llm_ms: number;
  tts_ms: number;
  first_audio_ms: number;
  total_ms: number;
}

// --- Transcript message (client-side enriched form) ---

export interface TranscriptMessage {
  role: VoiceRole;
  text: string;
  timestamp: number;
}

// --- Provider interfaces ---

export interface STTProvider {
  transcribe(audioData: ArrayBuffer, signal?: AbortSignal): Promise<string>;
}

export interface TTSProvider {
  synthesize(text: string, signal?: AbortSignal): Promise<ArrayBuffer | null>;
}

export interface StreamingTTSProvider {
  synthesizeStream(
    text: string,
    signal?: AbortSignal
  ): AsyncGenerator<ArrayBuffer>;
}

export interface VADProvider {
  checkEndOfTurn(
    audioData: ArrayBuffer
  ): Promise<{ isComplete: boolean; probability: number }>;
}

// --- Streaming STT ---

/**
 * Streaming speech-to-text provider.
 *
 * Unlike the batch `STTProvider`, this transcribes audio incrementally
 * as it arrives, producing interim and final results in real time.
 * This eliminates STT latency from the critical path — by the time
 * the user stops speaking, the transcript is already (nearly) ready.
 *
 * Session lifecycle is per-utterance: one session per speech segment.
 */
export interface StreamingSTTProvider {
  /** Create a new transcription session for one utterance. */
  createSession(options?: StreamingSTTSessionOptions): StreamingSTTSession;
}

export interface StreamingSTTSessionOptions {
  /** Language code (e.g. "en"). */
  language?: string;
  /** Abort signal — aborted on interrupt or disconnect. */
  signal?: AbortSignal;
  /**
   * Called when the provider produces an interim (unstable) transcript.
   * This text may change as more audio arrives.
   */
  onInterim?: (text: string) => void;
  /**
   * Called when the provider finalizes a transcript segment.
   * This text is stable and will not change.
   * A single utterance may produce multiple onFinal calls
   * (e.g. the provider segments by clause or sentence).
   */
  onFinal?: (text: string) => void;
  /**
   * Called when the provider detects end-of-turn server-side.
   * The transcript is the complete, stable text for this turn.
   *
   * When set, the voice pipeline will start processing (LLM + TTS)
   * immediately without waiting for the client to send end_of_speech.
   * This eliminates client-side silence detection latency.
   *
   * Not all providers support this — it is optional. Providers that
   * do not detect end-of-turn (e.g. Deepgram with endpointing disabled)
   * should leave this unused.
   */
  onEndOfTurn?: (text: string) => void;
}

export interface StreamingSTTSession {
  /**
   * Feed raw PCM audio (16kHz mono 16-bit LE).
   * Fire-and-forget — the session buffers internally as needed.
   */
  feed(chunk: ArrayBuffer): void;

  /**
   * Signal that the speaker has finished.
   * Flushes any buffered audio and returns the final, stable transcript.
   * The session is closed after this call and cannot be reused.
   */
  finish(): Promise<string>;

  /**
   * Abort the session and release resources immediately.
   * No final transcript is produced. Used on interrupt/disconnect.
   */
  abort(): void;
}

// --- Audio input ---

/**
 * Pluggable audio input source for VoiceClient.
 *
 * When provided via `VoiceClientOptions.audioInput`, VoiceClient delegates
 * mic capture to this object instead of using its built-in AudioWorklet.
 * The audio input is responsible for capturing audio and routing it to the
 * server (however it chooses — WebRTC, SFU, direct binary, etc.).
 *
 * It must call `onAudioLevel` with RMS values so VoiceClient can run
 * silence detection, interrupt detection, and update the audio level UI.
 *
 * @example
 * ```typescript
 * class SFUAudioInput implements VoiceAudioInput {
 *   onAudioLevel: ((rms: number) => void) | null = null;
 *   async start() {
 *     // Set up WebRTC peer connection, SFU session, etc.
 *     // In a monitoring loop, call this.onAudioLevel?.(rms)
 *   }
 *   stop() {
 *     // Tear down WebRTC
 *   }
 * }
 * ```
 */
export interface VoiceAudioInput {
  /** Start capturing audio. Called by VoiceClient on startCall(). */
  start(): Promise<void>;
  /** Stop capturing audio. Called by VoiceClient on endCall() or disconnect(). */
  stop(): void;
  /**
   * Set by VoiceClient before start(). The audio input must call this
   * with RMS audio level values on each frame so VoiceClient can run
   * silence detection, interrupt detection, and update the UI.
   */
  onAudioLevel: ((rms: number) => void) | null;

  /**
   * Set by VoiceClient before start(). If the audio input provides
   * raw PCM audio (16kHz mono 16-bit LE), call this callback and
   * VoiceClient will forward the data to the server via its transport.
   *
   * This is needed when audio reaches the server through the same
   * WebSocket as protocol messages (e.g. SFU in local dev where the
   * SFU adapter can't connect back to localhost).
   *
   * If the audio input routes audio to the server through an external
   * path (e.g. SFU WebSocket adapter in production), this can be left
   * unused — the audio will arrive on a separate connection.
   */
  onAudioData?: ((pcm: ArrayBuffer) => void) | null;
}

// --- Voice transport ---

/**
 * Abstraction over the data channel between client and server.
 * The default implementation wraps PartySocket (WebSocket).
 * Implement this interface to use WebRTC, SFU, or other transports.
 */
export interface VoiceTransport {
  /** Send a JSON-serializable message to the server. */
  sendJSON(data: Record<string, unknown>): void;
  /** Send raw binary audio to the server. */
  sendBinary(data: ArrayBuffer): void;

  /** Open the connection. */
  connect(): void;
  /** Close the connection and release resources. */
  disconnect(): void;

  /** Whether the transport is currently connected and ready to send. */
  readonly connected: boolean;

  // --- Event callbacks (set by VoiceClient) ---
  onopen: (() => void) | null;
  onclose: (() => void) | null;
  onerror: ((error?: unknown) => void) | null;
  /** Called when a JSON string message arrives from the server. */
  onmessage: ((data: string | ArrayBuffer | Blob) => void) | null;
}