voice-react.tsx - deathbyknowledge/agents

branch:
voice-react.tsx
13031 bytesRaw
import { useEffect, useRef, useState, useCallback, useMemo } from "react";
import {
  VoiceClient,
  type VoiceClientOptions,
  type VoiceStatus,
  type TranscriptMessage,
  type VoicePipelineMetrics
} from "./voice-client";

// Re-export types so consumers can import everything from agents/voice-react
export type {
  VoiceStatus,
  VoiceRole,
  VoiceAudioFormat,
  VoiceAudioInput,
  VoiceTransport,
  TranscriptMessage,
  VoicePipelineMetrics,
  VoiceClientOptions,
  VoiceClientEvent,
  VoiceClientEventMap
} from "./voice-client";
export { WebSocketVoiceTransport } from "./voice-client";

/** Options accepted by useVoiceAgent. */
export interface UseVoiceAgentOptions extends VoiceClientOptions {
  /**
   * Called when the hook reconnects due to option changes (e.g., agent name
   * or instance name changed). Use this to show a toast or notification.
   */
  onReconnect?: () => void;
}

export interface UseVoiceAgentReturn {
  status: VoiceStatus;
  transcript: TranscriptMessage[];
  /**
   * The current interim (partial) transcript from streaming STT.
   * Updates in real time as the user speaks. null when not available.
   */
  interimTranscript: string | null;
  metrics: VoicePipelineMetrics | null;
  audioLevel: number;
  isMuted: boolean;
  connected: boolean;
  error: string | null;
  startCall: () => Promise<void>;
  endCall: () => void;
  toggleMute: () => void;
  sendText: (text: string) => void;
  /** Send arbitrary JSON to the agent (app-level messages). */
  sendJSON: (data: Record<string, unknown>) => void;
  /** The last non-voice-protocol message received from the server. */
  lastCustomMessage: unknown;
}

// ---------------------------------------------------------------------------
// useVoiceInput — lightweight hook for voice-to-text dictation
// ---------------------------------------------------------------------------

/** Options accepted by useVoiceInput. */
export interface UseVoiceInputOptions {
  /** Agent name (matches the server-side Durable Object class). */
  agent: string;
  /** Instance name for the agent. @default "default" */
  name?: string;
  /** Host to connect to. @default window.location.host */
  host?: string;

  /** RMS threshold below which audio is considered silence. @default 0.04 */
  silenceThreshold?: number;
  /** How long silence must last before sending end_of_speech (ms). @default 500 */
  silenceDurationMs?: number;
}

export interface UseVoiceInputReturn {
  /** Accumulated final transcript text from all utterances. */
  transcript: string;
  /**
   * Current interim (partial) transcript from streaming STT.
   * Updates in real time as the user speaks. null when not available.
   */
  interimTranscript: string | null;
  /** Whether the mic is actively listening. */
  isListening: boolean;
  /** Current audio level (0–1) for visual feedback (e.g. waveform). */
  audioLevel: number;
  /** Whether the mic is muted. */
  isMuted: boolean;
  /** Any error message. */
  error: string | null;
  /** Start listening — requests mic permission and begins streaming audio. */
  start: () => Promise<void>;
  /** Stop listening — releases the mic. */
  stop: () => void;
  /** Toggle mute (mic stays open but audio is not sent). */
  toggleMute: () => void;
  /** Clear the accumulated transcript. */
  clear: () => void;
}

/**
 * React hook for voice-to-text input. Captures microphone audio, streams it
 * to a server-side VoiceAgent for STT, and returns the transcript as a string.
 *
 * Unlike `useVoiceAgent`, this hook is optimised for dictation — it accumulates
 * user transcripts into a single string and ignores assistant responses / TTS.
 *
 * @example
 * ```tsx
 * const { transcript, interimTranscript, isListening, start, stop } = useVoiceInput({
 *   agent: "voice-input-agent"
 * });
 *
 * <textarea value={transcript + (interimTranscript ? " " + interimTranscript : "")} />
 * <button onClick={isListening ? stop : start}>
 *   {isListening ? "Stop" : "Dictate"}
 * </button>
 * ```
 */
export function useVoiceInput(
  options: UseVoiceInputOptions
): UseVoiceInputReturn {
  const connectionKey = useMemo(
    () =>
      `${options.agent}:${options.name ?? "default"}:${options.host ?? ""}:${options.silenceThreshold ?? ""}:${options.silenceDurationMs ?? ""}`,
    [
      options.agent,
      options.name,
      options.host,
      options.silenceThreshold,
      options.silenceDurationMs
    ]
  );

  const clientRef = useRef<VoiceClient | null>(null);

  const [transcript, setTranscript] = useState("");
  const [interimTranscript, setInterimTranscript] = useState<string | null>(
    null
  );
  const [isListening, setIsListening] = useState(false);
  const [audioLevel, setAudioLevel] = useState(0);
  const [isMuted, setIsMuted] = useState(false);
  const [error, setError] = useState<string | null>(null);

  // Connect on mount or when connection identity changes
  useEffect(() => {
    setIsListening(false);
    setInterimTranscript(null);
    setAudioLevel(0);
    setIsMuted(false);
    setError(null);

    const client = new VoiceClient({
      agent: options.agent,
      name: options.name,
      host: options.host,
      silenceThreshold: options.silenceThreshold,
      silenceDurationMs: options.silenceDurationMs
    });
    clientRef.current = client;
    client.connect();

    // Sync user transcripts into a single accumulated string
    const onTranscript = () => {
      const msgs = client.transcript;
      const userTexts = msgs
        .filter((m) => m.role === "user")
        .map((m) => m.text);
      setTranscript(userTexts.join(" "));
    };

    const onInterim = () => setInterimTranscript(client.interimTranscript);
    const onAudioLevel = () => setAudioLevel(client.audioLevel);
    const onMute = () => setIsMuted(client.isMuted);
    const onError = () => setError(client.error);

    const onStatus = () => {
      const s = client.status;
      setIsListening(s === "listening" || s === "thinking");
    };

    client.addEventListener("transcriptchange", onTranscript);
    client.addEventListener("interimtranscript", onInterim);
    client.addEventListener("audiolevelchange", onAudioLevel);
    client.addEventListener("mutechange", onMute);
    client.addEventListener("error", onError);
    client.addEventListener("statuschange", onStatus);

    return () => {
      client.removeEventListener("transcriptchange", onTranscript);
      client.removeEventListener("interimtranscript", onInterim);
      client.removeEventListener("audiolevelchange", onAudioLevel);
      client.removeEventListener("mutechange", onMute);
      client.removeEventListener("error", onError);
      client.removeEventListener("statuschange", onStatus);
      client.disconnect();
    };
    // eslint-disable-next-line react-hooks/exhaustive-deps -- reconnect when connection identity changes
  }, [connectionKey]);

  const start = useCallback(() => clientRef.current!.startCall(), []);
  const stop = useCallback(() => clientRef.current!.endCall(), []);
  const toggleMute = useCallback(() => clientRef.current!.toggleMute(), []);
  const clear = useCallback(() => setTranscript(""), []);

  return {
    transcript,
    interimTranscript,
    isListening,
    audioLevel,
    isMuted,
    error,
    start,
    stop,
    toggleMute,
    clear
  };
}

// ---------------------------------------------------------------------------
// useVoiceAgent — full-featured hook for conversational voice agents
// ---------------------------------------------------------------------------

/**
 * React hook that wraps VoiceClient, syncing its state into React state.
 * All audio infrastructure (mic capture, playback, silence/interrupt detection,
 * voice protocol) is handled by VoiceClient — this hook just bridges to React.
 *
 * When the connection identity changes (agent, name, or host), the hook
 * automatically disconnects the old client, creates a new one, and reconnects.
 * The `onReconnect` callback fires when this happens.
 */
export function useVoiceAgent(
  options: UseVoiceAgentOptions
): UseVoiceAgentReturn {
  // Derive a stable key from the connection-identity fields.
  // When this changes, we tear down the old client and create a new one.
  const connectionKey = useMemo(
    () =>
      `${options.agent}:${options.name ?? "default"}:${options.host ?? ""}:${options.silenceThreshold ?? ""}:${options.silenceDurationMs ?? ""}:${options.interruptThreshold ?? ""}:${options.interruptChunks ?? ""}`,
    [
      options.agent,
      options.name,
      options.host,
      options.silenceThreshold,
      options.silenceDurationMs,
      options.interruptThreshold,
      options.interruptChunks
    ]
  );

  const clientRef = useRef<VoiceClient | null>(null);
  const prevKeyRef = useRef(connectionKey);
  const onReconnectRef = useRef(options.onReconnect);
  onReconnectRef.current = options.onReconnect;

  // React state mirrors VoiceClient state
  const [status, setStatus] = useState<VoiceStatus>("idle");
  const [transcript, setTranscript] = useState<TranscriptMessage[]>([]);
  const [metrics, setMetrics] = useState<VoicePipelineMetrics | null>(null);
  const [audioLevel, setAudioLevel] = useState(0);
  const [isMuted, setIsMuted] = useState(false);
  const [connected, setConnected] = useState(false);
  const [error, setError] = useState<string | null>(null);
  const [interimTranscript, setInterimTranscript] = useState<string | null>(
    null
  );

  // Connect on mount or when connection identity changes
  useEffect(() => {
    const isReconnect = prevKeyRef.current !== connectionKey;
    prevKeyRef.current = connectionKey;

    // Fire reconnect callback (e.g., to show a toast)
    if (isReconnect) {
      onReconnectRef.current?.();
    }

    // Reset state for a fresh connection
    setStatus("idle");
    setTranscript([]);
    setMetrics(null);
    setAudioLevel(0);
    setIsMuted(false);
    setConnected(false);
    setError(null);
    setInterimTranscript(null);

    const client = new VoiceClient(options);
    clientRef.current = client;
    client.connect();

    // Sync handlers — read state from client and push to React
    const onStatus = (s: VoiceStatus) => setStatus(s);
    const onTranscript = (t: TranscriptMessage[]) => setTranscript(t);
    const onMetrics = (m: VoicePipelineMetrics | null) => setMetrics(m);
    const onAudioLevel = (level: number) => setAudioLevel(level);
    const onMute = (muted: boolean) => setIsMuted(muted);
    const onConnection = (c: boolean) => setConnected(c);
    const onError = (e: string | null) => setError(e);
    const onInterim = (text: string | null) => setInterimTranscript(text);

    client.addEventListener("statuschange", onStatus);
    client.addEventListener("transcriptchange", onTranscript);
    client.addEventListener("interimtranscript", onInterim);
    client.addEventListener("metricschange", onMetrics);
    client.addEventListener("audiolevelchange", onAudioLevel);
    client.addEventListener("mutechange", onMute);
    client.addEventListener("connectionchange", onConnection);
    client.addEventListener("error", onError);

    return () => {
      client.removeEventListener("statuschange", onStatus);
      client.removeEventListener("transcriptchange", onTranscript);
      client.removeEventListener("interimtranscript", onInterim);
      client.removeEventListener("metricschange", onMetrics);
      client.removeEventListener("audiolevelchange", onAudioLevel);
      client.removeEventListener("mutechange", onMute);
      client.removeEventListener("connectionchange", onConnection);
      client.removeEventListener("error", onError);
      client.disconnect();
    };
    // eslint-disable-next-line react-hooks/exhaustive-deps -- reconnect when connection identity changes
  }, [connectionKey]);

  // Stable action callbacks — always use the latest client
  const startCall = useCallback(() => clientRef.current!.startCall(), []);
  const endCall = useCallback(() => clientRef.current!.endCall(), []);
  const toggleMute = useCallback(() => clientRef.current!.toggleMute(), []);
  const sendText = useCallback(
    (text: string) => clientRef.current!.sendText(text),
    []
  );
  const sendJSON = useCallback(
    (data: Record<string, unknown>) => clientRef.current!.sendJSON(data),
    []
  );

  const [lastCustomMessage, setLastCustomMessage] = useState<unknown>(null);

  // Listen for custom messages — needs a separate effect since it must
  // attach to the latest client.
  useEffect(() => {
    const client = clientRef.current;
    if (!client) return;
    const onCustom = (msg: unknown) => setLastCustomMessage(msg);
    client.addEventListener("custommessage", onCustom);
    return () => client.removeEventListener("custommessage", onCustom);
  }, [connectionKey]);

  return {
    status,
    transcript,
    interimTranscript,
    metrics,
    audioLevel,
    isMuted,
    connected,
    error,
    startCall,
    endCall,
    toggleMute,
    sendText,
    sendJSON,
    lastCustomMessage
  };
}