branch:
sentence-chunker.ts
3507 bytesRaw
/**
 * Sentence chunker — accumulates streaming text and yields complete sentences.
 *
 * Isolated and testable: no dependencies on the voice pipeline, Agent, or AI APIs.
 * Feed it tokens via `add()`, get back sentences via the return value.
 * Call `flush()` at end-of-stream to get any remaining text.
 *
 * Current implementation: splits on sentence-ending punctuation (. ! ?) followed
 * by a space or end-of-input. This is intentionally simple — optimize later with
 * better heuristics (abbreviations, decimal numbers, quoted speech, etc.).
 */

/**
 * Punctuation characters that can end a sentence.
 */
const SENTENCE_TERMINATORS = new Set([".", "!", "?"]);

/**
 * Minimum character count before we'll emit a sentence.
 * Prevents emitting fragments like "Dr." or "U.S." as standalone sentences,
 * while still allowing short responses like "Sure thing!" to stream quickly.
 */
const MIN_SENTENCE_LENGTH = 10;

export class SentenceChunker {
  #buffer = "";

  /**
   * Add a chunk of text (e.g. a streamed LLM token).
   * Returns an array of complete sentences extracted from the buffer.
   * May return 0, 1, or multiple sentences depending on the input.
   */
  add(text: string): string[] {
    this.#buffer += text;
    return this.#extractSentences();
  }

  /**
   * Flush any remaining text in the buffer as a final sentence.
   * Call this when the LLM stream ends.
   * Returns the remaining text (trimmed), or an empty array if nothing is left.
   */
  flush(): string[] {
    const remaining = this.#buffer.trim();
    this.#buffer = "";
    if (remaining.length > 0) {
      return [remaining];
    }
    return [];
  }

  /**
   * Reset the chunker, discarding any buffered text.
   */
  reset() {
    this.#buffer = "";
  }

  /**
   * Extract complete sentences from the buffer.
   * A sentence boundary is a terminator (. ! ?) followed by:
   * - a space and an uppercase letter (start of next sentence)
   * - a space and end of current buffer (likely a boundary)
   * - end of buffer after the terminator
   *
   * We leave ambiguous cases in the buffer until more text arrives.
   */
  #extractSentences(): string[] {
    const sentences: string[] = [];

    while (true) {
      const boundary = this.#findSentenceBoundary();
      if (boundary === -1) break;

      const sentence = this.#buffer.slice(0, boundary + 1).trim();
      this.#buffer = this.#buffer.slice(boundary + 1).trimStart();

      if (sentence.length > 0) {
        sentences.push(sentence);
      }
    }

    return sentences;
  }

  /**
   * Find the index of the end of the first complete sentence in the buffer.
   * Returns -1 if no complete sentence boundary is found.
   */
  #findSentenceBoundary(): number {
    for (let i = 0; i < this.#buffer.length; i++) {
      const char = this.#buffer[i];

      if (!SENTENCE_TERMINATORS.has(char)) continue;

      // Check what follows the terminator
      const nextChar = this.#buffer[i + 1];

      // If this is the last character in the buffer, don't split yet —
      // more text might follow (e.g. "3.14" or "Dr. Smith")
      if (nextChar === undefined) continue;

      // Terminator followed by space — likely a real sentence boundary
      if (nextChar === " " || nextChar === "\n") {
        // But only if the sentence is long enough to be real
        const candidate = this.#buffer.slice(0, i + 1).trim();
        if (candidate.length >= MIN_SENTENCE_LENGTH) {
          return i;
        }
      }
    }

    return -1;
  }
}