branch:
sentence-chunker.test.ts
4944 bytesRaw
import { describe, it, expect, beforeEach } from "vitest";
import { SentenceChunker } from "../sentence-chunker";

describe("SentenceChunker", () => {
  let chunker: SentenceChunker;

  beforeEach(() => {
    chunker = new SentenceChunker();
  });

  describe("add()", () => {
    it("returns nothing for a partial sentence", () => {
      expect(chunker.add("Hello, how are")).toEqual([]);
    });

    it("returns a sentence when text ends with terminator + space", () => {
      const result = chunker.add(
        "Hello, how are you doing today? I am doing well."
      );
      // First sentence is long enough, second stays in buffer (no trailing space/next sentence)
      expect(result).toEqual(["Hello, how are you doing today?"]);
    });

    it("accumulates across multiple add() calls", () => {
      expect(chunker.add("Hello, how are you ")).toEqual([]);
      expect(chunker.add("doing today? ")).toEqual([
        "Hello, how are you doing today?"
      ]);
      expect(chunker.add("I'm great! ")).toEqual(["I'm great!"]);
      expect(chunker.add("Thanks for asking about it. ")).toEqual([
        "Thanks for asking about it."
      ]);
    });

    it("handles multiple sentences in one chunk", () => {
      const result = chunker.add(
        "This is the first sentence here. This is the second one right here. And a third one comes now. "
      );
      expect(result).toEqual([
        "This is the first sentence here.",
        "This is the second one right here.",
        "And a third one comes now."
      ]);
    });

    it("handles exclamation marks as terminators", () => {
      expect(
        chunker.add("Wow that is so amazing! Tell me more about it. ")
      ).toEqual(["Wow that is so amazing!", "Tell me more about it."]);
    });

    it("handles question marks as terminators", () => {
      expect(
        chunker.add("How are you doing today? I am doing very well. ")
      ).toEqual(["How are you doing today?", "I am doing very well."]);
    });

    it("does not split on very short fragments like 'Dr.'", () => {
      // "Dr." alone is too short (< 10 chars) so it stays buffered
      expect(chunker.add("Dr. ")).toEqual([]);
      // But once the full sentence grows past MIN_SENTENCE_LENGTH it emits
      expect(chunker.add("Smith went to the store. ")).toEqual([
        "Dr. Smith went to the store."
      ]);
    });

    it("does not split on decimal numbers mid-word", () => {
      // "3.99" — the "." is followed by "9", not a space, so no split
      expect(chunker.add("The price was 3.99 ")).toEqual([]);
      expect(
        chunker.add("dollars for everything today. Next item costs more. ")
      ).toEqual([
        "The price was 3.99 dollars for everything today.",
        "Next item costs more."
      ]);
    });

    // TODO: Future optimisation — handle abbreviations like "Dr.", "U.S.", "etc."
    // These currently split if the preceding text is long enough.
    // See sentence-chunker.ts for the MIN_SENTENCE_LENGTH heuristic.
  });

  describe("flush()", () => {
    it("returns remaining buffer content", () => {
      chunker.add("This is an incomplete");
      expect(chunker.flush()).toEqual(["This is an incomplete"]);
    });

    it("returns empty array when buffer is empty", () => {
      expect(chunker.flush()).toEqual([]);
    });

    it("clears the buffer after flushing", () => {
      chunker.add("Some text here");
      chunker.flush();
      expect(chunker.flush()).toEqual([]);
    });

    it("returns remaining text after sentences have been extracted", () => {
      chunker.add("First sentence is complete. Second is not");
      // "First sentence is complete." is long enough
      expect(chunker.flush()).toEqual(["Second is not"]);
    });
  });

  describe("reset()", () => {
    it("discards buffered text", () => {
      chunker.add("Some buffered text");
      chunker.reset();
      expect(chunker.flush()).toEqual([]);
    });
  });

  describe("streaming simulation", () => {
    it("simulates token-by-token LLM streaming", () => {
      const tokens = [
        "Sure",
        ",",
        " I",
        "'d",
        " be",
        " happy",
        " to",
        " help",
        " you",
        " with",
        " that",
        ".",
        " Let",
        " me",
        " think",
        " about",
        " the",
        " best",
        " approach",
        " here",
        ".",
        " First",
        ",",
        " we",
        " should",
        " consider",
        " options",
        "."
      ];

      const allSentences: string[] = [];
      for (const token of tokens) {
        allSentences.push(...chunker.add(token));
      }
      allSentences.push(...chunker.flush());

      expect(allSentences).toEqual([
        "Sure, I'd be happy to help you with that.",
        "Let me think about the best approach here.",
        "First, we should consider options."
      ]);
    });
  });
});