llm.spec.ts - deathbyknowledge/agents

branch:
llm.spec.ts
19952 bytesRaw
import { test, expect } from "@playwright/test";

/**
 * E2E tests that make real LLM calls via Workers AI (@cf/moonshotai/kimi-k2.5).
 * These verify the full streaming pipeline: streamText → SSE → WebSocket → client.
 *
 * Uses the AI binding configured in wrangler.jsonc -- no API key needed.
 * The only OpenAI usage is BadKeyAgent, which tests error handling with an invalid key.
 */

const MessageType = {
  CF_AGENT_USE_CHAT_REQUEST: "cf_agent_use_chat_request",
  CF_AGENT_USE_CHAT_RESPONSE: "cf_agent_use_chat_response",
  CF_AGENT_CHAT_CLEAR: "cf_agent_chat_clear"
} as const;

type WSMessage = {
  type: string;
  [key: string]: unknown;
};

function agentPath(baseURL: string, room: string) {
  return `${baseURL.replace("http", "ws")}/agents/llm-chat-agent/${room}`;
}

/**
 * Opens a WebSocket, sends a chat request, collects all messages until done.
 */
async function sendChatAndCollect(
  page: import("@playwright/test").Page,
  wsUrl: string,
  userMessage: string,
  requestId: string,
  timeoutMs = 15000
): Promise<WSMessage[]> {
  return page.evaluate(
    ({ url, userMessage, requestId, timeoutMs, MT }) => {
      return new Promise<WSMessage[]>((resolve, reject) => {
        const ws = new WebSocket(url);
        const received: WSMessage[] = [];

        ws.onmessage = (e) => {
          try {
            received.push(JSON.parse(e.data as string));
          } catch {
            // ignore non-JSON
          }
        };

        ws.onerror = () => reject(new Error("WebSocket error"));

        ws.onopen = () => {
          ws.send(
            JSON.stringify({
              type: MT.CF_AGENT_USE_CHAT_REQUEST,
              id: requestId,
              init: {
                method: "POST",
                body: JSON.stringify({
                  messages: [
                    {
                      id: `msg-${requestId}`,
                      role: "user",
                      parts: [{ type: "text", text: userMessage }]
                    }
                  ]
                })
              }
            })
          );

          const check = setInterval(() => {
            const doneMsg = received.find(
              (m) => m.type === MT.CF_AGENT_USE_CHAT_RESPONSE && m.done === true
            );
            if (doneMsg) {
              clearInterval(check);
              ws.close();
              resolve(received);
            }
          }, 100);

          setTimeout(() => {
            clearInterval(check);
            ws.close();
            resolve(received);
          }, timeoutMs);
        };
      });
    },
    { url: wsUrl, userMessage, requestId, timeoutMs, MT: MessageType }
  );
}

/** Extract parsed chunk bodies from stream responses */
function extractChunkBodies(
  messages: WSMessage[]
): Array<{ type: string; [k: string]: unknown }> {
  return messages
    .filter(
      (m) =>
        m.type === MessageType.CF_AGENT_USE_CHAT_RESPONSE &&
        !m.done &&
        typeof m.body === "string" &&
        (m.body as string).trim()
    )
    .map((m) => JSON.parse(m.body as string));
}

test.describe("LLM e2e (Workers AI)", () => {
  // Longer timeout for real LLM calls
  test.setTimeout(30_000);

  test.beforeEach(async ({ page }) => {
    await page.goto("about:blank");
  });

  test("SSE streaming: receives text-start, text-delta(s), text-end from real model", async ({
    page,
    baseURL
  }) => {
    const room = crypto.randomUUID();
    const messages = await sendChatAndCollect(
      page,
      agentPath(baseURL!, room),
      "Say hello in exactly 3 words.",
      "req-sse-1"
    );

    const chunks = extractChunkBodies(messages);
    const types = chunks.map((c) => c.type);

    // Should have SSE format (not plain text format)
    expect(types).toContain("text-start");
    expect(types).toContain("text-delta");
    expect(types).toContain("text-end");

    // Accumulate the text deltas
    const fullText = chunks
      .filter((c) => c.type === "text-delta")
      .map((c) => c.delta as string)
      .join("");

    // Model should have responded with something
    expect(fullText.length).toBeGreaterThan(0);
    console.log("LLM response:", fullText);

    // Should have start/finish lifecycle events
    expect(types).toContain("start");
    expect(types).toContain("finish");
  });

  test("server-side tool call: model calls getWeather and returns result", async ({
    page,
    baseURL
  }) => {
    const room = crypto.randomUUID();
    const messages = await sendChatAndCollect(
      page,
      agentPath(baseURL!, room),
      "What is the weather in London?",
      "req-tool-1"
    );

    const chunks = extractChunkBodies(messages);
    const types = chunks.map((c) => c.type);

    // Should have tool lifecycle events
    // The model should call getWeather, which executes server-side,
    // and then continue with text output
    expect(types).toContain("tool-input-start");
    expect(types).toContain("tool-input-available");
    expect(types).toContain("tool-output-available");

    // Find the tool-input-available chunk
    const toolInput = chunks.find((c) => c.type === "tool-input-available");
    expect(toolInput).toBeTruthy();
    expect(toolInput!.toolName).toBe("getWeather");

    // Find the tool-output-available chunk — the execute function returns mocked data
    const toolOutput = chunks.find((c) => c.type === "tool-output-available");
    expect(toolOutput).toBeTruthy();
    const output = toolOutput!.output as { city: string; temperature: number };
    expect(output.city).toBe("London");
    expect(output.temperature).toBe(22);

    // After the tool result, the model should continue with text
    // (maxSteps=3 allows continuation). The model usually responds with text,
    // but may occasionally stop after the tool result.
    const textDeltas = chunks.filter((c) => c.type === "text-delta");
    if (textDeltas.length > 0) {
      const fullText = textDeltas.map((c) => c.delta as string).join("");
      console.log("Tool call response:", fullText);
      expect(fullText.length).toBeGreaterThan(0);
    } else {
      // Model stopped after tool result — this is valid behavior
      console.log("Model did not produce text after tool result (valid)");
    }
  });

  test("message persistence with real LLM: messages survive reconnection", async ({
    page,
    baseURL
  }) => {
    const room = crypto.randomUUID();

    // Send a message
    await sendChatAndCollect(
      page,
      agentPath(baseURL!, room),
      "Remember the number 42.",
      "req-persist-llm"
    );

    // Check persistence via HTTP
    const res = await page.request.get(
      `${baseURL}/agents/llm-chat-agent/${room}/get-messages`
    );
    expect(res.ok()).toBe(true);

    const persisted = await res.json();
    expect(persisted.length).toBeGreaterThanOrEqual(2); // user + assistant

    // User message
    const userMsg = persisted.find(
      (m: { id: string }) => m.id === "msg-req-persist-llm"
    );
    expect(userMsg).toBeTruthy();
    expect(userMsg.role).toBe("user");

    // Assistant message should have text parts from the real LLM response
    const assistantMsgs = persisted.filter(
      (m: { role: string }) => m.role === "assistant"
    );
    expect(assistantMsgs.length).toBeGreaterThanOrEqual(1);

    const assistantParts = assistantMsgs[0].parts;
    const textParts = assistantParts.filter(
      (p: { type: string }) => p.type === "text"
    );
    expect(textParts.length).toBeGreaterThanOrEqual(1);
    expect(textParts[0].text.length).toBeGreaterThan(0);
  });

  test("multi-step tool use: model uses addNumbers tool", async ({
    page,
    baseURL
  }) => {
    const room = crypto.randomUUID();
    const messages = await sendChatAndCollect(
      page,
      agentPath(baseURL!, room),
      "What is 17 + 25? Use the addNumbers tool to calculate.",
      "req-math-1"
    );

    const chunks = extractChunkBodies(messages);

    // Should have called addNumbers tool
    const toolInput = chunks.find(
      (c) => c.type === "tool-input-available" && c.toolName === "addNumbers"
    );
    expect(toolInput).toBeTruthy();

    // Tool output should have the correct result
    const toolOutput = chunks.find((c) => c.type === "tool-output-available");
    expect(toolOutput).toBeTruthy();
    const output = toolOutput!.output as { result: number };
    expect(output.result).toBe(42);

    // Model should have continued with text mentioning the result
    const textDeltas = chunks.filter((c) => c.type === "text-delta");
    if (textDeltas.length > 0) {
      const fullText = textDeltas.map((c) => c.delta as string).join("");
      console.log("Math response:", fullText);
      // The model should mention 42 somewhere in its response
      expect(fullText).toContain("42");
    } else {
      console.log("Model did not produce text after tool result (valid)");
    }
  });

  test("step boundaries: multi-step response has start-step and finish-step", async ({
    page,
    baseURL
  }) => {
    const room = crypto.randomUUID();
    const messages = await sendChatAndCollect(
      page,
      agentPath(baseURL!, room),
      "What is the weather in Paris? Use the getWeather tool.",
      "req-steps-1"
    );

    const chunks = extractChunkBodies(messages);
    const types = chunks.map((c) => c.type);

    // Multi-step responses should have step boundaries
    expect(types).toContain("start-step");
    expect(types).toContain("finish-step");

    // Should have at least 1 step (the tool call step).
    // With maxSteps=3, the model may also produce a continuation step.
    const startSteps = types.filter((t) => t === "start-step").length;
    expect(startSteps).toBeGreaterThanOrEqual(1);

    // Tool lifecycle should be present in the steps
    expect(types).toContain("tool-input-available");
    expect(types).toContain("tool-output-available");
  });

  test("clear history works after LLM conversation", async ({
    page,
    baseURL
  }) => {
    const room = crypto.randomUUID();
    const wsUrl = agentPath(baseURL!, room);

    // Have a conversation
    await sendChatAndCollect(page, wsUrl, "Hello!", "req-clear-llm");

    // Verify messages exist
    const before = await page.request.get(
      `${baseURL}/agents/llm-chat-agent/${room}/get-messages`
    );
    const beforeData = await before.json();
    expect(beforeData.length).toBeGreaterThanOrEqual(2);

    // Clear
    await page.evaluate(
      ({ url, MT }) => {
        return new Promise<void>((resolve) => {
          const ws = new WebSocket(url);
          ws.onopen = () => {
            ws.send(JSON.stringify({ type: MT.CF_AGENT_CHAT_CLEAR }));
            setTimeout(() => {
              ws.close();
              resolve();
            }, 500);
          };
        });
      },
      { url: wsUrl, MT: MessageType }
    );

    // Verify cleared
    const after = await page.request.get(
      `${baseURL}/agents/llm-chat-agent/${room}/get-messages`
    );
    const afterData = await after.json();
    expect(afterData.length).toBe(0);
  });

  test("multi-turn context: LLM remembers previous messages", async ({
    page,
    baseURL
  }) => {
    const room = crypto.randomUUID();
    const wsUrl = agentPath(baseURL!, room);

    // First turn: tell the model a fact
    await sendChatAndCollect(
      page,
      wsUrl,
      "My name is Alice. Remember that.",
      "req-ctx-1"
    );

    // Second turn: ask about the fact, including the first message
    // (as the client would — the full conversation history is sent)
    const messages = await page.evaluate(
      ({ url, MT }) => {
        return new Promise<Array<{ type: string; [k: string]: unknown }>>(
          (resolve) => {
            const ws = new WebSocket(url);
            const received: Array<{ type: string; [k: string]: unknown }> = [];

            ws.onmessage = (e) => {
              try {
                received.push(JSON.parse(e.data));
              } catch {
                // ignore
              }
            };

            ws.onopen = () => {
              ws.send(
                JSON.stringify({
                  type: MT.CF_AGENT_USE_CHAT_REQUEST,
                  id: "req-ctx-2",
                  init: {
                    method: "POST",
                    body: JSON.stringify({
                      messages: [
                        {
                          id: "msg-req-ctx-1",
                          role: "user",
                          parts: [
                            {
                              type: "text",
                              text: "My name is Alice. Remember that."
                            }
                          ]
                        },
                        {
                          id: "msg-req-ctx-2",
                          role: "user",
                          parts: [{ type: "text", text: "What is my name?" }]
                        }
                      ]
                    })
                  }
                })
              );

              const check = setInterval(() => {
                const done = received.find(
                  (m) =>
                    m.type === MT.CF_AGENT_USE_CHAT_RESPONSE && m.done === true
                );
                if (done) {
                  clearInterval(check);
                  ws.close();
                  resolve(received);
                }
              }, 100);

              setTimeout(() => {
                clearInterval(check);
                ws.close();
                resolve(received);
              }, 15000);
            };
          }
        );
      },
      { url: wsUrl, MT: MessageType }
    );

    // Extract text from the response
    const chunks = extractChunkBodies(messages);
    const fullText = chunks
      .filter((c) => c.type === "text-delta")
      .map((c) => c.delta as string)
      .join("");

    console.log("Context response:", fullText);
    // Model should mention "Alice" since it has the context
    expect(fullText.toLowerCase()).toContain("alice");
  });

  test("OpenAI metadata sanitization: no itemId in persisted messages", async ({
    page,
    baseURL
  }) => {
    const room = crypto.randomUUID();

    // Have a conversation to generate messages with OpenAI metadata
    await sendChatAndCollect(
      page,
      agentPath(baseURL!, room),
      "Tell me a short fun fact.",
      "req-sanitize"
    );

    // Fetch persisted messages
    const res = await page.request.get(
      `${baseURL}/agents/llm-chat-agent/${room}/get-messages`
    );
    expect(res.ok()).toBe(true);
    const persisted = await res.json();

    // Check that no part has providerMetadata.openai.itemId
    for (const msg of persisted) {
      for (const part of msg.parts || []) {
        if (part.providerMetadata?.openai) {
          expect(part.providerMetadata.openai.itemId).toBeUndefined();
          expect(
            part.providerMetadata.openai.reasoningEncryptedContent
          ).toBeUndefined();
        }
        if (part.callProviderMetadata?.openai) {
          expect(part.callProviderMetadata.openai.itemId).toBeUndefined();
        }
      }
    }

    // Also verify messages actually exist and have content
    expect(persisted.length).toBeGreaterThanOrEqual(2);
    const assistant = persisted.find(
      (m: { role: string }) => m.role === "assistant"
    );
    expect(assistant).toBeTruthy();
    expect(assistant.parts.length).toBeGreaterThan(0);
  });

  test("multi-tab tool streaming: second connection sees tool-input-start and tool-output-available", async ({
    page,
    baseURL
  }) => {
    const room = crypto.randomUUID();
    const wsUrl = agentPath(baseURL!, room);

    // Open two connections to the same room.
    // ws1 sends the request, ws2 observes the broadcast.
    const result = await page.evaluate(
      ({ url, MT }) => {
        return new Promise<{
          ws2ChunkTypes: string[];
          ws2HasToolInputStart: boolean;
          ws2HasToolOutputAvailable: boolean;
        }>((resolve) => {
          const ws1 = new WebSocket(url);
          const ws2 = new WebSocket(url);
          const ws2ChunkTypes: string[] = [];
          let ws1Open = false;
          let ws2Open = false;

          function maybeSend() {
            if (!ws1Open || !ws2Open) return;
            setTimeout(() => {
              ws1.send(
                JSON.stringify({
                  type: MT.CF_AGENT_USE_CHAT_REQUEST,
                  id: "req-multi-tab-tool",
                  init: {
                    method: "POST",
                    body: JSON.stringify({
                      messages: [
                        {
                          id: "mtt-1",
                          role: "user",
                          parts: [
                            {
                              type: "text",
                              text: "What is the weather in Tokyo? Use the getWeather tool."
                            }
                          ]
                        }
                      ]
                    })
                  }
                })
              );
            }, 100);
          }

          ws1.onopen = () => {
            ws1Open = true;
            maybeSend();
          };
          ws2.onopen = () => {
            ws2Open = true;
            maybeSend();
          };

          // Collect chunk types from ws2 (the observer)
          ws2.onmessage = (e) => {
            try {
              const data = JSON.parse(e.data);
              if (
                data.type === MT.CF_AGENT_USE_CHAT_RESPONSE &&
                typeof data.body === "string" &&
                data.body.trim()
              ) {
                try {
                  const chunk = JSON.parse(data.body);
                  ws2ChunkTypes.push(chunk.type);
                } catch {
                  // not JSON body
                }
              }
            } catch {
              // ignore
            }
          };

          // Wait for ws1 to get done signal
          const ws1Messages: Array<{
            type: string;
            [k: string]: unknown;
          }> = [];
          ws1.onmessage = (e) => {
            try {
              const data = JSON.parse(e.data);
              ws1Messages.push(data);
            } catch {
              // ignore
            }
          };

          const check = setInterval(() => {
            const done = ws1Messages.find(
              (m) => m.type === MT.CF_AGENT_USE_CHAT_RESPONSE && m.done === true
            );
            if (done) {
              clearInterval(check);
              setTimeout(() => {
                ws1.close();
                ws2.close();
                resolve({
                  ws2ChunkTypes,
                  ws2HasToolInputStart:
                    ws2ChunkTypes.includes("tool-input-start"),
                  ws2HasToolOutputAvailable: ws2ChunkTypes.includes(
                    "tool-output-available"
                  )
                });
              }, 300);
            }
          }, 100);

          setTimeout(() => {
            clearInterval(check);
            ws1.close();
            ws2.close();
            resolve({
              ws2ChunkTypes,
              ws2HasToolInputStart: ws2ChunkTypes.includes("tool-input-start"),
              ws2HasToolOutputAvailable: ws2ChunkTypes.includes(
                "tool-output-available"
              )
            });
          }, 20000);
        });
      },
      { url: wsUrl, MT: MessageType }
    );

    // ws2 should see the full tool lifecycle streamed to it
    expect(result.ws2ChunkTypes.length).toBeGreaterThan(0);
    // Should see tool-input-start (the tool being called)
    expect(result.ws2HasToolInputStart).toBe(true);
    // Should see tool-output-available (the tool result)
    expect(result.ws2HasToolOutputAvailable).toBe(true);
  });
});