CopilotKit

Voice

Real-time speech-to-text in the chat composer. The user speaks, the runtime transcribes, the agent runs the resulting prompt.


"""AG2 agent with weather and sales tools for CopilotKit showcase.Uses AG2's ConversableAgent with AGUIStream to exposethe agent via the AG-UI protocol."""from __future__ import annotationsimport jsonimport osfrom typing import Annotated, Anyfrom autogen import ConversableAgent, LLMConfigfrom autogen.ag_ui import AGUIStreamfrom dotenv import load_dotenvload_dotenv()# Import shared tool implementationsfrom tools import (    get_weather_impl,    query_data_impl,    manage_sales_todos_impl,    get_sales_todos_impl,    schedule_meeting_impl,    search_flights_impl,    build_a2ui_operations_from_tool_call,    RENDER_A2UI_TOOL_SCHEMA,)from tools.types import Flight# =====# Tools# =====async def get_weather(    location: Annotated[str, "City name to get weather for"],) -> dict[str, str | float]:    """Get current weather for a location."""    result = get_weather_impl(location)    return {        "city": result["city"],        "temperature": result["temperature"],        "feels_like": result["feels_like"],        "humidity": result["humidity"],        "wind_speed": result["wind_speed"],        "conditions": result["conditions"],    }async def query_data(    query: Annotated[str, "Natural language query for financial data"],) -> list:    """Query financial database for chart data."""    return query_data_impl(query)async def manage_sales_todos(    todos: Annotated[list, "Complete list of sales todos"],) -> dict:    """Manage the sales pipeline."""    return {"todos": manage_sales_todos_impl(todos)}async def get_sales_todos() -> list:    """Get the current sales pipeline."""    return get_sales_todos_impl(None)async def schedule_meeting(    reason: Annotated[str, "Reason for the meeting"],) -> dict:    """Schedule a meeting with user approval."""    return schedule_meeting_impl(reason)async def search_flights(    flights: Annotated[        list[dict[str, Any]], "List of flight objects to display as rich A2UI cards"    ],) -> str:    """Search for flights and display the results as rich cards. Return exactly 2 flights.    Each flight must have: airline, airlineLogo, flightNumber, origin, destination,    date (short readable format like "Tue, Mar 18" -- use near-future dates),    departureTime, arrivalTime, duration (e.g. "4h 25m"),    status (e.g. "On Time" or "Delayed"),    statusColor (hex color for status dot),    price (e.g. "$289"), and currency (e.g. "USD").    For airlineLogo use Google favicon API:    https://www.google.com/s2/favicons?domain={airline_domain}&sz=128    """    typed_flights: list[Flight] = [Flight(**f) for f in flights]    result = search_flights_impl(typed_flights)    return json.dumps(result)async def generate_a2ui(    context: Annotated[str, "Conversation context to generate UI for"],) -> str:    """Generate dynamic A2UI components based on the conversation.    A secondary LLM designs the UI schema and data. The result is    returned as an a2ui_operations container for the middleware to detect.    """    import openai    client = openai.OpenAI()    response = client.chat.completions.create(        model="gpt-4.1",        messages=[            {"role": "system", "content": context or "Generate a useful dashboard UI."},            {                "role": "user",                "content": "Generate a dynamic A2UI dashboard based on the conversation.",            },        ],        tools=[            {                "type": "function",                "function": RENDER_A2UI_TOOL_SCHEMA,            }        ],        tool_choice={"type": "function", "function": {"name": "render_a2ui"}},    )    choice = response.choices[0]    if choice.message.tool_calls:        args = json.loads(choice.message.tool_calls[0].function.arguments)        result = build_a2ui_operations_from_tool_call(args)        return json.dumps(result)    return json.dumps({"error": "LLM did not call render_a2ui"})# =====# Agent# =====agent = ConversableAgent(    name="assistant",    system_message=(        "You are a helpful sales assistant. You can look up current weather "        "for any city using the get_weather tool, query financial data with "        "query_data, manage the sales pipeline with manage_sales_todos and "        "get_sales_todos, schedule meetings with schedule_meeting, search "        "flights and display rich A2UI cards with search_flights, and "        "generate dynamic A2UI dashboards with generate_a2ui. "        "When asked about the weather, always use the tool rather than guessing. "        "Be concise and friendly in your responses."    ),    llm_config=LLMConfig({"model": "gpt-4o-mini", "stream": True}),    human_input_mode="NEVER",    # Guard against infinite tool-call loops: AG2's ConversableAgent with    # human_input_mode="NEVER" will keep executing tool calls indefinitely    # if the LLM keeps requesting them.  Without this limit the agent floods    # Railway's log stream (500 logs/sec rate-limit), becomes unresponsive    # to health probes, and gets killed by the watchdog.    max_consecutive_auto_reply=15,    functions=[        get_weather,        query_data,        manage_sales_todos,        get_sales_todos,        schedule_meeting,        search_flights,        generate_a2ui,    ],)# AG-UI stream wrapperstream = AGUIStream(agent)

You have a working chat surface and you want users to be able to speak instead of type. By the end of this guide, the chat composer will sprout a mic button, recorded audio will be transcribed by the runtime, and the transcript will auto-send to the agent like any other message.

When to use this#

  • Hands-free or accessibility flows where typing isn't the right input modality.
  • Mobile or kiosk surfaces where a long voice query is faster than thumb-typing.
  • Demo and test loops where you want canned audio to drive the chat without a microphone.

If you only need file uploads (audio, images, video, documents), use Multimodal Attachments instead. Voice is specifically about live transcription of recorded speech into chat input.

Frontend#

<CopilotChat /> renders the mic button automatically when the runtime advertises audioFileTranscriptionEnabled: true on its /info endpoint. There's nothing to wire up on the chat surface itself:

page.tsx
import { useCallback } from "react";import { CopilotKit } from "@copilotkit/react-core";import { CopilotChat } from "@copilotkit/react-core/v2";import { SampleAudioButton } from "./sample-audio-button";const RUNTIME_URL = "/api/copilotkit-voice";const AGENT_ID = "voice-demo";const SAMPLE_TEXT = "What is the weather in Tokyo?";// Voice demo (AG2). Two affordances on this page://// 1. The default mic button rendered by <CopilotChat /> when the runtime at//    RUNTIME_URL advertises `audioFileTranscriptionEnabled: true`.// 2. A "Play sample" button that synchronously injects a canned phrase into//    the composer (bypasses mic perms and the runtime entirely so Playwright//    + screenshot flows work too). Real transcription only happens through//    the mic path.export default function VoiceDemoPage() {  const handleTranscribed = useCallback((text: string) => {    if (typeof document === "undefined") return;    const textarea = document.querySelector<HTMLTextAreaElement>(      '[data-testid="copilot-chat-textarea"]',    );    if (!textarea) {      console.warn(        "[voice-demo] could not find copilot-chat-textarea to populate",      );      return;    }    const nativeSetter = Object.getOwnPropertyDescriptor(      window.HTMLTextAreaElement.prototype,      "value",    )?.set;    if (nativeSetter) {      nativeSetter.call(textarea, text);    } else {      textarea.value = text;    }    textarea.dispatchEvent(new Event("input", { bubbles: true }));    textarea.focus();  }, []);  return (    <CopilotKit      runtimeUrl={RUNTIME_URL}      agent={AGENT_ID}      useSingleEndpoint={false}    >      <div className="flex h-screen flex-col gap-3 p-6">        <header>          <h1 className="text-lg font-semibold">Voice input</h1>          <p className="text-sm text-black/60 dark:text-white/60">            Click the microphone to record, or play the bundled sample audio.            Speech is transcribed into the input field — you click send.          </p>        </header>        <SampleAudioButton          onTranscribed={handleTranscribed}          sampleText={SAMPLE_TEXT}        />        <div className="min-h-0 flex-1 overflow-hidden rounded-md border border-black/10 dark:border-white/10">          <CopilotChat agentId={AGENT_ID} className="h-full" />        </div>      </div>    </CopilotKit>  );}

When the user clicks the mic, the chat captures audio, POSTs it to the runtime's /transcribe endpoint, drops the resulting transcript into the composer, and submits.

Driving the demo without a mic#

For Playwright runs, screenshots, or any flow where prompting for mic permissions is awkward, ship a button that POSTs a bundled audio clip directly to the same /transcribe endpoint:

sample-audio-button.tsx
export function SampleAudioButton({  onTranscribed,  sampleText,}: SampleAudioButtonProps) {  return (    <div      data-testid="voice-sample-audio"      className="flex items-center gap-3 rounded-md border border-black/10 bg-black/[0.02] px-3 py-2 text-sm dark:border-white/10 dark:bg-white/[0.02]"    >      <button        type="button"        data-testid="voice-sample-audio-button"        onClick={() => onTranscribed(sampleText)}        className="rounded border border-black/10 bg-white px-3 py-1 text-xs font-medium hover:bg-black/5 dark:border-white/10 dark:bg-black/30 dark:hover:bg-white/10"      >        Play sample      </button>      <span className="text-black/60 dark:text-white/60">        Sample: &ldquo;{sampleText}&rdquo;      </span>    </div>  );}

The caller can drop the resulting text into the composer's textarea (matched via data-testid="copilot-chat-textarea") using the native value setter and a synthetic input event so React's managed state updates correctly.

Backend#

Wire up the V2 runtime with a TranscriptionService. The V1 wrapper drops the transcriptionService option, so use createCopilotRuntimeHandler from @copilotkit/runtime/v2 directly:

route.ts
import type { NextRequest } from "next/server";import {  CopilotRuntime,  TranscriptionService,  createCopilotRuntimeHandler,} from "@copilotkit/runtime/v2";import type { TranscribeFileOptions } from "@copilotkit/runtime/v2";import { HttpAgent } from "@ag-ui/client";import { TranscriptionServiceOpenAI } from "@copilotkit/voice";import OpenAI from "openai";const AGENT_URL = process.env.AGENT_URL || "http://localhost:8000";const voiceDemoAgent = new HttpAgent({ url: `${AGENT_URL}/` });class GuardedOpenAITranscriptionService extends TranscriptionService {  private delegate: TranscriptionServiceOpenAI | null;  constructor() {    super();    const apiKey = process.env.OPENAI_API_KEY;    this.delegate = apiKey      ? new TranscriptionServiceOpenAI({ openai: new OpenAI({ apiKey }) })      : null;  }  async transcribeFile(options: TranscribeFileOptions): Promise<string> {    if (!this.delegate) {      throw new Error(        "OPENAI_API_KEY not configured for this deployment (api key missing). " +          "Set OPENAI_API_KEY to enable voice transcription.",      );    }    return this.delegate.transcribeFile(options);  }}let cachedHandler: ((req: Request) => Promise<Response>) | null = null;function getHandler(): (req: Request) => Promise<Response> {  if (cachedHandler) return cachedHandler;  const runtime = new CopilotRuntime({    // @ts-ignore -- see main route.ts; published agents type generic mismatch    agents: {      "voice-demo": voiceDemoAgent,      default: voiceDemoAgent,    },    transcriptionService: new GuardedOpenAITranscriptionService(),  });  cachedHandler = createCopilotRuntimeHandler({    runtime,    basePath: "/api/copilotkit-voice",  });  return cachedHandler;}export const POST = (req: NextRequest) => getHandler()(req);export const GET = (req: NextRequest) => getHandler()(req);export const PUT = (req: NextRequest) => getHandler()(req);export const DELETE = (req: NextRequest) => getHandler()(req);

With transcriptionService set, the runtime advertises audioFileTranscriptionEnabled: true on /info (which is what tells the chat to render the mic button) and routes POST /transcribe to the service.

Custom transcription backends#

TranscriptionService from @copilotkit/runtime/v2 is an abstract class. Subclass it to plug in any transcription provider — Whisper, AssemblyAI, Deepgram, your own model. The library ships TranscriptionServiceOpenAI as the canonical reference implementation.

A useful pattern is wrapping your service in a guard that returns a clean 4xx when credentials aren't configured, instead of an opaque 5xx from the underlying SDK:

route.ts
import type { NextRequest } from "next/server";import {  CopilotRuntime,  TranscriptionService,  createCopilotRuntimeHandler,} from "@copilotkit/runtime/v2";import type { TranscribeFileOptions } from "@copilotkit/runtime/v2";import { HttpAgent } from "@ag-ui/client";import { TranscriptionServiceOpenAI } from "@copilotkit/voice";import OpenAI from "openai";const AGENT_URL = process.env.AGENT_URL || "http://localhost:8000";const voiceDemoAgent = new HttpAgent({ url: `${AGENT_URL}/` });class GuardedOpenAITranscriptionService extends TranscriptionService {  private delegate: TranscriptionServiceOpenAI | null;  constructor() {    super();    const apiKey = process.env.OPENAI_API_KEY;    this.delegate = apiKey      ? new TranscriptionServiceOpenAI({ openai: new OpenAI({ apiKey }) })      : null;  }  async transcribeFile(options: TranscribeFileOptions): Promise<string> {    if (!this.delegate) {      throw new Error(        "OPENAI_API_KEY not configured for this deployment (api key missing). " +          "Set OPENAI_API_KEY to enable voice transcription.",      );    }    return this.delegate.transcribeFile(options);  }}