Voice

Real-time speech-to-text in the chat composer. The user speaks, the runtime transcribes, the agent runs the resulting prompt.

"""AG2 agent with weather and sales tools for CopilotKit showcase.Uses AG2's ConversableAgent with AGUIStream to exposethe agent via the AG-UI protocol."""from __future__ import annotationsimport jsonimport loggingfrom typing import Annotated, Anyimport openaifrom autogen import ConversableAgent, LLMConfigfrom autogen.ag_ui import AGUIStreamfrom dotenv import load_dotenvfrom pydantic import ValidationErrorload_dotenv()# Import shared tool implementationsfrom tools import (    get_weather_impl,    query_data_impl,    manage_sales_todos_impl,    get_sales_todos_impl,    schedule_meeting_impl,    search_flights_impl,    build_a2ui_operations_from_tool_call,    RENDER_A2UI_TOOL_SCHEMA,)from tools.types import Flightfrom ._header_forwarding import get_forwarded_headersfrom ._request_context import get_latest_user_messagelogger = logging.getLogger(__name__)# Module-level async client: re-used across requests (httpx connection pool is# thread-safe). Using AsyncOpenAI inside an `async def` avoids blocking the# ASGI event loop on the secondary LLM call._async_openai_client = openai.AsyncOpenAI()# =====# Tools# =====async def get_weather(    location: Annotated[str, "City name to get weather for"],) -> str:    """Get current weather for a location."""    result = get_weather_impl(location)    # Return a JSON string (not a dict): autogen serializes dict returns with    # str(), producing a Python repr (single quotes) that the frontend's    # parseJsonResult/JSON.parse cannot parse — the weather card then renders    # "--" placeholders. Same pattern as search_flights below.    return json.dumps(        {            "city": result["city"],            "temperature": result["temperature"],            "feels_like": result["feels_like"],            "humidity": result["humidity"],            "wind_speed": result["wind_speed"],            "conditions": result["conditions"],        }    )async def query_data(    query: Annotated[str, "Natural language query for financial data"],) -> str:    """Query financial database for chart data."""    # Return a JSON string (not a list): autogen serializes non-str returns    # with str(), producing a Python repr (single quotes) that the frontend's    # parseJsonResult/JSON.parse cannot parse. Same pattern as get_weather.    return json.dumps(query_data_impl(query))async def manage_sales_todos(    todos: Annotated[list, "Complete list of sales todos"],) -> str:    """Manage the sales pipeline."""    # See contract comment on query_data above — return JSON, not dict.    # SalesTodo is a Pydantic model; coerce via model_dump for serialisability.    result = [t.model_dump() for t in manage_sales_todos_impl(todos)]    return json.dumps({"todos": result})async def get_sales_todos() -> str:    """Get the current sales pipeline."""    # See contract comment on query_data above — return JSON, not list.    # SalesTodo is a Pydantic model; coerce via model_dump for serialisability.    return json.dumps([t.model_dump() for t in get_sales_todos_impl(None)])async def schedule_meeting(    reason: Annotated[str, "Reason for the meeting"],) -> str:    """Schedule a meeting with user approval."""    # See contract comment on query_data above — return JSON, not dict.    return json.dumps(schedule_meeting_impl(reason))async def search_flights(    flights: Annotated[        list[dict[str, Any]], "List of flight objects to display as rich A2UI cards"    ],) -> str:    """Search for flights and display the results as rich cards. Return exactly 2 flights.    Each flight must have: airline, airlineLogo, flightNumber, origin, destination,    date (short readable format like "Tue, Mar 18" -- use near-future dates),    departureTime, arrivalTime, duration (e.g. "4h 25m"),    status (e.g. "On Time" or "Delayed"),    statusColor (hex color for status dot),    price (e.g. "$289"), and currency (e.g. "USD").    For airlineLogo use Google favicon API:    https://www.google.com/s2/favicons?domain={airline_domain}&sz=128    """    try:        typed_flights: list[Flight] = [Flight(**f) for f in flights]    except ValidationError as exc:        logger.warning(            "search_flights: invalid flight shape type=%s err=%s",            type(exc).__name__,            exc,            exc_info=True,        )        return json.dumps({"error": f"invalid flight shape: {exc}"})    result = search_flights_impl(typed_flights)    return json.dumps(result)async def generate_a2ui(    context: Annotated[str, "Conversation context to generate UI for"],) -> str:    """Generate dynamic A2UI components based on the conversation.    A secondary LLM designs the UI schema and data. The result is    returned as an a2ui_operations container for the middleware to detect.    """    # A13: AsyncOpenAI inside async def (was sync openai.OpenAI which blocks    # the ASGI event loop). Forward x-* headers via extra_headers in addition    # to the global httpx hook so aimock context routing is explicit at the    # call site.    #    # R2-A1 / A4: thread the latest user prompt from the inbound    # RunAgentInput.messages payload (captured into a per-request ContextVar    # by RequestUserMessageMiddleware — see agents/_request_context.py) into    # the inner LLM call so each pill's request body is byte-distinct.    # Without this, every pill landing on the omnibus agent (agentic-chat /    # tool-rendering / chat-customization-css / hitl) produces an IDENTICAL    # inner-LLM body and the aimock fixture cannot disambiguate. Falls back    # to the original hardcoded prompt when the middleware captured nothing    # (parse failure already logged at WARNING).    user_prompt = get_latest_user_message() or (        "Generate a dynamic A2UI dashboard based on the conversation."    )    forwarded = get_forwarded_headers()    try:        response = await _async_openai_client.chat.completions.create(            model="gpt-4.1",            messages=[                {                    "role": "system",                    "content": context or "Generate a useful dashboard UI.",                },                {                    "role": "user",                    "content": user_prompt,                },            ],            tools=[                {                    "type": "function",                    "function": RENDER_A2UI_TOOL_SCHEMA,                }            ],            tool_choice={"type": "function", "function": {"name": "render_a2ui"}},            extra_headers=forwarded or None,        )    except Exception as exc:        logger.error(            "generate_a2ui: inner LLM call failed type=%s err=%s",            type(exc).__name__,            exc,            exc_info=True,        )        return json.dumps({"error": f"inner LLM call failed: {type(exc).__name__}"})    if not response.choices:        logger.warning("generate_a2ui: LLM returned no choices")        return json.dumps({"error": "LLM returned no choices"})    choice = response.choices[0]    if not choice.message.tool_calls:        logger.warning("generate_a2ui: secondary LLM produced no render_a2ui tool call")        return json.dumps({"error": "LLM did not call render_a2ui"})    try:        args = json.loads(choice.message.tool_calls[0].function.arguments)        result = build_a2ui_operations_from_tool_call(args)        return json.dumps(result)    except (json.JSONDecodeError, KeyError, TypeError, ValueError) as exc:        logger.error(            "generate_a2ui: failed to parse render_a2ui args type=%s err=%s",            type(exc).__name__,            exc,            exc_info=True,        )        return json.dumps(            {"error": f"failed to parse render_a2ui args: {type(exc).__name__}"}        )# =====# Agent# =====agent = ConversableAgent(    name="assistant",    system_message=(        "You are a helpful sales assistant. You can look up current weather "        "for any city using the get_weather tool, query financial data with "        "query_data, manage the sales pipeline with manage_sales_todos and "        "get_sales_todos, schedule meetings with schedule_meeting, search "        "flights and display rich A2UI cards with search_flights, and "        "generate dynamic A2UI dashboards with generate_a2ui. "        "When asked about the weather, always use the tool rather than guessing. "        "Be concise and friendly in your responses."    ),    llm_config=LLMConfig({"model": "gpt-4o-mini", "stream": True}),    human_input_mode="NEVER",    # Guard against infinite tool-call loops: AG2's ConversableAgent with    # human_input_mode="NEVER" will keep executing tool calls indefinitely    # if the LLM keeps requesting them.  Without this limit the agent floods    # Railway's log stream (500 logs/sec rate-limit), becomes unresponsive    # to health probes, and gets killed by the watchdog.    max_consecutive_auto_reply=15,    functions=[        get_weather,        query_data,        manage_sales_todos,        get_sales_todos,        schedule_meeting,        search_flights,        generate_a2ui,    ],)# AG-UI stream wrapperstream = AGUIStream(agent)

You have a working chat surface and you want users to be able to speak instead of type. By the end of this guide, the chat composer will sprout a mic button, recorded audio will be transcribed by the runtime, and the transcript will auto-send to the agent like any other message.

When to use this#

Hands-free or accessibility flows where typing isn't the right input modality.
Mobile or kiosk surfaces where a long voice query is faster than thumb-typing.
Demo and test loops where you want canned audio to drive the chat without a microphone.

If you only need file uploads (audio, images, video, documents), use Multimodal Attachments instead. Voice is specifically about live transcription of recorded speech into chat input.

Frontend#

<CopilotChat /> renders the mic button automatically when the runtime advertises audioFileTranscriptionEnabled: true on its /info endpoint. There's nothing to wire up on the chat surface itself:

page.tsx

import { CopilotKit } from "@copilotkit/react-core/v2";import { VoiceChat } from "./voice-chat";export default function VoiceDemoPage() {  return (    <CopilotKit      runtimeUrl="/api/copilotkit-voice"      agent="voice-demo"      useSingleEndpoint={false}      // The dev-only `<cpk-web-inspector>` overlay (auto-enabled on      // localhost via shouldShowDevConsole) intercepts pointer events      // on top of the voice sample-audio button, so dev/D5 probe runs      // can't click it through Playwright. Production isn't localhost      // so the inspector never mounts there — voice is D5 in prod and      // D4 locally for this reason alone. Disable explicitly here so      // the demo behaves the same in both environments.      enableInspector={false}    >      <VoiceChat />    </CopilotKit>  );}

When the user clicks the mic, the chat captures audio, POSTs it to the runtime's /transcribe endpoint, drops the resulting transcript into the composer, and submits.

Driving the demo without a mic#

For Playwright runs, screenshots, or any flow where prompting for mic permissions is awkward, ship a button that POSTs a bundled audio clip directly to the same /transcribe endpoint:

sample-audio-button.tsx

export function SampleAudioButton({  onTranscribed,  sampleText,}: SampleAudioButtonProps) {  return (    <button      type="button"      data-testid="voice-sample-audio-button"      onClick={() => onTranscribed(sampleText)}      title={`Inserts: "${sampleText}"`}      className="inline-flex w-fit items-center gap-2 rounded-md border border-black/10 bg-white px-3 py-1.5 text-xs font-medium hover:bg-black/5 dark:border-white/10 dark:bg-black/30 dark:hover:bg-white/10"    >      <span aria-hidden>🎙</span>      <span>Try a sample audio</span>    </button>  );}

The caller can drop the resulting text into the composer's textarea (matched via data-testid="copilot-chat-textarea") using the native value setter and a synthetic input event so React's managed state updates correctly.

Backend#

Wire up the V2 runtime with a TranscriptionService. The V1 wrapper drops the transcriptionService option, so use createCopilotRuntimeHandler from @copilotkit/runtime/v2 directly:

route.ts

import type { NextRequest } from "next/server";import {  CopilotRuntime,  TranscriptionService,  createCopilotRuntimeHandler,} from "@copilotkit/runtime/v2";import type { TranscribeFileOptions } from "@copilotkit/runtime/v2";import { HttpAgent } from "@ag-ui/client";import { TranscriptionServiceOpenAI } from "@copilotkit/voice";import OpenAI from "openai";const AGENT_URL = process.env.AGENT_URL || "http://localhost:8000";const voiceDemoAgent = new HttpAgent({ url: `${AGENT_URL}/` });class GuardedOpenAITranscriptionService extends TranscriptionService {  private delegate: TranscriptionServiceOpenAI | null;  constructor() {    super();    const apiKey = process.env.OPENAI_API_KEY;    this.delegate = apiKey      ? new TranscriptionServiceOpenAI({ openai: new OpenAI({ apiKey }) })      : null;  }  async transcribeFile(options: TranscribeFileOptions): Promise<string> {    if (!this.delegate) {      throw new Error(        "OPENAI_API_KEY not configured for this deployment (api key missing). " +          "Set OPENAI_API_KEY to enable voice transcription.",      );    }    return this.delegate.transcribeFile(options);  }}let cachedHandler: ((req: Request) => Promise<Response>) | null = null;function getHandler(): (req: Request) => Promise<Response> {  if (cachedHandler) return cachedHandler;  const runtime = new CopilotRuntime({    // @ts-ignore -- see main route.ts; published agents type generic mismatch    agents: {      "voice-demo": voiceDemoAgent,      default: voiceDemoAgent,    },    transcriptionService: new GuardedOpenAITranscriptionService(),  });  cachedHandler = createCopilotRuntimeHandler({    runtime,    basePath: "/api/copilotkit-voice",  });  return cachedHandler;}export const POST = (req: NextRequest) => getHandler()(req);export const GET = (req: NextRequest) => getHandler()(req);export const PUT = (req: NextRequest) => getHandler()(req);export const DELETE = (req: NextRequest) => getHandler()(req);

With transcriptionService set, the runtime advertises audioFileTranscriptionEnabled: true on /info (which is what tells the chat to render the mic button) and routes POST /transcribe to the service.

Custom transcription backends#

TranscriptionService from @copilotkit/runtime/v2 is an abstract class. Subclass it to plug in any transcription provider — Whisper, AssemblyAI, Deepgram, your own model. The library ships TranscriptionServiceOpenAI as the canonical reference implementation.

A useful pattern is wrapping your service in a guard that returns a clean 4xx when credentials aren't configured, instead of an opaque 5xx from the underlying SDK: