Skip to content

Commit efdaecf

Browse files
committed
Realtime: web demo
Add a web demo that uses a fastapi backend. Also remove the CLI UI demo.
1 parent dff0548 commit efdaecf

File tree

8 files changed

+986
-119
lines changed

8 files changed

+986
-119
lines changed

examples/realtime/app/README.md

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
# Realtime Demo App
2+
3+
A web-based realtime voice assistant demo with a FastAPI backend and clean HTML/JS frontend.
4+
5+
## Features
6+
7+
- **Connect/Disconnect**: Simple button to establish realtime session
8+
- **Two-pane Interface**:
9+
- Left pane: Message thread showing conversation transcript with user/assistant alignment
10+
- Right pane: Raw transport events display with collapsible event details
11+
- **Voice Input**: Continuous microphone capture with mute/unmute control
12+
- **Audio Playback**: Real-time audio output from the assistant
13+
- **Clean UI**: Elegant, responsive design with smooth interactions
14+
15+
## Installation
16+
17+
Install the required dependencies:
18+
19+
```bash
20+
uv add fastapi uvicorn websockets
21+
```
22+
23+
## Usage
24+
25+
Start the application with a single command:
26+
27+
```bash
28+
cd examples/realtime/app && uv run python server.py
29+
```
30+
31+
Then open your browser to: http://localhost:8000
32+
33+
## How to Use
34+
35+
1. Click **Connect** to establish a realtime session
36+
2. Audio capture starts automatically - just speak naturally
37+
3. Click the **Mic On/Off** button to mute/unmute your microphone
38+
4. Watch the conversation unfold in the left pane
39+
5. Monitor raw events in the right pane (click to expand/collapse)
40+
6. Click **Disconnect** when done
41+
42+
## Architecture
43+
44+
- **Backend**: FastAPI server with WebSocket connections for real-time communication
45+
- **Session Management**: Each connection gets a unique session with the OpenAI Realtime API
46+
- **Audio Processing**: 24kHz mono audio capture and playback
47+
- **Event Handling**: Full event stream processing with transcript generation
48+
- **Frontend**: Vanilla JavaScript with clean, responsive CSS
49+
50+
The demo showcases the core patterns for building realtime voice applications with the OpenAI Agents SDK.

examples/realtime/app/server.py

Lines changed: 172 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,172 @@
1+
import asyncio
2+
import base64
3+
import json
4+
import logging
5+
import struct
6+
from contextlib import asynccontextmanager
7+
from typing import Any, assert_never
8+
9+
from fastapi import FastAPI, WebSocket, WebSocketDisconnect
10+
from fastapi.responses import FileResponse
11+
from fastapi.staticfiles import StaticFiles
12+
13+
from agents import function_tool
14+
from agents.realtime import RealtimeAgent, RealtimeRunner, RealtimeSession, RealtimeSessionEvent
15+
16+
logging.basicConfig(level=logging.INFO)
17+
logger = logging.getLogger(__name__)
18+
19+
20+
@function_tool
21+
def get_weather(city: str) -> str:
22+
"""Get the weather in a city."""
23+
return f"The weather in {city} is sunny."
24+
25+
26+
@function_tool
27+
def get_secret_number() -> int:
28+
"""Returns the secret number, if the user asks for it."""
29+
return 71
30+
31+
32+
haiku_agent = RealtimeAgent(
33+
name="Haiku Agent",
34+
instructions="You are a haiku poet. You must respond ONLY in traditional haiku format (5-7-5 syllables). Every response should be a proper haiku about the topic. Do not break character.",
35+
tools=[],
36+
)
37+
38+
agent = RealtimeAgent(
39+
name="Assistant",
40+
instructions="If the user wants poetry or haikus, you can hand them off to the haiku agent via the transfer_to_haiku_agent tool.",
41+
tools=[get_weather, get_secret_number],
42+
handoffs=[haiku_agent],
43+
)
44+
45+
46+
class RealtimeWebSocketManager:
47+
def __init__(self):
48+
self.active_sessions: dict[str, RealtimeSession] = {}
49+
self.session_contexts: dict[str, Any] = {}
50+
self.websockets: dict[str, WebSocket] = {}
51+
52+
async def connect(self, websocket: WebSocket, session_id: str):
53+
await websocket.accept()
54+
self.websockets[session_id] = websocket
55+
56+
runner = RealtimeRunner(agent)
57+
session_context = await runner.run()
58+
session = await session_context.__aenter__()
59+
self.active_sessions[session_id] = session
60+
self.session_contexts[session_id] = session_context
61+
62+
# Start event processing task
63+
asyncio.create_task(self._process_events(session_id))
64+
65+
async def disconnect(self, session_id: str):
66+
if session_id in self.session_contexts:
67+
await self.session_contexts[session_id].__aexit__(None, None, None)
68+
del self.session_contexts[session_id]
69+
if session_id in self.active_sessions:
70+
del self.active_sessions[session_id]
71+
if session_id in self.websockets:
72+
del self.websockets[session_id]
73+
74+
async def send_audio(self, session_id: str, audio_bytes: bytes):
75+
if session_id in self.active_sessions:
76+
await self.active_sessions[session_id].send_audio(audio_bytes)
77+
78+
async def _process_events(self, session_id: str):
79+
try:
80+
session = self.active_sessions[session_id]
81+
websocket = self.websockets[session_id]
82+
83+
async for event in session:
84+
event_data = await self._serialize_event(event)
85+
await websocket.send_text(json.dumps(event_data))
86+
except Exception as e:
87+
logger.error(f"Error processing events for session {session_id}: {e}")
88+
89+
async def _serialize_event(self, event: RealtimeSessionEvent) -> dict[str, Any]:
90+
base_event: dict[str, Any] = {
91+
"type": event.type,
92+
}
93+
94+
if event.type == "agent_start":
95+
base_event["agent"] = event.agent.name
96+
elif event.type == "agent_end":
97+
base_event["agent"] = event.agent.name
98+
elif event.type == "handoff":
99+
base_event["from"] = event.from_agent.name
100+
base_event["to"] = event.to_agent.name
101+
elif event.type == "tool_start":
102+
base_event["tool"] = event.tool.name
103+
elif event.type == "tool_end":
104+
base_event["tool"] = event.tool.name
105+
base_event["output"] = str(event.output)
106+
elif event.type == "audio":
107+
base_event["audio"] = base64.b64encode(event.audio.data).decode("utf-8")
108+
elif event.type == "audio_interrupted":
109+
pass
110+
elif event.type == "audio_end":
111+
pass
112+
elif event.type == "history_updated":
113+
base_event["history"] = [item.model_dump(mode="json") for item in event.history]
114+
elif event.type == "history_added":
115+
pass
116+
elif event.type == "guardrail_tripped":
117+
base_event["guardrail_results"] = [
118+
{"name": result.guardrail.name} for result in event.guardrail_results
119+
]
120+
elif event.type == "raw_model_event":
121+
base_event["raw_model_event"] = {
122+
"type": event.data.type,
123+
}
124+
elif event.type == "error":
125+
base_event["error"] = str(event.error) if hasattr(event, "error") else "Unknown error"
126+
else:
127+
assert_never(event)
128+
129+
return base_event
130+
131+
132+
manager = RealtimeWebSocketManager()
133+
134+
135+
@asynccontextmanager
136+
async def lifespan(app: FastAPI):
137+
yield
138+
139+
140+
app = FastAPI(lifespan=lifespan)
141+
142+
143+
@app.websocket("/ws/{session_id}")
144+
async def websocket_endpoint(websocket: WebSocket, session_id: str):
145+
await manager.connect(websocket, session_id)
146+
try:
147+
while True:
148+
data = await websocket.receive_text()
149+
message = json.loads(data)
150+
151+
if message["type"] == "audio":
152+
# Convert int16 array to bytes
153+
int16_data = message["data"]
154+
audio_bytes = struct.pack(f"{len(int16_data)}h", *int16_data)
155+
await manager.send_audio(session_id, audio_bytes)
156+
157+
except WebSocketDisconnect:
158+
await manager.disconnect(session_id)
159+
160+
161+
app.mount("/", StaticFiles(directory="static", html=True), name="static")
162+
163+
164+
@app.get("/")
165+
async def read_index():
166+
return FileResponse("static/index.html")
167+
168+
169+
if __name__ == "__main__":
170+
import uvicorn
171+
172+
uvicorn.run(app, host="0.0.0.0", port=8000)

0 commit comments

Comments
 (0)