Skip to content

Commit 03f0aba

Browse files
Python: added WebRTC support for Azure OpenAI Realtime (#12078)
### Motivation and Context <!-- Thank you for your contribution to the semantic-kernel repo! Please help reviewers and future users, providing the following information: 1. Why is this change required? 2. What problem does it solve? 3. What scenario does it contribute to? 4. If it fixes an open issue, please link to the issue here. --> Added support for WebRTC for Azure OpenAI Realtime models. ### Description <!-- Describe your changes, the overall approach, the underlying design. These notes will help understanding how your code works. Thanks! --> ### Contribution Checklist <!-- Before submitting this PR, please make sure: --> - [x] The code builds clean without any errors or warnings - [x] The PR follows the [SK Contribution Guidelines](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md) and the [pre-submission formatting script](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md#development-scripts) raises no violations - [x] All unit tests pass, and I have added new tests where possible - [x] I didn't break anyone 😄
1 parent 66d0950 commit 03f0aba

File tree

8 files changed

+234
-58
lines changed

8 files changed

+234
-58
lines changed

python/samples/concepts/realtime/realtime_agent_with_function_calling_webrtc.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,9 @@
88
from samples.concepts.realtime.utils import AudioPlayerWebRTC, AudioRecorderWebRTC, check_audio_devices
99
from semantic_kernel.connectors.ai import FunctionChoiceBehavior
1010
from semantic_kernel.connectors.ai.open_ai import (
11+
AzureRealtimeWebRTC,
1112
ListenEvents,
1213
OpenAIRealtimeExecutionSettings,
13-
OpenAIRealtimeWebRTC,
1414
TurnDetection,
1515
)
1616
from semantic_kernel.contents import ChatHistory, RealtimeTextEvent
@@ -26,11 +26,11 @@
2626
This simple sample demonstrates how to use the OpenAI Realtime API to create
2727
a agent that can listen and respond directly through audio.
2828
It requires installing:
29-
- semantic-kernel[realtime]
29+
- semantic-kernel
3030
- pyaudio
3131
- sounddevice
3232
- pydub
33-
e.g. pip install pyaudio sounddevice pydub semantic-kernel[realtime]
33+
e.g. pip install pyaudio sounddevice pydub semantic-kernel
3434
3535
For more details of the exact setup, see the README.md in the realtime folder.
3636
"""
@@ -79,7 +79,11 @@ async def main() -> None:
7979
# and can also be passed in the receive method
8080
# You can also pass in kernel, plugins, chat_history or settings here.
8181
# For WebRTC the audio_track is required
82-
realtime_agent = OpenAIRealtimeWebRTC(audio_track=AudioRecorderWebRTC(), plugins=[Helpers()])
82+
realtime_agent = AzureRealtimeWebRTC(
83+
audio_track=AudioRecorderWebRTC(),
84+
region="swedencentral",
85+
plugins=[Helpers()],
86+
)
8387

8488
# Create the settings for the session
8589
# The realtime api, does not use a system message, but takes instructions as a parameter for a session
@@ -109,12 +113,12 @@ async def main() -> None:
109113

110114
# the context manager calls the create_session method on the client and starts listening to the audio stream
111115
async with (
112-
audio_player,
113116
realtime_agent(
114117
settings=settings,
115118
chat_history=chat_history,
116119
create_response=True,
117120
),
121+
audio_player,
118122
):
119123
async for event in realtime_agent.receive(audio_output_callback=audio_player.client_callback):
120124
match event:

python/samples/concepts/realtime/realtime_agent_with_function_calling_websocket.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,14 +104,14 @@ async def main() -> None:
104104

105105
# the context manager calls the create_session method on the agent and starts listening to the audio stream
106106
async with (
107-
audio_player,
108107
audio_recorder,
109108
realtime_agent(
110109
settings=settings,
111110
chat_history=chat_history,
112111
kernel=kernel,
113112
create_response=True,
114113
),
114+
audio_player,
115115
):
116116
# the audio_output_callback can be added here or in the constructor
117117
# using this gives the smoothest experience

python/semantic_kernel/connectors/ai/open_ai/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@
3737
from semantic_kernel.connectors.ai.open_ai.services._open_ai_realtime import ListenEvents, SendEvents
3838
from semantic_kernel.connectors.ai.open_ai.services.azure_audio_to_text import AzureAudioToText
3939
from semantic_kernel.connectors.ai.open_ai.services.azure_chat_completion import AzureChatCompletion
40-
from semantic_kernel.connectors.ai.open_ai.services.azure_realtime import AzureRealtimeWebsocket
40+
from semantic_kernel.connectors.ai.open_ai.services.azure_realtime import AzureRealtimeWebRTC, AzureRealtimeWebsocket
4141
from semantic_kernel.connectors.ai.open_ai.services.azure_text_completion import AzureTextCompletion
4242
from semantic_kernel.connectors.ai.open_ai.services.azure_text_embedding import AzureTextEmbedding
4343
from semantic_kernel.connectors.ai.open_ai.services.azure_text_to_audio import AzureTextToAudio
@@ -68,6 +68,7 @@
6868
"AzureEmbeddingDependency",
6969
"AzureOpenAISettings",
7070
"AzureRealtimeExecutionSettings",
71+
"AzureRealtimeWebRTC",
7172
"AzureRealtimeWebsocket",
7273
"AzureTextCompletion",
7374
"AzureTextEmbedding",

python/semantic_kernel/connectors/ai/open_ai/prompt_execution_settings/open_ai_realtime_execution_settings.py

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,16 @@ class InputAudioTranscription(KernelBaseModel):
1313
"""Input audio transcription settings.
1414
1515
Args:
16-
model: The model to use for transcription, currently only "whisper-1" is supported.
16+
model: The model to use for transcription, should be one of the following:
17+
- whisper-1
18+
- gpt-4o-transcribe
19+
- gpt-4o-mini-transcribe
1720
language: The language of the audio, should be in ISO-639-1 format, like 'en'.
1821
prompt: An optional text to guide the model's style or continue a previous audio segment.
1922
The prompt should match the audio language.
2023
"""
2124

22-
model: Literal["whisper-1"] | None = None
25+
model: Literal["whisper-1", "gpt-4o-transcribe", "gpt-4o-mini-transcribe"] | None = None
2326
language: str | None = None
2427
prompt: str | None = None
2528

@@ -28,19 +31,24 @@ class TurnDetection(KernelBaseModel):
2831
"""Turn detection settings.
2932
3033
Args:
31-
type: The type of turn detection, currently only "server_vad" is supported.
32-
threshold: The threshold for voice activity detection, should be between 0 and 1.
34+
type: The type of turn detection, server_vad or semantic_vad.
35+
create_response: Whether to create a response for each detected turn.
36+
eagerness: The eagerness of the voice activity detection, can be low, medium, high, or auto,
37+
used only for semantic_vad.
38+
interrupt_response: Whether to interrupt the response for each detected turn.
3339
prefix_padding_ms: The padding before the detected voice activity, in milliseconds.
3440
silence_duration_ms: The duration of silence to detect the end of a turn, in milliseconds.
35-
create_response: Whether to create a response for each detected turn.
41+
threshold: The threshold for voice activity detection, should be between 0 and 1, only for server_vad.
3642
3743
"""
3844

39-
type: Literal["server_vad"] = "server_vad"
40-
threshold: Annotated[float | None, Field(ge=0.0, le=1.0)] = None
45+
type: Literal["server_vad", "semantic_vad"] = "server_vad"
46+
create_response: bool | None = None
47+
eagerness: Literal["low", "medium", "high", "auto"] | None = None
48+
interrupt_response: bool | None = None
4149
prefix_padding_ms: Annotated[int | None, Field(ge=0)] = None
4250
silence_duration_ms: Annotated[int | None, Field(ge=0)] = None
43-
create_response: bool | None = None
51+
threshold: Annotated[float | None, Field(ge=0.0, le=1.0)] = None
4452

4553

4654
class OpenAIRealtimeExecutionSettings(PromptExecutionSettings):
@@ -68,8 +76,9 @@ class OpenAIRealtimeExecutionSettings(PromptExecutionSettings):
6876
"on the function choice configuration.",
6977
),
7078
] = None
71-
temperature: Annotated[float | None, Field(ge=0.0, le=2.0)] = None
79+
temperature: Annotated[float | None, Field(ge=0.6, le=1.2)] = None
7280
max_response_output_tokens: Annotated[int | Literal["inf"] | None, Field(gt=0)] = None
81+
input_audio_noise_reduction: dict[Literal["type"], Literal["near_field", "far_field"]] | None = None
7382

7483

7584
class AzureRealtimeExecutionSettings(OpenAIRealtimeExecutionSettings):

python/semantic_kernel/connectors/ai/open_ai/services/_open_ai_realtime.py

Lines changed: 33 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,6 @@
1010
from enum import Enum
1111
from typing import TYPE_CHECKING, Any, ClassVar, Literal, cast
1212

13-
if sys.version_info >= (3, 12):
14-
from typing import override # pragma: no cover
15-
else:
16-
from typing_extensions import override # pragma: no cover
17-
1813
import numpy as np
1914
from aiohttp import ClientSession
2015
from aiortc import (
@@ -47,13 +42,15 @@
4742
from pydantic import Field, PrivateAttr
4843

4944
from semantic_kernel.connectors.ai.function_call_choice_configuration import FunctionCallChoiceConfiguration
50-
from semantic_kernel.connectors.ai.function_calling_utils import (
51-
prepare_settings_for_function_calling,
52-
)
45+
from semantic_kernel.connectors.ai.function_calling_utils import prepare_settings_for_function_calling
5346
from semantic_kernel.connectors.ai.function_choice_behavior import FunctionChoiceType
47+
from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_realtime_execution_settings import (
48+
OpenAIRealtimeExecutionSettings,
49+
)
5450
from semantic_kernel.connectors.ai.open_ai.services.open_ai_handler import OpenAIHandler
5551
from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings
5652
from semantic_kernel.connectors.ai.realtime_client_base import RealtimeClientBase
53+
from semantic_kernel.const import USER_AGENT
5754
from semantic_kernel.contents.audio_content import AudioContent
5855
from semantic_kernel.contents.chat_history import ChatHistory
5956
from semantic_kernel.contents.chat_message_content import ChatMessageContent
@@ -72,6 +69,7 @@
7269
from semantic_kernel.exceptions import ContentException
7370
from semantic_kernel.kernel import Kernel
7471
from semantic_kernel.utils.feature_stage_decorator import experimental
72+
from semantic_kernel.utils.telemetry.user_agent import SEMANTIC_KERNEL_USER_AGENT, prepend_semantic_kernel_to_user_agent
7573

7674
if TYPE_CHECKING:
7775
from aiortc.mediastreams import MediaStreamTrack
@@ -84,7 +82,13 @@
8482
from semantic_kernel.contents.chat_history import ChatHistory
8583
from semantic_kernel.functions.kernel_function_metadata import KernelFunctionMetadata
8684

87-
logger: logging.Logger = logging.getLogger(__name__)
85+
86+
if sys.version_info >= (3, 12):
87+
from typing import override # pragma: no cover
88+
else:
89+
from typing_extensions import override # pragma: no cover
90+
91+
logger: logging.Logger = logging.getLogger("semantic_kernel.connectors.ai.open_ai.realtime")
8892

8993

9094
# region utils
@@ -649,10 +653,6 @@ async def send(self, event: RealtimeEvents, **kwargs: Any) -> None:
649653

650654
@override
651655
def get_prompt_execution_settings_class(self) -> type["PromptExecutionSettings"]:
652-
from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_realtime_execution_settings import ( # noqa
653-
OpenAIRealtimeExecutionSettings,
654-
)
655-
656656
return OpenAIRealtimeExecutionSettings
657657

658658
@override
@@ -725,14 +725,11 @@ async def create_session(
725725
try:
726726
ephemeral_token = await self._get_ephemeral_token()
727727
headers = {"Authorization": f"Bearer {ephemeral_token}", "Content-Type": "application/sdp"}
728+
headers = prepend_semantic_kernel_to_user_agent(headers)
728729

729730
async with (
730731
ClientSession() as session,
731-
session.post(
732-
f"{self.client.beta.realtime._client.base_url}realtime?model={self.ai_model_id}",
733-
headers=headers,
734-
data=offer.sdp,
735-
) as response,
732+
session.post(self._get_webrtc_url(), headers=headers, data=offer.sdp) as response,
736733
):
737734
if response.status not in [200, 201]:
738735
error_text = await response.text()
@@ -813,15 +810,13 @@ async def _on_data(self, data: str) -> None:
813810

814811
async def _get_ephemeral_token(self) -> str:
815812
"""Get an ephemeral token from OpenAI."""
816-
headers = {"Authorization": f"Bearer {self.client.api_key}", "Content-Type": "application/json"}
817-
data = {"model": self.ai_model_id, "voice": "echo"}
818-
813+
data = {"model": self.ai_model_id}
814+
headers, url = self._get_ephemeral_token_headers_and_url()
815+
headers = prepend_semantic_kernel_to_user_agent(headers)
819816
try:
820817
async with (
821818
ClientSession() as session,
822-
session.post(
823-
f"{self.client.beta.realtime._client.base_url}/realtime/sessions", headers=headers, json=data
824-
) as response,
819+
session.post(url, headers=headers, json=data) as response,
825820
):
826821
if response.status not in [200, 201]:
827822
error_text = await response.text()
@@ -834,6 +829,17 @@ async def _get_ephemeral_token(self) -> str:
834829
logger.error(f"Failed to get ephemeral token: {e!s}")
835830
raise
836831

832+
def _get_ephemeral_token_headers_and_url(self) -> tuple[dict[str, str], str]:
833+
"""Get the headers for the ephemeral token."""
834+
return {
835+
"Authorization": f"Bearer {self.client.api_key}",
836+
"Content-Type": "application/json",
837+
}, f"{self.client.beta.realtime._client.base_url}/realtime/sessions"
838+
839+
def _get_webrtc_url(self) -> str:
840+
"""Get the WebRTC URL."""
841+
return f"{self.client.beta.realtime._client.base_url}/realtime?model={self.ai_model_id}"
842+
837843

838844
# region Websocket
839845

@@ -888,7 +894,9 @@ async def create_session(
888894
**kwargs: Any,
889895
) -> None:
890896
"""Create a session in the service."""
891-
self.connection = await self.client.beta.realtime.connect(model=self.ai_model_id).enter()
897+
self.connection = await self.client.beta.realtime.connect(
898+
model=self.ai_model_id, extra_headers={USER_AGENT: SEMANTIC_KERNEL_USER_AGENT}
899+
).enter()
892900
self.connected.set()
893901
await self.update_session(settings=settings, chat_history=chat_history, **kwargs)
894902

python/semantic_kernel/connectors/ai/open_ai/services/azure_config_base.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,9 @@ def __init__(
105105
if deployment_name and ai_model_type != OpenAIModelTypes.REALTIME:
106106
args["azure_deployment"] = deployment_name
107107

108+
if "websocket_base_url" in kwargs:
109+
args["websocket_base_url"] = kwargs.pop("websocket_base_url")
110+
108111
client = AsyncAzureOpenAI(**args)
109112
args = {
110113
"ai_model_id": deployment_name,

0 commit comments

Comments
 (0)