diff --git a/packages/api/src/hooks/useSpeechToSpeech.ts b/packages/api/src/hooks/useSpeechToSpeech.ts new file mode 100644 index 0000000000..4f529a2c08 --- /dev/null +++ b/packages/api/src/hooks/useSpeechToSpeech.ts @@ -0,0 +1,3 @@ +import useSpeechToSpeech from '../providers/SpeechToSpeech/useSpeechToSpeech'; + +export default useSpeechToSpeech; diff --git a/packages/api/src/hooks/useVoiceActivities.ts b/packages/api/src/hooks/useVoiceActivities.ts new file mode 100644 index 0000000000..d65e142b17 --- /dev/null +++ b/packages/api/src/hooks/useVoiceActivities.ts @@ -0,0 +1,11 @@ +import { isVoiceActivity, type WebChatActivity } from 'botframework-webchat-core'; +import { useSelector } from './internal/WebChatReduxContext'; + +const activitiesSelector = (state: { activities: WebChatActivity[] }) => state.activities; + +const of = (predicate: (activity: WebChatActivity) => boolean) => (state: { activities: WebChatActivity[] }) => + activitiesSelector(state).filter(predicate); + +export default function useVoiceActivities(): [WebChatActivity[]] { + return [useSelector(of(activity => isVoiceActivity(activity)))]; +} diff --git a/packages/api/src/providers/SpeechToSpeech/SpeechToSpeechComposer.tsx b/packages/api/src/providers/SpeechToSpeech/SpeechToSpeechComposer.tsx new file mode 100644 index 0000000000..0ccf1a6f32 --- /dev/null +++ b/packages/api/src/providers/SpeechToSpeech/SpeechToSpeechComposer.tsx @@ -0,0 +1,134 @@ +import React, { useCallback, useEffect, useMemo, useRef, useState, type ReactNode } from 'react'; +import { isVoiceActivity, WebChatActivity } from 'botframework-webchat-core'; +import { useAudioPlayer } from './private/useAudioPlayer'; +import { useRecorder } from './private/useRecorder'; +import { useDebouncedNotifications, usePostActivity, useVoiceActivities } from '../../hooks'; +import SpeechToSpeechContext from './private/Context'; +import { SpeechState } from './types/SpeechState'; + +export const SpeechToSpeechComposer: React.FC<{ readonly children: ReactNode }> = ({ children }) => { + const [voiceActivities] = useVoiceActivities(); + const postActivity = usePostActivity(); + const [{ connectivitystatus }] = useDebouncedNotifications(); + const { playAudio, stopAudio, isPlaying } = useAudioPlayer(); + + const lastProcessedIndexRef = useRef(0); + + // Remove when we have activity protocol changes, we would get this as part of signal activity. + const [speechState, setSpeechState] = useState('idle'); + + const isConnected = useMemo(() => connectivitystatus?.message === 'connected', [connectivitystatus]); + + const sendAudioChunk = useCallback( + (base64: string) => { + postActivity({ + type: 'event', + name: 'stream.chunk', + value: { voiceLiveEvent: { type: 'input_audio_buffer.append', audio: base64 } } + } as any); + }, + [postActivity] + ); + + const { recording, setRecording: baseSetRecording } = useRecorder(sendAudioChunk); + + const cancelActiveResponse = useCallback(() => { + if (isPlaying) { + postActivity({ + type: 'event', + value: { voiceLiveEvent: { type: 'response.cancel' } } + } as any); + } + }, [isPlaying, postActivity]); + + const handleVoiceActivity = useCallback( + (activity: WebChatActivity) => { + if (!isVoiceActivity(activity)) { + return; + } + + const { voiceLiveEvent } = activity.value; + + switch (voiceLiveEvent.type) { + case 'input_audio_buffer.speech_started': + stopAudio(); + setSpeechState('listening'); + break; + case 'input_audio_buffer.speech_stopped': + setSpeechState('processing'); + break; + case 'response.audio.delta': + if (voiceLiveEvent.delta && recording) { + playAudio(voiceLiveEvent.delta); + } + break; + case 'response.done': + if (!isPlaying) { + setSpeechState('listening'); + } + break; + default: + break; + } + }, + [isPlaying, playAudio, recording, stopAudio] + ); + + useEffect(() => { + const startIndex = lastProcessedIndexRef.current; + + if (!voiceActivities.length || startIndex >= voiceActivities.length) { + return; + } + + // If not recording, skip processing voice activities but update ref + // so next time we start recording, we only process new activities. + if (!recording) { + lastProcessedIndexRef.current = voiceActivities.length; + return; + } + + for (let i = startIndex; i < voiceActivities.length; i++) { + // eslint-disable-next-line security/detect-object-injection + handleVoiceActivity(voiceActivities[i]); + } + + if (isPlaying && speechState !== 'bot_speaking') { + setSpeechState('bot_speaking'); + } else if (!isPlaying && speechState === 'bot_speaking') { + setSpeechState('listening'); + } + + lastProcessedIndexRef.current = voiceActivities.length; + }, [voiceActivities, recording, postActivity, isPlaying, playAudio, speechState, stopAudio, handleVoiceActivity]); + + const setRecording = useCallback( + (shouldRecord: boolean) => { + if (!isConnected) { + return; + } + + if (!recording) { + setSpeechState('listening'); + } else { + stopAudio(); + cancelActiveResponse(); + setSpeechState('idle'); + } + + baseSetRecording(shouldRecord); + }, + [isConnected, recording, baseSetRecording, stopAudio, cancelActiveResponse] + ); + + const contextValue = useMemo( + () => ({ + recording, + setRecording, + speechState + }), + [recording, setRecording, speechState] + ); + + return {children}; +}; diff --git a/packages/api/src/providers/SpeechToSpeech/private/Context.ts b/packages/api/src/providers/SpeechToSpeech/private/Context.ts new file mode 100644 index 0000000000..ce85310246 --- /dev/null +++ b/packages/api/src/providers/SpeechToSpeech/private/Context.ts @@ -0,0 +1,14 @@ +import { createContext } from 'react'; +import { SpeechState } from '../types/SpeechState'; + +type SpeechToSpeechContextType = { + recording: boolean; + setRecording: (recording: boolean) => void; + speechState: SpeechState; +}; + +const SpeechToSpeechContext = createContext(undefined!); + +export default SpeechToSpeechContext; + +export type { SpeechToSpeechContextType }; diff --git a/packages/api/src/providers/SpeechToSpeech/private/useAudioPlayer.spec.tsx b/packages/api/src/providers/SpeechToSpeech/private/useAudioPlayer.spec.tsx new file mode 100644 index 0000000000..8c1d42cb08 --- /dev/null +++ b/packages/api/src/providers/SpeechToSpeech/private/useAudioPlayer.spec.tsx @@ -0,0 +1,279 @@ +/** @jest-environment @happy-dom/jest-environment */ +/// +/// + +import { render, type RenderResult } from '@testing-library/react'; +import React, { type ComponentType } from 'react'; +import { useAudioPlayer } from './useAudioPlayer'; + +// Mock AudioContext and related APIs +const mockAudioContext = { + sampleRate: 24000, + currentTime: 0, + destination: {}, + state: 'running', + resume: jest.fn().mockResolvedValue(undefined), + close: jest.fn().mockResolvedValue(undefined), + createBuffer: jest.fn(), + createBufferSource: jest.fn() +}; + +const mockAudioBuffer = { + duration: 0.1, // 100m + getChannelData: jest.fn().mockReturnValue(new Float32Array(2400)) +}; + +const mockBufferSource = { + buffer: null, + connect: jest.fn(), + start: jest.fn(), + stop: jest.fn(), + disconnect: jest.fn(), + onended: null +}; + +// Mock global AudioContext +global.AudioContext = jest.fn(() => mockAudioContext) as any; +global.atob = jest.fn(str => str); // Simple mock for base64 decode + +type UseAudioPlayerReturn = ReturnType; + +describe('setup', () => { + let HookApp: ComponentType; + let hookData: UseAudioPlayerReturn | undefined; + let renderResult: RenderResult; + const originalAudioContext = global.AudioContext; + + beforeEach(() => { + jest.clearAllMocks(); + mockAudioContext.currentTime = 0; + mockAudioContext.createBuffer.mockReturnValue(mockAudioBuffer); + mockAudioContext.createBufferSource.mockReturnValue(mockBufferSource); + mockBufferSource.buffer = null; + mockBufferSource.onended = null; + + HookApp = () => { + hookData = useAudioPlayer(); + return null; + }; + }); + + afterEach(() => { + global.AudioContext = originalAudioContext; + }); + + describe('Initialization', () => { + test('should initialize with correct default values', () => { + render(); + + expect(hookData?.isPlaying).toBe(false); + expect(typeof hookData?.playAudio).toBe('function'); + expect(typeof hookData?.stopAudio).toBe('function'); + }); + + test('should create AudioContext on first playAudio call', () => { + render(); + + hookData?.playAudio('dGVzdA=='); // base64 for 'test' + + expect(AudioContext).toHaveBeenCalledWith({ sampleRate: 24000 }); + }); + + test('should reuse existing AudioContext on subsequent calls', () => { + render(); + + hookData?.playAudio('dGVzdA=='); + hookData?.playAudio('dGVzdDI='); + + expect(AudioContext).toHaveBeenCalledTimes(1); + }); + }); + + describe('Audio playback', () => { + beforeEach(() => { + renderResult = render(); + }); + + test('should process base64 audio data correctly', () => { + hookData?.playAudio('dGVzdA=='); + + expect(global.atob).toHaveBeenCalledWith('dGVzdA=='); + expect(mockAudioContext.createBuffer).toHaveBeenCalledWith(1, expect.any(Number), 24000); + expect(mockAudioContext.createBufferSource).toHaveBeenCalled(); + }); + + test('should set up audio buffer source correctly', () => { + hookData?.playAudio('dGVzdA=='); + + expect(mockBufferSource.connect).toHaveBeenCalledWith(mockAudioContext.destination); + expect(mockBufferSource.start).toHaveBeenCalled(); + expect(mockBufferSource.buffer).toBe(mockAudioBuffer); + }); + + test('should resume AudioContext if needed', () => { + hookData?.playAudio('dGVzdA=='); + + expect(mockAudioContext.resume).toHaveBeenCalled(); + }); + + test('should queue multiple audio chunks correctly', () => { + mockAudioBuffer.duration = 0.1; // 100ms + + hookData?.playAudio('dGVzdA=='); + hookData?.playAudio('dGVzdDI='); + + expect(mockBufferSource.start).toHaveBeenCalledTimes(2); + // First chunk starts at currentTime (0), second at 0.1 + expect(mockBufferSource.start).toHaveBeenNthCalledWith(1, 0); + expect(mockBufferSource.start).toHaveBeenNthCalledWith(2, 0.1); + }); + }); + + describe('isPlaying state', () => { + beforeEach(() => { + renderResult = render(); + }); + + test('should return true when audio is queued for playback', () => { + mockAudioContext.currentTime = 0; + mockAudioBuffer.duration = 0.1; + + hookData?.playAudio('dGVzdA=='); + renderResult.rerender(); + + expect(hookData?.isPlaying).toBe(true); + }); + + test('should return false when no audio is queued', () => { + expect(hookData?.isPlaying).toBe(false); + }); + + test('should handle multiple chunks and playing state', () => { + mockAudioContext.currentTime = 0.05; // In the middle of first chunk + mockAudioBuffer.duration = 0.1; + + hookData?.playAudio('dGVzdA=='); // 0 - 0.1 + hookData?.playAudio('dGVzdDI='); // 0.1 - 0.2 + renderResult.rerender(); + + expect(hookData?.isPlaying).toBe(true); + }); + }); + + describe('Audio cleanup', () => { + beforeEach(() => { + renderResult = render(); + }); + + test('should clean up buffer source on ended', () => { + hookData?.playAudio('dGVzdA=='); + + // Simulate audio ended + if (mockBufferSource.onended) { + mockBufferSource.onended(); + } + + expect(mockBufferSource.disconnect).toHaveBeenCalled(); + expect(mockBufferSource.buffer).toBeNull(); + }); + + test('should stop all audio and close context', () => { + hookData?.playAudio('dGVzdA=='); + + hookData?.stopAudio(); + renderResult.rerender(); + + expect(mockAudioContext.close).toHaveBeenCalled(); + expect(hookData?.isPlaying).toBe(false); + }); + }); + + describe('Error handling', () => { + beforeEach(() => { + renderResult = render(); + }); + + test('should handle invalid base64 data gracefully', () => { + expect(() => { + hookData?.playAudio('invalid-base64!@#'); + }).not.toThrow(); + }); + + test('should handle AudioContext creation failure', () => { + global.AudioContext = jest.fn(() => { + throw new Error('AudioContext not supported'); + }) as any; + + expect(() => { + hookData?.playAudio('dGVzdA=='); + }).toThrow('AudioContext not supported'); + }); + + test('should handle missing audio context in isPlaying', () => { + // Before any audio is played, audioCtxRef should be null + expect(hookData?.isPlaying).toBe(false); + }); + }); + + describe('Real-world scenarios', () => { + beforeEach(() => { + renderResult = render(); + }); + + test('should handle streaming audio chunks', () => { + mockAudioBuffer.duration = 0.05; // 50ms chunks + + // Simulate streaming 5 chunks + for (let i = 0; i < 5; i++) { + hookData?.playAudio(`chunk${i}`); + } + + expect(mockBufferSource.start).toHaveBeenCalledTimes(5); + renderResult.rerender(); + expect(hookData?.isPlaying).toBe(true); + }); + + test('should handle playback interruption', () => { + hookData?.playAudio('dGVzdA=='); + renderResult.rerender(); + expect(hookData?.isPlaying).toBe(true); + + hookData?.stopAudio(); + renderResult.rerender(); + expect(hookData?.isPlaying).toBe(false); + expect(mockAudioContext.close).toHaveBeenCalled(); + }); + + test('should handle resume after stop', () => { + // Play, stop, then play again + hookData?.playAudio('dGVzdA=='); + hookData?.stopAudio(); + hookData?.playAudio('dGVzdDI='); + + expect(AudioContext).toHaveBeenCalledTimes(2); // New context after stop + }); + }); + + describe('Performance considerations', () => { + beforeEach(() => { + renderResult = render(); + }); + + test('should handle large audio data', () => { + const largeBase64 = 'A'.repeat(10000); + + expect(() => { + hookData?.playAudio(largeBase64); + }).not.toThrow(); + }); + + test('should handle rapid successive calls', () => { + for (let i = 0; i < 100; i++) { + // Ensure the mock "base64" data has an even length as Int16Array (which represents 16-bit audio samples) requires the underlying data to be in multiples of 2 bytes + hookData?.playAudio(`chunk${i}`.padEnd(8, ' ')); + } + + expect(mockBufferSource.start).toHaveBeenCalledTimes(100); + }); + }); +}); diff --git a/packages/api/src/providers/SpeechToSpeech/private/useAudioPlayer.ts b/packages/api/src/providers/SpeechToSpeech/private/useAudioPlayer.ts new file mode 100644 index 0000000000..6216932a8c --- /dev/null +++ b/packages/api/src/providers/SpeechToSpeech/private/useAudioPlayer.ts @@ -0,0 +1,69 @@ +import { useRef, useCallback } from 'react'; + +const SAMPLE_RATE = 24000; +const INT16_SCALE = 32768; + +export function useAudioPlayer() { + const audioCtxRef = useRef(null); + const nextPlayTimeRef = useRef(0); + + const initAudio = useCallback(() => { + if (!audioCtxRef.current) { + audioCtxRef.current = new AudioContext({ sampleRate: SAMPLE_RATE }); + } + return audioCtxRef.current; + }, []); + + const playAudio = useCallback( + (base64: string) => { + const audioCtx = initAudio(); + audioCtx.resume?.(); + + try { + const bytes = Uint8Array.from(atob(base64), c => c.charCodeAt(0)); + const int16 = new Int16Array(bytes.buffer); + const float32 = new Float32Array(int16.length); + + for (let i = 0; i < int16.length; i++) { + // eslint-disable-next-line security/detect-object-injection + float32[i] = int16[i] / INT16_SCALE; + } + + const buffer = audioCtx.createBuffer(1, float32.length, SAMPLE_RATE); + buffer.getChannelData(0).set(float32); + + const src = audioCtx.createBufferSource(); + src.buffer = buffer; + src.connect(audioCtx.destination); + + // Clear buffer when finished + src.onended = () => { + src.disconnect(); + src.buffer = null; + }; + + nextPlayTimeRef.current = Math.max(nextPlayTimeRef.current, audioCtx.currentTime); + src.start(nextPlayTimeRef.current); + nextPlayTimeRef.current += buffer.duration; + } catch (error) { + console.warn('botframework-webchat: Error during audio playback in useAudioPlayer:', error); + } + }, + [initAudio] + ); + + const stopAudio = useCallback(() => { + nextPlayTimeRef.current = 0; + + if (audioCtxRef.current) { + audioCtxRef.current.close(); + audioCtxRef.current = null; + } + }, []); + + return { + playAudio, + stopAudio, + isPlaying: audioCtxRef.current ? audioCtxRef.current.currentTime < nextPlayTimeRef.current : false + }; +} diff --git a/packages/api/src/providers/SpeechToSpeech/private/useContext.ts b/packages/api/src/providers/SpeechToSpeech/private/useContext.ts new file mode 100644 index 0000000000..50926b0a12 --- /dev/null +++ b/packages/api/src/providers/SpeechToSpeech/private/useContext.ts @@ -0,0 +1,15 @@ +import { useContext } from 'react'; + +import SpeechToSpeechContext from './Context'; + +import type { SpeechToSpeechContextType } from './Context'; + +export default function useSpeechToSpeechContext(thrownOnUndefined = true): SpeechToSpeechContextType { + const contextValue = useContext(SpeechToSpeechContext); + + if (thrownOnUndefined && !contextValue) { + throw new Error('botframework-webchat internal: This hook can only be used under .'); + } + + return contextValue; +} diff --git a/packages/api/src/providers/SpeechToSpeech/private/useRecorder.spec.tsx b/packages/api/src/providers/SpeechToSpeech/private/useRecorder.spec.tsx new file mode 100644 index 0000000000..01368ceda2 --- /dev/null +++ b/packages/api/src/providers/SpeechToSpeech/private/useRecorder.spec.tsx @@ -0,0 +1,160 @@ +/** @jest-environment @happy-dom/jest-environment */ +/// + +import { act, render, waitFor, type RenderResult } from '@testing-library/react'; +import React, { type ComponentType } from 'react'; +import { useRecorder } from './useRecorder'; + +// --- Mocks --- + +const mockTrack = { + stop: jest.fn() +}; + +const mockMediaStream = { + getTracks: jest.fn(() => [mockTrack]) +}; + +const mockMediaDevices = { + getUserMedia: jest.fn().mockResolvedValue(mockMediaStream) +}; + +const mockWorkletPort = { + postMessage: jest.fn(), + onmessage: null as ((event: { data: any }) => void) | null +}; + +const mockWorkletNode = { + connect: jest.fn(), + disconnect: jest.fn(), + port: mockWorkletPort +}; + +const mockAudioContext = { + state: 'running', + resume: jest.fn().mockResolvedValue(undefined), + createMediaStreamSource: jest.fn(() => ({ + connect: jest.fn() + })), + destination: {}, + audioWorklet: { + addModule: jest.fn().mockResolvedValue(undefined) + } +}; + +// --- Global Mocks Setup --- + +Object.defineProperty(global.navigator, 'mediaDevices', { + value: mockMediaDevices, + writable: true +}); + +global.AudioContext = jest.fn(() => mockAudioContext as any); +global.AudioWorkletNode = jest.fn(() => mockWorkletNode as any); +global.Blob = jest.fn(parts => ({ parts, type: parts[1]?.type })) as any; +global.URL.createObjectURL = jest.fn(() => 'blob:http://localhost/mock-url'); +global.URL.revokeObjectURL = jest.fn(); +global.btoa = jest.fn(str => `btoa(${str})`); + +// --- Tests --- + +describe('useRecorder', () => { + let onAudioChunk: jest.Mock; + let HookApp: ComponentType<{ onAudioChunk: (base64: string) => void }>; + let hookData: ReturnType | undefined; + // eslint-disable-next-line @typescript-eslint/no-unused-vars + let renderResult: RenderResult; + + beforeEach(() => { + // Clear all mocks before each test + jest.clearAllMocks(); + onAudioChunk = jest.fn(); + hookData = undefined; + mockWorkletPort.onmessage = null; + (mockAudioContext.state as any) = 'running'; + + HookApp = ({ onAudioChunk }) => { + hookData = useRecorder(onAudioChunk); + return null; + }; + }); + + test('should be initially not recording', () => { + render(); + expect(hookData?.recording).toBe(false); + }); + + test('should start recording when setRecording(true) is called', async () => { + renderResult = render(); + + act(() => { + hookData?.setRecording(true); + }); + + await waitFor(() => expect(hookData?.recording).toBe(true)); + + expect(navigator.mediaDevices.getUserMedia).toHaveBeenCalledTimes(1); + expect(global.AudioContext).toHaveBeenCalledTimes(1); + expect(mockAudioContext.audioWorklet.addModule).toHaveBeenCalledTimes(1); + expect(global.AudioWorkletNode).toHaveBeenCalledWith(expect.anything(), 'audio-recorder'); + expect(mockWorkletNode.connect).toHaveBeenCalledTimes(1); + expect(mockWorkletPort.postMessage).toHaveBeenCalledWith({ command: 'START' }); + }); + + test('should stop recording when setRecording(false) is called', async () => { + renderResult = render(); + + // Start recording + act(() => { + hookData?.setRecording(true); + }); + + await waitFor(() => expect(hookData?.recording).toBe(true)); + + // Stop recording + act(() => { + hookData?.setRecording(false); + }); + + await waitFor(() => expect(hookData?.recording).toBe(false)); + + expect(mockWorkletPort.postMessage).toHaveBeenCalledWith({ command: 'STOP' }); + expect(mockWorkletNode.disconnect).toHaveBeenCalledTimes(1); + expect(mockTrack.stop).toHaveBeenCalledTimes(1); + }); + + test('should process audio chunks sent from the worklet', async () => { + render(); + + act(() => { + hookData?.setRecording(true); + }); + + await waitFor(() => expect(mockWorkletPort.onmessage).not.toBeNull()); + + // Simulate a message from the audio worklet + const mockAudioData = new Float32Array([0.1, 0.2, -0.1]); + act(() => { + mockWorkletPort.onmessage!({ + data: { + eventType: 'audio', + audioData: mockAudioData + } + }); + }); + + await waitFor(() => expect(onAudioChunk).toHaveBeenCalledTimes(1)); + expect(global.btoa).toHaveBeenCalled(); + }); + + test('should handle suspended audio context by resuming it', async () => { + (mockAudioContext.state as any) = 'suspended'; + render(); + + act(() => { + hookData?.setRecording(true); + }); + + await waitFor(() => expect(mockAudioContext.resume).toHaveBeenCalledTimes(1)); + }); +}); diff --git a/packages/api/src/providers/SpeechToSpeech/private/useRecorder.ts b/packages/api/src/providers/SpeechToSpeech/private/useRecorder.ts new file mode 100644 index 0000000000..b9930cada1 --- /dev/null +++ b/packages/api/src/providers/SpeechToSpeech/private/useRecorder.ts @@ -0,0 +1,128 @@ +import { useRef, useState, useCallback } from 'react'; + +const audioProcessorCode = ` +class AudioRecorderProcessor extends AudioWorkletProcessor { + constructor() { + super() + this.recording = false + this.buffer = [] + this.port.onmessage = e => { + if (e.data.command === 'START') this.recording = true + else if (e.data.command === 'STOP') { + this.recording = false + if (this.buffer.length) this.sendBuffer() + } + } + } + sendBuffer() { + if (this.buffer.length) { + this.port.postMessage({ + eventType: 'audio', + audioData: new Float32Array(this.buffer) + }) + this.buffer = [] + } + } + process(inputs) { + if (inputs[0]?.length && this.recording) { + this.buffer.push(...inputs[0][0]) + if (this.buffer.length >= 2400) this.sendBuffer() + } + return true + } +} +registerProcessor('audio-recorder', AudioRecorderProcessor) +`; + +const INT16_MIN = -32768; +const INT16_MAX = 32767; +const INT16_SCALE = 32767; + +export function useRecorder(onAudioChunk: (base64: string) => void) { + const [recording, setRecordingInternal] = useState(false); + const audioCtxRef = useRef(null); + const workletRef = useRef(null); + const streamRef = useRef(null); + + const initAudio = useCallback(async () => { + if (audioCtxRef.current) { + return; + } + const audioCtx = new AudioContext({ sampleRate: 24000 }); + const blob = new Blob([audioProcessorCode], { + type: 'application/javascript' + }); + // eslint-disable-next-line no-restricted-properties + const url = URL.createObjectURL(blob); + await audioCtx.audioWorklet.addModule(url); + URL.revokeObjectURL(url); + // eslint-disable-next-line require-atomic-updates + audioCtxRef.current = audioCtx; + }, []); + + const startRecording = useCallback(async () => { + await initAudio(); + const audioCtx = audioCtxRef.current!; + if (audioCtx.state === 'suspended') { + await audioCtx.resume(); + } + const stream = await navigator.mediaDevices.getUserMedia({ + audio: { + channelCount: 1, + sampleRate: 24000, + echoCancellation: true + } + }); + streamRef.current = stream; + const source = audioCtx.createMediaStreamSource(stream); + const worklet = new AudioWorkletNode(audioCtx, 'audio-recorder'); + + worklet.port.onmessage = e => { + if (e.data.eventType === 'audio') { + const float32 = e.data.audioData; + const int16 = new Int16Array(float32.length); + for (let i = 0; i < float32.length; i++) { + // eslint-disable-next-line security/detect-object-injection + int16[i] = Math.max(INT16_MIN, Math.min(INT16_MAX, float32[i] * INT16_SCALE)); + } + const base64 = btoa(String.fromCharCode(...new Uint8Array(int16.buffer))); + onAudioChunk(base64); + } + }; + + source.connect(worklet); + worklet.connect(audioCtx.destination); + worklet.port.postMessage({ command: 'START' }); + workletRef.current = worklet; + setRecordingInternal(true); + }, [initAudio, onAudioChunk]); + + const stopRecording = useCallback(() => { + if (workletRef.current) { + workletRef.current.port.postMessage({ command: 'STOP' }); + workletRef.current.disconnect(); + workletRef.current = null; + } + if (streamRef.current) { + streamRef.current.getTracks().forEach(track => track.stop()); + streamRef.current = null; + } + setRecordingInternal(false); + }, []); + + const setRecording = useCallback( + async (shouldRecord: boolean) => { + if (!shouldRecord && recording) { + stopRecording(); + } else if (shouldRecord && !recording) { + await startRecording(); + } + }, + [recording, startRecording, stopRecording] + ); + + return { + recording, + setRecording + }; +} diff --git a/packages/api/src/providers/SpeechToSpeech/types/SpeechState.ts b/packages/api/src/providers/SpeechToSpeech/types/SpeechState.ts new file mode 100644 index 0000000000..62d5cc8c13 --- /dev/null +++ b/packages/api/src/providers/SpeechToSpeech/types/SpeechState.ts @@ -0,0 +1 @@ +export type SpeechState = 'idle' | 'listening' | 'processing' | 'bot_speaking'; diff --git a/packages/api/src/providers/SpeechToSpeech/useSpeechToSpeech.ts b/packages/api/src/providers/SpeechToSpeech/useSpeechToSpeech.ts new file mode 100644 index 0000000000..d7ac3fac44 --- /dev/null +++ b/packages/api/src/providers/SpeechToSpeech/useSpeechToSpeech.ts @@ -0,0 +1,6 @@ +import { SpeechToSpeechContextType } from './private/Context'; +import useSpeechToSpeechContext from './private/useContext'; + +export default function useSpeechToSpeech(): readonly [SpeechToSpeechContextType] { + return [useSpeechToSpeechContext()]; +} diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts index e635e6a060..a81d494a07 100644 --- a/packages/core/src/index.ts +++ b/packages/core/src/index.ts @@ -42,6 +42,7 @@ import getOrgSchemaMessage from './utils/getOrgSchemaMessage'; import isForbiddenPropertyName from './utils/isForbiddenPropertyName'; import onErrorResumeNext from './utils/onErrorResumeNext'; import singleToArray from './utils/singleToArray'; +import isVoiceActivity from './utils/voiceActivity/isVoiceActivity'; export { CLEAR_SUGGESTED_ACTIONS, @@ -96,6 +97,7 @@ export { getActivityLivestreamingMetadata, getOrgSchemaMessage, isForbiddenPropertyName, + isVoiceActivity, markActivity, onErrorResumeNext, parseAction, diff --git a/packages/core/src/utils/voiceActivity/isVoiceActivity.spec.ts b/packages/core/src/utils/voiceActivity/isVoiceActivity.spec.ts new file mode 100644 index 0000000000..c8d744595e --- /dev/null +++ b/packages/core/src/utils/voiceActivity/isVoiceActivity.spec.ts @@ -0,0 +1,88 @@ +import isVoiceActivity from './isVoiceActivity'; +import { WebChatActivity } from '../../types/WebChatActivity'; + +// Mock activity factory for testing +const createMockActivity = (type: string = 'event', value?: any): WebChatActivity => ({ + type: type as any, + id: 'test-activity-id', + from: { id: 'test-user' }, + channelData: { + 'webchat:sequence-id': 1 + }, + ...(value && { value }) +}); + +const createMockVoiceActivity = (voiceEventType: string, additionalProps?: any): WebChatActivity => + createMockActivity('event', { + voiceLiveEvent: { + type: voiceEventType, + ...additionalProps + } + }); + +describe('isVoiceActivity', () => { + describe('Valid voice activities', () => { + test('should return true for event activity with voiceLiveEvent', () => { + const activity = createMockVoiceActivity('response.audio.delta', { delta: 'audiodata' }); + + const result = isVoiceActivity(activity); + + expect(result).toBe(true); + }); + + test('should return true for voice activity with minimal voiceLiveEvent', () => { + const activity = createMockActivity('event', { voiceLiveEvent: {} }); + + const result = isVoiceActivity(activity); + + expect(result).toBe(true); + }); + }); + + describe('Invalid activities', () => { + const testCases = [ + // Invalid by activity type + { + name: 'message activity with voiceLiveEvent', + activity: () => createMockActivity('message', { voiceLiveEvent: { type: 'response.audio.delta' } }) + }, + { + name: 'typing activity', + activity: () => createMockActivity('typing') + }, + { + name: 'event activity with value', + activity: () => ({ ...createMockActivity('event'), value: 'not an object' }) + } + ]; + + test.each(testCases)('should return false for $name', ({ activity }) => { + const result = isVoiceActivity(activity()); + + expect(result).toBe(false); + }); + }); + + describe('Real-world voice event types', () => { + const voiceEventTypes = [ + 'input_audio_buffer.append', + 'input_audio_buffer.speech_started', + 'input_audio_buffer.speech_stopped', + 'conversation.item.input_audio_transcription.completed', + 'response.audio.delta', + 'response.audio_transcript.delta', + 'response.audio_transcript.done', + 'response.done', + 'session.update', + 'response.cancel' + ]; + + test.each(voiceEventTypes)('should return true for voice event type: %s', eventType => { + const activity = createMockVoiceActivity(eventType); + + const result = isVoiceActivity(activity); + + expect(result).toBe(true); + }); + }); +}); diff --git a/packages/core/src/utils/voiceActivity/isVoiceActivity.ts b/packages/core/src/utils/voiceActivity/isVoiceActivity.ts new file mode 100644 index 0000000000..e16154e590 --- /dev/null +++ b/packages/core/src/utils/voiceActivity/isVoiceActivity.ts @@ -0,0 +1,14 @@ +import { WebChatActivity } from '../../types/WebChatActivity'; + +// This is interim type guard until activity protocol is ratified. +const isVoiceActivity = ( + activity: WebChatActivity +): activity is WebChatActivity & { + value: { voiceLiveEvent: any }; +} => + activity.type === 'event' && + activity.value && + typeof activity.value === 'object' && + 'voiceLiveEvent' in activity.value; + +export default isVoiceActivity;