diff --git a/README.md b/README.md index 12576326..c4b7c454 100644 --- a/README.md +++ b/README.md @@ -32,6 +32,7 @@ This repository contains Swift community-maintained implementation over [OpenAI] - [Audio Create Speech](#audio-create-speech) - [Audio Transcriptions](#audio-transcriptions) - [Audio Translations](#audio-translations) + - [Audio Chat](#audio-chat-gpt-4o-audio-preview) - [Structured Outputs](#structured-outputs) - [Specialized models](#specialized-models) - [Embeddings](#embeddings) @@ -735,6 +736,136 @@ openAI.audioTranslations(query: query) { result in let result = try await openAI.audioTranslations(query: query) ``` +### Audio Chat (gpt-4o-audio-preview) + +The Audio Chat API enables audio-to-audio conversations with GPT-4o Audio models. This replaces the traditional STT→Chat→TTS pipeline with a single API call, providing 2-3x faster response times and better voice quality. + +**Supported Models:** `gpt-4o-audio-preview`, `gpt-4o-mini-audio-preview` + +**Important Format Requirements:** +- **Input audio formats:** Only `wav` and `mp3` are supported +- **Output audio formats:** `wav`, `mp3`, `flac`, `opus`, `pcm16` +- **Recommended for streaming:** Use `pcm16` for output to get optimal streaming performance + +**Request:** + +```swift +public struct AudioChatQuery: Codable, Equatable, Streamable, Sendable { + public let model: Model + public let messages: [Message] + public let modalities: [Modality]? // [.text, .audio] + public let audio: AudioConfig? + public var stream: Bool + + public struct AudioConfig { + public let voice: Voice // .alloy, .echo, .fable, .onyx, .nova, .shimmer + public let format: AudioFormat // .wav, .mp3, .flac, .opus, .pcm16 + } + + public struct Message { + public let role: ChatQuery.ChatCompletionMessageParam.Role + public let content: Content // .text(String) or .parts([ContentPart]) + } +} + +public enum Modality: String, Codable, Sendable { + case text + case audio +} +``` + +**Response:** + +```swift +public struct AudioChatResult: Codable, Equatable, Sendable { + public let id: String + public let choices: [Choice] + + public struct Choice { + public let message: Message + } + + public struct Message { + public let content: String? // Text transcript + public let audio: AudioOutput? // Base64-encoded audio data + } + + public struct AudioOutput { + public let data: String // Base64-encoded audio + public let transcript: String + } +} +``` + +**Example (Non-Streaming):** + +```swift +let audioData = Data(contentsOf: audioFileURL) +let base64Audio = audioData.base64EncodedString() + +let query = AudioChatQuery( + model: .gpt_4o_audio_preview, + messages: [ + .init(role: .system, content: .text("You are a helpful voice assistant.")), + .init(role: .user, content: .parts([ + .init(inputAudio: .init(data: base64Audio, format: .wav)) + ])) + ], + modalities: [.text, .audio], + audio: .init(voice: .alloy, format: .pcm16) +) + +let result = try await openAI.audioChats(query: query) +if let audioOutput = result.choices.first?.message.audio, + let audioData = Data(base64Encoded: audioOutput.data) { + // Use audioData and audioOutput.transcript +} +``` + +**Example (Streaming):** + +```swift +for try await chunk in openAI.audioChatsStream(query: query) { + if let audioDelta = chunk.choices.first?.delta.audio?.data, + let audioChunk = Data(base64Encoded: audioDelta) { + // Play audio chunk in real-time + } +} +``` + +**AudioConversationManager Utility:** + +The SDK includes a convenient `AudioConversationManager` actor for managing multi-turn conversations with automatic history tracking: + +```swift +let manager = AudioConversationManager( + openAI: openAI, + systemPrompt: "You are a helpful voice assistant.", + maxHistoryTurns: 10 +) + +// Send audio and get audio response +let (audioData, transcript) = try await manager.sendAudio( + audioData, + audioFormat: .wav, + voice: .alloy, + responseFormat: .pcm16 +) + +// Send text and get audio response +let (audioData, transcript) = try await manager.sendText( + "What's the weather like?", + voice: .alloy, + responseFormat: .pcm16 +) + +// Get conversation transcript +let transcript = manager.getTranscript() + +// Reset conversation +manager.reset() +``` + Review [Audio Documentation](https://platform.openai.com/docs/api-reference/audio) for more info. ## Structured Outputs diff --git a/Sources/OpenAI/OpenAI+OpenAIAsync.swift b/Sources/OpenAI/OpenAI+OpenAIAsync.swift index f4d47b94..13e7e738 100644 --- a/Sources/OpenAI/OpenAI+OpenAIAsync.swift +++ b/Sources/OpenAI/OpenAI+OpenAIAsync.swift @@ -105,7 +105,21 @@ extension OpenAI: OpenAIAsync { request: makeAudioTranslationsRequest(query: query) ) } - + + public func audioChats(query: AudioChatQuery) async throws -> AudioChatResult { + try await performRequestAsync( + request: makeAudioChatsRequest(query: query) + ) + } + + public func audioChatsStream( + query: AudioChatQuery + ) -> AsyncThrowingStream { + makeAsyncStream { onResult, completion in + audioChatsStream(query: query, onResult: onResult, completion: completion) + } + } + public func assistants() async throws -> AssistantsResult { try await assistants(after: nil) } diff --git a/Sources/OpenAI/OpenAI.swift b/Sources/OpenAI/OpenAI.swift index 4712fdda..8033ed3f 100644 --- a/Sources/OpenAI/OpenAI.swift +++ b/Sources/OpenAI/OpenAI.swift @@ -330,7 +330,19 @@ final public class OpenAI: OpenAIProtocol, @unchecked Sendable { public func audioTranslations(query: AudioTranslationQuery, completion: @escaping @Sendable (Result) -> Void) -> CancellableRequest { performRequest(request: makeAudioTranslationsRequest(query: query), completion: completion) } - + + public func audioChats(query: AudioChatQuery, completion: @escaping @Sendable (Result) -> Void) -> CancellableRequest { + performRequest(request: makeAudioChatsRequest(query: query.makeNonStreamable()), completion: completion) + } + + public func audioChatsStream(query: AudioChatQuery, onResult: @escaping @Sendable (Result) -> Void, completion: (@Sendable (Error?) -> Void)?) -> CancellableRequest { + performStreamingRequest( + request: JSONRequest(body: query.makeStreamable(), url: buildURL(path: .chats)), + onResult: onResult, + completion: completion + ) + } + public func audioCreateSpeech(query: AudioSpeechQuery, completion: @escaping @Sendable (Result) -> Void) -> CancellableRequest { performSpeechRequest(request: makeAudioCreateSpeechRequest(query: query), completion: completion) } diff --git a/Sources/OpenAI/Private/OpenAI+MakeRequest.swift b/Sources/OpenAI/Private/OpenAI+MakeRequest.swift index ff4d6718..1858e7dd 100644 --- a/Sources/OpenAI/Private/OpenAI+MakeRequest.swift +++ b/Sources/OpenAI/Private/OpenAI+MakeRequest.swift @@ -48,7 +48,11 @@ extension OpenAI { func makeAudioTranslationsRequest(query: AudioTranslationQuery) -> MultipartFormDataRequest { .init(body: query, url: buildURL(path: .audioTranslations)) } - + + func makeAudioChatsRequest(query: AudioChatQuery) -> JSONRequest { + .init(body: query, url: buildURL(path: .chats)) + } + func makeAudioCreateSpeechRequest(query: AudioSpeechQuery) -> JSONRequest { .init(body: query, url: buildURL(path: .audioSpeech)) } diff --git a/Sources/OpenAI/Public/Models/AudioChatQuery.swift b/Sources/OpenAI/Public/Models/AudioChatQuery.swift new file mode 100644 index 00000000..1bd5d144 --- /dev/null +++ b/Sources/OpenAI/Public/Models/AudioChatQuery.swift @@ -0,0 +1,248 @@ +// +// AudioChatQuery.swift +// +// +// Created by OpenAI SDK Contributors. +// + +import Foundation + +/// Creates an audio chat completion request for the gpt-4o-audio-preview model +/// Enables audio-to-audio conversations with a single API call, replacing the traditional STT→Chat→TTS pipeline +/// +/// **Format Requirements:** +/// - Input audio: Only `wav` and `mp3` formats are supported +/// - Output audio: `wav`, `mp3`, `flac`, `opus`, `pcm16` are supported +/// - Recommended for streaming: Use `pcm16` for output to get optimal performance +/// +/// https://platform.openai.com/docs/guides/audio +public struct AudioChatQuery: Codable, Equatable, Streamable, Sendable { + + /// ID of the model to use. Currently only `gpt-4o-audio-preview` and its variants support audio chat. + public let model: Model + + /// A list of messages comprising the conversation so far. + public let messages: [Message] + + /// Output types to enable for this request. Can include text and audio. + /// Defaults to [.text, .audio] + public let modalities: [Modality]? + + /// Configuration for audio output + public let audio: AudioConfig? + + /// What sampling temperature to use, between 0 and 2. + /// Defaults to 1 + public let temperature: Double? + + /// The maximum number of tokens to generate in the completion. + public let maxTokens: Int? + + /// Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency. + /// Defaults to 0 + public let frequencyPenalty: Double? + + /// Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text. + /// Defaults to 0 + public let presencePenalty: Double? + + /// Up to 4 sequences where the API will stop generating further tokens. + public let stop: [String]? + + /// If specified, our system will make a best effort to sample deterministically. + public let seed: Int? + + /// If set to true, the response will be streamed as Server-Sent Events. + /// Defaults to false + public var stream: Bool + + public init( + model: Model, + messages: [Message], + modalities: [Modality]? = [.text, .audio], + audio: AudioConfig? = nil, + temperature: Double? = nil, + maxTokens: Int? = nil, + frequencyPenalty: Double? = nil, + presencePenalty: Double? = nil, + stop: [String]? = nil, + seed: Int? = nil, + stream: Bool = false + ) { + self.model = model + self.messages = messages + self.modalities = modalities + self.audio = audio + self.temperature = temperature + self.maxTokens = maxTokens + self.frequencyPenalty = frequencyPenalty + self.presencePenalty = presencePenalty + self.stop = stop + self.seed = seed + self.stream = stream + } + + enum CodingKeys: String, CodingKey { + case model + case messages + case modalities + case audio + case temperature + case maxTokens = "max_tokens" + case frequencyPenalty = "frequency_penalty" + case presencePenalty = "presence_penalty" + case stop + case seed + case stream + } + + /// Audio output configuration + public struct AudioConfig: Codable, Equatable, Sendable { + /// The voice to use for audio output + public let voice: Voice + + /// The format of the audio output + /// Default is pcm16 which is optimal for streaming + public let format: AudioFormat + + public init(voice: Voice = .alloy, format: AudioFormat = .pcm16) { + self.voice = voice + self.format = format + } + + enum CodingKeys: String, CodingKey { + case voice + case format + } + + /// Voice options for audio output + public enum Voice: String, Codable, Sendable { + case alloy + case echo + case fable + case onyx + case nova + case shimmer + } + } + + /// A message in the conversation + public struct Message: Codable, Equatable, Sendable { + /// The role of the message author (system, user, assistant) + public let role: ChatQuery.ChatCompletionMessageParam.Role + + /// The content of the message + public let content: Content + + public init(role: ChatQuery.ChatCompletionMessageParam.Role, content: Content) { + self.role = role + self.content = content + } + + enum CodingKeys: String, CodingKey { + case role + case content + } + + /// Message content can be either simple text or an array of content parts + public enum Content: Codable, Equatable, Sendable { + case text(String) + case parts([ContentPart]) + + public init(from decoder: Decoder) throws { + let container = try decoder.singleValueContainer() + + if let text = try? container.decode(String.self) { + self = .text(text) + } else if let parts = try? container.decode([ContentPart].self) { + self = .parts(parts) + } else { + throw DecodingError.dataCorruptedError( + in: container, + debugDescription: "Content must be either a string or an array of parts" + ) + } + } + + public func encode(to encoder: Encoder) throws { + var container = encoder.singleValueContainer() + switch self { + case .text(let text): + try container.encode(text) + case .parts(let parts): + try container.encode(parts) + } + } + } + + /// A content part can be text or audio input + public struct ContentPart: Codable, Equatable, Sendable { + public let type: String + public let text: String? + public let inputAudio: InputAudio? + + /// Create a text content part + public init(text: String) { + self.type = "text" + self.text = text + self.inputAudio = nil + } + + /// Create an audio input content part + public init(inputAudio: InputAudio) { + self.type = "input_audio" + self.text = nil + self.inputAudio = inputAudio + } + + enum CodingKeys: String, CodingKey { + case type + case text + case inputAudio = "input_audio" + } + } + + /// Audio input data + public struct InputAudio: Codable, Equatable, Sendable { + /// Base64-encoded audio data + public let data: String + + /// Format of the audio data + /// Note: Only wav and mp3 are supported for input audio + public let format: AudioFormat + + public init(data: String, format: AudioFormat = .wav) { + self.data = data + self.format = format + } + + enum CodingKeys: String, CodingKey { + case data + case format + } + } + } +} + +/// Audio format options for both input and output +/// +/// **Input audio formats:** Only `wav` and `mp3` are supported +/// **Output audio formats:** `wav`, `mp3`, `flac`, `opus`, `pcm16` +/// **Recommended for streaming:** Use `pcm16` for output to get optimal streaming performance +public enum AudioFormat: String, Codable, Sendable { + case wav + case mp3 + case flac + case opus + case pcm16 +} + +/// Output modality options for audio chat requests +/// +/// Specifies which types of output the model should generate +public enum Modality: String, Codable, Sendable { + /// Text output + case text + /// Audio output + case audio +} diff --git a/Sources/OpenAI/Public/Models/AudioChatResult.swift b/Sources/OpenAI/Public/Models/AudioChatResult.swift new file mode 100644 index 00000000..54fd98e3 --- /dev/null +++ b/Sources/OpenAI/Public/Models/AudioChatResult.swift @@ -0,0 +1,176 @@ +// +// AudioChatResult.swift +// +// +// Created by OpenAI SDK Contributors. +// + +import Foundation + +/// Response from an audio chat completion request +public struct AudioChatResult: Codable, Equatable, Sendable { + /// A unique identifier for the chat completion + public let id: String + + /// The object type, always "chat.completion" + public let object: String + + /// The Unix timestamp (in seconds) of when the chat completion was created + public let created: Int + + /// The model used for the chat completion + public let model: Model + + /// A list of chat completion choices + public let choices: [Choice] + + /// Usage statistics for the completion request + public let usage: Usage? + + /// The service tier used for processing the request + public let serviceTier: String? + + /// This fingerprint represents the backend configuration that the model runs with + public let systemFingerprint: String? + + enum CodingKeys: String, CodingKey { + case id + case object + case created + case model + case choices + case usage + case serviceTier = "service_tier" + case systemFingerprint = "system_fingerprint" + } + + public init( + id: String, + object: String, + created: Int, + model: Model, + choices: [Choice], + usage: Usage? = nil, + serviceTier: String? = nil, + systemFingerprint: String? = nil + ) { + self.id = id + self.object = object + self.created = created + self.model = model + self.choices = choices + self.usage = usage + self.serviceTier = serviceTier + self.systemFingerprint = systemFingerprint + } + + public init(from decoder: any Decoder) throws { + let container = try decoder.container(keyedBy: CodingKeys.self) + let parsingOptions = decoder.userInfo[.parsingOptions] as? ParsingOptions ?? [] + self.id = try container.decodeString(forKey: .id, parsingOptions: parsingOptions) + self.object = try container.decodeString(forKey: .object, parsingOptions: parsingOptions) + self.created = try container.decode(Int.self, forKey: .created) + self.model = try container.decode(Model.self, forKey: .model) + self.choices = try container.decode([Choice].self, forKey: .choices) + // Handle empty usage object gracefully + self.usage = try? container.decodeIfPresent(Usage.self, forKey: .usage) + self.serviceTier = try container.decodeIfPresent(String.self, forKey: .serviceTier) + self.systemFingerprint = try container.decodeIfPresent(String.self, forKey: .systemFingerprint) + } + + public struct Choice: Codable, Equatable, Sendable { + /// The index of the choice in the list of choices + public let index: Int + + /// A chat completion message generated by the model + public let message: Message + + /// The reason the model stopped generating tokens + public let finishReason: String? + + enum CodingKeys: String, CodingKey { + case index + case message + case finishReason = "finish_reason" + } + } + + public struct Message: Codable, Equatable, Sendable { + /// The role of the author of this message + public let role: String + + /// The text content of the message + public let content: String? + + /// Audio output from the model + public let audio: AudioOutput? + + enum CodingKeys: String, CodingKey { + case role + case content + case audio + } + } + + /// Audio output from the model + public struct AudioOutput: Codable, Equatable, Sendable { + /// Unique identifier for the audio response + public let id: String + + /// Base64-encoded audio data + public let data: String + + /// Transcript of the audio + public let transcript: String + + /// Unix timestamp (in seconds) when this audio data expires + public let expiresAt: Int + + enum CodingKeys: String, CodingKey { + case id + case data + case transcript + case expiresAt = "expires_at" + } + } + + /// Usage statistics for the completion request + public struct Usage: Codable, Equatable, Sendable { + /// Number of tokens in the prompt + public let promptTokens: Int + + /// Number of tokens in the generated completion + public let completionTokens: Int + + /// Total number of tokens used in the request (prompt + completion) + public let totalTokens: Int + + /// Breakdown of tokens used in the prompt + public let promptTokensDetails: TokenDetails? + + /// Breakdown of tokens used in the completion + public let completionTokensDetails: TokenDetails? + + enum CodingKeys: String, CodingKey { + case promptTokens = "prompt_tokens" + case completionTokens = "completion_tokens" + case totalTokens = "total_tokens" + case promptTokensDetails = "prompt_tokens_details" + case completionTokensDetails = "completion_tokens_details" + } + + /// Detailed breakdown of token usage + public struct TokenDetails: Codable, Equatable, Sendable { + /// Number of text tokens + public let textTokens: Int? + + /// Number of audio tokens + public let audioTokens: Int? + + enum CodingKeys: String, CodingKey { + case textTokens = "text_tokens" + case audioTokens = "audio_tokens" + } + } + } +} diff --git a/Sources/OpenAI/Public/Models/AudioChatStreamResult.swift b/Sources/OpenAI/Public/Models/AudioChatStreamResult.swift new file mode 100644 index 00000000..dc2b1692 --- /dev/null +++ b/Sources/OpenAI/Public/Models/AudioChatStreamResult.swift @@ -0,0 +1,129 @@ +// +// AudioChatStreamResult.swift +// +// +// Created by OpenAI SDK Contributors. +// + +import Foundation + +/// A chunk of an audio chat completion stream response +public struct AudioChatStreamResult: Codable, Equatable, Sendable { + /// A unique identifier for the chat completion + public let id: String + + /// The object type, always "chat.completion.chunk" + public let object: String + + /// The Unix timestamp (in seconds) of when the chat completion was created + public let created: Int + + /// The model used for the chat completion + public let model: Model + + /// A list of chat completion choices. Can be more than one if n > 1. + public let choices: [Choice] + + /// The service tier used for processing the request + public let serviceTier: String? + + /// This fingerprint represents the backend configuration that the model runs with + public let systemFingerprint: String? + + enum CodingKeys: String, CodingKey { + case id + case object + case created + case model + case choices + case serviceTier = "service_tier" + case systemFingerprint = "system_fingerprint" + } + + public init( + id: String, + object: String, + created: Int, + model: Model, + choices: [Choice], + serviceTier: String? = nil, + systemFingerprint: String? = nil + ) { + self.id = id + self.object = object + self.created = created + self.model = model + self.choices = choices + self.serviceTier = serviceTier + self.systemFingerprint = systemFingerprint + } + + public init(from decoder: any Decoder) throws { + let container = try decoder.container(keyedBy: CodingKeys.self) + let parsingOptions = decoder.userInfo[.parsingOptions] as? ParsingOptions ?? [] + self.id = try container.decodeString(forKey: .id, parsingOptions: parsingOptions) + self.object = try container.decodeString(forKey: .object, parsingOptions: parsingOptions) + self.created = try container.decode(Int.self, forKey: .created) + self.model = try container.decode(Model.self, forKey: .model) + self.choices = try container.decode([Choice].self, forKey: .choices) + self.serviceTier = try container.decodeIfPresent(String.self, forKey: .serviceTier) + self.systemFingerprint = try container.decodeIfPresent(String.self, forKey: .systemFingerprint) + } + + public struct Choice: Codable, Equatable, Sendable { + /// The index of the choice in the list of choices + public let index: Int + + /// A chunk of the message being generated + public let delta: Delta + + /// The reason the model stopped generating tokens + public let finishReason: String? + + enum CodingKeys: String, CodingKey { + case index + case delta + case finishReason = "finish_reason" + } + } + + /// Delta containing the incremental changes to the message + public struct Delta: Codable, Equatable, Sendable { + /// The role of the author of this message (only present in first chunk) + public let role: String? + + /// The text content delta + public let content: String? + + /// Audio output delta + public let audio: AudioDelta? + + enum CodingKeys: String, CodingKey { + case role + case content + case audio + } + } + + /// Incremental audio output + public struct AudioDelta: Codable, Equatable, Sendable { + /// Unique identifier for the audio response (only present in first audio chunk) + public let id: String? + + /// Base64-encoded chunk of audio data + public let data: String? + + /// Chunk of the transcript + public let transcript: String? + + /// Unix timestamp when this audio data expires (only present in first audio chunk) + public let expiresAt: Int? + + enum CodingKeys: String, CodingKey { + case id + case data + case transcript + case expiresAt = "expires_at" + } + } +} diff --git a/Sources/OpenAI/Public/Protocols/OpenAIAsync.swift b/Sources/OpenAI/Public/Protocols/OpenAIAsync.swift index 8a6a2431..72680eb5 100644 --- a/Sources/OpenAI/Public/Protocols/OpenAIAsync.swift +++ b/Sources/OpenAI/Public/Protocols/OpenAIAsync.swift @@ -22,6 +22,8 @@ public protocol OpenAIAsync: Sendable { func audioTranscriptionsVerbose(query: AudioTranscriptionQuery) async throws -> AudioTranscriptionVerboseResult func audioTranscriptionStream(query: AudioTranscriptionQuery) -> AsyncThrowingStream func audioTranslations(query: AudioTranslationQuery) async throws -> AudioTranslationResult + func audioChats(query: AudioChatQuery) async throws -> AudioChatResult + func audioChatsStream(query: AudioChatQuery) -> AsyncThrowingStream func assistants() async throws -> AssistantsResult func assistants(after: String?) async throws -> AssistantsResult func assistantCreate(query: AssistantsQuery) async throws -> AssistantResult diff --git a/Sources/OpenAI/Public/Protocols/OpenAIProtocol.swift b/Sources/OpenAI/Public/Protocols/OpenAIProtocol.swift index dba875b5..611b4d0f 100644 --- a/Sources/OpenAI/Public/Protocols/OpenAIProtocol.swift +++ b/Sources/OpenAI/Public/Protocols/OpenAIProtocol.swift @@ -252,7 +252,33 @@ public protocol OpenAIProtocol: OpenAIModern { Returns a `Result` of type `AudioTranslationResult` if successful, or an `Error` if an error occurs. **/ @discardableResult func audioTranslations(query: AudioTranslationQuery, completion: @escaping @Sendable (Result) -> Void) -> CancellableRequest - + + /** + Performs audio chat completion using OpenAI's audio chat API and completes the operation asynchronously. + This enables audio-to-audio conversations using the gpt-4o-audio-preview model. + + - Parameter query: The `AudioChatQuery` instance containing the conversation messages and audio configuration. + - Parameter completion: The completion handler to be executed upon completion of the audio chat request. + Returns a `Result` of type `AudioChatResult` if successful, or an `Error` if an error occurs. + **/ + @discardableResult func audioChats(query: AudioChatQuery, completion: @escaping @Sendable (Result) -> Void) -> CancellableRequest + + /** + Performs audio chat completion by streaming the results in real-time using OpenAI's audio chat API. + + This method establishes a connection that remains open, receiving and delivering audio chunks incrementally as they are processed by the API. + + - Parameter query: The `AudioChatQuery` instance containing the conversation messages and audio configuration. + - Parameter onResult: A closure that is called multiple times as new audio chat results become available. It provides either a partial `AudioChatStreamResult` or an error encountered during the stream. + - Parameter completion: An optional closure executed once the stream is fully closed. It receives an `Error` if the stream terminated unexpectedly, or `nil` if it completed successfully. + - Returns: A `CancellableRequest` object that allows you to cancel the ongoing audio chat stream. + **/ + func audioChatsStream( + query: AudioChatQuery, + onResult: @escaping @Sendable (Result) -> Void, + completion: (@Sendable (Error?) -> Void)? + ) -> CancellableRequest + /** This function sends a assistants query to the OpenAI API to list assistants that have been created. diff --git a/Sources/OpenAI/Public/Utilities/AudioConversationManager.swift b/Sources/OpenAI/Public/Utilities/AudioConversationManager.swift new file mode 100644 index 00000000..d2f83f6c --- /dev/null +++ b/Sources/OpenAI/Public/Utilities/AudioConversationManager.swift @@ -0,0 +1,222 @@ +// +// AudioConversationManager.swift +// +// +// Created by OpenAI SDK Contributors. +// + +import Foundation + +/// Manages multi-turn audio conversations with history tracking +/// +/// This actor provides a convenient way to manage audio conversations using the audio chat API. +/// It automatically handles conversation history, including system prompts and message tracking. +/// +/// Example: +/// ```swift +/// let manager = AudioConversationManager( +/// openAI: openAI, +/// systemPrompt: "You are a helpful voice assistant" +/// ) +/// +/// // Send audio and get response +/// let (audioData, transcript) = try await manager.sendAudio(userAudioData) +/// ``` +@available(iOS 15.0, macOS 12.0, watchOS 8.0, *) +public actor AudioConversationManager { + private var conversationHistory: [AudioChatQuery.Message] = [] + private let openAI: OpenAIProtocol + private let systemPrompt: String? + private let maxHistoryTurns: Int + + /// Initialize a new audio conversation manager + /// + /// - Parameters: + /// - openAI: The OpenAI client instance + /// - systemPrompt: Optional system prompt to set context for the conversation + /// - maxHistoryTurns: Maximum number of conversation turns to keep in history (default: 10) + public init(openAI: OpenAIProtocol, systemPrompt: String? = nil, maxHistoryTurns: Int = 10) { + self.openAI = openAI + self.systemPrompt = systemPrompt + self.maxHistoryTurns = maxHistoryTurns + + if let systemPrompt = systemPrompt { + conversationHistory.append(.init( + role: .system, + content: .text(systemPrompt) + )) + } + } + + /// Send audio message and get audio response + /// + /// - Parameters: + /// - audioData: The audio data to send + /// - audioFormat: Format of the input audio (default: wav). Only wav and mp3 are supported for input + /// - voice: Voice to use for response (default: alloy) + /// - responseFormat: Format for the response audio (default: pcm16 for optimal streaming) + /// - Returns: A tuple containing the audio data and transcript of the response + /// - Throws: AudioConversationError or networking errors + public func sendAudio( + _ audioData: Data, + audioFormat: AudioFormat = .wav, + voice: AudioChatQuery.AudioConfig.Voice = .alloy, + responseFormat: AudioFormat = .pcm16 + ) async throws -> (audioData: Data, transcript: String) { + + let base64Audio = audioData.base64EncodedString() + + let userMessage = AudioChatQuery.Message( + role: .user, + content: .parts([ + .init(inputAudio: .init(data: base64Audio, format: audioFormat)) + ]) + ) + + conversationHistory.append(userMessage) + + // Prune history if needed + pruneHistoryIfNeeded() + + let query = AudioChatQuery( + model: .gpt_4o_audio_preview, + messages: conversationHistory, + modalities: [.text, .audio], + audio: .init(voice: voice, format: responseFormat) + ) + + let result = try await openAI.audioChats(query: query) + + guard let choice = result.choices.first, + let audio = choice.message.audio else { + throw AudioConversationError.noAudioResponse + } + + let assistantMessage = AudioChatQuery.Message( + role: .assistant, + content: .text(audio.transcript) + ) + conversationHistory.append(assistantMessage) + + guard let audioData = Data(base64Encoded: audio.data) else { + throw AudioConversationError.invalidAudioData + } + + return (audioData, audio.transcript) + } + + /// Send text message and get audio response + /// + /// - Parameters: + /// - text: The text message to send + /// - voice: Voice to use for response (default: alloy) + /// - responseFormat: Format for the response audio (default: pcm16) + /// - Returns: A tuple containing the audio data and transcript of the response + /// - Throws: AudioConversationError or networking errors + public func sendText( + _ text: String, + voice: AudioChatQuery.AudioConfig.Voice = .alloy, + responseFormat: AudioFormat = .pcm16 + ) async throws -> (audioData: Data, transcript: String) { + + let userMessage = AudioChatQuery.Message( + role: .user, + content: .text(text) + ) + + conversationHistory.append(userMessage) + + // Prune history if needed + pruneHistoryIfNeeded() + + let query = AudioChatQuery( + model: .gpt_4o_audio_preview, + messages: conversationHistory, + modalities: [.text, .audio], + audio: .init(voice: voice, format: responseFormat) + ) + + let result = try await openAI.audioChats(query: query) + + guard let choice = result.choices.first, + let audio = choice.message.audio else { + throw AudioConversationError.noAudioResponse + } + + let assistantMessage = AudioChatQuery.Message( + role: .assistant, + content: .text(audio.transcript) + ) + conversationHistory.append(assistantMessage) + + guard let audioData = Data(base64Encoded: audio.data) else { + throw AudioConversationError.invalidAudioData + } + + return (audioData, audio.transcript) + } + + /// Clear conversation history (keeps system prompt if provided) + public func reset() { + conversationHistory.removeAll() + if let systemPrompt = systemPrompt { + conversationHistory.append(.init( + role: .system, + content: .text(systemPrompt) + )) + } + } + + /// Get current conversation transcript as a formatted string + /// + /// - Returns: A multi-line string with the conversation history + public func getTranscript() -> String { + conversationHistory + .compactMap { message -> String? in + switch message.content { + case .text(let text): + return "\(message.role.rawValue): \(text)" + case .parts(let parts): + let textParts = parts.compactMap { $0.text } + if textParts.isEmpty { + return "\(message.role.rawValue): [audio]" + } + return "\(message.role.rawValue): \(textParts.joined(separator: " "))" + } + } + .joined(separator: "\n") + } + + /// Get the current number of messages in the conversation history + public func getMessageCount() -> Int { + conversationHistory.count + } + + // MARK: - Private Methods + + private func pruneHistoryIfNeeded() { + // Keep system message + last N turns (each turn is user + assistant = 2 messages) + let maxMessages = maxHistoryTurns * 2 + + if conversationHistory.count > maxMessages { + let systemMessages = conversationHistory.filter { $0.role == .system } + let recentMessages = Array(conversationHistory.suffix(maxMessages)) + conversationHistory = systemMessages + recentMessages.filter { $0.role != .system } + } + } +} + +/// Errors that can occur during audio conversation management +public enum AudioConversationError: Error, LocalizedError { + case noAudioResponse + case invalidAudioData + + public var errorDescription: String? { + switch self { + case .noAudioResponse: + return "No audio response received from the API" + case .invalidAudioData: + return "Invalid audio data received - could not decode base64" + } + } +} diff --git a/Tests/OpenAITests/AudioChatQueryCodingTests.swift b/Tests/OpenAITests/AudioChatQueryCodingTests.swift new file mode 100644 index 00000000..8523c4eb --- /dev/null +++ b/Tests/OpenAITests/AudioChatQueryCodingTests.swift @@ -0,0 +1,275 @@ +// +// AudioChatQueryCodingTests.swift +// OpenAI +// +// Created by OpenAI SDK Contributors. +// + +import Testing +@testable import OpenAI +import Foundation + +struct AudioChatQueryCodingTests { + @Test func encodeBasicTextMessage() async throws { + let query = AudioChatQuery( + model: .gpt_4o_audio_preview, + messages: [ + .init(role: .system, content: .text("You are a helpful assistant")), + .init(role: .user, content: .text("Hello")) + ], + modalities: [.text, .audio], + audio: .init(voice: .alloy, format: .pcm16) + ) + + let expected = """ + { + "model": "gpt-4o-audio-preview", + "messages": [ + { + "role": "system", + "content": "You are a helpful assistant" + }, + { + "role": "user", + "content": "Hello" + } + ], + "modalities": ["text", "audio"], + "audio": { + "voice": "alloy", + "format": "pcm16" + }, + "stream": false + } + """ + + let encodedQuery = try encodedAndComparable(query) + let decodedExpectation = try decodedAndComparable(expected) + #expect(encodedQuery == decodedExpectation) + } + + @Test func encodeAudioInputMessage() async throws { + let audioData = "base64encodedaudiodata" + let query = AudioChatQuery( + model: .gpt_4o_audio_preview, + messages: [ + .init(role: .user, content: .parts([ + .init(inputAudio: .init(data: audioData, format: .wav)) + ])) + ], + modalities: [.text, .audio], + audio: .init(voice: .onyx, format: .pcm16) + ) + + let expected = """ + { + "model": "gpt-4o-audio-preview", + "messages": [ + { + "role": "user", + "content": [ + { + "type": "input_audio", + "input_audio": { + "data": "base64encodedaudiodata", + "format": "wav" + } + } + ] + } + ], + "modalities": ["text", "audio"], + "audio": { + "voice": "onyx", + "format": "pcm16" + }, + "stream": false + } + """ + + #expect(try equal(query, expected)) + } + + @Test func encodeMixedContentParts() async throws { + let query = AudioChatQuery( + model: .gpt_4o_audio_preview, + messages: [ + .init(role: .user, content: .parts([ + .init(text: "Please analyze this audio:"), + .init(inputAudio: .init(data: "audiodata", format: .mp3)) + ])) + ], + audio: .init(voice: .shimmer, format: .opus) + ) + + let expected = """ + { + "model": "gpt-4o-audio-preview", + "messages": [ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "Please analyze this audio:" + }, + { + "type": "input_audio", + "input_audio": { + "data": "audiodata", + "format": "mp3" + } + } + ] + } + ], + "modalities": ["text", "audio"], + "audio": { + "voice": "shimmer", + "format": "opus" + }, + "stream": false + } + """ + + #expect(try equal(query, expected)) + } + + @Test func encodeWithAllVoices() async throws { + let voices: [AudioChatQuery.AudioConfig.Voice] = [.alloy, .echo, .fable, .onyx, .nova, .shimmer] + + for voice in voices { + let query = AudioChatQuery( + model: .gpt_4o_audio_preview, + messages: [.init(role: .user, content: .text("Test"))], + audio: .init(voice: voice, format: .pcm16) + ) + + let encoded = try JSONEncoder().encode(query) + let decoded = try JSONDecoder().decode(AudioChatQuery.self, from: encoded) + + #expect(decoded.audio?.voice == voice) + } + } + + @Test func encodeWithAllFormats() async throws { + let formats: [AudioFormat] = [.wav, .mp3, .flac, .opus, .pcm16] + + for format in formats { + let query = AudioChatQuery( + model: .gpt_4o_audio_preview, + messages: [.init(role: .user, content: .text("Test"))], + audio: .init(voice: .alloy, format: format) + ) + + let encoded = try JSONEncoder().encode(query) + let decoded = try JSONDecoder().decode(AudioChatQuery.self, from: encoded) + + #expect(decoded.audio?.format == format) + } + } + + @Test func encodeStreamingQuery() async throws { + let query = AudioChatQuery( + model: .gpt_4o_audio_preview, + messages: [.init(role: .user, content: .text("Hello"))], + audio: .init(voice: .alloy, format: .pcm16), + stream: true + ) + + let expected = """ + { + "model": "gpt-4o-audio-preview", + "messages": [ + { + "role": "user", + "content": "Hello" + } + ], + "modalities": ["text", "audio"], + "audio": { + "voice": "alloy", + "format": "pcm16" + }, + "stream": true + } + """ + + #expect(try equal(query, expected)) + } + + @Test func encodeWithOptionalParameters() async throws { + let query = AudioChatQuery( + model: .gpt_4o_audio_preview, + messages: [.init(role: .user, content: .text("Test"))], + audio: .init(voice: .alloy, format: .pcm16), + temperature: 0.7, + maxTokens: 100, + frequencyPenalty: 0.5, + presencePenalty: 0.3, + stop: ["STOP"], + seed: 42 + ) + + let expected = """ + { + "model": "gpt-4o-audio-preview", + "messages": [ + { + "role": "user", + "content": "Test" + } + ], + "modalities": ["text", "audio"], + "audio": { + "voice": "alloy", + "format": "pcm16" + }, + "temperature": 0.7, + "max_tokens": 100, + "frequency_penalty": 0.5, + "presence_penalty": 0.3, + "stop": ["STOP"], + "seed": 42, + "stream": false + } + """ + + #expect(try equal(query, expected)) + } + + @Test func testStreamableProtocolConformance() async throws { + let query = AudioChatQuery( + model: .gpt_4o_audio_preview, + messages: [.init(role: .user, content: .text("Test"))], + audio: .init(voice: .alloy, format: .pcm16), + stream: false + ) + + // Test makeStreamable + let streamable = query.makeStreamable() + #expect(streamable.stream == true) + + // Test makeNonStreamable + let nonStreamable = streamable.makeNonStreamable() + #expect(nonStreamable.stream == false) + } + + private func equal(_ query: Codable, _ expected: String) throws -> Bool { + let encodedQuery = try encodedAndComparable(query) + let decodedExpectation = try decodedAndComparable(expected) + return encodedQuery == decodedExpectation + } + + private func encodedAndComparable(_ candidate: Codable) throws -> NSDictionary { + try jsonDataAsNSDictionary(try JSONEncoder().encode(candidate)) + } + + private func decodedAndComparable(_ candidate: String) throws -> NSDictionary { + try jsonDataAsNSDictionary(candidate.data(using: .utf8)!) + } + + private func jsonDataAsNSDictionary(_ data: Data) throws -> NSDictionary { + NSDictionary(dictionary: try JSONSerialization.jsonObject(with: data, options: []) as! [String: Any]) + } +} diff --git a/Tests/OpenAITests/AudioChatResultTests.swift b/Tests/OpenAITests/AudioChatResultTests.swift new file mode 100644 index 00000000..9d1cac66 --- /dev/null +++ b/Tests/OpenAITests/AudioChatResultTests.swift @@ -0,0 +1,258 @@ +// +// AudioChatResultTests.swift +// OpenAI +// +// Created by OpenAI SDK Contributors. +// + +import XCTest +@testable import OpenAI + +final class AudioChatResultTests: XCTestCase { + private let decoder = JSONDecoder() + + func testDecodeBasicAudioChatResult() throws { + let jsonString = """ + { + "id": "chatcmpl-123", + "object": "chat.completion", + "created": 1677652288, + "model": "gpt-4o-audio-preview", + "choices": [ + { + "index": 0, + "message": { + "role": "assistant", + "content": "Hello! How can I help you?", + "audio": { + "id": "audio-123", + "data": "base64encodedaudiodata", + "transcript": "Hello! How can I help you?", + "expires_at": 1234567890 + } + }, + "finish_reason": "stop" + } + ], + "usage": { + "prompt_tokens": 10, + "completion_tokens": 20, + "total_tokens": 30 + } + } + """ + + let result = try decode(jsonString) + XCTAssertEqual(result.id, "chatcmpl-123") + XCTAssertEqual(result.model, "gpt-4o-audio-preview") + XCTAssertEqual(result.choices.count, 1) + + let choice = result.choices[0] + XCTAssertEqual(choice.index, 0) + XCTAssertEqual(choice.message.role, "assistant") + XCTAssertEqual(choice.message.content, "Hello! How can I help you?") + XCTAssertEqual(choice.finishReason, "stop") + + let audio = try XCTUnwrap(choice.message.audio) + XCTAssertEqual(audio.id, "audio-123") + XCTAssertEqual(audio.data, "base64encodedaudiodata") + XCTAssertEqual(audio.transcript, "Hello! How can I help you?") + XCTAssertEqual(audio.expiresAt, 1234567890) + } + + func testDecodeWithoutAudio() throws { + let jsonString = """ + { + "id": "chatcmpl-456", + "object": "chat.completion", + "created": 1677652288, + "model": "gpt-4o-audio-preview", + "choices": [ + { + "index": 0, + "message": { + "role": "assistant", + "content": "Text-only response" + }, + "finish_reason": "stop" + } + ] + } + """ + + let result = try decode(jsonString) + XCTAssertEqual(result.id, "chatcmpl-456") + XCTAssertEqual(result.choices[0].message.content, "Text-only response") + XCTAssertNil(result.choices[0].message.audio) + } + + func testDecodeMultipleChoices() throws { + let jsonString = """ + { + "id": "chatcmpl-789", + "object": "chat.completion", + "created": 1677652288, + "model": "gpt-4o-audio-preview", + "choices": [ + { + "index": 0, + "message": { + "role": "assistant", + "content": "First response", + "audio": { + "id": "audio-1", + "data": "data1", + "transcript": "First response", + "expires_at": 1234567890 + } + }, + "finish_reason": "stop" + }, + { + "index": 1, + "message": { + "role": "assistant", + "content": "Second response", + "audio": { + "id": "audio-2", + "data": "data2", + "transcript": "Second response", + "expires_at": 1234567891 + } + }, + "finish_reason": "stop" + } + ] + } + """ + + let result = try decode(jsonString) + XCTAssertEqual(result.choices.count, 2) + XCTAssertEqual(result.choices[0].message.audio?.id, "audio-1") + XCTAssertEqual(result.choices[1].message.audio?.id, "audio-2") + } + + func testDecodeWithUsage() throws { + let jsonString = """ + { + "id": "chatcmpl-usage", + "object": "chat.completion", + "created": 1677652288, + "model": "gpt-4o-audio-preview", + "choices": [ + { + "index": 0, + "message": { + "role": "assistant", + "content": "Response" + }, + "finish_reason": "stop" + } + ], + "usage": { + "prompt_tokens": 100, + "completion_tokens": 50, + "total_tokens": 150 + } + } + """ + + let result = try decode(jsonString) + let usage = try XCTUnwrap(result.usage) + XCTAssertEqual(usage.promptTokens, 100) + XCTAssertEqual(usage.completionTokens, 50) + XCTAssertEqual(usage.totalTokens, 150) + } + + func testDecodeFailsForMissingId() { + let jsonString = """ + { + "object": "chat.completion", + "model": "gpt-4o-audio-preview" + } + """ + + do { + _ = try decode(jsonString) + XCTFail("Should throw error") + } catch let error as DecodingError { + switch error { + case .keyNotFound(let key, _): + XCTAssertEqual(key as! AudioChatResult.CodingKeys, AudioChatResult.CodingKeys.id) + default: + XCTFail("Unexpected error") + } + } catch { + XCTFail("Unexpected error") + } + } + + func testDecodeMissingIdSucceedsWithRelaxedOptions() throws { + let jsonString = """ + { + "object": "chat.completion", + "created": 1677652288, + "model": "gpt-4o-audio-preview", + "choices": [] + } + """ + + decoder.userInfo = [.parsingOptions: ParsingOptions.relaxed] + let result = try decode(jsonString) + XCTAssertEqual(result.id, "") + } + + func testDecodeEmptyUsage() throws { + let jsonString = """ + { + "id": "some_id", + "object": "chat.completion", + "created": 1677652288, + "model": "gpt-4o-audio-preview", + "choices": [], + "usage": {} + } + """ + + let result = try decode(jsonString) + XCTAssertEqual(result.model, "gpt-4o-audio-preview") + XCTAssertNil(result.usage) + } + + func testDecodeSystemFingerprint() throws { + let jsonString = """ + { + "id": "some_id", + "object": "chat.completion", + "created": 1677652288, + "model": "gpt-4o-audio-preview", + "choices": [], + "system_fingerprint": "fp_abc123" + } + """ + + let result = try decode(jsonString) + XCTAssertEqual(result.systemFingerprint, "fp_abc123") + } + + func testDecodeNullSystemFingerprint() throws { + let jsonString = """ + { + "id": "some_id", + "object": "chat.completion", + "created": 1677652288, + "model": "gpt-4o-audio-preview", + "choices": [], + "system_fingerprint": null + } + """ + + let result = try decode(jsonString) + XCTAssertNil(result.systemFingerprint) + } + + private func decode(_ jsonString: String) throws -> AudioChatResult { + let jsonData = jsonString.data(using: .utf8)! + return try decoder.decode(AudioChatResult.self, from: jsonData) + } +} diff --git a/Tests/OpenAITests/AudioChatStreamResultTests.swift b/Tests/OpenAITests/AudioChatStreamResultTests.swift new file mode 100644 index 00000000..930224df --- /dev/null +++ b/Tests/OpenAITests/AudioChatStreamResultTests.swift @@ -0,0 +1,257 @@ +// +// AudioChatStreamResultTests.swift +// OpenAI +// +// Created by OpenAI SDK Contributors. +// + +import XCTest +@testable import OpenAI + +final class AudioChatStreamResultTests: XCTestCase { + private let decoder = JSONDecoder() + + func testDecodeBasicStreamChunk() throws { + let jsonString = """ + { + "id": "chatcmpl-123", + "object": "chat.completion.chunk", + "created": 1677652288, + "model": "gpt-4o-audio-preview", + "choices": [ + { + "index": 0, + "delta": { + "role": "assistant", + "content": "Hello" + }, + "finish_reason": null + } + ] + } + """ + + let result = try decode(jsonString) + XCTAssertEqual(result.id, "chatcmpl-123") + XCTAssertEqual(result.model, "gpt-4o-audio-preview") + XCTAssertEqual(result.choices.count, 1) + + let choice = result.choices[0] + XCTAssertEqual(choice.index, 0) + XCTAssertEqual(choice.delta.role, "assistant") + XCTAssertEqual(choice.delta.content, "Hello") + XCTAssertNil(choice.finishReason) + } + + func testDecodeAudioDelta() throws { + let jsonString = """ + { + "id": "chatcmpl-456", + "object": "chat.completion.chunk", + "created": 1677652288, + "model": "gpt-4o-audio-preview", + "choices": [ + { + "index": 0, + "delta": { + "audio": { + "id": "audio-123", + "data": "base64chunk", + "transcript": "partial transcript" + } + }, + "finish_reason": null + } + ] + } + """ + + let result = try decode(jsonString) + let audio = try XCTUnwrap(result.choices[0].delta.audio) + XCTAssertEqual(audio.id, "audio-123") + XCTAssertEqual(audio.data, "base64chunk") + XCTAssertEqual(audio.transcript, "partial transcript") + } + + func testDecodeAudioDeltaWithOnlyData() throws { + let jsonString = """ + { + "id": "chatcmpl-789", + "object": "chat.completion.chunk", + "created": 1677652288, + "model": "gpt-4o-audio-preview", + "choices": [ + { + "index": 0, + "delta": { + "audio": { + "data": "audiochunk123" + } + }, + "finish_reason": null + } + ] + } + """ + + let result = try decode(jsonString) + let audio = try XCTUnwrap(result.choices[0].delta.audio) + XCTAssertEqual(audio.data, "audiochunk123") + XCTAssertNil(audio.id) + XCTAssertNil(audio.transcript) + } + + func testDecodeWithFinishReason() throws { + let jsonString = """ + { + "id": "chatcmpl-final", + "object": "chat.completion.chunk", + "created": 1677652288, + "model": "gpt-4o-audio-preview", + "choices": [ + { + "index": 0, + "delta": {}, + "finish_reason": "stop" + } + ] + } + """ + + let result = try decode(jsonString) + XCTAssertEqual(result.choices[0].finishReason, "stop") + } + + func testDecodeMultipleChoices() throws { + let jsonString = """ + { + "id": "chatcmpl-multi", + "object": "chat.completion.chunk", + "created": 1677652288, + "model": "gpt-4o-audio-preview", + "choices": [ + { + "index": 0, + "delta": { + "content": "First" + }, + "finish_reason": null + }, + { + "index": 1, + "delta": { + "content": "Second" + }, + "finish_reason": null + } + ] + } + """ + + let result = try decode(jsonString) + XCTAssertEqual(result.choices.count, 2) + XCTAssertEqual(result.choices[0].delta.content, "First") + XCTAssertEqual(result.choices[1].delta.content, "Second") + } + + func testDecodeEmptyDelta() throws { + let jsonString = """ + { + "id": "chatcmpl-empty", + "object": "chat.completion.chunk", + "created": 1677652288, + "model": "gpt-4o-audio-preview", + "choices": [ + { + "index": 0, + "delta": {}, + "finish_reason": null + } + ] + } + """ + + let result = try decode(jsonString) + XCTAssertNil(result.choices[0].delta.role) + XCTAssertNil(result.choices[0].delta.content) + XCTAssertNil(result.choices[0].delta.audio) + } + + // Note: AudioChatStreamResult does not include usage field unlike ChatStreamResult + + func testDecodeFailsForMissingId() { + let jsonString = """ + { + "object": "chat.completion.chunk", + "model": "gpt-4o-audio-preview" + } + """ + + do { + _ = try decode(jsonString) + XCTFail("Should throw error") + } catch let error as DecodingError { + switch error { + case .keyNotFound(let key, _): + XCTAssertEqual(key as! AudioChatStreamResult.CodingKeys, AudioChatStreamResult.CodingKeys.id) + default: + XCTFail("Unexpected error") + } + } catch { + XCTFail("Unexpected error") + } + } + + func testDecodeMissingIdSucceedsWithRelaxedOptions() throws { + let jsonString = """ + { + "object": "chat.completion.chunk", + "created": 1677652288, + "model": "gpt-4o-audio-preview", + "choices": [] + } + """ + + decoder.userInfo = [.parsingOptions: ParsingOptions.relaxed] + let result = try decode(jsonString) + XCTAssertEqual(result.id, "") + } + + + func testDecodeSystemFingerprint() throws { + let jsonString = """ + { + "id": "some_id", + "object": "chat.completion.chunk", + "created": 1677652288, + "model": "gpt-4o-audio-preview", + "choices": [], + "system_fingerprint": "fp_xyz789" + } + """ + + let result = try decode(jsonString) + XCTAssertEqual(result.systemFingerprint, "fp_xyz789") + } + + func testDecodeNullSystemFingerprint() throws { + let jsonString = """ + { + "id": "some_id", + "object": "chat.completion.chunk", + "created": 1677652288, + "model": "gpt-4o-audio-preview", + "choices": [], + "system_fingerprint": null + } + """ + + let result = try decode(jsonString) + XCTAssertNil(result.systemFingerprint) + } + + private func decode(_ jsonString: String) throws -> AudioChatStreamResult { + let jsonData = jsonString.data(using: .utf8)! + return try decoder.decode(AudioChatStreamResult.self, from: jsonData) + } +} diff --git a/Tests/OpenAITests/AudioConversationManagerTests.swift b/Tests/OpenAITests/AudioConversationManagerTests.swift new file mode 100644 index 00000000..35a9204f --- /dev/null +++ b/Tests/OpenAITests/AudioConversationManagerTests.swift @@ -0,0 +1,329 @@ +// +// AudioConversationManagerTests.swift +// OpenAI +// +// Created by OpenAI SDK Contributors. +// + +import XCTest +@testable import OpenAI +#if canImport(Combine) +import Combine +#endif + +@available(iOS 15.0, macOS 12.0, watchOS 8.0, *) +final class AudioConversationManagerTests: XCTestCase { + + func testInitializationWithSystemPrompt() async { + let systemPrompt = "You are a helpful assistant" + let manager = AudioConversationManager( + openAI: MockOpenAI(), + systemPrompt: systemPrompt + ) + + let messageCount = await manager.getMessageCount() + XCTAssertEqual(messageCount, 1, "Should have system message") + + let transcript = await manager.getTranscript() + XCTAssertTrue(transcript.contains(systemPrompt)) + } + + func testInitializationWithoutSystemPrompt() async { + let manager = AudioConversationManager(openAI: MockOpenAI()) + let messageCount = await manager.getMessageCount() + XCTAssertEqual(messageCount, 0, "Should have no messages") + } + + func testSendAudio() async throws { + let mockOpenAI = MockOpenAI() + let manager = AudioConversationManager(openAI: mockOpenAI) + + let audioData = Data("test audio".utf8) + let (responseAudio, transcript) = try await manager.sendAudio(audioData) + + XCTAssertFalse(responseAudio.isEmpty) + XCTAssertEqual(transcript, "Mock response") + + let messageCount = await manager.getMessageCount() + XCTAssertEqual(messageCount, 2, "Should have user message and assistant response") + } + + func testSendText() async throws { + let mockOpenAI = MockOpenAI() + let manager = AudioConversationManager(openAI: mockOpenAI) + + let (responseAudio, transcript) = try await manager.sendText("Hello") + + XCTAssertFalse(responseAudio.isEmpty) + XCTAssertEqual(transcript, "Mock response") + + let messageCount = await manager.getMessageCount() + XCTAssertEqual(messageCount, 2, "Should have user message and assistant response") + } + + func testConversationHistory() async throws { + let mockOpenAI = MockOpenAI() + let manager = AudioConversationManager( + openAI: mockOpenAI, + systemPrompt: "You are helpful" + ) + + _ = try await manager.sendText("First message") + _ = try await manager.sendText("Second message") + + let transcript = await manager.getTranscript() + XCTAssertTrue(transcript.contains("First message")) + XCTAssertTrue(transcript.contains("Second message")) + XCTAssertTrue(transcript.contains("Mock response")) + + let messageCount = await manager.getMessageCount() + XCTAssertEqual(messageCount, 5, "System + 2 user messages + 2 assistant responses") + } + + func testReset() async throws { + let mockOpenAI = MockOpenAI() + let systemPrompt = "You are helpful" + let manager = AudioConversationManager( + openAI: mockOpenAI, + systemPrompt: systemPrompt + ) + + _ = try await manager.sendText("Test message") + + var messageCount = await manager.getMessageCount() + XCTAssertGreaterThan(messageCount, 1) + + await manager.reset() + + messageCount = await manager.getMessageCount() + XCTAssertEqual(messageCount, 1, "Should only have system message after reset") + + let transcript = await manager.getTranscript() + XCTAssertTrue(transcript.contains(systemPrompt)) + XCTAssertFalse(transcript.contains("Test message")) + } + + func testHistoryPruning() async throws { + let mockOpenAI = MockOpenAI() + let manager = AudioConversationManager( + openAI: mockOpenAI, + systemPrompt: "System", + maxHistoryTurns: 2 + ) + + // Send 5 messages (exceeds maxHistoryTurns of 2) + for i in 1...5 { + _ = try await manager.sendText("Message \(i)") + } + + let transcript = await manager.getTranscript() + + // Should keep system message + last 2 turns (4 messages) + XCTAssertTrue(transcript.contains("System")) + XCTAssertFalse(transcript.contains("Message 1")) + XCTAssertFalse(transcript.contains("Message 2")) + XCTAssertFalse(transcript.contains("Message 3")) + XCTAssertTrue(transcript.contains("Message 4")) + XCTAssertTrue(transcript.contains("Message 5")) + } + + func testDifferentVoices() async throws { + let mockOpenAI = MockOpenAI() + let manager = AudioConversationManager(openAI: mockOpenAI) + + let voices: [AudioChatQuery.AudioConfig.Voice] = [.alloy, .echo, .fable, .onyx, .nova, .shimmer] + + for voice in voices { + _ = try await manager.sendText("Test", voice: voice) + // Verify it doesn't throw + } + } + + func testDifferentResponseFormats() async throws { + let mockOpenAI = MockOpenAI() + let manager = AudioConversationManager(openAI: mockOpenAI) + + let formats: [AudioFormat] = [.wav, .mp3, .flac, .opus, .pcm16] + + for format in formats { + _ = try await manager.sendText("Test", responseFormat: format) + // Verify it doesn't throw + } + } + + func testGetTranscriptWithAudioMessage() async throws { + let mockOpenAI = MockOpenAI() + let manager = AudioConversationManager(openAI: mockOpenAI) + + let audioData = Data("audio".utf8) + _ = try await manager.sendAudio(audioData) + + let transcript = await manager.getTranscript() + XCTAssertTrue(transcript.contains("[audio]"), "Should indicate audio message") + } + + func testErrorPropagation() async { + let mockOpenAI = MockOpenAI() + mockOpenAI.shouldThrowError = true + let manager = AudioConversationManager(openAI: mockOpenAI) + + do { + _ = try await manager.sendText("Test") + XCTFail("Should throw error") + } catch { + // Expected error + } + } +} + +// MARK: - Mock OpenAI + +@available(iOS 15.0, macOS 12.0, watchOS 8.0, *) +private class MockOpenAI: OpenAIProtocol { + var shouldThrowError = false + + func audioChats(query: AudioChatQuery, completion: @escaping @Sendable (Result) -> Void) -> any CancellableRequest { + if shouldThrowError { + completion(.failure(NSError(domain: "test", code: -1))) + } else { + let mockResult = AudioChatResult( + id: "test-id", + object: "chat.completion", + created: 0, + model: "gpt-4o-audio-preview", + choices: [ + .init( + index: 0, + message: .init( + role: "assistant", + content: "Mock response", + audio: .init( + id: "audio-id", + data: Data("mock audio".utf8).base64EncodedString(), + transcript: "Mock response", + expiresAt: 0 + ) + ), + finishReason: "stop" + ) + ], + usage: nil, + serviceTier: nil, + systemFingerprint: nil + ) + completion(.success(mockResult)) + } + return MockCancellableRequest() + } + + func audioChatsStream(query: AudioChatQuery, onResult: @escaping @Sendable (Result) -> Void, completion: (@Sendable (Error?) -> Void)?) -> any CancellableRequest { + MockCancellableRequest() + } + + // Implement other required protocol methods with minimal implementations + var responses: ResponsesEndpointProtocol { fatalError("Not implemented") } + func images(query: ImagesQuery, completion: @escaping @Sendable (Result) -> Void) -> any CancellableRequest { MockCancellableRequest() } + func imageEdits(query: ImageEditsQuery, completion: @escaping @Sendable (Result) -> Void) -> any CancellableRequest { MockCancellableRequest() } + func imageVariations(query: ImageVariationsQuery, completion: @escaping @Sendable (Result) -> Void) -> any CancellableRequest { MockCancellableRequest() } + func embeddings(query: EmbeddingsQuery, completion: @escaping @Sendable (Result) -> Void) -> any CancellableRequest { MockCancellableRequest() } + func chats(query: ChatQuery, completion: @escaping @Sendable (Result) -> Void) -> any CancellableRequest { MockCancellableRequest() } + func chatsStream(query: ChatQuery, onResult: @escaping @Sendable (Result) -> Void, completion: (@Sendable (Error?) -> Void)?) -> any CancellableRequest { MockCancellableRequest() } + func model(query: ModelQuery, completion: @escaping @Sendable (Result) -> Void) -> any CancellableRequest { MockCancellableRequest() } + func models(completion: @escaping @Sendable (Result) -> Void) -> any CancellableRequest { MockCancellableRequest() } + func moderations(query: ModerationsQuery, completion: @escaping @Sendable (Result) -> Void) -> any CancellableRequest { MockCancellableRequest() } + func audioCreateSpeech(query: AudioSpeechQuery, completion: @escaping @Sendable (Result) -> Void) -> any CancellableRequest { MockCancellableRequest() } + func audioCreateSpeechStream(query: AudioSpeechQuery, onResult: @escaping @Sendable (Result) -> Void, completion: (@Sendable (Error?) -> Void)?) -> any CancellableRequest { MockCancellableRequest() } + func audioTranscriptions(query: AudioTranscriptionQuery, completion: @escaping @Sendable (Result) -> Void) -> any CancellableRequest { MockCancellableRequest() } + func audioTranscriptionsVerbose(query: AudioTranscriptionQuery, completion: @escaping @Sendable (Result) -> Void) -> any CancellableRequest { MockCancellableRequest() } + func audioTranscriptionStream(query: AudioTranscriptionQuery, onResult: @escaping @Sendable (Result) -> Void, completion: (@Sendable (Error?) -> Void)?) -> any CancellableRequest { MockCancellableRequest() } + func audioTranslations(query: AudioTranslationQuery, completion: @escaping @Sendable (Result) -> Void) -> any CancellableRequest { MockCancellableRequest() } + func assistants(after: String?, completion: @escaping @Sendable (Result) -> Void) -> any CancellableRequest { MockCancellableRequest() } + func assistantCreate(query: AssistantsQuery, completion: @escaping @Sendable (Result) -> Void) -> any CancellableRequest { MockCancellableRequest() } + func assistantModify(query: AssistantsQuery, assistantId: String, completion: @escaping @Sendable (Result) -> Void) -> any CancellableRequest { MockCancellableRequest() } + func threads(query: ThreadsQuery, completion: @escaping @Sendable (Result) -> Void) -> any CancellableRequest { MockCancellableRequest() } + func threadRun(query: ThreadRunQuery, completion: @escaping @Sendable (Result) -> Void) -> any CancellableRequest { MockCancellableRequest() } + func runs(threadId: String, query: RunsQuery, completion: @escaping @Sendable (Result) -> Void) -> any CancellableRequest { MockCancellableRequest() } + func runRetrieve(threadId: String, runId: String, completion: @escaping @Sendable (Result) -> Void) -> any CancellableRequest { MockCancellableRequest() } + func runRetrieveSteps(threadId: String, runId: String, before: String?, completion: @escaping @Sendable (Result) -> Void) -> any CancellableRequest { MockCancellableRequest() } + func runSubmitToolOutputs(threadId: String, runId: String, query: RunToolOutputsQuery, completion: @escaping @Sendable (Result) -> Void) -> any CancellableRequest { MockCancellableRequest() } + func threadsMessages(threadId: String, before: String?, completion: @escaping @Sendable (Result) -> Void) -> any CancellableRequest { MockCancellableRequest() } + func threadsAddMessage(threadId: String, query: MessageQuery, completion: @escaping @Sendable (Result) -> Void) -> any CancellableRequest { MockCancellableRequest() } + func files(query: FilesQuery, completion: @escaping @Sendable (Result) -> Void) -> any CancellableRequest { MockCancellableRequest() } + + // OpenAIAsync protocol requirements + func images(query: ImagesQuery) async throws -> ImagesResult { fatalError() } + func imageEdits(query: ImageEditsQuery) async throws -> ImagesResult { fatalError() } + func imageVariations(query: ImageVariationsQuery) async throws -> ImagesResult { fatalError() } + func embeddings(query: EmbeddingsQuery) async throws -> EmbeddingsResult { fatalError() } + func chats(query: ChatQuery) async throws -> ChatResult { fatalError() } + func chatsStream(query: ChatQuery) -> AsyncThrowingStream { fatalError() } + func model(query: ModelQuery) async throws -> ModelResult { fatalError() } + func models() async throws -> ModelsResult { fatalError() } + func moderations(query: ModerationsQuery) async throws -> ModerationsResult { fatalError() } + func audioCreateSpeech(query: AudioSpeechQuery) async throws -> AudioSpeechResult { fatalError() } + func audioTranscriptions(query: AudioTranscriptionQuery) async throws -> AudioTranscriptionResult { fatalError() } + func audioTranscriptionsVerbose(query: AudioTranscriptionQuery) async throws -> AudioTranscriptionVerboseResult { fatalError() } + func audioTranscriptionStream(query: AudioTranscriptionQuery) -> AsyncThrowingStream { fatalError() } + func audioTranslations(query: AudioTranslationQuery) async throws -> AudioTranslationResult { fatalError() } + func audioChats(query: AudioChatQuery) async throws -> AudioChatResult { + try await withCheckedThrowingContinuation { continuation in + _ = audioChats(query: query) { result in + continuation.resume(with: result) + } + } + } + func audioChatsStream(query: AudioChatQuery) -> AsyncThrowingStream { fatalError() } + func assistants() async throws -> AssistantsResult { fatalError() } + func assistants(after: String?) async throws -> AssistantsResult { fatalError() } + func assistantCreate(query: AssistantsQuery) async throws -> AssistantResult { fatalError() } + func assistantModify(query: AssistantsQuery, assistantId: String) async throws -> AssistantResult { fatalError() } + func threads(query: ThreadsQuery) async throws -> ThreadsResult { fatalError() } + func threadRun(query: ThreadRunQuery) async throws -> RunResult { fatalError() } + func runs(threadId: String, query: RunsQuery) async throws -> RunResult { fatalError() } + func runRetrieve(threadId: String, runId: String) async throws -> RunResult { fatalError() } + func runRetrieveSteps(threadId: String, runId: String) async throws -> RunRetrieveStepsResult { fatalError() } + func runRetrieveSteps(threadId: String, runId: String, before: String?) async throws -> RunRetrieveStepsResult { fatalError() } + func runSubmitToolOutputs(threadId: String, runId: String, query: RunToolOutputsQuery) async throws -> RunResult { fatalError() } + func threadsMessages(threadId: String) async throws -> ThreadsMessagesResult { fatalError() } + func threadsMessages(threadId: String, before: String?) async throws -> ThreadsMessagesResult { fatalError() } + func threadsAddMessage(threadId: String, query: MessageQuery) async throws -> ThreadAddMessageResult { fatalError() } + func files(query: FilesQuery) async throws -> FilesResult { fatalError() } + + #if canImport(Combine) + // OpenAICombine protocol requirements + func images(query: ImagesQuery) -> AnyPublisher { fatalError() } + func imageEdits(query: ImageEditsQuery) -> AnyPublisher { fatalError() } + func imageVariations(query: ImageVariationsQuery) -> AnyPublisher { fatalError() } + func embeddings(query: EmbeddingsQuery) -> AnyPublisher { fatalError() } + func chats(query: ChatQuery) -> AnyPublisher { fatalError() } + func chatsStream(query: ChatQuery) -> AnyPublisher, Error> { fatalError() } + func model(query: ModelQuery) -> AnyPublisher { fatalError() } + func models() -> AnyPublisher { fatalError() } + func moderations(query: ModerationsQuery) -> AnyPublisher { fatalError() } + func audioCreateSpeech(query: AudioSpeechQuery) -> AnyPublisher { fatalError() } + func audioTranscriptions(query: AudioTranscriptionQuery) -> AnyPublisher { fatalError() } + func audioTranslations(query: AudioTranslationQuery) -> AnyPublisher { fatalError() } + func audioChats(query: AudioChatQuery) -> AnyPublisher { fatalError() } + func audioChatsStream(query: AudioChatQuery) -> AnyPublisher, Error> { fatalError() } + func assistants() -> AnyPublisher { fatalError() } + func assistants(after: String?) -> AnyPublisher { fatalError() } + func assistantCreate(query: AssistantsQuery) -> AnyPublisher { fatalError() } + func assistantModify(query: AssistantsQuery, assistantId: String) -> AnyPublisher { fatalError() } + func threads(query: ThreadsQuery) -> AnyPublisher { fatalError() } + func threadRun(query: ThreadRunQuery) -> AnyPublisher { fatalError() } + func runs(threadId: String, query: RunsQuery) -> AnyPublisher { fatalError() } + func runRetrieve(threadId: String, runId: String) -> AnyPublisher { fatalError() } + func runRetrieveSteps(threadId: String, runId: String) -> AnyPublisher { fatalError() } + func runRetrieveSteps(threadId: String, runId: String, before: String?) -> AnyPublisher { fatalError() } + func runSubmitToolOutputs(threadId: String, runId: String, query: RunToolOutputsQuery) -> AnyPublisher { fatalError() } + func threadsMessages(threadId: String) -> AnyPublisher { fatalError() } + func threadsMessages(threadId: String, before: String?) -> AnyPublisher { fatalError() } + func threadsAddMessage(threadId: String, query: MessageQuery) -> AnyPublisher { fatalError() } + func files(query: FilesQuery) -> AnyPublisher { fatalError() } + #endif +} + +@available(iOS 15.0, macOS 12.0, watchOS 8.0, *) +private final class MockCancellableRequest: CancellableRequest { + func cancelRequest() {} +}