From 59845eecc577984734746067a10acef9f6b63cbe Mon Sep 17 00:00:00 2001 From: altic-dev Date: Tue, 30 Jun 2026 07:40:04 -0700 Subject: [PATCH 01/10] add dictionary replacement training --- Sources/Fluid/Services/ASRService.swift | 64 +- .../Fluid/Services/FluidAudioProvider.swift | 4 + .../Fluid/Services/GlobalHotkeyManager.swift | 12 + .../LocalAPI/DictionaryAPIController.swift | 9 +- Sources/Fluid/Theme/NativeButtonStyles.swift | 14 +- Sources/Fluid/UI/CustomDictionaryView.swift | 928 +++++++++++++++++- .../DictationE2ETests.swift | 50 + 7 files changed, 1032 insertions(+), 49 deletions(-) diff --git a/Sources/Fluid/Services/ASRService.swift b/Sources/Fluid/Services/ASRService.swift index 29d4d7b2..f8a244fa 100644 --- a/Sources/Fluid/Services/ASRService.swift +++ b/Sources/Fluid/Services/ASRService.swift @@ -94,6 +94,7 @@ final class ASRService: ObservableObject { @Published var downloadProgress: Double? = nil @Published var downloadingModelId: String? = nil // Tracks which model is currently being downloaded @Published private(set) var isCancellingModelDownload: Bool = false + @Published private(set) var isDictionaryTrainingCaptureActive: Bool = false private var isStarting: Bool = false // Guard against re-entrant start() calls private var hasCompletedFirstTranscription: Bool = false // Track if model has warmed up with first transcription @@ -839,7 +840,7 @@ final class ASRService: ObservableObject { /// ## Errors /// If audio session configuration fails, the method will silently fail /// and `isRunning` will remain `false`. Check the debug logs for details. - func start() async { + func start(forDictionaryTraining: Bool = false) async { DebugLogger.shared.info("🎀 START() called - beginning recording session", source: "ASRService") guard self.micStatus == .authorized else { @@ -882,6 +883,7 @@ final class ASRService: ObservableObject { self.isStarting = true defer { self.isStarting = false } + self.isDictionaryTrainingCaptureActive = false do { DebugLogger.shared.debug("βš™οΈ Calling configureSession()...", source: "ASRService") @@ -907,6 +909,7 @@ final class ASRService: ObservableObject { } self.isRunning = true + self.isDictionaryTrainingCaptureActive = forDictionaryTraining DebugLogger.shared.info("βœ… isRunning set to TRUE", source: "ASRService") // Start monitoring the currently bound device for disconnection @@ -919,15 +922,18 @@ final class ASRService: ObservableObject { // Only start streaming for models that support it (large Whisper models are too slow) let model = SettingsStore.shared.selectedSpeechModel - if model.supportsStreaming { + if model.supportsStreaming, !forDictionaryTraining { DebugLogger.shared.debug("πŸ“‘ Starting streaming transcription...", source: "ASRService") self.benchmarkLog("streaming_timer_start intervalMs=\(Int((self.streamingChunkDurationSeconds * 1000).rounded())) minSamples=\(self.minimumStreamingPreviewSamples)") self.startStreamingTranscription() + } else if forDictionaryTraining { + DebugLogger.shared.debug("⏸️ Skipping streaming for dictionary training sample", source: "ASRService") } else { DebugLogger.shared.debug("⏸️ Skipping streaming - model '\(model.displayName)' does not support real-time chunk processing", source: "ASRService") } DebugLogger.shared.info("βœ… START() completed successfully", source: "ASRService") } catch { + self.isDictionaryTrainingCaptureActive = false DebugLogger.shared.error("Failed to start ASR session: \(error)", source: "ASRService") // Resume media if we paused it before the failure @@ -995,17 +1001,25 @@ final class ASRService: ObservableObject { /// final transcription pass. Use this for immediate stop cues that /// shouldn't wait on finalization. Only invoked when capture was actually /// running (i.e. not when `stop()` early-returns because `isRunning` is false). - func stop(onCaptureStopped: (@MainActor () -> Void)? = nil) async -> String { + func stop( + onCaptureStopped: (@MainActor () -> Void)? = nil, + forDictionaryTraining: Bool = false + ) async -> String { DebugLogger.shared.info("πŸ›‘ STOP() called - beginning shutdown sequence", source: "ASRService") self.lastCompletedAudioSnapshot = nil let stopStartedAt = Date().timeIntervalSince1970 self.benchmarkLog("stop_start ageMs=\(self.elapsedMilliseconds(since: self.benchmarkRecordingStartedAt)) bufferedSamples=\(self.audioBuffer.count)") guard self.isRunning else { + self.isDictionaryTrainingCaptureActive = false DebugLogger.shared.warning("⚠️ STOP() - not running, returning empty string", source: "ASRService") return "" } - defer { self.applyPendingParakeetVocabularyReloadIfNeeded() } + let useDictionaryTrainingPath = forDictionaryTraining || self.isDictionaryTrainingCaptureActive + defer { + self.applyPendingParakeetVocabularyReloadIfNeeded() + self.isDictionaryTrainingCaptureActive = false + } self.audioRouteRecoveryTask?.cancel() self.audioRouteRecoveryTask = nil @@ -1021,7 +1035,9 @@ final class ASRService: ObservableObject { self.audioCapturePipeline.setRecordingEnabled(false) DebugLogger.shared.debug("βœ… Capture pipeline disabled", source: "ASRService") - await self.runFastPreviewStopGraceIfNeeded() + if !useDictionaryTrainingPath { + await self.runFastPreviewStopGraceIfNeeded() + } // CRITICAL: Set isRunning to false before teardown so in-flight chunks stop safely. DebugLogger.shared.debug("🚫 Setting isRunning = false...", source: "ASRService") @@ -1136,8 +1152,19 @@ final class ASRService: ObservableObject { let finalStartedAt = Date().timeIntervalSince1970 let result: ASRTranscriptionResult let finalSource: String - if let fluidProvider = provider as? FluidAudioProvider, - let cachedResult = await fluidProvider.transcribeCachedStreamingPreviewIfAvailable(pcm) + if useDictionaryTrainingPath { + if let fluidProvider = provider as? FluidAudioProvider { + result = try await self.transcriptionExecutor.run { [fluidProvider] in + try await fluidProvider.transcribeDictionaryTraining(pcm) + } + } else { + result = try await self.transcriptionExecutor.run { [provider] in + try await provider.transcribeFinal(pcm) + } + } + finalSource = "dictionaryTraining" + } else if let fluidProvider = provider as? FluidAudioProvider, + let cachedResult = await fluidProvider.transcribeCachedStreamingPreviewIfAvailable(pcm) { result = cachedResult finalSource = "livePreview" @@ -1175,11 +1202,17 @@ final class ASRService: ObservableObject { } // Do not update self.finalText here to avoid instant binding insert in playground - let cleanedText = ASRService.applyCustomDictionary(ASRService.removeFillerWords(result.text)) - self.recordWordBoostHitIfAny(transcribedText: cleanedText) - DebugLogger.shared.debug("After post-processing: '\(cleanedText)'", source: "ASRService") - self.benchmarkLog("stop_end result=success totalMs=\(self.elapsedMilliseconds(since: stopStartedAt)) recordingAgeMs=\(self.elapsedMilliseconds(since: self.benchmarkRecordingStartedAt)) cleanedChars=\(cleanedText.count)") - if SettingsStore.shared.saveTranscriptionHistory, + let textWithoutFillers = ASRService.removeFillerWords(result.text) + let outputText = useDictionaryTrainingPath + ? textWithoutFillers + : ASRService.applyCustomDictionary(textWithoutFillers) + if !useDictionaryTrainingPath { + self.recordWordBoostHitIfAny(transcribedText: outputText) + } + DebugLogger.shared.debug("After post-processing: '\(outputText)'", source: "ASRService") + self.benchmarkLog("stop_end result=success totalMs=\(self.elapsedMilliseconds(since: stopStartedAt)) recordingAgeMs=\(self.elapsedMilliseconds(since: self.benchmarkRecordingStartedAt)) cleanedChars=\(outputText.count)") + if !useDictionaryTrainingPath, + SettingsStore.shared.saveTranscriptionHistory, SettingsStore.shared.saveAudioWithTranscriptionHistory, !capturedPCM.isEmpty { @@ -1196,7 +1229,7 @@ final class ASRService: ObservableObject { DebugLogger.shared.info("🎡 Resumed system media after transcription", source: "ASRService") } - return cleanedText + return outputText } catch { DebugLogger.shared.error("ASR transcription failed: \(error)", source: "ASRService") DebugLogger.shared.error("Error details: \(error.localizedDescription)", source: "ASRService") @@ -1307,7 +1340,10 @@ final class ASRService: ObservableObject { func stopWithoutTranscription() async { guard self.isRunning else { return } - defer { self.applyPendingParakeetVocabularyReloadIfNeeded() } + defer { + self.applyPendingParakeetVocabularyReloadIfNeeded() + self.isDictionaryTrainingCaptureActive = false + } self.audioRouteRecoveryTask?.cancel() self.audioRouteRecoveryTask = nil diff --git a/Sources/Fluid/Services/FluidAudioProvider.swift b/Sources/Fluid/Services/FluidAudioProvider.swift index 130b1fdd..1304e351 100644 --- a/Sources/Fluid/Services/FluidAudioProvider.swift +++ b/Sources/Fluid/Services/FluidAudioProvider.swift @@ -201,6 +201,10 @@ final class FluidAudioProvider: TranscriptionProvider { return ASRTranscriptionResult(text: result.text, confidence: result.confidence) } + func transcribeDictionaryTraining(_ samples: [Float]) async throws -> ASRTranscriptionResult { + try await self.transcribeStreaming(samples) + } + func transcribeFinal(_ samples: [Float]) async throws -> ASRTranscriptionResult { guard let manager = self.finalAsrManager ?? self.streamingAsrManager else { throw NSError( diff --git a/Sources/Fluid/Services/GlobalHotkeyManager.swift b/Sources/Fluid/Services/GlobalHotkeyManager.swift index c0272aa5..7c9b271d 100644 --- a/Sources/Fluid/Services/GlobalHotkeyManager.swift +++ b/Sources/Fluid/Services/GlobalHotkeyManager.swift @@ -1786,6 +1786,10 @@ final class GlobalHotkeyManager: NSObject { DebugLogger.shared.debug("Ignoring \(label) - stop already processing", source: "GlobalHotkeyManager") return false } + guard !self.asrService.isDictionaryTrainingCaptureActive else { + DebugLogger.shared.debug("Ignoring \(label) - dictionary training capture is active", source: "GlobalHotkeyManager") + return false + } return true } @@ -1835,6 +1839,10 @@ final class GlobalHotkeyManager: NSObject { DebugLogger.shared.debug("Ignoring stop - already processing", source: "GlobalHotkeyManager") return } + guard !self.asrService.isDictionaryTrainingCaptureActive else { + DebugLogger.shared.debug("Ignoring stop - dictionary training capture is active", source: "GlobalHotkeyManager") + return + } guard self.asrService.isRunning else { return @@ -1847,6 +1855,10 @@ final class GlobalHotkeyManager: NSObject { @MainActor private func stopRecordingInternal() async { guard self.asrService.isRunning else { return } + guard !self.asrService.isDictionaryTrainingCaptureActive else { + DebugLogger.shared.debug("Stop ignored - dictionary training capture is active", source: "GlobalHotkeyManager") + return + } guard !self.isProcessingStop else { DebugLogger.shared.debug("Stop already in progress, ignoring", source: "GlobalHotkeyManager") return diff --git a/Sources/Fluid/Services/LocalAPI/DictionaryAPIController.swift b/Sources/Fluid/Services/LocalAPI/DictionaryAPIController.swift index f0eeb714..99641ae9 100644 --- a/Sources/Fluid/Services/LocalAPI/DictionaryAPIController.swift +++ b/Sources/Fluid/Services/LocalAPI/DictionaryAPIController.swift @@ -132,6 +132,7 @@ struct DictionaryAPIController: LocalAPIRouteHandler { let incoming = try self.replacementEntries(from: payload) var stored = payload.mode == .replace ? [] : SettingsStore.shared.customDictionaryEntries + var incomingEntries: [SettingsStore.CustomDictionaryEntry] = [] for entry in incoming { let normalized = Self.storeEntry(from: entry) guard !normalized.triggers.isEmpty, @@ -144,10 +145,14 @@ struct DictionaryAPIController: LocalAPIRouteHandler { if let id = entry.id, existing.id == id { return true } return existing.replacement.caseInsensitiveCompare(normalized.replacement) == .orderedSame } - stored.append(normalized) + incomingEntries.removeAll { existing in + if let id = entry.id, existing.id == id { return true } + return existing.replacement.caseInsensitiveCompare(normalized.replacement) == .orderedSame + } + incomingEntries.append(normalized) } - SettingsStore.shared.customDictionaryEntries = stored + SettingsStore.shared.customDictionaryEntries = incomingEntries + stored ASRService.invalidateDictionaryCache() NotificationCenter.default.post(name: .parakeetVocabularyDidChange, object: nil) return self.getReplacements() diff --git a/Sources/Fluid/Theme/NativeButtonStyles.swift b/Sources/Fluid/Theme/NativeButtonStyles.swift index 06924650..9845e605 100644 --- a/Sources/Fluid/Theme/NativeButtonStyles.swift +++ b/Sources/Fluid/Theme/NativeButtonStyles.swift @@ -18,6 +18,7 @@ enum FluidButtonRole { case glass case compact case accent + case destructive case inline } @@ -77,6 +78,8 @@ extension View { self.buttonStyle(CompactButtonStyle(height: size.controlHeight)) case .accent: self.buttonStyle(AccentButtonStyle(compact: size.accentCompact)) + case .destructive: + self.buttonStyle(AccentButtonStyle(compact: size.accentCompact, tone: Color(nsColor: .systemRed))) case .inline: self.buttonStyle(InlineButtonStyle()) } @@ -389,9 +392,10 @@ struct CompactButtonStyle: ButtonStyle { struct AccentButtonStyle: ButtonStyle { var compact: Bool = false + var tone: Color? = nil func makeBody(configuration: Configuration) -> some View { - AccentButton(configuration: configuration, compact: self.compact) + AccentButton(configuration: configuration, compact: self.compact, tone: self.tone) } private struct AccentButton: View { @@ -399,12 +403,14 @@ struct AccentButtonStyle: ButtonStyle { @State private var isHovered = false let configuration: ButtonStyle.Configuration let compact: Bool + let tone: Color? private var shape: RoundedRectangle { RoundedRectangle(cornerRadius: self.compact ? 8 : self.theme.metrics.corners.md, style: .continuous) } var body: some View { + let tone = self.tone ?? self.theme.palette.accent self.configuration.label .fontWeight(.semibold) .padding(.horizontal, self.compact ? 12 : self.theme.metrics.spacing.lg) @@ -416,8 +422,8 @@ struct AccentButtonStyle: ButtonStyle { .fill( LinearGradient( colors: [ - self.theme.palette.accent, - self.theme.palette.accent.opacity(0.85), + tone, + tone.opacity(0.85), ], startPoint: .top, endPoint: .bottom @@ -429,7 +435,7 @@ struct AccentButtonStyle: ButtonStyle { .stroke(Color.white.opacity(self.isHovered ? 0.3 : 0.15), lineWidth: 1) ) .shadow( - color: self.theme.palette.accent.opacity(self.isHovered ? 0.5 : 0.3), + color: tone.opacity(self.isHovered ? 0.5 : 0.3), radius: self.isHovered ? 6 : 4, x: 0, y: self.isHovered ? 3 : 2 diff --git a/Sources/Fluid/UI/CustomDictionaryView.swift b/Sources/Fluid/UI/CustomDictionaryView.swift index 36a7e10c..8c74f7fa 100644 --- a/Sources/Fluid/UI/CustomDictionaryView.swift +++ b/Sources/Fluid/UI/CustomDictionaryView.swift @@ -12,9 +12,13 @@ import UniformTypeIdentifiers struct CustomDictionaryView: View { @Environment(\.theme) private var theme + @Environment(\.accessibilityReduceMotion) private var reduceMotion + @EnvironmentObject private var appServices: AppServices + + private var asr: ASRService { self.appServices.asr } + @State private var entries: [SettingsStore.CustomDictionaryEntry] = SettingsStore.shared.customDictionaryEntries @State private var boostTerms: [ParakeetVocabularyStore.VocabularyConfig.Term] = [] - @State private var showAddSheet = false @State private var editingEntry: SettingsStore.CustomDictionaryEntry? @State private var showAddBoostSheet = false @State private var editingBoostTerm: EditableBoostTerm? @@ -24,23 +28,114 @@ struct CustomDictionaryView: View { @State private var vocabBoostingEnabled: Bool = SettingsStore.shared.vocabularyBoostingEnabled @State private var isBoostingInfoPresented = false + @State private var trainingReplacement = "" + @State private var trainingVariants: [String] = [] + @State private var trainingStatusMessage = "Type the correct text." + @State private var trainingHasError = false + @State private var isTrainingActive = false + @State private var isTrainingRecording = false + @State private var isTrainingProcessing = false + @State private var replacementConfirmation: ReplacementConfirmation? + @State private var composerMode: DictionaryComposerMode = .train + @State private var manualTriggersText = "" + @State private var manualReplacement = "" + + private var normalizedTrainingReplacement: String { + self.trainingReplacement.trimmingCharacters(in: .whitespacesAndNewlines) + } + + private var trainingProgressText: String { + let count = self.trainingVariants.count + let target = count <= CustomDictionaryTrainingMerge.recommendedSamples + ? CustomDictionaryTrainingMerge.recommendedSamples + : CustomDictionaryTrainingMerge.maxSamples + return "\(count)/\(target)" + } + + private var shouldShowTrainingStatus: Bool { + self.trainingHasError || ( + !self.trainingStatusMessage.isEmpty && + self.trainingStatusMessage != "Type the correct text." + ) + } + + private var canUseTrainingRecorderButton: Bool { + self.isTrainingRecording || self.canRecordTrainingSample + } + + private var trainingRecorderTitle: String { + if self.isTrainingProcessing { + return "Working..." + } + if self.isTrainingRecording { + return "Listening..." + } + if self.normalizedTrainingReplacement.isEmpty { + return "Record sample" + } + return self.trainingVariants.isEmpty ? "Say it once" : "Say it again" + } + + private var trainingRecorderDetail: String { + self.normalizedTrainingReplacement.isEmpty + ? "Type the correct text first." + : "\"\(self.normalizedTrainingReplacement)\"" + } + + private var canStartTraining: Bool { + !self.normalizedTrainingReplacement.isEmpty && + !self.isTrainingRecording && + !self.isTrainingProcessing + } + + private var canRecordTrainingSample: Bool { + !self.normalizedTrainingReplacement.isEmpty && + !self.isTrainingProcessing && + !self.asr.isRunning && + self.trainingVariants.count < CustomDictionaryTrainingMerge.maxSamples + } + + private var canAddTrainedReplacement: Bool { + !self.normalizedTrainingReplacement.isEmpty && + !self.trainingVariants.isEmpty && + !self.isTrainingRecording && + !self.isTrainingProcessing + } + + private var manualTriggers: [String] { + CustomDictionaryManualEntry.parseTriggers(self.manualTriggersText) + } + + private var manualDuplicateTriggers: [String] { + self.manualTriggers.filter { self.allExistingTriggers().contains($0) } + } + + private var canAddManualReplacement: Bool { + !self.manualTriggers.isEmpty && + !self.manualReplacement.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty && + self.manualDuplicateTriggers.isEmpty + } + var body: some View { ScrollView(.vertical, showsIndicators: false) { VStack(alignment: .leading, spacing: self.theme.metrics.spacing.xl) { self.pageHeader VStack(alignment: .leading, spacing: self.theme.metrics.spacing.xxl) { - self.instantReplacementSection + self.trainReplacementSection + self.yourDictionarySection self.aiPostProcessingSection } } .frame(maxWidth: 860, alignment: .leading) .padding(self.theme.metrics.spacing.xl) } - .sheet(isPresented: self.$showAddSheet) { - AddDictionaryEntrySheet(existingTriggers: self.allExistingTriggers()) { newEntry in - self.entries.append(newEntry) - self.saveEntries() + .overlay { + if let confirmation = self.replacementConfirmation { + ReplacementConfirmationToast(confirmation: confirmation) + .padding(self.theme.metrics.spacing.xl) + .transition(.scale(scale: 0.92).combined(with: .opacity)) + .allowsHitTesting(false) } } .sheet(item: self.$editingEntry) { entry in @@ -73,6 +168,14 @@ struct CustomDictionaryView: View { .onAppear { self.loadBoostTerms() } + .onDisappear { + guard self.isTrainingRecording else { return } + Task { @MainActor in + _ = await self.asr.stop(forDictionaryTraining: true) + self.isTrainingRecording = false + self.isTrainingProcessing = false + } + } } // MARK: - Page Header @@ -129,17 +232,295 @@ struct CustomDictionaryView: View { .frame(width: 34, height: 34) } - // MARK: - Instant Replacement + // MARK: - Teach Words - private var instantReplacementSection: some View { + private var trainReplacementSection: some View { ThemedCard(style: .standard, hoverEffect: false) { VStack(alignment: .leading, spacing: self.theme.metrics.spacing.lg) { HStack(alignment: .center, spacing: self.theme.metrics.spacing.md) { - self.settingsIconTile(systemName: "arrow.left.arrow.right") + self.settingsIconTile(systemName: "mic.fill") + + VStack(alignment: .leading, spacing: 3) { + Text("Teach Words") + .font(self.theme.typography.sectionTitle) + Text("Show FluidVoice the right spelling, by voice or by typing.") + .font(self.theme.typography.caption) + .foregroundStyle(self.theme.palette.secondaryText) + } + } + + self.dictionaryComposerModePicker + + Group { + switch self.composerMode { + case .train: + self.trainReplacementComposer + case .manual: + self.manualReplacementComposer + } + } + .frame(height: 260, alignment: .topLeading) + } + } + .frame(maxWidth: .infinity, alignment: .leading) + } + + private var dictionaryComposerModePicker: some View { + VStack(alignment: .leading, spacing: self.theme.metrics.spacing.sm) { + self.dictionaryComposerModeSegmented + + Text(self.composerMode.detail) + .font(self.theme.typography.caption) + .foregroundStyle(self.theme.palette.secondaryText) + .fixedSize(horizontal: false, vertical: true) + } + } + + private var dictionaryComposerModeSegmented: some View { + HStack(spacing: 2) { + ForEach(DictionaryComposerMode.allCases) { mode in + DictionaryComposerModeTab( + mode: mode, + isSelected: self.composerMode == mode, + isDisabled: self.isTrainingRecording || self.isTrainingProcessing + ) { + self.selectComposerMode(mode) + } + } + } + .padding(3) + .background( + RoundedRectangle(cornerRadius: self.theme.metrics.corners.md, style: .continuous) + .fill(self.theme.palette.contentBackground.opacity(0.5)) + .overlay( + RoundedRectangle(cornerRadius: self.theme.metrics.corners.md, style: .continuous) + .stroke(self.theme.palette.cardBorder.opacity(0.25), lineWidth: 1) + ) + ) + } + + private var trainReplacementComposer: some View { + VStack(alignment: .leading, spacing: self.theme.metrics.spacing.md) { + TextField("Type the correct text, e.g. FluidVoice", text: self.$trainingReplacement) + .textFieldStyle(.roundedBorder) + .disabled(self.isTrainingRecording || self.isTrainingProcessing) + .onChange(of: self.trainingReplacement) { oldValue, newValue in + self.handleTrainingReplacementChange(oldValue: oldValue, newValue: newValue) + } + + self.trainingRecorderPanel + + if !self.trainingVariants.isEmpty { + self.trainingHeardSection + } + + self.trainingFooter + + Spacer(minLength: 0) + + Button { + self.addTrainedReplacement() + } label: { + Label("Add Replacement", systemImage: "plus") + .frame(maxWidth: .infinity) + .frame(height: 38) + } + .fluidButton(.accent, size: .small) + .disabled(!self.canAddTrainedReplacement) + .opacity(self.canAddTrainedReplacement ? 1 : 0.45) + } + } + + private var manualReplacementComposer: some View { + VStack(alignment: .leading, spacing: self.theme.metrics.spacing.md) { + ViewThatFits(in: .horizontal) { + HStack(alignment: .top, spacing: self.theme.metrics.spacing.md) { + self.manualTriggerField + self.manualReplacementField + } + + VStack(alignment: .leading, spacing: self.theme.metrics.spacing.md) { + self.manualTriggerField + self.manualReplacementField + } + } + + if !self.manualDuplicateTriggers.isEmpty { + Label("Already used: \(self.manualDuplicateTriggers.joined(separator: ", "))", systemImage: "exclamationmark.triangle.fill") + .font(self.theme.typography.caption) + .foregroundStyle(self.theme.palette.warning) + } + + if !self.manualTriggers.isEmpty || !self.manualReplacement.isEmpty { + FlowLayout(spacing: 6) { + ForEach(self.manualTriggers, id: \.self) { trigger in + DictionaryPreviewChip(text: trigger) + } + + Image(systemName: "arrow.right") + .font(self.theme.typography.caption) + .foregroundStyle(self.theme.palette.tertiaryText) + + Text(self.manualReplacement.trimmingCharacters(in: .whitespacesAndNewlines)) + .font(self.theme.typography.captionStrong) + .foregroundStyle(self.theme.palette.accent) + } + } + + Spacer(minLength: 0) + + Button { + self.addManualReplacementIfValid() + } label: { + Label("Add Replacement", systemImage: "plus") + .frame(maxWidth: .infinity) + .frame(height: 38) + } + .fluidButton(.accent, size: .small) + .disabled(!self.canAddManualReplacement) + .opacity(self.canAddManualReplacement ? 1 : 0.45) + } + } + + private var manualTriggerField: some View { + VStack(alignment: .leading, spacing: self.theme.metrics.spacing.sm) { + Text("When FluidVoice hears") + .font(self.theme.typography.captionStrong) + TextField("fluid voice, fluid boys", text: self.$manualTriggersText) + .textFieldStyle(.roundedBorder) + .onSubmit { self.addManualReplacementIfValid() } + Text("Separate multiple versions with commas.") + .font(self.theme.typography.caption) + .foregroundStyle(self.theme.palette.secondaryText) + } + } + + private var manualReplacementField: some View { + VStack(alignment: .leading, spacing: self.theme.metrics.spacing.sm) { + Text("Change it to") + .font(self.theme.typography.captionStrong) + TextField("FluidVoice", text: self.$manualReplacement) + .textFieldStyle(.roundedBorder) + .onSubmit { self.addManualReplacementIfValid() } + Text("This is what appears in your transcription.") + .font(self.theme.typography.caption) + .foregroundStyle(self.theme.palette.secondaryText) + } + } + + private var trainingRecorderPanel: some View { + HStack(alignment: .center, spacing: self.theme.metrics.spacing.md) { + VStack(alignment: .leading, spacing: 6) { + Text(self.trainingRecorderTitle) + .font(self.theme.typography.bodySmallStrong) + + Text(self.trainingRecorderDetail) + .font(self.theme.typography.caption) + .foregroundStyle(self.theme.palette.secondaryText) + .lineLimit(1) + + HStack(spacing: 7) { + TrainingProgressDots(count: self.trainingVariants.count) + Text("\(self.trainingProgressText) recorded") + .font(self.theme.typography.caption) + .foregroundStyle(self.theme.palette.tertiaryText) + } + } + + Spacer() + + Button { + Task { + if self.isTrainingRecording { + await self.stopTrainingSample() + } else { + await self.startTrainingSample() + } + } + } label: { + Label(self.isTrainingRecording ? "Stop" : "Record", systemImage: self.isTrainingRecording ? "stop.fill" : "mic.fill") + } + .fluidButton(self.isTrainingRecording ? .destructive : .accent, size: .small) + .disabled(!self.canUseTrainingRecorderButton) + .opacity(self.canUseTrainingRecorderButton ? 1 : 0.45) + } + .padding(self.theme.metrics.spacing.md) + .background( + RoundedRectangle(cornerRadius: self.theme.metrics.corners.md, style: .continuous) + .fill(self.theme.palette.contentBackground.opacity(0.5)) + .overlay( + RoundedRectangle(cornerRadius: self.theme.metrics.corners.md, style: .continuous) + .stroke(self.theme.palette.cardBorder.opacity(0.25), lineWidth: 1) + ) + ) + } + + private var trainingHeardSection: some View { + VStack(alignment: .leading, spacing: self.theme.metrics.spacing.sm) { + Text("Heard") + .font(self.theme.typography.captionStrong) + .foregroundStyle(self.theme.palette.secondaryText) + + FlowLayout(spacing: 6) { + ForEach(self.trainingVariants, id: \.self) { variant in + TrainingVariantChip(variant: variant) { + self.trainingVariants.removeAll { $0 == variant } + } + } + } + } + .padding(self.theme.metrics.spacing.md) + .background( + RoundedRectangle(cornerRadius: self.theme.metrics.corners.md, style: .continuous) + .fill(self.theme.palette.contentBackground.opacity(0.5)) + .overlay( + RoundedRectangle(cornerRadius: self.theme.metrics.corners.md, style: .continuous) + .stroke(self.theme.palette.cardBorder.opacity(0.25), lineWidth: 1) + ) + ) + } + + @ViewBuilder + private var trainingFooter: some View { + if self.shouldShowTrainingStatus || self.isTrainingActive || !self.trainingVariants.isEmpty { + HStack(spacing: self.theme.metrics.spacing.sm) { + if self.trainingHasError { + Label(self.trainingStatusMessage, systemImage: "exclamationmark.triangle.fill") + .font(self.theme.typography.caption) + .foregroundStyle(self.theme.palette.warning) + } else if self.shouldShowTrainingStatus { + Text(self.trainingStatusMessage) + .font(self.theme.typography.caption) + .foregroundStyle(self.theme.palette.secondaryText) + } + + if self.isTrainingActive || !self.trainingVariants.isEmpty || !self.normalizedTrainingReplacement.isEmpty { + Spacer() + + Button("Clear") { + self.resetTraining() + } + .fluidButton(.compact, size: .compact) + .disabled(self.isTrainingRecording || self.isTrainingProcessing) + .opacity(self.isTrainingRecording || self.isTrainingProcessing ? 0.45 : 1) + } else { + Spacer(minLength: 0) + } + } + } + } + + // MARK: - Your Dictionary + + private var yourDictionarySection: some View { + ThemedCard(style: .standard, hoverEffect: false) { + VStack(alignment: .leading, spacing: self.theme.metrics.spacing.lg) { + HStack(alignment: .center, spacing: self.theme.metrics.spacing.md) { + self.settingsIconTile(systemName: "book.closed.fill") VStack(alignment: .leading, spacing: 3) { HStack(spacing: 6) { - Text("Instant Replacement") + Text("Your Dictionary") .font(self.theme.typography.sectionTitle) if !self.entries.isEmpty { Text("(\(self.entries.count))") @@ -147,28 +528,19 @@ struct CustomDictionaryView: View { .foregroundStyle(self.theme.palette.tertiaryText) } } - Text("Replace phrases that are consistently transcribed incorrectly.") + Text("Words and phrases FluidVoice will correct automatically.") .font(self.theme.typography.caption) .foregroundStyle(self.theme.palette.secondaryText) } - - Spacer() - - Button { - self.showAddSheet = true - } label: { - Label("Add Replacement", systemImage: "plus") - } - .fluidButton(.accent, size: .small) } if self.entries.isEmpty { self.dictionaryEmptyState( title: "No replacements yet", - detail: "Add a phrase and the text it should become." - ) { - self.showAddSheet = true - } + detail: "Use Train Replacement or Manual Add above to create your first one." + ) + .frame(maxWidth: 760) + .frame(maxWidth: .infinity, alignment: .center) } else { self.entriesListView } @@ -187,6 +559,8 @@ struct CustomDictionaryView: View { ) } } + .frame(maxWidth: 760) + .frame(maxWidth: .infinity, alignment: .center) } // MARK: - Custom Words @@ -301,7 +675,7 @@ struct CustomDictionaryView: View { private func dictionaryEmptyState( title: String, detail: String, - action: @escaping () -> Void + action: (() -> Void)? = nil ) -> some View { HStack(spacing: self.theme.metrics.spacing.sm) { Image(systemName: "plus.circle") @@ -316,10 +690,12 @@ struct CustomDictionaryView: View { .foregroundStyle(self.theme.palette.secondaryText) } - Spacer() + if let action { + Spacer() - Button("Add", action: action) - .fluidButton(.compact, size: .compact) + Button("Add", action: action) + .fluidButton(.compact, size: .compact) + } } .padding(self.theme.metrics.spacing.md) .background( @@ -341,6 +717,160 @@ struct CustomDictionaryView: View { NotificationCenter.default.post(name: .parakeetVocabularyDidChange, object: nil) } + private func addReplacementEntry(_ entry: SettingsStore.CustomDictionaryEntry) { + self.entries.insert(entry, at: 0) + self.saveEntries() + self.showReplacementConfirmation( + title: "Replacement added", + detail: "It is at the top of the list." + ) + } + + private func selectComposerMode(_ mode: DictionaryComposerMode) { + guard !self.isTrainingRecording, !self.isTrainingProcessing else { return } + self.composerMode = mode + } + + private func addManualReplacementIfValid() { + guard self.canAddManualReplacement else { return } + let entry = SettingsStore.CustomDictionaryEntry( + triggers: self.manualTriggers, + replacement: self.manualReplacement.trimmingCharacters(in: .whitespacesAndNewlines) + ) + self.addReplacementEntry(entry) + self.manualTriggersText = "" + self.manualReplacement = "" + } + + private func beginTrainingReplacement() { + guard self.canStartTraining else { return } + self.isTrainingActive = true + self.trainingHasError = false + self.trainingStatusMessage = "" + } + + private func startTrainingSample() async { + guard self.canRecordTrainingSample else { return } + self.isTrainingActive = true + self.trainingHasError = false + self.trainingStatusMessage = "" + self.isTrainingRecording = true + + await self.asr.start(forDictionaryTraining: true) + if !self.asr.isRunning { + self.isTrainingRecording = false + self.trainingHasError = true + self.trainingStatusMessage = "Couldn't start recording. Check microphone access and try again." + } + } + + private func stopTrainingSample() async { + guard self.isTrainingRecording else { return } + self.isTrainingRecording = false + self.isTrainingProcessing = true + self.trainingHasError = false + self.trainingStatusMessage = "" + + let transcript = await self.asr.stop(forDictionaryTraining: true) + self.isTrainingProcessing = false + self.addTrainingVariant(from: transcript) + } + + private func addTrainingVariant(from transcript: String) { + guard let detected = CustomDictionaryTrainingMerge.normalizedTrigger(transcript) else { + self.trainingHasError = true + self.trainingStatusMessage = "Nothing heard. Try again." + return + } + + if detected.caseInsensitiveCompare(self.normalizedTrainingReplacement) == .orderedSame { + self.trainingHasError = false + self.trainingStatusMessage = "That sounded right. Try another if needed." + return + } + + if self.trainingVariants.contains(where: { $0.caseInsensitiveCompare(detected) == .orderedSame }) { + self.trainingHasError = false + self.trainingStatusMessage = "Already got that one." + return + } + + guard self.trainingVariants.count < CustomDictionaryTrainingMerge.maxSamples else { + self.trainingHasError = false + self.trainingStatusMessage = "You have enough. Add when ready." + return + } + + self.trainingVariants.append(detected) + self.trainingHasError = false + if self.trainingVariants.count >= CustomDictionaryTrainingMerge.maxSamples { + self.trainingStatusMessage = "You have enough. Add when ready." + } else if self.trainingVariants.count >= CustomDictionaryTrainingMerge.recommendedSamples { + self.trainingStatusMessage = "You can add it now." + } else { + self.trainingStatusMessage = "Got it." + } + } + + private func addTrainedReplacement() { + guard self.canAddTrainedReplacement else { return } + let replacementText = self.normalizedTrainingReplacement + let updatesExisting = self.entries.contains { + $0.replacement.caseInsensitiveCompare(replacementText) == .orderedSame + } + self.entries = CustomDictionaryTrainingMerge.mergedEntries( + current: self.entries, + replacement: replacementText, + triggers: self.trainingVariants + ) + self.saveEntries() + self.resetTraining() + self.showReplacementConfirmation( + title: updatesExisting ? "Replacement updated" : "Recorded", + detail: updatesExisting ? "Your variants are ready." : "Replacement added at the top." + ) + } + + private func resetTraining(statusMessage: String = "Type the correct text.") { + self.trainingReplacement = "" + self.trainingVariants = [] + self.trainingStatusMessage = statusMessage + self.trainingHasError = false + self.isTrainingActive = false + self.isTrainingRecording = false + self.isTrainingProcessing = false + } + + private func handleTrainingReplacementChange(oldValue: String, newValue: String) { + let oldKey = CustomDictionaryTrainingMerge.normalizedReplacement(oldValue).lowercased() + let newKey = CustomDictionaryTrainingMerge.normalizedReplacement(newValue).lowercased() + guard oldKey != newKey else { return } + + if !self.trainingVariants.isEmpty { + self.trainingVariants.removeAll() + } + self.isTrainingActive = false + self.trainingStatusMessage = newKey.isEmpty ? "Type the correct text." : "" + self.trainingHasError = false + } + + private func showReplacementConfirmation(title: String, detail: String) { + let confirmation = ReplacementConfirmation(title: title, detail: detail) + NSHapticFeedbackManager.defaultPerformer.perform(.levelChange, performanceTime: .now) + + withAnimation(self.reduceMotion ? nil : .spring(response: 0.26, dampingFraction: 0.78)) { + self.replacementConfirmation = confirmation + } + + Task { @MainActor in + try? await Task.sleep(nanoseconds: 1_650_000_000) + guard self.replacementConfirmation?.id == confirmation.id else { return } + withAnimation(self.reduceMotion ? nil : .easeOut(duration: 0.16)) { + self.replacementConfirmation = nil + } + } + } + private func loadBoostTerms() { do { self.boostTerms = try ParakeetVocabularyStore.shared.loadUserBoostTerms() @@ -489,6 +1019,346 @@ private struct EditableBoostTerm: Identifiable { let term: ParakeetVocabularyStore.VocabularyConfig.Term } +private enum DictionaryComposerMode: CaseIterable, Identifiable { + case train + case manual + + var id: Self { self } + + var title: String { + switch self { + case .train: + return "Train by Voice" + case .manual: + return "Add Manually" + } + } + + var systemImage: String { + switch self { + case .train: + return "mic.fill" + case .manual: + return "keyboard" + } + } + + var detail: String { + switch self { + case .train: + return "Say it a few times so FluidVoice can catch the versions it hears." + case .manual: + return "Type the misheard text and the spelling you want." + } + } +} + +private struct DictionaryComposerModeTab: View { + let mode: DictionaryComposerMode + let isSelected: Bool + let isDisabled: Bool + let action: () -> Void + + @Environment(\.theme) private var theme + @Environment(\.accessibilityReduceMotion) private var reduceMotion + @State private var isHovered = false + + var body: some View { + Button(action: self.action) { + HStack(spacing: self.theme.metrics.spacing.sm) { + Image(systemName: self.mode.systemImage) + .font(.system(size: 12, weight: .semibold)) + Text(self.mode.title) + .font(self.theme.typography.bodySmallStrong) + } + .foregroundStyle(self.foreground) + .frame(maxWidth: .infinity) + .frame(minHeight: 30) + .padding(.horizontal, self.theme.metrics.spacing.md) + .background(self.background) + .contentShape(RoundedRectangle(cornerRadius: self.theme.metrics.corners.sm, style: .continuous)) + } + .buttonStyle(.plain) + .disabled(self.isDisabled) + .opacity(self.isDisabled ? 0.55 : 1) + .onHover { hovering in + guard !self.reduceMotion else { + self.isHovered = hovering + return + } + withAnimation(.easeOut(duration: 0.14)) { + self.isHovered = hovering + } + } + .accessibilityAddTraits(self.isSelected ? .isSelected : []) + } + + private var foreground: Color { + self.isSelected ? Color.white : self.theme.palette.primaryText + } + + private var background: some View { + RoundedRectangle(cornerRadius: self.theme.metrics.corners.sm, style: .continuous) + .fill( + self.isSelected + ? self.theme.palette.accent + : (self.isHovered ? self.theme.palette.cardBackground.opacity(0.6) : Color.clear) + ) + } +} + +private enum CustomDictionaryManualEntry { + static func parseTriggers(_ text: String) -> [String] { + text + .split(separator: ",") + .map { $0.trimmingCharacters(in: .whitespacesAndNewlines).lowercased() } + .filter { !$0.isEmpty } + } +} + +enum CustomDictionaryTrainingMerge { + static let recommendedSamples = 5 + static let maxSamples = 10 + + private static let edgePunctuation = CharacterSet(charactersIn: ".,!?;:\"'β€œβ€β€˜β€™") + + static func normalizedReplacement(_ value: String) -> String { + value.trimmingCharacters(in: .whitespacesAndNewlines) + } + + static func normalizedTrigger(_ value: String) -> String? { + let edgeCharacters = CharacterSet.whitespacesAndNewlines.union(self.edgePunctuation) + let trimmed = value.trimmingCharacters(in: edgeCharacters).lowercased() + return trimmed.isEmpty ? nil : trimmed + } + + static func normalizedTriggers(from values: [String], intendedReplacement: String) -> [String] { + let replacement = self.normalizedReplacement(intendedReplacement) + var seen: Set = [] + var result: [String] = [] + result.reserveCapacity(values.count) + + for value in values { + guard let trigger = self.normalizedTrigger(value), + trigger.caseInsensitiveCompare(replacement) != .orderedSame, + !seen.contains(trigger) + else { + continue + } + seen.insert(trigger) + result.append(trigger) + if result.count >= self.maxSamples { + break + } + } + + return result + } + + static func mergedEntries( + current entries: [SettingsStore.CustomDictionaryEntry], + replacement: String, + triggers: [String] + ) -> [SettingsStore.CustomDictionaryEntry] { + let replacementText = self.normalizedReplacement(replacement) + let incomingTriggers = self.normalizedTriggers(from: triggers, intendedReplacement: replacementText) + guard !replacementText.isEmpty, !incomingTriggers.isEmpty else { return entries } + + let matchingIndex = entries.firstIndex { + $0.replacement.caseInsensitiveCompare(replacementText) == .orderedSame + } + let replacementID = matchingIndex.map { entries[$0].id } + let storedReplacementText = matchingIndex.map { entries[$0].replacement } ?? replacementText + let matchingEntries = entries.filter { + $0.replacement.caseInsensitiveCompare(storedReplacementText) == .orderedSame + } + let existingTriggers = matchingEntries.flatMap(\.triggers) + let combinedTriggers = self.normalizedTriggers( + from: existingTriggers + incomingTriggers, + intendedReplacement: storedReplacementText + ) + let triggerKeys = Set(combinedTriggers) + + let mergedEntry = replacementID.map { + SettingsStore.CustomDictionaryEntry( + id: $0, + triggers: combinedTriggers, + replacement: storedReplacementText + ) + } ?? SettingsStore.CustomDictionaryEntry( + triggers: combinedTriggers, + replacement: storedReplacementText + ) + + var didInsertMergedEntry = false + var updatedEntries: [SettingsStore.CustomDictionaryEntry] = [] + updatedEntries.reserveCapacity(entries.count + (matchingIndex == nil ? 1 : 0)) + + for entry in entries { + if entry.replacement.caseInsensitiveCompare(storedReplacementText) == .orderedSame { + if !didInsertMergedEntry { + updatedEntries.append(mergedEntry) + didInsertMergedEntry = true + } + continue + } + + let remainingTriggers = entry.triggers.filter { trigger in + guard let key = self.normalizedTrigger(trigger) else { return false } + return !triggerKeys.contains(key) + } + guard !remainingTriggers.isEmpty else { continue } + updatedEntries.append( + SettingsStore.CustomDictionaryEntry( + id: entry.id, + triggers: remainingTriggers, + replacement: entry.replacement + ) + ) + } + + if !didInsertMergedEntry { + updatedEntries.insert(mergedEntry, at: 0) + } + + return updatedEntries + } +} + +private struct ReplacementConfirmation: Identifiable, Equatable { + let id = UUID() + let title: String + let detail: String +} + +private struct ReplacementConfirmationToast: View { + let confirmation: ReplacementConfirmation + + @Environment(\.theme) private var theme + + var body: some View { + VStack(spacing: self.theme.metrics.spacing.sm) { + ZStack { + Circle() + .fill(self.theme.palette.accent.opacity(0.14)) + .frame(width: 58, height: 58) + + Circle() + .stroke(self.theme.palette.accent.opacity(0.24), lineWidth: 1) + .frame(width: 58, height: 58) + + Image(systemName: "checkmark") + .font(.system(size: 25, weight: .bold)) + .foregroundStyle(self.theme.palette.accent) + } + + VStack(spacing: 3) { + Text(self.confirmation.title) + .font(self.theme.typography.sectionTitle) + .foregroundStyle(self.theme.palette.primaryText) + Text(self.confirmation.detail) + .font(self.theme.typography.caption) + .foregroundStyle(self.theme.palette.secondaryText) + .multilineTextAlignment(.center) + } + } + .frame(minWidth: 220) + .padding(.horizontal, self.theme.metrics.spacing.xl) + .padding(.vertical, self.theme.metrics.spacing.lg) + .background( + RoundedRectangle(cornerRadius: self.theme.metrics.corners.lg, style: .continuous) + .fill(self.theme.palette.cardBackground.opacity(0.96)) + .overlay( + RoundedRectangle(cornerRadius: self.theme.metrics.corners.lg, style: .continuous) + .stroke(self.theme.palette.accent.opacity(0.3), lineWidth: 1) + ) + .shadow( + color: self.theme.palette.accent.opacity(0.24), + radius: 24, + x: 0, + y: 10 + ) + .shadow( + color: Color.black.opacity(0.16), + radius: 18, + x: 0, + y: 8 + ) + ) + .accessibilityElement(children: .combine) + } +} + +private struct TrainingVariantChip: View { + let variant: String + let onDelete: () -> Void + + @Environment(\.theme) private var theme + + var body: some View { + HStack(spacing: 4) { + Text(self.variant) + .font(self.theme.typography.caption) + + Button(action: self.onDelete) { + Image(systemName: "xmark.circle.fill") + .font(.system(size: 11, weight: .semibold)) + .foregroundStyle(self.theme.palette.tertiaryText) + } + .buttonStyle(.plain) + .help("Remove \(self.variant)") + } + .padding(.horizontal, 7) + .padding(.vertical, 4) + .background( + RoundedRectangle(cornerRadius: 5, style: .continuous) + .fill(self.theme.palette.cardBackground.opacity(0.85)) + .overlay( + RoundedRectangle(cornerRadius: 5, style: .continuous) + .stroke(self.theme.palette.cardBorder.opacity(0.35), lineWidth: 1) + ) + ) + } +} + +private struct DictionaryPreviewChip: View { + let text: String + + @Environment(\.theme) private var theme + + var body: some View { + Text(self.text) + .font(self.theme.typography.caption) + .padding(.horizontal, 7) + .padding(.vertical, 4) + .background( + RoundedRectangle(cornerRadius: 5, style: .continuous) + .fill(self.theme.palette.cardBackground.opacity(0.85)) + .overlay( + RoundedRectangle(cornerRadius: 5, style: .continuous) + .stroke(self.theme.palette.cardBorder.opacity(0.35), lineWidth: 1) + ) + ) + } +} + +private struct TrainingProgressDots: View { + let count: Int + + @Environment(\.theme) private var theme + + var body: some View { + HStack(spacing: 4) { + ForEach(0.. Date: Tue, 30 Jun 2026 09:47:19 -0700 Subject: [PATCH 02/10] speed up dictation capture start --- Sources/Fluid/ContentView.swift | 76 +++--- Sources/Fluid/Services/ASRService.swift | 230 ++++++++++++++---- .../Fluid/Services/DictationStartProbe.swift | 117 +++++++++ .../Fluid/Services/GlobalHotkeyManager.swift | 3 + 4 files changed, 346 insertions(+), 80 deletions(-) create mode 100644 Sources/Fluid/Services/DictationStartProbe.swift diff --git a/Sources/Fluid/ContentView.swift b/Sources/Fluid/ContentView.swift index 19878293..a178c6d9 100644 --- a/Sources/Fluid/ContentView.swift +++ b/Sources/Fluid/ContentView.swift @@ -2949,12 +2949,14 @@ struct ContentView: View { self.menuBarManager.showRecordingOverlayImmediately() } - if !self.isRecordingForCommand, !self.isRecordingForRewrite { - TranscriptionSoundPlayer.shared.playStartSound() - } + let shouldPlayStartSound = !self.isRecordingForCommand && !self.isRecordingForRewrite Task { - await self.asr.start() + await self.asr.start(onCaptureStarted: { + if shouldPlayStartSound { + TranscriptionSoundPlayer.shared.playStartSound() + } + }) if !self.asr.isRunning { self.menuBarManager.hideRecordingOverlayImmediately(reason: "asr_start_failed") } @@ -3167,7 +3169,7 @@ struct ContentView: View { "ContentView: selected model for dictate hotkey=\(SettingsStore.shared.selectedSpeechModel.displayName)", source: "ContentView" ) - self.beginDictationRecording(for: .primary, mode: .dictate) + await self.beginDictationRecording(for: .primary, mode: .dictate) }, stopAndProcessCallback: { let route = self.currentDictationOutputRouteForHotkeyStop() @@ -3176,11 +3178,11 @@ struct ContentView: View { }, promptModeCallback: { DebugLogger.shared.info("Prompt mode triggered", source: "ContentView") - self.beginDictationRecording(for: .secondary, mode: .promptMode) + await self.beginDictationRecording(for: .secondary, mode: .promptMode) }, promptSelectionCallback: { selection in DebugLogger.shared.info("Prompt selection shortcut triggered", source: "ContentView") - self.beginDictationRecording(for: selection, mode: .promptMode) + await self.beginDictationRecording(for: selection, mode: .promptMode) }, commandModeCallback: { DebugLogger.shared.info("Command mode triggered", source: "ContentView") @@ -3199,9 +3201,10 @@ struct ContentView: View { "Starting voice recording for command", source: "ContentView" ) - TranscriptionSoundPlayer.shared.playStartSound() Task { - await self.asr.start() + await self.asr.start(onCaptureStarted: { + TranscriptionSoundPlayer.shared.playStartSound() + }) } }, rewriteModeCallback: { @@ -3234,9 +3237,10 @@ struct ContentView: View { // Start recording immediately for the edit instruction DebugLogger.shared.info("Starting voice recording for edit mode", source: "ContentView") - TranscriptionSoundPlayer.shared.playStartSound() Task { - await self.asr.start() + await self.asr.start(onCaptureStarted: { + TranscriptionSoundPlayer.shared.playStartSound() + }) } }, isDictateRecordingProvider: { @@ -3532,7 +3536,7 @@ extension ContentView { } } - private func beginDictationRecording(for slot: SettingsStore.DictationShortcutSlot, mode: ActiveRecordingMode) { + private func beginDictationRecording(for slot: SettingsStore.DictationShortcutSlot, mode: ActiveRecordingMode) async { DebugLogger.shared.debug("Begin dictation recording for slot \(slot.rawValue)", source: "ContentView") self.appBench("begin_recording slot=\(slot.rawValue) mode=\(mode.rawValue)") if self.isOnboardingVoicePlaygroundStepActive { @@ -3542,30 +3546,30 @@ extension ContentView { self.settings.playgroundUsed = false self.playgroundUsed = false } - self.captureRecordingContext() - self.applyDictationPromptConfiguration(for: SettingsStore.shared.dictationPromptSelection(for: slot)) + self.appBench("pre_asr_state_start") self.applyDictationShortcutSelectionContext(for: slot) self.setActiveRecordingMode(mode) self.rewriteModeService.clearState() - self.appBench("overlay_mode_request mode=Dictation") - self.menuBarManager.setOverlayMode(.dictation) - self.menuBarManager.showRecordingOverlayImmediately() - self.appBench("overlay_mode_requested mode=Dictation") - self.prewarmPrivateAIDictationIfNeeded(for: slot) + self.appBench("pre_asr_state_end") - guard !self.asr.isRunning else { + let wasAlreadyRunning = self.asr.isRunning + if wasAlreadyRunning { self.appBench("asr_start_skipped reason=already_running") - return - } - if SettingsStore.shared.enableTranscriptionSounds { - TranscriptionSoundPlayer.shared.playStartSound() } - Task { + + if !wasAlreadyRunning { let asrStartStartedAt = ProcessInfo.processInfo.systemUptime DebugLogger.shared.benchmark("APP_BENCH", message: "asr_start_call", source: "AppBenchmark") - await self.asr.start() + await self.asr.start(onCaptureStarted: { + if SettingsStore.shared.enableTranscriptionSounds { + self.appBench("start_sound_start") + TranscriptionSoundPlayer.shared.playStartSound() + self.appBench("start_sound_end") + } + }) if !self.asr.isRunning { - self.menuBarManager.hideRecordingOverlayImmediately(reason: "asr_start_failed") + self.appBench("asr_start_failed") + return } DebugLogger.shared.benchmark( "APP_BENCH", @@ -3573,13 +3577,27 @@ extension ContentView { source: "AppBenchmark" ) } + + self.appBench("capture_context_start") + self.captureRecordingContext() + self.appBench("capture_context_end") + self.appBench("prompt_config_start") + self.applyDictationPromptConfiguration(for: SettingsStore.shared.dictationPromptSelection(for: slot)) + self.appBench("prompt_config_end") + self.appBench("overlay_mode_request mode=Dictation") + self.menuBarManager.setOverlayMode(.dictation) + self.menuBarManager.showRecordingOverlayImmediately() + self.appBench("overlay_mode_requested mode=Dictation") + self.appBench("prewarm_private_ai_start") + self.prewarmPrivateAIDictationIfNeeded(for: slot) + self.appBench("prewarm_private_ai_end") } - private func beginDictationRecording(for selection: SettingsStore.DictationPromptSelection, mode: ActiveRecordingMode) { + private func beginDictationRecording(for selection: SettingsStore.DictationPromptSelection, mode: ActiveRecordingMode) async { let settings = SettingsStore.shared settings.setDictationPromptSelection(selection, for: .secondary) self.applyDictationPromptConfiguration(for: selection) - self.beginDictationRecording(for: .secondary, mode: mode) + await self.beginDictationRecording(for: .secondary, mode: mode) } private func applyDictationPromptConfiguration(for selection: SettingsStore.DictationPromptSelection) { diff --git a/Sources/Fluid/Services/ASRService.swift b/Sources/Fluid/Services/ASRService.swift index f8a244fa..3d94624a 100644 --- a/Sources/Fluid/Services/ASRService.swift +++ b/Sources/Fluid/Services/ASRService.swift @@ -102,6 +102,18 @@ final class ASRService: ObservableObject { private var hasPendingParakeetVocabularyReload: Bool = false private var vocabularyChangeObserver: NSObjectProtocol? + private struct CaptureEngineConfiguration: Equatable { + let syncAudioDevicesWithSystem: Bool + let preferredInputDeviceUID: String? + let preferredOutputDeviceUID: String? + } + + private let fastRestartWarmEngineEnabled = true + private let fastRestartWarmEngineHoldNanoseconds: UInt64 = 8_000_000_000 + private var fastRestartWarmEngineShutdownTask: Task? + private var isEngineWarmForFastRestart = false + private var warmCaptureEngineConfiguration: CaptureEngineConfiguration? + // MARK: - Error Handling @Published var errorTitle: String = "Error" @@ -475,6 +487,9 @@ final class ASRService: ObservableObject { func resetTranscriptionProvider() { let newModel = SettingsStore.shared.selectedSpeechModel DebugLogger.shared.info("ASRService: Switching to '\(newModel.displayName)', resetting provider state...", source: "ASRService") + if !self.isRunning, self.isEngineWarmForFastRestart { + self.tearDownCaptureEngine(reason: "provider reset while warm", releaseAsync: true) + } self.isAsrReady = false self.modelsExistOnDisk = false @@ -650,6 +665,7 @@ final class ASRService: ObservableObject { } deinit { + self.fastRestartWarmEngineShutdownTask?.cancel() if let observer = self.vocabularyChangeObserver { NotificationCenter.default.removeObserver(observer) } @@ -840,7 +856,10 @@ final class ASRService: ObservableObject { /// ## Errors /// If audio session configuration fails, the method will silently fail /// and `isRunning` will remain `false`. Check the debug logs for details. - func start(forDictionaryTraining: Bool = false) async { + func start( + forDictionaryTraining: Bool = false, + onCaptureStarted: (@MainActor () -> Void)? = nil + ) async { DebugLogger.shared.info("🎀 START() called - beginning recording session", source: "ASRService") guard self.micStatus == .authorized else { @@ -872,10 +891,12 @@ final class ASRService: ObservableObject { self.benchmarkStreamingChunkIndex = 0 self.benchmarkCompletedStreamingChunks = 0 self.benchmarkLastChunkSampleCount = 0 + DictationStartProbe.shared.markASRStart(session: self.benchmarkSessionID) self.streamingChunkAnalyticsSuccessCount = 0 self.lastStreamingChunkFailureAnalyticsAt = nil (self.transcriptionProvider as? FluidAudioProvider)?.resetStreamingPreviewCache() self.audioCapturePipeline.setRecordingEnabled(true) + DictationStartProbe.shared.markCaptureEnabled(session: self.benchmarkSessionID) self.refreshWordBoostStatus() let dims = self.currentTranscriptionAnalyticsDimensions() self.benchmarkLog("recording_start model=\(dims.model) provider=\(dims.provider) supportsStreaming=\(SettingsStore.shared.selectedSpeechModel.supportsStreaming)") @@ -886,17 +907,20 @@ final class ASRService: ObservableObject { self.isDictionaryTrainingCaptureActive = false do { - DebugLogger.shared.debug("βš™οΈ Calling configureSession()...", source: "ASRService") - try self.configureSession() - DebugLogger.shared.debug("βœ… configureSession() completed", source: "ASRService") + let reusedWarmEngine = self.reuseWarmCaptureEngineIfAvailable() + if reusedWarmEngine { + DebugLogger.shared.debug("βœ… Warm capture engine reused", source: "ASRService") + } else { + DebugLogger.shared.debug("βš™οΈ Calling configureSession()...", source: "ASRService") + try self.configureSession() + DebugLogger.shared.debug("βœ… configureSession() completed", source: "ASRService") - DebugLogger.shared.debug("πŸš€ Calling startEngine()...", source: "ASRService") - try self.startEngine() - DebugLogger.shared.debug("βœ… startEngine() completed", source: "ASRService") + DebugLogger.shared.debug("πŸš€ Calling startEngine()...", source: "ASRService") + try self.startEngine() + DebugLogger.shared.debug("βœ… startEngine() completed", source: "ASRService") + } - DebugLogger.shared.debug("🎧 Setting up engine tap...", source: "ASRService") - try self.setupEngineTap() - DebugLogger.shared.debug("βœ… Engine tap setup complete", source: "ASRService") + onCaptureStarted?() // Pause system media AFTER successful audio setup but BEFORE setting isRunning // This ensures we only pause media when we know recording will succeed @@ -934,6 +958,7 @@ final class ASRService: ObservableObject { DebugLogger.shared.info("βœ… START() completed successfully", source: "ASRService") } catch { self.isDictionaryTrainingCaptureActive = false + self.tearDownCaptureEngine(reason: "start failure", releaseAsync: true) DebugLogger.shared.error("Failed to start ASR session: \(error)", source: "ASRService") // Resume media if we paused it before the failure @@ -1049,27 +1074,19 @@ final class ASRService: ObservableObject { self.stopMonitoringDevice() DebugLogger.shared.debug("βœ… Device monitoring stopped", source: "ASRService") - // Stop the audio engine to stop new audio from coming in - DebugLogger.shared.debug("🎧 Removing engine tap...", source: "ASRService") - self.removeEngineTap() - DebugLogger.shared.debug("βœ… Engine tap removed", source: "ASRService") - - DebugLogger.shared.debug("πŸ›‘ Calling engine.stop()...", source: "ASRService") - self.engine.stop() - DebugLogger.shared.debug("βœ… Engine stopped", source: "ASRService") + if useDictionaryTrainingPath || !self.fastRestartWarmEngineEnabled { + DebugLogger.shared.debug("πŸ›‘ Tearing down capture engine...", source: "ASRService") + self.tearDownCaptureEngine(reason: "recording stopped") + DebugLogger.shared.debug("βœ… Capture engine torn down", source: "ASRService") + } else { + self.scheduleFastRestartWarmEngineShutdown(reason: "recording stopped") + } // Capture has fully ended β€” invoke the callback so callers can play a // stop cue or release capture-dependent UI without waiting on the // (potentially slow) final transcription pass. await MainActor.run { onCaptureStopped?() } - // Recreate the engine instance instead of calling reset() to prevent format corruption - // VoiceInk approach: tearing down and rebuilding ensures fresh, valid audio format on restart - DebugLogger.shared.debug("πŸ—‘οΈ Deallocating old engine and creating fresh instance...", source: "ASRService") - self.engineStorage = nil // Explicitly release old engine - // New engine will be lazily created on next access via computed property - DebugLogger.shared.debug("βœ… Engine instance recreated", source: "ASRService") - // CRITICAL FIX: Await completion of streaming task AND any pending transcriptions // This prevents use-after-free crashes (EXC_BAD_ACCESS) when clearing buffer DebugLogger.shared.debug("⏳ Awaiting stopStreamingTimerAndAwait()...", source: "ASRService") @@ -1362,20 +1379,10 @@ final class ASRService: ObservableObject { // Stop monitoring device self.stopMonitoringDevice() - self.removeEngineTap() - DebugLogger.shared.debug("Engine tap removed", source: "ASRService") - - self.engine.stop() - DebugLogger.shared.debug("Engine stopped", source: "ASRService") - // Release old engine on a background thread β€” if the underlying device just died, // AVAudioEngine deallocation can block in CoreAudio's internal teardown. // No new engine is created here (it's lazy on next start()), so no overlap risk. - let oldEngine = self.engineStorage - self.engineStorage = nil - if let oldEngine { - DispatchQueue.global(qos: .utility).async { _ = oldEngine } - } + self.tearDownCaptureEngine(reason: "stop without transcription", releaseAsync: true) // CRITICAL FIX: Await completion of streaming task AND any pending transcriptions // This prevents use-after-free crashes (EXC_BAD_ACCESS) when clearing buffer @@ -1745,6 +1752,8 @@ final class ASRService: ObservableObject { var lastError: Error? while attempts < 3 { + var installedTapThisAttempt = false + var startedEngineThisAttempt = false do { // CRITICAL: Bind devices BEFORE prepare() - must be set before AudioUnit initialization // Note: This may fail for aggregate devices (Bluetooth, etc.) with OSStatus -10851 @@ -1781,12 +1790,39 @@ final class ASRService: ObservableObject { source: "ASRService" ) + DebugLogger.shared.debug("🎧 Setting up engine tap before engine.start()...", source: "ASRService") + do { + try self.setupEngineTap() + installedTapThisAttempt = true + DebugLogger.shared.debug("βœ… Pre-start engine tap setup complete", source: "ASRService") + } catch { + DebugLogger.shared.warning( + "⚠️ Pre-start tap setup failed; falling back to post-start tap setup: \(error.localizedDescription)", + source: "ASRService" + ) + } + try self.engine.start() + startedEngineThisAttempt = true DebugLogger.shared.info("AVAudioEngine started successfully on attempt \(attempts + 1)", source: "ASRService") + + if installedTapThisAttempt == false { + DebugLogger.shared.debug("🎧 Setting up engine tap after engine.start() fallback...", source: "ASRService") + try self.setupEngineTap() + installedTapThisAttempt = true + DebugLogger.shared.debug("βœ… Post-start engine tap setup complete", source: "ASRService") + } + return } catch { lastError = error attempts += 1 + if installedTapThisAttempt { + self.removeEngineTap() + } + if startedEngineThisAttempt || self.engine.isRunning { + self.engine.stop() + } // Log the actual error from AVFoundation DebugLogger.shared.error( @@ -1826,7 +1862,100 @@ final class ASRService: ObservableObject { } private func removeEngineTap() { - self.engine.inputNode.removeTap(onBus: 0) + guard let engine = self.engineStorage as? AVAudioEngine else { return } + engine.inputNode.removeTap(onBus: 0) + } + + private func cancelFastRestartWarmEngineShutdown() { + self.fastRestartWarmEngineShutdownTask?.cancel() + self.fastRestartWarmEngineShutdownTask = nil + } + + private func currentCaptureEngineConfiguration() -> CaptureEngineConfiguration { + CaptureEngineConfiguration( + syncAudioDevicesWithSystem: SettingsStore.shared.syncAudioDevicesWithSystem, + preferredInputDeviceUID: SettingsStore.shared.preferredInputDeviceUID, + preferredOutputDeviceUID: SettingsStore.shared.preferredOutputDeviceUID + ) + } + + private func tearDownCaptureEngine(reason: String, releaseAsync: Bool = false) { + self.cancelFastRestartWarmEngineShutdown() + self.isEngineWarmForFastRestart = false + self.warmCaptureEngineConfiguration = nil + self.audioCapturePipeline.setRecordingEnabled(false) + self.removeEngineTap() + + let engineToStop = self.engineStorage as? AVAudioEngine + if engineToStop?.isRunning == true { + engineToStop?.stop() + } + + let oldEngine = self.engineStorage + self.engineStorage = nil + DebugLogger.shared.debug("Capture engine torn down: \(reason)", source: "ASRService") + + guard releaseAsync, let oldEngine else { return } + DispatchQueue.global(qos: .utility).async { _ = oldEngine } + } + + private func scheduleFastRestartWarmEngineShutdown(reason: String) { + guard self.fastRestartWarmEngineEnabled, + let currentEngine = self.engineStorage as? AVAudioEngine, + currentEngine.isRunning + else { + self.tearDownCaptureEngine(reason: reason) + return + } + + self.cancelFastRestartWarmEngineShutdown() + self.isEngineWarmForFastRestart = true + self.warmCaptureEngineConfiguration = self.currentCaptureEngineConfiguration() + let delay = self.fastRestartWarmEngineHoldNanoseconds + DebugLogger.shared.info("Keeping capture engine warm for fast restart: \(reason)", source: "ASRService") + + self.fastRestartWarmEngineShutdownTask = Task { @MainActor [weak self, weak currentEngine] in + do { + try await Task.sleep(nanoseconds: delay) + } catch { + return + } + + guard let self, + !self.isRunning, + self.isEngineWarmForFastRestart, + let storedEngine = self.engineStorage as? AVAudioEngine, + storedEngine === currentEngine + else { return } + + self.tearDownCaptureEngine(reason: "fast restart warm window expired") + } + } + + private func reuseWarmCaptureEngineIfAvailable() -> Bool { + guard self.fastRestartWarmEngineEnabled, self.isEngineWarmForFastRestart else { return false } + + guard let warmEngine = self.engineStorage as? AVAudioEngine, + warmEngine.isRunning + else { + self.tearDownCaptureEngine(reason: "warm engine unavailable") + self.audioCapturePipeline.setRecordingEnabled(true) + return false + } + + guard self.warmCaptureEngineConfiguration == self.currentCaptureEngineConfiguration() else { + self.tearDownCaptureEngine(reason: "audio settings changed while warm") + self.audioCapturePipeline.setRecordingEnabled(true) + return false + } + + self.cancelFastRestartWarmEngineShutdown() + self.isEngineWarmForFastRestart = false + self.warmCaptureEngineConfiguration = nil + DictationStartProbe.shared.markTapInstalled(session: self.benchmarkSessionID) + self.benchmarkLog("warm_engine_reuse reused=true") + DebugLogger.shared.info("Reusing warm capture engine for fast restart", source: "ASRService") + return true } private func setupEngineTap() throws { @@ -1887,11 +2016,15 @@ final class ASRService: ObservableObject { input.installTap(onBus: 0, bufferSize: 4096, format: inFormat) { buffer, _ in pipeline.handle(buffer: buffer) } + DictationStartProbe.shared.markTapInstalled(session: self.benchmarkSessionID) DebugLogger.shared.debug("βœ… setupEngineTap() - COMPLETED", source: "ASRService") } private func scheduleAudioRouteRecovery(reason: String) { guard self.isRunning else { + if self.isEngineWarmForFastRestart { + self.tearDownCaptureEngine(reason: "audio route changed while warm: \(reason)", releaseAsync: true) + } self.audioLevelSubject.send(0.0) return } @@ -1931,19 +2064,11 @@ final class ASRService: ObservableObject { self.audioCapturePipeline.setRecordingEnabled(false) self.stopMonitoringDevice() - self.removeEngineTap() - self.engine.stop() - - let oldEngine = self.engineStorage - self.engineStorage = nil - if let oldEngine { - DispatchQueue.global(qos: .utility).async { _ = oldEngine } - } + self.tearDownCaptureEngine(reason: "audio route recovery", releaseAsync: true) do { try self.configureSession() try self.startEngine() - try self.setupEngineTap() self.audioCapturePipeline.setRecordingEnabled(true) if let currentDevice = self.getCurrentlyBoundInputDevice() { @@ -3239,12 +3364,17 @@ private final class AudioCapturePipeline { func setRecordingEnabled(_ enabled: Bool) { self.lock.lock() - defer { self.lock.unlock() } self.recordingEnabled = enabled + let shouldResetLevel = enabled == false if enabled == false { self.levelHistory.removeAll(keepingCapacity: true) self.smoothedLevel = 0.0 } + self.lock.unlock() + + if shouldResetLevel { + self.onLevel(0.0) + } } func handle(buffer: AVAudioPCMBuffer) { @@ -3252,10 +3382,7 @@ private final class AudioCapturePipeline { let enabled = self.recordingEnabled self.lock.unlock() - guard enabled else { - self.onLevel(0.0) - return - } + guard enabled else { return } let mono16k = Self.toMono16k(floatBuffer: buffer) guard mono16k.isEmpty == false else { @@ -3264,6 +3391,7 @@ private final class AudioCapturePipeline { } self.audioBuffer.append(mono16k) + DictationStartProbe.shared.markFirstAudio(sampleCount: mono16k.count) let level = self.calculateAudioLevel(mono16k) self.onLevel(level) } diff --git a/Sources/Fluid/Services/DictationStartProbe.swift b/Sources/Fluid/Services/DictationStartProbe.swift new file mode 100644 index 00000000..3df7b9be --- /dev/null +++ b/Sources/Fluid/Services/DictationStartProbe.swift @@ -0,0 +1,117 @@ +import CoreGraphics +import Foundation + +final class DictationStartProbe: @unchecked Sendable { + static let shared = DictationStartProbe() + + private struct InputEvent { + let kind: String + let uptime: TimeInterval + } + + private let lock = NSLock() + private var lastInputEvent: InputEvent? + private var activeEvent: InputEvent? + private var activeTriggerLabel: String? + private var firstAudioLogged = false + + private init() {} + + func markInputEvent(type: CGEventType, uptime: TimeInterval) { + self.lock.lock() + self.lastInputEvent = InputEvent(kind: Self.eventName(type), uptime: uptime) + self.lock.unlock() + } + + func markStartTrigger(label: String) { + let now = ProcessInfo.processInfo.systemUptime + self.lock.lock() + self.activeEvent = self.lastInputEvent + self.activeTriggerLabel = label + self.firstAudioLogged = false + let event = self.activeEvent + self.lock.unlock() + + let eventDelta = Self.deltaMilliseconds(from: event?.uptime, to: now) + DebugLogger.shared.benchmark( + "START_LATENCY", + message: "trigger label=\(label) event=\(event?.kind ?? "unknown") eventToTriggerMs=\(eventDelta)", + source: "DictationStartProbe" + ) + } + + func markASRStart(session: Int) { + let now = ProcessInfo.processInfo.systemUptime + let snapshot = self.snapshot() + DebugLogger.shared.benchmark( + "START_LATENCY", + message: "asr_start_enter session=\(session) label=\(snapshot.label) eventToASRStartMs=\(Self.deltaMilliseconds(from: snapshot.eventUptime, to: now))", + source: "DictationStartProbe" + ) + } + + func markCaptureEnabled(session: Int) { + let now = ProcessInfo.processInfo.systemUptime + let snapshot = self.snapshot() + DebugLogger.shared.benchmark( + "START_LATENCY", + message: "capture_enabled session=\(session) label=\(snapshot.label) eventToCaptureEnabledMs=\(Self.deltaMilliseconds(from: snapshot.eventUptime, to: now))", + source: "DictationStartProbe" + ) + } + + func markTapInstalled(session: Int) { + let now = ProcessInfo.processInfo.systemUptime + let snapshot = self.snapshot() + DebugLogger.shared.benchmark( + "START_LATENCY", + message: "tap_installed session=\(session) label=\(snapshot.label) eventToTapInstalledMs=\(Self.deltaMilliseconds(from: snapshot.eventUptime, to: now))", + source: "DictationStartProbe" + ) + } + + func markFirstAudio(sampleCount: Int) { + let now = ProcessInfo.processInfo.systemUptime + self.lock.lock() + guard self.firstAudioLogged == false else { + self.lock.unlock() + return + } + self.firstAudioLogged = true + let event = self.activeEvent + let label = self.activeTriggerLabel ?? "unknown" + self.lock.unlock() + + DebugLogger.shared.benchmark( + "START_LATENCY", + message: "first_audio label=\(label) samples=\(sampleCount) event=\(event?.kind ?? "unknown") eventToFirstAudioMs=\(Self.deltaMilliseconds(from: event?.uptime, to: now))", + source: "DictationStartProbe" + ) + } + + private func snapshot() -> (eventUptime: TimeInterval?, label: String) { + self.lock.lock() + defer { self.lock.unlock() } + return (self.activeEvent?.uptime, self.activeTriggerLabel ?? "unknown") + } + + private static func deltaMilliseconds(from start: TimeInterval?, to end: TimeInterval) -> Int { + guard let start else { return -1 } + return Int(((end - start) * 1000).rounded()) + } + + private static func eventName(_ type: CGEventType) -> String { + switch type { + case .keyDown: return "keyDown" + case .keyUp: return "keyUp" + case .flagsChanged: return "flagsChanged" + case .leftMouseDown: return "leftMouseDown" + case .leftMouseUp: return "leftMouseUp" + case .rightMouseDown: return "rightMouseDown" + case .rightMouseUp: return "rightMouseUp" + case .otherMouseDown: return "otherMouseDown" + case .otherMouseUp: return "otherMouseUp" + default: return "event\(type.rawValue)" + } + } +} diff --git a/Sources/Fluid/Services/GlobalHotkeyManager.swift b/Sources/Fluid/Services/GlobalHotkeyManager.swift index 7c9b271d..19bb2257 100644 --- a/Sources/Fluid/Services/GlobalHotkeyManager.swift +++ b/Sources/Fluid/Services/GlobalHotkeyManager.swift @@ -606,6 +606,7 @@ final class GlobalHotkeyManager: NSObject { if let tapRecoveryResult = self.handleTapDisableEvent(type: type, event: event) { return tapRecoveryResult } + DictationStartProbe.shared.markInputEvent(type: type, uptime: TimeInterval(event.timestamp) / 1_000_000_000) if self.isShortcutCaptureActiveProvider?() ?? false { self.resetModifierOnlyShortcutTracking() @@ -1726,6 +1727,7 @@ final class GlobalHotkeyManager: NSObject { } private func triggerDictationMode() { + DictationStartProbe.shared.markStartTrigger(label: "dictationMode") Task { @MainActor [weak self] in guard let self = self else { return } guard self.canTriggerRecordingAction("Dictate mode hotkey") else { return } @@ -1814,6 +1816,7 @@ final class GlobalHotkeyManager: NSObject { } private func startRecordingIfNeeded() { + DictationStartProbe.shared.markStartTrigger(label: "startRecordingIfNeeded") Task { @MainActor [weak self] in guard let self = self else { return } From e82025f30c9c8949fb537f35c42cea26179c486a Mon Sep 17 00:00:00 2001 From: altic-dev Date: Tue, 30 Jun 2026 16:08:12 -0700 Subject: [PATCH 03/10] perf(asr): prewarm capture startup --- Sources/Fluid/ContentView.swift | 30 +- Sources/Fluid/Services/ASRService.swift | 310 +++++++++++++++++- .../Fluid/Services/DictationStartProbe.swift | 34 ++ .../Fluid/Services/GlobalHotkeyManager.swift | 92 +++++- 4 files changed, 452 insertions(+), 14 deletions(-) diff --git a/Sources/Fluid/ContentView.swift b/Sources/Fluid/ContentView.swift index a178c6d9..7f23674c 100644 --- a/Sources/Fluid/ContentView.swift +++ b/Sources/Fluid/ContentView.swift @@ -2954,7 +2954,7 @@ struct ContentView: View { Task { await self.asr.start(onCaptureStarted: { if shouldPlayStartSound { - TranscriptionSoundPlayer.shared.playStartSound() + self.scheduleTranscriptionStartSound() } }) if !self.asr.isRunning { @@ -3203,7 +3203,7 @@ struct ContentView: View { ) Task { await self.asr.start(onCaptureStarted: { - TranscriptionSoundPlayer.shared.playStartSound() + self.scheduleTranscriptionStartSound() }) } }, @@ -3239,7 +3239,7 @@ struct ContentView: View { DebugLogger.shared.info("Starting voice recording for edit mode", source: "ContentView") Task { await self.asr.start(onCaptureStarted: { - TranscriptionSoundPlayer.shared.playStartSound() + self.scheduleTranscriptionStartSound() }) } }, @@ -3561,11 +3561,7 @@ extension ContentView { let asrStartStartedAt = ProcessInfo.processInfo.systemUptime DebugLogger.shared.benchmark("APP_BENCH", message: "asr_start_call", source: "AppBenchmark") await self.asr.start(onCaptureStarted: { - if SettingsStore.shared.enableTranscriptionSounds { - self.appBench("start_sound_start") - TranscriptionSoundPlayer.shared.playStartSound() - self.appBench("start_sound_end") - } + self.scheduleTranscriptionStartSound(logBenchmarks: true) }) if !self.asr.isRunning { self.appBench("asr_start_failed") @@ -3600,6 +3596,24 @@ extension ContentView { await self.beginDictationRecording(for: .secondary, mode: mode) } + private func scheduleTranscriptionStartSound(logBenchmarks: Bool = false) { + guard SettingsStore.shared.enableTranscriptionSounds else { return } + + if logBenchmarks { + self.appBench("start_sound_scheduled") + } + + Task { @MainActor in + if logBenchmarks { + DebugLogger.shared.benchmark("APP_BENCH", message: "start_sound_start", source: "AppBenchmark") + } + TranscriptionSoundPlayer.shared.playStartSound() + if logBenchmarks { + DebugLogger.shared.benchmark("APP_BENCH", message: "start_sound_end", source: "AppBenchmark") + } + } + } + private func applyDictationPromptConfiguration(for selection: SettingsStore.DictationPromptSelection) { let providerID: String let modelName: String diff --git a/Sources/Fluid/Services/ASRService.swift b/Sources/Fluid/Services/ASRService.swift index 3d94624a..39a08f9f 100644 --- a/Sources/Fluid/Services/ASRService.swift +++ b/Sources/Fluid/Services/ASRService.swift @@ -110,8 +110,14 @@ final class ASRService: ObservableObject { private let fastRestartWarmEngineEnabled = true private let fastRestartWarmEngineHoldNanoseconds: UInt64 = 8_000_000_000 + private let fastStartPrewarmHoldNanoseconds: UInt64 = 2_000_000_000 private var fastRestartWarmEngineShutdownTask: Task? + private var fastStartPrewarmTask: Task? + private var fastStartPrewarmShutdownTask: Task? + private var idleCaptureEnginePrepareTask: Task? private var isEngineWarmForFastRestart = false + private var isCaptureEnginePrewarmedForFastStart = false + private var isFastStartPrewarmingCaptureEngine = false private var warmCaptureEngineConfiguration: CaptureEngineConfiguration? // MARK: - Error Handling @@ -666,6 +672,9 @@ final class ASRService: ObservableObject { deinit { self.fastRestartWarmEngineShutdownTask?.cancel() + self.fastStartPrewarmTask?.cancel() + self.fastStartPrewarmShutdownTask?.cancel() + self.idleCaptureEnginePrepareTask?.cancel() if let observer = self.vocabularyChangeObserver { NotificationCenter.default.removeObserver(observer) } @@ -756,6 +765,7 @@ final class ASRService: ObservableObject { // Initialize device list cache self.cacheCurrentDeviceList(AudioDevice.listInputDevices()) + self.scheduleIdleCaptureEnginePreparation(reason: "startup") // Check if models exist on disk and auto-load if present // This is done in a Task to support async model detection (e.g., AppleSpeechAnalyzerProvider) @@ -826,6 +836,9 @@ final class ASRService: ObservableObject { Task { @MainActor in self.micPermissionGranted = granted self.micStatus = granted ? .authorized : .denied + if granted { + self.scheduleIdleCaptureEnginePreparation(reason: "mic permission granted") + } } } } @@ -860,6 +873,7 @@ final class ASRService: ObservableObject { forDictionaryTraining: Bool = false, onCaptureStarted: (@MainActor () -> Void)? = nil ) async { + let startBenchmarkStartedAt = Date().timeIntervalSince1970 DebugLogger.shared.info("🎀 START() called - beginning recording session", source: "ASRService") guard self.micStatus == .authorized else { @@ -878,6 +892,7 @@ final class ASRService: ObservableObject { self.isRecoveringAudioRoute = false DebugLogger.shared.debug("🧹 Clearing buffers and state", source: "ASRService") + let stateResetStartedAt = Date().timeIntervalSince1970 self.finalText.removeAll() self.audioBuffer.clear(keepingCapacity: true) // specific optimization for restart self.partialTranscription.removeAll() @@ -900,6 +915,9 @@ final class ASRService: ObservableObject { self.refreshWordBoostStatus() let dims = self.currentTranscriptionAnalyticsDimensions() self.benchmarkLog("recording_start model=\(dims.model) provider=\(dims.provider) supportsStreaming=\(SettingsStore.shared.selectedSpeechModel.supportsStreaming)") + self.benchmarkLog( + "start_state_reset elapsedMs=\(self.elapsedMilliseconds(since: stateResetStartedAt)) totalMs=\(self.elapsedMilliseconds(since: startBenchmarkStartedAt))" + ) DebugLogger.shared.debug("βœ… Buffers cleared", source: "ASRService") self.isStarting = true @@ -907,23 +925,40 @@ final class ASRService: ObservableObject { self.isDictionaryTrainingCaptureActive = false do { + let warmReuseStartedAt = Date().timeIntervalSince1970 let reusedWarmEngine = self.reuseWarmCaptureEngineIfAvailable() + self.benchmarkLog( + "start_warm_reuse_check reused=\(reusedWarmEngine) elapsedMs=\(self.elapsedMilliseconds(since: warmReuseStartedAt)) totalMs=\(self.elapsedMilliseconds(since: startBenchmarkStartedAt))" + ) if reusedWarmEngine { DebugLogger.shared.debug("βœ… Warm capture engine reused", source: "ASRService") } else { DebugLogger.shared.debug("βš™οΈ Calling configureSession()...", source: "ASRService") + let configureStartedAt = Date().timeIntervalSince1970 try self.configureSession() + self.benchmarkLog( + "start_configure elapsedMs=\(self.elapsedMilliseconds(since: configureStartedAt)) totalMs=\(self.elapsedMilliseconds(since: startBenchmarkStartedAt))" + ) DebugLogger.shared.debug("βœ… configureSession() completed", source: "ASRService") DebugLogger.shared.debug("πŸš€ Calling startEngine()...", source: "ASRService") + let engineStartStartedAt = Date().timeIntervalSince1970 try self.startEngine() + self.benchmarkLog( + "start_engine elapsedMs=\(self.elapsedMilliseconds(since: engineStartStartedAt)) totalMs=\(self.elapsedMilliseconds(since: startBenchmarkStartedAt))" + ) DebugLogger.shared.debug("βœ… startEngine() completed", source: "ASRService") } + let captureCallbackStartedAt = Date().timeIntervalSince1970 onCaptureStarted?() + self.benchmarkLog( + "start_capture_callback elapsedMs=\(self.elapsedMilliseconds(since: captureCallbackStartedAt)) totalMs=\(self.elapsedMilliseconds(since: startBenchmarkStartedAt))" + ) // Pause system media AFTER successful audio setup but BEFORE setting isRunning // This ensures we only pause media when we know recording will succeed + let mediaPauseStartedAt = Date().timeIntervalSince1970 if SettingsStore.shared.pauseMediaDuringTranscription { let didPause = await MediaPlaybackService.shared.pauseIfPlaying() self.didPauseMediaForThisSession = didPause @@ -931,21 +966,32 @@ final class ASRService: ObservableObject { DebugLogger.shared.info("🎡 Paused system media for transcription", source: "ASRService") } } + self.benchmarkLog( + "start_media_pause enabled=\(SettingsStore.shared.pauseMediaDuringTranscription) " + + "didPause=\(self.didPauseMediaForThisSession) " + + "elapsedMs=\(self.elapsedMilliseconds(since: mediaPauseStartedAt)) " + + "totalMs=\(self.elapsedMilliseconds(since: startBenchmarkStartedAt))" + ) self.isRunning = true self.isDictionaryTrainingCaptureActive = forDictionaryTraining DebugLogger.shared.info("βœ… isRunning set to TRUE", source: "ASRService") // Start monitoring the currently bound device for disconnection + let deviceMonitorStartedAt = Date().timeIntervalSince1970 if let currentDevice = getCurrentlyBoundInputDevice() { DebugLogger.shared.debug("πŸ‘€ Starting device monitoring for: \(currentDevice.name)", source: "ASRService") self.startMonitoringDevice(currentDevice.id) } else { DebugLogger.shared.debug("ℹ️ No device to monitor", source: "ASRService") } + self.benchmarkLog( + "start_device_monitor elapsedMs=\(self.elapsedMilliseconds(since: deviceMonitorStartedAt)) totalMs=\(self.elapsedMilliseconds(since: startBenchmarkStartedAt))" + ) // Only start streaming for models that support it (large Whisper models are too slow) let model = SettingsStore.shared.selectedSpeechModel + let streamingStartStartedAt = Date().timeIntervalSince1970 if model.supportsStreaming, !forDictionaryTraining { DebugLogger.shared.debug("πŸ“‘ Starting streaming transcription...", source: "ASRService") self.benchmarkLog("streaming_timer_start intervalMs=\(Int((self.streamingChunkDurationSeconds * 1000).rounded())) minSamples=\(self.minimumStreamingPreviewSamples)") @@ -955,6 +1001,12 @@ final class ASRService: ObservableObject { } else { DebugLogger.shared.debug("⏸️ Skipping streaming - model '\(model.displayName)' does not support real-time chunk processing", source: "ASRService") } + self.benchmarkLog( + "start_streaming_setup elapsedMs=\(self.elapsedMilliseconds(since: streamingStartStartedAt)) totalMs=\(self.elapsedMilliseconds(since: startBenchmarkStartedAt))" + ) + self.benchmarkLog( + "start_done reusedWarmEngine=\(reusedWarmEngine) totalMs=\(self.elapsedMilliseconds(since: startBenchmarkStartedAt))" + ) DebugLogger.shared.info("βœ… START() completed successfully", source: "ASRService") } catch { self.isDictionaryTrainingCaptureActive = false @@ -1408,13 +1460,18 @@ final class ASRService: ObservableObject { } private func configureSession() throws { + let configureStartedAt = Date().timeIntervalSince1970 DebugLogger.shared.debug("πŸ”§ configureSession() - ENTERED", source: "ASRService") + let runningCheckStartedAt = Date().timeIntervalSince1970 if self.engine.isRunning { DebugLogger.shared.debug("⚠️ Engine is running, stopping before configuration", source: "ASRService") self.engine.stop() DebugLogger.shared.debug("βœ… Engine stopped", source: "ASRService") } + self.benchmarkLog( + "start_configure_running_check elapsedMs=\(self.elapsedMilliseconds(since: runningCheckStartedAt))" + ) // No need to call engine.reset() here - we created a fresh engine in stop() // Accessing the engine property will either return the existing fresh engine, @@ -1423,18 +1480,29 @@ final class ASRService: ObservableObject { // Force input node instantiation (ensures the underlying AUHAL AudioUnit exists) DebugLogger.shared.debug("πŸ“ Forcing input node instantiation...", source: "ASRService") + let inputNodeStartedAt = Date().timeIntervalSince1970 _ = self.engine.inputNode + self.benchmarkLog( + "start_configure_input_node elapsedMs=\(self.elapsedMilliseconds(since: inputNodeStartedAt))" + ) DebugLogger.shared.debug("Input node instantiated", source: "ASRService") // Force output node instantiation for output device binding DebugLogger.shared.debug("πŸ“ Forcing output node instantiation...", source: "ASRService") + let outputNodeStartedAt = Date().timeIntervalSince1970 _ = self.engine.outputNode + self.benchmarkLog( + "start_configure_output_node elapsedMs=\(self.elapsedMilliseconds(since: outputNodeStartedAt))" + ) DebugLogger.shared.debug("βœ… Output node instantiated", source: "ASRService") // NOTE: Device binding occurs in startEngine() BEFORE engine.prepare() // Per CoreAudio docs, device must be set before AudioUnit initialization (prepare) // Since sync mode is always ON, binding actually no-ops and uses system defaults + self.benchmarkLog( + "start_configure_done elapsedMs=\(self.elapsedMilliseconds(since: configureStartedAt))" + ) DebugLogger.shared.debug("βœ… configureSession() - COMPLETED", source: "ASRService") } @@ -1747,6 +1815,7 @@ final class ASRService: ObservableObject { } private func startEngine() throws { + let startEngineStartedAt = Date().timeIntervalSince1970 DebugLogger.shared.debug("πŸš€ startEngine() - ENTERED", source: "ASRService") var attempts = 0 var lastError: Error? @@ -1759,11 +1828,19 @@ final class ASRService: ObservableObject { // Note: This may fail for aggregate devices (Bluetooth, etc.) with OSStatus -10851 // In that case, we fall back to system defaults (same as sync mode) DebugLogger.shared.debug("🎚️ Binding input device (before prepare)...", source: "ASRService") + let inputBindStartedAt = Date().timeIntervalSince1970 let inputBindOk = self.bindPreferredInputDeviceIfNeeded() + self.benchmarkLog( + "start_engine_input_bind attempt=\(attempts + 1) ok=\(inputBindOk) elapsedMs=\(self.elapsedMilliseconds(since: inputBindStartedAt)) totalMs=\(self.elapsedMilliseconds(since: startEngineStartedAt))" + ) DebugLogger.shared.debug("βœ… Input device binding result: \(inputBindOk)", source: "ASRService") DebugLogger.shared.debug("πŸ”Š Binding output device (before prepare)...", source: "ASRService") + let outputBindStartedAt = Date().timeIntervalSince1970 let outputBindOk = self.bindPreferredOutputDeviceIfNeeded() + self.benchmarkLog( + "start_engine_output_bind attempt=\(attempts + 1) ok=\(outputBindOk) elapsedMs=\(self.elapsedMilliseconds(since: outputBindStartedAt)) totalMs=\(self.elapsedMilliseconds(since: startEngineStartedAt))" + ) DebugLogger.shared.debug("βœ… Output device binding result: \(outputBindOk)", source: "ASRService") // If binding failed (e.g., aggregate device), engine will use system defaults @@ -1777,42 +1854,80 @@ final class ASRService: ObservableObject { // Prepare the engine to allocate resources and establish format SYNCHRONOUSLY // This ensures the audio graph is fully initialized before we proceed DebugLogger.shared.debug("πŸ“‹ Preparing engine (allocating resources)...", source: "ASRService") + let prepareStartedAt = Date().timeIntervalSince1970 self.engine.prepare() + self.benchmarkLog( + "start_engine_prepare attempt=\(attempts + 1) elapsedMs=\(self.elapsedMilliseconds(since: prepareStartedAt)) totalMs=\(self.elapsedMilliseconds(since: startEngineStartedAt))" + ) DebugLogger.shared.debug("βœ… Engine prepared", source: "ASRService") // Log engine state before attempting to start + let formatProbeStartedAt = Date().timeIntervalSince1970 let inputNode = self.engine.inputNode let inputFormat = inputNode.inputFormat(forBus: 0) + let outputSampleRate = inputNode.outputFormat(forBus: 0).sampleRate + self.benchmarkLog( + "start_engine_format_probe attempt=\(attempts + 1) " + + "elapsedMs=\(self.elapsedMilliseconds(since: formatProbeStartedAt)) " + + "totalMs=\(self.elapsedMilliseconds(since: startEngineStartedAt)) " + + "sampleRate=\(Int(inputFormat.sampleRate.rounded())) " + + "channels=\(inputFormat.channelCount)" + ) DebugLogger.shared.debug( "(startEngine(): before engine.start attempt \(attempts + 1)) " + - "Engine IO device = \(inputNode.outputFormat(forBus: 0).sampleRate)Hz, " + + "Engine IO device = \(outputSampleRate)Hz, " + "Input format = \(inputFormat.sampleRate)Hz \(inputFormat.channelCount)ch", source: "ASRService" ) DebugLogger.shared.debug("🎧 Setting up engine tap before engine.start()...", source: "ASRService") + let tapSetupStartedAt = Date().timeIntervalSince1970 do { try self.setupEngineTap() installedTapThisAttempt = true + self.benchmarkLog( + "start_engine_tap_setup attempt=\(attempts + 1) " + + "preStart=true success=true " + + "elapsedMs=\(self.elapsedMilliseconds(since: tapSetupStartedAt)) " + + "totalMs=\(self.elapsedMilliseconds(since: startEngineStartedAt))" + ) DebugLogger.shared.debug("βœ… Pre-start engine tap setup complete", source: "ASRService") } catch { + self.benchmarkLog( + "start_engine_tap_setup attempt=\(attempts + 1) " + + "preStart=true success=false " + + "elapsedMs=\(self.elapsedMilliseconds(since: tapSetupStartedAt)) " + + "totalMs=\(self.elapsedMilliseconds(since: startEngineStartedAt)) " + + "error=\(error.localizedDescription)" + ) DebugLogger.shared.warning( "⚠️ Pre-start tap setup failed; falling back to post-start tap setup: \(error.localizedDescription)", source: "ASRService" ) } + let engineStartCallStartedAt = Date().timeIntervalSince1970 try self.engine.start() startedEngineThisAttempt = true + self.benchmarkLog( + "start_engine_start_call attempt=\(attempts + 1) elapsedMs=\(self.elapsedMilliseconds(since: engineStartCallStartedAt)) totalMs=\(self.elapsedMilliseconds(since: startEngineStartedAt))" + ) DebugLogger.shared.info("AVAudioEngine started successfully on attempt \(attempts + 1)", source: "ASRService") if installedTapThisAttempt == false { DebugLogger.shared.debug("🎧 Setting up engine tap after engine.start() fallback...", source: "ASRService") + let fallbackTapStartedAt = Date().timeIntervalSince1970 try self.setupEngineTap() installedTapThisAttempt = true + self.benchmarkLog( + "start_engine_tap_setup attempt=\(attempts + 1) preStart=false success=true elapsedMs=\(self.elapsedMilliseconds(since: fallbackTapStartedAt)) totalMs=\(self.elapsedMilliseconds(since: startEngineStartedAt))" + ) DebugLogger.shared.debug("βœ… Post-start engine tap setup complete", source: "ASRService") } + self.benchmarkLog( + "start_engine_done attempt=\(attempts + 1) totalMs=\(self.elapsedMilliseconds(since: startEngineStartedAt))" + ) return } catch { lastError = error @@ -1871,6 +1986,152 @@ final class ASRService: ObservableObject { self.fastRestartWarmEngineShutdownTask = nil } + private func cancelFastStartPrewarmShutdown() { + self.fastStartPrewarmShutdownTask?.cancel() + self.fastStartPrewarmShutdownTask = nil + } + + private func scheduleIdleCaptureEnginePreparation( + reason: String, + delayNanoseconds: UInt64 = 1_000_000_000 + ) { + guard self.fastRestartWarmEngineEnabled else { return } + guard self.micStatus == .authorized else { return } + guard self.idleCaptureEnginePrepareTask == nil else { return } + + self.idleCaptureEnginePrepareTask = Task { @MainActor [weak self] in + do { + try await Task.sleep(nanoseconds: delayNanoseconds) + } catch { + return + } + + guard let self else { return } + self.idleCaptureEnginePrepareTask = nil + guard self.micStatus == .authorized, + self.isRunning == false, + self.isStarting == false, + self.isEngineWarmForFastRestart == false, + self.fastStartPrewarmTask == nil, + self.engineStorage == nil + else { return } + + let startedAt = Date().timeIntervalSince1970 + do { + try self.configureSession() + if SettingsStore.shared.syncAudioDevicesWithSystem { + let prepareStartedAt = Date().timeIntervalSince1970 + self.engine.prepare() + self.benchmarkLog( + "fast_start_idle_prepare_engine_prepare skipped=false elapsedMs=\(self.elapsedMilliseconds(since: prepareStartedAt))" + ) + } else { + self.benchmarkLog("fast_start_idle_prepare_engine_prepare skipped=true reason=independent_device_binding") + } + self.benchmarkLog( + "fast_start_idle_prepare_ready reason=\(reason) elapsedMs=\(self.elapsedMilliseconds(since: startedAt))" + ) + } catch { + self.tearDownCaptureEngine(reason: "idle capture engine prepare failed", releaseAsync: true) + self.benchmarkLog( + "fast_start_idle_prepare_failed reason=\(reason) elapsedMs=\(self.elapsedMilliseconds(since: startedAt)) error=\(error.localizedDescription)" + ) + } + } + } + + @discardableResult + func prewarmCaptureEngineForFastStart(reason: String) -> Bool { + guard self.fastRestartWarmEngineEnabled else { return false } + guard self.micStatus == .authorized else { return false } + guard self.isRunning == false, self.isStarting == false else { return false } + guard self.isEngineWarmForFastRestart == false else { return false } + guard self.fastStartPrewarmTask == nil else { return true } + + self.idleCaptureEnginePrepareTask?.cancel() + self.idleCaptureEnginePrepareTask = nil + self.benchmarkLog("fast_start_prewarm_requested reason=\(reason)") + self.fastStartPrewarmTask = Task { @MainActor [weak self] in + guard let self else { return } + guard Task.isCancelled == false, + self.isRunning == false, + self.isStarting == false, + self.isEngineWarmForFastRestart == false + else { + self.fastStartPrewarmTask = nil + return + } + let startedAt = Date().timeIntervalSince1970 + do { + self.audioCapturePipeline.setRecordingEnabled(false) + self.isFastStartPrewarmingCaptureEngine = true + defer { self.isFastStartPrewarmingCaptureEngine = false } + try self.configureSession() + guard Task.isCancelled == false else { + self.tearDownCaptureEngine(reason: "fast start prewarm cancelled after configure", releaseAsync: true) + self.fastStartPrewarmTask = nil + return + } + try self.startEngine() + + guard Task.isCancelled == false else { + self.tearDownCaptureEngine(reason: "fast start prewarm cancelled before ready", releaseAsync: true) + self.fastStartPrewarmTask = nil + return + } + + self.isEngineWarmForFastRestart = true + self.isCaptureEnginePrewarmedForFastStart = true + self.warmCaptureEngineConfiguration = self.currentCaptureEngineConfiguration() + self.benchmarkLog("fast_start_prewarm_ready reason=\(reason) elapsedMs=\(self.elapsedMilliseconds(since: startedAt))") + self.scheduleFastStartPrewarmShutdown(reason: reason) + } catch { + self.tearDownCaptureEngine(reason: "fast start prewarm failed", releaseAsync: true) + self.benchmarkLog("fast_start_prewarm_failed reason=\(reason) elapsedMs=\(self.elapsedMilliseconds(since: startedAt)) error=\(error.localizedDescription)") + } + + self.fastStartPrewarmTask = nil + } + return true + } + + func cancelCaptureEngineFastStartPrewarm(reason: String) { + self.fastStartPrewarmTask?.cancel() + self.fastStartPrewarmTask = nil + self.cancelFastStartPrewarmShutdown() + + guard self.isRunning == false, self.isCaptureEnginePrewarmedForFastStart else { return } + self.benchmarkLog("fast_start_prewarm_cancel reason=\(reason)") + self.tearDownCaptureEngine(reason: "fast start prewarm cancelled: \(reason)", releaseAsync: true) + } + + private func scheduleFastStartPrewarmShutdown(reason: String) { + guard let currentEngine = self.engineStorage as? AVAudioEngine, + currentEngine.isRunning + else { return } + + self.cancelFastStartPrewarmShutdown() + let delay = self.fastStartPrewarmHoldNanoseconds + self.fastStartPrewarmShutdownTask = Task { @MainActor [weak self, weak currentEngine] in + do { + try await Task.sleep(nanoseconds: delay) + } catch { + return + } + + guard let self, + !self.isRunning, + self.isCaptureEnginePrewarmedForFastStart, + let storedEngine = self.engineStorage as? AVAudioEngine, + storedEngine === currentEngine + else { return } + + self.benchmarkLog("fast_start_prewarm_expired reason=\(reason)") + self.tearDownCaptureEngine(reason: "fast start prewarm expired", releaseAsync: true) + self.scheduleIdleCaptureEnginePreparation(reason: "fast start prewarm expired") + } + } + private func currentCaptureEngineConfiguration() -> CaptureEngineConfiguration { CaptureEngineConfiguration( syncAudioDevicesWithSystem: SettingsStore.shared.syncAudioDevicesWithSystem, @@ -1880,8 +2141,13 @@ final class ASRService: ObservableObject { } private func tearDownCaptureEngine(reason: String, releaseAsync: Bool = false) { + self.idleCaptureEnginePrepareTask?.cancel() + self.idleCaptureEnginePrepareTask = nil self.cancelFastRestartWarmEngineShutdown() + self.cancelFastStartPrewarmShutdown() self.isEngineWarmForFastRestart = false + self.isCaptureEnginePrewarmedForFastStart = false + self.isFastStartPrewarmingCaptureEngine = false self.warmCaptureEngineConfiguration = nil self.audioCapturePipeline.setRecordingEnabled(false) self.removeEngineTap() @@ -1909,7 +2175,9 @@ final class ASRService: ObservableObject { } self.cancelFastRestartWarmEngineShutdown() + self.cancelFastStartPrewarmShutdown() self.isEngineWarmForFastRestart = true + self.isCaptureEnginePrewarmedForFastStart = false self.warmCaptureEngineConfiguration = self.currentCaptureEngineConfiguration() let delay = self.fastRestartWarmEngineHoldNanoseconds DebugLogger.shared.info("Keeping capture engine warm for fast restart: \(reason)", source: "ASRService") @@ -1929,6 +2197,10 @@ final class ASRService: ObservableObject { else { return } self.tearDownCaptureEngine(reason: "fast restart warm window expired") + self.scheduleIdleCaptureEnginePreparation( + reason: "fast restart warm window expired", + delayNanoseconds: 250_000_000 + ) } } @@ -1950,22 +2222,33 @@ final class ASRService: ObservableObject { } self.cancelFastRestartWarmEngineShutdown() + self.cancelFastStartPrewarmShutdown() + let reusedPrewarmedCapture = self.isCaptureEnginePrewarmedForFastStart self.isEngineWarmForFastRestart = false + self.isCaptureEnginePrewarmedForFastStart = false self.warmCaptureEngineConfiguration = nil - DictationStartProbe.shared.markTapInstalled(session: self.benchmarkSessionID) - self.benchmarkLog("warm_engine_reuse reused=true") + if self.isFastStartPrewarmingCaptureEngine == false { + DictationStartProbe.shared.markTapInstalled(session: self.benchmarkSessionID) + } + self.benchmarkLog("warm_engine_reuse reused=true prewarmed=\(reusedPrewarmedCapture)") DebugLogger.shared.info("Reusing warm capture engine for fast restart", source: "ASRService") return true } private func setupEngineTap() throws { + let tapStartedAt = Date().timeIntervalSince1970 DebugLogger.shared.debug("🎧 setupEngineTap() - ENTERED", source: "ASRService") + let inputNodeStartedAt = Date().timeIntervalSince1970 let input = self.engine.inputNode + self.benchmarkLog( + "start_tap_input_node elapsedMs=\(self.elapsedMilliseconds(since: inputNodeStartedAt))" + ) // On Intel Macs (especially after wake from sleep), the audio HAL may not have // finished initializing even after engine.start() returns. The format can be // temporarily 0Hz/0ch while the hardware negotiates with CoreAudio. // We retry a few times with small delays to handle this race condition. + let formatWaitStartedAt = Date().timeIntervalSince1970 var inFormat = input.inputFormat(forBus: 0) var retryCount = 0 let maxRetries = 5 @@ -1997,6 +2280,9 @@ final class ASRService: ObservableObject { // Re-query the format inFormat = input.inputFormat(forBus: 0) } + self.benchmarkLog( + "start_tap_format_ready elapsedMs=\(self.elapsedMilliseconds(since: formatWaitStartedAt)) retryCount=\(retryCount) sampleRate=\(Int(inFormat.sampleRate.rounded())) channels=\(inFormat.channelCount)" + ) if retryCount > 0 { DebugLogger.shared.info( @@ -2013,17 +2299,27 @@ final class ASRService: ObservableObject { self.inputFormat = inFormat let pipeline = self.audioCapturePipeline DebugLogger.shared.debug("🎧 Installing tap on bus 0...", source: "ASRService") + let installTapStartedAt = Date().timeIntervalSince1970 input.installTap(onBus: 0, bufferSize: 4096, format: inFormat) { buffer, _ in pipeline.handle(buffer: buffer) } - DictationStartProbe.shared.markTapInstalled(session: self.benchmarkSessionID) + self.benchmarkLog( + "start_tap_install elapsedMs=\(self.elapsedMilliseconds(since: installTapStartedAt)) totalMs=\(self.elapsedMilliseconds(since: tapStartedAt)) bufferSize=4096" + ) + if self.isFastStartPrewarmingCaptureEngine == false { + DictationStartProbe.shared.markTapInstalled(session: self.benchmarkSessionID) + } + self.benchmarkLog( + "start_tap_done totalMs=\(self.elapsedMilliseconds(since: tapStartedAt))" + ) DebugLogger.shared.debug("βœ… setupEngineTap() - COMPLETED", source: "ASRService") } private func scheduleAudioRouteRecovery(reason: String) { guard self.isRunning else { - if self.isEngineWarmForFastRestart { + if self.isEngineWarmForFastRestart || self.engineStorage != nil { self.tearDownCaptureEngine(reason: "audio route changed while warm: \(reason)", releaseAsync: true) + self.scheduleIdleCaptureEnginePreparation(reason: "audio route changed while idle: \(reason)") } self.audioLevelSubject.send(0.0) return @@ -3384,6 +3680,10 @@ private final class AudioCapturePipeline { guard enabled else { return } + DictationStartProbe.shared.markFirstTapBuffer( + frameLength: Int(buffer.frameLength), + sampleRate: buffer.format.sampleRate + ) let mono16k = Self.toMono16k(floatBuffer: buffer) guard mono16k.isEmpty == false else { self.onLevel(0.0) diff --git a/Sources/Fluid/Services/DictationStartProbe.swift b/Sources/Fluid/Services/DictationStartProbe.swift index 3df7b9be..bcf46689 100644 --- a/Sources/Fluid/Services/DictationStartProbe.swift +++ b/Sources/Fluid/Services/DictationStartProbe.swift @@ -13,6 +13,7 @@ final class DictationStartProbe: @unchecked Sendable { private var lastInputEvent: InputEvent? private var activeEvent: InputEvent? private var activeTriggerLabel: String? + private var firstTapBufferLogged = false private var firstAudioLogged = false private init() {} @@ -28,6 +29,7 @@ final class DictationStartProbe: @unchecked Sendable { self.lock.lock() self.activeEvent = self.lastInputEvent self.activeTriggerLabel = label + self.firstTapBufferLogged = false self.firstAudioLogged = false let event = self.activeEvent self.lock.unlock() @@ -70,6 +72,33 @@ final class DictationStartProbe: @unchecked Sendable { ) } + func markFirstTapBuffer(frameLength: Int, sampleRate: Double) { + let now = ProcessInfo.processInfo.systemUptime + self.lock.lock() + guard self.firstTapBufferLogged == false else { + self.lock.unlock() + return + } + self.firstTapBufferLogged = true + let event = self.activeEvent + let label = self.activeTriggerLabel ?? "unknown" + self.lock.unlock() + + let callbackDelta = Self.deltaMilliseconds(from: event?.uptime, to: now) + let bufferDuration = Self.bufferDurationMilliseconds(frameLength: frameLength, sampleRate: sampleRate) + let estimatedFirstSampleDelta = callbackDelta >= 0 && bufferDuration >= 0 + ? callbackDelta - bufferDuration + : -1 + DebugLogger.shared.benchmark( + "START_LATENCY", + message: "first_tap_buffer label=\(label) frames=\(frameLength) " + + "sampleRate=\(Int(sampleRate.rounded())) event=\(event?.kind ?? "unknown") " + + "eventToFirstTapBufferMs=\(callbackDelta) bufferMs=\(bufferDuration) " + + "estimatedEventToFirstSampleMs=\(estimatedFirstSampleDelta)", + source: "DictationStartProbe" + ) + } + func markFirstAudio(sampleCount: Int) { let now = ProcessInfo.processInfo.systemUptime self.lock.lock() @@ -100,6 +129,11 @@ final class DictationStartProbe: @unchecked Sendable { return Int(((end - start) * 1000).rounded()) } + private static func bufferDurationMilliseconds(frameLength: Int, sampleRate: Double) -> Int { + guard sampleRate > 0 else { return -1 } + return Int(((Double(frameLength) / sampleRate) * 1000).rounded()) + } + private static func eventName(_ type: CGEventType) -> String { switch type { case .keyDown: return "keyDown" diff --git a/Sources/Fluid/Services/GlobalHotkeyManager.swift b/Sources/Fluid/Services/GlobalHotkeyManager.swift index 19bb2257..d919ad33 100644 --- a/Sources/Fluid/Services/GlobalHotkeyManager.swift +++ b/Sources/Fluid/Services/GlobalHotkeyManager.swift @@ -72,6 +72,7 @@ final class GlobalHotkeyManager: NSObject { private var pasteLastTranscriptionCallback: (() -> Void)? private var hotkeyMode: HotkeyActivationMode = SettingsStore.shared.hotkeyMode private let automaticTapThresholdSeconds: TimeInterval = 0.4 + private var primaryDictationShortcutPrewarmActive = false private struct ModifierOnlyShortcutBehavior { let shortcut: HotkeyShortcut @@ -346,6 +347,7 @@ final class GlobalHotkeyManager: NSObject { } func updatePrimaryShortcuts(_ newShortcuts: [HotkeyShortcut]) { + self.cancelPrimaryDictationShortcutPrewarmIfNeeded(reason: "primary dictation shortcuts changed") self.primaryShortcuts = newShortcuts DebugLogger.shared.info("Updated transcription hotkeys", source: "GlobalHotkeyManager") } @@ -447,6 +449,7 @@ final class GlobalHotkeyManager: NSObject { @discardableResult private func setupGlobalHotkey() -> Bool { + self.cancelPrimaryDictationShortcutPrewarmIfNeeded(reason: "event tap reset") self.cleanupEventTap() if !AXIsProcessTrusted() { @@ -534,6 +537,7 @@ final class GlobalHotkeyManager: NSObject { private func markOtherInputDuringModifierOnly() { guard self.modifierOnlyKeyDown else { return } self.otherKeyPressedDuringModifier = true + self.cancelPrimaryDictationShortcutPrewarmIfNeeded(reason: "modifier combo input") if let pending = self.pendingHoldModeStart { pending.cancel() self.pendingHoldModeStart = nil @@ -542,6 +546,22 @@ final class GlobalHotkeyManager: NSObject { } } + private func prewarmPrimaryDictationShortcutIfNeeded(reason: String) { + guard self.asrService.isRunning == false else { + self.primaryDictationShortcutPrewarmActive = false + return + } + + self.primaryDictationShortcutPrewarmActive = self.asrService.prewarmCaptureEngineForFastStart(reason: reason) + } + + private func cancelPrimaryDictationShortcutPrewarmIfNeeded(reason: String) { + guard self.primaryDictationShortcutPrewarmActive else { return } + + self.primaryDictationShortcutPrewarmActive = false + self.asrService.cancelCaptureEngineFastStartPrewarm(reason: reason) + } + private func mouseButton(from event: CGEvent) -> Int { Int(event.getIntegerValueField(.mouseEventButtonNumber)) } @@ -579,6 +599,7 @@ final class GlobalHotkeyManager: NSObject { setModeKeyPressed: { self.isKeyPressed = $0 }, onHoldStart: { self.startRecordingIfNeeded() }, onToggleRelease: { + self.primaryDictationShortcutPrewarmActive = false if self.asrService.isRunning { let isSameMode = self.isDictateRecordingProvider?() ?? false DebugLogger.shared.info( @@ -651,6 +672,7 @@ final class GlobalHotkeyManager: NSObject { } if handled { + self.cancelPrimaryDictationShortcutPrewarmIfNeeded(reason: "cancel shortcut pressed") return nil // Consume event only if we did something } } @@ -711,11 +733,15 @@ final class GlobalHotkeyManager: NSObject { self.triggerPromptSelection(assignment.selection) } } + self.cancelPrimaryDictationShortcutPrewarmIfNeeded(reason: "prompt shortcut pressed") return nil } // Check prompt mode hotkey - if self.handlePromptModeKeyDown(keyCode: keyCode, modifiers: eventModifiers) { return nil } + if self.handlePromptModeKeyDown(keyCode: keyCode, modifiers: eventModifiers) { + self.cancelPrimaryDictationShortcutPrewarmIfNeeded(reason: "prompt mode shortcut pressed") + return nil + } // Check command mode hotkey first if self.commandModeShortcutEnabled, @@ -767,6 +793,7 @@ final class GlobalHotkeyManager: NSObject { self.triggerCommandMode() } } + self.cancelPrimaryDictationShortcutPrewarmIfNeeded(reason: "command mode shortcut pressed") return nil } @@ -818,6 +845,7 @@ final class GlobalHotkeyManager: NSObject { self.triggerRewriteMode() } } + self.cancelPrimaryDictationShortcutPrewarmIfNeeded(reason: "rewrite mode shortcut pressed") return nil } } @@ -825,10 +853,13 @@ final class GlobalHotkeyManager: NSObject { // Then check transcription hotkeys if let shortcut = self.primaryShortcuts.first(where: { $0.matches(keyCode: keyCode, modifiers: eventModifiers) }) { guard self.beginPrimaryShortcutPress(.keyboard(shortcut.keyCode)) else { return nil } + self.primaryDictationShortcutPrewarmActive = false self.handlePrimaryDictationTriggerDown() return nil } + self.cancelPrimaryDictationShortcutPrewarmIfNeeded(reason: "primary dictation combo used by another key") + case .keyUp: // Prompt mode key up (press and hold mode) if self.handlePromptModeKeyUp(keyCode: keyCode) { return nil } @@ -906,6 +937,7 @@ final class GlobalHotkeyManager: NSObject { if self.handleMouseShortcutDown(event, modifiers: eventModifiers) { return nil } + self.cancelPrimaryDictationShortcutPrewarmIfNeeded(reason: "primary dictation mouse combo used by another button") case .leftMouseUp, .rightMouseUp, .otherMouseUp: if self.handleMouseShortcutUp(event) { @@ -998,6 +1030,8 @@ final class GlobalHotkeyManager: NSObject { ) { return nil } } + self.handlePrimaryDictationShortcutPrewarmFlagsChanged(modifiers: eventModifiers) + default: break } @@ -1072,6 +1106,7 @@ final class GlobalHotkeyManager: NSObject { for behavior: ModifierOnlyShortcutBehavior, message: String ) { + self.cancelModifierOnlyPrewarmIfNeeded(for: behavior, reason: message) guard self.pendingHoldModeType == behavior.holdModeType else { return } self.otherKeyPressedDuringModifier = true self.pendingHoldModeStart?.cancel() @@ -1080,6 +1115,44 @@ final class GlobalHotkeyManager: NSObject { DebugLogger.shared.info(message, source: "GlobalHotkeyManager") } + private func prewarmModifierOnlyCaptureIfNeeded(for behavior: ModifierOnlyShortcutBehavior) { + guard behavior.holdModeType == .transcription, + self.asrService.isRunning == false + else { return } + + self.prewarmPrimaryDictationShortcutIfNeeded(reason: "primary modifier down") + } + + private func cancelModifierOnlyPrewarmIfNeeded( + for behavior: ModifierOnlyShortcutBehavior, + reason: String + ) { + guard behavior.holdModeType == .transcription else { return } + self.cancelPrimaryDictationShortcutPrewarmIfNeeded(reason: reason) + } + + private func handlePrimaryDictationShortcutPrewarmFlagsChanged(modifiers: NSEvent.ModifierFlags) { + guard self.asrService.isRunning == false else { + self.cancelPrimaryDictationShortcutPrewarmIfNeeded(reason: "primary dictation already running") + return + } + + let relevantModifiers = modifiers.intersection(HotkeyShortcut.relevantModifierMask) + let matchingShortcut = self.primaryShortcuts.contains { shortcut in + guard !shortcut.isModifierOnlyShortcut, + !shortcut.relevantModifierFlags.isEmpty + else { return false } + + return shortcut.relevantModifierFlags == relevantModifiers + } + + if matchingShortcut { + self.prewarmPrimaryDictationShortcutIfNeeded(reason: "primary dictation modifier prefix down") + } else { + self.cancelPrimaryDictationShortcutPrewarmIfNeeded(reason: "primary dictation modifier prefix changed") + } + } + private func handleAutomaticKeyRelease( for type: HotkeyHoldModeType, label: String, @@ -1307,6 +1380,9 @@ final class GlobalHotkeyManager: NSObject { if self.hotkeyMode == .hold { self.markHoldModeStartTriggered(for: behavior.holdModeType) } + if behavior.holdModeType == .transcription { + self.primaryDictationShortcutPrewarmActive = false + } behavior.onHoldStart() if self.hotkeyMode == .automatic { self.markAutomaticPressStarted(for: behavior.holdModeType) @@ -1330,6 +1406,8 @@ final class GlobalHotkeyManager: NSObject { if self.asrService.isRunning || didStart { DebugLogger.shared.info(behavior.holdReleaseMessage, source: "GlobalHotkeyManager") self.stopRecordingAfterRelease(for: behavior.holdModeType, label: self.label(for: behavior.holdModeType)) + } else { + self.cancelModifierOnlyPrewarmIfNeeded(for: behavior, reason: "modifier hold released before start") } } case .automatic: @@ -1351,10 +1429,17 @@ final class GlobalHotkeyManager: NSObject { DebugLogger.shared.debug(behavior.toggleIgnoredMessage, source: "GlobalHotkeyManager") } } + if behavior.holdModeType == .transcription, + self.primaryDictationShortcutPrewarmActive, + self.asrService.isRunning == false + { + self.cancelPrimaryDictationShortcutPrewarmIfNeeded(reason: "automatic modifier released without start") + } case .toggle: if wasCleanPress { behavior.onToggleRelease() } else { + self.cancelModifierOnlyPrewarmIfNeeded(for: behavior, reason: behavior.toggleIgnoredMessage) DebugLogger.shared.debug(behavior.toggleIgnoredMessage, source: "GlobalHotkeyManager") } } @@ -1379,6 +1464,7 @@ final class GlobalHotkeyManager: NSObject { self.isRewriteKeyPressed = false self.isPromptAssignmentKeyPressed = false self.activePrimaryShortcutPress = nil + self.cancelPrimaryDictationShortcutPrewarmIfNeeded(reason: "modifier tracking reset") if shouldStopActiveHold { switch reason { @@ -1551,6 +1637,7 @@ final class GlobalHotkeyManager: NSObject { self.otherKeyPressedDuringModifier = false self.modifierPressStartTime = Date() + self.prewarmModifierOnlyCaptureIfNeeded(for: behavior) self.scheduleModifierOnlyStart(for: behavior) return true } @@ -1592,6 +1679,7 @@ final class GlobalHotkeyManager: NSObject { self.otherKeyPressedDuringModifier = false self.modifierPressStartTime = Date() + self.prewarmModifierOnlyCaptureIfNeeded(for: behavior) self.scheduleModifierOnlyStart(for: behavior) return true } @@ -1682,6 +1770,7 @@ final class GlobalHotkeyManager: NSObject { if self.primaryShortcuts.contains(where: { $0.matchesMouse(button: mouseButton, modifiers: eventModifiers) }) { guard self.beginPrimaryShortcutPress(.mouse(mouseButton)) else { return true } + self.primaryDictationShortcutPrewarmActive = false self.handlePrimaryDictationTriggerDown() return true } @@ -1757,6 +1846,7 @@ final class GlobalHotkeyManager: NSObject { } func setHotkeyMode(_ mode: HotkeyActivationMode) { + self.cancelPrimaryDictationShortcutPrewarmIfNeeded(reason: "hotkey mode changed") let shouldStopActivePress = self.hotkeyMode != .toggle && self.asrService.isRunning && (self.isKeyPressed || self.isPromptModeKeyPressed || self.isCommandModeKeyPressed || self.isRewriteKeyPressed || self.isPromptAssignmentKeyPressed) From 780cc6aa6a739e9eb6b2b2e07c8742743929b0bc Mon Sep 17 00:00:00 2001 From: altic-dev Date: Tue, 30 Jun 2026 21:33:17 -0700 Subject: [PATCH 04/10] improve custom dictionary training --- Sources/Fluid/UI/CustomDictionaryView.swift | 442 +++++++++++++++++--- 1 file changed, 378 insertions(+), 64 deletions(-) diff --git a/Sources/Fluid/UI/CustomDictionaryView.swift b/Sources/Fluid/UI/CustomDictionaryView.swift index 8c74f7fa..0eda7d60 100644 --- a/Sources/Fluid/UI/CustomDictionaryView.swift +++ b/Sources/Fluid/UI/CustomDictionaryView.swift @@ -30,6 +30,10 @@ struct CustomDictionaryView: View { @State private var trainingReplacement = "" @State private var trainingVariants: [String] = [] + @State private var trainingSampleCount = 0 + @State private var lastTrainingOutput = "" + @State private var lastTrainingOutputIsCovered = false + @State private var consecutiveCoveredCaptures = 0 @State private var trainingStatusMessage = "Type the correct text." @State private var trainingHasError = false @State private var isTrainingActive = false @@ -39,17 +43,15 @@ struct CustomDictionaryView: View { @State private var composerMode: DictionaryComposerMode = .train @State private var manualTriggersText = "" @State private var manualReplacement = "" + @State private var isDictionaryExpanded = false private var normalizedTrainingReplacement: String { self.trainingReplacement.trimmingCharacters(in: .whitespacesAndNewlines) } private var trainingProgressText: String { - let count = self.trainingVariants.count - let target = count <= CustomDictionaryTrainingMerge.recommendedSamples - ? CustomDictionaryTrainingMerge.recommendedSamples - : CustomDictionaryTrainingMerge.maxSamples - return "\(count)/\(target)" + let count = self.trainingSampleCount + return "\(count) \(count == 1 ? "sample" : "samples") Β· up to \(CustomDictionaryTrainingMerge.maxSamples)" } private var shouldShowTrainingStatus: Bool { @@ -79,7 +81,69 @@ struct CustomDictionaryView: View { private var trainingRecorderDetail: String { self.normalizedTrainingReplacement.isEmpty ? "Type the correct text first." - : "\"\(self.normalizedTrainingReplacement)\"" + : "Keep trying until FluidVoice understands you 3 times in a row." + } + + private var trainingRecorderStatusText: String { + guard !self.lastTrainingOutput.isEmpty else { return "Record to check" } + if self.trainingAlreadyCorrectWithoutReplacement { + return "Already correct" + } + if self.trainingFinalOutputIsReady { + return "Ready to add" + } + return "\(self.trainingReadinessProgress)/\(CustomDictionaryTrainingMerge.readyCoveredCount) understood" + } + + private var trainingRecorderStatusColor: Color { + self.trainingFinalOutputIsReady || self.trainingAlreadyCorrectWithoutReplacement + ? self.theme.palette.success + : self.theme.palette.secondaryText + } + + private var trainingRecorderFillColor: Color { + self.trainingFinalOutputIsReady || self.trainingAlreadyCorrectWithoutReplacement + ? self.theme.palette.success + : self.theme.palette.accent + } + + private var trainingRecorderFillFraction: Double { + guard !self.lastTrainingOutput.isEmpty else { return 0 } + if self.trainingAlreadyCorrectWithoutReplacement { + return 1 + } + return Double(self.trainingReadinessProgress) / Double(CustomDictionaryTrainingMerge.readyCoveredCount) + } + + private var trainingFinalOutputIsReady: Bool { + !self.trainingAlreadyCorrectWithoutReplacement && + self.trainingOutputIsCovered && + self.consecutiveCoveredCaptures >= CustomDictionaryTrainingMerge.readyCoveredCount + } + + private var trainingAlreadyCorrectWithoutReplacement: Bool { + self.trainingVariants.isEmpty && + self.trainingOutputIsCovered && + !self.lastTrainingOutput.isEmpty && + self.lastTrainingOutput.caseInsensitiveCompare(self.normalizedTrainingReplacement) == .orderedSame && + self.consecutiveCoveredCaptures >= CustomDictionaryTrainingMerge.readyCoveredCount + } + + private var trainingReadinessProgress: Int { + guard !self.trainingAlreadyCorrectWithoutReplacement else { + return CustomDictionaryTrainingMerge.readyCoveredCount + } + guard self.trainingOutputIsCovered else { return 0 } + return min(self.consecutiveCoveredCaptures, CustomDictionaryTrainingMerge.readyCoveredCount) + } + + private var trainingOutputIsCovered: Bool { + self.lastTrainingOutputIsCovered + } + + private var trainingFinalOutputText: String { + guard !self.lastTrainingOutput.isEmpty else { return "Record to check" } + return self.trainingOutputIsCovered ? self.normalizedTrainingReplacement : self.lastTrainingOutput } private var canStartTraining: Bool { @@ -92,7 +156,7 @@ struct CustomDictionaryView: View { !self.normalizedTrainingReplacement.isEmpty && !self.isTrainingProcessing && !self.asr.isRunning && - self.trainingVariants.count < CustomDictionaryTrainingMerge.maxSamples + self.trainingSampleCount < CustomDictionaryTrainingMerge.maxSamples } private var canAddTrainedReplacement: Bool { @@ -102,6 +166,14 @@ struct CustomDictionaryView: View { !self.isTrainingProcessing } + private var trainedReplacementButtonTitle: String { + self.trainingAlreadyCorrectWithoutReplacement ? "No Replacement Needed" : "Add Replacement" + } + + private var shouldEmphasizeTrainedReplacementButton: Bool { + self.trainingFinalOutputIsReady && self.canAddTrainedReplacement + } + private var manualTriggers: [String] { CustomDictionaryManualEntry.parseTriggers(self.manualTriggersText) } @@ -259,7 +331,7 @@ struct CustomDictionaryView: View { self.manualReplacementComposer } } - .frame(height: 260, alignment: .topLeading) + .frame(height: 315, alignment: .topLeading) } } .frame(maxWidth: .infinity, alignment: .leading) @@ -300,7 +372,7 @@ struct CustomDictionaryView: View { } private var trainReplacementComposer: some View { - VStack(alignment: .leading, spacing: self.theme.metrics.spacing.md) { + VStack(alignment: .leading, spacing: self.theme.metrics.spacing.sm) { TextField("Type the correct text, e.g. FluidVoice", text: self.$trainingReplacement) .textFieldStyle(.roundedBorder) .disabled(self.isTrainingRecording || self.isTrainingProcessing) @@ -310,6 +382,8 @@ struct CustomDictionaryView: View { self.trainingRecorderPanel + self.trainingFinalOutputPanel + if !self.trainingVariants.isEmpty { self.trainingHeardSection } @@ -321,16 +395,35 @@ struct CustomDictionaryView: View { Button { self.addTrainedReplacement() } label: { - Label("Add Replacement", systemImage: "plus") + Label(self.trainedReplacementButtonTitle, systemImage: self.trainingAlreadyCorrectWithoutReplacement ? "checkmark" : "plus") .frame(maxWidth: .infinity) .frame(height: 38) } .fluidButton(.accent, size: .small) .disabled(!self.canAddTrainedReplacement) - .opacity(self.canAddTrainedReplacement ? 1 : 0.45) + .opacity(self.canAddTrainedReplacement || self.trainingAlreadyCorrectWithoutReplacement ? 1 : 0.45) + .overlay(self.trainedReplacementButtonReadyOutline) + .shadow( + color: self.shouldEmphasizeTrainedReplacementButton ? self.theme.palette.success.opacity(0.18) : .clear, + radius: self.shouldEmphasizeTrainedReplacementButton ? 14 : 0, + x: 0, + y: 5 + ) + .scaleEffect(self.shouldEmphasizeTrainedReplacementButton ? 1.006 : 1) + .animation(.spring(response: 0.28, dampingFraction: 0.72), value: self.shouldEmphasizeTrainedReplacementButton) } } + private var trainedReplacementButtonReadyOutline: some View { + RoundedRectangle(cornerRadius: self.theme.metrics.corners.md, style: .continuous) + .stroke( + self.shouldEmphasizeTrainedReplacementButton ? self.theme.palette.success.opacity(0.72) : .clear, + lineWidth: 1.5 + ) + .padding(-3) + .allowsHitTesting(false) + } + private var manualReplacementComposer: some View { VStack(alignment: .leading, spacing: self.theme.metrics.spacing.md) { ViewThatFits(in: .horizontal) { @@ -417,13 +510,20 @@ struct CustomDictionaryView: View { Text(self.trainingRecorderDetail) .font(self.theme.typography.caption) .foregroundStyle(self.theme.palette.secondaryText) - .lineLimit(1) + .lineLimit(2) + + self.trainingRecorderProgressRow HStack(spacing: 7) { - TrainingProgressDots(count: self.trainingVariants.count) - Text("\(self.trainingProgressText) recorded") + Text(self.trainingRecorderStatusText) + .font(self.theme.typography.captionStrong) + .foregroundStyle(self.trainingRecorderStatusColor) + .lineLimit(1) + + Text("Β· \(self.trainingProgressText) recorded") .font(self.theme.typography.caption) .foregroundStyle(self.theme.palette.tertiaryText) + .lineLimit(1) } } @@ -445,31 +545,95 @@ struct CustomDictionaryView: View { .opacity(self.canUseTrainingRecorderButton ? 1 : 0.45) } .padding(self.theme.metrics.spacing.md) - .background( + .background(self.trainingRecorderBackground) + } + + private var trainingRecorderBackground: some View { + GeometryReader { proxy in + let fillWidth = proxy.size.width * min(max(self.trainingRecorderFillFraction, 0), 1) + RoundedRectangle(cornerRadius: self.theme.metrics.corners.md, style: .continuous) .fill(self.theme.palette.contentBackground.opacity(0.5)) + .overlay(alignment: .leading) { + RoundedRectangle(cornerRadius: self.theme.metrics.corners.md, style: .continuous) + .fill(self.trainingRecorderFillColor.opacity(0.16)) + .frame(width: fillWidth) + } .overlay( RoundedRectangle(cornerRadius: self.theme.metrics.corners.md, style: .continuous) - .stroke(self.theme.palette.cardBorder.opacity(0.25), lineWidth: 1) + .stroke(self.trainingRecorderBorderColor, lineWidth: 1) ) - ) + .animation(.easeOut(duration: 0.18), value: self.trainingRecorderFillFraction) + } + .allowsHitTesting(false) + } + + private var trainingRecorderBorderColor: Color { + self.trainingFinalOutputIsReady || self.trainingAlreadyCorrectWithoutReplacement + ? self.theme.palette.success.opacity(0.28) + : self.theme.palette.cardBorder.opacity(0.25) + } + + private var trainingRecorderProgressBar: some View { + GeometryReader { proxy in + let width = proxy.size.width * min(max(self.trainingRecorderFillFraction, 0), 1) + + ZStack(alignment: .leading) { + Capsule(style: .continuous) + .fill(self.theme.palette.cardBorder.opacity(0.35)) + + Capsule(style: .continuous) + .fill(self.trainingRecorderFillColor) + .frame(width: width) + } + } + .frame(height: 5) + .animation(.easeOut(duration: 0.18), value: self.trainingRecorderFillFraction) + .accessibilityHidden(true) + } + + private var trainingRecorderProgressRow: some View { + HStack(spacing: self.theme.metrics.spacing.sm) { + self.trainingRecorderProgressBar + + Text("\(self.trainingReadinessProgress)/\(CustomDictionaryTrainingMerge.readyCoveredCount)") + .font(self.theme.typography.captionStrong) + .foregroundStyle(self.trainingRecorderStatusColor) + .monospacedDigit() + .frame(width: 34, alignment: .trailing) + } } private var trainingHeardSection: some View { - VStack(alignment: .leading, spacing: self.theme.metrics.spacing.sm) { - Text("Heard") + HStack(spacing: self.theme.metrics.spacing.sm) { + Text("Captured") .font(self.theme.typography.captionStrong) .foregroundStyle(self.theme.palette.secondaryText) - FlowLayout(spacing: 6) { - ForEach(self.trainingVariants, id: \.self) { variant in - TrainingVariantChip(variant: variant) { - self.trainingVariants.removeAll { $0 == variant } + HStack(spacing: 6) { + ForEach(Array(self.trainingVariants.prefix(5).enumerated()), id: \.element) { index, variant in + TrainingVariantChip(number: index + 1, variant: variant) { + self.removeTrainingVariant(variant) } } + + if self.trainingVariants.count > 5 { + Text("+\(self.trainingVariants.count - 5)") + .font(self.theme.typography.captionStrong) + .foregroundStyle(self.theme.palette.tertiaryText) + .padding(.horizontal, 7) + .padding(.vertical, 4) + .background( + RoundedRectangle(cornerRadius: 5, style: .continuous) + .fill(self.theme.palette.cardBackground.opacity(0.65)) + ) + } } + + Spacer(minLength: 0) } - .padding(self.theme.metrics.spacing.md) + .padding(.horizontal, self.theme.metrics.spacing.md) + .padding(.vertical, self.theme.metrics.spacing.sm) .background( RoundedRectangle(cornerRadius: self.theme.metrics.corners.md, style: .continuous) .fill(self.theme.palette.contentBackground.opacity(0.5)) @@ -480,6 +644,43 @@ struct CustomDictionaryView: View { ) } + private var trainingFinalOutputPanel: some View { + HStack(alignment: .center, spacing: self.theme.metrics.spacing.md) { + VStack(alignment: .leading, spacing: 5) { + Text("Final output") + .font(self.theme.typography.captionStrong) + .foregroundStyle(self.theme.palette.secondaryText) + + Text(self.trainingFinalOutputText) + .font(self.theme.typography.bodySmallStrong) + .foregroundStyle(self.lastTrainingOutput.isEmpty ? self.theme.palette.tertiaryText : self.theme.palette.primaryText) + .lineLimit(1) + + if !self.lastTrainingOutput.isEmpty, self.lastTrainingOutput.caseInsensitiveCompare(self.trainingFinalOutputText) != .orderedSame { + Text("Heard: \(self.lastTrainingOutput)") + .font(self.theme.typography.caption) + .foregroundStyle(self.theme.palette.tertiaryText) + .lineLimit(1) + } + } + + Spacer() + } + .padding(.horizontal, self.theme.metrics.spacing.md) + .padding(.vertical, self.theme.metrics.spacing.sm) + .background( + RoundedRectangle(cornerRadius: self.theme.metrics.corners.md, style: .continuous) + .fill(self.theme.palette.contentBackground.opacity(0.42)) + .overlay( + RoundedRectangle(cornerRadius: self.theme.metrics.corners.md, style: .continuous) + .stroke( + self.trainingFinalOutputIsReady ? self.theme.palette.success.opacity(0.28) : self.theme.palette.cardBorder.opacity(0.22), + lineWidth: 1 + ) + ) + ) + } + @ViewBuilder private var trainingFooter: some View { if self.shouldShowTrainingStatus || self.isTrainingActive || !self.trainingVariants.isEmpty { @@ -532,17 +733,39 @@ struct CustomDictionaryView: View { .font(self.theme.typography.caption) .foregroundStyle(self.theme.palette.secondaryText) } + + Spacer() + + Button { + withAnimation(self.reduceMotion ? nil : .easeOut(duration: 0.16)) { + self.isDictionaryExpanded.toggle() + } + } label: { + Image(systemName: self.isDictionaryExpanded ? "chevron.up" : "chevron.down") + .font(.system(size: 12, weight: .semibold)) + .foregroundStyle(self.theme.palette.secondaryText) + .frame(width: 28, height: 28) + .background( + RoundedRectangle(cornerRadius: self.theme.metrics.corners.sm, style: .continuous) + .fill(self.theme.palette.contentBackground.opacity(0.45)) + ) + } + .buttonStyle(.plain) + .help(self.isDictionaryExpanded ? "Collapse dictionary" : "Expand dictionary") + .accessibilityLabel(self.isDictionaryExpanded ? "Collapse dictionary" : "Expand dictionary") } - if self.entries.isEmpty { - self.dictionaryEmptyState( - title: "No replacements yet", - detail: "Use Train Replacement or Manual Add above to create your first one." - ) - .frame(maxWidth: 760) - .frame(maxWidth: .infinity, alignment: .center) - } else { - self.entriesListView + if self.isDictionaryExpanded { + if self.entries.isEmpty { + self.dictionaryEmptyState( + title: "No replacements yet", + detail: "Use Train Replacement or Manual Add above to create your first one." + ) + .frame(maxWidth: 760) + .frame(maxWidth: .infinity, alignment: .center) + } else { + self.entriesListView + } } } } @@ -664,6 +887,10 @@ struct CustomDictionaryView: View { .font(self.theme.typography.caption) .foregroundStyle(self.theme.palette.secondaryText) + Text("It can add close to a second to transcription time.") + .font(self.theme.typography.caption) + .foregroundStyle(self.theme.palette.secondaryText) + Text("If recognition gets worse, the model behaves unexpectedly, or you notice other issues after enabling it, turn Boosting off.") .font(self.theme.typography.caption) .foregroundStyle(self.theme.palette.secondaryText) @@ -778,37 +1005,64 @@ struct CustomDictionaryView: View { private func addTrainingVariant(from transcript: String) { guard let detected = CustomDictionaryTrainingMerge.normalizedTrigger(transcript) else { + self.lastTrainingOutput = "" + self.lastTrainingOutputIsCovered = false + self.consecutiveCoveredCaptures = 0 self.trainingHasError = true self.trainingStatusMessage = "Nothing heard. Try again." return } + self.lastTrainingOutput = detected + self.trainingSampleCount = min(self.trainingSampleCount + 1, CustomDictionaryTrainingMerge.maxSamples) + if detected.caseInsensitiveCompare(self.normalizedTrainingReplacement) == .orderedSame { + self.lastTrainingOutputIsCovered = true + self.consecutiveCoveredCaptures += 1 self.trainingHasError = false - self.trainingStatusMessage = "That sounded right. Try another if needed." + if self.consecutiveCoveredCaptures >= CustomDictionaryTrainingMerge.readyCoveredCount { + self.trainingStatusMessage = self.trainingVariants.isEmpty + ? "Looks good already. No replacement needed." + : "Looks ready. Add this replacement when you're ready." + } else { + self.trainingStatusMessage = "Covered. Try a couple more." + } return } - if self.trainingVariants.contains(where: { $0.caseInsensitiveCompare(detected) == .orderedSame }) { + let wasAlreadyCaptured = self.trainingVariants.contains { $0.caseInsensitiveCompare(detected) == .orderedSame } + let wasAlreadySaved = self.savedDictionaryCovers(detected) + + if wasAlreadyCaptured || wasAlreadySaved { + self.lastTrainingOutputIsCovered = true + self.consecutiveCoveredCaptures += 1 self.trainingHasError = false - self.trainingStatusMessage = "Already got that one." + if self.consecutiveCoveredCaptures >= CustomDictionaryTrainingMerge.readyCoveredCount { + self.trainingStatusMessage = "Looks ready. Add this replacement when you're ready." + } else if wasAlreadySaved { + self.trainingStatusMessage = "Covered by your dictionary." + } else { + self.trainingStatusMessage = "Already captured. Try a couple more." + } return } guard self.trainingVariants.count < CustomDictionaryTrainingMerge.maxSamples else { + self.lastTrainingOutputIsCovered = false + self.consecutiveCoveredCaptures = 0 self.trainingHasError = false - self.trainingStatusMessage = "You have enough. Add when ready." + self.trainingStatusMessage = "Max samples reached. Add it or clear one." return } self.trainingVariants.append(detected) + self.lastTrainingOutputIsCovered = false + self.consecutiveCoveredCaptures = 0 self.trainingHasError = false - if self.trainingVariants.count >= CustomDictionaryTrainingMerge.maxSamples { - self.trainingStatusMessage = "You have enough. Add when ready." - } else if self.trainingVariants.count >= CustomDictionaryTrainingMerge.recommendedSamples { - self.trainingStatusMessage = "You can add it now." + if self.trainingSampleCount >= CustomDictionaryTrainingMerge.maxSamples || self.trainingVariants.count >= CustomDictionaryTrainingMerge.maxSamples { + self.trainingStatusMessage = "Max samples reached. Add it or clear one." } else { - self.trainingStatusMessage = "Got it." + self.trainingStatusMessage = "New pronunciation captured. Add replacement to cover it." } } @@ -831,9 +1085,38 @@ struct CustomDictionaryView: View { ) } + private func removeTrainingVariant(_ variant: String) { + self.trainingVariants.removeAll { $0 == variant } + self.refreshLastTrainingCoverage() + } + + private func refreshLastTrainingCoverage() { + guard !self.lastTrainingOutput.isEmpty else { + self.lastTrainingOutputIsCovered = false + self.consecutiveCoveredCaptures = 0 + return + } + + let matchesReplacement = self.lastTrainingOutput.caseInsensitiveCompare(self.normalizedTrainingReplacement) == .orderedSame + let isStillCaptured = self.trainingVariants.contains { + $0.caseInsensitiveCompare(self.lastTrainingOutput) == .orderedSame + } + + if matchesReplacement || isStillCaptured || self.savedDictionaryCovers(self.lastTrainingOutput) { + self.lastTrainingOutputIsCovered = true + } else { + self.lastTrainingOutputIsCovered = false + self.consecutiveCoveredCaptures = 0 + } + } + private func resetTraining(statusMessage: String = "Type the correct text.") { self.trainingReplacement = "" self.trainingVariants = [] + self.trainingSampleCount = 0 + self.lastTrainingOutput = "" + self.lastTrainingOutputIsCovered = false + self.consecutiveCoveredCaptures = 0 self.trainingStatusMessage = statusMessage self.trainingHasError = false self.isTrainingActive = false @@ -846,14 +1129,52 @@ struct CustomDictionaryView: View { let newKey = CustomDictionaryTrainingMerge.normalizedReplacement(newValue).lowercased() guard oldKey != newKey else { return } - if !self.trainingVariants.isEmpty { - self.trainingVariants.removeAll() - } + self.trainingVariants = self.existingTrainingVariants(for: newValue) + self.trainingSampleCount = 0 + self.lastTrainingOutput = "" + self.lastTrainingOutputIsCovered = false + self.consecutiveCoveredCaptures = 0 self.isTrainingActive = false - self.trainingStatusMessage = newKey.isEmpty ? "Type the correct text." : "" + if newKey.isEmpty { + self.trainingStatusMessage = "Type the correct text." + } else if self.trainingVariants.isEmpty { + self.trainingStatusMessage = "" + } else { + self.trainingStatusMessage = "Loaded \(self.trainingVariants.count) saved \(self.trainingVariants.count == 1 ? "capture" : "captures")." + } self.trainingHasError = false } + private func existingTrainingVariants(for replacement: String) -> [String] { + let replacementText = CustomDictionaryTrainingMerge.normalizedReplacement(replacement) + guard !replacementText.isEmpty else { return [] } + + let triggers = self.entries + .filter { $0.replacement.caseInsensitiveCompare(replacementText) == .orderedSame } + .flatMap(\.triggers) + + return CustomDictionaryTrainingMerge.normalizedTriggers( + from: triggers, + intendedReplacement: replacementText + ) + } + + private func savedDictionaryCovers(_ trigger: String) -> Bool { + guard let triggerKey = CustomDictionaryTrainingMerge.normalizedTrigger(trigger), + !self.normalizedTrainingReplacement.isEmpty + else { + return false + } + + return self.entries.contains { entry in + entry.replacement.caseInsensitiveCompare(self.normalizedTrainingReplacement) == .orderedSame && + entry.triggers.contains { savedTrigger in + guard let savedKey = CustomDictionaryTrainingMerge.normalizedTrigger(savedTrigger) else { return false } + return savedKey == triggerKey + } + } + } + private func showReplacementConfirmation(title: String, detail: String) { let confirmation = ReplacementConfirmation(title: title, detail: detail) NSHapticFeedbackManager.defaultPerformer.perform(.levelChange, performanceTime: .now) @@ -1118,7 +1439,8 @@ private enum CustomDictionaryManualEntry { enum CustomDictionaryTrainingMerge { static let recommendedSamples = 5 - static let maxSamples = 10 + static let maxSamples = 20 + static let readyCoveredCount = 3 private static let edgePunctuation = CharacterSet(charactersIn: ".,!?;:\"'β€œβ€β€˜β€™") @@ -1290,6 +1612,7 @@ private struct ReplacementConfirmationToast: View { } private struct TrainingVariantChip: View { + let number: Int let variant: String let onDelete: () -> Void @@ -1297,8 +1620,15 @@ private struct TrainingVariantChip: View { var body: some View { HStack(spacing: 4) { + Text("\(self.number)") + .font(self.theme.typography.captionSmall) + .foregroundStyle(self.theme.palette.accent) + .frame(minWidth: 11) + Text(self.variant) .font(self.theme.typography.caption) + .lineLimit(1) + .truncationMode(.tail) Button(action: self.onDelete) { Image(systemName: "xmark.circle.fill") @@ -1308,6 +1638,7 @@ private struct TrainingVariantChip: View { .buttonStyle(.plain) .help("Remove \(self.variant)") } + .frame(maxWidth: 165) .padding(.horizontal, 7) .padding(.vertical, 4) .background( @@ -1342,23 +1673,6 @@ private struct DictionaryPreviewChip: View { } } -private struct TrainingProgressDots: View { - let count: Int - - @Environment(\.theme) private var theme - - var body: some View { - HStack(spacing: 4) { - ForEach(0.. Date: Tue, 30 Jun 2026 22:00:58 -0700 Subject: [PATCH 05/10] fix dictionary training review issues --- Sources/Fluid/UI/CustomDictionaryView.swift | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Sources/Fluid/UI/CustomDictionaryView.swift b/Sources/Fluid/UI/CustomDictionaryView.swift index 0eda7d60..5603e6c7 100644 --- a/Sources/Fluid/UI/CustomDictionaryView.swift +++ b/Sources/Fluid/UI/CustomDictionaryView.swift @@ -331,7 +331,7 @@ struct CustomDictionaryView: View { self.manualReplacementComposer } } - .frame(height: 315, alignment: .topLeading) + .frame(minHeight: 315, alignment: .topLeading) } } .frame(maxWidth: .infinity, alignment: .leading) @@ -401,7 +401,7 @@ struct CustomDictionaryView: View { } .fluidButton(.accent, size: .small) .disabled(!self.canAddTrainedReplacement) - .opacity(self.canAddTrainedReplacement || self.trainingAlreadyCorrectWithoutReplacement ? 1 : 0.45) + .opacity(self.canAddTrainedReplacement ? 1 : 0.45) .overlay(self.trainedReplacementButtonReadyOutline) .shadow( color: self.shouldEmphasizeTrainedReplacementButton ? self.theme.palette.success.opacity(0.18) : .clear, From b269532ada68993c43c69ec2e9ac010743b283bf Mon Sep 17 00:00:00 2001 From: altic-dev Date: Tue, 30 Jun 2026 22:56:11 -0700 Subject: [PATCH 06/10] fix training stop startup race --- Sources/Fluid/UI/CustomDictionaryView.swift | 38 ++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/Sources/Fluid/UI/CustomDictionaryView.swift b/Sources/Fluid/UI/CustomDictionaryView.swift index 5603e6c7..98cd9d10 100644 --- a/Sources/Fluid/UI/CustomDictionaryView.swift +++ b/Sources/Fluid/UI/CustomDictionaryView.swift @@ -37,7 +37,9 @@ struct CustomDictionaryView: View { @State private var trainingStatusMessage = "Type the correct text." @State private var trainingHasError = false @State private var isTrainingActive = false + @State private var isTrainingStarting = false @State private var isTrainingRecording = false + @State private var trainingStopRequestedDuringStart = false @State private var isTrainingProcessing = false @State private var replacementConfirmation: ReplacementConfirmation? @State private var composerMode: DictionaryComposerMode = .train @@ -62,13 +64,20 @@ struct CustomDictionaryView: View { } private var canUseTrainingRecorderButton: Bool { - self.isTrainingRecording || self.canRecordTrainingSample + guard !self.trainingStopRequestedDuringStart, !self.isTrainingProcessing else { return false } + return self.isTrainingRecording || self.canRecordTrainingSample } private var trainingRecorderTitle: String { + if self.trainingStopRequestedDuringStart { + return "Stopping..." + } if self.isTrainingProcessing { return "Working..." } + if self.isTrainingStarting { + return "Starting..." + } if self.isTrainingRecording { return "Listening..." } @@ -981,19 +990,44 @@ struct CustomDictionaryView: View { self.isTrainingActive = true self.trainingHasError = false self.trainingStatusMessage = "" + self.trainingStopRequestedDuringStart = false + self.isTrainingStarting = true self.isTrainingRecording = true await self.asr.start(forDictionaryTraining: true) + self.isTrainingStarting = false if !self.asr.isRunning { self.isTrainingRecording = false + self.trainingStopRequestedDuringStart = false self.trainingHasError = true self.trainingStatusMessage = "Couldn't start recording. Check microphone access and try again." + return + } + + if self.trainingStopRequestedDuringStart { + await self.finishTrainingSampleStop() } } private func stopTrainingSample() async { + guard self.isTrainingRecording else { return } + guard !self.trainingStopRequestedDuringStart else { return } + + guard !self.isTrainingStarting, self.asr.isRunning else { + self.trainingStopRequestedDuringStart = true + self.trainingHasError = false + self.trainingStatusMessage = "Stopping..." + return + } + + await self.finishTrainingSampleStop() + } + + private func finishTrainingSampleStop() async { guard self.isTrainingRecording else { return } self.isTrainingRecording = false + self.isTrainingStarting = false + self.trainingStopRequestedDuringStart = false self.isTrainingProcessing = true self.trainingHasError = false self.trainingStatusMessage = "" @@ -1120,7 +1154,9 @@ struct CustomDictionaryView: View { self.trainingStatusMessage = statusMessage self.trainingHasError = false self.isTrainingActive = false + self.isTrainingStarting = false self.isTrainingRecording = false + self.trainingStopRequestedDuringStart = false self.isTrainingProcessing = false } From c7bfdb6efba3a0532614b230efbc94b4f0bbfb39 Mon Sep 17 00:00:00 2001 From: altic-dev Date: Tue, 30 Jun 2026 23:10:44 -0700 Subject: [PATCH 07/10] fix dictation startup review issues --- Sources/Fluid/ContentView.swift | 50 +++++++++++-------- Sources/Fluid/Services/ASRService.swift | 14 +++--- .../Services/TranscriptionSoundPlayer.swift | 22 ++++---- 3 files changed, 50 insertions(+), 36 deletions(-) diff --git a/Sources/Fluid/ContentView.swift b/Sources/Fluid/ContentView.swift index 7f23674c..b3286abf 100644 --- a/Sources/Fluid/ContentView.swift +++ b/Sources/Fluid/ContentView.swift @@ -2952,9 +2952,9 @@ struct ContentView: View { let shouldPlayStartSound = !self.isRecordingForCommand && !self.isRecordingForRewrite Task { - await self.asr.start(onCaptureStarted: { + await self.asr.start(beforeCaptureEnabled: { if shouldPlayStartSound { - self.scheduleTranscriptionStartSound() + await self.playTranscriptionStartSoundBeforeCapture() } }) if !self.asr.isRunning { @@ -3202,8 +3202,8 @@ struct ContentView: View { source: "ContentView" ) Task { - await self.asr.start(onCaptureStarted: { - self.scheduleTranscriptionStartSound() + await self.asr.start(beforeCaptureEnabled: { + await self.playTranscriptionStartSoundBeforeCapture() }) } }, @@ -3238,8 +3238,8 @@ struct ContentView: View { // Start recording immediately for the edit instruction DebugLogger.shared.info("Starting voice recording for edit mode", source: "ContentView") Task { - await self.asr.start(onCaptureStarted: { - self.scheduleTranscriptionStartSound() + await self.asr.start(beforeCaptureEnabled: { + await self.playTranscriptionStartSoundBeforeCapture() }) } }, @@ -3546,6 +3546,9 @@ extension ContentView { self.settings.playgroundUsed = false self.playgroundUsed = false } + self.appBench("capture_context_start") + self.captureRecordingContext() + self.appBench("capture_context_end") self.appBench("pre_asr_state_start") self.applyDictationShortcutSelectionContext(for: slot) self.setActiveRecordingMode(mode) @@ -3560,8 +3563,8 @@ extension ContentView { if !wasAlreadyRunning { let asrStartStartedAt = ProcessInfo.processInfo.systemUptime DebugLogger.shared.benchmark("APP_BENCH", message: "asr_start_call", source: "AppBenchmark") - await self.asr.start(onCaptureStarted: { - self.scheduleTranscriptionStartSound(logBenchmarks: true) + await self.asr.start(beforeCaptureEnabled: { + await self.playTranscriptionStartSoundBeforeCapture(logBenchmarks: true) }) if !self.asr.isRunning { self.appBench("asr_start_failed") @@ -3574,9 +3577,6 @@ extension ContentView { ) } - self.appBench("capture_context_start") - self.captureRecordingContext() - self.appBench("capture_context_end") self.appBench("prompt_config_start") self.applyDictationPromptConfiguration(for: SettingsStore.shared.dictationPromptSelection(for: slot)) self.appBench("prompt_config_end") @@ -3596,21 +3596,29 @@ extension ContentView { await self.beginDictationRecording(for: .secondary, mode: mode) } - private func scheduleTranscriptionStartSound(logBenchmarks: Bool = false) { + private func playTranscriptionStartSoundBeforeCapture(logBenchmarks: Bool = false) async { guard SettingsStore.shared.enableTranscriptionSounds else { return } if logBenchmarks { - self.appBench("start_sound_scheduled") + self.appBench("start_sound_start") } - Task { @MainActor in - if logBenchmarks { - DebugLogger.shared.benchmark("APP_BENCH", message: "start_sound_start", source: "AppBenchmark") - } - TranscriptionSoundPlayer.shared.playStartSound() - if logBenchmarks { - DebugLogger.shared.benchmark("APP_BENCH", message: "start_sound_end", source: "AppBenchmark") - } + let duration = TranscriptionSoundPlayer.shared.playStartSound() + let gateSeconds = min(max(duration + 0.04, 0), 1.0) + if logBenchmarks { + DebugLogger.shared.benchmark( + "APP_BENCH", + message: "start_sound_played durationMs=\(Int((duration * 1000).rounded())) gateMs=\(Int((gateSeconds * 1000).rounded()))", + source: "AppBenchmark" + ) + } + + if gateSeconds > 0 { + try? await Task.sleep(nanoseconds: UInt64(gateSeconds * 1_000_000_000)) + } + + if logBenchmarks { + self.appBench("start_sound_gate_done") } } diff --git a/Sources/Fluid/Services/ASRService.swift b/Sources/Fluid/Services/ASRService.swift index 39a08f9f..5942678c 100644 --- a/Sources/Fluid/Services/ASRService.swift +++ b/Sources/Fluid/Services/ASRService.swift @@ -871,7 +871,7 @@ final class ASRService: ObservableObject { /// and `isRunning` will remain `false`. Check the debug logs for details. func start( forDictionaryTraining: Bool = false, - onCaptureStarted: (@MainActor () -> Void)? = nil + beforeCaptureEnabled: (@MainActor () async -> Void)? = nil ) async { let startBenchmarkStartedAt = Date().timeIntervalSince1970 DebugLogger.shared.info("🎀 START() called - beginning recording session", source: "ASRService") @@ -910,8 +910,7 @@ final class ASRService: ObservableObject { self.streamingChunkAnalyticsSuccessCount = 0 self.lastStreamingChunkFailureAnalyticsAt = nil (self.transcriptionProvider as? FluidAudioProvider)?.resetStreamingPreviewCache() - self.audioCapturePipeline.setRecordingEnabled(true) - DictationStartProbe.shared.markCaptureEnabled(session: self.benchmarkSessionID) + self.audioCapturePipeline.setRecordingEnabled(false) self.refreshWordBoostStatus() let dims = self.currentTranscriptionAnalyticsDimensions() self.benchmarkLog("recording_start model=\(dims.model) provider=\(dims.provider) supportsStreaming=\(SettingsStore.shared.selectedSpeechModel.supportsStreaming)") @@ -951,7 +950,7 @@ final class ASRService: ObservableObject { } let captureCallbackStartedAt = Date().timeIntervalSince1970 - onCaptureStarted?() + await beforeCaptureEnabled?() self.benchmarkLog( "start_capture_callback elapsedMs=\(self.elapsedMilliseconds(since: captureCallbackStartedAt)) totalMs=\(self.elapsedMilliseconds(since: startBenchmarkStartedAt))" ) @@ -973,6 +972,9 @@ final class ASRService: ObservableObject { "totalMs=\(self.elapsedMilliseconds(since: startBenchmarkStartedAt))" ) + self.audioCapturePipeline.setRecordingEnabled(true) + DictationStartProbe.shared.markCaptureEnabled(session: self.benchmarkSessionID) + self.isRunning = true self.isDictionaryTrainingCaptureActive = forDictionaryTraining DebugLogger.shared.info("βœ… isRunning set to TRUE", source: "ASRService") @@ -2211,13 +2213,13 @@ final class ASRService: ObservableObject { warmEngine.isRunning else { self.tearDownCaptureEngine(reason: "warm engine unavailable") - self.audioCapturePipeline.setRecordingEnabled(true) + self.audioCapturePipeline.setRecordingEnabled(false) return false } guard self.warmCaptureEngineConfiguration == self.currentCaptureEngineConfiguration() else { self.tearDownCaptureEngine(reason: "audio settings changed while warm") - self.audioCapturePipeline.setRecordingEnabled(true) + self.audioCapturePipeline.setRecordingEnabled(false) return false } diff --git a/Sources/Fluid/Services/TranscriptionSoundPlayer.swift b/Sources/Fluid/Services/TranscriptionSoundPlayer.swift index 5c7ced21..7cfd2172 100644 --- a/Sources/Fluid/Services/TranscriptionSoundPlayer.swift +++ b/Sources/Fluid/Services/TranscriptionSoundPlayer.swift @@ -11,11 +11,12 @@ final class TranscriptionSoundPlayer { private init() {} - func playStartSound() { - guard SettingsStore.shared.enableTranscriptionSounds else { return } + @discardableResult + func playStartSound() -> TimeInterval { + guard SettingsStore.shared.enableTranscriptionSounds else { return 0 } let selected = SettingsStore.shared.transcriptionStartSound - guard let soundName = selected.startSoundFileName else { return } - self.play(soundName: soundName) + guard let soundName = selected.startSoundFileName else { return 0 } + return self.play(soundName: soundName) } func playStopSound() { @@ -38,10 +39,11 @@ final class TranscriptionSoundPlayer { self.play(soundName: soundName, overrideVolume: volume) } - private func play(soundName: String, overrideVolume: Float? = nil) { + @discardableResult + private func play(soundName: String, overrideVolume: Float? = nil) -> TimeInterval { guard let url = Bundle.main.url(forResource: soundName, withExtension: "m4a") else { DebugLogger.shared.error("Missing sound resource: \(soundName).m4a", source: "TranscriptionSoundPlayer") - return + return 0 } let settings = SettingsStore.shared @@ -49,7 +51,7 @@ final class TranscriptionSoundPlayer { if settings.transcriptionSoundIndependentVolume { let currentSystemVol = Self.getSystemVolume() - guard currentSystemVol > 0.001 else { return } + guard currentSystemVol > 0.001 else { return 0 } // Save current system volume and temporarily set it to desired level self.savedSystemVolume = currentSystemVol Self.setSystemVolume(desiredVolume) @@ -71,16 +73,17 @@ final class TranscriptionSoundPlayer { } else { player.volume = desiredVolume } - player.play() + guard player.play() else { return 0 } + let duration = player.duration // Restore system volume after the sound finishes if settings.transcriptionSoundIndependentVolume, let saved = self.savedSystemVolume { - let duration = player.duration DispatchQueue.main.asyncAfter(deadline: .now() + duration + 0.05) { [weak self] in Self.setSystemVolume(saved) self?.savedSystemVolume = nil } } + return duration } catch { // Restore system volume on error if let saved = self.savedSystemVolume { @@ -91,6 +94,7 @@ final class TranscriptionSoundPlayer { "Failed to play sound \(soundName).m4a: \(error.localizedDescription)", source: "TranscriptionSoundPlayer" ) + return 0 } } From 4ffa6e601536c72cc7d3ebb9b0a471865a925745 Mon Sep 17 00:00:00 2001 From: altic-dev Date: Tue, 30 Jun 2026 23:13:40 -0700 Subject: [PATCH 08/10] fix hotkey prewarm review issue --- .../Fluid/Services/GlobalHotkeyManager.swift | 24 ------------------- 1 file changed, 24 deletions(-) diff --git a/Sources/Fluid/Services/GlobalHotkeyManager.swift b/Sources/Fluid/Services/GlobalHotkeyManager.swift index d919ad33..cccdda36 100644 --- a/Sources/Fluid/Services/GlobalHotkeyManager.swift +++ b/Sources/Fluid/Services/GlobalHotkeyManager.swift @@ -1030,8 +1030,6 @@ final class GlobalHotkeyManager: NSObject { ) { return nil } } - self.handlePrimaryDictationShortcutPrewarmFlagsChanged(modifiers: eventModifiers) - default: break } @@ -1131,28 +1129,6 @@ final class GlobalHotkeyManager: NSObject { self.cancelPrimaryDictationShortcutPrewarmIfNeeded(reason: reason) } - private func handlePrimaryDictationShortcutPrewarmFlagsChanged(modifiers: NSEvent.ModifierFlags) { - guard self.asrService.isRunning == false else { - self.cancelPrimaryDictationShortcutPrewarmIfNeeded(reason: "primary dictation already running") - return - } - - let relevantModifiers = modifiers.intersection(HotkeyShortcut.relevantModifierMask) - let matchingShortcut = self.primaryShortcuts.contains { shortcut in - guard !shortcut.isModifierOnlyShortcut, - !shortcut.relevantModifierFlags.isEmpty - else { return false } - - return shortcut.relevantModifierFlags == relevantModifiers - } - - if matchingShortcut { - self.prewarmPrimaryDictationShortcutIfNeeded(reason: "primary dictation modifier prefix down") - } else { - self.cancelPrimaryDictationShortcutPrewarmIfNeeded(reason: "primary dictation modifier prefix changed") - } - } - private func handleAutomaticKeyRelease( for type: HotkeyHoldModeType, label: String, From 2d09afe124bd07e9612ea04e8a0d8a5c022a55a0 Mon Sep 17 00:00:00 2001 From: altic-dev Date: Tue, 30 Jun 2026 23:15:30 -0700 Subject: [PATCH 09/10] harden training stop cleanup --- Sources/Fluid/UI/CustomDictionaryView.swift | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/Sources/Fluid/UI/CustomDictionaryView.swift b/Sources/Fluid/UI/CustomDictionaryView.swift index 98cd9d10..c1c37a2e 100644 --- a/Sources/Fluid/UI/CustomDictionaryView.swift +++ b/Sources/Fluid/UI/CustomDictionaryView.swift @@ -252,9 +252,7 @@ struct CustomDictionaryView: View { .onDisappear { guard self.isTrainingRecording else { return } Task { @MainActor in - _ = await self.asr.stop(forDictionaryTraining: true) - self.isTrainingRecording = false - self.isTrainingProcessing = false + await self.stopTrainingSample() } } } From 5631029dd55671259c11a994b51bcb8d590c06f4 Mon Sep 17 00:00:00 2001 From: altic-dev Date: Wed, 1 Jul 2026 00:43:04 -0700 Subject: [PATCH 10/10] show dictation overlay before asr start --- Sources/Fluid/ContentView.swift | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/Sources/Fluid/ContentView.swift b/Sources/Fluid/ContentView.swift index b3286abf..8ed9394b 100644 --- a/Sources/Fluid/ContentView.swift +++ b/Sources/Fluid/ContentView.swift @@ -3555,6 +3555,14 @@ extension ContentView { self.rewriteModeService.clearState() self.appBench("pre_asr_state_end") + self.appBench("prompt_config_start") + self.applyDictationPromptConfiguration(for: SettingsStore.shared.dictationPromptSelection(for: slot)) + self.appBench("prompt_config_end") + self.appBench("overlay_mode_request mode=Dictation") + self.menuBarManager.setOverlayMode(.dictation) + self.menuBarManager.showRecordingOverlayImmediately() + self.appBench("overlay_mode_requested mode=Dictation") + let wasAlreadyRunning = self.asr.isRunning if wasAlreadyRunning { self.appBench("asr_start_skipped reason=already_running") @@ -3568,6 +3576,7 @@ extension ContentView { }) if !self.asr.isRunning { self.appBench("asr_start_failed") + self.menuBarManager.hideRecordingOverlayImmediately(reason: "asr_start_failed") return } DebugLogger.shared.benchmark( @@ -3577,13 +3586,6 @@ extension ContentView { ) } - self.appBench("prompt_config_start") - self.applyDictationPromptConfiguration(for: SettingsStore.shared.dictationPromptSelection(for: slot)) - self.appBench("prompt_config_end") - self.appBench("overlay_mode_request mode=Dictation") - self.menuBarManager.setOverlayMode(.dictation) - self.menuBarManager.showRecordingOverlayImmediately() - self.appBench("overlay_mode_requested mode=Dictation") self.appBench("prewarm_private_ai_start") self.prewarmPrivateAIDictationIfNeeded(for: slot) self.appBench("prewarm_private_ai_end")