Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions Fluid.xcodeproj/project.pbxproj
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
7C1C72F22EECBD1300E3BF4D /* SwiftWhisper in Frameworks */ = {isa = PBXBuildFile; productRef = 7C1C72F12EECBD1300E3BF4D /* SwiftWhisper */; };
7C3697892ED70F9C005874CE /* DynamicNotchKit in Frameworks */ = {isa = PBXBuildFile; productRef = 7C3697882ED70F9C005874CE /* DynamicNotchKit */; };
7C5AF14B2F15041600DE21B0 /* MediaRemoteAdapter in Frameworks */ = {isa = PBXBuildFile; productRef = 7C5AF14A2F15041600DE21B0 /* MediaRemoteAdapter */; };
6D19DEDFE36041A385D02553 /* VoiceCommandProcessorTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 97032BFB139242FDA10E67BF /* VoiceCommandProcessorTests.swift */; };
7C91B0012F42AA0100C0DEF0 /* HotkeyShortcutTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 7C91B0022F42AA0100C0DEF0 /* HotkeyShortcutTests.swift */; };
7CDB0A2D2F3C4D5600FB7CAD /* DictationE2ETests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 7CDB0A292F3C4D5600FB7CAD /* DictationE2ETests.swift */; };
7CDB0A2E2F3C4D5600FB7CAD /* AudioFixtureLoader.swift in Sources */ = {isa = PBXBuildFile; fileRef = 7CDB0A2A2F3C4D5600FB7CAD /* AudioFixtureLoader.swift */; };
Expand All @@ -32,6 +33,7 @@
/* Begin PBXFileReference section */
7C078D8F2E3B339200FB7CAC /* FluidVoice Debug.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = "FluidVoice Debug.app"; sourceTree = BUILT_PRODUCTS_DIR; };
7CDB0A202F3C4D5600FB7CAD /* FluidDictationIntegrationTests.xctest */ = {isa = PBXFileReference; explicitFileType = wrapper.cfbundle; includeInIndex = 0; path = FluidDictationIntegrationTests.xctest; sourceTree = BUILT_PRODUCTS_DIR; };
97032BFB139242FDA10E67BF /* VoiceCommandProcessorTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = VoiceCommandProcessorTests.swift; sourceTree = "<group>"; };
7C91B0022F42AA0100C0DEF0 /* HotkeyShortcutTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = HotkeyShortcutTests.swift; sourceTree = "<group>"; };
7CDB0A292F3C4D5600FB7CAD /* DictationE2ETests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = DictationE2ETests.swift; sourceTree = "<group>"; };
7CDB0A2A2F3C4D5600FB7CAD /* AudioFixtureLoader.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = AudioFixtureLoader.swift; sourceTree = "<group>"; };
Expand Down Expand Up @@ -104,6 +106,7 @@
7CDB0A272F3C4D5600FB7CAD /* Resources */,
7CDB0A292F3C4D5600FB7CAD /* DictationE2ETests.swift */,
7C91B0022F42AA0100C0DEF0 /* HotkeyShortcutTests.swift */,
97032BFB139242FDA10E67BF /* VoiceCommandProcessorTests.swift */,
);
path = FluidDictationIntegrationTests;
sourceTree = "<group>";
Expand Down Expand Up @@ -258,6 +261,7 @@
7CDB0A2E2F3C4D5600FB7CAD /* AudioFixtureLoader.swift in Sources */,
7CDB0A2D2F3C4D5600FB7CAD /* DictationE2ETests.swift in Sources */,
7C91B0012F42AA0100C0DEF0 /* HotkeyShortcutTests.swift in Sources */,
6D19DEDFE36041A385D02553 /* VoiceCommandProcessorTests.swift in Sources */,
);
runOnlyForDeploymentPostprocessing = 0;
};
Expand Down
21 changes: 21 additions & 0 deletions Sources/Fluid/Persistence/SettingsStore.swift
Original file line number Diff line number Diff line change
Expand Up @@ -2408,6 +2408,21 @@ final class SettingsStore: ObservableObject {
}
}

// MARK: - Voice Command Settings

var voiceCommandsEnabled: Bool {
get { self.defaults.bool(forKey: Keys.voiceCommandsEnabled) }
set { objectWillChange.send(); self.defaults.set(newValue, forKey: Keys.voiceCommandsEnabled) }
}

var voiceCommandScratchWordCount: Int {
get {
let raw = self.defaults.integer(forKey: Keys.voiceCommandScratchWordCount)
return raw > 0 ? raw : 1
}
set { objectWillChange.send(); self.defaults.set(max(1, newValue), forKey: Keys.voiceCommandScratchWordCount) }
}

// MARK: - Rewrite Mode Settings

var rewriteModeHotkeyShortcut: HotkeyShortcut {
Expand Down Expand Up @@ -4316,6 +4331,10 @@ private extension SettingsStore {
static let commandModeLinkedToGlobal = "CommandModeLinkedToGlobal"
static let commandModeShortcutEnabled = "CommandModeShortcutEnabled"

// Voice Command Keys
static let voiceCommandsEnabled = "VoiceCommandsEnabled"
static let voiceCommandScratchWordCount = "VoiceCommandScratchWordCount"

// Prompt Mode Keys (Transcribe with Prompt)
static let promptModeHotkeyShortcut = "PromptModeHotkeyShortcut"
static let promptModeShortcutEnabled = "PromptModeShortcutEnabled"
Expand Down Expand Up @@ -4709,3 +4728,5 @@ extension SettingsStore {
return newModel
}
}

extension SettingsStore: VoiceCommandSettings {}
28 changes: 21 additions & 7 deletions Sources/Fluid/Services/DictationPostProcessingService.swift
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,16 @@ final class DictationPostProcessingService {

func process(_ inputText: String, dictationSlot: SettingsStore.DictationShortcutSlot = .primary) async throws -> Result {
let trimmed = inputText.trimmingCharacters(in: .whitespacesAndNewlines)
guard !trimmed.isEmpty else {

let (voiceStripped, voicePendingAction) = VoiceCommandProcessor.detect(in: trimmed, settings: SettingsStore.shared)

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Run voice-command detection from the user dictation flow

This hook only executes inside DictationPostProcessingService, but normal hotkey dictation in ContentView.stopAndProcessTranscription calls processTextWithAI directly when AI is configured (ContentView.swift:2147) and otherwise uses transcribedText unchanged (ContentView.swift:2193); repo-wide search shows this service is only called by LocalAPI/InferenceAPIController.swift:88. As a result, enabling VoiceCommandsEnabled in the app will not process ... scratch that or new line for the main dictation workflow, only for the local /v1/postprocess API.

Useful? React with 👍 / 👎.

if let commandAction = voicePendingAction, voiceStripped.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty {
// Command-only utterance: apply the edit and bypass the LLM entirely.
let edited = VoiceCommandProcessor.apply(commandAction, to: "", settings: SettingsStore.shared)
return Result(text: edited, providerID: SettingsStore.shared.selectedProviderID, model: "")
}
let effectiveTrimmed = voiceStripped.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty ? trimmed : voiceStripped

guard !effectiveTrimmed.isEmpty else {
return Result(text: "", providerID: SettingsStore.shared.selectedProviderID, model: "")
}

Expand All @@ -49,7 +58,7 @@ final class DictationPostProcessingService {
isPrivateAIProvider || PrivateAIIntegrationService.shouldHandleDictation(model: resolved.model)
{
let response = try await PrivateAIIntegrationService.shared.enhanceDictation(
trimmed,
effectiveTrimmed,
runtime: PrivateAIIntegrationService.RuntimeConfiguration(
selectedProviderID: resolved.providerID,
providerKey: resolved.providerKey,
Expand All @@ -67,7 +76,7 @@ final class DictationPostProcessingService {
)
)
return Result(
text: ASRService.applyGAAVFormatting(response.outputText),
text: self.applyPendingVoiceEdit(ASRService.applyGAAVFormatting(response.outputText), action: voicePendingAction),
providerID: resolved.providerID,
model: resolved.model
)
Expand All @@ -77,7 +86,7 @@ final class DictationPostProcessingService {
let systemPrompt = ""
let userMessageContent = SettingsStore.renderDictationUserMessage(
promptText: promptText,
transcript: trimmed
transcript: effectiveTrimmed
)

if resolved.providerID == "apple-intelligence" {
Expand All @@ -86,10 +95,10 @@ final class DictationPostProcessingService {
let provider = AppleIntelligenceProvider()
let output = try await provider.process(systemPrompt: systemPrompt, userText: userMessageContent)
guard !output.isEmpty else { throw AIProcessingError.emptyResponse }
return Result(text: ASRService.applyGAAVFormatting(output), providerID: resolved.providerID, model: resolved.model)
return Result(text: self.applyPendingVoiceEdit(ASRService.applyGAAVFormatting(output), action: voicePendingAction), providerID: resolved.providerID, model: resolved.model)
}
#endif
return Result(text: trimmed, providerID: resolved.providerID, model: resolved.model)
return Result(text: self.applyPendingVoiceEdit(effectiveTrimmed, action: voicePendingAction), providerID: resolved.providerID, model: resolved.model)
}

guard !resolved.model.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty else {
Expand Down Expand Up @@ -131,12 +140,17 @@ final class DictationPostProcessingService {
throw AIProcessingError.emptyResponse
}
return Result(
text: ASRService.applyGAAVFormatting(response.content),
text: self.applyPendingVoiceEdit(ASRService.applyGAAVFormatting(response.content), action: voicePendingAction),
providerID: resolved.providerID,
model: resolved.model
)
}

private func applyPendingVoiceEdit(_ text: String, action: EditAction?) -> String {
guard let action else { return text }
return VoiceCommandProcessor.apply(action, to: text, settings: SettingsStore.shared)
}

private func resolveProvider(settings: SettingsStore, dictationSlot: SettingsStore.DictationShortcutSlot) -> ResolvedProvider {
if settings.dictationPromptSelection(for: dictationSlot) == .privateAI,
let modelID = PrivateAIProviderPromptFormat.verifiedModelID(settings: settings)
Expand Down
122 changes: 122 additions & 0 deletions Sources/Fluid/Services/VoiceCommandProcessor.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
import Foundation

protocol VoiceCommandSettings {
var voiceCommandsEnabled: Bool { get }
var voiceCommandScratchWordCount: Int { get }
}

enum EditAction {
case deleteLastWords(Int)
case capitalizeLastWord
case appendAfterLastWord(String)
case insertNewline
}

struct VoiceCommand {
let phrases: [String]
let action: EditAction
}

enum VoiceCommandProcessor {
static let commands: [VoiceCommand] = [
.init(phrases: ["scratch that", "delete that"], action: .deleteLastWords(1)),
.init(phrases: ["capitalize that"], action: .capitalizeLastWord),
.init(phrases: ["slash that"], action: .appendAfterLastWord("/")),
.init(phrases: ["new line", "new paragraph"], action: .insertNewline),
]

static func detect(in input: String, settings: VoiceCommandSettings) -> (stripped: String, action: EditAction?) {
guard settings.voiceCommandsEnabled else { return (input, nil) }
if input.isEmpty { return ("", nil) }

// TODO(v2): support "literal new line" escape hatch
let normalized = self.normalizeForMatching(input)
.replacingOccurrences(of: ", ", with: " ")
.replacingOccurrences(of: "-", with: " ")

for command in self.commands {
for phrase in command.phrases {
guard normalized.hasSuffix(phrase) else { continue }

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Strip terminal punctuation before matching commands

When the ASR includes punctuation at the end of the utterance, such as scratch that. or new line,, the normalized string still ends with that punctuation because only comma-space and hyphens are rewritten, so normalized.hasSuffix(phrase) is false and the command text is emitted literally. Since the app already has trailing-period cleanup after processing, these common punctuated ASR outputs need to be normalized before this suffix check.

Useful? React with 👍 / 👎.


let phraseStart = normalized.index(normalized.endIndex, offsetBy: -phrase.count)
let atWordBoundary = phraseStart == normalized.startIndex
|| normalized[normalized.index(before: phraseStart)] == " "
guard atWordBoundary else { continue }

let stripped = self.stripPhraseSuffix(phrase, from: input)
return (stripped, command.action)
}
}

return (input, nil)
}

static func apply(_ action: EditAction, to text: String, settings: VoiceCommandSettings) -> String {
switch action {
case .deleteLastWords:
let count = settings.voiceCommandScratchWordCount
var tokens = self.tokenize(text)
if tokens.isEmpty { return "" }
if count >= tokens.count { return "" }
tokens.removeLast(count)
return tokens.joined(separator: " ")

case .capitalizeLastWord:
var tokens = self.tokenize(text)
guard let last = tokens.last else { return "" }
let (stem, punct) = self.stripTrailingPunct(from: last)
guard !stem.isEmpty else { return text }
let capitalized = String(stem.prefix(1)).uppercased() + stem.dropFirst()
tokens[tokens.count - 1] = capitalized + punct
return tokens.joined(separator: " ")

case let .appendAfterLastWord(suffix):
var tokens = self.tokenize(text)
guard let last = tokens.last else { return "" }
let (stem, punct) = self.stripTrailingPunct(from: last)
tokens[tokens.count - 1] = stem + suffix + punct
return tokens.joined(separator: " ")

case .insertNewline:
return text + "\n"
}
}

/// Lowercase, collapse whitespace, strip leading/trailing whitespace.
private static func normalizeForMatching(_ s: String) -> String {
return s.lowercased()
.split(whereSeparator: { $0.isWhitespace })
.joined(separator: " ")
}

/// Split on whitespace, filter empty.
private static func tokenize(_ s: String) -> [String] {
return s.split(whereSeparator: { $0.isWhitespace }).map(String.init)
}

/// Returns (stem, trailingPunct) where trailingPunct is the trailing
/// punctuation characters stripped from the token (e.g. "word." -> ("word", "."))
/// Only strip common trailing punctuation: . , ! ? ; :
private static func stripTrailingPunct(from token: String) -> (stem: String, punct: String) {
let punctSet: Set<Character> = [".", ",", "!", "?", ";", ":"]
var stem = token
var punct = ""
while let last = stem.last, punctSet.contains(last) {
punct = String(last) + punct
stem.removeLast()
}
return (stem, punct)
}

/// Remove the matched phrase plus any preceding whitespace from the end of the
/// original (un-normalized) input. The phrase may appear in the original with
/// different casing/spacing/punctuation than the normalized form, so we strip
/// a word-count's worth of trailing tokens equal to the phrase's word count.
private static func stripPhraseSuffix(_ phrase: String, from input: String) -> String {
let phraseWordCount = phrase.split(separator: " ").count
var tokens = self.tokenize(input)
guard tokens.count >= phraseWordCount else { return "" }
tokens.removeLast(phraseWordCount)
Comment on lines +116 to +119

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Strip only the matched hyphenated command token

For the hyphen variants that detection accepts, such as send the report scratch-that, the original input has one token for scratch-that while phraseWordCount is 2, so removeLast(2) also drops report; applying the scratch action then deletes an additional word and produces send instead of send the. The strip step needs to remove the matched original suffix/range rather than a normalized word count.

Useful? React with 👍 / 👎.

return tokens.joined(separator: " ")
}
}
Loading