hack-ink · yvette-carlisle · May 22, 2026 · May 22, 2026 · May 22, 2026
diff --git a/README.md b/README.md
@@ -24,7 +24,7 @@ AI dictation App for macOS (MVP scaffold).
 - Pass-2 finalize pass using `gpt-4o-transcribe` for better punctuation and stability.
 - Optional Pass-3 rewrite for cleaner English output with numeric/proper noun protection.
 - Auto-paste into the app that was frontmost when recording began.
-- Configurable behavior and models via `config.toml`.
+- Configurable behavior and models via Settings-backed `config.toml`.
 
 For the normative product contract, constraints, and gaps, see the
 [Runtime Spec](docs/spec/runtime.md).
@@ -37,8 +37,8 @@ V1 target is **macOS-first** and aligned to the English-only voice input design.
 - Scope: ✅ Native macOS mic capture + OpenAI model pipeline only.
 - Limitation: ✅ Linux/Windows build is intentionally disabled.
 - Limitation: ⚠️ Known gaps are documented in the
-  [Runtime Spec](docs/spec/runtime.md) (runtime action wiring, config write-through,
-  CPAL fallback robustness, and rollout cleanup items).
+  [Runtime Spec](docs/spec/runtime.md) (explicit microphone picker, CPAL fallback
+  robustness, app-rule authoring, and rollout cleanup items).
 
 ## Usage
 
@@ -104,13 +104,14 @@ realtime_target_rate_hz = 24000
 
 [openai]
 api_base_url = "https://api.openai.com/v1"
-realtime_model = "gpt-4o-mini-transcribe"
+realtime_model = "gpt-realtime-2"
 finalize_model = "gpt-4o-transcribe"
 rewrite_model = "gpt-5.2-mini"
 language = "en"
 
 [openai.realtime]
 noise_reduction = "near_field" # near_field | far_field | off
+transcription_model = "gpt-4o-mini-transcribe"
 
 [rewrite]
 enabled = true
@@ -130,14 +131,14 @@ First-run onboarding checklist:
 - Microphone permission in **System Settings → Privacy & Security → Microphone**.
 - Accessibility permission in **Privacy & Security → Accessibility** (for Cmd+V fallback).
 - Input Monitoring permission in **Privacy & Security → Input Monitoring** (for global hotkey hooks).
-- Voxit uses request buttons to guide you through the permission prompts in sequence (Microphone → Accessibility → Input Monitoring); grant each permission and re-check when prompted.
+- Voxit Settings includes shortcut buttons for the relevant macOS privacy panes; grant each permission and re-check before a real dictation run.
 - Verify paste flow after permission grant and restart the app if needed.
 
 For the full guided sequence, see [First Run](docs/runbook/first-run.md).
 
 Runtime configuration remains sourced from `config.toml`. The current Swift Settings
-window persists shell preferences in macOS `UserDefaults`; writing those settings back
-through the Rust config path is a tracked runtime gap.
+window persists shell and model preferences in macOS `UserDefaults` and writes
+supported preferences back through the Rust host FFI.
 
 ### Interaction
 
@@ -147,8 +148,10 @@ through the Rust config path is a tracked runtime gap.
 - While listening: panel shows live draft text and committed segments.
 - Stop recording: toggle key again or release key in hold mode.
 - Finalize: Pass-2 runs automatically; rewrite runs by default unless disabled in settings.
-- Microphone input selection is persisted in config as `audio.input_device_id` and `audio.input_device_name`.
-- Refresh workflow: the picker list is refreshed at startup and via the **Refresh microphones** control before choosing from a list of input-capable devices.
+- Model choice: Settings exposes editable OpenAI model IDs for realtime voice,
+  realtime transcript, finalize, and rewrite passes.
+- The Swift Settings audio picker currently exposes the system default microphone; explicit
+  `audio.input_device_id` values can still be resolved by Rust config.
 - Runtime fallback: if a saved explicit device id is unavailable, Voxit falls back to the system default input device and continues recording.
 - Paste behavior: by default paste rewritten text after finalize, or paste raw transcript via available controls.
 - Output target: text is pasted into the app that was frontmost when dictation started.

diff --git a/docs/decisions/contextual-voice-layer.md b/docs/decisions/contextual-voice-layer.md
@@ -21,8 +21,8 @@ Consequences:
 - The main Voxit window is a control center for activity, app rules, profiles,
   glossary, prompt experiments, and debug/evaluation surfaces.
 - The Settings window stays separate and limited to app preferences such as startup,
-  shortcuts, microphone, permissions, account defaults, privacy, logging, and
-  notifications.
+  shortcuts, model choices, microphone, permissions, account defaults, privacy, logging,
+  and notifications.
 - Swift owns the native macOS presentation layer and UI glue. Rust owns durable product
   logic, context classification, prompt profile selection, voice session planning,
   output policy, and provider orchestration.

diff --git a/docs/reference/repository-layout.md b/docs/reference/repository-layout.md
@@ -16,8 +16,9 @@ files.
 ## Top-level surfaces
 
 - `native/macos-host/` holds the SwiftPM native macOS host. It owns platform UI
-  composition, the menu bar extra, the Voxit control-center window, the Settings
-  window, and links Rust through the host FFI static library.
+  composition, the menu bar extra, global hotkey observation, the floating recording
+  HUD, the Voxit control-center window, the Settings window, and links Rust through the
+  host FFI static library.
 - `packages/voxit-core/` holds the shared runtime logic, auth, OpenAI integration, and
   dictation pipeline code. Platform-neutral UI model types and contextual voice
   planning contracts also live here so hosts do not invent divergent state names,

diff --git a/docs/runbook/first-run.md b/docs/runbook/first-run.md
@@ -49,17 +49,17 @@ Verification:
 ## 4. Confirm runtime configuration
 
 - Open **Settings...** from the menu bar menu or press `Cmd+,` to confirm shell
-  preferences and permission shortcuts are available.
+  preferences, model choices, and permission shortcuts are available.
 - Check the config file at:
 
 ```text
 $HOME/Library/Application Support/voxit/config.toml
 ```
 
-- Confirm the default runtime hotkey and audio device settings look reasonable for the
-  machine.
-- If you need an explicit microphone, refresh the device list and select it before the
-  first real dictation run.
+- Confirm the default runtime hotkey, OpenAI model IDs, and system-default audio route
+  look reasonable for the machine.
+- If you need an explicit microphone before the Swift picker exposes one, set
+  `audio.input_device_id` and `audio.input_device_name` in `config.toml`.
 
 ## 5. Verify paste flow
 

diff --git a/docs/spec/contextual-voice.md b/docs/spec/contextual-voice.md
@@ -167,7 +167,7 @@ Swift hosts own:
 
 - menu bar, HUD, main window, and Settings presentation
 - macOS-specific context capture
-- permission prompts and native controls
+- permission panes and native controls
 - rendering Rust-owned snapshots and session plans
 - user confirmation UX
 

diff --git a/docs/spec/runtime.md b/docs/spec/runtime.md
@@ -93,25 +93,27 @@ State transitions:
 
 ### 4.2 Device picker lifecycle
 
-- On startup, the app refreshes available input-capable devices and caches the result.
-- A manual **Refresh microphones** action is available in the UI to repopulate the
-  picker.
-- Picker values map to:
-  - **System default** (`audio.input_device_id = 0`)
-  - an explicit input device id and name pair from a discovered device list
-- Selection changes persist `audio.input_device_name` and `audio.input_device_id` to
-  config.
+- The current Swift Settings audio picker exposes **System default**
+  (`audio.input_device_id = 0`).
+- Rust can resolve explicit `audio.input_device_id` and `audio.input_device_name` values
+  supplied through config.
 - If a configured device id is invalid or stale when starting recording, the runtime
   falls back to system default and reports fallback in status or logs.
 
 ### 4.3 Pass1 transport
 
 - For each chunk, send `input_audio_buffer.append` payload frames to OpenAI Realtime.
 - Realtime session must be configured with:
+  - `model`: `openai.realtime_model` (default `gpt-realtime-2`)
+  - `reasoning.effort`: the Rust-selected contextual voice plan effort
   - `audio.input.format`: `audio/pcm` with sample rate from config (default `24000`)
-  - `audio.input.noise_reduction`: configured profile (default `near_field`)
-  - `audio.input.transcription.model`: Pass1 model
+  - `audio.input.noise_reduction`: configured profile (default `near_field`) or `null`
+    when set to `off`
+  - `audio.input.transcription.model`: `openai.realtime.transcription_model` (default
+    `gpt-4o-mini-transcribe`)
+  - `audio.input.transcription.language`: `openai.language` (default `en`)
   - `audio.input.turn_detection.type`: `server_vad`
+  - `audio.input.turn_detection.create_response`: `false`
 - Realtime events consumed by the UI:
   - `conversation.item.input_audio_transcription.delta` (draft)
   - `conversation.item.input_audio_transcription.completed` (committed)
@@ -167,8 +169,10 @@ State transitions:
 
 - Hotkey chord handling:
   - supported mode switch: toggle or hold
-  - the menu command uses the configured `hotkey.chord` presentation
-  - system-wide hotkey capture is not active yet
+  - system-wide and app-local key monitors observe the configured `hotkey.chord`
+  - pressing the chord presents the non-activating floating recording HUD and starts
+    dictation without making Voxit the target-app context
+  - toggle mode stops on the next chord press; hold mode stops on hotkey release
 - Menu bar behavior:
   - `MenuBarExtra` exposes `Open Voxit` (`Cmd+O`), `Settings...` (`Cmd+,`),
     `Start Dictation`, `Stop Dictation`, `Refresh Status` (`Cmd+R`), and `Quit Voxit`
@@ -185,15 +189,14 @@ State transitions:
     controls
   - Voxit control-center window: activity, app rules, profiles, glossary, prompt lab,
     and debug/evaluation surfaces
-  - Settings window: app preferences, shortcuts, microphone, permissions, account
-    defaults, privacy, logging, and notifications
-- Onboarding checklist provides request actions for required macOS permissions. The UI
-  prompts permission requests in order:
-  - Microphone: probe-based request and retry loop when denied
-  - Accessibility: system prompt request plus re-check
-  - Input Monitoring: system prompt request plus re-check
-- Grant each permission in macOS Privacy & Security settings when prompted, then
-  re-check in Voxit before continuing.
+  - Settings window: app preferences, shortcuts, model choices, microphone,
+    permissions, account defaults, privacy, logging, and notifications
+- Settings provides shortcut actions for required macOS permission panes:
+  - Microphone
+  - Accessibility
+  - Input Monitoring
+- Grant each permission in macOS Privacy & Security settings, then re-check before
+  continuing to a real dictation run.
 - "Paste raw now" is always available when finalization or rewrite is active and should
   bypass Pass3.
 - The Control Center exposes the current focused context, selected profile, profile
@@ -217,7 +220,7 @@ Supported sections and keys:
   `audio.input_device_id`, `audio.realtime_target_rate_hz`
 - `openai.api_base_url`, `openai.realtime_model`, `openai.finalize_model`,
   `openai.rewrite_model`, `openai.language`
-- `openai.realtime.noise_reduction`
+- `openai.realtime.noise_reduction`, `openai.realtime.transcription_model`
 - `rewrite.enabled`, `rewrite.auto`, `rewrite.guard_numbers`,
   `rewrite.max_output_chars`, `rewrite.style`
 - `paste.lock_frontmost_app`, `paste.method`
@@ -233,7 +236,10 @@ On load:
 Current Swift Settings window:
 
 - persists shell preferences in macOS `UserDefaults`
-- writes supported preferences through the Rust host FFI into `config.toml`
+- exposes editable OpenAI model IDs for realtime voice, realtime transcript, finalize,
+  and rewrite passes
+- writes supported shell and model preferences through the Rust host FFI into
+  `config.toml`
 
 ## 11) CI and Release
 
@@ -253,10 +259,6 @@ Current Swift Settings window:
 
 ## 13) Known Gaps
 
-- System-wide global hotkey capture is not implemented yet; the configured shortcut is
-  currently a Swift menu command.
-- The native HUD does not yet render Pass1 realtime draft/committed transcript events;
-  it shows active profile/state plus raw and final output after Pass2/Pass3.
 - App-rule authoring is not implemented yet; users can refresh focus context and
   manually override the active built-in profile.
 - The Swift Settings audio picker still exposes only System Default even though Rust can

diff --git a/native/macos-host/Sources/VoxitHostBridge/HostFFI.swift b/native/macos-host/Sources/VoxitHostBridge/HostFFI.swift
@@ -70,6 +70,8 @@ public struct HostSnapshot: Equatable, Sendable {
   public var hasFocusedContext: Bool
   public var selectedTextPresent: Bool
   public var hasRawTranscript: Bool
+  public var hasPass1CommittedTranscript: Bool
+  public var hasPass1DraftTranscript: Bool
   public var hasFinalOutput: Bool
   public var hasError: Bool
   public var recordingDurationMS: UInt64
@@ -80,6 +82,8 @@ public struct HostSnapshot: Equatable, Sendable {
   public var focusedElementRole: String?
   public var promptProfileID: String?
   public var promptDirective: String?
+  public var pass1CommittedTranscript: String?
+  public var pass1DraftTranscript: String?
   public var rawTranscript: String?
   public var finalOutput: String?
   public var lastError: String?
@@ -100,6 +104,8 @@ public struct HostSnapshot: Equatable, Sendable {
     hasFocusedContext: Bool,
     selectedTextPresent: Bool,
     hasRawTranscript: Bool,
+    hasPass1CommittedTranscript: Bool,
+    hasPass1DraftTranscript: Bool,
     hasFinalOutput: Bool,
     hasError: Bool,
     recordingDurationMS: UInt64,
@@ -110,6 +116,8 @@ public struct HostSnapshot: Equatable, Sendable {
     focusedElementRole: String?,
     promptProfileID: String?,
     promptDirective: String?,
+    pass1CommittedTranscript: String?,
+    pass1DraftTranscript: String?,
     rawTranscript: String?,
     finalOutput: String?,
     lastError: String?,
@@ -129,6 +137,8 @@ public struct HostSnapshot: Equatable, Sendable {
     self.hasFocusedContext = hasFocusedContext
     self.selectedTextPresent = selectedTextPresent
     self.hasRawTranscript = hasRawTranscript
+    self.hasPass1CommittedTranscript = hasPass1CommittedTranscript
+    self.hasPass1DraftTranscript = hasPass1DraftTranscript
     self.hasFinalOutput = hasFinalOutput
     self.hasError = hasError
     self.recordingDurationMS = recordingDurationMS
@@ -139,6 +149,8 @@ public struct HostSnapshot: Equatable, Sendable {
     self.focusedElementRole = focusedElementRole
     self.promptProfileID = promptProfileID
     self.promptDirective = promptDirective
+    self.pass1CommittedTranscript = pass1CommittedTranscript
+    self.pass1DraftTranscript = pass1DraftTranscript
     self.rawTranscript = rawTranscript
     self.finalOutput = finalOutput
     self.lastError = lastError
@@ -277,6 +289,34 @@ public final class VoxitHostSession {
     return try currentSnapshot()
   }
 
+  public func saveModelPreferences(
+    realtimeModel: String,
+    realtimeTranscriptionModel: String,
+    finalizeModel: String,
+    rewriteModel: String
+  ) throws -> HostSnapshot {
+    try realtimeModel.withCString { realtime in
+      try realtimeTranscriptionModel.withCString { realtimeTranscription in
+        try finalizeModel.withCString { finalize in
+          try rewriteModel.withCString { rewrite in
+            try requireOk(
+              voxit_host_session_save_model_preferences(
+                handle,
+                realtime,
+                realtimeTranscription,
+                finalize,
+                rewrite
+              ),
+              context: "saving model preferences"
+            )
+          }
+        }
+      }
+    }
+
+    return try currentSnapshot()
+  }
+
   public func setProfileOverride(_ profileKind: PromptProfileKind) throws -> HostSnapshot {
     try requireOk(
       voxit_host_session_set_profile_override(handle, encode(promptProfileKind: profileKind)),
@@ -321,6 +361,8 @@ public final class VoxitHostSession {
       hasFocusedContext: snapshot.has_focused_context != 0,
       selectedTextPresent: snapshot.selected_text_present != 0,
       hasRawTranscript: snapshot.has_raw_transcript != 0,
+      hasPass1CommittedTranscript: snapshot.has_pass1_committed_transcript != 0,
+      hasPass1DraftTranscript: snapshot.has_pass1_draft_transcript != 0,
       hasFinalOutput: snapshot.has_final_output != 0,
       hasError: snapshot.has_error != 0,
       recordingDurationMS: snapshot.recording_duration_ms,
@@ -331,6 +373,8 @@ public final class VoxitHostSession {
       focusedElementRole: try copyString(field: VOXIT_HOST_STRING_FOCUSED_ELEMENT_ROLE),
       promptProfileID: try copyString(field: VOXIT_HOST_STRING_PROMPT_PROFILE_ID),
       promptDirective: try copyString(field: VOXIT_HOST_STRING_PROMPT_DIRECTIVE),
+      pass1CommittedTranscript: try copyString(field: VOXIT_HOST_STRING_PASS1_COMMITTED_TRANSCRIPT),
+      pass1DraftTranscript: try copyString(field: VOXIT_HOST_STRING_PASS1_DRAFT_TRANSCRIPT),
       rawTranscript: try copyString(field: VOXIT_HOST_STRING_RAW_TRANSCRIPT),
       finalOutput: try copyString(field: VOXIT_HOST_STRING_FINAL_OUTPUT),
       lastError: try copyString(field: VOXIT_HOST_STRING_LAST_ERROR),