diff --git a/src/arch/host/configs/library_defconfig b/src/arch/host/configs/library_defconfig index 28c486bec58d..34ea0fe051f3 100644 --- a/src/arch/host/configs/library_defconfig +++ b/src/arch/host/configs/library_defconfig @@ -11,6 +11,7 @@ CONFIG_COMP_IIR=y CONFIG_COMP_IGO_NR=y CONFIG_COMP_LEVEL_MULTIPLIER=y CONFIG_COMP_MFCC=y +CONFIG_COMP_MFCC_VAD=y CONFIG_COMP_MODULE_ADAPTER=y CONFIG_COMP_MULTIBAND_DRC=y CONFIG_COMP_MUX=y diff --git a/src/audio/mfcc/CMakeLists.txt b/src/audio/mfcc/CMakeLists.txt index f8af79d1ca8a..433aa824e713 100644 --- a/src/audio/mfcc/CMakeLists.txt +++ b/src/audio/mfcc/CMakeLists.txt @@ -5,4 +5,7 @@ if(CONFIG_COMP_MFCC STREQUAL "m" AND DEFINED CONFIG_LLEXT) add_dependencies(app mfcc) else() add_local_sources(sof mfcc.c mfcc_setup.c mfcc_common.c mfcc_generic.c mfcc_hifi4.c mfcc_hifi3.c) + if(CONFIG_COMP_MFCC_VAD) + add_local_sources(sof mfcc_vad.c) + endif() endif() diff --git a/src/audio/mfcc/Kconfig b/src/audio/mfcc/Kconfig index f56cadb40de2..821a3d22018d 100644 --- a/src/audio/mfcc/Kconfig +++ b/src/audio/mfcc/Kconfig @@ -24,3 +24,14 @@ config COMP_MFCC The characteristic of the audio features are defined in the binary control blob. Directory tools/tune/mfcc contains a tool to create the configurations. + +config COMP_MFCC_VAD + bool "MFCC Voice Activity Detection" + depends on COMP_MFCC + default y + help + This option enables a Voice Activity Detector (VAD) that operates + on the Mel spectrum values produced by the MFCC component. The VAD + flag is inserted into the output stream as the first int32_t value + after the magic header word. The VAD tracks a per-bin noise floor + and detects speech using a weighted energy delta with hangover. diff --git a/src/audio/mfcc/mfcc_common.c b/src/audio/mfcc/mfcc_common.c index 1079864e9259..75e027794449 100644 --- a/src/audio/mfcc/mfcc_common.c +++ b/src/audio/mfcc/mfcc_common.c @@ -21,6 +21,10 @@ #include #include +#ifdef CONFIG_COMP_MFCC_VAD +#include +#endif + LOG_MODULE_REGISTER(mfcc_common, CONFIG_SOF_LOG_LEVEL); /* @@ -144,6 +148,10 @@ static int mfcc_stft_process(const struct comp_dev *dev, struct mfcc_comp_data * sat_int32(Q_MULTSR_32X32(s, config->mel_scale, 23, 12, 23)); } +#ifdef CONFIG_COMP_MFCC_VAD + /* Run VAD on the mel log spectrum before further processing */ + state->vad_flag = mfcc_vad_update(&cd->vad, state->mel_log_32); +#endif /* Store Q9.7 version in mel_spectra for s16 output mode */ for (j = 0; j < state->dct.num_in; j++) state->mel_spectra->data[j] = @@ -282,10 +290,14 @@ void mfcc_s16_default(struct processing_module *mod, struct input_stream_buffer /* If new output produced, set up pointer into scratch data and mark magic pending */ if (num_ceps > 0) { - if (state->mel_only) + if (state->mel_only) { state->out_data_ptr = state->mel_spectra->data; - else +#ifdef CONFIG_COMP_MFCC_VAD + state->vad_pending = true; +#endif + } else { state->out_data_ptr = state->cepstral_coef->data; + } state->out_remain = num_ceps; state->magic_pending = true; @@ -301,6 +313,15 @@ void mfcc_s16_default(struct processing_module *mod, struct input_stream_buffer state->magic_pending = false; } +#ifdef CONFIG_COMP_MFCC_VAD + /* Write VAD flag as first value after magic (as two int16_t = one int32_t) */ + if (state->vad_pending && sink_samples >= 2) { + w_ptr = mfcc_sink_copy_data_s16(sink, w_ptr, 2, (int16_t *)&state->vad_flag); + sink_samples -= 2; + state->vad_pending = false; + } +#endif + /* Write cepstral/mel data from scratch buffer */ to_copy = MIN(state->out_remain, sink_samples); if (to_copy > 0) { @@ -386,6 +407,9 @@ void mfcc_s24_default(struct processing_module *mod, struct input_stream_buffer state->mel_log_32[k] >>= 8; state->out_data_ptr_32 = state->mel_log_32; +#ifdef CONFIG_COMP_MFCC_VAD + state->vad_pending = true; +#endif } else { state->out_data_ptr = state->cepstral_coef->data; } @@ -404,6 +428,15 @@ void mfcc_s24_default(struct processing_module *mod, struct input_stream_buffer state->magic_pending = false; } +#ifdef CONFIG_COMP_MFCC_VAD + /* Write VAD flag as first value after magic */ + if (state->vad_pending && sink_samples >= 1) { + w_ptr = mfcc_sink_copy_data_s32(sink, w_ptr, 1, &state->vad_flag); + sink_samples -= 1; + state->vad_pending = false; + } +#endif + if (state->mel_only) { /* Write 32-bit mel data Q9.15, one value per int32_t */ to_copy = MIN(state->out_remain, sink_samples); @@ -461,6 +494,9 @@ void mfcc_s32_default(struct processing_module *mod, struct input_stream_buffer if (num_ceps > 0) { if (state->mel_only) { state->out_data_ptr_32 = state->mel_log_32; +#ifdef CONFIG_COMP_MFCC_VAD + state->vad_pending = true; +#endif } else { state->out_data_ptr = state->cepstral_coef->data; } @@ -479,6 +515,15 @@ void mfcc_s32_default(struct processing_module *mod, struct input_stream_buffer state->magic_pending = false; } +#ifdef CONFIG_COMP_MFCC_VAD + /* Write VAD flag as first value after magic */ + if (state->vad_pending && sink_samples >= 1) { + w_ptr = mfcc_sink_copy_data_s32(sink, w_ptr, 1, &state->vad_flag); + sink_samples -= 1; + state->vad_pending = false; + } +#endif + if (state->mel_only) { /* Write 32-bit mel data Q9.23, one value per int32_t */ to_copy = MIN(state->out_remain, sink_samples); diff --git a/src/audio/mfcc/mfcc_setup.c b/src/audio/mfcc/mfcc_setup.c index 1cad4b2b984e..aa83f93d8e3a 100644 --- a/src/audio/mfcc/mfcc_setup.c +++ b/src/audio/mfcc/mfcc_setup.c @@ -18,6 +18,10 @@ #include #include +#ifdef CONFIG_COMP_MFCC_VAD +#include +#endif + /* Definitions for cepstral lifter */ #define PI_Q23 Q_CONVERT_FLOAT(3.1415926536, 23) #define TWO_PI_Q23 Q_CONVERT_FLOAT(6.2831853072, 23) @@ -346,10 +350,22 @@ int mfcc_setup(struct processing_module *mod, int max_frames, int sample_rate, i state->waiting_fill = true; state->prev_samples_valid = false; state->magic_pending = false; +#ifdef CONFIG_COMP_MFCC_VAD + state->vad_pending = false; + state->vad_flag = 0; +#endif state->out_data_ptr = NULL; state->out_data_ptr_32 = NULL; state->out_remain = 0; +#ifdef CONFIG_COMP_MFCC_VAD + ret = mfcc_vad_init(&cd->vad, config->num_mel_bins, sample_rate, mod); + if (ret < 0) { + comp_err(dev, "Failed VAD init"); + goto free_lifter; + } +#endif + comp_dbg(dev, "done"); return 0; @@ -389,4 +405,8 @@ void mfcc_free_buffers(struct processing_module *mod) mod_free(mod, cd->state.melfb.data); mod_free(mod, cd->state.dct.matrix); mod_free(mod, cd->state.lifter.matrix); +#ifdef CONFIG_COMP_MFCC_VAD + mod_free(mod, cd->vad.noise_floor); + mod_free(mod, cd->vad.weights); +#endif } diff --git a/src/audio/mfcc/mfcc_vad.c b/src/audio/mfcc/mfcc_vad.c new file mode 100644 index 000000000000..cdcc3d7eaa12 --- /dev/null +++ b/src/audio/mfcc/mfcc_vad.c @@ -0,0 +1,244 @@ +// SPDX-License-Identifier: BSD-3-Clause +// +// Copyright(c) 2026 Intel Corporation. +// +// Author: Seppo Ingalsuo + +/** + * \file mfcc_vad.c + * \brief Voice Activity Detection based on Mel spectrum energy. + * + * Implements a VAD that tracks per-bin noise floor and computes a + * speech-frequency weighted energy above the floor. Speech is declared + * when the weighted delta exceeds a threshold, with hangover to prevent + * rapid toggling. + */ + +#include + +#ifdef CONFIG_COMP_MFCC_VAD + +#include +#include +#include +#include +#include +#include +#include + +LOG_MODULE_DECLARE(mfcc, CONFIG_SOF_LOG_LEVEL); + +/** + * \brief A-weighting table: 1/3 octave band center frequencies in Hz (Q16.0). + * + * From IEC 61672-1:2013, source: + * https://acousticalengineer.com/a-weighting-table/ + */ +#define A_WEIGHT_TABLE_SIZE 36 + +static const int16_t a_weight_hz[A_WEIGHT_TABLE_SIZE] = { + 6, 8, 10, 13, 16, 20, 25, 32, + 40, 50, 63, 80, 100, 125, 160, 200, + 250, 315, 400, 500, 630, 800, 1000, 1250, + 1600, 2000, 2500, 3150, 4000, 5000, 6300, 8000, + 10000, 12500, 16000, 20000, +}; + +/** + * \brief A-weighting linear amplitude, scaled so peak (at 2500 Hz) maps + * to INT16_MAX (32767). Original dB values converted via + * 10^(dB/20) then scaled by 32767 / max. + */ +static const int16_t a_weight_lin[A_WEIGHT_TABLE_SIZE] = { + 2, 4, 9, 19, 43, 85, 162, 299, + 531, 862, 1382, 2140, 3129, 4370, 6172, 8136, + 10362, 13196, 16234, 19518, 22669, 25730, 28212, 30230, + 31655, 32392, 32767, 32392, 31655, 30230, 27889, 24856, + 21156, 17196, 13045, 9670, +}; + +/** + * \brief Compute A-weighted speech-frequency emphasis weights for Mel bins. + * + * Weights are computed by linearly interpolating the A-weighting table + * at each Mel bin center frequency. Output weights are in Q1.15 and + * sum to approximately 2^15. + * + * \param[out] weights Output weight array. + * \param[in] num_mel Number of Mel bins. + * \param[in] sample_rate Sample rate in Hz. + */ +static void mfcc_vad_build_weights(int16_t *weights, int num_mel, int sample_rate) +{ + int32_t scaled, num; + int32_t sum = 0; + int16_t f_hz, f0, f1, w, w0, w1, den; + int16_t mel_end = psy_hz_to_mel((int16_t)(sample_rate / 2)); + int16_t mel_step = mel_end / (num_mel + 1); + int i, j; + + if (!num_mel) + return; + + for (i = 0; i < num_mel; i++) { + f_hz = psy_mel_to_hz((int16_t)((i + 1) * mel_step)); + + /* Find the table interval containing f_hz and interpolate */ + if (f_hz <= a_weight_hz[0]) { + w = a_weight_lin[0]; + } else if (f_hz >= a_weight_hz[A_WEIGHT_TABLE_SIZE - 1]) { + w = a_weight_lin[A_WEIGHT_TABLE_SIZE - 1]; + } else { + /* Find j such that a_weight_hz[j] <= f_hz < a_weight_hz[j+1] */ + for (j = 0; j < A_WEIGHT_TABLE_SIZE - 2; j++) { + if (f_hz < a_weight_hz[j + 1]) + break; + } + + /* Linear interpolation: w = w0 + (w1 - w0) * (f - f0) / (f1 - f0) */ + f0 = a_weight_hz[j]; + f1 = a_weight_hz[j + 1]; + w0 = a_weight_lin[j]; + w1 = a_weight_lin[j + 1]; + num = (int32_t)(w1 - w0) * (f_hz - f0); + den = f1 - f0; + w = w0 + (int16_t)(num / den); + } + + weights[i] = w; + sum += w; + } + + /* Normalize weights so they sum to 1.0 */ + for (i = 0; i < num_mel; i++) { + scaled = ((int32_t)weights[i] << 16) / sum; /* Q1.16 */ + weights[i] = (int16_t)Q_SHIFT_RND(scaled, 16, 15); /* Round to Q1.15 */ + } +} + +int mfcc_vad_init(struct mfcc_vad_state *vad, int num_mel_bins, int sample_rate, + struct processing_module *mod) +{ + if (!vad) + return -EINVAL; + + if (num_mel_bins <= 0) + return -EINVAL; + + vad->num_mel_bins = num_mel_bins; + vad->energy_threshold = MFCC_VAD_ENERGY_THRESHOLD; + vad->noise_rise_alpha_slow = MFCC_VAD_NOISE_RISE_ALPHA; + vad->noise_rise_alpha_fast = MFCC_VAD_NOISE_RISE_ALPHA_FAST; + vad->hangover_max = MFCC_VAD_HANGOVER_FRAMES; + vad->hangover_counter = 0; + vad->init_frames = MFCC_VAD_NOISE_INIT_FRAMES; + vad->frame_count = 0; + vad->is_speech = false; + vad->initialized = false; + + /* Allocate per-bin noise floor */ + vad->noise_floor = mod_zalloc(mod, num_mel_bins * sizeof(int32_t)); + if (!vad->noise_floor) + return -ENOMEM; + + /* Allocate and compute per-bin weights */ + vad->weights = mod_zalloc(mod, num_mel_bins * sizeof(int16_t)); + if (!vad->weights) { + mod_free(mod, vad->noise_floor); + vad->noise_floor = NULL; + return -ENOMEM; + } + + mfcc_vad_build_weights(vad->weights, num_mel_bins, sample_rate); + return 0; +} + +int mfcc_vad_update(struct mfcc_vad_state *vad, const int32_t *mel_log) +{ + int64_t energy_delta = 0; + int32_t delta; + int32_t p; + int16_t alpha; + int i; + + if (!vad || !mel_log) + return 0; + + vad->frame_count++; + + /* Initialize noise floor to first frame */ + if (!vad->initialized) { + for (i = 0; i < vad->num_mel_bins; i++) + vad->noise_floor[i] = mel_log[i]; + + vad->initialized = true; + } + + /* Select rise alpha based on convergence phase */ + if (vad->frame_count <= vad->init_frames) + alpha = vad->noise_rise_alpha_fast; + else + alpha = vad->noise_rise_alpha_slow; + + /* Update noise floor: follow down instantly, rise slowly */ + for (i = 0; i < vad->num_mel_bins; i++) { + if (mel_log[i] < vad->noise_floor[i]) { + /* Instant follow-down */ + vad->noise_floor[i] = mel_log[i]; + } else { + /* Slow rise: floor += alpha * (mel - floor) + * Q9.23 + Q1.15 * Q9.23 => need Q9.23 result + * alpha is Q1.15, delta is Q9.23 + */ + delta = mel_log[i] - vad->noise_floor[i]; + p = (int32_t)Q_MULTSR_32X32((int64_t)alpha, delta, 15, 23, 23); + vad->noise_floor[i] += p; + } + } + + /* Compute weighted energy delta above noise floor. + * energy_delta = sum(weights[i] * (mel[i] - noise_floor[i])) + * weights are Q1.15, mel delta is Q9.23 + * Product is Q10.38, accumulate in int64_t then shift to Q9.23 + */ + for (i = 0; i < vad->num_mel_bins; i++) { + delta = mel_log[i] - vad->noise_floor[i]; + if (delta > 0) + energy_delta += (int64_t)vad->weights[i] * delta; + } + + /* Round accumulated energy from Q10.38 to Q9.23, saturate to int32 */ + energy_delta = sat_int32(Q_SHIFT_RND(energy_delta, 38, 23)); + + if (energy_delta > vad->energy_threshold) { + vad->hangover_counter = vad->hangover_max; + vad->is_speech = true; + } else { + if (vad->hangover_counter > 0) { + vad->hangover_counter--; + vad->is_speech = true; + } else { + vad->is_speech = false; + } + } + + return vad->is_speech ? 1 : 0; +} + +void mfcc_vad_reset(struct mfcc_vad_state *vad) +{ + int i; + + if (!vad) + return; + + vad->frame_count = 0; + vad->hangover_counter = 0; + vad->is_speech = false; + vad->initialized = false; + + for (i = 0; i < vad->num_mel_bins; i++) + vad->noise_floor[i] = 0; +} + +#endif /* CONFIG_COMP_MFCC_VAD */ diff --git a/src/audio/mfcc/tune/README.md b/src/audio/mfcc/tune/README.md new file mode 100644 index 000000000000..5fef841efff1 --- /dev/null +++ b/src/audio/mfcc/tune/README.md @@ -0,0 +1,98 @@ +# SOF MFCC Tuning Tools + +This directory contains a tool to create configuration blob for SOF +MFCC component. It's simply run in Matlab or Octave with command +`setup_mfcc`. The MFCC configuration parameters can be edited from the +script. + +## Testbench + +The configuration can be test run with testbench. First the test topologies +need to be created with `scripts/build-tools.sh -t`. Next the testbench +is built with `scripts/rebuild-testbench.sh`. + +Once the previous steps are done, a sample wav file can be processed +with script `run_mfcc.sh`. The script converts the input to raw 16 kHz +stereo format and runs the testbench for S16, S24, and S32 bit depths, +producing both cepstral coefficient (MFCC) and Mel spectrogram outputs. + +``` +./run_mfcc.sh /usr/share/sounds/alsa/Front_Center.wav +``` + +Output files from host testbench: + +| File | Content | +|------|---------| +| `mfcc_s16.raw`, `mfcc_s24.raw`, `mfcc_s32.raw` | Cepstral coefficients | +| `mel_s16.raw`, `mel_s24.raw`, `mel_s32.raw` | Mel spectrogram | + +If the `XTENSA_PATH` environment variable is set, the script also runs +the Xtensa build of the testbench (via `xt-run`) and produces additional +output files prefixed with `xt_`: + +| File | Content | +|------|---------| +| `xt_mfcc_s16.raw`, `xt_mfcc_s24.raw`, `xt_mfcc_s32.raw` | Cepstral coefficients | +| `xt_mel_s16.raw`, `xt_mel_s24.raw`, `xt_mel_s32.raw` | Mel spectrogram | + +## Decoding and Plotting + +All output files can be decoded and plotted at once in Matlab or Octave +with the `decode_all.m` script: + +```matlab +decode_all +``` + +This calls `decode_ceps` for each MFCC file (13 cepstral coefficients) and +`decode_mel` for each Mel file (80 Mel bins), plotting spectrograms for all +files that exist including the Xtensa variants. + +Individual files can also be decoded manually: + +```matlab +[ceps, t, n] = decode_ceps('mfcc_s16.raw', 13); +``` + +In the above it's known from configuration script that MFCC was set up to +output 13 cepstral coefficients from each FFT → Mel → DCT → Cepstral +coefficients computation run. + +The 80 bands Mel output can be visualized with command: + +```matlab +[mel, t, n] = decode_mel('mel_s16.raw', 80); +``` + +## Live Whisper Transcription with DSP VAD + +The directory contains a Python script `sof_mel_to_text_live_dsp_vad.py`. +It can be used with development topologies +`sof-arl-cs42l43-l0-cs35l56-l23-mfcc.tplg` and +`sof-mtl-rt713-l0-rt1316-l12-mfcc.tplg`. It captures from default audio +device `hw:0,47` (headset microphone) Mel audio features and VAD flags. +The captured frames with detected speech are sent to Whisper speech +recognizer model for conversion to text. + +### Prerequisites + +The script needs OpenVINO. Please follow the install procedure from +. + +The following Python pip installs are needed into the same OpenVINO venv: + +```bash +pip install openvino openvino-tokenizers openvino-genai +pip install optimum[intel] +pip install transformers +pip install huggingface_hub +``` + +### NPU / GPU Support + +The script by default runs the Whisper encoder model in the NPU. To +use the NPU, install the driver from +. If the NPU is not +available, change the encoder to CPU with run option `--encoder-device CPU`. +With a GPU both `--encoder-device GPU` and `--decoder-device GPU` can be set. diff --git a/src/audio/mfcc/tune/README.txt b/src/audio/mfcc/tune/README.txt deleted file mode 100644 index a0c3189e81a3..000000000000 --- a/src/audio/mfcc/tune/README.txt +++ /dev/null @@ -1,52 +0,0 @@ -This directory contains a tool to create configuration blob for SOF -MFCC component. It's simply run in Matlab or Octave with command -"setup_mfcc". The MFCC configuration parameters can be edited from the -script. - -The configuration can be test run with testbench. First the test topologies -need to be created with "scripts/build-tools.sh -t". Next the testbench -is build with "scripts/rebuild-testbench.sh". - -Once the previous steps are done, a sample wav file can be processed -with script run_mfcc.sh. The script converts the input to raw 16 kHz -stereo format and runs the testbench for S16, S24, and S32 bit depths, -producing both cepstral coefficient (MFCC) and Mel spectrogram outputs. - -./run_mfcc.sh /usr/share/sounds/alsa/Front_Center.wav - -Output files from host testbench: - mfcc_s16.raw, mfcc_s24.raw, mfcc_s32.raw - cepstral coefficients - mel_s16.raw, mel_s24.raw, mel_s32.raw - Mel spectrogram - -If the XTENSA_PATH environment variable is set, the script also runs -the Xtensa build of the testbench (via xt-run) and produces additional -output files prefixed with "xt_": - xt_mfcc_s16.raw, xt_mfcc_s24.raw, xt_mfcc_s32.raw - xt_mel_s16.raw, xt_mel_s24.raw, xt_mel_s32.raw - -All output files can be decoded and plotted at once in Matlab or Octave -with the decode_all.m script: - -decode_all - -This calls decode_ceps for each MFCC file (13 cepstral coefficients) and -decode_mel for each Mel file (80 Mel bins), plotting spectrograms for all -files that exist including the Xtensa variants. - -Individual files can also be decoded manually: - -[ceps, t, n] = decode_ceps('mfcc_s16.raw', 13); - -In the above it's known from configuration script that MFCC was set up to -output 13 cepstral coefficients from each FFT -> Mel -> DCT -> Cepstral -coefficients computation run. - -The 80 bands Mel output can be visualized with command: - -[mel, t, n] = decode_mel('mel_s16.raw', 80); - -Other kind of signals have quite big visual difference in audio features. Try -e.g. other sound files found in computer. - -./run_mfcc.sh /usr/share/sounds/gnome/default/alerts/bark.ogg -./run_mfcc.sh /usr/share/sounds/gnome/default/alerts/sonar.ogg diff --git a/src/audio/mfcc/tune/decode_mel.m b/src/audio/mfcc/tune/decode_mel.m index f6a723aa2040..409fbccd9a52 100644 --- a/src/audio/mfcc/tune/decode_mel.m +++ b/src/audio/mfcc/tune/decode_mel.m @@ -1,4 +1,4 @@ -% [mel, t, n] = decode_mel(fn, num_mel, fmt, num_channels) +% [mel, t, n, vad] = decode_mel(fn, num_mel, fmt, num_channels) % % Input % fn - File with Mel data in .raw or .wav format @@ -10,17 +10,18 @@ % mel - Mel coefficients % t - time vector for plotting % n - mel 1..num_mel vector for plotting +% vad - VAD flag per frame from DSP % SPDX-License-Identifier: BSD-3-Clause % Copyright(c) 2026 Intel Corporation. -function [mel, t, n] = decode_mel(fn, num_mel, fmt, num_channels) +function [mel, t, n, vad] = decode_mel(fn, num_mel, fmt, num_channels) if nargin < 3 fmt = 's16'; end if nargin < 4 - num_channels = 1; + num_channels = 2; end % MFCC stream @@ -74,27 +75,43 @@ num_frames = num_frames - 1; end -t_mel = period_mel / num_channels / fs; -t = (0:num_frames -1) * t_mel; -n = 1:num_mel; +% VAD flag is first int32 after magic, followed by num_mel coefficients +payload_len = 1 + num_mel; -mel = zeros(num_mel, num_frames); +payload = zeros(payload_len, num_frames); for i = 1:num_frames i1 = idx(i) + num_magic; - i2 = i1 + num_mel - 1; - mel(:,i) = double(data(i1:i2)) / 2^qformat; + i2 = i1 + payload_len - 1; + payload(:,i) = double(data(i1:i2)); end -figure; +vad = payload(1, :); +mel = payload(2:payload_len, :) / 2^qformat; + +t_mel = period_mel / num_channels / fs; +t = (0:num_frames -1) * t_mel; +n = 1:num_mel; + +%figure(1); +figure imagesc(t, n, mel); axis xy; colormap(jet); colorbar; tstr = sprintf('SOF MFCC Mel coefficients (%s)', fn); title(tstr, 'Interpreter', 'None'); -xlabel('Time (s)'); ylabel('Mel coef #'); +figure +level = sum(mel(:,:)); +plot(t, vad) +ax = axis(); +axis([ax(1:2) -0.1 1.1]); +grid on; +title(tstr, 'Interpreter', 'None'); +xlabel('Time (s)'); +ylabel('VAD flag'); + end function [data, num_channels] = get_file(fn, num_channels, fmt) diff --git a/src/audio/mfcc/tune/sof_mel_to_text_live_dsp_vad.py b/src/audio/mfcc/tune/sof_mel_to_text_live_dsp_vad.py new file mode 100644 index 000000000000..eeafb28f0b75 --- /dev/null +++ b/src/audio/mfcc/tune/sof_mel_to_text_live_dsp_vad.py @@ -0,0 +1,454 @@ +"""Live SOF mel capture with DSP VAD-triggered Whisper transcription. + +Captures mel frames from ALSA with embedded VAD flag from the DSP. +Frame format: [magic(int32), vad_flag(int32), mel[0..79](int32)] +When silence of 100ms is detected after speech, sends the buffered mel +features to Whisper (OpenVINO encoder+decoder) for transcription. +Capture continues running during Whisper inference. + +Usage: + python sof_mel_to_text_live_dsp_vad.py [--device hw:0,47] [--model whisper-medium-int4-ov] + python sof_mel_to_text_live_dsp_vad.py --plot # with live spectrogram +""" + +import argparse +import os +import struct +import subprocess +import threading +import time +import numpy as np +import openvino as ov +import huggingface_hub as hf_hub +from pathlib import Path + +# Graphics imports deferred until --plot is used +matplotlib = None +plt = None + +# SOF mel_s32.raw format constants (with DSP VAD flag) +SOF_MAGIC_S32 = np.int32(0x6D666363) # ASCII 'mfcc' as int32 +SOF_MAGIC_BYTES = struct.pack(' 3: + buf = buf[-3:] + return None, None, buf + end = idx + SOF_FRAME_BYTES + if end > len(buf): + buf = buf[idx:] + return None, None, buf + # Parse VAD flag (first int32 after magic) + vad_flag = struct.unpack_from('> \"{text}\"\n", flush=True) + else: + print(" [Whisper] empty result", flush=True) + + try: + while True: + data = proc.stdout.read(read_chunk) + if not data: + rc = proc.poll() + if rc is not None: + stderr_out = proc.stderr.read().decode(errors='replace') + print(f"\narecord exited with code {rc}") + if stderr_out: + print(f"stderr: {stderr_out}") + break + continue + + buf += data + + while True: + vad_flag, frame_ints, buf = find_frame_in_buffer(buf) + if frame_ints is None: + break + + frame_num += 1 + mel = decode_mel_frame(frame_ints) + speech = vad_flag != 0 + + # Print VAD transitions when not plotting + if plotter is None and speech != prev_speech: + t = frame_num * 0.01 + tag = "SPEECH" if speech else "SILENCE" + print(f" [{t:7.2f}s] {tag}", flush=True) + prev_speech = speech + + # Update plot + if plotter is not None: + plotter.update(mel, speech) + + # --- Speech buffering logic --- + if speech: + speech_buffer.append(mel.copy()) + silence_counter = 0 + was_speaking = True + else: + if was_speaking: + silence_counter += 1 + if silence_counter >= SILENCE_TRIGGER_FRAMES: + n = len(speech_buffer) + duration = n * 0.01 + t = frame_num * 0.01 + + if n < MIN_SPEECH_FRAMES: + # Too short — discard + speech_buffer.clear() + silence_counter = 0 + was_speaking = False + continue + + # Silence threshold reached — send to Whisper + print(f" [{t:7.2f}s] Transcribing {n} frames " + f"({duration:.1f}s)...", flush=True) + + if not transcriber.is_busy(): + frames_copy = list(speech_buffer) + transcriber.transcribe_async( + frames_copy, on_transcription) + else: + print(f" [{t:7.2f}s] (Whisper busy, " + f"dropping {n} frames)", flush=True) + + speech_buffer.clear() + silence_counter = 0 + was_speaking = False + + except (KeyboardInterrupt, BrokenPipeError): + pass + finally: + if proc.poll() is None: + proc.terminate() + try: + proc.wait(timeout=3) + except subprocess.TimeoutExpired: + proc.kill() + proc.wait() + if plotter is not None: + try: + plt.close(plotter.fig) + except Exception: + pass + print("\n\nCapture stopped.") + + +def main(): + parser = argparse.ArgumentParser( + description="Live SOF mel capture with DSP VAD-triggered Whisper transcription") + parser.add_argument('--device', '-D', default='hw:0,47', + help='ALSA capture device (default: hw:0,47)') + parser.add_argument('--rate', '-r', type=int, default=16000, + help='Sample rate for arecord (default: 16000)') + parser.add_argument('--model', '-m', default='whisper-medium-int4-ov', + help='Path to Whisper OpenVINO model directory') + parser.add_argument('--encoder-device', default='NPU', + help='OpenVINO device for encoder (default: NPU)') + parser.add_argument('--decoder-device', default='CPU', + help='OpenVINO device for decoder (default: CPU)') + parser.add_argument('--plot', action='store_true', + help='Show live scrolling mel spectrogram and VAD plot') + args = parser.parse_args() + model_id = "OpenVINO/" + os.path.basename(args.model) + if not os.path.isdir(args.model): + print(f"Downloading model {model_id} ...") + hf_hub.snapshot_download(model_id, local_dir=args.model) + + print("=== Live SOF Mel → Whisper Transcription (DSP VAD) ===\n") + run_capture(args.device, args.rate, args.model, args.encoder_device, + args.decoder_device, enable_plot=args.plot) + + +if __name__ == '__main__': + main() diff --git a/src/include/sof/audio/mfcc/mfcc_comp.h b/src/include/sof/audio/mfcc/mfcc_comp.h index 025eef116752..e0617e0f026f 100644 --- a/src/include/sof/audio/mfcc/mfcc_comp.h +++ b/src/include/sof/audio/mfcc/mfcc_comp.h @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -106,6 +107,10 @@ struct mfcc_state { bool waiting_fill; /**< booleans */ bool prev_samples_valid; bool magic_pending; /**< True when magic word not yet written for current output */ +#ifdef CONFIG_COMP_MFCC_VAD + bool vad_pending; /**< True when VAD flag not yet written for current output */ + int32_t vad_flag; /**< Current VAD result: 1 = speech, 0 = silence */ +#endif size_t sample_buffers_size; /**< bytes */ int16_t *out_data_ptr; /**< Read pointer into scratch data for multi-period output */ int32_t *out_data_ptr_32; /**< Read pointer for 32-bit mel-only output */ @@ -115,6 +120,9 @@ struct mfcc_state { /* MFCC component private data */ struct mfcc_comp_data { struct mfcc_state state; +#ifdef CONFIG_COMP_MFCC_VAD + struct mfcc_vad_state vad; +#endif struct comp_data_blob_handler *model_handler; struct sof_mfcc_config *config; int max_frames; diff --git a/src/include/sof/audio/mfcc/mfcc_vad.h b/src/include/sof/audio/mfcc/mfcc_vad.h new file mode 100644 index 000000000000..6eac1ae08a15 --- /dev/null +++ b/src/include/sof/audio/mfcc/mfcc_vad.h @@ -0,0 +1,101 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * + * Copyright(c) 2026 Intel Corporation. + * + * Author: Seppo Ingalsuo + */ + +/** + * \file mfcc_vad.h + * \brief Voice Activity Detection based on Mel spectrum energy. + * + * This VAD operates on the Q9.23 Mel log spectrum values produced by + * the MFCC component. It tracks a per-bin noise floor that follows + * the signal downward instantly and rises slowly, then computes a + * speech-weighted energy delta above the floor. + */ + +#ifndef __SOF_AUDIO_MFCC_MFCC_VAD_H__ +#define __SOF_AUDIO_MFCC_MFCC_VAD_H__ + +#include +#include + +#ifdef CONFIG_COMP_MFCC_VAD + +struct processing_module; + +/** + * \brief Number of frames for fast noise floor convergence at startup (~1 s at 10 ms/frame). + */ +#define MFCC_VAD_NOISE_INIT_FRAMES 100 + +/** + * \brief Slow noise floor rise coefficient in Q1.15 (0.0010 * 32768 = 3). + */ +#define MFCC_VAD_NOISE_RISE_ALPHA 33 + +/** + * \brief Fast noise floor rise coefficient in Q1.15 (0.05 * 32768 = 1638). + */ +#define MFCC_VAD_NOISE_RISE_ALPHA_FAST 1638 + +/** + * \brief Energy threshold for speech detection in Q9.23 (0.35 * 2^23 = 2936013). + */ +#define MFCC_VAD_ENERGY_THRESHOLD 2936013 + +/** + * \brief Hangover frame count to keep VAD active after last speech detection. + */ +#define MFCC_VAD_HANGOVER_FRAMES 20 + +/** + * \brief VAD state structure. + */ +struct mfcc_vad_state { + int32_t *noise_floor; /**< Per-bin noise floor in Q9.23 */ + int16_t *weights; /**< Speech-frequency emphasis weights Q1.15 */ + int32_t energy_threshold; /**< Energy threshold Q9.23 */ + int16_t noise_rise_alpha_slow; /**< Slow rise alpha Q1.15 */ + int16_t noise_rise_alpha_fast; /**< Fast rise alpha Q1.15 */ + int16_t hangover_max; /**< Maximum hangover frames */ + int16_t hangover_counter; /**< Current hangover counter */ + int16_t num_mel_bins; /**< Number of Mel bins in use */ + int16_t init_frames; /**< Number of initial frames for fast convergence */ + int32_t frame_count; /**< Total frames processed */ + bool is_speech; /**< Current VAD decision */ + bool initialized; /**< True after first frame processed */ +}; + +/** + * \brief Initialize VAD state. + * + * \param[out] vad Pointer to VAD state to initialize. + * \param[in] num_mel_bins Number of Mel bins. + * \param[in] sample_rate Audio sample rate in Hz. + * \param[in] mod Processing module for memory allocation. + * \return 0 on success, negative error code on failure. + */ +int mfcc_vad_init(struct mfcc_vad_state *vad, int num_mel_bins, int sample_rate, + struct processing_module *mod); + +/** + * \brief Process one Mel spectrum frame and update VAD decision. + * + * \param[in,out] vad Pointer to VAD state. + * \param[in] mel_log Mel log spectrum in Q9.23, array of num_mel_bins values. + * \return 1 if speech detected, 0 if silence. + */ +int mfcc_vad_update(struct mfcc_vad_state *vad, const int32_t *mel_log); + +/** + * \brief Reset VAD state without changing configuration. + * + * \param[in,out] vad Pointer to VAD state. + */ +void mfcc_vad_reset(struct mfcc_vad_state *vad); + +#endif /* CONFIG_COMP_MFCC_VAD */ + +#endif /* __SOF_AUDIO_MFCC_MFCC_VAD_H__ */