diff --git a/src/arch/host/configs/library_defconfig b/src/arch/host/configs/library_defconfig
index 28c486bec58d..34ea0fe051f3 100644
--- a/src/arch/host/configs/library_defconfig
+++ b/src/arch/host/configs/library_defconfig
@@ -11,6 +11,7 @@ CONFIG_COMP_IIR=y
 CONFIG_COMP_IGO_NR=y
 CONFIG_COMP_LEVEL_MULTIPLIER=y
 CONFIG_COMP_MFCC=y
+CONFIG_COMP_MFCC_VAD=y
 CONFIG_COMP_MODULE_ADAPTER=y
 CONFIG_COMP_MULTIBAND_DRC=y
 CONFIG_COMP_MUX=y
diff --git a/src/audio/mfcc/CMakeLists.txt b/src/audio/mfcc/CMakeLists.txt
index f8af79d1ca8a..433aa824e713 100644
--- a/src/audio/mfcc/CMakeLists.txt
+++ b/src/audio/mfcc/CMakeLists.txt
@@ -5,4 +5,7 @@ if(CONFIG_COMP_MFCC STREQUAL "m" AND DEFINED CONFIG_LLEXT)
   add_dependencies(app mfcc)
 else()
   add_local_sources(sof mfcc.c mfcc_setup.c mfcc_common.c mfcc_generic.c mfcc_hifi4.c mfcc_hifi3.c)
+  if(CONFIG_COMP_MFCC_VAD)
+    add_local_sources(sof mfcc_vad.c)
+  endif()
 endif()
diff --git a/src/audio/mfcc/Kconfig b/src/audio/mfcc/Kconfig
index f56cadb40de2..821a3d22018d 100644
--- a/src/audio/mfcc/Kconfig
+++ b/src/audio/mfcc/Kconfig
@@ -24,3 +24,14 @@ config COMP_MFCC
 	  The characteristic of the audio features are defined in the binary
 	  control blob. Directory tools/tune/mfcc contains a tool to create
 	  the configurations.
+
+config COMP_MFCC_VAD
+	bool "MFCC Voice Activity Detection"
+	depends on COMP_MFCC
+	default y
+	help
+	  This option enables a Voice Activity Detector (VAD) that operates
+	  on the Mel spectrum values produced by the MFCC component. The VAD
+	  flag is inserted into the output stream as the first int32_t value
+	  after the magic header word. The VAD tracks a per-bin noise floor
+	  and detects speech using a weighted energy delta with hangover.
diff --git a/src/audio/mfcc/mfcc_common.c b/src/audio/mfcc/mfcc_common.c
index 1079864e9259..75e027794449 100644
--- a/src/audio/mfcc/mfcc_common.c
+++ b/src/audio/mfcc/mfcc_common.c
@@ -21,6 +21,10 @@
 #include <stddef.h>
 #include <stdint.h>
 
+#ifdef CONFIG_COMP_MFCC_VAD
+#include <sof/audio/mfcc/mfcc_vad.h>
+#endif
+
 LOG_MODULE_REGISTER(mfcc_common, CONFIG_SOF_LOG_LEVEL);
 
 /*
@@ -144,6 +148,10 @@ static int mfcc_stft_process(const struct comp_dev *dev, struct mfcc_comp_data *
 					sat_int32(Q_MULTSR_32X32(s, config->mel_scale, 23, 12, 23));
 			}
 
+#ifdef CONFIG_COMP_MFCC_VAD
+			/* Run VAD on the mel log spectrum before further processing */
+			state->vad_flag = mfcc_vad_update(&cd->vad, state->mel_log_32);
+#endif
 			/* Store Q9.7 version in mel_spectra for s16 output mode */
 			for (j = 0; j < state->dct.num_in; j++)
 				state->mel_spectra->data[j] =
@@ -282,10 +290,14 @@ void mfcc_s16_default(struct processing_module *mod, struct input_stream_buffer
 
 	/* If new output produced, set up pointer into scratch data and mark magic pending */
 	if (num_ceps > 0) {
-		if (state->mel_only)
+		if (state->mel_only) {
 			state->out_data_ptr = state->mel_spectra->data;
-		else
+#ifdef CONFIG_COMP_MFCC_VAD
+			state->vad_pending = true;
+#endif
+		} else {
 			state->out_data_ptr = state->cepstral_coef->data;
+		}
 
 		state->out_remain = num_ceps;
 		state->magic_pending = true;
@@ -301,6 +313,15 @@ void mfcc_s16_default(struct processing_module *mod, struct input_stream_buffer
 		state->magic_pending = false;
 	}
 
+#ifdef CONFIG_COMP_MFCC_VAD
+	/* Write VAD flag as first value after magic (as two int16_t = one int32_t) */
+	if (state->vad_pending && sink_samples >= 2) {
+		w_ptr = mfcc_sink_copy_data_s16(sink, w_ptr, 2, (int16_t *)&state->vad_flag);
+		sink_samples -= 2;
+		state->vad_pending = false;
+	}
+#endif
+
 	/* Write cepstral/mel data from scratch buffer */
 	to_copy = MIN(state->out_remain, sink_samples);
 	if (to_copy > 0) {
@@ -386,6 +407,9 @@ void mfcc_s24_default(struct processing_module *mod, struct input_stream_buffer
 				state->mel_log_32[k] >>= 8;
 
 			state->out_data_ptr_32 = state->mel_log_32;
+#ifdef CONFIG_COMP_MFCC_VAD
+			state->vad_pending = true;
+#endif
 		} else {
 			state->out_data_ptr = state->cepstral_coef->data;
 		}
@@ -404,6 +428,15 @@ void mfcc_s24_default(struct processing_module *mod, struct input_stream_buffer
 		state->magic_pending = false;
 	}
 
+#ifdef CONFIG_COMP_MFCC_VAD
+	/* Write VAD flag as first value after magic */
+	if (state->vad_pending && sink_samples >= 1) {
+		w_ptr = mfcc_sink_copy_data_s32(sink, w_ptr, 1, &state->vad_flag);
+		sink_samples -= 1;
+		state->vad_pending = false;
+	}
+#endif
+
 	if (state->mel_only) {
 		/* Write 32-bit mel data Q9.15, one value per int32_t */
 		to_copy = MIN(state->out_remain, sink_samples);
@@ -461,6 +494,9 @@ void mfcc_s32_default(struct processing_module *mod, struct input_stream_buffer
 	if (num_ceps > 0) {
 		if (state->mel_only) {
 			state->out_data_ptr_32 = state->mel_log_32;
+#ifdef CONFIG_COMP_MFCC_VAD
+			state->vad_pending = true;
+#endif
 		} else {
 			state->out_data_ptr = state->cepstral_coef->data;
 		}
@@ -479,6 +515,15 @@ void mfcc_s32_default(struct processing_module *mod, struct input_stream_buffer
 		state->magic_pending = false;
 	}
 
+#ifdef CONFIG_COMP_MFCC_VAD
+	/* Write VAD flag as first value after magic */
+	if (state->vad_pending && sink_samples >= 1) {
+		w_ptr = mfcc_sink_copy_data_s32(sink, w_ptr, 1, &state->vad_flag);
+		sink_samples -= 1;
+		state->vad_pending = false;
+	}
+#endif
+
 	if (state->mel_only) {
 		/* Write 32-bit mel data Q9.23, one value per int32_t */
 		to_copy = MIN(state->out_remain, sink_samples);
diff --git a/src/audio/mfcc/mfcc_setup.c b/src/audio/mfcc/mfcc_setup.c
index 1cad4b2b984e..aa83f93d8e3a 100644
--- a/src/audio/mfcc/mfcc_setup.c
+++ b/src/audio/mfcc/mfcc_setup.c
@@ -18,6 +18,10 @@
 #include <stddef.h>
 #include <stdint.h>
 
+#ifdef CONFIG_COMP_MFCC_VAD
+#include <sof/audio/mfcc/mfcc_vad.h>
+#endif
+
 /* Definitions for cepstral lifter */
 #define PI_Q23 Q_CONVERT_FLOAT(3.1415926536, 23)
 #define TWO_PI_Q23 Q_CONVERT_FLOAT(6.2831853072, 23)
@@ -346,10 +350,22 @@ int mfcc_setup(struct processing_module *mod, int max_frames, int sample_rate, i
 	state->waiting_fill = true;
 	state->prev_samples_valid = false;
 	state->magic_pending = false;
+#ifdef CONFIG_COMP_MFCC_VAD
+	state->vad_pending = false;
+	state->vad_flag = 0;
+#endif
 	state->out_data_ptr = NULL;
 	state->out_data_ptr_32 = NULL;
 	state->out_remain = 0;
 
+#ifdef CONFIG_COMP_MFCC_VAD
+	ret = mfcc_vad_init(&cd->vad, config->num_mel_bins, sample_rate, mod);
+	if (ret < 0) {
+		comp_err(dev, "Failed VAD init");
+		goto free_lifter;
+	}
+#endif
+
 	comp_dbg(dev, "done");
 	return 0;
 
@@ -389,4 +405,8 @@ void mfcc_free_buffers(struct processing_module *mod)
 	mod_free(mod, cd->state.melfb.data);
 	mod_free(mod, cd->state.dct.matrix);
 	mod_free(mod, cd->state.lifter.matrix);
+#ifdef CONFIG_COMP_MFCC_VAD
+	mod_free(mod, cd->vad.noise_floor);
+	mod_free(mod, cd->vad.weights);
+#endif
 }
diff --git a/src/audio/mfcc/mfcc_vad.c b/src/audio/mfcc/mfcc_vad.c
new file mode 100644
index 000000000000..cdcc3d7eaa12
--- /dev/null
+++ b/src/audio/mfcc/mfcc_vad.c
@@ -0,0 +1,244 @@
+// SPDX-License-Identifier: BSD-3-Clause
+//
+// Copyright(c) 2026 Intel Corporation.
+//
+// Author: Seppo Ingalsuo <seppo.ingalsuo@linux.intel.com>
+
+/**
+ * \file mfcc_vad.c
+ * \brief Voice Activity Detection based on Mel spectrum energy.
+ *
+ * Implements a VAD that tracks per-bin noise floor and computes a
+ * speech-frequency weighted energy above the floor. Speech is declared
+ * when the weighted delta exceeds a threshold, with hangover to prevent
+ * rapid toggling.
+ */
+
+#include <sof/audio/mfcc/mfcc_vad.h>
+
+#ifdef CONFIG_COMP_MFCC_VAD
+
+#include <sof/audio/component.h>
+#include <sof/audio/format.h>
+#include <sof/audio/module_adapter/module/module_interface.h>
+#include <sof/math/auditory.h>
+#include <sof/trace/trace.h>
+#include <errno.h>
+#include <stddef.h>
+
+LOG_MODULE_DECLARE(mfcc, CONFIG_SOF_LOG_LEVEL);
+
+/**
+ * \brief A-weighting table: 1/3 octave band center frequencies in Hz (Q16.0).
+ *
+ * From IEC 61672-1:2013, source:
+ * https://acousticalengineer.com/a-weighting-table/
+ */
+#define A_WEIGHT_TABLE_SIZE	36
+
+static const int16_t a_weight_hz[A_WEIGHT_TABLE_SIZE] = {
+	    6,     8,    10,    13,    16,    20,    25,    32,
+	   40,    50,    63,    80,   100,   125,   160,   200,
+	  250,   315,   400,   500,   630,   800,  1000,  1250,
+	 1600,  2000,  2500,  3150,  4000,  5000,  6300,  8000,
+	10000, 12500, 16000, 20000,
+};
+
+/**
+ * \brief A-weighting linear amplitude, scaled so peak (at 2500 Hz) maps
+ *        to INT16_MAX (32767).  Original dB values converted via
+ *        10^(dB/20) then scaled by 32767 / max.
+ */
+static const int16_t a_weight_lin[A_WEIGHT_TABLE_SIZE] = {
+	    2,     4,     9,    19,    43,    85,   162,   299,
+	  531,   862,  1382,  2140,  3129,  4370,  6172,  8136,
+	10362, 13196, 16234, 19518, 22669, 25730, 28212, 30230,
+	31655, 32392, 32767, 32392, 31655, 30230, 27889, 24856,
+	21156, 17196, 13045,  9670,
+};
+
+/**
+ * \brief Compute A-weighted speech-frequency emphasis weights for Mel bins.
+ *
+ * Weights are computed by linearly interpolating the A-weighting table
+ * at each Mel bin center frequency.  Output weights are in Q1.15 and
+ * sum to approximately 2^15.
+ *
+ * \param[out] weights Output weight array.
+ * \param[in] num_mel Number of Mel bins.
+ * \param[in] sample_rate Sample rate in Hz.
+ */
+static void mfcc_vad_build_weights(int16_t *weights, int num_mel, int sample_rate)
+{
+	int32_t scaled, num;
+	int32_t sum = 0;
+	int16_t f_hz, f0, f1, w, w0, w1, den;
+	int16_t mel_end = psy_hz_to_mel((int16_t)(sample_rate / 2));
+	int16_t mel_step = mel_end / (num_mel + 1);
+	int i, j;
+
+	if (!num_mel)
+		return;
+
+	for (i = 0; i < num_mel; i++) {
+		f_hz = psy_mel_to_hz((int16_t)((i + 1) * mel_step));
+
+		/* Find the table interval containing f_hz and interpolate */
+		if (f_hz <= a_weight_hz[0]) {
+			w = a_weight_lin[0];
+		} else if (f_hz >= a_weight_hz[A_WEIGHT_TABLE_SIZE - 1]) {
+			w = a_weight_lin[A_WEIGHT_TABLE_SIZE - 1];
+		} else {
+			/* Find j such that a_weight_hz[j] <= f_hz < a_weight_hz[j+1] */
+			for (j = 0; j < A_WEIGHT_TABLE_SIZE - 2; j++) {
+				if (f_hz < a_weight_hz[j + 1])
+					break;
+			}
+
+			/* Linear interpolation: w = w0 + (w1 - w0) * (f - f0) / (f1 - f0) */
+			f0 = a_weight_hz[j];
+			f1 = a_weight_hz[j + 1];
+			w0 = a_weight_lin[j];
+			w1 = a_weight_lin[j + 1];
+			num = (int32_t)(w1 - w0) * (f_hz - f0);
+			den = f1 - f0;
+			w = w0 + (int16_t)(num / den);
+		}
+
+		weights[i] = w;
+		sum += w;
+	}
+
+	/* Normalize weights so they sum to 1.0 */
+	for (i = 0; i < num_mel; i++) {
+		scaled = ((int32_t)weights[i] << 16) / sum; /* Q1.16 */
+		weights[i] = (int16_t)Q_SHIFT_RND(scaled, 16, 15); /* Round to Q1.15 */
+	}
+}
+
+int mfcc_vad_init(struct mfcc_vad_state *vad, int num_mel_bins, int sample_rate,
+		  struct processing_module *mod)
+{
+	if (!vad)
+		return -EINVAL;
+
+	if (num_mel_bins <= 0)
+		return -EINVAL;
+
+	vad->num_mel_bins = num_mel_bins;
+	vad->energy_threshold = MFCC_VAD_ENERGY_THRESHOLD;
+	vad->noise_rise_alpha_slow = MFCC_VAD_NOISE_RISE_ALPHA;
+	vad->noise_rise_alpha_fast = MFCC_VAD_NOISE_RISE_ALPHA_FAST;
+	vad->hangover_max = MFCC_VAD_HANGOVER_FRAMES;
+	vad->hangover_counter = 0;
+	vad->init_frames = MFCC_VAD_NOISE_INIT_FRAMES;
+	vad->frame_count = 0;
+	vad->is_speech = false;
+	vad->initialized = false;
+
+	/* Allocate per-bin noise floor */
+	vad->noise_floor = mod_zalloc(mod, num_mel_bins * sizeof(int32_t));
+	if (!vad->noise_floor)
+		return -ENOMEM;
+
+	/* Allocate and compute per-bin weights */
+	vad->weights = mod_zalloc(mod, num_mel_bins * sizeof(int16_t));
+	if (!vad->weights) {
+		mod_free(mod, vad->noise_floor);
+		vad->noise_floor = NULL;
+		return -ENOMEM;
+	}
+
+	mfcc_vad_build_weights(vad->weights, num_mel_bins, sample_rate);
+	return 0;
+}
+
+int mfcc_vad_update(struct mfcc_vad_state *vad, const int32_t *mel_log)
+{
+	int64_t energy_delta = 0;
+	int32_t delta;
+	int32_t p;
+	int16_t alpha;
+	int i;
+
+	if (!vad || !mel_log)
+		return 0;
+
+	vad->frame_count++;
+
+	/* Initialize noise floor to first frame */
+	if (!vad->initialized) {
+		for (i = 0; i < vad->num_mel_bins; i++)
+			vad->noise_floor[i] = mel_log[i];
+
+		vad->initialized = true;
+	}
+
+	/* Select rise alpha based on convergence phase */
+	if (vad->frame_count <= vad->init_frames)
+		alpha = vad->noise_rise_alpha_fast;
+	else
+		alpha = vad->noise_rise_alpha_slow;
+
+	/* Update noise floor: follow down instantly, rise slowly */
+	for (i = 0; i < vad->num_mel_bins; i++) {
+		if (mel_log[i] < vad->noise_floor[i]) {
+			/* Instant follow-down */
+			vad->noise_floor[i] = mel_log[i];
+		} else {
+			/* Slow rise: floor += alpha * (mel - floor)
+			 * Q9.23 + Q1.15 * Q9.23 => need Q9.23 result
+			 * alpha is Q1.15, delta is Q9.23
+			 */
+			delta = mel_log[i] - vad->noise_floor[i];
+			p = (int32_t)Q_MULTSR_32X32((int64_t)alpha, delta, 15, 23, 23);
+			vad->noise_floor[i] += p;
+		}
+	}
+
+	/* Compute weighted energy delta above noise floor.
+	 * energy_delta = sum(weights[i] * (mel[i] - noise_floor[i]))
+	 * weights are Q1.15, mel delta is Q9.23
+	 * Product is Q10.38, accumulate in int64_t then shift to Q9.23
+	 */
+	for (i = 0; i < vad->num_mel_bins; i++) {
+		delta = mel_log[i] - vad->noise_floor[i];
+		if (delta > 0)
+			energy_delta += (int64_t)vad->weights[i] * delta;
+	}
+
+	/* Round accumulated energy from Q10.38 to Q9.23, saturate to int32 */
+	energy_delta = sat_int32(Q_SHIFT_RND(energy_delta, 38, 23));
+
+	if (energy_delta > vad->energy_threshold) {
+		vad->hangover_counter = vad->hangover_max;
+		vad->is_speech = true;
+	} else {
+		if (vad->hangover_counter > 0) {
+			vad->hangover_counter--;
+			vad->is_speech = true;
+		} else {
+			vad->is_speech = false;
+		}
+	}
+
+	return vad->is_speech ? 1 : 0;
+}
+
+void mfcc_vad_reset(struct mfcc_vad_state *vad)
+{
+	int i;
+
+	if (!vad)
+		return;
+
+	vad->frame_count = 0;
+	vad->hangover_counter = 0;
+	vad->is_speech = false;
+	vad->initialized = false;
+
+	for (i = 0; i < vad->num_mel_bins; i++)
+		vad->noise_floor[i] = 0;
+}
+
+#endif /* CONFIG_COMP_MFCC_VAD */
diff --git a/src/audio/mfcc/tune/README.md b/src/audio/mfcc/tune/README.md
new file mode 100644
index 000000000000..5fef841efff1
--- /dev/null
+++ b/src/audio/mfcc/tune/README.md
@@ -0,0 +1,98 @@
+# SOF MFCC Tuning Tools
+
+This directory contains a tool to create configuration blob for SOF
+MFCC component. It's simply run in Matlab or Octave with command
+`setup_mfcc`. The MFCC configuration parameters can be edited from the
+script.
+
+## Testbench
+
+The configuration can be test run with testbench. First the test topologies
+need to be created with `scripts/build-tools.sh -t`. Next the testbench
+is built with `scripts/rebuild-testbench.sh`.
+
+Once the previous steps are done, a sample wav file can be processed
+with script `run_mfcc.sh`. The script converts the input to raw 16 kHz
+stereo format and runs the testbench for S16, S24, and S32 bit depths,
+producing both cepstral coefficient (MFCC) and Mel spectrogram outputs.
+
+```
+./run_mfcc.sh /usr/share/sounds/alsa/Front_Center.wav
+```
+
+Output files from host testbench:
+
+| File | Content |
+|------|---------|
+| `mfcc_s16.raw`, `mfcc_s24.raw`, `mfcc_s32.raw` | Cepstral coefficients |
+| `mel_s16.raw`, `mel_s24.raw`, `mel_s32.raw` | Mel spectrogram |
+
+If the `XTENSA_PATH` environment variable is set, the script also runs
+the Xtensa build of the testbench (via `xt-run`) and produces additional
+output files prefixed with `xt_`:
+
+| File | Content |
+|------|---------|
+| `xt_mfcc_s16.raw`, `xt_mfcc_s24.raw`, `xt_mfcc_s32.raw` | Cepstral coefficients |
+| `xt_mel_s16.raw`, `xt_mel_s24.raw`, `xt_mel_s32.raw` | Mel spectrogram |
+
+## Decoding and Plotting
+
+All output files can be decoded and plotted at once in Matlab or Octave
+with the `decode_all.m` script:
+
+```matlab
+decode_all
+```
+
+This calls `decode_ceps` for each MFCC file (13 cepstral coefficients) and
+`decode_mel` for each Mel file (80 Mel bins), plotting spectrograms for all
+files that exist including the Xtensa variants.
+
+Individual files can also be decoded manually:
+
+```matlab
+[ceps, t, n] = decode_ceps('mfcc_s16.raw', 13);
+```
+
+In the above it's known from configuration script that MFCC was set up to
+output 13 cepstral coefficients from each FFT → Mel → DCT → Cepstral
+coefficients computation run.
+
+The 80 bands Mel output can be visualized with command:
+
+```matlab
+[mel, t, n] = decode_mel('mel_s16.raw', 80);
+```
+
+## Live Whisper Transcription with DSP VAD
+
+The directory contains a Python script `sof_mel_to_text_live_dsp_vad.py`.
+It can be used with development topologies
+`sof-arl-cs42l43-l0-cs35l56-l23-mfcc.tplg` and
+`sof-mtl-rt713-l0-rt1316-l12-mfcc.tplg`. It captures from default audio
+device `hw:0,47` (headset microphone) Mel audio features and VAD flags.
+The captured frames with detected speech are sent to Whisper speech
+recognizer model for conversion to text.
+
+### Prerequisites
+
+The script needs OpenVINO. Please follow the install procedure from
+<https://docs.openvino.ai/2025/get-started/install-openvino.html>.
+
+The following Python pip installs are needed into the same OpenVINO venv:
+
+```bash
+pip install openvino openvino-tokenizers openvino-genai
+pip install optimum[intel]
+pip install transformers
+pip install huggingface_hub
+```
+
+### NPU / GPU Support
+
+The script by default runs the Whisper encoder model in the NPU. To
+use the NPU, install the driver from
+<https://github.com/intel/linux-npu-driver/releases>. If the NPU is not
+available, change the encoder to CPU with run option `--encoder-device CPU`.
+With a GPU both `--encoder-device GPU` and `--decoder-device GPU` can be set.
diff --git a/src/audio/mfcc/tune/README.txt b/src/audio/mfcc/tune/README.txt
deleted file mode 100644
index a0c3189e81a3..000000000000
--- a/src/audio/mfcc/tune/README.txt
+++ /dev/null
@@ -1,52 +0,0 @@
-This directory contains a tool to create configuration blob for SOF
-MFCC component. It's simply run in Matlab or Octave with command
-"setup_mfcc". The MFCC configuration parameters can be edited from the
-script.
-
-The configuration can be test run with testbench. First the test topologies
-need to be created with "scripts/build-tools.sh -t". Next the testbench
-is build with "scripts/rebuild-testbench.sh".
-
-Once the previous steps are done, a sample wav file can be processed
-with script run_mfcc.sh. The script converts the input to raw 16 kHz
-stereo format and runs the testbench for S16, S24, and S32 bit depths,
-producing both cepstral coefficient (MFCC) and Mel spectrogram outputs.
-
-./run_mfcc.sh /usr/share/sounds/alsa/Front_Center.wav
-
-Output files from host testbench:
-  mfcc_s16.raw, mfcc_s24.raw, mfcc_s32.raw   - cepstral coefficients
-  mel_s16.raw, mel_s24.raw, mel_s32.raw       - Mel spectrogram
-
-If the XTENSA_PATH environment variable is set, the script also runs
-the Xtensa build of the testbench (via xt-run) and produces additional
-output files prefixed with "xt_":
-  xt_mfcc_s16.raw, xt_mfcc_s24.raw, xt_mfcc_s32.raw
-  xt_mel_s16.raw, xt_mel_s24.raw, xt_mel_s32.raw
-
-All output files can be decoded and plotted at once in Matlab or Octave
-with the decode_all.m script:
-
-decode_all
-
-This calls decode_ceps for each MFCC file (13 cepstral coefficients) and
-decode_mel for each Mel file (80 Mel bins), plotting spectrograms for all
-files that exist including the Xtensa variants.
-
-Individual files can also be decoded manually:
-
-[ceps, t, n] = decode_ceps('mfcc_s16.raw', 13);
-
-In the above it's known from configuration script that MFCC was set up to
-output 13 cepstral coefficients from each FFT -> Mel -> DCT -> Cepstral
-coefficients computation run.
-
-The 80 bands Mel output can be visualized with command:
-
-[mel, t, n] = decode_mel('mel_s16.raw', 80);
-
-Other kind of signals have quite big visual difference in audio features. Try
-e.g. other sound files found in computer.
-
-./run_mfcc.sh /usr/share/sounds/gnome/default/alerts/bark.ogg
-./run_mfcc.sh /usr/share/sounds/gnome/default/alerts/sonar.ogg
diff --git a/src/audio/mfcc/tune/decode_mel.m b/src/audio/mfcc/tune/decode_mel.m
index f6a723aa2040..409fbccd9a52 100644
--- a/src/audio/mfcc/tune/decode_mel.m
+++ b/src/audio/mfcc/tune/decode_mel.m
@@ -1,4 +1,4 @@
-% [mel, t, n] = decode_mel(fn, num_mel, fmt, num_channels)
+% [mel, t, n, vad] = decode_mel(fn, num_mel, fmt, num_channels)
 %
 % Input
 %   fn - File with Mel data in .raw or .wav format
@@ -10,17 +10,18 @@
 %   mel - Mel coefficients
 %   t - time vector for plotting
 %   n - mel 1..num_mel vector for plotting
+%   vad - VAD flag per frame from DSP
 
 % SPDX-License-Identifier: BSD-3-Clause
 % Copyright(c) 2026 Intel Corporation.
 
-function [mel, t, n] = decode_mel(fn, num_mel, fmt, num_channels)
+function [mel, t, n, vad] = decode_mel(fn, num_mel, fmt, num_channels)
 
 if nargin < 3
 	fmt = 's16';
 end
 if nargin < 4
-	num_channels = 1;
+	num_channels = 2;
 end
 
 % MFCC stream
@@ -74,27 +75,43 @@
     num_frames = num_frames - 1;
 end
 
-t_mel = period_mel / num_channels / fs;
-t = (0:num_frames -1) * t_mel;
-n = 1:num_mel;
+% VAD flag is first int32 after magic, followed by num_mel coefficients
+payload_len = 1 + num_mel;
 
-mel = zeros(num_mel, num_frames);
+payload = zeros(payload_len, num_frames);
 for i = 1:num_frames
 	i1 = idx(i) + num_magic;
-	i2 = i1 + num_mel - 1;
-	mel(:,i) = double(data(i1:i2)) / 2^qformat;
+	i2 = i1 + payload_len - 1;
+	payload(:,i) = double(data(i1:i2));
 end
 
-figure;
+vad = payload(1, :);
+mel = payload(2:payload_len, :) / 2^qformat;
+
+t_mel = period_mel / num_channels / fs;
+t = (0:num_frames -1) * t_mel;
+n = 1:num_mel;
+
+%figure(1);
+figure
 imagesc(t, n, mel);
 axis xy;
 colormap(jet);
 colorbar;
 tstr = sprintf('SOF MFCC Mel coefficients (%s)', fn);
 title(tstr, 'Interpreter', 'None');
-xlabel('Time (s)');
 ylabel('Mel coef #');
 
+figure
+level = sum(mel(:,:));
+plot(t, vad)
+ax = axis();
+axis([ax(1:2) -0.1 1.1]);
+grid on;
+title(tstr, 'Interpreter', 'None');
+xlabel('Time (s)');
+ylabel('VAD flag');
+
 end
 
 function [data, num_channels] = get_file(fn, num_channels, fmt)
diff --git a/src/audio/mfcc/tune/sof_mel_to_text_live_dsp_vad.py b/src/audio/mfcc/tune/sof_mel_to_text_live_dsp_vad.py
new file mode 100644
index 000000000000..eeafb28f0b75
--- /dev/null
+++ b/src/audio/mfcc/tune/sof_mel_to_text_live_dsp_vad.py
@@ -0,0 +1,454 @@
+"""Live SOF mel capture with DSP VAD-triggered Whisper transcription.
+
+Captures mel frames from ALSA with embedded VAD flag from the DSP.
+Frame format: [magic(int32), vad_flag(int32), mel[0..79](int32)]
+When silence of 100ms is detected after speech, sends the buffered mel
+features to Whisper (OpenVINO encoder+decoder) for transcription.
+Capture continues running during Whisper inference.
+
+Usage:
+    python sof_mel_to_text_live_dsp_vad.py [--device hw:0,47] [--model whisper-medium-int4-ov]
+    python sof_mel_to_text_live_dsp_vad.py --plot  # with live spectrogram
+"""
+
+import argparse
+import os
+import struct
+import subprocess
+import threading
+import time
+import numpy as np
+import openvino as ov
+import huggingface_hub as hf_hub
+from pathlib import Path
+
+# Graphics imports deferred until --plot is used
+matplotlib = None
+plt = None
+
+# SOF mel_s32.raw format constants (with DSP VAD flag)
+SOF_MAGIC_S32 = np.int32(0x6D666363)  # ASCII 'mfcc' as int32
+SOF_MAGIC_BYTES = struct.pack('<i', 0x6D666363)
+SOF_NUM_MAGIC = 1
+SOF_NUM_VAD = 1              # VAD flag from DSP (1 = speech, 0 = silence)
+SOF_Q_FORMAT = 23            # Q9.23 fixed-point
+SOF_NUM_MEL = 80
+SOF_FRAME_INTS = SOF_NUM_MAGIC + SOF_NUM_VAD + SOF_NUM_MEL  # 82 int32 per frame
+SOF_FRAME_BYTES = SOF_FRAME_INTS * 4  # 328 bytes per frame
+
+# Speech buffering
+SILENCE_TRIGGER_MS = 100     # ms of silence after speech to trigger transcription
+SILENCE_TRIGGER_FRAMES = SILENCE_TRIGGER_MS // 10  # 10 frames at 10ms/frame
+MIN_SPEECH_MS = 500          # minimum speech duration to send to Whisper
+MIN_SPEECH_FRAMES = MIN_SPEECH_MS // 10  # 50 frames at 10ms/frame
+
+# Whisper model constants
+WHISPER_FEATURE_SIZE = 80
+WHISPER_NB_MAX_FRAMES = 3000  # 30 seconds at 10ms per frame
+
+
+def decode_mel_frame(raw_ints):
+    """Convert 80 int32 Q9.23 values to float32 mel coefficients."""
+    return raw_ints.astype(np.float64) / (2 ** SOF_Q_FORMAT)
+
+
+# ---------- Optional scrolling plot ----------
+
+SPECTROGRAM_WIDTH = 100
+
+
+class MelPlotter:
+    """Real-time scrolling mel spectrogram + VAD strip."""
+
+    def __init__(self, num_mel=SOF_NUM_MEL, width=SPECTROGRAM_WIDTH):
+        global matplotlib, plt
+        import matplotlib as _mpl
+        _mpl.use('TkAgg')
+        import matplotlib.pyplot as _plt
+        matplotlib = _mpl
+        plt = _plt
+
+        self.num_mel = num_mel
+        self.width = width
+
+        self.mel_buf = np.zeros((num_mel, width), dtype=np.float64)
+        self.vad_buf = np.zeros(width, dtype=np.float64)
+        self.x = np.arange(width)
+
+        self.fig, (self.ax_mel, self.ax_vad) = plt.subplots(
+            2, 1, figsize=(10, 5),
+            gridspec_kw={'height_ratios': [5, 1]},
+            sharex=True
+        )
+        self.fig.tight_layout(pad=2.0)
+
+        self.im_mel = self.ax_mel.imshow(
+            self.mel_buf, aspect='auto', origin='lower',
+            interpolation='nearest', cmap='turbo',
+            vmin=-2.0, vmax=2.0
+        )
+        self.ax_mel.set_ylabel('Mel bin')
+        self.ax_mel.set_title('Mel Spectrogram (scrolling) — DSP VAD')
+
+        self.line_vad, = self.ax_vad.plot(
+            self.x, self.vad_buf, color='green', linewidth=1.5,
+            drawstyle='steps-post')
+        self.ax_vad.set_ylabel('VAD')
+        self.ax_vad.set_xlabel('Frame')
+        self.ax_vad.set_ylim(-0.1, 1.1)
+        self.ax_vad.set_yticks([0, 1])
+        self.ax_vad.set_yticklabels(['Silent', 'Speech'])
+
+        plt.ion()
+        plt.show(block=False)
+        self.fig.canvas.draw()
+        self.fig.canvas.flush_events()
+
+    def update(self, mel_frame, is_speech):
+        self.mel_buf[:, :-1] = self.mel_buf[:, 1:]
+        self.mel_buf[:, -1] = mel_frame
+        self.vad_buf[:-1] = self.vad_buf[1:]
+        self.vad_buf[-1] = 1.0 if is_speech else 0.0
+
+        self.im_mel.set_data(self.mel_buf)
+        self.line_vad.set_ydata(self.vad_buf)
+
+        self.fig.canvas.draw_idle()
+        self.fig.canvas.flush_events()
+
+
+# ---------- Whisper inference ----------
+
+class WhisperTranscriber:
+    """Whisper encoder+decoder using OpenVINO, runs in a background thread."""
+
+    def __init__(self, model_path, encoder_device="NPU", decoder_device="CPU"):
+        self.model_path = model_path
+        core = ov.Core()
+        encoder_xml = str(Path(model_path) / "openvino_encoder_model.xml")
+        decoder_xml = str(Path(model_path) / "openvino_decoder_model.xml")
+        # NPU requires static shapes — fix [?,?,3000] to [1,80,3000]
+        encoder_model = core.read_model(encoder_xml)
+        encoder_model.reshape({0: [1, WHISPER_FEATURE_SIZE, WHISPER_NB_MAX_FRAMES]})
+        self.encoder = core.compile_model(encoder_model, encoder_device)
+        self.decoder = core.compile_model(decoder_xml, decoder_device)
+        self._load_tokenizer()
+        self._busy = False
+        self._lock = threading.Lock()
+
+    def _load_tokenizer(self):
+        """Load Whisper tokenizer."""
+        try:
+            from transformers import WhisperTokenizer
+            self.tokenizer = WhisperTokenizer.from_pretrained(self.model_path)
+            self._tokenizer_type = "hf"
+        except ImportError:
+            import openvino_genai as ov_genai
+            self.tokenizer = ov_genai.Tokenizer(self.model_path)
+            self._tokenizer_type = "ov"
+
+    def is_busy(self):
+        with self._lock:
+            return self._busy
+
+    def transcribe_async(self, mel_frames, callback):
+        """Run transcription in a background thread.
+
+        Args:
+            mel_frames: list of np.ndarray [80] mel frames
+            callback: function(text) called with result
+        """
+        with self._lock:
+            if self._busy:
+                return False
+            self._busy = True
+
+        t = threading.Thread(target=self._run, args=(mel_frames, callback),
+                             daemon=True)
+        t.start()
+        return True
+
+    def _run(self, mel_frames, callback):
+        try:
+            text = self._transcribe(mel_frames)
+            callback(text)
+        except Exception as e:
+            print(f"  [Whisper ERROR] {e}", flush=True)
+        finally:
+            with self._lock:
+                self._busy = False
+
+    def _transcribe(self, mel_frames):
+        """Encode mel frames and decode to text."""
+        n_frames = len(mel_frames)
+        if n_frames == 0:
+            return ""
+
+        # Stack frames into [80, n_frames]
+        features = np.column_stack(mel_frames).astype(np.float32)
+
+        # Pad to 3000 frames
+        silence_val = features.min()
+        padded = np.full((WHISPER_FEATURE_SIZE, WHISPER_NB_MAX_FRAMES),
+                         silence_val, dtype=np.float32)
+        n = min(n_frames, WHISPER_NB_MAX_FRAMES)
+        padded[:, :n] = features[:, :n]
+
+        # Encoder
+        t0 = time.time()
+        encoder_input = padded.reshape(1, WHISPER_FEATURE_SIZE, WHISPER_NB_MAX_FRAMES)
+        encoder_req = self.encoder.create_infer_request()
+        encoder_req.set_tensor("input_features", ov.Tensor(encoder_input))
+        encoder_req.infer()
+        hidden_state = encoder_req.get_tensor("last_hidden_state").data.copy()
+        t1 = time.time()
+        print(f"  [Whisper] encoder: {t1-t0:.2f}s", flush=True)
+
+        # Decoder: greedy decode
+        token_ids = self._greedy_decode(hidden_state)
+        t2 = time.time()
+        print(f"  [Whisper] decoder: {t2-t1:.2f}s ({len(token_ids)} tokens)",
+              flush=True)
+
+        # Convert to text
+        text_tokens = [t for t in token_ids if t < 50257]
+        if self._tokenizer_type == "hf":
+            text = self.tokenizer.decode(text_tokens)
+        else:
+            text = self.tokenizer.decode(text_tokens)
+
+        return text.strip()
+
+    def _greedy_decode(self, hidden_state, max_tokens=448):
+        """Greedy decoding loop."""
+        sot_tokens = [50258, 50259, 50359, 50363]
+        eos_token = 50257
+
+        decoder_req = self.decoder.create_infer_request()
+        input_names = [inp.get_any_name() for inp in self.decoder.inputs]
+        has_cache_position = "cache_position" in input_names
+
+        decoder_req.set_tensor("encoder_hidden_states", ov.Tensor(hidden_state))
+
+        # Prefill with SOT tokens
+        input_ids = np.array([sot_tokens], dtype=np.int64)
+        beam_idx = np.array([0], dtype=np.int32)
+
+        decoder_req.set_tensor("input_ids", ov.Tensor(input_ids))
+        if "beam_idx" in input_names:
+            decoder_req.set_tensor("beam_idx", ov.Tensor(beam_idx))
+        if has_cache_position:
+            cache_pos = np.arange(len(sot_tokens), dtype=np.int64).reshape(1, -1)
+            decoder_req.set_tensor("cache_position", ov.Tensor(cache_pos))
+
+        decoder_req.infer()
+        logits = decoder_req.get_tensor("logits").data
+        next_token = int(np.argmax(logits[0, -1, :]))
+
+        generated = [next_token]
+        position = len(sot_tokens)
+
+        for _ in range(max_tokens - 1):
+            if next_token == eos_token:
+                break
+
+            decoder_req.set_tensor("input_ids",
+                                   ov.Tensor(np.array([[next_token]], dtype=np.int64)))
+            if "beam_idx" in input_names:
+                decoder_req.set_tensor("beam_idx", ov.Tensor(beam_idx))
+            if has_cache_position:
+                decoder_req.set_tensor("cache_position",
+                                       ov.Tensor(np.array([[position]], dtype=np.int64)))
+
+            decoder_req.infer()
+            logits = decoder_req.get_tensor("logits").data
+            next_token = int(np.argmax(logits[0, -1, :]))
+            generated.append(next_token)
+            position += 1
+
+        return generated
+
+
+# ---------- Frame parser ----------
+
+def find_frame_in_buffer(buf):
+    """Find the first complete mel frame with DSP VAD flag in a byte buffer.
+
+    Frame layout: [magic(4B), vad_flag(4B), mel[0..79](320B)] = 328 bytes
+    Returns: (vad_flag, mel_ints, remaining_buf) or (None, None, buf)
+    """
+    while True:
+        idx = buf.find(SOF_MAGIC_BYTES)
+        if idx < 0:
+            if len(buf) > 3:
+                buf = buf[-3:]
+            return None, None, buf
+        end = idx + SOF_FRAME_BYTES
+        if end > len(buf):
+            buf = buf[idx:]
+            return None, None, buf
+        # Parse VAD flag (first int32 after magic)
+        vad_flag = struct.unpack_from('<i', buf, idx + 4)[0]
+        # Parse 80 mel coefficients (after magic + VAD)
+        mel_bytes = buf[idx + 8 : end]
+        mel_ints = np.frombuffer(mel_bytes, dtype=np.int32)
+        buf = buf[end:]
+        return vad_flag, mel_ints, buf
+
+
+# ---------- Main capture + transcription loop ----------
+
+def run_capture(device, rate, model_path, encoder_device, decoder_device,
+                enable_plot=False):
+    """Main capture loop: ALSA → DSP VAD → buffer speech → Whisper."""
+
+    plotter = MelPlotter() if enable_plot else None
+    transcriber = WhisperTranscriber(model_path, encoder_device=encoder_device,
+                                     decoder_device=decoder_device)
+
+    cmd = [
+        'arecord', '-D', device, '-f', 'S32_LE', '-c', '2',
+        '-r', str(rate), '-t', 'raw', '--buffer-size', '8192',
+    ]
+
+    print(f"Starting capture: {' '.join(cmd)}")
+    print(f"VAD source: DSP (embedded in stream)")
+    print(f"Silence trigger: {SILENCE_TRIGGER_MS}ms ({SILENCE_TRIGGER_FRAMES} frames)")
+    print(f"Whisper model: {model_path} (encoder: {encoder_device}, decoder: {decoder_device})")
+    print()
+
+    proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+
+    buf = b''
+    read_chunk = SOF_FRAME_BYTES * 4
+    frame_num = 0
+    prev_speech = None
+
+    # Speech buffering state
+    speech_buffer = []         # list of mel frames during speech
+    silence_counter = 0        # consecutive silence frames after speech
+    was_speaking = False       # True if we have buffered speech frames
+
+    def on_transcription(text):
+        if text:
+            print(f"\n  >> \"{text}\"\n", flush=True)
+        else:
+            print("  [Whisper] empty result", flush=True)
+
+    try:
+        while True:
+            data = proc.stdout.read(read_chunk)
+            if not data:
+                rc = proc.poll()
+                if rc is not None:
+                    stderr_out = proc.stderr.read().decode(errors='replace')
+                    print(f"\narecord exited with code {rc}")
+                    if stderr_out:
+                        print(f"stderr: {stderr_out}")
+                    break
+                continue
+
+            buf += data
+
+            while True:
+                vad_flag, frame_ints, buf = find_frame_in_buffer(buf)
+                if frame_ints is None:
+                    break
+
+                frame_num += 1
+                mel = decode_mel_frame(frame_ints)
+                speech = vad_flag != 0
+
+                # Print VAD transitions when not plotting
+                if plotter is None and speech != prev_speech:
+                    t = frame_num * 0.01
+                    tag = "SPEECH" if speech else "SILENCE"
+                    print(f"  [{t:7.2f}s] {tag}", flush=True)
+                prev_speech = speech
+
+                # Update plot
+                if plotter is not None:
+                    plotter.update(mel, speech)
+
+                # --- Speech buffering logic ---
+                if speech:
+                    speech_buffer.append(mel.copy())
+                    silence_counter = 0
+                    was_speaking = True
+                else:
+                    if was_speaking:
+                        silence_counter += 1
+                        if silence_counter >= SILENCE_TRIGGER_FRAMES:
+                            n = len(speech_buffer)
+                            duration = n * 0.01
+                            t = frame_num * 0.01
+
+                            if n < MIN_SPEECH_FRAMES:
+                                # Too short — discard
+                                speech_buffer.clear()
+                                silence_counter = 0
+                                was_speaking = False
+                                continue
+
+                            # Silence threshold reached — send to Whisper
+                            print(f"  [{t:7.2f}s] Transcribing {n} frames "
+                                  f"({duration:.1f}s)...", flush=True)
+
+                            if not transcriber.is_busy():
+                                frames_copy = list(speech_buffer)
+                                transcriber.transcribe_async(
+                                    frames_copy, on_transcription)
+                            else:
+                                print(f"  [{t:7.2f}s] (Whisper busy, "
+                                      f"dropping {n} frames)", flush=True)
+
+                            speech_buffer.clear()
+                            silence_counter = 0
+                            was_speaking = False
+
+    except (KeyboardInterrupt, BrokenPipeError):
+        pass
+    finally:
+        if proc.poll() is None:
+            proc.terminate()
+            try:
+                proc.wait(timeout=3)
+            except subprocess.TimeoutExpired:
+                proc.kill()
+                proc.wait()
+        if plotter is not None:
+            try:
+                plt.close(plotter.fig)
+            except Exception:
+                pass
+        print("\n\nCapture stopped.")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Live SOF mel capture with DSP VAD-triggered Whisper transcription")
+    parser.add_argument('--device', '-D', default='hw:0,47',
+                        help='ALSA capture device (default: hw:0,47)')
+    parser.add_argument('--rate', '-r', type=int, default=16000,
+                        help='Sample rate for arecord (default: 16000)')
+    parser.add_argument('--model', '-m', default='whisper-medium-int4-ov',
+                        help='Path to Whisper OpenVINO model directory')
+    parser.add_argument('--encoder-device', default='NPU',
+                        help='OpenVINO device for encoder (default: NPU)')
+    parser.add_argument('--decoder-device', default='CPU',
+                        help='OpenVINO device for decoder (default: CPU)')
+    parser.add_argument('--plot', action='store_true',
+                        help='Show live scrolling mel spectrogram and VAD plot')
+    args = parser.parse_args()
+    model_id = "OpenVINO/" + os.path.basename(args.model)
+    if not os.path.isdir(args.model):
+        print(f"Downloading model {model_id} ...")
+        hf_hub.snapshot_download(model_id, local_dir=args.model)
+
+    print("=== Live SOF Mel → Whisper Transcription (DSP VAD) ===\n")
+    run_capture(args.device, args.rate, args.model, args.encoder_device,
+                args.decoder_device, enable_plot=args.plot)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/include/sof/audio/mfcc/mfcc_comp.h b/src/include/sof/audio/mfcc/mfcc_comp.h
index 025eef116752..e0617e0f026f 100644
--- a/src/include/sof/audio/mfcc/mfcc_comp.h
+++ b/src/include/sof/audio/mfcc/mfcc_comp.h
@@ -12,6 +12,7 @@
 #include <sof/math/auditory.h>
 #include <sof/math/dct.h>
 #include <sof/math/fft.h>
+#include <sof/audio/mfcc/mfcc_vad.h>
 #include <stddef.h>
 #include <stdint.h>
 
@@ -106,6 +107,10 @@ struct mfcc_state {
 	bool waiting_fill; /**< booleans */
 	bool prev_samples_valid;
 	bool magic_pending; /**< True when magic word not yet written for current output */
+#ifdef CONFIG_COMP_MFCC_VAD
+	bool vad_pending; /**< True when VAD flag not yet written for current output */
+	int32_t vad_flag; /**< Current VAD result: 1 = speech, 0 = silence */
+#endif
 	size_t sample_buffers_size; /**< bytes */
 	int16_t *out_data_ptr; /**< Read pointer into scratch data for multi-period output */
 	int32_t *out_data_ptr_32; /**< Read pointer for 32-bit mel-only output */
@@ -115,6 +120,9 @@ struct mfcc_state {
 /* MFCC component private data */
 struct mfcc_comp_data {
 	struct mfcc_state state;
+#ifdef CONFIG_COMP_MFCC_VAD
+	struct mfcc_vad_state vad;
+#endif
 	struct comp_data_blob_handler *model_handler;
 	struct sof_mfcc_config *config;
 	int max_frames;
diff --git a/src/include/sof/audio/mfcc/mfcc_vad.h b/src/include/sof/audio/mfcc/mfcc_vad.h
new file mode 100644
index 000000000000..6eac1ae08a15
--- /dev/null
+++ b/src/include/sof/audio/mfcc/mfcc_vad.h
@@ -0,0 +1,101 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright(c) 2026 Intel Corporation.
+ *
+ * Author: Seppo Ingalsuo <seppo.ingalsuo@linux.intel.com>
+ */
+
+/**
+ * \file mfcc_vad.h
+ * \brief Voice Activity Detection based on Mel spectrum energy.
+ *
+ * This VAD operates on the Q9.23 Mel log spectrum values produced by
+ * the MFCC component. It tracks a per-bin noise floor that follows
+ * the signal downward instantly and rises slowly, then computes a
+ * speech-weighted energy delta above the floor.
+ */
+
+#ifndef __SOF_AUDIO_MFCC_MFCC_VAD_H__
+#define __SOF_AUDIO_MFCC_MFCC_VAD_H__
+
+#include <stdint.h>
+#include <stdbool.h>
+
+#ifdef CONFIG_COMP_MFCC_VAD
+
+struct processing_module;
+
+/**
+ * \brief Number of frames for fast noise floor convergence at startup (~1 s at 10 ms/frame).
+ */
+#define MFCC_VAD_NOISE_INIT_FRAMES	100
+
+/**
+ * \brief Slow noise floor rise coefficient in Q1.15 (0.0010 * 32768 = 3).
+ */
+#define MFCC_VAD_NOISE_RISE_ALPHA	33
+
+/**
+ * \brief Fast noise floor rise coefficient in Q1.15 (0.05 * 32768 = 1638).
+ */
+#define MFCC_VAD_NOISE_RISE_ALPHA_FAST	1638
+
+/**
+ * \brief Energy threshold for speech detection in Q9.23 (0.35 * 2^23 = 2936013).
+ */
+#define MFCC_VAD_ENERGY_THRESHOLD	2936013
+
+/**
+ * \brief Hangover frame count to keep VAD active after last speech detection.
+ */
+#define MFCC_VAD_HANGOVER_FRAMES	20
+
+/**
+ * \brief VAD state structure.
+ */
+struct mfcc_vad_state {
+	int32_t *noise_floor; /**< Per-bin noise floor in Q9.23 */
+	int16_t *weights; /**< Speech-frequency emphasis weights Q1.15 */
+	int32_t energy_threshold; /**< Energy threshold Q9.23 */
+	int16_t noise_rise_alpha_slow; /**< Slow rise alpha Q1.15 */
+	int16_t noise_rise_alpha_fast; /**< Fast rise alpha Q1.15 */
+	int16_t hangover_max; /**< Maximum hangover frames */
+	int16_t hangover_counter; /**< Current hangover counter */
+	int16_t num_mel_bins; /**< Number of Mel bins in use */
+	int16_t init_frames; /**< Number of initial frames for fast convergence */
+	int32_t frame_count; /**< Total frames processed */
+	bool is_speech; /**< Current VAD decision */
+	bool initialized; /**< True after first frame processed */
+};
+
+/**
+ * \brief Initialize VAD state.
+ *
+ * \param[out] vad Pointer to VAD state to initialize.
+ * \param[in] num_mel_bins Number of Mel bins.
+ * \param[in] sample_rate Audio sample rate in Hz.
+ * \param[in] mod Processing module for memory allocation.
+ * \return 0 on success, negative error code on failure.
+ */
+int mfcc_vad_init(struct mfcc_vad_state *vad, int num_mel_bins, int sample_rate,
+		  struct processing_module *mod);
+
+/**
+ * \brief Process one Mel spectrum frame and update VAD decision.
+ *
+ * \param[in,out] vad Pointer to VAD state.
+ * \param[in] mel_log Mel log spectrum in Q9.23, array of num_mel_bins values.
+ * \return 1 if speech detected, 0 if silence.
+ */
+int mfcc_vad_update(struct mfcc_vad_state *vad, const int32_t *mel_log);
+
+/**
+ * \brief Reset VAD state without changing configuration.
+ *
+ * \param[in,out] vad Pointer to VAD state.
+ */
+void mfcc_vad_reset(struct mfcc_vad_state *vad);
+
+#endif /* CONFIG_COMP_MFCC_VAD */
+
+#endif /* __SOF_AUDIO_MFCC_MFCC_VAD_H__ */