From faca46b0c765f399d4d66f23ec0acd666abd9385 Mon Sep 17 00:00:00 2001
From: kunitoki <kunitoki@gmail.com>
Date: Tue, 26 Aug 2025 17:04:08 +0200
Subject: [PATCH 01/37] Updates to FFT

---
 modules/yup_core/maths/yup_MathsFunctions.h   |  1 +
 .../yup_dsp/frequency/yup_FFTProcessor.cpp    | 48 ++++++++++++-------
 modules/yup_dsp/frequency/yup_FFTProcessor.h  |  6 ++-
 3 files changed, 37 insertions(+), 18 deletions(-)
diff --git a/modules/yup_core/maths/yup_MathsFunctions.h b/modules/yup_core/maths/yup_MathsFunctions.h
index 9e3b8ebe1..a9015671c 100644
--- a/modules/yup_core/maths/yup_MathsFunctions.h
+++ b/modules/yup_core/maths/yup_MathsFunctions.h
@@ -751,6 +751,7 @@ constexpr int nextPowerOfTwo (int n) noexcept
     n |= (n >> 4);
     n |= (n >> 8);
     n |= (n >> 16);
+
     return n + 1;
 }
 
diff --git a/modules/yup_dsp/frequency/yup_FFTProcessor.cpp b/modules/yup_dsp/frequency/yup_FFTProcessor.cpp
index 141853da6..45aac82a7 100644
--- a/modules/yup_dsp/frequency/yup_FFTProcessor.cpp
+++ b/modules/yup_dsp/frequency/yup_FFTProcessor.cpp
@@ -700,7 +700,17 @@ FFTProcessor& FFTProcessor::operator= (FFTProcessor&& other) noexcept
 }
 
 //==============================================================================
-// Public interface
+
+void FFTProcessor::setScaling (FFTScaling newScaling) noexcept
+{
+    if (scaling != newScaling)
+    {
+        scaling = newScaling;
+
+        updateScalingFactor();
+    }
+}
+
 void FFTProcessor::setSize (int newSize)
 {
     jassert (isPowerOfTwo (newSize) && newSize >= 64 && newSize <= 65536);
@@ -709,6 +719,8 @@ void FFTProcessor::setSize (int newSize)
     {
         fftSize = newSize;
 
+        updateScalingFactor();
+
         if (engine)
             engine->initialize (fftSize);
     }
@@ -720,6 +732,7 @@ void FFTProcessor::performRealFFTForward (const float* realInput, float* complex
     jassert (engine != nullptr);
 
     engine->performRealFFTForward (realInput, complexOutput);
+
     applyScaling (complexOutput, fftSize * 2, true);
 }
 
@@ -729,6 +742,7 @@ void FFTProcessor::performRealFFTInverse (const float* complexInput, float* real
     jassert (engine != nullptr);
 
     engine->performRealFFTInverse (complexInput, realOutput);
+
     applyScaling (realOutput, fftSize, false);
 }
 
@@ -738,6 +752,7 @@ void FFTProcessor::performComplexFFTForward (const float* complexInput, float* c
     jassert (engine != nullptr);
 
     engine->performComplexFFTForward (complexInput, complexOutput);
+
     applyScaling (complexOutput, fftSize * 2, true);
 }
 
@@ -747,6 +762,7 @@ void FFTProcessor::performComplexFFTInverse (const float* complexInput, float* c
     jassert (engine != nullptr);
 
     engine->performComplexFFTInverse (complexInput, complexOutput);
+
     applyScaling (complexOutput, fftSize * 2, false);
 }
 
@@ -756,25 +772,25 @@ String FFTProcessor::getBackendName() const
 }
 
 //==============================================================================
-// Private implementation
-void FFTProcessor::applyScaling (float* data, int numElements, bool isForward)
+
+void FFTProcessor::updateScalingFactor()
 {
-    if (scaling == FFTScaling::none)
-        return;
+    if (scaling == FFTScaling::unitary)
+        scalingFactor = 1.0f / std::sqrt (static_cast<float> (fftSize));
 
-    float scale = 1.0f;
+    else if (scaling == FFTScaling::asymmetric)
+        scalingFactor = 1.0f / static_cast<float> (fftSize);
 
-    if (scaling == FFTScaling::unitary)
-    {
-        scale = 1.0f / std::sqrt (static_cast<float> (fftSize));
-    }
-    else if (scaling == FFTScaling::asymmetric && ! isForward)
-    {
-        scale = 1.0f / static_cast<float> (fftSize);
-    }
+    else
+        scalingFactor = 1.0f;
+}
+
+void FFTProcessor::applyScaling (float* data, int numElements, bool isForward) const
+{
+    if (scaling == FFTScaling::none || (scaling == FFTScaling::asymmetric && ! isForward))
+        return;
 
-    if (scale != 1.0f)
-        FloatVectorOperations::multiply (data, scale, numElements);
+    FloatVectorOperations::multiply (data, scalingFactor, numElements);
 }
 
 } // namespace yup
diff --git a/modules/yup_dsp/frequency/yup_FFTProcessor.h b/modules/yup_dsp/frequency/yup_FFTProcessor.h
index 5ecaa5d9f..976ec9351 100644
--- a/modules/yup_dsp/frequency/yup_FFTProcessor.h
+++ b/modules/yup_dsp/frequency/yup_FFTProcessor.h
@@ -89,7 +89,7 @@ class FFTProcessor
     int getSize() const noexcept { return fftSize; }
 
     /** Sets the FFT scaling mode */
-    void setScaling (FFTScaling newScaling) noexcept { scaling = newScaling; }
+    void setScaling (FFTScaling newScaling) noexcept;
 
     /** Gets the current scaling mode */
     FFTScaling getScaling() const noexcept { return scaling; }
@@ -139,11 +139,13 @@ class FFTProcessor
 
 private:
     //==============================================================================
-    void applyScaling (float* data, int numElements, bool isForward);
+    void updateScalingFactor ();
+    void applyScaling (float* data, int numElements, bool isForward) const;
 
     //==============================================================================
     int fftSize = -1;
     FFTScaling scaling = FFTScaling::none;
+    float scalingFactor = 1.0f;
 
     std::unique_ptr<Engine> engine;
 

From 1a307098c668f75f40ead1c42608a91bb6ea7fd9 Mon Sep 17 00:00:00 2001
From: kunitoki <kunitoki@gmail.com>
Date: Tue, 26 Aug 2025 17:04:32 +0200
Subject: [PATCH 02/37] Initial work on the convolution

---
 .../graphics/data/ir_e112_g12_dyn_us_6v6.wav  | Bin 0 -> 4488 bytes
 .../source/examples/ConvolutionDemo.h         | 475 ++++++++++++++
 examples/graphics/source/main.cpp             |   2 +
 .../convolution/yup_PartitionedConvolver.cpp  | 593 ++++++++++++++++++
 .../convolution/yup_PartitionedConvolver.h    | 160 +++++
 modules/yup_dsp/yup_dsp.cpp                   |   3 +
 modules/yup_dsp/yup_dsp.h                     |   3 +
 7 files changed, 1236 insertions(+)
 create mode 100644 examples/graphics/data/ir_e112_g12_dyn_us_6v6.wav
 create mode 100644 examples/graphics/source/examples/ConvolutionDemo.h
 create mode 100644 modules/yup_dsp/convolution/yup_PartitionedConvolver.cpp
 create mode 100644 modules/yup_dsp/convolution/yup_PartitionedConvolver.h

diff --git a/examples/graphics/data/ir_e112_g12_dyn_us_6v6.wav b/examples/graphics/data/ir_e112_g12_dyn_us_6v6.wav
new file mode 100644
index 0000000000000000000000000000000000000000..be8463d3aeab399d5330665e53987852d9df2c59
GIT binary patch
literal 4488
zcmXX~30zdw8vo9nJ2MOnAflkAf)r|ki5n_mMk%7=^1|E`F_+3Mdt#QQ=u6EtcguV#
z_FPhgA{7;v;^7)4TBt0FfGY|j2+qvC_k8a^{NC^U#+kWu?m6H0-_FH%2F1pv_z{hc
z88u+qta-srh)7`cPb2zzj39-YQ2d~Tk*#oj(6HFygA!s#`rvbX()?*Xd%xDB&ud*H
zB4|p|yrd|sqKI(Qaa&-Z43CNLF^3l~nCbhL5%^}Eb}is!UPjre79~Eef%fv|2U?D8
zk=yLHU$xH-k2vGzlgA|!3+9v*SG9HxcSSdBsdYSVRuNaa^X_*~4{J`#1mB5GUukl|
z^R6k=XzT1!H?y+(4&OXeaJ#7TL5Ra_-R?K8x!uptdQdx2bKri5J4Z?*DrP*Wc|5x&
zx$fBW%=!lnha1wLm(;ea?NhtIE~jCtYqZgqzoH%Di1>pO_4ggqpFgabUwyjj!>7~g
zN*$MEp=rCNmnGPAM3%ZjoE6Sg-AsQgh3Yf&esf#3laffK@=s%k5n*H+{dub>GOaSl
zTQ-`#O(Qr{o9?W3yzV-wuV!!MnW;|oG|yERnx-rLL>O)0vs^%Tm1^@k+hzMkpLCxL
zZ_VqNt-tkcbEMi<U8q{jhs{xzE0$F2JdbLR8jmcGLmm|#KDJ_y0FR$7zpML{YgEL4
z@;q84a+PA!CbdXqb*_1=Wsj9?7SCm#+dLO|CV2XI?y$w!LTy8Bmuz2oCU^zfQ|(3e
zJo_8=r=A^bpIZ|wr_77YOU!4?Z1%Afm=n$A>P~g9TA_|GE0%mqob?s!HjBfYY_3xy
z)hN?oB~2_8$HW+AqG_x8rFoNir@C8_{F(8WwpsgA@523+S>{S>n9bkz%-Yuys}4}+
z6R}19V))BY&Z8fdKB{WYQWu-dN+ES77l%*^-NNWY4`nZ7gBI!P=lszz&9T^Fa@IH_
zwe5P2@rC?V?v!2R3L{q!(9^YjS`+=g{?JI1CVnj6mrIRxdaf3&d1w*Zer>p3ug^4&
z8hOTAqe1Vkr)X`oewsszFtX%(R3*Gj<4oDgc(IXxHca{zR}WXHYlCZ|HcW3~%r)j1
z9gHkJUJuq|^yT^){kncZU!>P*y|t;Xb<R7E^^ScGyYpA)IajUbYdDQ;If{Sez5Eed
zd5WBC^w*ndtz1i-Q=Dg=i7tQbSM6P_U^luL3C0@ZDsYIDE#*aHzVU|PYuwNe=>O6G
z)`N{@hPON_C-EI_L%~$f$9NUb<+u2jjFnrAB0Wf7s=ccDY3;S4TB0^kYo~>4+q4P#
zV56I?l@ob7hwvG>K>kzim4=+fh1`ioPz;$Vm4kSnd{2&&+hic;@ERH~%*q^Pld?#O
zRT{)L(OcZ5o%8|4(GdECcGFL^j9#IWJd0yEn7uiWqj>@J){R!uMRHIJ;VsV5Kd1l|
zc4sfXBQxbUa*KQ_cXKkmB0dpu;v`jZYhEeG$=z~1PowoBL)l<*nl78#n-Y{QB3Sr~
zS;AA<r!dAr#jd<5X3!Dt#ZLJ`cH(8+jBe6FF+$m>3{jSeWSYtc<qWx5*2`)98`rZx
z4X1osAqFe8%4E}K(;ic*Dcy9-6r_$-=c;e1$4#Zm6A>yhXe}L}EA$ubq9{5IOv~hH
zc}s@z5}<LPt+@InouVr86)%hSqPY;lQ}h$-MVWX>>8^BCyp<Q?v8WJ5;+)ti5=0rz
zqHyX-pHne~i-Dq@xI#0?;I-U^>*Ygf=2+g%21nC7KxrrqqyaRB7Sln>qawOQTXEHg
z@_7>{azBpXFz&%)cpH~+6Y4~-l1jhvXFQbO;nn<*M^YwgtH-@xQ!KqiUgS@a^f9H;
zA5=(>$zKc>Ux{o{DZCVa#U-wYJ>pX_K<Koc!sskd<Q6PtYwWO`zv0C^lPB|79?41Q
ze=ncl1N<p43E_6!A2W(M9NitF=M*cpiek}18LoVwyrZ-O1~<e>u}gd-Qp7m0cNEoJ
z<vHA$HF;64kwauN`Or9L95zyo14h0P1T`L$f5^-7xIBnYPvijJ&jNTXgy)l}FZBR}
zpV1!51&=wjkCxL%G?Av`J&pdPCu9*V!AGPRCZ>p~`1dto7tg4Y3<?r&i_y3eEQ)9o
zy-RJ$!Ik_JnEOy0>Pqo6jb>8<P)_GKekKpgCGrdTgDjKHIg&>}*;_de%=e~Im^l~x
zZ-%l1#cc7D$OXr}#V(S(h9~jIe27(g9~_iYIb~Bay-ZiYP7vooi(krxa=F|i&&#Xw
zkQ^`TjdjK#W1#W1ao32Hi)Fri3HwjrDM0;m-pH9;hQyme8B|N1z)_3{5;tiDFlk3G
z_y%9$d+ZH8PJsirL$90Z3z~#qhtN3GITnfen7`$|T!WR%WK*!X9f{G2=7F&bbb~5@
zPA%zV6Rm(&wCE-*A{T4)C2y+Z8rGPo0ecifYnAxif;v(NQuqXT?8q+p2%}0mr8m^j
z9op>0{#-84$}IW2td>o<CltE`D%yls3c8Bp*P)t3WYuqMLY*sV2V4<K$2cBN$VB3#
z%RADBBxuccsL%^apTwUattRmdWXdA=d>y#h!Mk`Ls>tWN;4p{=A%{|EGIb$4F)F<V
zY!v8a82Fk@bLc~Q4=Apt@9A4wL!VM2jf4u`MFlbRIvm7Et4vPkqnynr`4`>^^hcmW
zI~(Zly3BzlPsnVHKcpr*!>{q^Vg(=O(|n$b_yx3Kf&zT$O<G0QNC-cnh-=s@0dAf}
z$Ehhaxk;>n_bPyIck+cVgXj(DdI@UUj|}~ZexR?wTW`4VH8?gJwG9H3QRIG-a$_Hb
zd`P6JGy(khBQFb3brGAP-tXx;cyv+&-J`R}uc_#7JhZnND`!B9X;A%4>W{Pth90}2
zrjfX-E2_8;6jwuW-MJ+<gR7%~^lD&n2^i+_56BcJPSZ2;fehjIcs*zEasG)@kv`w@
z9w3$uCf4!4fW?3KD%bII{)_(&ru%Rxzshf5wWW=7zd_plo3r>bs=fzZc~fifGny7a
z%}e0pq3An;df|OG>i-_~{u5QU1D`fj*&I0*gvvWmd*EQ8>z{cg6fptGl!QF`7CaPk
z6*K(Ylsw_?yZkr*j@JwB3LLWNKF*;UpmdD3;Cm9#83lC=#(pD!MH19=5WgDKTm%Xe
z&bJ)+X&%s?j5-z}f#zV&Sg<o0-cA65UQ`C;cA)p==r9E}Bq4#{1r7z=0hzRpzN4i`
z@g8tYS1>sOzD%YK^fOYp9FA)x!bBV4L?U0L)2J!~?5~Drd*TEx2llsE=a<3QC}0!=
z1w2Q8zd~<mcpAKkQ|JL4c>x|+3WNu66cjlF-p;{&PmpW(p#O__CfLK9;g(e7_C7Fm
ziqAo<7oeJ}{9k12T3!fmeT>?dqmv`xrX0KaQZwYBH}(vJGXltitQbU2u%?g&^_g+a
z8?)M=x9;e$1Li-&j_1L`Z{WWgsZCS^mQG{crJT(3;N0~%InH9|2Vlv+QHA}1<QQl=
z9y)&;?EBCIaF&BA-LtG%y%(I&66aezKL(@axLS==szip}!gmMG{4m_t4plt@J|&pd
zfRp|{Iy{1&Gw@!EGcp7-{NQm1GN(Cqezmc)m*ItQ_$wK?c^0ST1v*Np;O$FX`2^!r
zd~$#O0K5-?b9~X0N`YW=D6(WZbhHU)bmIU0`KZy2g+SkheeR);Y>fZHLpSiP3g>bm
zs>ug;B^Y-wu7QsXq)jRNf#LVD<7{vp34N=iu@lOD@qfhasJbomAAmmV(f4(%^B3-`
z!Ynr<Dw3xj-`pn}p~7(F%0MJh2mCEo031?~*2(ZoD)=Y{e$V+4{Jjmz>BHf8O+Y%I
z0b6c_OBz>q*IC}Uo)b>(2pvuY9{++9W`Z+!Vzvb@0T?d<-Hza(Cq@^1YKvsD!ha92
z&UyHKKeBW;&cxIB{*1lwoY)nkBkr?7GuNTUJake6)SRfk41HY&b~kaw3v+_OTqqa|
zz?w!QCiP(01(m))b;Y>%6w)Xid36HXIuA_Cv5NzD-@<QafbSmu4jx#Kvw0;{z5;&T
z0UaL3*vng>&G|T0mIDj7THI?~z}~e;k<Lhm5AoDA73_~e_fz2MDfk>m{h)^N$gRzI
z68RqK^>L#<qQK!u%>EELF&a-go#79n+D0c{KxU@HxksSw0@T+O_;o^eDpZ?~s@<7b
z0S0a0IRN?P2Mn8_V>cqtaK5?c_k~B|;D<n*$K}w{bud)}ZbIR}XnG3_y@~f;V4^*e
zwHefEM@Ctp`g(Zg5#I)?CsD^m?Bw=e5bg}YE;UePR%5RG0<~VpuNA=c8s=pKkweIk
zOSrcN*lWPN8oJHrTp*hbM$f~ig+Tfqa^L-Te+}|B3w*ETuYve(WONR8y^McPHO{_;
zokO6LKJd)ja9%9j?LnpZJqy2I1MW>xRWNWXKt&tD*e>kg2`szex(_n&HlBFSBV$gX
zm(!^6HqdeBR0KvxILC_rhwz5RqM_2McuxBQK3xUXeuNy2148Z-bu6+x1=&0gX&wc=
zdXf%J5wH(~vIilLh67J`s<i@&UU<3Dv?BFvsMdYT>%hW2bnGs0e<0>YLKl6Z>`<g)
z0Pb=ppeLB<2DJ=>8++ih2cFrgz(pnSG{9XeXut>S)u4}yc<T58+T6tl(c4A%pb)QI
WXeJF;(tyw<Tq%J%uj5sSjQbz-BD28&

literal 0
HcmV?d00001

diff --git a/examples/graphics/source/examples/ConvolutionDemo.h b/examples/graphics/source/examples/ConvolutionDemo.h
new file mode 100644
index 000000000..a5dc7256b
--- /dev/null
+++ b/examples/graphics/source/examples/ConvolutionDemo.h
@@ -0,0 +1,475 @@
+/*
+  ==============================================================================
+
+   This file is part of the YUP library.
+   Copyright (c) 2025 - kunitoki@gmail.com
+
+   YUP is an open source library subject to open-source licensing.
+
+   The code included in this file is provided under the terms of the ISC license
+   http://www.isc.org/downloads/software-support-policy/isc-license. Permission
+   to use, copy, modify, and/or distribute this software for any purpose with or
+   without fee is hereby granted provided that the above copyright notice and
+   this permission notice appear in all copies.
+
+   YUP IS PROVIDED "AS IS" WITHOUT ANY WARRANTY, AND ALL WARRANTIES, WHETHER
+   EXPRESSED OR IMPLIED, INCLUDING MERCHANTABILITY AND FITNESS FOR PURPOSE, ARE
+   DISCLAIMED.
+
+  ==============================================================================
+*/
+
+#pragma once
+
+#include <yup_dsp/yup_dsp.h>
+#include <yup_audio_formats/yup_audio_formats.h>
+#include <yup_audio_gui/yup_audio_gui.h>
+#include <yup_gui/yup_gui.h>
+
+#include <memory>
+#include <vector>
+#include <iostream>
+#include <atomic>
+
+//==============================================================================
+
+class ConvolutionDemo
+    : public yup::Component
+    , public yup::AudioIODeviceCallback
+    , public yup::Timer
+{
+public:
+    ConvolutionDemo()
+        : wetGainSlider (yup::Slider::LinearHorizontal)
+        , dryGainSlider (yup::Slider::LinearHorizontal)
+        , loadIRButton ("Load IR...")
+    {
+        formatManager.registerDefaultFormats();
+
+        // Load default audio files
+        loadAudioFile();
+        loadDefaultImpulseResponse();
+
+        // Audio device manager
+        audioDeviceManager.initialiseWithDefaultDevices (0, 2);
+
+        // Initialize smoothed values
+        wetGain.reset (44100, 0.02);
+        dryGain.reset (44100, 0.02);
+        wetGain.setCurrentAndTargetValue (1.0f);
+        dryGain.setCurrentAndTargetValue (0.3f);
+
+        // Configure convolver with typical layout
+        convolver.setTypicalLayout (256, {256, 1024, 4096});
+
+        // Create UI
+        createUI();
+
+        // Start timer for waveform updates
+        startTimerHz (30);
+    }
+
+    ~ConvolutionDemo() override
+    {
+        audioDeviceManager.removeAudioCallback (this);
+        audioDeviceManager.closeAudioDevice();
+    }
+
+    void resized() override
+    {
+        auto bounds = getLocalBounds().reduced (10);
+
+        // Top controls
+        auto topControls = bounds.removeFromTop (120);
+
+        // IR loading section
+        auto irSection = topControls.removeFromTop (60);
+        loadIRButton.setBounds (irSection.removeFromTop (30).reduced (5, 0));
+        irInfoLabel.setBounds (irSection.removeFromTop (25));
+
+        // Control sliders section
+        auto controlsSection = topControls;
+        auto wetSection = controlsSection.removeFromLeft (controlsSection.getWidth() / 2);
+        wetGainLabel.setBounds (wetSection.removeFromTop (25));
+        wetGainSlider.setBounds (wetSection.removeFromTop (30).reduced (5, 0));
+
+        dryGainLabel.setBounds (controlsSection.removeFromTop (25));
+        dryGainSlider.setBounds (controlsSection.removeFromTop (30).reduced (5, 0));
+
+        // IR waveform display takes remaining space
+        irWaveformDisplay.setBounds (bounds);
+    }
+
+    void visibilityChanged() override
+    {
+        if (! isVisible())
+            audioDeviceManager.removeAudioCallback (this);
+        else
+            audioDeviceManager.addAudioCallback (this);
+    }
+
+    void audioDeviceAboutToStart (yup::AudioIODevice* device) override
+    {
+        auto sampleRate = device->getCurrentSampleRate();
+
+        // Update smoothed values
+        wetGain.reset (sampleRate, 0.02);
+        dryGain.reset (sampleRate, 0.02);
+
+        // Reset convolver
+        convolver.reset();
+    }
+
+    void audioDeviceStopped() override
+    {
+    }
+
+    void audioDeviceIOCallbackWithContext (const float* const* inputChannelData,
+                                           int numInputChannels,
+                                           float* const* outputChannelData,
+                                           int numOutputChannels,
+                                           int numSamples,
+                                           const yup::AudioIODeviceCallbackContext& context) override
+    {
+        // Clear outputs
+        for (int ch = 0; ch < numOutputChannels; ++ch)
+        {
+            if (outputChannelData[ch] != nullptr)
+                yup::FloatVectorOperations::clear (outputChannelData[ch], numSamples);
+        }
+
+        if (numOutputChannels < 2 || audioBuffer.getNumSamples() == 0)
+            return;
+
+        // Prepare buffers for processing
+        tempDryBuffer.resize (static_cast<size_t> (numSamples));
+        tempWetBuffer.resize (static_cast<size_t> (numSamples));
+
+        // Process samples
+        const int totalSamples = audioBuffer.getNumSamples();
+        const int numChannels = audioBuffer.getNumChannels();
+
+        for (int i = 0; i < numSamples; ++i)
+        {
+            // Get the audio sample from the loaded file (mono to stereo if needed)
+            float audioSample = 0.0f;
+
+            if (numChannels == 1)
+            {
+                // Mono file
+                audioSample = audioBuffer.getSample (0, readPosition) * 0.5f;
+            }
+            else
+            {
+                // Stereo or multichannel - mix to mono
+                for (int ch = 0; ch < yup::jmin (2, numChannels); ++ch)
+                    audioSample += audioBuffer.getSample (ch, readPosition) * 0.5f;
+                audioSample /= yup::jmin (2, numChannels);
+            }
+
+            // Increment read position and wrap around for looping
+            readPosition++;
+            if (readPosition >= totalSamples)
+                readPosition = 0;
+
+            // Store dry signal
+            tempDryBuffer[static_cast<size_t> (i)] = audioSample;
+        }
+
+        // Process through convolver if IR is loaded
+        std::fill (tempWetBuffer.begin(), tempWetBuffer.end(), 0.0f);
+        if (hasImpulseResponse)
+            convolver.process (tempDryBuffer.data(), tempWetBuffer.data(), static_cast<size_t> (numSamples));
+
+        // Mix dry and wet signals with gains
+        for (int i = 0; i < numSamples; ++i)
+        {
+            float wetGainValue = wetGain.getNextValue();
+            float dryGainValue = dryGain.getNextValue();
+
+            float drySignal = tempDryBuffer[static_cast<size_t> (i)] * dryGainValue;
+            float wetSignal = tempWetBuffer[static_cast<size_t> (i)] * wetGainValue;
+            float mixedSignal = drySignal + wetSignal;
+
+            // Output to both channels (mono to stereo)
+            outputChannelData[0][i] = mixedSignal;
+            outputChannelData[1][i] = mixedSignal;
+        }
+    }
+
+    void timerCallback() override
+    {
+        // Update waveform display if needed
+        repaint();
+    }
+
+private:
+    void loadAudioFile()
+    {
+        // Create the path to the audio file
+        auto dataDir = yup::File (__FILE__)
+                           .getParentDirectory()
+                           .getParentDirectory()
+                           .getParentDirectory()
+                           .getChildFile ("data");
+
+        yup::File audioFile = dataDir.getChildFile ("break_boomblastic_92bpm.wav");
+        if (! audioFile.existsAsFile())
+        {
+            std::cerr << "Could not find break_boomblastic_92bpm.wav" << std::endl;
+            return;
+        }
+
+        // Load the audio file
+        yup::AudioFormatManager formatManager;
+        formatManager.registerDefaultFormats();
+
+        if (auto reader = formatManager.createReaderFor (audioFile))
+        {
+            audioBuffer.setSize ((int) reader->numChannels, (int) reader->lengthInSamples);
+            reader->read (&audioBuffer, 0, (int) reader->lengthInSamples, 0, true, true);
+
+            std::cout << "Loaded audio file: " << audioFile.getFileName() << std::endl;
+            std::cout << "Sample rate: " << reader->sampleRate << " Hz" << std::endl;
+            std::cout << "Channels: " << reader->numChannels << std::endl;
+            std::cout << "Length: " << reader->lengthInSamples << " samples" << std::endl;
+        }
+        else
+        {
+            std::cerr << "Failed to create reader for audio file" << std::endl;
+        }
+    }
+
+    void loadDefaultImpulseResponse()
+    {
+        // Create the path to the default impulse response file
+        auto dataDir = yup::File (__FILE__)
+                           .getParentDirectory()
+                           .getParentDirectory()
+                           .getParentDirectory()
+                           .getChildFile ("data");
+
+        yup::File irFile = dataDir.getChildFile ("ir_e112_g12_dyn_us_6v6.wav");
+        loadImpulseResponseFromFile (irFile);
+    }
+
+    void loadImpulseResponseFromFile (const yup::File& file)
+    {
+        if (! file.existsAsFile())
+        {
+            std::cerr << "Could not find impulse response file: " << file.getFullPathName() << std::endl;
+            updateIRInfo ("No IR loaded");
+            return;
+        }
+
+        // Load the impulse response file
+        if (auto reader = formatManager.createReaderFor (file))
+        {
+            impulseResponseBuffer.setSize ((int) reader->numChannels, (int) reader->lengthInSamples);
+            reader->read (&impulseResponseBuffer, 0, (int) reader->lengthInSamples, 0, true, true);
+
+            // Convert to mono if stereo
+            if (impulseResponseBuffer.getNumChannels() > 1)
+            {
+                for (int i = 0; i < impulseResponseBuffer.getNumSamples(); ++i)
+                {
+                    float monoSample = 0.0f;
+                    for (int ch = 0; ch < impulseResponseBuffer.getNumChannels(); ++ch)
+                        monoSample += impulseResponseBuffer.getSample (ch, i);
+                    monoSample /= static_cast<float> (impulseResponseBuffer.getNumChannels());
+                    impulseResponseBuffer.setSample (0, i, monoSample);
+                }
+                impulseResponseBuffer.setSize (1, impulseResponseBuffer.getNumSamples(), true);
+            }
+
+            // Extract samples for convolver and normalize
+            const int numSamples = impulseResponseBuffer.getNumSamples();
+            impulseResponseData.resize (static_cast<size_t> (numSamples));
+
+            // Normalize IR to prevent clipping (very aggressive scaling for testing)
+            float normalizationGain = 1.0f;
+            for (int i = 0; i < numSamples; ++i)
+                impulseResponseData[static_cast<size_t> (i)] = impulseResponseBuffer.getSample (0, i) * normalizationGain;
+
+            // Set impulse response in convolver
+            convolver.setImpulseResponse (impulseResponseData);
+            hasImpulseResponse = true;
+
+            std::cout << "Loaded impulse response: " << file.getFileName() << std::endl;
+            std::cout << "Sample rate: " << reader->sampleRate << " Hz" << std::endl;
+            std::cout << "Length: " << reader->lengthInSamples << " samples" << std::endl;
+
+            // Update UI
+            updateIRInfo (file.getFileName());
+            updateWaveformDisplay();
+        }
+        else
+        {
+            std::cerr << "Failed to create reader for impulse response file" << std::endl;
+            updateIRInfo ("Failed to load IR");
+        }
+    }
+
+    void createUI()
+    {
+        setOpaque (false);
+
+        // Get fonts
+        auto labelFont = yup::ApplicationTheme::getGlobalTheme()->getDefaultFont().withHeight (12.0f);
+        auto buttonFont = yup::ApplicationTheme::getGlobalTheme()->getDefaultFont().withHeight (14.0f);
+
+        // Load IR button
+        // loadIRButton.setFont (buttonFont);
+        loadIRButton.onClick = [this]
+        {
+            auto chooser = yup::FileChooser::create ("Load Impulse Response",
+                                                     yup::File(),
+                                                     "*.wav;*.aiff;*.aif");
+            chooser->browseForFileToOpen ([this] (bool success, const yup::Array<yup::File>& results)
+            {
+                if (success && results.size() > 0)
+                {
+                    loadImpulseResponseFromFile (results[0]);
+                }
+            });
+        };
+        addAndMakeVisible (loadIRButton);
+
+        // IR info label
+        irInfoLabel.setText ("Loading default IR...", yup::NotificationType::dontSendNotification);
+        irInfoLabel.setFont (labelFont);
+        irInfoLabel.setJustification (yup::Justification::center);
+        addAndMakeVisible (irInfoLabel);
+
+        // Wet gain slider
+        wetGainLabel.setText ("Wet Gain", yup::NotificationType::dontSendNotification);
+        wetGainLabel.setFont (labelFont);
+        addAndMakeVisible (wetGainLabel);
+
+        wetGainSlider.setRange (0.0, 2.0);
+        wetGainSlider.setValue (1.0);
+        wetGainSlider.onValueChanged = [this] (float value)
+        {
+            wetGain.setTargetValue (value);
+        };
+        addAndMakeVisible (wetGainSlider);
+
+        // Dry gain slider
+        dryGainLabel.setText ("Dry Gain", yup::NotificationType::dontSendNotification);
+        dryGainLabel.setFont (labelFont);
+        addAndMakeVisible (dryGainLabel);
+
+        dryGainSlider.setRange (0.0, 2.0);
+        dryGainSlider.setValue (0.3);
+        dryGainSlider.onValueChanged = [this] (float value)
+        {
+            dryGain.setTargetValue (value);
+        };
+        addAndMakeVisible (dryGainSlider);
+
+        // Configure IR waveform display
+        setupWaveformDisplay();
+        addAndMakeVisible (irWaveformDisplay);
+    }
+
+    void setupWaveformDisplay()
+    {
+        // Configure the CartesianPlane for waveform display
+        irWaveformDisplay.setTitle ("Impulse Response Waveform");
+
+        // Set linear axes
+        irWaveformDisplay.setXRange (0.0, 1.0);
+        irWaveformDisplay.setXScaleType (yup::CartesianPlane::AxisScaleType::linear);
+        irWaveformDisplay.setYRange (-1.0, 1.0);
+        irWaveformDisplay.setYScaleType (yup::CartesianPlane::AxisScaleType::linear);
+
+        // Set margins
+        irWaveformDisplay.setMargins (25, 25, 25, 25);
+
+        // Add grid lines
+        irWaveformDisplay.setVerticalGridLines ({ 0.0, 1.0 });
+        irWaveformDisplay.setHorizontalGridLines ({ -1.0, -0.5, 0.5, 1.0 });
+        irWaveformDisplay.addHorizontalGridLine (0.0, yup::Color (0xFF666666), 1.0f, true);
+
+        irWaveformDisplay.clearXAxisLabels();
+        irWaveformDisplay.setYAxisLabels ({ -1.0, -0.5, 0.5, 1.0 });
+
+        // Add waveform signal
+        waveformSignalIndex = irWaveformDisplay.addSignal ("IR", yup::Color (0xFF44AA44), 1.5f);
+
+        // Configure legend
+        irWaveformDisplay.setLegendVisible (false);
+    }
+
+    void updateWaveformDisplay()
+    {
+        if (impulseResponseData.empty())
+            return;
+
+        // Create waveform data points
+        const size_t numPoints = std::min (static_cast<size_t> (2048), impulseResponseData.size());
+        const size_t stride = impulseResponseData.size() / numPoints;
+
+        std::vector<yup::Point<double>> waveformData;
+        waveformData.reserve (numPoints);
+
+        for (size_t i = 0; i < numPoints; ++i)
+        {
+            size_t sampleIndex = i * stride;
+            if (sampleIndex >= impulseResponseData.size())
+                sampleIndex = impulseResponseData.size() - 1;
+
+            double normalizedTime = static_cast<double> (i) / static_cast<double> (numPoints - 1);
+            double amplitude = static_cast<double> (impulseResponseData[sampleIndex]);
+
+            waveformData.emplace_back (normalizedTime, amplitude);
+        }
+
+        // Update the display
+        irWaveformDisplay.updateSignalData (waveformSignalIndex, waveformData);
+
+        // Update X axis range to show time
+        double lengthInSeconds = static_cast<double> (impulseResponseData.size()) / 44100.0; // Assume 44.1kHz
+        irWaveformDisplay.setXRange (0.0, lengthInSeconds);
+
+        // Update X axis labels to show time
+        std::vector<double> timeLabels;
+        for (int i = 0; i <= 4; ++i)
+            timeLabels.push_back (lengthInSeconds * static_cast<double> (i) / 4.0);
+        irWaveformDisplay.setXAxisLabels (timeLabels);
+    }
+
+    void updateIRInfo (const yup::String& info)
+    {
+        irInfoLabel.setText (info, yup::NotificationType::dontSendNotification);
+    }
+
+    // Audio
+    yup::AudioFormatManager formatManager;
+    yup::AudioDeviceManager audioDeviceManager;
+    yup::AudioBuffer<float> audioBuffer;
+    yup::AudioBuffer<float> impulseResponseBuffer;
+    std::vector<float> impulseResponseData;
+    int readPosition = 0;
+    std::atomic<bool> hasImpulseResponse = false;
+
+    // Processing
+    yup::PartitionedConvolver convolver;
+    std::vector<float> tempDryBuffer;
+    std::vector<float> tempWetBuffer;
+
+    // Smoothed parameters
+    yup::SmoothedValue<float> wetGain, dryGain;
+
+    // UI
+    yup::TextButton loadIRButton;
+    yup::Label irInfoLabel;
+    yup::Label wetGainLabel;
+    yup::Slider wetGainSlider;
+    yup::Label dryGainLabel;
+    yup::Slider dryGainSlider;
+    yup::CartesianPlane irWaveformDisplay;
+
+    // Display
+    int waveformSignalIndex = -1;
+};
diff --git a/examples/graphics/source/main.cpp b/examples/graphics/source/main.cpp
index 34e405208..f6d4d6084 100644
--- a/examples/graphics/source/main.cpp
+++ b/examples/graphics/source/main.cpp
@@ -40,6 +40,7 @@
 #include "examples/Artboard.h"
 #include "examples/Audio.h"
 #include "examples/CrossoverDemo.h"
+#include "examples/ConvolutionDemo.h"
 #include "examples/FilterDemo.h"
 #include "examples/LayoutFonts.h"
 #include "examples/FileChooser.h"
@@ -107,6 +108,7 @@ class CustomWindow
         registerDemo<SpectrumAnalyzerDemo> ("FFT Analyzer", counter++);
         registerDemo<FilterDemo> ("Filter Demo", counter++);
         registerDemo<CrossoverDemo> ("Crossover Demo", counter++);
+        registerDemo<ConvolutionDemo> ("Convolution Demo", counter++);
         registerDemo<LayoutFontsExample> ("Layout Fonts", counter++);
         registerDemo<VariableFontsExample> ("Variable Fonts", counter++);
         registerDemo<PathsExample> ("Paths", counter++);
diff --git a/modules/yup_dsp/convolution/yup_PartitionedConvolver.cpp b/modules/yup_dsp/convolution/yup_PartitionedConvolver.cpp
new file mode 100644
index 000000000..09319eba6
--- /dev/null
+++ b/modules/yup_dsp/convolution/yup_PartitionedConvolver.cpp
@@ -0,0 +1,593 @@
+/*
+  ==============================================================================
+
+   This file is part of the YUP library.
+   Copyright (c) 2025 - kunitoki@gmail.com
+
+   YUP is an open source library subject to open-source licensing.
+
+   The code included in this file is provided under the terms of the ISC license
+   http://www.isc.org/downloads/software-support-policy/isc-license. Permission
+   to use, copy, modify, and/or distribute this software for any purpose with or
+   without fee is hereby granted provided that the above copyright notice and
+   this permission notice appear in all copies.
+
+   YUP IS PROVIDED "AS IS" WITHOUT ANY WARRANTY, AND ALL WARRANTIES, WHETHER
+   EXPRESSED OR IMPLIED, INCLUDING MERCHANTABILITY AND FITNESS FOR PURPOSE, ARE
+   DISCLAIMED.
+
+  ==============================================================================
+*/
+
+#include <atomic>
+#include <thread>
+
+namespace yup
+{
+
+//==============================================================================
+// Complex multiply-accumulate for interleaved real/imaginary format
+// Y += A * B (complex multiplication)
+//==============================================================================
+static void complexMultiplyAccumulate (const float* A, const float* B, float* Y, int complexPairs)
+{
+    for (int i = 0; i < complexPairs; ++i)
+    {
+        const int ri = i * 2;
+        const int ii = ri + 1;
+        const float ar = A[ri];
+        const float ai = A[ii];
+        const float br = B[ri];
+        const float bi = B[ii];
+        // (ar + j*ai) * (br + j*bi) = (ar*br - ai*bi) + j*(ar*bi + ai*br)
+        Y[ri] += ar * br - ai * bi;
+        Y[ii] += ar * bi + ai * br;
+    }
+}
+
+//==============================================================================
+// DirectFIR - Brute-force FIR implementation for early taps
+//==============================================================================
+class PartitionedConvolver::DirectFIR
+{
+public:
+    DirectFIR() = default;
+
+    void setTaps (std::vector<float> taps, float scaling)
+    {
+        taps_ = std::move (taps);
+        FloatVectorOperations::multiply (taps_.data(), scaling, taps_.size());
+
+        history_.assign (taps_.size(), 0.0f);
+
+        writeIndex_ = 0;
+    }
+
+    void reset()
+    {
+        std::fill (history_.begin(), history_.end(), 0.0f);
+        writeIndex_ = 0;
+    }
+
+    void process (const float* input, float* output, std::size_t numSamples)
+    {
+        const std::size_t numTaps = taps_.size();
+        if (numTaps == 0) return;
+
+        for (std::size_t i = 0; i < numSamples; ++i)
+        {
+            history_[writeIndex_] = input[i];
+
+            // Convolution: y[n] = sum(h[m] * x[n-m])
+            float sum = 0.0f;
+            std::size_t readIndex = writeIndex_;
+            for (std::size_t m = 0; m < numTaps; ++m)
+            {
+                sum += taps_[m] * history_[readIndex];
+                if (readIndex == 0)
+                    readIndex = numTaps - 1;
+                else
+                    --readIndex;
+            }
+
+            output[i] += sum;
+
+            // Advance circular buffer
+            if (++writeIndex_ == numTaps)
+                writeIndex_ = 0;
+        }
+    }
+
+    std::size_t getNumTaps() const { return taps_.size(); }
+
+private:
+    std::vector<float> taps_;
+    std::vector<float> history_;
+    std::size_t writeIndex_ = 0;
+};
+
+//==============================================================================
+// FFTLayer - Single uniform-partitioned OLA layer
+//==============================================================================
+class PartitionedConvolver::FFTLayer
+{
+public:
+    FFTLayer() = default;
+    ~FFTLayer() = default;
+
+    FFTLayer (FFTLayer&& other) = default;
+    FFTLayer& operator= (FFTLayer&& other) = default;
+
+    void configure (int hopSize)
+    {
+        hopSize_ = hopSize;
+        fftSize_ = hopSize * 2;
+
+        fftProcessor_.setSize (fftSize_);
+        fftProcessor_.setScaling (FFTProcessor::FFTScaling::asymmetric);
+
+        overlapBuffer_.assign (static_cast<std::size_t> (hopSize_), 0.0f);
+        timeBuffer_.assign (static_cast<std::size_t> (fftSize_), 0.0f);
+        frequencyBuffer_.assign (static_cast<std::size_t> (fftSize_) * 2, 0.0f);
+        tempBuffer_.assign (static_cast<std::size_t> (fftSize_) * 2, 0.0f);  // Must hold complex data for in-place FFT
+
+        fdlIndex_ = 0;
+        configured_ = true;
+    }
+
+    int getHopSize() const { return hopSize_; }
+    int getFFTSize() const { return fftSize_; }
+    bool isConfigured() const { return configured_; }
+
+    std::size_t setImpulseResponse (const float* impulseResponse, std::size_t length, float scaling)
+    {
+        jassert (configured_);
+
+        if (fftSize_ <= 0 || hopSize_ <= 0)
+        {
+            resetState();
+            return 0;
+        }
+
+        frequencyPartitions_.clear();
+        frequencyDelayLine_.clear();
+
+        if (length == 0 || impulseResponse == nullptr)
+        {
+            resetState();
+            return 0;
+        }
+
+        const auto numPartitions = (length + static_cast<std::size_t> (hopSize_) - 1) / static_cast<std::size_t> (hopSize_);
+        if (numPartitions == 0)
+        {
+            resetState();
+            return 0;
+        }
+
+        std::size_t processedSamples = 0;
+        frequencyPartitions_.reserve (numPartitions);
+
+        for (std::size_t p = 0; p < numPartitions; ++p)
+        {
+            std::vector<float> partition;
+            partition.resize (static_cast<std::size_t> (fftSize_) * 2);
+
+            std::fill (tempBuffer_.begin(), tempBuffer_.end(), 0.0f);
+
+            const std::size_t offset = p * static_cast<std::size_t> (hopSize_);
+            const std::size_t copyCount = std::min (static_cast<std::size_t> (hopSize_), length - offset);
+
+            if (copyCount > 0 && offset < length)
+            {
+                for (std::size_t i = 0; i < copyCount && offset + i < length; ++i)
+                    tempBuffer_[i] = impulseResponse[offset + i] * scaling;
+            }
+
+            fftProcessor_.performRealFFTForward (tempBuffer_.data(), partition.data());
+
+            frequencyPartitions_.push_back (std::move (partition));
+
+            processedSamples += copyCount;
+        }
+
+        frequencyDelayLine_.assign (numPartitions, std::vector<float> (static_cast<std::size_t> (fftSize_) * 2, 0.0f));
+        fdlIndex_ = 0;
+
+        resetState();
+
+        return processedSamples;
+    }
+
+    void resetState()
+    {
+        fdlIndex_ = 0;
+
+        for (auto& partition : frequencyDelayLine_)
+            std::fill (partition.begin(), partition.end(), 0.0f);
+
+        std::fill (overlapBuffer_.begin(), overlapBuffer_.end(), 0.0f);
+        std::fill (timeBuffer_.begin(), timeBuffer_.end(), 0.0f);
+        std::fill (frequencyBuffer_.begin(), frequencyBuffer_.end(), 0.0f);
+    }
+
+    void processHop (const float* inputHop, float* outputAccumulator)
+    {
+        jassert (configured_);
+
+        if (frequencyPartitions_.empty())
+            return;
+
+        // 1) Transform current input hop to frequency domain
+        std::fill (tempBuffer_.begin(), tempBuffer_.end(), 0.0f);
+        for (int i = 0; i < hopSize_; ++i)
+            tempBuffer_[i] = inputHop[i];
+
+        fftProcessor_.performRealFFTForward (tempBuffer_.data(), tempBuffer_.data());
+
+        // 2) Store in frequency delay line (circular buffer) - copy full complex buffer
+        fdlIndex_ = (fdlIndex_ == 0) ? static_cast<int> (frequencyDelayLine_.size()) - 1 : fdlIndex_ - 1;
+        std::copy (tempBuffer_.begin(), tempBuffer_.begin() + (fftSize_ * 2), frequencyDelayLine_[static_cast<std::size_t> (fdlIndex_)].begin());
+
+        // 3) Frequency domain convolution: Y = sum(X[k-p] * H[p])
+        std::fill (frequencyBuffer_.data(), frequencyBuffer_.data() + (fftSize_ * 2), 0.0f);
+
+        int xIndex = fdlIndex_;
+        for (std::size_t p = 0; p < frequencyPartitions_.size(); ++p)
+        {
+            const float* X = frequencyDelayLine_[static_cast<std::size_t> (xIndex)].data();
+            const float* H = frequencyPartitions_[p].data();
+
+            // fftSize_/2 gives the number of complex pairs for real FFT
+            complexMultiplyAccumulate (X, H, frequencyBuffer_.data(), fftSize_ / 2);
+
+            // Move to next older spectrum
+            xIndex++;
+            if (xIndex >= static_cast<int> (frequencyDelayLine_.size()))
+                xIndex = 0;
+        }
+
+        // 4) Inverse FFT back to time domain
+        fftProcessor_.performRealFFTInverse (frequencyBuffer_.data(), timeBuffer_.data());
+
+        // 5) Overlap-Add: output first hopSize_ samples, store last hopSize_ as overlap
+        for (int i = 0; i < hopSize_; ++i)
+        {
+            outputAccumulator[i] += timeBuffer_[i] + overlapBuffer_[i];
+            overlapBuffer_[i] = timeBuffer_[i + hopSize_];
+        }
+    }
+
+    bool hasImpulseResponse() const { return !frequencyPartitions_.empty(); }
+
+private:
+    int hopSize_ = 0;
+    int fftSize_ = 0;
+
+    FFTProcessor fftProcessor_;
+
+    // IR partitions in frequency domain
+    std::vector<std::vector<float>> frequencyPartitions_;
+
+    // Frequency Delay Line (most recent at fdlIndex_)
+    std::vector<std::vector<float>> frequencyDelayLine_;
+    int fdlIndex_ = 0;
+
+    // Processing buffers
+    std::vector<float> overlapBuffer_;
+    std::vector<float> timeBuffer_;
+    std::vector<float> frequencyBuffer_;
+    std::vector<float> tempBuffer_;
+
+    bool configured_ = false;
+};
+
+//==============================================================================
+// PartitionedConvolver::Impl - Implementation details
+//==============================================================================
+class PartitionedConvolver::Impl
+{
+public:
+    Impl() = default;
+    ~Impl() = default;
+
+    void configureLayers (std::size_t directFIRTaps, const std::vector<LayerSpec>& layers)
+    {
+        directFIRTapCount_ = directFIRTaps;
+        layers_.clear();
+        layers_.resize (layers.size());
+
+        std::size_t maximumHopSize = 0;
+
+        baseHopSize_ = layers.empty() ? 0 : layers.front().hopSize;
+        for (std::size_t i = 0; i < layers.size(); ++i)
+        {
+            layers_[i].configure (layers[i].hopSize);
+            if (i == 0)
+                baseHopSize_ = layers[i].hopSize;
+            else
+                baseHopSize_ = std::min (baseHopSize_, layers[i].hopSize);
+
+            maximumHopSize = std::max (maximumHopSize, layers[i].hopSize);
+        }
+
+        // Prepare staging buffers
+        inputStaging_.clear();
+        outputStaging_.assign (static_cast<std::size_t> (baseHopSize_), 0.0f);
+        inputCarry_.clear();
+
+        // Prepare per-layer accumulators
+        layerInputAccumulators_.assign (layers.size(), std::vector<float>());
+        layerOutputCarries_.assign (layers.size(), std::vector<float>());
+        layerTempOutput_.resize (maximumHopSize);
+    }
+
+    void setImpulseResponse (const float* impulseResponse, std::size_t length, const PartitionedConvolver::IRLoadOptions& options)
+    {
+        DirectFIR newFIR;
+        std::vector<FFTLayer> newLayers (layers_.size());
+
+        // Safety check
+        if (impulseResponse != nullptr && length > 0)
+        {
+            // Always apply peak headroom
+            float headroomScale = std::pow (10.0f, options.headroomDb / 20.0f);
+            if (options.normalize)
+            {
+                const auto minMax = FloatVectorOperations::findMinAndMax (impulseResponse, length);
+
+                const float peak = std::max (std::abs (minMax.getStart()), std::abs (minMax.getEnd()));
+                if (peak > 0.0f)
+                    headroomScale /= peak;
+            }
+
+            // Update DirectFIR in-place
+            std::vector<float> directTaps;
+
+            const auto directTapsCount = std::min (directFIRTapCount_, length);
+            if (directTapsCount > 0)
+            {
+                directTaps.reserve (directTapsCount);
+                directTaps.assign (impulseResponse, impulseResponse + directTapsCount);
+            }
+
+            newFIR.setTaps (std::move (directTaps), headroomScale);
+
+            // Update FFT layers
+            std::size_t consumed = directTapsCount;
+            for (std::size_t i = 0; i < newLayers.size(); ++i)
+            {
+                auto& layer = newLayers[i];
+                layer.configure (layers_[i].getHopSize());
+
+                const std::size_t remaining = (consumed < length) ? (length - consumed) : 0;
+                if (remaining == 0)
+                {
+                    layer.setImpulseResponse (nullptr, 0, headroomScale);
+                    continue;
+                }
+
+                consumed += layer.setImpulseResponse (impulseResponse + consumed, remaining, headroomScale);
+            }
+        }
+
+        {
+            SpinLock::ScopedLockType lock (processingLock_);
+
+            directFIR_ = std::move (newFIR);
+            layers_ = std::move (newLayers);
+
+            resetStateUnsafe();
+        }
+    }
+
+    void reset()
+    {
+        SpinLock::ScopedLockType lock (processingLock_);
+
+        resetStateUnsafe();
+    }
+
+    void process (const float* input, float* output, std::size_t numSamples)
+    {
+        if (numSamples == 0) return;
+
+        SpinLock::ScopedLockType lock (processingLock_);
+
+        processUnsafe (input, output, numSamples);
+    }
+
+private:
+    void resetStateUnsafe()
+    {
+        directFIR_.reset();
+        inputStaging_.clear();
+        std::fill (outputStaging_.begin(), outputStaging_.end(), 0.0f);
+        inputCarry_.clear();
+
+        for (auto& acc : layerInputAccumulators_)
+            acc.clear();
+
+        for (auto& carry : layerOutputCarries_)
+            carry.clear();
+
+        for (auto& layer : layers_)
+            layer.resetState();
+    }
+
+    void processUnsafe (const float* input, float* output, std::size_t numSamples)
+    {
+        ensureWorkingBuffers (numSamples); // TODO - move outside of process
+
+        FloatVectorOperations::copy (workingInput_.data(), input, numSamples);
+        FloatVectorOperations::clear (workingOutput_.data(), numSamples);
+
+        // Process direct FIR (no block size constraints)
+        directFIR_.process (workingInput_.data(), workingOutput_.data(), numSamples);
+        if (layers_.empty())
+        {
+            FloatVectorOperations::add (output, workingOutput_.data(), numSamples);
+            return;
+        }
+
+        // Process FFT layers with hop-based processing
+        appendToBuffer (inputCarry_, workingInput_.data(), numSamples);
+
+        std::size_t outputSamplesProduced = 0;
+        while (inputCarry_.size() >= static_cast<std::size_t> (baseHopSize_))
+        {
+            const std::size_t hopSize = static_cast<std::size_t> (baseHopSize_);
+            inputStaging_.assign (inputCarry_.begin(), inputCarry_.begin() + hopSize);
+
+            FloatVectorOperations::clear (outputStaging_.data(), outputStaging_.size());
+
+            for (std::size_t layerIndex = 0; layerIndex < layers_.size(); ++layerIndex)
+            {
+                auto& layer = layers_[layerIndex];
+                const int layerHopSize = layer.getHopSize();
+
+                appendToBuffer (layerInputAccumulators_[layerIndex], inputStaging_.data(), hopSize);
+
+                while (layerInputAccumulators_[layerIndex].size() >= static_cast<std::size_t> (layerHopSize))
+                {
+                    tempLayerHop_.assign (layerInputAccumulators_[layerIndex].begin(),
+                                          layerInputAccumulators_[layerIndex].begin() + layerHopSize);
+
+                    FloatVectorOperations::clear (layerTempOutput_.data(), layerHopSize);
+
+                    if (layer.hasImpulseResponse())
+                        layer.processHop (tempLayerHop_.data(), layerTempOutput_.data());
+
+                    appendToBuffer (layerOutputCarries_[layerIndex], layerTempOutput_.data(), static_cast<std::size_t> (layerHopSize));
+
+                    layerInputAccumulators_[layerIndex].erase (
+                        layerInputAccumulators_[layerIndex].begin(),
+                        layerInputAccumulators_[layerIndex].begin() + layerHopSize);
+                }
+
+                if (layerOutputCarries_[layerIndex].size() >= hopSize)
+                {
+                    FloatVectorOperations::add (outputStaging_.data(), layerOutputCarries_[layerIndex].data(), hopSize);
+
+                    layerOutputCarries_[layerIndex].erase (
+                        layerOutputCarries_[layerIndex].begin(),
+                        layerOutputCarries_[layerIndex].begin() + hopSize);
+                }
+            }
+
+            // Add staging output to main output
+            const std::size_t samplesToWrite = std::min (hopSize, numSamples - outputSamplesProduced);
+            FloatVectorOperations::add (workingOutput_.data() + outputSamplesProduced, outputStaging_.data(), samplesToWrite);
+            outputSamplesProduced += samplesToWrite;
+
+            // Remove processed input from carry buffer
+            inputCarry_.erase (inputCarry_.begin(), inputCarry_.begin() + hopSize);
+        }
+
+        // Copy final result to output (accumulate)
+        FloatVectorOperations::add (output, workingOutput_.data(), numSamples);
+    }
+
+private:
+    void ensureWorkingBuffers (std::size_t numSamples)
+    {
+        if (workingInput_.size() < numSamples)
+            workingInput_.resize (numSamples);
+
+        if (workingOutput_.size() < numSamples)
+            workingOutput_.resize (numSamples);
+    }
+
+    void appendToBuffer (std::vector<float>& buffer, const float* data, std::size_t numSamples)
+    {
+        const std::size_t oldSize = buffer.size();
+
+        buffer.resize (oldSize + numSamples);
+
+        std::copy (data, data + numSamples, buffer.begin() + oldSize);
+    }
+
+    std::size_t directFIRTapCount_ = 0;
+    int baseHopSize_ = 0;
+
+    DirectFIR directFIR_;
+    std::vector<FFTLayer> layers_;
+
+    // Working buffers
+    std::vector<float> workingInput_;
+    std::vector<float> workingOutput_;
+
+    // Staging for hop-based processing
+    std::vector<float> inputCarry_;
+    std::vector<float> inputStaging_;
+    std::vector<float> outputStaging_;
+
+    // Per-layer buffering
+    std::vector<std::vector<float>> layerInputAccumulators_;
+    std::vector<std::vector<float>> layerOutputCarries_;
+    std::vector<float> tempLayerHop_;
+    std::vector<float> layerTempOutput_;
+
+    mutable SpinLock processingLock_;
+};
+
+//==============================================================================
+// PartitionedConvolver implementation
+//==============================================================================
+
+PartitionedConvolver::PartitionedConvolver()
+    : pImpl (std::make_unique<Impl>())
+{
+}
+
+PartitionedConvolver::~PartitionedConvolver() = default;
+
+PartitionedConvolver::PartitionedConvolver (PartitionedConvolver&& other) noexcept
+    : pImpl (std::move (other.pImpl))
+{
+}
+
+PartitionedConvolver& PartitionedConvolver::operator= (PartitionedConvolver&& other) noexcept
+{
+    if (this != &other)
+        pImpl = std::move (other.pImpl);
+    return *this;
+}
+
+void PartitionedConvolver::configureLayers (std::size_t directFIRTaps, const std::vector<LayerSpec>& layers)
+{
+    pImpl->configureLayers (directFIRTaps, layers);
+}
+
+void PartitionedConvolver::setTypicalLayout (std::size_t directTaps, const std::vector<int>& hops)
+{
+    std::vector<LayerSpec> layerSpecs;
+    layerSpecs.reserve (hops.size());
+
+    for (int hop : hops)
+        layerSpecs.push_back ({hop});
+
+    configureLayers (directTaps, layerSpecs);
+}
+
+void PartitionedConvolver::setImpulseResponse (const float* impulseResponse, std::size_t length, const IRLoadOptions& options)
+{
+    pImpl->setImpulseResponse (impulseResponse, length, options);
+}
+
+void PartitionedConvolver::setImpulseResponse (const std::vector<float>& impulseResponse, const IRLoadOptions& options)
+{
+    setImpulseResponse (impulseResponse.data(), impulseResponse.size(), options);
+}
+
+void PartitionedConvolver::reset()
+{
+    pImpl->reset();
+}
+
+void PartitionedConvolver::process (const float* input, float* output, std::size_t numSamples)
+{
+    pImpl->process (input, output, numSamples);
+}
+
+} // namespace yup
diff --git a/modules/yup_dsp/convolution/yup_PartitionedConvolver.h b/modules/yup_dsp/convolution/yup_PartitionedConvolver.h
new file mode 100644
index 000000000..69c384ca1
--- /dev/null
+++ b/modules/yup_dsp/convolution/yup_PartitionedConvolver.h
@@ -0,0 +1,160 @@
+/*
+  ==============================================================================
+
+   This file is part of the YUP library.
+   Copyright (c) 2025 - kunitoki@gmail.com
+
+   YUP is an open source library subject to open-source licensing.
+
+   The code included in this file is provided under the terms of the ISC license
+   http://www.isc.org/downloads/software-support-policy/isc-license. Permission
+   to use, copy, modify, and/or distribute this software for any purpose with or
+   without fee is hereby granted provided that the above copyright notice and
+   this permission notice appear in all copies.
+
+   YUP IS PROVIDED "AS IS" WITHOUT ANY WARRANTY, AND ALL WARRANTIES, WHETHER
+   EXPRESSED OR IMPLIED, INCLUDING MERCHANTABILITY AND FITNESS FOR PURPOSE, ARE
+   DISCLAIMED.
+
+  ==============================================================================
+*/
+
+#pragma once
+
+namespace yup
+{
+
+//==============================================================================
+/**
+    Layered partitioned convolution engine optimized for real-time audio processing.
+    
+    Combines multiple processing strategies for efficient convolution:
+    - Direct FIR computation for early taps (low latency)
+    - One or more FFT-based Overlap-Add layers with uniform partitioning per layer
+    
+    The engine uses YUP's FFTProcessor for real FFT operations and supports:
+    - Arbitrary input/output block sizes with internal buffering
+    - Real-time safe processing (no heap allocations during process())
+    - Configurable layer hierarchy for optimal CPU/latency trade-off
+    
+    Example usage:
+    @code
+    PartitionedConvolver convolver;
+    
+    // Configure layers: 256 direct taps + FFT layers with hops 256, 1024, 4096
+    convolver.setTypicalLayout(256, {256, 1024, 4096});
+    
+    // Set impulse response (e.g., reverb IR)
+    std::vector<float> impulseResponse = loadImpulseResponse();
+    convolver.setImpulseResponse(impulseResponse);
+    
+    // In audio callback (accumulates into output):
+    convolver.process(inputBuffer, outputBuffer, numSamples);
+    @endcode
+    
+    @note The process() method accumulates results into the output buffer.
+          Clear the output buffer first if overwrite behavior is desired.
+*/
+class PartitionedConvolver
+{
+public:
+    //==============================================================================
+    /** Configuration for a single FFT-based convolution layer */
+    struct LayerSpec
+    {
+        int hopSize;  /**< Partition size L (FFT size will be 2*L) */
+    };
+
+    //==============================================================================
+    /** Default constructor */
+    PartitionedConvolver();
+    
+    /** Destructor */
+    ~PartitionedConvolver();
+
+    // Non-copyable but movable
+    PartitionedConvolver (PartitionedConvolver&& other) noexcept;
+    PartitionedConvolver& operator= (PartitionedConvolver&& other) noexcept;
+
+    //==============================================================================
+    /**
+        Configure the convolution layers before setting the impulse response.
+        
+        @param directFIRTaps  Number of early taps to process with direct FIR (for low latency)
+        @param layers         Vector of layer specifications with increasing hop sizes
+                             (e.g., {{256}, {1024}, {4096}} for 256→1024→4096 progression)
+    */
+    void configureLayers (std::size_t directFIRTaps, const std::vector<LayerSpec>& layers);
+    
+    /**
+        Convenience method to set a typical late-reverb configuration.
+        
+        @param directTaps  Number of direct FIR taps for early reflections
+        @param hops        Vector of hop sizes for FFT layers (geometrically increasing recommended)
+    */
+    void setTypicalLayout (std::size_t directTaps, const std::vector<int>& hops);
+
+    //==============================================================================
+
+    struct IRLoadOptions
+    {
+        IRLoadOptions()
+            : normalize (true)
+            , headroomDb (-12.0f)
+        {
+        }
+
+        bool normalize;
+        float headroomDb;
+    };
+
+    /**
+        Set the impulse response for convolution.
+        
+        @param impulseResponse  Pointer to impulse response samples
+        @param length          Number of samples in the impulse response
+        
+        @note This method is not real-time safe and should be called during initialization
+              or from a background thread when audio is paused.
+    */
+    void setImpulseResponse (const float* impulseResponse, std::size_t length, const IRLoadOptions& options = {});
+
+    /**
+        Set the impulse response from a vector.
+        
+        @param impulseResponse  Vector containing impulse response samples
+    */
+    void setImpulseResponse (const std::vector<float>& impulseResponse, const IRLoadOptions& options = {});
+
+    //==============================================================================
+    /**
+        Reset all internal processing state (clears delay lines, overlap buffers).
+        Impulse response partitions are preserved.
+    */
+    void reset();
+
+    /**
+        Process audio samples through the convolver.
+        
+        @param input       Input audio buffer
+        @param output      Output audio buffer (results are accumulated)
+        @param numSamples  Number of samples to process
+        
+        @note Results are accumulated into the output buffer. Clear it first if needed.
+        @note This method is real-time safe with no heap allocations.
+    */
+    void process (const float* input, float* output, std::size_t numSamples);
+
+private:
+    //==============================================================================
+    class DirectFIR;
+    class FFTLayer;
+    class Impl;
+    
+    std::unique_ptr<Impl> pImpl;
+
+    //==============================================================================
+    YUP_DECLARE_NON_COPYABLE_WITH_LEAK_DETECTOR (PartitionedConvolver)
+};
+
+} // namespace yup
diff --git a/modules/yup_dsp/yup_dsp.cpp b/modules/yup_dsp/yup_dsp.cpp
index e7c1087c4..ced8951b1 100644
--- a/modules/yup_dsp/yup_dsp.cpp
+++ b/modules/yup_dsp/yup_dsp.cpp
@@ -40,3 +40,6 @@
 
 //==============================================================================
 #include "designers/yup_FilterDesigner.cpp"
+
+//==============================================================================
+#include "convolution/yup_PartitionedConvolver.cpp"
diff --git a/modules/yup_dsp/yup_dsp.h b/modules/yup_dsp/yup_dsp.h
index b6eca3c79..e9ab7c795 100644
--- a/modules/yup_dsp/yup_dsp.h
+++ b/modules/yup_dsp/yup_dsp.h
@@ -141,4 +141,7 @@
 // Dynamics processors
 #include "dynamics/yup_SoftClipper.h"
 
+// Convolution processors
+#include "convolution/yup_PartitionedConvolver.h"
+
 //==============================================================================

From 7e9b1a86dd10d5038b3cb98c550786bffe9ac141 Mon Sep 17 00:00:00 2001
From: kunitoki <kunitoki@gmail.com>
Date: Tue, 26 Aug 2025 17:19:01 +0200
Subject: [PATCH 03/37] More work

---
 .../source/examples/ConvolutionDemo.h         |  16 ++-
 .../convolution/yup_PartitionedConvolver.cpp  | 101 ++++++++++++++----
 .../convolution/yup_PartitionedConvolver.h    |  16 ++-
 3 files changed, 111 insertions(+), 22 deletions(-)

diff --git a/examples/graphics/source/examples/ConvolutionDemo.h b/examples/graphics/source/examples/ConvolutionDemo.h
index a5dc7256b..8885d8ae9 100644
--- a/examples/graphics/source/examples/ConvolutionDemo.h
+++ b/examples/graphics/source/examples/ConvolutionDemo.h
@@ -60,7 +60,7 @@ class ConvolutionDemo
         dryGain.setCurrentAndTargetValue (0.3f);
 
         // Configure convolver with typical layout
-        convolver.setTypicalLayout (256, {256, 1024, 4096});
+        convolver.setTypicalLayout (128, {128, 256, 512, 1024, 4096});
 
         // Create UI
         createUI();
@@ -118,6 +118,7 @@ class ConvolutionDemo
 
         // Reset convolver
         convolver.reset();
+        convolver.prepare (static_cast<std::size_t> (device->getCurrentBufferSizeSamples()));
     }
 
     void audioDeviceStopped() override
@@ -387,7 +388,7 @@ class ConvolutionDemo
         irWaveformDisplay.setMargins (25, 25, 25, 25);
 
         // Add grid lines
-        irWaveformDisplay.setVerticalGridLines ({ 0.0, 1.0 });
+         irWaveformDisplay.setVerticalGridLines ({ 0.0, 1.0 });
         irWaveformDisplay.setHorizontalGridLines ({ -1.0, -0.5, 0.5, 1.0 });
         irWaveformDisplay.addHorizontalGridLine (0.0, yup::Color (0xFF666666), 1.0f, true);
 
@@ -406,6 +407,13 @@ class ConvolutionDemo
         if (impulseResponseData.empty())
             return;
 
+        // Always apply peak headroom
+        float headroomScale = std::pow (10.0f, -12.0f / 20.0f);
+        const auto minMax = yup::FloatVectorOperations::findMinAndMax (impulseResponseData.data(), impulseResponseData.size());
+        const float peak = std::max (std::abs (minMax.getStart()), std::abs (minMax.getEnd()));
+        if (peak > 0.0f)
+            headroomScale /= peak;
+
         // Create waveform data points
         const size_t numPoints = std::min (static_cast<size_t> (2048), impulseResponseData.size());
         const size_t stride = impulseResponseData.size() / numPoints;
@@ -420,7 +428,7 @@ class ConvolutionDemo
                 sampleIndex = impulseResponseData.size() - 1;
 
             double normalizedTime = static_cast<double> (i) / static_cast<double> (numPoints - 1);
-            double amplitude = static_cast<double> (impulseResponseData[sampleIndex]);
+            double amplitude = static_cast<double> (impulseResponseData[sampleIndex] * headroomScale);
 
             waveformData.emplace_back (normalizedTime, amplitude);
         }
@@ -431,11 +439,13 @@ class ConvolutionDemo
         // Update X axis range to show time
         double lengthInSeconds = static_cast<double> (impulseResponseData.size()) / 44100.0; // Assume 44.1kHz
         irWaveformDisplay.setXRange (0.0, lengthInSeconds);
+        irWaveformDisplay.setVerticalGridLines ({ 0.0, lengthInSeconds });
 
         // Update X axis labels to show time
         std::vector<double> timeLabels;
         for (int i = 0; i <= 4; ++i)
             timeLabels.push_back (lengthInSeconds * static_cast<double> (i) / 4.0);
+
         irWaveformDisplay.setXAxisLabels (timeLabels);
     }
 
diff --git a/modules/yup_dsp/convolution/yup_PartitionedConvolver.cpp b/modules/yup_dsp/convolution/yup_PartitionedConvolver.cpp
index 09319eba6..01ee58d1f 100644
--- a/modules/yup_dsp/convolution/yup_PartitionedConvolver.cpp
+++ b/modules/yup_dsp/convolution/yup_PartitionedConvolver.cpp
@@ -308,18 +308,64 @@ class PartitionedConvolver::Impl
             else
                 baseHopSize_ = std::min (baseHopSize_, layers[i].hopSize);
 
-            maximumHopSize = std::max (maximumHopSize, layers[i].hopSize);
+            maximumHopSize = std::max (maximumHopSize, static_cast<std::size_t> (layers[i].hopSize));
         }
 
+        maxHopSize_ = maximumHopSize;
+        
+        // Clear staging buffers - will be allocated in prepare()
+        inputStaging_.clear();
+        outputStaging_.clear();
+        inputCarry_.clear();
+
+        // Clear per-layer accumulators - will be allocated in prepare()
+        layerInputAccumulators_.assign (layers.size(), std::vector<float>());
+        layerOutputCarries_.assign (layers.size(), std::vector<float>());
+        
+        layerTempOutput_.clear();
+        tempLayerHop_.clear();
+        
+        // Clear working buffers - will be allocated in prepare()
+        workingInput_.clear();
+        workingOutput_.clear();
+        
+        isPrepared_ = false;
+    }
+
+    void prepare (std::size_t maxBlockSize)
+    {
+        maxBlockSize_ = maxBlockSize;
+        
+        // Calculate buffer sizes based on block size and hop configurations
+        const std::size_t maxBufferSize = std::max (maxBlockSize * 4, maxHopSize_ * 16);
+        
         // Prepare staging buffers
         inputStaging_.clear();
         outputStaging_.assign (static_cast<std::size_t> (baseHopSize_), 0.0f);
         inputCarry_.clear();
+        inputCarry_.reserve (maxBufferSize);
 
         // Prepare per-layer accumulators
-        layerInputAccumulators_.assign (layers.size(), std::vector<float>());
-        layerOutputCarries_.assign (layers.size(), std::vector<float>());
-        layerTempOutput_.resize (maximumHopSize);
+        for (std::size_t i = 0; i < layerInputAccumulators_.size(); ++i)
+        {
+            layerInputAccumulators_[i].clear();
+            layerInputAccumulators_[i].reserve (maxBufferSize);
+            layerOutputCarries_[i].clear();
+            layerOutputCarries_[i].reserve (maxBufferSize);
+        }
+        
+        // Allocate temp buffers
+        if (maxHopSize_ > 0)
+        {
+            layerTempOutput_.resize (maxHopSize_);
+            tempLayerHop_.reserve (maxHopSize_);
+        }
+        
+        // Allocate working buffers
+        workingInput_.resize (maxBlockSize);
+        workingOutput_.resize (maxBlockSize);
+        
+        isPrepared_ = true;
     }
 
     void setImpulseResponse (const float* impulseResponse, std::size_t length, const PartitionedConvolver::IRLoadOptions& options)
@@ -417,7 +463,12 @@ class PartitionedConvolver::Impl
 
     void processUnsafe (const float* input, float* output, std::size_t numSamples)
     {
-        ensureWorkingBuffers (numSamples); // TODO - move outside of process
+        // Ensure prepare() was called
+        jassert (isPrepared_);
+        jassert (numSamples <= maxBlockSize_);
+        
+        if (!isPrepared_ || numSamples > maxBlockSize_)
+            return; // Fail gracefully in release builds
 
         FloatVectorOperations::copy (workingInput_.data(), input, numSamples);
         FloatVectorOperations::clear (workingOutput_.data(), numSamples);
@@ -431,7 +482,7 @@ class PartitionedConvolver::Impl
         }
 
         // Process FFT layers with hop-based processing
-        appendToBuffer (inputCarry_, workingInput_.data(), numSamples);
+        safeAppendToBuffer (inputCarry_, workingInput_.data(), numSamples);
 
         std::size_t outputSamplesProduced = 0;
         while (inputCarry_.size() >= static_cast<std::size_t> (baseHopSize_))
@@ -446,7 +497,7 @@ class PartitionedConvolver::Impl
                 auto& layer = layers_[layerIndex];
                 const int layerHopSize = layer.getHopSize();
 
-                appendToBuffer (layerInputAccumulators_[layerIndex], inputStaging_.data(), hopSize);
+                safeAppendToBuffer (layerInputAccumulators_[layerIndex], inputStaging_.data(), hopSize);
 
                 while (layerInputAccumulators_[layerIndex].size() >= static_cast<std::size_t> (layerHopSize))
                 {
@@ -458,7 +509,7 @@ class PartitionedConvolver::Impl
                     if (layer.hasImpulseResponse())
                         layer.processHop (tempLayerHop_.data(), layerTempOutput_.data());
 
-                    appendToBuffer (layerOutputCarries_[layerIndex], layerTempOutput_.data(), static_cast<std::size_t> (layerHopSize));
+                    safeAppendToBuffer (layerOutputCarries_[layerIndex], layerTempOutput_.data(), static_cast<std::size_t> (layerHopSize));
 
                     layerInputAccumulators_[layerIndex].erase (
                         layerInputAccumulators_[layerIndex].begin(),
@@ -489,26 +540,35 @@ class PartitionedConvolver::Impl
     }
 
 private:
-    void ensureWorkingBuffers (std::size_t numSamples)
-    {
-        if (workingInput_.size() < numSamples)
-            workingInput_.resize (numSamples);
-
-        if (workingOutput_.size() < numSamples)
-            workingOutput_.resize (numSamples);
-    }
 
-    void appendToBuffer (std::vector<float>& buffer, const float* data, std::size_t numSamples)
+    void safeAppendToBuffer (std::vector<float>& buffer, const float* data, std::size_t numSamples)
     {
         const std::size_t oldSize = buffer.size();
+        const std::size_t newSize = oldSize + numSamples;
 
-        buffer.resize (oldSize + numSamples);
+        // Ensure we never exceed the reserved capacity to avoid allocations
+        jassert (newSize <= buffer.capacity());
+        if (newSize > buffer.capacity())
+        {
+            // Truncate to prevent allocation - this is a safety measure
+            const std::size_t maxSamples = buffer.capacity() - oldSize;
+            if (maxSamples > 0)
+            {
+                buffer.resize (buffer.capacity());
+                std::copy (data, data + maxSamples, buffer.begin() + oldSize);
+            }
+            return;
+        }
 
+        buffer.resize (newSize);
         std::copy (data, data + numSamples, buffer.begin() + oldSize);
     }
 
     std::size_t directFIRTapCount_ = 0;
     int baseHopSize_ = 0;
+    std::size_t maxHopSize_ = 0;
+    std::size_t maxBlockSize_ = 0;
+    bool isPrepared_ = false;
 
     DirectFIR directFIR_;
     std::vector<FFTLayer> layers_;
@@ -580,6 +640,11 @@ void PartitionedConvolver::setImpulseResponse (const std::vector<float>& impulse
     setImpulseResponse (impulseResponse.data(), impulseResponse.size(), options);
 }
 
+void PartitionedConvolver::prepare (std::size_t maxBlockSize)
+{
+    pImpl->prepare (maxBlockSize);
+}
+
 void PartitionedConvolver::reset()
 {
     pImpl->reset();
diff --git a/modules/yup_dsp/convolution/yup_PartitionedConvolver.h b/modules/yup_dsp/convolution/yup_PartitionedConvolver.h
index 69c384ca1..5931e7ece 100644
--- a/modules/yup_dsp/convolution/yup_PartitionedConvolver.h
+++ b/modules/yup_dsp/convolution/yup_PartitionedConvolver.h
@@ -44,12 +44,15 @@ namespace yup
     // Configure layers: 256 direct taps + FFT layers with hops 256, 1024, 4096
     convolver.setTypicalLayout(256, {256, 1024, 4096});
     
+    // Prepare for processing with maximum block size (must be called before process)
+    convolver.prepare(512); // Maximum 512 samples per process() call
+    
     // Set impulse response (e.g., reverb IR)
     std::vector<float> impulseResponse = loadImpulseResponse();
     convolver.setImpulseResponse(impulseResponse);
     
     // In audio callback (accumulates into output):
-    convolver.process(inputBuffer, outputBuffer, numSamples);
+    convolver.process(inputBuffer, outputBuffer, numSamples); // numSamples <= 512
     @endcode
     
     @note The process() method accumulates results into the output buffer.
@@ -127,6 +130,17 @@ class PartitionedConvolver
     void setImpulseResponse (const std::vector<float>& impulseResponse, const IRLoadOptions& options = {});
 
     //==============================================================================
+    /**
+        Prepare the convolver for processing with a specific maximum block size.
+        
+        @param maxBlockSize  Maximum number of samples that will be passed to process()
+        
+        @note This method is not real-time safe and should be called during initialization
+              or when audio processing is paused. It pre-allocates all internal buffers
+              to handle the specified block size without further allocations.
+    */
+    void prepare (std::size_t maxBlockSize);
+
     /**
         Reset all internal processing state (clears delay lines, overlap buffers).
         Impulse response partitions are preserved.

From 29ba67e05a69b9f712cab8c17e4a49594163aa32 Mon Sep 17 00:00:00 2001
From: kunitoki <kunitoki@gmail.com>
Date: Tue, 26 Aug 2025 21:40:58 +0200
Subject: [PATCH 04/37] Fix typo

---
 modules/yup_dsp/frequency/yup_FFTProcessor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/yup_dsp/frequency/yup_FFTProcessor.cpp b/modules/yup_dsp/frequency/yup_FFTProcessor.cpp
index 45aac82a7..c85263290 100644
--- a/modules/yup_dsp/frequency/yup_FFTProcessor.cpp
+++ b/modules/yup_dsp/frequency/yup_FFTProcessor.cpp
@@ -787,7 +787,7 @@ void FFTProcessor::updateScalingFactor()
 
 void FFTProcessor::applyScaling (float* data, int numElements, bool isForward) const
 {
-    if (scaling == FFTScaling::none || (scaling == FFTScaling::asymmetric && ! isForward))
+    if (scaling == FFTScaling::none || (scaling == FFTScaling::asymmetric && ! isForward))
         return;
 
     FloatVectorOperations::multiply (data, scalingFactor, numElements);

From fd316bf0e732526c1242ceeb82654c8790da7d47 Mon Sep 17 00:00:00 2001
From: kunitoki <kunitoki@gmail.com>
Date: Tue, 26 Aug 2025 22:15:15 +0200
Subject: [PATCH 05/37] Improved convolver

---
 modules/yup_audio_basics/yup_audio_basics.h   |   9 +
 .../convolution/yup_PartitionedConvolver.cpp  | 376 ++++++++++++++----
 .../convolution/yup_PartitionedConvolver.h    |  55 +--
 modules/yup_dsp/yup_dsp.cpp                   |   4 +
 4 files changed, 330 insertions(+), 114 deletions(-)

diff --git a/modules/yup_audio_basics/yup_audio_basics.h b/modules/yup_audio_basics/yup_audio_basics.h
index 944dd6ec6..fb69411df 100644
--- a/modules/yup_audio_basics/yup_audio_basics.h
+++ b/modules/yup_audio_basics/yup_audio_basics.h
@@ -69,11 +69,20 @@
 
 //==============================================================================
 #ifndef YUP_USE_SSE_INTRINSICS
+#if defined (__SSE__)
 #define YUP_USE_SSE_INTRINSICS 1
 #endif
+#endif
+
+#ifndef YUP_USE_AVX_INTRINSICS
+#if defined (__AVX2__)
+#define YUP_USE_AVX_INTRINSICS 1
+#endif
+#endif
 
 #if ! YUP_INTEL
 #undef YUP_USE_SSE_INTRINSICS
+#undef YUP_USE_AVX_INTRINSICS
 #endif
 
 #if __ARM_NEON__ && ! (YUP_USE_VDSP_FRAMEWORK || defined(YUP_USE_ARM_NEON))
diff --git a/modules/yup_dsp/convolution/yup_PartitionedConvolver.cpp b/modules/yup_dsp/convolution/yup_PartitionedConvolver.cpp
index 01ee58d1f..ef98c9a2f 100644
--- a/modules/yup_dsp/convolution/yup_PartitionedConvolver.cpp
+++ b/modules/yup_dsp/convolution/yup_PartitionedConvolver.cpp
@@ -19,26 +19,127 @@
   ==============================================================================
 */
 
-#include <atomic>
-#include <thread>
-
 namespace yup
 {
 
 //==============================================================================
-// Complex multiply-accumulate for interleaved real/imaginary format
-// Y += A * B (complex multiplication)
-//==============================================================================
-static void complexMultiplyAccumulate (const float* A, const float* B, float* Y, int complexPairs)
+
+/** Performs Y += A * B (complex multiply accumulate) where A, B, and Y
+    are arrays of interleaved complex<float> values [real, imag, real, imag...].
+
+    @param A pointer to input complex array
+    @param B pointer to input complex array
+    @param Y pointer to output complex array (accumulated)
+    @param complexPairs number of complex pairs (not number of floats!)
+*/
+static void complexMultiplyAccumulate (const float* A, const float* B, float* Y, int complexPairs) noexcept
 {
-    for (int i = 0; i < complexPairs; ++i)
+    int i = 0;
+
+#if YUP_USE_AVX_INTRINSICS
+    constexpr int simdWidth = 4; // AVX2 path: process 4 complex pairs (8 floats) at a time
+    for (; i <= complexPairs - simdWidth; i += simdWidth)
+    {
+        const int idx = i * 2;
+
+        __m256 a = _mm256_loadu_ps(A + idx);
+        __m256 b = _mm256_loadu_ps(B + idx);
+        __m256 y = _mm256_loadu_ps(Y + idx);
+
+        // a = [ar0 ai0 ar1 ai1 ar2 ai2 ar3 ai3]
+        // b = [br0 bi0 br1 bi1 br2 bi2 br3 bi3]
+
+        // separate real and imag for a and b
+        const __m256 a_shuffled = _mm256_permute_ps(a, _MM_SHUFFLE(2, 3, 0, 1));
+        const __m256 b_shuffled = _mm256_permute_ps(b, _MM_SHUFFLE(2, 3, 0, 1));
+
+        // real = ar*br - ai*bi
+        __m256 realPart = _mm256_fmsub_ps(a, b, _mm256_mul_ps(a_shuffled, b_shuffled));
+
+        // imag = ar*bi + ai*br
+        __m256 imagPart = _mm256_fmadd_ps(a, b_shuffled, _mm256_mul_ps(a_shuffled, b));
+
+        // interleave real/imag back
+        const __m256 interleaved = _mm256_blend_ps(realPart, imagPart, 0b10101010);
+
+        y = _mm256_add_ps(y, interleaved);
+        _mm256_storeu_ps(Y + idx, y);
+    }
+
+#elif YUP_USE_SSE_INTRINSICS
+    constexpr int simdWidth = 2; // SSE path: process 2 complex pairs (4 floats) at a time
+    for (; i <= complexPairs - simdWidth; i += simdWidth)
+    {
+        const int idx = i * 2;
+
+        __m128 a = _mm_loadu_ps(A + idx);
+        __m128 b = _mm_loadu_ps(B + idx);
+        __m128 y = _mm_loadu_ps(Y + idx);
+
+        // separate real and imag for a and b
+        const __m128 a_shuffled = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1));
+        const __m128 b_shuffled = _mm_shuffle_ps(b, b, _MM_SHUFFLE(2, 3, 0, 1));
+
+        // real = ar*br - ai*bi
+        __m128 realPart = _mm_sub_ps(_mm_mul_ps(a, b), _mm_mul_ps(a_shuffled, b_shuffled));
+
+        // imag = ar*bi + ai*br
+        __m128 imagPart = _mm_add_ps(_mm_mul_ps(a, b_shuffled), _mm_mul_ps(a_shuffled, b));
+
+        // interleave real/imag back
+        const __m128 interleaved = _mm_unpacklo_ps(realPart, imagPart);
+
+        y = _mm_add_ps(y, interleaved);
+        _mm_storeu_ps(Y + idx, y);
+    }
+
+#elif YUP_USE_ARM_NEON
+    constexpr int simdWidth = 2; // NEON path: process 2 complex pairs (4 floats) at a time
+    for (; i <= complexPairs - simdWidth; i += simdWidth)
+    {
+        const int idx = i * 2;
+
+        float32x4_t a = vld1q_f32(A + idx); // [ar0, ai0, ar1, ai1]
+        float32x4_t b = vld1q_f32(B + idx); // [br0, bi0, br1, bi1]
+        float32x4_t y = vld1q_f32(Y + idx);
+
+        // Shuffle a and b to get swapped real/imag for cross-multiplication
+        float32x4_t a_shuf = vrev64q_f32(a); // [ai0, ar0, ai1, ar1]
+        float32x4_t b_shuf = vrev64q_f32(b); // [bi0, br0, bi1, br1]
+
+        // real = ar*br - ai*bi
+        float32x4_t realPart = vsubq_f32(vmulq_f32(a, b), vmulq_f32(a_shuf, b_shuf));
+
+        // imag = ar*bi + ai*br
+        float32x4_t imagPart = vaddq_f32(vmulq_f32(a, b_shuf), vmulq_f32(a_shuf, b));
+
+        // Interleave real and imag: [real0, imag0, real1, imag1]
+        float32x2_t realLow = vget_low_f32(realPart);
+        float32x2_t imagLow = vget_low_f32(imagPart);
+        float32x2x2_t zippedLow = vzip_f32(realLow, imagLow);
+
+        float32x2_t realHigh = vget_high_f32(realPart);
+        float32x2_t imagHigh = vget_high_f32(imagPart);
+        float32x2x2_t zippedHigh = vzip_f32(realHigh, imagHigh);
+
+        float32x4_t interleaved = vcombine_f32(zippedLow.val[0], zippedHigh.val[0]);
+
+        y = vaddq_f32(y, interleaved);
+        vst1q_f32(Y + idx, y);
+    }
+
+#endif
+
+    for (; i < complexPairs; ++i)
     {
         const int ri = i * 2;
         const int ii = ri + 1;
+
         const float ar = A[ri];
         const float ai = A[ii];
         const float br = B[ri];
         const float bi = B[ii];
+
         // (ar + j*ai) * (br + j*bi) = (ar*br - ai*bi) + j*(ar*bi + ai*br)
         Y[ri] += ar * br - ai * bi;
         Y[ii] += ar * bi + ai * br;
@@ -46,8 +147,7 @@ static void complexMultiplyAccumulate (const float* A, const float* B, float* Y,
 }
 
 //==============================================================================
-// DirectFIR - Brute-force FIR implementation for early taps
-//==============================================================================
+
 class PartitionedConvolver::DirectFIR
 {
 public:
@@ -107,8 +207,7 @@ class PartitionedConvolver::DirectFIR
 };
 
 //==============================================================================
-// FFTLayer - Single uniform-partitioned OLA layer
-//==============================================================================
+
 class PartitionedConvolver::FFTLayer
 {
 public:
@@ -283,8 +382,100 @@ class PartitionedConvolver::FFTLayer
 };
 
 //==============================================================================
-// PartitionedConvolver::Impl - Implementation details
+
+class PartitionedConvolver::CircularBuffer
+{
+public:
+    CircularBuffer() = default;
+    
+    void resize (std::size_t size)
+    {
+        buffer_.resize (size);
+        clear();
+    }
+    
+    void clear()
+    {
+        std::fill (buffer_.begin(), buffer_.end(), 0.0f);
+        writeIndex_ = 0;
+        readIndex_ = 0;
+        availableForRead_ = 0;
+    }
+    
+    std::size_t getAvailableForRead() const { return availableForRead_; }
+    std::size_t getAvailableForWrite() const { return buffer_.size() - availableForRead_; }
+    std::size_t getSize() const { return buffer_.size(); }
+    
+    void write (const float* data, std::size_t numSamples)
+    {
+        jassert (numSamples <= getAvailableForWrite());
+        numSamples = std::min (numSamples, getAvailableForWrite());
+        
+        if (numSamples == 0) return;
+        
+        const std::size_t beforeWrap = std::min (numSamples, buffer_.size() - writeIndex_);
+        const std::size_t afterWrap = numSamples - beforeWrap;
+        
+        std::copy (data, data + beforeWrap, buffer_.begin() + writeIndex_);
+        if (afterWrap > 0)
+            std::copy (data + beforeWrap, data + numSamples, buffer_.begin());
+        
+        writeIndex_ = (writeIndex_ + numSamples) % buffer_.size();
+        availableForRead_ += numSamples;
+    }
+    
+    void read (float* data, std::size_t numSamples)
+    {
+        jassert (numSamples <= getAvailableForRead());
+        numSamples = std::min (numSamples, getAvailableForRead());
+        
+        if (numSamples == 0) return;
+        
+        const std::size_t beforeWrap = std::min (numSamples, buffer_.size() - readIndex_);
+        const std::size_t afterWrap = numSamples - beforeWrap;
+        
+        std::copy (buffer_.begin() + readIndex_, buffer_.begin() + readIndex_ + beforeWrap, data);
+        if (afterWrap > 0)
+            std::copy (buffer_.begin(), buffer_.begin() + afterWrap, data + beforeWrap);
+        
+        readIndex_ = (readIndex_ + numSamples) % buffer_.size();
+        availableForRead_ -= numSamples;
+    }
+    
+    void peek (float* data, std::size_t numSamples, std::size_t offset = 0) const
+    {
+        jassert (numSamples + offset <= getAvailableForRead());
+        numSamples = std::min (numSamples, getAvailableForRead() - offset);
+        
+        if (numSamples == 0) return;
+        
+        const std::size_t startIndex = (readIndex_ + offset) % buffer_.size();
+        const std::size_t beforeWrap = std::min (numSamples, buffer_.size() - startIndex);
+        const std::size_t afterWrap = numSamples - beforeWrap;
+        
+        std::copy (buffer_.begin() + startIndex, buffer_.begin() + startIndex + beforeWrap, data);
+        if (afterWrap > 0)
+            std::copy (buffer_.begin(), buffer_.begin() + afterWrap, data + beforeWrap);
+    }
+    
+    void skip (std::size_t numSamples)
+    {
+        jassert (numSamples <= getAvailableForRead());
+        numSamples = std::min (numSamples, getAvailableForRead());
+        
+        readIndex_ = (readIndex_ + numSamples) % buffer_.size();
+        availableForRead_ -= numSamples;
+    }
+
+private:
+    std::vector<float> buffer_;
+    std::size_t writeIndex_ = 0;
+    std::size_t readIndex_ = 0;
+    std::size_t availableForRead_ = 0;
+};
+
 //==============================================================================
+
 class PartitionedConvolver::Impl
 {
 public:
@@ -294,6 +485,7 @@ class PartitionedConvolver::Impl
     void configureLayers (std::size_t directFIRTaps, const std::vector<LayerSpec>& layers)
     {
         directFIRTapCount_ = directFIRTaps;
+
         layers_.clear();
         layers_.resize (layers.size());
 
@@ -316,11 +508,10 @@ class PartitionedConvolver::Impl
         // Clear staging buffers - will be allocated in prepare()
         inputStaging_.clear();
         outputStaging_.clear();
-        inputCarry_.clear();
 
-        // Clear per-layer accumulators - will be allocated in prepare()
-        layerInputAccumulators_.assign (layers.size(), std::vector<float>());
-        layerOutputCarries_.assign (layers.size(), std::vector<float>());
+        // Resize per-layer circular buffers - will be allocated in prepare()
+        layerInputBuffers_.resize (layers.size());
+        layerOutputBuffers_.resize (layers.size());
         
         layerTempOutput_.clear();
         tempLayerHop_.clear();
@@ -336,29 +527,26 @@ class PartitionedConvolver::Impl
     {
         maxBlockSize_ = maxBlockSize;
         
-        // Calculate buffer sizes based on block size and hop configurations
-        const std::size_t maxBufferSize = std::max (maxBlockSize * 4, maxHopSize_ * 16);
-        
-        // Prepare staging buffers
-        inputStaging_.clear();
+        // Calculate buffer sizes - generous but fixed allocation
+        const std::size_t inputBufferSize = maxBlockSize; // Input staging for all layers
+        const std::size_t outputBufferSize = maxHopSize_; // Output buffering for all layers
+
+        // Prepare main input staging
+        inputStaging_.resize (inputBufferSize);
         outputStaging_.assign (static_cast<std::size_t> (baseHopSize_), 0.0f);
-        inputCarry_.clear();
-        inputCarry_.reserve (maxBufferSize);
 
-        // Prepare per-layer accumulators
-        for (std::size_t i = 0; i < layerInputAccumulators_.size(); ++i)
+        // Prepare per-layer circular buffers
+        for (std::size_t i = 0; i < layerInputBuffers_.size(); ++i)
         {
-            layerInputAccumulators_[i].clear();
-            layerInputAccumulators_[i].reserve (maxBufferSize);
-            layerOutputCarries_[i].clear();
-            layerOutputCarries_[i].reserve (maxBufferSize);
+            layerInputBuffers_[i].resize (inputBufferSize);
+            layerOutputBuffers_[i].resize (outputBufferSize);
         }
         
         // Allocate temp buffers
         if (maxHopSize_ > 0)
         {
             layerTempOutput_.resize (maxHopSize_);
-            tempLayerHop_.reserve (maxHopSize_);
+            tempLayerHop_.resize (maxHopSize_);
         }
         
         // Allocate working buffers
@@ -447,15 +635,16 @@ class PartitionedConvolver::Impl
     void resetStateUnsafe()
     {
         directFIR_.reset();
-        inputStaging_.clear();
+        inputStagingReadIndex_ = 0;
+        inputStagingWriteIndex_ = 0;
+        inputStagingAvailable_ = 0;
         std::fill (outputStaging_.begin(), outputStaging_.end(), 0.0f);
-        inputCarry_.clear();
 
-        for (auto& acc : layerInputAccumulators_)
-            acc.clear();
+        for (auto& buffer : layerInputBuffers_)
+            buffer.clear();
 
-        for (auto& carry : layerOutputCarries_)
-            carry.clear();
+        for (auto& buffer : layerOutputBuffers_)
+            buffer.clear();
 
         for (auto& layer : layers_)
             layer.resetState();
@@ -481,48 +670,49 @@ class PartitionedConvolver::Impl
             return;
         }
 
-        // Process FFT layers with hop-based processing
-        safeAppendToBuffer (inputCarry_, workingInput_.data(), numSamples);
+        // Add input to main input staging buffer using circular buffer logic
+        writeToInputStaging (workingInput_.data(), numSamples);
 
         std::size_t outputSamplesProduced = 0;
-        while (inputCarry_.size() >= static_cast<std::size_t> (baseHopSize_))
+        while (getInputStagingAvailable() >= static_cast<std::size_t> (baseHopSize_))
         {
             const std::size_t hopSize = static_cast<std::size_t> (baseHopSize_);
-            inputStaging_.assign (inputCarry_.begin(), inputCarry_.begin() + hopSize);
-
+            
+            // Read hop from input staging
+            readFromInputStaging (tempLayerHop_.data(), hopSize);
+            
             FloatVectorOperations::clear (outputStaging_.data(), outputStaging_.size());
 
             for (std::size_t layerIndex = 0; layerIndex < layers_.size(); ++layerIndex)
             {
                 auto& layer = layers_[layerIndex];
                 const int layerHopSize = layer.getHopSize();
+                auto& inputBuffer = layerInputBuffers_[layerIndex];
+                auto& outputBuffer = layerOutputBuffers_[layerIndex];
 
-                safeAppendToBuffer (layerInputAccumulators_[layerIndex], inputStaging_.data(), hopSize);
+                // Write input hop to layer's input buffer
+                inputBuffer.write (tempLayerHop_.data(), hopSize);
 
-                while (layerInputAccumulators_[layerIndex].size() >= static_cast<std::size_t> (layerHopSize))
+                // Process complete layer hops
+                while (inputBuffer.getAvailableForRead() >= static_cast<std::size_t> (layerHopSize))
                 {
-                    tempLayerHop_.assign (layerInputAccumulators_[layerIndex].begin(),
-                                          layerInputAccumulators_[layerIndex].begin() + layerHopSize);
-
+                    // Read a full hop for this layer
+                    inputBuffer.read (tempLayerHop_.data(), static_cast<std::size_t> (layerHopSize));
+                    
                     FloatVectorOperations::clear (layerTempOutput_.data(), layerHopSize);
 
                     if (layer.hasImpulseResponse())
                         layer.processHop (tempLayerHop_.data(), layerTempOutput_.data());
 
-                    safeAppendToBuffer (layerOutputCarries_[layerIndex], layerTempOutput_.data(), static_cast<std::size_t> (layerHopSize));
-
-                    layerInputAccumulators_[layerIndex].erase (
-                        layerInputAccumulators_[layerIndex].begin(),
-                        layerInputAccumulators_[layerIndex].begin() + layerHopSize);
+                    // Write output to layer's output buffer
+                    outputBuffer.write (layerTempOutput_.data(), static_cast<std::size_t> (layerHopSize));
                 }
 
-                if (layerOutputCarries_[layerIndex].size() >= hopSize)
+                // Mix available output from this layer
+                if (outputBuffer.getAvailableForRead() >= hopSize)
                 {
-                    FloatVectorOperations::add (outputStaging_.data(), layerOutputCarries_[layerIndex].data(), hopSize);
-
-                    layerOutputCarries_[layerIndex].erase (
-                        layerOutputCarries_[layerIndex].begin(),
-                        layerOutputCarries_[layerIndex].begin() + hopSize);
+                    outputBuffer.read (layerTempOutput_.data(), hopSize);
+                    FloatVectorOperations::add (outputStaging_.data(), layerTempOutput_.data(), hopSize);
                 }
             }
 
@@ -530,9 +720,6 @@ class PartitionedConvolver::Impl
             const std::size_t samplesToWrite = std::min (hopSize, numSamples - outputSamplesProduced);
             FloatVectorOperations::add (workingOutput_.data() + outputSamplesProduced, outputStaging_.data(), samplesToWrite);
             outputSamplesProduced += samplesToWrite;
-
-            // Remove processed input from carry buffer
-            inputCarry_.erase (inputCarry_.begin(), inputCarry_.begin() + hopSize);
         }
 
         // Copy final result to output (accumulate)
@@ -540,29 +727,42 @@ class PartitionedConvolver::Impl
     }
 
 private:
-
-    void safeAppendToBuffer (std::vector<float>& buffer, const float* data, std::size_t numSamples)
+    void writeToInputStaging (const float* data, std::size_t numSamples)
     {
-        const std::size_t oldSize = buffer.size();
-        const std::size_t newSize = oldSize + numSamples;
-
-        // Ensure we never exceed the reserved capacity to avoid allocations
-        jassert (newSize <= buffer.capacity());
-        if (newSize > buffer.capacity())
-        {
-            // Truncate to prevent allocation - this is a safety measure
-            const std::size_t maxSamples = buffer.capacity() - oldSize;
-            if (maxSamples > 0)
-            {
-                buffer.resize (buffer.capacity());
-                std::copy (data, data + maxSamples, buffer.begin() + oldSize);
-            }
-            return;
-        }
-
-        buffer.resize (newSize);
-        std::copy (data, data + numSamples, buffer.begin() + oldSize);
+        const std::size_t available = inputStaging_.size() - inputStagingAvailable_;
+        jassert (numSamples <= available);
+        numSamples = std::min (numSamples, available);
+        if (numSamples == 0) return;
+        
+        const std::size_t beforeWrap = std::min (numSamples, inputStaging_.size() - inputStagingWriteIndex_);
+        const std::size_t afterWrap = numSamples - beforeWrap;
+        
+        std::copy (data, data + beforeWrap, inputStaging_.begin() + inputStagingWriteIndex_);
+        if (afterWrap > 0)
+            std::copy (data + beforeWrap, data + numSamples, inputStaging_.begin());
+        
+        inputStagingWriteIndex_ = (inputStagingWriteIndex_ + numSamples) % inputStaging_.size();
+        inputStagingAvailable_ += numSamples;
+    }
+    
+    void readFromInputStaging (float* data, std::size_t numSamples)
+    {
+        jassert (numSamples <= inputStagingAvailable_);
+        numSamples = std::min (numSamples, inputStagingAvailable_);
+        if (numSamples == 0) return;
+        
+        const std::size_t beforeWrap = std::min (numSamples, inputStaging_.size() - inputStagingReadIndex_);
+        const std::size_t afterWrap = numSamples - beforeWrap;
+        
+        std::copy (inputStaging_.begin() + inputStagingReadIndex_, inputStaging_.begin() + inputStagingReadIndex_ + beforeWrap, data);
+        if (afterWrap > 0)
+            std::copy (inputStaging_.begin(), inputStaging_.begin() + afterWrap, data + beforeWrap);
+        
+        inputStagingReadIndex_ = (inputStagingReadIndex_ + numSamples) % inputStaging_.size();
+        inputStagingAvailable_ -= numSamples;
     }
+    
+    std::size_t getInputStagingAvailable() const { return inputStagingAvailable_; }
 
     std::size_t directFIRTapCount_ = 0;
     int baseHopSize_ = 0;
@@ -577,14 +777,16 @@ class PartitionedConvolver::Impl
     std::vector<float> workingInput_;
     std::vector<float> workingOutput_;
 
-    // Staging for hop-based processing
-    std::vector<float> inputCarry_;
+    // Input staging with circular buffer management
     std::vector<float> inputStaging_;
+    std::size_t inputStagingReadIndex_ = 0;
+    std::size_t inputStagingWriteIndex_ = 0;
+    std::size_t inputStagingAvailable_ = 0;
     std::vector<float> outputStaging_;
 
-    // Per-layer buffering
-    std::vector<std::vector<float>> layerInputAccumulators_;
-    std::vector<std::vector<float>> layerOutputCarries_;
+    // Per-layer circular buffering
+    std::vector<CircularBuffer> layerInputBuffers_;
+    std::vector<CircularBuffer> layerOutputBuffers_;
     std::vector<float> tempLayerHop_;
     std::vector<float> layerTempOutput_;
 
diff --git a/modules/yup_dsp/convolution/yup_PartitionedConvolver.h b/modules/yup_dsp/convolution/yup_PartitionedConvolver.h
index 5931e7ece..a4e4d4d5b 100644
--- a/modules/yup_dsp/convolution/yup_PartitionedConvolver.h
+++ b/modules/yup_dsp/convolution/yup_PartitionedConvolver.h
@@ -27,34 +27,34 @@ namespace yup
 //==============================================================================
 /**
     Layered partitioned convolution engine optimized for real-time audio processing.
-    
+
     Combines multiple processing strategies for efficient convolution:
     - Direct FIR computation for early taps (low latency)
     - One or more FFT-based Overlap-Add layers with uniform partitioning per layer
-    
+
     The engine uses YUP's FFTProcessor for real FFT operations and supports:
     - Arbitrary input/output block sizes with internal buffering
     - Real-time safe processing (no heap allocations during process())
     - Configurable layer hierarchy for optimal CPU/latency trade-off
-    
+
     Example usage:
     @code
     PartitionedConvolver convolver;
-    
+
     // Configure layers: 256 direct taps + FFT layers with hops 256, 1024, 4096
     convolver.setTypicalLayout(256, {256, 1024, 4096});
-    
+
     // Prepare for processing with maximum block size (must be called before process)
     convolver.prepare(512); // Maximum 512 samples per process() call
-    
+
     // Set impulse response (e.g., reverb IR)
     std::vector<float> impulseResponse = loadImpulseResponse();
     convolver.setImpulseResponse(impulseResponse);
-    
+
     // In audio callback (accumulates into output):
     convolver.process(inputBuffer, outputBuffer, numSamples); // numSamples <= 512
     @endcode
-    
+
     @note The process() method accumulates results into the output buffer.
           Clear the output buffer first if overwrite behavior is desired.
 */
@@ -71,7 +71,7 @@ class PartitionedConvolver
     //==============================================================================
     /** Default constructor */
     PartitionedConvolver();
-    
+
     /** Destructor */
     ~PartitionedConvolver();
 
@@ -82,23 +82,23 @@ class PartitionedConvolver
     //==============================================================================
     /**
         Configure the convolution layers before setting the impulse response.
-        
+
         @param directFIRTaps  Number of early taps to process with direct FIR (for low latency)
         @param layers         Vector of layer specifications with increasing hop sizes
                              (e.g., {{256}, {1024}, {4096}} for 256→1024→4096 progression)
     */
     void configureLayers (std::size_t directFIRTaps, const std::vector<LayerSpec>& layers);
-    
+
     /**
         Convenience method to set a typical late-reverb configuration.
-        
+
         @param directTaps  Number of direct FIR taps for early reflections
         @param hops        Vector of hop sizes for FFT layers (geometrically increasing recommended)
     */
     void setTypicalLayout (std::size_t directTaps, const std::vector<int>& hops);
 
     //==============================================================================
-
+    /** Impulse response loading options. */
     struct IRLoadOptions
     {
         IRLoadOptions()
@@ -113,10 +113,10 @@ class PartitionedConvolver
 
     /**
         Set the impulse response for convolution.
-        
+
         @param impulseResponse  Pointer to impulse response samples
         @param length          Number of samples in the impulse response
-        
+
         @note This method is not real-time safe and should be called during initialization
               or from a background thread when audio is paused.
     */
@@ -124,7 +124,7 @@ class PartitionedConvolver
 
     /**
         Set the impulse response from a vector.
-        
+
         @param impulseResponse  Vector containing impulse response samples
     */
     void setImpulseResponse (const std::vector<float>& impulseResponse, const IRLoadOptions& options = {});
@@ -132,39 +132,40 @@ class PartitionedConvolver
     //==============================================================================
     /**
         Prepare the convolver for processing with a specific maximum block size.
-        
+
         @param maxBlockSize  Maximum number of samples that will be passed to process()
-        
+
         @note This method is not real-time safe and should be called during initialization
               or when audio processing is paused. It pre-allocates all internal buffers
               to handle the specified block size without further allocations.
     */
     void prepare (std::size_t maxBlockSize);
 
-    /**
-        Reset all internal processing state (clears delay lines, overlap buffers).
-        Impulse response partitions are preserved.
-    */
-    void reset();
-
     /**
         Process audio samples through the convolver.
-        
+
         @param input       Input audio buffer
         @param output      Output audio buffer (results are accumulated)
         @param numSamples  Number of samples to process
-        
+
         @note Results are accumulated into the output buffer. Clear it first if needed.
         @note This method is real-time safe with no heap allocations.
     */
     void process (const float* input, float* output, std::size_t numSamples);
 
+        /**
+        Reset all internal processing state (clears delay lines, overlap buffers).
+        Impulse response partitions are preserved.
+    */
+    void reset();
+
 private:
     //==============================================================================
     class DirectFIR;
     class FFTLayer;
+    class CircularBuffer;
     class Impl;
-    
+
     std::unique_ptr<Impl> pImpl;
 
     //==============================================================================
diff --git a/modules/yup_dsp/yup_dsp.cpp b/modules/yup_dsp/yup_dsp.cpp
index ced8951b1..ec420b409 100644
--- a/modules/yup_dsp/yup_dsp.cpp
+++ b/modules/yup_dsp/yup_dsp.cpp
@@ -30,6 +30,10 @@
 
 #include "yup_dsp.h"
 
+//==============================================================================
+#include <atomic>
+#include <thread>
+
 //==============================================================================
 #include "frequency/yup_FFTProcessor.cpp"
 #include "frequency/yup_SpectrumAnalyzerState.cpp"

From 897da5183d5229da31484cd2ea4e0b105f5afce6 Mon Sep 17 00:00:00 2001
From: kunitoki <kunitoki@gmail.com>
Date: Tue, 26 Aug 2025 22:16:06 +0200
Subject: [PATCH 06/37] Formatting

---
 .../convolution/yup_PartitionedConvolver.cpp  | 183 ++++++++++--------
 .../convolution/yup_PartitionedConvolver.h    |   4 +-
 2 files changed, 99 insertions(+), 88 deletions(-)

diff --git a/modules/yup_dsp/convolution/yup_PartitionedConvolver.cpp b/modules/yup_dsp/convolution/yup_PartitionedConvolver.cpp
index ef98c9a2f..d4ecc2e64 100644
--- a/modules/yup_dsp/convolution/yup_PartitionedConvolver.cpp
+++ b/modules/yup_dsp/convolution/yup_PartitionedConvolver.cpp
@@ -42,28 +42,28 @@ static void complexMultiplyAccumulate (const float* A, const float* B, float* Y,
     {
         const int idx = i * 2;
 
-        __m256 a = _mm256_loadu_ps(A + idx);
-        __m256 b = _mm256_loadu_ps(B + idx);
-        __m256 y = _mm256_loadu_ps(Y + idx);
+        __m256 a = _mm256_loadu_ps (A + idx);
+        __m256 b = _mm256_loadu_ps (B + idx);
+        __m256 y = _mm256_loadu_ps (Y + idx);
 
         // a = [ar0 ai0 ar1 ai1 ar2 ai2 ar3 ai3]
         // b = [br0 bi0 br1 bi1 br2 bi2 br3 bi3]
 
         // separate real and imag for a and b
-        const __m256 a_shuffled = _mm256_permute_ps(a, _MM_SHUFFLE(2, 3, 0, 1));
-        const __m256 b_shuffled = _mm256_permute_ps(b, _MM_SHUFFLE(2, 3, 0, 1));
+        const __m256 a_shuffled = _mm256_permute_ps (a, _MM_SHUFFLE (2, 3, 0, 1));
+        const __m256 b_shuffled = _mm256_permute_ps (b, _MM_SHUFFLE (2, 3, 0, 1));
 
         // real = ar*br - ai*bi
-        __m256 realPart = _mm256_fmsub_ps(a, b, _mm256_mul_ps(a_shuffled, b_shuffled));
+        __m256 realPart = _mm256_fmsub_ps (a, b, _mm256_mul_ps (a_shuffled, b_shuffled));
 
         // imag = ar*bi + ai*br
-        __m256 imagPart = _mm256_fmadd_ps(a, b_shuffled, _mm256_mul_ps(a_shuffled, b));
+        __m256 imagPart = _mm256_fmadd_ps (a, b_shuffled, _mm256_mul_ps (a_shuffled, b));
 
         // interleave real/imag back
-        const __m256 interleaved = _mm256_blend_ps(realPart, imagPart, 0b10101010);
+        const __m256 interleaved = _mm256_blend_ps (realPart, imagPart, 0b10101010);
 
-        y = _mm256_add_ps(y, interleaved);
-        _mm256_storeu_ps(Y + idx, y);
+        y = _mm256_add_ps (y, interleaved);
+        _mm256_storeu_ps (Y + idx, y);
     }
 
 #elif YUP_USE_SSE_INTRINSICS
@@ -72,25 +72,25 @@ static void complexMultiplyAccumulate (const float* A, const float* B, float* Y,
     {
         const int idx = i * 2;
 
-        __m128 a = _mm_loadu_ps(A + idx);
-        __m128 b = _mm_loadu_ps(B + idx);
-        __m128 y = _mm_loadu_ps(Y + idx);
+        __m128 a = _mm_loadu_ps (A + idx);
+        __m128 b = _mm_loadu_ps (B + idx);
+        __m128 y = _mm_loadu_ps (Y + idx);
 
         // separate real and imag for a and b
-        const __m128 a_shuffled = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1));
-        const __m128 b_shuffled = _mm_shuffle_ps(b, b, _MM_SHUFFLE(2, 3, 0, 1));
+        const __m128 a_shuffled = _mm_shuffle_ps (a, a, _MM_SHUFFLE (2, 3, 0, 1));
+        const __m128 b_shuffled = _mm_shuffle_ps (b, b, _MM_SHUFFLE (2, 3, 0, 1));
 
         // real = ar*br - ai*bi
-        __m128 realPart = _mm_sub_ps(_mm_mul_ps(a, b), _mm_mul_ps(a_shuffled, b_shuffled));
+        __m128 realPart = _mm_sub_ps (_mm_mul_ps (a, b), _mm_mul_ps (a_shuffled, b_shuffled));
 
         // imag = ar*bi + ai*br
-        __m128 imagPart = _mm_add_ps(_mm_mul_ps(a, b_shuffled), _mm_mul_ps(a_shuffled, b));
+        __m128 imagPart = _mm_add_ps (_mm_mul_ps (a, b_shuffled), _mm_mul_ps (a_shuffled, b));
 
         // interleave real/imag back
-        const __m128 interleaved = _mm_unpacklo_ps(realPart, imagPart);
+        const __m128 interleaved = _mm_unpacklo_ps (realPart, imagPart);
 
-        y = _mm_add_ps(y, interleaved);
-        _mm_storeu_ps(Y + idx, y);
+        y = _mm_add_ps (y, interleaved);
+        _mm_storeu_ps (Y + idx, y);
     }
 
 #elif YUP_USE_ARM_NEON
@@ -99,33 +99,33 @@ static void complexMultiplyAccumulate (const float* A, const float* B, float* Y,
     {
         const int idx = i * 2;
 
-        float32x4_t a = vld1q_f32(A + idx); // [ar0, ai0, ar1, ai1]
-        float32x4_t b = vld1q_f32(B + idx); // [br0, bi0, br1, bi1]
-        float32x4_t y = vld1q_f32(Y + idx);
+        float32x4_t a = vld1q_f32 (A + idx); // [ar0, ai0, ar1, ai1]
+        float32x4_t b = vld1q_f32 (B + idx); // [br0, bi0, br1, bi1]
+        float32x4_t y = vld1q_f32 (Y + idx);
 
         // Shuffle a and b to get swapped real/imag for cross-multiplication
-        float32x4_t a_shuf = vrev64q_f32(a); // [ai0, ar0, ai1, ar1]
-        float32x4_t b_shuf = vrev64q_f32(b); // [bi0, br0, bi1, br1]
+        float32x4_t a_shuf = vrev64q_f32 (a); // [ai0, ar0, ai1, ar1]
+        float32x4_t b_shuf = vrev64q_f32 (b); // [bi0, br0, bi1, br1]
 
         // real = ar*br - ai*bi
-        float32x4_t realPart = vsubq_f32(vmulq_f32(a, b), vmulq_f32(a_shuf, b_shuf));
+        float32x4_t realPart = vsubq_f32 (vmulq_f32 (a, b), vmulq_f32 (a_shuf, b_shuf));
 
         // imag = ar*bi + ai*br
-        float32x4_t imagPart = vaddq_f32(vmulq_f32(a, b_shuf), vmulq_f32(a_shuf, b));
+        float32x4_t imagPart = vaddq_f32 (vmulq_f32 (a, b_shuf), vmulq_f32 (a_shuf, b));
 
         // Interleave real and imag: [real0, imag0, real1, imag1]
-        float32x2_t realLow = vget_low_f32(realPart);
-        float32x2_t imagLow = vget_low_f32(imagPart);
-        float32x2x2_t zippedLow = vzip_f32(realLow, imagLow);
+        float32x2_t realLow = vget_low_f32 (realPart);
+        float32x2_t imagLow = vget_low_f32 (imagPart);
+        float32x2x2_t zippedLow = vzip_f32 (realLow, imagLow);
 
-        float32x2_t realHigh = vget_high_f32(realPart);
-        float32x2_t imagHigh = vget_high_f32(imagPart);
-        float32x2x2_t zippedHigh = vzip_f32(realHigh, imagHigh);
+        float32x2_t realHigh = vget_high_f32 (realPart);
+        float32x2_t imagHigh = vget_high_f32 (imagPart);
+        float32x2x2_t zippedHigh = vzip_f32 (realHigh, imagHigh);
 
-        float32x4_t interleaved = vcombine_f32(zippedLow.val[0], zippedHigh.val[0]);
+        float32x4_t interleaved = vcombine_f32 (zippedLow.val[0], zippedHigh.val[0]);
 
-        y = vaddq_f32(y, interleaved);
-        vst1q_f32(Y + idx, y);
+        y = vaddq_f32 (y, interleaved);
+        vst1q_f32 (Y + idx, y);
     }
 
 #endif
@@ -172,7 +172,8 @@ class PartitionedConvolver::DirectFIR
     void process (const float* input, float* output, std::size_t numSamples)
     {
         const std::size_t numTaps = taps_.size();
-        if (numTaps == 0) return;
+        if (numTaps == 0)
+            return;
 
         for (std::size_t i = 0; i < numSamples; ++i)
         {
@@ -228,14 +229,16 @@ class PartitionedConvolver::FFTLayer
         overlapBuffer_.assign (static_cast<std::size_t> (hopSize_), 0.0f);
         timeBuffer_.assign (static_cast<std::size_t> (fftSize_), 0.0f);
         frequencyBuffer_.assign (static_cast<std::size_t> (fftSize_) * 2, 0.0f);
-        tempBuffer_.assign (static_cast<std::size_t> (fftSize_) * 2, 0.0f);  // Must hold complex data for in-place FFT
+        tempBuffer_.assign (static_cast<std::size_t> (fftSize_) * 2, 0.0f); // Must hold complex data for in-place FFT
 
         fdlIndex_ = 0;
         configured_ = true;
     }
 
     int getHopSize() const { return hopSize_; }
+
     int getFFTSize() const { return fftSize_; }
+
     bool isConfigured() const { return configured_; }
 
     std::size_t setImpulseResponse (const float* impulseResponse, std::size_t length, float scaling)
@@ -357,7 +360,7 @@ class PartitionedConvolver::FFTLayer
         }
     }
 
-    bool hasImpulseResponse() const { return !frequencyPartitions_.empty(); }
+    bool hasImpulseResponse() const { return ! frequencyPartitions_.empty(); }
 
 private:
     int hopSize_ = 0;
@@ -387,13 +390,13 @@ class PartitionedConvolver::CircularBuffer
 {
 public:
     CircularBuffer() = default;
-    
+
     void resize (std::size_t size)
     {
         buffer_.resize (size);
         clear();
     }
-    
+
     void clear()
     {
         std::fill (buffer_.begin(), buffer_.end(), 0.0f);
@@ -401,68 +404,73 @@ class PartitionedConvolver::CircularBuffer
         readIndex_ = 0;
         availableForRead_ = 0;
     }
-    
+
     std::size_t getAvailableForRead() const { return availableForRead_; }
+
     std::size_t getAvailableForWrite() const { return buffer_.size() - availableForRead_; }
+
     std::size_t getSize() const { return buffer_.size(); }
-    
+
     void write (const float* data, std::size_t numSamples)
     {
         jassert (numSamples <= getAvailableForWrite());
         numSamples = std::min (numSamples, getAvailableForWrite());
-        
-        if (numSamples == 0) return;
-        
+
+        if (numSamples == 0)
+            return;
+
         const std::size_t beforeWrap = std::min (numSamples, buffer_.size() - writeIndex_);
         const std::size_t afterWrap = numSamples - beforeWrap;
-        
+
         std::copy (data, data + beforeWrap, buffer_.begin() + writeIndex_);
         if (afterWrap > 0)
             std::copy (data + beforeWrap, data + numSamples, buffer_.begin());
-        
+
         writeIndex_ = (writeIndex_ + numSamples) % buffer_.size();
         availableForRead_ += numSamples;
     }
-    
+
     void read (float* data, std::size_t numSamples)
     {
         jassert (numSamples <= getAvailableForRead());
         numSamples = std::min (numSamples, getAvailableForRead());
-        
-        if (numSamples == 0) return;
-        
+
+        if (numSamples == 0)
+            return;
+
         const std::size_t beforeWrap = std::min (numSamples, buffer_.size() - readIndex_);
         const std::size_t afterWrap = numSamples - beforeWrap;
-        
+
         std::copy (buffer_.begin() + readIndex_, buffer_.begin() + readIndex_ + beforeWrap, data);
         if (afterWrap > 0)
             std::copy (buffer_.begin(), buffer_.begin() + afterWrap, data + beforeWrap);
-        
+
         readIndex_ = (readIndex_ + numSamples) % buffer_.size();
         availableForRead_ -= numSamples;
     }
-    
+
     void peek (float* data, std::size_t numSamples, std::size_t offset = 0) const
     {
         jassert (numSamples + offset <= getAvailableForRead());
         numSamples = std::min (numSamples, getAvailableForRead() - offset);
-        
-        if (numSamples == 0) return;
-        
+
+        if (numSamples == 0)
+            return;
+
         const std::size_t startIndex = (readIndex_ + offset) % buffer_.size();
         const std::size_t beforeWrap = std::min (numSamples, buffer_.size() - startIndex);
         const std::size_t afterWrap = numSamples - beforeWrap;
-        
+
         std::copy (buffer_.begin() + startIndex, buffer_.begin() + startIndex + beforeWrap, data);
         if (afterWrap > 0)
             std::copy (buffer_.begin(), buffer_.begin() + afterWrap, data + beforeWrap);
     }
-    
+
     void skip (std::size_t numSamples)
     {
         jassert (numSamples <= getAvailableForRead());
         numSamples = std::min (numSamples, getAvailableForRead());
-        
+
         readIndex_ = (readIndex_ + numSamples) % buffer_.size();
         availableForRead_ -= numSamples;
     }
@@ -504,7 +512,7 @@ class PartitionedConvolver::Impl
         }
 
         maxHopSize_ = maximumHopSize;
-        
+
         // Clear staging buffers - will be allocated in prepare()
         inputStaging_.clear();
         outputStaging_.clear();
@@ -512,21 +520,21 @@ class PartitionedConvolver::Impl
         // Resize per-layer circular buffers - will be allocated in prepare()
         layerInputBuffers_.resize (layers.size());
         layerOutputBuffers_.resize (layers.size());
-        
+
         layerTempOutput_.clear();
         tempLayerHop_.clear();
-        
+
         // Clear working buffers - will be allocated in prepare()
         workingInput_.clear();
         workingOutput_.clear();
-        
+
         isPrepared_ = false;
     }
 
     void prepare (std::size_t maxBlockSize)
     {
         maxBlockSize_ = maxBlockSize;
-        
+
         // Calculate buffer sizes - generous but fixed allocation
         const std::size_t inputBufferSize = maxBlockSize; // Input staging for all layers
         const std::size_t outputBufferSize = maxHopSize_; // Output buffering for all layers
@@ -541,18 +549,18 @@ class PartitionedConvolver::Impl
             layerInputBuffers_[i].resize (inputBufferSize);
             layerOutputBuffers_[i].resize (outputBufferSize);
         }
-        
+
         // Allocate temp buffers
         if (maxHopSize_ > 0)
         {
             layerTempOutput_.resize (maxHopSize_);
             tempLayerHop_.resize (maxHopSize_);
         }
-        
+
         // Allocate working buffers
         workingInput_.resize (maxBlockSize);
         workingOutput_.resize (maxBlockSize);
-        
+
         isPrepared_ = true;
     }
 
@@ -624,7 +632,8 @@ class PartitionedConvolver::Impl
 
     void process (const float* input, float* output, std::size_t numSamples)
     {
-        if (numSamples == 0) return;
+        if (numSamples == 0)
+            return;
 
         SpinLock::ScopedLockType lock (processingLock_);
 
@@ -655,8 +664,8 @@ class PartitionedConvolver::Impl
         // Ensure prepare() was called
         jassert (isPrepared_);
         jassert (numSamples <= maxBlockSize_);
-        
-        if (!isPrepared_ || numSamples > maxBlockSize_)
+
+        if (! isPrepared_ || numSamples > maxBlockSize_)
             return; // Fail gracefully in release builds
 
         FloatVectorOperations::copy (workingInput_.data(), input, numSamples);
@@ -677,10 +686,10 @@ class PartitionedConvolver::Impl
         while (getInputStagingAvailable() >= static_cast<std::size_t> (baseHopSize_))
         {
             const std::size_t hopSize = static_cast<std::size_t> (baseHopSize_);
-            
+
             // Read hop from input staging
             readFromInputStaging (tempLayerHop_.data(), hopSize);
-            
+
             FloatVectorOperations::clear (outputStaging_.data(), outputStaging_.size());
 
             for (std::size_t layerIndex = 0; layerIndex < layers_.size(); ++layerIndex)
@@ -698,7 +707,7 @@ class PartitionedConvolver::Impl
                 {
                     // Read a full hop for this layer
                     inputBuffer.read (tempLayerHop_.data(), static_cast<std::size_t> (layerHopSize));
-                    
+
                     FloatVectorOperations::clear (layerTempOutput_.data(), layerHopSize);
 
                     if (layer.hasImpulseResponse())
@@ -732,36 +741,38 @@ class PartitionedConvolver::Impl
         const std::size_t available = inputStaging_.size() - inputStagingAvailable_;
         jassert (numSamples <= available);
         numSamples = std::min (numSamples, available);
-        if (numSamples == 0) return;
-        
+        if (numSamples == 0)
+            return;
+
         const std::size_t beforeWrap = std::min (numSamples, inputStaging_.size() - inputStagingWriteIndex_);
         const std::size_t afterWrap = numSamples - beforeWrap;
-        
+
         std::copy (data, data + beforeWrap, inputStaging_.begin() + inputStagingWriteIndex_);
         if (afterWrap > 0)
             std::copy (data + beforeWrap, data + numSamples, inputStaging_.begin());
-        
+
         inputStagingWriteIndex_ = (inputStagingWriteIndex_ + numSamples) % inputStaging_.size();
         inputStagingAvailable_ += numSamples;
     }
-    
+
     void readFromInputStaging (float* data, std::size_t numSamples)
     {
         jassert (numSamples <= inputStagingAvailable_);
         numSamples = std::min (numSamples, inputStagingAvailable_);
-        if (numSamples == 0) return;
-        
+        if (numSamples == 0)
+            return;
+
         const std::size_t beforeWrap = std::min (numSamples, inputStaging_.size() - inputStagingReadIndex_);
         const std::size_t afterWrap = numSamples - beforeWrap;
-        
+
         std::copy (inputStaging_.begin() + inputStagingReadIndex_, inputStaging_.begin() + inputStagingReadIndex_ + beforeWrap, data);
         if (afterWrap > 0)
             std::copy (inputStaging_.begin(), inputStaging_.begin() + afterWrap, data + beforeWrap);
-        
+
         inputStagingReadIndex_ = (inputStagingReadIndex_ + numSamples) % inputStaging_.size();
         inputStagingAvailable_ -= numSamples;
     }
-    
+
     std::size_t getInputStagingAvailable() const { return inputStagingAvailable_; }
 
     std::size_t directFIRTapCount_ = 0;
@@ -827,7 +838,7 @@ void PartitionedConvolver::setTypicalLayout (std::size_t directTaps, const std::
     layerSpecs.reserve (hops.size());
 
     for (int hop : hops)
-        layerSpecs.push_back ({hop});
+        layerSpecs.push_back ({ hop });
 
     configureLayers (directTaps, layerSpecs);
 }
diff --git a/modules/yup_dsp/convolution/yup_PartitionedConvolver.h b/modules/yup_dsp/convolution/yup_PartitionedConvolver.h
index a4e4d4d5b..78f3cb176 100644
--- a/modules/yup_dsp/convolution/yup_PartitionedConvolver.h
+++ b/modules/yup_dsp/convolution/yup_PartitionedConvolver.h
@@ -65,7 +65,7 @@ class PartitionedConvolver
     /** Configuration for a single FFT-based convolution layer */
     struct LayerSpec
     {
-        int hopSize;  /**< Partition size L (FFT size will be 2*L) */
+        int hopSize; /**< Partition size L (FFT size will be 2*L) */
     };
 
     //==============================================================================
@@ -153,7 +153,7 @@ class PartitionedConvolver
     */
     void process (const float* input, float* output, std::size_t numSamples);
 
-        /**
+    /**
         Reset all internal processing state (clears delay lines, overlap buffers).
         Impulse response partitions are preserved.
     */

From 230f934989211297ec53b26df832e096e0178278 Mon Sep 17 00:00:00 2001
From: kunitoki <kunitoki@gmail.com>
Date: Tue, 26 Aug 2025 22:54:39 +0200
Subject: [PATCH 07/37] More tweaks

---
 .../convolution/yup_PartitionedConvolver.cpp        | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/modules/yup_dsp/convolution/yup_PartitionedConvolver.cpp b/modules/yup_dsp/convolution/yup_PartitionedConvolver.cpp
index d4ecc2e64..88ea1c489 100644
--- a/modules/yup_dsp/convolution/yup_PartitionedConvolver.cpp
+++ b/modules/yup_dsp/convolution/yup_PartitionedConvolver.cpp
@@ -321,10 +321,7 @@ class PartitionedConvolver::FFTLayer
             return;
 
         // 1) Transform current input hop to frequency domain
-        std::fill (tempBuffer_.begin(), tempBuffer_.end(), 0.0f);
-        for (int i = 0; i < hopSize_; ++i)
-            tempBuffer_[i] = inputHop[i];
-
+        FloatVectorOperations::copy (tempBuffer_.data(), inputHop, hopSize_);
         fftProcessor_.performRealFFTForward (tempBuffer_.data(), tempBuffer_.data());
 
         // 2) Store in frequency delay line (circular buffer) - copy full complex buffer
@@ -332,7 +329,7 @@ class PartitionedConvolver::FFTLayer
         std::copy (tempBuffer_.begin(), tempBuffer_.begin() + (fftSize_ * 2), frequencyDelayLine_[static_cast<std::size_t> (fdlIndex_)].begin());
 
         // 3) Frequency domain convolution: Y = sum(X[k-p] * H[p])
-        std::fill (frequencyBuffer_.data(), frequencyBuffer_.data() + (fftSize_ * 2), 0.0f);
+        FloatVectorOperations::clear (frequencyBuffer_.data(), fftSize_ * 2);
 
         int xIndex = fdlIndex_;
         for (std::size_t p = 0; p < frequencyPartitions_.size(); ++p)
@@ -661,12 +658,10 @@ class PartitionedConvolver::Impl
 
     void processUnsafe (const float* input, float* output, std::size_t numSamples)
     {
-        // Ensure prepare() was called
         jassert (isPrepared_);
         jassert (numSamples <= maxBlockSize_);
-
         if (! isPrepared_ || numSamples > maxBlockSize_)
-            return; // Fail gracefully in release builds
+            return;
 
         FloatVectorOperations::copy (workingInput_.data(), input, numSamples);
         FloatVectorOperations::clear (workingOutput_.data(), numSamples);
@@ -689,7 +684,6 @@ class PartitionedConvolver::Impl
 
             // Read hop from input staging
             readFromInputStaging (tempLayerHop_.data(), hopSize);
-
             FloatVectorOperations::clear (outputStaging_.data(), outputStaging_.size());
 
             for (std::size_t layerIndex = 0; layerIndex < layers_.size(); ++layerIndex)
@@ -707,7 +701,6 @@ class PartitionedConvolver::Impl
                 {
                     // Read a full hop for this layer
                     inputBuffer.read (tempLayerHop_.data(), static_cast<std::size_t> (layerHopSize));
-
                     FloatVectorOperations::clear (layerTempOutput_.data(), layerHopSize);
 
                     if (layer.hasImpulseResponse())

From 15d5ce4c3e184c4bd1ce8bd6e8cef1a29d255240 Mon Sep 17 00:00:00 2001
From: kunitoki <kunitoki@gmail.com>
Date: Tue, 26 Aug 2025 23:30:47 +0200
Subject: [PATCH 08/37] More work on Convolution

---
 .../source/examples/ConvolutionDemo.h         |   2 +-
 modules/yup_audio_basics/yup_audio_basics.h   |   7 +
 .../convolution/yup_PartitionedConvolver.cpp  | 146 ++++++++++++++----
 modules/yup_dsp/yup_dsp.cpp                   |  17 ++
 4 files changed, 142 insertions(+), 30 deletions(-)

diff --git a/examples/graphics/source/examples/ConvolutionDemo.h b/examples/graphics/source/examples/ConvolutionDemo.h
index 8885d8ae9..4b1abc252 100644
--- a/examples/graphics/source/examples/ConvolutionDemo.h
+++ b/examples/graphics/source/examples/ConvolutionDemo.h
@@ -60,7 +60,7 @@ class ConvolutionDemo
         dryGain.setCurrentAndTargetValue (0.3f);
 
         // Configure convolver with typical layout
-        convolver.setTypicalLayout (128, {128, 256, 512, 1024, 4096});
+        convolver.setTypicalLayout (128, {128, 512, 2048});
 
         // Create UI
         createUI();
diff --git a/modules/yup_audio_basics/yup_audio_basics.h b/modules/yup_audio_basics/yup_audio_basics.h
index fb69411df..79fee813d 100644
--- a/modules/yup_audio_basics/yup_audio_basics.h
+++ b/modules/yup_audio_basics/yup_audio_basics.h
@@ -80,9 +80,16 @@
 #endif
 #endif
 
+#ifndef YUP_USE_FMA_INTRINSICS
+#if defined (__FMA__)
+#define YUP_USE_FMA_INTRINSICS 1
+#endif
+#endif
+
 #if ! YUP_INTEL
 #undef YUP_USE_SSE_INTRINSICS
 #undef YUP_USE_AVX_INTRINSICS
+#undef YUP_USE_FMA_INTRINSICS
 #endif
 
 #if __ARM_NEON__ && ! (YUP_USE_VDSP_FRAMEWORK || defined(YUP_USE_ARM_NEON))
diff --git a/modules/yup_dsp/convolution/yup_PartitionedConvolver.cpp b/modules/yup_dsp/convolution/yup_PartitionedConvolver.cpp
index 88ea1c489..03166fa30 100644
--- a/modules/yup_dsp/convolution/yup_PartitionedConvolver.cpp
+++ b/modules/yup_dsp/convolution/yup_PartitionedConvolver.cpp
@@ -155,11 +155,16 @@ class PartitionedConvolver::DirectFIR
 
     void setTaps (std::vector<float> taps, float scaling)
     {
-        taps_ = std::move (taps);
-        FloatVectorOperations::multiply (taps_.data(), scaling, taps_.size());
+        FloatVectorOperations::multiply (taps.data(), scaling, taps.size());
 
-        history_.assign (taps_.size(), 0.0f);
+        tapsReversed_ = std::move (taps);
+        std::reverse (tapsReversed_.begin(), tapsReversed_.end());
 
+        numTaps_ = tapsReversed_.size();
+        paddedLen_ = (numTaps_ + 3u) & ~3u;
+        tapsReversed_.resize (paddedLen_, 0.0f);
+
+        history_.assign (2 * numTaps_, 0.0f);
         writeIndex_ = 0;
     }
 
@@ -169,41 +174,120 @@ class PartitionedConvolver::DirectFIR
         writeIndex_ = 0;
     }
 
-    void process (const float* input, float* output, std::size_t numSamples)
+    void process (const float* input, float* output, std::size_t numSamples) noexcept
     {
-        const std::size_t numTaps = taps_.size();
-        if (numTaps == 0)
+        const std::size_t M = numTaps_;
+        if (M == 0)
             return;
 
+        const float* h = tapsReversed_.data();
         for (std::size_t i = 0; i < numSamples; ++i)
         {
-            history_[writeIndex_] = input[i];
+            const float x = input[i];
+
+            history_[writeIndex_] = x;
+            history_[writeIndex_ + M] = x;
+
+            const float* w = history_.data() + writeIndex_ + 1;
 
-            // Convolution: y[n] = sum(h[m] * x[n-m])
             float sum = 0.0f;
-            std::size_t readIndex = writeIndex_;
-            for (std::size_t m = 0; m < numTaps; ++m)
-            {
-                sum += taps_[m] * history_[readIndex];
-                if (readIndex == 0)
-                    readIndex = numTaps - 1;
-                else
-                    --readIndex;
-            }
+
+#if YUP_ENABLE_VDSP
+            vDSP_dotpr (w, 1, h, 1, &sum, M);
+#else
+            sum = dotProduct (w, h, M, paddedLen_);
+#endif
 
             output[i] += sum;
 
-            // Advance circular buffer
-            if (++writeIndex_ == numTaps)
+            if (++writeIndex_ == M)
                 writeIndex_ = 0;
         }
     }
 
-    std::size_t getNumTaps() const { return taps_.size(); }
+    std::size_t getNumTaps() const
+    {
+        return numTaps_;
+    }
 
 private:
-    std::vector<float> taps_;
+    static float dotProduct (const float* __restrict a, const float* __restrict b, std::size_t len, std::size_t paddedLen) noexcept
+    {
+        float acc = 0.0f;
+        std::size_t i = 0;
+
+#if YUP_USE_AVX_INTRINSICS && YUP_USE_FMA_INTRINSICS
+        // 8-wide AVX2 FMA path
+        __m256 vacc = _mm256_setzero_ps();
+        for (; i + 8 <= len; i += 8)
+        {
+            __m256 va = _mm256_loadu_ps (a + i);
+            __m256 vb = _mm256_loadu_ps (b + i);
+            vacc = _mm256_fmadd_ps (va, vb, vacc);
+        }
+        // horizontal add
+        __m128 low = _mm256_castps256_ps128 (vacc);
+        __m128 high = _mm256_extractf128_ps (vacc, 1);
+        __m128 vsum = _mm_add_ps (low, high);
+        vsum = _mm_hadd_ps (vsum, vsum);
+        vsum = _mm_hadd_ps (vsum, vsum);
+        acc += _mm_cvtss_f32 (vsum);
+
+#elif YUP_USE_SSE_INTRINSICS
+        __m128 vacc = _mm_setzero_ps();
+        std::size_t i = 0;
+#if YUP_USE_FMA_INTRINSICS
+        for (; i + 4 <= len; i += 4)
+        {
+            __m128 va = _mm_loadu_ps (a + i);
+            __m128 vb = _mm_loadu_ps (b + i);
+            vacc = _mm_fmadd_ps (va, vb, vacc);
+        }
+#else
+        for (; i + 4 <= len; i += 4)
+        {
+            __m128 va = _mm_loadu_ps (a + i);
+            __m128 vb = _mm_loadu_ps (b + i);
+            vacc = _mm_add_ps (vacc, _mm_mul_ps (va, vb));
+        }
+#endif
+        // horizontal add
+        __m128 shuf = _mm_movehdup_ps (vacc);
+        __m128 sums = _mm_add_ps (vacc, shuf);
+        shuf = _mm_movehl_ps (shuf, sums);
+        sums = _mm_add_ss (sums, shuf);
+        acc += _mm_cvtss_f32 (sums);
+
+#elif YUP_USE_ARM_NEON
+        float32x4_t vacc = vdupq_n_f32 (0.0f);
+        for (; i + 4 <= len; i += 4)
+        {
+            float32x4_t va = vld1q_f32 (a + i);
+            float32x4_t vb = vld1q_f32 (b + i);
+            vacc = vmlaq_f32 (vacc, va, vb);
+        }
+#if YUP_64BIT
+        acc += vaddvq_f32 (vacc);
+#else
+        float32x2_t vlow = vget_low_f32 (vacc);
+        float32x2_t vhigh = vget_high_f32 (vacc);
+        float32x2_t vsum2 = vpadd_f32 (vlow, vhigh);
+        vsum2 = vpadd_f32 (vsum2, vsum2);
+        acc += vget_lane_f32 (vsum2, 0);
+#endif
+
+#endif
+
+        for (; i < len; ++i)
+            acc += a[i] * b[i];
+
+        return acc;
+    }
+
+    std::vector<float> tapsReversed_;
     std::vector<float> history_;
+    std::size_t numTaps_ = 0;
+    std::size_t paddedLen_ = 0;
     std::size_t writeIndex_ = 0;
 };
 
@@ -532,19 +616,23 @@ class PartitionedConvolver::Impl
     {
         maxBlockSize_ = maxBlockSize;
 
-        // Calculate buffer sizes - generous but fixed allocation
-        const std::size_t inputBufferSize = maxBlockSize; // Input staging for all layers
-        const std::size_t outputBufferSize = maxHopSize_; // Output buffering for all layers
-
         // Prepare main input staging
-        inputStaging_.resize (inputBufferSize);
+        inputStaging_.resize (maxBlockSize);
         outputStaging_.assign (static_cast<std::size_t> (baseHopSize_), 0.0f);
 
-        // Prepare per-layer circular buffers
+        // Prepare per-layer circular buffers with layer-specific sizing
         for (std::size_t i = 0; i < layerInputBuffers_.size(); ++i)
         {
-            layerInputBuffers_[i].resize (inputBufferSize);
-            layerOutputBuffers_[i].resize (outputBufferSize);
+            const std::size_t layerHopSize = static_cast<std::size_t> (layers_[i].getHopSize());
+
+            // Input buffer: needs to accumulate up to layerHopSize samples plus incoming block
+            const std::size_t layerInputBufferSize = layerHopSize + maxBlockSize;
+            layerInputBuffers_[i].resize (layerInputBufferSize);
+
+            // Output buffer: needs to handle bursts of layerHopSize samples
+            // Size it to handle multiple hops since read rate (baseHopSize) may be much smaller than write rate (layerHopSize)
+            const std::size_t layerOutputBufferSize = layerHopSize * ((layerHopSize / static_cast<std::size_t> (baseHopSize_)) + 2);
+            layerOutputBuffers_[i].resize (layerOutputBufferSize);
         }
 
         // Allocate temp buffers
diff --git a/modules/yup_dsp/yup_dsp.cpp b/modules/yup_dsp/yup_dsp.cpp
index ec420b409..4d7e9a73e 100644
--- a/modules/yup_dsp/yup_dsp.cpp
+++ b/modules/yup_dsp/yup_dsp.cpp
@@ -30,6 +30,23 @@
 
 #include "yup_dsp.h"
 
+//==============================================================================
+#if YUP_USE_AVX_INTRINSICS || YUP_USE_FMA_INTRINSICS
+#include <immintrin.h>
+#endif
+
+#if YUP_USE_SSE_INTRINSICS
+#include <emmintrin.h>
+#endif
+
+#if YUP_USE_ARM_NEON
+#include <arm_neon.h>
+#endif
+
+#if (YUP_MAC || YUP_IOS) && YUP_ENABLE_VDSP
+#include <Accelerate/Accelerate.h>
+#endif
+
 //==============================================================================
 #include <atomic>
 #include <thread>

From c870ab3c7fe83bb0147fc5bbcabc60238d2e9421 Mon Sep 17 00:00:00 2001
From: kunitoki <kunitoki@gmail.com>
Date: Tue, 26 Aug 2025 23:45:03 +0200
Subject: [PATCH 09/37] Remove unused variable

---
 modules/yup_dsp/convolution/yup_PartitionedConvolver.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/yup_dsp/convolution/yup_PartitionedConvolver.cpp b/modules/yup_dsp/convolution/yup_PartitionedConvolver.cpp
index 03166fa30..d627ad42f 100644
--- a/modules/yup_dsp/convolution/yup_PartitionedConvolver.cpp
+++ b/modules/yup_dsp/convolution/yup_PartitionedConvolver.cpp
@@ -195,7 +195,7 @@ class PartitionedConvolver::DirectFIR
 #if YUP_ENABLE_VDSP
             vDSP_dotpr (w, 1, h, 1, &sum, M);
 #else
-            sum = dotProduct (w, h, M, paddedLen_);
+            sum = dotProduct (w, h, M);
 #endif
 
             output[i] += sum;
@@ -211,7 +211,7 @@ class PartitionedConvolver::DirectFIR
     }
 
 private:
-    static float dotProduct (const float* __restrict a, const float* __restrict b, std::size_t len, std::size_t paddedLen) noexcept
+    static float dotProduct (const float* __restrict a, const float* __restrict b, std::size_t len) noexcept
     {
         float acc = 0.0f;
         std::size_t i = 0;

From 7f92cd3115638d3850719bb7c8398691b6c9692f Mon Sep 17 00:00:00 2001
From: kunitoki <kunitoki@gmail.com>
Date: Wed, 27 Aug 2025 00:08:42 +0200
Subject: [PATCH 10/37] More tweaks

---
 .../yup_dsp/convolution/yup_PartitionedConvolver.cpp | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/modules/yup_dsp/convolution/yup_PartitionedConvolver.cpp b/modules/yup_dsp/convolution/yup_PartitionedConvolver.cpp
index d627ad42f..078707bc4 100644
--- a/modules/yup_dsp/convolution/yup_PartitionedConvolver.cpp
+++ b/modules/yup_dsp/convolution/yup_PartitionedConvolver.cpp
@@ -616,8 +616,9 @@ class PartitionedConvolver::Impl
     {
         maxBlockSize_ = maxBlockSize;
 
-        // Prepare main input staging
-        inputStaging_.resize (maxBlockSize);
+        // Prepare main input staging - needs to accumulate up to baseHopSize samples plus incoming block
+        const std::size_t inputStagingSize = static_cast<std::size_t> (baseHopSize_) + maxBlockSize;
+        inputStaging_.resize (inputStagingSize);
         outputStaging_.assign (static_cast<std::size_t> (baseHopSize_), 0.0f);
 
         // Prepare per-layer circular buffers with layer-specific sizing
@@ -777,6 +778,9 @@ class PartitionedConvolver::Impl
             for (std::size_t layerIndex = 0; layerIndex < layers_.size(); ++layerIndex)
             {
                 auto& layer = layers_[layerIndex];
+                if (! layer.hasImpulseResponse())
+                    continue;
+
                 const int layerHopSize = layer.getHopSize();
                 auto& inputBuffer = layerInputBuffers_[layerIndex];
                 auto& outputBuffer = layerOutputBuffers_[layerIndex];
@@ -791,8 +795,8 @@ class PartitionedConvolver::Impl
                     inputBuffer.read (tempLayerHop_.data(), static_cast<std::size_t> (layerHopSize));
                     FloatVectorOperations::clear (layerTempOutput_.data(), layerHopSize);
 
-                    if (layer.hasImpulseResponse())
-                        layer.processHop (tempLayerHop_.data(), layerTempOutput_.data());
+                    // Process hop
+                    layer.processHop (tempLayerHop_.data(), layerTempOutput_.data());
 
                     // Write output to layer's output buffer
                     outputBuffer.write (layerTempOutput_.data(), static_cast<std::size_t> (layerHopSize));

From 09166e549f3fa8de1080d0a218fd6f6189248fd8 Mon Sep 17 00:00:00 2001
From: kunitoki <kunitoki@gmail.com>
Date: Wed, 27 Aug 2025 00:26:15 +0200
Subject: [PATCH 11/37] Less copying

---
 examples/graphics/source/examples/ConvolutionDemo.h      | 2 +-
 modules/yup_dsp/convolution/yup_PartitionedConvolver.cpp | 8 ++------
 2 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/examples/graphics/source/examples/ConvolutionDemo.h b/examples/graphics/source/examples/ConvolutionDemo.h
index 4b1abc252..a984b57eb 100644
--- a/examples/graphics/source/examples/ConvolutionDemo.h
+++ b/examples/graphics/source/examples/ConvolutionDemo.h
@@ -60,7 +60,7 @@ class ConvolutionDemo
         dryGain.setCurrentAndTargetValue (0.3f);
 
         // Configure convolver with typical layout
-        convolver.setTypicalLayout (128, {128, 512, 2048});
+        convolver.setTypicalLayout (256, {256, 1024, 4096});
 
         // Create UI
         createUI();
diff --git a/modules/yup_dsp/convolution/yup_PartitionedConvolver.cpp b/modules/yup_dsp/convolution/yup_PartitionedConvolver.cpp
index 078707bc4..9eab19c50 100644
--- a/modules/yup_dsp/convolution/yup_PartitionedConvolver.cpp
+++ b/modules/yup_dsp/convolution/yup_PartitionedConvolver.cpp
@@ -606,7 +606,6 @@ class PartitionedConvolver::Impl
         tempLayerHop_.clear();
 
         // Clear working buffers - will be allocated in prepare()
-        workingInput_.clear();
         workingOutput_.clear();
 
         isPrepared_ = false;
@@ -644,7 +643,6 @@ class PartitionedConvolver::Impl
         }
 
         // Allocate working buffers
-        workingInput_.resize (maxBlockSize);
         workingOutput_.resize (maxBlockSize);
 
         isPrepared_ = true;
@@ -752,11 +750,10 @@ class PartitionedConvolver::Impl
         if (! isPrepared_ || numSamples > maxBlockSize_)
             return;
 
-        FloatVectorOperations::copy (workingInput_.data(), input, numSamples);
         FloatVectorOperations::clear (workingOutput_.data(), numSamples);
 
         // Process direct FIR (no block size constraints)
-        directFIR_.process (workingInput_.data(), workingOutput_.data(), numSamples);
+        directFIR_.process (input, workingOutput_.data(), numSamples);
         if (layers_.empty())
         {
             FloatVectorOperations::add (output, workingOutput_.data(), numSamples);
@@ -764,7 +761,7 @@ class PartitionedConvolver::Impl
         }
 
         // Add input to main input staging buffer using circular buffer logic
-        writeToInputStaging (workingInput_.data(), numSamples);
+        writeToInputStaging (input, numSamples);
 
         std::size_t outputSamplesProduced = 0;
         while (getInputStagingAvailable() >= static_cast<std::size_t> (baseHopSize_))
@@ -870,7 +867,6 @@ class PartitionedConvolver::Impl
     std::vector<FFTLayer> layers_;
 
     // Working buffers
-    std::vector<float> workingInput_;
     std::vector<float> workingOutput_;
 
     // Input staging with circular buffer management

From e8cefe637297b0e879ce4c5f6f926e16baf5a3ce Mon Sep 17 00:00:00 2001
From: kunitoki <kunitoki@gmail.com>
Date: Wed, 27 Aug 2025 00:27:49 +0200
Subject: [PATCH 12/37] Some tests

---
 tests/yup_dsp/yup_PartitionedConvolver.cpp | 838 +++++++++++++++++++++
 1 file changed, 838 insertions(+)
 create mode 100644 tests/yup_dsp/yup_PartitionedConvolver.cpp

diff --git a/tests/yup_dsp/yup_PartitionedConvolver.cpp b/tests/yup_dsp/yup_PartitionedConvolver.cpp
new file mode 100644
index 000000000..393a01a2f
--- /dev/null
+++ b/tests/yup_dsp/yup_PartitionedConvolver.cpp
@@ -0,0 +1,838 @@
+/*
+  ==============================================================================
+
+   This file is part of the YUP library.
+   Copyright (c) 2025 - kunitoki@gmail.com
+
+   YUP is an open source library subject to open-source licensing.
+
+   The code included in this file is provided under the terms of the ISC license
+   http://www.isc.org/downloads/software-support-policy/isc-license. Permission
+   to use, copy, modify, and/or distribute this software for any purpose with or
+   without fee is hereby granted provided that the above copyright notice and
+   this permission notice appear in all copies.
+
+   YUP IS PROVIDED "AS IS" WITHOUT ANY WARRANTY, AND ALL WARRANTIES, WHETHER
+   EXPRESSED OR IMPLIED, INCLUDING MERCHANTABILITY AND FITNESS FOR PURPOSE, ARE
+   DISCLAIMED.
+
+  ==============================================================================
+*/
+
+#include <yup_dsp/yup_dsp.h>
+
+#include <gtest/gtest.h>
+
+#include <random>
+#include <cmath>
+
+namespace yup::test
+{
+
+//==============================================================================
+class PartitionedConvolverTest : public ::testing::Test
+{
+protected:
+    void SetUp() override
+    {
+        generator.seed (42); // Fixed seed for reproducible tests
+    }
+
+    float randomFloat (float min = -1.0f, float max = 1.0f)
+    {
+        std::uniform_real_distribution<float> dist (min, max);
+        return dist (generator);
+    }
+
+    void fillWithRandomData (std::vector<float>& buffer)
+    {
+        for (auto& sample : buffer)
+            sample = randomFloat();
+    }
+
+    void fillWithSine (std::vector<float>& buffer, float frequency, float sampleRate)
+    {
+        for (size_t i = 0; i < buffer.size(); ++i)
+            buffer[i] = std::sin (2.0f * MathConstants<float>::pi * frequency * static_cast<float> (i) / sampleRate);
+    }
+
+    void clearBuffer (std::vector<float>& buffer)
+    {
+        std::fill (buffer.begin(), buffer.end(), 0.0f);
+    }
+
+    float calculateRMS (const std::vector<float>& buffer)
+    {
+        if (buffer.empty())
+            return 0.0f;
+
+        float sum = 0.0f;
+        for (float sample : buffer)
+            sum += sample * sample;
+
+        return std::sqrt (sum / static_cast<float> (buffer.size()));
+    }
+
+    float findPeak (const std::vector<float>& buffer)
+    {
+        if (buffer.empty())
+            return 0.0f;
+
+        float peak = 0.0f;
+        for (float sample : buffer)
+            peak = std::max (peak, std::abs (sample));
+
+        return peak;
+    }
+
+    std::mt19937 generator;
+};
+
+//==============================================================================
+// Basic API Tests
+//==============================================================================
+
+TEST_F (PartitionedConvolverTest, DefaultConstruction)
+{
+    PartitionedConvolver convolver;
+    // Should not crash - basic construction test
+}
+
+TEST_F (PartitionedConvolverTest, MoveSemantics)
+{
+    PartitionedConvolver convolver1;
+    convolver1.setTypicalLayout (64, { 64, 256 });
+    convolver1.prepare (512);
+
+    // Move constructor
+    PartitionedConvolver convolver2 = std::move (convolver1);
+
+    // Move assignment
+    PartitionedConvolver convolver3;
+    convolver3 = std::move (convolver2);
+
+    // Should not crash
+}
+
+TEST_F (PartitionedConvolverTest, BasicConfiguration)
+{
+    PartitionedConvolver convolver;
+
+    // Test typical layout configuration
+    convolver.setTypicalLayout (128, { 128, 512, 2048 });
+
+    // Should not crash
+    convolver.prepare (512);
+
+    // Test reset
+    convolver.reset();
+}
+
+TEST_F (PartitionedConvolverTest, ConfigureLayers)
+{
+    PartitionedConvolver convolver;
+
+    std::vector<PartitionedConvolver::LayerSpec> layers = {
+        { 64 }, { 256 }, { 1024 }
+    };
+
+    convolver.configureLayers (32, layers);
+    convolver.prepare (256);
+
+    // Should not crash
+}
+
+//==============================================================================
+// Impulse Response Tests
+//==============================================================================
+
+TEST_F (PartitionedConvolverTest, SetImpulseResponseVector)
+{
+    PartitionedConvolver convolver;
+    convolver.setTypicalLayout (64, { 64, 256 });
+    convolver.prepare (512);
+
+    // Create simple impulse response
+    std::vector<float> ir (1000);
+    fillWithRandomData (ir);
+
+    convolver.setImpulseResponse (ir);
+
+    // Should not crash
+}
+
+TEST_F (PartitionedConvolverTest, SetImpulseResponsePointer)
+{
+    PartitionedConvolver convolver;
+    convolver.setTypicalLayout (64, { 64, 256 });
+    convolver.prepare (512);
+
+    // Create simple impulse response
+    std::vector<float> ir (1000);
+    fillWithRandomData (ir);
+
+    convolver.setImpulseResponse (ir.data(), ir.size());
+
+    // Should not crash
+}
+
+TEST_F (PartitionedConvolverTest, SetImpulseResponseWithOptions)
+{
+    PartitionedConvolver convolver;
+    convolver.setTypicalLayout (64, { 64, 256 });
+    convolver.prepare (512);
+
+    std::vector<float> ir (1000);
+    fillWithRandomData (ir);
+
+    PartitionedConvolver::IRLoadOptions options;
+    options.normalize = true;
+    options.headroomDb = -6.0f;
+
+    convolver.setImpulseResponse (ir, options);
+
+    // Should not crash
+}
+
+TEST_F (PartitionedConvolverTest, EmptyImpulseResponse)
+{
+    PartitionedConvolver convolver;
+    convolver.setTypicalLayout (64, { 64, 256 });
+    convolver.prepare (512);
+
+    std::vector<float> emptyIR;
+    convolver.setImpulseResponse (emptyIR);
+
+    // Processing with empty IR should work
+    std::vector<float> input (256);
+    std::vector<float> output (256);
+    fillWithRandomData (input);
+    clearBuffer (output);
+
+    convolver.process (input.data(), output.data(), input.size());
+
+    // Output should remain zero
+    for (float sample : output)
+        EXPECT_FLOAT_EQ (sample, 0.0f);
+}
+
+//==============================================================================
+// Audio Processing Tests
+//==============================================================================
+
+TEST_F (PartitionedConvolverTest, ImpulseResponseTest)
+{
+    PartitionedConvolver convolver;
+    convolver.setTypicalLayout (64, { 64, 256 });
+    convolver.prepare (512);
+
+    // Create unit impulse response
+    std::vector<float> ir (256, 0.0f);
+    ir[0] = 1.0f;   // Unit impulse at start
+    ir[10] = 0.5f;  // Delayed impulse
+    ir[50] = 0.25f; // Another delayed impulse
+
+    convolver.setImpulseResponse (ir);
+
+    // Test with unit impulse input
+    std::vector<float> input (512, 0.0f);
+    input[0] = 1.0f; // Unit impulse
+
+    std::vector<float> output (512);
+    clearBuffer (output);
+
+    convolver.process (input.data(), output.data(), input.size());
+
+    // Output should contain the impulse response (with some latency)
+    // Check for non-zero output
+    float outputRMS = calculateRMS (output);
+    EXPECT_GT (outputRMS, 0.01f);
+}
+
+TEST_F (PartitionedConvolverTest, SineWaveConvolution)
+{
+    PartitionedConvolver convolver;
+    convolver.setTypicalLayout (128, { 128, 512 });
+    convolver.prepare (2048);
+
+    // Create simple lowpass IR (moving average)
+    const size_t irLength = 32;
+    std::vector<float> ir (irLength);
+    for (size_t i = 0; i < irLength; ++i)
+        ir[i] = 1.0f / static_cast<float> (irLength);
+
+    convolver.setImpulseResponse (ir);
+
+    // Test with sine wave
+    const float sampleRate = 44100.0f;
+    const float frequency = 1000.0f;
+    std::vector<float> input (2048);
+    fillWithSine (input, frequency, sampleRate);
+
+    std::vector<float> output (2048);
+    clearBuffer (output);
+
+    convolver.process (input.data(), output.data(), input.size());
+
+    // Output should have significant energy (lowpass filtered sine)
+    float outputRMS = calculateRMS (output);
+    EXPECT_GT (outputRMS, 0.1f);
+}
+
+TEST_F (PartitionedConvolverTest, AccumulativeOutput)
+{
+    PartitionedConvolver convolver;
+    convolver.setTypicalLayout (64, { 64, 256 });
+    convolver.prepare (256);
+
+    // Simple IR
+    std::vector<float> ir (100, 0.1f);
+    convolver.setImpulseResponse (ir);
+
+    std::vector<float> input (256);
+    fillWithRandomData (input);
+
+    // Pre-populate output buffer
+    std::vector<float> output (256);
+    fillWithRandomData (output);
+    std::vector<float> originalOutput = output;
+
+    convolver.process (input.data(), output.data(), input.size());
+
+    // Output should contain original data plus convolution result
+    bool hasAccumulated = false;
+    for (size_t i = 0; i < output.size(); ++i)
+    {
+        if (std::abs (output[i] - originalOutput[i]) > 0.001f)
+        {
+            hasAccumulated = true;
+            break;
+        }
+    }
+    EXPECT_TRUE (hasAccumulated);
+}
+
+//==============================================================================
+// Latency Tests
+//==============================================================================
+
+TEST_F (PartitionedConvolverTest, LatencyMeasurement)
+{
+    // Test different configurations and measure latency
+    std::vector<std::pair<size_t, std::vector<int>>> configs = {
+        { 64, { 64 } },
+        { 128, { 128 } },
+        { 64, { 64, 256 } },
+        { 128, { 128, 512 } },
+        { 256, { 256, 1024 } }
+    };
+
+    for (const auto& [directTaps, hops] : configs)
+    {
+        PartitionedConvolver convolver;
+        convolver.setTypicalLayout (directTaps, hops);
+        convolver.prepare (1024);
+
+        // Unit impulse response
+        std::vector<float> ir (1000, 0.0f);
+        ir[0] = 1.0f;
+        convolver.setImpulseResponse (ir);
+
+        // Unit impulse input
+        std::vector<float> input (1024, 0.0f);
+        input[0] = 1.0f;
+
+        std::vector<float> output (1024);
+        clearBuffer (output);
+
+        convolver.process (input.data(), output.data(), input.size());
+
+        // Find first non-zero sample in output
+        size_t latencySamples = 0;
+        for (size_t i = 0; i < output.size(); ++i)
+        {
+            if (std::abs (output[i]) > 0.001f)
+            {
+                latencySamples = i;
+                break;
+            }
+        }
+
+        // Latency should be reasonable (less than largest hop size)
+        const int maxHop = *std::max_element (hops.begin(), hops.end());
+        EXPECT_LE (latencySamples, static_cast<size_t> (maxHop * 2));
+
+        // With direct FIR, latency should be minimal
+        if (directTaps > 0)
+            EXPECT_LE (latencySamples, directTaps);
+    }
+}
+
+//==============================================================================
+// Partition Size Tests (Fixed)
+//==============================================================================
+
+TEST_F (PartitionedConvolverTest, VariousPartitionSizes)
+{
+    // Test various partition configurations - all with direct taps for immediate response
+    std::vector<std::tuple<size_t, std::vector<int>, size_t>> testConfigs = {
+        // (directTaps, hops, maxBlockSize)
+        { 64, { 64 }, 512 },
+        { 32, { 64 }, 512 },
+        { 64, { 64, 256 }, 512 },
+        { 128, { 128, 512 }, 1024 },
+        { 128, { 128, 512, 2048 }, 2048 },
+        { 256, { 256, 1024, 4096 }, 4096 },
+        { 64, { 128, 256, 512 }, 1024 },
+        { 48, { 32, 128, 512 }, 1024 },
+        { 24, { 32, 64, 128 }, 1024 },
+    };
+
+    for (const auto& item : testConfigs)
+    {
+        const auto& directTaps = std::get<0> (item);
+        const auto& hops = std::get<1> (item);
+        const auto& maxBlockSize = std::get<2> (item);
+
+        SCOPED_TRACE (testing::Message() << "Config: directTaps=" << directTaps << " hops=[" << [&]()
+        {
+            std::string hopStr;
+            for (size_t i = 0; i < hops.size(); ++i)
+            {
+                if (i > 0)
+                    hopStr += ",";
+                hopStr += std::to_string (hops[i]);
+            }
+            return hopStr;
+        }() << "] maxBlockSize=" << maxBlockSize);
+
+        PartitionedConvolver convolver;
+
+        // Configure and verify setup
+        EXPECT_NO_THROW (convolver.setTypicalLayout (directTaps, hops));
+        EXPECT_NO_THROW (convolver.prepare (maxBlockSize));
+
+        // Create a simple known impulse response
+        std::vector<float> ir (std::min (static_cast<size_t> (500), maxBlockSize), 0.0f);
+        ir[0] = 1.0f; // Unit impulse at start
+        if (ir.size() > 100)
+            ir[100] = 0.5f; // Delayed impulse for verification
+        EXPECT_NO_THROW (convolver.setImpulseResponse (ir));
+
+        // Test with unit impulse to verify convolution correctness
+        std::vector<float> deltaInput (maxBlockSize, 0.0f);
+        deltaInput[0] = 1.0f; // Unit impulse
+        std::vector<float> deltaOutput (maxBlockSize);
+        clearBuffer (deltaOutput);
+
+        EXPECT_NO_THROW (convolver.process (deltaInput.data(), deltaOutput.data(), maxBlockSize));
+
+        // Should produce significant output
+        float outputRMS = calculateRMS (deltaOutput);
+        EXPECT_GT (outputRMS, 0.003f) << "No significant convolution output detected";
+
+        // Verify we get immediate response from direct FIR
+        EXPECT_GT (findPeak (deltaOutput), 0.1f) << "No immediate response detected";
+
+        // Process various realistic block sizes
+        std::vector<size_t> blockSizes = { 64, 128, 256, maxBlockSize };
+
+        for (size_t blockSize : blockSizes)
+        {
+            if (blockSize > maxBlockSize)
+                continue;
+
+            SCOPED_TRACE (testing::Message() << "BlockSize=" << blockSize);
+
+            std::vector<float> input (blockSize);
+            std::vector<float> output (blockSize);
+            fillWithRandomData (input);
+            clearBuffer (output);
+
+            EXPECT_NO_THROW (convolver.process (input.data(), output.data(), blockSize));
+
+            // Verify audio processing quality
+            for (float sample : output)
+            {
+                EXPECT_TRUE (std::isfinite (sample)) << "Non-finite output detected";
+                EXPECT_LT (std::abs (sample), 100.0f) << "Output amplitude too large";
+            }
+
+            // With direct taps, should get output for reasonable input
+            float inputRMS = calculateRMS (input);
+            float outputRMS = calculateRMS (output);
+
+            if (inputRMS > 0.01f)
+            {
+                EXPECT_GT (outputRMS, 0.001f) << "Output unexpectedly quiet for significant input";
+            }
+        }
+    }
+}
+
+//==============================================================================
+// Stress Test (Fixed)
+//==============================================================================
+
+TEST_F (PartitionedConvolverTest, StressTestDifferentBlockSizes)
+{
+    PartitionedConvolver convolver;
+    convolver.setTypicalLayout (128, { 128, 512, 2048 });
+    convolver.prepare (2048);
+
+    // Create a simple, well-behaved impulse response
+    std::vector<float> ir (1024, 0.0f);
+    // Simple decaying impulse response
+    for (size_t i = 0; i < 200; ++i)
+    {
+        ir[i] = std::exp (-static_cast<float> (i) / 50.0f) * std::cos (2.0f * MathConstants<float>::pi * i / 16.0f);
+    }
+
+    // Normalize to prevent overflow
+    float peak = *std::max_element (ir.begin(), ir.end(), [] (float a, float b)
+    {
+        return std::abs (a) < std::abs (b);
+    });
+    if (peak > 0.0f)
+    {
+        for (auto& sample : ir)
+            sample /= (peak * 2.0f); // Extra headroom
+    }
+
+    convolver.setImpulseResponse (ir);
+
+    // Test reasonable block sizes first
+    std::vector<size_t> blockSizes = { 32, 64, 128, 256, 512, 1024 };
+
+    float totalInputEnergy = 0.0f;
+    float totalOutputEnergy = 0.0f;
+
+    for (size_t blockSize : blockSizes)
+    {
+        SCOPED_TRACE (testing::Message() << "Processing blockSize=" << blockSize);
+
+        std::vector<float> input (blockSize);
+        std::vector<float> output (blockSize);
+        fillWithRandomData (input);
+        clearBuffer (output);
+
+        EXPECT_NO_THROW (convolver.process (input.data(), output.data(), blockSize));
+
+        // Most critical: no non-finite values
+        for (float sample : output)
+        {
+            EXPECT_TRUE (std::isfinite (sample)) << "Non-finite output in blockSize=" << blockSize;
+        }
+
+        float inputRMS = calculateRMS (input);
+        float outputRMS = calculateRMS (output);
+
+        if (std::isfinite (outputRMS))
+        {
+            totalInputEnergy += inputRMS * inputRMS * blockSize;
+            totalOutputEnergy += outputRMS * outputRMS * blockSize;
+        }
+
+        // Verify reasonable levels
+        float peak = findPeak (output);
+        EXPECT_LT (peak, 50.0f) << "Output peak too large for blockSize=" << blockSize;
+
+        // With direct taps, expect output for reasonable input
+        if (inputRMS > 0.01f)
+        {
+            EXPECT_GT (outputRMS, 0.0001f) << "No output for significant input, blockSize=" << blockSize;
+            EXPECT_LT (outputRMS, inputRMS * 5.0f) << "Output unreasonably high for blockSize=" << blockSize;
+        }
+    }
+
+    // Test challenging small block sizes
+    std::vector<size_t> smallBlockSizes = { 1, 7, 15 };
+
+    for (size_t blockSize : smallBlockSizes)
+    {
+        SCOPED_TRACE (testing::Message() << "Processing small blockSize=" << blockSize);
+
+        std::vector<float> input (blockSize);
+        std::vector<float> output (blockSize);
+        fillWithRandomData (input);
+        clearBuffer (output);
+
+        EXPECT_NO_THROW (convolver.process (input.data(), output.data(), blockSize));
+
+        // Critical: no non-finite values
+        for (float sample : output)
+        {
+            EXPECT_TRUE (std::isfinite (sample)) << "Non-finite output in small blockSize=" << blockSize;
+        }
+
+        // Reasonable bounds
+        float peak = findPeak (output);
+        EXPECT_LT (peak, 50.0f) << "Output peak too large for small blockSize=" << blockSize;
+    }
+
+    // Energy conservation check
+    if (totalInputEnergy > 0.0f && totalOutputEnergy > 0.0f)
+    {
+        EXPECT_GT (totalOutputEnergy, totalInputEnergy * 0.01f) << "Total output energy too low";
+        EXPECT_LT (totalOutputEnergy, totalInputEnergy * 10.0f) << "Total output energy too high";
+    }
+}
+
+//==============================================================================
+// Remaining Tests (These were passing)
+//==============================================================================
+
+TEST_F (PartitionedConvolverTest, RandomizedFuzzing)
+{
+    // Generate random configurations and test them
+    std::uniform_int_distribution<int> hopDist (32, 2048);
+    std::uniform_int_distribution<size_t> directTapsDist (32, 512); // Always have some direct taps
+    std::uniform_int_distribution<size_t> blockSizeDist (32, 1024);
+
+    for (int trial = 0; trial < 10; ++trial) // Reduce trials for stability
+    {
+        SCOPED_TRACE (testing::Message() << "Fuzzing trial " << trial);
+
+        // Generate random configuration
+        const size_t directTaps = directTapsDist (generator);
+        const size_t numLayers = 1 + (generator() % 3); // 1-3 layers
+
+        std::vector<int> hops;
+        int prevHop = 32;
+        for (size_t i = 0; i < numLayers; ++i)
+        {
+            int hop = std::max (prevHop, hopDist (generator));
+            // Ensure power-of-2 for valid FFT sizes
+            hop = 1 << static_cast<int> (std::log2 (hop));
+            hops.push_back (hop);
+            prevHop = hop;
+        }
+
+        const size_t maxBlockSize = 1024;
+
+        PartitionedConvolver convolver;
+
+        try
+        {
+            convolver.setTypicalLayout (directTaps, hops);
+            convolver.prepare (maxBlockSize);
+
+            // Simple impulse response
+            std::vector<float> ir (512);
+            for (size_t i = 0; i < ir.size(); ++i)
+                ir[i] = std::exp (-static_cast<float> (i) / 100.0f) * randomFloat (-0.1f, 0.1f);
+
+            convolver.setImpulseResponse (ir);
+
+            // Test with impulse
+            std::vector<float> deltaInput (maxBlockSize, 0.0f);
+            deltaInput[0] = 1.0f;
+            std::vector<float> deltaOutput (maxBlockSize);
+            clearBuffer (deltaOutput);
+
+            convolver.process (deltaInput.data(), deltaOutput.data(), maxBlockSize);
+
+            float deltaRMS = calculateRMS (deltaOutput);
+            EXPECT_GT (deltaRMS, 0.001f) << "No convolution output in trial " << trial;
+
+            // Process several blocks
+            for (int block = 0; block < 5; ++block)
+            {
+                const size_t blockSize = 32 + (generator() % (maxBlockSize - 32));
+
+                std::vector<float> input (blockSize);
+                std::vector<float> output (blockSize);
+                fillWithRandomData (input);
+                clearBuffer (output);
+
+                convolver.process (input.data(), output.data(), blockSize);
+
+                // Audio quality checks
+                for (float sample : output)
+                {
+                    EXPECT_TRUE (std::isfinite (sample)) << "Non-finite output in trial " << trial << " block " << block;
+                    EXPECT_LT (std::abs (sample), 100.0f) << "Output too large in trial " << trial << " block " << block;
+                }
+            }
+        }
+        catch (const std::exception& e)
+        {
+            FAIL() << "Exception in fuzzing trial " << trial << ": " << e.what();
+        }
+    }
+}
+
+TEST_F (PartitionedConvolverTest, ShortImpulseResponseWithManyLayers)
+{
+    PartitionedConvolver convolver;
+
+    // Configure many layers but use a short IR
+    convolver.setTypicalLayout (64, { 128, 512, 2048, 4096 });
+    convolver.prepare (512);
+
+    // Very short IR (only 32 samples) - much shorter than layer configurations
+    std::vector<float> shortIR (32);
+    fillWithRandomData (shortIR);
+
+    // This should not crash and should not create "zombie" layers
+    EXPECT_NO_THROW (convolver.setImpulseResponse (shortIR));
+
+    // Process some data - should work without endless loops
+    std::vector<float> input (512);
+    std::vector<float> output (512);
+    fillWithRandomData (input);
+    clearBuffer (output);
+
+    EXPECT_NO_THROW (convolver.process (input.data(), output.data(), input.size()));
+
+    // Should produce some output (from direct FIR at least)
+    float outputRMS = calculateRMS (output);
+    EXPECT_GT (outputRMS, 0.001f);
+}
+
+TEST_F (PartitionedConvolverTest, IRShorterThanDirectTaps)
+{
+    PartitionedConvolver convolver;
+
+    // Configure with 128 direct taps but use much shorter IR
+    convolver.setTypicalLayout (128, { 256, 1024 });
+    convolver.prepare (512);
+
+    // IR shorter than direct taps
+    std::vector<float> shortIR (64);
+    fillWithRandomData (shortIR);
+
+    EXPECT_NO_THROW (convolver.setImpulseResponse (shortIR));
+
+    // Should still work - only direct FIR should be active
+    std::vector<float> input (512);
+    std::vector<float> output (512);
+    fillWithRandomData (input);
+    clearBuffer (output);
+
+    EXPECT_NO_THROW (convolver.process (input.data(), output.data(), input.size()));
+
+    // Should produce output from direct FIR
+    float outputRMS = calculateRMS (output);
+    EXPECT_GT (outputRMS, 0.001f);
+}
+
+TEST_F (PartitionedConvolverTest, IRExactlyMatchesFirstLayer)
+{
+    PartitionedConvolver convolver;
+
+    // Configure layers
+    convolver.setTypicalLayout (64, { 128, 512, 2048 });
+    convolver.prepare (512);
+
+    // IR that exactly fills direct taps + first layer
+    const std::size_t irLength = 64 + 128; // direct + first layer
+    std::vector<float> ir (irLength);
+    fillWithRandomData (ir);
+
+    EXPECT_NO_THROW (convolver.setImpulseResponse (ir));
+
+    // Should work with first layer active, subsequent layers inactive
+    std::vector<float> input (512);
+    std::vector<float> output (512);
+    fillWithRandomData (input);
+    clearBuffer (output);
+
+    EXPECT_NO_THROW (convolver.process (input.data(), output.data(), input.size()));
+
+    float outputRMS = calculateRMS (output);
+    EXPECT_GT (outputRMS, 0.001f);
+}
+
+TEST_F (PartitionedConvolverTest, ZeroLengthIR)
+{
+    PartitionedConvolver convolver;
+    convolver.setTypicalLayout (64, { 128, 512 });
+    convolver.prepare (512);
+
+    // Zero length IR
+    std::vector<float> emptyIR;
+    EXPECT_NO_THROW (convolver.setImpulseResponse (emptyIR));
+
+    // Should process without crashing but produce no output
+    std::vector<float> input (512);
+    std::vector<float> output (512);
+    fillWithRandomData (input);
+    clearBuffer (output);
+
+    EXPECT_NO_THROW (convolver.process (input.data(), output.data(), input.size()));
+
+    // Output should be zero (or very close to zero)
+    for (float sample : output)
+        EXPECT_NEAR (sample, 0.0f, 0.0001f);
+}
+
+TEST_F (PartitionedConvolverTest, ProgressiveIRLengths)
+{
+    // Test with progressively longer IRs to ensure layer activation works correctly
+    std::vector<size_t> irLengths = { 10, 50, 100, 200, 500, 1000, 2000 };
+
+    for (size_t irLength : irLengths)
+    {
+        SCOPED_TRACE (testing::Message() << "IR Length: " << irLength);
+
+        PartitionedConvolver convolver;
+        convolver.setTypicalLayout (64, { 128, 512, 2048 });
+        convolver.prepare (512);
+
+        std::vector<float> ir (irLength);
+        fillWithRandomData (ir);
+
+        EXPECT_NO_THROW (convolver.setImpulseResponse (ir));
+
+        // Process and verify output
+        std::vector<float> input (512);
+        std::vector<float> output (512);
+        fillWithRandomData (input);
+        clearBuffer (output);
+
+        EXPECT_NO_THROW (convolver.process (input.data(), output.data(), input.size()));
+
+        // Should produce reasonable output
+        for (float sample : output)
+        {
+            EXPECT_TRUE (std::isfinite (sample));
+            EXPECT_LT (std::abs (sample), 100.0f); // Sanity check
+        }
+    }
+}
+
+TEST_F (PartitionedConvolverTest, ResetFunctionality)
+{
+    PartitionedConvolver convolver;
+    convolver.setTypicalLayout (64, { 64, 256 });
+    convolver.prepare (512);
+
+    std::vector<float> ir (500);
+    fillWithRandomData (ir);
+    convolver.setImpulseResponse (ir);
+
+    // Process some data to build up internal state
+    std::vector<float> input (512);
+    std::vector<float> output1 (512);
+    fillWithRandomData (input);
+    clearBuffer (output1);
+
+    convolver.process (input.data(), output1.data(), input.size());
+
+    // Reset and process same input again
+    convolver.reset();
+
+    std::vector<float> output2 (512);
+    clearBuffer (output2);
+
+    convolver.process (input.data(), output2.data(), input.size());
+
+    // Outputs should be identical after reset
+    for (size_t i = 0; i < output1.size(); ++i)
+    {
+        EXPECT_NEAR (output1[i], output2[i], 0.001f) << "Mismatch at sample " << i;
+    }
+}
+
+} // namespace yup::test
\ No newline at end of file

From 8e8a4c8edf9caa21670b2778c023eb1185639bc8 Mon Sep 17 00:00:00 2001
From: kunitoki <kunitoki@gmail.com>
Date: Wed, 27 Aug 2025 00:31:03 +0200
Subject: [PATCH 13/37] Fix tests

---
 tests/yup_dsp/yup_PartitionedConvolver.cpp | 193 +++++++++++++++++++--
 1 file changed, 178 insertions(+), 15 deletions(-)

diff --git a/tests/yup_dsp/yup_PartitionedConvolver.cpp b/tests/yup_dsp/yup_PartitionedConvolver.cpp
index 393a01a2f..c1ea2985f 100644
--- a/tests/yup_dsp/yup_PartitionedConvolver.cpp
+++ b/tests/yup_dsp/yup_PartitionedConvolver.cpp
@@ -95,7 +95,22 @@ class PartitionedConvolverTest : public ::testing::Test
 TEST_F (PartitionedConvolverTest, DefaultConstruction)
 {
     PartitionedConvolver convolver;
-    // Should not crash - basic construction test
+
+    // Verify default state - should be safe to call these methods
+    EXPECT_NO_THROW (convolver.reset());
+
+    // Should be able to configure after construction
+    EXPECT_NO_THROW (convolver.setTypicalLayout (64, { 64, 256 }));
+    EXPECT_NO_THROW (convolver.prepare (512));
+
+    // Should handle empty processing gracefully
+    std::vector<float> input (256, 0.0f);
+    std::vector<float> output (256, 0.0f);
+    EXPECT_NO_THROW (convolver.process (input.data(), output.data(), input.size()));
+
+    // Output should remain zero without impulse response
+    for (float sample : output)
+        EXPECT_EQ (sample, 0.0f);
 }
 
 TEST_F (PartitionedConvolverTest, MoveSemantics)
@@ -104,14 +119,35 @@ TEST_F (PartitionedConvolverTest, MoveSemantics)
     convolver1.setTypicalLayout (64, { 64, 256 });
     convolver1.prepare (512);
 
+    // Set up a known state
+    std::vector<float> ir (128, 0.0f);
+    ir[0] = 1.0f;
+    convolver1.setImpulseResponse (ir);
+
     // Move constructor
     PartitionedConvolver convolver2 = std::move (convolver1);
 
+    // Verify moved convolver works
+    std::vector<float> input (256, 0.0f);
+    input[0] = 1.0f;
+    std::vector<float> output (256, 0.0f);
+
+    EXPECT_NO_THROW (convolver2.process (input.data(), output.data(), input.size()));
+
+    // Should produce output from the moved convolver
+    float outputRMS = calculateRMS (output);
+    EXPECT_GT (outputRMS, 0.001f);
+
     // Move assignment
     PartitionedConvolver convolver3;
     convolver3 = std::move (convolver2);
 
-    // Should not crash
+    // Verify move-assigned convolver works
+    clearBuffer (output);
+    EXPECT_NO_THROW (convolver3.process (input.data(), output.data(), input.size()));
+
+    outputRMS = calculateRMS (output);
+    EXPECT_GT (outputRMS, 0.001f);
 }
 
 TEST_F (PartitionedConvolverTest, BasicConfiguration)
@@ -119,13 +155,37 @@ TEST_F (PartitionedConvolverTest, BasicConfiguration)
     PartitionedConvolver convolver;
 
     // Test typical layout configuration
-    convolver.setTypicalLayout (128, { 128, 512, 2048 });
+    EXPECT_NO_THROW (convolver.setTypicalLayout (128, { 128, 512, 2048 }));
 
-    // Should not crash
-    convolver.prepare (512);
+    // Should be able to prepare after configuration
+    EXPECT_NO_THROW (convolver.prepare (512));
+
+    // Verify configuration works by setting an impulse response
+    std::vector<float> ir (256, 0.0f);
+    ir[0] = 1.0f;
+    EXPECT_NO_THROW (convolver.setImpulseResponse (ir));
 
-    // Test reset
+    // Verify processing works after configuration
+    std::vector<float> input (256, 0.0f);
+    input[0] = 1.0f;
+    std::vector<float> output (256, 0.0f);
+
+    EXPECT_NO_THROW (convolver.process (input.data(), output.data(), input.size()));
+
+    // Should produce output
+    float outputRMS = calculateRMS (output);
+    EXPECT_GT (outputRMS, 0.001f);
+
+    // Test reset clears state
     convolver.reset();
+
+    // After reset, same input should produce same output (deterministic)
+    std::vector<float> output2 (256, 0.0f);
+    EXPECT_NO_THROW (convolver.process (input.data(), output2.data(), input.size()));
+
+    // Outputs should be very similar after reset
+    for (size_t i = 0; i < output.size(); ++i)
+        EXPECT_NEAR (output[i], output2[i], 0.001f);
 }
 
 TEST_F (PartitionedConvolverTest, ConfigureLayers)
@@ -136,10 +196,28 @@ TEST_F (PartitionedConvolverTest, ConfigureLayers)
         { 64 }, { 256 }, { 1024 }
     };
 
-    convolver.configureLayers (32, layers);
-    convolver.prepare (256);
+    EXPECT_NO_THROW (convolver.configureLayers (32, layers));
+    EXPECT_NO_THROW (convolver.prepare (256));
 
-    // Should not crash
+    // Verify the configuration works with an impulse response
+    std::vector<float> ir (500, 0.0f);
+    ir[0] = 1.0f;
+    ir[50] = 0.5f;
+    EXPECT_NO_THROW (convolver.setImpulseResponse (ir));
+
+    // Test processing with the configured layers
+    std::vector<float> input (256, 0.0f);
+    input[0] = 1.0f;
+    std::vector<float> output (256, 0.0f);
+
+    EXPECT_NO_THROW (convolver.process (input.data(), output.data(), input.size()));
+
+    // Should produce output from direct FIR immediately
+    float outputRMS = calculateRMS (output);
+    EXPECT_GT (outputRMS, 0.001f);
+
+    // Verify immediate response from direct taps
+    EXPECT_GT (findPeak (output), 0.1f);
 }
 
 //==============================================================================
@@ -156,9 +234,36 @@ TEST_F (PartitionedConvolverTest, SetImpulseResponseVector)
     std::vector<float> ir (1000);
     fillWithRandomData (ir);
 
-    convolver.setImpulseResponse (ir);
+    // Normalize to reasonable levels
+    float peak = findPeak (ir);
+    if (peak > 0.0f)
+    {
+        for (auto& sample : ir)
+            sample /= peak;
+    }
+
+    EXPECT_NO_THROW (convolver.setImpulseResponse (ir));
+
+    // Verify the impulse response was set by testing processing
+    std::vector<float> input (512, 0.0f);
+    input[0] = 1.0f;
+    std::vector<float> output (512, 0.0f);
+
+    EXPECT_NO_THROW (convolver.process (input.data(), output.data(), input.size()));
 
-    // Should not crash
+    // Should produce significant output
+    float outputRMS = calculateRMS (output);
+    EXPECT_GT (outputRMS, 0.001f);
+
+    // Test linearity - 2x input should give ~2x output
+    input[0] = 2.0f;
+    std::vector<float> output2 (512, 0.0f);
+
+    convolver.reset();
+    EXPECT_NO_THROW (convolver.process (input.data(), output2.data(), input.size()));
+
+    float output2RMS = calculateRMS (output2);
+    EXPECT_GT (output2RMS, outputRMS * 1.5f);
 }
 
 TEST_F (PartitionedConvolverTest, SetImpulseResponsePointer)
@@ -171,9 +276,35 @@ TEST_F (PartitionedConvolverTest, SetImpulseResponsePointer)
     std::vector<float> ir (1000);
     fillWithRandomData (ir);
 
-    convolver.setImpulseResponse (ir.data(), ir.size());
+    // Normalize to reasonable levels
+    float peak = findPeak (ir);
+    if (peak > 0.0f)
+    {
+        for (auto& sample : ir)
+            sample /= peak;
+    }
+
+    EXPECT_NO_THROW (convolver.setImpulseResponse (ir.data(), ir.size()));
+
+    // Verify both pointer and vector methods produce same result
+    std::vector<float> input (512, 0.0f);
+    input[0] = 1.0f;
+    std::vector<float> output1 (512, 0.0f);
+
+    EXPECT_NO_THROW (convolver.process (input.data(), output1.data(), input.size()));
 
-    // Should not crash
+    // Reset and test with vector method
+    PartitionedConvolver convolver2;
+    convolver2.setTypicalLayout (64, { 64, 256 });
+    convolver2.prepare (512);
+    convolver2.setImpulseResponse (ir);
+
+    std::vector<float> output2 (512, 0.0f);
+    EXPECT_NO_THROW (convolver2.process (input.data(), output2.data(), input.size()));
+
+    // Both methods should produce identical results
+    for (size_t i = 0; i < output1.size(); ++i)
+        EXPECT_NEAR (output1[i], output2[i], 0.0001f);
 }
 
 TEST_F (PartitionedConvolverTest, SetImpulseResponseWithOptions)
@@ -185,13 +316,45 @@ TEST_F (PartitionedConvolverTest, SetImpulseResponseWithOptions)
     std::vector<float> ir (1000);
     fillWithRandomData (ir);
 
+    // Make IR have a known peak
+    ir[0] = 2.0f; // Peak value
+
     PartitionedConvolver::IRLoadOptions options;
     options.normalize = true;
     options.headroomDb = -6.0f;
 
-    convolver.setImpulseResponse (ir, options);
+    EXPECT_NO_THROW (convolver.setImpulseResponse (ir, options));
+
+    // Test that normalization and headroom are applied
+    std::vector<float> input (512, 0.0f);
+    input[0] = 1.0f;
+    std::vector<float> output (512, 0.0f);
+
+    EXPECT_NO_THROW (convolver.process (input.data(), output.data(), input.size()));
+
+    // Should produce output, but peak should be limited by headroom
+    float outputPeak = findPeak (output);
+    EXPECT_GT (outputPeak, 0.001f);
+    EXPECT_LT (outputPeak, 1.0f); // Should be less than input due to headroom
+
+    // Compare with non-normalized version
+    PartitionedConvolver convolver2;
+    convolver2.setTypicalLayout (64, { 64, 256 });
+    convolver2.prepare (512);
+
+    PartitionedConvolver::IRLoadOptions options2;
+    options2.normalize = false;
+    options2.headroomDb = 0.0f;
+
+    convolver2.setImpulseResponse (ir, options2);
+
+    std::vector<float> output2 (512, 0.0f);
+    EXPECT_NO_THROW (convolver2.process (input.data(), output2.data(), input.size()));
+
+    float output2Peak = findPeak (output2);
 
-    // Should not crash
+    // Normalized version should have different peak
+    EXPECT_NE (outputPeak, output2Peak);
 }
 
 TEST_F (PartitionedConvolverTest, EmptyImpulseResponse)

From 2f6139384287e77138c2ce560368dc2f2515ed71 Mon Sep 17 00:00:00 2001
From: kunitoki <kunitoki@gmail.com>
Date: Wed, 27 Aug 2025 09:07:15 +0200
Subject: [PATCH 14/37] Include fixes

---
 modules/yup_audio_basics/yup_audio_basics.cpp | 23 --------
 modules/yup_audio_basics/yup_audio_basics.h   | 30 +++++++++++
 modules/yup_core/system/yup_PlatformDefs.h    | 20 +++----
 .../yup_dsp/frequency/yup_FFTProcessor.cpp    | 35 -------------
 modules/yup_dsp/yup_dsp.cpp                   | 52 +++++++++++++------
 modules/yup_dsp/yup_dsp.h                     |  4 +-
 6 files changed, 77 insertions(+), 87 deletions(-)

diff --git a/modules/yup_audio_basics/yup_audio_basics.cpp b/modules/yup_audio_basics/yup_audio_basics.cpp
index d3b7ad64d..5d128a9e3 100644
--- a/modules/yup_audio_basics/yup_audio_basics.cpp
+++ b/modules/yup_audio_basics/yup_audio_basics.cpp
@@ -48,29 +48,6 @@
 
 #include "yup_audio_basics.h"
 
-#if YUP_USE_SSE_INTRINSICS
-#include <emmintrin.h>
-#endif
-
-#if YUP_MAC || YUP_IOS
-#ifndef YUP_USE_VDSP_FRAMEWORK
-#define YUP_USE_VDSP_FRAMEWORK 1
-#endif
-
-#if YUP_USE_VDSP_FRAMEWORK
-#include <Accelerate/Accelerate.h>
-#endif
-
-#include "native/yup_AudioWorkgroup_apple.h"
-
-#elif YUP_USE_VDSP_FRAMEWORK
-#undef YUP_USE_VDSP_FRAMEWORK
-#endif
-
-#if YUP_USE_ARM_NEON
-#include <arm_neon.h>
-#endif
-
 #include "buffers/yup_FloatVectorOperations.cpp"
 #include "buffers/yup_AudioChannelSet.cpp"
 #include "buffers/yup_AudioProcessLoadMeasurer.cpp"
diff --git a/modules/yup_audio_basics/yup_audio_basics.h b/modules/yup_audio_basics/yup_audio_basics.h
index 79fee813d..22fc858d6 100644
--- a/modules/yup_audio_basics/yup_audio_basics.h
+++ b/modules/yup_audio_basics/yup_audio_basics.h
@@ -103,6 +103,36 @@
 #define YUP_USE_ARM_NEON 0
 #endif
 
+//==============================================================================
+#if YUP_USE_AVX_INTRINSICS || YUP_USE_FMA_INTRINSICS
+#include <immintrin.h>
+#endif
+
+#if YUP_USE_SSE_INTRINSICS
+#include <emmintrin.h>
+#endif
+
+#if YUP_USE_ARM_NEON
+#if JUCE_64BIT && JUCE_WINDOWS
+#include <arm64_neon.h>
+#else
+#include <arm_neon.h>
+#endif
+#endif
+
+#if (YUP_MAC || YUP_IOS) && __has_include(<Accelerate/Accelerate.h>)
+#ifndef YUP_USE_VDSP_FRAMEWORK
+#define YUP_USE_VDSP_FRAMEWORK 1
+#endif
+
+#if YUP_USE_VDSP_FRAMEWORK
+#include <Accelerate/Accelerate.h>
+#endif
+
+#elif YUP_USE_VDSP_FRAMEWORK
+#undef YUP_USE_VDSP_FRAMEWORK
+#endif
+
 //==============================================================================
 #include "buffers/yup_AudioDataConverters.h"
 YUP_BEGIN_IGNORE_WARNINGS_MSVC (4661)
diff --git a/modules/yup_core/system/yup_PlatformDefs.h b/modules/yup_core/system/yup_PlatformDefs.h
index b70e93747..b9c8ef125 100644
--- a/modules/yup_core/system/yup_PlatformDefs.h
+++ b/modules/yup_core/system/yup_PlatformDefs.h
@@ -97,24 +97,26 @@ namespace yup
 
     @see jassert()
 */
-#define YUP_BREAK_IN_DEBUGGER ::kill (0, SIGTRAP);
+#define YUP_BREAK_IN_DEBUGGER { ::kill (0, SIGTRAP); }
 #elif YUP_WASM
-#define YUP_BREAK_IN_DEBUGGER
+#define YUP_BREAK_IN_DEBUGGER { }
 #elif YUP_MSVC
 #pragma intrinsic(__debugbreak)
-#define YUP_BREAK_IN_DEBUGGER __debugbreak();
+#define YUP_BREAK_IN_DEBUGGER { __debugbreak(); }
 #elif YUP_INTEL && (YUP_GCC || YUP_CLANG || YUP_MAC)
 #if YUP_NO_INLINE_ASM
-#define YUP_BREAK_IN_DEBUGGER
+#define YUP_BREAK_IN_DEBUGGER { }
 #else
-#define YUP_BREAK_IN_DEBUGGER asm ("int $3");
+#define YUP_BREAK_IN_DEBUGGER { asm ("int $3"); }
 #endif
-#elif YUP_ARM && YUP_MAC
-#define YUP_BREAK_IN_DEBUGGER __builtin_debugtrap();
 #elif YUP_ANDROID
-#define YUP_BREAK_IN_DEBUGGER __builtin_trap();
+#define YUP_BREAK_IN_DEBUGGER { __builtin_trap(); }
+#elif YUP_ARM
+#if YUP_MAC || (YUP_WINDOWS && YUP_CLANG)
+#define YUP_BREAK_IN_DEBUGGER { __builtin_debugtrap(); }
+#endif
 #else
-#define YUP_BREAK_IN_DEBUGGER __asm int 3;
+#define YUP_BREAK_IN_DEBUGGER { __asm int 3; }
 #endif
 // clang-format on
 
diff --git a/modules/yup_dsp/frequency/yup_FFTProcessor.cpp b/modules/yup_dsp/frequency/yup_FFTProcessor.cpp
index c85263290..94f0daceb 100644
--- a/modules/yup_dsp/frequency/yup_FFTProcessor.cpp
+++ b/modules/yup_dsp/frequency/yup_FFTProcessor.cpp
@@ -19,41 +19,6 @@
   ==============================================================================
 */
 
-// Conditional includes based on available FFT backends
-#if ! YUP_FFT_FOUND_BACKEND && YUP_ENABLE_VDSP && (YUP_MAC || YUP_IOS) && __has_include(<Accelerate/Accelerate.h>)
-#include <Accelerate/Accelerate.h>
-#define YUP_FFT_USING_VDSP 1
-#define YUP_FFT_FOUND_BACKEND 1
-#endif
-
-#if ! YUP_FFT_FOUND_BACKEND && YUP_ENABLE_INTEL_IPP && __has_include(<ipp.h>)
-#include <ipp.h>
-#define YUP_FFT_USING_IPP 1
-#define YUP_FFT_FOUND_BACKEND 1
-#endif
-
-#if ! YUP_FFT_FOUND_BACKEND && YUP_ENABLE_FFTW3 && __has_include(<fftw3.h>)
-#include <fftw3.h>
-#define YUP_FFT_USING_FFTW3 1
-#define YUP_FFT_FOUND_BACKEND 1
-#endif
-
-#if ! YUP_FFT_FOUND_BACKEND && YUP_ENABLE_PFFFT && YUP_MODULE_AVAILABLE_pffft_library
-#include <pffft_library/pffft_library.h>
-#define YUP_FFT_USING_PFFFT 1
-#define YUP_FFT_FOUND_BACKEND 1
-#endif
-
-#if ! YUP_FFT_FOUND_BACKEND && YUP_ENABLE_OOURA
-#include "yup_OouraFFT8g.h"
-#define YUP_FFT_USING_OOURA 1
-#define YUP_FFT_FOUND_BACKEND 1
-#endif
-
-#if ! defined(YUP_FFT_FOUND_BACKEND)
-#error "Unable to find a proper FFT backend !"
-#endif
-
 namespace yup
 {
 
diff --git a/modules/yup_dsp/yup_dsp.cpp b/modules/yup_dsp/yup_dsp.cpp
index 4d7e9a73e..39d32fd85 100644
--- a/modules/yup_dsp/yup_dsp.cpp
+++ b/modules/yup_dsp/yup_dsp.cpp
@@ -31,36 +31,54 @@
 #include "yup_dsp.h"
 
 //==============================================================================
-#if YUP_USE_AVX_INTRINSICS || YUP_USE_FMA_INTRINSICS
-#include <immintrin.h>
+
+#include <atomic>
+#include <thread>
+
+//==============================================================================
+
+#if ! YUP_FFT_FOUND_BACKEND && YUP_ENABLE_VDSP && (YUP_MAC || YUP_IOS)
+#define YUP_FFT_USING_VDSP 1
+#define YUP_FFT_FOUND_BACKEND 1
 #endif
 
-#if YUP_USE_SSE_INTRINSICS
-#include <emmintrin.h>
+#if ! YUP_FFT_FOUND_BACKEND && YUP_ENABLE_INTEL_IPP && __has_include(<ipp.h>)
+#include <ipp.h>
+#define YUP_FFT_USING_IPP 1
+#define YUP_FFT_FOUND_BACKEND 1
 #endif
 
-#if YUP_USE_ARM_NEON
-#include <arm_neon.h>
+#if ! YUP_FFT_FOUND_BACKEND && YUP_ENABLE_FFTW3 && __has_include(<fftw3.h>)
+#include <fftw3.h>
+#define YUP_FFT_USING_FFTW3 1
+#define YUP_FFT_FOUND_BACKEND 1
 #endif
 
-#if (YUP_MAC || YUP_IOS) && YUP_ENABLE_VDSP
-#include <Accelerate/Accelerate.h>
+#if ! YUP_FFT_FOUND_BACKEND && YUP_ENABLE_PFFFT && YUP_MODULE_AVAILABLE_pffft_library
+#include <pffft_library/pffft_library.h>
+#define YUP_FFT_USING_PFFFT 1
+#define YUP_FFT_FOUND_BACKEND 1
 #endif
 
-//==============================================================================
-#include <atomic>
-#include <thread>
+#if ! YUP_FFT_FOUND_BACKEND && YUP_ENABLE_OOURA
+#include "yup_OouraFFT8g.h"
+#define YUP_FFT_USING_OOURA 1
+#define YUP_FFT_FOUND_BACKEND 1
+#endif
+
+#if ! defined(YUP_FFT_FOUND_BACKEND)
+#error "Unable to find a proper FFT backend !"
+#endif
 
 //==============================================================================
+
 #include "frequency/yup_FFTProcessor.cpp"
 #include "frequency/yup_SpectrumAnalyzerState.cpp"
+#include "designers/yup_FilterDesigner.cpp"
+#include "convolution/yup_PartitionedConvolver.cpp"
+
+//==============================================================================
 
 #if YUP_ENABLE_OOURA && YUP_FFT_USING_OOURA
 #include "frequency/yup_OouraFFT8g.cpp"
 #endif
-
-//==============================================================================
-#include "designers/yup_FilterDesigner.cpp"
-
-//==============================================================================
-#include "convolution/yup_PartitionedConvolver.cpp"
diff --git a/modules/yup_dsp/yup_dsp.h b/modules/yup_dsp/yup_dsp.h
index e9ab7c795..c89002c13 100644
--- a/modules/yup_dsp/yup_dsp.h
+++ b/modules/yup_dsp/yup_dsp.h
@@ -68,7 +68,7 @@
     Enable Apple's vDSP backend.
 */
 #ifndef YUP_ENABLE_VDSP
-#if (YUP_MAC || YUP_IOS)
+#if (YUP_MAC || YUP_IOS) && YUP_USE_VDSP_FRAMEWORK
 #define YUP_ENABLE_VDSP 1
 #else
 #define YUP_ENABLE_VDSP 0
@@ -143,5 +143,3 @@
 
 // Convolution processors
 #include "convolution/yup_PartitionedConvolver.h"
-
-//==============================================================================

From 35318a9ef23e00c293a41c829f33e5e045883b90 Mon Sep 17 00:00:00 2001
From: kunitoki <kunitoki@gmail.com>
Date: Wed, 27 Aug 2025 09:17:42 +0200
Subject: [PATCH 15/37] Make things faster

---
 .github/workflows/build_android.yml  |  6 ++++++
 .github/workflows/build_ios.yml      |  9 +++++++++
 .github/workflows/build_linux.yml    | 12 ++++++++++++
 .github/workflows/build_macos.yml    | 12 ++++++++++++
 .github/workflows/build_wasm.yml     |  8 ++++++++
 .github/workflows/build_windows.yml  | 10 ++++++++++
 .github/workflows/coverage.yml       |  2 ++
 .github/workflows/python_linux.yml   |  2 ++
 .github/workflows/python_macos.yml   |  4 +++-
 .github/workflows/python_windows.yml |  2 ++
 10 files changed, 66 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build_android.yml b/.github/workflows/build_android.yml
index ca95f690a..0d0612c36 100644
--- a/.github/workflows/build_android.yml
+++ b/.github/workflows/build_android.yml
@@ -37,6 +37,8 @@ jobs:
 
     steps:
         - uses: actions/checkout@v4
+          with:
+            fetch-depth: 0
         - uses: seanmiddleditch/gha-setup-ninja@master
         - name: Setup Java
           uses: actions/setup-java@v3
@@ -65,6 +67,8 @@ jobs:
     needs: [configure]
     steps:
     - uses: actions/checkout@v4
+      with:
+        fetch-depth: 0
     - uses: seanmiddleditch/gha-setup-ninja@master
     - name: Setup Java
       uses: actions/setup-java@v3
@@ -95,6 +99,8 @@ jobs:
     needs: [configure]
     steps:
     - uses: actions/checkout@v4
+      with:
+        fetch-depth: 0
     - uses: seanmiddleditch/gha-setup-ninja@master
     - name: Setup Java
       uses: actions/setup-java@v3
diff --git a/.github/workflows/build_ios.yml b/.github/workflows/build_ios.yml
index 51a60a5e8..49c990074 100644
--- a/.github/workflows/build_ios.yml
+++ b/.github/workflows/build_ios.yml
@@ -33,6 +33,9 @@ jobs:
 
     steps:
     - uses: actions/checkout@v4
+      with:
+        fetch-depth: 0
+
     - uses: seanmiddleditch/gha-setup-ninja@master
 
     - name: Configure
@@ -57,6 +60,8 @@ jobs:
     needs: [configure]
     steps:
     - uses: actions/checkout@v4
+      with:
+        fetch-depth: 0
     - uses: seanmiddleditch/gha-setup-ninja@master
     - uses: actions/cache/restore@v4
       id: cache-restore
@@ -76,6 +81,8 @@ jobs:
     needs: [configure]
     steps:
     - uses: actions/checkout@v4
+      with:
+        fetch-depth: 0
     - uses: seanmiddleditch/gha-setup-ninja@master
     - uses: actions/cache/restore@v4
       id: cache-restore
@@ -95,6 +102,8 @@ jobs:
     needs: [configure]
     steps:
     - uses: actions/checkout@v4
+      with:
+        fetch-depth: 0
     - uses: seanmiddleditch/gha-setup-ninja@master
     - uses: actions/cache/restore@v4
       id: cache-restore
diff --git a/.github/workflows/build_linux.yml b/.github/workflows/build_linux.yml
index 6f9b096ec..941b7e466 100644
--- a/.github/workflows/build_linux.yml
+++ b/.github/workflows/build_linux.yml
@@ -39,6 +39,8 @@ jobs:
 
     steps:
     - uses: actions/checkout@v4
+      with:
+        fetch-depth: 0
     - uses: seanmiddleditch/gha-setup-ninja@master
     - name: Install Dependencies
       run: sudo apt-get update && sudo apt-get install -y ${INSTALL_DEPS}
@@ -63,6 +65,8 @@ jobs:
     needs: [configure]
     steps:
     - uses: actions/checkout@v4
+      with:
+        fetch-depth: 0
     - uses: seanmiddleditch/gha-setup-ninja@master
     - run: sudo apt-get update && sudo apt-get install -y ${INSTALL_DEPS}
     - uses: actions/cache/restore@v4
@@ -85,6 +89,8 @@ jobs:
     needs: [configure]
     steps:
     - uses: actions/checkout@v4
+      with:
+        fetch-depth: 0
     - uses: seanmiddleditch/gha-setup-ninja@master
     - run: sudo apt-get update && sudo apt-get install -y ${INSTALL_DEPS}
     - uses: actions/cache/restore@v4
@@ -103,6 +109,8 @@ jobs:
     needs: [configure]
     steps:
     - uses: actions/checkout@v4
+      with:
+        fetch-depth: 0
     - uses: seanmiddleditch/gha-setup-ninja@master
     - run: sudo apt-get update && sudo apt-get install -y ${INSTALL_DEPS}
     - uses: actions/cache/restore@v4
@@ -121,6 +129,8 @@ jobs:
     needs: [configure]
     steps:
     - uses: actions/checkout@v4
+      with:
+        fetch-depth: 0
     - uses: seanmiddleditch/gha-setup-ninja@master
     - run: sudo apt-get update && sudo apt-get install -y ${INSTALL_DEPS}
     - uses: actions/cache/restore@v4
@@ -139,6 +149,8 @@ jobs:
     needs: [configure]
     steps:
     - uses: actions/checkout@v4
+      with:
+        fetch-depth: 0
     - uses: seanmiddleditch/gha-setup-ninja@master
     - run: sudo apt-get update && sudo apt-get install -y ${INSTALL_DEPS}
     - uses: actions/cache/restore@v4
diff --git a/.github/workflows/build_macos.yml b/.github/workflows/build_macos.yml
index f2e672586..dbfa6bd6f 100644
--- a/.github/workflows/build_macos.yml
+++ b/.github/workflows/build_macos.yml
@@ -32,6 +32,8 @@ jobs:
 
     steps:
     - uses: actions/checkout@v4
+      with:
+        fetch-depth: 0
     - uses: seanmiddleditch/gha-setup-ninja@master
 
     - name: Configure
@@ -54,6 +56,8 @@ jobs:
     needs: [configure]
     steps:
     - uses: actions/checkout@v4
+      with:
+        fetch-depth: 0
     - uses: seanmiddleditch/gha-setup-ninja@master
     - uses: actions/cache/restore@v4
       id: cache-restore
@@ -75,6 +79,8 @@ jobs:
     needs: [configure]
     steps:
     - uses: actions/checkout@v4
+      with:
+        fetch-depth: 0
     - uses: seanmiddleditch/gha-setup-ninja@master
     - uses: actions/cache/restore@v4
       id: cache-restore
@@ -92,6 +98,8 @@ jobs:
     needs: [configure]
     steps:
     - uses: actions/checkout@v4
+      with:
+        fetch-depth: 0
     - uses: seanmiddleditch/gha-setup-ninja@master
     - uses: actions/cache/restore@v4
       id: cache-restore
@@ -109,6 +117,8 @@ jobs:
     needs: [configure]
     steps:
     - uses: actions/checkout@v4
+      with:
+        fetch-depth: 0
     - uses: seanmiddleditch/gha-setup-ninja@master
     - uses: actions/cache/restore@v4
       id: cache-restore
@@ -126,6 +136,8 @@ jobs:
     needs: [configure]
     steps:
     - uses: actions/checkout@v4
+      with:
+        fetch-depth: 0
     - uses: seanmiddleditch/gha-setup-ninja@master
     - uses: actions/cache/restore@v4
       id: cache-restore
diff --git a/.github/workflows/build_wasm.yml b/.github/workflows/build_wasm.yml
index b88c425e7..1ac58ae20 100644
--- a/.github/workflows/build_wasm.yml
+++ b/.github/workflows/build_wasm.yml
@@ -38,6 +38,8 @@ jobs:
     runs-on: ubuntu-latest
     steps:
     - uses: actions/checkout@v4
+      with:
+        fetch-depth: 0
     - uses: seanmiddleditch/gha-setup-ninja@master
     - run: sudo apt-get update && sudo apt-get install -y ${INSTALL_DEPS}
     - name: Setup emsdk
@@ -58,6 +60,8 @@ jobs:
     runs-on: ubuntu-latest
     steps:
     - uses: actions/checkout@v4
+      with:
+        fetch-depth: 0
     - uses: seanmiddleditch/gha-setup-ninja@master
     - run: sudo apt-get update && sudo apt-get install -y ${INSTALL_DEPS}
     - name: Setup emsdk
@@ -74,6 +78,8 @@ jobs:
     runs-on: ubuntu-latest
     steps:
     - uses: actions/checkout@v4
+      with:
+        fetch-depth: 0
     - uses: seanmiddleditch/gha-setup-ninja@master
     - run: sudo apt-get update && sudo apt-get install -y ${INSTALL_DEPS}
     - name: Setup emsdk
@@ -90,6 +96,8 @@ jobs:
     runs-on: ubuntu-latest
     steps:
     - uses: actions/checkout@v4
+      with:
+        fetch-depth: 0
     - uses: seanmiddleditch/gha-setup-ninja@master
     - run: sudo apt-get update && sudo apt-get install -y ${INSTALL_DEPS}
     - name: Setup emsdk
diff --git a/.github/workflows/build_windows.yml b/.github/workflows/build_windows.yml
index 80dad6dd6..024ec032d 100644
--- a/.github/workflows/build_windows.yml
+++ b/.github/workflows/build_windows.yml
@@ -33,6 +33,8 @@ jobs:
     runs-on: windows-latest
     steps:
     - uses: actions/checkout@v4
+      with:
+        fetch-depth: 0
     - run: cmake ${{ github.workspace }} -B ${{ runner.workspace }}/build -DYUP_ENABLE_TESTS=ON
     - run: cmake --build ${{ runner.workspace }}/build --config Debug --parallel 4 --target yup_tests
     - working-directory: ${{ runner.workspace }}/build/tests/Debug
@@ -47,6 +49,8 @@ jobs:
     runs-on: windows-latest
     steps:
     - uses: actions/checkout@v4
+      with:
+        fetch-depth: 0
     - run: cmake ${{ github.workspace }} -B ${{ runner.workspace }}/build -DYUP_ENABLE_EXAMPLES=ON
     - run: cmake --build ${{ runner.workspace }}/build --config Debug --parallel 4 --target example_console
     - run: cmake --build ${{ runner.workspace }}/build --config Release --parallel 4 --target example_console
@@ -55,6 +59,8 @@ jobs:
     runs-on: windows-latest
     steps:
     - uses: actions/checkout@v4
+      with:
+        fetch-depth: 0
     - run: cmake ${{ github.workspace }} -B ${{ runner.workspace }}/build -DYUP_ENABLE_EXAMPLES=ON
     - run: cmake --build ${{ runner.workspace }}/build --config Debug --parallel 4 --target example_app
     - run: cmake --build ${{ runner.workspace }}/build --config Release --parallel 4 --target example_app
@@ -63,6 +69,8 @@ jobs:
     runs-on: windows-latest
     steps:
     - uses: actions/checkout@v4
+      with:
+        fetch-depth: 0
     - run: cmake ${{ github.workspace }} -B ${{ runner.workspace }}/build -DYUP_ENABLE_EXAMPLES=ON
     - run: cmake --build ${{ runner.workspace }}/build --config Debug --parallel 4 --target example_graphics
     - run: cmake --build ${{ runner.workspace }}/build --config Release --parallel 4 --target example_graphics
@@ -71,6 +79,8 @@ jobs:
     runs-on: windows-latest
     steps:
     - uses: actions/checkout@v4
+      with:
+        fetch-depth: 0
     - run: cmake ${{ github.workspace }} -B ${{ runner.workspace }}/build -DYUP_ENABLE_EXAMPLES=ON
     - run: cmake --build ${{ runner.workspace }}/build --config Debug --parallel 4 --target example_plugin_clap_plugin
     - run: cmake --build ${{ runner.workspace }}/build --config Release --parallel 4 --target example_plugin_clap_plugin
diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml
index 0c0652eb2..1ad4e6340 100644
--- a/.github/workflows/coverage.yml
+++ b/.github/workflows/coverage.yml
@@ -59,6 +59,8 @@ jobs:
     steps:
     - name: Checkout repository
       uses: actions/checkout@v4
+      with:
+        fetch-depth: 0
 
     - name: Setup Ninja
       uses: seanmiddleditch/gha-setup-ninja@master
diff --git a/.github/workflows/python_linux.yml b/.github/workflows/python_linux.yml
index 00ea8c306..0861a4c52 100644
--- a/.github/workflows/python_linux.yml
+++ b/.github/workflows/python_linux.yml
@@ -40,6 +40,8 @@ jobs:
 
     steps:
       - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
 
       - name: Set up QEMU
         if: matrix.cibw_archs == 'aarch64'
diff --git a/.github/workflows/python_macos.yml b/.github/workflows/python_macos.yml
index 9fb101e05..a3d1d2846 100644
--- a/.github/workflows/python_macos.yml
+++ b/.github/workflows/python_macos.yml
@@ -34,10 +34,12 @@ jobs:
       fail-fast: true
       matrix:
         include:
-        - { os: macos-15, python: 311, platform_id: macosx_universal2, cibw_archs: universal2 }
+        - { os: macos-latest, python: 311, platform_id: macosx_universal2, cibw_archs: universal2 }
 
     steps:
       - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
 
       - name: Setup and install python
         uses: actions/setup-python@v5
diff --git a/.github/workflows/python_windows.yml b/.github/workflows/python_windows.yml
index ae90b3a7f..b4df3770f 100644
--- a/.github/workflows/python_windows.yml
+++ b/.github/workflows/python_windows.yml
@@ -41,6 +41,8 @@ jobs:
 
     steps:
       - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
 
       - name: Setup and install python
         uses: actions/setup-python@v5

From f6357559ef316790e73661d27703e08a2f01520d Mon Sep 17 00:00:00 2001
From: kunitoki <kunitoki@gmail.com>
Date: Wed, 27 Aug 2025 09:43:18 +0200
Subject: [PATCH 16/37] Missing include

---
 modules/yup_audio_basics/yup_audio_basics.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/modules/yup_audio_basics/yup_audio_basics.cpp b/modules/yup_audio_basics/yup_audio_basics.cpp
index 5d128a9e3..60c63befa 100644
--- a/modules/yup_audio_basics/yup_audio_basics.cpp
+++ b/modules/yup_audio_basics/yup_audio_basics.cpp
@@ -48,6 +48,10 @@
 
 #include "yup_audio_basics.h"
 
+#if YUP_MAC || YUP_IOS
+#include "native/yup_AudioWorkgroup_apple.h"
+#endif
+
 #include "buffers/yup_FloatVectorOperations.cpp"
 #include "buffers/yup_AudioChannelSet.cpp"
 #include "buffers/yup_AudioProcessLoadMeasurer.cpp"

From 81c4821d76275865ef9421c70ef917e187dbf145 Mon Sep 17 00:00:00 2001
From: kunitoki <kunitoki@gmail.com>
Date: Wed, 27 Aug 2025 09:43:36 +0200
Subject: [PATCH 17/37] Missing divide methods in float vector operations

---
 .../buffers/yup_FloatVectorOperations.cpp     | 155 ++++++++++++++++++
 .../buffers/yup_FloatVectorOperations.h       |  18 ++
 2 files changed, 173 insertions(+)

diff --git a/modules/yup_audio_basics/buffers/yup_FloatVectorOperations.cpp b/modules/yup_audio_basics/buffers/yup_FloatVectorOperations.cpp
index de9ffbfc8..447b9fec3 100644
--- a/modules/yup_audio_basics/buffers/yup_FloatVectorOperations.cpp
+++ b/modules/yup_audio_basics/buffers/yup_FloatVectorOperations.cpp
@@ -89,6 +89,8 @@ struct BasicOps32
 
     static forcedinline ParallelType mul (ParallelType a, ParallelType b) noexcept { return _mm_mul_ps (a, b); }
 
+    static forcedinline ParallelType div (ParallelType a, ParallelType b) noexcept  { return _mm_div_ps (a, b); }
+
     static forcedinline ParallelType max (ParallelType a, ParallelType b) noexcept { return _mm_max_ps (a, b); }
 
     static forcedinline ParallelType min (ParallelType a, ParallelType b) noexcept { return _mm_min_ps (a, b); }
@@ -148,6 +150,8 @@ struct BasicOps64
 
     static forcedinline ParallelType mul (ParallelType a, ParallelType b) noexcept { return _mm_mul_pd (a, b); }
 
+    static forcedinline ParallelType div (ParallelType a, ParallelType b) noexcept  { return _mm_div_pd (a, b); }
+
     static forcedinline ParallelType max (ParallelType a, ParallelType b) noexcept { return _mm_max_pd (a, b); }
 
     static forcedinline ParallelType min (ParallelType a, ParallelType b) noexcept { return _mm_min_pd (a, b); }
@@ -337,6 +341,8 @@ struct BasicOps32
 
     static forcedinline ParallelType mul (ParallelType a, ParallelType b) noexcept { return vmulq_f32 (a, b); }
 
+    static forcedinline ParallelType div (ParallelType a, ParallelType b) noexcept { return vdivq_f32 (a, b); }
+
     static forcedinline ParallelType max (ParallelType a, ParallelType b) noexcept { return vmaxq_f32 (a, b); }
 
     static forcedinline ParallelType min (ParallelType a, ParallelType b) noexcept { return vminq_f32 (a, b); }
@@ -411,6 +417,8 @@ struct BasicOps64
 
     static forcedinline ParallelType mul (ParallelType a, ParallelType b) noexcept { return a * b; }
 
+    static forcedinline ParallelType div (ParallelType a, ParallelType b) noexcept { return a / b; }
+
     static forcedinline ParallelType max (ParallelType a, ParallelType b) noexcept { return jmax (a, b); }
 
     static forcedinline ParallelType min (ParallelType a, ParallelType b) noexcept { return jmin (a, b); }
@@ -755,6 +763,63 @@ void copyWithMultiply (double* dest, const double* src, double multiplier, Size
 #endif
 }
 
+
+template <typename Size>
+void copyWithDividend (float* dest, const float* src, float dividend, Size num) noexcept
+{
+#if YUP_USE_VDSP_FRAMEWORK
+    vDSP_svdiv (&dividend, src, 1, dest, 1, (vDSP_Length) num);
+#else
+    YUP_PERFORM_VEC_OP_SRC_DEST (dest[i] = dividend / src[i],
+                                 Mode::div (divsd, s),
+                                 YUP_LOAD_SRC,
+                                 YUP_INCREMENT_SRC_DEST,
+                                 const Mode::ParallelType divsd = Mode::load1 (dividend);)
+#endif
+}
+
+template <typename Size>
+void copyWithDividend (double* dest, const double* src, double dividend, Size num) noexcept
+{
+#if YUP_USE_VDSP_FRAMEWORK
+    vDSP_svdivD (&dividend, src, 1, dest, 1, (vDSP_Length) num);
+#else
+    YUP_PERFORM_VEC_OP_SRC_DEST (dest[i] = dividend / src[i],
+                                 Mode::div (divsd, s),
+                                 YUP_LOAD_SRC,
+                                 YUP_INCREMENT_SRC_DEST,
+                                 const Mode::ParallelType divsd = Mode::load1 (dividend);)
+#endif
+}
+
+template <typename Size>
+void copyWithDivide (float* dest, const float* src, float divisor, Size num) noexcept
+{
+#if YUP_USE_VDSP_FRAMEWORK
+    vDSP_vsdiv (src, 1, &divisor, dest, 1, (vDSP_Length) num);
+#else
+    YUP_PERFORM_VEC_OP_SRC_DEST (dest[i] = src[i] / divisor,
+                                 Mode::div (s, divs),
+                                 YUP_LOAD_SRC,
+                                 YUP_INCREMENT_SRC_DEST,
+                                 const Mode::ParallelType divs = Mode::load1 (divisor);)
+#endif
+}
+
+template <typename Size>
+void copyWithDivide (double* dest, const double* src, double divisor, Size num) noexcept
+{
+#if YUP_USE_VDSP_FRAMEWORK
+    vDSP_vsdivD (src, 1, &divisor, dest, 1, (vDSP_Length) num);
+#else
+    YUP_PERFORM_VEC_OP_SRC_DEST (dest[i] = src[i] / divisor,
+                                 Mode::div (s, divs),
+                                 YUP_LOAD_SRC,
+                                 YUP_INCREMENT_SRC_DEST,
+                                 const Mode::ParallelType divs = Mode::load1 (divisor);)
+#endif
+}
+
 template <typename Size>
 void add (float* dest, float amount, Size num) noexcept
 {
@@ -1099,6 +1164,78 @@ void multiply (double* dest, const double* src, double multiplier, Size num) noe
                                  const Mode::ParallelType mult = Mode::load1 (multiplier);)
 }
 
+template <typename Size>
+void divide (float* dest, float divisor, Size num) noexcept
+{
+#if YUP_USE_VDSP_FRAMEWORK
+    vDSP_vsdiv (dest, 1, &divisor, dest, 1, (vDSP_Length) num);
+#else
+    YUP_PERFORM_VEC_OP_DEST (dest[i] /= divisor,
+                             Mode::div (d, divs),
+                             YUP_LOAD_DEST,
+                             const Mode::ParallelType divs = Mode::load1 (divisor);)
+#endif
+}
+
+template <typename Size>
+void divide (double* dest, double divisor, Size num) noexcept
+{
+#if YUP_USE_VDSP_FRAMEWORK
+    vDSP_vsdivD (dest, 1, &divisor, dest, 1, (vDSP_Length) num);
+#else
+    YUP_PERFORM_VEC_OP_DEST (dest[i] /= divisor,
+                             Mode::div (d, divs),
+                             YUP_LOAD_DEST,
+                             const Mode::ParallelType divs = Mode::load1 (divisor);)
+#endif
+}
+
+template <typename Size>
+void divide (float* dest, const float* src1, const float* src2, Size num) noexcept
+{
+#if YUP_USE_VDSP_FRAMEWORK
+    vDSP_vdiv (src2, 1, src1, 1, dest, 1, (vDSP_Length) num);
+#else
+    YUP_PERFORM_VEC_OP_SRC1_SRC2_DEST (dest[i] = src1[i] / src2[i],
+                                       Mode::div (s1, s2),
+                                       YUP_LOAD_SRC1_SRC2,
+                                       YUP_INCREMENT_SRC1_SRC2_DEST,)
+#endif
+}
+
+template <typename Size>
+void divide (double* dest, const double* src1, const double* src2, Size num) noexcept
+{
+#if YUP_USE_VDSP_FRAMEWORK
+    vDSP_vdivD (src2, 1, src1, 1, dest, 1, (vDSP_Length) num);
+#else
+    YUP_PERFORM_VEC_OP_SRC1_SRC2_DEST (dest[i] = src1[i] / src2[i],
+                                       Mode::div (s1, s2),
+                                       YUP_LOAD_SRC1_SRC2,
+                                       YUP_INCREMENT_SRC1_SRC2_DEST,)
+#endif
+}
+
+template <typename Size>
+void divide (float* dest, const float* src, float divisor, Size num) noexcept
+{
+    YUP_PERFORM_VEC_OP_SRC_DEST (dest[i] = src[i] / divisor,
+                                 Mode::div (s, divs),
+                                 YUP_LOAD_SRC,
+                                 YUP_INCREMENT_SRC_DEST,
+                                 const Mode::ParallelType divs = Mode::load1 (divisor);)
+}
+
+template <typename Size>
+void divide (double* dest, const double* src, double divisor, Size num) noexcept
+{
+    YUP_PERFORM_VEC_OP_SRC_DEST (dest[i] = src[i] / divisor,
+                                 Mode::div (s, divs),
+                                 YUP_LOAD_SRC,
+                                 YUP_INCREMENT_SRC_DEST,
+                                 const Mode::ParallelType divs = Mode::load1 (divisor);)
+}
+
 template <typename Size>
 void negate (float* dest, const float* src, Size num) noexcept
 {
@@ -1374,6 +1511,24 @@ void YUP_CALLTYPE FloatVectorOperationsBase<FloatType, CountType>::copyWithMulti
     FloatVectorHelpers::copyWithMultiply (dest, src, multiplier, numValues);
 }
 
+template <typename FloatType, typename CountType>
+void YUP_CALLTYPE FloatVectorOperationsBase<FloatType, CountType>::copyWithDividend (FloatType* dest,
+                                                                                     const FloatType* src,
+                                                                                     FloatType dividend,
+                                                                                     CountType numValues) noexcept
+{
+    FloatVectorHelpers::copyWithDividend (dest, src, dividend, numValues);
+}
+
+template <typename FloatType, typename CountType>
+void YUP_CALLTYPE FloatVectorOperationsBase<FloatType, CountType>::copyWithDivide (FloatType* dest,
+                                                                                   const FloatType* src,
+                                                                                   FloatType divisor,
+                                                                                   CountType numValues) noexcept
+{
+    FloatVectorHelpers::copyWithDivide (dest, src, divisor, numValues);
+}
+
 template <typename FloatType, typename CountType>
 void YUP_CALLTYPE FloatVectorOperationsBase<FloatType, CountType>::add (FloatType* dest,
                                                                         FloatType amountToAdd,
diff --git a/modules/yup_audio_basics/buffers/yup_FloatVectorOperations.h b/modules/yup_audio_basics/buffers/yup_FloatVectorOperations.h
index 3b7cd9e90..3b66168fc 100644
--- a/modules/yup_audio_basics/buffers/yup_FloatVectorOperations.h
+++ b/modules/yup_audio_basics/buffers/yup_FloatVectorOperations.h
@@ -84,6 +84,12 @@ struct FloatVectorOperationsBase
     /** Copies a vector of floating point numbers, multiplying each value by a given multiplier */
     static void YUP_CALLTYPE copyWithMultiply (FloatType* dest, const FloatType* src, FloatType multiplier, CountType numValues) noexcept;
 
+    /** Copies a vector of floating point numbers, dividing a dividend with each value (dest[i] = dividend / src[i]) */
+    static void YUP_CALLTYPE copyWithDividend (FloatType* dest, const FloatType* src, FloatType dividend, CountType numValues) noexcept;
+
+    /** Copies a vector of floating point numbers, dividing each value with a divisor (dest[i] = src[i] / divisor) */
+    static void YUP_CALLTYPE copyWithDivide (FloatType* dest, const FloatType* src, FloatType divisor, CountType numValues) noexcept;
+
     /** Adds a fixed value to the destination values. */
     static void YUP_CALLTYPE add (FloatType* dest, FloatType amountToAdd, CountType numValues) noexcept;
 
@@ -126,6 +132,18 @@ struct FloatVectorOperationsBase
     /** Multiplies each of the source values by a fixed multiplier and stores the result in the destination array. */
     static void YUP_CALLTYPE multiply (FloatType* dest, const FloatType* src, FloatType multiplier, CountType num) noexcept;
 
+    /** Divides the destination values by the source values. */
+    static void YUP_CALLTYPE divide (FloatType* dest, const FloatType* src, CountType numValues) noexcept;
+
+    /** Divides each source1 value by the corresponding source2 value, then stores it in the destination array. */
+    static void YUP_CALLTYPE divide (FloatType* dest, const FloatType* src1, const FloatType* src2, CountType numValues) noexcept;
+
+    /** Divides each of the destination values by a fixed divisor. */
+    static void YUP_CALLTYPE divide (FloatType* dest, FloatType divisor, CountType numValues) noexcept;
+
+    /** Divides each of the source values by a fixed divisor and stores the result in the destination array. */
+    static void YUP_CALLTYPE divide (FloatType* dest, const FloatType* src, FloatType divisor, CountType num) noexcept;
+
     /** Copies a source vector to a destination, negating each value. */
     static void YUP_CALLTYPE negate (FloatType* dest, const FloatType* src, CountType numValues) noexcept;
 

From 55ca8f88e28bb3355a42a689b5781aac43030081 Mon Sep 17 00:00:00 2001
From: kunitoki <kunitoki@gmail.com>
Date: Wed, 27 Aug 2025 09:55:10 +0200
Subject: [PATCH 18/37] More tests

---
 .../buffers/yup_FloatVectorOperations.cpp     | 84 ++++++++++++++++---
 .../buffers/yup_FloatVectorOperations.h       |  3 +
 .../yup_FloatVectorOperations.cpp             | 27 ++++++
 3 files changed, 102 insertions(+), 12 deletions(-)

diff --git a/modules/yup_audio_basics/buffers/yup_FloatVectorOperations.cpp b/modules/yup_audio_basics/buffers/yup_FloatVectorOperations.cpp
index 447b9fec3..7fbb129bf 100644
--- a/modules/yup_audio_basics/buffers/yup_FloatVectorOperations.cpp
+++ b/modules/yup_audio_basics/buffers/yup_FloatVectorOperations.cpp
@@ -1165,28 +1165,28 @@ void multiply (double* dest, const double* src, double multiplier, Size num) noe
 }
 
 template <typename Size>
-void divide (float* dest, float divisor, Size num) noexcept
+void divide (float* dest, const float* src, Size num) noexcept
 {
 #if YUP_USE_VDSP_FRAMEWORK
-    vDSP_vsdiv (dest, 1, &divisor, dest, 1, (vDSP_Length) num);
+    vDSP_vdiv (src, 1, dest, 1, dest, 1, (vDSP_Length) num);
 #else
-    YUP_PERFORM_VEC_OP_DEST (dest[i] /= divisor,
-                             Mode::div (d, divs),
-                             YUP_LOAD_DEST,
-                             const Mode::ParallelType divs = Mode::load1 (divisor);)
+    YUP_PERFORM_VEC_OP_SRC_DEST (dest[i] /= src[i],
+                                 Mode::div (d, s),
+                                 YUP_LOAD_SRC_DEST,
+                                 YUP_INCREMENT_SRC_DEST, )
 #endif
 }
 
 template <typename Size>
-void divide (double* dest, double divisor, Size num) noexcept
+void divide (double* dest, const double* src, Size num) noexcept
 {
 #if YUP_USE_VDSP_FRAMEWORK
-    vDSP_vsdivD (dest, 1, &divisor, dest, 1, (vDSP_Length) num);
+    vDSP_vdivD (src, 1, dest, 1, dest, 1, (vDSP_Length) num);
 #else
-    YUP_PERFORM_VEC_OP_DEST (dest[i] /= divisor,
-                             Mode::div (d, divs),
-                             YUP_LOAD_DEST,
-                             const Mode::ParallelType divs = Mode::load1 (divisor);)
+    YUP_PERFORM_VEC_OP_SRC_DEST (dest[i] /= src[i],
+                                 Mode::div (d, s),
+                                 YUP_LOAD_SRC_DEST,
+                                 YUP_INCREMENT_SRC_DEST, )
 #endif
 }
 
@@ -1216,6 +1216,32 @@ void divide (double* dest, const double* src1, const double* src2, Size num) noe
 #endif
 }
 
+template <typename Size>
+void divide (float* dest, float divisor, Size num) noexcept
+{
+#if YUP_USE_VDSP_FRAMEWORK
+    vDSP_vsdiv (dest, 1, &divisor, dest, 1, (vDSP_Length) num);
+#else
+    YUP_PERFORM_VEC_OP_DEST (dest[i] /= divisor,
+                             Mode::div (d, divs),
+                             YUP_LOAD_DEST,
+                             const Mode::ParallelType divs = Mode::load1 (divisor);)
+#endif
+}
+
+template <typename Size>
+void divide (double* dest, double divisor, Size num) noexcept
+{
+#if YUP_USE_VDSP_FRAMEWORK
+    vDSP_vsdivD (dest, 1, &divisor, dest, 1, (vDSP_Length) num);
+#else
+    YUP_PERFORM_VEC_OP_DEST (dest[i] /= divisor,
+                             Mode::div (d, divs),
+                             YUP_LOAD_DEST,
+                             const Mode::ParallelType divs = Mode::load1 (divisor);)
+#endif
+}
+
 template <typename Size>
 void divide (float* dest, const float* src, float divisor, Size num) noexcept
 {
@@ -1650,6 +1676,40 @@ void YUP_CALLTYPE FloatVectorOperationsBase<FloatType, CountType>::multiply (Flo
     FloatVectorHelpers::multiply (dest, src, multiplier, num);
 }
 
+template <typename FloatType, typename CountType>
+void YUP_CALLTYPE FloatVectorOperationsBase<FloatType, CountType>::divide (FloatType* dest,
+                                                                           const FloatType* src,
+                                                                           CountType numValues) noexcept
+{
+    FloatVectorHelpers::divide (dest, src, numValues);
+}
+
+template <typename FloatType, typename CountType>
+void YUP_CALLTYPE FloatVectorOperationsBase<FloatType, CountType>::divide (FloatType* dest,
+                                                                           const FloatType* src1,
+                                                                           const FloatType* src2,
+                                                                           CountType numValues) noexcept
+{
+    FloatVectorHelpers::divide (dest, src1, src2, numValues);
+}
+
+template <typename FloatType, typename CountType>
+void YUP_CALLTYPE FloatVectorOperationsBase<FloatType, CountType>::divide (FloatType* dest,
+                                                                           FloatType divisor,
+                                                                           CountType numValues) noexcept
+{
+    FloatVectorHelpers::divide (dest, divisor, numValues);
+}
+
+template <typename FloatType, typename CountType>
+void YUP_CALLTYPE FloatVectorOperationsBase<FloatType, CountType>::divide (FloatType* dest,
+                                                                           const FloatType* src,
+                                                                           FloatType divisor,
+                                                                           CountType num) noexcept
+{
+    FloatVectorHelpers::divide (dest, src, divisor, num);
+}
+
 template <typename FloatType, typename CountType>
 void YUP_CALLTYPE FloatVectorOperationsBase<FloatType, CountType>::negate (FloatType* dest,
                                                                            const FloatType* src,
diff --git a/modules/yup_audio_basics/buffers/yup_FloatVectorOperations.h b/modules/yup_audio_basics/buffers/yup_FloatVectorOperations.h
index 3b66168fc..b83831b82 100644
--- a/modules/yup_audio_basics/buffers/yup_FloatVectorOperations.h
+++ b/modules/yup_audio_basics/buffers/yup_FloatVectorOperations.h
@@ -186,11 +186,14 @@ struct NameForwarder : public Bases...
         Bases::fill...,
         Bases::copy...,
         Bases::copyWithMultiply...,
+        Bases::copyWithDividend...,
+        Bases::copyWithDivide...,
         Bases::add...,
         Bases::subtract...,
         Bases::addWithMultiply...,
         Bases::subtractWithMultiply...,
         Bases::multiply...,
+        Bases::divide...,
         Bases::negate...,
         Bases::abs...,
         Bases::min...,
diff --git a/tests/yup_audio_basics/yup_FloatVectorOperations.cpp b/tests/yup_audio_basics/yup_FloatVectorOperations.cpp
index 223cbda7d..b4fc11b6c 100644
--- a/tests/yup_audio_basics/yup_FloatVectorOperations.cpp
+++ b/tests/yup_audio_basics/yup_FloatVectorOperations.cpp
@@ -125,6 +125,33 @@ class FloatVectorOperationsTests : public ::testing::Test
             FloatVectorOperations::fill (data2, (ValueType) 3, num);
             FloatVectorOperations::addWithMultiply (data1, data1, data2, num);
             EXPECT_TRUE (areAllValuesEqual (data1, num, (ValueType) 8));
+
+            FloatVectorOperations::fill (data1, (ValueType) 8, num);
+            FloatVectorOperations::copyWithDividend (data2, data1, (ValueType) 16, num);
+            EXPECT_TRUE (areAllValuesEqual (data2, num, (ValueType) 2));
+
+            FloatVectorOperations::fill (data1, (ValueType) 12, num);
+            FloatVectorOperations::copyWithDivide (data2, data1, (ValueType) 3, num);
+            EXPECT_TRUE (areAllValuesEqual (data2, num, (ValueType) 4));
+
+            FloatVectorOperations::fill (data1, (ValueType) 20, num);
+            FloatVectorOperations::divide (data1, (ValueType) 4, num);
+            EXPECT_TRUE (areAllValuesEqual (data1, num, (ValueType) 5));
+
+            FloatVectorOperations::fill (data1, (ValueType) 15, num);
+            FloatVectorOperations::fill (data2, (ValueType) 3, num);
+            HeapBlock<ValueType> result (num + 16);
+#if YUP_ARM
+            ValueType* const resultData = result;
+#else
+            ValueType* const resultData = addBytesToPointer (result.get(), random.nextInt (16));
+#endif
+            FloatVectorOperations::divide (resultData, data1, data2, num);
+            EXPECT_TRUE (areAllValuesEqual (resultData, num, (ValueType) 5));
+
+            FloatVectorOperations::fill (data1, (ValueType) 18, num);
+            FloatVectorOperations::divide (data2, data1, (ValueType) 6, num);
+            EXPECT_TRUE (areAllValuesEqual (data2, num, (ValueType) 3));
         }
 
         static void fillRandomly (Random& random, ValueType* d, int num)

From f5d66f61ea0c3214a4212d7523c40e8421671239 Mon Sep 17 00:00:00 2001
From: kunitoki <kunitoki@gmail.com>
Date: Wed, 27 Aug 2025 09:56:51 +0200
Subject: [PATCH 19/37] More comments

---
 tests/yup_audio_basics/yup_FloatVectorOperations.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/yup_audio_basics/yup_FloatVectorOperations.cpp b/tests/yup_audio_basics/yup_FloatVectorOperations.cpp
index b4fc11b6c..3d7bdb91e 100644
--- a/tests/yup_audio_basics/yup_FloatVectorOperations.cpp
+++ b/tests/yup_audio_basics/yup_FloatVectorOperations.cpp
@@ -144,6 +144,8 @@ class FloatVectorOperationsTests : public ::testing::Test
 #if YUP_ARM
             ValueType* const resultData = result;
 #else
+            // These tests deliberately operate on misaligned memory and will be flagged up by
+            // checks for undefined behavior!
             ValueType* const resultData = addBytesToPointer (result.get(), random.nextInt (16));
 #endif
             FloatVectorOperations::divide (resultData, data1, data2, num);

From 61b5bee3d1a89858f96c8827cd682956adff4f39 Mon Sep 17 00:00:00 2001
From: kunitoki <kunitoki@gmail.com>
Date: Wed, 27 Aug 2025 10:00:40 +0200
Subject: [PATCH 20/37] Restored fixed to floatconversions

---
 .../buffers/yup_FloatVectorOperations.cpp     | 29 +++++++++++++++++++
 .../buffers/yup_FloatVectorOperations.h       |  6 ++++
 2 files changed, 35 insertions(+)

diff --git a/modules/yup_audio_basics/buffers/yup_FloatVectorOperations.cpp b/modules/yup_audio_basics/buffers/yup_FloatVectorOperations.cpp
index 7fbb129bf..accf84ce3 100644
--- a/modules/yup_audio_basics/buffers/yup_FloatVectorOperations.cpp
+++ b/modules/yup_audio_basics/buffers/yup_FloatVectorOperations.cpp
@@ -1501,6 +1501,23 @@ double findMaximum (const double* src, Size num) noexcept
 #endif
 }
 
+template <typename Size>
+void convertFixedToFloat (float* dest, const int* src, float multiplier, Size num) noexcept
+{
+#if YUP_USE_ARM_NEON
+    YUP_PERFORM_VEC_OP_SRC_DEST (dest[i] = (float) src[i] * multiplier,
+                                 vmulq_n_f32 (vcvtq_f32_s32 (vld1q_s32 (src)), multiplier),
+                                 YUP_LOAD_NONE,
+                                 YUP_INCREMENT_SRC_DEST, )
+#else
+    YUP_PERFORM_VEC_OP_SRC_DEST (dest[i] = (float) src[i] * multiplier,
+                                 Mode::mul (mult, _mm_cvtepi32_ps (_mm_loadu_si128 (reinterpret_cast<const __m128i*> (src)))),
+                                 YUP_LOAD_NONE,
+                                 YUP_INCREMENT_SRC_DEST,
+                                 const Mode::ParallelType mult = Mode::load1 (multiplier);)
+#endif
+}
+
 } // namespace
 } // namespace FloatVectorHelpers
 
@@ -1802,6 +1819,18 @@ template struct FloatVectorOperationsBase<double, size_t>;
 
 //==============================================================================
 
+void YUP_CALLTYPE FloatVectorOperations::convertFixedToFloat (float* dest, const int* src, float multiplier, size_t num) noexcept
+{
+   FloatVectorHelpers::convertFixedToFloat (dest, src, multiplier, num);
+}
+
+void YUP_CALLTYPE FloatVectorOperations::convertFixedToFloat (float* dest, const int* src, float multiplier, int num) noexcept
+{
+    FloatVectorHelpers::convertFixedToFloat (dest, src, multiplier, num);
+}
+
+//==============================================================================
+
 intptr_t YUP_CALLTYPE FloatVectorOperations::getFpStatusRegister() noexcept
 {
     intptr_t fpsr = 0;
diff --git a/modules/yup_audio_basics/buffers/yup_FloatVectorOperations.h b/modules/yup_audio_basics/buffers/yup_FloatVectorOperations.h
index b83831b82..033c33b49 100644
--- a/modules/yup_audio_basics/buffers/yup_FloatVectorOperations.h
+++ b/modules/yup_audio_basics/buffers/yup_FloatVectorOperations.h
@@ -220,6 +220,12 @@ struct NameForwarder : public Bases...
 class YUP_API FloatVectorOperations : public detail::NameForwarder<FloatVectorOperationsBase<float, int>, FloatVectorOperationsBase<float, size_t>, FloatVectorOperationsBase<double, int>, FloatVectorOperationsBase<double, size_t>>
 {
 public:
+    /** */
+    static void JUCE_CALLTYPE convertFixedToFloat (float* dest, const int* src, float multiplier, int num) noexcept;
+
+    /** */
+    static void JUCE_CALLTYPE convertFixedToFloat (float* dest, const int* src, float multiplier, size_t num) noexcept;
+
     /** This method enables or disables the SSE/NEON flush-to-zero mode. */
     static void YUP_CALLTYPE enableFlushToZeroMode (bool shouldEnable) noexcept;
 

From 9b1e6ee458f5a98746769b50c37994f4e6ecd654 Mon Sep 17 00:00:00 2001
From: kunitoki <kunitoki@gmail.com>
Date: Wed, 27 Aug 2025 10:32:56 +0200
Subject: [PATCH 21/37] More work on FloatVectorOperations

---
 .../buffers/yup_FloatVectorOperations.cpp     | 45 ++++++++++++++++++-
 .../buffers/yup_FloatVectorOperations.h       | 10 +++--
 .../yup_FloatVectorOperations.cpp             | 22 +++++++++
 3 files changed, 72 insertions(+), 5 deletions(-)

diff --git a/modules/yup_audio_basics/buffers/yup_FloatVectorOperations.cpp b/modules/yup_audio_basics/buffers/yup_FloatVectorOperations.cpp
index accf84ce3..bcb9d614b 100644
--- a/modules/yup_audio_basics/buffers/yup_FloatVectorOperations.cpp
+++ b/modules/yup_audio_basics/buffers/yup_FloatVectorOperations.cpp
@@ -83,6 +83,10 @@ struct BasicOps32
 
     static forcedinline void storeU (Type* dest, ParallelType a) noexcept { _mm_storeu_ps (dest, a); }
 
+    static forcedinline void storeU (int* dest, ParallelType a) noexcept { _mm_storeu_si128 (reinterpret_cast<__m128i*> (dest), _mm_castps_si128 (a)); }
+
+    static forcedinline void storeA (int* dest, ParallelType a) noexcept { _mm_store_si128 (reinterpret_cast<__m128i*> (dest), _mm_castps_si128 (a)); }
+
     static forcedinline ParallelType add (ParallelType a, ParallelType b) noexcept { return _mm_add_ps (a, b); }
 
     static forcedinline ParallelType sub (ParallelType a, ParallelType b) noexcept { return _mm_sub_ps (a, b); }
@@ -335,6 +339,10 @@ struct BasicOps32
 
     static forcedinline void storeU (Type* dest, ParallelType a) noexcept { vst1q_f32 (dest, a); }
 
+    static forcedinline void storeU (int* dest, ParallelType a) noexcept { vst1q_f32 (reinterpret_cast<float*> (dest), a); }
+
+    static forcedinline void storeA (int* dest, ParallelType a) noexcept { vst1q_f32 (reinterpret_cast<float*> (dest), a); }
+
     static forcedinline ParallelType add (ParallelType a, ParallelType b) noexcept { return vaddq_f32 (a, b); }
 
     static forcedinline ParallelType sub (ParallelType a, ParallelType b) noexcept { return vsubq_f32 (a, b); }
@@ -1509,12 +1517,37 @@ void convertFixedToFloat (float* dest, const int* src, float multiplier, Size nu
                                  vmulq_n_f32 (vcvtq_f32_s32 (vld1q_s32 (src)), multiplier),
                                  YUP_LOAD_NONE,
                                  YUP_INCREMENT_SRC_DEST, )
-#else
+#elif YUP_USE_SSE_INTRINSICS
     YUP_PERFORM_VEC_OP_SRC_DEST (dest[i] = (float) src[i] * multiplier,
                                  Mode::mul (mult, _mm_cvtepi32_ps (_mm_loadu_si128 (reinterpret_cast<const __m128i*> (src)))),
                                  YUP_LOAD_NONE,
                                  YUP_INCREMENT_SRC_DEST,
                                  const Mode::ParallelType mult = Mode::load1 (multiplier);)
+#else
+    for (Size i = 0; i < num; ++i)
+        dest[i] = (float) src[i] * multiplier;
+#endif
+}
+
+template <typename Size>
+void convertFloatToFixed (int* dest, const float* src, float multiplier, Size num) noexcept
+{
+#if YUP_USE_ARM_NEON
+    YUP_PERFORM_VEC_OP_SRC_DEST (dest[i] = (int) (src[i] * multiplier),
+                                 vreinterpretq_f32_s32 (vcvtq_s32_f32 (vmulq_n_f32 (vld1q_f32 (src), multiplier))),
+                                 YUP_LOAD_NONE,
+                                 YUP_INCREMENT_SRC_DEST, )
+
+#elif YUP_USE_SSE_INTRINSICS
+    YUP_PERFORM_VEC_OP_SRC_DEST (dest[i] = (int) (src[i] * multiplier),
+                                 _mm_castsi128_ps (_mm_cvtps_epi32 (_mm_mul_ps (_mm_loadu_ps (src), mult))),
+                                 YUP_LOAD_NONE,
+                                 YUP_INCREMENT_SRC_DEST,
+                                 const Mode::ParallelType mult = Mode::load1 (multiplier);)
+
+#else
+    for (Size i = 0; i < num; ++i)
+        dest[i] = (int) (src[i] * multiplier);
 #endif
 }
 
@@ -1829,6 +1862,16 @@ void YUP_CALLTYPE FloatVectorOperations::convertFixedToFloat (float* dest, const
     FloatVectorHelpers::convertFixedToFloat (dest, src, multiplier, num);
 }
 
+void YUP_CALLTYPE FloatVectorOperations::convertFloatToFixed (int* dest, const float* src, float multiplier, size_t num) noexcept
+{
+   FloatVectorHelpers::convertFloatToFixed (dest, src, multiplier, num);
+}
+
+void YUP_CALLTYPE FloatVectorOperations::convertFloatToFixed (int* dest, const float* src, float multiplier, int num) noexcept
+{
+    FloatVectorHelpers::convertFloatToFixed (dest, src, multiplier, num);
+}
+
 //==============================================================================
 
 intptr_t YUP_CALLTYPE FloatVectorOperations::getFpStatusRegister() noexcept
diff --git a/modules/yup_audio_basics/buffers/yup_FloatVectorOperations.h b/modules/yup_audio_basics/buffers/yup_FloatVectorOperations.h
index 033c33b49..18d373ad2 100644
--- a/modules/yup_audio_basics/buffers/yup_FloatVectorOperations.h
+++ b/modules/yup_audio_basics/buffers/yup_FloatVectorOperations.h
@@ -220,11 +220,13 @@ struct NameForwarder : public Bases...
 class YUP_API FloatVectorOperations : public detail::NameForwarder<FloatVectorOperationsBase<float, int>, FloatVectorOperationsBase<float, size_t>, FloatVectorOperationsBase<double, int>, FloatVectorOperationsBase<double, size_t>>
 {
 public:
-    /** */
-    static void JUCE_CALLTYPE convertFixedToFloat (float* dest, const int* src, float multiplier, int num) noexcept;
+    /** Convert fixed integer signal to float applying a multiplier. */
+    static void YUP_CALLTYPE convertFixedToFloat (float* dest, const int* src, float multiplier, int num) noexcept;
+    static void YUP_CALLTYPE convertFixedToFloat (float* dest, const int* src, float multiplier, size_t num) noexcept;
 
-    /** */
-    static void JUCE_CALLTYPE convertFixedToFloat (float* dest, const int* src, float multiplier, size_t num) noexcept;
+    /** Convert float signal to int applying a multiplier. */
+    static void YUP_CALLTYPE convertFloatToFixed (int* dest, const float* src, float multiplier, int num) noexcept;
+    static void YUP_CALLTYPE convertFloatToFixed (int* dest, const float* src, float multiplier, size_t num) noexcept;
 
     /** This method enables or disables the SSE/NEON flush-to-zero mode. */
     static void YUP_CALLTYPE enableFlushToZeroMode (bool shouldEnable) noexcept;
diff --git a/tests/yup_audio_basics/yup_FloatVectorOperations.cpp b/tests/yup_audio_basics/yup_FloatVectorOperations.cpp
index 3d7bdb91e..b7c0fa294 100644
--- a/tests/yup_audio_basics/yup_FloatVectorOperations.cpp
+++ b/tests/yup_audio_basics/yup_FloatVectorOperations.cpp
@@ -154,6 +154,28 @@ class FloatVectorOperationsTests : public ::testing::Test
             FloatVectorOperations::fill (data1, (ValueType) 18, num);
             FloatVectorOperations::divide (data2, data1, (ValueType) 6, num);
             EXPECT_TRUE (areAllValuesEqual (data2, num, (ValueType) 3));
+
+            fillRandomly (random, int1, num);
+            const ValueType multiplier = (ValueType) (1.0 / (1 << 16));
+
+            if constexpr (std::is_same_v<ValueType, float>)
+            {
+                convertFixed (data1, int1, multiplier, num);
+                FloatVectorOperations::convertFixedToFloat (data2, int1, multiplier, num);
+                EXPECT_TRUE (buffersMatch (data1, data2, num));
+
+                convertFloatToFixed (int1, data1, 1.0f / multiplier, num);
+                HeapBlock<int> int2 (num + 16);
+#if YUP_ARM
+                int* const intData = int2;
+#else
+                int* const intData = addBytesToPointer (int2.get(), random.nextInt (16));
+#endif
+                FloatVectorOperations::convertFloatToFixed (intData, data1, 1.0f / multiplier, num);
+
+                for (int i = 0; i < num; ++i)
+                    EXPECT_EQ (int1[i], intData[i]);
+            }
         }
 
         static void fillRandomly (Random& random, ValueType* d, int num)

From 3fea1ab88479383ff3c54c1c865be29034ad0c31 Mon Sep 17 00:00:00 2001
From: Yup Bot <yup-bot@users.noreply.github.com>
Date: Wed, 27 Aug 2025 08:33:40 +0000
Subject: [PATCH 22/37] Code formatting

---
 .../buffers/yup_FloatVectorOperations.cpp           | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/modules/yup_audio_basics/buffers/yup_FloatVectorOperations.cpp b/modules/yup_audio_basics/buffers/yup_FloatVectorOperations.cpp
index bcb9d614b..0108a98cb 100644
--- a/modules/yup_audio_basics/buffers/yup_FloatVectorOperations.cpp
+++ b/modules/yup_audio_basics/buffers/yup_FloatVectorOperations.cpp
@@ -93,7 +93,7 @@ struct BasicOps32
 
     static forcedinline ParallelType mul (ParallelType a, ParallelType b) noexcept { return _mm_mul_ps (a, b); }
 
-    static forcedinline ParallelType div (ParallelType a, ParallelType b) noexcept  { return _mm_div_ps (a, b); }
+    static forcedinline ParallelType div (ParallelType a, ParallelType b) noexcept { return _mm_div_ps (a, b); }
 
     static forcedinline ParallelType max (ParallelType a, ParallelType b) noexcept { return _mm_max_ps (a, b); }
 
@@ -154,7 +154,7 @@ struct BasicOps64
 
     static forcedinline ParallelType mul (ParallelType a, ParallelType b) noexcept { return _mm_mul_pd (a, b); }
 
-    static forcedinline ParallelType div (ParallelType a, ParallelType b) noexcept  { return _mm_div_pd (a, b); }
+    static forcedinline ParallelType div (ParallelType a, ParallelType b) noexcept { return _mm_div_pd (a, b); }
 
     static forcedinline ParallelType max (ParallelType a, ParallelType b) noexcept { return _mm_max_pd (a, b); }
 
@@ -771,7 +771,6 @@ void copyWithMultiply (double* dest, const double* src, double multiplier, Size
 #endif
 }
 
-
 template <typename Size>
 void copyWithDividend (float* dest, const float* src, float dividend, Size num) noexcept
 {
@@ -1207,7 +1206,7 @@ void divide (float* dest, const float* src1, const float* src2, Size num) noexce
     YUP_PERFORM_VEC_OP_SRC1_SRC2_DEST (dest[i] = src1[i] / src2[i],
                                        Mode::div (s1, s2),
                                        YUP_LOAD_SRC1_SRC2,
-                                       YUP_INCREMENT_SRC1_SRC2_DEST,)
+                                       YUP_INCREMENT_SRC1_SRC2_DEST, )
 #endif
 }
 
@@ -1220,7 +1219,7 @@ void divide (double* dest, const double* src1, const double* src2, Size num) noe
     YUP_PERFORM_VEC_OP_SRC1_SRC2_DEST (dest[i] = src1[i] / src2[i],
                                        Mode::div (s1, s2),
                                        YUP_LOAD_SRC1_SRC2,
-                                       YUP_INCREMENT_SRC1_SRC2_DEST,)
+                                       YUP_INCREMENT_SRC1_SRC2_DEST, )
 #endif
 }
 
@@ -1854,7 +1853,7 @@ template struct FloatVectorOperationsBase<double, size_t>;
 
 void YUP_CALLTYPE FloatVectorOperations::convertFixedToFloat (float* dest, const int* src, float multiplier, size_t num) noexcept
 {
-   FloatVectorHelpers::convertFixedToFloat (dest, src, multiplier, num);
+    FloatVectorHelpers::convertFixedToFloat (dest, src, multiplier, num);
 }
 
 void YUP_CALLTYPE FloatVectorOperations::convertFixedToFloat (float* dest, const int* src, float multiplier, int num) noexcept
@@ -1864,7 +1863,7 @@ void YUP_CALLTYPE FloatVectorOperations::convertFixedToFloat (float* dest, const
 
 void YUP_CALLTYPE FloatVectorOperations::convertFloatToFixed (int* dest, const float* src, float multiplier, size_t num) noexcept
 {
-   FloatVectorHelpers::convertFloatToFixed (dest, src, multiplier, num);
+    FloatVectorHelpers::convertFloatToFixed (dest, src, multiplier, num);
 }
 
 void YUP_CALLTYPE FloatVectorOperations::convertFloatToFixed (int* dest, const float* src, float multiplier, int num) noexcept

From 3a1632f8893afc6651951c0df1cbc2cc83c5b98e Mon Sep 17 00:00:00 2001
From: kunitoki <kunitoki@gmail.com>
Date: Wed, 27 Aug 2025 11:00:38 +0200
Subject: [PATCH 23/37] Add conversions to and from float and double

---
 .../buffers/yup_FloatVectorOperations.cpp     | 78 +++++++++++++++++++
 .../buffers/yup_FloatVectorOperations.h       |  8 ++
 2 files changed, 86 insertions(+)

diff --git a/modules/yup_audio_basics/buffers/yup_FloatVectorOperations.cpp b/modules/yup_audio_basics/buffers/yup_FloatVectorOperations.cpp
index 0108a98cb..ef87efbe7 100644
--- a/modules/yup_audio_basics/buffers/yup_FloatVectorOperations.cpp
+++ b/modules/yup_audio_basics/buffers/yup_FloatVectorOperations.cpp
@@ -1550,6 +1550,64 @@ void convertFloatToFixed (int* dest, const float* src, float multiplier, Size nu
 #endif
 }
 
+template <typename Size>
+void convertDoubleToFloat (float* dest, const double* src, Size num) noexcept
+{
+#if YUP_USE_VDSP_FRAMEWORK
+    vDSP_vdpsp (src, 1, dest, 1, (vDSP_Length) num);
+#else
+    Size i = 0;
+#if YUP_USE_ARM_NEON
+    for (; i + 2 <= num; i += 2)
+    {
+        float64x2_t d = vld1q_f64 (src + i);
+        float32x2_t f = vcvt_f32_f64 (d);
+        vst1_f32 (dest + i, f);
+    }
+#elif JUCE_USE_SSE_INTRINSICS
+    for (; i + 2 <= num; i += 2)
+    {
+        __m128d d = _mm_loadu_pd (src + i);
+        __m128 f  = _mm_cvtpd_ps (d);
+        _mm_storel_pi ((__m64*) (dest + i), f);
+    }
+#endif
+
+   for (; i < num; ++i)
+       dest[i] = (float) src[i];
+#endif
+}
+
+template <typename Size>
+void convertFloatToDouble (double* dest, const float* src, Size num) noexcept
+{
+#if YUP_USE_VDSP_FRAMEWORK
+    vDSP_vspdp (src, 1, dest, 1, (vDSP_Length) num);
+#else
+    Size i = 0;
+#if YUP_USE_ARM_NEON
+    for (; i + 2 <= num; i += 2)
+    {
+        float32x2_t f = vld1_f32 (src + i);
+        float64x2_t d = vcvt_f64_f32 (f);
+        vst1q_f64 (dest + i, d);
+    }
+#elif JUCE_USE_SSE_INTRINSICS
+    for (; i + 4 <= num; i += 4)
+    {
+        __m128 f = _mm_loadu_ps (src + i);
+        __m128d d0 = _mm_cvtps_pd (f);
+        __m128d d1 = _mm_cvtps_pd (_mm_movehl_ps (f, f));
+        _mm_storeu_pd (dest + i, d0);
+        _mm_storeu_pd (dest + i + 2, d1);
+    }
+#endif
+
+   for (; i < num; ++i)
+       dest[i] = (double) src[i];
+#endif
+}
+
 } // namespace
 } // namespace FloatVectorHelpers
 
@@ -1871,6 +1929,26 @@ void YUP_CALLTYPE FloatVectorOperations::convertFloatToFixed (int* dest, const f
     FloatVectorHelpers::convertFloatToFixed (dest, src, multiplier, num);
 }
 
+void YUP_CALLTYPE convertFloatToDouble (double* dest, const float* src, int num) noexcept
+{
+    FloatVectorHelpers::convertFloatToDouble (dest, src, num);
+}
+
+void YUP_CALLTYPE convertFloatToDouble (double* dest, const float* src, size_t num) noexcept
+{
+    FloatVectorHelpers::convertFloatToDouble (dest, src, num);
+}
+
+void YUP_CALLTYPE convertDoubleToFloat (float* dest, const double* src, int num) noexcept
+{
+    FloatVectorHelpers::convertDoubleToFloat (dest, src, num);
+}
+
+void YUP_CALLTYPE convertDoubleToFloat (float* dest, const double* src, size_t num) noexcept
+{
+    FloatVectorHelpers::convertDoubleToFloat (dest, src, num);
+}
+
 //==============================================================================
 
 intptr_t YUP_CALLTYPE FloatVectorOperations::getFpStatusRegister() noexcept
diff --git a/modules/yup_audio_basics/buffers/yup_FloatVectorOperations.h b/modules/yup_audio_basics/buffers/yup_FloatVectorOperations.h
index 18d373ad2..9efe0f1ea 100644
--- a/modules/yup_audio_basics/buffers/yup_FloatVectorOperations.h
+++ b/modules/yup_audio_basics/buffers/yup_FloatVectorOperations.h
@@ -228,6 +228,14 @@ class YUP_API FloatVectorOperations : public detail::NameForwarder<FloatVectorOp
     static void YUP_CALLTYPE convertFloatToFixed (int* dest, const float* src, float multiplier, int num) noexcept;
     static void YUP_CALLTYPE convertFloatToFixed (int* dest, const float* src, float multiplier, size_t num) noexcept;
 
+    /** Convert float signal to double. */
+    static void YUP_CALLTYPE convertFloatToDouble (double* dest, const float* src, int num) noexcept;
+    static void YUP_CALLTYPE convertFloatToDouble (double* dest, const float* src, size_t num) noexcept;
+
+    /** Convert double signal to float. */
+    static void YUP_CALLTYPE convertDoubleToFloat (float* dest, const double* src, int num) noexcept;
+    static void YUP_CALLTYPE convertDoubleToFloat (float* dest, const double* src, size_t num) noexcept;
+
     /** This method enables or disables the SSE/NEON flush-to-zero mode. */
     static void YUP_CALLTYPE enableFlushToZeroMode (bool shouldEnable) noexcept;
 

From 5f3cf189d474a1e591af6dc69d0bb33316ed5bfe Mon Sep 17 00:00:00 2001
From: kunitoki <kunitoki@gmail.com>
Date: Wed, 27 Aug 2025 11:01:07 +0200
Subject: [PATCH 24/37] Unset device open

---
 modules/yup_audio_devices/native/yup_OpenSL_android.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/modules/yup_audio_devices/native/yup_OpenSL_android.cpp b/modules/yup_audio_devices/native/yup_OpenSL_android.cpp
index 942b59410..c20f039b9 100644
--- a/modules/yup_audio_devices/native/yup_OpenSL_android.cpp
+++ b/modules/yup_audio_devices/native/yup_OpenSL_android.cpp
@@ -1073,6 +1073,9 @@ class OpenSLAudioIODevice final : public AudioIODevice
     void close() override
     {
         stop();
+
+        deviceOpen = false;
+
         session = nullptr;
         callback = nullptr;
     }

From 942dccb1eac29b5d14b65559ae4efa9da88f1ac0 Mon Sep 17 00:00:00 2001
From: kunitoki <kunitoki@gmail.com>
Date: Wed, 27 Aug 2025 11:29:35 +0200
Subject: [PATCH 25/37] More tests

---
 .../buffers/yup_FloatVectorOperations.cpp     |   8 +-
 .../yup_FloatVectorOperations.cpp             | 163 +++++++++++++-----
 2 files changed, 121 insertions(+), 50 deletions(-)

diff --git a/modules/yup_audio_basics/buffers/yup_FloatVectorOperations.cpp b/modules/yup_audio_basics/buffers/yup_FloatVectorOperations.cpp
index ef87efbe7..246c17b92 100644
--- a/modules/yup_audio_basics/buffers/yup_FloatVectorOperations.cpp
+++ b/modules/yup_audio_basics/buffers/yup_FloatVectorOperations.cpp
@@ -1929,22 +1929,22 @@ void YUP_CALLTYPE FloatVectorOperations::convertFloatToFixed (int* dest, const f
     FloatVectorHelpers::convertFloatToFixed (dest, src, multiplier, num);
 }
 
-void YUP_CALLTYPE convertFloatToDouble (double* dest, const float* src, int num) noexcept
+void YUP_CALLTYPE FloatVectorOperations::convertFloatToDouble (double* dest, const float* src, int num) noexcept
 {
     FloatVectorHelpers::convertFloatToDouble (dest, src, num);
 }
 
-void YUP_CALLTYPE convertFloatToDouble (double* dest, const float* src, size_t num) noexcept
+void YUP_CALLTYPE FloatVectorOperations::convertFloatToDouble (double* dest, const float* src, size_t num) noexcept
 {
     FloatVectorHelpers::convertFloatToDouble (dest, src, num);
 }
 
-void YUP_CALLTYPE convertDoubleToFloat (float* dest, const double* src, int num) noexcept
+void YUP_CALLTYPE FloatVectorOperations::convertDoubleToFloat (float* dest, const double* src, int num) noexcept
 {
     FloatVectorHelpers::convertDoubleToFloat (dest, src, num);
 }
 
-void YUP_CALLTYPE convertDoubleToFloat (float* dest, const double* src, size_t num) noexcept
+void YUP_CALLTYPE FloatVectorOperations::convertDoubleToFloat (float* dest, const double* src, size_t num) noexcept
 {
     FloatVectorHelpers::convertDoubleToFloat (dest, src, num);
 }
diff --git a/tests/yup_audio_basics/yup_FloatVectorOperations.cpp b/tests/yup_audio_basics/yup_FloatVectorOperations.cpp
index b7c0fa294..85a551710 100644
--- a/tests/yup_audio_basics/yup_FloatVectorOperations.cpp
+++ b/tests/yup_audio_basics/yup_FloatVectorOperations.cpp
@@ -154,28 +154,6 @@ class FloatVectorOperationsTests : public ::testing::Test
             FloatVectorOperations::fill (data1, (ValueType) 18, num);
             FloatVectorOperations::divide (data2, data1, (ValueType) 6, num);
             EXPECT_TRUE (areAllValuesEqual (data2, num, (ValueType) 3));
-
-            fillRandomly (random, int1, num);
-            const ValueType multiplier = (ValueType) (1.0 / (1 << 16));
-
-            if constexpr (std::is_same_v<ValueType, float>)
-            {
-                convertFixed (data1, int1, multiplier, num);
-                FloatVectorOperations::convertFixedToFloat (data2, int1, multiplier, num);
-                EXPECT_TRUE (buffersMatch (data1, data2, num));
-
-                convertFloatToFixed (int1, data1, 1.0f / multiplier, num);
-                HeapBlock<int> int2 (num + 16);
-#if YUP_ARM
-                int* const intData = int2;
-#else
-                int* const intData = addBytesToPointer (int2.get(), random.nextInt (16));
-#endif
-                FloatVectorOperations::convertFloatToFixed (intData, data1, 1.0f / multiplier, num);
-
-                for (int i = 0; i < num; ++i)
-                    EXPECT_EQ (int1[i], intData[i]);
-            }
         }
 
         static void fillRandomly (Random& random, ValueType* d, int num)
@@ -190,30 +168,6 @@ class FloatVectorOperationsTests : public ::testing::Test
                 *d++ = random.nextInt();
         }
 
-        static void convertFixed (float* d, const int* s, ValueType multiplier, int num)
-        {
-            while (--num >= 0)
-                *d++ = (float) *s++ * multiplier;
-        }
-
-        static void convertFixedToDouble (double* d, const int* s, double multiplier, int num)
-        {
-            while (--num >= 0)
-                *d++ = (double) *s++ * multiplier;
-        }
-
-        static void convertFloatToFixed (int* d, const float* s, float multiplier, int num)
-        {
-            while (--num >= 0)
-                *d++ = (int) (*s++ * multiplier);
-        }
-
-        static void convertDoubleToFixed (int* d, const double* s, double multiplier, int num)
-        {
-            while (--num >= 0)
-                *d++ = (int) (*s++ * multiplier);
-        }
-
         static bool areAllValuesEqual (const ValueType* d, int num, ValueType target)
         {
             while (--num >= 0)
@@ -237,6 +191,43 @@ class FloatVectorOperationsTests : public ::testing::Test
             return std::abs (v1 - v2) < std::numeric_limits<ValueType>::epsilon();
         }
     };
+
+    template <class ValueType>
+    static bool valuesMatch (ValueType v1, ValueType v2)
+    {
+        return std::abs (v1 - v2) < std::numeric_limits<ValueType>::epsilon();
+    }
+
+    template <class ValueType>
+    static bool buffersMatch (const ValueType* d1, const ValueType* d2, int num)
+    {
+        while (--num >= 0)
+        {
+            if (! valuesMatch (*d1++, *d2++))
+                return false;
+        }
+
+        return true;
+    }
+
+    static void convertFixedToFloat (float* d, const int* s, float multiplier, int num)
+    {
+        while (--num >= 0)
+            *d++ = (float) *s++ * multiplier;
+    }
+
+    static void convertFloatToFixed (int* d, const float* s, float multiplier, int num)
+    {
+        while (--num >= 0)
+            *d++ = (int) (*s++ * multiplier);
+    }
+
+    template <class ValueType>
+    static void fillRandomly (Random& random, ValueType* d, int num)
+    {
+        while (--num >= 0)
+            *d++ = (ValueType) (random.nextDouble() * 1000.0);
+    }
 };
 
 TEST_F (FloatVectorOperationsTests, BasicOperations)
@@ -247,3 +238,83 @@ TEST_F (FloatVectorOperationsTests, BasicOperations)
         TestRunner<double>::runTest (Random::getSystemRandom());
     }
 }
+
+TEST_F (FloatVectorOperationsTests, FloatToFixedAndBack)
+{
+    Random& random = Random::getSystemRandom();
+
+    for (int i = 1000; --i >= 0;)
+    {
+        const int range = random.nextBool() ? 500 : 10;
+        const int num = random.nextInt (range) + 1;
+
+        HeapBlock<float> buffer1 (num + 16), buffer2 (num + 16);
+        HeapBlock<int> buffer3 (num + 16, true);
+
+#if YUP_ARM
+        float* const data1 = buffer1;
+        float* const data2 = buffer2;
+        int* const int1 = buffer3;
+#else
+        // These tests deliberately operate on misaligned memory and will be flagged up by
+        // checks for undefined behavior!
+        float* const data1 = addBytesToPointer (buffer1.get(), random.nextInt (16));
+        float* const data2 = addBytesToPointer (buffer2.get(), random.nextInt (16));
+        int* const int1 = addBytesToPointer (buffer3.get(), random.nextInt (16));
+#endif
+
+        fillRandomly (random, data1, num);
+        fillRandomly (random, data2, num);
+
+        fillRandomly (random, int1, num);
+        const auto multiplier = (float) (1.0 / (1 << 16));
+
+        convertFixedToFloat (data1, int1, multiplier, num);
+        FloatVectorOperations::convertFixedToFloat (data2, int1, multiplier, num);
+        EXPECT_TRUE (buffersMatch (data1, data2, num));
+
+        convertFloatToFixed (int1, data1, 1.0f / multiplier, num);
+        HeapBlock<int> int2 (num + 16);
+#if YUP_ARM
+        int* const intData = int2;
+#else
+        int* const intData = addBytesToPointer (int2.get(), random.nextInt (16));
+#endif
+        FloatVectorOperations::convertFloatToFixed (intData, data1, 1.0f / multiplier, num);
+
+        for (int i = 0; i < num; ++i)
+            EXPECT_EQ (int1[i], intData[i]);
+    }
+}
+
+TEST_F (FloatVectorOperationsTests, FloatToDoubleAndBack)
+{
+    Random& random = Random::getSystemRandom();
+
+    for (int i = 1000; --i >= 0;)
+    {
+        const int range = random.nextBool() ? 500 : 10;
+        const int num = random.nextInt (range) + 1;
+
+        HeapBlock<float> floatBuffer (num + 16);
+        HeapBlock<double> doubleBuffer (num + 16);
+
+#if YUP_ARM
+        float* const floatData = floatBuffer;
+        double* const doubleData = doubleBuffer;
+#else
+        float* const floatData = addBytesToPointer (floatBuffer.get(), random.nextInt (16));
+        double* const doubleData = addBytesToPointer (doubleBuffer.get(), random.nextInt (16));
+#endif
+
+        fillRandomly (random, floatData, num);
+        FloatVectorOperations::convertFloatToDouble (doubleData, floatData, num);
+        for (int i = 0; i < num; ++i)
+            EXPECT_NEAR ((float) doubleData[i], (float) floatData[i], std::numeric_limits<float>::epsilon());
+
+        fillRandomly (random, doubleData, num);
+        FloatVectorOperations::convertDoubleToFloat (floatData, doubleData, num);
+        for (int i = 0; i < num; ++i)
+            EXPECT_NEAR ((float) floatData[i], (float) doubleData[i], std::numeric_limits<float>::epsilon());
+    }
+}

From ef4f8527540e9586d611129adbf8b4649c466871 Mon Sep 17 00:00:00 2001
From: kunitoki <kunitoki@gmail.com>
Date: Wed, 27 Aug 2025 11:42:19 +0200
Subject: [PATCH 26/37] More fixes

---
 modules/yup_core/text/yup_String.h |  9 ++++-----
 tests/yup_core/yup_String.cpp      | 12 ++++++++++++
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/modules/yup_core/text/yup_String.h b/modules/yup_core/text/yup_String.h
index 60d4110af..4cd5ef796 100644
--- a/modules/yup_core/text/yup_String.h
+++ b/modules/yup_core/text/yup_String.h
@@ -1193,11 +1193,10 @@ class YUP_API String final
             return "0";
         }
 
-        auto numDigitsBeforePoint = (int) std::ceil (std::log10 (number < 0 ? -number : number));
-
-        auto shift = numberOfSignificantFigures - numDigitsBeforePoint;
-        auto factor = std::pow (10.0, shift);
-        auto rounded = std::round (number * factor) / factor;
+        const auto numDigitsBeforePoint = (int) std::floor (std::log10 (std::abs (number)) + DecimalType (1));
+        const auto shift = numberOfSignificantFigures - numDigitsBeforePoint;
+        const auto factor = std::pow (10.0, shift);
+        const auto rounded = std::round (number * factor) / factor;
 
         std::stringstream ss;
         ss << std::fixed << std::setprecision (std::max (shift, 0)) << rounded;
diff --git a/tests/yup_core/yup_String.cpp b/tests/yup_core/yup_String.cpp
index 0aedea73b..bb9dcb64a 100644
--- a/tests/yup_core/yup_String.cpp
+++ b/tests/yup_core/yup_String.cpp
@@ -571,6 +571,18 @@ TEST_F (StringTests, SignificantFigures)
     EXPECT_EQ (String::toDecimalStringWithSignificantFigures (2.8647, 6), String ("2.86470"));
 
     EXPECT_EQ (String::toDecimalStringWithSignificantFigures (-0.0000000000019, 1), String ("-0.000000000002"));
+
+    EXPECT_EQ (String::toDecimalStringWithSignificantFigures (0.001, 7), String ("0.001000000"));
+    EXPECT_EQ (String::toDecimalStringWithSignificantFigures (0.01, 7), String ("0.01000000"));
+    EXPECT_EQ (String::toDecimalStringWithSignificantFigures (0.1, 7), String ("0.1000000"));
+    EXPECT_EQ (String::toDecimalStringWithSignificantFigures (1, 7), String ("1.000000"));
+    EXPECT_EQ (String::toDecimalStringWithSignificantFigures (10, 7), String ("10.00000"));
+    EXPECT_EQ (String::toDecimalStringWithSignificantFigures (100, 7), String ("100.0000"));
+    EXPECT_EQ (String::toDecimalStringWithSignificantFigures (1000, 7), String ("1000.000"));
+    EXPECT_EQ (String::toDecimalStringWithSignificantFigures (10000, 7), String ("10000.00"));
+    EXPECT_EQ (String::toDecimalStringWithSignificantFigures (100000, 7), String ("100000.0"));
+    EXPECT_EQ (String::toDecimalStringWithSignificantFigures (1000000, 7), String ("1000000"));
+    EXPECT_EQ (String::toDecimalStringWithSignificantFigures (10000000, 7), String ("10000000"));
 }
 
 TEST_F (StringTests, FloatTrimming)

From 5c58a34b93a0900594a04631f2ab7d6b816f878d Mon Sep 17 00:00:00 2001
From: Yup Bot <yup-bot@users.noreply.github.com>
Date: Wed, 27 Aug 2025 09:52:44 +0000
Subject: [PATCH 27/37] Code formatting

---
 .../buffers/yup_FloatVectorOperations.cpp              | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/modules/yup_audio_basics/buffers/yup_FloatVectorOperations.cpp b/modules/yup_audio_basics/buffers/yup_FloatVectorOperations.cpp
index 246c17b92..42c601386 100644
--- a/modules/yup_audio_basics/buffers/yup_FloatVectorOperations.cpp
+++ b/modules/yup_audio_basics/buffers/yup_FloatVectorOperations.cpp
@@ -1568,13 +1568,13 @@ void convertDoubleToFloat (float* dest, const double* src, Size num) noexcept
     for (; i + 2 <= num; i += 2)
     {
         __m128d d = _mm_loadu_pd (src + i);
-        __m128 f  = _mm_cvtpd_ps (d);
+        __m128 f = _mm_cvtpd_ps (d);
         _mm_storel_pi ((__m64*) (dest + i), f);
     }
 #endif
 
-   for (; i < num; ++i)
-       dest[i] = (float) src[i];
+    for (; i < num; ++i)
+        dest[i] = (float) src[i];
 #endif
 }
 
@@ -1603,8 +1603,8 @@ void convertFloatToDouble (double* dest, const float* src, Size num) noexcept
     }
 #endif
 
-   for (; i < num; ++i)
-       dest[i] = (double) src[i];
+    for (; i < num; ++i)
+        dest[i] = (double) src[i];
 #endif
 }
 

From dd6ecd79db424d9d5ae5bb51684d8fb3ec57460e Mon Sep 17 00:00:00 2001
From: kunitoki <kunitoki@gmail.com>
Date: Wed, 27 Aug 2025 11:55:30 +0200
Subject: [PATCH 28/37] Fix warning

---
 modules/yup_dsp/convolution/yup_PartitionedConvolver.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/modules/yup_dsp/convolution/yup_PartitionedConvolver.cpp b/modules/yup_dsp/convolution/yup_PartitionedConvolver.cpp
index 9eab19c50..68dfe45d7 100644
--- a/modules/yup_dsp/convolution/yup_PartitionedConvolver.cpp
+++ b/modules/yup_dsp/convolution/yup_PartitionedConvolver.cpp
@@ -235,7 +235,6 @@ class PartitionedConvolver::DirectFIR
 
 #elif YUP_USE_SSE_INTRINSICS
         __m128 vacc = _mm_setzero_ps();
-        std::size_t i = 0;
 #if YUP_USE_FMA_INTRINSICS
         for (; i + 4 <= len; i += 4)
         {

From ca39bd75bddce1406d755036c0824e3504845819 Mon Sep 17 00:00:00 2001
From: kunitoki <kunitoki@gmail.com>
Date: Wed, 27 Aug 2025 11:58:52 +0200
Subject: [PATCH 29/37] Fix SSE3 > SSE2

---
 modules/yup_dsp/convolution/yup_PartitionedConvolver.cpp | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/modules/yup_dsp/convolution/yup_PartitionedConvolver.cpp b/modules/yup_dsp/convolution/yup_PartitionedConvolver.cpp
index 68dfe45d7..e189a5dc0 100644
--- a/modules/yup_dsp/convolution/yup_PartitionedConvolver.cpp
+++ b/modules/yup_dsp/convolution/yup_PartitionedConvolver.cpp
@@ -217,7 +217,6 @@ class PartitionedConvolver::DirectFIR
         std::size_t i = 0;
 
 #if YUP_USE_AVX_INTRINSICS && YUP_USE_FMA_INTRINSICS
-        // 8-wide AVX2 FMA path
         __m256 vacc = _mm256_setzero_ps();
         for (; i + 8 <= len; i += 8)
         {
@@ -225,7 +224,6 @@ class PartitionedConvolver::DirectFIR
             __m256 vb = _mm256_loadu_ps (b + i);
             vacc = _mm256_fmadd_ps (va, vb, vacc);
         }
-        // horizontal add
         __m128 low = _mm256_castps256_ps128 (vacc);
         __m128 high = _mm256_extractf128_ps (vacc, 1);
         __m128 vsum = _mm_add_ps (low, high);
@@ -250,8 +248,7 @@ class PartitionedConvolver::DirectFIR
             vacc = _mm_add_ps (vacc, _mm_mul_ps (va, vb));
         }
 #endif
-        // horizontal add
-        __m128 shuf = _mm_movehdup_ps (vacc);
+        __m128 shuf = _mm_shuffle_ps (vacc, vacc, _MM_SHUFFLE (2, 3, 0, 1));
         __m128 sums = _mm_add_ps (vacc, shuf);
         shuf = _mm_movehl_ps (shuf, sums);
         sums = _mm_add_ss (sums, shuf);

From 1ed1edb50778a40efacb0de8107babb591701c2d Mon Sep 17 00:00:00 2001
From: kunitoki <kunitoki@gmail.com>
Date: Thu, 11 Sep 2025 21:19:16 +0200
Subject: [PATCH 30/37] Still delay running

---
 tests/main.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tests/main.cpp b/tests/main.cpp
index 37d5d7637..35d8eae6f 100644
--- a/tests/main.cpp
+++ b/tests/main.cpp
@@ -80,7 +80,10 @@ struct TestApplication : yup::YUPApplication
         else
         {
             // Run suites individually
-            runNextSuite (0);
+            yup::MessageManager::callAsync ([this]
+            {
+                runNextSuite (0);
+            });
         }
     }
 

From 47094537872f8c492d4f7551113639e7e5a8a326 Mon Sep 17 00:00:00 2001
From: kunitoki <kunitoki@gmail.com>
Date: Thu, 11 Sep 2025 22:01:58 +0200
Subject: [PATCH 31/37] More work

---
 .../source/examples/ConvolutionDemo.h         |   8 +-
 .../convolution/yup_PartitionedConvolver.cpp  | 566 ++++++++++--------
 .../convolution/yup_PartitionedConvolver.h    |   1 +
 modules/yup_dsp/yup_dsp.h                     |   1 +
 tests/yup_dsp/yup_PartitionedConvolver.cpp    | 281 +++++++++
 5 files changed, 592 insertions(+), 265 deletions(-)

diff --git a/examples/graphics/source/examples/ConvolutionDemo.h b/examples/graphics/source/examples/ConvolutionDemo.h
index a984b57eb..4253d8973 100644
--- a/examples/graphics/source/examples/ConvolutionDemo.h
+++ b/examples/graphics/source/examples/ConvolutionDemo.h
@@ -60,7 +60,7 @@ class ConvolutionDemo
         dryGain.setCurrentAndTargetValue (0.3f);
 
         // Configure convolver with typical layout
-        convolver.setTypicalLayout (256, {256, 1024, 4096});
+        convolver.setTypicalLayout (256, { 256, 1024, 4096 });
 
         // Create UI
         createUI();
@@ -293,7 +293,9 @@ class ConvolutionDemo
                 impulseResponseData[static_cast<size_t> (i)] = impulseResponseBuffer.getSample (0, i) * normalizationGain;
 
             // Set impulse response in convolver
-            convolver.setImpulseResponse (impulseResponseData);
+            yup::PartitionedConvolver::IRLoadOptions loadOptions;
+            loadOptions.trimEndSilenceBelowDb = -60.0f;
+            convolver.setImpulseResponse (impulseResponseData, loadOptions);
             hasImpulseResponse = true;
 
             std::cout << "Loaded impulse response: " << file.getFileName() << std::endl;
@@ -388,7 +390,7 @@ class ConvolutionDemo
         irWaveformDisplay.setMargins (25, 25, 25, 25);
 
         // Add grid lines
-         irWaveformDisplay.setVerticalGridLines ({ 0.0, 1.0 });
+        irWaveformDisplay.setVerticalGridLines ({ 0.0, 1.0 });
         irWaveformDisplay.setHorizontalGridLines ({ -1.0, -0.5, 0.5, 1.0 });
         irWaveformDisplay.addHorizontalGridLine (0.0, yup::Color (0xFF666666), 1.0f, true);
 
diff --git a/modules/yup_dsp/convolution/yup_PartitionedConvolver.cpp b/modules/yup_dsp/convolution/yup_PartitionedConvolver.cpp
index e189a5dc0..133d27335 100644
--- a/modules/yup_dsp/convolution/yup_PartitionedConvolver.cpp
+++ b/modules/yup_dsp/convolution/yup_PartitionedConvolver.cpp
@@ -46,20 +46,12 @@ static void complexMultiplyAccumulate (const float* A, const float* B, float* Y,
         __m256 b = _mm256_loadu_ps (B + idx);
         __m256 y = _mm256_loadu_ps (Y + idx);
 
-        // a = [ar0 ai0 ar1 ai1 ar2 ai2 ar3 ai3]
-        // b = [br0 bi0 br1 bi1 br2 bi2 br3 bi3]
-
-        // separate real and imag for a and b
         const __m256 a_shuffled = _mm256_permute_ps (a, _MM_SHUFFLE (2, 3, 0, 1));
         const __m256 b_shuffled = _mm256_permute_ps (b, _MM_SHUFFLE (2, 3, 0, 1));
 
-        // real = ar*br - ai*bi
         __m256 realPart = _mm256_fmsub_ps (a, b, _mm256_mul_ps (a_shuffled, b_shuffled));
-
-        // imag = ar*bi + ai*br
         __m256 imagPart = _mm256_fmadd_ps (a, b_shuffled, _mm256_mul_ps (a_shuffled, b));
 
-        // interleave real/imag back
         const __m256 interleaved = _mm256_blend_ps (realPart, imagPart, 0b10101010);
 
         y = _mm256_add_ps (y, interleaved);
@@ -76,17 +68,12 @@ static void complexMultiplyAccumulate (const float* A, const float* B, float* Y,
         __m128 b = _mm_loadu_ps (B + idx);
         __m128 y = _mm_loadu_ps (Y + idx);
 
-        // separate real and imag for a and b
         const __m128 a_shuffled = _mm_shuffle_ps (a, a, _MM_SHUFFLE (2, 3, 0, 1));
         const __m128 b_shuffled = _mm_shuffle_ps (b, b, _MM_SHUFFLE (2, 3, 0, 1));
 
-        // real = ar*br - ai*bi
         __m128 realPart = _mm_sub_ps (_mm_mul_ps (a, b), _mm_mul_ps (a_shuffled, b_shuffled));
-
-        // imag = ar*bi + ai*br
         __m128 imagPart = _mm_add_ps (_mm_mul_ps (a, b_shuffled), _mm_mul_ps (a_shuffled, b));
 
-        // interleave real/imag back
         const __m128 interleaved = _mm_unpacklo_ps (realPart, imagPart);
 
         y = _mm_add_ps (y, interleaved);
@@ -94,38 +81,29 @@ static void complexMultiplyAccumulate (const float* A, const float* B, float* Y,
     }
 
 #elif YUP_USE_ARM_NEON
-    constexpr int simdWidth = 2; // NEON path: process 2 complex pairs (4 floats) at a time
+    constexpr int simdWidth = 4;
     for (; i <= complexPairs - simdWidth; i += simdWidth)
     {
         const int idx = i * 2;
 
-        float32x4_t a = vld1q_f32 (A + idx); // [ar0, ai0, ar1, ai1]
-        float32x4_t b = vld1q_f32 (B + idx); // [br0, bi0, br1, bi1]
-        float32x4_t y = vld1q_f32 (Y + idx);
-
-        // Shuffle a and b to get swapped real/imag for cross-multiplication
-        float32x4_t a_shuf = vrev64q_f32 (a); // [ai0, ar0, ai1, ar1]
-        float32x4_t b_shuf = vrev64q_f32 (b); // [bi0, br0, bi1, br1]
+        float32x4x2_t a = vld2q_f32 (A + idx);
+        float32x4x2_t b = vld2q_f32 (B + idx);
+        float32x4x2_t y = vld2q_f32 (Y + idx);
 
-        // real = ar*br - ai*bi
-        float32x4_t realPart = vsubq_f32 (vmulq_f32 (a, b), vmulq_f32 (a_shuf, b_shuf));
+        float32x4_t ar = a.val[0], ai = a.val[1];
+        float32x4_t br = b.val[0], bi = b.val[1];
+        float32x4_t yr = y.val[0], yi = y.val[1];
 
-        // imag = ar*bi + ai*br
-        float32x4_t imagPart = vaddq_f32 (vmulq_f32 (a, b_shuf), vmulq_f32 (a_shuf, b));
+        float32x4_t real = vmulq_f32 (ar, br);
+        real = vfmsq_f32 (real, ai, bi);
+        float32x4_t imag = vmulq_f32 (ar, bi);
+        imag = vfmaq_f32 (imag, ai, br);
 
-        // Interleave real and imag: [real0, imag0, real1, imag1]
-        float32x2_t realLow = vget_low_f32 (realPart);
-        float32x2_t imagLow = vget_low_f32 (imagPart);
-        float32x2x2_t zippedLow = vzip_f32 (realLow, imagLow);
+        yr = vaddq_f32 (yr, real);
+        yi = vaddq_f32 (yi, imag);
 
-        float32x2_t realHigh = vget_high_f32 (realPart);
-        float32x2_t imagHigh = vget_high_f32 (imagPart);
-        float32x2x2_t zippedHigh = vzip_f32 (realHigh, imagHigh);
-
-        float32x4_t interleaved = vcombine_f32 (zippedLow.val[0], zippedHigh.val[0]);
-
-        y = vaddq_f32 (y, interleaved);
-        vst1q_f32 (Y + idx, y);
+        float32x4x2_t out = { yr, yi };
+        vst2q_f32 (Y + idx, out); // interleave back
     }
 
 #endif
@@ -140,7 +118,6 @@ static void complexMultiplyAccumulate (const float* A, const float* B, float* Y,
         const float br = B[ri];
         const float bi = B[ii];
 
-        // (ar + j*ai) * (br + j*bi) = (ar*br - ai*bi) + j*(ar*bi + ai*br)
         Y[ri] += ar * br - ai * bi;
         Y[ii] += ar * bi + ai * br;
     }
@@ -157,38 +134,38 @@ class PartitionedConvolver::DirectFIR
     {
         FloatVectorOperations::multiply (taps.data(), scaling, taps.size());
 
-        tapsReversed_ = std::move (taps);
-        std::reverse (tapsReversed_.begin(), tapsReversed_.end());
+        tapsReversed = std::move (taps);
+        std::reverse (tapsReversed.begin(), tapsReversed.end());
 
-        numTaps_ = tapsReversed_.size();
-        paddedLen_ = (numTaps_ + 3u) & ~3u;
-        tapsReversed_.resize (paddedLen_, 0.0f);
+        numTaps = tapsReversed.size();
+        paddedLen = (numTaps + 3u) & ~3u;
+        tapsReversed.resize (paddedLen, 0.0f);
 
-        history_.assign (2 * numTaps_, 0.0f);
-        writeIndex_ = 0;
+        history.assign (2 * numTaps, 0.0f);
+        writeIndex = 0;
     }
 
     void reset()
     {
-        std::fill (history_.begin(), history_.end(), 0.0f);
-        writeIndex_ = 0;
+        std::fill (history.begin(), history.end(), 0.0f);
+        writeIndex = 0;
     }
 
     void process (const float* input, float* output, std::size_t numSamples) noexcept
     {
-        const std::size_t M = numTaps_;
+        const std::size_t M = numTaps;
         if (M == 0)
             return;
 
-        const float* h = tapsReversed_.data();
+        const float* h = tapsReversed.data();
         for (std::size_t i = 0; i < numSamples; ++i)
         {
             const float x = input[i];
 
-            history_[writeIndex_] = x;
-            history_[writeIndex_ + M] = x;
+            history[writeIndex] = x;
+            history[writeIndex + M] = x;
 
-            const float* w = history_.data() + writeIndex_ + 1;
+            const float* w = history.data() + writeIndex + 1;
 
             float sum = 0.0f;
 
@@ -200,14 +177,14 @@ class PartitionedConvolver::DirectFIR
 
             output[i] += sum;
 
-            if (++writeIndex_ == M)
-                writeIndex_ = 0;
+            if (++writeIndex == M)
+                writeIndex = 0;
         }
     }
 
     std::size_t getNumTaps() const
     {
-        return numTaps_;
+        return numTaps;
     }
 
 private:
@@ -280,11 +257,11 @@ class PartitionedConvolver::DirectFIR
         return acc;
     }
 
-    std::vector<float> tapsReversed_;
-    std::vector<float> history_;
-    std::size_t numTaps_ = 0;
-    std::size_t paddedLen_ = 0;
-    std::size_t writeIndex_ = 0;
+    std::vector<float> tapsReversed;
+    std::vector<float> history;
+    std::size_t numTaps = 0;
+    std::size_t paddedLen = 0;
+    std::size_t writeIndex = 0;
 };
 
 //==============================================================================
@@ -298,41 +275,41 @@ class PartitionedConvolver::FFTLayer
     FFTLayer (FFTLayer&& other) = default;
     FFTLayer& operator= (FFTLayer&& other) = default;
 
-    void configure (int hopSize)
+    void configure (int newHopSize)
     {
-        hopSize_ = hopSize;
-        fftSize_ = hopSize * 2;
+        hopSize = newHopSize;
+        fftSize = hopSize * 2;
 
-        fftProcessor_.setSize (fftSize_);
-        fftProcessor_.setScaling (FFTProcessor::FFTScaling::asymmetric);
+        fftProcessor.setSize (fftSize);
+        fftProcessor.setScaling (FFTProcessor::FFTScaling::asymmetric);
 
-        overlapBuffer_.assign (static_cast<std::size_t> (hopSize_), 0.0f);
-        timeBuffer_.assign (static_cast<std::size_t> (fftSize_), 0.0f);
-        frequencyBuffer_.assign (static_cast<std::size_t> (fftSize_) * 2, 0.0f);
-        tempBuffer_.assign (static_cast<std::size_t> (fftSize_) * 2, 0.0f); // Must hold complex data for in-place FFT
+        overlapBuffer.assign (static_cast<std::size_t> (hopSize), 0.0f);
+        timeBuffer.assign (static_cast<std::size_t> (fftSize), 0.0f);
+        frequencyBuffer.assign (static_cast<std::size_t> (fftSize) * 2, 0.0f);
+        tempBuffer.assign (static_cast<std::size_t> (fftSize) * 2, 0.0f); // Must hold complex data for in-place FFT
 
-        fdlIndex_ = 0;
-        configured_ = true;
+        fdlIndex = 0;
+        configured = true;
     }
 
-    int getHopSize() const { return hopSize_; }
+    int getHopSize() const { return hopSize; }
 
-    int getFFTSize() const { return fftSize_; }
+    int getFFTSize() const { return fftSize; }
 
-    bool isConfigured() const { return configured_; }
+    bool isConfigured() const { return configured; }
 
     std::size_t setImpulseResponse (const float* impulseResponse, std::size_t length, float scaling)
     {
-        jassert (configured_);
+        jassert (configured);
 
-        if (fftSize_ <= 0 || hopSize_ <= 0)
+        if (fftSize <= 0 || hopSize <= 0)
         {
             resetState();
             return 0;
         }
 
-        frequencyPartitions_.clear();
-        frequencyDelayLine_.clear();
+        frequencyPartitions.clear();
+        frequencyDelayLine.clear();
 
         if (length == 0 || impulseResponse == nullptr)
         {
@@ -340,7 +317,7 @@ class PartitionedConvolver::FFTLayer
             return 0;
         }
 
-        const auto numPartitions = (length + static_cast<std::size_t> (hopSize_) - 1) / static_cast<std::size_t> (hopSize_);
+        const auto numPartitions = (length + static_cast<std::size_t> (hopSize) - 1) / static_cast<std::size_t> (hopSize);
         if (numPartitions == 0)
         {
             resetState();
@@ -348,33 +325,33 @@ class PartitionedConvolver::FFTLayer
         }
 
         std::size_t processedSamples = 0;
-        frequencyPartitions_.reserve (numPartitions);
+        frequencyPartitions.reserve (numPartitions);
 
         for (std::size_t p = 0; p < numPartitions; ++p)
         {
             std::vector<float> partition;
-            partition.resize (static_cast<std::size_t> (fftSize_) * 2);
+            partition.resize (static_cast<std::size_t> (fftSize) * 2);
 
-            std::fill (tempBuffer_.begin(), tempBuffer_.end(), 0.0f);
+            std::fill (tempBuffer.begin(), tempBuffer.end(), 0.0f);
 
-            const std::size_t offset = p * static_cast<std::size_t> (hopSize_);
-            const std::size_t copyCount = std::min (static_cast<std::size_t> (hopSize_), length - offset);
+            const std::size_t offset = p * static_cast<std::size_t> (hopSize);
+            const std::size_t copyCount = std::min (static_cast<std::size_t> (hopSize), length - offset);
 
             if (copyCount > 0 && offset < length)
             {
                 for (std::size_t i = 0; i < copyCount && offset + i < length; ++i)
-                    tempBuffer_[i] = impulseResponse[offset + i] * scaling;
+                    tempBuffer[i] = impulseResponse[offset + i] * scaling;
             }
 
-            fftProcessor_.performRealFFTForward (tempBuffer_.data(), partition.data());
+            fftProcessor.performRealFFTForward (tempBuffer.data(), partition.data());
 
-            frequencyPartitions_.push_back (std::move (partition));
+            frequencyPartitions.push_back (std::move (partition));
 
             processedSamples += copyCount;
         }
 
-        frequencyDelayLine_.assign (numPartitions, std::vector<float> (static_cast<std::size_t> (fftSize_) * 2, 0.0f));
-        fdlIndex_ = 0;
+        frequencyDelayLine.assign (numPartitions, std::vector<float> (static_cast<std::size_t> (fftSize) * 2, 0.0f));
+        fdlIndex = 0;
 
         resetState();
 
@@ -383,82 +360,82 @@ class PartitionedConvolver::FFTLayer
 
     void resetState()
     {
-        fdlIndex_ = 0;
+        fdlIndex = 0;
 
-        for (auto& partition : frequencyDelayLine_)
+        for (auto& partition : frequencyDelayLine)
             std::fill (partition.begin(), partition.end(), 0.0f);
 
-        std::fill (overlapBuffer_.begin(), overlapBuffer_.end(), 0.0f);
-        std::fill (timeBuffer_.begin(), timeBuffer_.end(), 0.0f);
-        std::fill (frequencyBuffer_.begin(), frequencyBuffer_.end(), 0.0f);
+        std::fill (overlapBuffer.begin(), overlapBuffer.end(), 0.0f);
+        std::fill (timeBuffer.begin(), timeBuffer.end(), 0.0f);
+        std::fill (frequencyBuffer.begin(), frequencyBuffer.end(), 0.0f);
     }
 
     void processHop (const float* inputHop, float* outputAccumulator)
     {
-        jassert (configured_);
+        jassert (configured);
 
-        if (frequencyPartitions_.empty())
+        if (frequencyPartitions.empty())
             return;
 
         // 1) Transform current input hop to frequency domain
-        FloatVectorOperations::copy (tempBuffer_.data(), inputHop, hopSize_);
-        fftProcessor_.performRealFFTForward (tempBuffer_.data(), tempBuffer_.data());
+        FloatVectorOperations::copy (tempBuffer.data(), inputHop, hopSize);
+        fftProcessor.performRealFFTForward (tempBuffer.data(), tempBuffer.data());
 
         // 2) Store in frequency delay line (circular buffer) - copy full complex buffer
-        fdlIndex_ = (fdlIndex_ == 0) ? static_cast<int> (frequencyDelayLine_.size()) - 1 : fdlIndex_ - 1;
-        std::copy (tempBuffer_.begin(), tempBuffer_.begin() + (fftSize_ * 2), frequencyDelayLine_[static_cast<std::size_t> (fdlIndex_)].begin());
+        fdlIndex = (fdlIndex == 0) ? static_cast<int> (frequencyDelayLine.size()) - 1 : fdlIndex - 1;
+        std::copy (tempBuffer.begin(), tempBuffer.begin() + (fftSize * 2), frequencyDelayLine[static_cast<std::size_t> (fdlIndex)].begin());
 
         // 3) Frequency domain convolution: Y = sum(X[k-p] * H[p])
-        FloatVectorOperations::clear (frequencyBuffer_.data(), fftSize_ * 2);
+        FloatVectorOperations::clear (frequencyBuffer.data(), fftSize * 2);
 
-        int xIndex = fdlIndex_;
-        for (std::size_t p = 0; p < frequencyPartitions_.size(); ++p)
+        int xIndex = fdlIndex;
+        for (std::size_t p = 0; p < frequencyPartitions.size(); ++p)
         {
-            const float* X = frequencyDelayLine_[static_cast<std::size_t> (xIndex)].data();
-            const float* H = frequencyPartitions_[p].data();
+            const float* X = frequencyDelayLine[static_cast<std::size_t> (xIndex)].data();
+            const float* H = frequencyPartitions[p].data();
 
             // fftSize_/2 gives the number of complex pairs for real FFT
-            complexMultiplyAccumulate (X, H, frequencyBuffer_.data(), fftSize_ / 2);
+            complexMultiplyAccumulate (X, H, frequencyBuffer.data(), fftSize / 2);
 
             // Move to next older spectrum
             xIndex++;
-            if (xIndex >= static_cast<int> (frequencyDelayLine_.size()))
+            if (xIndex >= static_cast<int> (frequencyDelayLine.size()))
                 xIndex = 0;
         }
 
         // 4) Inverse FFT back to time domain
-        fftProcessor_.performRealFFTInverse (frequencyBuffer_.data(), timeBuffer_.data());
+        fftProcessor.performRealFFTInverse (frequencyBuffer.data(), timeBuffer.data());
 
-        // 5) Overlap-Add: output first hopSize_ samples, store last hopSize_ as overlap
-        for (int i = 0; i < hopSize_; ++i)
+        // 5) Overlap-Add: output first hopSize samples, store last hopSize as overlap
+        for (int i = 0; i < hopSize; ++i)
         {
-            outputAccumulator[i] += timeBuffer_[i] + overlapBuffer_[i];
-            overlapBuffer_[i] = timeBuffer_[i + hopSize_];
+            outputAccumulator[i] += timeBuffer[i] + overlapBuffer[i];
+            overlapBuffer[i] = timeBuffer[i + hopSize];
         }
     }
 
-    bool hasImpulseResponse() const { return ! frequencyPartitions_.empty(); }
+    bool hasImpulseResponse() const { return ! frequencyPartitions.empty(); }
 
 private:
-    int hopSize_ = 0;
-    int fftSize_ = 0;
+    int hopSize = 0;
+    int fftSize = 0;
 
-    FFTProcessor fftProcessor_;
+    FFTProcessor fftProcessor;
 
     // IR partitions in frequency domain
-    std::vector<std::vector<float>> frequencyPartitions_;
+    std::vector<std::vector<float>> frequencyPartitions;
 
-    // Frequency Delay Line (most recent at fdlIndex_)
-    std::vector<std::vector<float>> frequencyDelayLine_;
-    int fdlIndex_ = 0;
+    // Frequency Delay Line (most recent at fdlIndex)
+    std::vector<std::vector<float>> frequencyDelayLine;
+    int fdlIndex = 0;
 
     // Processing buffers
-    std::vector<float> overlapBuffer_;
-    std::vector<float> timeBuffer_;
-    std::vector<float> frequencyBuffer_;
-    std::vector<float> tempBuffer_;
+    std::vector<float> overlapBuffer;
+    std::vector<float> timeBuffer;
+    std::vector<float> frequencyBuffer;
+    std::vector<float> tempBuffer;
 
-    bool configured_ = false;
+    bool configured = false;
 };
 
 //==============================================================================
@@ -470,23 +447,23 @@ class PartitionedConvolver::CircularBuffer
 
     void resize (std::size_t size)
     {
-        buffer_.resize (size);
+        buffer.resize (size);
         clear();
     }
 
     void clear()
     {
-        std::fill (buffer_.begin(), buffer_.end(), 0.0f);
-        writeIndex_ = 0;
-        readIndex_ = 0;
-        availableForRead_ = 0;
+        std::fill (buffer.begin(), buffer.end(), 0.0f);
+        writeIndex = 0;
+        readIndex = 0;
+        availableForRead = 0;
     }
 
-    std::size_t getAvailableForRead() const { return availableForRead_; }
+    std::size_t getAvailableForRead() const { return availableForRead; }
 
-    std::size_t getAvailableForWrite() const { return buffer_.size() - availableForRead_; }
+    std::size_t getAvailableForWrite() const { return buffer.size() - availableForRead; }
 
-    std::size_t getSize() const { return buffer_.size(); }
+    std::size_t getSize() const { return buffer.size(); }
 
     void write (const float* data, std::size_t numSamples)
     {
@@ -496,15 +473,15 @@ class PartitionedConvolver::CircularBuffer
         if (numSamples == 0)
             return;
 
-        const std::size_t beforeWrap = std::min (numSamples, buffer_.size() - writeIndex_);
+        const std::size_t beforeWrap = std::min (numSamples, buffer.size() - writeIndex);
         const std::size_t afterWrap = numSamples - beforeWrap;
 
-        std::copy (data, data + beforeWrap, buffer_.begin() + writeIndex_);
+        std::copy (data, data + beforeWrap, buffer.begin() + writeIndex);
         if (afterWrap > 0)
-            std::copy (data + beforeWrap, data + numSamples, buffer_.begin());
+            std::copy (data + beforeWrap, data + numSamples, buffer.begin());
 
-        writeIndex_ = (writeIndex_ + numSamples) % buffer_.size();
-        availableForRead_ += numSamples;
+        writeIndex = (writeIndex + numSamples) % buffer.size();
+        availableForRead += numSamples;
     }
 
     void read (float* data, std::size_t numSamples)
@@ -515,15 +492,15 @@ class PartitionedConvolver::CircularBuffer
         if (numSamples == 0)
             return;
 
-        const std::size_t beforeWrap = std::min (numSamples, buffer_.size() - readIndex_);
+        const std::size_t beforeWrap = std::min (numSamples, buffer.size() - readIndex);
         const std::size_t afterWrap = numSamples - beforeWrap;
 
-        std::copy (buffer_.begin() + readIndex_, buffer_.begin() + readIndex_ + beforeWrap, data);
+        std::copy (buffer.begin() + readIndex, buffer.begin() + readIndex + beforeWrap, data);
         if (afterWrap > 0)
-            std::copy (buffer_.begin(), buffer_.begin() + afterWrap, data + beforeWrap);
+            std::copy (buffer.begin(), buffer.begin() + afterWrap, data + beforeWrap);
 
-        readIndex_ = (readIndex_ + numSamples) % buffer_.size();
-        availableForRead_ -= numSamples;
+        readIndex = (readIndex + numSamples) % buffer.size();
+        availableForRead -= numSamples;
     }
 
     void peek (float* data, std::size_t numSamples, std::size_t offset = 0) const
@@ -534,13 +511,13 @@ class PartitionedConvolver::CircularBuffer
         if (numSamples == 0)
             return;
 
-        const std::size_t startIndex = (readIndex_ + offset) % buffer_.size();
-        const std::size_t beforeWrap = std::min (numSamples, buffer_.size() - startIndex);
+        const std::size_t startIndex = (readIndex + offset) % buffer.size();
+        const std::size_t beforeWrap = std::min (numSamples, buffer.size() - startIndex);
         const std::size_t afterWrap = numSamples - beforeWrap;
 
-        std::copy (buffer_.begin() + startIndex, buffer_.begin() + startIndex + beforeWrap, data);
+        std::copy (buffer.begin() + startIndex, buffer.begin() + startIndex + beforeWrap, data);
         if (afterWrap > 0)
-            std::copy (buffer_.begin(), buffer_.begin() + afterWrap, data + beforeWrap);
+            std::copy (buffer.begin(), buffer.begin() + afterWrap, data + beforeWrap);
     }
 
     void skip (std::size_t numSamples)
@@ -548,15 +525,15 @@ class PartitionedConvolver::CircularBuffer
         jassert (numSamples <= getAvailableForRead());
         numSamples = std::min (numSamples, getAvailableForRead());
 
-        readIndex_ = (readIndex_ + numSamples) % buffer_.size();
-        availableForRead_ -= numSamples;
+        readIndex = (readIndex + numSamples) % buffer.size();
+        availableForRead -= numSamples;
     }
 
 private:
-    std::vector<float> buffer_;
-    std::size_t writeIndex_ = 0;
-    std::size_t readIndex_ = 0;
-    std::size_t availableForRead_ = 0;
+    std::vector<float> buffer;
+    std::size_t writeIndex = 0;
+    std::size_t readIndex = 0;
+    std::size_t availableForRead = 0;
 };
 
 //==============================================================================
@@ -567,96 +544,156 @@ class PartitionedConvolver::Impl
     Impl() = default;
     ~Impl() = default;
 
-    void configureLayers (std::size_t directFIRTaps, const std::vector<LayerSpec>& layers)
+    void configureLayers (std::size_t directFIRTaps, const std::vector<LayerSpec>& newLayers)
     {
-        directFIRTapCount_ = directFIRTaps;
+        directFIRTapCount = directFIRTaps;
 
-        layers_.clear();
-        layers_.resize (layers.size());
+        layers.clear();
+        layers.resize (newLayers.size());
 
         std::size_t maximumHopSize = 0;
 
-        baseHopSize_ = layers.empty() ? 0 : layers.front().hopSize;
-        for (std::size_t i = 0; i < layers.size(); ++i)
+        baseHopSize = newLayers.empty() ? 0 : newLayers.front().hopSize;
+        for (std::size_t i = 0; i < newLayers.size(); ++i)
         {
-            layers_[i].configure (layers[i].hopSize);
+            layers[i].configure (newLayers[i].hopSize);
             if (i == 0)
-                baseHopSize_ = layers[i].hopSize;
+                baseHopSize = newLayers[i].hopSize;
             else
-                baseHopSize_ = std::min (baseHopSize_, layers[i].hopSize);
+                baseHopSize = std::min (baseHopSize, newLayers[i].hopSize);
 
-            maximumHopSize = std::max (maximumHopSize, static_cast<std::size_t> (layers[i].hopSize));
+            maximumHopSize = std::max (maximumHopSize, static_cast<std::size_t> (newLayers[i].hopSize));
         }
 
-        maxHopSize_ = maximumHopSize;
+        maxHopSize = maximumHopSize;
 
         // Clear staging buffers - will be allocated in prepare()
-        inputStaging_.clear();
-        outputStaging_.clear();
+        inputStaging.clear();
+        outputStaging.clear();
 
         // Resize per-layer circular buffers - will be allocated in prepare()
-        layerInputBuffers_.resize (layers.size());
-        layerOutputBuffers_.resize (layers.size());
+        layerInputBuffers.resize (layers.size());
+        layerOutputBuffers.resize (layers.size());
 
-        layerTempOutput_.clear();
-        tempLayerHop_.clear();
+        layerTempOutput.clear();
+        tempLayerHop.clear();
 
         // Clear working buffers - will be allocated in prepare()
-        workingOutput_.clear();
+        workingOutput.clear();
 
-        isPrepared_ = false;
+        isPrepared = false;
     }
 
     void prepare (std::size_t maxBlockSize)
     {
-        maxBlockSize_ = maxBlockSize;
+        this->maxBlockSize = maxBlockSize;
 
         // Prepare main input staging - needs to accumulate up to baseHopSize samples plus incoming block
-        const std::size_t inputStagingSize = static_cast<std::size_t> (baseHopSize_) + maxBlockSize;
-        inputStaging_.resize (inputStagingSize);
-        outputStaging_.assign (static_cast<std::size_t> (baseHopSize_), 0.0f);
+        const std::size_t inputStagingSize = static_cast<std::size_t> (baseHopSize) + maxBlockSize;
+        inputStaging.resize (inputStagingSize);
+        outputStaging.assign (static_cast<std::size_t> (baseHopSize), 0.0f);
 
         // Prepare per-layer circular buffers with layer-specific sizing
-        for (std::size_t i = 0; i < layerInputBuffers_.size(); ++i)
+        for (std::size_t i = 0; i < layerInputBuffers.size(); ++i)
         {
-            const std::size_t layerHopSize = static_cast<std::size_t> (layers_[i].getHopSize());
+            const std::size_t layerHopSize = static_cast<std::size_t> (layers[i].getHopSize());
 
             // Input buffer: needs to accumulate up to layerHopSize samples plus incoming block
             const std::size_t layerInputBufferSize = layerHopSize + maxBlockSize;
-            layerInputBuffers_[i].resize (layerInputBufferSize);
+            layerInputBuffers[i].resize (layerInputBufferSize);
 
             // Output buffer: needs to handle bursts of layerHopSize samples
             // Size it to handle multiple hops since read rate (baseHopSize) may be much smaller than write rate (layerHopSize)
-            const std::size_t layerOutputBufferSize = layerHopSize * ((layerHopSize / static_cast<std::size_t> (baseHopSize_)) + 2);
-            layerOutputBuffers_[i].resize (layerOutputBufferSize);
+            const std::size_t layerOutputBufferSize = layerHopSize * ((layerHopSize / static_cast<std::size_t> (baseHopSize)) + 2);
+            layerOutputBuffers[i].resize (layerOutputBufferSize);
         }
 
         // Allocate temp buffers
-        if (maxHopSize_ > 0)
+        if (maxHopSize > 0)
         {
-            layerTempOutput_.resize (maxHopSize_);
-            tempLayerHop_.resize (maxHopSize_);
+            layerTempOutput.resize (maxHopSize);
+            tempLayerHop.resize (maxHopSize);
         }
 
         // Allocate working buffers
-        workingOutput_.resize (maxBlockSize);
+        workingOutput.resize (maxBlockSize);
+
+        isPrepared = true;
+    }
+
+    std::size_t trimSilenceFromEnd (const float* impulseResponse, std::size_t length, float thresholdDb)
+    {
+        if (impulseResponse == nullptr || length == 0)
+            return 0;
+
+        const float threshold = std::pow (10.0f, thresholdDb / 20.0f);
+
+        // For short IRs, use smaller window size and be more conservative
+        const std::size_t minRetainLength = std::max (std::size_t (32), length / 4);
+        const std::size_t windowSize = std::min (std::size_t (1024), std::max (std::size_t (64), length / 20));
+
+        // First pass: scan from end to find significant content
+        std::size_t significantContentEnd = 0;
+        for (std::size_t i = length; i > windowSize; i -= windowSize)
+        {
+            const std::size_t startIdx = i - windowSize;
+            const std::size_t endIdx = std::min (i, length);
+            const std::size_t samples = endIdx - startIdx;
+
+            if (samples == 0)
+                continue;
+
+            float rmsSquared = 0.0f;
+            for (std::size_t j = startIdx; j < endIdx; ++j)
+                rmsSquared += impulseResponse[j] * impulseResponse[j];
+
+            const float rms = std::sqrt (rmsSquared / static_cast<float> (samples));
+            if (rms >= threshold)
+            {
+                significantContentEnd = endIdx;
+                break;
+            }
+        }
 
-        isPrepared_ = true;
+        // If no significant content found, check the beginning more carefully
+        if (significantContentEnd == 0)
+        {
+            const std::size_t checkLength = std::min (minRetainLength, length);
+            float rmsSquared = 0.0f;
+            for (std::size_t j = 0; j < checkLength; ++j)
+                rmsSquared += impulseResponse[j] * impulseResponse[j];
+
+            const float rms = std::sqrt (rmsSquared / static_cast<float> (checkLength));
+            if (rms < threshold)
+                return 1;
+        }
+
+        // Return the found significant content end, but respect minimum for short IRs
+        if (length <= 200) // Short IR protection
+            return std::max (significantContentEnd, minRetainLength);
+        else
+            return std::max (significantContentEnd, windowSize);
     }
 
     void setImpulseResponse (const float* impulseResponse, std::size_t length, const PartitionedConvolver::IRLoadOptions& options)
     {
         DirectFIR newFIR;
-        std::vector<FFTLayer> newLayers (layers_.size());
+        std::vector<FFTLayer> newLayers (layers.size());
+
+        std::size_t trimmedLength = length;
 
         // Safety check
-        if (impulseResponse != nullptr && length > 0)
+        if (impulseResponse != nullptr && trimmedLength > 0)
         {
+            // Trim end silence if requested
+            if (options.trimEndSilenceBelowDb)
+                trimmedLength = trimSilenceFromEnd (impulseResponse, length, *options.trimEndSilenceBelowDb);
+
             // Always apply peak headroom
             float headroomScale = std::pow (10.0f, options.headroomDb / 20.0f);
             if (options.normalize)
             {
-                const auto minMax = FloatVectorOperations::findMinAndMax (impulseResponse, length);
+                const auto minMax = FloatVectorOperations::findMinAndMax (impulseResponse, trimmedLength);
 
                 const float peak = std::max (std::abs (minMax.getStart()), std::abs (minMax.getEnd()));
                 if (peak > 0.0f)
@@ -666,7 +703,7 @@ class PartitionedConvolver::Impl
             // Update DirectFIR in-place
             std::vector<float> directTaps;
 
-            const auto directTapsCount = std::min (directFIRTapCount_, length);
+            const auto directTapsCount = std::min (directFIRTapCount, trimmedLength);
             if (directTapsCount > 0)
             {
                 directTaps.reserve (directTapsCount);
@@ -680,9 +717,9 @@ class PartitionedConvolver::Impl
             for (std::size_t i = 0; i < newLayers.size(); ++i)
             {
                 auto& layer = newLayers[i];
-                layer.configure (layers_[i].getHopSize());
+                layer.configure (layers[i].getHopSize());
 
-                const std::size_t remaining = (consumed < length) ? (length - consumed) : 0;
+                const std::size_t remaining = (consumed < trimmedLength) ? (trimmedLength - consumed) : 0;
                 if (remaining == 0)
                 {
                     layer.setImpulseResponse (nullptr, 0, headroomScale);
@@ -694,10 +731,10 @@ class PartitionedConvolver::Impl
         }
 
         {
-            SpinLock::ScopedLockType lock (processingLock_);
+            SpinLock::ScopedLockType lock (processingLock);
 
-            directFIR_ = std::move (newFIR);
-            layers_ = std::move (newLayers);
+            directFIR = std::move (newFIR);
+            layers = std::move (newLayers);
 
             resetStateUnsafe();
         }
@@ -705,7 +742,7 @@ class PartitionedConvolver::Impl
 
     void reset()
     {
-        SpinLock::ScopedLockType lock (processingLock_);
+        SpinLock::ScopedLockType lock (processingLock);
 
         resetStateUnsafe();
     }
@@ -715,7 +752,7 @@ class PartitionedConvolver::Impl
         if (numSamples == 0)
             return;
 
-        SpinLock::ScopedLockType lock (processingLock_);
+        SpinLock::ScopedLockType lock (processingLock);
 
         processUnsafe (input, output, numSamples);
     }
@@ -723,36 +760,36 @@ class PartitionedConvolver::Impl
 private:
     void resetStateUnsafe()
     {
-        directFIR_.reset();
-        inputStagingReadIndex_ = 0;
-        inputStagingWriteIndex_ = 0;
-        inputStagingAvailable_ = 0;
-        std::fill (outputStaging_.begin(), outputStaging_.end(), 0.0f);
+        directFIR.reset();
+        inputStagingReadIndex = 0;
+        inputStagingWriteIndex = 0;
+        inputStagingAvailable = 0;
+        std::fill (outputStaging.begin(), outputStaging.end(), 0.0f);
 
-        for (auto& buffer : layerInputBuffers_)
+        for (auto& buffer : layerInputBuffers)
             buffer.clear();
 
-        for (auto& buffer : layerOutputBuffers_)
+        for (auto& buffer : layerOutputBuffers)
             buffer.clear();
 
-        for (auto& layer : layers_)
+        for (auto& layer : layers)
             layer.resetState();
     }
 
     void processUnsafe (const float* input, float* output, std::size_t numSamples)
     {
-        jassert (isPrepared_);
-        jassert (numSamples <= maxBlockSize_);
-        if (! isPrepared_ || numSamples > maxBlockSize_)
+        jassert (isPrepared);
+        jassert (numSamples <= maxBlockSize);
+        if (! isPrepared || numSamples > maxBlockSize)
             return;
 
-        FloatVectorOperations::clear (workingOutput_.data(), numSamples);
+        FloatVectorOperations::clear (workingOutput.data(), numSamples);
 
         // Process direct FIR (no block size constraints)
-        directFIR_.process (input, workingOutput_.data(), numSamples);
-        if (layers_.empty())
+        directFIR.process (input, workingOutput.data(), numSamples);
+        if (layers.empty())
         {
-            FloatVectorOperations::add (output, workingOutput_.data(), numSamples);
+            FloatVectorOperations::add (output, workingOutput.data(), numSamples);
             return;
         }
 
@@ -760,125 +797,125 @@ class PartitionedConvolver::Impl
         writeToInputStaging (input, numSamples);
 
         std::size_t outputSamplesProduced = 0;
-        while (getInputStagingAvailable() >= static_cast<std::size_t> (baseHopSize_))
+        while (getInputStagingAvailable() >= static_cast<std::size_t> (baseHopSize))
         {
-            const std::size_t hopSize = static_cast<std::size_t> (baseHopSize_);
+            const std::size_t hopSize = static_cast<std::size_t> (baseHopSize);
 
             // Read hop from input staging
-            readFromInputStaging (tempLayerHop_.data(), hopSize);
-            FloatVectorOperations::clear (outputStaging_.data(), outputStaging_.size());
+            readFromInputStaging (tempLayerHop.data(), hopSize);
+            FloatVectorOperations::clear (outputStaging.data(), outputStaging.size());
 
-            for (std::size_t layerIndex = 0; layerIndex < layers_.size(); ++layerIndex)
+            for (std::size_t layerIndex = 0; layerIndex < layers.size(); ++layerIndex)
             {
-                auto& layer = layers_[layerIndex];
+                auto& layer = layers[layerIndex];
                 if (! layer.hasImpulseResponse())
                     continue;
 
                 const int layerHopSize = layer.getHopSize();
-                auto& inputBuffer = layerInputBuffers_[layerIndex];
-                auto& outputBuffer = layerOutputBuffers_[layerIndex];
+                auto& inputBuffer = layerInputBuffers[layerIndex];
+                auto& outputBuffer = layerOutputBuffers[layerIndex];
 
                 // Write input hop to layer's input buffer
-                inputBuffer.write (tempLayerHop_.data(), hopSize);
+                inputBuffer.write (tempLayerHop.data(), hopSize);
 
                 // Process complete layer hops
                 while (inputBuffer.getAvailableForRead() >= static_cast<std::size_t> (layerHopSize))
                 {
                     // Read a full hop for this layer
-                    inputBuffer.read (tempLayerHop_.data(), static_cast<std::size_t> (layerHopSize));
-                    FloatVectorOperations::clear (layerTempOutput_.data(), layerHopSize);
+                    inputBuffer.read (tempLayerHop.data(), static_cast<std::size_t> (layerHopSize));
+                    FloatVectorOperations::clear (layerTempOutput.data(), layerHopSize);
 
                     // Process hop
-                    layer.processHop (tempLayerHop_.data(), layerTempOutput_.data());
+                    layer.processHop (tempLayerHop.data(), layerTempOutput.data());
 
                     // Write output to layer's output buffer
-                    outputBuffer.write (layerTempOutput_.data(), static_cast<std::size_t> (layerHopSize));
+                    outputBuffer.write (layerTempOutput.data(), static_cast<std::size_t> (layerHopSize));
                 }
 
                 // Mix available output from this layer
                 if (outputBuffer.getAvailableForRead() >= hopSize)
                 {
-                    outputBuffer.read (layerTempOutput_.data(), hopSize);
-                    FloatVectorOperations::add (outputStaging_.data(), layerTempOutput_.data(), hopSize);
+                    outputBuffer.read (layerTempOutput.data(), hopSize);
+                    FloatVectorOperations::add (outputStaging.data(), layerTempOutput.data(), hopSize);
                 }
             }
 
             // Add staging output to main output
             const std::size_t samplesToWrite = std::min (hopSize, numSamples - outputSamplesProduced);
-            FloatVectorOperations::add (workingOutput_.data() + outputSamplesProduced, outputStaging_.data(), samplesToWrite);
+            FloatVectorOperations::add (workingOutput.data() + outputSamplesProduced, outputStaging.data(), samplesToWrite);
             outputSamplesProduced += samplesToWrite;
         }
 
         // Copy final result to output (accumulate)
-        FloatVectorOperations::add (output, workingOutput_.data(), numSamples);
+        FloatVectorOperations::add (output, workingOutput.data(), numSamples);
     }
 
 private:
     void writeToInputStaging (const float* data, std::size_t numSamples)
     {
-        const std::size_t available = inputStaging_.size() - inputStagingAvailable_;
+        const std::size_t available = inputStaging.size() - inputStagingAvailable;
         jassert (numSamples <= available);
         numSamples = std::min (numSamples, available);
         if (numSamples == 0)
             return;
 
-        const std::size_t beforeWrap = std::min (numSamples, inputStaging_.size() - inputStagingWriteIndex_);
+        const std::size_t beforeWrap = std::min (numSamples, inputStaging.size() - inputStagingWriteIndex);
         const std::size_t afterWrap = numSamples - beforeWrap;
 
-        std::copy (data, data + beforeWrap, inputStaging_.begin() + inputStagingWriteIndex_);
+        std::copy (data, data + beforeWrap, inputStaging.begin() + inputStagingWriteIndex);
         if (afterWrap > 0)
-            std::copy (data + beforeWrap, data + numSamples, inputStaging_.begin());
+            std::copy (data + beforeWrap, data + numSamples, inputStaging.begin());
 
-        inputStagingWriteIndex_ = (inputStagingWriteIndex_ + numSamples) % inputStaging_.size();
-        inputStagingAvailable_ += numSamples;
+        inputStagingWriteIndex = (inputStagingWriteIndex + numSamples) % inputStaging.size();
+        inputStagingAvailable += numSamples;
     }
 
     void readFromInputStaging (float* data, std::size_t numSamples)
     {
-        jassert (numSamples <= inputStagingAvailable_);
-        numSamples = std::min (numSamples, inputStagingAvailable_);
+        jassert (numSamples <= inputStagingAvailable);
+        numSamples = std::min (numSamples, inputStagingAvailable);
         if (numSamples == 0)
             return;
 
-        const std::size_t beforeWrap = std::min (numSamples, inputStaging_.size() - inputStagingReadIndex_);
+        const std::size_t beforeWrap = std::min (numSamples, inputStaging.size() - inputStagingReadIndex);
         const std::size_t afterWrap = numSamples - beforeWrap;
 
-        std::copy (inputStaging_.begin() + inputStagingReadIndex_, inputStaging_.begin() + inputStagingReadIndex_ + beforeWrap, data);
+        std::copy (inputStaging.begin() + inputStagingReadIndex, inputStaging.begin() + inputStagingReadIndex + beforeWrap, data);
         if (afterWrap > 0)
-            std::copy (inputStaging_.begin(), inputStaging_.begin() + afterWrap, data + beforeWrap);
+            std::copy (inputStaging.begin(), inputStaging.begin() + afterWrap, data + beforeWrap);
 
-        inputStagingReadIndex_ = (inputStagingReadIndex_ + numSamples) % inputStaging_.size();
-        inputStagingAvailable_ -= numSamples;
+        inputStagingReadIndex = (inputStagingReadIndex + numSamples) % inputStaging.size();
+        inputStagingAvailable -= numSamples;
     }
 
-    std::size_t getInputStagingAvailable() const { return inputStagingAvailable_; }
+    std::size_t getInputStagingAvailable() const { return inputStagingAvailable; }
 
-    std::size_t directFIRTapCount_ = 0;
-    int baseHopSize_ = 0;
-    std::size_t maxHopSize_ = 0;
-    std::size_t maxBlockSize_ = 0;
-    bool isPrepared_ = false;
+    std::size_t directFIRTapCount = 0;
+    int baseHopSize = 0;
+    std::size_t maxHopSize = 0;
+    std::size_t maxBlockSize = 0;
+    bool isPrepared = false;
 
-    DirectFIR directFIR_;
-    std::vector<FFTLayer> layers_;
+    DirectFIR directFIR;
+    std::vector<FFTLayer> layers;
 
     // Working buffers
-    std::vector<float> workingOutput_;
+    std::vector<float> workingOutput;
 
     // Input staging with circular buffer management
-    std::vector<float> inputStaging_;
-    std::size_t inputStagingReadIndex_ = 0;
-    std::size_t inputStagingWriteIndex_ = 0;
-    std::size_t inputStagingAvailable_ = 0;
-    std::vector<float> outputStaging_;
+    std::vector<float> inputStaging;
+    std::size_t inputStagingReadIndex = 0;
+    std::size_t inputStagingWriteIndex = 0;
+    std::size_t inputStagingAvailable = 0;
+    std::vector<float> outputStaging;
 
     // Per-layer circular buffering
-    std::vector<CircularBuffer> layerInputBuffers_;
-    std::vector<CircularBuffer> layerOutputBuffers_;
-    std::vector<float> tempLayerHop_;
-    std::vector<float> layerTempOutput_;
+    std::vector<CircularBuffer> layerInputBuffers;
+    std::vector<CircularBuffer> layerOutputBuffers;
+    std::vector<float> tempLayerHop;
+    std::vector<float> layerTempOutput;
 
-    mutable SpinLock processingLock_;
+    mutable SpinLock processingLock;
 };
 
 //==============================================================================
@@ -915,7 +952,12 @@ void PartitionedConvolver::setTypicalLayout (std::size_t directTaps, const std::
     layerSpecs.reserve (hops.size());
 
     for (int hop : hops)
-        layerSpecs.push_back ({ hop });
+    {
+        if (hop < 64)
+            directTaps += static_cast<std::size_t> (hop);
+        else
+            layerSpecs.push_back ({ nextPowerOfTwo (hop) });
+    }
 
     configureLayers (directTaps, layerSpecs);
 }
diff --git a/modules/yup_dsp/convolution/yup_PartitionedConvolver.h b/modules/yup_dsp/convolution/yup_PartitionedConvolver.h
index 78f3cb176..b73f98fa8 100644
--- a/modules/yup_dsp/convolution/yup_PartitionedConvolver.h
+++ b/modules/yup_dsp/convolution/yup_PartitionedConvolver.h
@@ -109,6 +109,7 @@ class PartitionedConvolver
 
         bool normalize;
         float headroomDb;
+        std::optional<float> trimEndSilenceBelowDb;
     };
 
     /**
diff --git a/modules/yup_dsp/yup_dsp.h b/modules/yup_dsp/yup_dsp.h
index c89002c13..c42e03212 100644
--- a/modules/yup_dsp/yup_dsp.h
+++ b/modules/yup_dsp/yup_dsp.h
@@ -97,6 +97,7 @@
 #include <cmath>
 #include <complex>
 #include <memory>
+#include <optional>
 #include <vector>
 
 //==============================================================================
diff --git a/tests/yup_dsp/yup_PartitionedConvolver.cpp b/tests/yup_dsp/yup_PartitionedConvolver.cpp
index c1ea2985f..b8a4f09b7 100644
--- a/tests/yup_dsp/yup_PartitionedConvolver.cpp
+++ b/tests/yup_dsp/yup_PartitionedConvolver.cpp
@@ -998,4 +998,285 @@ TEST_F (PartitionedConvolverTest, ResetFunctionality)
     }
 }
 
+//==============================================================================
+// IR Trimming Tests
+//==============================================================================
+
+TEST_F (PartitionedConvolverTest, IRTrimmingBasicFunctionality)
+{
+    PartitionedConvolver convolver;
+    convolver.setTypicalLayout (64, { 64, 256 });
+    convolver.prepare (512);
+
+    // Create IR with significant content at start and silence at end
+    const size_t originalLength = 2000;
+    const size_t significantLength = 800;
+    std::vector<float> ir (originalLength, 0.0f);
+
+    // Fill first part with meaningful signal
+    for (size_t i = 0; i < significantLength; ++i)
+    {
+        ir[i] = std::exp (-static_cast<float> (i) / 100.0f) * std::sin (2.0f * MathConstants<float>::pi * i / 32.0f);
+    }
+
+    // Add very quiet noise at the end (below -60dB)
+    for (size_t i = significantLength; i < originalLength; ++i)
+    {
+        ir[i] = randomFloat (-0.001f, 0.001f); // ~ -60dB
+    }
+
+    // Test without trimming
+    convolver.setImpulseResponse (ir);
+    std::vector<float> input (512, 0.0f);
+    input[0] = 1.0f;
+    std::vector<float> outputWithoutTrim (512, 0.0f);
+    convolver.process (input.data(), outputWithoutTrim.data(), input.size());
+    convolver.reset();
+
+    // Test with trimming at -50dB threshold
+    PartitionedConvolver::IRLoadOptions options;
+    options.trimEndSilenceBelowDb = -50.0f;
+    convolver.setImpulseResponse (ir, options);
+
+    std::vector<float> outputWithTrim (512, 0.0f);
+    convolver.process (input.data(), outputWithTrim.data(), input.size());
+
+    // Both should produce similar output in the early samples
+    float correlationSum = 0.0f;
+    float norm1 = 0.0f, norm2 = 0.0f;
+
+    for (size_t i = 0; i < 200; ++i) // Compare first 200 samples
+    {
+        correlationSum += outputWithoutTrim[i] * outputWithTrim[i];
+        norm1 += outputWithoutTrim[i] * outputWithoutTrim[i];
+        norm2 += outputWithTrim[i] * outputWithTrim[i];
+    }
+
+    if (norm1 > 0.0f && norm2 > 0.0f)
+    {
+        float correlation = correlationSum / std::sqrt (norm1 * norm2);
+        EXPECT_GT (correlation, 0.95f) << "Trimmed and untrimmed outputs should be highly correlated in early samples";
+    }
+}
+
+TEST_F (PartitionedConvolverTest, IRTrimmingWithDifferentThresholds)
+{
+    PartitionedConvolver convolver;
+    convolver.setTypicalLayout (64, { 64, 256 });
+    convolver.prepare (512);
+
+    // Create IR with exponentially decaying tail
+    const size_t originalLength = 2000;
+    std::vector<float> ir (originalLength);
+
+    for (size_t i = 0; i < originalLength; ++i)
+    {
+        float decay = std::exp (-static_cast<float> (i) / 200.0f);
+        ir[i] = decay * std::sin (2.0f * MathConstants<float>::pi * i / 16.0f);
+    }
+
+    std::vector<float> thresholds = { -20.0f, -40.0f, -60.0f, -80.0f };
+    std::vector<float> outputEnergies;
+
+    for (float threshold : thresholds)
+    {
+        PartitionedConvolver::IRLoadOptions options;
+        options.trimEndSilenceBelowDb = threshold;
+        convolver.setImpulseResponse (ir, options);
+
+        std::vector<float> input (512, 0.0f);
+        input[0] = 1.0f;
+        std::vector<float> output (512, 0.0f);
+        convolver.process (input.data(), output.data(), input.size());
+
+        float energy = 0.0f;
+        for (float sample : output)
+            energy += sample * sample;
+
+        outputEnergies.push_back (energy);
+        convolver.reset();
+    }
+
+    // More aggressive trimming should result in less energy
+    for (size_t i = 1; i < outputEnergies.size(); ++i)
+    {
+        EXPECT_LE (outputEnergies[i], outputEnergies[i - 1] * 1.1f)
+            << "More aggressive trimming threshold should not significantly increase output energy";
+    }
+}
+
+TEST_F (PartitionedConvolverTest, IRTrimmingVeryShortIR)
+{
+    PartitionedConvolver convolver;
+    convolver.setTypicalLayout (64, { 64, 256 });
+    convolver.prepare (512);
+
+    // Very short IR that shouldn't be trimmed much
+    std::vector<float> shortIR (100);
+    for (size_t i = 0; i < shortIR.size(); ++i)
+    {
+        shortIR[i] = std::sin (2.0f * MathConstants<float>::pi * i / 8.0f);
+    }
+
+    PartitionedConvolver::IRLoadOptions options;
+    options.trimEndSilenceBelowDb = -40.0f;
+
+    // Should not crash or produce errors with short IR
+    EXPECT_NO_THROW (convolver.setImpulseResponse (shortIR, options));
+
+    std::vector<float> input (512, 0.0f);
+    input[0] = 1.0f;
+    std::vector<float> output (512, 0.0f);
+
+    EXPECT_NO_THROW (convolver.process (input.data(), output.data(), input.size()));
+
+    // Should still produce meaningful output
+    float outputRMS = calculateRMS (output);
+    EXPECT_GT (outputRMS, 0.01f);
+}
+
+TEST_F (PartitionedConvolverTest, IRTrimmingAllSilence)
+{
+    PartitionedConvolver convolver;
+    convolver.setTypicalLayout (64, { 64, 256 });
+    convolver.prepare (512);
+
+    // IR with only very quiet content
+    std::vector<float> quietIR (1000);
+    for (size_t i = 0; i < quietIR.size(); ++i)
+    {
+        quietIR[i] = randomFloat (-0.0001f, 0.0001f); // Very quiet, ~ -80dB
+    }
+
+    PartitionedConvolver::IRLoadOptions options;
+    options.normalize = false;              // Don't normalize the quiet IR
+    options.trimEndSilenceBelowDb = -60.0f; // Should trim most/all of it
+
+    EXPECT_NO_THROW (convolver.setImpulseResponse (quietIR, options));
+
+    std::vector<float> input (512);
+    fillWithRandomData (input);
+    std::vector<float> output (512, 0.0f);
+
+    EXPECT_NO_THROW (convolver.process (input.data(), output.data(), input.size()));
+
+    // Output should be very quiet or silent
+    float outputRMS = calculateRMS (output);
+    EXPECT_LT (outputRMS, 0.001f); // Should be very quiet with normalized disabled and aggressive trimming
+}
+
+TEST_F (PartitionedConvolverTest, IRTrimmingWithNormalization)
+{
+    PartitionedConvolver convolver;
+    convolver.setTypicalLayout (64, { 64, 256 });
+    convolver.prepare (512);
+
+    // Create IR with large peak but quiet tail
+    std::vector<float> ir (1500);
+    for (size_t i = 0; i < ir.size(); ++i)
+    {
+        if (i < 100)
+            ir[i] = 2.0f * std::exp (-static_cast<float> (i) / 50.0f); // Large peak
+        else
+            ir[i] = 0.01f * randomFloat (-0.1f, 0.1f); // Quiet tail
+    }
+
+    PartitionedConvolver::IRLoadOptions options;
+    options.normalize = true;
+    options.headroomDb = -6.0f;
+    options.trimEndSilenceBelowDb = -50.0f;
+
+    EXPECT_NO_THROW (convolver.setImpulseResponse (ir, options));
+
+    std::vector<float> input (512, 0.0f);
+    input[0] = 1.0f;
+    std::vector<float> output (512, 0.0f);
+
+    EXPECT_NO_THROW (convolver.process (input.data(), output.data(), input.size()));
+
+    // Should produce reasonable output levels due to normalization
+    float outputPeak = findPeak (output);
+    EXPECT_GT (outputPeak, 0.1f);
+    EXPECT_LT (outputPeak, 1.0f); // Should be limited by headroom
+}
+
+TEST_F (PartitionedConvolverTest, IRTrimmingExactBoundary)
+{
+    PartitionedConvolver convolver;
+    convolver.setTypicalLayout (64, { 64, 256 });
+    convolver.prepare (512);
+
+    // Create IR that drops exactly to threshold
+    const size_t significantLength = 1000;
+    const size_t totalLength = 1500;
+    std::vector<float> ir (totalLength, 0.0f);
+
+    // Significant content
+    for (size_t i = 0; i < significantLength; ++i)
+    {
+        ir[i] = std::exp (-static_cast<float> (i) / 200.0f);
+    }
+
+    // Content right at threshold level (-50dB = 0.00316)
+    const float thresholdLevel = std::pow (10.0f, -50.0f / 20.0f);
+    for (size_t i = significantLength; i < totalLength; ++i)
+    {
+        ir[i] = thresholdLevel * 0.9f; // Slightly below threshold
+    }
+
+    PartitionedConvolver::IRLoadOptions options;
+    options.trimEndSilenceBelowDb = -50.0f;
+
+    EXPECT_NO_THROW (convolver.setImpulseResponse (ir, options));
+
+    std::vector<float> input (512, 0.0f);
+    input[0] = 1.0f;
+    std::vector<float> output (512, 0.0f);
+
+    EXPECT_NO_THROW (convolver.process (input.data(), output.data(), input.size()));
+
+    // Should work correctly at boundary conditions
+    float outputRMS = calculateRMS (output);
+    EXPECT_GT (outputRMS, 0.001f);
+}
+
+TEST_F (PartitionedConvolverTest, IRTrimmingConsistency)
+{
+    // Test that trimming produces consistent results across multiple calls
+    PartitionedConvolver convolver1, convolver2;
+    convolver1.setTypicalLayout (64, { 64, 256 });
+    convolver1.prepare (512);
+    convolver2.setTypicalLayout (64, { 64, 256 });
+    convolver2.prepare (512);
+
+    std::vector<float> ir (1000);
+    fillWithRandomData (ir);
+    // Add quiet tail
+    for (size_t i = 600; i < ir.size(); ++i)
+    {
+        ir[i] *= 0.001f; // Make very quiet
+    }
+
+    PartitionedConvolver::IRLoadOptions options;
+    options.trimEndSilenceBelowDb = -50.0f;
+
+    // Set same IR with trimming on both convolvers
+    convolver1.setImpulseResponse (ir, options);
+    convolver2.setImpulseResponse (ir, options);
+
+    std::vector<float> input (512);
+    fillWithRandomData (input);
+    std::vector<float> output1 (512, 0.0f);
+    std::vector<float> output2 (512, 0.0f);
+
+    convolver1.process (input.data(), output1.data(), input.size());
+    convolver2.process (input.data(), output2.data(), input.size());
+
+    // Both should produce identical results
+    for (size_t i = 0; i < output1.size(); ++i)
+    {
+        EXPECT_NEAR (output1[i], output2[i], 0.0001f) << "Inconsistent trimming results at sample " << i;
+    }
+}
+
 } // namespace yup::test
\ No newline at end of file

From 35c4e5c2c678f610696b1d27aa3f7158e7981211 Mon Sep 17 00:00:00 2001
From: kunitoki <kunitoki@gmail.com>
Date: Thu, 11 Sep 2025 22:58:14 +0200
Subject: [PATCH 32/37] Moved the DirectFIR outside of the
 PartitionedConvolution

---
 .../source/examples/ConvolutionDemo.h         |  17 +-
 examples/graphics/source/main.cpp             |  17 +-
 .../convolution/yup_PartitionedConvolver.cpp  | 196 ++----
 .../convolution/yup_PartitionedConvolver.h    |  16 +-
 modules/yup_dsp/filters/yup_DirectFIR.cpp     | 227 +++++++
 modules/yup_dsp/filters/yup_DirectFIR.h       | 158 +++++
 modules/yup_dsp/yup_dsp.cpp                   |   1 +
 modules/yup_dsp/yup_dsp.h                     |   1 +
 tests/yup_dsp/yup_DirectFIR.cpp               | 585 ++++++++++++++++++
 tests/yup_dsp/yup_PartitionedConvolver.cpp    |  24 +-
 10 files changed, 1050 insertions(+), 192 deletions(-)
 create mode 100644 modules/yup_dsp/filters/yup_DirectFIR.cpp
 create mode 100644 modules/yup_dsp/filters/yup_DirectFIR.h
 create mode 100644 tests/yup_dsp/yup_DirectFIR.cpp

diff --git a/examples/graphics/source/examples/ConvolutionDemo.h b/examples/graphics/source/examples/ConvolutionDemo.h
index 4253d8973..221cb37cb 100644
--- a/examples/graphics/source/examples/ConvolutionDemo.h
+++ b/examples/graphics/source/examples/ConvolutionDemo.h
@@ -294,13 +294,15 @@ class ConvolutionDemo
 
             // Set impulse response in convolver
             yup::PartitionedConvolver::IRLoadOptions loadOptions;
-            loadOptions.trimEndSilenceBelowDb = -60.0f;
+            loadOptions.trimEndSilenceBelowDb = -36.0f;
             convolver.setImpulseResponse (impulseResponseData, loadOptions);
+            impulseLength = static_cast<int> (convolver.getImpulseLength());
             hasImpulseResponse = true;
 
             std::cout << "Loaded impulse response: " << file.getFileName() << std::endl;
             std::cout << "Sample rate: " << reader->sampleRate << " Hz" << std::endl;
             std::cout << "Length: " << reader->lengthInSamples << " samples" << std::endl;
+            std::cout << "Effective Length: " << impulseLength << " samples" << std::endl;
 
             // Update UI
             updateIRInfo (file.getFileName());
@@ -409,6 +411,8 @@ class ConvolutionDemo
         if (impulseResponseData.empty())
             return;
 
+        const size_t length = static_cast<size_t> (impulseLength);
+
         // Always apply peak headroom
         float headroomScale = std::pow (10.0f, -12.0f / 20.0f);
         const auto minMax = yup::FloatVectorOperations::findMinAndMax (impulseResponseData.data(), impulseResponseData.size());
@@ -417,8 +421,8 @@ class ConvolutionDemo
             headroomScale /= peak;
 
         // Create waveform data points
-        const size_t numPoints = std::min (static_cast<size_t> (2048), impulseResponseData.size());
-        const size_t stride = impulseResponseData.size() / numPoints;
+        const size_t numPoints = std::min (static_cast<size_t> (getWidth()), length);
+        const size_t stride = length / numPoints;
 
         std::vector<yup::Point<double>> waveformData;
         waveformData.reserve (numPoints);
@@ -426,8 +430,8 @@ class ConvolutionDemo
         for (size_t i = 0; i < numPoints; ++i)
         {
             size_t sampleIndex = i * stride;
-            if (sampleIndex >= impulseResponseData.size())
-                sampleIndex = impulseResponseData.size() - 1;
+            if (sampleIndex >= length)
+                sampleIndex = length - 1;
 
             double normalizedTime = static_cast<double> (i) / static_cast<double> (numPoints - 1);
             double amplitude = static_cast<double> (impulseResponseData[sampleIndex] * headroomScale);
@@ -439,7 +443,7 @@ class ConvolutionDemo
         irWaveformDisplay.updateSignalData (waveformSignalIndex, waveformData);
 
         // Update X axis range to show time
-        double lengthInSeconds = static_cast<double> (impulseResponseData.size()) / 44100.0; // Assume 44.1kHz
+        double lengthInSeconds = static_cast<double> (length) / 44100.0; // Assume 44.1kHz
         irWaveformDisplay.setXRange (0.0, lengthInSeconds);
         irWaveformDisplay.setVerticalGridLines ({ 0.0, lengthInSeconds });
 
@@ -463,6 +467,7 @@ class ConvolutionDemo
     yup::AudioBuffer<float> impulseResponseBuffer;
     std::vector<float> impulseResponseData;
     int readPosition = 0;
+    int impulseLength = 0;
     std::atomic<bool> hasImpulseResponse = false;
 
     // Processing
diff --git a/examples/graphics/source/main.cpp b/examples/graphics/source/main.cpp
index f6d4d6084..65b6b648b 100644
--- a/examples/graphics/source/main.cpp
+++ b/examples/graphics/source/main.cpp
@@ -309,18 +309,23 @@ struct Application : yup::YUPApplication
 
         yup::Logger::outputDebugString ("Starting app " + commandLineParameters);
 
-        window = std::make_unique<CustomWindow>();
+        yup::MessageManager::callAsync ([this]
+        {
+            yup::Process::makeForegroundProcess();
+
+            window = std::make_unique<CustomWindow>();
 
 #if YUP_IOS
-        window->centreWithSize ({ 320, 480 });
+            window->centreWithSize ({ 320, 480 });
 #elif YUP_ANDROID
-        window->centreWithSize ({ 1080, 2400 });
-        // window->setFullScreen(true);
+            window->centreWithSize ({ 1080, 2400 });
+            // window->setFullScreen(true);
 #else
-        window->centreWithSize ({ 600, 800 });
+            window->centreWithSize ({ 600, 800 });
 #endif
 
-        window->setVisible (true);
+            window->setVisible (true);
+        });
     }
 
     void shutdown() override
diff --git a/modules/yup_dsp/convolution/yup_PartitionedConvolver.cpp b/modules/yup_dsp/convolution/yup_PartitionedConvolver.cpp
index 133d27335..566ba72cf 100644
--- a/modules/yup_dsp/convolution/yup_PartitionedConvolver.cpp
+++ b/modules/yup_dsp/convolution/yup_PartitionedConvolver.cpp
@@ -32,7 +32,7 @@ namespace yup
     @param Y pointer to output complex array (accumulated)
     @param complexPairs number of complex pairs (not number of floats!)
 */
-static void complexMultiplyAccumulate (const float* A, const float* B, float* Y, int complexPairs) noexcept
+static void complexMultiplyAccumulate (const float* __restrict A, const float* __restrict B, float* __restrict Y, int complexPairs) noexcept
 {
     int i = 0;
 
@@ -125,147 +125,6 @@ static void complexMultiplyAccumulate (const float* A, const float* B, float* Y,
 
 //==============================================================================
 
-class PartitionedConvolver::DirectFIR
-{
-public:
-    DirectFIR() = default;
-
-    void setTaps (std::vector<float> taps, float scaling)
-    {
-        FloatVectorOperations::multiply (taps.data(), scaling, taps.size());
-
-        tapsReversed = std::move (taps);
-        std::reverse (tapsReversed.begin(), tapsReversed.end());
-
-        numTaps = tapsReversed.size();
-        paddedLen = (numTaps + 3u) & ~3u;
-        tapsReversed.resize (paddedLen, 0.0f);
-
-        history.assign (2 * numTaps, 0.0f);
-        writeIndex = 0;
-    }
-
-    void reset()
-    {
-        std::fill (history.begin(), history.end(), 0.0f);
-        writeIndex = 0;
-    }
-
-    void process (const float* input, float* output, std::size_t numSamples) noexcept
-    {
-        const std::size_t M = numTaps;
-        if (M == 0)
-            return;
-
-        const float* h = tapsReversed.data();
-        for (std::size_t i = 0; i < numSamples; ++i)
-        {
-            const float x = input[i];
-
-            history[writeIndex] = x;
-            history[writeIndex + M] = x;
-
-            const float* w = history.data() + writeIndex + 1;
-
-            float sum = 0.0f;
-
-#if YUP_ENABLE_VDSP
-            vDSP_dotpr (w, 1, h, 1, &sum, M);
-#else
-            sum = dotProduct (w, h, M);
-#endif
-
-            output[i] += sum;
-
-            if (++writeIndex == M)
-                writeIndex = 0;
-        }
-    }
-
-    std::size_t getNumTaps() const
-    {
-        return numTaps;
-    }
-
-private:
-    static float dotProduct (const float* __restrict a, const float* __restrict b, std::size_t len) noexcept
-    {
-        float acc = 0.0f;
-        std::size_t i = 0;
-
-#if YUP_USE_AVX_INTRINSICS && YUP_USE_FMA_INTRINSICS
-        __m256 vacc = _mm256_setzero_ps();
-        for (; i + 8 <= len; i += 8)
-        {
-            __m256 va = _mm256_loadu_ps (a + i);
-            __m256 vb = _mm256_loadu_ps (b + i);
-            vacc = _mm256_fmadd_ps (va, vb, vacc);
-        }
-        __m128 low = _mm256_castps256_ps128 (vacc);
-        __m128 high = _mm256_extractf128_ps (vacc, 1);
-        __m128 vsum = _mm_add_ps (low, high);
-        vsum = _mm_hadd_ps (vsum, vsum);
-        vsum = _mm_hadd_ps (vsum, vsum);
-        acc += _mm_cvtss_f32 (vsum);
-
-#elif YUP_USE_SSE_INTRINSICS
-        __m128 vacc = _mm_setzero_ps();
-#if YUP_USE_FMA_INTRINSICS
-        for (; i + 4 <= len; i += 4)
-        {
-            __m128 va = _mm_loadu_ps (a + i);
-            __m128 vb = _mm_loadu_ps (b + i);
-            vacc = _mm_fmadd_ps (va, vb, vacc);
-        }
-#else
-        for (; i + 4 <= len; i += 4)
-        {
-            __m128 va = _mm_loadu_ps (a + i);
-            __m128 vb = _mm_loadu_ps (b + i);
-            vacc = _mm_add_ps (vacc, _mm_mul_ps (va, vb));
-        }
-#endif
-        __m128 shuf = _mm_shuffle_ps (vacc, vacc, _MM_SHUFFLE (2, 3, 0, 1));
-        __m128 sums = _mm_add_ps (vacc, shuf);
-        shuf = _mm_movehl_ps (shuf, sums);
-        sums = _mm_add_ss (sums, shuf);
-        acc += _mm_cvtss_f32 (sums);
-
-#elif YUP_USE_ARM_NEON
-        float32x4_t vacc = vdupq_n_f32 (0.0f);
-        for (; i + 4 <= len; i += 4)
-        {
-            float32x4_t va = vld1q_f32 (a + i);
-            float32x4_t vb = vld1q_f32 (b + i);
-            vacc = vmlaq_f32 (vacc, va, vb);
-        }
-#if YUP_64BIT
-        acc += vaddvq_f32 (vacc);
-#else
-        float32x2_t vlow = vget_low_f32 (vacc);
-        float32x2_t vhigh = vget_high_f32 (vacc);
-        float32x2_t vsum2 = vpadd_f32 (vlow, vhigh);
-        vsum2 = vpadd_f32 (vsum2, vsum2);
-        acc += vget_lane_f32 (vsum2, 0);
-#endif
-
-#endif
-
-        for (; i < len; ++i)
-            acc += a[i] * b[i];
-
-        return acc;
-    }
-
-    std::vector<float> tapsReversed;
-    std::vector<float> history;
-    std::size_t numTaps = 0;
-    std::size_t paddedLen = 0;
-    std::size_t writeIndex = 0;
-};
-
-//==============================================================================
-
 class PartitionedConvolver::FFTLayer
 {
 public:
@@ -544,9 +403,9 @@ class PartitionedConvolver::Impl
     Impl() = default;
     ~Impl() = default;
 
-    void configureLayers (std::size_t directFIRTaps, const std::vector<LayerSpec>& newLayers)
+    void configureLayers (std::size_t directFIRCoefficients, const std::vector<LayerSpec>& newLayers)
     {
-        directFIRTapCount = directFIRTaps;
+        directFIRCoefficientCount = directFIRCoefficients;
 
         layers.clear();
         layers.resize (newLayers.size());
@@ -659,6 +518,7 @@ class PartitionedConvolver::Impl
         if (significantContentEnd == 0)
         {
             const std::size_t checkLength = std::min (minRetainLength, length);
+
             float rmsSquared = 0.0f;
             for (std::size_t j = 0; j < checkLength; ++j)
                 rmsSquared += impulseResponse[j] * impulseResponse[j];
@@ -685,12 +545,10 @@ class PartitionedConvolver::Impl
         // Safety check
         if (impulseResponse != nullptr && trimmedLength > 0)
         {
-            // Trim end silence if requested
-            if (options.trimEndSilenceBelowDb)
-                trimmedLength = trimSilenceFromEnd (impulseResponse, length, *options.trimEndSilenceBelowDb);
-
             // Always apply peak headroom
             float headroomScale = std::pow (10.0f, options.headroomDb / 20.0f);
+
+            // Normalize peaks
             if (options.normalize)
             {
                 const auto minMax = FloatVectorOperations::findMinAndMax (impulseResponse, trimmedLength);
@@ -700,20 +558,24 @@ class PartitionedConvolver::Impl
                     headroomScale /= peak;
             }
 
+            // Trim end silence if requested
+            if (options.trimEndSilenceBelowDb)
+                trimmedLength = trimSilenceFromEnd (impulseResponse, length, *options.trimEndSilenceBelowDb);
+
             // Update DirectFIR in-place
-            std::vector<float> directTaps;
+            std::vector<float> directCoefficients;
 
-            const auto directTapsCount = std::min (directFIRTapCount, trimmedLength);
-            if (directTapsCount > 0)
+            const auto directCoefficientsCount = std::min (directFIRCoefficientCount, trimmedLength);
+            if (directCoefficientsCount > 0)
             {
-                directTaps.reserve (directTapsCount);
-                directTaps.assign (impulseResponse, impulseResponse + directTapsCount);
+                directCoefficients.reserve (directCoefficientsCount);
+                directCoefficients.assign (impulseResponse, impulseResponse + directCoefficientsCount);
             }
 
-            newFIR.setTaps (std::move (directTaps), headroomScale);
+            newFIR.setCoefficients (std::move (directCoefficients), headroomScale);
 
             // Update FFT layers
-            std::size_t consumed = directTapsCount;
+            std::size_t consumed = directCoefficientsCount;
             for (std::size_t i = 0; i < newLayers.size(); ++i)
             {
                 auto& layer = newLayers[i];
@@ -735,11 +597,17 @@ class PartitionedConvolver::Impl
 
             directFIR = std::move (newFIR);
             layers = std::move (newLayers);
+            finalImpulseLength = trimmedLength;
 
             resetStateUnsafe();
         }
     }
 
+    std::size_t getImpulseLength() const
+    {
+        return finalImpulseLength;
+    }
+
     void reset()
     {
         SpinLock::ScopedLockType lock (processingLock);
@@ -890,10 +758,11 @@ class PartitionedConvolver::Impl
 
     std::size_t getInputStagingAvailable() const { return inputStagingAvailable; }
 
-    std::size_t directFIRTapCount = 0;
+    std::size_t directFIRCoefficientCount = 0;
     int baseHopSize = 0;
     std::size_t maxHopSize = 0;
     std::size_t maxBlockSize = 0;
+    std::size_t finalImpulseLength = 0;
     bool isPrepared = false;
 
     DirectFIR directFIR;
@@ -941,12 +810,12 @@ PartitionedConvolver& PartitionedConvolver::operator= (PartitionedConvolver&& ot
     return *this;
 }
 
-void PartitionedConvolver::configureLayers (std::size_t directFIRTaps, const std::vector<LayerSpec>& layers)
+void PartitionedConvolver::configureLayers (std::size_t directFIRCoefficients, const std::vector<LayerSpec>& layers)
 {
-    pImpl->configureLayers (directFIRTaps, layers);
+    pImpl->configureLayers (directFIRCoefficients, layers);
 }
 
-void PartitionedConvolver::setTypicalLayout (std::size_t directTaps, const std::vector<int>& hops)
+void PartitionedConvolver::setTypicalLayout (std::size_t directCoefficients, const std::vector<int>& hops)
 {
     std::vector<LayerSpec> layerSpecs;
     layerSpecs.reserve (hops.size());
@@ -954,12 +823,12 @@ void PartitionedConvolver::setTypicalLayout (std::size_t directTaps, const std::
     for (int hop : hops)
     {
         if (hop < 64)
-            directTaps += static_cast<std::size_t> (hop);
+            directCoefficients += static_cast<std::size_t> (hop);
         else
             layerSpecs.push_back ({ nextPowerOfTwo (hop) });
     }
 
-    configureLayers (directTaps, layerSpecs);
+    configureLayers (directCoefficients, layerSpecs);
 }
 
 void PartitionedConvolver::setImpulseResponse (const float* impulseResponse, std::size_t length, const IRLoadOptions& options)
@@ -972,6 +841,11 @@ void PartitionedConvolver::setImpulseResponse (const std::vector<float>& impulse
     setImpulseResponse (impulseResponse.data(), impulseResponse.size(), options);
 }
 
+std::size_t PartitionedConvolver::getImpulseLength() const
+{
+    return pImpl->getImpulseLength();
+}
+
 void PartitionedConvolver::prepare (std::size_t maxBlockSize)
 {
     pImpl->prepare (maxBlockSize);
diff --git a/modules/yup_dsp/convolution/yup_PartitionedConvolver.h b/modules/yup_dsp/convolution/yup_PartitionedConvolver.h
index b73f98fa8..f319bc217 100644
--- a/modules/yup_dsp/convolution/yup_PartitionedConvolver.h
+++ b/modules/yup_dsp/convolution/yup_PartitionedConvolver.h
@@ -29,7 +29,7 @@ namespace yup
     Layered partitioned convolution engine optimized for real-time audio processing.
 
     Combines multiple processing strategies for efficient convolution:
-    - Direct FIR computation for early taps (low latency)
+    - Direct FIR computation for early coefficients (low latency)
     - One or more FFT-based Overlap-Add layers with uniform partitioning per layer
 
     The engine uses YUP's FFTProcessor for real FFT operations and supports:
@@ -41,7 +41,7 @@ namespace yup
     @code
     PartitionedConvolver convolver;
 
-    // Configure layers: 256 direct taps + FFT layers with hops 256, 1024, 4096
+    // Configure layers: 256 direct coefficients + FFT layers with hops 256, 1024, 4096
     convolver.setTypicalLayout(256, {256, 1024, 4096});
 
     // Prepare for processing with maximum block size (must be called before process)
@@ -83,19 +83,19 @@ class PartitionedConvolver
     /**
         Configure the convolution layers before setting the impulse response.
 
-        @param directFIRTaps  Number of early taps to process with direct FIR (for low latency)
+        @param directFIRCoefficients  Number of early coefficients to process with direct FIR (for low latency)
         @param layers         Vector of layer specifications with increasing hop sizes
                              (e.g., {{256}, {1024}, {4096}} for 256→1024→4096 progression)
     */
-    void configureLayers (std::size_t directFIRTaps, const std::vector<LayerSpec>& layers);
+    void configureLayers (std::size_t directFIRCoefficients, const std::vector<LayerSpec>& layers);
 
     /**
         Convenience method to set a typical late-reverb configuration.
 
-        @param directTaps  Number of direct FIR taps for early reflections
+        @param directCoefficients  Number of direct FIR coefficients for early reflections
         @param hops        Vector of hop sizes for FFT layers (geometrically increasing recommended)
     */
-    void setTypicalLayout (std::size_t directTaps, const std::vector<int>& hops);
+    void setTypicalLayout (std::size_t directCoefficients, const std::vector<int>& hops);
 
     //==============================================================================
     /** Impulse response loading options. */
@@ -130,6 +130,9 @@ class PartitionedConvolver
     */
     void setImpulseResponse (const std::vector<float>& impulseResponse, const IRLoadOptions& options = {});
 
+    /** Returns the length of the impulse in samples, taking into account trimmed silence samples. */
+    std::size_t getImpulseLength() const;
+
     //==============================================================================
     /**
         Prepare the convolver for processing with a specific maximum block size.
@@ -162,7 +165,6 @@ class PartitionedConvolver
 
 private:
     //==============================================================================
-    class DirectFIR;
     class FFTLayer;
     class CircularBuffer;
     class Impl;
diff --git a/modules/yup_dsp/filters/yup_DirectFIR.cpp b/modules/yup_dsp/filters/yup_DirectFIR.cpp
new file mode 100644
index 000000000..1c5d34da9
--- /dev/null
+++ b/modules/yup_dsp/filters/yup_DirectFIR.cpp
@@ -0,0 +1,227 @@
+/*
+  ==============================================================================
+
+   This file is part of the YUP library.
+   Copyright (c) 2025 - kunitoki@gmail.com
+
+   YUP is an open source library subject to open-source licensing.
+
+   The code included in this file is provided under the terms of the ISC license
+   http://www.isc.org/downloads/software-support-policy/isc-license. Permission
+   to use, copy, modify, and/or distribute this software for any purpose with or
+   without fee is hereby granted provided that the above copyright notice and
+   this permission notice appear in all copies.
+
+   YUP IS PROVIDED "AS IS" WITHOUT ANY WARRANTY, AND ALL WARRANTIES, WHETHER
+   EXPRESSED OR IMPLIED, INCLUDING MERCHANTABILITY AND FITNESS FOR PURPOSE, ARE
+   DISCLAIMED.
+
+  ==============================================================================
+*/
+
+namespace yup
+{
+
+namespace
+{
+
+//==============================================================================
+
+float dotProduct (const float* __restrict a, const float* __restrict b, std::size_t len) noexcept
+{
+    float acc = 0.0f;
+    std::size_t i = 0;
+
+#if YUP_USE_AVX_INTRINSICS && YUP_USE_FMA_INTRINSICS
+    __m256 vacc = _mm256_setzero_ps();
+    for (; i + 8 <= len; i += 8)
+    {
+        __m256 va = _mm256_loadu_ps (a + i);
+        __m256 vb = _mm256_loadu_ps (b + i);
+        vacc = _mm256_fmadd_ps (va, vb, vacc);
+    }
+    __m128 low = _mm256_castps256_ps128 (vacc);
+    __m128 high = _mm256_extractf128_ps (vacc, 1);
+    __m128 vsum = _mm_add_ps (low, high);
+    vsum = _mm_hadd_ps (vsum, vsum);
+    vsum = _mm_hadd_ps (vsum, vsum);
+    acc += _mm_cvtss_f32 (vsum);
+
+#elif YUP_USE_SSE_INTRINSICS
+    __m128 vacc = _mm_setzero_ps();
+#if YUP_USE_FMA_INTRINSICS
+    for (; i + 4 <= len; i += 4)
+    {
+        __m128 va = _mm_loadu_ps (a + i);
+        __m128 vb = _mm_loadu_ps (b + i);
+        vacc = _mm_fmadd_ps (va, vb, vacc);
+    }
+#else
+    for (; i + 4 <= len; i += 4)
+    {
+        __m128 va = _mm_loadu_ps (a + i);
+        __m128 vb = _mm_loadu_ps (b + i);
+        vacc = _mm_add_ps (vacc, _mm_mul_ps (va, vb));
+    }
+#endif
+    __m128 shuf = _mm_shuffle_ps (vacc, vacc, _MM_SHUFFLE (2, 3, 0, 1));
+    __m128 sums = _mm_add_ps (vacc, shuf);
+    shuf = _mm_movehl_ps (shuf, sums);
+    sums = _mm_add_ss (sums, shuf);
+    acc += _mm_cvtss_f32 (sums);
+
+#elif YUP_USE_ARM_NEON
+    float32x4_t vacc = vdupq_n_f32 (0.0f);
+    for (; i + 4 <= len; i += 4)
+    {
+        float32x4_t va = vld1q_f32 (a + i);
+        float32x4_t vb = vld1q_f32 (b + i);
+        vacc = vmlaq_f32 (vacc, va, vb);
+    }
+#if YUP_64BIT
+    acc += vaddvq_f32 (vacc);
+#else
+    float32x2_t vlow = vget_low_f32 (vacc);
+    float32x2_t vhigh = vget_high_f32 (vacc);
+    float32x2_t vsum2 = vpadd_f32 (vlow, vhigh);
+    vsum2 = vpadd_f32 (vsum2, vsum2);
+    acc += vget_lane_f32 (vsum2, 0);
+#endif
+
+#endif
+
+    // Handle remaining samples
+    for (; i < len; ++i)
+        acc += a[i] * b[i];
+
+    return acc;
+}
+
+} // namespace
+
+//==============================================================================
+
+DirectFIR::DirectFIR() = default;
+
+DirectFIR::~DirectFIR() = default;
+
+DirectFIR::DirectFIR (DirectFIR&& other) noexcept
+    : coefficientsReversed (std::move (other.coefficientsReversed))
+    , history (std::move (other.history))
+    , numCoefficients (std::exchange (other.numCoefficients, 0))
+    , paddedLen (std::exchange (other.paddedLen, 0))
+    , writeIndex (std::exchange (other.writeIndex, 0))
+    , currentScaling (std::exchange (other.currentScaling, 1.0f))
+{
+}
+
+DirectFIR& DirectFIR::operator= (DirectFIR&& other) noexcept
+{
+    if (this != &other)
+    {
+        coefficientsReversed = std::move (other.coefficientsReversed);
+        history = std::move (other.history);
+        numCoefficients = std::exchange (other.numCoefficients, 0);
+        paddedLen = std::exchange (other.paddedLen, 0);
+        writeIndex = std::exchange (other.writeIndex, 0);
+        currentScaling = std::exchange (other.currentScaling, 1.0f);
+    }
+    return *this;
+}
+
+void DirectFIR::setCoefficients (std::vector<float> coefficients, float scaling)
+{
+    currentScaling = scaling;
+    if (! approximatelyEqual (currentScaling, 1.0f))
+        FloatVectorOperations::multiply (coefficients.data(), scaling, coefficients.size());
+
+    coefficientsReversed = std::move (coefficients);
+    std::reverse (coefficientsReversed.begin(), coefficientsReversed.end());
+
+    numCoefficients = coefficientsReversed.size();
+    paddedLen = (numCoefficients + 3u) & ~3u; // Round up to multiple of 4 for SIMD
+    coefficientsReversed.resize (paddedLen, 0.0f);
+
+    history.assign (2 * numCoefficients, 0.0f);
+
+    reset();
+}
+
+void DirectFIR::setCoefficients (const float* coefficients, std::size_t numCoefficientsIn, float scaling)
+{
+    if (coefficients == nullptr || numCoefficientsIn == 0)
+    {
+        reset();
+        numCoefficients = 0;
+        return;
+    }
+
+    std::vector<float> coefficientsVector (coefficients, coefficients + numCoefficientsIn);
+    setCoefficients (std::move (coefficientsVector), scaling);
+}
+
+std::size_t DirectFIR::getNumCoefficients() const noexcept
+{
+    return numCoefficients;
+}
+
+bool DirectFIR::hasCoefficients() const noexcept
+{
+    return numCoefficients > 0;
+}
+
+const std::vector<float>& DirectFIR::getCoefficients() const noexcept
+{
+    return coefficientsReversed;
+}
+
+float DirectFIR::getScaling() const noexcept
+{
+    return currentScaling;
+}
+
+void DirectFIR::reset()
+{
+    std::fill (history.begin(), history.end(), 0.0f);
+    writeIndex = 0;
+}
+
+void DirectFIR::process (const float* input, float* output, std::size_t numSamples) noexcept
+{
+    const std::size_t M = numCoefficients;
+    if (M == 0 || input == nullptr || output == nullptr)
+        return;
+
+    const float* h = coefficientsReversed.data();
+
+    for (std::size_t i = 0; i < numSamples; ++i)
+    {
+        const float x = input[i];
+
+        // Update circular buffer with current input sample
+        history[writeIndex] = x;
+        history[writeIndex + M] = x; // Duplicate for efficient circular access
+
+        // Point to the start of the delay line for this sample
+        const float* w = history.data() + writeIndex + 1;
+
+        float sum = 0.0f;
+
+#if YUP_ENABLE_VDSP
+        // Use Apple's optimized vDSP if available
+        vDSP_dotpr (w, 1, h, 1, &sum, M);
+#else
+        // Use our own SIMD-optimized dot product
+        sum = dotProduct (w, h, M);
+#endif
+
+        // Accumulate result into output
+        output[i] += sum;
+
+        // Advance circular buffer write pointer
+        if (++writeIndex == M)
+            writeIndex = 0;
+    }
+}
+
+} // namespace yup
diff --git a/modules/yup_dsp/filters/yup_DirectFIR.h b/modules/yup_dsp/filters/yup_DirectFIR.h
new file mode 100644
index 000000000..23abdaeb6
--- /dev/null
+++ b/modules/yup_dsp/filters/yup_DirectFIR.h
@@ -0,0 +1,158 @@
+/*
+  ==============================================================================
+
+   This file is part of the YUP library.
+   Copyright (c) 2025 - kunitoki@gmail.com
+
+   YUP is an open source library subject to open-source licensing.
+
+   The code included in this file is provided under the terms of the ISC license
+   http://www.isc.org/downloads/software-support-policy/isc-license. Permission
+   to use, copy, modify, and/or distribute this software for any purpose with or
+   without fee is hereby granted provided that the above copyright notice and
+   this permission notice appear in all copies.
+
+   YUP IS PROVIDED "AS IS" WITHOUT ANY WARRANTY, AND ALL WARRANTIES, WHETHER
+   EXPRESSED OR IMPLIED, INCLUDING MERCHANTABILITY AND FITNESS FOR PURPOSE, ARE
+   DISCLAIMED.
+
+  ==============================================================================
+*/
+
+#pragma once
+
+namespace yup
+{
+
+//==============================================================================
+/**
+    Direct-form FIR (Finite Impulse Response) filter processor optimized for real-time audio.
+
+    Implements a time-domain FIR filter using direct convolution with SIMD optimizations.
+    This class is ideal for low-latency applications where the number of coefficients is relatively
+    small (typically < 512 coefficients), as it provides zero algorithmic delay.
+
+    Features:
+    - Zero algorithmic latency (only processing delay)
+    - SIMD-optimized convolution (AVX2, SSE, ARM NEON, vDSP)
+    - Circular buffer implementation for efficient sample history management
+    - Real-time safe processing (no heap allocations during process())
+    - Support for arbitrary block sizes
+
+    Example usage:
+    @code
+    DirectFIR fir;
+
+    // Set filter coefficients (e.g., lowpass filter)
+    std::vector<float> coeffs = calculateLowpassCoeffs(44100.0f, 1000.0f, 64);
+    fir.setTaps(coeffs, 1.0f); // coeffs with 1.0x scaling
+
+    // Prepare for processing
+    fir.prepare(512); // Maximum 512 samples per process() call
+
+    // In audio callback:
+    fir.process(inputBuffer, outputBuffer, numSamples); // Accumulates into output
+    @endcode
+
+    @note The process() method accumulates results into the output buffer.
+          Clear the output buffer first if overwrite behavior is desired.
+
+    @see PartitionedConvolver for longer impulse responses using FFT-based convolution
+*/
+class DirectFIR
+{
+public:
+    //==============================================================================
+    /** Default constructor */
+    DirectFIR();
+
+    /** Destructor */
+    ~DirectFIR();
+
+    // Non-copyable but movable
+    DirectFIR (DirectFIR&& other) noexcept;
+    DirectFIR& operator= (DirectFIR&& other) noexcept;
+
+    //==============================================================================
+    /**
+        Set the FIR filter coefficients.
+
+        @param coefficients  Vector containing the FIR coefficients in time order
+        @param scaling       Scaling factor to apply to all coefficients
+
+        @note This method is not real-time safe and should be called during initialization
+              or when audio processing is paused.
+    */
+    void setCoefficients (std::vector<float> coefficients, float scaling = 1.0f);
+
+    /**
+        Set the FIR filter coefficients from a raw pointer.
+
+        @param coefficients     Pointer to FIR coefficients array
+        @param numCoefficients  Number of coefficients
+        @param scaling          Scaling factor to apply to all coefficients
+
+        @note This method is not real-time safe and should be called during initialization
+              or when audio processing is paused.
+    */
+    void setCoefficients (const float* coefficients, std::size_t numCoefficients, float scaling = 1.0f);
+
+    /**
+        Get the number of filter coefficients.
+
+        @return Number of coefficients in the current filter
+    */
+    std::size_t getNumCoefficients() const noexcept;
+
+    /**
+        Check if the filter has been configured with coefficients.
+
+        @return True if coefficients have been set, false otherwise
+    */
+    bool hasCoefficients() const noexcept;
+
+    /**
+        Get the current filter coefficients.
+
+        @return Vector containing the current coefficients (time-reversed for processing)
+    */
+    const std::vector<float>& getCoefficients() const noexcept;
+
+    /**
+        Get the current scaling factor applied to coefficients.
+
+        @return Current scaling factor
+    */
+    float getScaling() const noexcept;
+
+    //==============================================================================
+    /**
+        Reset all internal processing state (clears sample history).
+        Filter coefficients are preserved.
+    */
+    void reset();
+
+    /**
+        Process audio samples through the FIR filter.
+
+        @param input       Input audio buffer
+        @param output      Output audio buffer (results are accumulated)
+        @param numSamples  Number of samples to process
+
+        @note Results are accumulated into the output buffer. Clear it first if needed.
+        @note This method is real-time safe with no heap allocations.
+    */
+    void process (const float* input, float* output, std::size_t numSamples) noexcept;
+
+private:
+    std::vector<float> coefficientsReversed;
+    std::vector<float> history;
+    std::size_t numCoefficients = 0;
+    std::size_t paddedLen = 0;
+    std::size_t writeIndex = 0;
+    float currentScaling = 1.0f;
+
+    YUP_DECLARE_NON_COPYABLE_WITH_LEAK_DETECTOR (DirectFIR)
+};
+
+} // namespace yup
diff --git a/modules/yup_dsp/yup_dsp.cpp b/modules/yup_dsp/yup_dsp.cpp
index 39d32fd85..0c3488b70 100644
--- a/modules/yup_dsp/yup_dsp.cpp
+++ b/modules/yup_dsp/yup_dsp.cpp
@@ -75,6 +75,7 @@
 #include "frequency/yup_FFTProcessor.cpp"
 #include "frequency/yup_SpectrumAnalyzerState.cpp"
 #include "designers/yup_FilterDesigner.cpp"
+#include "filters/yup_DirectFIR.cpp"
 #include "convolution/yup_PartitionedConvolver.cpp"
 
 //==============================================================================
diff --git a/modules/yup_dsp/yup_dsp.h b/modules/yup_dsp/yup_dsp.h
index c42e03212..5181dfb6c 100644
--- a/modules/yup_dsp/yup_dsp.h
+++ b/modules/yup_dsp/yup_dsp.h
@@ -138,6 +138,7 @@
 #include "filters/yup_StateVariableFilter.h"
 #include "filters/yup_ButterworthFilter.h"
 #include "filters/yup_LinkwitzRileyFilter.h"
+#include "filters/yup_DirectFIR.h"
 
 // Dynamics processors
 #include "dynamics/yup_SoftClipper.h"
diff --git a/tests/yup_dsp/yup_DirectFIR.cpp b/tests/yup_dsp/yup_DirectFIR.cpp
new file mode 100644
index 000000000..e46071a19
--- /dev/null
+++ b/tests/yup_dsp/yup_DirectFIR.cpp
@@ -0,0 +1,585 @@
+/*
+  ==============================================================================
+
+   This file is part of the YUP library.
+   Copyright (c) 2025 - kunitoki@gmail.com
+
+   YUP is an open source library subject to open-source licensing.
+
+   The code included in this file is provided under the terms of the ISC license
+   http://www.isc.org/downloads/software-support-policy/isc-license. Permission
+   to use, copy, modify, and/or distribute this software for any purpose with or
+   without fee is hereby granted provided that the above copyright notice and
+   this permission notice appear in all copies.
+
+   YUP IS PROVIDED "AS IS" WITHOUT ANY WARRANTY, AND ALL WARRANTIES, WHETHER
+   EXPRESSED OR IMPLIED, INCLUDING MERCHANTABILITY AND FITNESS FOR PURPOSE, ARE
+   DISCLAIMED.
+
+  ==============================================================================
+*/
+
+#include <yup_dsp/yup_dsp.h>
+
+#include <gtest/gtest.h>
+
+#include <random>
+#include <cmath>
+
+namespace yup::test
+{
+
+//==============================================================================
+class DirectFIRTest : public ::testing::Test
+{
+protected:
+    void SetUp() override
+    {
+        generator.seed (42); // Fixed seed for reproducible tests
+    }
+
+    float randomFloat (float min = -1.0f, float max = 1.0f)
+    {
+        std::uniform_real_distribution<float> dist (min, max);
+        return dist (generator);
+    }
+
+    void fillWithRandomData (std::vector<float>& buffer)
+    {
+        for (auto& sample : buffer)
+            sample = randomFloat();
+    }
+
+    void fillWithSine (std::vector<float>& buffer, float frequency, float sampleRate)
+    {
+        for (size_t i = 0; i < buffer.size(); ++i)
+            buffer[i] = std::sin (2.0f * MathConstants<float>::pi * frequency * static_cast<float> (i) / sampleRate);
+    }
+
+    void clearBuffer (std::vector<float>& buffer)
+    {
+        std::fill (buffer.begin(), buffer.end(), 0.0f);
+    }
+
+    float calculateRMS (const std::vector<float>& buffer)
+    {
+        if (buffer.empty())
+            return 0.0f;
+
+        float sum = 0.0f;
+        for (float sample : buffer)
+            sum += sample * sample;
+
+        return std::sqrt (sum / static_cast<float> (buffer.size()));
+    }
+
+    float findPeak (const std::vector<float>& buffer)
+    {
+        if (buffer.empty())
+            return 0.0f;
+
+        float peak = 0.0f;
+        for (float sample : buffer)
+            peak = std::max (peak, std::abs (sample));
+
+        return peak;
+    }
+
+    std::vector<float> createLowpassCoefficients (int numCoefficients, float cutoffFreq, float sampleRate)
+    {
+        std::vector<float> coefficients (numCoefficients);
+        float fc = cutoffFreq / sampleRate;
+        int center = numCoefficients / 2;
+
+        for (int i = 0; i < numCoefficients; ++i)
+        {
+            if (i == center)
+                coefficients[i] = 2.0f * fc;
+            else
+            {
+                float x = 2.0f * MathConstants<float>::pi * fc * (i - center);
+                coefficients[i] = std::sin (x) / x;
+            }
+
+            // Apply Hanning window
+            float w = 0.5f - 0.5f * std::cos (2.0f * MathConstants<float>::pi * i / (numCoefficients - 1));
+            coefficients[i] *= w;
+        }
+
+        return coefficients;
+    }
+
+    std::mt19937 generator;
+};
+
+//==============================================================================
+// Basic API Tests
+//==============================================================================
+
+TEST_F (DirectFIRTest, DefaultConstruction)
+{
+    DirectFIR fir;
+
+    // Default state should be safe
+    EXPECT_EQ (fir.getNumCoefficients(), 0);
+    EXPECT_FALSE (fir.hasCoefficients());
+    EXPECT_EQ (fir.getScaling(), 1.0f);
+
+    // Should handle empty processing gracefully
+    std::vector<float> input (256, 0.0f);
+    std::vector<float> output (256, 0.0f);
+    EXPECT_NO_THROW (fir.process (input.data(), output.data(), input.size()));
+
+    // Output should remain zero without coefficients
+    for (float sample : output)
+        EXPECT_EQ (sample, 0.0f);
+}
+
+TEST_F (DirectFIRTest, MoveSemantics)
+{
+    DirectFIR fir1;
+    std::vector<float> coefficients = { 1.0f, 0.5f, 0.25f };
+    fir1.setCoefficients (coefficients, 2.0f);
+
+    // Move constructor
+    DirectFIR fir2 = std::move (fir1);
+
+    // Verify moved filter works
+    EXPECT_EQ (fir2.getNumCoefficients(), 3);
+    EXPECT_TRUE (fir2.hasCoefficients());
+    EXPECT_EQ (fir2.getScaling(), 2.0f);
+
+    // Original should be in valid but unspecified state
+    EXPECT_EQ (fir1.getNumCoefficients(), 0);
+
+    // Test processing with moved filter
+    std::vector<float> input (10, 0.0f);
+    input[0] = 1.0f;
+    std::vector<float> output (10, 0.0f);
+
+    EXPECT_NO_THROW (fir2.process (input.data(), output.data(), input.size()));
+
+    // Should produce scaled output
+    float outputSum = 0.0f;
+    for (float sample : output)
+        outputSum += std::abs (sample);
+    EXPECT_GT (outputSum, 1.0f); // Should be > 1 due to scaling
+
+    // Move assignment
+    DirectFIR fir3;
+    fir3 = std::move (fir2);
+
+    EXPECT_EQ (fir3.getNumCoefficients(), 3);
+    EXPECT_TRUE (fir3.hasCoefficients());
+    EXPECT_EQ (fir3.getScaling(), 2.0f);
+}
+
+//==============================================================================
+// Coefficient Setting Tests
+//==============================================================================
+
+TEST_F (DirectFIRTest, SetCoefficientsVector)
+{
+    DirectFIR fir;
+    std::vector<float> coefficients = { 0.1f, 0.5f, 1.0f, 0.5f, 0.1f };
+
+    fir.setCoefficients (coefficients, 1.0f);
+
+    EXPECT_EQ (fir.getNumCoefficients(), 5);
+    EXPECT_TRUE (fir.hasCoefficients());
+    EXPECT_EQ (fir.getScaling(), 1.0f);
+
+    // Coefficients should be available
+    const auto& coeffs = fir.getCoefficients();
+    EXPECT_EQ (coeffs.size(), 8); // Padded to multiple of 4
+}
+
+TEST_F (DirectFIRTest, SetCoefficientsPointer)
+{
+    DirectFIR fir;
+    float coefficients[] = { 0.2f, 0.4f, 0.6f, 0.8f };
+
+    fir.setCoefficients (coefficients, 4, 2.0f);
+
+    EXPECT_EQ (fir.getNumCoefficients(), 4);
+    EXPECT_TRUE (fir.hasCoefficients());
+    EXPECT_EQ (fir.getScaling(), 2.0f);
+}
+
+TEST_F (DirectFIRTest, SetCoefficientsNullptr)
+{
+    DirectFIR fir;
+
+    // First set some valid coefficients
+    std::vector<float> coefficients = { 1.0f, 0.5f };
+    fir.setCoefficients (coefficients);
+    EXPECT_TRUE (fir.hasCoefficients());
+
+    // Setting nullptr should clear the filter
+    fir.setCoefficients (nullptr, 0, 1.0f);
+    EXPECT_FALSE (fir.hasCoefficients());
+    EXPECT_EQ (fir.getNumCoefficients(), 0);
+}
+
+TEST_F (DirectFIRTest, SetCoefficientsWithScaling)
+{
+    DirectFIR fir;
+    std::vector<float> coefficients = { 1.0f, 1.0f, 1.0f };
+
+    fir.setCoefficients (coefficients, 0.5f);
+
+    // Test impulse response
+    std::vector<float> input (10, 0.0f);
+    input[0] = 2.0f; // Unit impulse scaled by 2
+    std::vector<float> output (10, 0.0f);
+
+    fir.process (input.data(), output.data(), input.size());
+
+    // Output should reflect the coefficient scaling
+    // Each coefficient was originally 1.0, scaled by 0.5, so output per coefficient = 2.0 * 0.5 = 1.0
+    float expectedSum = 3.0f; // 3 coefficients * 1.0 each
+    float actualSum = 0.0f;
+    for (size_t i = 0; i < 5; ++i) // Check first 5 samples
+        actualSum += output[i];
+
+    EXPECT_NEAR (actualSum, expectedSum, 0.001f);
+}
+
+//==============================================================================
+// Processing Tests
+//==============================================================================
+
+TEST_F (DirectFIRTest, ImpulseResponse)
+{
+    DirectFIR fir;
+    std::vector<float> coefficients = { 1.0f, 0.5f, 0.25f };
+    fir.setCoefficients (coefficients);
+
+    // Test with unit impulse
+    std::vector<float> input (10, 0.0f);
+    input[0] = 1.0f;
+    std::vector<float> output (10, 0.0f);
+
+    fir.process (input.data(), output.data(), input.size());
+
+    // Should get the impulse response (coefficients in original order)
+    EXPECT_NEAR (output[0], 1.0f, 0.001f);  // First coefficient h0
+    EXPECT_NEAR (output[1], 0.5f, 0.001f);  // Second coefficient h1
+    EXPECT_NEAR (output[2], 0.25f, 0.001f); // Third coefficient h2
+
+    // Rest should be zero
+    for (size_t i = 3; i < output.size(); ++i)
+        EXPECT_NEAR (output[i], 0.0f, 0.001f);
+}
+
+TEST_F (DirectFIRTest, AccumulativeOutput)
+{
+    DirectFIR fir;
+    std::vector<float> coefficients = { 0.5f, 0.5f };
+    fir.setCoefficients (coefficients);
+
+    std::vector<float> input (5, 1.0f);
+    std::vector<float> output (5);
+
+    // Pre-populate output buffer
+    std::fill (output.begin(), output.end(), 1.0f);
+    std::vector<float> originalOutput = output;
+
+    fir.process (input.data(), output.data(), input.size());
+
+    // Output should contain original data plus filter result
+    for (size_t i = 0; i < output.size(); ++i)
+        EXPECT_GT (output[i], originalOutput[i]);
+}
+
+TEST_F (DirectFIRTest, Linearity)
+{
+    DirectFIR fir;
+    std::vector<float> coefficients = createLowpassCoefficients (32, 1000.0f, 44100.0f);
+    fir.setCoefficients (coefficients);
+
+    std::vector<float> input (512);
+    fillWithRandomData (input);
+
+    // Scale input by 2 and test linearity
+    std::vector<float> input2 = input;
+    FloatVectorOperations::multiply (input2.data(), 2.0f, input2.size());
+
+    std::vector<float> output1 (512, 0.0f);
+    std::vector<float> output2 (512, 0.0f);
+
+    fir.reset();
+    fir.process (input.data(), output1.data(), input.size());
+
+    fir.reset();
+    fir.process (input2.data(), output2.data(), input2.size());
+
+    // output2 should be approximately 2x output1
+    for (size_t i = 0; i < output1.size(); ++i)
+    {
+        if (std::abs (output1[i]) > 0.001f) // Avoid division by near-zero
+            EXPECT_NEAR (output2[i] / output1[i], 2.0f, 0.01f);
+    }
+}
+
+TEST_F (DirectFIRTest, Reset)
+{
+    DirectFIR fir;
+    std::vector<float> coefficients = { 1.0f, 0.8f, 0.6f, 0.4f, 0.2f };
+    fir.setCoefficients (coefficients);
+
+    std::vector<float> input (20);
+    fillWithRandomData (input);
+    std::vector<float> output1 (20, 0.0f);
+
+    // Process some data to build up internal state
+    fir.process (input.data(), output1.data(), input.size());
+
+    // Reset and process same input
+    fir.reset();
+    std::vector<float> output2 (20, 0.0f);
+    fir.process (input.data(), output2.data(), input.size());
+
+    // Outputs should be identical after reset
+    for (size_t i = 0; i < output1.size(); ++i)
+        EXPECT_NEAR (output1[i], output2[i], 0.0001f);
+}
+
+//==============================================================================
+// Signal Processing Tests
+//==============================================================================
+
+TEST_F (DirectFIRTest, LowpassFiltering)
+{
+    DirectFIR fir;
+
+    // Create lowpass filter coefficients
+    std::vector<float> coefficients = createLowpassCoefficients (64, 1000.0f, 44100.0f);
+    fir.setCoefficients (coefficients);
+
+    const float sampleRate = 44100.0f;
+    const size_t bufferSize = 2048;
+
+    // Test with low frequency (should pass)
+    std::vector<float> lowFreqInput (bufferSize);
+    fillWithSine (lowFreqInput, 500.0f, sampleRate);
+    std::vector<float> lowFreqOutput (bufferSize, 0.0f);
+
+    fir.process (lowFreqInput.data(), lowFreqOutput.data(), bufferSize);
+
+    // Test with high frequency (should be attenuated)
+    fir.reset();
+    std::vector<float> highFreqInput (bufferSize);
+    fillWithSine (highFreqInput, 5000.0f, sampleRate);
+    std::vector<float> highFreqOutput (bufferSize, 0.0f);
+
+    fir.process (highFreqInput.data(), highFreqOutput.data(), bufferSize);
+
+    // Compare RMS levels (skip first samples due to transient)
+    const size_t skipSamples = 100;
+    float lowFreqRMS = 0.0f, highFreqRMS = 0.0f;
+
+    for (size_t i = skipSamples; i < bufferSize; ++i)
+    {
+        lowFreqRMS += lowFreqOutput[i] * lowFreqOutput[i];
+        highFreqRMS += highFreqOutput[i] * highFreqOutput[i];
+    }
+
+    lowFreqRMS = std::sqrt (lowFreqRMS / (bufferSize - skipSamples));
+    highFreqRMS = std::sqrt (highFreqRMS / (bufferSize - skipSamples));
+
+    // Low frequency should have higher RMS than high frequency
+    EXPECT_GT (lowFreqRMS, highFreqRMS * 2.0f);
+}
+
+TEST_F (DirectFIRTest, BlockSizeIndependence)
+{
+    DirectFIR fir;
+    std::vector<float> coefficients = createLowpassCoefficients (48, 2000.0f, 44100.0f);
+    fir.setCoefficients (coefficients);
+
+    const size_t totalSamples = 1024;
+    std::vector<float> input (totalSamples);
+    fillWithRandomData (input);
+
+    // Process in one big block
+    fir.reset();
+    std::vector<float> output1 (totalSamples, 0.0f);
+    fir.process (input.data(), output1.data(), totalSamples);
+
+    // Process in smaller blocks
+    fir.reset();
+    std::vector<float> output2 (totalSamples, 0.0f);
+    const std::vector<size_t> blockSizes = { 32, 64, 128, 256, 32, 128, 64 };
+    size_t processed = 0;
+
+    for (size_t blockSize : blockSizes)
+    {
+        if (processed >= totalSamples)
+            break;
+
+        if (processed + blockSize > totalSamples)
+            blockSize = totalSamples - processed;
+
+        if (blockSize == 0)
+            break;
+
+        fir.process (input.data() + processed, output2.data() + processed, blockSize);
+        processed += blockSize;
+    }
+
+    // Process any remaining samples
+    while (processed < totalSamples)
+    {
+        size_t remaining = totalSamples - processed;
+        size_t blockSize = std::min (remaining, size_t (128)); // Process in chunks of 128
+        fir.process (input.data() + processed, output2.data() + processed, blockSize);
+        processed += blockSize;
+    }
+
+    // Outputs should be identical regardless of block size
+    for (size_t i = 0; i < totalSamples; ++i)
+        EXPECT_NEAR (output1[i], output2[i], 0.0001f);
+}
+
+//==============================================================================
+// Edge Cases and Error Handling
+//==============================================================================
+
+TEST_F (DirectFIRTest, ZeroSamples)
+{
+    DirectFIR fir;
+    std::vector<float> coefficients = { 1.0f, 0.5f };
+    fir.setCoefficients (coefficients);
+
+    std::vector<float> input (10, 1.0f);
+    std::vector<float> output (10, 0.0f);
+
+    // Processing zero samples should be safe
+    EXPECT_NO_THROW (fir.process (input.data(), output.data(), 0));
+
+    // Output should remain unchanged
+    for (float sample : output)
+        EXPECT_EQ (sample, 0.0f);
+}
+
+TEST_F (DirectFIRTest, NullPointers)
+{
+    DirectFIR fir;
+    std::vector<float> coefficients = { 1.0f };
+    fir.setCoefficients (coefficients);
+
+    std::vector<float> buffer (10, 0.0f);
+
+    // Null input pointer should be handled gracefully
+    EXPECT_NO_THROW (fir.process (nullptr, buffer.data(), 10));
+
+    // Null output pointer should be handled gracefully
+    EXPECT_NO_THROW (fir.process (buffer.data(), nullptr, 10));
+
+    // Both null should be handled gracefully
+    EXPECT_NO_THROW (fir.process (nullptr, nullptr, 10));
+}
+
+TEST_F (DirectFIRTest, LargeTapCounts)
+{
+    DirectFIR fir;
+
+    // Test with relatively large number of coefficients
+    std::vector<float> coefficients (512);
+    for (size_t i = 0; i < coefficients.size(); ++i)
+        coefficients[i] = std::exp (-static_cast<float> (i) / 100.0f) * std::sin (2.0f * MathConstants<float>::pi * i / 16.0f);
+
+    EXPECT_NO_THROW (fir.setCoefficients (coefficients));
+    EXPECT_EQ (fir.getNumCoefficients(), 512);
+
+    // Should process without issues
+    std::vector<float> input (1024);
+    std::vector<float> output (1024, 0.0f);
+    fillWithRandomData (input);
+
+    EXPECT_NO_THROW (fir.process (input.data(), output.data(), input.size()));
+
+    // Should produce reasonable output
+    float rms = calculateRMS (output);
+    EXPECT_GT (rms, 0.001f);
+    EXPECT_LT (rms, 10.0f);
+}
+
+TEST_F (DirectFIRTest, SingleTap)
+{
+    DirectFIR fir;
+    std::vector<float> coefficients = { 0.75f };
+    fir.setCoefficients (coefficients);
+
+    EXPECT_EQ (fir.getNumCoefficients(), 1);
+
+    // Single coefficient should act as a simple gain
+    std::vector<float> input = { 1.0f, 2.0f, 3.0f, 4.0f, 5.0f };
+    std::vector<float> output (5, 0.0f);
+
+    fir.process (input.data(), output.data(), input.size());
+
+    for (size_t i = 0; i < input.size(); ++i)
+        EXPECT_NEAR (output[i], input[i] * 0.75f, 0.001f);
+}
+
+//==============================================================================
+// Performance and Memory Tests
+//==============================================================================
+
+TEST_F (DirectFIRTest, MemoryAlignment)
+{
+    DirectFIR fir;
+
+    // Coefficient count that's not a multiple of 4
+    std::vector<float> coefficients (37);
+    std::fill (coefficients.begin(), coefficients.end(), 0.1f);
+    fir.setCoefficients (coefficients);
+
+    // Coefficients should be padded for SIMD alignment
+    const auto& coeffs = fir.getCoefficients();
+    EXPECT_EQ (coeffs.size() % 4, 0); // Should be multiple of 4
+    EXPECT_GE (coeffs.size(), 37);    // Should be at least original size
+
+    // Padded elements should be zero
+    for (size_t i = 37; i < coeffs.size(); ++i)
+        EXPECT_EQ (coeffs[i], 0.0f);
+}
+
+TEST_F (DirectFIRTest, StressTest)
+{
+    DirectFIR fir;
+
+    // Create complex impulse response
+    std::vector<float> coefficients (256);
+    for (size_t i = 0; i < coefficients.size(); ++i)
+    {
+        float t = static_cast<float> (i) / 256.0f;
+        coefficients[i] = std::exp (-t * 5.0f) * std::cos (20.0f * MathConstants<float>::pi * t);
+    }
+    fir.setCoefficients (coefficients);
+
+    // Process multiple blocks of varying sizes
+    const std::vector<size_t> blockSizes = { 1, 7, 32, 63, 128, 255, 512, 1023 };
+
+    for (size_t blockSize : blockSizes)
+    {
+        SCOPED_TRACE (testing::Message() << "Block size: " << blockSize);
+
+        std::vector<float> input (blockSize);
+        std::vector<float> output (blockSize, 0.0f);
+        fillWithRandomData (input);
+
+        EXPECT_NO_THROW (fir.process (input.data(), output.data(), blockSize));
+
+        // Verify output quality
+        for (float sample : output)
+        {
+            EXPECT_TRUE (std::isfinite (sample));
+            EXPECT_LT (std::abs (sample), 100.0f); // Reasonable bounds
+        }
+    }
+}
+
+} // namespace yup::test
\ No newline at end of file
diff --git a/tests/yup_dsp/yup_PartitionedConvolver.cpp b/tests/yup_dsp/yup_PartitionedConvolver.cpp
index b8a4f09b7..5f83c46b1 100644
--- a/tests/yup_dsp/yup_PartitionedConvolver.cpp
+++ b/tests/yup_dsp/yup_PartitionedConvolver.cpp
@@ -490,10 +490,10 @@ TEST_F (PartitionedConvolverTest, LatencyMeasurement)
         { 256, { 256, 1024 } }
     };
 
-    for (const auto& [directTaps, hops] : configs)
+    for (const auto& [directCoefficients, hops] : configs)
     {
         PartitionedConvolver convolver;
-        convolver.setTypicalLayout (directTaps, hops);
+        convolver.setTypicalLayout (directCoefficients, hops);
         convolver.prepare (1024);
 
         // Unit impulse response
@@ -526,8 +526,8 @@ TEST_F (PartitionedConvolverTest, LatencyMeasurement)
         EXPECT_LE (latencySamples, static_cast<size_t> (maxHop * 2));
 
         // With direct FIR, latency should be minimal
-        if (directTaps > 0)
-            EXPECT_LE (latencySamples, directTaps);
+        if (directCoefficients > 0)
+            EXPECT_LE (latencySamples, directCoefficients);
     }
 }
 
@@ -537,9 +537,9 @@ TEST_F (PartitionedConvolverTest, LatencyMeasurement)
 
 TEST_F (PartitionedConvolverTest, VariousPartitionSizes)
 {
-    // Test various partition configurations - all with direct taps for immediate response
+    // Test various partition configurations - all with direct coefficients for immediate response
     std::vector<std::tuple<size_t, std::vector<int>, size_t>> testConfigs = {
-        // (directTaps, hops, maxBlockSize)
+        // (directCoefficients, hops, maxBlockSize)
         { 64, { 64 }, 512 },
         { 32, { 64 }, 512 },
         { 64, { 64, 256 }, 512 },
@@ -553,11 +553,11 @@ TEST_F (PartitionedConvolverTest, VariousPartitionSizes)
 
     for (const auto& item : testConfigs)
     {
-        const auto& directTaps = std::get<0> (item);
+        const auto& directCoefficients = std::get<0> (item);
         const auto& hops = std::get<1> (item);
         const auto& maxBlockSize = std::get<2> (item);
 
-        SCOPED_TRACE (testing::Message() << "Config: directTaps=" << directTaps << " hops=[" << [&]()
+        SCOPED_TRACE (testing::Message() << "Config: directCoefficients=" << directCoefficients << " hops=[" << [&]()
         {
             std::string hopStr;
             for (size_t i = 0; i < hops.size(); ++i)
@@ -572,7 +572,7 @@ TEST_F (PartitionedConvolverTest, VariousPartitionSizes)
         PartitionedConvolver convolver;
 
         // Configure and verify setup
-        EXPECT_NO_THROW (convolver.setTypicalLayout (directTaps, hops));
+        EXPECT_NO_THROW (convolver.setTypicalLayout (directCoefficients, hops));
         EXPECT_NO_THROW (convolver.prepare (maxBlockSize));
 
         // Create a simple known impulse response
@@ -749,7 +749,7 @@ TEST_F (PartitionedConvolverTest, RandomizedFuzzing)
 {
     // Generate random configurations and test them
     std::uniform_int_distribution<int> hopDist (32, 2048);
-    std::uniform_int_distribution<size_t> directTapsDist (32, 512); // Always have some direct taps
+    std::uniform_int_distribution<size_t> directCoefficientsDist (32, 512); // Always have some direct coefficients
     std::uniform_int_distribution<size_t> blockSizeDist (32, 1024);
 
     for (int trial = 0; trial < 10; ++trial) // Reduce trials for stability
@@ -757,7 +757,7 @@ TEST_F (PartitionedConvolverTest, RandomizedFuzzing)
         SCOPED_TRACE (testing::Message() << "Fuzzing trial " << trial);
 
         // Generate random configuration
-        const size_t directTaps = directTapsDist (generator);
+        const size_t directCoefficients = directCoefficientsDist (generator);
         const size_t numLayers = 1 + (generator() % 3); // 1-3 layers
 
         std::vector<int> hops;
@@ -777,7 +777,7 @@ TEST_F (PartitionedConvolverTest, RandomizedFuzzing)
 
         try
         {
-            convolver.setTypicalLayout (directTaps, hops);
+            convolver.setTypicalLayout (directCoefficients, hops);
             convolver.prepare (maxBlockSize);
 
             // Simple impulse response

From 9ffd8d2e2e72a29cd45779ecacf5d95855ab14e3 Mon Sep 17 00:00:00 2001
From: kunitoki <kunitoki@gmail.com>
Date: Fri, 12 Sep 2025 08:23:12 +0200
Subject: [PATCH 33/37] FIR Implementation

---
 .../convolution/yup_PartitionedConvolver.cpp  |   4 +-
 .../yup_dsp/designers/yup_FilterDesigner.cpp  | 166 ++++++++++
 .../yup_dsp/designers/yup_FilterDesigner.h    |  72 ++++
 modules/yup_dsp/filters/yup_DirectFIR.cpp     | 227 -------------
 modules/yup_dsp/filters/yup_DirectFIR.h       | 207 ++++++++++--
 modules/yup_dsp/utilities/yup_DspMath.cpp     | 104 ++++++
 modules/yup_dsp/utilities/yup_DspMath.h       |  18 +
 modules/yup_dsp/yup_dsp.cpp                   |   2 +-
 tests/yup_dsp/yup_DirectFIR.cpp               |  48 +--
 tests/yup_dsp/yup_FilterDesigner.cpp          | 307 ++++++++++++++++++
 10 files changed, 875 insertions(+), 280 deletions(-)
 delete mode 100644 modules/yup_dsp/filters/yup_DirectFIR.cpp
 create mode 100644 modules/yup_dsp/utilities/yup_DspMath.cpp

diff --git a/modules/yup_dsp/convolution/yup_PartitionedConvolver.cpp b/modules/yup_dsp/convolution/yup_PartitionedConvolver.cpp
index 566ba72cf..f5be2c0fb 100644
--- a/modules/yup_dsp/convolution/yup_PartitionedConvolver.cpp
+++ b/modules/yup_dsp/convolution/yup_PartitionedConvolver.cpp
@@ -537,7 +537,7 @@ class PartitionedConvolver::Impl
 
     void setImpulseResponse (const float* impulseResponse, std::size_t length, const PartitionedConvolver::IRLoadOptions& options)
     {
-        DirectFIR newFIR;
+        DirectFIRFloat newFIR;
         std::vector<FFTLayer> newLayers (layers.size());
 
         std::size_t trimmedLength = length;
@@ -765,7 +765,7 @@ class PartitionedConvolver::Impl
     std::size_t finalImpulseLength = 0;
     bool isPrepared = false;
 
-    DirectFIR directFIR;
+    DirectFIRFloat directFIR;
     std::vector<FFTLayer> layers;
 
     // Working buffers
diff --git a/modules/yup_dsp/designers/yup_FilterDesigner.cpp b/modules/yup_dsp/designers/yup_FilterDesigner.cpp
index fc32560d1..a92949a08 100644
--- a/modules/yup_dsp/designers/yup_FilterDesigner.cpp
+++ b/modules/yup_dsp/designers/yup_FilterDesigner.cpp
@@ -625,6 +625,172 @@ int FilterDesigner<CoeffType>::designLinkwitzRiley (
     return static_cast<int> (lowCoeffs.size());
 }
 
+//==============================================================================
+// FIR Filter Design Implementations
+//==============================================================================
+
+template <typename CoeffType>
+std::vector<CoeffType> FilterDesigner<CoeffType>::designFIRLowpass (
+    int numCoefficients,
+    CoeffType cutoffFreq,
+    double sampleRate,
+    WindowType windowType) noexcept
+{
+    jassert (numCoefficients > 0);
+    jassert (cutoffFreq > static_cast<CoeffType> (0.0));
+    jassert (sampleRate > 0.0);
+    jassert (cutoffFreq < static_cast<CoeffType> (sampleRate / 2.0));
+
+    numCoefficients = nextOdd (numCoefficients);
+    std::vector<CoeffType> coefficients (numCoefficients);
+
+    const auto normalizedCutoff = static_cast<CoeffType> (2.0) * cutoffFreq / static_cast<CoeffType> (sampleRate);
+    const int center = (numCoefficients - 1) / 2;
+
+    // Generate ideal lowpass sinc function
+    for (int i = 0; i < numCoefficients; ++i)
+    {
+        if (i == center)
+        {
+            coefficients[i] = normalizedCutoff;
+        }
+        else
+        {
+            const auto x = MathConstants<CoeffType>::pi * normalizedCutoff * static_cast<CoeffType> (i - center);
+            coefficients[i] = std::sin (x) / (MathConstants<CoeffType>::pi * static_cast<CoeffType> (i - center));
+        }
+    }
+
+    // Apply window function
+    for (int i = 0; i < numCoefficients; ++i)
+    {
+        const auto windowValue = WindowFunctions<CoeffType>::getValue (windowType, i, numCoefficients);
+        coefficients[i] *= windowValue;
+    }
+
+    // Normalization
+    const auto sum = std::accumulate (coefficients.begin(), coefficients.end(), static_cast<CoeffType> (0.0));
+    if (sum != static_cast<CoeffType> (0.0))
+        for (auto& c : coefficients)
+            c /= sum;
+
+    return coefficients;
+}
+
+template <typename CoeffType>
+std::vector<CoeffType> FilterDesigner<CoeffType>::designFIRHighpass (
+    int numCoefficients,
+    CoeffType cutoffFreq,
+    double sampleRate,
+    WindowType windowType) noexcept
+{
+    jassert (numCoefficients > 0);
+    jassert (cutoffFreq > static_cast<CoeffType> (0.0));
+    jassert (sampleRate > 0.0);
+    jassert (cutoffFreq < static_cast<CoeffType> (sampleRate / 2.0));
+
+    // Generate lowpass first
+    numCoefficients = nextOdd (numCoefficients);
+    auto coefficients = designFIRLowpass (numCoefficients, cutoffFreq, sampleRate, windowType);
+
+    // Convert to highpass using spectral inversion
+    const int center = (numCoefficients - 1) / 2;
+    for (int i = 0; i < numCoefficients; ++i)
+        coefficients[i] = -coefficients[i];
+
+    // Add unit impulse at center
+    coefficients[center] += static_cast<CoeffType> (1.0);
+
+    // Normalization
+    CoeffType hpi (0.0);
+    for (int n = 0; n < numCoefficients; ++n)
+        hpi += coefficients[n] * ((n & 1) ? static_cast<CoeffType> (-1.0) : static_cast<CoeffType> (1.0));
+
+    if (hpi != static_cast<CoeffType> (0.0))
+        for (auto& c : coefficients)
+            c /= hpi;
+
+    return coefficients;
+}
+
+template <typename CoeffType>
+std::vector<CoeffType> FilterDesigner<CoeffType>::designFIRBandpass (
+    int numCoefficients,
+    CoeffType lowCutoffFreq,
+    CoeffType highCutoffFreq,
+    double sampleRate,
+    WindowType windowType) noexcept
+{
+    jassert (numCoefficients > 0);
+    jassert (lowCutoffFreq > static_cast<CoeffType> (0.0));
+    jassert (highCutoffFreq > lowCutoffFreq);
+    jassert (sampleRate > 0.0);
+    jassert (highCutoffFreq < static_cast<CoeffType> (sampleRate / 2.0));
+
+    numCoefficients = nextOdd (numCoefficients);
+    std::vector<CoeffType> coefficients (numCoefficients);
+
+    const auto normalizedLow = static_cast<CoeffType> (2.0) * lowCutoffFreq / static_cast<CoeffType> (sampleRate);
+    const auto normalizedHigh = static_cast<CoeffType> (2.0) * highCutoffFreq / static_cast<CoeffType> (sampleRate);
+    const int center = (numCoefficients - 1) / 2;
+
+    // Generate ideal bandpass as difference of two sinc functions
+    for (int i = 0; i < numCoefficients; ++i)
+    {
+        if (i == center)
+        {
+            coefficients[i] = normalizedHigh - normalizedLow;
+        }
+        else
+        {
+            const auto n = static_cast<CoeffType> (i - center);
+            const auto xHigh = MathConstants<CoeffType>::pi * normalizedHigh * n;
+            const auto xLow = MathConstants<CoeffType>::pi * normalizedLow * n;
+
+            coefficients[i] = (std::sin (xHigh) - std::sin (xLow)) / (MathConstants<CoeffType>::pi * n);
+        }
+    }
+
+    // Apply window function
+    for (int i = 0; i < numCoefficients; ++i)
+    {
+        const auto windowValue = WindowFunctions<CoeffType>::getValue (windowType, i, numCoefficients);
+        coefficients[i] *= windowValue;
+    }
+
+    return coefficients;
+}
+
+template <typename CoeffType>
+std::vector<CoeffType> FilterDesigner<CoeffType>::designFIRBandstop (
+    int numCoefficients,
+    CoeffType lowCutoffFreq,
+    CoeffType highCutoffFreq,
+    double sampleRate,
+    WindowType windowType) noexcept
+{
+    jassert (numCoefficients > 0);
+    jassert (lowCutoffFreq > static_cast<CoeffType> (0.0));
+    jassert (highCutoffFreq > lowCutoffFreq);
+    jassert (sampleRate > 0.0);
+    jassert (highCutoffFreq < static_cast<CoeffType> (sampleRate / 2.0));
+
+    // Generate bandpass first
+    numCoefficients = nextOdd (numCoefficients);
+    auto coefficients = designFIRBandpass (numCoefficients, lowCutoffFreq, highCutoffFreq, sampleRate, windowType);
+
+    // Convert to bandstop using spectral inversion
+    const int center = (numCoefficients - 1) / 2;
+
+    for (int i = 0; i < numCoefficients; ++i)
+        coefficients[i] = -coefficients[i];
+
+    // Add unit impulse at center
+    coefficients[center] += static_cast<CoeffType> (1.0);
+
+    return coefficients;
+}
+
 //==============================================================================
 
 template class FilterDesigner<float>;
diff --git a/modules/yup_dsp/designers/yup_FilterDesigner.h b/modules/yup_dsp/designers/yup_FilterDesigner.h
index 0dba2770e..baed267a2 100644
--- a/modules/yup_dsp/designers/yup_FilterDesigner.h
+++ b/modules/yup_dsp/designers/yup_FilterDesigner.h
@@ -625,6 +625,78 @@ class FilterDesigner
     {
         return designLinkwitzRiley (8, crossoverFreq, sampleRate, lowCoeffs, highCoeffs);
     }
+
+    //==============================================================================
+    // FIR Filter Design
+    //==============================================================================
+
+    /**
+        Designs FIR lowpass filter coefficients using windowed sinc method.
+
+        @param numCoefficients  The number of filter coefficients (filter order + 1)
+        @param cutoffFreq       The cutoff frequency in Hz
+        @param sampleRate       The sample rate in Hz
+        @param windowType       The window function to apply (default: Hanning)
+
+        @returns               Vector of FIR coefficients suitable for DirectFIR
+    */
+    static std::vector<CoeffType> designFIRLowpass (
+        int numCoefficients,
+        CoeffType cutoffFreq,
+        double sampleRate,
+        WindowType windowType = WindowType::hann) noexcept;
+
+    /**
+        Designs FIR highpass filter coefficients using windowed sinc method.
+
+        @param numCoefficients  The number of filter coefficients (filter order + 1)
+        @param cutoffFreq       The cutoff frequency in Hz
+        @param sampleRate       The sample rate in Hz
+        @param windowType       The window function to apply (default: Hanning)
+
+        @returns               Vector of FIR coefficients suitable for DirectFIR
+    */
+    static std::vector<CoeffType> designFIRHighpass (
+        int numCoefficients,
+        CoeffType cutoffFreq,
+        double sampleRate,
+        WindowType windowType = WindowType::hann) noexcept;
+
+    /**
+        Designs FIR bandpass filter coefficients using windowed sinc method.
+
+        @param numCoefficients  The number of filter coefficients (filter order + 1)
+        @param lowCutoffFreq    The lower cutoff frequency in Hz
+        @param highCutoffFreq   The upper cutoff frequency in Hz
+        @param sampleRate       The sample rate in Hz
+        @param windowType       The window function to apply (default: Hanning)
+
+        @returns               Vector of FIR coefficients suitable for DirectFIR
+    */
+    static std::vector<CoeffType> designFIRBandpass (
+        int numCoefficients,
+        CoeffType lowCutoffFreq,
+        CoeffType highCutoffFreq,
+        double sampleRate,
+        WindowType windowType = WindowType::hann) noexcept;
+
+    /**
+        Designs FIR bandstop filter coefficients using windowed sinc method.
+
+        @param numCoefficients  The number of filter coefficients (filter order + 1)
+        @param lowCutoffFreq    The lower cutoff frequency in Hz
+        @param highCutoffFreq   The upper cutoff frequency in Hz
+        @param sampleRate       The sample rate in Hz
+        @param windowType       The window function to apply (default: Hanning)
+
+        @returns               Vector of FIR coefficients suitable for DirectFIR
+    */
+    static std::vector<CoeffType> designFIRBandstop (
+        int numCoefficients,
+        CoeffType lowCutoffFreq,
+        CoeffType highCutoffFreq,
+        double sampleRate,
+        WindowType windowType = WindowType::hann) noexcept;
 };
 
 } // namespace yup
diff --git a/modules/yup_dsp/filters/yup_DirectFIR.cpp b/modules/yup_dsp/filters/yup_DirectFIR.cpp
deleted file mode 100644
index 1c5d34da9..000000000
--- a/modules/yup_dsp/filters/yup_DirectFIR.cpp
+++ /dev/null
@@ -1,227 +0,0 @@
-/*
-  ==============================================================================
-
-   This file is part of the YUP library.
-   Copyright (c) 2025 - kunitoki@gmail.com
-
-   YUP is an open source library subject to open-source licensing.
-
-   The code included in this file is provided under the terms of the ISC license
-   http://www.isc.org/downloads/software-support-policy/isc-license. Permission
-   to use, copy, modify, and/or distribute this software for any purpose with or
-   without fee is hereby granted provided that the above copyright notice and
-   this permission notice appear in all copies.
-
-   YUP IS PROVIDED "AS IS" WITHOUT ANY WARRANTY, AND ALL WARRANTIES, WHETHER
-   EXPRESSED OR IMPLIED, INCLUDING MERCHANTABILITY AND FITNESS FOR PURPOSE, ARE
-   DISCLAIMED.
-
-  ==============================================================================
-*/
-
-namespace yup
-{
-
-namespace
-{
-
-//==============================================================================
-
-float dotProduct (const float* __restrict a, const float* __restrict b, std::size_t len) noexcept
-{
-    float acc = 0.0f;
-    std::size_t i = 0;
-
-#if YUP_USE_AVX_INTRINSICS && YUP_USE_FMA_INTRINSICS
-    __m256 vacc = _mm256_setzero_ps();
-    for (; i + 8 <= len; i += 8)
-    {
-        __m256 va = _mm256_loadu_ps (a + i);
-        __m256 vb = _mm256_loadu_ps (b + i);
-        vacc = _mm256_fmadd_ps (va, vb, vacc);
-    }
-    __m128 low = _mm256_castps256_ps128 (vacc);
-    __m128 high = _mm256_extractf128_ps (vacc, 1);
-    __m128 vsum = _mm_add_ps (low, high);
-    vsum = _mm_hadd_ps (vsum, vsum);
-    vsum = _mm_hadd_ps (vsum, vsum);
-    acc += _mm_cvtss_f32 (vsum);
-
-#elif YUP_USE_SSE_INTRINSICS
-    __m128 vacc = _mm_setzero_ps();
-#if YUP_USE_FMA_INTRINSICS
-    for (; i + 4 <= len; i += 4)
-    {
-        __m128 va = _mm_loadu_ps (a + i);
-        __m128 vb = _mm_loadu_ps (b + i);
-        vacc = _mm_fmadd_ps (va, vb, vacc);
-    }
-#else
-    for (; i + 4 <= len; i += 4)
-    {
-        __m128 va = _mm_loadu_ps (a + i);
-        __m128 vb = _mm_loadu_ps (b + i);
-        vacc = _mm_add_ps (vacc, _mm_mul_ps (va, vb));
-    }
-#endif
-    __m128 shuf = _mm_shuffle_ps (vacc, vacc, _MM_SHUFFLE (2, 3, 0, 1));
-    __m128 sums = _mm_add_ps (vacc, shuf);
-    shuf = _mm_movehl_ps (shuf, sums);
-    sums = _mm_add_ss (sums, shuf);
-    acc += _mm_cvtss_f32 (sums);
-
-#elif YUP_USE_ARM_NEON
-    float32x4_t vacc = vdupq_n_f32 (0.0f);
-    for (; i + 4 <= len; i += 4)
-    {
-        float32x4_t va = vld1q_f32 (a + i);
-        float32x4_t vb = vld1q_f32 (b + i);
-        vacc = vmlaq_f32 (vacc, va, vb);
-    }
-#if YUP_64BIT
-    acc += vaddvq_f32 (vacc);
-#else
-    float32x2_t vlow = vget_low_f32 (vacc);
-    float32x2_t vhigh = vget_high_f32 (vacc);
-    float32x2_t vsum2 = vpadd_f32 (vlow, vhigh);
-    vsum2 = vpadd_f32 (vsum2, vsum2);
-    acc += vget_lane_f32 (vsum2, 0);
-#endif
-
-#endif
-
-    // Handle remaining samples
-    for (; i < len; ++i)
-        acc += a[i] * b[i];
-
-    return acc;
-}
-
-} // namespace
-
-//==============================================================================
-
-DirectFIR::DirectFIR() = default;
-
-DirectFIR::~DirectFIR() = default;
-
-DirectFIR::DirectFIR (DirectFIR&& other) noexcept
-    : coefficientsReversed (std::move (other.coefficientsReversed))
-    , history (std::move (other.history))
-    , numCoefficients (std::exchange (other.numCoefficients, 0))
-    , paddedLen (std::exchange (other.paddedLen, 0))
-    , writeIndex (std::exchange (other.writeIndex, 0))
-    , currentScaling (std::exchange (other.currentScaling, 1.0f))
-{
-}
-
-DirectFIR& DirectFIR::operator= (DirectFIR&& other) noexcept
-{
-    if (this != &other)
-    {
-        coefficientsReversed = std::move (other.coefficientsReversed);
-        history = std::move (other.history);
-        numCoefficients = std::exchange (other.numCoefficients, 0);
-        paddedLen = std::exchange (other.paddedLen, 0);
-        writeIndex = std::exchange (other.writeIndex, 0);
-        currentScaling = std::exchange (other.currentScaling, 1.0f);
-    }
-    return *this;
-}
-
-void DirectFIR::setCoefficients (std::vector<float> coefficients, float scaling)
-{
-    currentScaling = scaling;
-    if (! approximatelyEqual (currentScaling, 1.0f))
-        FloatVectorOperations::multiply (coefficients.data(), scaling, coefficients.size());
-
-    coefficientsReversed = std::move (coefficients);
-    std::reverse (coefficientsReversed.begin(), coefficientsReversed.end());
-
-    numCoefficients = coefficientsReversed.size();
-    paddedLen = (numCoefficients + 3u) & ~3u; // Round up to multiple of 4 for SIMD
-    coefficientsReversed.resize (paddedLen, 0.0f);
-
-    history.assign (2 * numCoefficients, 0.0f);
-
-    reset();
-}
-
-void DirectFIR::setCoefficients (const float* coefficients, std::size_t numCoefficientsIn, float scaling)
-{
-    if (coefficients == nullptr || numCoefficientsIn == 0)
-    {
-        reset();
-        numCoefficients = 0;
-        return;
-    }
-
-    std::vector<float> coefficientsVector (coefficients, coefficients + numCoefficientsIn);
-    setCoefficients (std::move (coefficientsVector), scaling);
-}
-
-std::size_t DirectFIR::getNumCoefficients() const noexcept
-{
-    return numCoefficients;
-}
-
-bool DirectFIR::hasCoefficients() const noexcept
-{
-    return numCoefficients > 0;
-}
-
-const std::vector<float>& DirectFIR::getCoefficients() const noexcept
-{
-    return coefficientsReversed;
-}
-
-float DirectFIR::getScaling() const noexcept
-{
-    return currentScaling;
-}
-
-void DirectFIR::reset()
-{
-    std::fill (history.begin(), history.end(), 0.0f);
-    writeIndex = 0;
-}
-
-void DirectFIR::process (const float* input, float* output, std::size_t numSamples) noexcept
-{
-    const std::size_t M = numCoefficients;
-    if (M == 0 || input == nullptr || output == nullptr)
-        return;
-
-    const float* h = coefficientsReversed.data();
-
-    for (std::size_t i = 0; i < numSamples; ++i)
-    {
-        const float x = input[i];
-
-        // Update circular buffer with current input sample
-        history[writeIndex] = x;
-        history[writeIndex + M] = x; // Duplicate for efficient circular access
-
-        // Point to the start of the delay line for this sample
-        const float* w = history.data() + writeIndex + 1;
-
-        float sum = 0.0f;
-
-#if YUP_ENABLE_VDSP
-        // Use Apple's optimized vDSP if available
-        vDSP_dotpr (w, 1, h, 1, &sum, M);
-#else
-        // Use our own SIMD-optimized dot product
-        sum = dotProduct (w, h, M);
-#endif
-
-        // Accumulate result into output
-        output[i] += sum;
-
-        // Advance circular buffer write pointer
-        if (++writeIndex == M)
-            writeIndex = 0;
-    }
-}
-
-} // namespace yup
diff --git a/modules/yup_dsp/filters/yup_DirectFIR.h b/modules/yup_dsp/filters/yup_DirectFIR.h
index 23abdaeb6..cdedac031 100644
--- a/modules/yup_dsp/filters/yup_DirectFIR.h
+++ b/modules/yup_dsp/filters/yup_DirectFIR.h
@@ -38,40 +38,65 @@ namespace yup
     - Circular buffer implementation for efficient sample history management
     - Real-time safe processing (no heap allocations during process())
     - Support for arbitrary block sizes
+    - Inherits FilterBase interface for frequency response analysis
 
     Example usage:
     @code
-    DirectFIR fir;
+    DirectFIR<float> fir;
 
     // Set filter coefficients (e.g., lowpass filter)
-    std::vector<float> coeffs = calculateLowpassCoeffs(44100.0f, 1000.0f, 64);
-    fir.setTaps(coeffs, 1.0f); // coeffs with 1.0x scaling
+    auto coeffs = FilterDesigner<float>::designFIRLowpass(64, 1000.0f, 44100.0);
+    fir.setCoefficients(coeffs);
 
     // Prepare for processing
-    fir.prepare(512); // Maximum 512 samples per process() call
+    fir.prepare(44100.0, 512);
 
     // In audio callback:
-    fir.process(inputBuffer, outputBuffer, numSamples); // Accumulates into output
+    fir.processBlock(inputBuffer, outputBuffer, numSamples);
     @endcode
 
-    @note The process() method accumulates results into the output buffer.
-          Clear the output buffer first if overwrite behavior is desired.
+    @tparam SampleType  Type for audio samples (float or double)
+    @tparam CoeffType   Type for internal coefficients (defaults to double)
 
     @see PartitionedConvolver for longer impulse responses using FFT-based convolution
+    @see FilterBase for frequency response methods
 */
-class DirectFIR
+template <typename SampleType, typename CoeffType = double>
+class DirectFIR : public FilterBase<SampleType, CoeffType>
 {
 public:
     //==============================================================================
     /** Default constructor */
-    DirectFIR();
+    DirectFIR() = default;
 
     /** Destructor */
-    ~DirectFIR();
-
-    // Non-copyable but movable
-    DirectFIR (DirectFIR&& other) noexcept;
-    DirectFIR& operator= (DirectFIR&& other) noexcept;
+    ~DirectFIR() override = default;
+
+    /** Move constructor */
+    DirectFIR (DirectFIR&& other) noexcept
+        : coefficientsReversed (std::move (other.coefficientsReversed))
+        , history (std::move (other.history))
+        , numCoefficients (std::exchange (other.numCoefficients, 0))
+        , paddedLen (std::exchange (other.paddedLen, 0))
+        , writeIndex (std::exchange (other.writeIndex, 0))
+        , currentScaling (std::exchange (other.currentScaling, CoeffType (1)))
+    {
+    }
+
+    /** Move assignment */
+    DirectFIR& operator= (DirectFIR&& other) noexcept
+    {
+        if (this != &other)
+        {
+            coefficientsReversed = std::move (other.coefficientsReversed);
+            history = std::move (other.history);
+            numCoefficients = std::exchange (other.numCoefficients, 0);
+            paddedLen = std::exchange (other.paddedLen, 0);
+            writeIndex = std::exchange (other.writeIndex, 0);
+            currentScaling = std::exchange (other.currentScaling, CoeffType (1));
+        }
+        return *this;
+    }
 
     //==============================================================================
     /**
@@ -83,7 +108,22 @@ class DirectFIR
         @note This method is not real-time safe and should be called during initialization
               or when audio processing is paused.
     */
-    void setCoefficients (std::vector<float> coefficients, float scaling = 1.0f);
+    void setCoefficients (std::vector<CoeffType> coefficients, CoeffType scaling = CoeffType (1))
+    {
+        currentScaling = scaling;
+        if (! approximatelyEqual (currentScaling, 1.0f))
+            FloatVectorOperations::multiply (coefficients.data(), scaling, coefficients.size());
+
+        coefficientsReversed = std::move (coefficients);
+        std::reverse (coefficientsReversed.begin(), coefficientsReversed.end());
+
+        numCoefficients = coefficientsReversed.size();
+        paddedLen = (numCoefficients + 3u) & ~3u; // Round up to multiple of 4 for SIMD
+        coefficientsReversed.resize (paddedLen, 0.0f);
+
+        history.assign (2 * numCoefficients, 0.0f);
+        reset();
+    }
 
     /**
         Set the FIR filter coefficients from a raw pointer.
@@ -95,45 +135,151 @@ class DirectFIR
         @note This method is not real-time safe and should be called during initialization
               or when audio processing is paused.
     */
-    void setCoefficients (const float* coefficients, std::size_t numCoefficients, float scaling = 1.0f);
+    void setCoefficients (const CoeffType* coefficients, std::size_t numCoefficientsIn, CoeffType scaling = CoeffType (1))
+    {
+        if (coefficients == nullptr || numCoefficientsIn == 0)
+        {
+            reset();
+            numCoefficients = 0;
+            return;
+        }
+
+        std::vector<float> coefficientsVector (coefficients, coefficients + numCoefficientsIn);
+        setCoefficients (std::move (coefficientsVector), scaling);
+    }
 
     /**
         Get the number of filter coefficients.
 
         @return Number of coefficients in the current filter
     */
-    std::size_t getNumCoefficients() const noexcept;
+    std::size_t getNumCoefficients() const noexcept
+    {
+        return numCoefficients;
+    }
 
     /**
         Check if the filter has been configured with coefficients.
 
         @return True if coefficients have been set, false otherwise
     */
-    bool hasCoefficients() const noexcept;
+    bool hasCoefficients() const noexcept
+    {
+        return numCoefficients > 0;
+    }
 
     /**
         Get the current filter coefficients.
 
         @return Vector containing the current coefficients (time-reversed for processing)
     */
-    const std::vector<float>& getCoefficients() const noexcept;
+    const std::vector<CoeffType>& getCoefficients() const noexcept
+    {
+        return coefficientsReversed;
+    }
 
     /**
         Get the current scaling factor applied to coefficients.
 
         @return Current scaling factor
     */
-    float getScaling() const noexcept;
+    CoeffType getScaling() const noexcept
+    {
+        return currentScaling;
+    }
 
     //==============================================================================
     /**
         Reset all internal processing state (clears sample history).
         Filter coefficients are preserved.
     */
-    void reset();
+    void reset() noexcept override
+    {
+        std::fill (history.begin(), history.end(), 0.0f);
+        writeIndex = 0;
+    }
+
+    /**
+        Prepares the filter for processing with the given sample rate and block size.
+
+        @param sampleRate     The sample rate in Hz
+        @param maximumBlockSize  The maximum number of samples that will be processed at once
+    */
+    void prepare (double sampleRate, int maximumBlockSize) override
+    {
+        this->sampleRate = sampleRate;
+        this->maximumBlockSize = maximumBlockSize;
+    }
+
+    /**
+        Processes a single sample.
+
+        @param inputSample  The input sample to process
+        @returns           The filtered output sample
+    */
+    SampleType processSample (SampleType inputSample) noexcept override
+    {
+        const std::size_t M = numCoefficients;
+        const CoeffType* h = coefficientsReversed.data();
+
+        // Update circular buffer with current input sample
+        history[writeIndex] = inputSample;
+        history[writeIndex + M] = inputSample; // Duplicate for efficient circular access
+
+        // Point to the start of the delay line for this sample
+        const SampleType* w = history.data() + writeIndex + 1;
+
+        // Advance circular buffer write pointer
+        if (++writeIndex == M)
+            writeIndex = 0;
+
+        return dotProduct (w, h, M);
+    }
 
     /**
-        Process audio samples through the FIR filter.
+        Processes a block of samples.
+
+        @param inputBuffer   Pointer to the input samples
+        @param outputBuffer  Pointer to the output buffer
+        @param numSamples    Number of samples to process
+    */
+    void processBlock (const SampleType* inputBuffer, SampleType* outputBuffer, int numSamples) noexcept override
+    {
+        if (numCoefficients == 0 || inputBuffer == nullptr || outputBuffer == nullptr)
+            return;
+
+        for (int i = 0; i < numSamples; ++i)
+            outputBuffer[i] += processSample (inputBuffer[i]);
+    }
+
+    /**
+        Returns the complex frequency response at the given frequency.
+
+        @param frequency  The frequency in Hz
+        @returns         The complex frequency response
+    */
+    Complex<CoeffType> getComplexResponse (CoeffType frequency) const override
+    {
+        if (numCoefficients == 0)
+            return Complex<CoeffType> (0, 0);
+
+        const CoeffType omega = MathConstants<CoeffType>::twoPi * frequency / static_cast<CoeffType> (this->sampleRate);
+
+        Complex<CoeffType> response (0, 0);
+
+        // H(e^jω) = Σ h[n] * e^(-jωn) for n = 0 to N-1
+        for (std::size_t n = 0; n < numCoefficients; ++n)
+        {
+            const CoeffType angle = -omega * static_cast<CoeffType> (n);
+            Complex<CoeffType> exponential (std::cos (angle), std::sin (angle));
+            response += static_cast<CoeffType> (coefficientsReversed[numCoefficients - 1 - n]) * exponential;
+        }
+
+        return response;
+    }
+
+    /**
+        Process audio samples through the FIR filter (legacy method).
 
         @param input       Input audio buffer
         @param output      Output audio buffer (results are accumulated)
@@ -141,18 +287,27 @@ class DirectFIR
 
         @note Results are accumulated into the output buffer. Clear it first if needed.
         @note This method is real-time safe with no heap allocations.
+        @note Use processBlock() for new code
     */
-    void process (const float* input, float* output, std::size_t numSamples) noexcept;
+    void process (const SampleType* input, SampleType* output, std::size_t numSamples) noexcept
+    {
+        processBlock (input, output, static_cast<int> (numSamples));
+    }
 
 private:
-    std::vector<float> coefficientsReversed;
-    std::vector<float> history;
+    std::vector<CoeffType> coefficientsReversed;
+    std::vector<SampleType> history;
     std::size_t numCoefficients = 0;
     std::size_t paddedLen = 0;
     std::size_t writeIndex = 0;
-    float currentScaling = 1.0f;
+    CoeffType currentScaling = CoeffType (1);
 
     YUP_DECLARE_NON_COPYABLE_WITH_LEAK_DETECTOR (DirectFIR)
 };
 
+//==============================================================================
+/** Type aliases for backward compatibility and convenience */
+using DirectFIRFloat = DirectFIR<float, float>;
+using DirectFIRDouble = DirectFIR<double, double>;
+
 } // namespace yup
diff --git a/modules/yup_dsp/utilities/yup_DspMath.cpp b/modules/yup_dsp/utilities/yup_DspMath.cpp
new file mode 100644
index 000000000..b14bade41
--- /dev/null
+++ b/modules/yup_dsp/utilities/yup_DspMath.cpp
@@ -0,0 +1,104 @@
+/*
+  ==============================================================================
+
+   This file is part of the YUP library.
+   Copyright (c) 2025 - kunitoki@gmail.com
+
+   YUP is an open source library subject to open-source licensing.
+
+   The code included in this file is provided under the terms of the ISC license
+   http://www.isc.org/downloads/software-support-policy/isc-license. Permission
+   to use, copy, modify, and/or distribute this software for any purpose with or
+   without fee is hereby granted provided that the above copyright notice and
+   this permission notice appear in all copies.
+
+   YUP IS PROVIDED "AS IS" WITHOUT ANY WARRANTY, AND ALL WARRANTIES, WHETHER
+   EXPRESSED OR IMPLIED, INCLUDING MERCHANTABILITY AND FITNESS FOR PURPOSE, ARE
+   DISCLAIMED.
+
+  ==============================================================================
+*/
+
+namespace yup
+{
+
+//==============================================================================
+
+template <>
+float dotProduct (const float* __restrict a, const float* __restrict b, std::size_t length) noexcept
+{
+    float accumulation = 0.0f;
+
+#if YUP_ENABLE_VDSP
+    vDSP_dotpr (a, 1, b, 1, &accumulation, length);
+
+#else
+    std::size_t i = 0;
+
+#if YUP_USE_AVX_INTRINSICS && YUP_USE_FMA_INTRINSICS
+    __m256 vacc = _mm256_setzero_ps();
+    for (; i + 8 <= length; i += 8)
+    {
+        __m256 va = _mm256_loadu_ps (a + i);
+        __m256 vb = _mm256_loadu_ps (b + i);
+        vacc = _mm256_fmadd_ps (va, vb, vacc);
+    }
+    __m128 low = _mm256_castps256_ps128 (vacc);
+    __m128 high = _mm256_extractf128_ps (vacc, 1);
+    __m128 vsum = _mm_add_ps (low, high);
+    vsum = _mm_hadd_ps (vsum, vsum);
+    vsum = _mm_hadd_ps (vsum, vsum);
+    accumulation += _mm_cvtss_f32 (vsum);
+
+#elif YUP_USE_SSE_INTRINSICS
+    __m128 vacc = _mm_setzero_ps();
+#if YUP_USE_FMA_INTRINSICS
+    for (; i + 4 <= length; i += 4)
+    {
+        __m128 va = _mm_loadu_ps (a + i);
+        __m128 vb = _mm_loadu_ps (b + i);
+        vacc = _mm_fmadd_ps (va, vb, vacc);
+    }
+#else
+    for (; i + 4 <= length; i += 4)
+    {
+        __m128 va = _mm_loadu_ps (a + i);
+        __m128 vb = _mm_loadu_ps (b + i);
+        vacc = _mm_add_ps (vacc, _mm_mul_ps (va, vb));
+    }
+#endif
+    __m128 shuf = _mm_shuffle_ps (vacc, vacc, _MM_SHUFFLE (2, 3, 0, 1));
+    __m128 sums = _mm_add_ps (vacc, shuf);
+    shuf = _mm_movehl_ps (shuf, sums);
+    sums = _mm_add_ss (sums, shuf);
+    accumulation += _mm_cvtss_f32 (sums);
+
+#elif YUP_USE_ARM_NEON
+    float32x4_t vacc = vdupq_n_f32 (0.0f);
+    for (; i + 4 <= length; i += 4)
+    {
+        float32x4_t va = vld1q_f32 (a + i);
+        float32x4_t vb = vld1q_f32 (b + i);
+        vacc = vmlaq_f32 (vacc, va, vb);
+    }
+#if YUP_64BIT
+    accumulation += vaddvq_f32 (vacc);
+#else
+    float32x2_t vlow = vget_low_f32 (vacc);
+    float32x2_t vhigh = vget_high_f32 (vacc);
+    float32x2_t vsum2 = vpadd_f32 (vlow, vhigh);
+    vsum2 = vpadd_f32 (vsum2, vsum2);
+    accumulation += vget_lane_f32 (vsum2, 0);
+#endif
+
+#endif
+
+    // Handle remaining samples
+    for (; i < length; ++i)
+        accumulation += a[i] * b[i];
+#endif
+
+    return accumulation;
+}
+
+} // namespace yup
diff --git a/modules/yup_dsp/utilities/yup_DspMath.h b/modules/yup_dsp/utilities/yup_DspMath.h
index 920522777..c058bbc2c 100644
--- a/modules/yup_dsp/utilities/yup_DspMath.h
+++ b/modules/yup_dsp/utilities/yup_DspMath.h
@@ -110,6 +110,24 @@ FloatType fastCos (FloatType x) noexcept
 
 //==============================================================================
 
+/** Dot product fallback implementation */
+template <typename CoeffType, typename SampleType>
+SampleType dotProduct (const CoeffType* __restrict a, const SampleType* __restrict b, std::size_t length) noexcept
+{
+    CoeffType acc = CoeffType (0);
+
+    for (std::size_t i = 0; i < length; ++i)
+        acc += a[i] * static_cast<CoeffType> (b[i]);
+
+    return static_cast<SampleType> (acc);
+}
+
+/** Fast specialization for dotProduct using SIMD */
+template <>
+float dotProduct (const float* __restrict a, const float* __restrict b, std::size_t length) noexcept;
+
+//==============================================================================
+
 /** Bilinear transform from s-plane to z-plane with frequency warping */
 template <typename FloatType>
 void bilinearTransform (FloatType& a0, FloatType& a1, FloatType& a2, FloatType& b0, FloatType& b1, FloatType& b2, FloatType frequency, FloatType sampleRate) noexcept
diff --git a/modules/yup_dsp/yup_dsp.cpp b/modules/yup_dsp/yup_dsp.cpp
index 0c3488b70..85c09b0b8 100644
--- a/modules/yup_dsp/yup_dsp.cpp
+++ b/modules/yup_dsp/yup_dsp.cpp
@@ -75,8 +75,8 @@
 #include "frequency/yup_FFTProcessor.cpp"
 #include "frequency/yup_SpectrumAnalyzerState.cpp"
 #include "designers/yup_FilterDesigner.cpp"
-#include "filters/yup_DirectFIR.cpp"
 #include "convolution/yup_PartitionedConvolver.cpp"
+#include "utilities/yup_DspMath.cpp"
 
 //==============================================================================
 
diff --git a/tests/yup_dsp/yup_DirectFIR.cpp b/tests/yup_dsp/yup_DirectFIR.cpp
index e46071a19..9d79d8066 100644
--- a/tests/yup_dsp/yup_DirectFIR.cpp
+++ b/tests/yup_dsp/yup_DirectFIR.cpp
@@ -118,7 +118,7 @@ class DirectFIRTest : public ::testing::Test
 
 TEST_F (DirectFIRTest, DefaultConstruction)
 {
-    DirectFIR fir;
+    DirectFIRFloat fir;
 
     // Default state should be safe
     EXPECT_EQ (fir.getNumCoefficients(), 0);
@@ -137,12 +137,12 @@ TEST_F (DirectFIRTest, DefaultConstruction)
 
 TEST_F (DirectFIRTest, MoveSemantics)
 {
-    DirectFIR fir1;
+    DirectFIRFloat fir1;
     std::vector<float> coefficients = { 1.0f, 0.5f, 0.25f };
     fir1.setCoefficients (coefficients, 2.0f);
 
     // Move constructor
-    DirectFIR fir2 = std::move (fir1);
+    DirectFIRFloat fir2 = std::move (fir1);
 
     // Verify moved filter works
     EXPECT_EQ (fir2.getNumCoefficients(), 3);
@@ -166,7 +166,7 @@ TEST_F (DirectFIRTest, MoveSemantics)
     EXPECT_GT (outputSum, 1.0f); // Should be > 1 due to scaling
 
     // Move assignment
-    DirectFIR fir3;
+    DirectFIRFloat fir3;
     fir3 = std::move (fir2);
 
     EXPECT_EQ (fir3.getNumCoefficients(), 3);
@@ -180,7 +180,7 @@ TEST_F (DirectFIRTest, MoveSemantics)
 
 TEST_F (DirectFIRTest, SetCoefficientsVector)
 {
-    DirectFIR fir;
+    DirectFIRFloat fir;
     std::vector<float> coefficients = { 0.1f, 0.5f, 1.0f, 0.5f, 0.1f };
 
     fir.setCoefficients (coefficients, 1.0f);
@@ -196,7 +196,7 @@ TEST_F (DirectFIRTest, SetCoefficientsVector)
 
 TEST_F (DirectFIRTest, SetCoefficientsPointer)
 {
-    DirectFIR fir;
+    DirectFIRFloat fir;
     float coefficients[] = { 0.2f, 0.4f, 0.6f, 0.8f };
 
     fir.setCoefficients (coefficients, 4, 2.0f);
@@ -208,7 +208,7 @@ TEST_F (DirectFIRTest, SetCoefficientsPointer)
 
 TEST_F (DirectFIRTest, SetCoefficientsNullptr)
 {
-    DirectFIR fir;
+    DirectFIRFloat fir;
 
     // First set some valid coefficients
     std::vector<float> coefficients = { 1.0f, 0.5f };
@@ -223,7 +223,7 @@ TEST_F (DirectFIRTest, SetCoefficientsNullptr)
 
 TEST_F (DirectFIRTest, SetCoefficientsWithScaling)
 {
-    DirectFIR fir;
+    DirectFIRFloat fir;
     std::vector<float> coefficients = { 1.0f, 1.0f, 1.0f };
 
     fir.setCoefficients (coefficients, 0.5f);
@@ -251,7 +251,7 @@ TEST_F (DirectFIRTest, SetCoefficientsWithScaling)
 
 TEST_F (DirectFIRTest, ImpulseResponse)
 {
-    DirectFIR fir;
+    DirectFIRFloat fir;
     std::vector<float> coefficients = { 1.0f, 0.5f, 0.25f };
     fir.setCoefficients (coefficients);
 
@@ -274,7 +274,7 @@ TEST_F (DirectFIRTest, ImpulseResponse)
 
 TEST_F (DirectFIRTest, AccumulativeOutput)
 {
-    DirectFIR fir;
+    DirectFIRFloat fir;
     std::vector<float> coefficients = { 0.5f, 0.5f };
     fir.setCoefficients (coefficients);
 
@@ -294,8 +294,8 @@ TEST_F (DirectFIRTest, AccumulativeOutput)
 
 TEST_F (DirectFIRTest, Linearity)
 {
-    DirectFIR fir;
-    std::vector<float> coefficients = createLowpassCoefficients (32, 1000.0f, 44100.0f);
+    DirectFIRFloat fir;
+    auto coefficients = FilterDesigner<float>::designFIRLowpass (32, 1000.0f, 44100.0f);
     fir.setCoefficients (coefficients);
 
     std::vector<float> input (512);
@@ -324,7 +324,7 @@ TEST_F (DirectFIRTest, Linearity)
 
 TEST_F (DirectFIRTest, Reset)
 {
-    DirectFIR fir;
+    DirectFIRFloat fir;
     std::vector<float> coefficients = { 1.0f, 0.8f, 0.6f, 0.4f, 0.2f };
     fir.setCoefficients (coefficients);
 
@@ -351,10 +351,10 @@ TEST_F (DirectFIRTest, Reset)
 
 TEST_F (DirectFIRTest, LowpassFiltering)
 {
-    DirectFIR fir;
+    DirectFIRFloat fir;
 
     // Create lowpass filter coefficients
-    std::vector<float> coefficients = createLowpassCoefficients (64, 1000.0f, 44100.0f);
+    auto coefficients = FilterDesigner<float>::designFIRLowpass (64, 1000.0f, 44100.0);
     fir.setCoefficients (coefficients);
 
     const float sampleRate = 44100.0f;
@@ -394,8 +394,8 @@ TEST_F (DirectFIRTest, LowpassFiltering)
 
 TEST_F (DirectFIRTest, BlockSizeIndependence)
 {
-    DirectFIR fir;
-    std::vector<float> coefficients = createLowpassCoefficients (48, 2000.0f, 44100.0f);
+    DirectFIRFloat fir;
+    auto coefficients = FilterDesigner<float>::designFIRLowpass (48, 2000.0f, 44100.0);
     fir.setCoefficients (coefficients);
 
     const size_t totalSamples = 1024;
@@ -448,7 +448,7 @@ TEST_F (DirectFIRTest, BlockSizeIndependence)
 
 TEST_F (DirectFIRTest, ZeroSamples)
 {
-    DirectFIR fir;
+    DirectFIRFloat fir;
     std::vector<float> coefficients = { 1.0f, 0.5f };
     fir.setCoefficients (coefficients);
 
@@ -465,7 +465,7 @@ TEST_F (DirectFIRTest, ZeroSamples)
 
 TEST_F (DirectFIRTest, NullPointers)
 {
-    DirectFIR fir;
+    DirectFIRFloat fir;
     std::vector<float> coefficients = { 1.0f };
     fir.setCoefficients (coefficients);
 
@@ -483,7 +483,7 @@ TEST_F (DirectFIRTest, NullPointers)
 
 TEST_F (DirectFIRTest, LargeTapCounts)
 {
-    DirectFIR fir;
+    DirectFIRFloat fir;
 
     // Test with relatively large number of coefficients
     std::vector<float> coefficients (512);
@@ -508,7 +508,7 @@ TEST_F (DirectFIRTest, LargeTapCounts)
 
 TEST_F (DirectFIRTest, SingleTap)
 {
-    DirectFIR fir;
+    DirectFIRFloat fir;
     std::vector<float> coefficients = { 0.75f };
     fir.setCoefficients (coefficients);
 
@@ -530,7 +530,7 @@ TEST_F (DirectFIRTest, SingleTap)
 
 TEST_F (DirectFIRTest, MemoryAlignment)
 {
-    DirectFIR fir;
+    DirectFIRFloat fir;
 
     // Coefficient count that's not a multiple of 4
     std::vector<float> coefficients (37);
@@ -549,7 +549,7 @@ TEST_F (DirectFIRTest, MemoryAlignment)
 
 TEST_F (DirectFIRTest, StressTest)
 {
-    DirectFIR fir;
+    DirectFIRFloat fir;
 
     // Create complex impulse response
     std::vector<float> coefficients (256);
@@ -582,4 +582,4 @@ TEST_F (DirectFIRTest, StressTest)
     }
 }
 
-} // namespace yup::test
\ No newline at end of file
+} // namespace yup::test
diff --git a/tests/yup_dsp/yup_FilterDesigner.cpp b/tests/yup_dsp/yup_FilterDesigner.cpp
index ca43acc07..63b17071a 100644
--- a/tests/yup_dsp/yup_FilterDesigner.cpp
+++ b/tests/yup_dsp/yup_FilterDesigner.cpp
@@ -23,6 +23,8 @@
 
 #include <gtest/gtest.h>
 
+#include <fstream>
+
 using namespace yup;
 
 //==============================================================================
@@ -358,3 +360,308 @@ TEST_F (FilterDesignerTests, FloatPrecisionConsistency)
     EXPECT_NEAR (doubleCoeffs.a1, static_cast<double> (floatCoeffs.a1), toleranceF);
     EXPECT_NEAR (doubleCoeffs.a2, static_cast<double> (floatCoeffs.a2), toleranceF);
 }
+
+//==============================================================================
+// FIR Filter Design Tests
+//==============================================================================
+
+TEST_F (FilterDesignerTests, FirLowpassBasicProperties)
+{
+    const int numCoeffs = 65; // Odd number for symmetric filter
+    auto coeffs = FilterDesigner<float>::designFIRLowpass (numCoeffs, 1000.0f, sampleRate);
+
+    // Should return the correct number of coefficients
+    EXPECT_EQ (coeffs.size(), numCoeffs);
+
+    // All coefficients should be finite
+    for (const auto& coeff : coeffs)
+        EXPECT_TRUE (std::isfinite (coeff));
+
+    // FIR filter should be symmetric for linear phase
+    const int center = (numCoeffs - 1) / 2;
+    for (int i = 0; i < center; ++i)
+        EXPECT_NEAR (coeffs[i], coeffs[numCoeffs - 1 - i], toleranceF);
+
+    // Center coefficient should be largest for lowpass
+    for (int i = 0; i < numCoeffs; ++i)
+    {
+        if (i != center)
+            EXPECT_GE (coeffs[center], coeffs[i]);
+    }
+}
+
+TEST_F (FilterDesignerTests, FirHighpassBasicProperties)
+{
+    const int numCoeffs = 65;
+    auto coeffs = FilterDesigner<float>::designFIRHighpass (numCoeffs, 1000.0f, sampleRate);
+
+    // Should return the correct number of coefficients
+    EXPECT_EQ (coeffs.size(), numCoeffs);
+
+    // All coefficients should be finite
+    for (const auto& coeff : coeffs)
+        EXPECT_TRUE (std::isfinite (coeff));
+
+    // FIR filter should be symmetric for linear phase
+    const int center = (numCoeffs - 1) / 2;
+    for (int i = 0; i < center; ++i)
+        EXPECT_NEAR (coeffs[i], coeffs[numCoeffs - 1 - i], toleranceF);
+
+    // Sum of coefficients should be approximately zero for highpass (DC gain = 0)
+    // Note: windowing can cause small deviations from ideal DC gain
+    float sum = 0.0f;
+    for (const auto& coeff : coeffs)
+        sum += coeff;
+
+    EXPECT_NEAR (sum, 0.0f, 0.05f); // Relaxed tolerance for windowed FIR
+}
+
+TEST_F (FilterDesignerTests, FirBandpassBasicProperties)
+{
+    const int numCoeffs = 65;
+    auto coeffs = FilterDesigner<float>::designFIRBandpass (numCoeffs, 800.0f, 1200.0f, sampleRate);
+
+    // Should return the correct number of coefficients
+    EXPECT_EQ (coeffs.size(), numCoeffs);
+
+    // All coefficients should be finite
+    for (const auto& coeff : coeffs)
+        EXPECT_TRUE (std::isfinite (coeff));
+
+    // FIR filter should be symmetric for linear phase
+    const int center = (numCoeffs - 1) / 2;
+    for (int i = 0; i < center; ++i)
+        EXPECT_NEAR (coeffs[i], coeffs[numCoeffs - 1 - i], toleranceF);
+
+    // Sum of coefficients should be approximately zero for bandpass (DC gain = 0)
+    // Note: windowing can cause small deviations from ideal DC gain
+    float sum = 0.0f;
+    for (const auto& coeff : coeffs)
+        sum += coeff;
+
+    EXPECT_NEAR (sum, 0.0f, 0.15f); // Relaxed tolerance for windowed FIR
+}
+
+TEST_F (FilterDesignerTests, FirBandstopBasicProperties)
+{
+    const int numCoeffs = 65;
+    auto coeffs = FilterDesigner<float>::designFIRBandstop (numCoeffs, 800.0f, 1200.0f, sampleRate);
+
+    // Should return the correct number of coefficients
+    EXPECT_EQ (coeffs.size(), numCoeffs);
+
+    // All coefficients should be finite
+    for (const auto& coeff : coeffs)
+        EXPECT_TRUE (std::isfinite (coeff));
+
+    // FIR filter should be symmetric for linear phase
+    const int center = (numCoeffs - 1) / 2;
+    for (int i = 0; i < center; ++i)
+        EXPECT_NEAR (coeffs[i], coeffs[numCoeffs - 1 - i], toleranceF);
+
+    // Sum of coefficients should be approximately 1.0 for bandstop (DC gain = 1)
+    // Note: windowing can cause small deviations from ideal DC gain
+    float sum = 0.0f;
+    for (const auto& coeff : coeffs)
+        sum += coeff;
+
+    EXPECT_NEAR (sum, 1.0f, 0.15f); // Relaxed tolerance for windowed FIR
+}
+
+TEST_F (FilterDesignerTests, FirDifferentWindowTypes)
+{
+    const int numCoeffs = 33;
+
+    // Test different window types
+    auto hannCoeffs = FilterDesigner<float>::designFIRLowpass (numCoeffs, 1000.0f, sampleRate, WindowType::hann);
+    auto hammingCoeffs = FilterDesigner<float>::designFIRLowpass (numCoeffs, 1000.0f, sampleRate, WindowType::hamming);
+    auto blackmanCoeffs = FilterDesigner<float>::designFIRLowpass (numCoeffs, 1000.0f, sampleRate, WindowType::blackman);
+
+    // All should have same size
+    EXPECT_EQ (hannCoeffs.size(), numCoeffs);
+    EXPECT_EQ (hammingCoeffs.size(), numCoeffs);
+    EXPECT_EQ (blackmanCoeffs.size(), numCoeffs);
+
+    // All coefficients should be finite
+    for (int i = 0; i < numCoeffs; ++i)
+    {
+        EXPECT_TRUE (std::isfinite (hannCoeffs[i]));
+        EXPECT_TRUE (std::isfinite (hammingCoeffs[i]));
+        EXPECT_TRUE (std::isfinite (blackmanCoeffs[i]));
+    }
+
+    // Different windows should produce different coefficients
+    bool coeffsDifferent = false;
+    for (int i = 0; i < numCoeffs; ++i)
+    {
+        if (std::abs (hannCoeffs[i] - blackmanCoeffs[i]) > toleranceF)
+        {
+            coeffsDifferent = true;
+            break;
+        }
+    }
+    EXPECT_TRUE (coeffsDifferent);
+}
+
+TEST_F (FilterDesignerTests, FirFloatDoubleConsistency)
+{
+    const int numCoeffs = 33;
+
+    auto doubleCoeffs = FilterDesigner<double>::designFIRLowpass (numCoeffs, 1000.0, sampleRate);
+    auto floatCoeffs = FilterDesigner<float>::designFIRLowpass (numCoeffs, 1000.0f, sampleRate);
+
+    EXPECT_EQ (doubleCoeffs.size(), floatCoeffs.size());
+
+    // Coefficients should be very similar between float and double precision
+    for (int i = 0; i < numCoeffs; ++i)
+        EXPECT_NEAR (doubleCoeffs[i], static_cast<double> (floatCoeffs[i]), toleranceF);
+}
+
+TEST_F (FilterDesignerTests, ExportFIRCoefficientsForAnalysis)
+{
+    const int numCoeffs = 65;
+    const float sampleRateF = 44100.0f;
+
+    // Design different FIR filters
+    auto lowpass = FilterDesigner<float>::designFIRLowpass (numCoeffs, 1000.0f, sampleRateF);
+    auto highpass = FilterDesigner<float>::designFIRHighpass (numCoeffs, 1000.0f, sampleRateF);
+    auto bandpass = FilterDesigner<float>::designFIRBandpass (numCoeffs, 800.0f, 1200.0f, sampleRateF);
+    auto bandstop = FilterDesigner<float>::designFIRBandstop (numCoeffs, 800.0f, 1200.0f, sampleRateF);
+
+    // Different windows for lowpass
+    auto lowpassHann = FilterDesigner<float>::designFIRLowpass (numCoeffs, 1000.0f, sampleRateF, WindowType::hann);
+    auto lowpassHamming = FilterDesigner<float>::designFIRLowpass (numCoeffs, 1000.0f, sampleRateF, WindowType::hamming);
+    auto lowpassBlackman = FilterDesigner<float>::designFIRLowpass (numCoeffs, 1000.0f, sampleRateF, WindowType::blackman);
+
+    // Helper lambda to write coefficients to file
+    auto writeCoeffs = [] (const std::vector<float>& coeffs, const std::string& filename)
+    {
+        std::ofstream file (filename);
+        if (file.is_open())
+        {
+            for (size_t i = 0; i < coeffs.size(); ++i)
+            {
+                file << coeffs[i];
+                if (i < coeffs.size() - 1)
+                    file << "\n";
+            }
+            file.close();
+        }
+    };
+
+    // Write all coefficient sets to files
+    writeCoeffs (lowpass, "fir_lowpass_1000hz.txt");
+    writeCoeffs (highpass, "fir_highpass_1000hz.txt");
+    writeCoeffs (bandpass, "fir_bandpass_800_1200hz.txt");
+    writeCoeffs (bandstop, "fir_bandstop_800_1200hz.txt");
+    writeCoeffs (lowpassHann, "fir_lowpass_hann_1000hz.txt");
+    writeCoeffs (lowpassHamming, "fir_lowpass_hamming_1000hz.txt");
+    writeCoeffs (lowpassBlackman, "fir_lowpass_blackman_1000hz.txt");
+
+    // Create a Python script to plot the frequency responses
+    std::ofstream pyScript ("plot_fir_responses.py");
+    if (pyScript.is_open())
+    {
+        pyScript << R"(#!/usr/bin/env python3
+import numpy as np
+import matplotlib.pyplot as plt
+from scipy import signal
+
+def load_coeffs(filename):
+    with open(filename, 'r') as f:
+        return [float(line.strip()) for line in f.readlines()]
+
+def plot_frequency_response(coeffs, title, sample_rate=44100):
+    w, h = signal.freqz(coeffs, worN=8000, fs=sample_rate)
+    
+    plt.figure(figsize=(12, 8))
+    
+    # Magnitude response
+    plt.subplot(2, 1, 1)
+    plt.plot(w, 20 * np.log10(np.abs(h)))
+    plt.title(f'{title} - Magnitude Response')
+    plt.xlabel('Frequency (Hz)')
+    plt.ylabel('Magnitude (dB)')
+    plt.grid(True)
+    plt.xlim(0, sample_rate/2)
+    plt.ylim(-80, 5)
+    
+    # Phase response
+    plt.subplot(2, 1, 2)
+    plt.plot(w, np.unwrap(np.angle(h)) * 180 / np.pi)
+    plt.title(f'{title} - Phase Response')
+    plt.xlabel('Frequency (Hz)')
+    plt.ylabel('Phase (degrees)')
+    plt.grid(True)
+    plt.xlim(0, sample_rate/2)
+    
+    plt.tight_layout()
+    plt.savefig(f'{title.lower().replace(" ", "_").replace("-", "_")}_response.png', dpi=150, bbox_inches='tight')
+    plt.show()
+
+# Load and plot all FIR filter responses
+filters = [
+    ('fir_lowpass_1000hz.txt', 'FIR Lowpass 1000Hz'),
+    ('fir_highpass_1000hz.txt', 'FIR Highpass 1000Hz'), 
+    ('fir_bandpass_800_1200hz.txt', 'FIR Bandpass 800-1200Hz'),
+    ('fir_bandstop_800_1200hz.txt', 'FIR Bandstop 800-1200Hz'),
+    ('fir_lowpass_hann_1000hz.txt', 'FIR Lowpass Hann Window'),
+    ('fir_lowpass_hamming_1000hz.txt', 'FIR Lowpass Hamming Window'),
+    ('fir_lowpass_blackman_1000hz.txt', 'FIR Lowpass Blackman Window')
+]
+
+for filename, title in filters:
+    try:
+        coeffs = load_coeffs(filename)
+        plot_frequency_response(coeffs, title)
+    except FileNotFoundError:
+        print(f"File {filename} not found!")
+
+# Compare window types on same plot
+plt.figure(figsize=(12, 6))
+window_files = [
+    ('fir_lowpass_hann_1000hz.txt', 'Hann', 'blue'),
+    ('fir_lowpass_hamming_1000hz.txt', 'Hamming', 'red'),
+    ('fir_lowpass_blackman_1000hz.txt', 'Blackman', 'green')
+]
+
+for filename, label, color in window_files:
+    try:
+        coeffs = load_coeffs(filename)
+        w, h = signal.freqz(coeffs, worN=8000, fs=44100)
+        plt.plot(w, 20 * np.log10(np.abs(h)), label=label, color=color)
+    except FileNotFoundError:
+        print(f"File {filename} not found!")
+
+plt.title('FIR Lowpass 1000Hz - Window Comparison')
+plt.xlabel('Frequency (Hz)')
+plt.ylabel('Magnitude (dB)')
+plt.grid(True)
+plt.legend()
+plt.xlim(0, 22050)
+plt.ylim(-80, 5)
+plt.savefig('fir_window_comparison.png', dpi=150, bbox_inches='tight')
+plt.show()
+
+print("All plots generated successfully!")
+)";
+        pyScript.close();
+    }
+
+    // Just verify the files were created - the actual validation will be done visually with Python
+    EXPECT_EQ (lowpass.size(), numCoeffs);
+    EXPECT_EQ (highpass.size(), numCoeffs);
+    EXPECT_EQ (bandpass.size(), numCoeffs);
+    EXPECT_EQ (bandstop.size(), numCoeffs);
+
+    std::cout << "\nFIR coefficient files and Python plotting script created:\n";
+    std::cout << "- fir_lowpass_1000hz.txt\n";
+    std::cout << "- fir_highpass_1000hz.txt\n";
+    std::cout << "- fir_bandpass_800_1200hz.txt\n";
+    std::cout << "- fir_bandstop_800_1200hz.txt\n";
+    std::cout << "- fir_lowpass_hann_1000hz.txt\n";
+    std::cout << "- fir_lowpass_hamming_1000hz.txt\n";
+    std::cout << "- fir_lowpass_blackman_1000hz.txt\n";
+    std::cout << "- plot_fir_responses.py\n\n";
+    std::cout << "Run: python3 plot_fir_responses.py (requires numpy, matplotlib, scipy)\n";
+}

From 66b0e1b0a695a87e65e96a357789cfca92d54258 Mon Sep 17 00:00:00 2001
From: kunitoki <kunitoki@gmail.com>
Date: Fri, 12 Sep 2025 08:24:05 +0200
Subject: [PATCH 34/37] Remove validations

---
 tests/yup_dsp/yup_FilterDesigner.cpp | 149 ---------------------------
 1 file changed, 149 deletions(-)

diff --git a/tests/yup_dsp/yup_FilterDesigner.cpp b/tests/yup_dsp/yup_FilterDesigner.cpp
index 63b17071a..5f031090b 100644
--- a/tests/yup_dsp/yup_FilterDesigner.cpp
+++ b/tests/yup_dsp/yup_FilterDesigner.cpp
@@ -516,152 +516,3 @@ TEST_F (FilterDesignerTests, FirFloatDoubleConsistency)
     for (int i = 0; i < numCoeffs; ++i)
         EXPECT_NEAR (doubleCoeffs[i], static_cast<double> (floatCoeffs[i]), toleranceF);
 }
-
-TEST_F (FilterDesignerTests, ExportFIRCoefficientsForAnalysis)
-{
-    const int numCoeffs = 65;
-    const float sampleRateF = 44100.0f;
-
-    // Design different FIR filters
-    auto lowpass = FilterDesigner<float>::designFIRLowpass (numCoeffs, 1000.0f, sampleRateF);
-    auto highpass = FilterDesigner<float>::designFIRHighpass (numCoeffs, 1000.0f, sampleRateF);
-    auto bandpass = FilterDesigner<float>::designFIRBandpass (numCoeffs, 800.0f, 1200.0f, sampleRateF);
-    auto bandstop = FilterDesigner<float>::designFIRBandstop (numCoeffs, 800.0f, 1200.0f, sampleRateF);
-
-    // Different windows for lowpass
-    auto lowpassHann = FilterDesigner<float>::designFIRLowpass (numCoeffs, 1000.0f, sampleRateF, WindowType::hann);
-    auto lowpassHamming = FilterDesigner<float>::designFIRLowpass (numCoeffs, 1000.0f, sampleRateF, WindowType::hamming);
-    auto lowpassBlackman = FilterDesigner<float>::designFIRLowpass (numCoeffs, 1000.0f, sampleRateF, WindowType::blackman);
-
-    // Helper lambda to write coefficients to file
-    auto writeCoeffs = [] (const std::vector<float>& coeffs, const std::string& filename)
-    {
-        std::ofstream file (filename);
-        if (file.is_open())
-        {
-            for (size_t i = 0; i < coeffs.size(); ++i)
-            {
-                file << coeffs[i];
-                if (i < coeffs.size() - 1)
-                    file << "\n";
-            }
-            file.close();
-        }
-    };
-
-    // Write all coefficient sets to files
-    writeCoeffs (lowpass, "fir_lowpass_1000hz.txt");
-    writeCoeffs (highpass, "fir_highpass_1000hz.txt");
-    writeCoeffs (bandpass, "fir_bandpass_800_1200hz.txt");
-    writeCoeffs (bandstop, "fir_bandstop_800_1200hz.txt");
-    writeCoeffs (lowpassHann, "fir_lowpass_hann_1000hz.txt");
-    writeCoeffs (lowpassHamming, "fir_lowpass_hamming_1000hz.txt");
-    writeCoeffs (lowpassBlackman, "fir_lowpass_blackman_1000hz.txt");
-
-    // Create a Python script to plot the frequency responses
-    std::ofstream pyScript ("plot_fir_responses.py");
-    if (pyScript.is_open())
-    {
-        pyScript << R"(#!/usr/bin/env python3
-import numpy as np
-import matplotlib.pyplot as plt
-from scipy import signal
-
-def load_coeffs(filename):
-    with open(filename, 'r') as f:
-        return [float(line.strip()) for line in f.readlines()]
-
-def plot_frequency_response(coeffs, title, sample_rate=44100):
-    w, h = signal.freqz(coeffs, worN=8000, fs=sample_rate)
-    
-    plt.figure(figsize=(12, 8))
-    
-    # Magnitude response
-    plt.subplot(2, 1, 1)
-    plt.plot(w, 20 * np.log10(np.abs(h)))
-    plt.title(f'{title} - Magnitude Response')
-    plt.xlabel('Frequency (Hz)')
-    plt.ylabel('Magnitude (dB)')
-    plt.grid(True)
-    plt.xlim(0, sample_rate/2)
-    plt.ylim(-80, 5)
-    
-    # Phase response
-    plt.subplot(2, 1, 2)
-    plt.plot(w, np.unwrap(np.angle(h)) * 180 / np.pi)
-    plt.title(f'{title} - Phase Response')
-    plt.xlabel('Frequency (Hz)')
-    plt.ylabel('Phase (degrees)')
-    plt.grid(True)
-    plt.xlim(0, sample_rate/2)
-    
-    plt.tight_layout()
-    plt.savefig(f'{title.lower().replace(" ", "_").replace("-", "_")}_response.png', dpi=150, bbox_inches='tight')
-    plt.show()
-
-# Load and plot all FIR filter responses
-filters = [
-    ('fir_lowpass_1000hz.txt', 'FIR Lowpass 1000Hz'),
-    ('fir_highpass_1000hz.txt', 'FIR Highpass 1000Hz'), 
-    ('fir_bandpass_800_1200hz.txt', 'FIR Bandpass 800-1200Hz'),
-    ('fir_bandstop_800_1200hz.txt', 'FIR Bandstop 800-1200Hz'),
-    ('fir_lowpass_hann_1000hz.txt', 'FIR Lowpass Hann Window'),
-    ('fir_lowpass_hamming_1000hz.txt', 'FIR Lowpass Hamming Window'),
-    ('fir_lowpass_blackman_1000hz.txt', 'FIR Lowpass Blackman Window')
-]
-
-for filename, title in filters:
-    try:
-        coeffs = load_coeffs(filename)
-        plot_frequency_response(coeffs, title)
-    except FileNotFoundError:
-        print(f"File {filename} not found!")
-
-# Compare window types on same plot
-plt.figure(figsize=(12, 6))
-window_files = [
-    ('fir_lowpass_hann_1000hz.txt', 'Hann', 'blue'),
-    ('fir_lowpass_hamming_1000hz.txt', 'Hamming', 'red'),
-    ('fir_lowpass_blackman_1000hz.txt', 'Blackman', 'green')
-]
-
-for filename, label, color in window_files:
-    try:
-        coeffs = load_coeffs(filename)
-        w, h = signal.freqz(coeffs, worN=8000, fs=44100)
-        plt.plot(w, 20 * np.log10(np.abs(h)), label=label, color=color)
-    except FileNotFoundError:
-        print(f"File {filename} not found!")
-
-plt.title('FIR Lowpass 1000Hz - Window Comparison')
-plt.xlabel('Frequency (Hz)')
-plt.ylabel('Magnitude (dB)')
-plt.grid(True)
-plt.legend()
-plt.xlim(0, 22050)
-plt.ylim(-80, 5)
-plt.savefig('fir_window_comparison.png', dpi=150, bbox_inches='tight')
-plt.show()
-
-print("All plots generated successfully!")
-)";
-        pyScript.close();
-    }
-
-    // Just verify the files were created - the actual validation will be done visually with Python
-    EXPECT_EQ (lowpass.size(), numCoeffs);
-    EXPECT_EQ (highpass.size(), numCoeffs);
-    EXPECT_EQ (bandpass.size(), numCoeffs);
-    EXPECT_EQ (bandstop.size(), numCoeffs);
-
-    std::cout << "\nFIR coefficient files and Python plotting script created:\n";
-    std::cout << "- fir_lowpass_1000hz.txt\n";
-    std::cout << "- fir_highpass_1000hz.txt\n";
-    std::cout << "- fir_bandpass_800_1200hz.txt\n";
-    std::cout << "- fir_bandstop_800_1200hz.txt\n";
-    std::cout << "- fir_lowpass_hann_1000hz.txt\n";
-    std::cout << "- fir_lowpass_hamming_1000hz.txt\n";
-    std::cout << "- fir_lowpass_blackman_1000hz.txt\n";
-    std::cout << "- plot_fir_responses.py\n\n";
-    std::cout << "Run: python3 plot_fir_responses.py (requires numpy, matplotlib, scipy)\n";
-}

From 0dfd4ea3f10ce7e1d24caae82b2ff37cb1114157 Mon Sep 17 00:00:00 2001
From: kunitoki <kunitoki@gmail.com>
Date: Fri, 12 Sep 2025 08:28:15 +0200
Subject: [PATCH 35/37] Export FIR coefficients for analysis

---
 tests/yup_dsp/yup_FilterDesigner.cpp | 149 +++++++++++++++++++++++++++
 1 file changed, 149 insertions(+)

diff --git a/tests/yup_dsp/yup_FilterDesigner.cpp b/tests/yup_dsp/yup_FilterDesigner.cpp
index 5f031090b..120c4ed08 100644
--- a/tests/yup_dsp/yup_FilterDesigner.cpp
+++ b/tests/yup_dsp/yup_FilterDesigner.cpp
@@ -516,3 +516,152 @@ TEST_F (FilterDesignerTests, FirFloatDoubleConsistency)
     for (int i = 0; i < numCoeffs; ++i)
         EXPECT_NEAR (doubleCoeffs[i], static_cast<double> (floatCoeffs[i]), toleranceF);
 }
+
+TEST_F (FilterDesignerTests, DISABLED_ExportFIRCoefficientsForAnalysis)
+{
+    const int numCoeffs = 129;
+    const float sampleRateF = 44100.0f;
+
+    // Design different FIR filters
+    auto lowpass = FilterDesigner<float>::designFIRLowpass (numCoeffs, 10000.0f, sampleRateF);
+    auto highpass = FilterDesigner<float>::designFIRHighpass (numCoeffs, 10000.0f, sampleRateF);
+    auto bandpass = FilterDesigner<float>::designFIRBandpass (numCoeffs, 8000.0f, 12000.0f, sampleRateF);
+    auto bandstop = FilterDesigner<float>::designFIRBandstop (numCoeffs, 8000.0f, 12000.0f, sampleRateF);
+
+    // Different windows for lowpass
+    auto lowpassHann = FilterDesigner<float>::designFIRLowpass (numCoeffs, 10000.0f, sampleRateF, WindowType::hann);
+    auto lowpassHamming = FilterDesigner<float>::designFIRLowpass (numCoeffs, 10000.0f, sampleRateF, WindowType::hamming);
+    auto lowpassBlackman = FilterDesigner<float>::designFIRLowpass (numCoeffs, 10000.0f, sampleRateF, WindowType::blackman);
+
+    // Helper lambda to write coefficients to file
+    auto writeCoeffs = [] (const std::vector<float>& coeffs, const std::string& filename)
+    {
+        std::ofstream file (filename);
+        if (file.is_open())
+        {
+            for (size_t i = 0; i < coeffs.size(); ++i)
+            {
+                file << coeffs[i];
+                if (i < coeffs.size() - 1)
+                    file << "\n";
+            }
+            file.close();
+        }
+    };
+
+    // Write all coefficient sets to files
+    writeCoeffs (lowpass, "fir_lowpass_10000hz.txt");
+    writeCoeffs (highpass, "fir_highpass_10000hz.txt");
+    writeCoeffs (bandpass, "fir_bandpass_8000_12000hz.txt");
+    writeCoeffs (bandstop, "fir_bandstop_8000_12000hz.txt");
+    writeCoeffs (lowpassHann, "fir_lowpass_hann_10000hz.txt");
+    writeCoeffs (lowpassHamming, "fir_lowpass_hamming_10000hz.txt");
+    writeCoeffs (lowpassBlackman, "fir_lowpass_blackman_10000hz.txt");
+
+    // Create a Python script to plot the frequency responses
+    std::ofstream pyScript ("plot_fir_responses.py");
+    if (pyScript.is_open())
+    {
+        pyScript << R"(#!/usr/bin/env python3
+import numpy as np
+import matplotlib.pyplot as plt
+from scipy import signal
+
+def load_coeffs(filename):
+    with open(filename, 'r') as f:
+        return [float(line.strip()) for line in f.readlines()]
+
+def plot_frequency_response(coeffs, title, sample_rate=44100):
+    w, h = signal.freqz(coeffs, worN=8000, fs=sample_rate)
+    
+    plt.figure(figsize=(12, 8))
+    
+    # Magnitude response
+    plt.subplot(2, 1, 1)
+    plt.plot(w, 20 * np.log10(np.abs(h)))
+    plt.title(f'{title} - Magnitude Response')
+    plt.xlabel('Frequency (Hz)')
+    plt.ylabel('Magnitude (dB)')
+    plt.grid(True)
+    plt.xlim(0, sample_rate/2)
+    plt.ylim(-80, 5)
+    
+    # Phase response
+    plt.subplot(2, 1, 2)
+    plt.plot(w, np.unwrap(np.angle(h)) * 180 / np.pi)
+    plt.title(f'{title} - Phase Response')
+    plt.xlabel('Frequency (Hz)')
+    plt.ylabel('Phase (degrees)')
+    plt.grid(True)
+    plt.xlim(0, sample_rate/2)
+    
+    plt.tight_layout()
+    plt.savefig(f'{title.lower().replace(" ", "_").replace("-", "_")}_response.png', dpi=150, bbox_inches='tight')
+    plt.show()
+
+# Load and plot all FIR filter responses
+filters = [
+    ('fir_lowpass_10000hz.txt', 'FIR Lowpass 10000Hz'),
+    ('fir_highpass_10000hz.txt', 'FIR Highpass 10000Hz'), 
+    ('fir_bandpass_8000_12000hz.txt', 'FIR Bandpass 8000-12000Hz'),
+    ('fir_bandstop_8000_12000hz.txt', 'FIR Bandstop 8000-12000Hz'),
+    ('fir_lowpass_hann_10000hz.txt', 'FIR Lowpass Hann Window'),
+    ('fir_lowpass_hamming_10000hz.txt', 'FIR Lowpass Hamming Window'),
+    ('fir_lowpass_blackman_10000hz.txt', 'FIR Lowpass Blackman Window')
+]
+
+for filename, title in filters:
+    try:
+        coeffs = load_coeffs(filename)
+        plot_frequency_response(coeffs, title)
+    except FileNotFoundError:
+        print(f"File {filename} not found!")
+
+# Compare window types on same plot
+plt.figure(figsize=(12, 6))
+window_files = [
+    ('fir_lowpass_hann_1000hz.txt', 'Hann', 'blue'),
+    ('fir_lowpass_hamming_1000hz.txt', 'Hamming', 'red'),
+    ('fir_lowpass_blackman_1000hz.txt', 'Blackman', 'green')
+]
+
+for filename, label, color in window_files:
+    try:
+        coeffs = load_coeffs(filename)
+        w, h = signal.freqz(coeffs, worN=8000, fs=44100)
+        plt.plot(w, 20 * np.log10(np.abs(h)), label=label, color=color)
+    except FileNotFoundError:
+        print(f"File {filename} not found!")
+
+plt.title('FIR Lowpass 1000Hz - Window Comparison')
+plt.xlabel('Frequency (Hz)')
+plt.ylabel('Magnitude (dB)')
+plt.grid(True)
+plt.legend()
+plt.xlim(0, 22050)
+plt.ylim(-80, 5)
+plt.savefig('fir_window_comparison.png', dpi=150, bbox_inches='tight')
+plt.show()
+
+print("All plots generated successfully!")
+)";
+        pyScript.close();
+    }
+
+    // Just verify the files were created - the actual validation will be done visually with Python
+    EXPECT_EQ (lowpass.size(), numCoeffs);
+    EXPECT_EQ (highpass.size(), numCoeffs);
+    EXPECT_EQ (bandpass.size(), numCoeffs);
+    EXPECT_EQ (bandstop.size(), numCoeffs);
+
+    std::cout << "\nFIR coefficient files and Python plotting script created:\n";
+    std::cout << "- fir_lowpass_10000hz.txt\n";
+    std::cout << "- fir_highpass_10000hz.txt\n";
+    std::cout << "- fir_bandpass_8000_12000hz.txt\n";
+    std::cout << "- fir_bandstop_8000_12000hz.txt\n";
+    std::cout << "- fir_lowpass_hann_10000hz.txt\n";
+    std::cout << "- fir_lowpass_hamming_10000hz.txt\n";
+    std::cout << "- fir_lowpass_blackman_10000hz.txt\n";
+    std::cout << "- plot_fir_responses.py\n\n";
+    std::cout << "Run: python3 plot_fir_responses.py (requires numpy, matplotlib, scipy)\n";
+}

From cf8851f813eb125bad4304ee0b8f029ae6482b76 Mon Sep 17 00:00:00 2001
From: kunitoki <kunitoki@gmail.com>
Date: Fri, 12 Sep 2025 09:50:12 +0200
Subject: [PATCH 36/37] Fix issues in FIR filter

---
 modules/yup_dsp/filters/yup_DirectFIR.h |  2 +-
 tests/yup_dsp/yup_DirectFIR.cpp         | 82 ++++++++++++-------------
 2 files changed, 42 insertions(+), 42 deletions(-)

diff --git a/modules/yup_dsp/filters/yup_DirectFIR.h b/modules/yup_dsp/filters/yup_DirectFIR.h
index cdedac031..5a2dea78b 100644
--- a/modules/yup_dsp/filters/yup_DirectFIR.h
+++ b/modules/yup_dsp/filters/yup_DirectFIR.h
@@ -111,7 +111,7 @@ class DirectFIR : public FilterBase<SampleType, CoeffType>
     void setCoefficients (std::vector<CoeffType> coefficients, CoeffType scaling = CoeffType (1))
     {
         currentScaling = scaling;
-        if (! approximatelyEqual (currentScaling, 1.0f))
+        if (! approximatelyEqual (currentScaling, CoeffType (1)))
             FloatVectorOperations::multiply (coefficients.data(), scaling, coefficients.size());
 
         coefficientsReversed = std::move (coefficients);
diff --git a/tests/yup_dsp/yup_DirectFIR.cpp b/tests/yup_dsp/yup_DirectFIR.cpp
index 9d79d8066..a58a5e3cd 100644
--- a/tests/yup_dsp/yup_DirectFIR.cpp
+++ b/tests/yup_dsp/yup_DirectFIR.cpp
@@ -118,7 +118,7 @@ class DirectFIRTest : public ::testing::Test
 
 TEST_F (DirectFIRTest, DefaultConstruction)
 {
-    DirectFIRFloat fir;
+    DirectFIR<float, float> fir;
 
     // Default state should be safe
     EXPECT_EQ (fir.getNumCoefficients(), 0);
@@ -128,7 +128,7 @@ TEST_F (DirectFIRTest, DefaultConstruction)
     // Should handle empty processing gracefully
     std::vector<float> input (256, 0.0f);
     std::vector<float> output (256, 0.0f);
-    EXPECT_NO_THROW (fir.process (input.data(), output.data(), input.size()));
+    EXPECT_NO_THROW (fir.processBlock (input.data(), output.data(), static_cast<int> (input.size())));
 
     // Output should remain zero without coefficients
     for (float sample : output)
@@ -137,12 +137,12 @@ TEST_F (DirectFIRTest, DefaultConstruction)
 
 TEST_F (DirectFIRTest, MoveSemantics)
 {
-    DirectFIRFloat fir1;
+    DirectFIR<float, float> fir1;
     std::vector<float> coefficients = { 1.0f, 0.5f, 0.25f };
     fir1.setCoefficients (coefficients, 2.0f);
 
     // Move constructor
-    DirectFIRFloat fir2 = std::move (fir1);
+    DirectFIR<float, float> fir2 = std::move (fir1);
 
     // Verify moved filter works
     EXPECT_EQ (fir2.getNumCoefficients(), 3);
@@ -157,7 +157,7 @@ TEST_F (DirectFIRTest, MoveSemantics)
     input[0] = 1.0f;
     std::vector<float> output (10, 0.0f);
 
-    EXPECT_NO_THROW (fir2.process (input.data(), output.data(), input.size()));
+    EXPECT_NO_THROW (fir2.processBlock (input.data(), output.data(), static_cast<int> (static_cast<int> (input.size()))));
 
     // Should produce scaled output
     float outputSum = 0.0f;
@@ -166,7 +166,7 @@ TEST_F (DirectFIRTest, MoveSemantics)
     EXPECT_GT (outputSum, 1.0f); // Should be > 1 due to scaling
 
     // Move assignment
-    DirectFIRFloat fir3;
+    DirectFIR<float, float> fir3;
     fir3 = std::move (fir2);
 
     EXPECT_EQ (fir3.getNumCoefficients(), 3);
@@ -180,7 +180,7 @@ TEST_F (DirectFIRTest, MoveSemantics)
 
 TEST_F (DirectFIRTest, SetCoefficientsVector)
 {
-    DirectFIRFloat fir;
+    DirectFIR<float, float> fir;
     std::vector<float> coefficients = { 0.1f, 0.5f, 1.0f, 0.5f, 0.1f };
 
     fir.setCoefficients (coefficients, 1.0f);
@@ -196,7 +196,7 @@ TEST_F (DirectFIRTest, SetCoefficientsVector)
 
 TEST_F (DirectFIRTest, SetCoefficientsPointer)
 {
-    DirectFIRFloat fir;
+    DirectFIR<float, float> fir;
     float coefficients[] = { 0.2f, 0.4f, 0.6f, 0.8f };
 
     fir.setCoefficients (coefficients, 4, 2.0f);
@@ -208,7 +208,7 @@ TEST_F (DirectFIRTest, SetCoefficientsPointer)
 
 TEST_F (DirectFIRTest, SetCoefficientsNullptr)
 {
-    DirectFIRFloat fir;
+    DirectFIR<float, float> fir;
 
     // First set some valid coefficients
     std::vector<float> coefficients = { 1.0f, 0.5f };
@@ -223,7 +223,7 @@ TEST_F (DirectFIRTest, SetCoefficientsNullptr)
 
 TEST_F (DirectFIRTest, SetCoefficientsWithScaling)
 {
-    DirectFIRFloat fir;
+    DirectFIR<float, float> fir;
     std::vector<float> coefficients = { 1.0f, 1.0f, 1.0f };
 
     fir.setCoefficients (coefficients, 0.5f);
@@ -233,7 +233,7 @@ TEST_F (DirectFIRTest, SetCoefficientsWithScaling)
     input[0] = 2.0f; // Unit impulse scaled by 2
     std::vector<float> output (10, 0.0f);
 
-    fir.process (input.data(), output.data(), input.size());
+    fir.processBlock (input.data(), output.data(), static_cast<int> (input.size()));
 
     // Output should reflect the coefficient scaling
     // Each coefficient was originally 1.0, scaled by 0.5, so output per coefficient = 2.0 * 0.5 = 1.0
@@ -251,7 +251,7 @@ TEST_F (DirectFIRTest, SetCoefficientsWithScaling)
 
 TEST_F (DirectFIRTest, ImpulseResponse)
 {
-    DirectFIRFloat fir;
+    DirectFIR<float, float> fir;
     std::vector<float> coefficients = { 1.0f, 0.5f, 0.25f };
     fir.setCoefficients (coefficients);
 
@@ -260,7 +260,7 @@ TEST_F (DirectFIRTest, ImpulseResponse)
     input[0] = 1.0f;
     std::vector<float> output (10, 0.0f);
 
-    fir.process (input.data(), output.data(), input.size());
+    fir.processBlock (input.data(), output.data(), static_cast<int> (input.size()));
 
     // Should get the impulse response (coefficients in original order)
     EXPECT_NEAR (output[0], 1.0f, 0.001f);  // First coefficient h0
@@ -274,7 +274,7 @@ TEST_F (DirectFIRTest, ImpulseResponse)
 
 TEST_F (DirectFIRTest, AccumulativeOutput)
 {
-    DirectFIRFloat fir;
+    DirectFIR<float, float> fir;
     std::vector<float> coefficients = { 0.5f, 0.5f };
     fir.setCoefficients (coefficients);
 
@@ -285,7 +285,7 @@ TEST_F (DirectFIRTest, AccumulativeOutput)
     std::fill (output.begin(), output.end(), 1.0f);
     std::vector<float> originalOutput = output;
 
-    fir.process (input.data(), output.data(), input.size());
+    fir.processBlock (input.data(), output.data(), static_cast<int> (input.size()));
 
     // Output should contain original data plus filter result
     for (size_t i = 0; i < output.size(); ++i)
@@ -294,7 +294,7 @@ TEST_F (DirectFIRTest, AccumulativeOutput)
 
 TEST_F (DirectFIRTest, Linearity)
 {
-    DirectFIRFloat fir;
+    DirectFIR<float, float> fir;
     auto coefficients = FilterDesigner<float>::designFIRLowpass (32, 1000.0f, 44100.0f);
     fir.setCoefficients (coefficients);
 
@@ -309,10 +309,10 @@ TEST_F (DirectFIRTest, Linearity)
     std::vector<float> output2 (512, 0.0f);
 
     fir.reset();
-    fir.process (input.data(), output1.data(), input.size());
+    fir.processBlock (input.data(), output1.data(), static_cast<int> (input.size()));
 
     fir.reset();
-    fir.process (input2.data(), output2.data(), input2.size());
+    fir.processBlock (input2.data(), output2.data(), input2.size());
 
     // output2 should be approximately 2x output1
     for (size_t i = 0; i < output1.size(); ++i)
@@ -324,7 +324,7 @@ TEST_F (DirectFIRTest, Linearity)
 
 TEST_F (DirectFIRTest, Reset)
 {
-    DirectFIRFloat fir;
+    DirectFIR<float, float> fir;
     std::vector<float> coefficients = { 1.0f, 0.8f, 0.6f, 0.4f, 0.2f };
     fir.setCoefficients (coefficients);
 
@@ -333,12 +333,12 @@ TEST_F (DirectFIRTest, Reset)
     std::vector<float> output1 (20, 0.0f);
 
     // Process some data to build up internal state
-    fir.process (input.data(), output1.data(), input.size());
+    fir.processBlock (input.data(), output1.data(), static_cast<int> (input.size()));
 
     // Reset and process same input
     fir.reset();
     std::vector<float> output2 (20, 0.0f);
-    fir.process (input.data(), output2.data(), input.size());
+    fir.processBlock (input.data(), output2.data(), static_cast<int> (input.size()));
 
     // Outputs should be identical after reset
     for (size_t i = 0; i < output1.size(); ++i)
@@ -351,7 +351,7 @@ TEST_F (DirectFIRTest, Reset)
 
 TEST_F (DirectFIRTest, LowpassFiltering)
 {
-    DirectFIRFloat fir;
+    DirectFIR<float, float> fir;
 
     // Create lowpass filter coefficients
     auto coefficients = FilterDesigner<float>::designFIRLowpass (64, 1000.0f, 44100.0);
@@ -365,7 +365,7 @@ TEST_F (DirectFIRTest, LowpassFiltering)
     fillWithSine (lowFreqInput, 500.0f, sampleRate);
     std::vector<float> lowFreqOutput (bufferSize, 0.0f);
 
-    fir.process (lowFreqInput.data(), lowFreqOutput.data(), bufferSize);
+    fir.processBlock (lowFreqInput.data(), lowFreqOutput.data(), bufferSize);
 
     // Test with high frequency (should be attenuated)
     fir.reset();
@@ -373,7 +373,7 @@ TEST_F (DirectFIRTest, LowpassFiltering)
     fillWithSine (highFreqInput, 5000.0f, sampleRate);
     std::vector<float> highFreqOutput (bufferSize, 0.0f);
 
-    fir.process (highFreqInput.data(), highFreqOutput.data(), bufferSize);
+    fir.processBlock (highFreqInput.data(), highFreqOutput.data(), bufferSize);
 
     // Compare RMS levels (skip first samples due to transient)
     const size_t skipSamples = 100;
@@ -394,7 +394,7 @@ TEST_F (DirectFIRTest, LowpassFiltering)
 
 TEST_F (DirectFIRTest, BlockSizeIndependence)
 {
-    DirectFIRFloat fir;
+    DirectFIR<float, float> fir;
     auto coefficients = FilterDesigner<float>::designFIRLowpass (48, 2000.0f, 44100.0);
     fir.setCoefficients (coefficients);
 
@@ -405,7 +405,7 @@ TEST_F (DirectFIRTest, BlockSizeIndependence)
     // Process in one big block
     fir.reset();
     std::vector<float> output1 (totalSamples, 0.0f);
-    fir.process (input.data(), output1.data(), totalSamples);
+    fir.processBlock (input.data(), output1.data(), totalSamples);
 
     // Process in smaller blocks
     fir.reset();
@@ -424,7 +424,7 @@ TEST_F (DirectFIRTest, BlockSizeIndependence)
         if (blockSize == 0)
             break;
 
-        fir.process (input.data() + processed, output2.data() + processed, blockSize);
+        fir.processBlock (input.data() + processed, output2.data() + processed, blockSize);
         processed += blockSize;
     }
 
@@ -433,7 +433,7 @@ TEST_F (DirectFIRTest, BlockSizeIndependence)
     {
         size_t remaining = totalSamples - processed;
         size_t blockSize = std::min (remaining, size_t (128)); // Process in chunks of 128
-        fir.process (input.data() + processed, output2.data() + processed, blockSize);
+        fir.processBlock (input.data() + processed, output2.data() + processed, blockSize);
         processed += blockSize;
     }
 
@@ -448,7 +448,7 @@ TEST_F (DirectFIRTest, BlockSizeIndependence)
 
 TEST_F (DirectFIRTest, ZeroSamples)
 {
-    DirectFIRFloat fir;
+    DirectFIR<float, float> fir;
     std::vector<float> coefficients = { 1.0f, 0.5f };
     fir.setCoefficients (coefficients);
 
@@ -456,7 +456,7 @@ TEST_F (DirectFIRTest, ZeroSamples)
     std::vector<float> output (10, 0.0f);
 
     // Processing zero samples should be safe
-    EXPECT_NO_THROW (fir.process (input.data(), output.data(), 0));
+    EXPECT_NO_THROW (fir.processBlock (input.data(), output.data(), 0));
 
     // Output should remain unchanged
     for (float sample : output)
@@ -465,25 +465,25 @@ TEST_F (DirectFIRTest, ZeroSamples)
 
 TEST_F (DirectFIRTest, NullPointers)
 {
-    DirectFIRFloat fir;
+    DirectFIR<float, float> fir;
     std::vector<float> coefficients = { 1.0f };
     fir.setCoefficients (coefficients);
 
     std::vector<float> buffer (10, 0.0f);
 
     // Null input pointer should be handled gracefully
-    EXPECT_NO_THROW (fir.process (nullptr, buffer.data(), 10));
+    EXPECT_NO_THROW (fir.processBlock (nullptr, buffer.data(), 10));
 
     // Null output pointer should be handled gracefully
-    EXPECT_NO_THROW (fir.process (buffer.data(), nullptr, 10));
+    EXPECT_NO_THROW (fir.processBlock (buffer.data(), nullptr, 10));
 
     // Both null should be handled gracefully
-    EXPECT_NO_THROW (fir.process (nullptr, nullptr, 10));
+    EXPECT_NO_THROW (fir.processBlock (nullptr, nullptr, 10));
 }
 
 TEST_F (DirectFIRTest, LargeTapCounts)
 {
-    DirectFIRFloat fir;
+    DirectFIR<float, float> fir;
 
     // Test with relatively large number of coefficients
     std::vector<float> coefficients (512);
@@ -498,7 +498,7 @@ TEST_F (DirectFIRTest, LargeTapCounts)
     std::vector<float> output (1024, 0.0f);
     fillWithRandomData (input);
 
-    EXPECT_NO_THROW (fir.process (input.data(), output.data(), input.size()));
+    EXPECT_NO_THROW (fir.processBlock (input.data(), output.data(), static_cast<int> (input.size())));
 
     // Should produce reasonable output
     float rms = calculateRMS (output);
@@ -508,7 +508,7 @@ TEST_F (DirectFIRTest, LargeTapCounts)
 
 TEST_F (DirectFIRTest, SingleTap)
 {
-    DirectFIRFloat fir;
+    DirectFIR<float, float> fir;
     std::vector<float> coefficients = { 0.75f };
     fir.setCoefficients (coefficients);
 
@@ -518,7 +518,7 @@ TEST_F (DirectFIRTest, SingleTap)
     std::vector<float> input = { 1.0f, 2.0f, 3.0f, 4.0f, 5.0f };
     std::vector<float> output (5, 0.0f);
 
-    fir.process (input.data(), output.data(), input.size());
+    fir.processBlock (input.data(), output.data(), static_cast<int> (input.size()));
 
     for (size_t i = 0; i < input.size(); ++i)
         EXPECT_NEAR (output[i], input[i] * 0.75f, 0.001f);
@@ -530,7 +530,7 @@ TEST_F (DirectFIRTest, SingleTap)
 
 TEST_F (DirectFIRTest, MemoryAlignment)
 {
-    DirectFIRFloat fir;
+    DirectFIR<float, float> fir;
 
     // Coefficient count that's not a multiple of 4
     std::vector<float> coefficients (37);
@@ -549,7 +549,7 @@ TEST_F (DirectFIRTest, MemoryAlignment)
 
 TEST_F (DirectFIRTest, StressTest)
 {
-    DirectFIRFloat fir;
+    DirectFIR<float, float> fir;
 
     // Create complex impulse response
     std::vector<float> coefficients (256);
@@ -571,7 +571,7 @@ TEST_F (DirectFIRTest, StressTest)
         std::vector<float> output (blockSize, 0.0f);
         fillWithRandomData (input);
 
-        EXPECT_NO_THROW (fir.process (input.data(), output.data(), blockSize));
+        EXPECT_NO_THROW (fir.processBlock (input.data(), output.data(), blockSize));
 
         // Verify output quality
         for (float sample : output)

From 0a4b74ab87c9145195f6598d323b7c8311455fd0 Mon Sep 17 00:00:00 2001
From: kunitoki <kunitoki@gmail.com>
Date: Sun, 12 Oct 2025 11:30:07 +0200
Subject: [PATCH 37/37] More FIR specializations

---
 .../graphics/source/examples/FilterDemo.h     | 226 +++++++++++++++++-
 .../convolution/yup_PartitionedConvolver.cpp  |  10 +-
 .../yup_dsp/designers/yup_FilterDesigner.cpp  |  48 ++--
 .../yup_dsp/designers/yup_FilterDesigner.h    |  24 +-
 modules/yup_dsp/filters/yup_DirectFIR.h       |  65 ++---
 .../yup_dsp/windowing/yup_WindowFunctions.h   |  82 +++++--
 tests/yup_dsp/yup_DirectFIR.cpp               |  17 +-
 tests/yup_dsp/yup_FilterDesigner.cpp          |  45 ++--
 8 files changed, 402 insertions(+), 115 deletions(-)

diff --git a/examples/graphics/source/examples/FilterDemo.h b/examples/graphics/source/examples/FilterDemo.h
index 317fd98c2..8c16d7b56 100644
--- a/examples/graphics/source/examples/FilterDemo.h
+++ b/examples/graphics/source/examples/FilterDemo.h
@@ -951,6 +951,7 @@ class FilterDemo
         filterTypeCombo->addItem ("State Variable", 3);
         filterTypeCombo->addItem ("First Order", 4);
         filterTypeCombo->addItem ("Butterworth", 5);
+        filterTypeCombo->addItem ("FIR Filter", 6);
         filterTypeCombo->setSelectedId (1);
         filterTypeCombo->onSelectedItemChanged = [this]
         {
@@ -976,6 +977,42 @@ class FilterDemo
         };
         addAndMakeVisible (*responseTypeCombo);
 
+        // FIR-specific controls
+        firCoefficientsSlider = std::make_unique<yup::Slider> (yup::Slider::LinearBarHorizontal, "FIR Length");
+        firCoefficientsSlider->setRange ({ 16.0, 256.0 });
+        firCoefficientsSlider->setValue (64.0);
+        firCoefficientsSlider->onValueChanged = [this] (float value)
+        {
+            updateAnalysisDisplays();
+        };
+        addAndMakeVisible (*firCoefficientsSlider);
+
+        firWindowCombo = std::make_unique<yup::ComboBox> ("FIR Window");
+        firWindowCombo->addItem ("Hann", 1);
+        firWindowCombo->addItem ("Hamming", 2);
+        firWindowCombo->addItem ("Blackman", 3);
+        firWindowCombo->addItem ("Kaiser", 4);
+        firWindowCombo->addItem ("Rectangle", 5);
+        firWindowCombo->addItem ("Rakshit-Ullah", 6);
+        firWindowCombo->setSelectedId (1);
+        firWindowCombo->onSelectedItemChanged = [this]
+        {
+            updateWindowParameterRange();
+            updateAnalysisDisplays();
+        };
+        addAndMakeVisible (*firWindowCombo);
+
+        // FIR window parameter control (for adjustable windows like Kaiser and Rakshit-Ullah)
+        firWindowParameterSlider = std::make_unique<yup::Slider> (yup::Slider::LinearBarHorizontal, "Window Parameter");
+        firWindowParameterSlider->setRange ({ 0.0005, 10.0 });
+        firWindowParameterSlider->setSkewFactorFromMidpoint (1.0);
+        firWindowParameterSlider->setValue (1.0);
+        firWindowParameterSlider->onValueChanged = [this] (float value)
+        {
+            updateAnalysisDisplays();
+        };
+        addAndMakeVisible (*firWindowParameterSlider);
+
         // Parameter controls with smoothed parameter updates
         frequencySlider = std::make_unique<yup::Slider> (yup::Slider::LinearBarHorizontal, "Frequency");
         frequencySlider->setRange ({ 20.0, 20000.0 });
@@ -1066,7 +1103,7 @@ class FilterDemo
         // Labels for parameter controls
         auto font = yup::ApplicationTheme::getGlobalTheme()->getDefaultFont().withHeight (10.0f);
 
-        for (const auto& labelText : { "Filter Type:", "Response Type:", "Frequency:", "Frequency 2:", "Q/Resonance:", "Gain (dB):", "Order:", "Noise Level:", "Output Level:" })
+        for (const auto& labelText : { "Filter Type:", "Response Type:", "Frequency:", "Frequency 2:", "Q/Resonance:", "Gain (dB):", "Order:", "FIR Length:", "FIR Window:", "Window Param:", "Noise Level:", "Output Level:" })
         {
             auto label = parameterLabels.add (std::make_unique<yup::Label> (labelText));
             label->setText (labelText);
@@ -1092,8 +1129,11 @@ class FilterDemo
             { parameterLabels[4], qSlider.get() },
             { parameterLabels[5], gainSlider.get() },
             { parameterLabels[6], orderSlider.get() },
-            { parameterLabels[7], noiseGainSlider.get() },
-            { parameterLabels[8], outputGainSlider.get() }
+            { parameterLabels[7], firCoefficientsSlider.get() },
+            { parameterLabels[8], firWindowCombo.get() },
+            { parameterLabels[9], firWindowParameterSlider.get() },
+            { parameterLabels[10], noiseGainSlider.get() },
+            { parameterLabels[11], outputGainSlider.get() }
         };
 
         for (auto& [label, component] : layouts)
@@ -1114,6 +1154,7 @@ class FilterDemo
         audioSvf = std::make_shared<yup::StateVariableFilter<float>>();
         audioFirstOrder = std::make_shared<yup::FirstOrderFilter<float>>();
         audioButterworthFilter = std::make_shared<yup::ButterworthFilter<float>>();
+        audioDirectFIR = std::make_shared<yup::DirectFIR<float>>();
 
         // Create instances of all filter types for UI thread
         uiRbj = std::make_shared<yup::RbjFilter<float>>();
@@ -1121,14 +1162,15 @@ class FilterDemo
         uiSvf = std::make_shared<yup::StateVariableFilter<float>>();
         uiFirstOrder = std::make_shared<yup::FirstOrderFilter<float>>();
         uiButterworthFilter = std::make_shared<yup::ButterworthFilter<float>>();
+        uiDirectFIR = std::make_shared<yup::DirectFIR<float>>();
 
         // Store in arrays for easy management
         allAudioFilters = {
-            audioRbj, audioZoelzer, audioSvf, audioFirstOrder, audioButterworthFilter
+            audioRbj, audioZoelzer, audioSvf, audioFirstOrder, audioButterworthFilter, audioDirectFIR
         };
 
         allUIFilters = {
-            uiRbj, uiZoelzer, uiSvf, uiFirstOrder, uiButterworthFilter
+            uiRbj, uiZoelzer, uiSvf, uiFirstOrder, uiButterworthFilter, uiDirectFIR
         };
 
         // Set default filters
@@ -1144,7 +1186,9 @@ class FilterDemo
     {
         noiseGeneratorAmplitude.setCurrentAndTargetValue (0.1f);
         outputGain.setCurrentAndTargetValue (0.5f);
+        updateWindowParameterRange(); // Set initial window parameter range
         updateCurrentFilter();
+        updateControlVisibility(); // Set initial visibility
     }
 
     void updateCurrentFilter()
@@ -1171,6 +1215,9 @@ class FilterDemo
             case 5:
                 currentUIFilter = uiButterworthFilter;
                 break;
+            case 6:
+                currentUIFilter = uiDirectFIR;
+                break;
             default:
                 currentUIFilter = uiRbj;
                 break;
@@ -1189,6 +1236,9 @@ class FilterDemo
         // Update UI filter with current parameters
         updateUIFilterParameters();
 
+        // Update control visibility based on filter type
+        updateControlVisibility();
+
         // Update displays using UI filter
         frequencyResponsePlot.setFilter (currentUIFilter);
         frequencyResponsePlot.updateResponseData();
@@ -1201,12 +1251,12 @@ class FilterDemo
             return;
 
         double freq = smoothedFrequency.getNextValue();
-        double freq2 = smoothedFrequency2.getNextValue();
+        double freq2 = yup::jmax (freq, (double) smoothedFrequency2.getNextValue());
         double q = smoothedQ.getNextValue();
         double gain = smoothedGain.getNextValue();
         int order = yup::jlimit (2, 16, static_cast<int> (smoothedOrder.getNextValue()));
 
-        updateFilterParameters (currentAudioFilter.get(), freq, freq2, q, gain, order);
+        updateFilterParameters (currentAudioFilter.get(), firCoefficients, freq, freq2, q, gain, order);
     }
 
     void updateUIFilterParameters()
@@ -1215,15 +1265,15 @@ class FilterDemo
             return;
 
         double freq = frequencySlider->getValue();
-        double freq2 = frequency2Slider->getValue();
+        double freq2 = yup::jmax (freq, frequency2Slider->getValue());
         double q = qSlider->getValue();
         double gain = gainSlider->getValue();
         int order = yup::jlimit (2, 16, static_cast<int> (orderSlider->getValue()));
 
-        updateFilterParameters (currentUIFilter.get(), freq, freq2, q, gain, order);
+        updateFilterParameters (currentUIFilter.get(), firCoefficientsUI, freq, freq2, q, gain, order);
     }
 
-    void updateFilterParameters (yup::FilterBase<float>* filter, double freq, double freq2, double q, double gain, int order)
+    void updateFilterParameters (yup::FilterBase<float>* filter, std::vector<double>& coefficients, double freq, double freq2, double q, double gain, int order)
     {
         // Update parameters based on filter type using direct UI values
         if (auto rf = dynamic_cast<yup::RbjFilter<float>*> (filter))
@@ -1246,6 +1296,10 @@ class FilterDemo
         {
             bf->setParameters (getFilterMode (currentResponseTypeId), order, freq, yup::jmax (freq2, freq * 1.01), currentSampleRate);
         }
+        else if (auto fir = dynamic_cast<yup::DirectFIR<float>*> (filter))
+        {
+            updateFIRFilterParameters (fir, coefficients, freq, freq2);
+        }
     }
 
     void updateCurrentAudioFilter()
@@ -1268,6 +1322,9 @@ class FilterDemo
             case 5:
                 currentAudioFilter = audioButterworthFilter;
                 break;
+            case 6:
+                currentAudioFilter = audioDirectFIR;
+                break;
             default:
                 currentAudioFilter = audioRbj;
                 break;
@@ -1344,6 +1401,147 @@ class FilterDemo
         polesZerosDisplay.updatePolesZeros (poles, zeros);
     }
 
+    void updateControlVisibility()
+    {
+        bool isFIRFilter = (currentFilterTypeId == 6);
+
+        // Show/hide FIR-specific controls
+        firCoefficientsSlider->setVisible (isFIRFilter);
+        firWindowCombo->setVisible (isFIRFilter);
+        parameterLabels[7]->setVisible (isFIRFilter); // FIR Length label
+        parameterLabels[8]->setVisible (isFIRFilter); // FIR Window label
+
+        // Show/hide window parameter control for adjustable windows (Kaiser, Rakshit-Ullah)
+        bool needsWindowParameter = isFIRFilter && (firWindowCombo->getSelectedId() == 4 || firWindowCombo->getSelectedId() == 6); // Kaiser or Rakshit-Ullah
+        firWindowParameterSlider->setVisible (needsWindowParameter);
+        parameterLabels[9]->setVisible (needsWindowParameter); // Window Parameter label
+
+        // Show/hide standard filter controls
+        qSlider->setVisible (! isFIRFilter);
+        gainSlider->setVisible (! isFIRFilter);
+        orderSlider->setVisible (! isFIRFilter || currentFilterTypeId == 5);        // Show for Butterworth and FIR
+        parameterLabels[4]->setVisible (! isFIRFilter);                             // Q label
+        parameterLabels[5]->setVisible (! isFIRFilter);                             // Gain label
+        parameterLabels[6]->setVisible (! isFIRFilter || currentFilterTypeId == 5); // Order label
+
+        // Frequency 2 is only visible for bandpass/bandstop filters
+        bool needsFreq2 = (currentResponseTypeId >= 3 && currentResponseTypeId <= 5);
+        frequency2Slider->setVisible (needsFreq2);
+        parameterLabels[3]->setVisible (needsFreq2); // Frequency 2 label
+
+        // Update restricted response types for FIR
+        if (isFIRFilter)
+        {
+            // Save current selection
+            int currentResponse = responseTypeCombo->getSelectedId();
+
+            // Clear and repopulate with FIR-compatible responses
+            responseTypeCombo->clear();
+            responseTypeCombo->addItem ("Lowpass", 1);
+            responseTypeCombo->addItem ("Highpass", 2);
+            responseTypeCombo->addItem ("Bandpass", 3);
+            responseTypeCombo->addItem ("Bandstop", 5);
+
+            // Restore selection if compatible, otherwise default to lowpass
+            if (currentResponse == 1 || currentResponse == 2 || currentResponse == 3 || currentResponse == 5)
+                responseTypeCombo->setSelectedId (currentResponse, yup::dontSendNotification);
+            else
+                responseTypeCombo->setSelectedId (1, yup::dontSendNotification);
+        }
+        else
+        {
+            // Restore full response type list for IIR filters
+            int currentResponse = responseTypeCombo->getSelectedId();
+            responseTypeCombo->clear();
+            responseTypeCombo->addItem ("Lowpass", 1);
+            responseTypeCombo->addItem ("Highpass", 2);
+            responseTypeCombo->addItem ("Bandpass CSG", 3);
+            responseTypeCombo->addItem ("Bandpass CPG", 4);
+            responseTypeCombo->addItem ("Bandstop", 5);
+            responseTypeCombo->addItem ("Peak", 6);
+            responseTypeCombo->addItem ("Low Shelf", 7);
+            responseTypeCombo->addItem ("High Shelf", 8);
+            responseTypeCombo->addItem ("Allpass", 9);
+
+            // Restore selection
+            responseTypeCombo->setSelectedId (currentResponse, yup::dontSendNotification);
+        }
+
+        repaint();
+    }
+
+    void updateWindowParameterRange()
+    {
+        int windowId = firWindowCombo->getSelectedId();
+
+        // Update parameter range and default based on window type
+        switch (windowId)
+        {
+            case 4: // Kaiser
+                firWindowParameterSlider->setRange ({ 0.0, 20.0 });
+                firWindowParameterSlider->setSkewFactorFromMidpoint (8.0);
+                firWindowParameterSlider->setValue (8.0); // Kaiser beta parameter
+                break;
+
+            case 6: // Rakshit-Ullah
+                firWindowParameterSlider->setRange ({ 0.0001, 100.0 });
+                firWindowParameterSlider->setSkewFactorFromMidpoint (1.0);
+                firWindowParameterSlider->setValue (1.0); // Rakshit-Ullah r parameter
+                break;
+
+            default: // Other windows (parameter not used)
+                firWindowParameterSlider->setRange ({ 0.0, 10.0 });
+                firWindowParameterSlider->setValue (1.0);
+                break;
+        }
+
+        updateControlVisibility();
+    }
+
+    void updateFIRFilterParameters (yup::DirectFIR<float>* fir, std::vector<double>& coeffs, double freq, double freq2)
+    {
+        int numCoeffs = static_cast<int> (firCoefficientsSlider->getValue());
+        auto windowType = getFIRWindowType (firWindowCombo->getSelectedId());
+        auto responseMode = getFilterMode (currentResponseTypeId);
+
+        // Get window parameter (for Kaiser and Rakshit-Ullah windows)
+        double windowParam = firWindowParameterSlider->getValue();
+
+        if (responseMode.test (yup::FilterMode::lowpass))
+            yup::FilterDesigner<double>::designFIRLowpass (coeffs, numCoeffs, freq, currentSampleRate, windowType, windowParam);
+        else if (responseMode.test (yup::FilterMode::highpass))
+            yup::FilterDesigner<double>::designFIRHighpass (coeffs, numCoeffs, freq, currentSampleRate, windowType, windowParam);
+        else if (responseMode.test (yup::FilterMode::bandpassCsg | yup::FilterMode::bandpassCpg))
+            yup::FilterDesigner<double>::designFIRBandpass (coeffs, numCoeffs, freq, freq2, currentSampleRate, windowType, windowParam);
+        else if (responseMode.test (yup::FilterMode::bandstop))
+            yup::FilterDesigner<double>::designFIRBandstop (coeffs, numCoeffs, freq, freq2, currentSampleRate, windowType, windowParam);
+        else
+            yup::FilterDesigner<double>::designFIRLowpass (coeffs, numCoeffs, freq, currentSampleRate, windowType, windowParam);
+
+        fir->setCoefficients (coeffs.data(), coeffs.size());
+    }
+
+    yup::WindowType getFIRWindowType (int windowId)
+    {
+        switch (windowId)
+        {
+            case 1:
+                return yup::WindowType::hann;
+            case 2:
+                return yup::WindowType::hamming;
+            case 3:
+                return yup::WindowType::blackman;
+            case 4:
+                return yup::WindowType::kaiser;
+            case 5:
+                return yup::WindowType::rectangular;
+            case 6:
+                return yup::WindowType::rakshitUllah;
+            default:
+                return yup::WindowType::hann;
+        }
+    }
+
     yup::FilterModeType getFilterMode (int responseTypeId)
     {
         switch (responseTypeId)
@@ -1391,6 +1589,9 @@ class FilterDemo
     std::vector<std::complex<double>> poles;
     std::vector<std::complex<double>> zeros;
 
+    std::vector<double> firCoefficients { 512, 0.0f };
+    std::vector<double> firCoefficientsUI { 512, 0.0f };
+
     // Filter type settings (thread-safe storage)
     std::atomic<int> currentFilterTypeId { 1 };
     std::atomic<int> currentResponseTypeId { 1 };
@@ -1401,6 +1602,7 @@ class FilterDemo
     std::shared_ptr<yup::StateVariableFilter<float>> audioSvf;
     std::shared_ptr<yup::FirstOrderFilter<float>> audioFirstOrder;
     std::shared_ptr<yup::ButterworthFilter<float>> audioButterworthFilter;
+    std::shared_ptr<yup::DirectFIR<float>> audioDirectFIR;
 
     // UI thread filter instances
     std::shared_ptr<yup::RbjFilter<float>> uiRbj;
@@ -1408,6 +1610,7 @@ class FilterDemo
     std::shared_ptr<yup::StateVariableFilter<float>> uiSvf;
     std::shared_ptr<yup::FirstOrderFilter<float>> uiFirstOrder;
     std::shared_ptr<yup::ButterworthFilter<float>> uiButterworthFilter;
+    std::shared_ptr<yup::DirectFIR<float>> uiDirectFIR;
 
     std::vector<std::shared_ptr<yup::FilterBase<float>>> allAudioFilters;
     std::vector<std::shared_ptr<yup::FilterBase<float>>> allUIFilters;
@@ -1423,6 +1626,9 @@ class FilterDemo
     std::unique_ptr<yup::Slider> qSlider;
     std::unique_ptr<yup::Slider> gainSlider;
     std::unique_ptr<yup::Slider> orderSlider;
+    std::unique_ptr<yup::Slider> firCoefficientsSlider;
+    std::unique_ptr<yup::ComboBox> firWindowCombo;
+    std::unique_ptr<yup::Slider> firWindowParameterSlider;
     std::unique_ptr<yup::Slider> noiseGainSlider;
     std::unique_ptr<yup::Slider> outputGainSlider;
     yup::OwnedArray<yup::Label> parameterLabels;
diff --git a/modules/yup_dsp/convolution/yup_PartitionedConvolver.cpp b/modules/yup_dsp/convolution/yup_PartitionedConvolver.cpp
index f5be2c0fb..ac027993e 100644
--- a/modules/yup_dsp/convolution/yup_PartitionedConvolver.cpp
+++ b/modules/yup_dsp/convolution/yup_PartitionedConvolver.cpp
@@ -563,16 +563,8 @@ class PartitionedConvolver::Impl
                 trimmedLength = trimSilenceFromEnd (impulseResponse, length, *options.trimEndSilenceBelowDb);
 
             // Update DirectFIR in-place
-            std::vector<float> directCoefficients;
-
             const auto directCoefficientsCount = std::min (directFIRCoefficientCount, trimmedLength);
-            if (directCoefficientsCount > 0)
-            {
-                directCoefficients.reserve (directCoefficientsCount);
-                directCoefficients.assign (impulseResponse, impulseResponse + directCoefficientsCount);
-            }
-
-            newFIR.setCoefficients (std::move (directCoefficients), headroomScale);
+            newFIR.setCoefficients (impulseResponse, directCoefficientsCount, headroomScale);
 
             // Update FFT layers
             std::size_t consumed = directCoefficientsCount;
diff --git a/modules/yup_dsp/designers/yup_FilterDesigner.cpp b/modules/yup_dsp/designers/yup_FilterDesigner.cpp
index a92949a08..2972eb0c0 100644
--- a/modules/yup_dsp/designers/yup_FilterDesigner.cpp
+++ b/modules/yup_dsp/designers/yup_FilterDesigner.cpp
@@ -630,11 +630,13 @@ int FilterDesigner<CoeffType>::designLinkwitzRiley (
 //==============================================================================
 
 template <typename CoeffType>
-std::vector<CoeffType> FilterDesigner<CoeffType>::designFIRLowpass (
+void FilterDesigner<CoeffType>::designFIRLowpass (
+    std::vector<CoeffType>& coefficients,
     int numCoefficients,
     CoeffType cutoffFreq,
     double sampleRate,
-    WindowType windowType) noexcept
+    WindowType windowType,
+    CoeffType windowParameter) noexcept
 {
     jassert (numCoefficients > 0);
     jassert (cutoffFreq > static_cast<CoeffType> (0.0));
@@ -642,7 +644,7 @@ std::vector<CoeffType> FilterDesigner<CoeffType>::designFIRLowpass (
     jassert (cutoffFreq < static_cast<CoeffType> (sampleRate / 2.0));
 
     numCoefficients = nextOdd (numCoefficients);
-    std::vector<CoeffType> coefficients (numCoefficients);
+    coefficients.resize (numCoefficients);
 
     const auto normalizedCutoff = static_cast<CoeffType> (2.0) * cutoffFreq / static_cast<CoeffType> (sampleRate);
     const int center = (numCoefficients - 1) / 2;
@@ -664,25 +666,27 @@ std::vector<CoeffType> FilterDesigner<CoeffType>::designFIRLowpass (
     // Apply window function
     for (int i = 0; i < numCoefficients; ++i)
     {
-        const auto windowValue = WindowFunctions<CoeffType>::getValue (windowType, i, numCoefficients);
+        const auto windowValue = WindowFunctions<CoeffType>::getValue (windowType, i, numCoefficients, windowParameter);
         coefficients[i] *= windowValue;
     }
 
     // Normalization
     const auto sum = std::accumulate (coefficients.begin(), coefficients.end(), static_cast<CoeffType> (0.0));
     if (sum != static_cast<CoeffType> (0.0))
+    {
         for (auto& c : coefficients)
             c /= sum;
-
-    return coefficients;
+    }
 }
 
 template <typename CoeffType>
-std::vector<CoeffType> FilterDesigner<CoeffType>::designFIRHighpass (
+void FilterDesigner<CoeffType>::designFIRHighpass (
+    std::vector<CoeffType>& coefficients,
     int numCoefficients,
     CoeffType cutoffFreq,
     double sampleRate,
-    WindowType windowType) noexcept
+    WindowType windowType,
+    CoeffType windowParameter) noexcept
 {
     jassert (numCoefficients > 0);
     jassert (cutoffFreq > static_cast<CoeffType> (0.0));
@@ -691,7 +695,7 @@ std::vector<CoeffType> FilterDesigner<CoeffType>::designFIRHighpass (
 
     // Generate lowpass first
     numCoefficients = nextOdd (numCoefficients);
-    auto coefficients = designFIRLowpass (numCoefficients, cutoffFreq, sampleRate, windowType);
+    designFIRLowpass (coefficients, numCoefficients, cutoffFreq, sampleRate, windowType);
 
     // Convert to highpass using spectral inversion
     const int center = (numCoefficients - 1) / 2;
@@ -707,19 +711,21 @@ std::vector<CoeffType> FilterDesigner<CoeffType>::designFIRHighpass (
         hpi += coefficients[n] * ((n & 1) ? static_cast<CoeffType> (-1.0) : static_cast<CoeffType> (1.0));
 
     if (hpi != static_cast<CoeffType> (0.0))
+    {
         for (auto& c : coefficients)
             c /= hpi;
-
-    return coefficients;
+    }
 }
 
 template <typename CoeffType>
-std::vector<CoeffType> FilterDesigner<CoeffType>::designFIRBandpass (
+void FilterDesigner<CoeffType>::designFIRBandpass (
+    std::vector<CoeffType>& coefficients,
     int numCoefficients,
     CoeffType lowCutoffFreq,
     CoeffType highCutoffFreq,
     double sampleRate,
-    WindowType windowType) noexcept
+    WindowType windowType,
+    CoeffType windowParameter) noexcept
 {
     jassert (numCoefficients > 0);
     jassert (lowCutoffFreq > static_cast<CoeffType> (0.0));
@@ -728,7 +734,7 @@ std::vector<CoeffType> FilterDesigner<CoeffType>::designFIRBandpass (
     jassert (highCutoffFreq < static_cast<CoeffType> (sampleRate / 2.0));
 
     numCoefficients = nextOdd (numCoefficients);
-    std::vector<CoeffType> coefficients (numCoefficients);
+    coefficients.resize (numCoefficients);
 
     const auto normalizedLow = static_cast<CoeffType> (2.0) * lowCutoffFreq / static_cast<CoeffType> (sampleRate);
     const auto normalizedHigh = static_cast<CoeffType> (2.0) * highCutoffFreq / static_cast<CoeffType> (sampleRate);
@@ -754,20 +760,20 @@ std::vector<CoeffType> FilterDesigner<CoeffType>::designFIRBandpass (
     // Apply window function
     for (int i = 0; i < numCoefficients; ++i)
     {
-        const auto windowValue = WindowFunctions<CoeffType>::getValue (windowType, i, numCoefficients);
+        const auto windowValue = WindowFunctions<CoeffType>::getValue (windowType, i, numCoefficients, windowParameter);
         coefficients[i] *= windowValue;
     }
-
-    return coefficients;
 }
 
 template <typename CoeffType>
-std::vector<CoeffType> FilterDesigner<CoeffType>::designFIRBandstop (
+void FilterDesigner<CoeffType>::designFIRBandstop (
+    std::vector<CoeffType>& coefficients,
     int numCoefficients,
     CoeffType lowCutoffFreq,
     CoeffType highCutoffFreq,
     double sampleRate,
-    WindowType windowType) noexcept
+    WindowType windowType,
+    CoeffType windowParameter) noexcept
 {
     jassert (numCoefficients > 0);
     jassert (lowCutoffFreq > static_cast<CoeffType> (0.0));
@@ -777,7 +783,7 @@ std::vector<CoeffType> FilterDesigner<CoeffType>::designFIRBandstop (
 
     // Generate bandpass first
     numCoefficients = nextOdd (numCoefficients);
-    auto coefficients = designFIRBandpass (numCoefficients, lowCutoffFreq, highCutoffFreq, sampleRate, windowType);
+    designFIRBandpass (coefficients, numCoefficients, lowCutoffFreq, highCutoffFreq, sampleRate, windowType);
 
     // Convert to bandstop using spectral inversion
     const int center = (numCoefficients - 1) / 2;
@@ -787,8 +793,6 @@ std::vector<CoeffType> FilterDesigner<CoeffType>::designFIRBandstop (
 
     // Add unit impulse at center
     coefficients[center] += static_cast<CoeffType> (1.0);
-
-    return coefficients;
 }
 
 //==============================================================================
diff --git a/modules/yup_dsp/designers/yup_FilterDesigner.h b/modules/yup_dsp/designers/yup_FilterDesigner.h
index baed267a2..f74b7097d 100644
--- a/modules/yup_dsp/designers/yup_FilterDesigner.h
+++ b/modules/yup_dsp/designers/yup_FilterDesigner.h
@@ -640,11 +640,13 @@ class FilterDesigner
 
         @returns               Vector of FIR coefficients suitable for DirectFIR
     */
-    static std::vector<CoeffType> designFIRLowpass (
+    static void designFIRLowpass (
+        std::vector<CoeffType>& coefficients,
         int numCoefficients,
         CoeffType cutoffFreq,
         double sampleRate,
-        WindowType windowType = WindowType::hann) noexcept;
+        WindowType windowType = WindowType::hann,
+        CoeffType windowParameter = CoeffType (8)) noexcept;
 
     /**
         Designs FIR highpass filter coefficients using windowed sinc method.
@@ -656,11 +658,13 @@ class FilterDesigner
 
         @returns               Vector of FIR coefficients suitable for DirectFIR
     */
-    static std::vector<CoeffType> designFIRHighpass (
+    static void designFIRHighpass (
+        std::vector<CoeffType>& coefficients,
         int numCoefficients,
         CoeffType cutoffFreq,
         double sampleRate,
-        WindowType windowType = WindowType::hann) noexcept;
+        WindowType windowType = WindowType::hann,
+        CoeffType windowParameter = CoeffType (8)) noexcept;
 
     /**
         Designs FIR bandpass filter coefficients using windowed sinc method.
@@ -673,12 +677,14 @@ class FilterDesigner
 
         @returns               Vector of FIR coefficients suitable for DirectFIR
     */
-    static std::vector<CoeffType> designFIRBandpass (
+    static void designFIRBandpass (
+        std::vector<CoeffType>& coefficients,
         int numCoefficients,
         CoeffType lowCutoffFreq,
         CoeffType highCutoffFreq,
         double sampleRate,
-        WindowType windowType = WindowType::hann) noexcept;
+        WindowType windowType = WindowType::hann,
+        CoeffType windowParameter = CoeffType (8)) noexcept;
 
     /**
         Designs FIR bandstop filter coefficients using windowed sinc method.
@@ -691,12 +697,14 @@ class FilterDesigner
 
         @returns               Vector of FIR coefficients suitable for DirectFIR
     */
-    static std::vector<CoeffType> designFIRBandstop (
+    static void designFIRBandstop (
+        std::vector<CoeffType>& coefficients,
         int numCoefficients,
         CoeffType lowCutoffFreq,
         CoeffType highCutoffFreq,
         double sampleRate,
-        WindowType windowType = WindowType::hann) noexcept;
+        WindowType windowType = WindowType::hann,
+        CoeffType windowParameter = CoeffType (8)) noexcept;
 };
 
 } // namespace yup
diff --git a/modules/yup_dsp/filters/yup_DirectFIR.h b/modules/yup_dsp/filters/yup_DirectFIR.h
index 5a2dea78b..b637714c8 100644
--- a/modules/yup_dsp/filters/yup_DirectFIR.h
+++ b/modules/yup_dsp/filters/yup_DirectFIR.h
@@ -100,29 +100,17 @@ class DirectFIR : public FilterBase<SampleType, CoeffType>
 
     //==============================================================================
     /**
-        Set the FIR filter coefficients.
+        Set the FIR filter coefficients from a raw pointer.
 
-        @param coefficients  Vector containing the FIR coefficients in time order
-        @param scaling       Scaling factor to apply to all coefficients
+        @param coefficients     Span of FIR coefficients array
+        @param scaling          Scaling factor to apply to all coefficients
 
         @note This method is not real-time safe and should be called during initialization
-              or when audio processing is paused.
+              or when audio processing is paused, unless the coefficients vector has already been set with a greater or equal size.
     */
-    void setCoefficients (std::vector<CoeffType> coefficients, CoeffType scaling = CoeffType (1))
+    void setCoefficients (yup::Span<const CoeffType> coefficients, CoeffType scaling = CoeffType (1))
     {
-        currentScaling = scaling;
-        if (! approximatelyEqual (currentScaling, CoeffType (1)))
-            FloatVectorOperations::multiply (coefficients.data(), scaling, coefficients.size());
-
-        coefficientsReversed = std::move (coefficients);
-        std::reverse (coefficientsReversed.begin(), coefficientsReversed.end());
-
-        numCoefficients = coefficientsReversed.size();
-        paddedLen = (numCoefficients + 3u) & ~3u; // Round up to multiple of 4 for SIMD
-        coefficientsReversed.resize (paddedLen, 0.0f);
-
-        history.assign (2 * numCoefficients, 0.0f);
-        reset();
+        setCoefficients (coefficients.data(), coefficients.size(), scaling);
     }
 
     /**
@@ -133,7 +121,7 @@ class DirectFIR : public FilterBase<SampleType, CoeffType>
         @param scaling          Scaling factor to apply to all coefficients
 
         @note This method is not real-time safe and should be called during initialization
-              or when audio processing is paused.
+              or when audio processing is paused, unless the coefficients vector has already been set with a greater or equal size.
     */
     void setCoefficients (const CoeffType* coefficients, std::size_t numCoefficientsIn, CoeffType scaling = CoeffType (1))
     {
@@ -144,8 +132,21 @@ class DirectFIR : public FilterBase<SampleType, CoeffType>
             return;
         }
 
-        std::vector<float> coefficientsVector (coefficients, coefficients + numCoefficientsIn);
-        setCoefficients (std::move (coefficientsVector), scaling);
+        numCoefficients = numCoefficientsIn;
+        paddedLen = (numCoefficientsIn + 3u) & ~3u; // Round up to multiple of 4 for SIMD
+
+        coefficientsReversed.resize (paddedLen, 0.0f);
+
+        currentScaling = scaling;
+        if (! approximatelyEqual (currentScaling, CoeffType (1)))
+            FloatVectorOperations::copyWithMultiply (coefficientsReversed.data(), coefficients, scaling, static_cast<int> (numCoefficientsIn));
+        else
+            FloatVectorOperations::copy (coefficientsReversed.data(), coefficients, static_cast<int> (numCoefficientsIn));
+
+        std::reverse (coefficientsReversed.begin(), coefficientsReversed.begin() + numCoefficients);
+
+        history.resize (2 * numCoefficients, 0.0f);
+        writeIndex = 0;
     }
 
     /**
@@ -263,19 +264,29 @@ class DirectFIR : public FilterBase<SampleType, CoeffType>
         if (numCoefficients == 0)
             return Complex<CoeffType> (0, 0);
 
+        // ω = 2π f / Fs
         const CoeffType omega = MathConstants<CoeffType>::twoPi * frequency / static_cast<CoeffType> (this->sampleRate);
 
-        Complex<CoeffType> response (0, 0);
+        // Standard FIR frequency response: H(e^{jω}) = Σ_{n=0}^{N-1} h[n] * e^{-jωn}
+        // coefficientsReversed stores: [h[M-1], h[M-2], ..., h[1], h[0]]
+        // So coefficientsReversed[k] = h[M-1-k], and we need: Σ h[n] * e^{-jωn}
+
+        // e^{-jω}
+        const Complex<CoeffType> ez_neg { std::cos (omega), -std::sin (omega) };
+
+        // Accumulate: Σ_{n=0}^{N-1} h[n] * e^{-jωn}
+        // Since coefficientsReversed[k] = h[M-1-k], we have h[n] = coefficientsReversed[M-1-n]
+        Complex<CoeffType> sum { 0, 0 };
+        Complex<CoeffType> ez_neg_n { 1, 0 }; // e^{-jω*0} = 1
 
-        // H(e^jω) = Σ h[n] * e^(-jωn) for n = 0 to N-1
         for (std::size_t n = 0; n < numCoefficients; ++n)
         {
-            const CoeffType angle = -omega * static_cast<CoeffType> (n);
-            Complex<CoeffType> exponential (std::cos (angle), std::sin (angle));
-            response += static_cast<CoeffType> (coefficientsReversed[numCoefficients - 1 - n]) * exponential;
+            const CoeffType h_n = coefficientsReversed[numCoefficients - 1 - n];
+            sum += h_n * ez_neg_n;
+            ez_neg_n *= ez_neg;
         }
 
-        return response;
+        return sum;
     }
 
     /**
diff --git a/modules/yup_dsp/windowing/yup_WindowFunctions.h b/modules/yup_dsp/windowing/yup_WindowFunctions.h
index e3968b8cb..2e9ddadfa 100644
--- a/modules/yup_dsp/windowing/yup_WindowFunctions.h
+++ b/modules/yup_dsp/windowing/yup_WindowFunctions.h
@@ -35,21 +35,22 @@ namespace yup
 */
 enum class WindowType
 {
-    rectangular,    /**< Rectangular (no windowing) */
-    hann,           /**< Hann window (raised cosine) */
-    hamming,        /**< Hamming window */
-    blackman,       /**< Blackman window */
-    blackmanHarris, /**< Blackman-Harris window (4-term) */
-    kaiser,         /**< Kaiser window (parameterizable) */
-    gaussian,       /**< Gaussian window */
-    tukey,          /**< Tukey window (tapered cosine) */
-    bartlett,       /**< Bartlett window (triangular) */
-    welch,          /**< Welch window (parabolic) */
-    flattop,        /**< Flat-top window */
-    cosine,         /**< Cosine window */
-    lanczos,        /**< Lanczos window (sinc) */
-    nuttall,        /**< Nuttall window */
-    blackmanNuttall /**< Blackman-Nuttall window */
+    rectangular,     /**< Rectangular (no windowing) */
+    hann,            /**< Hann window (raised cosine) */
+    hamming,         /**< Hamming window */
+    blackman,        /**< Blackman window */
+    blackmanHarris,  /**< Blackman-Harris window (4-term) */
+    kaiser,          /**< Kaiser window (parameterizable) */
+    gaussian,        /**< Gaussian window */
+    tukey,           /**< Tukey window (tapered cosine) */
+    bartlett,        /**< Bartlett window (triangular) */
+    welch,           /**< Welch window (parabolic) */
+    flattop,         /**< Flat-top window */
+    cosine,          /**< Cosine window */
+    lanczos,         /**< Lanczos window (sinc) */
+    nuttall,         /**< Nuttall window */
+    blackmanNuttall, /**< Blackman-Nuttall window */
+    rakshitUllah     /**< Rakshit-Ullah adjustable window (novel) */
 };
 
 //==============================================================================
@@ -131,6 +132,8 @@ class WindowFunctions
                 return nuttall (n, N);
             case WindowType::blackmanNuttall:
                 return blackmanNuttall (n, N);
+            case WindowType::rakshitUllah:
+                return rakshitUllah (n, N, parameter);
             default:
                 return rectangular (n, N);
         }
@@ -353,6 +356,55 @@ class WindowFunctions
         return a0 - a1 * std::cos (factor) + a2 * std::cos (FloatType (2) * factor) - a3 * std::cos (FloatType (3) * factor);
     }
 
+    /**
+        Rakshit-Ullah adjustable window function.
+
+        A novel adjustable window combining hyperbolic tangent and weighted cosine functions.
+        Proposed by Hrishi Rakshit and Muhammad Ahsan Ullah (2015).
+
+        @param n Sample index (0 to N-1)
+        @param N Window length
+        @param r Controlling parameter (default 1.0). Higher values give better side-lobe roll-off.
+               Common values: 0.0005, 1.18, 1.618, 30, 75
+        @return Window value at sample n
+
+        @note Reference: "FIR Filter Design Using An Adjustable Novel Window and Its Applications"
+              International Journal of Engineering and Technology (IJET), 2015
+    */
+    static FloatType rakshitUllah (int n, int N, FloatType r = FloatType (1)) noexcept
+    {
+        if (N <= 1)
+            return FloatType (1);
+
+        // Constants from the paper
+        constexpr auto alpha = FloatType (2);
+        constexpr auto B = FloatType (2);
+
+        // Hyperbolic tangent component (y1)
+        const auto center = (N - 1) / FloatType (2);
+        const auto coshAlpha = std::cosh (alpha);
+        const auto coshAlphaSquared = coshAlpha * coshAlpha;
+
+        const auto arg1 = (n - center + coshAlphaSquared) / B;
+        const auto arg2 = (n - center - coshAlphaSquared) / B;
+
+        const auto y1 = std::tanh (arg1) - std::tanh (arg2);
+
+        // Weighted cosine component (y2)
+        const auto factor = MathConstants<FloatType>::twoPi * n / (N - 1);
+        const auto y2 = FloatType (0.375) - FloatType (0.5) * std::cos (factor)
+                      + FloatType (0.125) * std::cos (FloatType (2) * factor);
+
+        // Combined window with power parameter
+        const auto window = y1 * y2;
+
+        // Apply the controlling parameter r
+        if (approximatelyEqual (r, FloatType (1)))
+            return window;
+        else
+            return std::pow (std::abs (window), r) * (window >= FloatType (0) ? FloatType (1) : FloatType (-1));
+    }
+
 private:
     //==============================================================================
     /** Modified Bessel function of the first kind, order 0 */
diff --git a/tests/yup_dsp/yup_DirectFIR.cpp b/tests/yup_dsp/yup_DirectFIR.cpp
index a58a5e3cd..0f1747c20 100644
--- a/tests/yup_dsp/yup_DirectFIR.cpp
+++ b/tests/yup_dsp/yup_DirectFIR.cpp
@@ -295,7 +295,8 @@ TEST_F (DirectFIRTest, AccumulativeOutput)
 TEST_F (DirectFIRTest, Linearity)
 {
     DirectFIR<float, float> fir;
-    auto coefficients = FilterDesigner<float>::designFIRLowpass (32, 1000.0f, 44100.0f);
+    std::vector<float> coefficients;
+    FilterDesigner<float>::designFIRLowpass (coefficients, 32, 1000.0f, 44100.0f);
     fir.setCoefficients (coefficients);
 
     std::vector<float> input (512);
@@ -312,7 +313,7 @@ TEST_F (DirectFIRTest, Linearity)
     fir.processBlock (input.data(), output1.data(), static_cast<int> (input.size()));
 
     fir.reset();
-    fir.processBlock (input2.data(), output2.data(), input2.size());
+    fir.processBlock (input2.data(), output2.data(), static_cast<int> (input2.size()));
 
     // output2 should be approximately 2x output1
     for (size_t i = 0; i < output1.size(); ++i)
@@ -354,7 +355,8 @@ TEST_F (DirectFIRTest, LowpassFiltering)
     DirectFIR<float, float> fir;
 
     // Create lowpass filter coefficients
-    auto coefficients = FilterDesigner<float>::designFIRLowpass (64, 1000.0f, 44100.0);
+    std::vector<float> coefficients;
+    FilterDesigner<float>::designFIRLowpass (coefficients, 64, 1000.0f, 44100.0);
     fir.setCoefficients (coefficients);
 
     const float sampleRate = 44100.0f;
@@ -395,7 +397,8 @@ TEST_F (DirectFIRTest, LowpassFiltering)
 TEST_F (DirectFIRTest, BlockSizeIndependence)
 {
     DirectFIR<float, float> fir;
-    auto coefficients = FilterDesigner<float>::designFIRLowpass (48, 2000.0f, 44100.0);
+    std::vector<float> coefficients;
+    FilterDesigner<float>::designFIRLowpass (coefficients, 48, 2000.0f, 44100.0);
     fir.setCoefficients (coefficients);
 
     const size_t totalSamples = 1024;
@@ -424,7 +427,7 @@ TEST_F (DirectFIRTest, BlockSizeIndependence)
         if (blockSize == 0)
             break;
 
-        fir.processBlock (input.data() + processed, output2.data() + processed, blockSize);
+        fir.processBlock (input.data() + processed, output2.data() + processed, static_cast<int> (blockSize));
         processed += blockSize;
     }
 
@@ -433,7 +436,7 @@ TEST_F (DirectFIRTest, BlockSizeIndependence)
     {
         size_t remaining = totalSamples - processed;
         size_t blockSize = std::min (remaining, size_t (128)); // Process in chunks of 128
-        fir.processBlock (input.data() + processed, output2.data() + processed, blockSize);
+        fir.processBlock (input.data() + processed, output2.data() + processed, static_cast<int> (blockSize));
         processed += blockSize;
     }
 
@@ -571,7 +574,7 @@ TEST_F (DirectFIRTest, StressTest)
         std::vector<float> output (blockSize, 0.0f);
         fillWithRandomData (input);
 
-        EXPECT_NO_THROW (fir.processBlock (input.data(), output.data(), blockSize));
+        EXPECT_NO_THROW (fir.processBlock (input.data(), output.data(), static_cast<int> (blockSize)));
 
         // Verify output quality
         for (float sample : output)
diff --git a/tests/yup_dsp/yup_FilterDesigner.cpp b/tests/yup_dsp/yup_FilterDesigner.cpp
index 120c4ed08..5ddc72b6d 100644
--- a/tests/yup_dsp/yup_FilterDesigner.cpp
+++ b/tests/yup_dsp/yup_FilterDesigner.cpp
@@ -368,7 +368,8 @@ TEST_F (FilterDesignerTests, FloatPrecisionConsistency)
 TEST_F (FilterDesignerTests, FirLowpassBasicProperties)
 {
     const int numCoeffs = 65; // Odd number for symmetric filter
-    auto coeffs = FilterDesigner<float>::designFIRLowpass (numCoeffs, 1000.0f, sampleRate);
+    std::vector<float> coeffs;
+    FilterDesigner<float>::designFIRLowpass (coeffs, numCoeffs, 1000.0f, sampleRate);
 
     // Should return the correct number of coefficients
     EXPECT_EQ (coeffs.size(), numCoeffs);
@@ -393,7 +394,8 @@ TEST_F (FilterDesignerTests, FirLowpassBasicProperties)
 TEST_F (FilterDesignerTests, FirHighpassBasicProperties)
 {
     const int numCoeffs = 65;
-    auto coeffs = FilterDesigner<float>::designFIRHighpass (numCoeffs, 1000.0f, sampleRate);
+    std::vector<float> coeffs;
+    FilterDesigner<float>::designFIRHighpass (coeffs, numCoeffs, 1000.0f, sampleRate);
 
     // Should return the correct number of coefficients
     EXPECT_EQ (coeffs.size(), numCoeffs);
@@ -419,7 +421,8 @@ TEST_F (FilterDesignerTests, FirHighpassBasicProperties)
 TEST_F (FilterDesignerTests, FirBandpassBasicProperties)
 {
     const int numCoeffs = 65;
-    auto coeffs = FilterDesigner<float>::designFIRBandpass (numCoeffs, 800.0f, 1200.0f, sampleRate);
+    std::vector<float> coeffs;
+    FilterDesigner<float>::designFIRBandpass (coeffs, numCoeffs, 800.0f, 1200.0f, sampleRate);
 
     // Should return the correct number of coefficients
     EXPECT_EQ (coeffs.size(), numCoeffs);
@@ -445,7 +448,8 @@ TEST_F (FilterDesignerTests, FirBandpassBasicProperties)
 TEST_F (FilterDesignerTests, FirBandstopBasicProperties)
 {
     const int numCoeffs = 65;
-    auto coeffs = FilterDesigner<float>::designFIRBandstop (numCoeffs, 800.0f, 1200.0f, sampleRate);
+    std::vector<float> coeffs;
+    FilterDesigner<float>::designFIRBandstop (coeffs, numCoeffs, 800.0f, 1200.0f, sampleRate);
 
     // Should return the correct number of coefficients
     EXPECT_EQ (coeffs.size(), numCoeffs);
@@ -473,9 +477,11 @@ TEST_F (FilterDesignerTests, FirDifferentWindowTypes)
     const int numCoeffs = 33;
 
     // Test different window types
-    auto hannCoeffs = FilterDesigner<float>::designFIRLowpass (numCoeffs, 1000.0f, sampleRate, WindowType::hann);
-    auto hammingCoeffs = FilterDesigner<float>::designFIRLowpass (numCoeffs, 1000.0f, sampleRate, WindowType::hamming);
-    auto blackmanCoeffs = FilterDesigner<float>::designFIRLowpass (numCoeffs, 1000.0f, sampleRate, WindowType::blackman);
+    std::vector<float> hannCoeffs, hammingCoeffs, blackmanCoeffs;
+
+    FilterDesigner<float>::designFIRLowpass (hannCoeffs, numCoeffs, 1000.0f, sampleRate, WindowType::hann);
+    FilterDesigner<float>::designFIRLowpass (hammingCoeffs, numCoeffs, 1000.0f, sampleRate, WindowType::hamming);
+    FilterDesigner<float>::designFIRLowpass (blackmanCoeffs, numCoeffs, 1000.0f, sampleRate, WindowType::blackman);
 
     // All should have same size
     EXPECT_EQ (hannCoeffs.size(), numCoeffs);
@@ -507,8 +513,11 @@ TEST_F (FilterDesignerTests, FirFloatDoubleConsistency)
 {
     const int numCoeffs = 33;
 
-    auto doubleCoeffs = FilterDesigner<double>::designFIRLowpass (numCoeffs, 1000.0, sampleRate);
-    auto floatCoeffs = FilterDesigner<float>::designFIRLowpass (numCoeffs, 1000.0f, sampleRate);
+    std::vector<double> doubleCoeffs;
+    FilterDesigner<double>::designFIRLowpass (doubleCoeffs, numCoeffs, 1000.0, sampleRate);
+
+    std::vector<float> floatCoeffs;
+    FilterDesigner<float>::designFIRLowpass (floatCoeffs, numCoeffs, 1000.0f, sampleRate);
 
     EXPECT_EQ (doubleCoeffs.size(), floatCoeffs.size());
 
@@ -519,19 +528,21 @@ TEST_F (FilterDesignerTests, FirFloatDoubleConsistency)
 
 TEST_F (FilterDesignerTests, DISABLED_ExportFIRCoefficientsForAnalysis)
 {
-    const int numCoeffs = 129;
+    const int numCoeffs = 97;
     const float sampleRateF = 44100.0f;
 
     // Design different FIR filters
-    auto lowpass = FilterDesigner<float>::designFIRLowpass (numCoeffs, 10000.0f, sampleRateF);
-    auto highpass = FilterDesigner<float>::designFIRHighpass (numCoeffs, 10000.0f, sampleRateF);
-    auto bandpass = FilterDesigner<float>::designFIRBandpass (numCoeffs, 8000.0f, 12000.0f, sampleRateF);
-    auto bandstop = FilterDesigner<float>::designFIRBandstop (numCoeffs, 8000.0f, 12000.0f, sampleRateF);
+    std::vector<float> lowpass, highpass, bandpass, bandstop;
+    FilterDesigner<float>::designFIRLowpass (lowpass, numCoeffs, 10000.0f, sampleRateF);
+    FilterDesigner<float>::designFIRHighpass (highpass, numCoeffs, 10000.0f, sampleRateF);
+    FilterDesigner<float>::designFIRBandpass (bandpass, numCoeffs, 8000.0f, 12000.0f, sampleRateF);
+    FilterDesigner<float>::designFIRBandstop (bandstop, numCoeffs, 8000.0f, 12000.0f, sampleRateF);
 
     // Different windows for lowpass
-    auto lowpassHann = FilterDesigner<float>::designFIRLowpass (numCoeffs, 10000.0f, sampleRateF, WindowType::hann);
-    auto lowpassHamming = FilterDesigner<float>::designFIRLowpass (numCoeffs, 10000.0f, sampleRateF, WindowType::hamming);
-    auto lowpassBlackman = FilterDesigner<float>::designFIRLowpass (numCoeffs, 10000.0f, sampleRateF, WindowType::blackman);
+    std::vector<float> lowpassHann, lowpassHamming, lowpassBlackman;
+    FilterDesigner<float>::designFIRLowpass (lowpassHann, numCoeffs, 10000.0f, sampleRateF, WindowType::hann);
+    FilterDesigner<float>::designFIRLowpass (lowpassHamming, numCoeffs, 10000.0f, sampleRateF, WindowType::hamming);
+    FilterDesigner<float>::designFIRLowpass (lowpassBlackman, numCoeffs, 10000.0f, sampleRateF, WindowType::blackman);
 
     // Helper lambda to write coefficients to file
     auto writeCoeffs = [] (const std::vector<float>& coeffs, const std::string& filename)