From e0fd6e85225588f437194dd5a975fa794beb1618 Mon Sep 17 00:00:00 2001
From: Conrad Delgado <conraddelgado@Conrads-MacBook-Air-6.local>
Date: Mon, 19 May 2025 21:41:00 -0700
Subject: [PATCH 01/30] periodic ibs,forcing,fft,drag,

---
 runs/3d_1sphere_periodic/case.py             |  154 +++
 runs/3d_drag_test/case.py                    |  145 +++
 runs/3d_periodic_ibs_test/case.py            |  155 +++
 src/common/m_boundary_common.fpp             |  149 ++-
 src/common/m_checker_common.fpp              |    5 +
 src/common/m_constants.fpp                   |    2 +-
 src/common/m_mpi_common.fpp                  |  263 ++++-
 src/post_process/m_global_parameters.fpp     |    8 +
 src/post_process/m_mpi_proxy.fpp             |   16 +-
 src/post_process/m_start_up.f90              |    3 +-
 src/pre_process/m_compute_levelset.fpp       |  103 ++
 src/pre_process/m_data_output.fpp            |   84 +-
 src/pre_process/m_global_parameters.fpp      |    8 +
 src/pre_process/m_initial_condition.fpp      |    6 +-
 src/pre_process/m_mpi_proxy.fpp              |   16 +-
 src/pre_process/m_patches.fpp                |   70 +-
 src/pre_process/m_start_up.fpp               |    3 +-
 src/simulation/m_additional_forcing.fpp      |  158 +++
 src/simulation/m_checker.fpp                 |    8 +
 src/simulation/m_compute_particle_forces.fpp |   72 ++
 src/simulation/m_global_parameters.fpp       |   24 +
 src/simulation/m_ibm.fpp                     |  144 ++-
 src/simulation/m_mpi_proxy.fpp               |   20 +-
 src/simulation/m_rhs.fpp                     |  123 +-
 src/simulation/m_start_up.fpp                |  188 ++--
 src/simulation/m_time_steppers.fpp           |   61 +-
 src/simulation/m_volume_filtering.fpp        | 1049 ++++++++++++++++++
 src/simulation/p_main.fpp                    |    6 +
 toolchain/mfc/run/case_dicts.py              |   12 +-
 voronoi/gen_voronoi_2D.py                    |   99 ++
 voronoi/gen_voronoi_3D.py                    |   98 ++
 31 files changed, 3098 insertions(+), 154 deletions(-)
 create mode 100644 runs/3d_1sphere_periodic/case.py
 create mode 100644 runs/3d_drag_test/case.py
 create mode 100644 runs/3d_periodic_ibs_test/case.py
 create mode 100644 src/simulation/m_additional_forcing.fpp
 create mode 100644 src/simulation/m_compute_particle_forces.fpp
 create mode 100644 src/simulation/m_volume_filtering.fpp
 create mode 100644 voronoi/gen_voronoi_2D.py
 create mode 100644 voronoi/gen_voronoi_3D.py

diff --git a/runs/3d_1sphere_periodic/case.py b/runs/3d_1sphere_periodic/case.py
new file mode 100644
index 0000000000..857841ad0c
--- /dev/null
+++ b/runs/3d_1sphere_periodic/case.py
@@ -0,0 +1,154 @@
+import json
+import math
+import numpy as np
+
+Mu = 1.84e-05
+gam_a = 1.4
+R = 287.0
+
+D = 0.1
+
+P = 101325 # Pa
+rho = 1.225 # kg/m^3
+
+T = P/(rho*R)
+
+M = 1.2
+Re = 1500.0
+v1 = M*(gam_a*P/rho)**(1.0/2.0)
+
+mu = rho*v1*D/Re # dynamic viscosity for current case
+
+#print('mu: ', mu)
+#print('v1: ', v1)
+#print('rho: ', rho)
+#print('Kn = ' + str( np.sqrt(np.pi*gam_a/2)*(M/Re) )) # Kn < 0.01 = continuum flow
+
+dt = 4.0E-06
+Nt = 100
+t_save = 10
+
+Nx = 63
+Ny = 63
+Nz = 63
+
+# immersed boundary dictionary
+ib_dict = {}
+ib_dict.update({
+    f"patch_ib({1})%geometry": 8,
+    f"patch_ib({1})%x_centroid": 0.0,
+    f"patch_ib({1})%y_centroid": 0.0,
+    f"patch_ib({1})%z_centroid": 0.0,
+    f"patch_ib({1})%radius": D / 2,
+    f"patch_ib({1})%slip": "F",
+    })
+
+# Configuring case dictionary
+case_dict = {
+    # Logistics
+    "run_time_info": "T",
+    # Computational Domain Parameters
+    # x direction
+    "x_domain%beg": -5.0 * D,
+    "x_domain%end": 5.0 * D,
+    # y direction
+    "y_domain%beg": -5.0 * D,
+    "y_domain%end": 5.0 * D,
+    # z direction
+    "z_domain%beg": -5.0 * D,
+    "z_domain%end": 5.0 * D,
+    "cyl_coord": "F",
+    "m": Nx,
+    "n": Ny,
+    "p": Nz,
+    "dt": dt,
+    "t_step_start": 0,
+    "t_step_stop": Nt,  # 3000
+    "t_step_save": t_save,  # 10
+    # Simulation Algorithm Parameters
+    # Only one patches are necessary, the air tube
+    "num_patches": 1,
+    # Use the 5 equation model
+    "model_eqns": 2,
+    # 6 equations model does not need the K \div(u) term
+    "alt_soundspeed": "F",
+    # One fluids: air
+    "num_fluids": 1,
+    # time step
+    "mpp_lim": "F",
+    # Correct errors when computing speed of sound
+    "mixture_err": "T",
+    # Use TVD RK3 for time marching
+    "time_stepper": 3,
+    # Reconstruct the primitive variables to minimize spurious
+    # Use WENO5
+    "weno_order": 5,
+    "weno_eps": 1.0e-14,
+    "weno_Re_flux": "T",
+    "weno_avg": "T",
+    "avg_state": 2,
+    "mapped_weno": "T",
+    "null_weights": "F",
+    "mp_weno": "T",
+    "riemann_solver": 2,
+    "low_Mach": 1,
+    "wave_speeds": 1,
+    # periodic bc
+    "bc_x%beg": -1,
+    "bc_x%end": -1,
+    "bc_y%beg": -1,
+    "bc_y%end": -1,
+    "bc_z%beg": -1,
+    "bc_z%end": -1,
+    # Set IB to True and add 1 patch
+    "ib": "T",
+    "num_ibs": 1,
+    "viscous": "T",
+    # Formatted Database Files Structure Parameters
+    "format": 1,
+    "precision": 2,
+    "prim_vars_wrt": "T",
+    "E_wrt": "T",
+    #"q_filtered_wrt": "T",
+    "parallel_io": "T",
+    
+    "patch_icpp(1)%geometry": 9,
+    "patch_icpp(1)%x_centroid": 0.0,
+    # Uniform medium density, centroid is at the center of the domain
+    "patch_icpp(1)%y_centroid": 0.0,
+    "patch_icpp(1)%z_centroid": 0.0,
+    "patch_icpp(1)%length_x": 10 * D,
+    "patch_icpp(1)%length_y": 10 * D,
+    "patch_icpp(1)%length_z": 10 * D,
+    # Specify the patch primitive variables
+    "patch_icpp(1)%vel(1)": v1,
+    "patch_icpp(1)%vel(2)": 0.0e00,
+    "patch_icpp(1)%vel(3)": 0.0e00,
+    "patch_icpp(1)%pres": P,
+    "patch_icpp(1)%alpha_rho(1)": rho,
+    "patch_icpp(1)%alpha(1)": 1.0e00,
+    # Patch: Sphere Immersed Boundary
+    # Fluids Physical Parameters
+    "fluid_pp(1)%gamma": 1.0e00 / (gam_a - 1.0e00),  # 2.50(Not 1.40)
+    "fluid_pp(1)%pi_inf": 0,
+    "fluid_pp(1)%Re(1)": Re,
+
+    # new case additions
+    "periodic_forcing": "T",
+    "periodic_ibs": "T",
+    #"compute_CD_vi": "F",
+    #"compute_CD_si": "F",
+    #"fourier_transform_filtering": "T",
+
+    "u_inf_ref": v1,
+    "rho_inf_ref": rho,
+    "T_inf_ref": T,
+    "mu_visc": mu,
+
+    "store_levelset": "F",
+    "slab_domain_decomposition": "T", 
+    }
+
+case_dict.update(ib_dict)
+
+print(json.dumps(case_dict))
diff --git a/runs/3d_drag_test/case.py b/runs/3d_drag_test/case.py
new file mode 100644
index 0000000000..2eb50ebc62
--- /dev/null
+++ b/runs/3d_drag_test/case.py
@@ -0,0 +1,145 @@
+import json
+import math
+import numpy as np
+
+Mu = 1.84e-05
+gam_a = 1.4
+R = 287.0
+
+D = 0.1
+
+P = 101325 # Pa
+rho = 1.225 # kg/m^3
+
+T = P/(rho*R)
+
+M = 1.2
+Re = 1500.0
+v1 = M*(gam_a*P/rho)**(1.0/2.0)
+
+mu = rho*v1*D/Re # dynamic viscosity for current case
+
+#print('mu: ', mu)
+#print('v1: ', v1)
+#print('rho: ', rho)
+#print('Kn = ' + str( np.sqrt(np.pi*gam_a/2)*(M/Re) )) # Kn < 0.01 = continuum flow
+
+dt = 4.0E-06
+Nt = 100
+t_save = 1
+
+Nx = 99
+Ny = 99
+Nz = 99
+
+# immersed boundary dictionary
+ib_dict = {}
+ib_dict.update({
+    f"patch_ib({1})%geometry": 8,
+    f"patch_ib({1})%x_centroid": 0.0,
+    f"patch_ib({1})%y_centroid": 0.0,
+    f"patch_ib({1})%z_centroid": 0.0,
+    f"patch_ib({1})%radius": D / 2,
+    f"patch_ib({1})%slip": "F",
+    })
+
+# Configuring case dictionary
+case_dict = {
+    # Logistics
+    "run_time_info": "T",
+    # Computational Domain Parameters
+    # x direction
+    "x_domain%beg": -5.0 * D,
+    "x_domain%end": 5.0 * D,
+    # y direction
+    "y_domain%beg": -5.0 * D,
+    "y_domain%end": 5.0 * D,
+    # z direction
+    "z_domain%beg": -5.0 * D,
+    "z_domain%end": 5.0 * D,
+    "cyl_coord": "F",
+    "m": Nx,
+    "n": Ny,
+    "p": Nz,
+    "dt": dt,
+    "t_step_start": 0,
+    "t_step_stop": Nt,  # 3000
+    "t_step_save": t_save,  # 10
+    # Simulation Algorithm Parameters
+    # Only one patches are necessary, the air tube
+    "num_patches": 1,
+    # Use the 5 equation model
+    "model_eqns": 2,
+    # 6 equations model does not need the K \div(u) term
+    "alt_soundspeed": "F",
+    # One fluids: air
+    "num_fluids": 1,
+    # time step
+    "mpp_lim": "F",
+    # Correct errors when computing speed of sound
+    "mixture_err": "T",
+    # Use TVD RK3 for time marching
+    "time_stepper": 3,
+    # Reconstruct the primitive variables to minimize spurious
+    # Use WENO5
+    "weno_order": 5,
+    "weno_eps": 1.0e-14,
+    "weno_Re_flux": "T",
+    "weno_avg": "T",
+    "avg_state": 2,
+    "mapped_weno": "T",
+    "null_weights": "F",
+    "mp_weno": "T",
+    "riemann_solver": 2,
+    "low_Mach": 1,
+    "wave_speeds": 1,
+    # ghost cell extrapolation
+    "bc_x%beg": -3,
+    "bc_x%end": -3,
+    "bc_y%beg": -3,
+    "bc_y%end": -3,
+    "bc_z%beg": -3,
+    "bc_z%end": -3,
+    # Set IB to True and add 1 patch
+    "ib": "T",
+    "num_ibs": 1,
+    "viscous": "T",
+    # Formatted Database Files Structure Parameters
+    "format": 1,
+    "precision": 2,
+    "prim_vars_wrt": "T",
+    "E_wrt": "T",
+    "parallel_io": "T",
+    
+    "patch_icpp(1)%geometry": 9,
+    "patch_icpp(1)%x_centroid": 0.0,
+    # Uniform medium density, centroid is at the center of the domain
+    "patch_icpp(1)%y_centroid": 0.0,
+    "patch_icpp(1)%z_centroid": 0.0,
+    "patch_icpp(1)%length_x": 10 * D,
+    "patch_icpp(1)%length_y": 10 * D,
+    "patch_icpp(1)%length_z": 10 * D,
+    # Specify the patch primitive variables
+    "patch_icpp(1)%vel(1)": v1,
+    "patch_icpp(1)%vel(2)": 0.0e00,
+    "patch_icpp(1)%vel(3)": 0.0e00,
+    "patch_icpp(1)%pres": P,
+    "patch_icpp(1)%alpha_rho(1)": rho,
+    "patch_icpp(1)%alpha(1)": 1.0e00,
+    # Patch: Sphere Immersed Boundary
+    # Fluids Physical Parameters
+    "fluid_pp(1)%gamma": 1.0e00 / (gam_a - 1.0e00),  # 2.50(Not 1.40)
+    "fluid_pp(1)%pi_inf": 0,
+    "fluid_pp(1)%Re(1)": Re,
+
+    # new case additions
+    "compute_CD": "T",
+    "mu_visc": mu,
+    "u_inf_ref": v1,
+    "rho_inf_ref": rho,
+    "T_inf_ref": T,
+    }
+
+case_dict.update(ib_dict)
+
+print(json.dumps(case_dict))
diff --git a/runs/3d_periodic_ibs_test/case.py b/runs/3d_periodic_ibs_test/case.py
new file mode 100644
index 0000000000..9a63a3f4a3
--- /dev/null
+++ b/runs/3d_periodic_ibs_test/case.py
@@ -0,0 +1,155 @@
+import json
+import math
+import numpy as np
+
+Mu = 1.84e-05
+gam_a = 1.4
+R = 287.0
+
+D = 0.1
+
+P = 101325 # Pa
+rho = 1.225 # kg/m^3
+
+T = P/(rho*R)
+
+M = 1.2
+Re = 1500.0
+v1 = M*(gam_a*P/rho)**(1.0/2.0)
+
+mu = rho*v1*D/Re # dynamic viscosity for current case
+
+#print('mu: ', mu)
+#print('v1: ', v1)
+#print('rho: ', rho)
+#print('Kn = ' + str( np.sqrt(np.pi*gam_a/2)*(M/Re) )) # Kn < 0.01 = continuum flow
+
+dt = 4.0E-06
+Nt = 5
+t_save = 1
+
+Nx = 63
+Ny = 63
+Nz = 63
+
+# immersed boundary dictionary
+ib_dict = {}
+ib_dict.update({
+    f"patch_ib({1})%geometry": 8,
+    f"patch_ib({1})%x_centroid": 0.5,
+    f"patch_ib({1})%y_centroid": 0.5,
+    f"patch_ib({1})%z_centroid": 0.5,
+    f"patch_ib({1})%radius": D / 2,
+    f"patch_ib({1})%slip": "F",
+
+    f"patch_ib({2})%geometry": 8,
+    f"patch_ib({2})%x_centroid": 0.0,
+    f"patch_ib({2})%y_centroid": 0.0,
+    f"patch_ib({2})%z_centroid": 0.0,
+    f"patch_ib({2})%radius": D / 2,
+    f"patch_ib({2})%slip": "F",
+
+    f"patch_ib({3})%geometry": 8,
+    f"patch_ib({3})%x_centroid": 0.0,
+    f"patch_ib({3})%y_centroid": 0.5,
+    f"patch_ib({3})%z_centroid": 0.25,
+    f"patch_ib({3})%radius": D / 2,
+    f"patch_ib({3})%slip": "F",
+    })
+
+# Configuring case dictionary
+case_dict = {
+    # Logistics
+    "run_time_info": "T",
+    # Computational Domain Parameters
+    # x direction
+    "x_domain%beg": -5.0 * D,
+    "x_domain%end": 5.0 * D,
+    # y direction
+    "y_domain%beg": -5.0 * D,
+    "y_domain%end": 5.0 * D,
+    # z direction
+    "z_domain%beg": -5.0 * D,
+    "z_domain%end": 5.0 * D,
+    "cyl_coord": "F",
+    "m": Nx,
+    "n": Ny,
+    "p": Nz,
+    "dt": dt,
+    "t_step_start": 0,
+    "t_step_stop": Nt,  # 3000
+    "t_step_save": t_save,  # 10
+    # Simulation Algorithm Parameters
+    # Only one patches are necessary, the air tube
+    "num_patches": 1,
+    # Use the 5 equation model
+    "model_eqns": 2,
+    # 6 equations model does not need the K \div(u) term
+    "alt_soundspeed": "F",
+    # One fluids: air
+    "num_fluids": 1,
+    # time step
+    "mpp_lim": "F",
+    # Correct errors when computing speed of sound
+    "mixture_err": "T",
+    # Use TVD RK3 for time marching
+    "time_stepper": 3,
+    # Reconstruct the primitive variables to minimize spurious
+    # Use WENO5
+    "weno_order": 5,
+    "weno_eps": 1.0e-14,
+    "weno_Re_flux": "T",
+    "weno_avg": "T",
+    "avg_state": 2,
+    "mapped_weno": "T",
+    "null_weights": "F",
+    "mp_weno": "T",
+    "riemann_solver": 2,
+    "low_Mach": 1,
+    "wave_speeds": 1,
+    # periodic bc
+    "bc_x%beg": -1,
+    "bc_x%end": -1,
+    "bc_y%beg": -1,
+    "bc_y%end": -1,
+    "bc_z%beg": -1,
+    "bc_z%end": -1,
+    # Set IB to True and add 1 patch
+    "ib": "T",
+    "num_ibs": 3,
+    "viscous": "T",
+    # Formatted Database Files Structure Parameters
+    "format": 1,
+    "precision": 2,
+    "prim_vars_wrt": "T",
+    "E_wrt": "T",
+    "parallel_io": "T",
+    
+    "patch_icpp(1)%geometry": 9,
+    "patch_icpp(1)%x_centroid": 0.0,
+    # Uniform medium density, centroid is at the center of the domain
+    "patch_icpp(1)%y_centroid": 0.0,
+    "patch_icpp(1)%z_centroid": 0.0,
+    "patch_icpp(1)%length_x": 10 * D,
+    "patch_icpp(1)%length_y": 10 * D,
+    "patch_icpp(1)%length_z": 10 * D,
+    # Specify the patch primitive variables
+    "patch_icpp(1)%vel(1)": v1,
+    "patch_icpp(1)%vel(2)": 0.0e00,
+    "patch_icpp(1)%vel(3)": 0.0e00,
+    "patch_icpp(1)%pres": P,
+    "patch_icpp(1)%alpha_rho(1)": rho,
+    "patch_icpp(1)%alpha(1)": 1.0e00,
+    # Patch: Sphere Immersed Boundary
+    # Fluids Physical Parameters
+    "fluid_pp(1)%gamma": 1.0e00 / (gam_a - 1.0e00),  # 2.50(Not 1.40)
+    "fluid_pp(1)%pi_inf": 0,
+    "fluid_pp(1)%Re(1)": Re,
+
+    # new case additions
+    "periodic_ibs": "T",
+    }
+
+case_dict.update(ib_dict)
+
+print(json.dumps(case_dict))
diff --git a/src/common/m_boundary_common.fpp b/src/common/m_boundary_common.fpp
index 55ac2fec82..eb07e7d9eb 100644
--- a/src/common/m_boundary_common.fpp
+++ b/src/common/m_boundary_common.fpp
@@ -32,7 +32,8 @@ module m_boundary_common
  s_populate_variables_buffers, &
  s_create_mpi_types, &
  s_populate_capillary_buffers, &
- s_finalize_boundary_common_module
+ s_finalize_boundary_common_module, & 
+ s_populate_scalarfield_buffers
 
     public :: bc_buffers, bcxb, bcxe, bcyb, bcye, bczb, bcze
 
@@ -238,6 +239,152 @@ contains
 
     end subroutine s_populate_variables_buffers
 
+    !>  The purpose of this procedure is to populate the buffers of any scalar field.  Used in unclosed term calculation
+    subroutine s_populate_scalarfield_buffers(q_temp)
+
+        type(scalar_field), intent(inout) :: q_temp
+
+        ! currently only considering periodic boundary conditions 
+
+        ! X-dir
+        select case (bc_x%beg)
+        case (-1)     ! Periodic BC at beginning
+            call s_periodic_scalarfield(q_temp, 1, -1)
+        case default ! Processor BC at beginning
+            call s_mpi_sendrecv_variables_buffers_scalarfield( &
+                q_temp, 1, -1)
+        end select
+
+        select case (bc_x%end)
+        case (-1)     ! Periodic BC at end
+            call s_periodic_scalarfield(q_temp, 1, 1)
+        case default ! Processor BC at end
+            call s_mpi_sendrecv_variables_buffers_scalarfield( &
+                q_temp, 1, 1)
+        end select
+
+        ! Y-dir
+        select case (bc_y%beg)
+        case (-1)     ! Periodic BC at beginning
+            call s_periodic_scalarfield(q_temp, 2, -1)
+        case default ! Processor BC at beginning
+            call s_mpi_sendrecv_variables_buffers_scalarfield( &
+                q_temp, 2, -1)
+        end select
+
+        select case (bc_y%end)
+        case (-1)     ! Periodic BC at end
+            call s_periodic_scalarfield(q_temp, 2, 1)
+        case default ! Processor BC at end
+            call s_mpi_sendrecv_variables_buffers_scalarfield( &
+                q_temp, 2, 1)
+        end select
+
+        ! Z-dir
+        select case (bc_z%beg)
+        case (-1)     ! Periodic BC at beginning
+            call s_periodic_scalarfield(q_temp, 3, -1)
+        case default ! Processor BC at beginning
+            call s_mpi_sendrecv_variables_buffers_scalarfield( &
+                q_temp, 3, -1)
+        end select
+
+        select case (bc_z%end)
+        case (-1)     ! Periodic BC at end
+            call s_periodic_scalarfield(q_temp, 3, 1)
+        case default ! Processor BC at end
+            call s_mpi_sendrecv_variables_buffers_scalarfield( &
+                q_temp, 3, 1)
+        end select
+    
+    end subroutine s_populate_scalarfield_buffers
+
+    subroutine s_periodic_scalarfield(q_temp, bc_dir, bc_loc)
+
+        type(scalar_field), intent(inout) :: q_temp
+        integer, intent(in) :: bc_dir, bc_loc
+
+        integer :: j, k, l, q, i
+
+        !< x-direction
+        if (bc_dir == 1) then
+            if (bc_loc == -1) then !< bc_x%beg
+                !$acc parallel loop collapse(3) gang vector default(present)
+                do l = 0, p
+                    do k = 0, n
+                        do j = 1, buff_size
+                            q_temp%sf(-j, k, l) = &
+                                q_temp%sf(m - (j - 1), k, l)
+                        end do
+                    end do
+                end do
+
+            else !< bc_x%end
+                !$acc parallel loop collapse(3) gang vector default(present)
+                do l = 0, p
+                    do k = 0, n
+                        do j = 1, buff_size
+                            q_temp%sf(m + j, k, l) = &
+                                q_temp%sf(j - 1, k, l)
+                        end do
+                    end do
+                end do
+            end if
+
+        !< y-direction
+        elseif (bc_dir == 2) then
+            if (bc_loc == -1) then !< bc_y%beg
+                !$acc parallel loop collapse(3) gang vector default(present)
+                do k = 0, p
+                    do j = 1, buff_size
+                        do l = -buff_size, m + buff_size
+                            q_temp%sf(l, -j, k) = &
+                                q_temp%sf(l, n - (j - 1), k)
+                        end do
+                    end do
+                end do
+
+            else !< bc_y%end
+                !$acc parallel loop collapse(3) gang vector default(present)
+                do k = 0, p
+                    do j = 1, buff_size
+                        do l = -buff_size, m + buff_size
+                            q_temp%sf(l, n + j, k) = &
+                                q_temp%sf(l, j - 1, k)
+                        end do
+                    end do
+                end do
+            end if
+
+        !< z-direction
+        elseif (bc_dir == 3) then
+            if (bc_loc == -1) then !< bc_z%beg
+                !$acc parallel loop collapse(3) gang vector default(present)
+                do j = 1, buff_size
+                    do l = -buff_size, n + buff_size
+                        do k = -buff_size, m + buff_size
+                            q_temp%sf(k, l, -j) = &
+                                q_temp%sf(k, l, p - (j - 1))
+                        end do
+                    end do
+                end do
+
+            else !< bc_z%end
+                !$acc parallel loop collapse(3) gang vector default(present)
+                do j = 1, buff_size
+                    do l = -buff_size, n + buff_size
+                        do k = -buff_size, m + buff_size
+                            q_temp%sf(k, l, p + j) = &
+                                q_temp%sf(k, l, j - 1)
+                        end do
+                    end do
+                end do
+            end if
+
+        end if
+
+    end subroutine s_periodic_scalarfield
+
     subroutine s_ghost_cell_extrapolation(q_prim_vf, pb, mv, bc_dir, bc_loc, k, l)
 #ifdef _CRAYFTN
         !DIR$ INLINEALWAYS s_ghost_cell_extrapolation
diff --git a/src/common/m_checker_common.fpp b/src/common/m_checker_common.fpp
index 7abadf29be..2f6a505001 100644
--- a/src/common/m_checker_common.fpp
+++ b/src/common/m_checker_common.fpp
@@ -174,6 +174,11 @@ contains
         @:PROHIBIT(ib .and. n <= 0, "Immersed Boundaries do not work in 1D")
         @:PROHIBIT(ib .and. (num_ibs <= 0 .or. num_ibs > num_patches_max), "num_ibs must be between 1 and num_patches_max")
         @:PROHIBIT((.not. ib) .and. num_ibs > 0, "num_ibs is set, but ib is not enabled")
+        #:for X in ['x', 'y', 'z']
+            #:for BOUND in ['beg', 'end']
+                @:PROHIBIT(periodic_ibs .and. bc_${X}$%${BOUND}$ /= BC_PERIODIC, "periodic ibs requires periodic BCs, bc_${X}$%${BOUND}$ must = -1")   
+            #:endfor
+        #:endfor
     end subroutine s_check_inputs_ibm
 
 #endif
diff --git a/src/common/m_constants.fpp b/src/common/m_constants.fpp
index 1d09d98fcf..ac2a614860 100644
--- a/src/common/m_constants.fpp
+++ b/src/common/m_constants.fpp
@@ -21,7 +21,7 @@ module m_constants
     integer, parameter :: fourier_rings = 5                       !< Fourier filter ring limit
     integer, parameter :: num_fluids_max = 10                     !< Maximum number of fluids in the simulation
     integer, parameter :: num_probes_max = 10                     !< Maximum number of flow probes in the simulation
-    integer, parameter :: num_patches_max = 10
+    integer, parameter :: num_patches_max = 1000
     integer, parameter :: num_bc_patches_max = 10
     integer, parameter :: pathlen_max = 400
     integer, parameter :: nnode = 4    !< Number of QBMM nodes
diff --git a/src/common/m_mpi_common.fpp b/src/common/m_mpi_common.fpp
index 4645e59c13..25cd6fda5d 100644
--- a/src/common/m_mpi_common.fpp
+++ b/src/common/m_mpi_common.fpp
@@ -43,6 +43,14 @@ module m_mpi_common
     integer :: halo_size, nVars
     !$acc declare create(halo_size, nVars)
 
+    real(wp), private, allocatable, dimension(:), target :: buff_send_scalarfield
+    !! This variable is utilized to pack and send the buffer of any scalar field to neighboring processors
+
+    real(wp), private, allocatable, dimension(:), target :: buff_recv_scalarfield
+    !! This variable is utilized to receive and unpack the buffer of any scalar field from neighboring processors
+
+    !$acc declare create(buff_send_scalarfield, buff_recv_scalarfield)
+
 contains
 
     !> The computation of parameters, the allocation of memory,
@@ -91,6 +99,18 @@ contains
         allocate (buff_send(0:halo_size))
 
         allocate (buff_recv(0:ubound(buff_send, 1)))
+
+#ifdef MFC_SIMULATION
+        if (fourier_transform_filtering) then
+            @:ALLOCATE(buff_send_scalarfield(0:-1 + buff_size*1* &
+                                     & (m + 2*buff_size + 1)* &
+                                     & (n + 2*buff_size + 1)* &
+                                     & (p + 2*buff_size + 1)/ &
+                                     & (min(m, n, p) + 2*buff_size + 1)))
+
+            @:ALLOCATE(buff_recv_scalarfield(0:ubound(buff_send_scalarfield, 1)))
+        end if  
+#endif
 #endif
 
     end subroutine s_initialize_mpi_common_module
@@ -232,14 +252,18 @@ contains
 
 #ifdef MFC_PRE_PROCESS
             MPI_IO_IB_DATA%var%sf => ib_markers%sf
-            MPI_IO_levelset_DATA%var%sf => levelset%sf
-            MPI_IO_levelsetnorm_DATA%var%sf => levelset_norm%sf
+            if (store_levelset) then 
+                MPI_IO_levelset_DATA%var%sf => levelset%sf
+                MPI_IO_levelsetnorm_DATA%var%sf => levelset_norm%sf
+            end if
 #else
             MPI_IO_IB_DATA%var%sf => ib_markers%sf(0:m, 0:n, 0:p)
 
 #ifndef MFC_POST_PROCESS
-            MPI_IO_levelset_DATA%var%sf => levelset%sf(0:m, 0:n, 0:p, 1:num_ibs)
-            MPI_IO_levelsetnorm_DATA%var%sf => levelset_norm%sf(0:m, 0:n, 0:p, 1:num_ibs, 1:3)
+            if (store_levelset) then 
+                MPI_IO_levelset_DATA%var%sf => levelset%sf(0:m, 0:n, 0:p, 1:num_ibs)
+                MPI_IO_levelsetnorm_DATA%var%sf => levelset_norm%sf(0:m, 0:n, 0:p, 1:num_ibs, 1:3)
+            end if 
 #endif
 
 #endif
@@ -1071,6 +1095,233 @@ contains
 
     end subroutine s_mpi_sendrecv_variables_buffers
 
+    !>  The goal of this procedure is to populate the buffers of any scalar field quantity
+    subroutine s_mpi_sendrecv_variables_buffers_scalarfield(q_temp, &
+                                                mpi_dir, &
+                                                pbc_loc)
+
+        type(scalar_field), intent(inout) :: q_temp
+        integer, intent(in) :: mpi_dir, pbc_loc
+
+        integer :: i, j, k, l, r, q !< Generic loop iterators
+
+        integer :: buffer_counts(1:3), buffer_count
+
+        type(int_bounds_info) :: boundary_conditions(1:3)
+        integer :: beg_end(1:2), grid_dims(1:3)
+        integer :: dst_proc, src_proc, recv_tag, send_tag
+
+        logical :: beg_end_geq_0
+
+        integer :: pack_offset, unpack_offset
+
+        real(wp), pointer :: p_send, p_recv
+#ifdef MFC_MPI
+
+        call nvtxStartRange("RHS-COMM-PACKBUF")
+!$acc update device(v_size)
+
+        buffer_counts = (/ &
+                        buff_size*1*(n + 1)*(p + 1), &
+                        buff_size*1*(m + 2*buff_size + 1)*(p + 1), &
+                        buff_size*v_size*(m + 2*buff_size + 1)*(n + 2*buff_size + 1) &
+                        /)
+
+        buffer_count = buffer_counts(mpi_dir)
+        boundary_conditions = (/bc_x, bc_y, bc_z/)
+        beg_end = (/boundary_conditions(mpi_dir)%beg, boundary_conditions(mpi_dir)%end/)
+        beg_end_geq_0 = beg_end(max(pbc_loc, 0) - pbc_loc + 1) >= 0
+
+        ! Implements:
+        ! pbc_loc  bc_x >= 0 -> [send/recv]_tag  [dst/src]_proc
+        ! -1 (=0)      0            ->     [1,0]       [0,0]      | 0 0 [1,0] [beg,beg]
+        ! -1 (=0)      1            ->     [0,0]       [1,0]      | 0 1 [0,0] [end,beg]
+        ! +1 (=1)      0            ->     [0,1]       [1,1]      | 1 0 [0,1] [end,end]
+        ! +1 (=1)      1            ->     [1,1]       [0,1]      | 1 1 [1,1] [beg,end]
+
+        send_tag = f_logical_to_int(.not. f_xor(beg_end_geq_0, pbc_loc == 1))
+        recv_tag = f_logical_to_int(pbc_loc == 1)
+
+        dst_proc = beg_end(1 + f_logical_to_int(f_xor(pbc_loc == 1, beg_end_geq_0)))
+        src_proc = beg_end(1 + f_logical_to_int(pbc_loc == 1))
+
+        grid_dims = (/m, n, p/)
+
+        pack_offset = 0
+        if (f_xor(pbc_loc == 1, beg_end_geq_0)) then
+            pack_offset = grid_dims(mpi_dir) - buff_size + 1
+        end if
+
+        unpack_offset = 0
+        if (pbc_loc == 1) then
+            unpack_offset = grid_dims(mpi_dir) + buff_size + 1
+        end if
+
+        ! Pack Buffer to Send
+        #:for mpi_dir in [1, 2, 3]
+            if (mpi_dir == ${mpi_dir}$) then
+                #:if mpi_dir == 1
+                    !$acc parallel loop collapse(4) gang vector default(present) private(r)
+                    do l = 0, p
+                        do k = 0, n
+                            do j = 0, buff_size - 1
+                                do i = 1, 1
+                                    r = (i - 1) + v_size*(j + buff_size*(k + (n + 1)*l))
+                                    buff_send_scalarfield(r) = q_temp%sf(j + pack_offset, k, l)
+                                end do
+                            end do
+                        end do
+                    end do
+                #:elif mpi_dir == 2
+                    !$acc parallel loop collapse(4) gang vector default(present) private(r)
+                    do i = 1, 1
+                        do l = 0, p
+                            do k = 0, buff_size - 1
+                                do j = -buff_size, m + buff_size
+                                    r = (i - 1) + v_size* &
+                                        ((j + buff_size) + (m + 2*buff_size + 1)* &
+                                         (k + buff_size*l))
+                                    buff_send_scalarfield(r) = q_temp%sf(j, k + pack_offset, l)
+                                end do
+                            end do
+                        end do
+                    end do
+                #:else
+                    !$acc parallel loop collapse(4) gang vector default(present) private(r)
+                    do i = 1, 1
+                        do l = 0, buff_size - 1
+                            do k = -buff_size, n + buff_size
+                                do j = -buff_size, m + buff_size
+                                    r = (i - 1) + v_size* &
+                                        ((j + buff_size) + (m + 2*buff_size + 1)* &
+                                         ((k + buff_size) + (n + 2*buff_size + 1)*l))
+                                    buff_send_scalarfield(r) = q_temp%sf(j, k, l + pack_offset)
+                                end do
+                            end do
+                        end do
+                    end do
+                #:endif
+            end if
+        #:endfor
+        call nvtxEndRange ! Packbuf
+
+        p_send => buff_send_scalarfield(0)
+        p_recv => buff_recv_scalarfield(0)
+
+        ! Send/Recv
+#ifdef MFC_SIMULATION
+        #:for rdma_mpi in [False, True]
+            if (rdma_mpi .eqv. ${'.true.' if rdma_mpi else '.false.'}$) then
+                #:if rdma_mpi
+                    !$acc data attach(p_send, p_recv)
+                    !$acc host_data use_device(p_send, p_recv)
+                    call nvtxStartRange("RHS-COMM-SENDRECV-RDMA")
+                #:else
+                    call nvtxStartRange("RHS-COMM-DEV2HOST")
+                    !$acc update host(buff_send_scalarfield)
+                    call nvtxEndRange
+                    call nvtxStartRange("RHS-COMM-SENDRECV-NO-RMDA")
+                #:endif
+
+                call MPI_SENDRECV( &
+                    p_send, buffer_count, mpi_p, dst_proc, send_tag, &
+                    p_recv, buffer_count, mpi_p, src_proc, recv_tag, &
+                    MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
+
+                call nvtxEndRange ! RHS-MPI-SENDRECV-(NO)-RDMA
+
+                #:if rdma_mpi
+                    !$acc end host_data
+                    !$acc end data
+                    !$acc wait
+                #:else
+                    call nvtxStartRange("RHS-COMM-HOST2DEV")
+                    !$acc update device(buff_recv_scalarfield)
+                    call nvtxEndRange
+                #:endif
+            end if
+        #:endfor
+#else
+        call MPI_SENDRECV( &
+            p_send, buffer_count, mpi_p, dst_proc, send_tag, &
+            p_recv, buffer_count, mpi_p, src_proc, recv_tag, &
+            MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
+#endif
+
+        ! Unpack Received Buffer
+        call nvtxStartRange("RHS-COMM-UNPACKBUF")
+        #:for mpi_dir in [1, 2, 3]
+            if (mpi_dir == ${mpi_dir}$) then
+                #:if mpi_dir == 1
+                    !$acc parallel loop collapse(4) gang vector default(present) private(r)
+                    do l = 0, p
+                        do k = 0, n
+                            do j = -buff_size, -1
+                                do i = 1, 1
+                                    r = (i - 1) + v_size* &
+                                        (j + buff_size*((k + 1) + (n + 1)*l))
+                                    q_temp%sf(j + unpack_offset, k, l) = buff_recv_scalarfield(r)
+#if defined(__INTEL_COMPILER)
+                                    if (ieee_is_nan(q_temp%sf(j, k, l))) then
+                                        print *, "Error", j, k, l, i
+                                        error stop "NaN(s) in recv"
+                                    end if
+#endif
+                                end do
+                            end do
+                        end do
+                    end do
+                #:elif mpi_dir == 2
+                    !$acc parallel loop collapse(4) gang vector default(present) private(r)
+                    do i = 1, 1
+                        do l = 0, p
+                            do k = -buff_size, -1
+                                do j = -buff_size, m + buff_size
+                                    r = (i - 1) + v_size* &
+                                        ((j + buff_size) + (m + 2*buff_size + 1)* &
+                                         ((k + buff_size) + buff_size*l))
+                                    q_temp%sf(j, k + unpack_offset, l) = buff_recv_scalarfield(r)
+#if defined(__INTEL_COMPILER)
+                                    if (ieee_is_nan(q_temp%sf(j, k, l))) then
+                                        print *, "Error", j, k, l, i
+                                        error stop "NaN(s) in recv"
+                                    end if
+#endif
+                                end do
+                            end do
+                        end do
+                    end do
+                #:else
+                    ! Unpacking buffer from bc_z%beg
+                    !$acc parallel loop collapse(4) gang vector default(present) private(r)
+                    do i = 1, 1
+                        do l = -buff_size, -1
+                            do k = -buff_size, n + buff_size
+                                do j = -buff_size, m + buff_size
+                                    r = (i - 1) + v_size* &
+                                        ((j + buff_size) + (m + 2*buff_size + 1)* &
+                                         ((k + buff_size) + (n + 2*buff_size + 1)* &
+                                          (l + buff_size)))
+                                    q_temp%sf(j, k, l + unpack_offset) = buff_recv_scalarfield(r)
+#if defined(__INTEL_COMPILER)
+                                    if (ieee_is_nan(q_temp%sf(j, k, l))) then
+                                        print *, "Error", j, k, l, i
+                                        error stop "NaN(s) in recv"
+                                    end if
+#endif
+                                end do
+                            end do
+                        end do
+                    end do
+                #:endif
+            end if
+        #:endfor
+        call nvtxEndRange
+
+#endif
+
+    end subroutine s_mpi_sendrecv_variables_buffers_scalarfield
+
     subroutine s_mpi_sendrecv_capilary_variables_buffers(c_divs_vf, mpi_dir, pbc_loc)
 
         type(scalar_field), dimension(num_dims + 1), intent(inout) :: c_divs_vf
@@ -1299,6 +1550,10 @@ contains
 
 #ifdef MFC_MPI
         deallocate (buff_send, buff_recv)
+#ifdef MFC_SIMULATION
+        @:DEALLOCATE(buff_send_scalarfield)
+        @:DEALLOCATE(buff_recv_scalarfield)
+#endif
 #endif
 
     end subroutine s_finalize_mpi_common_module
diff --git a/src/post_process/m_global_parameters.fpp b/src/post_process/m_global_parameters.fpp
index 00c5f0ec27..9db5321c55 100644
--- a/src/post_process/m_global_parameters.fpp
+++ b/src/post_process/m_global_parameters.fpp
@@ -319,6 +319,10 @@ module m_global_parameters
 
     real(wp) :: Bx0 !< Constant magnetic field in the x-direction (1D)
 
+    logical :: periodic_ibs
+    logical :: store_levelset
+    logical :: slab_domain_decomposition
+
 contains
 
     !> Assigns default values to user inputs prior to reading
@@ -460,6 +464,10 @@ contains
         ! MHD
         Bx0 = dflt_real
 
+        periodic_ibs = .false.
+        store_levelset = .true.
+        slab_domain_decomposition = .false.
+
     end subroutine s_assign_default_values_to_user_inputs
 
     !>  Computation of parameters, allocation procedures, and/or
diff --git a/src/post_process/m_mpi_proxy.fpp b/src/post_process/m_mpi_proxy.fpp
index 8fc70bfe77..9e368d7fa4 100644
--- a/src/post_process/m_mpi_proxy.fpp
+++ b/src/post_process/m_mpi_proxy.fpp
@@ -171,7 +171,8 @@ contains
             & 'adv_n', 'ib', 'cfl_adap_dt', 'cfl_const_dt', 'cfl_dt',          &
             & 'surface_tension', 'hyperelasticity', 'bubbles_lagrange',        &
             & 'rkck_adap_dt', 'output_partial_domain', 'relativity',           &
-            & 'cont_damage' ]
+            & 'cont_damage', 'periodic_ibs', 'store_levelset',                 &
+            & 'slab_domain_decomposition' ]
             call MPI_BCAST(${VAR}$, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr)
         #:endfor
 
@@ -297,6 +298,19 @@ contains
 
                     end do
 
+                else if (slab_domain_decomposition) then 
+                    if (proc_rank == 0) then 
+                        print *, 'slab domain decomposition...'
+                    end if 
+                    
+                    ! continuous x and y direction, block decomposition in z
+                    num_procs_x = 1
+                    num_procs_y = 1
+                    num_procs_z = num_procs
+                    ierr = -1
+                    if (mod((p+1), num_procs_z) == 0) then 
+                        ierr = 0
+                    end if
                 else
 
                     ! Initial values of the processor factorization optimization
diff --git a/src/post_process/m_start_up.f90 b/src/post_process/m_start_up.f90
index 8fece2820b..f04efd75e6 100644
--- a/src/post_process/m_start_up.f90
+++ b/src/post_process/m_start_up.f90
@@ -84,7 +84,8 @@ subroutine s_read_input_file
             relax_model, cf_wrt, sigma, adv_n, ib, num_ibs, &
             cfl_adap_dt, cfl_const_dt, t_save, t_stop, n_start, &
             cfl_target, surface_tension, bubbles_lagrange, rkck_adap_dt, &
-            sim_data, hyperelasticity, Bx0, relativity, cont_damage
+            sim_data, hyperelasticity, Bx0, relativity, cont_damage, & 
+            periodic_ibs, store_levelset, slab_domain_decomposition
 
         ! Inquiring the status of the post_process.inp file
         file_loc = 'post_process.inp'
diff --git a/src/pre_process/m_compute_levelset.fpp b/src/pre_process/m_compute_levelset.fpp
index d339157be6..1a7547bb6b 100644
--- a/src/pre_process/m_compute_levelset.fpp
+++ b/src/pre_process/m_compute_levelset.fpp
@@ -474,6 +474,11 @@ contains
         real(wp) :: x_centroid, y_centroid, z_centroid
         real(wp), dimension(3) :: dist_vec
 
+        real(wp) :: x_domain_beg, x_domain_end, y_domain_beg, y_domain_end, z_domain_beg, z_domain_end
+        real(wp) :: x_pcen, y_pcen, z_pcen !< periodically projected centroids of sphere
+        real(wp), dimension(7, 3) :: dist_vec_per
+        real(wp), dimension(7) :: dist_per
+
         integer :: i, j, k !< Loop index variables
 
         radius = patch_ib(ib_patch_id)%radius
@@ -481,6 +486,37 @@ contains
         y_centroid = patch_ib(ib_patch_id)%y_centroid
         z_centroid = patch_ib(ib_patch_id)%z_centroid
 
+        call s_mpi_allreduce_min(x_domain%beg, x_domain_beg)
+        call s_mpi_allreduce_max(x_domain%end, x_domain_end)
+        call s_mpi_allreduce_min(y_domain%beg, y_domain_beg)
+        call s_mpi_allreduce_max(y_domain%end, y_domain_end)
+        call s_mpi_allreduce_min(z_domain%beg, z_domain_beg)
+        call s_mpi_allreduce_max(z_domain%end, z_domain_end)
+
+        if (periodic_ibs) then
+            if ((x_centroid - x_domain_beg) <= radius) then
+                x_pcen = x_domain_end + (x_centroid - x_domain_beg)
+            else if ((x_domain_end - x_centroid) <= radius) then 
+                x_pcen = x_domain_beg - (x_domain_end - x_centroid)
+            else 
+                x_pcen = x_centroid
+            end if
+            if ((y_centroid - y_domain_beg) <= radius) then
+                y_pcen = y_domain_end + (y_centroid - y_domain_beg)
+            else if ((y_domain_end - y_centroid) <= radius) then 
+                y_pcen = y_domain_beg - (y_domain_end - y_centroid)
+            else 
+                y_pcen = y_centroid
+            end if
+            if ((z_centroid - z_domain_beg) <= radius) then
+                z_pcen = z_domain_end + (z_centroid - z_domain_beg)
+            else if ((z_domain_end - z_centroid) <= radius) then 
+                z_pcen = z_domain_beg - (z_domain_end - z_centroid)
+            else 
+                z_pcen = z_centroid
+            end if
+        end if
+
         do i = 0, m
             do j = 0, n
                 do k = 0, p
@@ -488,6 +524,73 @@ contains
                     dist_vec(2) = y_cc(j) - y_centroid
                     dist_vec(3) = z_cc(k) - z_centroid
                     dist = sqrt(sum(dist_vec**2))
+
+                    ! all permutations of periodically projected ib
+                    if (periodic_ibs) then
+                        dist_vec_per(1, 1) = x_cc(i) - x_pcen 
+                        dist_vec_per(1, 2) = y_cc(j) - y_pcen
+                        dist_vec_per(1, 3) = z_cc(k) - z_pcen
+                        dist_per(1) = sqrt(sum(dist_vec_per(1, :)**2))
+                        if (dist_per(1) < dist) then    
+                            dist = dist_per(1)
+                            dist_vec = dist_vec_per(1, :)
+                        end if 
+
+                        dist_vec_per(2, 1) = x_cc(i) - x_pcen 
+                        dist_vec_per(2, 2) = y_cc(j) - y_centroid
+                        dist_vec_per(2, 3) = z_cc(k) - z_pcen
+                        dist_per(2) = sqrt(sum(dist_vec_per(2, :)**2))
+                        if (dist_per(2) < dist) then    
+                            dist = dist_per(2)
+                            dist_vec = dist_vec_per(2, :)
+                        end if
+
+                        dist_vec_per(3, 1) = x_cc(i) - x_pcen 
+                        dist_vec_per(3, 2) = y_cc(j) - y_pcen
+                        dist_vec_per(3, 3) = z_cc(k) - z_centroid
+                        dist_per(3) = sqrt(sum(dist_vec_per(3, :)**2))
+                        if (dist_per(3) < dist) then    
+                            dist = dist_per(3)
+                            dist_vec = dist_vec_per(3, :)
+                        end if
+
+                        dist_vec_per(4, 1) = x_cc(i) - x_pcen 
+                        dist_vec_per(4, 2) = y_cc(j) - y_centroid
+                        dist_vec_per(4, 3) = z_cc(k) - z_centroid
+                        dist_per(4) = sqrt(sum(dist_vec_per(4, :)**2))
+                        if (dist_per(4) < dist) then    
+                            dist = dist_per(4)
+                            dist_vec = dist_vec_per(4, :)
+                        end if
+
+                        dist_vec_per(5, 1) = x_cc(i) - x_centroid
+                        dist_vec_per(5, 2) = y_cc(j) - y_pcen
+                        dist_vec_per(5, 3) = z_cc(k) - z_pcen
+                        dist_per(5) = sqrt(sum(dist_vec_per(5, :)**2))
+                        if (dist_per(5) < dist) then    
+                            dist = dist_per(5)
+                            dist_vec = dist_vec_per(5, :)
+                        end if
+
+                        dist_vec_per(6, 1) = x_cc(i) - x_centroid
+                        dist_vec_per(6, 2) = y_cc(j) - y_pcen
+                        dist_vec_per(6, 3) = z_cc(k) - z_centroid
+                        dist_per(6) = sqrt(sum(dist_vec_per(6, :)**2))
+                        if (dist_per(6) < dist) then    
+                            dist = dist_per(6)
+                            dist_vec = dist_vec_per(6, :)
+                        end if
+
+                        dist_vec_per(7, 1) = x_cc(i) - x_centroid
+                        dist_vec_per(7, 2) = y_cc(j) - y_centroid
+                        dist_vec_per(7, 3) = z_cc(k) - z_pcen
+                        dist_per(7) = sqrt(sum(dist_vec_per(7, :)**2))
+                        if (dist_per(7) < dist) then    
+                            dist = dist_per(7)
+                            dist_vec = dist_vec_per(7, :)
+                        end if
+                    end if
+
                     levelset%sf(i, j, k, ib_patch_id) = dist - radius
                     if (dist == 0) then
                         levelset_norm%sf(i, j, k, ib_patch_id, :) = (/1, 0, 0/)
diff --git a/src/pre_process/m_data_output.fpp b/src/pre_process/m_data_output.fpp
index 7ae637f034..0030749793 100644
--- a/src/pre_process/m_data_output.fpp
+++ b/src/pre_process/m_data_output.fpp
@@ -217,17 +217,19 @@ contains
         end if
 
         ! Outtputting Levelset Info
-        file_loc = trim(t_step_dir)//'/levelset.dat'
+        if (store_levelset) then 
+            file_loc = trim(t_step_dir)//'/levelset.dat'
 
-        open (1, FILE=trim(file_loc), FORM='unformatted', STATUS=status)
-        write (1) levelset%sf
-        close (1)
+            open (1, FILE=trim(file_loc), FORM='unformatted', STATUS=status)
+            write (1) levelset%sf
+            close (1)
 
-        file_loc = trim(t_step_dir)//'/levelset_norm.dat'
+            file_loc = trim(t_step_dir)//'/levelset_norm.dat'
 
-        open (1, FILE=trim(file_loc), FORM='unformatted', STATUS=status)
-        write (1) levelset_norm%sf
-        close (1)
+            open (1, FILE=trim(file_loc), FORM='unformatted', STATUS=status)
+            write (1) levelset_norm%sf
+            close (1)
+        end if
 
         ! Outputting Conservative Variables
         do i = 1, sys_size
@@ -774,45 +776,47 @@ contains
 
             call MPI_FILE_CLOSE(ifile, ierr)
 
-            ! Levelset
-            write (file_loc, '(A)') 'levelset.dat'
-            file_loc = trim(restart_dir)//trim(mpiiofs)//trim(file_loc)
-            inquire (FILE=trim(file_loc), EXIST=file_exist)
-            if (file_exist .and. proc_rank == 0) then
-                call MPI_FILE_DELETE(file_loc, mpi_info_int, ierr)
-            end if
-            call MPI_FILE_OPEN(MPI_COMM_WORLD, file_loc, ior(MPI_MODE_WRONLY, MPI_MODE_CREATE), &
-                               mpi_info_int, ifile, ierr)
+            if (store_levelset) then 
+                ! Levelset
+                write (file_loc, '(A)') 'levelset.dat'
+                file_loc = trim(restart_dir)//trim(mpiiofs)//trim(file_loc)
+                inquire (FILE=trim(file_loc), EXIST=file_exist)
+                if (file_exist .and. proc_rank == 0) then
+                    call MPI_FILE_DELETE(file_loc, mpi_info_int, ierr)
+                end if
+                call MPI_FILE_OPEN(MPI_COMM_WORLD, file_loc, ior(MPI_MODE_WRONLY, MPI_MODE_CREATE), &
+                                mpi_info_int, ifile, ierr)
 
-            ! Initial displacement to skip at beginning of file
-            disp = 0
+                ! Initial displacement to skip at beginning of file
+                disp = 0
 
-            call MPI_FILE_SET_VIEW(ifile, disp, mpi_p, MPI_IO_levelset_DATA%view, &
-                                   'native', mpi_info_int, ierr)
-            call MPI_FILE_WRITE_ALL(ifile, MPI_IO_levelset_DATA%var%sf, data_size*num_ibs, &
-                                    mpi_p, status, ierr)
+                call MPI_FILE_SET_VIEW(ifile, disp, mpi_p, MPI_IO_levelset_DATA%view, &
+                                    'native', mpi_info_int, ierr)
+                call MPI_FILE_WRITE_ALL(ifile, MPI_IO_levelset_DATA%var%sf, data_size*num_ibs, &
+                                        mpi_p, status, ierr)
 
-            call MPI_FILE_CLOSE(ifile, ierr)
+                call MPI_FILE_CLOSE(ifile, ierr)
 
-            ! Levelset Norm
-            write (file_loc, '(A)') 'levelset_norm.dat'
-            file_loc = trim(restart_dir)//trim(mpiiofs)//trim(file_loc)
-            inquire (FILE=trim(file_loc), EXIST=file_exist)
-            if (file_exist .and. proc_rank == 0) then
-                call MPI_FILE_DELETE(file_loc, mpi_info_int, ierr)
-            end if
-            call MPI_FILE_OPEN(MPI_COMM_WORLD, file_loc, ior(MPI_MODE_WRONLY, MPI_MODE_CREATE), &
-                               mpi_info_int, ifile, ierr)
+                ! Levelset Norm
+                write (file_loc, '(A)') 'levelset_norm.dat'
+                file_loc = trim(restart_dir)//trim(mpiiofs)//trim(file_loc)
+                inquire (FILE=trim(file_loc), EXIST=file_exist)
+                if (file_exist .and. proc_rank == 0) then
+                    call MPI_FILE_DELETE(file_loc, mpi_info_int, ierr)
+                end if
+                call MPI_FILE_OPEN(MPI_COMM_WORLD, file_loc, ior(MPI_MODE_WRONLY, MPI_MODE_CREATE), &
+                                mpi_info_int, ifile, ierr)
 
-            ! Initial displacement to skip at beginning of file
-            disp = 0
+                ! Initial displacement to skip at beginning of file
+                disp = 0
 
-            call MPI_FILE_SET_VIEW(ifile, disp, mpi_p, MPI_IO_levelsetnorm_DATA%view, &
-                                   'native', mpi_info_int, ierr)
-            call MPI_FILE_WRITE_ALL(ifile, MPI_IO_levelsetnorm_DATA%var%sf, data_size*num_ibs*3, &
-                                    mpi_p, status, ierr)
+                call MPI_FILE_SET_VIEW(ifile, disp, mpi_p, MPI_IO_levelsetnorm_DATA%view, &
+                                    'native', mpi_info_int, ierr)
+                call MPI_FILE_WRITE_ALL(ifile, MPI_IO_levelsetnorm_DATA%var%sf, data_size*num_ibs*3, &
+                                        mpi_p, status, ierr)
 
-            call MPI_FILE_CLOSE(ifile, ierr)
+                call MPI_FILE_CLOSE(ifile, ierr)
+            end if
         end if
 
         if (ib) then
diff --git a/src/pre_process/m_global_parameters.fpp b/src/pre_process/m_global_parameters.fpp
index 8305f47996..fa8966922c 100644
--- a/src/pre_process/m_global_parameters.fpp
+++ b/src/pre_process/m_global_parameters.fpp
@@ -283,6 +283,10 @@ module m_global_parameters
     !! conditions data to march the solution in the physical computational domain
     !! to the next time-step.
 
+    logical :: periodic_ibs
+    logical :: store_levelset
+    logical :: slab_domain_decomposition
+
 contains
 
     !>  Assigns default values to user inputs prior to reading
@@ -550,6 +554,10 @@ contains
 
         Bx0 = dflt_real
 
+        periodic_ibs = .false.
+        store_levelset = .true.
+        slab_domain_decomposition = .false.
+
     end subroutine s_assign_default_values_to_user_inputs
 
     !> Computation of parameters, allocation procedures, and/or
diff --git a/src/pre_process/m_initial_condition.fpp b/src/pre_process/m_initial_condition.fpp
index 0efc1c225d..78f2e73cb5 100644
--- a/src/pre_process/m_initial_condition.fpp
+++ b/src/pre_process/m_initial_condition.fpp
@@ -92,8 +92,10 @@ contains
 
         allocate (ib_markers%sf(0:m, 0:n, 0:p))
 
-        allocate (levelset%sf(0:m, 0:n, 0:p, 1:num_ibs))
-        allocate (levelset_norm%sf(0:m, 0:n, 0:p, 1:num_ibs, 1:3))
+        if (store_levelset) then 
+            allocate (levelset%sf(0:m, 0:n, 0:p, 1:num_ibs))
+            allocate (levelset_norm%sf(0:m, 0:n, 0:p, 1:num_ibs, 1:3))
+        end if
 
         if (qbmm .and. .not. polytropic) then
             !Allocate bubble pressure pb and vapor mass mv for non-polytropic qbmm at all quad nodes and R0 bins
diff --git a/src/pre_process/m_mpi_proxy.fpp b/src/pre_process/m_mpi_proxy.fpp
index a0bea3caee..691ba56add 100644
--- a/src/pre_process/m_mpi_proxy.fpp
+++ b/src/pre_process/m_mpi_proxy.fpp
@@ -70,7 +70,8 @@ contains
             & 'qbmm', 'file_per_process', 'adv_n', 'ib' , 'cfl_adap_dt',       &
             & 'cfl_const_dt', 'cfl_dt', 'surface_tension',                     &
             & 'hyperelasticity', 'pre_stress', 'elliptic_smoothing', 'viscous',&
-            & 'bubbles_lagrange', 'bc_io', 'mhd', 'relativity', 'cont_damage'  ]
+            & 'bubbles_lagrange', 'bc_io', 'mhd', 'relativity', 'cont_damage', & 
+            & 'periodic_ibs', 'store_levelset', 'slab_domain_decomposition' ]
             call MPI_BCAST(${VAR}$, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr)
         #:endfor
         call MPI_BCAST(fluid_rho(1), num_fluids_max, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr)
@@ -251,6 +252,19 @@ contains
 
                     end do
 
+                else if (slab_domain_decomposition) then 
+                    if (proc_rank == 0) then 
+                        print *, 'slab domain decomposition...'
+                    end if 
+                    
+                    ! continuous x and y direction, block decomposition in z
+                    num_procs_x = 1
+                    num_procs_y = 1
+                    num_procs_z = num_procs
+                    ierr = -1
+                    if (mod((p+1), num_procs_z) == 0) then 
+                        ierr = 0
+                    end if
                 else
 
                     ! Initial values of the processor factorization optimization
diff --git a/src/pre_process/m_patches.fpp b/src/pre_process/m_patches.fpp
index c01f91654b..40cb8a86fe 100644
--- a/src/pre_process/m_patches.fpp
+++ b/src/pre_process/m_patches.fpp
@@ -125,7 +125,9 @@ contains
 
                 if (patch_ib(i)%geometry == 8) then
                     call s_sphere(i, ib_markers_sf, q_prim_vf, ib)
-                    call s_sphere_levelset(levelset, levelset_norm, i)
+                    if (store_levelset) then 
+                        call s_sphere_levelset(levelset, levelset_norm, i)
+                    end if 
                 elseif (patch_ib(i)%geometry == 9) then
                     call s_cuboid(i, ib_markers_sf, q_prim_vf, ib)
                     call s_cuboid_levelset(levelset, levelset_norm, i)
@@ -1724,6 +1726,9 @@ contains
             !! Variables to initialize the pressure field that corresponds to the
             !! bubble-collapse test case found in Tiwari et al. (2013)
 
+        real(wp) :: x_domain_beg, x_domain_end, y_domain_beg, y_domain_end, z_domain_beg, z_domain_end
+        real(wp) :: x_pcen, y_pcen, z_pcen
+
         ! Transferring spherical patch's radius, centroid, smoothing patch
         ! identity and smoothing coefficient information
         if (present(ib)) then
@@ -1749,6 +1754,39 @@ contains
         ! and verifying whether the current patch has permission to write to
         ! that cell. If both queries check out, the primitive variables of
         ! the current patch are assigned to this cell.
+
+        call s_mpi_allreduce_min(x_domain%beg, x_domain_beg)
+        call s_mpi_allreduce_max(x_domain%end, x_domain_end)
+        call s_mpi_allreduce_min(y_domain%beg, y_domain_beg)
+        call s_mpi_allreduce_max(y_domain%end, y_domain_end)
+        call s_mpi_allreduce_min(z_domain%beg, z_domain_beg)
+        call s_mpi_allreduce_max(z_domain%end, z_domain_end)
+
+        ! periodically projected sphere centroid
+        if (periodic_ibs .and. present(ib)) then 
+            if ((x_centroid - x_domain_beg) <= radius) then
+                x_pcen = x_domain_end + (x_centroid - x_domain_beg)
+            else if ((x_domain_end - x_centroid) <= radius) then 
+                x_pcen = x_domain_beg - (x_domain_end - x_centroid)
+            else 
+                x_pcen = x_centroid
+            end if
+            if ((y_centroid - y_domain_beg) <= radius) then
+                y_pcen = y_domain_end + (y_centroid - y_domain_beg)
+            else if ((y_domain_end - y_centroid) <= radius) then 
+                y_pcen = y_domain_beg - (y_domain_end - y_centroid)
+            else 
+                y_pcen = y_centroid
+            end if
+            if ((z_centroid - z_domain_beg) <= radius) then
+                z_pcen = z_domain_end + (z_centroid - z_domain_beg)
+            else if ((z_domain_end - z_centroid) <= radius) then 
+                z_pcen = z_domain_beg - (z_domain_end - z_centroid)
+            else 
+                z_pcen = z_centroid
+            end if
+        end if
+
         do k = 0, p
             do j = 0, n
                 do i = 0, m
@@ -1788,6 +1826,36 @@ contains
                             @:analytical()
                         end if
                     end if
+
+                    if (periodic_ibs .and. present(ib)) then    
+                        ! check every permutation of the projected cell location
+                        if (((x_cc(i) - x_pcen)**2 &
+                            + (cart_y - y_pcen)**2 &
+                            + (cart_z - z_pcen)**2 <= radius**2) &
+                            .or. ((x_cc(i) - x_pcen)**2 &
+                            + (cart_y - y_centroid)**2 &
+                            + (cart_z - z_centroid)**2 <= radius**2) &
+                            .or. ((x_cc(i) - x_pcen)**2 &
+                            + (cart_y - y_pcen)**2 &
+                            + (cart_z - z_centroid)**2 <= radius**2) &
+                            .or. ((x_cc(i) - x_pcen)**2 &
+                            + (cart_y - y_centroid)**2 &
+                            + (cart_z - z_pcen)**2 <= radius**2) & 
+                            .or. ((x_cc(i) - x_centroid)**2 &
+                            + (cart_y - y_pcen)**2 &
+                            + (cart_z - z_centroid)**2 <= radius**2) &
+                            .or. ((x_cc(i) - x_centroid)**2 &
+                            + (cart_y - y_pcen)**2 &
+                            + (cart_z - z_pcen)**2 <= radius**2) &
+                            .or. ((x_cc(i) - x_centroid)**2 &
+                            + (cart_y - y_centroid)**2 &
+                            + (cart_z - z_pcen)**2 <= radius**2)) &
+                            then
+
+                            ! Updating the patch identities bookkeeping variable
+                            patch_id_fp(i, j, k) = patch_id
+                        end if
+                    end if
                 end do
             end do
         end do
diff --git a/src/pre_process/m_start_up.fpp b/src/pre_process/m_start_up.fpp
index 8749c22278..71ca6523b2 100644
--- a/src/pre_process/m_start_up.fpp
+++ b/src/pre_process/m_start_up.fpp
@@ -149,7 +149,8 @@ contains
             n_start_old, surface_tension, hyperelasticity, pre_stress, &
             rkck_adap_dt, elliptic_smoothing, elliptic_smoothing_iters, &
             viscous, bubbles_lagrange, bc_x, bc_y, bc_z, num_bc_patches, &
-            patch_bc, Bx0, relativity, cont_damage
+            patch_bc, Bx0, relativity, cont_damage, & 
+            periodic_ibs, store_levelset, slab_domain_decomposition
 
         ! Inquiring the status of the pre_process.inp file
         file_loc = 'pre_process.inp'
diff --git a/src/simulation/m_additional_forcing.fpp b/src/simulation/m_additional_forcing.fpp
new file mode 100644
index 0000000000..17765413bd
--- /dev/null
+++ b/src/simulation/m_additional_forcing.fpp
@@ -0,0 +1,158 @@
+#:include 'macros.fpp'
+
+module m_additional_forcing
+    use m_derived_types 
+
+    use m_global_parameters
+
+    use m_ibm
+
+    use m_mpi_proxy 
+
+    use m_volume_filtering
+
+    implicit none
+
+    private; public :: s_initialize_additional_forcing_module, & 
+ s_add_periodic_forcing, s_finalize_additional_forcing_module, & 
+ s_compute_phase_average, s_compute_periodic_forcing;
+
+    real(wp), allocatable, dimension(:) :: q_bar ! 1:3 rho*u, 4 rho, 5 T
+    type(scalar_field), allocatable, dimension(:) :: q_periodic_force
+    real(wp), allocatable, dimension(:) :: q_spatial_avg, q_spatial_avg_glb ! 1:3 rho*u, 4 rho, 5 T
+    real(wp) :: volfrac_phi
+    integer :: N_x_total_glb
+
+    !$acc declare create(q_bar, q_periodic_force, q_spatial_avg, q_spatial_avg_glb, volfrac_phi, N_x_total_glb)
+
+contains
+
+    subroutine s_initialize_additional_forcing_module
+        integer :: i
+        if (periodic_forcing) then 
+            @:ALLOCATE(q_bar(1:5))
+            @:ALLOCATE(q_periodic_force(1:8))
+            do i = 1, 8 
+                @:ALLOCATE(q_periodic_force(i)%sf(0:m, 0:n, 0:p))
+                @:ACC_SETUP_SFs(q_periodic_force(i))
+            end do
+            @:ALLOCATE(q_spatial_avg(1:5))
+            @:ALLOCATE(q_spatial_avg_glb(1:5))
+        end if
+
+        volfrac_phi = num_ibs * 4._wp/3._wp * pi * patch_ib(1)%radius**3 / ((x_domain%end - x_domain%beg)*(y_domain%end - y_domain%beg)*(z_domain%end - z_domain%beg))
+        !$acc update device(volfrac_phi)
+
+        N_x_total_glb = (m_glb + 1) * (n_glb + 1) * (p_glb + 1)
+        !$acc update device(N_x_total_glb)
+    end subroutine s_initialize_additional_forcing_module
+
+    !< adds periodic forcing terms to RHS, as detailed in Khalloufi and Capecelatro
+    subroutine s_add_periodic_forcing(rhs_vf)
+        type(scalar_field), dimension(sys_size), intent(inout) :: rhs_vf
+        integer :: i, j, k
+
+        !$acc parallel loop collapse(3) gang vector default(present)
+        do i = 0, m
+            do j = 0, n
+                do k = 0, p
+                    rhs_vf(1)%sf(i, j, k) = rhs_vf(1)%sf(i, j, k) + q_periodic_force(7)%sf(i, j, k) * fluid_indicator_function_I%sf(i, j, k) ! continuity
+                    rhs_vf(2)%sf(i, j, k) = rhs_vf(2)%sf(i, j, k) + q_periodic_force(1)%sf(i, j, k) * fluid_indicator_function_I%sf(i, j, k) * fluid_indicator_function_I%sf(i, j, k) ! x momentum
+                    rhs_vf(5)%sf(i, j, k) = rhs_vf(5)%sf(i, j, k) + (q_periodic_force(4)%sf(i, j, k) + q_periodic_force(8)%sf(i, j, k)) * fluid_indicator_function_I%sf(i, j, k) ! energy
+                end do
+            end do
+        end do
+    end subroutine s_add_periodic_forcing
+
+    subroutine s_compute_phase_average(q_cons_vf, t_step)
+        type(scalar_field), dimension(sys_size), intent(in) :: q_cons_vf
+        integer, intent(in) :: t_step
+        integer :: i, j, k
+
+        !$acc loop seq
+        do i = 1, 5
+            q_spatial_avg(i) = 0._wp
+        end do
+
+        ! spatial average
+        !$acc parallel loop collapse(3) gang vector default(present) reduction(+:q_spatial_avg(:))
+        do i = 0, m 
+            do j = 0, n 
+                do k = 0, p 
+                    q_spatial_avg(4) = q_spatial_avg(4) + q_cons_vf(1)%sf(i, j, k) * fluid_indicator_function_I%sf(i, j, k)
+                    q_spatial_avg(5) = q_spatial_avg(5) + (0.4_wp/287._wp * (q_cons_vf(5)%sf(i, j, k)/q_cons_vf(1)%sf(i, j, k) & 
+                                        - 0.5_wp * ((q_cons_vf(2)%sf(i, j, k)/q_cons_vf(1)%sf(i, j, k))**2 & 
+                                        + (q_cons_vf(3)%sf(i, j, k)/q_cons_vf(1)%sf(i, j, k))**2 & 
+                                        + (q_cons_vf(4)%sf(i, j, k)/q_cons_vf(1)%sf(i, j, k))**2))) * fluid_indicator_function_I%sf(i, j, k)
+                                        
+                    q_spatial_avg(1) = q_spatial_avg(1) + (q_cons_vf(2)%sf(i, j, k)) * fluid_indicator_function_I%sf(i, j, k)
+                    q_spatial_avg(2) = q_spatial_avg(2) + (q_cons_vf(3)%sf(i, j, k)) * fluid_indicator_function_I%sf(i, j, k)
+                    q_spatial_avg(3) = q_spatial_avg(3) + (q_cons_vf(4)%sf(i, j, k)) * fluid_indicator_function_I%sf(i, j, k)
+                end do
+            end do
+        end do
+
+        !$acc update host(q_spatial_avg(:))
+
+        do i = 1, 5 
+            call s_mpi_allreduce_sum(q_spatial_avg(i), q_spatial_avg_glb(i))
+        end do
+
+        !$acc update device(q_spatial_avg_glb(:))
+
+        !$acc loop seq
+        do i = 1, 5 
+            q_spatial_avg_glb(i) = q_spatial_avg_glb(i) / real(N_x_total_glb, wp)
+        end do
+
+        ! time average
+        !$acc loop seq
+        do i = 1, 5 
+            q_bar(i) = ( (q_spatial_avg_glb(i) + (t_step - 1._wp)*q_bar(i)) / t_step ) 
+        end do
+    end subroutine s_compute_phase_average
+
+    !< computes the periodic forcing terms described in Khalloufi and Capecelatro
+    subroutine s_compute_periodic_forcing(q_cons_vf)
+        type(scalar_field), dimension(sys_size), intent(in) :: q_cons_vf
+
+        integer :: i, j, k
+
+        !$acc parallel loop collapse(3) gang vector default(present)
+        do i = 0, m
+            do j = 0, n
+                do k = 0, p
+                    ! f_u
+                    q_periodic_force(1)%sf(i, j, k) = (rho_inf_ref*u_inf_ref - q_bar(1)/(1._wp - volfrac_phi)) / dt
+                    q_periodic_force(2)%sf(i, j, k) = (rho_inf_ref*u_inf_ref - q_bar(2)/(1._wp - volfrac_phi)) / dt
+                    q_periodic_force(3)%sf(i, j, k) = (rho_inf_ref*u_inf_ref - q_bar(3)/(1._wp - volfrac_phi)) / dt
+
+                    ! u*f_u
+                    q_periodic_force(4)%sf(i, j, k) = q_cons_vf(2)%sf(i, j, k)/q_cons_vf(1)%sf(i, j, k) * q_periodic_force(1)%sf(i, j, k)
+                    q_periodic_force(5)%sf(i, j, k) = q_cons_vf(3)%sf(i, j, k)/q_cons_vf(1)%sf(i, j, k) * q_periodic_force(2)%sf(i, j, k)
+                    q_periodic_force(6)%sf(i, j, k) = q_cons_vf(4)%sf(i, j, k)/q_cons_vf(1)%sf(i, j, k) * q_periodic_force(3)%sf(i, j, k)
+
+                    ! f_rho
+                    q_periodic_force(7)%sf(i, j, k) = (rho_inf_ref - q_bar(4)/(1._wp - volfrac_phi)) / dt
+
+                    ! f_T
+                    q_periodic_force(8)%sf(i, j, k) = (q_cons_vf(1)%sf(i, j, k) / 1.4_wp) * (T_inf_ref - q_bar(5)/(1._wp - volfrac_phi)) / dt
+                end do 
+            end do
+        end do
+    end subroutine s_compute_periodic_forcing
+
+    subroutine s_finalize_additional_forcing_module
+        integer :: i
+        if (periodic_forcing) then
+            @:DEALLOCATE(q_bar)
+            do i = 1, 8
+                @:DEALLOCATE(q_periodic_force(i)%sf)
+            end do
+            @:DEALLOCATE(q_periodic_force)
+            @:DEALLOCATE(q_spatial_avg)
+            @:DEALLOCATE(q_spatial_avg_glb)
+        end if
+    end subroutine s_finalize_additional_forcing_module
+
+end module m_additional_forcing
\ No newline at end of file
diff --git a/src/simulation/m_checker.fpp b/src/simulation/m_checker.fpp
index f3fe8eb6d5..04c1076f2a 100644
--- a/src/simulation/m_checker.fpp
+++ b/src/simulation/m_checker.fpp
@@ -346,6 +346,14 @@ contains
     subroutine s_check_inputs_misc
         @:PROHIBIT(probe_wrt .and. fd_order == dflt_int, "fd_order must be specified for probe_wrt")
         @:PROHIBIT(integral_wrt .and. (.not. bubbles_euler))
+        #:for X in ['x', 'y', 'z']
+            #:for BOUND in ['beg', 'end']
+                @:PROHIBIT(periodic_forcing .and. bc_${X}$%${BOUND}$ /= BC_PERIODIC, &
+                    "Periodic forcing requires all BCs to be periodic")
+                @:PROHIBIT(fourier_transform_filtering .and. bc_${X}$%${BOUND}$ /= BC_PERIODIC, &
+                    "Explicit filtering of flow data requires all BCs to be periodic due to fourier transform")
+            #:endfor
+        #:endfor
     end subroutine s_check_inputs_misc
 
     subroutine s_check_inputs_mhd
diff --git a/src/simulation/m_compute_particle_forces.fpp b/src/simulation/m_compute_particle_forces.fpp
new file mode 100644
index 0000000000..fd84657f5f
--- /dev/null
+++ b/src/simulation/m_compute_particle_forces.fpp
@@ -0,0 +1,72 @@
+#:include 'macros.fpp'
+
+module m_compute_particle_forces
+    use m_derived_types 
+
+    use m_global_parameters
+
+    use m_ibm
+
+    use m_mpi_proxy 
+
+    implicit none
+
+    private; public :: s_initialize_particle_forces_module, & 
+ s_compute_drag_coefficient, s_finalize_particle_forces_module
+
+    real(wp), allocatable, dimension(:) :: FD_calc
+
+    !$acc declare create(FD_calc)
+
+contains
+    
+    subroutine s_initialize_particle_forces_module
+        if (compute_CD) then
+            @:ALLOCATE(FD_calc(0:num_ibs))
+        end if
+
+    end subroutine s_initialize_particle_forces_module
+
+    subroutine s_compute_drag_coefficient(div_pres_visc_stress)
+        type(scalar_field), dimension(momxb:momxe), intent(in) :: div_pres_visc_stress
+        real(wp), dimension(0:num_ibs) :: FD_global
+        real(wp) :: drag_coeff 
+        integer :: i, j, k
+
+        !$acc parallel loop gang vector default(present)
+        do i = 0, num_ibs 
+            FD_calc(i) = 0._wp
+        end do
+
+        !$acc parallel loop collapse(3) gang vector default(present)
+        do i = 0, m 
+            do j = 0, n 
+                do k = 0, p  
+                    !$acc atomic
+                    FD_calc(ib_markers%sf(i, j, k)) = FD_calc(ib_markers%sf(i, j, k)) & 
+                                                    + div_pres_visc_stress(momxb)%sf(i, j, k) * dx(i) * dy(j) * dz(k)
+                end do 
+            end do 
+        end do
+
+        !$acc update host(FD_calc(:))
+
+        do i = 0, num_ibs 
+            call s_mpi_allreduce_sum(FD_calc(i), FD_global(i))
+        end do
+
+        drag_coeff = FD_global(1) / (0.5_wp * rho_inf_ref * (u_inf_ref**2) * pi * (patch_ib(1)%radius**2))
+        if (proc_rank == 0) then 
+            print *, 'C_D: ', drag_coeff
+        end if
+
+    end subroutine s_compute_drag_coefficient
+
+    subroutine s_finalize_particle_forces_module
+        if (compute_CD) then 
+            @:DEALLOCATE(FD_calc)
+        end if
+
+    end subroutine s_finalize_particle_forces_module
+    
+end module m_compute_particle_forces
diff --git a/src/simulation/m_global_parameters.fpp b/src/simulation/m_global_parameters.fpp
index 1bdc9f1d97..0ae8d7763e 100644
--- a/src/simulation/m_global_parameters.fpp
+++ b/src/simulation/m_global_parameters.fpp
@@ -501,6 +501,19 @@ module m_global_parameters
     !$acc declare create(tau_star, cont_damage_s, alpha_bar)
     !> @}
 
+    logical :: periodic_ibs
+    logical :: compute_CD
+    real(wp) :: mu_visc !< reference viscosity
+    real(wp) :: u_inf_ref !< reference freestream velocity
+    real(wp) :: rho_inf_ref !< reference freestream density 
+    real(wp) :: T_inf_ref !< reference freestream temperature
+    logical :: periodic_forcing
+    logical :: fourier_transform_filtering
+    logical :: store_levelset
+    logical :: slab_domain_decomposition
+
+    !$acc declare create(mu_visc, u_inf_ref, rho_inf_ref, T_inf_ref)
+
 contains
 
     !> Assigns default values to the user inputs before reading
@@ -776,6 +789,17 @@ contains
             relativity = .false.
         #:endif
 
+        periodic_ibs = .false.
+        compute_CD = .false.
+        mu_visc = dflt_real
+        u_inf_ref = dflt_real
+        rho_inf_ref = dflt_real
+        T_inf_ref = dflt_real
+        periodic_forcing = .false.
+        fourier_transform_filtering = .false.
+        store_levelset = .true.
+        slab_domain_decomposition = .false.
+
     end subroutine s_assign_default_values_to_user_inputs
 
     !>  The computation of parameters, the allocation of memory,
diff --git a/src/simulation/m_ibm.fpp b/src/simulation/m_ibm.fpp
index 0c18d17327..c5eb54bd87 100644
--- a/src/simulation/m_ibm.fpp
+++ b/src/simulation/m_ibm.fpp
@@ -45,6 +45,9 @@ module m_ibm
     integer :: num_inner_gps !< Number of ghost points
     !$acc declare create(gp_layers, num_gps, num_inner_gps)
 
+    real(wp) :: x_domain_beg_glb, x_domain_end_glb, y_domain_beg_glb, y_domain_end_glb, z_domain_beg_glb, z_domain_end_glb !< global domain beginning/end
+    !$acc declare create(x_domain_beg_glb, x_domain_end_glb, y_domain_beg_glb, y_domain_end_glb, z_domain_beg_glb, z_domain_end_glb)
+
 contains
 
     !>  Allocates memory for the variables in the IBM module
@@ -53,10 +56,12 @@ contains
         if (p > 0) then
             @:ALLOCATE(ib_markers%sf(-gp_layers:m+gp_layers, &
                 -gp_layers:n+gp_layers, -gp_layers:p+gp_layers))
-            @:ALLOCATE(levelset%sf(-gp_layers:m+gp_layers, &
-                -gp_layers:n+gp_layers, -gp_layers:p+gp_layers, 1:num_ibs))
-            @:ALLOCATE(levelset_norm%sf(-gp_layers:m+gp_layers, &
-                -gp_layers:n+gp_layers, -gp_layers:p+gp_layers, 1:num_ibs, 1:3))
+            if (store_levelset) then 
+                @:ALLOCATE(levelset%sf(-gp_layers:m+gp_layers, &
+                    -gp_layers:n+gp_layers, -gp_layers:p+gp_layers, 1:num_ibs))
+                @:ALLOCATE(levelset_norm%sf(-gp_layers:m+gp_layers, &
+                    -gp_layers:n+gp_layers, -gp_layers:p+gp_layers, 1:num_ibs, 1:3))
+            end if
         else
             @:ALLOCATE(ib_markers%sf(-gp_layers:m+gp_layers, &
                 -gp_layers:n+gp_layers, 0:0))
@@ -67,9 +72,11 @@ contains
         end if
 
         @:ACC_SETUP_SFs(ib_markers)
-        @:ACC_SETUP_SFs(levelset)
-        @:ACC_SETUP_SFs(levelset_norm)
-
+        if (store_levelset) then 
+            @:ACC_SETUP_SFs(levelset)
+            @:ACC_SETUP_SFs(levelset_norm)
+        end if
+        
         !$acc enter data copyin(num_gps, num_inner_gps)
 
     end subroutine s_initialize_ibm_module
@@ -106,6 +113,14 @@ contains
         call s_compute_interpolation_coeffs(ghost_points)
         !$acc update device(ghost_points)
 
+        call s_mpi_allreduce_min(x_domain%beg, x_domain_beg_glb)
+        call s_mpi_allreduce_max(x_domain%end, x_domain_end_glb)
+        call s_mpi_allreduce_min(y_domain%beg, y_domain_beg_glb)
+        call s_mpi_allreduce_max(y_domain%end, y_domain_end_glb)
+        call s_mpi_allreduce_min(z_domain%beg, z_domain_beg_glb)
+        call s_mpi_allreduce_max(z_domain%end, z_domain_end_glb)
+        !$acc update device(x_domain_beg_glb, x_domain_end_glb, y_domain_beg_glb, y_domain_end_glb, z_domain_beg_glb, z_domain_end_glb)
+
     end subroutine s_ibm_setup
 
     !>  Subroutine that updates the conservative variables at the ghost points
@@ -362,6 +377,13 @@ contains
         integer :: dir
         integer :: index
 
+        real(wp) :: radius, x_centroid, y_centroid, z_centroid
+        real(wp) :: x_pcen, y_pcen, z_pcen 
+        real(wp) :: dist_calc
+        real(wp), dimension(3) :: dist_vec
+        real(wp), dimension(7, 3) :: dist_vec_per
+        real(wp), dimension(7) :: dist_per
+
         do q = 1, num_gps
             gp = ghost_points(q)
             i = gp%loc(1)
@@ -377,8 +399,106 @@ contains
 
             ! Calculate and store the precise location of the image point
             patch_id = gp%ib_patch_id
-            dist = abs(levelset%sf(i, j, k, patch_id))
-            norm(:) = levelset_norm%sf(i, j, k, patch_id, :)
+            if (store_levelset) then 
+                dist = abs(levelset%sf(i, j, k, patch_id))
+                norm(:) = levelset_norm%sf(i, j, k, patch_id, :)
+            else ! compute levelset and levelset_norm on the fly
+                radius = patch_ib(patch_id)%radius
+                x_centroid = patch_ib(patch_id)%x_centroid
+                y_centroid = patch_ib(patch_id)%y_centroid
+                z_centroid = patch_ib(patch_id)%z_centroid
+                if ((x_centroid - x_domain_beg_glb) <= radius) then
+                    x_pcen = x_domain_end_glb + (x_centroid - x_domain_beg_glb)
+                else if ((x_domain_end_glb - x_centroid) <= radius) then 
+                    x_pcen = x_domain_beg_glb - (x_domain_end_glb - x_centroid)
+                else 
+                    x_pcen = x_centroid
+                end if
+                if ((y_centroid - y_domain_beg_glb) <= radius) then
+                    y_pcen = y_domain_end_glb + (y_centroid - y_domain_beg_glb)
+                else if ((y_domain_end_glb - y_centroid) <= radius) then 
+                    y_pcen = y_domain_beg_glb - (y_domain_end_glb - y_centroid)
+                else 
+                    y_pcen = y_centroid
+                end if
+                if ((z_centroid - z_domain_beg_glb) <= radius) then
+                    z_pcen = z_domain_end_glb + (z_centroid - z_domain_beg_glb)
+                else if ((z_domain_end_glb - z_centroid) <= radius) then 
+                    z_pcen = z_domain_beg_glb - (z_domain_end_glb - z_centroid)
+                else 
+                    z_pcen = z_centroid
+                end if
+                dist_vec(1) = x_cc(i) - x_centroid
+                dist_vec(2) = y_cc(j) - y_centroid
+                dist_vec(3) = z_cc(k) - z_centroid
+                dist_calc = sqrt(sum(dist_vec**2))
+                ! all permutations of periodically projected ib
+                if (periodic_ibs) then
+                    dist_vec_per(1, 1) = x_cc(i) - x_pcen 
+                    dist_vec_per(1, 2) = y_cc(j) - y_pcen
+                    dist_vec_per(1, 3) = z_cc(k) - z_pcen
+                    dist_per(1) = sqrt(sum(dist_vec_per(1, :)**2))
+                    if (dist_per(1) < dist_calc) then    
+                        dist_calc = dist_per(1)
+                        dist_vec = dist_vec_per(1, :)
+                    end if 
+                    dist_vec_per(2, 1) = x_cc(i) - x_pcen 
+                    dist_vec_per(2, 2) = y_cc(j) - y_centroid
+                    dist_vec_per(2, 3) = z_cc(k) - z_pcen
+                    dist_per(2) = sqrt(sum(dist_vec_per(2, :)**2))
+                    if (dist_per(2) < dist_calc) then    
+                        dist_calc = dist_per(2)
+                        dist_vec = dist_vec_per(2, :)
+                    end if
+                    dist_vec_per(3, 1) = x_cc(i) - x_pcen 
+                    dist_vec_per(3, 2) = y_cc(j) - y_pcen
+                    dist_vec_per(3, 3) = z_cc(k) - z_centroid
+                    dist_per(3) = sqrt(sum(dist_vec_per(3, :)**2))
+                    if (dist_per(3) < dist_calc) then    
+                        dist_calc = dist_per(3)
+                        dist_vec = dist_vec_per(3, :)
+                    end if
+                    dist_vec_per(4, 1) = x_cc(i) - x_pcen 
+                    dist_vec_per(4, 2) = y_cc(j) - y_centroid
+                    dist_vec_per(4, 3) = z_cc(k) - z_centroid
+                    dist_per(4) = sqrt(sum(dist_vec_per(4, :)**2))
+                    if (dist_per(4) < dist_calc) then    
+                        dist_calc = dist_per(4)
+                        dist_vec = dist_vec_per(4, :)
+                    end if
+                    dist_vec_per(5, 1) = x_cc(i) - x_centroid
+                    dist_vec_per(5, 2) = y_cc(j) - y_pcen
+                    dist_vec_per(5, 3) = z_cc(k) - z_pcen
+                    dist_per(5) = sqrt(sum(dist_vec_per(5, :)**2))
+                    if (dist_per(5) < dist_calc) then    
+                        dist_calc = dist_per(5)
+                        dist_vec = dist_vec_per(5, :)
+                    end if
+                    dist_vec_per(6, 1) = x_cc(i) - x_centroid
+                    dist_vec_per(6, 2) = y_cc(j) - y_pcen
+                    dist_vec_per(6, 3) = z_cc(k) - z_centroid
+                    dist_per(6) = sqrt(sum(dist_vec_per(6, :)**2))
+                    if (dist_per(6) < dist_calc) then    
+                        dist_calc = dist_per(6)
+                        dist_vec = dist_vec_per(6, :)
+                    end if
+                    dist_vec_per(7, 1) = x_cc(i) - x_centroid
+                    dist_vec_per(7, 2) = y_cc(j) - y_centroid
+                    dist_vec_per(7, 3) = z_cc(k) - z_pcen
+                    dist_per(7) = sqrt(sum(dist_vec_per(7, :)**2))
+                    if (dist_per(7) < dist_calc) then    
+                        dist_calc = dist_per(7)
+                        dist_vec = dist_vec_per(7, :)
+                    end if
+                end if
+                dist = abs(dist_calc - radius)
+                if (dist_calc == 0) then
+                    norm(:) = (/1, 0, 0/)
+                else
+                    norm(:) = dist_vec(:)/dist_calc
+                end if
+            end if ! end store_levelset if statement
+
             ghost_points(q)%ip_loc(:) = physical_loc(:) + 2*dist*norm(:)
 
             ! Find the closest grid point to the image point
@@ -863,8 +983,10 @@ contains
     subroutine s_finalize_ibm_module()
 
         @:DEALLOCATE(ib_markers%sf)
-        @:DEALLOCATE(levelset%sf)
-        @:DEALLOCATE(levelset_norm%sf)
+        if (store_levelset) then
+            @:DEALLOCATE(levelset%sf)
+            @:DEALLOCATE(levelset_norm%sf)
+        end if
 
     end subroutine s_finalize_ibm_module
 
diff --git a/src/simulation/m_mpi_proxy.fpp b/src/simulation/m_mpi_proxy.fpp
index 33b61a9284..b1f1c28c8c 100644
--- a/src/simulation/m_mpi_proxy.fpp
+++ b/src/simulation/m_mpi_proxy.fpp
@@ -91,7 +91,9 @@ contains
             & 'bc_z%grcbc_in', 'bc_z%grcbc_out', 'bc_z%grcbc_vel_out',          &
             & 'cfl_adap_dt', 'cfl_const_dt', 'cfl_dt', 'surface_tension',        &
             & 'viscous', 'shear_stress', 'bulk_stress', 'bubbles_lagrange',     &
-            & 'hyperelasticity', 'rkck_adap_dt', 'bc_io', 'powell', 'cont_damage' ]
+            & 'hyperelasticity', 'rkck_adap_dt', 'bc_io', 'powell', 'cont_damage', &
+            & 'periodic_ibs', 'compute_CD', 'periodic_forcing', 'fourier_transform_filtering', & 
+            & 'store_levelset', 'slab_domain_decomposition' ]
             call MPI_BCAST(${VAR}$, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr)
         #:endfor
 
@@ -130,7 +132,8 @@ contains
             & 'x_domain%beg', 'x_domain%end', 'y_domain%beg', 'y_domain%end',    &
             & 'z_domain%beg', 'z_domain%end', 'x_a', 'x_b', 'y_a', 'y_b', 'z_a', &
             & 'z_b', 't_stop', 't_save', 'cfl_target', 'rkck_tolerance', 'Bx0',  &
-            & 'tau_star', 'cont_damage_s', 'alpha_bar' ]
+            & 'tau_star', 'cont_damage_s', 'alpha_bar', 'mu_visc', 'u_inf_ref',  & 
+            & 'rho_inf_ref', 'T_inf_ref' ]
             call MPI_BCAST(${VAR}$, 1, mpi_p, 0, MPI_COMM_WORLD, ierr)
         #:endfor
 
@@ -294,6 +297,19 @@ contains
 
                     end do
 
+                else if (slab_domain_decomposition) then 
+                    if (proc_rank == 0) then 
+                        print *, 'slab domain decomposition...'
+                    end if 
+                    
+                    ! continuous x and y direction, block decomposition in z
+                    num_procs_x = 1
+                    num_procs_y = 1
+                    num_procs_z = num_procs
+                    ierr = -1
+                    if (mod((p+1), num_procs_z) == 0) then 
+                        ierr = 0
+                    end if
                 else
 
                     ! Initial estimate of optimal processor topology
diff --git a/src/simulation/m_rhs.fpp b/src/simulation/m_rhs.fpp
index 6930f3caa7..626aed96ce 100644
--- a/src/simulation/m_rhs.fpp
+++ b/src/simulation/m_rhs.fpp
@@ -609,7 +609,7 @@ contains
 
     end subroutine s_initialize_rhs_module
 
-    subroutine s_compute_rhs(q_cons_vf, q_T_sf, q_prim_vf, bc_type, rhs_vf, pb, rhs_pb, mv, rhs_mv, t_step, time_avg)
+    subroutine s_compute_rhs(q_cons_vf, q_T_sf, q_prim_vf, bc_type, rhs_vf, pb, rhs_pb, mv, rhs_mv, t_step, time_avg, div_pres_visc_stress)
 
         type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
         type(scalar_field), intent(inout) :: q_T_sf
@@ -620,6 +620,7 @@ contains
         real(wp), dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:, 1:, 1:), intent(inout) :: mv, rhs_mv
         integer, intent(in) :: t_step
         real(wp), intent(inout) :: time_avg
+        type(scalar_field), dimension(momxb:momxe), optional, intent(inout) :: div_pres_visc_stress
 
         real(wp), dimension(0:m, 0:n, 0:p) :: nbub
         real(wp) :: t_start, t_finish
@@ -809,7 +810,8 @@ contains
                                                  rhs_vf, &
                                                  q_cons_qp, &
                                                  q_prim_qp, &
-                                                 flux_src_n(id))
+                                                 flux_src_n(id), & 
+                                                 div_pres_visc_stress)
             call nvtxEndRange
 
             ! RHS additions for hypoelasticity
@@ -828,7 +830,8 @@ contains
                                                       flux_src_n(id)%vf, &
                                                       dq_prim_dx_qp(1)%vf, &
                                                       dq_prim_dy_qp(1)%vf, &
-                                                      dq_prim_dz_qp(1)%vf)
+                                                      dq_prim_dz_qp(1)%vf, & 
+                                                      div_pres_visc_stress)
                 call nvtxEndRange
             end if
 
@@ -935,13 +938,14 @@ contains
 
     end subroutine s_compute_rhs
 
-    subroutine s_compute_advection_source_term(idir, rhs_vf, q_cons_vf, q_prim_vf, flux_src_n_vf)
+    subroutine s_compute_advection_source_term(idir, rhs_vf, q_cons_vf, q_prim_vf, flux_src_n_vf, div_pres_visc_stress)
 
         integer, intent(in) :: idir
         type(scalar_field), dimension(sys_size), intent(inout) :: rhs_vf
         type(vector_field), intent(inout) :: q_cons_vf
         type(vector_field), intent(inout) :: q_prim_vf
         type(vector_field), intent(inout) :: flux_src_n_vf
+        type(scalar_field), dimension(momxb:momxe), optional, intent(inout) :: div_pres_visc_stress  
 
         integer :: i, j, k, l, q
 
@@ -994,6 +998,25 @@ contains
                 end do
             end do
 
+            ! particle forces loop, x-dir
+            if (compute_CD .and. present(div_pres_visc_stress)) then
+                !$acc parallel loop collapse(3) gang vector default(present)
+                do k = 0, p
+                    do j = 0, n 
+                        do i = 0, m 
+                            !$acc loop seq
+                            do l = momxb, momxe
+                                div_pres_visc_stress(l)%sf(i, j, k) = 1._wp/dx(i) * & 
+                                                          (flux_n(1)%vf(l)%sf(i-1, j, k) - & 
+                                                           flux_n(1)%vf(l)%sf(i, j, k)) - 0.5_wp/dx(i) * & 
+                                                          (q_cons_vf%vf(2)%sf(i+1, j, k)*q_cons_vf%vf(l)%sf(i+1, j, k)/q_cons_vf%vf(1)%sf(i+1, j, k) - & 
+                                                           q_cons_vf%vf(2)%sf(i-1, j, k)*q_cons_vf%vf(l)%sf(i-1, j, k)/q_cons_vf%vf(1)%sf(i-1, j, k))
+                            end do 
+                        end do 
+                    end do 
+                end do 
+            end if
+
             if (model_eqns == 3) then
                 !$acc parallel loop collapse(4) gang vector default(present)
                 do l = 0, p
@@ -1104,6 +1127,25 @@ contains
                 end do
             end do
 
+            ! particle forces loop, y-dir
+            if (compute_CD .and. present(div_pres_visc_stress)) then
+                !$acc parallel loop collapse(3) gang vector default(present)
+                do k = 0, p 
+                    do j = 0, n 
+                        do i = 0, m 
+                            !$acc loop seq
+                            do l = momxb, momxe 
+                                div_pres_visc_stress(l)%sf(i, j, k) = div_pres_visc_stress(l)%sf(i, j, k) + 1._wp/dy(j) * & 
+                                                          (flux_n(2)%vf(l)%sf(i, j-1, k) - & 
+                                                           flux_n(2)%vf(l)%sf(i, j, k)) - 0.5_wp/dy(j) * & 
+                                                          (q_cons_vf%vf(3)%sf(i, j+1, k)*q_cons_vf%vf(l)%sf(i, j+1, k)/q_cons_vf%vf(1)%sf(i, j+1, k) - & 
+                                                           q_cons_vf%vf(3)%sf(i, j-1, k)*q_cons_vf%vf(l)%sf(i, j-1, k)/q_cons_vf%vf(1)%sf(i, j-1, k))
+                            end do  
+                        end do 
+                    end do
+                end do
+            end if
+
             if (model_eqns == 3) then
                 !$acc parallel loop collapse(4) gang vector default(present)
                 do l = 0, p
@@ -1310,6 +1352,25 @@ contains
                 end do
             end if
 
+            ! particle forces loop, z-dir
+            if (compute_CD .and. present(div_pres_visc_stress)) then
+                !$acc parallel loop collapse(3) gang vector default(present)
+                do k = 0, p 
+                    do j = 0, n 
+                        do i = 0, m 
+                            !$acc loop seq
+                            do l = momxb, momxe 
+                                div_pres_visc_stress(l)%sf(i, j, k) = div_pres_visc_stress(l)%sf(i, j, k) + 1._wp/dz(k) * & 
+                                                          (flux_n(3)%vf(l)%sf(i, j, k-1) - & 
+                                                           flux_n(3)%vf(l)%sf(i, j, k)) - 0.5_wp/dz(k) * & 
+                                                          (q_cons_vf%vf(4)%sf(i, j, k+1)*q_cons_vf%vf(l)%sf(i, j, k+1)/q_cons_vf%vf(1)%sf(i, j, k+1) - & 
+                                                           q_cons_vf%vf(4)%sf(i, j, k-1)*q_cons_vf%vf(l)%sf(i, j, k-1)/q_cons_vf%vf(1)%sf(i, j, k-1))
+                            end do  
+                        end do 
+                    end do 
+                end do 
+            end if
+
             if (model_eqns == 3) then
                 !$acc parallel loop collapse(4) gang vector default(present)
                 do l = 0, p
@@ -1491,13 +1552,14 @@ contains
     end subroutine s_compute_advection_source_term
 
     subroutine s_compute_additional_physics_rhs(idir, q_prim_vf, rhs_vf, flux_src_n, &
-                                                dq_prim_dx_vf, dq_prim_dy_vf, dq_prim_dz_vf)
+                                                dq_prim_dx_vf, dq_prim_dy_vf, dq_prim_dz_vf, div_pres_visc_stress)
 
         integer, intent(in) :: idir
         type(scalar_field), dimension(sys_size), intent(in) :: q_prim_vf
         type(scalar_field), dimension(sys_size), intent(inout) :: rhs_vf
         type(scalar_field), dimension(sys_size), intent(in) :: flux_src_n
         type(scalar_field), dimension(sys_size), intent(in) :: dq_prim_dx_vf, dq_prim_dy_vf, dq_prim_dz_vf
+        type(scalar_field), dimension(momxb:momxe), optional, intent(inout) :: div_pres_visc_stress
 
         integer :: i, j, k, l
 
@@ -1533,6 +1595,23 @@ contains
                 end do
             end do
 
+            ! particle momentum exchange, viscous stress tensor, x-dir
+            if (compute_CD .and. present(div_pres_visc_stress)) then
+                !$acc parallel loop collapse(3) gang vector default(present)
+                do k = 0, p 
+                    do j = 0, n 
+                        do i = 0, m 
+                            !$acc loop seq
+                            do l = momxb, momxe
+                                div_pres_visc_stress(l)%sf(i, j, k) = div_pres_visc_stress(l)%sf(i, j, k) + 1._wp/dx(i) * & 
+                                                       (flux_src_n(l)%sf(i-1, j, k) - & 
+                                                        flux_src_n(l)%sf(i, j, k))
+                            end do 
+                        end do 
+                    end do 
+                end do
+            end if
+
         elseif (idir == 2) then ! y-direction
 
             if (surface_tension) then
@@ -1615,6 +1694,23 @@ contains
                 end do
             end if
 
+            ! particle momentum exchange, viscous stress tensor, y-dir
+            if (compute_CD .and. present(div_pres_visc_stress)) then
+                !$acc parallel loop collapse(3) gang vector default(present)
+                do k = 0, p 
+                    do j = 0, n 
+                        do i = 0, m 
+                            !$acc loop seq
+                            do l = momxb, momxe
+                                div_pres_visc_stress(l)%sf(i, j, k) = div_pres_visc_stress(l)%sf(i, j, k) + 1._wp/dy(j) * & 
+                                                       (flux_src_n(l)%sf(i, j-1, k) - & 
+                                                        flux_src_n(l)%sf(i, j, k))
+                            end do 
+                        end do 
+                    end do
+                end do
+            end if
+
             ! Applying the geometrical viscous Riemann source fluxes calculated as average
             ! of values at cell boundaries
             if (cyl_coord) then
@@ -1700,6 +1796,23 @@ contains
                 end do
             end do
 
+            ! particle momentum exchange, viscous stress tensor, z-dir
+            if (compute_CD .and. present(div_pres_visc_stress)) then
+                !$acc parallel loop collapse(3) gang vector default(present)
+                do k = 0, p 
+                    do j = 0, n 
+                        do i = 0, m 
+                            !$acc loop seq
+                            do l = momxb, momxe 
+                                div_pres_visc_stress(l)%sf(i, j, k) = div_pres_visc_stress(l)%sf(i, j, k) + 1._wp/dz(k) * & 
+                                                       (flux_src_n(l)%sf(i, j, k-1) - & 
+                                                        flux_src_n(l)%sf(i, j, k))
+                            end do 
+                        end do 
+                    end do 
+                end do 
+            end if 
+
             if (grid_geometry == 3) then
                 !$acc parallel loop collapse(3) gang vector default(present)
                 do l = 0, p
diff --git a/src/simulation/m_start_up.fpp b/src/simulation/m_start_up.fpp
index a45346673c..d2e9e344a3 100644
--- a/src/simulation/m_start_up.fpp
+++ b/src/simulation/m_start_up.fpp
@@ -89,6 +89,12 @@ module m_start_up
 
     use m_mhd
 
+    use m_compute_particle_forces
+
+    use m_additional_forcing 
+
+    use m_volume_filtering
+
     implicit none
 
     private; public :: s_read_input_file, &
@@ -180,7 +186,10 @@ contains
             bubbles_lagrange, lag_params, &
             rkck_adap_dt, rkck_tolerance, &
             hyperelasticity, R0ref, num_bc_patches, Bx0, powell, &
-            cont_damage, tau_star, cont_damage_s, alpha_bar
+            cont_damage, tau_star, cont_damage_s, alpha_bar, & 
+            periodic_ibs, compute_CD, mu_visc, u_inf_ref, rho_inf_ref, T_inf_ref, & 
+            periodic_forcing, fourier_transform_filtering, store_levelset, & 
+            slab_domain_decomposition
 
         ! Checking that an input file has been provided by the user. If it
         ! has, then the input file is read in, otherwise, simulation exits.
@@ -434,33 +443,35 @@ contains
                 call s_mpi_abort(trim(file_path)//' is missing. Exiting.')
             end if
 
-            ! Read Levelset
-            write (file_path, '(A)') &
-                trim(t_step_dir)//'/levelset.dat'
-            inquire (FILE=trim(file_path), EXIST=file_exist)
-            if (file_exist) then
-                open (2, FILE=trim(file_path), &
-                        FORM='unformatted', &
-                        ACTION='read', &
-                        STATUS='old')
-                read (2) levelset%sf(0:m, 0:n, 0:p, 1:num_ibs); close (2)
-                ! print*, 'check', STL_levelset(106, 50, 0, 1)
-            else
-                call s_mpi_abort(trim(file_path)//' is missing. Exiting.')
-            end if
+            if (store_levelset) then
+                ! Read Levelset
+                write (file_path, '(A)') &
+                    trim(t_step_dir)//'/levelset.dat'
+                inquire (FILE=trim(file_path), EXIST=file_exist)
+                if (file_exist) then
+                    open (2, FILE=trim(file_path), &
+                            FORM='unformatted', &
+                            ACTION='read', &
+                            STATUS='old')
+                    read (2) levelset%sf(0:m, 0:n, 0:p, 1:num_ibs); close (2)
+                    ! print*, 'check', STL_levelset(106, 50, 0, 1)
+                else
+                    call s_mpi_abort(trim(file_path)//' is missing. Exiting.')
+                end if
 
-            ! Read Levelset Norm
-            write (file_path, '(A)') &
-                trim(t_step_dir)//'/levelset_norm.dat'
-            inquire (FILE=trim(file_path), EXIST=file_exist)
-            if (file_exist) then
-                open (2, FILE=trim(file_path), &
-                        FORM='unformatted', &
-                        ACTION='read', &
-                        STATUS='old')
-                read (2) levelset_norm%sf(0:m, 0:n, 0:p, 1:num_ibs, 1:3); close (2)
-            else
-                call s_mpi_abort(trim(file_path)//' is missing. Exiting.')
+                ! Read Levelset Norm
+                write (file_path, '(A)') &
+                    trim(t_step_dir)//'/levelset_norm.dat'
+                inquire (FILE=trim(file_path), EXIST=file_exist)
+                if (file_exist) then
+                    open (2, FILE=trim(file_path), &
+                            FORM='unformatted', &
+                            ACTION='read', &
+                            STATUS='old')
+                    read (2) levelset_norm%sf(0:m, 0:n, 0:p, 1:num_ibs, 1:3); close (2)
+                else
+                    call s_mpi_abort(trim(file_path)//' is missing. Exiting.')
+                end if
             end if
 
             do i = 1, num_ibs
@@ -693,44 +704,46 @@ contains
                         call s_mpi_abort('File '//trim(file_loc)//' is missing. Exiting.')
                     end if
 
-                    ! Read Levelset
-                    write (file_loc, '(A)') 'levelset.dat'
-                    file_loc = trim(case_dir)//'/restart_data'//trim(mpiiofs)//trim(file_loc)
-                    inquire (FILE=trim(file_loc), EXIST=file_exist)
+                    if (store_levelset) then
+                        ! Read Levelset
+                        write (file_loc, '(A)') 'levelset.dat'
+                        file_loc = trim(case_dir)//'/restart_data'//trim(mpiiofs)//trim(file_loc)
+                        inquire (FILE=trim(file_loc), EXIST=file_exist)
 
-                    if (file_exist) then
+                        if (file_exist) then
 
-                        call MPI_FILE_OPEN(MPI_COMM_WORLD, file_loc, MPI_MODE_RDONLY, mpi_info_int, ifile, ierr)
+                            call MPI_FILE_OPEN(MPI_COMM_WORLD, file_loc, MPI_MODE_RDONLY, mpi_info_int, ifile, ierr)
 
-                        disp = 0
+                            disp = 0
 
-                        call MPI_FILE_SET_VIEW(ifile, disp, mpi_p, MPI_IO_levelset_DATA%view, &
-                                               'native', mpi_info_int, ierr)
-                        call MPI_FILE_READ(ifile, MPI_IO_levelset_DATA%var%sf, data_size * num_ibs, &
-                                           mpi_p, status, ierr)
+                            call MPI_FILE_SET_VIEW(ifile, disp, mpi_p, MPI_IO_levelset_DATA%view, &
+                                                'native', mpi_info_int, ierr)
+                            call MPI_FILE_READ(ifile, MPI_IO_levelset_DATA%var%sf, data_size * num_ibs, &
+                                            mpi_p, status, ierr)
 
-                    else
-                        call s_mpi_abort('File '//trim(file_loc)//' is missing. Exiting.')
-                    end if
+                        else
+                            call s_mpi_abort('File '//trim(file_loc)//' is missing. Exiting.')
+                        end if
 
-                    ! Read Levelset Norm
-                    write (file_loc, '(A)') 'levelset_norm.dat'
-                    file_loc = trim(case_dir)//'/restart_data'//trim(mpiiofs)//trim(file_loc)
-                    inquire (FILE=trim(file_loc), EXIST=file_exist)
+                        ! Read Levelset Norm
+                        write (file_loc, '(A)') 'levelset_norm.dat'
+                        file_loc = trim(case_dir)//'/restart_data'//trim(mpiiofs)//trim(file_loc)
+                        inquire (FILE=trim(file_loc), EXIST=file_exist)
 
-                    if (file_exist) then
+                        if (file_exist) then
 
-                        call MPI_FILE_OPEN(MPI_COMM_WORLD, file_loc, MPI_MODE_RDONLY, mpi_info_int, ifile, ierr)
+                            call MPI_FILE_OPEN(MPI_COMM_WORLD, file_loc, MPI_MODE_RDONLY, mpi_info_int, ifile, ierr)
 
-                        disp = 0
+                            disp = 0
 
-                        call MPI_FILE_SET_VIEW(ifile, disp, mpi_p, MPI_IO_levelsetnorm_DATA%view, &
-                                               'native', mpi_info_int, ierr)
-                        call MPI_FILE_READ(ifile, MPI_IO_levelsetnorm_DATA%var%sf, data_size * num_ibs * 3, &
-                                           mpi_p, status, ierr)
+                            call MPI_FILE_SET_VIEW(ifile, disp, mpi_p, MPI_IO_levelsetnorm_DATA%view, &
+                                                'native', mpi_info_int, ierr)
+                            call MPI_FILE_READ(ifile, MPI_IO_levelsetnorm_DATA%var%sf, data_size * num_ibs * 3, &
+                                            mpi_p, status, ierr)
 
-                    else
-                        call s_mpi_abort('File '//trim(file_loc)//' is missing. Exiting.')
+                        else
+                            call s_mpi_abort('File '//trim(file_loc)//' is missing. Exiting.')
+                        end if
                     end if
 
                 end if
@@ -842,44 +855,46 @@ contains
                         call s_mpi_abort('File '//trim(file_loc)//' is missing. Exiting.')
                     end if
 
-                    ! Read Levelset
-                    write (file_loc, '(A)') 'levelset.dat'
-                    file_loc = trim(case_dir)//'/restart_data'//trim(mpiiofs)//trim(file_loc)
-                    inquire (FILE=trim(file_loc), EXIST=file_exist)
+                    if (store_levelset) then
+                        ! Read Levelset
+                        write (file_loc, '(A)') 'levelset.dat'
+                        file_loc = trim(case_dir)//'/restart_data'//trim(mpiiofs)//trim(file_loc)
+                        inquire (FILE=trim(file_loc), EXIST=file_exist)
 
-                    if (file_exist) then
+                        if (file_exist) then
 
-                        call MPI_FILE_OPEN(MPI_COMM_WORLD, file_loc, MPI_MODE_RDONLY, mpi_info_int, ifile, ierr)
+                            call MPI_FILE_OPEN(MPI_COMM_WORLD, file_loc, MPI_MODE_RDONLY, mpi_info_int, ifile, ierr)
 
-                        disp = 0
+                            disp = 0
 
-                        call MPI_FILE_SET_VIEW(ifile, disp, mpi_p, MPI_IO_levelset_DATA%view, &
-                                               'native', mpi_info_int, ierr)
-                        call MPI_FILE_READ(ifile, MPI_IO_levelset_DATA%var%sf, data_size * num_ibs, &
-                                           mpi_p, status, ierr)
+                            call MPI_FILE_SET_VIEW(ifile, disp, mpi_p, MPI_IO_levelset_DATA%view, &
+                                                'native', mpi_info_int, ierr)
+                            call MPI_FILE_READ(ifile, MPI_IO_levelset_DATA%var%sf, data_size * num_ibs, &
+                                            mpi_p, status, ierr)
 
-                    else
-                        call s_mpi_abort('File '//trim(file_loc)//' is missing. Exiting.')
-                    end if
+                        else
+                            call s_mpi_abort('File '//trim(file_loc)//' is missing. Exiting.')
+                        end if
 
-                    ! Read Levelset Norm
-                    write (file_loc, '(A)') 'levelset_norm.dat'
-                    file_loc = trim(case_dir)//'/restart_data'//trim(mpiiofs)//trim(file_loc)
-                    inquire (FILE=trim(file_loc), EXIST=file_exist)
+                        ! Read Levelset Norm
+                        write (file_loc, '(A)') 'levelset_norm.dat'
+                        file_loc = trim(case_dir)//'/restart_data'//trim(mpiiofs)//trim(file_loc)
+                        inquire (FILE=trim(file_loc), EXIST=file_exist)
 
-                    if (file_exist) then
+                        if (file_exist) then
 
-                        call MPI_FILE_OPEN(MPI_COMM_WORLD, file_loc, MPI_MODE_RDONLY, mpi_info_int, ifile, ierr)
+                            call MPI_FILE_OPEN(MPI_COMM_WORLD, file_loc, MPI_MODE_RDONLY, mpi_info_int, ifile, ierr)
 
-                        disp = 0
+                            disp = 0
 
-                        call MPI_FILE_SET_VIEW(ifile, disp, mpi_p, MPI_IO_levelsetnorm_DATA%view, &
-                                               'native', mpi_info_int, ierr)
-                        call MPI_FILE_READ(ifile, MPI_IO_levelsetnorm_DATA%var%sf, data_size * num_ibs * 3, &
-                                           mpi_p, status, ierr)
+                            call MPI_FILE_SET_VIEW(ifile, disp, mpi_p, MPI_IO_levelsetnorm_DATA%view, &
+                                                'native', mpi_info_int, ierr)
+                            call MPI_FILE_READ(ifile, MPI_IO_levelsetnorm_DATA%var%sf, data_size * num_ibs * 3, &
+                                            mpi_p, status, ierr)
 
-                    else
-                        call s_mpi_abort('File '//trim(file_loc)//' is missing. Exiting.')
+                        else
+                            call s_mpi_abort('File '//trim(file_loc)//' is missing. Exiting.')
+                        end if
                     end if
 
                 end if
@@ -1552,6 +1567,10 @@ contains
 
         if (mhd .and. powell) call s_initialize_mhd_powell_module
 
+        call s_initialize_particle_forces_module()
+        call s_initialize_additional_forcing_module()
+        if (fourier_transform_filtering) call s_initialize_fftw_explicit_filter_module()
+
     end subroutine s_initialize_modules
 
     subroutine s_initialize_mpi_domain
@@ -1663,6 +1682,9 @@ contains
         if (ib) then
             !$acc update device(ib_markers%sf)
         end if
+
+        !$acc update device(mu_visc, u_inf_ref, rho_inf_ref, T_inf_ref)
+
     end subroutine s_initialize_gpu_vars
 
     subroutine s_finalize_modules
@@ -1691,6 +1713,10 @@ contains
         if (bodyForces) call s_finalize_body_forces_module()
         if (mhd .and. powell) call s_finalize_mhd_powell_module
 
+        call s_finalize_particle_forces_module()
+        call s_finalize_additional_forcing_module()
+        if (fourier_transform_filtering) call s_finalize_fftw_explicit_filter_module
+
         ! Terminating MPI execution environment
         call s_mpi_finalize()
     end subroutine s_finalize_modules
diff --git a/src/simulation/m_time_steppers.fpp b/src/simulation/m_time_steppers.fpp
index f8cdb7a7ac..8291e2d9e7 100644
--- a/src/simulation/m_time_steppers.fpp
+++ b/src/simulation/m_time_steppers.fpp
@@ -46,6 +46,12 @@ module m_time_steppers
 
     use m_body_forces
 
+    use m_compute_particle_forces
+
+    use m_additional_forcing 
+
+    use m_volume_filtering
+
     implicit none
 
     type(vector_field), allocatable, dimension(:) :: q_cons_ts !<
@@ -79,7 +85,12 @@ module m_time_steppers
     integer, private :: num_ts !<
     !! Number of time stages in the time-stepping scheme
 
+    type(scalar_field), allocatable, dimension(:) :: div_pres_visc_stress
+
+    type(scalar_field), allocatable, dimension(:) :: q_cons_filtered
+
     !$acc declare create(q_cons_ts, q_prim_vf, q_T_sf, rhs_vf, rhs_ts_rkck, q_prim_ts, rhs_mv, rhs_pb, max_dt)
+    !$acc declare create(div_pres_visc_stress)
 
 contains
 
@@ -355,6 +366,26 @@ contains
             end do
         end do
 
+        if (compute_CD) then
+            @:ALLOCATE(div_pres_visc_stress(momxb:momxe))
+            do i = momxb, momxe
+                @:ALLOCATE(div_pres_visc_stress(i)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
+                    idwbuff(2)%beg:idwbuff(2)%end, &
+                    idwbuff(3)%beg:idwbuff(3)%end))
+                @:ACC_SETUP_SFs(div_pres_visc_stress(i))
+            end do
+        end if
+
+        if (fourier_transform_filtering) then 
+            @:ALLOCATE(q_cons_filtered(1:sys_size))
+            do i = 1, sys_size
+                @:ALLOCATE(q_cons_filtered(i)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
+                    idwbuff(2)%beg:idwbuff(2)%end, &
+                    idwbuff(3)%beg:idwbuff(3)%end))
+                @:ACC_SETUP_SFs(q_cons_filtered(i))
+            end do
+        end if
+
     end subroutine s_initialize_time_steppers_module
 
     !> 1st order TVD RK time-stepping algorithm
@@ -670,7 +701,20 @@ contains
             call nvtxStartRange("TIMESTEP")
         end if
 
-        call s_compute_rhs(q_cons_ts(1)%vf, q_T_sf, q_prim_vf, bc_type, rhs_vf, pb_ts(1)%sf, rhs_pb, mv_ts(1)%sf, rhs_mv, t_step, time_avg)
+        if (periodic_forcing) then 
+            call s_compute_phase_average(q_cons_ts(1)%vf, t_step+1)
+            call s_compute_periodic_forcing(q_cons_ts(1)%vf)
+        end if
+
+        call s_compute_rhs(q_cons_ts(1)%vf, q_T_sf, q_prim_vf, bc_type, rhs_vf, pb_ts(1)%sf, rhs_pb, mv_ts(1)%sf, rhs_mv, t_step, time_avg, div_pres_visc_stress)
+
+        if (compute_CD) then
+            call s_compute_drag_coefficient(div_pres_visc_stress)
+        end if
+
+        if (periodic_forcing) then 
+            call s_add_periodic_forcing(rhs_vf)
+        end if
 
         if (run_time_info) then
             call s_write_run_time_information(q_prim_vf, t_step)
@@ -761,6 +805,10 @@ contains
 
         call s_compute_rhs(q_cons_ts(2)%vf, q_T_sf, q_prim_vf, bc_type, rhs_vf, pb_ts(2)%sf, rhs_pb, mv_ts(2)%sf, rhs_mv, t_step, time_avg)
 
+        if (periodic_forcing) then 
+            call s_add_periodic_forcing(rhs_vf)
+        end if
+
         if (bubbles_lagrange) then
             call s_compute_EL_coupled_solver(q_cons_ts(2)%vf, q_prim_vf, rhs_vf, stage=2)
             call s_update_lagrange_tdv_rk(stage=2)
@@ -837,6 +885,10 @@ contains
         ! Stage 3 of 3
         call s_compute_rhs(q_cons_ts(2)%vf, q_T_sf, q_prim_vf, bc_type, rhs_vf, pb_ts(2)%sf, rhs_pb, mv_ts(2)%sf, rhs_mv, t_step, time_avg)
 
+        if (periodic_forcing) then 
+            call s_add_periodic_forcing(rhs_vf)
+        end if
+
         if (bubbles_lagrange) then
             call s_compute_EL_coupled_solver(q_cons_ts(2)%vf, q_prim_vf, rhs_vf, stage=3)
             call s_update_lagrange_tdv_rk(stage=3)
@@ -1328,6 +1380,13 @@ contains
             @:DEALLOCATE(rhs_vf)
         end if
 
+        if (compute_CD) then
+            do i = momxb, momxe
+                @:DEALLOCATE(div_pres_visc_stress(i)%sf)
+            end do
+            @:DEALLOCATE(div_pres_visc_stress)
+        end if
+
         ! Writing the footer of and closing the run-time information file
         if (proc_rank == 0 .and. run_time_info) then
             call s_close_run_time_information_file()
diff --git a/src/simulation/m_volume_filtering.fpp b/src/simulation/m_volume_filtering.fpp
new file mode 100644
index 0000000000..6caffaa4c5
--- /dev/null
+++ b/src/simulation/m_volume_filtering.fpp
@@ -0,0 +1,1049 @@
+#:include 'macros.fpp'
+
+module m_volume_filtering
+
+    use, intrinsic :: iso_c_binding
+
+    use m_derived_types        !< Definitions of the derived types
+
+    use m_global_parameters    !< Definitions of the global parameters
+
+    use m_mpi_proxy            !< Message passing interface (MPI) module proxy
+
+    use m_ibm
+
+#ifdef MFC_MPI
+    use mpi                    !< Message passing interface (MPI) module
+#endif
+
+#if defined(MFC_OpenACC) && defined(__PGI)
+    use cufft
+#endif
+
+    implicit none
+
+    private; public :: s_initialize_fftw_explicit_filter_module, &
+ s_apply_fftw_filter_cons, & 
+ s_initialize_filtering_kernel, s_initialize_fluid_indicator_function, & 
+ s_initialize_filtered_fluid_indicator_function, & 
+ s_finalize_fftw_explicit_filter_module, & 
+ s_apply_fftw_filter_tensor, s_apply_fftw_filter_scalarfield
+
+#if !defined(MFC_OpenACC)
+    include 'fftw3.f03'
+#endif
+
+    integer :: ierr   
+
+    ! fluid indicator function (1 = fluid, 0 = otherwise)
+    type(scalar_field), public :: fluid_indicator_function_I
+
+    !$acc declare create(fluid_indicator_function_I)
+
+#if defined(MFC_OpenACC)
+    ! GPU plans
+    integer :: plan_x_fwd_gpu, plan_x_bwd_gpu, plan_y_gpu, plan_z_gpu
+#else
+    ! CPU plans
+    type(c_ptr) :: plan_x_r2c_fwd, plan_x_c2r_bwd
+    type(c_ptr) :: plan_y_c2c_fwd, plan_y_c2c_bwd 
+    type(c_ptr) :: plan_z_c2c_fwd, plan_z_c2c_bwd
+    type(c_ptr) :: plan_x_r2c_kernelG, plan_y_c2c_kernelG, plan_z_c2c_kernelG
+#endif
+
+    ! domain size information (global, complex, local)
+    integer :: Nx, Ny, Nz, NxC, Nyloc, Nzloc
+
+    ! 1D real and complex vectors for FFT routines
+    real(c_double), allocatable :: data_real_in1d(:) 
+    complex(c_double_complex), allocatable :: data_cmplx_out1d(:)
+    complex(c_double_complex), allocatable :: data_cmplx_out1dy(:)
+
+    ! 3D arrays for slab transposes
+    complex(c_double_complex), allocatable :: data_cmplx_slabz(:, :, :), data_cmplx_slaby(:, :, :)
+
+    ! input/output array for FFT routine
+    real(c_double), allocatable :: data_real_3D_slabz(:, :, :)
+
+    ! filtering kernel in physical space
+    real(c_double), allocatable :: real_kernelG_in(:, :, :)
+
+    ! FFT of filtering kernel
+    complex(c_double_complex), allocatable :: cmplx_kernelG1d(:)
+
+    !$acc declare create(Nx, Ny, Nz, NxC, Nyloc, Nzloc)
+    !$acc declare create(data_real_in1d, data_cmplx_out1d, data_cmplx_out1dy, data_cmplx_slabz, data_cmplx_slaby, data_real_3D_slabz, real_kernelG_in, cmplx_kernelG1d)
+
+contains
+
+    !< create fft plans to be used for explicit filtering of data 
+    subroutine s_initialize_fftw_explicit_filter_module
+        integer :: size_n(1), inembed(1), onembed(1)
+
+        !< global sizes 
+        Nx = m_glb + 1
+        Ny = n_glb + 1
+        Nz = p_glb + 1
+
+        !< complex size
+        NxC = Nx/2 + 1
+
+        !< local sizes on each processor
+        Nyloc = Ny / num_procs
+        Nzloc = p + 1
+
+        !$acc update device(Nx, Ny, Nz, NxC, Nyloc, Nzloc)
+
+        @:ALLOCATE(data_real_in1d(Nx*Ny*Nzloc))
+        @:ALLOCATE(data_cmplx_out1d(NxC*Ny*Nz/num_procs))
+        @:ALLOCATE(data_cmplx_out1dy(NxC*Ny*Nz/num_procs))
+        @:ALLOCATE(cmplx_kernelG1d(NxC*Nyloc*Nz))
+        @:ALLOCATE(real_kernelG_in(Nx, Ny, Nzloc))
+        @:ALLOCATE(data_real_3D_slabz(Nx, Ny, Nzloc))
+        @:ALLOCATE(data_cmplx_slabz(NxC, Ny, Nzloc))
+        @:ALLOCATE(data_cmplx_slaby(NxC, Nyloc, Nz))
+
+#if defined(MFC_OpenACC)
+        !< GPU FFT plans
+        !< X - plans
+        size_n(1) = Nx
+        inembed(1) = Nx
+        onembed(1) = NxC
+        ierr = cufftPlanMany(plan_x_fwd_gpu, 1, size_n, inembed, 1, Nx, onembed, 1, NxC, CUFFT_D2Z, Ny*Nzloc)
+        size_n(1) = Nx
+        inembed(1) = NxC
+        onembed(1) = Nx  
+        ierr = cufftPlanMany(plan_x_bwd_gpu, 1, size_n, inembed, 1, NxC, onembed, 1, Nx, CUFFT_Z2D, Ny*Nzloc)
+        !< Y - plans
+        size_n(1) = Ny
+        inembed(1) = Ny
+        onembed(1) = Ny
+        ierr = cufftPlanMany(plan_y_gpu, 1, size_n, inembed, 1, Ny, onembed, 1, Ny, CUFFT_Z2Z, NxC*Nzloc)
+        !< Z - plans
+        size_n(1) = Nz 
+        inembed(1) = Nz 
+        onembed(1) = Nz 
+        ierr = cufftPlanMany(plan_z_gpu, 1, size_n, inembed, 1, Nz, onembed, 1, Nz, CUFFT_Z2Z, NxC*Nyloc)
+#else
+        !< CPU FFT plans
+        !< X - direction plans
+        size_n(1) = Nx
+        inembed(1) = Nx
+        onembed(1) = NxC
+        plan_x_r2c_fwd = fftw_plan_many_dft_r2c(1, size_n, Ny*Nzloc, &                  ! rank, n, howmany
+                                                data_real_in1d, inembed, 1, Nx, &       ! in, inembed, istride, idist
+                                                data_cmplx_out1d, onembed, 1, NxC, &    ! out, onembed, ostride, odist
+                                                FFTW_MEASURE)                           ! sign, flags
+        size_n(1) = Nx
+        inembed(1) = NxC
+        onembed(1) = Nx                                                         
+        plan_x_c2r_bwd = fftw_plan_many_dft_c2r(1, size_n, Ny*Nzloc, & 
+                                                data_cmplx_out1d, inembed, 1, NxC, & 
+                                                data_real_in1d, onembed, 1, Nx, & 
+                                                FFTW_MEASURE)
+        !< Y - direction plans
+        size_n(1) = Ny
+        inembed(1) = Ny
+        onembed(1) = Ny
+        plan_y_c2c_fwd = fftw_plan_many_dft(1, size_n, NxC*Nzloc, & 
+                                            data_cmplx_out1dy, inembed, 1, Ny, & 
+                                            data_cmplx_out1dy, onembed, 1, Ny, & 
+                                            FFTW_FORWARD, FFTW_MEASURE)
+        plan_y_c2c_bwd = fftw_plan_many_dft(1, size_n, NxC*Nzloc, & 
+                                            data_cmplx_out1dy, inembed, 1, Ny, & 
+                                            data_cmplx_out1dy, onembed, 1, Ny, & 
+                                            FFTW_BACKWARD, FFTW_MEASURE)
+        !< Z - direction plans
+        size_n(1) = Nz 
+        inembed(1) = Nz 
+        onembed(1) = Nz 
+        plan_z_c2c_fwd = fftw_plan_many_dft(1, size_n, NxC*Nyloc, & 
+                                            data_cmplx_out1d, inembed, 1, Nz, & 
+                                            data_cmplx_out1d, onembed, 1, Nz, & 
+                                            FFTW_FORWARD, FFTW_MEASURE)
+        plan_z_c2c_bwd = fftw_plan_many_dft(1, size_n, NxC*Nyloc, & 
+                                            data_cmplx_out1d, inembed, 1, Nz, &
+                                            data_cmplx_out1d, onembed, 1, Nz, & 
+                                            FFTW_BACKWARD, FFTW_MEASURE)
+        ! forward plans for filtering kernel
+        ! X kernel plan
+        size_n(1) = Nx
+        inembed(1) = Nx
+        onembed(1) = NxC
+        plan_x_r2c_kernelG = fftw_plan_many_dft_r2c(1, size_n, Ny*Nzloc, &                    
+                                                    data_real_in1d, inembed, 1, Nx, &        
+                                                    cmplx_kernelG1d, onembed, 1, NxC, &    
+                                                    FFTW_MEASURE)          
+        ! Y kernel plan                  
+        size_n(1) = Ny
+        inembed(1) = Ny
+        onembed(1) = Ny
+        plan_y_c2c_kernelG = fftw_plan_many_dft(1, size_n, NxC*Nzloc, & 
+                                                data_cmplx_out1dy, inembed, 1, Ny, & 
+                                                data_cmplx_out1dy, onembed, 1, Ny, & 
+                                                FFTW_FORWARD, FFTW_MEASURE)
+        ! Z kernel plan
+        size_n(1) = Nz 
+        inembed(1) = Nz 
+        onembed(1) = Nz 
+        plan_z_c2c_kernelG = fftw_plan_many_dft(1, size_n, NxC*Nyloc, & 
+                                                cmplx_kernelG1d, inembed, 1, Nz, & 
+                                                cmplx_kernelG1d, onembed, 1, Nz, & 
+                                                FFTW_FORWARD, FFTW_MEASURE)
+#endif
+    end subroutine s_initialize_fftw_explicit_filter_module
+
+    !< initialize the gaussian filtering kernel in real space and then compute its DFT
+    subroutine s_initialize_filtering_kernel
+        real(dp) :: sigma_stddev
+        real(dp) :: Lx, Ly, Lz
+        real(dp) :: x_r, y_r, z_r  
+        real(dp) :: r2
+        real(dp) :: G_norm_int, G_norm_int_glb
+        integer :: i, j, k, idx
+
+        ! gaussian filter
+        sigma_stddev = 3.0_dp * 0.05_dp
+
+        Lx = x_domain_end_glb - x_domain_beg_glb
+        Ly = y_domain_end_glb - y_domain_beg_glb  
+        Lz = z_domain_end_glb - z_domain_beg_glb    
+        
+        G_norm_int = 0.0_dp
+   
+        !$acc parallel loop collapse(3) gang vector default(present) reduction(+:G_norm_int) copyin(Lx, Ly, Lz, sigma_stddev) private(x_r, y_r, z_r, r2)
+        do i = 0, m 
+            do j = 0, n 
+                do k = 0, p 
+                    x_r = min(abs(x_cc(i) - x_domain_beg_glb), Lx - abs(x_cc(i) - x_domain_beg_glb))
+                    y_r = min(abs(y_cc(j) - y_domain_beg_glb), Ly - abs(y_cc(j) - y_domain_beg_glb))
+                    z_r = min(abs(z_cc(k) - z_domain_beg_glb), Lz - abs(z_cc(k) - z_domain_beg_glb))
+
+                    r2 = x_r**2 + y_r**2 + z_r**2
+
+                    real_kernelG_in(i+1, j+1, k+1) = exp(-r2 / (2.0_dp*sigma_stddev**2))
+
+                    G_norm_int = G_norm_int + real_kernelG_in(i+1, j+1, k+1)*dx(i)*dy(j)*dz(k)
+                end do 
+            end do
+        end do
+
+        call s_mpi_allreduce_sum(G_norm_int, G_norm_int_glb) 
+
+        ! FFT of kernel
+        ! normalize kernel
+        !$acc parallel loop collapse(3) gang vector default(present) copyin(G_norm_int_glb)
+        do i = 1, Nx 
+            do j = 1, Ny 
+                do k = 1, Nzloc
+                    data_real_3D_slabz(i, j, k) = real_kernelG_in(i, j, k) / G_norm_int_glb
+                end do 
+            end do 
+        end do 
+
+        ! 3D z-slab -> 1D x, y, z
+        !$acc parallel loop collapse(3) gang vector default(present)
+        do i = 1, Nx 
+            do j = 1, Ny 
+                do k = 1, Nzloc
+                    data_real_in1d(i + (j-1)*Nx + (k-1)*Nx*Ny) = data_real_3D_slabz(i, j, k)
+                end do 
+            end do 
+        end do
+
+        ! X FFT
+#if defined(MFC_OpenACC)
+        ierr = cufftExecD2Z(plan_x_fwd_gpu, data_real_in1d, cmplx_kernelG1d)
+#else
+        call fftw_execute_dft_r2c(plan_x_r2c_kernelG, data_real_in1d, cmplx_kernelG1d)
+#endif
+
+        ! 1D x, y, z -> 1D y, x, z (CMPLX)
+        !$acc parallel loop collapse(3) gang vector default(present)
+        do i = 1, NxC
+            do j = 1, Ny 
+                do k = 1, Nzloc
+                    data_cmplx_out1dy(j + (i-1)*Ny + (k-1)*Ny*NxC) = cmplx_kernelG1d(i + (j-1)*NxC + (k-1)*NxC*Ny)
+                end do 
+            end do 
+        end do
+
+        ! Y FFT 
+#if defined(MFC_OpenACC)
+        ierr = cufftExecZ2Z(plan_y_gpu, data_cmplx_out1dy, data_cmplx_out1dy, CUFFT_FORWARD)
+#else
+        call fftw_execute_dft(plan_y_c2c_kernelG, data_cmplx_out1dy, data_cmplx_out1dy)
+#endif
+
+        ! 1D y, x, z -> 3D z-slab
+        !$acc parallel loop collapse(3) gang vector default(present)
+        do i = 1, NxC 
+            do j = 1, Ny 
+                do k = 1, Nzloc
+                    data_cmplx_slabz(i, j, k) = data_cmplx_out1dy(j + (i-1)*Ny + (k-1)*Ny*NxC)
+                end do 
+            end do 
+        end do 
+
+        ! transpose z-slab to y-slab
+        call s_mpi_transpose_slabZ2Y 
+
+        ! 3D y-slab -> 1D z, x, y
+        !$acc parallel loop collapse(3) gang vector default(present)
+        do i = 1, NxC 
+            do j = 1, Nyloc 
+                do k = 1, Nz
+                    cmplx_kernelG1d(k + (i-1)*Nz + (j-1)*Nz*NxC) = data_cmplx_slaby(i, j, k)
+                end do 
+            end do 
+        end do
+
+        ! Z FFT
+#if defined(MFC_OpenACC)
+        ierr = cufftExecZ2Z(plan_z_gpu, cmplx_kernelG1d, cmplx_kernelG1d, CUFFT_FORWARD)
+#else
+        call fftw_execute_dft(plan_z_c2c_kernelG, cmplx_kernelG1d, cmplx_kernelG1d)
+#endif
+
+        ! normalize FFT 
+        !$acc parallel loop collapse(3) gang vector default(present)
+        do i = 1, NxC 
+            do j = 1, Nyloc 
+                do k = 1, Nz
+                    cmplx_kernelG1d(k + (i-1)*Nz + (j-1)*Nz*NxC) = cmplx_kernelG1d(k + (i-1)*Nz + (j-1)*Nz*NxC) / (real(Nx*Ny*Nz, dp))
+                end do 
+            end do 
+        end do
+
+        ! return cmplx_kernelG1d: 1D z, x, y
+    end subroutine s_initialize_filtering_kernel
+
+    !< initialize fluid indicator function
+    subroutine s_initialize_fluid_indicator_function 
+        integer :: i, j, k 
+
+        @:ALLOCATE(fluid_indicator_function_I%sf(0:m, 0:n, 0:p))
+        @:ACC_SETUP_SFs(fluid_indicator_function_I)
+
+        ! define fluid indicator function
+        !$acc parallel loop collapse(3) gang vector default(present)
+        do i = 0, m
+            do j = 0, n 
+                do k = 0, p
+                    if (ib_markers%sf(i, j, k) == 0) then 
+                        fluid_indicator_function_I%sf(i, j, k) = 1.0_dp
+                    else 
+                        fluid_indicator_function_I%sf(i, j, k) = 0.0_dp
+                    end if
+                end do
+            end do
+        end do
+
+    end subroutine s_initialize_fluid_indicator_function
+
+    !< compute the filtered fluid indicator function counterpart
+    subroutine s_initialize_filtered_fluid_indicator_function(filtered_fluid_indicator_function)
+        type(scalar_field) :: filtered_fluid_indicator_function
+
+        integer :: i, j, k
+
+        ! filter fluid indicator function -> stored in q_cons_vf(advxb)
+        !$acc parallel loop collapse(3) gang vector default(present)
+        do i = 1, Nx 
+            do j = 1, Ny 
+                do k = 1, Nzloc 
+                    data_real_3D_slabz(i, j, k) = fluid_indicator_function_I%sf(i-1, j-1, k-1)
+                end do 
+            end do 
+        end do 
+
+        call s_mpi_FFT_fwd 
+
+        !$acc parallel loop collapse(3) gang vector default(present)
+        do i = 1, NxC 
+            do j = 1, Nyloc 
+                do k = 1, Nz
+                    data_cmplx_out1d(k + (i-1)*Nz + (j-1)*Nz*NxC) = data_cmplx_out1d(k + (i-1)*Nz + (j-1)*Nz*NxC) * cmplx_kernelG1d(k + (i-1)*Nz + (j-1)*Nz*NxC)
+                end do
+            end do 
+        end do
+
+        call s_mpi_FFT_bwd
+
+        !$acc parallel loop collapse(3) gang vector default(present)
+        do i = 1, Nx 
+            do j = 1, Ny 
+                do k = 1, Nzloc
+                    filtered_fluid_indicator_function%sf(i-1, j-1, k-1) = data_real_3D_slabz(i, j, k) / (real(Nx*Ny*Nz, dp))
+                end do 
+            end do
+        end do
+
+    end subroutine s_initialize_filtered_fluid_indicator_function
+
+    !< apply the gaussian filter to the conservative variables and compute their filtered components
+    subroutine s_apply_fftw_filter_cons(q_cons_vf, q_cons_filtered)
+        type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
+        type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_filtered
+
+        integer :: l
+
+        do l = 1, sys_size-1
+            call s_apply_fftw_filter_scalarfield(q_cons_filtered(advxb), .true., q_cons_vf(l), q_cons_filtered(l))
+        end do 
+
+    end subroutine s_apply_fftw_filter_cons
+
+    !< applies the gaussian filter to an arbitrary scalar field
+    subroutine s_apply_fftw_filter_scalarfield(filtered_fluid_indicator_function, fluid_quantity, q_temp_in, q_temp_out)
+        type(scalar_field), intent(in) :: filtered_fluid_indicator_function
+        type(scalar_field), intent(inout) :: q_temp_in
+        type(scalar_field), intent(inout), optional :: q_temp_out
+
+        logical, intent(in) :: fluid_quantity !< whether or not convolution integral is over V_f or V_p^(i) - integral over fluid volume or particle volume
+
+        integer :: i, j, k
+
+        if (fluid_quantity) then 
+            !$acc parallel loop collapse(3) gang vector default(present)
+            do i = 0, m 
+                do j = 0, n 
+                    do k = 0, p 
+                        data_real_3D_slabz(i+1, j+1, k+1) = q_temp_in%sf(i, j, k) * fluid_indicator_function_I%sf(i, j, k)
+                    end do 
+                end do 
+            end do
+        else 
+            !$acc parallel loop collapse(3) gang vector default(present)
+            do i = 0, m 
+                do j = 0, n 
+                    do k = 0, p 
+                        data_real_3D_slabz(i+1, j+1, k+1) = q_temp_in%sf(i, j, k) * (1.0_dp - fluid_indicator_function_I%sf(i, j, k))
+                    end do 
+                end do 
+            end do
+        end if
+
+        call s_mpi_FFT_fwd 
+
+        !$acc parallel loop collapse(3) gang vector default(present)
+        do i = 1, NxC 
+            do j = 1, Nyloc 
+                do k = 1, Nz 
+                    data_cmplx_out1d(k + (i-1)*Nz + (j-1)*Nz*NxC) = data_cmplx_out1d(k + (i-1)*Nz + (j-1)*Nz*NxC) * cmplx_kernelG1d(k + (i-1)*Nz + (j-1)*Nz*NxC)
+                end do 
+            end do 
+        end do
+
+        call s_mpi_FFT_bwd
+
+        if (present(q_temp_out)) then 
+            !$acc parallel loop collapse(3) gang vector default(present)
+            do i = 0, m
+                do j = 0, n
+                    do k = 0, p
+                        q_temp_out%sf(i, j, k) = data_real_3D_slabz(i+1, j+1, k+1) / (real(Nx*Ny*Nz, dp) * filtered_fluid_indicator_function%sf(i, j, k))
+                    end do 
+                end do 
+            end do
+        else 
+            !$acc parallel loop collapse(3) gang vector default(present)
+            do i = 0, m
+                do j = 0, n 
+                    do k = 0, p 
+                        q_temp_in%sf(i, j, k) = data_real_3D_slabz(i+1, j+1, k+1) / (real(Nx*Ny*Nz, dp) * filtered_fluid_indicator_function%sf(i, j, k))      
+                    end do 
+                end do 
+            end do
+        end if
+
+    end subroutine s_apply_fftw_filter_scalarfield
+
+    !< apply the gaussian filter to the requisite tensors to compute unclosed terms of interest
+    subroutine s_apply_fftw_filter_tensor(pt_Re_stress, R_mu, q_cons_filtered, rhs_rhouu, pImT_filtered)
+        type(vector_field), dimension(1:num_dims), intent(inout) :: pt_Re_stress
+        type(vector_field), dimension(1:num_dims), intent(inout) :: R_mu
+        type(scalar_field), dimension(sys_size), intent(in) :: q_cons_filtered
+        type(scalar_field), dimension(momxb:momxe), intent(inout) :: rhs_rhouu
+        type(scalar_field), dimension(1:num_dims), intent(inout) :: pImT_filtered
+
+        integer :: i, j, k, l, q
+
+        ! pseudo turbulent reynolds stress
+        do l = 1, num_dims 
+            do q = 1, num_dims
+                call s_apply_fftw_filter_scalarfield(q_cons_filtered(advxb), .true., pt_Re_stress(l)%vf(q))
+            end do
+        end do 
+
+        ! effective viscosity
+        do l = 1, num_dims 
+            do q = 1, num_dims
+                call s_apply_fftw_filter_scalarfield(q_cons_filtered(advxb), .true., R_mu(l)%vf(q))
+            end do
+        end do 
+
+        ! interphase momentum exchange
+        do l = 1, num_dims
+            call s_apply_fftw_filter_scalarfield(q_cons_filtered(advxb), .false., rhs_rhouu(momxb-1+l), pImT_filtered(l))
+        end do 
+
+    end subroutine s_apply_fftw_filter_tensor
+
+    !< transpose domain from z-slabs to y-slabs on each processor
+    subroutine s_mpi_transpose_slabZ2Y
+        complex(c_double_complex), allocatable :: sendbuf(:), recvbuf(:)
+        integer :: dest_rank, src_rank
+        integer :: i, j, k
+
+        allocate(sendbuf(NxC*Nyloc*Nzloc*num_procs))
+        allocate(recvbuf(NxC*Nyloc*Nzloc*num_procs))
+
+        !$acc parallel loop collapse(4) gang vector default(present) copy(sendbuf)
+        do dest_rank = 0, num_procs-1
+            do k = 1, Nzloc 
+                do j = 1, Nyloc
+                    do i = 1, NxC
+                        sendbuf(i + (j-1)*NxC + (k-1)*NxC*Nyloc + dest_rank*NxC*Nyloc*Nzloc) = data_cmplx_slabz(i, j+dest_rank*Nyloc, k)
+                    end do 
+                end do
+            end do
+        end do
+
+        call MPI_Alltoall(sendbuf, NxC*Nyloc*Nzloc, MPI_DOUBLE_COMPLEX, & 
+                          recvbuf, NxC*Nyloc*Nzloc, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD, ierr)
+
+        !$acc parallel loop collapse(4) gang vector default(present) copy(recvbuf)
+        do src_rank = 0, num_procs-1
+            do k = 1, Nzloc 
+                do j = 1, Nyloc
+                    do i = 1, NxC
+                        data_cmplx_slaby(i, j, k+src_rank*Nzloc) = recvbuf(i + (j-1)*NxC + (k-1)*NxC*Nyloc + src_rank*NxC*Nyloc*Nzloc)
+                    end do 
+                end do
+            end do 
+        end do
+
+        deallocate(sendbuf, recvbuf)
+    end subroutine s_mpi_transpose_slabZ2Y
+
+    !< transpose domain from y-slabs to z-slabs on each processor
+    subroutine s_mpi_transpose_slabY2Z 
+        complex(c_double_complex), allocatable :: sendbuf(:), recvbuf(:)
+        integer :: dest_rank, src_rank
+        integer :: i, j, k
+
+        allocate(sendbuf(NxC*Nyloc*Nzloc*num_procs))
+        allocate(recvbuf(NxC*Nyloc*Nzloc*num_procs))
+
+        !$acc parallel loop collapse(4) gang vector default(present) copy(sendbuf)
+        do dest_rank = 0, num_procs-1
+            do k = 1, Nzloc 
+                do j = 1, Nyloc 
+                    do i = 1, NxC 
+                        sendbuf(i + (j-1)*NxC + (k-1)*NxC*Nyloc + dest_rank*NxC*Nyloc*Nzloc) = data_cmplx_slaby(i, j, k+dest_rank*Nzloc)
+                    end do 
+                end do 
+            end do 
+        end do
+
+        call MPI_Alltoall(sendbuf, NxC*Nyloc*Nzloc, MPI_DOUBLE_COMPLEX, & 
+                          recvbuf, NxC*Nyloc*Nzloc, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD, ierr)
+
+        !$acc parallel loop collapse(4) gang vector default(present) copy(recvbuf) 
+        do src_rank = 0, num_procs-1
+            do k = 1, Nzloc
+                do j = 1, Nyloc 
+                    do i = 1, NxC 
+                        data_cmplx_slabz(i, j+src_rank*Nyloc, k) = recvbuf(i + (j-1)*NxC + (k-1)*NxC*Nyloc + src_rank*NxC*Nyloc*Nzloc)
+                    end do 
+                end do
+            end do 
+        end do
+        
+        deallocate(sendbuf, recvbuf)
+    end subroutine s_mpi_transpose_slabY2Z
+
+    !< compute forward FFT, input: data_real_3D_slabz, output: data_cmplx_out1d
+    subroutine s_mpi_FFT_fwd
+        integer :: i, j, k
+
+        ! 3D z-slab -> 1D x, y, z
+        !$acc parallel loop collapse(3) gang vector default(present)
+        do i = 1, Nx 
+            do j = 1, Ny 
+                do k = 1, Nzloc
+                    data_real_in1d(i + (j-1)*Nx + (k-1)*Nx*Ny) = data_real_3D_slabz(i, j, k)
+                end do 
+            end do 
+        end do
+
+        ! X FFT
+#if defined(MFC_OpenACC)
+        ierr = cufftExecD2Z(plan_x_fwd_gpu, data_real_in1d, data_cmplx_out1d)
+#else
+        call fftw_execute_dft_r2c(plan_x_r2c_fwd, data_real_in1d, data_cmplx_out1d)
+#endif
+
+        ! 1D x, y, z -> 1D y, x, z (CMPLX)
+        !$acc parallel loop collapse(3) gang vector default(present)
+        do i = 1, NxC
+            do j = 1, Ny 
+                do k = 1, Nzloc
+                    data_cmplx_out1dy(j + (i-1)*Ny + (k-1)*Ny*NxC) = data_cmplx_out1d(i + (j-1)*NxC + (k-1)*NxC*Ny)
+                end do 
+            end do 
+        end do
+
+        ! Y FFT 
+#if defined(MFC_OpenACC)
+        ierr = cufftExecZ2Z(plan_y_gpu, data_cmplx_out1dy, data_cmplx_out1dy, CUFFT_FORWARD)
+#else
+        call fftw_execute_dft(plan_y_c2c_fwd, data_cmplx_out1dy, data_cmplx_out1dy)
+#endif 
+
+        ! 1D y, x, z -> 3D z-slab
+        !$acc parallel loop collapse(3) gang vector default(present)
+        do i = 1, NxC 
+            do j = 1, Ny 
+                do k = 1, Nzloc
+                    data_cmplx_slabz(i, j, k) = data_cmplx_out1dy(j + (i-1)*Ny + (k-1)*Ny*NxC)
+                end do 
+            end do 
+        end do 
+
+        ! transpose z-slab to y-slab
+        call s_mpi_transpose_slabZ2Y 
+
+        ! 3D y-slab -> 1D z, x, y
+        !$acc parallel loop collapse(3) gang vector default(present)
+        do i = 1, NxC 
+            do j = 1, Nyloc 
+                do k = 1, Nz
+                    data_cmplx_out1d(k + (i-1)*Nz + (j-1)*Nz*NxC) = data_cmplx_slaby(i, j, k)
+                end do 
+            end do 
+        end do
+
+        ! Z FFT
+#if defined(MFC_OpenACC)
+        ierr = cufftExecZ2Z(plan_z_gpu, data_cmplx_out1d, data_cmplx_out1d, CUFFT_FORWARD)
+#else
+        call fftw_execute_dft(plan_z_c2c_fwd, data_cmplx_out1d, data_cmplx_out1d)
+#endif
+
+        ! return data_cmplx_out1d: 1D z, x, y
+    end subroutine s_mpi_FFT_fwd
+
+    !< compute inverse FFT, input: data_cmplx_out1d, output: data_real_3D_slabz
+    subroutine s_mpi_FFT_bwd
+        integer :: i, j, k
+
+        ! Z inv FFT 
+#if defined(MFC_OpenACC)
+        ierr = cufftExecZ2Z(plan_z_gpu, data_cmplx_out1d, data_cmplx_out1d, CUFFT_INVERSE)
+#else
+        call fftw_execute_dft(plan_z_c2c_bwd, data_cmplx_out1d, data_cmplx_out1d)
+#endif
+
+        ! 1D z, x, y -> 3D y-slab
+        !$acc parallel loop collapse(3) gang vector default(present)
+        do i = 1, NxC 
+            do j = 1, Nyloc 
+                do k = 1, Nz 
+                    data_cmplx_slaby(i, j, k) = data_cmplx_out1d(k + (i-1)*Nz + (j-1)*Nz*NxC)
+                end do 
+            end do 
+        end do
+
+        ! transpose y-slab to z-slab
+        call s_mpi_transpose_slabY2Z
+
+        ! 3D z-slab -> 1D y, x, z
+        !$acc parallel loop collapse(3) gang vector default(present)
+        do i = 1, NxC 
+            do j = 1, Ny 
+                do k = 1, Nzloc
+                    data_cmplx_out1dy(j + (i-1)*Ny + (k-1)*Ny*NxC) = data_cmplx_slabz(i, j, k)
+                end do 
+            end do 
+        end do
+
+        ! Y inv FFT 
+#if defined(MFC_OpenACC)
+        ierr = cufftExecZ2Z(plan_y_gpu, data_cmplx_out1dy, data_cmplx_out1dy, CUFFT_INVERSE)
+#else
+        call fftw_execute_dft(plan_y_c2c_bwd, data_cmplx_out1dy, data_cmplx_out1dy)
+#endif
+
+        ! 1D y, x, z -> 1D x, y, z 
+        !$acc parallel loop collapse(3) gang vector default(present)
+        do i = 1, NxC 
+            do j = 1, Ny 
+                do k = 1, Nzloc
+                    data_cmplx_out1d(i + (j-1)*NxC + (k-1)*NxC*Ny) = data_cmplx_out1dy(j + (i-1)*Ny + (k-1)*Ny*NxC)
+                end do 
+            end do 
+        end do
+
+        ! X inv FFT
+#if defined(MFC_OpenACC)
+        ierr = cufftExecZ2D(plan_x_bwd_gpu, data_cmplx_out1d, data_real_in1d)
+#else
+        call fftw_execute_dft_c2r(plan_x_c2r_bwd, data_cmplx_out1d, data_real_in1d)
+#endif
+
+        ! 1D x, y, z -> 3D z-slab
+        !$acc parallel loop collapse(3) gang vector default(present)
+        do i = 1, Nx 
+            do j = 1, Ny 
+                do k = 1, Nzloc
+                    data_real_3D_slabz(i, j, k) = data_real_in1d(i + (j-1)*Nx + (k-1)*Nx*Ny)
+                end do 
+            end do 
+        end do
+
+    end subroutine s_mpi_FFT_bwd
+
+    !< setup for calculation of unclosed terms in volume filtered momentum eqn
+    subroutine s_setup_terms_filtering(q_cons_vf, pt_Re_stress, R_mu)
+        type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
+        type(vector_field), dimension(1:num_dims), intent(inout) :: pt_Re_stress
+        type(vector_field), dimension(1:num_dims), intent(inout) :: R_mu
+
+        integer :: i, j, k, l, q
+
+        ! pseudo turbulent reynolds stress setup
+        !$acc parallel loop collapse(3) gang vector default(present)
+        do i = 0, m
+            do j = 0, n
+                do k = 0, p
+                    !$acc loop seq
+                    do l = 1, num_dims
+                        !$acc loop seq
+                        do q = 1, num_dims
+                            pt_Re_stress(l)%vf(q)%sf(i, j, k) = (q_cons_vf(momxb-1+l)%sf(i, j, k) * q_cons_vf(momxb-1+q)%sf(i, j, k)) / q_cons_vf(1)%sf(i, j, k) ! (rho*u x rho*u)/rho = rho*(u x u) 
+                        end do
+                    end do
+                end do
+            end do 
+        end do
+
+        ! set density and momentum buffers
+#ifdef MFC_MPI
+        do i = 1, momxe 
+            call s_populate_scalarfield_buffers(q_cons_vf(i))
+        end do
+#else
+        do i = 1, momxe
+            q_cons_vf(i)%sf(-buff_size:-1, :, :) = q_cons_vf(i)%sf(m-buff_size+1:m, :, :)
+            q_cons_vf(i)%sf(m+1:m+buff_size, :, :) = q_cons_vf(i)%sf(0:buff_size-1, :, :)
+
+            q_cons_vf(i)%sf(:, -buff_size:-1, :) = q_cons_vf(i)%sf(:, n-buff_size+1:n, :)
+            q_cons_vf(i)%sf(:, n+1:n+buff_size, :) = q_cons_vf(i)%sf(:, 0:buff_size-1, :)
+
+            q_cons_vf(i)%sf(:, :, -buff_size:-1) = q_cons_vf(i)%sf(:, :, p-buff_size+1:p)
+            q_cons_vf(i)%sf(:, :, p+1:p+buff_size) = q_cons_vf(i)%sf(:, :, 0:buff_size-1)
+        end do
+#endif
+        
+        ! R_mu setup
+        !$acc parallel loop collapse(3) gang vector default(present)
+        do i = 0, m
+            do j = 0, n
+                do k = 0, p
+                    R_mu(1)%vf(1)%sf(i, j, k) = mu_visc * (2._wp*(q_cons_vf(momxb)%sf(i+1, j, k)/q_cons_vf(1)%sf(i+1, j, k) - q_cons_vf(momxb)%sf(i-1, j, k)/q_cons_vf(1)%sf(i-1, j, k))/(2._wp*dx(i)) & 
+                                                - 2._wp/3._wp*((q_cons_vf(momxb)%sf(i+1, j, k)/q_cons_vf(1)%sf(i+1, j, k) - q_cons_vf(momxb)%sf(i-1, j, k)/q_cons_vf(1)%sf(i-1, j, k))/(2._wp*dx(i)) & 
+                                                + (q_cons_vf(momxb+1)%sf(i, j+1, k)/q_cons_vf(1)%sf(i, j+1, k) - q_cons_vf(momxb+1)%sf(i, j-1, k)/q_cons_vf(1)%sf(i, j-1, k))/(2._wp*dy(j)) & 
+                                                + (q_cons_vf(momxb+2)%sf(i, j, k+1)/q_cons_vf(1)%sf(i, j, k+1) - q_cons_vf(momxb+2)%sf(i, j, k-1)/q_cons_vf(1)%sf(i, j, k-1))/(2._wp*dz(k))))
+
+                    R_mu(2)%vf(2)%sf(i, j, k) = mu_visc * (2._wp*(q_cons_vf(momxb+1)%sf(i, j+1, k)/q_cons_vf(1)%sf(i, j+1, k) - q_cons_vf(momxb+1)%sf(i, j-1, k)/q_cons_vf(1)%sf(i, j-1, k))/(2._wp*dy(j)) & 
+                                                - 2._wp/3._wp*((q_cons_vf(momxb)%sf(i+1, j, k)/q_cons_vf(1)%sf(i+1, j, k) - q_cons_vf(momxb)%sf(i-1, j, k)/q_cons_vf(1)%sf(i-1, j, k))/(2._wp*dx(i)) & 
+                                                + (q_cons_vf(momxb+1)%sf(i, j+1, k)/q_cons_vf(1)%sf(i, j+1, k) - q_cons_vf(momxb+1)%sf(i, j-1, k)/q_cons_vf(1)%sf(i, j-1, k))/(2._wp*dy(j)) & 
+                                                + (q_cons_vf(momxb+2)%sf(i, j, k+1)/q_cons_vf(1)%sf(i, j, k+1) - q_cons_vf(momxb+2)%sf(i, j, k-1)/q_cons_vf(1)%sf(i, j, k-1))/(2._wp*dz(k))))
+
+                    R_mu(3)%vf(3)%sf(i, j, k) = mu_visc * (2._wp*(q_cons_vf(momxb+2)%sf(i, j, k+1)/q_cons_vf(1)%sf(i, j, k+1) - q_cons_vf(momxb+2)%sf(i, j, k-1)/q_cons_vf(1)%sf(i, j, k-1))/(2._wp*dz(k)) & 
+                                                - 2._wp/3._wp*((q_cons_vf(momxb)%sf(i+1, j, k)/q_cons_vf(1)%sf(i+1, j, k) - q_cons_vf(momxb)%sf(i-1, j, k)/q_cons_vf(1)%sf(i-1, j, k))/(2._wp*dx(i)) & 
+                                                + (q_cons_vf(momxb+1)%sf(i, j+1, k)/q_cons_vf(1)%sf(i, j+1, k) - q_cons_vf(momxb+1)%sf(i, j-1, k)/q_cons_vf(1)%sf(i, j-1, k))/(2._wp*dy(j)) & 
+                                                + (q_cons_vf(momxb+2)%sf(i, j, k+1)/q_cons_vf(1)%sf(i, j, k+1) - q_cons_vf(momxb+2)%sf(i, j, k-1)/q_cons_vf(1)%sf(i, j, k-1))/(2._wp*dz(k))))
+
+                    R_mu(1)%vf(2)%sf(i, j, k) = mu_visc * ((q_cons_vf(momxb)%sf(i, j+1, k)/q_cons_vf(1)%sf(i, j+1, k) - q_cons_vf(momxb)%sf(i, j-1, k)/q_cons_vf(1)%sf(i, j-1, k))/(2._wp*dy(j))/q_cons_vf(1)%sf(i, j, k) & 
+                                                + (q_cons_vf(momxb+1)%sf(i+1, j, k)/q_cons_vf(1)%sf(i+1, j, k) - q_cons_vf(momxb+1)%sf(i-1, j, k)/q_cons_vf(1)%sf(i-1, j, k))/(2._wp*dx(i))/q_cons_vf(1)%sf(i, j, k))
+                                            
+                    R_mu(2)%vf(1)%sf(i, j, k) = R_mu(1)%vf(2)%sf(i, j, k)
+
+                    R_mu(1)%vf(3)%sf(i, j, k) = mu_visc * ((q_cons_vf(momxb)%sf(i, j, k+1)/q_cons_vf(1)%sf(i, j, k+1) - q_cons_vf(momxb)%sf(i, j, k-1)/q_cons_vf(1)%sf(i, j, k-1))/(2._wp*dz(k))/q_cons_vf(1)%sf(i, j, k) & 
+                                                + (q_cons_vf(momxb+2)%sf(i+1, j, k)/q_cons_vf(1)%sf(i+1, j, k) - q_cons_vf(momxb+2)%sf(i-1, j, k)/q_cons_vf(1)%sf(i-1, j, k))/(2._wp*dx(i))/q_cons_vf(1)%sf(i, j, k))
+
+                    R_mu(3)%vf(1)%sf(i, j, k) = R_mu(1)%vf(3)%sf(i, j, k)
+
+                    R_mu(2)%vf(3)%sf(i, j, k) = mu_visc * ((q_cons_vf(momxb+1)%sf(i, j, k+1)/q_cons_vf(1)%sf(i, j, k+1) - q_cons_vf(momxb+1)%sf(i, j, k-1)/q_cons_vf(1)%sf(i, j, k-1))/(2._wp*dz(k))/q_cons_vf(1)%sf(i, j, k) & 
+                                                + (q_cons_vf(momxb+2)%sf(i, j+1, k)/q_cons_vf(1)%sf(i, j+1, k) - q_cons_vf(momxb+2)%sf(i, j-1, k)/q_cons_vf(1)%sf(i, j-1, k))/(2._wp*dy(j))/q_cons_vf(1)%sf(i, j, k))
+
+                    R_mu(3)%vf(2)%sf(i, j, k) = R_mu(2)%vf(3)%sf(i, j, k)
+                end do
+            end do
+        end do
+
+    end subroutine s_setup_terms_filtering
+
+    subroutine s_compute_pseudo_turbulent_reynolds_stress(q_cons_filtered, pt_Re_stress, mag_div_Ru)
+        type(scalar_field), dimension(sys_size), intent(in) :: q_cons_filtered
+        type(vector_field), dimension(1:num_dims), intent(inout) :: pt_Re_stress
+        type(scalar_field), intent(inout) :: mag_div_Ru
+        real(wp), dimension(1:num_dims, 0:m, 0:n, 0:p) :: div_Ru
+        integer :: i, j, k, l, q    
+
+        !$acc parallel loop collapse(3) gang vector default(present)
+        do i = 0, m 
+            do j = 0, n 
+                do k = 0, p
+                    !$acc loop seq
+                    do l = 1, num_dims
+                        !$acc loop seq
+                        do q = 1, num_dims
+                            pt_Re_stress(l)%vf(q)%sf(i, j, k) = pt_Re_stress(l)%vf(q)%sf(i, j, k) &
+                                                              - (q_cons_filtered(momxb-1+l)%sf(i, j, k) * q_cons_filtered(momxb-1+q)%sf(i, j, k) / q_cons_filtered(1)%sf(i, j, k))
+                        end do
+                    end do
+                end do
+            end do
+        end do
+
+        !$acc parallel loop collapse(3) gang vector default(present)
+        do i = 0, m
+            do j = 0, n
+                do k = 0, p  
+                    !$acc loop seq
+                    do l = 1, num_dims
+                        !$acc loop seq
+                        do q = 1, num_dims
+                            pt_Re_stress(l)%vf(q)%sf(i, j, k) = pt_Re_stress(l)%vf(q)%sf(i, j, k) * q_cons_filtered(advxb)%sf(i, j, k)
+                        end do 
+                    end do 
+                end do
+            end do 
+        end do
+
+        ! set boundary buffer zone values
+#ifdef MFC_MPI
+        do l = 1, num_dims 
+            do q = 1, num_dims
+                call s_populate_scalarfield_buffers(pt_Re_stress(l)%vf(q))
+            end do 
+        end do
+#else
+        do l = 1, num_dims
+            do q = 1, num_dims
+                pt_Re_stress(l)%vf(q)%sf(-buff_size:-1, :, :) = pt_Re_stress(l)%vf(q)%sf(m-buff_size+1:m, :, :)
+                pt_Re_stress(l)%vf(q)%sf(m+1:m+buff_size, :, :) = pt_Re_stress(l)%vf(q)%sf(0:buff_size-1, :, :)
+
+                pt_Re_stress(l)%vf(q)%sf(:, -buff_size:-1, :) = pt_Re_stress(l)%vf(q)%sf(:, n-buff_size+1:n, :)
+                pt_Re_stress(l)%vf(q)%sf(:, n+1:n+buff_size, :) = pt_Re_stress(l)%vf(q)%sf(:, 0:buff_size-1, :)
+
+                pt_Re_stress(l)%vf(q)%sf(:, :, -buff_size:-1) = pt_Re_stress(l)%vf(q)%sf(:, :, p-buff_size+1:p)
+                pt_Re_stress(l)%vf(q)%sf(:, :, p+1:p+buff_size) = pt_Re_stress(l)%vf(q)%sf(:, :, 0:buff_size-1)
+            end do
+        end do
+#endif
+
+        ! div(Ru), using CD2 FD scheme 
+        !$acc parallel loop collapse(3) gang vector default(present) copy(div_Ru)
+        do i = 0, m
+            do j = 0, n 
+                do k = 0, p
+                    !$acc loop seq
+                    do l = 1, num_dims
+                        div_Ru(l, i, j, k) = (pt_Re_stress(l)%vf(1)%sf(i+1, j, k) - pt_Re_stress(l)%vf(1)%sf(i-1, j, k))/(2._wp*dx(i)) &
+                                           + (pt_Re_stress(l)%vf(2)%sf(i, j+1, k) - pt_Re_stress(l)%vf(2)%sf(i, j-1, k))/(2._wp*dy(j)) & 
+                                           + (pt_Re_stress(l)%vf(3)%sf(i, j, k+1) - pt_Re_stress(l)%vf(3)%sf(i, j, k-1))/(2._wp*dz(k))
+                    end do
+                end do
+            end do
+        end do
+
+        !$acc parallel loop collapse(3) gang vector default(present) copyin(div_Ru)
+        do i = 0, m
+            do j = 0, n
+                do k = 0, p 
+                    mag_div_Ru%sf(i, j, k) = sqrt(div_Ru(1, i, j, k)**2 + div_Ru(2, i, j, k)**2 + div_Ru(3, i, j, k)**2)
+                end do
+            end do
+        end do
+
+    end subroutine s_compute_pseudo_turbulent_reynolds_stress
+
+    subroutine s_compute_R_mu(q_cons_filtered, R_mu, mag_div_R_mu)
+        type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_filtered
+        type(vector_field), dimension(1:num_dims), intent(inout) :: R_mu
+        type(scalar_field), intent(inout) :: mag_div_R_mu
+        real(wp), dimension(1:num_dims, 0:m, 0:n, 0:p) :: div_R_mu
+
+        integer :: i, j, k, l, q
+
+        ! set buffers for filtered momentum quantities and density
+#ifdef MFC_MPI
+        do i = 1, momxe 
+            call s_populate_scalarfield_buffers(q_cons_filtered(i))
+        end do
+#else
+        do i = 1, momxe
+            q_cons_filtered(i)%sf(-buff_size:-1, :, :) = q_cons_filtered(i)%sf(m-buff_size+1:m, :, :)
+            q_cons_filtered(i)%sf(m+1:m+buff_size, :, :) = q_cons_filtered(i)%sf(0:buff_size-1, :, :)
+
+            q_cons_filtered(i)%sf(:, -buff_size:-1, :) = q_cons_filtered(i)%sf(:, n-buff_size+1:n, :)
+            q_cons_filtered(i)%sf(:, n+1:n+buff_size, :) = q_cons_filtered(i)%sf(:, 0:buff_size-1, :)
+
+            q_cons_filtered(i)%sf(:, :, -buff_size:-1) = q_cons_filtered(i)%sf(:, :, p-buff_size+1:p)
+            q_cons_filtered(i)%sf(:, :, p+1:p+buff_size) = q_cons_filtered(i)%sf(:, :, 0:buff_size-1)
+        end do
+#endif
+
+        ! calculate R_mu
+        !$acc parallel loop collapse(3) gang vector default(present)
+        do i = 0, m
+            do j = 0, n
+                do k = 0, p
+                    R_mu(1)%vf(1)%sf(i, j, k) = R_mu(1)%vf(1)%sf(i, j, k) - mu_visc * (2._wp*(q_cons_filtered(momxb)%sf(i+1, j, k)/q_cons_filtered(1)%sf(i+1, j, k) - q_cons_filtered(momxb)%sf(i-1, j, k)/q_cons_filtered(1)%sf(i-1, j, k))/(2._wp*dx(i)) & 
+                                            - 2._wp/3._wp*((q_cons_filtered(momxb)%sf(i+1, j, k)/q_cons_filtered(1)%sf(i+1, j, k) - q_cons_filtered(momxb)%sf(i-1, j, k)/q_cons_filtered(1)%sf(i-1, j, k))/(2._wp*dx(i)) & 
+                                            + (q_cons_filtered(momxb+1)%sf(i, j+1, k)/q_cons_filtered(1)%sf(i, j+1, k) - q_cons_filtered(momxb+1)%sf(i, j-1, k)/q_cons_filtered(1)%sf(i, j-1, k))/(2._wp*dy(j)) & 
+                                            + (q_cons_filtered(momxb+2)%sf(i, j, k+1)/q_cons_filtered(1)%sf(i, j, k+1) - q_cons_filtered(momxb+2)%sf(i, j, k-1)/q_cons_filtered(1)%sf(i, j, k-1))/(2._wp*dz(k))))
+
+                    R_mu(2)%vf(2)%sf(i, j, k) = R_mu(2)%vf(2)%sf(i, j, k) - mu_visc * (2._wp*(q_cons_filtered(momxb+1)%sf(i, j+1, k)/q_cons_filtered(1)%sf(i, j+1, k) - q_cons_filtered(momxb+1)%sf(i, j-1, k)/q_cons_filtered(1)%sf(i, j-1, k))/(2._wp*dy(j)) & 
+                                            - 2._wp/3._wp*((q_cons_filtered(momxb)%sf(i+1, j, k)/q_cons_filtered(1)%sf(i+1, j, k) - q_cons_filtered(momxb)%sf(i-1, j, k)/q_cons_filtered(1)%sf(i-1, j, k))/(2._wp*dx(i)) & 
+                                            + (q_cons_filtered(momxb+1)%sf(i, j+1, k)/q_cons_filtered(1)%sf(i, j+1, k) - q_cons_filtered(momxb+1)%sf(i, j-1, k)/q_cons_filtered(1)%sf(i, j-1, k))/(2._wp*dy(j)) & 
+                                            + (q_cons_filtered(momxb+2)%sf(i, j, k+1)/q_cons_filtered(1)%sf(i, j, k+1) - q_cons_filtered(momxb+2)%sf(i, j, k-1)/q_cons_filtered(1)%sf(i, j, k-1))/(2._wp*dz(k))))
+
+                    R_mu(3)%vf(3)%sf(i, j, k) = R_mu(3)%vf(3)%sf(i, j, k) - mu_visc * (2._wp*(q_cons_filtered(momxb+2)%sf(i, j, k+1)/q_cons_filtered(1)%sf(i, j, k+1) - q_cons_filtered(momxb+2)%sf(i, j, k-1)/q_cons_filtered(1)%sf(i, j, k-1))/(2._wp*dz(k)) & 
+                                            - 2._wp/3._wp*((q_cons_filtered(momxb)%sf(i+1, j, k)/q_cons_filtered(1)%sf(i+1, j, k) - q_cons_filtered(momxb)%sf(i-1, j, k)/q_cons_filtered(1)%sf(i-1, j, k))/(2._wp*dx(i)) & 
+                                            + (q_cons_filtered(momxb+1)%sf(i, j+1, k)/q_cons_filtered(1)%sf(i, j+1, k) - q_cons_filtered(momxb+1)%sf(i, j-1, k)/q_cons_filtered(1)%sf(i, j-1, k))/(2._wp*dy(j)) & 
+                                            + (q_cons_filtered(momxb+2)%sf(i, j, k+1)/q_cons_filtered(1)%sf(i, j, k+1) - q_cons_filtered(momxb+2)%sf(i, j, k-1)/q_cons_filtered(1)%sf(i, j, k-1))/(2._wp*dz(k))))
+
+                    R_mu(1)%vf(2)%sf(i, j, k) = R_mu(1)%vf(2)%sf(i, j, k) - mu_visc * ((q_cons_filtered(momxb)%sf(i, j+1, k)/q_cons_filtered(1)%sf(i, j+1, k) - q_cons_filtered(momxb)%sf(i, j-1, k)/q_cons_filtered(1)%sf(i, j-1, k))/(2._wp*dy(j))/q_cons_filtered(1)%sf(i, j, k) & 
+                                            + (q_cons_filtered(momxb+1)%sf(i+1, j, k)/q_cons_filtered(1)%sf(i+1, j, k) - q_cons_filtered(momxb+1)%sf(i-1, j, k)/q_cons_filtered(1)%sf(i-1, j, k))/(2._wp*dx(i))/q_cons_filtered(1)%sf(i, j, k))
+                                        
+                    R_mu(2)%vf(1)%sf(i, j, k) = R_mu(1)%vf(2)%sf(i, j, k)
+
+                    R_mu(1)%vf(3)%sf(i, j, k) = R_mu(1)%vf(3)%sf(i, j, k) - mu_visc * ((q_cons_filtered(momxb)%sf(i, j, k+1)/q_cons_filtered(1)%sf(i, j, k+1) - q_cons_filtered(momxb)%sf(i, j, k-1)/q_cons_filtered(1)%sf(i, j, k-1))/(2._wp*dz(k))/q_cons_filtered(1)%sf(i, j, k) & 
+                                            + (q_cons_filtered(momxb+2)%sf(i+1, j, k)/q_cons_filtered(1)%sf(i+1, j, k) - q_cons_filtered(momxb+2)%sf(i-1, j, k)/q_cons_filtered(1)%sf(i-1, j, k))/(2._wp*dx(i))/q_cons_filtered(1)%sf(i, j, k))
+
+                    R_mu(3)%vf(1)%sf(i, j, k) = R_mu(1)%vf(3)%sf(i, j, k)
+
+                    R_mu(2)%vf(3)%sf(i, j, k) = R_mu(2)%vf(3)%sf(i, j, k) - mu_visc * ((q_cons_filtered(momxb+1)%sf(i, j, k+1)/q_cons_filtered(1)%sf(i, j, k+1) - q_cons_filtered(momxb+1)%sf(i, j, k-1)/q_cons_filtered(1)%sf(i, j, k-1))/(2._wp*dz(k))/q_cons_filtered(1)%sf(i, j, k) & 
+                                            + (q_cons_filtered(momxb+2)%sf(i, j+1, k)/q_cons_filtered(1)%sf(i, j+1, k) - q_cons_filtered(momxb+2)%sf(i, j-1, k)/q_cons_filtered(1)%sf(i, j-1, k))/(2._wp*dy(j))/q_cons_filtered(1)%sf(i, j, k))
+
+                    R_mu(3)%vf(2)%sf(i, j, k) = R_mu(2)%vf(3)%sf(i, j, k)
+                    
+                end do
+            end do
+        end do
+
+        !$acc parallel loop collapse(3) gang vector default(present)
+        do i = 0, m
+            do j = 0, n
+                do k = 0, p 
+                    !$acc loop seq
+                    do l = 1, num_dims
+                        !$acc loop seq
+                        do q = 1, num_dims
+                            R_mu(l)%vf(q)%sf(i, j, k) = R_mu(l)%vf(q)%sf(i, j, k) * q_cons_filtered(advxb)%sf(i, j, k)
+                        end do 
+                    end do 
+                end do
+            end do 
+        end do
+
+        ! set boundary buffer zone values
+#ifdef MFC_MPI
+        do l = 1, num_dims
+            do q = 1, num_dims
+                call s_populate_scalarfield_buffers(R_mu(l)%vf(q))
+            end do
+        end do
+#else
+        do l = 1, num_dims
+            do q = 1, num_dims
+                R_mu(l)%vf(q)%sf(-buff_size:-1, :, :) = R_mu(l)%vf(q)%sf(m-buff_size+1:m, :, :)
+                R_mu(l)%vf(q)%sf(m+1:m+buff_size, :, :) = R_mu(l)%vf(q)%sf(0:buff_size-1, :, :)
+
+                R_mu(l)%vf(q)%sf(:, -buff_size:-1, :) = R_mu(l)%vf(q)%sf(:, n-buff_size+1:n, :)
+                R_mu(l)%vf(q)%sf(:, n+1:n+buff_size, :) = R_mu(l)%vf(q)%sf(:, 0:buff_size-1, :)
+
+                R_mu(l)%vf(q)%sf(:, :, -buff_size:-1) = R_mu(l)%vf(q)%sf(:, :, p-buff_size+1:p)
+                R_mu(l)%vf(q)%sf(:, :, p+1:p+buff_size) = R_mu(l)%vf(q)%sf(:, :, 0:buff_size-1)
+            end do
+        end do
+#endif
+
+        ! div(R_mu), using CD2 FD scheme 
+        !$acc parallel loop collapse(3) gang vector default(present) copy(div_R_mu)
+        do i = 0, m
+            do j = 0, n 
+                do k = 0, p
+                    !$acc loop seq
+                    do l = 1, num_dims
+                        div_R_mu(l, i, j, k) = (R_mu(l)%vf(1)%sf(i+1, j, k) - R_mu(l)%vf(1)%sf(i-1, j, k))/(2._wp*dx(i)) &
+                                           + (R_mu(l)%vf(2)%sf(i, j+1, k) - R_mu(l)%vf(2)%sf(i, j-1, k))/(2._wp*dy(j)) & 
+                                           + (R_mu(l)%vf(3)%sf(i, j, k+1) - R_mu(l)%vf(3)%sf(i, j, k-1))/(2._wp*dz(k))
+                    end do
+                end do
+            end do
+        end do
+
+        !$acc parallel loop collapse(3) gang vector default(present) copyin(div_R_mu)
+        do i = 0, m
+            do j = 0, n
+                do k = 0, p 
+                    mag_div_R_mu%sf(i, j, k) = sqrt(div_R_mu(1, i, j, k)**2 + div_R_mu(2, i, j, k)**2 + div_R_mu(3, i, j, k)**2)
+                end do
+            end do
+        end do
+
+    end subroutine s_compute_R_mu
+
+    subroutine s_compute_interphase_momentum_exchange_term(pImT_filtered, mag_F_IMET)
+        type(scalar_field), dimension(1:num_dims), intent(in) :: pImT_filtered
+        type(scalar_field), intent(inout) :: mag_F_IMET
+
+        integer :: i, j, k, l, q, ii
+
+        !$acc parallel loop collapse(3) gang vector default(present)
+        do i = 0, m
+            do j = 0, n
+                do k = 0, p 
+                    mag_F_IMET%sf(i, j, k) = sqrt(pImT_filtered(1)%sf(i, j, k)**2 & 
+                                               + pImT_filtered(2)%sf(i, j, k)**2 & 
+                                               + pImT_filtered(3)%sf(i, j, k)**2)
+                end do
+            end do
+        end do 
+
+    end subroutine s_compute_interphase_momentum_exchange_term
+
+    subroutine s_finalize_fftw_explicit_filter_module
+        @:DEALLOCATE(fluid_indicator_function_I%sf)
+
+        @:DEALLOCATE(data_real_in1d, data_cmplx_out1d, data_cmplx_out1dy)
+        @:DEALLOCATE(cmplx_kernelG1d, real_kernelG_in)
+        @:DEALLOCATE(data_real_3D_slabz, data_cmplx_slabz, data_cmplx_slaby)
+
+#if defined(MFC_OpenACC)
+        ierr = cufftDestroy(plan_x_fwd_gpu)
+        ierr = cufftDestroy(plan_x_bwd_gpu) 
+        ierr = cufftDestroy(plan_y_gpu)
+        ierr = cufftDestroy(plan_z_gpu)
+#else
+        call fftw_destroy_plan(plan_x_r2c_fwd)
+        call fftw_destroy_plan(plan_x_c2r_bwd)
+        call fftw_destroy_plan(plan_y_c2c_fwd) 
+        call fftw_destroy_plan(plan_y_c2c_bwd) 
+        call fftw_destroy_plan(plan_z_c2c_fwd) 
+        call fftw_destroy_plan(plan_z_c2c_bwd) 
+        call fftw_destroy_plan(plan_x_r2c_kernelG)
+        call fftw_destroy_plan(plan_y_c2c_kernelG)
+        call fftw_destroy_plan(plan_z_c2c_kernelG)
+#endif
+
+    end subroutine s_finalize_fftw_explicit_filter_module
+
+end module m_volume_filtering
\ No newline at end of file
diff --git a/src/simulation/p_main.fpp b/src/simulation/p_main.fpp
index 57dcbfbdf6..7f07744d8b 100644
--- a/src/simulation/p_main.fpp
+++ b/src/simulation/p_main.fpp
@@ -22,6 +22,8 @@ program p_main
 
     use m_nvtx
 
+    use m_volume_filtering
+
     implicit none
 
     integer :: t_step !< Iterator for the time-stepping loop
@@ -54,6 +56,10 @@ program p_main
     call s_initialize_gpu_vars()
     call nvtxEndRange
 
+    call s_initialize_fluid_indicator_function()
+    if (fourier_transform_filtering) call s_initialize_filtering_kernel()
+    if (fourier_transform_filtering) call s_initialize_filtered_fluid_indicator_function(q_cons_filtered(advxb))
+
     ! Setting the time-step iterator to the first time-step
     if (cfl_dt) then
         t_step = 0
diff --git a/toolchain/mfc/run/case_dicts.py b/toolchain/mfc/run/case_dicts.py
index a2d2ca559b..e425e53b6d 100644
--- a/toolchain/mfc/run/case_dicts.py
+++ b/toolchain/mfc/run/case_dicts.py
@@ -61,6 +61,9 @@ def analytic(self):
     'Bx0': ParamType.REAL,
     'relativity': ParamType.LOG,
     'cont_damage': ParamType.LOG,
+    'periodic_ibs': ParamType.LOG,
+    'store_levelset': ParamType.LOG,
+    'slab_domain_decomposition': ParamType.LOG,
 }
 
 PRE_PROCESS = COMMON.copy()
@@ -103,7 +106,7 @@ def analytic(self):
     'bubbles_lagrange': ParamType.LOG,
 })
 
-for ib_id in range(1, 10+1):
+for ib_id in range(1, 1000+1):
     for real_attr, ty in [("geometry", ParamType.INT), ("radius", ParamType.REAL),
                           ("theta", ParamType.REAL), ("slip", ParamType.LOG),
                           ("c", ParamType.REAL), ("p", ParamType.REAL),
@@ -297,6 +300,13 @@ def analytic(self):
     'tau_star': ParamType.REAL,
     'cont_damage_s': ParamType.REAL,
     'alpha_bar': ParamType.REAL,
+    'compute_CD': ParamType.LOG,
+    'mu_visc': ParamType.REAL, 
+    'u_inf_ref': ParamType.REAL,
+    'rho_inf_ref': ParamType.REAL,
+    'T_inf_ref': ParamType.REAL,
+    'periodic_forcing': ParamType.LOG,
+    'fourier_transform_filtering': ParamType.LOG,
 })
 
 for var in [ 'heatTransfer_model', 'massTransfer_model', 'pressure_corrector',
diff --git a/voronoi/gen_voronoi_2D.py b/voronoi/gen_voronoi_2D.py
new file mode 100644
index 0000000000..73beb4b8d7
--- /dev/null
+++ b/voronoi/gen_voronoi_2D.py
@@ -0,0 +1,99 @@
+import numpy as np
+import matplotlib
+import matplotlib.pyplot as plt
+import freud
+
+
+# lloyd relaxation
+def compute_simplex_centroid(simplex_vertices):
+    v1 = simplex_vertices[:, :, 0]
+    v2 = simplex_vertices[:, :, 1]
+    v3 = simplex_vertices[:, :, 2]
+
+    v1_mean = np.mean(v1, axis=1)
+    v2_mean = np.mean(v2, axis=1)
+    v3_mean = np.mean(v3, axis=1)
+
+    simplex_centroids = np.array([v1_mean, v2_mean, v3_mean])
+
+    return simplex_centroids
+
+def compute_simplex_area(simplex_vertices):
+    v1 = simplex_vertices[:, :, 0]
+    v2 = simplex_vertices[:, :, 1]
+    v3 = simplex_vertices[:, :, 2]
+
+    area = 0.5 * np.linalg.norm( np.cross(v2 - v1, v3 - v1), axis=1 )
+
+    return area
+
+def lloyd_relaxation_2d(initial_points, box, w=1.0, iterations=20):
+    points = initial_points
+
+    for _ in range(iterations):
+        voro = freud.locality.Voronoi()
+        voro_data = voro.compute((box, initial_points))
+        vertices = voro_data.polytopes
+
+        for i in range(len(points)):
+            n = len(vertices[i])
+
+            simplex_vertices = np.array( [(points[i, :], vertices[i][j-1], vertices[i][j]) for j in range(n)] )
+
+            simplex_centroids = compute_simplex_centroid(simplex_vertices)
+            simplex_areas = compute_simplex_area(simplex_vertices)
+
+            centroid = (1/np.sum(simplex_areas)) * (np.sum(simplex_centroids*simplex_areas, axis=1))
+
+            dist = centroid - points[i, :]
+
+            points[i, :] += w * dist
+
+        points = box.wrap(points)
+
+    return points
+
+if (__name__ == '__main__'):
+    print('running 2D...')
+
+    # setup 
+    phi = 0.4
+    D = 0.1
+    L = 10*D
+
+    N = int( 4*phi*L**2 / (np.pi*D**2) )
+    print(f'volume fraction phi: {phi}, number of circles: {N}')
+
+    x_i = L/2 * np.random.uniform(-1, 1, N)
+    y_i = L/2 * np.random.uniform(-1, 1, N)
+    z_i = L/2 * np.random.uniform(-1, 1, N) * 0
+
+    initial_points = np.stack((x_i, y_i, z_i), axis=1)
+
+    box = freud.box.Box.square(L)
+    voro = freud.locality.Voronoi()
+
+    cells = voro.compute((box, initial_points)).polytopes
+
+    # plot initial distribution
+    plt.figure()
+    ax = plt.gca()
+    voro.plot(ax=ax, cmap='RdBu')
+    ax.scatter(initial_points[:, 0], initial_points[:, 1], s=5, c='k')
+    plt.show()
+    plt.close()
+
+    # calculate relaxed points
+    relaxed_points = lloyd_relaxation_2d(initial_points, box, w=1.5, iterations=25)
+    voro.compute((box, relaxed_points))
+
+    # plot relaxed distribution
+    plt.figure()
+    ax = plt.gca()
+    voro.plot(ax=ax, cmap='RdBu')
+    ax.scatter(relaxed_points[:, 0], relaxed_points[:, 1], s=5, c='k')
+    plt.show()
+    plt.close()
+
+
+
diff --git a/voronoi/gen_voronoi_3D.py b/voronoi/gen_voronoi_3D.py
new file mode 100644
index 0000000000..ce700acb5d
--- /dev/null
+++ b/voronoi/gen_voronoi_3D.py
@@ -0,0 +1,98 @@
+import os
+import numpy as np
+import matplotlib
+import matplotlib.pyplot as plt
+import freud
+
+
+# lloyd relaxation
+def compute_tetrahedron_centroid(tetrahedron_vertices):
+    
+    return np.mean(tetrahedron_vertices, axis=0)
+
+def compute_tetrahedron_volume(tetrahedron_vertices):
+    v0, v1, v2, v3 = tetrahedron_vertices
+    matrix = np.vstack([v1 - v0, v2 - v0, v3 - v0]).T
+    volume = np.abs(np.linalg.det(matrix)) / 6
+
+    return volume
+
+def lloyd_relaxation_3d(initial_points, box, w=1, iterations=10):
+    points = initial_points
+
+    for _ in range(iterations):
+        voro = freud.locality.Voronoi()
+        voro_data = voro.compute((box, points))
+        vertices = voro_data.polytopes
+
+        for i in range(len(points)):
+            n = len(vertices[i])
+
+            tetrahedra = []
+            for j in range(n):
+                tetrahedra.append([points[i, :], vertices[i][j], vertices[i][(j+1) % n], vertices[i][(j+2) % n]])
+
+            centroids = np.array([compute_tetrahedron_centroid(t) for t in tetrahedra])
+            volumes = np.array([compute_tetrahedron_volume(t) for t in tetrahedra])
+
+            weighted_centroid = np.sum(centroids * volumes[:, np.newaxis], axis=0)
+            total_volume = np.sum(volumes)
+
+            if total_volume > 1.0e-12:
+                centroid = weighted_centroid / total_volume
+                dist = centroid - points[i, :]
+
+                points[i, :] += w * dist
+
+        points = box.wrap(points)
+
+    return points
+
+if (__name__ == '__main__'): 
+    print('running 3D...')
+
+    # setup 
+    phi = 0.05
+    str_phi = '005'
+
+    D = 0.1
+    L = 10*D
+
+    output_dir = '../examples/phi'+str_phi
+    if os.path.exists(output_dir) == False:
+        os.mkdir(output_dir)
+
+    N_sphere = int( 6*phi*L**3 / (np.pi*D**3) )
+    print(f'volume fraction phi: {phi}, number of spheres: {N_sphere}')
+    print(f'actual phi value: {N_sphere*4/3*np.pi*(D/2)**3/(L**3)}')
+
+    x_i = L/2 * np.random.uniform(-1, 1, N_sphere)
+    y_i = L/2 * np.random.uniform(-1, 1, N_sphere)
+    z_i = L/2 * np.random.uniform(-1, 1, N_sphere)
+
+    initial_points = np.stack((x_i, y_i, z_i), axis=1)
+    box = freud.box.Box.cube(L)
+    
+    relaxed_points = lloyd_relaxation_3d(initial_points, box, iterations=30)
+    print(np.shape(relaxed_points))
+
+    np.savetxt(output_dir+'/sphere_array_locations.txt', relaxed_points)
+
+    # check no spheres are overlaping
+    for i in range(N_sphere):
+        for j in range(N_sphere):
+            if (i != j):
+                dist = np.sqrt((relaxed_points[i, 0] - relaxed_points[j, 0])**2 + (relaxed_points[i, 1] - relaxed_points[j, 1])**2 + (relaxed_points[i, 2] - relaxed_points[j, 2])**2)
+                if (dist <= 1.05*D):
+                    print(f'spheres overlaping, dist={dist}, spheres #: {i}, {j}')
+                    print(f'locations: ({relaxed_points[i, :]}), ({relaxed_points[j, :]})')
+
+    fig = plt.figure(figsize=(10,5))
+    ax1 = fig.add_subplot(121, projection='3d')
+    ax1.scatter(initial_points[:, 0], initial_points[:, 1], initial_points[:, 2], color='blue', s=10)
+    ax1.set_title('initial points')
+    ax2 = fig.add_subplot(122, projection='3d')
+    ax2.scatter(relaxed_points[:, 0], relaxed_points[:, 1], relaxed_points[:, 2], color='red', s=10)
+    ax2.set_title('relaxed points')
+    plt.show()
+    plt.close()

From bd91e4f08b6eb5545498f84aae007d1e7dfd7b88 Mon Sep 17 00:00:00 2001
From: Conrad Delgado <conraddelgado@Conrads-MacBook-Air-6.local>
Date: Mon, 9 Jun 2025 21:41:46 -0600
Subject: [PATCH 02/30] statistics computation

---
 runs/3d_1sphere_filtering/case.py        | 154 +++++++++++++
 src/common/m_mpi_common.fpp              |  34 ++-
 src/post_process/m_data_input.f90        | 279 ++++++++++++++++++++++-
 src/post_process/m_global_parameters.fpp |  15 ++
 src/post_process/m_mpi_proxy.fpp         |   2 +-
 src/post_process/m_start_up.f90          |  30 ++-
 src/simulation/m_compute_statistics.fpp  | 130 +++++++++++
 src/simulation/m_data_output.fpp         |  34 ++-
 src/simulation/m_global_parameters.fpp   |  12 +
 src/simulation/m_rhs.fpp                 |  12 +-
 src/simulation/m_start_up.fpp            |  14 +-
 src/simulation/m_time_steppers.fpp       | 158 ++++++++++++-
 src/simulation/m_volume_filtering.fpp    |  32 +--
 toolchain/mfc/run/case_dicts.py          |   1 +
 14 files changed, 867 insertions(+), 40 deletions(-)
 create mode 100644 runs/3d_1sphere_filtering/case.py
 create mode 100644 src/simulation/m_compute_statistics.fpp

diff --git a/runs/3d_1sphere_filtering/case.py b/runs/3d_1sphere_filtering/case.py
new file mode 100644
index 0000000000..fa38be1ff0
--- /dev/null
+++ b/runs/3d_1sphere_filtering/case.py
@@ -0,0 +1,154 @@
+import json
+import math
+import numpy as np
+
+Mu = 1.84e-05
+gam_a = 1.4
+R = 287.0
+
+D = 0.1
+
+P = 101325 # Pa
+rho = 1.225 # kg/m^3
+
+T = P/(rho*R)
+
+M = 1.2
+Re = 1500.0
+v1 = M*(gam_a*P/rho)**(1.0/2.0)
+
+mu = rho*v1*D/Re # dynamic viscosity for current case
+
+#print('mu: ', mu)
+#print('v1: ', v1)
+#print('rho: ', rho)
+#print('Kn = ' + str( np.sqrt(np.pi*gam_a/2)*(M/Re) )) # Kn < 0.01 = continuum flow
+
+dt = 4.0E-06
+Nt = 31
+t_save = 1
+
+Nx = 63
+Ny = 63
+Nz = 63
+
+# immersed boundary dictionary
+ib_dict = {}
+ib_dict.update({
+    f"patch_ib({1})%geometry": 8,
+    f"patch_ib({1})%x_centroid": 0.0,
+    f"patch_ib({1})%y_centroid": 0.0,
+    f"patch_ib({1})%z_centroid": 0.0,
+    f"patch_ib({1})%radius": D / 2,
+    f"patch_ib({1})%slip": "F",
+    })
+
+# Configuring case dictionary
+case_dict = {
+    # Logistics
+    "run_time_info": "T",
+    # Computational Domain Parameters
+    # x direction
+    "x_domain%beg": -5.0 * D,
+    "x_domain%end": 5.0 * D,
+    # y direction
+    "y_domain%beg": -5.0 * D,
+    "y_domain%end": 5.0 * D,
+    # z direction
+    "z_domain%beg": -5.0 * D,
+    "z_domain%end": 5.0 * D,
+    "cyl_coord": "F",
+    "m": Nx,
+    "n": Ny,
+    "p": Nz,
+    "dt": dt,
+    "t_step_start": 0,
+    "t_step_stop": Nt,  # 3000
+    "t_step_save": t_save,  # 10
+    # Simulation Algorithm Parameters
+    # Only one patches are necessary, the air tube
+    "num_patches": 1,
+    # Use the 5 equation model
+    "model_eqns": 2,
+    # 6 equations model does not need the K \div(u) term
+    "alt_soundspeed": "F",
+    # One fluids: air
+    "num_fluids": 1,
+    # time step
+    "mpp_lim": "F",
+    # Correct errors when computing speed of sound
+    "mixture_err": "T",
+    # Use TVD RK3 for time marching
+    "time_stepper": 3,
+    # Reconstruct the primitive variables to minimize spurious
+    # Use WENO5
+    "weno_order": 5,
+    "weno_eps": 1.0e-14,
+    "weno_Re_flux": "T",
+    "weno_avg": "T",
+    "avg_state": 2,
+    "mapped_weno": "T",
+    "null_weights": "F",
+    "mp_weno": "T",
+    "riemann_solver": 2,
+    "low_Mach": 1,
+    "wave_speeds": 1,
+    # periodic bc
+    "bc_x%beg": -1,
+    "bc_x%end": -1,
+    "bc_y%beg": -1,
+    "bc_y%end": -1,
+    "bc_z%beg": -1,
+    "bc_z%end": -1,
+    # Set IB to True and add 1 patch
+    "ib": "T",
+    "num_ibs": 1,
+    "viscous": "T",
+    # Formatted Database Files Structure Parameters
+    "format": 1,
+    "precision": 2,
+    "prim_vars_wrt": "T",
+    "E_wrt": "T",
+    "q_filtered_wrt": "T",
+    "parallel_io": "T",
+    # Patch: Constant Tube filled with air
+    # Specify the cylindrical air tube grid geometry
+    "patch_icpp(1)%geometry": 9,
+    "patch_icpp(1)%x_centroid": 0.0,
+    # Uniform medium density, centroid is at the center of the domain
+    "patch_icpp(1)%y_centroid": 0.0,
+    "patch_icpp(1)%z_centroid": 0.0,
+    "patch_icpp(1)%length_x": 10 * D,
+    "patch_icpp(1)%length_y": 10 * D,
+    "patch_icpp(1)%length_z": 10 * D,
+    # Specify the patch primitive variables
+    "patch_icpp(1)%vel(1)": v1,
+    "patch_icpp(1)%vel(2)": 0.0e00,
+    "patch_icpp(1)%vel(3)": 0.0e00,
+    "patch_icpp(1)%pres": P,
+    "patch_icpp(1)%alpha_rho(1)": rho,
+    "patch_icpp(1)%alpha(1)": 1.0e00,
+    # Patch: Sphere Immersed Boundary
+    # Fluids Physical Parameters
+    "fluid_pp(1)%gamma": 1.0e00 / (gam_a - 1.0e00),  # 2.50(Not 1.40)
+    "fluid_pp(1)%pi_inf": 0,
+    "fluid_pp(1)%Re(1)": Re,
+
+    # new case additions
+    "periodic_forcing": "T",
+    "periodic_ibs": "T",
+    "compute_CD": "F",
+    "fourier_transform_filtering": "T",
+
+    "u_inf_ref": v1,
+    "rho_inf_ref": rho,
+    "T_inf_ref": T,
+    "mu_visc": mu,
+
+    "store_levelset": "F",
+    "slab_domain_decomposition": "T", 
+    }
+
+case_dict.update(ib_dict)
+
+print(json.dumps(case_dict))
diff --git a/src/common/m_mpi_common.fpp b/src/common/m_mpi_common.fpp
index 25cd6fda5d..8214120fe7 100644
--- a/src/common/m_mpi_common.fpp
+++ b/src/common/m_mpi_common.fpp
@@ -153,7 +153,7 @@ contains
     !! @param levelset closest distance from every cell to the IB
     !! @param levelset_norm normalized vector from every cell to the closest point to the IB
     !! @param beta Eulerian void fraction from lagrangian bubbles
-    subroutine s_initialize_mpi_data(q_cons_vf, ib_markers, levelset, levelset_norm, beta)
+    subroutine s_initialize_mpi_data(q_cons_vf, ib_markers, levelset, levelset_norm, beta, R_u_stat, R_mu_stat, F_IMET_stat)
 
         type(scalar_field), &
             dimension(sys_size), &
@@ -174,6 +174,10 @@ contains
         type(scalar_field), &
             intent(in), optional :: beta
 
+        type(scalar_field), dimension(2:4), intent(in), optional :: R_u_stat
+        type(scalar_field), dimension(2:4), intent(in), optional :: R_mu_stat
+        type(scalar_field), dimension(2:4), intent(in), optional :: F_IMET_stat
+
         integer, dimension(num_dims) :: sizes_glb, sizes_loc
         integer, dimension(1) :: airfoil_glb, airfoil_loc, airfoil_start
 
@@ -187,6 +191,8 @@ contains
 
         if (present(beta)) then
             alt_sys = sys_size + 1
+        else if (present(R_u_stat) .and. present(R_mu_stat) .and. present(F_IMET_stat)) then
+            alt_sys = sys_size + 9
         else
             alt_sys = sys_size
         end if
@@ -194,6 +200,18 @@ contains
         do i = 1, sys_size
             MPI_IO_DATA%var(i)%sf => q_cons_vf(i)%sf(0:m, 0:n, 0:p)
         end do
+        
+        if (present(R_u_stat) .and. present(R_mu_stat) .and. present(F_IMET_stat)) then 
+            do i = sys_size+1, sys_size+3
+                MPI_IO_DATA%var(i)%sf => R_u_stat(i-sys_size+1)%sf(0:m, 0:n, 0:p)
+            end do
+            do i = sys_size+4, sys_size+6
+                MPI_IO_DATA%var(i)%sf => R_mu_stat(i-sys_size-2)%sf(0:m, 0:n, 0:p)
+            end do
+            do i = sys_size+7, sys_size+9 
+                MPI_IO_DATA%var(i)%sf => F_IMET_stat(i-sys_size-5)%sf(0:m, 0:n, 0:p)
+            end do 
+        end if
 
         if (present(beta)) then
             MPI_IO_DATA%var(alt_sys)%sf => beta%sf(0:m, 0:n, 0:p)
@@ -1124,7 +1142,7 @@ contains
         buffer_counts = (/ &
                         buff_size*1*(n + 1)*(p + 1), &
                         buff_size*1*(m + 2*buff_size + 1)*(p + 1), &
-                        buff_size*v_size*(m + 2*buff_size + 1)*(n + 2*buff_size + 1) &
+                        buff_size*1*(m + 2*buff_size + 1)*(n + 2*buff_size + 1) &
                         /)
 
         buffer_count = buffer_counts(mpi_dir)
@@ -1166,7 +1184,7 @@ contains
                         do k = 0, n
                             do j = 0, buff_size - 1
                                 do i = 1, 1
-                                    r = (i - 1) + v_size*(j + buff_size*(k + (n + 1)*l))
+                                    r = (i - 1) + 1*(j + buff_size*(k + (n + 1)*l))
                                     buff_send_scalarfield(r) = q_temp%sf(j + pack_offset, k, l)
                                 end do
                             end do
@@ -1178,7 +1196,7 @@ contains
                         do l = 0, p
                             do k = 0, buff_size - 1
                                 do j = -buff_size, m + buff_size
-                                    r = (i - 1) + v_size* &
+                                    r = (i - 1) + 1* &
                                         ((j + buff_size) + (m + 2*buff_size + 1)* &
                                          (k + buff_size*l))
                                     buff_send_scalarfield(r) = q_temp%sf(j, k + pack_offset, l)
@@ -1192,7 +1210,7 @@ contains
                         do l = 0, buff_size - 1
                             do k = -buff_size, n + buff_size
                                 do j = -buff_size, m + buff_size
-                                    r = (i - 1) + v_size* &
+                                    r = (i - 1) + 1* &
                                         ((j + buff_size) + (m + 2*buff_size + 1)* &
                                          ((k + buff_size) + (n + 2*buff_size + 1)*l))
                                     buff_send_scalarfield(r) = q_temp%sf(j, k, l + pack_offset)
@@ -1258,7 +1276,7 @@ contains
                         do k = 0, n
                             do j = -buff_size, -1
                                 do i = 1, 1
-                                    r = (i - 1) + v_size* &
+                                    r = (i - 1) + 1* &
                                         (j + buff_size*((k + 1) + (n + 1)*l))
                                     q_temp%sf(j + unpack_offset, k, l) = buff_recv_scalarfield(r)
 #if defined(__INTEL_COMPILER)
@@ -1277,7 +1295,7 @@ contains
                         do l = 0, p
                             do k = -buff_size, -1
                                 do j = -buff_size, m + buff_size
-                                    r = (i - 1) + v_size* &
+                                    r = (i - 1) + 1* &
                                         ((j + buff_size) + (m + 2*buff_size + 1)* &
                                          ((k + buff_size) + buff_size*l))
                                     q_temp%sf(j, k + unpack_offset, l) = buff_recv_scalarfield(r)
@@ -1298,7 +1316,7 @@ contains
                         do l = -buff_size, -1
                             do k = -buff_size, n + buff_size
                                 do j = -buff_size, m + buff_size
-                                    r = (i - 1) + v_size* &
+                                    r = (i - 1) + 1* &
                                         ((j + buff_size) + (m + 2*buff_size + 1)* &
                                          ((k + buff_size) + (n + 2*buff_size + 1)* &
                                           (l + buff_size)))
diff --git a/src/post_process/m_data_input.f90 b/src/post_process/m_data_input.f90
index 5385ef7619..69b13707b1 100644
--- a/src/post_process/m_data_input.f90
+++ b/src/post_process/m_data_input.f90
@@ -29,6 +29,7 @@ module m_data_input
  s_read_parallel_data_files, &
  s_populate_grid_variables_buffer_regions, &
  s_populate_conservative_variables_buffer_regions, &
+ s_populate_filtered_variables_buffer_regions, &
  s_finalize_data_input_module
 
     abstract interface
@@ -60,6 +61,10 @@ end subroutine s_read_abstract_data_files
     ! type(scalar_field), public :: ib_markers !<
     type(integer_field), public :: ib_markers
 
+    type(scalar_field), allocatable, dimension(:), public :: R_u_stat
+    type(scalar_field), allocatable, dimension(:), public :: R_mu_stat
+    type(scalar_field), allocatable, dimension(:), public :: F_IMET_stat
+
     procedure(s_read_abstract_data_files), pointer :: s_read_data_files => null()
 
 contains
@@ -296,6 +301,8 @@ subroutine s_read_parallel_data_files(t_step)
 
         if (bubbles_lagrange) then
             alt_sys = sys_size + 1
+        else if (q_filtered_wrt) then
+            alt_sys = sys_size + 9
         else
             alt_sys = sys_size
         end if
@@ -454,7 +461,14 @@ subroutine s_read_parallel_data_files(t_step)
 
                 ! Initialize MPI data I/O
                 if (ib) then
-                    call s_initialize_mpi_data(q_cons_vf, ib_markers)
+                    if (q_filtered_wrt) then
+                        call s_initialize_mpi_data(q_cons_vf, ib_markers, &
+                                                   R_u_stat=R_u_stat, & 
+                                                   R_mu_stat=R_mu_stat, & 
+                                                   F_IMET_stat=F_IMET_stat)
+                    else 
+                        call s_initialize_mpi_data(q_cons_vf, ib_markers)
+                    end if
                 elseif (bubbles_lagrange) then
                     call s_initialize_mpi_data(q_cons_vf, beta=q_particle(1))
                 else
@@ -481,6 +495,18 @@ subroutine s_read_parallel_data_files(t_step)
                         ! Initial displacement to skip at beginning of file
                         disp = m_MOK*max(MOK, n_MOK)*max(MOK, p_MOK)*WP_MOK*(var_MOK - 1)
 
+                        call MPI_FILE_SET_VIEW(ifile, disp, mpi_p, MPI_IO_DATA%view(i), &
+                                               'native', mpi_info_int, ierr)
+                        call MPI_FILE_READ_ALL(ifile, MPI_IO_DATA%var(i)%sf, data_size, &
+                                               mpi_p, status, ierr)
+                    end do
+                else if (q_filtered_wrt) then
+                    do i = 1, alt_sys
+                        var_MOK = int(i, MPI_OFFSET_KIND)
+
+                        ! Initial displacement to skip at beginning of file
+                        disp = m_MOK*max(MOK, n_MOK)*max(MOK, p_MOK)*WP_MOK*(var_MOK - 1)
+
                         call MPI_FILE_SET_VIEW(ifile, disp, mpi_p, MPI_IO_DATA%view(i), &
                                                'native', mpi_info_int, ierr)
                         call MPI_FILE_READ_ALL(ifile, MPI_IO_DATA%var(i)%sf, data_size, &
@@ -1302,6 +1328,224 @@ subroutine s_populate_conservative_variables_buffer_regions(q_particle)
 
     end subroutine s_populate_conservative_variables_buffer_regions
 
+    subroutine s_populate_filtered_variables_buffer_regions(q_particle)
+
+        type(scalar_field), intent(inout), optional :: q_particle
+
+        integer :: i, j, k !< Generic loop iterators
+
+        ! Populating Buffer Regions in the x-direction
+
+        ! Periodic BC at the beginning
+        if (bc_x%beg == BC_PERIODIC) then
+
+            do j = 1, buff_size
+                if (present(q_particle)) then
+                    q_particle%sf(-j, 0:n, 0:p) = &
+                        q_particle%sf((m + 1) - j, 0:n, 0:p)
+                else
+                    do i = 2, 4
+                        R_u_stat(i)%sf(-j, 0:n, 0:p) = &
+                            R_u_stat(i)%sf((m + 1) - j, 0:n, 0:p)
+                        R_mu_stat(i)%sf(-j, 0:n, 0:p) = &
+                            R_mu_stat(i)%sf((m + 1) - j, 0:n, 0:p)
+                        F_IMET_stat(i)%sf(-j, 0:n, 0:p) = &
+                            F_IMET_stat(i)%sf((m + 1) - j, 0:n, 0:p)
+                    end do
+                end if
+            end do
+
+            ! Processor BC at the beginning
+        else
+            if (present(q_particle)) then
+                call s_mpi_sendrecv_cons_vars_buffer_regions(q_cons_vf, &
+                                                             'beg', 'x', q_particle)
+            else
+                call s_mpi_sendrecv_cons_vars_buffer_regions(q_cons_vf, &
+                                                             'beg', 'x')
+            end if
+
+        end if
+
+        ! Perodic BC at the end
+        if (bc_x%end == BC_PERIODIC) then
+
+            do j = 1, buff_size
+                if (present(q_particle)) then
+                    q_particle%sf(m + j, 0:n, 0:p) = &
+                        q_particle%sf(j - 1, 0:n, 0:p)
+                else
+                    do i = 2, 4
+                        R_u_stat(i)%sf(m + j, 0:n, 0:p) = &
+                            R_u_stat(i)%sf(j - 1, 0:n, 0:p)
+                        R_mu_stat(i)%sf(m + j, 0:n, 0:p) = &
+                            R_mu_stat(i)%sf(j - 1, 0:n, 0:p)
+                        F_IMET_stat(i)%sf(m + j, 0:n, 0:p) = &
+                            F_IMET_stat(i)%sf(j - 1, 0:n, 0:p)
+                    end do
+                end if
+            end do
+
+            ! Processor BC at the end
+        else
+
+            if (present(q_particle)) then
+                call s_mpi_sendrecv_cons_vars_buffer_regions(q_cons_vf, &
+                                                             'end', 'x', q_particle)
+            else
+                call s_mpi_sendrecv_cons_vars_buffer_regions(q_cons_vf, &
+                                                             'end', 'x')
+            end if
+
+        end if
+
+        ! END: Populating Buffer Regions in the x-direction
+
+        ! Populating Buffer Regions in the y-direction
+
+        if (n > 0) then
+
+            ! Periodic BC at the beginning
+            if (bc_y%beg == BC_PERIODIC) then
+
+                do j = 1, buff_size
+                    if (present(q_particle)) then
+                        q_particle%sf(:, -j, 0:p) = &
+                            q_particle%sf(:, (n + 1) - j, 0:p)
+                    else
+                        do i = 2, 4
+                            R_u_stat(i)%sf(:, -j, 0:p) = &
+                                R_u_stat(i)%sf(:, (n + 1) - j, 0:p)
+                            R_mu_stat(i)%sf(:, -j, 0:p) = &
+                                R_mu_stat(i)%sf(:, (n + 1) - j, 0:p)
+                            F_IMET_stat(i)%sf(:, -j, 0:p) = &
+                                F_IMET_stat(i)%sf(:, (n + 1) - j, 0:p)
+                        end do
+                    end if
+                end do
+
+                ! Processor BC at the beginning
+            else
+                if (present(q_particle)) then
+                    call s_mpi_sendrecv_cons_vars_buffer_regions(q_cons_vf, &
+                                                                 'beg', 'y', q_particle)
+                else
+                    call s_mpi_sendrecv_cons_vars_buffer_regions(q_cons_vf, &
+                                                                 'beg', 'y')
+                end if
+
+            end if
+
+            ! Perodic BC at the end
+            if (bc_y%end == BC_PERIODIC) then
+
+                do j = 1, buff_size
+                    if (present(q_particle)) then
+                        q_particle%sf(:, n + j, 0:p) = &
+                            q_particle%sf(:, j - 1, 0:p)
+                    else
+                        do i = 2, 4
+                            R_u_stat(i)%sf(:, n + j, 0:p) = &
+                                R_u_stat(i)%sf(:, j - 1, 0:p)
+                            R_mu_stat(i)%sf(:, n + j, 0:p) = &
+                                R_mu_stat(i)%sf(:, j - 1, 0:p)
+                            F_IMET_stat(i)%sf(:, n + j, 0:p) = &
+                                F_IMET_stat(i)%sf(:, j - 1, 0:p)
+                        end do
+                    end if
+                end do
+
+                ! Processor BC at the end
+            else
+
+                if (present(q_particle)) then
+                    call s_mpi_sendrecv_cons_vars_buffer_regions(q_cons_vf, &
+                                                                 'end', 'y', q_particle)
+                else
+                    call s_mpi_sendrecv_cons_vars_buffer_regions(q_cons_vf, &
+                                                                 'end', 'y')
+                end if
+
+            end if
+
+            ! END: Populating Buffer Regions in the y-direction
+
+            ! Populating Buffer Regions in the z-direction
+
+            if (p > 0) then
+
+                ! Periodic BC at the beginning
+                if (bc_z%beg == BC_PERIODIC) then
+
+                    do j = 1, buff_size
+                        if (present(q_particle)) then
+                            q_particle%sf(:, :, -j) = &
+                                q_particle%sf(:, :, (p + 1) - j)
+                        else
+                            do i = 2, 4
+                                R_u_stat(i)%sf(:, :, -j) = &
+                                    R_u_stat(i)%sf(:, :, (p + 1) - j)
+                                R_mu_stat(i)%sf(:, :, -j) = &
+                                    R_mu_stat(i)%sf(:, :, (p + 1) - j)
+                                F_IMET_stat(i)%sf(:, :, -j) = &
+                                    F_IMET_stat(i)%sf(:, :, (p + 1) - j)
+                            end do
+                        end if
+                    end do
+
+                    ! Processor BC at the beginning
+                else
+
+                    if (present(q_particle)) then
+                        call s_mpi_sendrecv_cons_vars_buffer_regions(q_cons_vf, &
+                                                                     'beg', 'z', q_particle)
+                    else
+                        call s_mpi_sendrecv_cons_vars_buffer_regions(q_cons_vf, &
+                                                                     'beg', 'z')
+                    end if
+
+                end if
+
+                ! Perodic BC at the end
+                if (bc_z%end == BC_PERIODIC) then
+
+                    do j = 1, buff_size
+                        if (present(q_particle)) then
+                            q_particle%sf(:, :, p + j) = &
+                                q_particle%sf(:, :, j - 1)
+                        else
+                            do i = 2, 4
+                                R_u_stat(i)%sf(:, :, p + j) = &
+                                    R_u_stat(i)%sf(:, :, j - 1)
+                                R_mu_stat(i)%sf(:, :, p + j) = &
+                                    R_mu_stat(i)%sf(:, :, j - 1)
+                                F_IMET_stat(i)%sf(:, :, p + j) = &
+                                    F_IMET_stat(i)%sf(:, :, j - 1)
+                            end do
+                        end if
+                    end do
+
+                    ! Processor BC at the end
+                else
+
+                    if (present(q_particle)) then
+                        call s_mpi_sendrecv_cons_vars_buffer_regions(q_cons_vf, &
+                                                                     'end', 'z', q_particle)
+                    else
+                        call s_mpi_sendrecv_cons_vars_buffer_regions(q_cons_vf, &
+                                                                     'end', 'z')
+                    end if
+
+                end if
+
+            end if
+
+        end if
+
+        ! END: Populating Buffer Regions in the z-direction
+
+    end subroutine s_populate_filtered_variables_buffer_regions
+
     !>  Computation of parameters, allocation procedures, and/or
         !!      any other tasks needed to properly setup the module
     subroutine s_initialize_data_input_module
@@ -1315,6 +1559,10 @@ subroutine s_initialize_data_input_module
         allocate (q_prim_vf(1:sys_size))
         if (bubbles_lagrange) allocate (q_particle(1))
 
+        if (q_filtered_wrt) allocate (R_u_stat(2:4))
+        if (q_filtered_wrt) allocate (R_mu_stat(2:4))
+        if (q_filtered_wrt) allocate (F_IMET_stat(2:4))
+
         ! Allocating the parts of the conservative and primitive variables
         ! that do require the direct knowledge of the dimensionality of the
         ! simulation
@@ -1352,6 +1600,20 @@ subroutine s_initialize_data_input_module
                                         -buff_size:p + buff_size))
                 end if
 
+                if (q_filtered_wrt) then
+                    do i = 2, 4
+                        allocate (R_u_stat(i)%sf(-buff_size:m + buff_size, &
+                                                     -buff_size:n + buff_size, &
+                                                     -buff_size:p + buff_size))
+                        allocate (R_mu_stat(i)%sf(-buff_size:m + buff_size, &
+                                                     -buff_size:n + buff_size, &
+                                                     -buff_size:p + buff_size))
+                        allocate (F_IMET_stat(i)%sf(-buff_size:m + buff_size, &
+                                                     -buff_size:n + buff_size, &
+                                                     -buff_size:p + buff_size))
+                    end do
+                end if
+                
                 ! Simulation is 2D
             else
 
@@ -1444,6 +1706,21 @@ subroutine s_finalize_data_input_module
             deallocate (q_T_sf%sf)
         end if
 
+        if (q_filtered_wrt) then 
+            do i = 2, 4 
+                deallocate (R_u_stat(i)%sf)
+            end do 
+            deallocate(R_u_stat)
+            do i = 2, 4 
+                deallocate (R_mu_stat(i)%sf)
+            end do 
+            deallocate(R_mu_stat)
+            do i = 2, 4 
+                deallocate (F_IMET_stat(i)%sf)
+            end do 
+            deallocate(F_IMET_stat)
+        end if
+
         s_read_data_files => null()
 
     end subroutine s_finalize_data_input_module
diff --git a/src/post_process/m_global_parameters.fpp b/src/post_process/m_global_parameters.fpp
index 9db5321c55..ec6a3ca3f9 100644
--- a/src/post_process/m_global_parameters.fpp
+++ b/src/post_process/m_global_parameters.fpp
@@ -322,6 +322,7 @@ module m_global_parameters
     logical :: periodic_ibs
     logical :: store_levelset
     logical :: slab_domain_decomposition
+    logical :: q_filtered_wrt
 
 contains
 
@@ -467,6 +468,7 @@ contains
         periodic_ibs = .false.
         store_levelset = .true.
         slab_domain_decomposition = .false.
+        q_filtered_wrt = .false.
 
     end subroutine s_assign_default_values_to_user_inputs
 
@@ -778,6 +780,13 @@ contains
                 allocate (MPI_IO_DATA%var(i)%sf(0:m, 0:n, 0:p))
                 MPI_IO_DATA%var(i)%sf => null()
             end do
+        else if (q_filtered_wrt) then
+            allocate (MPI_IO_DATA%view(1:sys_size+9))
+            allocate (MPI_IO_DATA%var(1:sys_size+9))
+            do i = 1, sys_size+9
+                allocate (MPI_IO_DATA%var(i)%sf(0:m, 0:n, 0:p))
+                MPI_IO_DATA%var(i)%sf => null()
+            end do
         else
             allocate (MPI_IO_DATA%view(1:sys_size))
             allocate (MPI_IO_DATA%var(1:sys_size))
@@ -964,6 +973,12 @@ contains
 
             if (bubbles_lagrange) MPI_IO_DATA%var(sys_size + 1)%sf => null()
 
+            if (q_filtered_wrt) then 
+                do i = sys_size+1, sys_size+9
+                    MPI_IO_DATA%var(i)%sf => null()
+                end do
+            end if
+
             deallocate (MPI_IO_DATA%var)
             deallocate (MPI_IO_DATA%view)
         end if
diff --git a/src/post_process/m_mpi_proxy.fpp b/src/post_process/m_mpi_proxy.fpp
index 9e368d7fa4..357bb326a7 100644
--- a/src/post_process/m_mpi_proxy.fpp
+++ b/src/post_process/m_mpi_proxy.fpp
@@ -172,7 +172,7 @@ contains
             & 'surface_tension', 'hyperelasticity', 'bubbles_lagrange',        &
             & 'rkck_adap_dt', 'output_partial_domain', 'relativity',           &
             & 'cont_damage', 'periodic_ibs', 'store_levelset',                 &
-            & 'slab_domain_decomposition' ]
+            & 'slab_domain_decomposition', 'q_filtered_wrt' ]
             call MPI_BCAST(${VAR}$, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr)
         #:endfor
 
diff --git a/src/post_process/m_start_up.f90 b/src/post_process/m_start_up.f90
index f04efd75e6..8a61c72e07 100644
--- a/src/post_process/m_start_up.f90
+++ b/src/post_process/m_start_up.f90
@@ -85,7 +85,8 @@ subroutine s_read_input_file
             cfl_adap_dt, cfl_const_dt, t_save, t_stop, n_start, &
             cfl_target, surface_tension, bubbles_lagrange, rkck_adap_dt, &
             sim_data, hyperelasticity, Bx0, relativity, cont_damage, & 
-            periodic_ibs, store_levelset, slab_domain_decomposition
+            periodic_ibs, store_levelset, slab_domain_decomposition, &
+            q_filtered_wrt
 
         ! Inquiring the status of the post_process.inp file
         file_loc = 'post_process.inp'
@@ -179,6 +180,7 @@ subroutine s_perform_time_step(t_step)
         ! Populating the buffer regions of the conservative variables
         if (buff_size > 0) then
             call s_populate_conservative_variables_buffer_regions()
+            if (q_filtered_wrt) call s_populate_filtered_variables_buffer_regions()
             if (bubbles_lagrange) call s_populate_conservative_variables_buffer_regions(q_particle(1))
         end if
 
@@ -323,6 +325,32 @@ subroutine s_save_data(t_step, varname, pres, c, H)
             end if
         end do
 
+        ! Adding filtered quantities
+        if (q_filtered_wrt) then
+            ! filtered cons vars
+            do i = 2, 4
+                q_sf = R_u_stat(i)%sf(x_beg:x_end, y_beg:y_end, z_beg:z_end)
+                write (varname, '(A,I0)') 'R_u_stats', i
+                call s_write_variable_to_formatted_database_file(varname, t_step)
+
+                varname(:) = ' '
+            end do
+            do i = 2, 4
+                q_sf = R_mu_stat(i)%sf(x_beg:x_end, y_beg:y_end, z_beg:z_end)
+                write (varname, '(A,I0)') 'R_mu_stats', i
+                call s_write_variable_to_formatted_database_file(varname, t_step)
+
+                varname(:) = ' '
+            end do
+            do i = 2, 4
+                q_sf = F_IMET_stat(i)%sf(x_beg:x_end, y_beg:y_end, z_beg:z_end)
+                write (varname, '(A,I0)') 'F_IMET_stats', i
+                call s_write_variable_to_formatted_database_file(varname, t_step)
+
+                varname(:) = ' '
+            end do
+        end if
+
         ! Adding the species' concentrations to the formatted database file
         if (chemistry) then
             do i = 1, num_species
diff --git a/src/simulation/m_compute_statistics.fpp b/src/simulation/m_compute_statistics.fpp
new file mode 100644
index 0000000000..bcd6732cf2
--- /dev/null
+++ b/src/simulation/m_compute_statistics.fpp
@@ -0,0 +1,130 @@
+#:include 'macros.fpp'
+
+module m_compute_statistics
+    use m_derived_types 
+
+    use m_global_parameters
+
+    use m_mpi_proxy 
+
+    implicit none
+
+    private; public :: s_initialize_statistics_module, s_finalize_statistics_module, s_compute_s_order_statistics
+
+    type(scalar_field), allocatable, dimension(:) :: xnbar_stat
+
+    type(scalar_field), allocatable, dimension(:) :: delta_stat
+
+    type(vector_field), allocatable, dimension(:) :: Msn_stat
+
+    !$acc declare create(xnbar_stat, delta_stat, Msn_stat)
+
+contains
+
+    subroutine s_initialize_statistics_module
+        integer :: i, j
+        @:ALLOCATE(xnbar_stat(1:3))
+        do i = 1, 3
+            @:ALLOCATE(xnbar_stat(i)%sf(0:m, 0:n, 0:p))
+            @:ACC_SETUP_SFs(xnbar_stat(i))
+        end do
+
+        @:ALLOCATE(delta_stat(1:3))
+        do i = 1, 3
+            @:ALLOCATE(delta_stat(i)%sf(0:m, 0:n, 0:p))
+            @:ACC_SETUP_SFs(delta_stat(i))
+        end do
+
+        @:ALLOCATE(Msn_stat(1:num_dims))
+        do i = 1, 3
+            @:ALLOCATE(Msn_stat(i)%vf(2:4))
+        end do
+        do i = 1, 3
+            do j = 2, 4
+                @:ALLOCATE(Msn_stat(i)%vf(j)%sf(0:m, 0:n, 0:p))
+            end do
+            @:ACC_SETUP_VFs(Msn_stat(i))
+        end do
+
+    end subroutine s_initialize_statistics_module
+
+    subroutine s_compute_s_order_statistics(q_temp, n_step, s_order_stat, id)
+        type(scalar_field), intent(in) :: q_temp
+        integer, intent(in) :: n_step
+        type(scalar_field), dimension(2:4), intent(inout) :: s_order_stat
+        integer, intent(in) :: id
+        real(wp) :: ns
+        integer :: i, j, k, ii
+
+        ns = real(n_step, wp)
+
+        if (n_step == 1) then
+            !$acc parallel loop collapse(3) gang vector default(present)
+            do i = 0, m 
+                do j = 0, n
+                    do k = 0, p
+                        xnbar_stat(id)%sf(i, j, k) = q_temp%sf(i, j, k)
+                        Msn_stat(id)%vf(2)%sf(i, j, k) = 0.0_wp
+                        Msn_stat(id)%vf(3)%sf(i, j, k) = 0.0_wp
+                        Msn_stat(id)%vf(4)%sf(i, j, k) = 0.0_wp
+                        s_order_stat(2)%sf(i, j, k) = 0.0_wp
+                        s_order_stat(3)%sf(i, j, k) = 0.0_wp
+                        s_order_stat(4)%sf(i, j, k) = 0.0_wp
+                    end do 
+                end do
+            end do
+        else 
+            !$acc parallel loop collapse(3) gang vector default(present) copyin(ns)
+            do i = 0, m 
+                do j = 0, n
+                    do k = 0, p
+                        delta_stat(id)%sf(i, j, k) = q_temp%sf(i, j, k) - xnbar_stat(id)%sf(i, j, k)
+
+                        xnbar_stat(id)%sf(i, j, k) = xnbar_stat(id)%sf(i, j, k) + delta_stat(id)%sf(i, j, k)/ns
+
+                        Msn_stat(id)%vf(4)%sf(i, j, k) = Msn_stat(id)%vf(4)%sf(i, j, k) & 
+                                                + (delta_stat(id)%sf(i, j, k)**4)*(ns - 1.0_wp)*(ns**2 - 3.0_wp*ns + 3.0_wp)/(ns**3) &
+                                                + 6.0_wp*(delta_stat(id)%sf(i, j, k)**2)*Msn_stat(id)%vf(2)%sf(i, j, k)/(ns**2) &
+                                                - 4.0_wp*delta_stat(id)%sf(i, j, k)*Msn_stat(id)%vf(3)%sf(i, j, k)/ns
+
+                        Msn_stat(id)%vf(3)%sf(i, j, k) = Msn_stat(id)%vf(3)%sf(i, j, k) & 
+                                                + (delta_stat(id)%sf(i, j, k)**3)*(ns - 1.0_wp)*(ns - 2.0_wp)/(ns**2) & 
+                                                - 3.0_wp*delta_stat(id)%sf(i, j, k)*Msn_stat(id)%vf(2)%sf(i, j, k)/ns
+
+                        Msn_stat(id)%vf(2)%sf(i, j, k) = Msn_stat(id)%vf(2)%sf(i, j, k) &
+                                                + (delta_stat(id)%sf(i, j, k)**2)*(ns - 1.0_wp)/ns
+
+                        s_order_stat(2)%sf(i, j, k) = Msn_stat(id)%vf(2)%sf(i, j, k)/(ns - 1.0_wp)
+
+                        s_order_stat(3)%sf(i, j, k) = sqrt(ns)*Msn_stat(id)%vf(3)%sf(i, j, k)/(Msn_stat(id)%vf(2)%sf(i, j, k)**1.5_wp)
+
+                        s_order_stat(4)%sf(i, j, k) = ns*Msn_stat(id)%vf(4)%sf(i, j, k)/(Msn_stat(id)%vf(2)%sf(i, j, k)**2) - 3.0_wp
+                    end do 
+                end do
+            end do
+        end if
+
+    end subroutine s_compute_s_order_statistics
+
+    subroutine s_finalize_statistics_module
+        integer :: i, j
+        do i = 1, 3
+            @:DEALLOCATE(xnbar_stat(i)%sf)
+        end do
+        @:DEALLOCATE(xnbar_stat)
+
+        do i = 1, 3
+            @:DEALLOCATE(delta_stat(i)%sf)
+        end do
+        @:DEALLOCATE(delta_stat)
+
+        do i = 1, 3
+            do j = 2, 4
+                @:DEALLOCATE(Msn_stat(i)%vf(j)%sf)
+            end do
+            @:DEALLOCATE(Msn_stat(i)%vf)
+        end do
+        @:DEALLOCATE(Msn_stat)
+    end subroutine s_finalize_statistics_module
+
+end module m_compute_statistics
\ No newline at end of file
diff --git a/src/simulation/m_data_output.fpp b/src/simulation/m_data_output.fpp
index 56b21207ff..63b8fa2b32 100644
--- a/src/simulation/m_data_output.fpp
+++ b/src/simulation/m_data_output.fpp
@@ -76,7 +76,7 @@ contains
         !! @param q_cons_vf Conservative variables
         !! @param q_prim_vf Primitive variables
         !! @param t_step Current time step
-    subroutine s_write_data_files(q_cons_vf, q_T_sf, q_prim_vf, t_step, beta)
+    subroutine s_write_data_files(q_cons_vf, q_T_sf, q_prim_vf, t_step, beta, R_u_stat, R_mu_stat, F_IMET_stat)
 
         type(scalar_field), &
             dimension(sys_size), &
@@ -94,10 +94,14 @@ contains
         type(scalar_field), &
             intent(inout), optional :: beta
 
+        type(scalar_field), dimension(2:4), intent(inout), optional :: R_u_stat
+        type(scalar_field), dimension(2:4), intent(inout), optional :: R_mu_stat
+        type(scalar_field), dimension(2:4), intent(inout), optional :: F_IMET_stat
+
         if (.not. parallel_io) then
             call s_write_serial_data_files(q_cons_vf, q_T_sf, q_prim_vf, t_step, beta)
         else
-            call s_write_parallel_data_files(q_cons_vf, q_prim_vf, t_step, beta)
+            call s_write_parallel_data_files(q_cons_vf, q_prim_vf, t_step, beta, R_u_stat, R_mu_stat, F_IMET_stat)
         end if
 
     end subroutine s_write_data_files
@@ -786,12 +790,15 @@ contains
         !!  @param q_prim_vf Cell-average primitive variables
         !!  @param t_step Current time-step
         !!  @param beta Eulerian void fraction from lagrangian bubbles
-    subroutine s_write_parallel_data_files(q_cons_vf, q_prim_vf, t_step, beta)
+    subroutine s_write_parallel_data_files(q_cons_vf, q_prim_vf, t_step, beta, R_u_stat, R_mu_stat, F_IMET_stat)
 
         type(scalar_field), dimension(sys_size), intent(in) :: q_cons_vf
         type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
         integer, intent(in) :: t_step
         type(scalar_field), intent(inout), optional :: beta
+        type(scalar_field), dimension(2:4), intent(inout), optional :: R_u_stat
+        type(scalar_field), dimension(2:4), intent(inout), optional :: R_mu_stat
+        type(scalar_field), dimension(2:4), intent(inout), optional :: F_IMET_stat
 
 #ifdef MFC_MPI
 
@@ -813,6 +820,8 @@ contains
 
         if (present(beta)) then
             alt_sys = sys_size + 1
+        else if (present(R_u_stat) .and. present(R_mu_stat) .and. present(F_IMET_stat)) then
+            alt_sys = sys_size + 9
         else
             alt_sys = sys_size
         end if
@@ -896,7 +905,12 @@ contains
             ! Initialize MPI data I/O
 
             if (ib) then
-                call s_initialize_mpi_data(q_cons_vf, ib_markers, levelset, levelset_norm)
+                if (present(R_u_stat) .and. present(R_mu_stat) .and. present(F_IMET_stat)) then
+                    call s_initialize_mpi_data(q_cons_vf, ib_markers, levelset, levelset_norm, & 
+                                               R_u_stat=R_u_stat, R_mu_stat=R_mu_stat, F_IMET_stat=F_IMET_stat)
+                else
+                    call s_initialize_mpi_data(q_cons_vf, ib_markers, levelset, levelset_norm)
+                end if
             elseif (present(beta)) then
                 call s_initialize_mpi_data(q_cons_vf, beta=beta)
             else
@@ -951,6 +965,18 @@ contains
                                                 mpi_p, status, ierr)
                     end do
                 end if
+            else if (fourier_transform_filtering) then
+                do i = 1, alt_sys
+                    var_MOK = int(i, MPI_OFFSET_KIND)
+
+                    ! Initial displacement to skip at beginning of file
+                    disp = m_MOK*max(MOK, n_MOK)*max(MOK, p_MOK)*WP_MOK*(var_MOK - 1)
+
+                    call MPI_FILE_SET_VIEW(ifile, disp, mpi_p, MPI_IO_DATA%view(i), &
+                                           'native', mpi_info_int, ierr)
+                    call MPI_FILE_WRITE_ALL(ifile, MPI_IO_DATA%var(i)%sf, data_size, &
+                                            mpi_p, status, ierr)
+                end do
             else
                 do i = 1, sys_size !TODO: check if correct (sys_size
                     var_MOK = int(i, MPI_OFFSET_KIND)
diff --git a/src/simulation/m_global_parameters.fpp b/src/simulation/m_global_parameters.fpp
index 0ae8d7763e..0158af546f 100644
--- a/src/simulation/m_global_parameters.fpp
+++ b/src/simulation/m_global_parameters.fpp
@@ -1153,6 +1153,9 @@ contains
         elseif (bubbles_lagrange) then
             allocate (MPI_IO_DATA%view(1:sys_size + 1))
             allocate (MPI_IO_DATA%var(1:sys_size + 1))
+        else if (fourier_transform_filtering) then 
+            allocate (MPI_IO_DATA%view(1:sys_size+9))
+            allocate (MPI_IO_DATA%var(1:sys_size+9))
         else
             allocate (MPI_IO_DATA%view(1:sys_size))
             allocate (MPI_IO_DATA%var(1:sys_size))
@@ -1172,6 +1175,11 @@ contains
                 allocate (MPI_IO_DATA%var(i)%sf(0:m, 0:n, 0:p))
                 MPI_IO_DATA%var(i)%sf => null()
             end do
+        else if (fourier_transform_filtering) then 
+            do i = sys_size+1, sys_size+9
+                allocate (MPI_IO_DATA%var(i)%sf(0:m, 0:n, 0:p))
+                MPI_IO_DATA%var(i)%sf => null()
+            end do
         end if
 
         ! Configuring the WENO average flag that will be used to regulate
@@ -1344,6 +1352,10 @@ contains
                 do i = 1, sys_size + 1
                     MPI_IO_DATA%var(i)%sf => null()
                 end do
+            else if (fourier_transform_filtering) then 
+                do i = 1, sys_size+9
+                    MPI_IO_DATA%var(i)%sf => null()
+                end do
             else
                 do i = 1, sys_size
                     MPI_IO_DATA%var(i)%sf => null()
diff --git a/src/simulation/m_rhs.fpp b/src/simulation/m_rhs.fpp
index 626aed96ce..884e6a07ad 100644
--- a/src/simulation/m_rhs.fpp
+++ b/src/simulation/m_rhs.fpp
@@ -999,7 +999,7 @@ contains
             end do
 
             ! particle forces loop, x-dir
-            if (compute_CD .and. present(div_pres_visc_stress)) then
+            if ((compute_CD .or. fourier_transform_filtering) .and. present(div_pres_visc_stress)) then
                 !$acc parallel loop collapse(3) gang vector default(present)
                 do k = 0, p
                     do j = 0, n 
@@ -1128,7 +1128,7 @@ contains
             end do
 
             ! particle forces loop, y-dir
-            if (compute_CD .and. present(div_pres_visc_stress)) then
+            if ((compute_CD .or. fourier_transform_filtering) .and. present(div_pres_visc_stress)) then
                 !$acc parallel loop collapse(3) gang vector default(present)
                 do k = 0, p 
                     do j = 0, n 
@@ -1353,7 +1353,7 @@ contains
             end if
 
             ! particle forces loop, z-dir
-            if (compute_CD .and. present(div_pres_visc_stress)) then
+            if ((compute_CD .or. fourier_transform_filtering) .and. present(div_pres_visc_stress)) then
                 !$acc parallel loop collapse(3) gang vector default(present)
                 do k = 0, p 
                     do j = 0, n 
@@ -1596,7 +1596,7 @@ contains
             end do
 
             ! particle momentum exchange, viscous stress tensor, x-dir
-            if (compute_CD .and. present(div_pres_visc_stress)) then
+            if ((compute_CD .or. fourier_transform_filtering) .and. present(div_pres_visc_stress)) then
                 !$acc parallel loop collapse(3) gang vector default(present)
                 do k = 0, p 
                     do j = 0, n 
@@ -1695,7 +1695,7 @@ contains
             end if
 
             ! particle momentum exchange, viscous stress tensor, y-dir
-            if (compute_CD .and. present(div_pres_visc_stress)) then
+            if ((compute_CD .or. fourier_transform_filtering) .and. present(div_pres_visc_stress)) then
                 !$acc parallel loop collapse(3) gang vector default(present)
                 do k = 0, p 
                     do j = 0, n 
@@ -1797,7 +1797,7 @@ contains
             end do
 
             ! particle momentum exchange, viscous stress tensor, z-dir
-            if (compute_CD .and. present(div_pres_visc_stress)) then
+            if ((compute_CD .or. fourier_transform_filtering) .and. present(div_pres_visc_stress)) then
                 !$acc parallel loop collapse(3) gang vector default(present)
                 do k = 0, p 
                     do j = 0, n 
diff --git a/src/simulation/m_start_up.fpp b/src/simulation/m_start_up.fpp
index d2e9e344a3..4c3fd33495 100644
--- a/src/simulation/m_start_up.fpp
+++ b/src/simulation/m_start_up.fpp
@@ -1420,6 +1420,11 @@ contains
 
         call cpu_time(start)
         call nvtxStartRange("SAVE-DATA")
+        do i = 2, 4 
+            !$acc update host(R_u_stat(i)%sf)
+            !$acc update host(R_mu_stat(i)%sf)
+            !$acc update host(F_IMET_stat(i)%sf)
+        end do  
         do i = 1, sys_size
             !$acc update host(q_cons_ts(1)%vf(i)%sf)
             do l = 0, p
@@ -1452,7 +1457,12 @@ contains
             call s_write_restart_lag_bubbles(save_count) !parallel
             if (lag_params%write_bubbles_stats) call s_write_lag_bubble_stats()
         else
-            call s_write_data_files(q_cons_ts(1)%vf, q_T_sf, q_prim_vf, save_count)
+            if (fourier_transform_filtering) then
+                call s_write_data_files(q_cons_ts(1)%vf, q_T_sf, q_prim_vf, save_count, &
+                                        R_u_stat=R_u_stat, R_mu_stat=R_mu_stat, F_IMET_stat=F_IMET_stat)
+            else
+                call s_write_data_files(q_cons_ts(1)%vf, q_T_sf, q_prim_vf, save_count)
+            end if
         end if
 
         call nvtxEndRange
@@ -1571,6 +1581,8 @@ contains
         call s_initialize_additional_forcing_module()
         if (fourier_transform_filtering) call s_initialize_fftw_explicit_filter_module()
 
+        call s_initialize_statistics_module()
+
     end subroutine s_initialize_modules
 
     subroutine s_initialize_mpi_domain
diff --git a/src/simulation/m_time_steppers.fpp b/src/simulation/m_time_steppers.fpp
index 8291e2d9e7..343a542e4b 100644
--- a/src/simulation/m_time_steppers.fpp
+++ b/src/simulation/m_time_steppers.fpp
@@ -52,6 +52,8 @@ module m_time_steppers
 
     use m_volume_filtering
 
+    use m_compute_statistics
+
     implicit none
 
     type(vector_field), allocatable, dimension(:) :: q_cons_ts !<
@@ -89,8 +91,22 @@ module m_time_steppers
 
     type(scalar_field), allocatable, dimension(:) :: q_cons_filtered
 
+    type(vector_field), allocatable, dimension(:) :: pt_Re_stress
+    type(vector_field), allocatable, dimension(:) :: R_mu
+    type(scalar_field), allocatable, dimension(:) :: pres_visc_stress_filtered
+
+    type(scalar_field) :: mag_div_Ru
+    type(scalar_field) :: mag_div_R_mu
+    type(scalar_field) :: mag_F_IMET
+
+    type(scalar_field), allocatable, dimension(:) :: R_u_stat
+    type(scalar_field), allocatable, dimension(:) :: R_mu_stat
+    type(scalar_field), allocatable, dimension(:) :: F_IMET_stat
+
     !$acc declare create(q_cons_ts, q_prim_vf, q_T_sf, rhs_vf, rhs_ts_rkck, q_prim_ts, rhs_mv, rhs_pb, max_dt)
-    !$acc declare create(div_pres_visc_stress)
+    !$acc declare create(div_pres_visc_stress, q_cons_filtered, pt_Re_stress, R_mu, pres_visc_stress_filtered)
+    !$acc declare create(mag_div_Ru, mag_div_R_mu, mag_F_IMET)
+    !$acc declare create(R_u_stat, R_mu_stat, F_IMET_stat)
 
 contains
 
@@ -366,7 +382,7 @@ contains
             end do
         end do
 
-        if (compute_CD) then
+        if (compute_CD .or. fourier_transform_filtering) then
             @:ALLOCATE(div_pres_visc_stress(momxb:momxe))
             do i = momxb, momxe
                 @:ALLOCATE(div_pres_visc_stress(i)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
@@ -384,8 +400,73 @@ contains
                     idwbuff(3)%beg:idwbuff(3)%end))
                 @:ACC_SETUP_SFs(q_cons_filtered(i))
             end do
+
+            @:ALLOCATE(pt_Re_stress(1:num_dims))
+            do i = 1, num_dims
+                @:ALLOCATE(pt_Re_stress(i)%vf(1:num_dims))
+            end do
+            do i = 1, num_dims
+                do j = 1, num_dims
+                    @:ALLOCATE(pt_Re_stress(i)%vf(j)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
+                        idwbuff(2)%beg:idwbuff(2)%end, &
+                        idwbuff(3)%beg:idwbuff(3)%end))
+                end do
+                @:ACC_SETUP_VFs(pt_Re_stress(i))
+            end do
+
+            @:ALLOCATE(R_mu(1:num_dims))
+            do i = 1, num_dims
+                @:ALLOCATE(R_mu(i)%vf(1:num_dims))
+            end do
+            do i = 1, num_dims
+                do j = 1, num_dims
+                    @:ALLOCATE(R_mu(i)%vf(j)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
+                        idwbuff(2)%beg:idwbuff(2)%end, &
+                        idwbuff(3)%beg:idwbuff(3)%end))
+                end do
+                @:ACC_SETUP_VFs(R_mu(i))
+            end do
+
+            @:ALLOCATE(pres_visc_stress_filtered(1:num_dims))
+            do i = 1, num_dims
+                @:ALLOCATE(pres_visc_stress_filtered(i)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
+                    idwbuff(2)%beg:idwbuff(2)%end, &
+                    idwbuff(3)%beg:idwbuff(3)%end))
+                @:ACC_SETUP_SFs(pres_visc_stress_filtered(i))
+            end do
+
+            @:ALLOCATE(mag_div_Ru%sf(idwbuff(1)%beg:idwbuff(1)%end, &
+                idwbuff(2)%beg:idwbuff(2)%end, &
+                idwbuff(3)%beg:idwbuff(3)%end))
+            @:ACC_SETUP_SFs(mag_div_Ru)
+
+            @:ALLOCATE(mag_div_R_mu%sf(idwbuff(1)%beg:idwbuff(1)%end, &
+                idwbuff(2)%beg:idwbuff(2)%end, &
+                idwbuff(3)%beg:idwbuff(3)%end))
+            @:ACC_SETUP_SFs(mag_div_R_mu)
+
+            @:ALLOCATE(mag_F_IMET%sf(idwbuff(1)%beg:idwbuff(1)%end, &
+                idwbuff(2)%beg:idwbuff(2)%end, &
+                idwbuff(3)%beg:idwbuff(3)%end))
+            @:ACC_SETUP_SFs(mag_F_IMET)
         end if
 
+        @:ALLOCATE(R_u_stat(2:4))
+        do i = 2, 4
+            @:ALLOCATE(R_u_stat(i)%sf(0:m, 0:n, 0:p))
+            @:ACC_SETUP_SFs(R_u_stat(i))
+        end do
+        @:ALLOCATE(R_mu_stat(2:4))
+        do i = 2, 4
+            @:ALLOCATE(R_mu_stat(i)%sf(0:m, 0:n, 0:p))
+            @:ACC_SETUP_SFs(R_mu_stat(i))
+        end do
+        @:ALLOCATE(F_IMET_stat(2:4))
+        do i = 2, 4
+            @:ALLOCATE(F_IMET_stat(i)%sf(0:m, 0:n, 0:p))
+            @:ACC_SETUP_SFs(F_IMET_stat(i))
+        end do
+
     end subroutine s_initialize_time_steppers_module
 
     !> 1st order TVD RK time-stepping algorithm
@@ -694,6 +775,8 @@ contains
 
         real(wp) :: start, finish
 
+        integer :: n_step
+
         ! Stage 1 of 3
 
         if (.not. adap_dt) then
@@ -708,6 +791,30 @@ contains
 
         call s_compute_rhs(q_cons_ts(1)%vf, q_T_sf, q_prim_vf, bc_type, rhs_vf, pb_ts(1)%sf, rhs_pb, mv_ts(1)%sf, rhs_mv, t_step, time_avg, div_pres_visc_stress)
 
+        if (fourier_transform_filtering) then 
+            call s_apply_fftw_filter_cons(q_cons_ts(1)%vf, q_cons_filtered)
+            call s_setup_terms_filtering(q_cons_ts(1)%vf, pt_Re_stress, R_mu)
+            call s_apply_fftw_filter_tensor(pt_Re_stress, R_mu, q_cons_filtered, div_pres_visc_stress, pres_visc_stress_filtered)
+            call s_compute_pseudo_turbulent_reynolds_stress(q_cons_filtered, pt_Re_stress, mag_div_Ru)
+            call s_compute_R_mu(q_cons_filtered, R_mu, mag_div_R_mu)
+            call s_compute_interphase_momentum_exchange_term(pres_visc_stress_filtered, mag_F_IMET)
+        end if
+        
+        if (t_step > 5) then
+            n_step = t_step - 5
+            print *, n_step
+            call s_compute_s_order_statistics(mag_div_Ru, n_step, R_u_stat, 1)
+            !call s_compute_s_order_statistics(mag_div_R_mu, n_step, R_mu_stat, 2)
+            !call s_compute_s_order_statistics(mag_F_IMET, n_step, F_IMET_stat, 3)
+        end if
+
+
+        ! R_u_stat(2)%sf(0:m, 0:n, 0:p) = q_cons_filtered(6)%sf(0:m, 0:n, 0:p)
+        ! R_u_stat(3)%sf(0:m, 0:n, 0:p) = mag_div_Ru%sf(0:m, 0:n, 0:p)
+        ! R_u_stat(4)%sf(0:m, 0:n, 0:p) = mag_div_R_mu%sf(0:m, 0:n, 0:p)
+        ! R_mu_stat(2)%sf(0:m, 0:n, 0:p) = mag_F_IMET%sf(0:m, 0:n, 0:p)
+
+
         if (compute_CD) then
             call s_compute_drag_coefficient(div_pres_visc_stress)
         end if
@@ -1380,13 +1487,58 @@ contains
             @:DEALLOCATE(rhs_vf)
         end if
 
-        if (compute_CD) then
+        if (compute_CD .or. fourier_transform_filtering) then
             do i = momxb, momxe
                 @:DEALLOCATE(div_pres_visc_stress(i)%sf)
             end do
             @:DEALLOCATE(div_pres_visc_stress)
         end if
 
+        if (fourier_transform_filtering) then 
+            do i = 1, sys_size
+                @:DEALLOCATE(q_cons_filtered(i)%sf)
+            end do
+            @:DEALLOCATE(q_cons_filtered)
+
+            do i = 1, num_dims
+                do j = 1, num_dims
+                    @:DEALLOCATE(pt_Re_stress(i)%vf(j)%sf)
+                end do
+                @:DEALLOCATE(pt_Re_stress(i)%vf)
+            end do
+            @:DEALLOCATE(pt_Re_stress)
+
+            do i = 1, num_dims
+                do j = 1, num_dims
+                    @:DEALLOCATE(R_mu(i)%vf(j)%sf)
+                end do
+                @:DEALLOCATE(R_mu(i)%vf)
+            end do
+            @:DEALLOCATE(R_mu)
+
+            do i = 1, num_dims
+                @:DEALLOCATE(pres_visc_stress_filtered(i)%sf)
+            end do
+            @:DEALLOCATE(pres_visc_stress_filtered)
+
+            @:DEALLOCATE(mag_div_Ru%sf)
+            @:DEALLOCATE(mag_div_R_mu%sf)
+            @:DEALLOCATE(mag_F_IMET%sf)
+        end if
+
+        do i = 2, 4
+            @:DEALLOCATE(R_u_stat(i)%sf)
+        end do
+        @:DEALLOCATE(R_u_stat)
+        do i = 2, 4
+            @:DEALLOCATE(R_mu_stat(i)%sf)
+        end do
+        @:DEALLOCATE(R_mu_stat)
+        do i = 2, 4
+            @:DEALLOCATE(F_IMET_stat(i)%sf)
+        end do
+        @:DEALLOCATE(F_IMET_stat)
+
         ! Writing the footer of and closing the run-time information file
         if (proc_rank == 0 .and. run_time_info) then
             call s_close_run_time_information_file()
diff --git a/src/simulation/m_volume_filtering.fpp b/src/simulation/m_volume_filtering.fpp
index 6caffaa4c5..adb102df35 100644
--- a/src/simulation/m_volume_filtering.fpp
+++ b/src/simulation/m_volume_filtering.fpp
@@ -12,6 +12,8 @@ module m_volume_filtering
 
     use m_ibm
 
+    use m_boundary_common
+
 #ifdef MFC_MPI
     use mpi                    !< Message passing interface (MPI) module
 #endif
@@ -23,11 +25,11 @@ module m_volume_filtering
     implicit none
 
     private; public :: s_initialize_fftw_explicit_filter_module, &
- s_apply_fftw_filter_cons, & 
- s_initialize_filtering_kernel, s_initialize_fluid_indicator_function, & 
- s_initialize_filtered_fluid_indicator_function, & 
+ s_initialize_filtering_kernel, s_initialize_fluid_indicator_function, s_initialize_filtered_fluid_indicator_function, & 
  s_finalize_fftw_explicit_filter_module, & 
- s_apply_fftw_filter_tensor, s_apply_fftw_filter_scalarfield
+ s_apply_fftw_filter_cons, s_apply_fftw_filter_tensor, s_apply_fftw_filter_scalarfield, &
+ s_mpi_transpose_slabZ2Y, s_mpi_transpose_slabY2Z, s_mpi_FFT_fwd, s_mpi_FFT_bwd, &
+ s_setup_terms_filtering, s_compute_pseudo_turbulent_reynolds_stress, s_compute_R_mu, s_compute_interphase_momentum_exchange_term
 
 #if !defined(MFC_OpenACC)
     include 'fftw3.f03'
@@ -460,12 +462,12 @@ contains
     end subroutine s_apply_fftw_filter_scalarfield
 
     !< apply the gaussian filter to the requisite tensors to compute unclosed terms of interest
-    subroutine s_apply_fftw_filter_tensor(pt_Re_stress, R_mu, q_cons_filtered, rhs_rhouu, pImT_filtered)
+    subroutine s_apply_fftw_filter_tensor(pt_Re_stress, R_mu, q_cons_filtered, div_pres_visc_stress, pres_visc_stress_filtered)
         type(vector_field), dimension(1:num_dims), intent(inout) :: pt_Re_stress
         type(vector_field), dimension(1:num_dims), intent(inout) :: R_mu
         type(scalar_field), dimension(sys_size), intent(in) :: q_cons_filtered
-        type(scalar_field), dimension(momxb:momxe), intent(inout) :: rhs_rhouu
-        type(scalar_field), dimension(1:num_dims), intent(inout) :: pImT_filtered
+        type(scalar_field), dimension(momxb:momxe), intent(inout) :: div_pres_visc_stress
+        type(scalar_field), dimension(1:num_dims), intent(inout) :: pres_visc_stress_filtered
 
         integer :: i, j, k, l, q
 
@@ -485,7 +487,7 @@ contains
 
         ! interphase momentum exchange
         do l = 1, num_dims
-            call s_apply_fftw_filter_scalarfield(q_cons_filtered(advxb), .false., rhs_rhouu(momxb-1+l), pImT_filtered(l))
+            call s_apply_fftw_filter_scalarfield(q_cons_filtered(advxb), .false., div_pres_visc_stress(momxb-1+l), pres_visc_stress_filtered(l))
         end do 
 
     end subroutine s_apply_fftw_filter_tensor
@@ -983,8 +985,8 @@ contains
                     !$acc loop seq
                     do l = 1, num_dims
                         div_R_mu(l, i, j, k) = (R_mu(l)%vf(1)%sf(i+1, j, k) - R_mu(l)%vf(1)%sf(i-1, j, k))/(2._wp*dx(i)) &
-                                           + (R_mu(l)%vf(2)%sf(i, j+1, k) - R_mu(l)%vf(2)%sf(i, j-1, k))/(2._wp*dy(j)) & 
-                                           + (R_mu(l)%vf(3)%sf(i, j, k+1) - R_mu(l)%vf(3)%sf(i, j, k-1))/(2._wp*dz(k))
+                                             + (R_mu(l)%vf(2)%sf(i, j+1, k) - R_mu(l)%vf(2)%sf(i, j-1, k))/(2._wp*dy(j)) & 
+                                             + (R_mu(l)%vf(3)%sf(i, j, k+1) - R_mu(l)%vf(3)%sf(i, j, k-1))/(2._wp*dz(k))
                     end do
                 end do
             end do
@@ -1001,8 +1003,8 @@ contains
 
     end subroutine s_compute_R_mu
 
-    subroutine s_compute_interphase_momentum_exchange_term(pImT_filtered, mag_F_IMET)
-        type(scalar_field), dimension(1:num_dims), intent(in) :: pImT_filtered
+    subroutine s_compute_interphase_momentum_exchange_term(pres_visc_stress_filtered, mag_F_IMET)
+        type(scalar_field), dimension(1:num_dims), intent(in) :: pres_visc_stress_filtered
         type(scalar_field), intent(inout) :: mag_F_IMET
 
         integer :: i, j, k, l, q, ii
@@ -1011,9 +1013,9 @@ contains
         do i = 0, m
             do j = 0, n
                 do k = 0, p 
-                    mag_F_IMET%sf(i, j, k) = sqrt(pImT_filtered(1)%sf(i, j, k)**2 & 
-                                               + pImT_filtered(2)%sf(i, j, k)**2 & 
-                                               + pImT_filtered(3)%sf(i, j, k)**2)
+                    mag_F_IMET%sf(i, j, k) = sqrt(pres_visc_stress_filtered(1)%sf(i, j, k)**2 & 
+                                                + pres_visc_stress_filtered(2)%sf(i, j, k)**2 & 
+                                                + pres_visc_stress_filtered(3)%sf(i, j, k)**2)
                 end do
             end do
         end do 
diff --git a/toolchain/mfc/run/case_dicts.py b/toolchain/mfc/run/case_dicts.py
index e425e53b6d..b3a26a65b5 100644
--- a/toolchain/mfc/run/case_dicts.py
+++ b/toolchain/mfc/run/case_dicts.py
@@ -446,6 +446,7 @@ def analytic(self):
     'surface_tension': ParamType.LOG,
     'output_partial_domain': ParamType.LOG,
     'bubbles_lagrange': ParamType.LOG,
+    'q_filtered_wrt': ParamType.LOG,
 })
 
 for cmp_id in range(1,3+1):

From 3b1572631045452a5c247863ef8a4e8d727ccc58 Mon Sep 17 00:00:00 2001
From: Conrad Delgado <conraddelgado@Conrads-MacBook-Air-6.local>
Date: Tue, 24 Jun 2025 22:22:02 -0600
Subject: [PATCH 03/30] autocorrelation function calc

---
 runs/3d_1sphere_filtering/case.py       |   1 +
 runs/phi01/case.py                      | 160 ++++++++++++++++++++++++
 src/simulation/m_additional_forcing.fpp |   3 +-
 src/simulation/m_compute_statistics.fpp |  48 ++++++-
 src/simulation/m_global_parameters.fpp  |   2 +
 src/simulation/m_mpi_proxy.fpp          |   2 +-
 src/simulation/m_start_up.fpp           |   2 +-
 src/simulation/m_time_steppers.fpp      |  11 +-
 toolchain/mfc/run/case_dicts.py         |   1 +
 voronoi/gen_voronoi_3D.py               |   6 +-
 10 files changed, 224 insertions(+), 12 deletions(-)
 create mode 100644 runs/phi01/case.py

diff --git a/runs/3d_1sphere_filtering/case.py b/runs/3d_1sphere_filtering/case.py
index fa38be1ff0..9a8c4b1f4c 100644
--- a/runs/3d_1sphere_filtering/case.py
+++ b/runs/3d_1sphere_filtering/case.py
@@ -147,6 +147,7 @@
 
     "store_levelset": "F",
     "slab_domain_decomposition": "T", 
+    "compute_autocorrelation": "T",
     }
 
 case_dict.update(ib_dict)
diff --git a/runs/phi01/case.py b/runs/phi01/case.py
new file mode 100644
index 0000000000..56390a1943
--- /dev/null
+++ b/runs/phi01/case.py
@@ -0,0 +1,160 @@
+import json
+import math
+import numpy as np
+
+Mu = 1.84e-05
+gam_a = 1.4
+R = 287.0
+
+D = 0.1
+
+P = 101325 # Pa
+rho = 1.225 # kg/m^3
+
+T = P/(rho*R)
+
+M = 1.2
+Re = 1500.0
+v1 = M*(gam_a*P/rho)**(1.0/2.0)
+
+mu = rho*v1*D/Re # dynamic viscosity for current case
+
+#print('mu: ', mu)
+#print('v1: ', v1)
+#print('rho: ', rho)
+#print('Kn = ' + str( np.sqrt(np.pi*gam_a/2)*(M/Re) )) # Kn < 0.01 = continuum flow
+
+dt = 4.0E-06
+Nt = 10
+t_save = 1
+
+Nx = 99
+Ny = 99
+Nz = 99
+
+# load initial sphere locations
+sphere_loc = np.loadtxt('sphere_array_locations.txt')
+N_sphere = len(sphere_loc)
+
+# immersed boundary dictionary
+ib_dict = {}
+for i in range(N_sphere):
+    ib_dict.update({
+        f"patch_ib({i+1})%geometry": 8,
+        f"patch_ib({i+1})%x_centroid": sphere_loc[i, 0],
+        f"patch_ib({i+1})%y_centroid": sphere_loc[i, 1],
+        f"patch_ib({i+1})%z_centroid": sphere_loc[i, 2],
+        f"patch_ib({i+1})%radius": D / 2,
+        f"patch_ib({i+1})%slip": "F",
+        })
+
+# Configuring case dictionary
+case_dict = {
+    # Logistics
+    "run_time_info": "T",
+    # Computational Domain Parameters
+    # x direction
+    "x_domain%beg": -5.0 * D,
+    "x_domain%end": 5.0 * D,
+    # y direction
+    "y_domain%beg": -5.0 * D,
+    "y_domain%end": 5.0 * D,
+    # z direction
+    "z_domain%beg": -5.0 * D,
+    "z_domain%end": 5.0 * D,
+    "cyl_coord": "F",
+    "m": Nx,
+    "n": Ny,
+    "p": Nz,
+    "dt": dt,
+    "t_step_start": 0,
+    "t_step_stop": Nt,  # 3000
+    "t_step_save": t_save,  # 10
+    # Simulation Algorithm Parameters
+    # Only one patches are necessary, the air tube
+    "num_patches": 1,
+    # Use the 5 equation model
+    "model_eqns": 2,
+    # 6 equations model does not need the K \div(u) term
+    "alt_soundspeed": "F",
+    # One fluids: air
+    "num_fluids": 1,
+    # time step
+    "mpp_lim": "F",
+    # Correct errors when computing speed of sound
+    "mixture_err": "T",
+    # Use TVD RK3 for time marching
+    "time_stepper": 3,
+    # Reconstruct the primitive variables to minimize spurious
+    # Use WENO5
+    "weno_order": 5,
+    "weno_eps": 1.0e-14,
+    "weno_Re_flux": "T",
+    "weno_avg": "T",
+    "avg_state": 2,
+    "mapped_weno": "T",
+    "null_weights": "F",
+    "mp_weno": "T",
+    "riemann_solver": 2,
+    "low_Mach": 1,
+    "wave_speeds": 1,
+    # periodic bc
+    "bc_x%beg": -1,
+    "bc_x%end": -1,
+    "bc_y%beg": -1,
+    "bc_y%end": -1,
+    "bc_z%beg": -1,
+    "bc_z%end": -1,
+    # Set IB to True and add 1 patch
+    "ib": "T",
+    "num_ibs": N_sphere,
+    "viscous": "T",
+    # Formatted Database Files Structure Parameters
+    "format": 1,
+    "precision": 2,
+    "prim_vars_wrt": "T",
+    "E_wrt": "T",
+    "q_filtered_wrt": "T",
+    "parallel_io": "T",
+    # Patch: Constant Tube filled with air
+    # Specify the cylindrical air tube grid geometry
+    "patch_icpp(1)%geometry": 9,
+    "patch_icpp(1)%x_centroid": 0.0,
+    # Uniform medium density, centroid is at the center of the domain
+    "patch_icpp(1)%y_centroid": 0.0,
+    "patch_icpp(1)%z_centroid": 0.0,
+    "patch_icpp(1)%length_x": 10 * D,
+    "patch_icpp(1)%length_y": 10 * D,
+    "patch_icpp(1)%length_z": 10 * D,
+    # Specify the patch primitive variables
+    "patch_icpp(1)%vel(1)": v1,
+    "patch_icpp(1)%vel(2)": 0.0e00,
+    "patch_icpp(1)%vel(3)": 0.0e00,
+    "patch_icpp(1)%pres": P,
+    "patch_icpp(1)%alpha_rho(1)": rho,
+    "patch_icpp(1)%alpha(1)": 1.0e00,
+    # Patch: Sphere Immersed Boundary
+    # Fluids Physical Parameters
+    "fluid_pp(1)%gamma": 1.0e00 / (gam_a - 1.0e00),  # 2.50(Not 1.40)
+    "fluid_pp(1)%pi_inf": 0,
+    "fluid_pp(1)%Re(1)": Re,
+
+    # new case additions
+    "periodic_forcing": "T",
+    "periodic_ibs": "T",
+    "compute_CD": "F",
+    "fourier_transform_filtering": "T",
+
+    "u_inf_ref": v1,
+    "rho_inf_ref": rho,
+    "T_inf_ref": T,
+    "mu_visc": mu,
+
+    "store_levelset": "F",
+    "slab_domain_decomposition": "T", 
+    "compute_autocorrelation": "T",
+    }
+
+case_dict.update(ib_dict)
+
+print(json.dumps(case_dict))
diff --git a/src/simulation/m_additional_forcing.fpp b/src/simulation/m_additional_forcing.fpp
index 17765413bd..2971ba0e9a 100644
--- a/src/simulation/m_additional_forcing.fpp
+++ b/src/simulation/m_additional_forcing.fpp
@@ -19,7 +19,8 @@ module m_additional_forcing
 
     real(wp), allocatable, dimension(:) :: q_bar ! 1:3 rho*u, 4 rho, 5 T
     type(scalar_field), allocatable, dimension(:) :: q_periodic_force
-    real(wp), allocatable, dimension(:) :: q_spatial_avg, q_spatial_avg_glb ! 1:3 rho*u, 4 rho, 5 T
+    real(wp), allocatable, dimension(:) :: q_spatial_avg
+    real(wp), allocatable, dimension(:), public :: q_spatial_avg_glb ! 1:3 rho*u, 4 rho, 5 T
     real(wp) :: volfrac_phi
     integer :: N_x_total_glb
 
diff --git a/src/simulation/m_compute_statistics.fpp b/src/simulation/m_compute_statistics.fpp
index bcd6732cf2..1721b0706c 100644
--- a/src/simulation/m_compute_statistics.fpp
+++ b/src/simulation/m_compute_statistics.fpp
@@ -7,9 +7,12 @@ module m_compute_statistics
 
     use m_mpi_proxy 
 
+    use m_additional_forcing
+
     implicit none
 
-    private; public :: s_initialize_statistics_module, s_finalize_statistics_module, s_compute_s_order_statistics
+    private; public :: s_initialize_statistics_module, s_finalize_statistics_module, &
+ s_compute_s_order_statistics, s_autocorrelation_function
 
     type(scalar_field), allocatable, dimension(:) :: xnbar_stat
 
@@ -17,6 +20,10 @@ module m_compute_statistics
 
     type(vector_field), allocatable, dimension(:) :: Msn_stat
 
+    real(wp), allocatable, dimension(:) :: xm_th
+
+    real(wp), allocatable, dimension(:) :: x_mom_autocorr
+
     !$acc declare create(xnbar_stat, delta_stat, Msn_stat)
 
 contains
@@ -46,6 +53,11 @@ contains
             @:ACC_SETUP_VFs(Msn_stat(i))
         end do
 
+        if (compute_autocorrelation) then
+            @:ALLOCATE(xm_th(t_step_stop))
+            @:ALLOCATE(x_mom_autocorr(t_step_stop))
+        end if
+
     end subroutine s_initialize_statistics_module
 
     subroutine s_compute_s_order_statistics(q_temp, n_step, s_order_stat, id)
@@ -106,6 +118,40 @@ contains
 
     end subroutine s_compute_s_order_statistics
 
+    subroutine s_autocorrelation_function(n_step, q_cons_vf)
+        integer, intent(in) :: n_step
+        type(scalar_field), dimension(sys_size), intent(in) :: q_cons_vf
+        real(wp) :: q_avg, q_var, test
+        integer :: i, j, k, s, it
+
+        !$acc update host(q_cons_vf(2))
+        xm_th(n_step) = q_cons_vf(2)%sf(m/4, n/4, p/4)
+
+        if (n_step > 1) then
+            ! compute average
+            q_avg = sum(xm_th(1:n_step)) / real(n_step, wp)
+
+            ! compute variance
+            q_var = sum((xm_th(1:n_step) - q_avg)**2) / real(n_step, wp)
+
+            ! compute autocorrelation function
+            do s = 0, n_step - 1
+                x_mom_autocorr(s+1) = 0.0_wp
+                do it = 1, n_step - s
+                    x_mom_autocorr(s+1) = x_mom_autocorr(s+1) + (xm_th(it) - q_avg) * (xm_th(it+s) - q_avg)
+                end do
+                x_mom_autocorr(s+1) = x_mom_autocorr(s+1) / ((n_step - s) * q_var)
+            end do
+            
+            print *, q_cons_vf(2)%sf(m/4, n/4, p/4)
+            print *, 'Autocorrelation at lag 0:', x_mom_autocorr(1) 
+            print *, 'Autocorrelation at lag N/2:', x_mom_autocorr(n_step/2)
+            print *, 'Autocorrelation at max lag:', x_mom_autocorr(n_step)
+
+        end if
+
+    end subroutine s_autocorrelation_function
+
     subroutine s_finalize_statistics_module
         integer :: i, j
         do i = 1, 3
diff --git a/src/simulation/m_global_parameters.fpp b/src/simulation/m_global_parameters.fpp
index 0158af546f..eaea7c04e3 100644
--- a/src/simulation/m_global_parameters.fpp
+++ b/src/simulation/m_global_parameters.fpp
@@ -511,6 +511,7 @@ module m_global_parameters
     logical :: fourier_transform_filtering
     logical :: store_levelset
     logical :: slab_domain_decomposition
+    logical :: compute_autocorrelation
 
     !$acc declare create(mu_visc, u_inf_ref, rho_inf_ref, T_inf_ref)
 
@@ -799,6 +800,7 @@ contains
         fourier_transform_filtering = .false.
         store_levelset = .true.
         slab_domain_decomposition = .false.
+        compute_autocorrelation = .false.
 
     end subroutine s_assign_default_values_to_user_inputs
 
diff --git a/src/simulation/m_mpi_proxy.fpp b/src/simulation/m_mpi_proxy.fpp
index b1f1c28c8c..6171f9faf3 100644
--- a/src/simulation/m_mpi_proxy.fpp
+++ b/src/simulation/m_mpi_proxy.fpp
@@ -93,7 +93,7 @@ contains
             & 'viscous', 'shear_stress', 'bulk_stress', 'bubbles_lagrange',     &
             & 'hyperelasticity', 'rkck_adap_dt', 'bc_io', 'powell', 'cont_damage', &
             & 'periodic_ibs', 'compute_CD', 'periodic_forcing', 'fourier_transform_filtering', & 
-            & 'store_levelset', 'slab_domain_decomposition' ]
+            & 'store_levelset', 'slab_domain_decomposition', 'compute_autocorrelation' ]
             call MPI_BCAST(${VAR}$, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr)
         #:endfor
 
diff --git a/src/simulation/m_start_up.fpp b/src/simulation/m_start_up.fpp
index 4c3fd33495..62b641da19 100644
--- a/src/simulation/m_start_up.fpp
+++ b/src/simulation/m_start_up.fpp
@@ -189,7 +189,7 @@ contains
             cont_damage, tau_star, cont_damage_s, alpha_bar, & 
             periodic_ibs, compute_CD, mu_visc, u_inf_ref, rho_inf_ref, T_inf_ref, & 
             periodic_forcing, fourier_transform_filtering, store_levelset, & 
-            slab_domain_decomposition
+            slab_domain_decomposition, compute_autocorrelation
 
         ! Checking that an input file has been provided by the user. If it
         ! has, then the input file is read in, otherwise, simulation exits.
diff --git a/src/simulation/m_time_steppers.fpp b/src/simulation/m_time_steppers.fpp
index 343a542e4b..b88766eccd 100644
--- a/src/simulation/m_time_steppers.fpp
+++ b/src/simulation/m_time_steppers.fpp
@@ -800,12 +800,13 @@ contains
             call s_compute_interphase_momentum_exchange_term(pres_visc_stress_filtered, mag_F_IMET)
         end if
         
-        if (t_step > 5) then
-            n_step = t_step - 5
-            print *, n_step
+
+        call s_autocorrelation_function(t_step+1, q_cons_ts(1)%vf)
+        if (t_step > 10) then
+            n_step = t_step - 10
             call s_compute_s_order_statistics(mag_div_Ru, n_step, R_u_stat, 1)
-            !call s_compute_s_order_statistics(mag_div_R_mu, n_step, R_mu_stat, 2)
-            !call s_compute_s_order_statistics(mag_F_IMET, n_step, F_IMET_stat, 3)
+            call s_compute_s_order_statistics(mag_div_R_mu, n_step, R_mu_stat, 2)
+            call s_compute_s_order_statistics(mag_F_IMET, n_step, F_IMET_stat, 3)
         end if
 
 
diff --git a/toolchain/mfc/run/case_dicts.py b/toolchain/mfc/run/case_dicts.py
index b3a26a65b5..a58d29869f 100644
--- a/toolchain/mfc/run/case_dicts.py
+++ b/toolchain/mfc/run/case_dicts.py
@@ -307,6 +307,7 @@ def analytic(self):
     'T_inf_ref': ParamType.REAL,
     'periodic_forcing': ParamType.LOG,
     'fourier_transform_filtering': ParamType.LOG,
+    'compute_autocorrelation': ParamType.LOG,
 })
 
 for var in [ 'heatTransfer_model', 'massTransfer_model', 'pressure_corrector',
diff --git a/voronoi/gen_voronoi_3D.py b/voronoi/gen_voronoi_3D.py
index ce700acb5d..c56a02fb8e 100644
--- a/voronoi/gen_voronoi_3D.py
+++ b/voronoi/gen_voronoi_3D.py
@@ -52,13 +52,13 @@ def lloyd_relaxation_3d(initial_points, box, w=1, iterations=10):
     print('running 3D...')
 
     # setup 
-    phi = 0.05
-    str_phi = '005'
+    phi = 0.1
+    str_phi = '01'
 
     D = 0.1
     L = 10*D
 
-    output_dir = '../examples/phi'+str_phi
+    output_dir = '../runs/phi'+str_phi
     if os.path.exists(output_dir) == False:
         os.mkdir(output_dir)
 

From aa4a876d3c36f12b2964d5768b38b011e73e5ef5 Mon Sep 17 00:00:00 2001
From: Conrad Delgado <conraddelgado@Conrads-MacBook-Air-6.local>
Date: Tue, 24 Jun 2025 22:28:44 -0600
Subject: [PATCH 04/30] sphere locations for phi=0.1

---
 runs/phi01/sphere_array_locations.txt | 190 ++++++++++++++++++++++++++
 1 file changed, 190 insertions(+)
 create mode 100644 runs/phi01/sphere_array_locations.txt

diff --git a/runs/phi01/sphere_array_locations.txt b/runs/phi01/sphere_array_locations.txt
new file mode 100644
index 0000000000..cb062253cc
--- /dev/null
+++ b/runs/phi01/sphere_array_locations.txt
@@ -0,0 +1,190 @@
+-2.269415855407714844e-01 -1.414051055908203125e-01 3.922535181045532227e-01
+4.000198841094970703e-01 2.981948852539062500e-02 -2.832174301147460938e-01
+-3.220155239105224609e-01 -3.898024559020996094e-01 -3.041059970855712891e-01
+2.814270257949829102e-01 -7.608795166015625000e-02 -1.437755823135375977e-01
+-2.728327512741088867e-01 4.227894544601440430e-01 3.520679473876953125e-02
+-4.947633743286132812e-01 -4.232151508331298828e-01 -2.972397804260253906e-01
+-1.808261871337890625e-02 2.877434492111206055e-01 -2.310247421264648438e-01
+3.818988800048828125e-01 3.529353141784667969e-01 1.727198362350463867e-01
+-2.346787452697753906e-01 2.829644680023193359e-01 1.594238281250000000e-01
+-4.887726306915283203e-01 4.662406444549560547e-02 7.227540016174316406e-02
+-2.048213481903076172e-01 4.885343313217163086e-01 -2.821706533432006836e-01
+-4.693455696105957031e-01 4.566423892974853516e-01 1.360166072845458984e-02
+-2.810692787170410156e-02 3.964089155197143555e-01 2.224528789520263672e-01
+-4.457854032516479492e-01 2.029451131820678711e-01 -2.691650390625000000e-01
+-4.315460920333862305e-01 3.888773918151855469e-02 -4.190684556961059570e-01
+4.569005966186523438e-01 4.780390262603759766e-01 -1.672872304916381836e-01
+4.523042440414428711e-01 2.975084781646728516e-01 -5.123972892761230469e-03
+1.155309677124023438e-01 3.826811313629150391e-01 3.157733678817749023e-01
+-3.529649972915649414e-01 3.223993778228759766e-01 3.534083366394042969e-01
+-1.699209213256835938e-03 -3.757699728012084961e-01 4.251234531402587891e-01
+3.104512691497802734e-01 3.631212711334228516e-01 3.740961551666259766e-01
+3.886995315551757812e-01 -4.476237297058105469e-01 3.331944942474365234e-01
+-8.131015300750732422e-02 3.511540889739990234e-01 6.623625755310058594e-02
+5.544662475585937500e-03 2.087895870208740234e-01 -4.609942436218261719e-03
+-2.697887420654296875e-01 7.647264003753662109e-02 1.385573148727416992e-01
+-4.056740999221801758e-01 -2.304553985595703125e-03 -2.276074886322021484e-01
+-3.986057043075561523e-01 -8.398652076721191406e-02 8.779549598693847656e-02
+1.455659866333007812e-01 -5.315554141998291016e-02 3.587335348129272461e-01
+-3.624105453491210938e-02 -1.932673454284667969e-01 3.783030509948730469e-01
+2.404289245605468750e-01 2.313592433929443359e-01 -9.129595756530761719e-02
+4.290236234664916992e-01 -2.806437015533447266e-01 -3.928461074829101562e-01
+3.948264122009277344e-01 1.061335802078247070e-01 -1.345469951629638672e-01
+4.199941158294677734e-01 -5.409121513366699219e-02 -4.431722164154052734e-01
+-1.276044845581054688e-01 5.453205108642578125e-02 4.209027290344238281e-01
+2.240920066833496094e-01 5.745470523834228516e-02 -2.274198532104492188e-01
+3.475044965744018555e-01 -1.186680793762207031e-02 3.881464004516601562e-01
+-1.399791240692138672e-02 -5.303645133972167969e-02 3.192350864410400391e-01
+3.149266242980957031e-01 -4.960085153579711914e-01 4.852926731109619141e-01
+1.159789562225341797e-01 7.240676879882812500e-02 -2.081871032714843750e-03
+3.457980155944824219e-01 -4.685097932815551758e-01 1.311070919036865234e-01
+-3.134734630584716797e-01 -1.447633504867553711e-01 2.294397354125976562e-01
+-2.322396039962768555e-01 4.453787803649902344e-01 2.214672565460205078e-01
+7.549452781677246094e-02 2.149226665496826172e-01 1.942512989044189453e-01
+4.877026081085205078e-01 9.565687179565429688e-02 4.446644783020019531e-01
+2.452219724655151367e-01 -1.041567325592041016e-02 -4.420824050903320312e-01
+3.802477121353149414e-01 -2.260215282440185547e-01 -6.829130649566650391e-02
+4.026585817337036133e-01 -9.730875492095947266e-02 5.328035354614257812e-02
+-1.340943574905395508e-01 -2.988189458847045898e-01 4.915304183959960938e-01
+1.499507427215576172e-01 -1.232669353485107422e-01 -3.215692043304443359e-01
+7.229900360107421875e-02 1.496689319610595703e-01 -1.584017276763916016e-01
+4.887890815734863281e-02 -2.996931076049804688e-01 -6.179094314575195312e-02
+3.264107704162597656e-01 1.829891204833984375e-01 4.166131019592285156e-01
+3.418397903442382812e-01 -3.681684732437133789e-01 -1.888689994812011719e-01
+-1.746954917907714844e-01 3.889560699462890625e-03 2.538719177246093750e-01
+-1.082150936126708984e-01 -1.183983087539672852e-01 -4.667383432388305664e-01
+4.464948177337646484e-02 7.829546928405761719e-02 -4.987317323684692383e-01
+2.724659442901611328e-01 3.989661931991577148e-01 -2.271368503570556641e-01
+2.325954437255859375e-01 2.180564403533935547e-01 7.740092277526855469e-02
+4.475378990173339844e-01 8.053278923034667969e-02 2.720277309417724609e-01
+2.500159740447998047e-01 1.361670494079589844e-01 -4.378540515899658203e-01
+-8.050751686096191406e-02 2.042385339736938477e-01 4.733436107635498047e-01
+6.334328651428222656e-02 3.953868150711059570e-01 -1.099604368209838867e-01
+-6.584823131561279297e-02 4.609835147857666016e-01 -2.351213693618774414e-01
+-3.965889215469360352e-01 2.626715898513793945e-01 -4.403696060180664062e-01
+-4.123499393463134766e-01 4.679954051971435547e-01 -4.630439281463623047e-01
+-3.268948793411254883e-01 -2.706754207611083984e-01 4.083846807479858398e-01
+4.519817829132080078e-01 -4.413130283355712891e-01 -4.950367212295532227e-01
+1.736700534820556641e-01 -4.334635734558105469e-01 3.858578205108642578e-01
+-2.476015090942382812e-01 -8.808064460754394531e-02 -3.171390295028686523e-01
+1.416424512863159180e-01 6.130337715148925781e-03 1.613216400146484375e-01
+1.161313056945800781e-01 -8.472347259521484375e-02 -6.638598442077636719e-02
+6.862294673919677734e-02 7.571196556091308594e-02 -3.263452053070068359e-01
+-2.883186340332031250e-01 1.637139320373535156e-01 -1.617478132247924805e-01
+4.712302684783935547e-01 -1.252410411834716797e-01 2.302359342575073242e-01
+-3.321516513824462891e-02 -3.931099176406860352e-01 -1.693089008331298828e-01
+4.347057342529296875e-01 3.060367107391357422e-01 -1.781182289123535156e-01
+4.378421306610107422e-01 -2.324944734573364258e-01 4.174745082855224609e-01
+1.022851467132568359e-02 -1.360912322998046875e-01 6.093466281890869141e-02
+1.258683204650878906e-01 -2.447234392166137695e-01 3.956108093261718750e-01
+-1.879813671112060547e-01 3.079674243927001953e-01 3.408046960830688477e-01
+-3.804820775985717773e-01 3.240450620651245117e-01 -1.224457025527954102e-01
+4.557719230651855469e-01 -3.179004192352294922e-01 2.294783592224121094e-01
+-1.324630975723266602e-01 -2.825807332992553711e-01 2.794981002807617188e-02
+-9.088420867919921875e-02 -4.938784837722778320e-01 -4.559993743896484375e-01
+4.321286678314208984e-01 1.908559799194335938e-01 -4.160747528076171875e-01
+4.761004447937011719e-01 -3.449964523315429688e-02 -9.512662887573242188e-02
+-3.295238018035888672e-01 -4.874784946441650391e-01 3.628075122833251953e-01
+-3.269430398941040039e-01 4.961208105087280273e-01 -1.530282497406005859e-01
+1.903204917907714844e-01 4.334928989410400391e-01 1.328067779541015625e-01
+-1.938850879669189453e-01 -3.347861766815185547e-01 3.228425979614257812e-01
+-7.716512680053710938e-02 -1.792883872985839844e-01 -1.214803457260131836e-01
+2.945523262023925781e-01 4.375331401824951172e-01 -4.941940307617187500e-02
+2.805604934692382812e-01 3.923368453979492188e-02 3.594279289245605469e-03
+-3.963446617126464844e-02 4.087066650390625000e-02 1.291446685791015625e-01
+3.017591238021850586e-01 -4.672487974166870117e-01 -3.370153903961181641e-01
+-5.923175811767578125e-02 -1.029053926467895508e-01 -2.954306602478027344e-01
+-4.299471378326416016e-01 1.944204568862915039e-01 1.885912418365478516e-01
+1.226736307144165039e-01 -4.231331348419189453e-01 -4.431772232055664062e-01
+-1.630305051803588867e-01 1.654865741729736328e-01 1.177084445953369141e-02
+-2.820068597793579102e-01 -1.914020776748657227e-01 4.649567604064941406e-02
+-1.803944110870361328e-01 -5.573785305023193359e-02 3.654372692108154297e-02
+3.560798168182373047e-01 -2.656357288360595703e-01 1.175208091735839844e-01
+4.641888141632080078e-01 3.300178050994873047e-01 4.690952301025390625e-01
+-3.651157617568969727e-01 4.143847227096557617e-01 -3.058776855468750000e-01
+4.892169237136840820e-01 4.351882934570312500e-01 3.436787128448486328e-01
+1.252651214599609375e-02 1.140588521957397461e-01 3.147521018981933594e-01
+2.564185857772827148e-01 4.870939254760742188e-01 2.839587926864624023e-01
+2.440360784530639648e-01 2.740068435668945312e-01 2.384872436523437500e-01
+-1.093761920928955078e-01 2.005448341369628906e-01 2.263984680175781250e-01
+2.751414775848388672e-01 3.257715702056884766e-01 -4.216753244400024414e-01
+-2.344570159912109375e-01 3.708822727203369141e-01 -4.901626110076904297e-01
+3.321342468261718750e-01 2.205178737640380859e-01 -2.770333290100097656e-01
+-3.562602996826171875e-01 2.268432378768920898e-01 3.148293495178222656e-02
+-2.453712224960327148e-01 -3.159594535827636719e-01 -1.403638124465942383e-01
+-4.225530624389648438e-01 1.738572120666503906e-01 4.009822607040405273e-01
+-2.291325330734252930e-01 3.076609373092651367e-01 -2.942405939102172852e-01
+-4.163160324096679688e-01 -1.362502574920654297e-01 -4.328134059906005859e-01
+1.602690219879150391e-01 4.211304187774658203e-01 4.947811365127563477e-01
+1.699512004852294922e-01 -3.455421924591064453e-01 1.857841014862060547e-01
+1.917399168014526367e-01 -2.274444103240966797e-01 -1.499438285827636719e-01
+5.063652992248535156e-02 -7.577204704284667969e-02 -4.671556949615478516e-01
+1.856522560119628906e-01 1.085456609725952148e-01 3.598620891571044922e-01
+2.133283615112304688e-01 -1.748585700988769531e-01 5.385351181030273438e-02
+1.607365608215332031e-01 2.551939487457275391e-01 -2.725876569747924805e-01
+4.380518198013305664e-01 2.549636363983154297e-01 2.876336574554443359e-01
+-2.457389831542968750e-01 -4.205622673034667969e-01 -4.621033668518066406e-01
+-4.958317279815673828e-01 -4.657427072525024414e-01 1.988265514373779297e-01
+6.845688819885253906e-02 2.681604623794555664e-01 -4.308686256408691406e-01
+-4.200789928436279297e-01 3.732924461364746094e-01 1.710724830627441406e-01
+3.544092178344726562e-02 -3.218197822570800781e-01 8.597135543823242188e-02
+-5.194902420043945312e-02 1.222956180572509766e-02 -5.136704444885253906e-02
+1.391673088073730469e-01 2.476061582565307617e-01 4.182490110397338867e-01
+-1.033620834350585938e-01 3.683781623840332031e-02 -3.891081809997558594e-01
+-4.138703346252441406e-01 -3.311948776245117188e-01 -4.624009132385253906e-01
+-9.261775016784667969e-02 1.478457450866699219e-01 -1.957361698150634766e-01
+2.608032226562500000e-01 -1.573407649993896484e-01 4.948087930679321289e-01
+-1.243667602539062500e-01 -4.962480068206787109e-01 3.667256832122802734e-01
+-4.454655647277832031e-01 -2.705636024475097656e-01 1.070654392242431641e-01
+-4.106376171112060547e-01 -1.618578433990478516e-01 -6.648111343383789062e-02
+3.302078247070312500e-01 -2.219557762145996094e-02 1.648344993591308594e-01
+-1.774271726608276367e-01 3.244402408599853516e-01 -9.372758865356445312e-02
+2.811634540557861328e-01 1.279127597808837891e-01 2.315803766250610352e-01
+2.449696063995361328e-01 -3.595451116561889648e-01 -6.689429283142089844e-03
+5.237126350402832031e-02 -2.531653642654418945e-01 -4.304802417755126953e-01
+-2.635989189147949219e-01 -2.267163991928100586e-01 -4.170490503311157227e-01
+-2.721209526062011719e-01 1.574560403823852539e-01 2.993257045745849609e-01
+-3.956274986267089844e-01 2.191853523254394531e-02 2.550070285797119141e-01
+-1.563029289245605469e-01 -2.704749107360839844e-01 -2.991802692413330078e-01
+7.597208023071289062e-02 -1.699868440628051758e-01 2.227045297622680664e-01
+-3.653595447540283203e-01 -4.391734600067138672e-01 1.462922096252441406e-01
+1.705410480499267578e-01 -4.559497833251953125e-01 -1.512272357940673828e-01
+-1.343528032302856445e-01 -1.545200347900390625e-01 2.051105499267578125e-01
+5.652284622192382812e-02 -3.860473632812500000e-02 -1.806387901306152344e-01
+7.492136955261230469e-02 -4.894123077392578125e-01 3.830230236053466797e-02
+2.993867397308349609e-01 -3.184000253677368164e-01 2.854095697402954102e-01
+9.030818939208984375e-02 4.506881237030029297e-01 -3.190367221832275391e-01
+1.546680927276611328e-01 -3.337359428405761719e-01 -2.724964618682861328e-01
+1.143584251403808594e-01 3.319869041442871094e-01 3.964900970458984375e-02
+3.128879070281982422e-01 -1.711206436157226562e-01 -2.891231775283813477e-01
+3.134812116622924805e-01 -3.195825815200805664e-01 4.452092647552490234e-01
+-4.257751703262329102e-01 -3.556568622589111328e-01 3.161740303039550781e-01
+-4.604424238204956055e-01 1.566462516784667969e-01 -7.651758193969726562e-02
+-4.535093307495117188e-01 -1.047830581665039062e-01 3.779829740524291992e-01
+-7.651937007904052734e-02 3.510303497314453125e-01 -4.015958309173583984e-01
+-5.069994926452636719e-02 -3.019337654113769531e-01 2.270703315734863281e-01
+4.413585662841796875e-01 3.929922580718994141e-01 -3.560695648193359375e-01
+2.530579566955566406e-01 -3.169052600860595703e-01 -4.228000640869140625e-01
+-6.997537612915039062e-02 1.933835744857788086e-01 -3.526034355163574219e-01
+-1.785504817962646484e-01 1.803159713745117188e-03 -1.765856742858886719e-01
+-1.506757736206054688e-02 3.296717405319213867e-01 4.055316448211669922e-01
+-4.429192543029785156e-01 -3.453509807586669922e-01 -1.209781169891357422e-01
+-2.643674612045288086e-01 1.182488203048706055e-01 -3.157637119293212891e-01
+4.684782028198242188e-02 -4.617999792098999023e-01 2.509958744049072266e-01
+-3.250834941864013672e-01 2.819657325744628906e-03 4.174815416336059570e-01
+-3.355050086975097656e-02 -4.035353660583496094e-01 -3.605549335479736328e-01
+-3.662085533142089844e-01 -2.316244840621948242e-01 -2.756531238555908203e-01
+-2.576720714569091797e-01 -1.255595684051513672e-02 -4.626390933990478516e-01
+-3.275632858276367188e-01 2.991151809692382812e-02 -4.782438278198242188e-02
+4.056546688079833984e-01 1.594020128250122070e-01 7.798624038696289062e-02
+-2.715262174606323242e-01 -3.173813819885253906e-01 1.938788890838623047e-01
+3.270006179809570312e-02 -2.296169996261596680e-01 -2.338488101959228516e-01
+-1.381781101226806641e-01 -4.450683593750000000e-01 1.390277147293090820e-01
+4.581812620162963867e-01 -4.004166126251220703e-01 5.525112152099609375e-03
+-2.281215190887451172e-01 -1.310509443283081055e-01 -1.401650905609130859e-01
+-2.425242662429809570e-01 1.733251810073852539e-01 -4.973032474517822266e-01
+-1.258821487426757812e-01 -4.724828004837036133e-01 -5.991733074188232422e-02
+4.821944236755371094e-01 -1.722755432128906250e-01 -2.475099563598632812e-01
+2.750682830810546875e-02 4.665797948837280273e-01 4.664119482040405273e-01
+-3.053290843963623047e-01 -3.777220249176025391e-01 2.397775650024414062e-03
+2.908480167388916016e-01 -1.594734191894531250e-01 2.671622037887573242e-01

From d39eca955ce9a2501659bdb50038340d4f9c1ba6 Mon Sep 17 00:00:00 2001
From: conradd3 <conradd3@illinois.edu>
Date: Mon, 18 Aug 2025 14:33:05 -0500
Subject: [PATCH 05/30] Reorganization for PR

---
 examples/3D_ibm_sphere_periodic/case.py      | 107 ++++++
 runs/3d_1sphere_filtering/case.py            |   2 +-
 runs/3d_1sphere_periodic/case.py             |   2 +-
 runs/phi01/case.py                           |   2 +-
 src/common/m_mpi_common.fpp                  |  20 +-
 src/post_process/m_data_input.f90            | 108 +++---
 src/post_process/m_start_up.f90              |  12 +-
 src/simulation/m_additional_forcing.fpp      |  16 +-
 src/simulation/m_checker.fpp                 |   2 +-
 src/simulation/m_compute_particle_forces.fpp |   6 +-
 src/simulation/m_data_output.fpp             |  26 +-
 src/simulation/m_global_parameters.fpp       |  10 +-
 src/simulation/m_mpi_proxy.fpp               |   2 +-
 src/simulation/m_rhs.fpp                     |  40 +--
 src/simulation/m_start_up.fpp                |  23 +-
 src/simulation/m_time_steppers.fpp           | 200 +++--------
 src/simulation/m_volume_filtering.fpp        | 357 +++++++++++++------
 src/simulation/p_main.fpp                    |   3 +-
 toolchain/mfc/run/case_dicts.py              |   2 +-
 19 files changed, 552 insertions(+), 388 deletions(-)
 create mode 100644 examples/3D_ibm_sphere_periodic/case.py

diff --git a/examples/3D_ibm_sphere_periodic/case.py b/examples/3D_ibm_sphere_periodic/case.py
new file mode 100644
index 0000000000..41938f69fd
--- /dev/null
+++ b/examples/3D_ibm_sphere_periodic/case.py
@@ -0,0 +1,107 @@
+import json
+import math
+
+Mu = 1.84e-05
+gam_a = 1.4
+
+D = 0.1
+
+# Configuring case dictionary
+print(
+    json.dumps(
+        {
+            # Logistics
+            "run_time_info": "T",
+            # Computational Domain Parameters
+            # x direction
+            "x_domain%beg": -5 * D,
+            "x_domain%end": 5.0 * D,
+            # y direction
+            "y_domain%beg": -2.5 * D,
+            "y_domain%end": 2.5 * D,
+            # z direction
+            "z_domain%beg": -2.5 * D,
+            "z_domain%end": 2.5 * D,
+            "cyl_coord": "F",
+            "m": 99,
+            "n": 99,
+            "p": 99,
+            "dt": 1.0e-6,
+            "t_step_start": 0,
+            "t_step_stop": 200,  # 3000
+            "t_step_save": 10,  # 10
+            # Simulation Algorithm Parameters
+            # Only one patches are necessary, the air tube
+            "num_patches": 1,
+            # Use the 5 equation model
+            "model_eqns": 2,
+            # 6 equations model does not need the K \div(u) term
+            "alt_soundspeed": "F",
+            # One fluids: air
+            "num_fluids": 1,
+            # time step
+            "mpp_lim": "F",
+            # Correct errors when computing speed of sound
+            "mixture_err": "T",
+            # Use TVD RK3 for time marching
+            "time_stepper": 3,
+            # Reconstruct the primitive variables to minimize spurious
+            # Use WENO5
+            "weno_order": 5,
+            "weno_eps": 1.0e-16,
+            "weno_Re_flux": "T",
+            "weno_avg": "T",
+            "avg_state": 2,
+            "mapped_weno": "T",
+            "null_weights": "F",
+            "mp_weno": "T",
+            "riemann_solver": 2,
+            "wave_speeds": 1,
+            # Periodic BCs
+            "bc_x%beg": -1,
+            "bc_x%end": -1,
+            "bc_y%beg": -1,
+            "bc_y%end": -1,
+            "bc_z%beg": -1,
+            "bc_z%end": -1,
+            # Set IB to True and add 1 patch
+            "ib": "T",
+            "num_ibs": 1,
+            "viscous": "T",
+            # Formatted Database Files Structure Parameters
+            "format": 1,
+            "precision": 2,
+            "prim_vars_wrt": "T",
+            "E_wrt": "T",
+            "parallel_io": "T",
+            # Patch: Constant Tube filled with air
+            # Specify the cylindrical air tube grid geometry
+            "patch_icpp(1)%geometry": 9,
+            "patch_icpp(1)%x_centroid": 0.0,
+            # Uniform medium density, centroid is at the center of the domain
+            "patch_icpp(1)%y_centroid": 0.0,
+            "patch_icpp(1)%z_centroid": 0.0,
+            "patch_icpp(1)%length_x": 10 * D,
+            "patch_icpp(1)%length_y": 5 * D,
+            "patch_icpp(1)%length_z": 5 * D,
+            # Specify the patch primitive variables
+            "patch_icpp(1)%vel(1)": 527.2e00,
+            "patch_icpp(1)%vel(2)": 0.0e00,
+            "patch_icpp(1)%vel(3)": 0.0e00,
+            "patch_icpp(1)%pres": 10918.2549,
+            "patch_icpp(1)%alpha_rho(1)": 0.2199,
+            "patch_icpp(1)%alpha(1)": 1.0e00,
+            # Patch: Sphere Immersed Boundary
+            "patch_ib(1)%geometry": 8,
+            "patch_ib(1)%x_centroid": -3.0e-3,
+            "patch_ib(1)%y_centroid": 0.0,
+            "patch_ib(1)%z_centroid": 0.0,
+            "patch_ib(1)%radius": D / 2,
+            "patch_ib(1)%slip": "T",
+            # Fluids Physical Parameters
+            "fluid_pp(1)%gamma": 1.0e00 / (gam_a - 1.0e00),  # 2.50(Not 1.40)
+            "fluid_pp(1)%pi_inf": 0,
+            "fluid_pp(1)%Re(1)": 7535533.2,
+        }
+    )
+)
diff --git a/runs/3d_1sphere_filtering/case.py b/runs/3d_1sphere_filtering/case.py
index 9a8c4b1f4c..0964ea5dd4 100644
--- a/runs/3d_1sphere_filtering/case.py
+++ b/runs/3d_1sphere_filtering/case.py
@@ -138,7 +138,7 @@
     "periodic_forcing": "T",
     "periodic_ibs": "T",
     "compute_CD": "F",
-    "fourier_transform_filtering": "T",
+    "volume_filtering_momentum_eqn": "T",
 
     "u_inf_ref": v1,
     "rho_inf_ref": rho,
diff --git a/runs/3d_1sphere_periodic/case.py b/runs/3d_1sphere_periodic/case.py
index 857841ad0c..f4512b5f00 100644
--- a/runs/3d_1sphere_periodic/case.py
+++ b/runs/3d_1sphere_periodic/case.py
@@ -138,7 +138,7 @@
     "periodic_ibs": "T",
     #"compute_CD_vi": "F",
     #"compute_CD_si": "F",
-    #"fourier_transform_filtering": "T",
+    #"volume_filtering_momentum_eqn": "T",
 
     "u_inf_ref": v1,
     "rho_inf_ref": rho,
diff --git a/runs/phi01/case.py b/runs/phi01/case.py
index 56390a1943..8e7a5bff4b 100644
--- a/runs/phi01/case.py
+++ b/runs/phi01/case.py
@@ -143,7 +143,7 @@
     "periodic_forcing": "T",
     "periodic_ibs": "T",
     "compute_CD": "F",
-    "fourier_transform_filtering": "T",
+    "volume_filtering_momentum_eqn": "T",
 
     "u_inf_ref": v1,
     "rho_inf_ref": rho,
diff --git a/src/common/m_mpi_common.fpp b/src/common/m_mpi_common.fpp
index 8214120fe7..662d096665 100644
--- a/src/common/m_mpi_common.fpp
+++ b/src/common/m_mpi_common.fpp
@@ -101,7 +101,7 @@ contains
         allocate (buff_recv(0:ubound(buff_send, 1)))
 
 #ifdef MFC_SIMULATION
-        if (fourier_transform_filtering) then
+        if (volume_filtering_momentum_eqn) then
             @:ALLOCATE(buff_send_scalarfield(0:-1 + buff_size*1* &
                                      & (m + 2*buff_size + 1)* &
                                      & (n + 2*buff_size + 1)* &
@@ -153,7 +153,7 @@ contains
     !! @param levelset closest distance from every cell to the IB
     !! @param levelset_norm normalized vector from every cell to the closest point to the IB
     !! @param beta Eulerian void fraction from lagrangian bubbles
-    subroutine s_initialize_mpi_data(q_cons_vf, ib_markers, levelset, levelset_norm, beta, R_u_stat, R_mu_stat, F_IMET_stat)
+    subroutine s_initialize_mpi_data(q_cons_vf, ib_markers, levelset, levelset_norm, beta, stat_reynolds_stress, stat_eff_visc, stat_int_mom_exch)
 
         type(scalar_field), &
             dimension(sys_size), &
@@ -174,9 +174,9 @@ contains
         type(scalar_field), &
             intent(in), optional :: beta
 
-        type(scalar_field), dimension(2:4), intent(in), optional :: R_u_stat
-        type(scalar_field), dimension(2:4), intent(in), optional :: R_mu_stat
-        type(scalar_field), dimension(2:4), intent(in), optional :: F_IMET_stat
+        type(scalar_field), dimension(2:4), intent(in), optional :: stat_reynolds_stress
+        type(scalar_field), dimension(2:4), intent(in), optional :: stat_eff_visc
+        type(scalar_field), dimension(2:4), intent(in), optional :: stat_int_mom_exch
 
         integer, dimension(num_dims) :: sizes_glb, sizes_loc
         integer, dimension(1) :: airfoil_glb, airfoil_loc, airfoil_start
@@ -191,7 +191,7 @@ contains
 
         if (present(beta)) then
             alt_sys = sys_size + 1
-        else if (present(R_u_stat) .and. present(R_mu_stat) .and. present(F_IMET_stat)) then
+        else if (present(stat_reynolds_stress) .and. present(stat_eff_visc) .and. present(stat_int_mom_exch)) then
             alt_sys = sys_size + 9
         else
             alt_sys = sys_size
@@ -201,15 +201,15 @@ contains
             MPI_IO_DATA%var(i)%sf => q_cons_vf(i)%sf(0:m, 0:n, 0:p)
         end do
         
-        if (present(R_u_stat) .and. present(R_mu_stat) .and. present(F_IMET_stat)) then 
+        if (present(stat_reynolds_stress) .and. present(stat_eff_visc) .and. present(stat_int_mom_exch)) then 
             do i = sys_size+1, sys_size+3
-                MPI_IO_DATA%var(i)%sf => R_u_stat(i-sys_size+1)%sf(0:m, 0:n, 0:p)
+                MPI_IO_DATA%var(i)%sf => stat_reynolds_stress(i-sys_size+1)%sf(0:m, 0:n, 0:p)
             end do
             do i = sys_size+4, sys_size+6
-                MPI_IO_DATA%var(i)%sf => R_mu_stat(i-sys_size-2)%sf(0:m, 0:n, 0:p)
+                MPI_IO_DATA%var(i)%sf => stat_eff_visc(i-sys_size-2)%sf(0:m, 0:n, 0:p)
             end do
             do i = sys_size+7, sys_size+9 
-                MPI_IO_DATA%var(i)%sf => F_IMET_stat(i-sys_size-5)%sf(0:m, 0:n, 0:p)
+                MPI_IO_DATA%var(i)%sf => stat_int_mom_exch(i-sys_size-5)%sf(0:m, 0:n, 0:p)
             end do 
         end if
 
diff --git a/src/post_process/m_data_input.f90 b/src/post_process/m_data_input.f90
index 69b13707b1..1efc1b97d3 100644
--- a/src/post_process/m_data_input.f90
+++ b/src/post_process/m_data_input.f90
@@ -61,9 +61,9 @@ end subroutine s_read_abstract_data_files
     ! type(scalar_field), public :: ib_markers !<
     type(integer_field), public :: ib_markers
 
-    type(scalar_field), allocatable, dimension(:), public :: R_u_stat
-    type(scalar_field), allocatable, dimension(:), public :: R_mu_stat
-    type(scalar_field), allocatable, dimension(:), public :: F_IMET_stat
+    type(scalar_field), allocatable, dimension(:), public :: stat_reynolds_stress
+    type(scalar_field), allocatable, dimension(:), public :: stat_eff_visc
+    type(scalar_field), allocatable, dimension(:), public :: stat_int_mom_exch
 
     procedure(s_read_abstract_data_files), pointer :: s_read_data_files => null()
 
@@ -463,9 +463,9 @@ subroutine s_read_parallel_data_files(t_step)
                 if (ib) then
                     if (q_filtered_wrt) then
                         call s_initialize_mpi_data(q_cons_vf, ib_markers, &
-                                                   R_u_stat=R_u_stat, & 
-                                                   R_mu_stat=R_mu_stat, & 
-                                                   F_IMET_stat=F_IMET_stat)
+                                                   stat_reynolds_stress=stat_reynolds_stress, & 
+                                                   stat_eff_visc=stat_eff_visc, & 
+                                                   stat_int_mom_exch=stat_int_mom_exch)
                     else 
                         call s_initialize_mpi_data(q_cons_vf, ib_markers)
                     end if
@@ -1345,12 +1345,12 @@ subroutine s_populate_filtered_variables_buffer_regions(q_particle)
                         q_particle%sf((m + 1) - j, 0:n, 0:p)
                 else
                     do i = 2, 4
-                        R_u_stat(i)%sf(-j, 0:n, 0:p) = &
-                            R_u_stat(i)%sf((m + 1) - j, 0:n, 0:p)
-                        R_mu_stat(i)%sf(-j, 0:n, 0:p) = &
-                            R_mu_stat(i)%sf((m + 1) - j, 0:n, 0:p)
-                        F_IMET_stat(i)%sf(-j, 0:n, 0:p) = &
-                            F_IMET_stat(i)%sf((m + 1) - j, 0:n, 0:p)
+                        stat_reynolds_stress(i)%sf(-j, 0:n, 0:p) = &
+                            stat_reynolds_stress(i)%sf((m + 1) - j, 0:n, 0:p)
+                        stat_eff_visc(i)%sf(-j, 0:n, 0:p) = &
+                            stat_eff_visc(i)%sf((m + 1) - j, 0:n, 0:p)
+                        stat_int_mom_exch(i)%sf(-j, 0:n, 0:p) = &
+                            stat_int_mom_exch(i)%sf((m + 1) - j, 0:n, 0:p)
                     end do
                 end if
             end do
@@ -1376,12 +1376,12 @@ subroutine s_populate_filtered_variables_buffer_regions(q_particle)
                         q_particle%sf(j - 1, 0:n, 0:p)
                 else
                     do i = 2, 4
-                        R_u_stat(i)%sf(m + j, 0:n, 0:p) = &
-                            R_u_stat(i)%sf(j - 1, 0:n, 0:p)
-                        R_mu_stat(i)%sf(m + j, 0:n, 0:p) = &
-                            R_mu_stat(i)%sf(j - 1, 0:n, 0:p)
-                        F_IMET_stat(i)%sf(m + j, 0:n, 0:p) = &
-                            F_IMET_stat(i)%sf(j - 1, 0:n, 0:p)
+                        stat_reynolds_stress(i)%sf(m + j, 0:n, 0:p) = &
+                            stat_reynolds_stress(i)%sf(j - 1, 0:n, 0:p)
+                        stat_eff_visc(i)%sf(m + j, 0:n, 0:p) = &
+                            stat_eff_visc(i)%sf(j - 1, 0:n, 0:p)
+                        stat_int_mom_exch(i)%sf(m + j, 0:n, 0:p) = &
+                            stat_int_mom_exch(i)%sf(j - 1, 0:n, 0:p)
                     end do
                 end if
             end do
@@ -1414,12 +1414,12 @@ subroutine s_populate_filtered_variables_buffer_regions(q_particle)
                             q_particle%sf(:, (n + 1) - j, 0:p)
                     else
                         do i = 2, 4
-                            R_u_stat(i)%sf(:, -j, 0:p) = &
-                                R_u_stat(i)%sf(:, (n + 1) - j, 0:p)
-                            R_mu_stat(i)%sf(:, -j, 0:p) = &
-                                R_mu_stat(i)%sf(:, (n + 1) - j, 0:p)
-                            F_IMET_stat(i)%sf(:, -j, 0:p) = &
-                                F_IMET_stat(i)%sf(:, (n + 1) - j, 0:p)
+                            stat_reynolds_stress(i)%sf(:, -j, 0:p) = &
+                                stat_reynolds_stress(i)%sf(:, (n + 1) - j, 0:p)
+                            stat_eff_visc(i)%sf(:, -j, 0:p) = &
+                                stat_eff_visc(i)%sf(:, (n + 1) - j, 0:p)
+                            stat_int_mom_exch(i)%sf(:, -j, 0:p) = &
+                                stat_int_mom_exch(i)%sf(:, (n + 1) - j, 0:p)
                         end do
                     end if
                 end do
@@ -1445,12 +1445,12 @@ subroutine s_populate_filtered_variables_buffer_regions(q_particle)
                             q_particle%sf(:, j - 1, 0:p)
                     else
                         do i = 2, 4
-                            R_u_stat(i)%sf(:, n + j, 0:p) = &
-                                R_u_stat(i)%sf(:, j - 1, 0:p)
-                            R_mu_stat(i)%sf(:, n + j, 0:p) = &
-                                R_mu_stat(i)%sf(:, j - 1, 0:p)
-                            F_IMET_stat(i)%sf(:, n + j, 0:p) = &
-                                F_IMET_stat(i)%sf(:, j - 1, 0:p)
+                            stat_reynolds_stress(i)%sf(:, n + j, 0:p) = &
+                                stat_reynolds_stress(i)%sf(:, j - 1, 0:p)
+                            stat_eff_visc(i)%sf(:, n + j, 0:p) = &
+                                stat_eff_visc(i)%sf(:, j - 1, 0:p)
+                            stat_int_mom_exch(i)%sf(:, n + j, 0:p) = &
+                                stat_int_mom_exch(i)%sf(:, j - 1, 0:p)
                         end do
                     end if
                 end do
@@ -1483,12 +1483,12 @@ subroutine s_populate_filtered_variables_buffer_regions(q_particle)
                                 q_particle%sf(:, :, (p + 1) - j)
                         else
                             do i = 2, 4
-                                R_u_stat(i)%sf(:, :, -j) = &
-                                    R_u_stat(i)%sf(:, :, (p + 1) - j)
-                                R_mu_stat(i)%sf(:, :, -j) = &
-                                    R_mu_stat(i)%sf(:, :, (p + 1) - j)
-                                F_IMET_stat(i)%sf(:, :, -j) = &
-                                    F_IMET_stat(i)%sf(:, :, (p + 1) - j)
+                                stat_reynolds_stress(i)%sf(:, :, -j) = &
+                                    stat_reynolds_stress(i)%sf(:, :, (p + 1) - j)
+                                stat_eff_visc(i)%sf(:, :, -j) = &
+                                    stat_eff_visc(i)%sf(:, :, (p + 1) - j)
+                                stat_int_mom_exch(i)%sf(:, :, -j) = &
+                                    stat_int_mom_exch(i)%sf(:, :, (p + 1) - j)
                             end do
                         end if
                     end do
@@ -1515,12 +1515,12 @@ subroutine s_populate_filtered_variables_buffer_regions(q_particle)
                                 q_particle%sf(:, :, j - 1)
                         else
                             do i = 2, 4
-                                R_u_stat(i)%sf(:, :, p + j) = &
-                                    R_u_stat(i)%sf(:, :, j - 1)
-                                R_mu_stat(i)%sf(:, :, p + j) = &
-                                    R_mu_stat(i)%sf(:, :, j - 1)
-                                F_IMET_stat(i)%sf(:, :, p + j) = &
-                                    F_IMET_stat(i)%sf(:, :, j - 1)
+                                stat_reynolds_stress(i)%sf(:, :, p + j) = &
+                                    stat_reynolds_stress(i)%sf(:, :, j - 1)
+                                stat_eff_visc(i)%sf(:, :, p + j) = &
+                                    stat_eff_visc(i)%sf(:, :, j - 1)
+                                stat_int_mom_exch(i)%sf(:, :, p + j) = &
+                                    stat_int_mom_exch(i)%sf(:, :, j - 1)
                             end do
                         end if
                     end do
@@ -1559,9 +1559,9 @@ subroutine s_initialize_data_input_module
         allocate (q_prim_vf(1:sys_size))
         if (bubbles_lagrange) allocate (q_particle(1))
 
-        if (q_filtered_wrt) allocate (R_u_stat(2:4))
-        if (q_filtered_wrt) allocate (R_mu_stat(2:4))
-        if (q_filtered_wrt) allocate (F_IMET_stat(2:4))
+        if (q_filtered_wrt) allocate (stat_reynolds_stress(2:4))
+        if (q_filtered_wrt) allocate (stat_eff_visc(2:4))
+        if (q_filtered_wrt) allocate (stat_int_mom_exch(2:4))
 
         ! Allocating the parts of the conservative and primitive variables
         ! that do require the direct knowledge of the dimensionality of the
@@ -1602,13 +1602,13 @@ subroutine s_initialize_data_input_module
 
                 if (q_filtered_wrt) then
                     do i = 2, 4
-                        allocate (R_u_stat(i)%sf(-buff_size:m + buff_size, &
+                        allocate (stat_reynolds_stress(i)%sf(-buff_size:m + buff_size, &
                                                      -buff_size:n + buff_size, &
                                                      -buff_size:p + buff_size))
-                        allocate (R_mu_stat(i)%sf(-buff_size:m + buff_size, &
+                        allocate (stat_eff_visc(i)%sf(-buff_size:m + buff_size, &
                                                      -buff_size:n + buff_size, &
                                                      -buff_size:p + buff_size))
-                        allocate (F_IMET_stat(i)%sf(-buff_size:m + buff_size, &
+                        allocate (stat_int_mom_exch(i)%sf(-buff_size:m + buff_size, &
                                                      -buff_size:n + buff_size, &
                                                      -buff_size:p + buff_size))
                     end do
@@ -1708,17 +1708,17 @@ subroutine s_finalize_data_input_module
 
         if (q_filtered_wrt) then 
             do i = 2, 4 
-                deallocate (R_u_stat(i)%sf)
+                deallocate (stat_reynolds_stress(i)%sf)
             end do 
-            deallocate(R_u_stat)
+            deallocate(stat_reynolds_stress)
             do i = 2, 4 
-                deallocate (R_mu_stat(i)%sf)
+                deallocate (stat_eff_visc(i)%sf)
             end do 
-            deallocate(R_mu_stat)
+            deallocate(stat_eff_visc)
             do i = 2, 4 
-                deallocate (F_IMET_stat(i)%sf)
+                deallocate (stat_int_mom_exch(i)%sf)
             end do 
-            deallocate(F_IMET_stat)
+            deallocate(stat_int_mom_exch)
         end if
 
         s_read_data_files => null()
diff --git a/src/post_process/m_start_up.f90 b/src/post_process/m_start_up.f90
index 8a61c72e07..b454764c3e 100644
--- a/src/post_process/m_start_up.f90
+++ b/src/post_process/m_start_up.f90
@@ -329,22 +329,22 @@ subroutine s_save_data(t_step, varname, pres, c, H)
         if (q_filtered_wrt) then
             ! filtered cons vars
             do i = 2, 4
-                q_sf = R_u_stat(i)%sf(x_beg:x_end, y_beg:y_end, z_beg:z_end)
-                write (varname, '(A,I0)') 'R_u_stats', i
+                q_sf = stat_reynolds_stress(i)%sf(x_beg:x_end, y_beg:y_end, z_beg:z_end)
+                write (varname, '(A,I0)') 'stat_reynolds_stresss', i
                 call s_write_variable_to_formatted_database_file(varname, t_step)
 
                 varname(:) = ' '
             end do
             do i = 2, 4
-                q_sf = R_mu_stat(i)%sf(x_beg:x_end, y_beg:y_end, z_beg:z_end)
-                write (varname, '(A,I0)') 'R_mu_stats', i
+                q_sf = stat_eff_visc(i)%sf(x_beg:x_end, y_beg:y_end, z_beg:z_end)
+                write (varname, '(A,I0)') 'stat_eff_viscs', i
                 call s_write_variable_to_formatted_database_file(varname, t_step)
 
                 varname(:) = ' '
             end do
             do i = 2, 4
-                q_sf = F_IMET_stat(i)%sf(x_beg:x_end, y_beg:y_end, z_beg:z_end)
-                write (varname, '(A,I0)') 'F_IMET_stats', i
+                q_sf = stat_int_mom_exch(i)%sf(x_beg:x_end, y_beg:y_end, z_beg:z_end)
+                write (varname, '(A,I0)') 'stat_int_mom_exchs', i
                 call s_write_variable_to_formatted_database_file(varname, t_step)
 
                 varname(:) = ' '
diff --git a/src/simulation/m_additional_forcing.fpp b/src/simulation/m_additional_forcing.fpp
index 2971ba0e9a..cc90cce4ef 100644
--- a/src/simulation/m_additional_forcing.fpp
+++ b/src/simulation/m_additional_forcing.fpp
@@ -57,9 +57,9 @@ contains
         do i = 0, m
             do j = 0, n
                 do k = 0, p
-                    rhs_vf(1)%sf(i, j, k) = rhs_vf(1)%sf(i, j, k) + q_periodic_force(7)%sf(i, j, k) * fluid_indicator_function_I%sf(i, j, k) ! continuity
-                    rhs_vf(2)%sf(i, j, k) = rhs_vf(2)%sf(i, j, k) + q_periodic_force(1)%sf(i, j, k) * fluid_indicator_function_I%sf(i, j, k) * fluid_indicator_function_I%sf(i, j, k) ! x momentum
-                    rhs_vf(5)%sf(i, j, k) = rhs_vf(5)%sf(i, j, k) + (q_periodic_force(4)%sf(i, j, k) + q_periodic_force(8)%sf(i, j, k)) * fluid_indicator_function_I%sf(i, j, k) ! energy
+                    rhs_vf(1)%sf(i, j, k) = rhs_vf(1)%sf(i, j, k) + q_periodic_force(7)%sf(i, j, k) * fluid_indicator_function%sf(i, j, k) ! continuity
+                    rhs_vf(2)%sf(i, j, k) = rhs_vf(2)%sf(i, j, k) + q_periodic_force(1)%sf(i, j, k) * fluid_indicator_function%sf(i, j, k) * fluid_indicator_function%sf(i, j, k) ! x momentum
+                    rhs_vf(5)%sf(i, j, k) = rhs_vf(5)%sf(i, j, k) + (q_periodic_force(4)%sf(i, j, k) + q_periodic_force(8)%sf(i, j, k)) * fluid_indicator_function%sf(i, j, k) ! energy
                 end do
             end do
         end do
@@ -80,15 +80,15 @@ contains
         do i = 0, m 
             do j = 0, n 
                 do k = 0, p 
-                    q_spatial_avg(4) = q_spatial_avg(4) + q_cons_vf(1)%sf(i, j, k) * fluid_indicator_function_I%sf(i, j, k)
+                    q_spatial_avg(4) = q_spatial_avg(4) + q_cons_vf(1)%sf(i, j, k) * fluid_indicator_function%sf(i, j, k)
                     q_spatial_avg(5) = q_spatial_avg(5) + (0.4_wp/287._wp * (q_cons_vf(5)%sf(i, j, k)/q_cons_vf(1)%sf(i, j, k) & 
                                         - 0.5_wp * ((q_cons_vf(2)%sf(i, j, k)/q_cons_vf(1)%sf(i, j, k))**2 & 
                                         + (q_cons_vf(3)%sf(i, j, k)/q_cons_vf(1)%sf(i, j, k))**2 & 
-                                        + (q_cons_vf(4)%sf(i, j, k)/q_cons_vf(1)%sf(i, j, k))**2))) * fluid_indicator_function_I%sf(i, j, k)
+                                        + (q_cons_vf(4)%sf(i, j, k)/q_cons_vf(1)%sf(i, j, k))**2))) * fluid_indicator_function%sf(i, j, k)
                                         
-                    q_spatial_avg(1) = q_spatial_avg(1) + (q_cons_vf(2)%sf(i, j, k)) * fluid_indicator_function_I%sf(i, j, k)
-                    q_spatial_avg(2) = q_spatial_avg(2) + (q_cons_vf(3)%sf(i, j, k)) * fluid_indicator_function_I%sf(i, j, k)
-                    q_spatial_avg(3) = q_spatial_avg(3) + (q_cons_vf(4)%sf(i, j, k)) * fluid_indicator_function_I%sf(i, j, k)
+                    q_spatial_avg(1) = q_spatial_avg(1) + (q_cons_vf(2)%sf(i, j, k)) * fluid_indicator_function%sf(i, j, k)
+                    q_spatial_avg(2) = q_spatial_avg(2) + (q_cons_vf(3)%sf(i, j, k)) * fluid_indicator_function%sf(i, j, k)
+                    q_spatial_avg(3) = q_spatial_avg(3) + (q_cons_vf(4)%sf(i, j, k)) * fluid_indicator_function%sf(i, j, k)
                 end do
             end do
         end do
diff --git a/src/simulation/m_checker.fpp b/src/simulation/m_checker.fpp
index 04c1076f2a..d0c2c278ec 100644
--- a/src/simulation/m_checker.fpp
+++ b/src/simulation/m_checker.fpp
@@ -350,7 +350,7 @@ contains
             #:for BOUND in ['beg', 'end']
                 @:PROHIBIT(periodic_forcing .and. bc_${X}$%${BOUND}$ /= BC_PERIODIC, &
                     "Periodic forcing requires all BCs to be periodic")
-                @:PROHIBIT(fourier_transform_filtering .and. bc_${X}$%${BOUND}$ /= BC_PERIODIC, &
+                @:PROHIBIT(volume_filtering_momentum_eqn .and. bc_${X}$%${BOUND}$ /= BC_PERIODIC, &
                     "Explicit filtering of flow data requires all BCs to be periodic due to fourier transform")
             #:endfor
         #:endfor
diff --git a/src/simulation/m_compute_particle_forces.fpp b/src/simulation/m_compute_particle_forces.fpp
index fd84657f5f..8a1ef5f092 100644
--- a/src/simulation/m_compute_particle_forces.fpp
+++ b/src/simulation/m_compute_particle_forces.fpp
@@ -27,8 +27,8 @@ contains
 
     end subroutine s_initialize_particle_forces_module
 
-    subroutine s_compute_drag_coefficient(div_pres_visc_stress)
-        type(scalar_field), dimension(momxb:momxe), intent(in) :: div_pres_visc_stress
+    subroutine s_compute_drag_coefficient(pres_visc_stress)
+        type(scalar_field), dimension(momxb:momxe), intent(in) :: pres_visc_stress
         real(wp), dimension(0:num_ibs) :: FD_global
         real(wp) :: drag_coeff 
         integer :: i, j, k
@@ -44,7 +44,7 @@ contains
                 do k = 0, p  
                     !$acc atomic
                     FD_calc(ib_markers%sf(i, j, k)) = FD_calc(ib_markers%sf(i, j, k)) & 
-                                                    + div_pres_visc_stress(momxb)%sf(i, j, k) * dx(i) * dy(j) * dz(k)
+                                                    + pres_visc_stress(momxb)%sf(i, j, k) * dx(i) * dy(j) * dz(k)
                 end do 
             end do 
         end do
diff --git a/src/simulation/m_data_output.fpp b/src/simulation/m_data_output.fpp
index 63b8fa2b32..f43cebc798 100644
--- a/src/simulation/m_data_output.fpp
+++ b/src/simulation/m_data_output.fpp
@@ -76,7 +76,7 @@ contains
         !! @param q_cons_vf Conservative variables
         !! @param q_prim_vf Primitive variables
         !! @param t_step Current time step
-    subroutine s_write_data_files(q_cons_vf, q_T_sf, q_prim_vf, t_step, beta, R_u_stat, R_mu_stat, F_IMET_stat)
+    subroutine s_write_data_files(q_cons_vf, q_T_sf, q_prim_vf, t_step, beta, stat_reynolds_stress, stat_eff_visc, stat_int_mom_exch)
 
         type(scalar_field), &
             dimension(sys_size), &
@@ -94,14 +94,14 @@ contains
         type(scalar_field), &
             intent(inout), optional :: beta
 
-        type(scalar_field), dimension(2:4), intent(inout), optional :: R_u_stat
-        type(scalar_field), dimension(2:4), intent(inout), optional :: R_mu_stat
-        type(scalar_field), dimension(2:4), intent(inout), optional :: F_IMET_stat
+        type(scalar_field), dimension(2:4), intent(inout), optional :: stat_reynolds_stress
+        type(scalar_field), dimension(2:4), intent(inout), optional :: stat_eff_visc
+        type(scalar_field), dimension(2:4), intent(inout), optional :: stat_int_mom_exch
 
         if (.not. parallel_io) then
             call s_write_serial_data_files(q_cons_vf, q_T_sf, q_prim_vf, t_step, beta)
         else
-            call s_write_parallel_data_files(q_cons_vf, q_prim_vf, t_step, beta, R_u_stat, R_mu_stat, F_IMET_stat)
+            call s_write_parallel_data_files(q_cons_vf, q_prim_vf, t_step, beta, stat_reynolds_stress, stat_eff_visc, stat_int_mom_exch)
         end if
 
     end subroutine s_write_data_files
@@ -790,15 +790,15 @@ contains
         !!  @param q_prim_vf Cell-average primitive variables
         !!  @param t_step Current time-step
         !!  @param beta Eulerian void fraction from lagrangian bubbles
-    subroutine s_write_parallel_data_files(q_cons_vf, q_prim_vf, t_step, beta, R_u_stat, R_mu_stat, F_IMET_stat)
+    subroutine s_write_parallel_data_files(q_cons_vf, q_prim_vf, t_step, beta, stat_reynolds_stress, stat_eff_visc, stat_int_mom_exch)
 
         type(scalar_field), dimension(sys_size), intent(in) :: q_cons_vf
         type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
         integer, intent(in) :: t_step
         type(scalar_field), intent(inout), optional :: beta
-        type(scalar_field), dimension(2:4), intent(inout), optional :: R_u_stat
-        type(scalar_field), dimension(2:4), intent(inout), optional :: R_mu_stat
-        type(scalar_field), dimension(2:4), intent(inout), optional :: F_IMET_stat
+        type(scalar_field), dimension(2:4), intent(inout), optional :: stat_reynolds_stress
+        type(scalar_field), dimension(2:4), intent(inout), optional :: stat_eff_visc
+        type(scalar_field), dimension(2:4), intent(inout), optional :: stat_int_mom_exch
 
 #ifdef MFC_MPI
 
@@ -820,7 +820,7 @@ contains
 
         if (present(beta)) then
             alt_sys = sys_size + 1
-        else if (present(R_u_stat) .and. present(R_mu_stat) .and. present(F_IMET_stat)) then
+        else if (present(stat_reynolds_stress) .and. present(stat_eff_visc) .and. present(stat_int_mom_exch)) then
             alt_sys = sys_size + 9
         else
             alt_sys = sys_size
@@ -905,9 +905,9 @@ contains
             ! Initialize MPI data I/O
 
             if (ib) then
-                if (present(R_u_stat) .and. present(R_mu_stat) .and. present(F_IMET_stat)) then
+                if (present(stat_reynolds_stress) .and. present(stat_eff_visc) .and. present(stat_int_mom_exch)) then
                     call s_initialize_mpi_data(q_cons_vf, ib_markers, levelset, levelset_norm, & 
-                                               R_u_stat=R_u_stat, R_mu_stat=R_mu_stat, F_IMET_stat=F_IMET_stat)
+                                               stat_reynolds_stress=stat_reynolds_stress, stat_eff_visc=stat_eff_visc, stat_int_mom_exch=stat_int_mom_exch)
                 else
                     call s_initialize_mpi_data(q_cons_vf, ib_markers, levelset, levelset_norm)
                 end if
@@ -965,7 +965,7 @@ contains
                                                 mpi_p, status, ierr)
                     end do
                 end if
-            else if (fourier_transform_filtering) then
+            else if (volume_filtering_momentum_eqn) then
                 do i = 1, alt_sys
                     var_MOK = int(i, MPI_OFFSET_KIND)
 
diff --git a/src/simulation/m_global_parameters.fpp b/src/simulation/m_global_parameters.fpp
index eaea7c04e3..a71e17a69d 100644
--- a/src/simulation/m_global_parameters.fpp
+++ b/src/simulation/m_global_parameters.fpp
@@ -508,7 +508,7 @@ module m_global_parameters
     real(wp) :: rho_inf_ref !< reference freestream density 
     real(wp) :: T_inf_ref !< reference freestream temperature
     logical :: periodic_forcing
-    logical :: fourier_transform_filtering
+    logical :: volume_filtering_momentum_eqn
     logical :: store_levelset
     logical :: slab_domain_decomposition
     logical :: compute_autocorrelation
@@ -797,7 +797,7 @@ contains
         rho_inf_ref = dflt_real
         T_inf_ref = dflt_real
         periodic_forcing = .false.
-        fourier_transform_filtering = .false.
+        volume_filtering_momentum_eqn = .false.
         store_levelset = .true.
         slab_domain_decomposition = .false.
         compute_autocorrelation = .false.
@@ -1155,7 +1155,7 @@ contains
         elseif (bubbles_lagrange) then
             allocate (MPI_IO_DATA%view(1:sys_size + 1))
             allocate (MPI_IO_DATA%var(1:sys_size + 1))
-        else if (fourier_transform_filtering) then 
+        else if (volume_filtering_momentum_eqn) then 
             allocate (MPI_IO_DATA%view(1:sys_size+9))
             allocate (MPI_IO_DATA%var(1:sys_size+9))
         else
@@ -1177,7 +1177,7 @@ contains
                 allocate (MPI_IO_DATA%var(i)%sf(0:m, 0:n, 0:p))
                 MPI_IO_DATA%var(i)%sf => null()
             end do
-        else if (fourier_transform_filtering) then 
+        else if (volume_filtering_momentum_eqn) then 
             do i = sys_size+1, sys_size+9
                 allocate (MPI_IO_DATA%var(i)%sf(0:m, 0:n, 0:p))
                 MPI_IO_DATA%var(i)%sf => null()
@@ -1354,7 +1354,7 @@ contains
                 do i = 1, sys_size + 1
                     MPI_IO_DATA%var(i)%sf => null()
                 end do
-            else if (fourier_transform_filtering) then 
+            else if (volume_filtering_momentum_eqn) then 
                 do i = 1, sys_size+9
                     MPI_IO_DATA%var(i)%sf => null()
                 end do
diff --git a/src/simulation/m_mpi_proxy.fpp b/src/simulation/m_mpi_proxy.fpp
index 6171f9faf3..c2579cc057 100644
--- a/src/simulation/m_mpi_proxy.fpp
+++ b/src/simulation/m_mpi_proxy.fpp
@@ -92,7 +92,7 @@ contains
             & 'cfl_adap_dt', 'cfl_const_dt', 'cfl_dt', 'surface_tension',        &
             & 'viscous', 'shear_stress', 'bulk_stress', 'bubbles_lagrange',     &
             & 'hyperelasticity', 'rkck_adap_dt', 'bc_io', 'powell', 'cont_damage', &
-            & 'periodic_ibs', 'compute_CD', 'periodic_forcing', 'fourier_transform_filtering', & 
+            & 'periodic_ibs', 'compute_CD', 'periodic_forcing', 'volume_filtering_momentum_eqn', & 
             & 'store_levelset', 'slab_domain_decomposition', 'compute_autocorrelation' ]
             call MPI_BCAST(${VAR}$, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr)
         #:endfor
diff --git a/src/simulation/m_rhs.fpp b/src/simulation/m_rhs.fpp
index 884e6a07ad..eea4a49260 100644
--- a/src/simulation/m_rhs.fpp
+++ b/src/simulation/m_rhs.fpp
@@ -609,7 +609,7 @@ contains
 
     end subroutine s_initialize_rhs_module
 
-    subroutine s_compute_rhs(q_cons_vf, q_T_sf, q_prim_vf, bc_type, rhs_vf, pb, rhs_pb, mv, rhs_mv, t_step, time_avg, div_pres_visc_stress)
+    subroutine s_compute_rhs(q_cons_vf, q_T_sf, q_prim_vf, bc_type, rhs_vf, pb, rhs_pb, mv, rhs_mv, t_step, time_avg, pres_visc_stress)
 
         type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
         type(scalar_field), intent(inout) :: q_T_sf
@@ -620,7 +620,7 @@ contains
         real(wp), dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:, 1:, 1:), intent(inout) :: mv, rhs_mv
         integer, intent(in) :: t_step
         real(wp), intent(inout) :: time_avg
-        type(scalar_field), dimension(momxb:momxe), optional, intent(inout) :: div_pres_visc_stress
+        type(scalar_field), dimension(momxb:momxe), optional, intent(inout) :: pres_visc_stress
 
         real(wp), dimension(0:m, 0:n, 0:p) :: nbub
         real(wp) :: t_start, t_finish
@@ -811,7 +811,7 @@ contains
                                                  q_cons_qp, &
                                                  q_prim_qp, &
                                                  flux_src_n(id), & 
-                                                 div_pres_visc_stress)
+                                                 pres_visc_stress)
             call nvtxEndRange
 
             ! RHS additions for hypoelasticity
@@ -831,7 +831,7 @@ contains
                                                       dq_prim_dx_qp(1)%vf, &
                                                       dq_prim_dy_qp(1)%vf, &
                                                       dq_prim_dz_qp(1)%vf, & 
-                                                      div_pres_visc_stress)
+                                                      pres_visc_stress)
                 call nvtxEndRange
             end if
 
@@ -938,14 +938,14 @@ contains
 
     end subroutine s_compute_rhs
 
-    subroutine s_compute_advection_source_term(idir, rhs_vf, q_cons_vf, q_prim_vf, flux_src_n_vf, div_pres_visc_stress)
+    subroutine s_compute_advection_source_term(idir, rhs_vf, q_cons_vf, q_prim_vf, flux_src_n_vf, pres_visc_stress)
 
         integer, intent(in) :: idir
         type(scalar_field), dimension(sys_size), intent(inout) :: rhs_vf
         type(vector_field), intent(inout) :: q_cons_vf
         type(vector_field), intent(inout) :: q_prim_vf
         type(vector_field), intent(inout) :: flux_src_n_vf
-        type(scalar_field), dimension(momxb:momxe), optional, intent(inout) :: div_pres_visc_stress  
+        type(scalar_field), dimension(momxb:momxe), optional, intent(inout) :: pres_visc_stress  
 
         integer :: i, j, k, l, q
 
@@ -999,14 +999,14 @@ contains
             end do
 
             ! particle forces loop, x-dir
-            if ((compute_CD .or. fourier_transform_filtering) .and. present(div_pres_visc_stress)) then
+            if ((compute_CD .or. volume_filtering_momentum_eqn) .and. present(pres_visc_stress)) then
                 !$acc parallel loop collapse(3) gang vector default(present)
                 do k = 0, p
                     do j = 0, n 
                         do i = 0, m 
                             !$acc loop seq
                             do l = momxb, momxe
-                                div_pres_visc_stress(l)%sf(i, j, k) = 1._wp/dx(i) * & 
+                                pres_visc_stress(l)%sf(i, j, k) = 1._wp/dx(i) * & 
                                                           (flux_n(1)%vf(l)%sf(i-1, j, k) - & 
                                                            flux_n(1)%vf(l)%sf(i, j, k)) - 0.5_wp/dx(i) * & 
                                                           (q_cons_vf%vf(2)%sf(i+1, j, k)*q_cons_vf%vf(l)%sf(i+1, j, k)/q_cons_vf%vf(1)%sf(i+1, j, k) - & 
@@ -1128,14 +1128,14 @@ contains
             end do
 
             ! particle forces loop, y-dir
-            if ((compute_CD .or. fourier_transform_filtering) .and. present(div_pres_visc_stress)) then
+            if ((compute_CD .or. volume_filtering_momentum_eqn) .and. present(pres_visc_stress)) then
                 !$acc parallel loop collapse(3) gang vector default(present)
                 do k = 0, p 
                     do j = 0, n 
                         do i = 0, m 
                             !$acc loop seq
                             do l = momxb, momxe 
-                                div_pres_visc_stress(l)%sf(i, j, k) = div_pres_visc_stress(l)%sf(i, j, k) + 1._wp/dy(j) * & 
+                                pres_visc_stress(l)%sf(i, j, k) = pres_visc_stress(l)%sf(i, j, k) + 1._wp/dy(j) * & 
                                                           (flux_n(2)%vf(l)%sf(i, j-1, k) - & 
                                                            flux_n(2)%vf(l)%sf(i, j, k)) - 0.5_wp/dy(j) * & 
                                                           (q_cons_vf%vf(3)%sf(i, j+1, k)*q_cons_vf%vf(l)%sf(i, j+1, k)/q_cons_vf%vf(1)%sf(i, j+1, k) - & 
@@ -1353,14 +1353,14 @@ contains
             end if
 
             ! particle forces loop, z-dir
-            if ((compute_CD .or. fourier_transform_filtering) .and. present(div_pres_visc_stress)) then
+            if ((compute_CD .or. volume_filtering_momentum_eqn) .and. present(pres_visc_stress)) then
                 !$acc parallel loop collapse(3) gang vector default(present)
                 do k = 0, p 
                     do j = 0, n 
                         do i = 0, m 
                             !$acc loop seq
                             do l = momxb, momxe 
-                                div_pres_visc_stress(l)%sf(i, j, k) = div_pres_visc_stress(l)%sf(i, j, k) + 1._wp/dz(k) * & 
+                                pres_visc_stress(l)%sf(i, j, k) = pres_visc_stress(l)%sf(i, j, k) + 1._wp/dz(k) * & 
                                                           (flux_n(3)%vf(l)%sf(i, j, k-1) - & 
                                                            flux_n(3)%vf(l)%sf(i, j, k)) - 0.5_wp/dz(k) * & 
                                                           (q_cons_vf%vf(4)%sf(i, j, k+1)*q_cons_vf%vf(l)%sf(i, j, k+1)/q_cons_vf%vf(1)%sf(i, j, k+1) - & 
@@ -1552,14 +1552,14 @@ contains
     end subroutine s_compute_advection_source_term
 
     subroutine s_compute_additional_physics_rhs(idir, q_prim_vf, rhs_vf, flux_src_n, &
-                                                dq_prim_dx_vf, dq_prim_dy_vf, dq_prim_dz_vf, div_pres_visc_stress)
+                                                dq_prim_dx_vf, dq_prim_dy_vf, dq_prim_dz_vf, pres_visc_stress)
 
         integer, intent(in) :: idir
         type(scalar_field), dimension(sys_size), intent(in) :: q_prim_vf
         type(scalar_field), dimension(sys_size), intent(inout) :: rhs_vf
         type(scalar_field), dimension(sys_size), intent(in) :: flux_src_n
         type(scalar_field), dimension(sys_size), intent(in) :: dq_prim_dx_vf, dq_prim_dy_vf, dq_prim_dz_vf
-        type(scalar_field), dimension(momxb:momxe), optional, intent(inout) :: div_pres_visc_stress
+        type(scalar_field), dimension(momxb:momxe), optional, intent(inout) :: pres_visc_stress
 
         integer :: i, j, k, l
 
@@ -1596,14 +1596,14 @@ contains
             end do
 
             ! particle momentum exchange, viscous stress tensor, x-dir
-            if ((compute_CD .or. fourier_transform_filtering) .and. present(div_pres_visc_stress)) then
+            if ((compute_CD .or. volume_filtering_momentum_eqn) .and. present(pres_visc_stress)) then
                 !$acc parallel loop collapse(3) gang vector default(present)
                 do k = 0, p 
                     do j = 0, n 
                         do i = 0, m 
                             !$acc loop seq
                             do l = momxb, momxe
-                                div_pres_visc_stress(l)%sf(i, j, k) = div_pres_visc_stress(l)%sf(i, j, k) + 1._wp/dx(i) * & 
+                                pres_visc_stress(l)%sf(i, j, k) = pres_visc_stress(l)%sf(i, j, k) + 1._wp/dx(i) * & 
                                                        (flux_src_n(l)%sf(i-1, j, k) - & 
                                                         flux_src_n(l)%sf(i, j, k))
                             end do 
@@ -1695,14 +1695,14 @@ contains
             end if
 
             ! particle momentum exchange, viscous stress tensor, y-dir
-            if ((compute_CD .or. fourier_transform_filtering) .and. present(div_pres_visc_stress)) then
+            if ((compute_CD .or. volume_filtering_momentum_eqn) .and. present(pres_visc_stress)) then
                 !$acc parallel loop collapse(3) gang vector default(present)
                 do k = 0, p 
                     do j = 0, n 
                         do i = 0, m 
                             !$acc loop seq
                             do l = momxb, momxe
-                                div_pres_visc_stress(l)%sf(i, j, k) = div_pres_visc_stress(l)%sf(i, j, k) + 1._wp/dy(j) * & 
+                                pres_visc_stress(l)%sf(i, j, k) = pres_visc_stress(l)%sf(i, j, k) + 1._wp/dy(j) * & 
                                                        (flux_src_n(l)%sf(i, j-1, k) - & 
                                                         flux_src_n(l)%sf(i, j, k))
                             end do 
@@ -1797,14 +1797,14 @@ contains
             end do
 
             ! particle momentum exchange, viscous stress tensor, z-dir
-            if ((compute_CD .or. fourier_transform_filtering) .and. present(div_pres_visc_stress)) then
+            if ((compute_CD .or. volume_filtering_momentum_eqn) .and. present(pres_visc_stress)) then
                 !$acc parallel loop collapse(3) gang vector default(present)
                 do k = 0, p 
                     do j = 0, n 
                         do i = 0, m 
                             !$acc loop seq
                             do l = momxb, momxe 
-                                div_pres_visc_stress(l)%sf(i, j, k) = div_pres_visc_stress(l)%sf(i, j, k) + 1._wp/dz(k) * & 
+                                pres_visc_stress(l)%sf(i, j, k) = pres_visc_stress(l)%sf(i, j, k) + 1._wp/dz(k) * & 
                                                        (flux_src_n(l)%sf(i, j, k-1) - & 
                                                         flux_src_n(l)%sf(i, j, k))
                             end do 
diff --git a/src/simulation/m_start_up.fpp b/src/simulation/m_start_up.fpp
index 62b641da19..c34bd05321 100644
--- a/src/simulation/m_start_up.fpp
+++ b/src/simulation/m_start_up.fpp
@@ -95,6 +95,8 @@ module m_start_up
 
     use m_volume_filtering
 
+    use m_compute_statistics
+
     implicit none
 
     private; public :: s_read_input_file, &
@@ -188,7 +190,7 @@ contains
             hyperelasticity, R0ref, num_bc_patches, Bx0, powell, &
             cont_damage, tau_star, cont_damage_s, alpha_bar, & 
             periodic_ibs, compute_CD, mu_visc, u_inf_ref, rho_inf_ref, T_inf_ref, & 
-            periodic_forcing, fourier_transform_filtering, store_levelset, & 
+            periodic_forcing, volume_filtering_momentum_eqn, store_levelset, & 
             slab_domain_decomposition, compute_autocorrelation
 
         ! Checking that an input file has been provided by the user. If it
@@ -1341,6 +1343,11 @@ contains
 
         if (relax) call s_infinite_relaxation_k(q_cons_ts(1)%vf)
 
+        ! Volume filter flow variables, compute unclosed terms and their statistics
+        if (volume_filtering_momentum_eqn) then 
+            call s_volume_filter_momentum_eqn(q_cons_ts(1)%vf)
+        end if
+
         ! Time-stepping loop controls
 
         t_step = t_step + 1
@@ -1421,9 +1428,9 @@ contains
         call cpu_time(start)
         call nvtxStartRange("SAVE-DATA")
         do i = 2, 4 
-            !$acc update host(R_u_stat(i)%sf)
-            !$acc update host(R_mu_stat(i)%sf)
-            !$acc update host(F_IMET_stat(i)%sf)
+            !$acc update host(stat_reynolds_stress(i)%sf)
+            !$acc update host(stat_eff_visc(i)%sf)
+            !$acc update host(stat_int_mom_exch(i)%sf)
         end do  
         do i = 1, sys_size
             !$acc update host(q_cons_ts(1)%vf(i)%sf)
@@ -1457,9 +1464,9 @@ contains
             call s_write_restart_lag_bubbles(save_count) !parallel
             if (lag_params%write_bubbles_stats) call s_write_lag_bubble_stats()
         else
-            if (fourier_transform_filtering) then
+            if (volume_filtering_momentum_eqn) then
                 call s_write_data_files(q_cons_ts(1)%vf, q_T_sf, q_prim_vf, save_count, &
-                                        R_u_stat=R_u_stat, R_mu_stat=R_mu_stat, F_IMET_stat=F_IMET_stat)
+                                        stat_reynolds_stress=stat_reynolds_stress, stat_eff_visc=stat_eff_visc, stat_int_mom_exch=stat_int_mom_exch)
             else
                 call s_write_data_files(q_cons_ts(1)%vf, q_T_sf, q_prim_vf, save_count)
             end if
@@ -1579,7 +1586,7 @@ contains
 
         call s_initialize_particle_forces_module()
         call s_initialize_additional_forcing_module()
-        if (fourier_transform_filtering) call s_initialize_fftw_explicit_filter_module()
+        if (volume_filtering_momentum_eqn) call s_initialize_fftw_explicit_filter_module()
 
         call s_initialize_statistics_module()
 
@@ -1727,7 +1734,7 @@ contains
 
         call s_finalize_particle_forces_module()
         call s_finalize_additional_forcing_module()
-        if (fourier_transform_filtering) call s_finalize_fftw_explicit_filter_module
+        if (volume_filtering_momentum_eqn) call s_finalize_fftw_explicit_filter_module
 
         ! Terminating MPI execution environment
         call s_mpi_finalize()
diff --git a/src/simulation/m_time_steppers.fpp b/src/simulation/m_time_steppers.fpp
index b88766eccd..9fdbb519e0 100644
--- a/src/simulation/m_time_steppers.fpp
+++ b/src/simulation/m_time_steppers.fpp
@@ -87,26 +87,13 @@ module m_time_steppers
     integer, private :: num_ts !<
     !! Number of time stages in the time-stepping scheme
 
-    type(scalar_field), allocatable, dimension(:) :: div_pres_visc_stress
-
-    type(scalar_field), allocatable, dimension(:) :: q_cons_filtered
-
-    type(vector_field), allocatable, dimension(:) :: pt_Re_stress
-    type(vector_field), allocatable, dimension(:) :: R_mu
-    type(scalar_field), allocatable, dimension(:) :: pres_visc_stress_filtered
-
-    type(scalar_field) :: mag_div_Ru
-    type(scalar_field) :: mag_div_R_mu
-    type(scalar_field) :: mag_F_IMET
-
-    type(scalar_field), allocatable, dimension(:) :: R_u_stat
-    type(scalar_field), allocatable, dimension(:) :: R_mu_stat
-    type(scalar_field), allocatable, dimension(:) :: F_IMET_stat
+    type(scalar_field), allocatable, dimension(:) :: stat_reynolds_stress
+    type(scalar_field), allocatable, dimension(:) :: stat_eff_visc
+    type(scalar_field), allocatable, dimension(:) :: stat_int_mom_exch
 
     !$acc declare create(q_cons_ts, q_prim_vf, q_T_sf, rhs_vf, rhs_ts_rkck, q_prim_ts, rhs_mv, rhs_pb, max_dt)
-    !$acc declare create(div_pres_visc_stress, q_cons_filtered, pt_Re_stress, R_mu, pres_visc_stress_filtered)
-    !$acc declare create(mag_div_Ru, mag_div_R_mu, mag_F_IMET)
-    !$acc declare create(R_u_stat, R_mu_stat, F_IMET_stat)
+
+    !$acc declare create(stat_reynolds_stress, stat_eff_visc, stat_int_mom_exch)
 
 contains
 
@@ -382,89 +369,30 @@ contains
             end do
         end do
 
-        if (compute_CD .or. fourier_transform_filtering) then
-            @:ALLOCATE(div_pres_visc_stress(momxb:momxe))
+        if (compute_CD .or. volume_filtering_momentum_eqn) then
+            @:ALLOCATE(pres_visc_stress(momxb:momxe))
             do i = momxb, momxe
-                @:ALLOCATE(div_pres_visc_stress(i)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
+                @:ALLOCATE(pres_visc_stress(i)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
                     idwbuff(2)%beg:idwbuff(2)%end, &
                     idwbuff(3)%beg:idwbuff(3)%end))
-                @:ACC_SETUP_SFs(div_pres_visc_stress(i))
+                @:ACC_SETUP_SFs(pres_visc_stress(i))
             end do
         end if
 
-        if (fourier_transform_filtering) then 
-            @:ALLOCATE(q_cons_filtered(1:sys_size))
-            do i = 1, sys_size
-                @:ALLOCATE(q_cons_filtered(i)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
-                    idwbuff(2)%beg:idwbuff(2)%end, &
-                    idwbuff(3)%beg:idwbuff(3)%end))
-                @:ACC_SETUP_SFs(q_cons_filtered(i))
-            end do
-
-            @:ALLOCATE(pt_Re_stress(1:num_dims))
-            do i = 1, num_dims
-                @:ALLOCATE(pt_Re_stress(i)%vf(1:num_dims))
-            end do
-            do i = 1, num_dims
-                do j = 1, num_dims
-                    @:ALLOCATE(pt_Re_stress(i)%vf(j)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
-                        idwbuff(2)%beg:idwbuff(2)%end, &
-                        idwbuff(3)%beg:idwbuff(3)%end))
-                end do
-                @:ACC_SETUP_VFs(pt_Re_stress(i))
-            end do
-
-            @:ALLOCATE(R_mu(1:num_dims))
-            do i = 1, num_dims
-                @:ALLOCATE(R_mu(i)%vf(1:num_dims))
-            end do
-            do i = 1, num_dims
-                do j = 1, num_dims
-                    @:ALLOCATE(R_mu(i)%vf(j)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
-                        idwbuff(2)%beg:idwbuff(2)%end, &
-                        idwbuff(3)%beg:idwbuff(3)%end))
-                end do
-                @:ACC_SETUP_VFs(R_mu(i))
-            end do
-
-            @:ALLOCATE(pres_visc_stress_filtered(1:num_dims))
-            do i = 1, num_dims
-                @:ALLOCATE(pres_visc_stress_filtered(i)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
-                    idwbuff(2)%beg:idwbuff(2)%end, &
-                    idwbuff(3)%beg:idwbuff(3)%end))
-                @:ACC_SETUP_SFs(pres_visc_stress_filtered(i))
-            end do
-
-            @:ALLOCATE(mag_div_Ru%sf(idwbuff(1)%beg:idwbuff(1)%end, &
-                idwbuff(2)%beg:idwbuff(2)%end, &
-                idwbuff(3)%beg:idwbuff(3)%end))
-            @:ACC_SETUP_SFs(mag_div_Ru)
-
-            @:ALLOCATE(mag_div_R_mu%sf(idwbuff(1)%beg:idwbuff(1)%end, &
-                idwbuff(2)%beg:idwbuff(2)%end, &
-                idwbuff(3)%beg:idwbuff(3)%end))
-            @:ACC_SETUP_SFs(mag_div_R_mu)
-
-            @:ALLOCATE(mag_F_IMET%sf(idwbuff(1)%beg:idwbuff(1)%end, &
-                idwbuff(2)%beg:idwbuff(2)%end, &
-                idwbuff(3)%beg:idwbuff(3)%end))
-            @:ACC_SETUP_SFs(mag_F_IMET)
-        end if
-
-        @:ALLOCATE(R_u_stat(2:4))
+        @:ALLOCATE(stat_reynolds_stress(2:4))
         do i = 2, 4
-            @:ALLOCATE(R_u_stat(i)%sf(0:m, 0:n, 0:p))
-            @:ACC_SETUP_SFs(R_u_stat(i))
+            @:ALLOCATE(stat_reynolds_stress(i)%sf(0:m, 0:n, 0:p))
+            @:ACC_SETUP_SFs(stat_reynolds_stress(i))
         end do
-        @:ALLOCATE(R_mu_stat(2:4))
+        @:ALLOCATE(stat_eff_visc(2:4))
         do i = 2, 4
-            @:ALLOCATE(R_mu_stat(i)%sf(0:m, 0:n, 0:p))
-            @:ACC_SETUP_SFs(R_mu_stat(i))
+            @:ALLOCATE(stat_eff_visc(i)%sf(0:m, 0:n, 0:p))
+            @:ACC_SETUP_SFs(stat_eff_visc(i))
         end do
-        @:ALLOCATE(F_IMET_stat(2:4))
+        @:ALLOCATE(stat_int_mom_exch(2:4))
         do i = 2, 4
-            @:ALLOCATE(F_IMET_stat(i)%sf(0:m, 0:n, 0:p))
-            @:ACC_SETUP_SFs(F_IMET_stat(i))
+            @:ALLOCATE(stat_int_mom_exch(i)%sf(0:m, 0:n, 0:p))
+            @:ACC_SETUP_SFs(stat_int_mom_exch(i))
         end do
 
     end subroutine s_initialize_time_steppers_module
@@ -789,35 +717,35 @@ contains
             call s_compute_periodic_forcing(q_cons_ts(1)%vf)
         end if
 
-        call s_compute_rhs(q_cons_ts(1)%vf, q_T_sf, q_prim_vf, bc_type, rhs_vf, pb_ts(1)%sf, rhs_pb, mv_ts(1)%sf, rhs_mv, t_step, time_avg, div_pres_visc_stress)
+        call s_compute_rhs(q_cons_ts(1)%vf, q_T_sf, q_prim_vf, bc_type, rhs_vf, pb_ts(1)%sf, rhs_pb, mv_ts(1)%sf, rhs_mv, t_step, time_avg, pres_visc_stress)
 
-        if (fourier_transform_filtering) then 
-            call s_apply_fftw_filter_cons(q_cons_ts(1)%vf, q_cons_filtered)
-            call s_setup_terms_filtering(q_cons_ts(1)%vf, pt_Re_stress, R_mu)
-            call s_apply_fftw_filter_tensor(pt_Re_stress, R_mu, q_cons_filtered, div_pres_visc_stress, pres_visc_stress_filtered)
-            call s_compute_pseudo_turbulent_reynolds_stress(q_cons_filtered, pt_Re_stress, mag_div_Ru)
-            call s_compute_R_mu(q_cons_filtered, R_mu, mag_div_R_mu)
-            call s_compute_interphase_momentum_exchange_term(pres_visc_stress_filtered, mag_F_IMET)
-        end if
+        ! if (volume_filtering_momentum_eqn) then 
+        !     call s_apply_fftw_filter_cons(q_cons_ts(1)%vf, q_cons_filtered)
+        !     call s_setup_terms_filtering(q_cons_ts(1)%vf, reynolds_stress, eff_visc)
+        !     call s_apply_fftw_filter_tensor(reynolds_stress, eff_visc, q_cons_filtered, pres_visc_stress, int_mom_exch)
+        !     call s_compute_pseudo_turbulent_reynolds_stress(q_cons_filtered, reynolds_stress, mag_reynolds_stress)
+        !     call s_compute_eff_visc(q_cons_filtered, eff_visc, mag_eff_visc)
+        !     call s_compute_interphase_momentum_exchange_term(int_mom_exch, mag_int_mom_exch)
+        ! end if
         
 
-        call s_autocorrelation_function(t_step+1, q_cons_ts(1)%vf)
-        if (t_step > 10) then
-            n_step = t_step - 10
-            call s_compute_s_order_statistics(mag_div_Ru, n_step, R_u_stat, 1)
-            call s_compute_s_order_statistics(mag_div_R_mu, n_step, R_mu_stat, 2)
-            call s_compute_s_order_statistics(mag_F_IMET, n_step, F_IMET_stat, 3)
-        end if
+        ! call s_autocorrelation_function(t_step+1, q_cons_ts(1)%vf)
+        ! if (t_step > 10) then
+        !     n_step = t_step - 10
+        !     call s_compute_s_order_statistics(mag_reynolds_stress, n_step, stat_reynolds_stress, 1)
+        !     call s_compute_s_order_statistics(mag_eff_visc, n_step, stat_eff_visc, 2)
+        !     call s_compute_s_order_statistics(mag_int_mom_exch, n_step, stat_int_mom_exch, 3)
+        ! end if
 
 
-        ! R_u_stat(2)%sf(0:m, 0:n, 0:p) = q_cons_filtered(6)%sf(0:m, 0:n, 0:p)
-        ! R_u_stat(3)%sf(0:m, 0:n, 0:p) = mag_div_Ru%sf(0:m, 0:n, 0:p)
-        ! R_u_stat(4)%sf(0:m, 0:n, 0:p) = mag_div_R_mu%sf(0:m, 0:n, 0:p)
-        ! R_mu_stat(2)%sf(0:m, 0:n, 0:p) = mag_F_IMET%sf(0:m, 0:n, 0:p)
+        ! stat_reynolds_stress(2)%sf(0:m, 0:n, 0:p) = q_cons_filtered(6)%sf(0:m, 0:n, 0:p)
+        ! stat_reynolds_stress(3)%sf(0:m, 0:n, 0:p) = mag_reynolds_stress%sf(0:m, 0:n, 0:p)
+        ! stat_reynolds_stress(4)%sf(0:m, 0:n, 0:p) = mag_eff_visc%sf(0:m, 0:n, 0:p)
+        ! stat_eff_visc(2)%sf(0:m, 0:n, 0:p) = mag_int_mom_exch%sf(0:m, 0:n, 0:p)
 
 
         if (compute_CD) then
-            call s_compute_drag_coefficient(div_pres_visc_stress)
+            call s_compute_drag_coefficient(pres_visc_stress)
         end if
 
         if (periodic_forcing) then 
@@ -1488,57 +1416,25 @@ contains
             @:DEALLOCATE(rhs_vf)
         end if
 
-        if (compute_CD .or. fourier_transform_filtering) then
+        if (compute_CD .or. volume_filtering_momentum_eqn) then
             do i = momxb, momxe
-                @:DEALLOCATE(div_pres_visc_stress(i)%sf)
+                @:DEALLOCATE(pres_visc_stress(i)%sf)
             end do
-            @:DEALLOCATE(div_pres_visc_stress)
-        end if
-
-        if (fourier_transform_filtering) then 
-            do i = 1, sys_size
-                @:DEALLOCATE(q_cons_filtered(i)%sf)
-            end do
-            @:DEALLOCATE(q_cons_filtered)
-
-            do i = 1, num_dims
-                do j = 1, num_dims
-                    @:DEALLOCATE(pt_Re_stress(i)%vf(j)%sf)
-                end do
-                @:DEALLOCATE(pt_Re_stress(i)%vf)
-            end do
-            @:DEALLOCATE(pt_Re_stress)
-
-            do i = 1, num_dims
-                do j = 1, num_dims
-                    @:DEALLOCATE(R_mu(i)%vf(j)%sf)
-                end do
-                @:DEALLOCATE(R_mu(i)%vf)
-            end do
-            @:DEALLOCATE(R_mu)
-
-            do i = 1, num_dims
-                @:DEALLOCATE(pres_visc_stress_filtered(i)%sf)
-            end do
-            @:DEALLOCATE(pres_visc_stress_filtered)
-
-            @:DEALLOCATE(mag_div_Ru%sf)
-            @:DEALLOCATE(mag_div_R_mu%sf)
-            @:DEALLOCATE(mag_F_IMET%sf)
+            @:DEALLOCATE(pres_visc_stress)
         end if
 
         do i = 2, 4
-            @:DEALLOCATE(R_u_stat(i)%sf)
+            @:DEALLOCATE(stat_reynolds_stress(i)%sf)
         end do
-        @:DEALLOCATE(R_u_stat)
+        @:DEALLOCATE(stat_reynolds_stress)
         do i = 2, 4
-            @:DEALLOCATE(R_mu_stat(i)%sf)
+            @:DEALLOCATE(stat_eff_visc(i)%sf)
         end do
-        @:DEALLOCATE(R_mu_stat)
+        @:DEALLOCATE(stat_eff_visc)
         do i = 2, 4
-            @:DEALLOCATE(F_IMET_stat(i)%sf)
+            @:DEALLOCATE(stat_int_mom_exch(i)%sf)
         end do
-        @:DEALLOCATE(F_IMET_stat)
+        @:DEALLOCATE(stat_int_mom_exch)
 
         ! Writing the footer of and closing the run-time information file
         if (proc_rank == 0 .and. run_time_info) then
diff --git a/src/simulation/m_volume_filtering.fpp b/src/simulation/m_volume_filtering.fpp
index adb102df35..6b1e981bfc 100644
--- a/src/simulation/m_volume_filtering.fpp
+++ b/src/simulation/m_volume_filtering.fpp
@@ -25,11 +25,11 @@ module m_volume_filtering
     implicit none
 
     private; public :: s_initialize_fftw_explicit_filter_module, &
- s_initialize_filtering_kernel, s_initialize_fluid_indicator_function, s_initialize_filtered_fluid_indicator_function, & 
+ s_initialize_filtering_kernel, s_initialize_fluid_indicator_function, & 
  s_finalize_fftw_explicit_filter_module, & 
  s_apply_fftw_filter_cons, s_apply_fftw_filter_tensor, s_apply_fftw_filter_scalarfield, &
  s_mpi_transpose_slabZ2Y, s_mpi_transpose_slabY2Z, s_mpi_FFT_fwd, s_mpi_FFT_bwd, &
- s_setup_terms_filtering, s_compute_pseudo_turbulent_reynolds_stress, s_compute_R_mu, s_compute_interphase_momentum_exchange_term
+ s_setup_terms_filtering, s_compute_pseudo_turbulent_reynolds_stress, s_compute_effective_viscosity, s_compute_interphase_momentum_exchange
 
 #if !defined(MFC_OpenACC)
     include 'fftw3.f03'
@@ -38,9 +38,25 @@ module m_volume_filtering
     integer :: ierr   
 
     ! fluid indicator function (1 = fluid, 0 = otherwise)
-    type(scalar_field), public :: fluid_indicator_function_I
+    type(scalar_field), public :: fluid_indicator_function
+    type(scalar_field) :: filtered_fluid_indicator_function
 
-    !$acc declare create(fluid_indicator_function_I)
+    ! volume filtered conservative variables
+    type(scalar_field), allocatable, dimension(:) :: q_cons_filtered
+
+    ! unclosed terms in momentum eqn
+    type(scalar_field), allocatable, dimension(:) :: pres_visc_stress
+    type(vector_field), allocatable, dimension(:) :: reynolds_stress
+    type(vector_field), allocatable, dimension(:) :: eff_visc
+    type(scalar_field), allocatable, dimension(:) :: int_mom_exch
+
+    ! magnitude of unclosed terms in momentum eqn
+    type(scalar_field) :: mag_reynolds_stress
+    type(scalar_field) :: mag_eff_visc
+    type(scalar_field) :: mag_int_mom_exch
+
+    !$acc declare create(fluid_indicator_function, filtered_fluid_indicator_function, q_cons_filtered)
+    !$acc declare create(pres_visc_stress, reynolds_stress, eff_visc, int_mom_exch, mag_reynolds_stress, mag_eff_visc, mag_int_mom_exch)
 
 #if defined(MFC_OpenACC)
     ! GPU plans
@@ -80,8 +96,72 @@ contains
 
     !< create fft plans to be used for explicit filtering of data 
     subroutine s_initialize_fftw_explicit_filter_module
+        integer :: i, j, k
         integer :: size_n(1), inembed(1), onembed(1)
 
+        @:ALLOCATE(fluid_indicator_function%sf(0:m, 0:n, 0:p))
+        @:ACC_SETUP_SFs(fluid_indicator_function)
+
+        @:ALLOCATE(filtered_fluid_indicator_function%sf(0:m, 0:n, 0:p))
+        @:ACC_SETUP_SFs(filtered_fluid_indicator_function)
+        
+        @:ALLOCATE(q_cons_filtered(1:sys_size))
+        do i = 1, sys_size
+            @:ALLOCATE(q_cons_filtered(i)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
+                idwbuff(2)%beg:idwbuff(2)%end, &
+                idwbuff(3)%beg:idwbuff(3)%end))
+            @:ACC_SETUP_SFs(q_cons_filtered(i))
+        end do
+
+        @:ALLOCATE(reynolds_stress(1:num_dims))
+            do i = 1, num_dims
+                @:ALLOCATE(reynolds_stress(i)%vf(1:num_dims))
+            end do
+            do i = 1, num_dims
+                do j = 1, num_dims
+                    @:ALLOCATE(reynolds_stress(i)%vf(j)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
+                        idwbuff(2)%beg:idwbuff(2)%end, &
+                        idwbuff(3)%beg:idwbuff(3)%end))
+                end do
+                @:ACC_SETUP_VFs(reynolds_stress(i))
+            end do
+
+        @:ALLOCATE(eff_visc(1:num_dims))
+        do i = 1, num_dims
+            @:ALLOCATE(eff_visc(i)%vf(1:num_dims))
+        end do
+        do i = 1, num_dims
+            do j = 1, num_dims
+                @:ALLOCATE(eff_visc(i)%vf(j)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
+                    idwbuff(2)%beg:idwbuff(2)%end, &
+                    idwbuff(3)%beg:idwbuff(3)%end))
+            end do
+            @:ACC_SETUP_VFs(eff_visc(i))
+        end do
+
+        @:ALLOCATE(int_mom_exch(1:num_dims))
+        do i = 1, num_dims
+            @:ALLOCATE(int_mom_exch(i)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
+                idwbuff(2)%beg:idwbuff(2)%end, &
+                idwbuff(3)%beg:idwbuff(3)%end))
+            @:ACC_SETUP_SFs(int_mom_exch(i))
+        end do
+
+        @:ALLOCATE(mag_reynolds_stress%sf(idwbuff(1)%beg:idwbuff(1)%end, &
+            idwbuff(2)%beg:idwbuff(2)%end, &
+            idwbuff(3)%beg:idwbuff(3)%end))
+        @:ACC_SETUP_SFs(mag_reynolds_stress)
+
+        @:ALLOCATE(mag_eff_visc%sf(idwbuff(1)%beg:idwbuff(1)%end, &
+            idwbuff(2)%beg:idwbuff(2)%end, &
+            idwbuff(3)%beg:idwbuff(3)%end))
+        @:ACC_SETUP_SFs(mag_eff_visc)
+
+        @:ALLOCATE(mag_int_mom_exch%sf(idwbuff(1)%beg:idwbuff(1)%end, &
+            idwbuff(2)%beg:idwbuff(2)%end, &
+            idwbuff(3)%beg:idwbuff(3)%end))
+        @:ACC_SETUP_SFs(mag_int_mom_exch)
+
         !< global sizes 
         Nx = m_glb + 1
         Ny = n_glb + 1
@@ -320,41 +400,30 @@ contains
         ! return cmplx_kernelG1d: 1D z, x, y
     end subroutine s_initialize_filtering_kernel
 
-    !< initialize fluid indicator function
+    !< initialize fluid indicator function and filtered fluid indicator function
     subroutine s_initialize_fluid_indicator_function 
         integer :: i, j, k 
 
-        @:ALLOCATE(fluid_indicator_function_I%sf(0:m, 0:n, 0:p))
-        @:ACC_SETUP_SFs(fluid_indicator_function_I)
-
         ! define fluid indicator function
         !$acc parallel loop collapse(3) gang vector default(present)
         do i = 0, m
             do j = 0, n 
                 do k = 0, p
                     if (ib_markers%sf(i, j, k) == 0) then 
-                        fluid_indicator_function_I%sf(i, j, k) = 1.0_dp
+                        fluid_indicator_function%sf(i, j, k) = 1.0_dp
                     else 
-                        fluid_indicator_function_I%sf(i, j, k) = 0.0_dp
+                        fluid_indicator_function%sf(i, j, k) = 0.0_dp
                     end if
                 end do
             end do
         end do
 
-    end subroutine s_initialize_fluid_indicator_function
-
-    !< compute the filtered fluid indicator function counterpart
-    subroutine s_initialize_filtered_fluid_indicator_function(filtered_fluid_indicator_function)
-        type(scalar_field) :: filtered_fluid_indicator_function
-
-        integer :: i, j, k
-
-        ! filter fluid indicator function -> stored in q_cons_vf(advxb)
+        ! filter fluid indicator function 
         !$acc parallel loop collapse(3) gang vector default(present)
         do i = 1, Nx 
             do j = 1, Ny 
                 do k = 1, Nzloc 
-                    data_real_3D_slabz(i, j, k) = fluid_indicator_function_I%sf(i-1, j-1, k-1)
+                    data_real_3D_slabz(i, j, k) = fluid_indicator_function%sf(i-1, j-1, k-1)
                 end do 
             end do 
         end do 
@@ -381,7 +450,7 @@ contains
             end do
         end do
 
-    end subroutine s_initialize_filtered_fluid_indicator_function
+    end subroutine s_initialize_fluid_indicator_function
 
     !< apply the gaussian filter to the conservative variables and compute their filtered components
     subroutine s_apply_fftw_filter_cons(q_cons_vf, q_cons_filtered)
@@ -390,12 +459,26 @@ contains
 
         integer :: l
 
-        do l = 1, sys_size-1
-            call s_apply_fftw_filter_scalarfield(q_cons_filtered(advxb), .true., q_cons_vf(l), q_cons_filtered(l))
+        do l = contxb, momxe
+            call s_apply_fftw_filter_scalarfield(filtered_fluid_indicator_function, .true., q_cons_vf(l), q_cons_filtered(l))
         end do 
 
     end subroutine s_apply_fftw_filter_cons
 
+    !< calculate the unclosed terms present in the volume filtered momentum equation
+    subroutine s_volume_filter_momentum_eqn(q_cons_vf)
+        type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
+        integer :: i, j, k
+
+        call s_apply_fftw_filter_cons(q_cons_vf, q_cons_filtered)
+        call s_setup_terms_filtering(q_cons_vf, reynolds_stress, eff_visc)
+        call s_apply_fftw_filter_tensor(reynolds_stress, eff_visc, q_cons_filtered, pres_visc_stress, int_mom_exch)
+        call s_compute_pseudo_turbulent_reynolds_stress(q_cons_filtered, reynolds_stress, mag_reynolds_stress)
+        call s_compute_effective_viscosity(q_cons_filtered, eff_visc, mag_eff_visc)
+        call s_compute_interphase_momentum_exchange(int_mom_exch, mag_int_mom_exch)
+
+    end subroutine s_volume_filter_momentum_eqn
+
     !< applies the gaussian filter to an arbitrary scalar field
     subroutine s_apply_fftw_filter_scalarfield(filtered_fluid_indicator_function, fluid_quantity, q_temp_in, q_temp_out)
         type(scalar_field), intent(in) :: filtered_fluid_indicator_function
@@ -411,7 +494,7 @@ contains
             do i = 0, m 
                 do j = 0, n 
                     do k = 0, p 
-                        data_real_3D_slabz(i+1, j+1, k+1) = q_temp_in%sf(i, j, k) * fluid_indicator_function_I%sf(i, j, k)
+                        data_real_3D_slabz(i+1, j+1, k+1) = q_temp_in%sf(i, j, k) * fluid_indicator_function%sf(i, j, k)
                     end do 
                 end do 
             end do
@@ -420,7 +503,7 @@ contains
             do i = 0, m 
                 do j = 0, n 
                     do k = 0, p 
-                        data_real_3D_slabz(i+1, j+1, k+1) = q_temp_in%sf(i, j, k) * (1.0_dp - fluid_indicator_function_I%sf(i, j, k))
+                        data_real_3D_slabz(i+1, j+1, k+1) = q_temp_in%sf(i, j, k) * (1.0_dp - fluid_indicator_function%sf(i, j, k))
                     end do 
                 end do 
             end do
@@ -462,36 +545,77 @@ contains
     end subroutine s_apply_fftw_filter_scalarfield
 
     !< apply the gaussian filter to the requisite tensors to compute unclosed terms of interest
-    subroutine s_apply_fftw_filter_tensor(pt_Re_stress, R_mu, q_cons_filtered, div_pres_visc_stress, pres_visc_stress_filtered)
-        type(vector_field), dimension(1:num_dims), intent(inout) :: pt_Re_stress
-        type(vector_field), dimension(1:num_dims), intent(inout) :: R_mu
+    subroutine s_apply_fftw_filter_tensor(reynolds_stress, eff_visc, q_cons_filtered, pres_visc_stress, int_mom_exch)
+        type(vector_field), dimension(1:num_dims), intent(inout) :: reynolds_stress
+        type(vector_field), dimension(1:num_dims), intent(inout) :: eff_visc
         type(scalar_field), dimension(sys_size), intent(in) :: q_cons_filtered
-        type(scalar_field), dimension(momxb:momxe), intent(inout) :: div_pres_visc_stress
-        type(scalar_field), dimension(1:num_dims), intent(inout) :: pres_visc_stress_filtered
+        type(scalar_field), dimension(momxb:momxe), intent(inout) :: pres_visc_stress
+        type(scalar_field), dimension(1:num_dims), intent(inout) :: int_mom_exch
 
         integer :: i, j, k, l, q
 
         ! pseudo turbulent reynolds stress
         do l = 1, num_dims 
             do q = 1, num_dims
-                call s_apply_fftw_filter_scalarfield(q_cons_filtered(advxb), .true., pt_Re_stress(l)%vf(q))
+                call s_apply_fftw_filter_scalarfield(filtered_fluid_indicator_function, .true., reynolds_stress(l)%vf(q))
             end do
         end do 
 
         ! effective viscosity
         do l = 1, num_dims 
             do q = 1, num_dims
-                call s_apply_fftw_filter_scalarfield(q_cons_filtered(advxb), .true., R_mu(l)%vf(q))
+                call s_apply_fftw_filter_scalarfield(filtered_fluid_indicator_function, .true., eff_visc(l)%vf(q))
             end do
         end do 
 
         ! interphase momentum exchange
         do l = 1, num_dims
-            call s_apply_fftw_filter_scalarfield(q_cons_filtered(advxb), .false., div_pres_visc_stress(momxb-1+l), pres_visc_stress_filtered(l))
+            call s_apply_fftw_filter_scalarfield(filtered_fluid_indicator_function, .false., pres_visc_stress(momxb-1+l), int_mom_exch(l))
         end do 
 
     end subroutine s_apply_fftw_filter_tensor
 
+    ! compute pressure and viscous stress tensors
+    subroutine s_compute_stress_tensor(q_cons_vf)
+        type(scalar_field), dimension(sys_size), intent(in) :: q_cons_vf
+        real(wp) :: dudx, dudy, dudz, dvdx, dvdy, dvdz, dqdx, dwdy, dwdz ! spatial velocity derivatives
+        integer :: i, j, k 
+
+        !$acc parallel loop collapse(3) gang vector default(present)
+        do i = 0, m 
+            do j = 0, n 
+                do k = 0, p
+                    ! local to each process
+                    dudx = ( q_cons_vf(2)%sf(i+1, j, k)/q_cons_vf(1)%sf(i+1, j, k) - q_cons_vf(2)%sf(i-1, j, k)/q_cons_vf(1)%sf(i-1, j, k) ) / (dx(i-1) + dx(i+1))
+                    dudy = ( q_cons_vf(2)%sf(i, j+1, k)/q_cons_vf(1)%sf(i, j+1, k) - q_cons_vf(2)%sf(i, j-1, k)/q_cons_vf(1)%sf(i, j-1, k) ) / (dy(j-1) + dy(j+1))
+                    dudz = ( q_cons_vf(2)%vf(i, j, k+1)/q_cons_vf(1)%sf(i, j, k+1) - q_cons_vf(2)%sf(i, j, k-1)/q_cons_vf(1)%sf(i, j, k-1) ) / (dz(k-1) + dz(k+1))
+
+                    dvdx = ( q_cons_vf(3)%sf(i+1, j, k)/q_cons_vf(1)%sf(i+1, j, k) - q_cons_vf(3)%sf(i-1, j, k)/q_cons_vf(1)%sf(i-1, j, k) ) / (dx(i-1) + dx(i+1))
+                    dvdy = ( q_cons_vf(3)%sf(i, j+1, k)/q_cons_vf(1)%sf(i, j+1, k) - q_cons_vf(3)%sf(i, j-1, k)/q_cons_vf(1)%sf(i, j-1, k) ) / (dy(j-1) + dy(j+1))
+                    dvdz = ( q_cons_vf(3)%vf(i, j, k+1)/q_cons_vf(1)%sf(i, j, k+1) - q_cons_vf(3)%sf(i, j, k-1)/q_cons_vf(1)%sf(i, j, k-1) ) / (dz(k-1) + dz(k+1))
+
+                    dwdx = ( q_cons_vf(4)%sf(i+1, j, k)/q_cons_vf(1)%sf(i+1, j, k) - q_cons_vf(4)%sf(i-1, j, k)/q_cons_vf(1)%sf(i-1, j, k) ) / (dx(i-1) + dx(i+1))
+                    dwdy = ( q_cons_vf(4)%sf(i, j+1, k)/q_cons_vf(1)%sf(i, j+1, k) - q_cons_vf(4)%sf(i, j-1, k)/q_cons_vf(1)%sf(i, j-1, k) ) / (dy(j-1) + dy(j+1))
+                    dwdz = ( q_cons_vf(4)%vf(i, j, k+1)/q_cons_vf(1)%sf(i, j, k+1) - q_cons_vf(4)%sf(i, j, k-1)/q_cons_vf(1)%sf(i, j, k-1) ) / (dz(k-1) + dz(k+1))
+
+                    ! viscous stress tensor, tau(row, column)
+                    tau(1)%vf(1) = mu * (4._wp/3._wp * dudx - 2._wp/3._wp * (dvdy + dwdz))
+                    tau(1)%vf(2) = mu * (dudy + dvdx)
+                    tau(1)%vf(3) = mu * (dudz + dwdx)
+                    tau(2)%vf(1) = mu * (dvdx + dudy)
+                    tau(2)%vf(2) = mu * (4._wp/3._wp * dvdy - 2._wp/3._wp * (dudx + dwdz))
+                    tau(2)%vf(3) = mu * (dvdz + dwdy)
+                    tau(3)%vf(1) = mu * (dwdx + dudz)
+                    tau(3)%vf(2) = mu * (dwdy + dvdz) 
+                    tau(3)%vf(3) = mu * (4._wp/3._wp * dwdz - 2._wp/3._wp * (dudx + dvdy))
+
+
+                end do 
+            end do 
+        end do
+
+    end subroutine s_compute_stress_tensor
+
     !< transpose domain from z-slabs to y-slabs on each processor
     subroutine s_mpi_transpose_slabZ2Y
         complex(c_double_complex), allocatable :: sendbuf(:), recvbuf(:)
@@ -708,10 +832,10 @@ contains
     end subroutine s_mpi_FFT_bwd
 
     !< setup for calculation of unclosed terms in volume filtered momentum eqn
-    subroutine s_setup_terms_filtering(q_cons_vf, pt_Re_stress, R_mu)
+    subroutine s_setup_terms_filtering(q_cons_vf, reynolds_stress, eff_visc)
         type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
-        type(vector_field), dimension(1:num_dims), intent(inout) :: pt_Re_stress
-        type(vector_field), dimension(1:num_dims), intent(inout) :: R_mu
+        type(vector_field), dimension(1:num_dims), intent(inout) :: reynolds_stress
+        type(vector_field), dimension(1:num_dims), intent(inout) :: eff_visc
 
         integer :: i, j, k, l, q
 
@@ -724,7 +848,7 @@ contains
                     do l = 1, num_dims
                         !$acc loop seq
                         do q = 1, num_dims
-                            pt_Re_stress(l)%vf(q)%sf(i, j, k) = (q_cons_vf(momxb-1+l)%sf(i, j, k) * q_cons_vf(momxb-1+q)%sf(i, j, k)) / q_cons_vf(1)%sf(i, j, k) ! (rho*u x rho*u)/rho = rho*(u x u) 
+                            reynolds_stress(l)%vf(q)%sf(i, j, k) = (q_cons_vf(momxb-1+l)%sf(i, j, k) * q_cons_vf(momxb-1+q)%sf(i, j, k)) / q_cons_vf(1)%sf(i, j, k) ! (rho*u x rho*u)/rho = rho*(u x u) 
                         end do
                     end do
                 end do
@@ -749,50 +873,50 @@ contains
         end do
 #endif
         
-        ! R_mu setup
+        ! effective viscosity setup
         !$acc parallel loop collapse(3) gang vector default(present)
         do i = 0, m
             do j = 0, n
                 do k = 0, p
-                    R_mu(1)%vf(1)%sf(i, j, k) = mu_visc * (2._wp*(q_cons_vf(momxb)%sf(i+1, j, k)/q_cons_vf(1)%sf(i+1, j, k) - q_cons_vf(momxb)%sf(i-1, j, k)/q_cons_vf(1)%sf(i-1, j, k))/(2._wp*dx(i)) & 
+                    eff_visc(1)%vf(1)%sf(i, j, k) = mu_visc * (2._wp*(q_cons_vf(momxb)%sf(i+1, j, k)/q_cons_vf(1)%sf(i+1, j, k) - q_cons_vf(momxb)%sf(i-1, j, k)/q_cons_vf(1)%sf(i-1, j, k))/(2._wp*dx(i)) & 
                                                 - 2._wp/3._wp*((q_cons_vf(momxb)%sf(i+1, j, k)/q_cons_vf(1)%sf(i+1, j, k) - q_cons_vf(momxb)%sf(i-1, j, k)/q_cons_vf(1)%sf(i-1, j, k))/(2._wp*dx(i)) & 
                                                 + (q_cons_vf(momxb+1)%sf(i, j+1, k)/q_cons_vf(1)%sf(i, j+1, k) - q_cons_vf(momxb+1)%sf(i, j-1, k)/q_cons_vf(1)%sf(i, j-1, k))/(2._wp*dy(j)) & 
                                                 + (q_cons_vf(momxb+2)%sf(i, j, k+1)/q_cons_vf(1)%sf(i, j, k+1) - q_cons_vf(momxb+2)%sf(i, j, k-1)/q_cons_vf(1)%sf(i, j, k-1))/(2._wp*dz(k))))
 
-                    R_mu(2)%vf(2)%sf(i, j, k) = mu_visc * (2._wp*(q_cons_vf(momxb+1)%sf(i, j+1, k)/q_cons_vf(1)%sf(i, j+1, k) - q_cons_vf(momxb+1)%sf(i, j-1, k)/q_cons_vf(1)%sf(i, j-1, k))/(2._wp*dy(j)) & 
+                    eff_visc(2)%vf(2)%sf(i, j, k) = mu_visc * (2._wp*(q_cons_vf(momxb+1)%sf(i, j+1, k)/q_cons_vf(1)%sf(i, j+1, k) - q_cons_vf(momxb+1)%sf(i, j-1, k)/q_cons_vf(1)%sf(i, j-1, k))/(2._wp*dy(j)) & 
                                                 - 2._wp/3._wp*((q_cons_vf(momxb)%sf(i+1, j, k)/q_cons_vf(1)%sf(i+1, j, k) - q_cons_vf(momxb)%sf(i-1, j, k)/q_cons_vf(1)%sf(i-1, j, k))/(2._wp*dx(i)) & 
                                                 + (q_cons_vf(momxb+1)%sf(i, j+1, k)/q_cons_vf(1)%sf(i, j+1, k) - q_cons_vf(momxb+1)%sf(i, j-1, k)/q_cons_vf(1)%sf(i, j-1, k))/(2._wp*dy(j)) & 
                                                 + (q_cons_vf(momxb+2)%sf(i, j, k+1)/q_cons_vf(1)%sf(i, j, k+1) - q_cons_vf(momxb+2)%sf(i, j, k-1)/q_cons_vf(1)%sf(i, j, k-1))/(2._wp*dz(k))))
 
-                    R_mu(3)%vf(3)%sf(i, j, k) = mu_visc * (2._wp*(q_cons_vf(momxb+2)%sf(i, j, k+1)/q_cons_vf(1)%sf(i, j, k+1) - q_cons_vf(momxb+2)%sf(i, j, k-1)/q_cons_vf(1)%sf(i, j, k-1))/(2._wp*dz(k)) & 
+                    eff_visc(3)%vf(3)%sf(i, j, k) = mu_visc * (2._wp*(q_cons_vf(momxb+2)%sf(i, j, k+1)/q_cons_vf(1)%sf(i, j, k+1) - q_cons_vf(momxb+2)%sf(i, j, k-1)/q_cons_vf(1)%sf(i, j, k-1))/(2._wp*dz(k)) & 
                                                 - 2._wp/3._wp*((q_cons_vf(momxb)%sf(i+1, j, k)/q_cons_vf(1)%sf(i+1, j, k) - q_cons_vf(momxb)%sf(i-1, j, k)/q_cons_vf(1)%sf(i-1, j, k))/(2._wp*dx(i)) & 
                                                 + (q_cons_vf(momxb+1)%sf(i, j+1, k)/q_cons_vf(1)%sf(i, j+1, k) - q_cons_vf(momxb+1)%sf(i, j-1, k)/q_cons_vf(1)%sf(i, j-1, k))/(2._wp*dy(j)) & 
                                                 + (q_cons_vf(momxb+2)%sf(i, j, k+1)/q_cons_vf(1)%sf(i, j, k+1) - q_cons_vf(momxb+2)%sf(i, j, k-1)/q_cons_vf(1)%sf(i, j, k-1))/(2._wp*dz(k))))
 
-                    R_mu(1)%vf(2)%sf(i, j, k) = mu_visc * ((q_cons_vf(momxb)%sf(i, j+1, k)/q_cons_vf(1)%sf(i, j+1, k) - q_cons_vf(momxb)%sf(i, j-1, k)/q_cons_vf(1)%sf(i, j-1, k))/(2._wp*dy(j))/q_cons_vf(1)%sf(i, j, k) & 
+                    eff_visc(1)%vf(2)%sf(i, j, k) = mu_visc * ((q_cons_vf(momxb)%sf(i, j+1, k)/q_cons_vf(1)%sf(i, j+1, k) - q_cons_vf(momxb)%sf(i, j-1, k)/q_cons_vf(1)%sf(i, j-1, k))/(2._wp*dy(j))/q_cons_vf(1)%sf(i, j, k) & 
                                                 + (q_cons_vf(momxb+1)%sf(i+1, j, k)/q_cons_vf(1)%sf(i+1, j, k) - q_cons_vf(momxb+1)%sf(i-1, j, k)/q_cons_vf(1)%sf(i-1, j, k))/(2._wp*dx(i))/q_cons_vf(1)%sf(i, j, k))
                                             
-                    R_mu(2)%vf(1)%sf(i, j, k) = R_mu(1)%vf(2)%sf(i, j, k)
+                    eff_visc(2)%vf(1)%sf(i, j, k) = eff_visc(1)%vf(2)%sf(i, j, k)
 
-                    R_mu(1)%vf(3)%sf(i, j, k) = mu_visc * ((q_cons_vf(momxb)%sf(i, j, k+1)/q_cons_vf(1)%sf(i, j, k+1) - q_cons_vf(momxb)%sf(i, j, k-1)/q_cons_vf(1)%sf(i, j, k-1))/(2._wp*dz(k))/q_cons_vf(1)%sf(i, j, k) & 
+                    eff_visc(1)%vf(3)%sf(i, j, k) = mu_visc * ((q_cons_vf(momxb)%sf(i, j, k+1)/q_cons_vf(1)%sf(i, j, k+1) - q_cons_vf(momxb)%sf(i, j, k-1)/q_cons_vf(1)%sf(i, j, k-1))/(2._wp*dz(k))/q_cons_vf(1)%sf(i, j, k) & 
                                                 + (q_cons_vf(momxb+2)%sf(i+1, j, k)/q_cons_vf(1)%sf(i+1, j, k) - q_cons_vf(momxb+2)%sf(i-1, j, k)/q_cons_vf(1)%sf(i-1, j, k))/(2._wp*dx(i))/q_cons_vf(1)%sf(i, j, k))
 
-                    R_mu(3)%vf(1)%sf(i, j, k) = R_mu(1)%vf(3)%sf(i, j, k)
+                    eff_visc(3)%vf(1)%sf(i, j, k) = eff_visc(1)%vf(3)%sf(i, j, k)
 
-                    R_mu(2)%vf(3)%sf(i, j, k) = mu_visc * ((q_cons_vf(momxb+1)%sf(i, j, k+1)/q_cons_vf(1)%sf(i, j, k+1) - q_cons_vf(momxb+1)%sf(i, j, k-1)/q_cons_vf(1)%sf(i, j, k-1))/(2._wp*dz(k))/q_cons_vf(1)%sf(i, j, k) & 
+                    eff_visc(2)%vf(3)%sf(i, j, k) = mu_visc * ((q_cons_vf(momxb+1)%sf(i, j, k+1)/q_cons_vf(1)%sf(i, j, k+1) - q_cons_vf(momxb+1)%sf(i, j, k-1)/q_cons_vf(1)%sf(i, j, k-1))/(2._wp*dz(k))/q_cons_vf(1)%sf(i, j, k) & 
                                                 + (q_cons_vf(momxb+2)%sf(i, j+1, k)/q_cons_vf(1)%sf(i, j+1, k) - q_cons_vf(momxb+2)%sf(i, j-1, k)/q_cons_vf(1)%sf(i, j-1, k))/(2._wp*dy(j))/q_cons_vf(1)%sf(i, j, k))
 
-                    R_mu(3)%vf(2)%sf(i, j, k) = R_mu(2)%vf(3)%sf(i, j, k)
+                    eff_visc(3)%vf(2)%sf(i, j, k) = eff_visc(2)%vf(3)%sf(i, j, k)
                 end do
             end do
         end do
 
     end subroutine s_setup_terms_filtering
 
-    subroutine s_compute_pseudo_turbulent_reynolds_stress(q_cons_filtered, pt_Re_stress, mag_div_Ru)
+    subroutine s_compute_pseudo_turbulent_reynolds_stress(q_cons_filtered, reynolds_stress, mag_reynolds_stress)
         type(scalar_field), dimension(sys_size), intent(in) :: q_cons_filtered
-        type(vector_field), dimension(1:num_dims), intent(inout) :: pt_Re_stress
-        type(scalar_field), intent(inout) :: mag_div_Ru
+        type(vector_field), dimension(1:num_dims), intent(inout) :: reynolds_stress
+        type(scalar_field), intent(inout) :: mag_reynolds_stress
         real(wp), dimension(1:num_dims, 0:m, 0:n, 0:p) :: div_Ru
         integer :: i, j, k, l, q    
 
@@ -804,7 +928,7 @@ contains
                     do l = 1, num_dims
                         !$acc loop seq
                         do q = 1, num_dims
-                            pt_Re_stress(l)%vf(q)%sf(i, j, k) = pt_Re_stress(l)%vf(q)%sf(i, j, k) &
+                            reynolds_stress(l)%vf(q)%sf(i, j, k) = reynolds_stress(l)%vf(q)%sf(i, j, k) &
                                                               - (q_cons_filtered(momxb-1+l)%sf(i, j, k) * q_cons_filtered(momxb-1+q)%sf(i, j, k) / q_cons_filtered(1)%sf(i, j, k))
                         end do
                     end do
@@ -820,7 +944,7 @@ contains
                     do l = 1, num_dims
                         !$acc loop seq
                         do q = 1, num_dims
-                            pt_Re_stress(l)%vf(q)%sf(i, j, k) = pt_Re_stress(l)%vf(q)%sf(i, j, k) * q_cons_filtered(advxb)%sf(i, j, k)
+                            reynolds_stress(l)%vf(q)%sf(i, j, k) = reynolds_stress(l)%vf(q)%sf(i, j, k) * filtered_fluid_indicator_function%sf(i, j, k)
                         end do 
                     end do 
                 end do
@@ -831,20 +955,20 @@ contains
 #ifdef MFC_MPI
         do l = 1, num_dims 
             do q = 1, num_dims
-                call s_populate_scalarfield_buffers(pt_Re_stress(l)%vf(q))
+                call s_populate_scalarfield_buffers(reynolds_stress(l)%vf(q))
             end do 
         end do
 #else
         do l = 1, num_dims
             do q = 1, num_dims
-                pt_Re_stress(l)%vf(q)%sf(-buff_size:-1, :, :) = pt_Re_stress(l)%vf(q)%sf(m-buff_size+1:m, :, :)
-                pt_Re_stress(l)%vf(q)%sf(m+1:m+buff_size, :, :) = pt_Re_stress(l)%vf(q)%sf(0:buff_size-1, :, :)
+                reynolds_stress(l)%vf(q)%sf(-buff_size:-1, :, :) = reynolds_stress(l)%vf(q)%sf(m-buff_size+1:m, :, :)
+                reynolds_stress(l)%vf(q)%sf(m+1:m+buff_size, :, :) = reynolds_stress(l)%vf(q)%sf(0:buff_size-1, :, :)
 
-                pt_Re_stress(l)%vf(q)%sf(:, -buff_size:-1, :) = pt_Re_stress(l)%vf(q)%sf(:, n-buff_size+1:n, :)
-                pt_Re_stress(l)%vf(q)%sf(:, n+1:n+buff_size, :) = pt_Re_stress(l)%vf(q)%sf(:, 0:buff_size-1, :)
+                reynolds_stress(l)%vf(q)%sf(:, -buff_size:-1, :) = reynolds_stress(l)%vf(q)%sf(:, n-buff_size+1:n, :)
+                reynolds_stress(l)%vf(q)%sf(:, n+1:n+buff_size, :) = reynolds_stress(l)%vf(q)%sf(:, 0:buff_size-1, :)
 
-                pt_Re_stress(l)%vf(q)%sf(:, :, -buff_size:-1) = pt_Re_stress(l)%vf(q)%sf(:, :, p-buff_size+1:p)
-                pt_Re_stress(l)%vf(q)%sf(:, :, p+1:p+buff_size) = pt_Re_stress(l)%vf(q)%sf(:, :, 0:buff_size-1)
+                reynolds_stress(l)%vf(q)%sf(:, :, -buff_size:-1) = reynolds_stress(l)%vf(q)%sf(:, :, p-buff_size+1:p)
+                reynolds_stress(l)%vf(q)%sf(:, :, p+1:p+buff_size) = reynolds_stress(l)%vf(q)%sf(:, :, 0:buff_size-1)
             end do
         end do
 #endif
@@ -856,9 +980,9 @@ contains
                 do k = 0, p
                     !$acc loop seq
                     do l = 1, num_dims
-                        div_Ru(l, i, j, k) = (pt_Re_stress(l)%vf(1)%sf(i+1, j, k) - pt_Re_stress(l)%vf(1)%sf(i-1, j, k))/(2._wp*dx(i)) &
-                                           + (pt_Re_stress(l)%vf(2)%sf(i, j+1, k) - pt_Re_stress(l)%vf(2)%sf(i, j-1, k))/(2._wp*dy(j)) & 
-                                           + (pt_Re_stress(l)%vf(3)%sf(i, j, k+1) - pt_Re_stress(l)%vf(3)%sf(i, j, k-1))/(2._wp*dz(k))
+                        div_Ru(l, i, j, k) = (reynolds_stress(l)%vf(1)%sf(i+1, j, k) - reynolds_stress(l)%vf(1)%sf(i-1, j, k))/(2._wp*dx(i)) &
+                                           + (reynolds_stress(l)%vf(2)%sf(i, j+1, k) - reynolds_stress(l)%vf(2)%sf(i, j-1, k))/(2._wp*dy(j)) & 
+                                           + (reynolds_stress(l)%vf(3)%sf(i, j, k+1) - reynolds_stress(l)%vf(3)%sf(i, j, k-1))/(2._wp*dz(k))
                     end do
                 end do
             end do
@@ -868,18 +992,18 @@ contains
         do i = 0, m
             do j = 0, n
                 do k = 0, p 
-                    mag_div_Ru%sf(i, j, k) = sqrt(div_Ru(1, i, j, k)**2 + div_Ru(2, i, j, k)**2 + div_Ru(3, i, j, k)**2)
+                    mag_reynolds_stress%sf(i, j, k) = sqrt(div_Ru(1, i, j, k)**2 + div_Ru(2, i, j, k)**2 + div_Ru(3, i, j, k)**2)
                 end do
             end do
         end do
 
     end subroutine s_compute_pseudo_turbulent_reynolds_stress
 
-    subroutine s_compute_R_mu(q_cons_filtered, R_mu, mag_div_R_mu)
+    subroutine s_compute_effective_viscosity(q_cons_filtered, eff_visc, mag_eff_visc)
         type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_filtered
-        type(vector_field), dimension(1:num_dims), intent(inout) :: R_mu
-        type(scalar_field), intent(inout) :: mag_div_R_mu
-        real(wp), dimension(1:num_dims, 0:m, 0:n, 0:p) :: div_R_mu
+        type(vector_field), dimension(1:num_dims), intent(inout) :: eff_visc
+        type(scalar_field), intent(inout) :: mag_eff_visc
+        real(wp), dimension(1:num_dims, 0:m, 0:n, 0:p) :: div_eff_visc
 
         integer :: i, j, k, l, q
 
@@ -901,40 +1025,40 @@ contains
         end do
 #endif
 
-        ! calculate R_mu
+        ! calculate eff_visc
         !$acc parallel loop collapse(3) gang vector default(present)
         do i = 0, m
             do j = 0, n
                 do k = 0, p
-                    R_mu(1)%vf(1)%sf(i, j, k) = R_mu(1)%vf(1)%sf(i, j, k) - mu_visc * (2._wp*(q_cons_filtered(momxb)%sf(i+1, j, k)/q_cons_filtered(1)%sf(i+1, j, k) - q_cons_filtered(momxb)%sf(i-1, j, k)/q_cons_filtered(1)%sf(i-1, j, k))/(2._wp*dx(i)) & 
+                    eff_visc(1)%vf(1)%sf(i, j, k) = eff_visc(1)%vf(1)%sf(i, j, k) - mu_visc * (2._wp*(q_cons_filtered(momxb)%sf(i+1, j, k)/q_cons_filtered(1)%sf(i+1, j, k) - q_cons_filtered(momxb)%sf(i-1, j, k)/q_cons_filtered(1)%sf(i-1, j, k))/(2._wp*dx(i)) & 
                                             - 2._wp/3._wp*((q_cons_filtered(momxb)%sf(i+1, j, k)/q_cons_filtered(1)%sf(i+1, j, k) - q_cons_filtered(momxb)%sf(i-1, j, k)/q_cons_filtered(1)%sf(i-1, j, k))/(2._wp*dx(i)) & 
                                             + (q_cons_filtered(momxb+1)%sf(i, j+1, k)/q_cons_filtered(1)%sf(i, j+1, k) - q_cons_filtered(momxb+1)%sf(i, j-1, k)/q_cons_filtered(1)%sf(i, j-1, k))/(2._wp*dy(j)) & 
                                             + (q_cons_filtered(momxb+2)%sf(i, j, k+1)/q_cons_filtered(1)%sf(i, j, k+1) - q_cons_filtered(momxb+2)%sf(i, j, k-1)/q_cons_filtered(1)%sf(i, j, k-1))/(2._wp*dz(k))))
 
-                    R_mu(2)%vf(2)%sf(i, j, k) = R_mu(2)%vf(2)%sf(i, j, k) - mu_visc * (2._wp*(q_cons_filtered(momxb+1)%sf(i, j+1, k)/q_cons_filtered(1)%sf(i, j+1, k) - q_cons_filtered(momxb+1)%sf(i, j-1, k)/q_cons_filtered(1)%sf(i, j-1, k))/(2._wp*dy(j)) & 
+                    eff_visc(2)%vf(2)%sf(i, j, k) = eff_visc(2)%vf(2)%sf(i, j, k) - mu_visc * (2._wp*(q_cons_filtered(momxb+1)%sf(i, j+1, k)/q_cons_filtered(1)%sf(i, j+1, k) - q_cons_filtered(momxb+1)%sf(i, j-1, k)/q_cons_filtered(1)%sf(i, j-1, k))/(2._wp*dy(j)) & 
                                             - 2._wp/3._wp*((q_cons_filtered(momxb)%sf(i+1, j, k)/q_cons_filtered(1)%sf(i+1, j, k) - q_cons_filtered(momxb)%sf(i-1, j, k)/q_cons_filtered(1)%sf(i-1, j, k))/(2._wp*dx(i)) & 
                                             + (q_cons_filtered(momxb+1)%sf(i, j+1, k)/q_cons_filtered(1)%sf(i, j+1, k) - q_cons_filtered(momxb+1)%sf(i, j-1, k)/q_cons_filtered(1)%sf(i, j-1, k))/(2._wp*dy(j)) & 
                                             + (q_cons_filtered(momxb+2)%sf(i, j, k+1)/q_cons_filtered(1)%sf(i, j, k+1) - q_cons_filtered(momxb+2)%sf(i, j, k-1)/q_cons_filtered(1)%sf(i, j, k-1))/(2._wp*dz(k))))
 
-                    R_mu(3)%vf(3)%sf(i, j, k) = R_mu(3)%vf(3)%sf(i, j, k) - mu_visc * (2._wp*(q_cons_filtered(momxb+2)%sf(i, j, k+1)/q_cons_filtered(1)%sf(i, j, k+1) - q_cons_filtered(momxb+2)%sf(i, j, k-1)/q_cons_filtered(1)%sf(i, j, k-1))/(2._wp*dz(k)) & 
+                    eff_visc(3)%vf(3)%sf(i, j, k) = eff_visc(3)%vf(3)%sf(i, j, k) - mu_visc * (2._wp*(q_cons_filtered(momxb+2)%sf(i, j, k+1)/q_cons_filtered(1)%sf(i, j, k+1) - q_cons_filtered(momxb+2)%sf(i, j, k-1)/q_cons_filtered(1)%sf(i, j, k-1))/(2._wp*dz(k)) & 
                                             - 2._wp/3._wp*((q_cons_filtered(momxb)%sf(i+1, j, k)/q_cons_filtered(1)%sf(i+1, j, k) - q_cons_filtered(momxb)%sf(i-1, j, k)/q_cons_filtered(1)%sf(i-1, j, k))/(2._wp*dx(i)) & 
                                             + (q_cons_filtered(momxb+1)%sf(i, j+1, k)/q_cons_filtered(1)%sf(i, j+1, k) - q_cons_filtered(momxb+1)%sf(i, j-1, k)/q_cons_filtered(1)%sf(i, j-1, k))/(2._wp*dy(j)) & 
                                             + (q_cons_filtered(momxb+2)%sf(i, j, k+1)/q_cons_filtered(1)%sf(i, j, k+1) - q_cons_filtered(momxb+2)%sf(i, j, k-1)/q_cons_filtered(1)%sf(i, j, k-1))/(2._wp*dz(k))))
 
-                    R_mu(1)%vf(2)%sf(i, j, k) = R_mu(1)%vf(2)%sf(i, j, k) - mu_visc * ((q_cons_filtered(momxb)%sf(i, j+1, k)/q_cons_filtered(1)%sf(i, j+1, k) - q_cons_filtered(momxb)%sf(i, j-1, k)/q_cons_filtered(1)%sf(i, j-1, k))/(2._wp*dy(j))/q_cons_filtered(1)%sf(i, j, k) & 
+                    eff_visc(1)%vf(2)%sf(i, j, k) = eff_visc(1)%vf(2)%sf(i, j, k) - mu_visc * ((q_cons_filtered(momxb)%sf(i, j+1, k)/q_cons_filtered(1)%sf(i, j+1, k) - q_cons_filtered(momxb)%sf(i, j-1, k)/q_cons_filtered(1)%sf(i, j-1, k))/(2._wp*dy(j))/q_cons_filtered(1)%sf(i, j, k) & 
                                             + (q_cons_filtered(momxb+1)%sf(i+1, j, k)/q_cons_filtered(1)%sf(i+1, j, k) - q_cons_filtered(momxb+1)%sf(i-1, j, k)/q_cons_filtered(1)%sf(i-1, j, k))/(2._wp*dx(i))/q_cons_filtered(1)%sf(i, j, k))
                                         
-                    R_mu(2)%vf(1)%sf(i, j, k) = R_mu(1)%vf(2)%sf(i, j, k)
+                    eff_visc(2)%vf(1)%sf(i, j, k) = eff_visc(1)%vf(2)%sf(i, j, k)
 
-                    R_mu(1)%vf(3)%sf(i, j, k) = R_mu(1)%vf(3)%sf(i, j, k) - mu_visc * ((q_cons_filtered(momxb)%sf(i, j, k+1)/q_cons_filtered(1)%sf(i, j, k+1) - q_cons_filtered(momxb)%sf(i, j, k-1)/q_cons_filtered(1)%sf(i, j, k-1))/(2._wp*dz(k))/q_cons_filtered(1)%sf(i, j, k) & 
+                    eff_visc(1)%vf(3)%sf(i, j, k) = eff_visc(1)%vf(3)%sf(i, j, k) - mu_visc * ((q_cons_filtered(momxb)%sf(i, j, k+1)/q_cons_filtered(1)%sf(i, j, k+1) - q_cons_filtered(momxb)%sf(i, j, k-1)/q_cons_filtered(1)%sf(i, j, k-1))/(2._wp*dz(k))/q_cons_filtered(1)%sf(i, j, k) & 
                                             + (q_cons_filtered(momxb+2)%sf(i+1, j, k)/q_cons_filtered(1)%sf(i+1, j, k) - q_cons_filtered(momxb+2)%sf(i-1, j, k)/q_cons_filtered(1)%sf(i-1, j, k))/(2._wp*dx(i))/q_cons_filtered(1)%sf(i, j, k))
 
-                    R_mu(3)%vf(1)%sf(i, j, k) = R_mu(1)%vf(3)%sf(i, j, k)
+                    eff_visc(3)%vf(1)%sf(i, j, k) = eff_visc(1)%vf(3)%sf(i, j, k)
 
-                    R_mu(2)%vf(3)%sf(i, j, k) = R_mu(2)%vf(3)%sf(i, j, k) - mu_visc * ((q_cons_filtered(momxb+1)%sf(i, j, k+1)/q_cons_filtered(1)%sf(i, j, k+1) - q_cons_filtered(momxb+1)%sf(i, j, k-1)/q_cons_filtered(1)%sf(i, j, k-1))/(2._wp*dz(k))/q_cons_filtered(1)%sf(i, j, k) & 
+                    eff_visc(2)%vf(3)%sf(i, j, k) = eff_visc(2)%vf(3)%sf(i, j, k) - mu_visc * ((q_cons_filtered(momxb+1)%sf(i, j, k+1)/q_cons_filtered(1)%sf(i, j, k+1) - q_cons_filtered(momxb+1)%sf(i, j, k-1)/q_cons_filtered(1)%sf(i, j, k-1))/(2._wp*dz(k))/q_cons_filtered(1)%sf(i, j, k) & 
                                             + (q_cons_filtered(momxb+2)%sf(i, j+1, k)/q_cons_filtered(1)%sf(i, j+1, k) - q_cons_filtered(momxb+2)%sf(i, j-1, k)/q_cons_filtered(1)%sf(i, j-1, k))/(2._wp*dy(j))/q_cons_filtered(1)%sf(i, j, k))
 
-                    R_mu(3)%vf(2)%sf(i, j, k) = R_mu(2)%vf(3)%sf(i, j, k)
+                    eff_visc(3)%vf(2)%sf(i, j, k) = eff_visc(2)%vf(3)%sf(i, j, k)
                     
                 end do
             end do
@@ -948,7 +1072,7 @@ contains
                     do l = 1, num_dims
                         !$acc loop seq
                         do q = 1, num_dims
-                            R_mu(l)%vf(q)%sf(i, j, k) = R_mu(l)%vf(q)%sf(i, j, k) * q_cons_filtered(advxb)%sf(i, j, k)
+                            eff_visc(l)%vf(q)%sf(i, j, k) = eff_visc(l)%vf(q)%sf(i, j, k) * filtered_fluid_indicator_function%sf(i, j, k)
                         end do 
                     end do 
                 end do
@@ -959,53 +1083,53 @@ contains
 #ifdef MFC_MPI
         do l = 1, num_dims
             do q = 1, num_dims
-                call s_populate_scalarfield_buffers(R_mu(l)%vf(q))
+                call s_populate_scalarfield_buffers(eff_visc(l)%vf(q))
             end do
         end do
 #else
         do l = 1, num_dims
             do q = 1, num_dims
-                R_mu(l)%vf(q)%sf(-buff_size:-1, :, :) = R_mu(l)%vf(q)%sf(m-buff_size+1:m, :, :)
-                R_mu(l)%vf(q)%sf(m+1:m+buff_size, :, :) = R_mu(l)%vf(q)%sf(0:buff_size-1, :, :)
+                eff_visc(l)%vf(q)%sf(-buff_size:-1, :, :) = eff_visc(l)%vf(q)%sf(m-buff_size+1:m, :, :)
+                eff_visc(l)%vf(q)%sf(m+1:m+buff_size, :, :) = eff_visc(l)%vf(q)%sf(0:buff_size-1, :, :)
 
-                R_mu(l)%vf(q)%sf(:, -buff_size:-1, :) = R_mu(l)%vf(q)%sf(:, n-buff_size+1:n, :)
-                R_mu(l)%vf(q)%sf(:, n+1:n+buff_size, :) = R_mu(l)%vf(q)%sf(:, 0:buff_size-1, :)
+                eff_visc(l)%vf(q)%sf(:, -buff_size:-1, :) = eff_visc(l)%vf(q)%sf(:, n-buff_size+1:n, :)
+                eff_visc(l)%vf(q)%sf(:, n+1:n+buff_size, :) = eff_visc(l)%vf(q)%sf(:, 0:buff_size-1, :)
 
-                R_mu(l)%vf(q)%sf(:, :, -buff_size:-1) = R_mu(l)%vf(q)%sf(:, :, p-buff_size+1:p)
-                R_mu(l)%vf(q)%sf(:, :, p+1:p+buff_size) = R_mu(l)%vf(q)%sf(:, :, 0:buff_size-1)
+                eff_visc(l)%vf(q)%sf(:, :, -buff_size:-1) = eff_visc(l)%vf(q)%sf(:, :, p-buff_size+1:p)
+                eff_visc(l)%vf(q)%sf(:, :, p+1:p+buff_size) = eff_visc(l)%vf(q)%sf(:, :, 0:buff_size-1)
             end do
         end do
 #endif
 
-        ! div(R_mu), using CD2 FD scheme 
-        !$acc parallel loop collapse(3) gang vector default(present) copy(div_R_mu)
+        ! div(eff_visc), using CD2 FD scheme 
+        !$acc parallel loop collapse(3) gang vector default(present) copy(div_eff_visc)
         do i = 0, m
             do j = 0, n 
                 do k = 0, p
                     !$acc loop seq
                     do l = 1, num_dims
-                        div_R_mu(l, i, j, k) = (R_mu(l)%vf(1)%sf(i+1, j, k) - R_mu(l)%vf(1)%sf(i-1, j, k))/(2._wp*dx(i)) &
-                                             + (R_mu(l)%vf(2)%sf(i, j+1, k) - R_mu(l)%vf(2)%sf(i, j-1, k))/(2._wp*dy(j)) & 
-                                             + (R_mu(l)%vf(3)%sf(i, j, k+1) - R_mu(l)%vf(3)%sf(i, j, k-1))/(2._wp*dz(k))
+                        div_eff_visc(l, i, j, k) = (eff_visc(l)%vf(1)%sf(i+1, j, k) - eff_visc(l)%vf(1)%sf(i-1, j, k))/(2._wp*dx(i)) &
+                                             + (eff_visc(l)%vf(2)%sf(i, j+1, k) - eff_visc(l)%vf(2)%sf(i, j-1, k))/(2._wp*dy(j)) & 
+                                             + (eff_visc(l)%vf(3)%sf(i, j, k+1) - eff_visc(l)%vf(3)%sf(i, j, k-1))/(2._wp*dz(k))
                     end do
                 end do
             end do
         end do
 
-        !$acc parallel loop collapse(3) gang vector default(present) copyin(div_R_mu)
+        !$acc parallel loop collapse(3) gang vector default(present) copyin(div_eff_visc)
         do i = 0, m
             do j = 0, n
                 do k = 0, p 
-                    mag_div_R_mu%sf(i, j, k) = sqrt(div_R_mu(1, i, j, k)**2 + div_R_mu(2, i, j, k)**2 + div_R_mu(3, i, j, k)**2)
+                    mag_eff_visc%sf(i, j, k) = sqrt(div_eff_visc(1, i, j, k)**2 + div_eff_visc(2, i, j, k)**2 + div_eff_visc(3, i, j, k)**2)
                 end do
             end do
         end do
 
-    end subroutine s_compute_R_mu
+    end subroutine s_compute_effective_viscosity
 
-    subroutine s_compute_interphase_momentum_exchange_term(pres_visc_stress_filtered, mag_F_IMET)
-        type(scalar_field), dimension(1:num_dims), intent(in) :: pres_visc_stress_filtered
-        type(scalar_field), intent(inout) :: mag_F_IMET
+    subroutine s_compute_interphase_momentum_exchange(int_mom_exch, mag_int_mom_exch)
+        type(scalar_field), dimension(1:num_dims), intent(in) :: int_mom_exch
+        type(scalar_field), intent(inout) :: mag_int_mom_exch
 
         integer :: i, j, k, l, q, ii
 
@@ -1013,17 +1137,48 @@ contains
         do i = 0, m
             do j = 0, n
                 do k = 0, p 
-                    mag_F_IMET%sf(i, j, k) = sqrt(pres_visc_stress_filtered(1)%sf(i, j, k)**2 & 
-                                                + pres_visc_stress_filtered(2)%sf(i, j, k)**2 & 
-                                                + pres_visc_stress_filtered(3)%sf(i, j, k)**2)
+                    mag_int_mom_exch%sf(i, j, k) = sqrt(int_mom_exch(1)%sf(i, j, k)**2 & 
+                                                + int_mom_exch(2)%sf(i, j, k)**2 & 
+                                                + int_mom_exch(3)%sf(i, j, k)**2)
                 end do
             end do
         end do 
 
-    end subroutine s_compute_interphase_momentum_exchange_term
+    end subroutine s_compute_interphase_momentum_exchange
 
     subroutine s_finalize_fftw_explicit_filter_module
-        @:DEALLOCATE(fluid_indicator_function_I%sf)
+        @:DEALLOCATE(fluid_indicator_function%sf)
+        @:DEALLOCATE(filtered_fluid_indicator_function%sf)
+
+        do i = 1, sys_size
+            @:DEALLOCATE(q_cons_filtered(i)%sf)
+        end do
+        @:DEALLOCATE(q_cons_filtered)
+
+        do i = 1, num_dims
+            do j = 1, num_dims
+                @:DEALLOCATE(reynolds_stress(i)%vf(j)%sf)
+            end do
+            @:DEALLOCATE(reynolds_stress(i)%vf)
+        end do
+        @:DEALLOCATE(reynolds_stress)
+
+        do i = 1, num_dims
+            do j = 1, num_dims
+                @:DEALLOCATE(eff_visc(i)%vf(j)%sf)
+            end do
+            @:DEALLOCATE(eff_visc(i)%vf)
+        end do
+        @:DEALLOCATE(eff_visc)
+
+        do i = 1, num_dims
+            @:DEALLOCATE(int_mom_exch(i)%sf)
+        end do
+        @:DEALLOCATE(int_mom_exch)
+
+        @:DEALLOCATE(mag_reynolds_stress%sf)
+        @:DEALLOCATE(mag_eff_visc%sf)
+        @:DEALLOCATE(mag_int_mom_exch%sf)
 
         @:DEALLOCATE(data_real_in1d, data_cmplx_out1d, data_cmplx_out1dy)
         @:DEALLOCATE(cmplx_kernelG1d, real_kernelG_in)
diff --git a/src/simulation/p_main.fpp b/src/simulation/p_main.fpp
index 7f07744d8b..4c3ae9b62b 100644
--- a/src/simulation/p_main.fpp
+++ b/src/simulation/p_main.fpp
@@ -56,9 +56,8 @@ program p_main
     call s_initialize_gpu_vars()
     call nvtxEndRange
 
+    if (volume_filtering_momentum_eqn) call s_initialize_filtering_kernel()
     call s_initialize_fluid_indicator_function()
-    if (fourier_transform_filtering) call s_initialize_filtering_kernel()
-    if (fourier_transform_filtering) call s_initialize_filtered_fluid_indicator_function(q_cons_filtered(advxb))
 
     ! Setting the time-step iterator to the first time-step
     if (cfl_dt) then
diff --git a/toolchain/mfc/run/case_dicts.py b/toolchain/mfc/run/case_dicts.py
index a58d29869f..212d7a6cb6 100644
--- a/toolchain/mfc/run/case_dicts.py
+++ b/toolchain/mfc/run/case_dicts.py
@@ -306,7 +306,7 @@ def analytic(self):
     'rho_inf_ref': ParamType.REAL,
     'T_inf_ref': ParamType.REAL,
     'periodic_forcing': ParamType.LOG,
-    'fourier_transform_filtering': ParamType.LOG,
+    'volume_filtering_momentum_eqn': ParamType.LOG,
     'compute_autocorrelation': ParamType.LOG,
 })
 

From c35d5778c64ef7ca010b2ed7e0e6e2e67c2f698a Mon Sep 17 00:00:00 2001
From: conradd3 <conradd3@illinois.edu>
Date: Mon, 25 Aug 2025 11:33:52 -0500
Subject: [PATCH 06/30] statistics verification -> test gpus

---
 runs/3d_1sphere_filtering/case.py             |   9 +-
 runs/3d_1sphere_periodic/case.py              |   1 -
 runs/3d_drag_test/case.py                     |   1 -
 .../{ => centered}/case.py                    |  35 +-
 runs/3d_periodic_ibs_test/off-center/case.py  | 146 +++
 runs/phi01/case.py                            |   1 -
 src/common/m_mpi_common.fpp                   |   6 +-
 src/simulation/m_compute_statistics.fpp       | 258 +++---
 src/simulation/m_global_parameters.fpp        |   6 +-
 src/simulation/m_mpi_proxy.fpp                |   4 +-
 src/simulation/m_rhs.fpp                      | 127 +--
 src/simulation/m_start_up.fpp                 |  38 +-
 src/simulation/m_time_steppers.fpp            | 100 +-
 src/simulation/m_volume_filtering.fpp         | 865 ++++++++++--------
 src/simulation/p_main.fpp                     |   5 +-
 toolchain/mfc/run/case_dicts.py               |   2 +-
 16 files changed, 832 insertions(+), 772 deletions(-)
 rename runs/3d_periodic_ibs_test/{ => centered}/case.py (82%)
 create mode 100644 runs/3d_periodic_ibs_test/off-center/case.py

diff --git a/runs/3d_1sphere_filtering/case.py b/runs/3d_1sphere_filtering/case.py
index 0964ea5dd4..c6d138c110 100644
--- a/runs/3d_1sphere_filtering/case.py
+++ b/runs/3d_1sphere_filtering/case.py
@@ -25,8 +25,8 @@
 #print('Kn = ' + str( np.sqrt(np.pi*gam_a/2)*(M/Re) )) # Kn < 0.01 = continuum flow
 
 dt = 4.0E-06
-Nt = 31
-t_save = 1
+Nt = 1000
+t_save = 10
 
 Nx = 63
 Ny = 63
@@ -65,6 +65,7 @@
     "t_step_start": 0,
     "t_step_stop": Nt,  # 3000
     "t_step_save": t_save,  # 10
+    "t_step_stat_start": 100,
     # Simulation Algorithm Parameters
     # Only one patches are necessary, the air tube
     "num_patches": 1,
@@ -132,18 +133,16 @@
     # Fluids Physical Parameters
     "fluid_pp(1)%gamma": 1.0e00 / (gam_a - 1.0e00),  # 2.50(Not 1.40)
     "fluid_pp(1)%pi_inf": 0,
-    "fluid_pp(1)%Re(1)": Re,
+    "fluid_pp(1)%Re(1)": 1.0 / mu,
 
     # new case additions
     "periodic_forcing": "T",
     "periodic_ibs": "T",
-    "compute_CD": "F",
     "volume_filtering_momentum_eqn": "T",
 
     "u_inf_ref": v1,
     "rho_inf_ref": rho,
     "T_inf_ref": T,
-    "mu_visc": mu,
 
     "store_levelset": "F",
     "slab_domain_decomposition": "T", 
diff --git a/runs/3d_1sphere_periodic/case.py b/runs/3d_1sphere_periodic/case.py
index f4512b5f00..d8760b7909 100644
--- a/runs/3d_1sphere_periodic/case.py
+++ b/runs/3d_1sphere_periodic/case.py
@@ -143,7 +143,6 @@
     "u_inf_ref": v1,
     "rho_inf_ref": rho,
     "T_inf_ref": T,
-    "mu_visc": mu,
 
     "store_levelset": "F",
     "slab_domain_decomposition": "T", 
diff --git a/runs/3d_drag_test/case.py b/runs/3d_drag_test/case.py
index 2eb50ebc62..9a78272a88 100644
--- a/runs/3d_drag_test/case.py
+++ b/runs/3d_drag_test/case.py
@@ -134,7 +134,6 @@
 
     # new case additions
     "compute_CD": "T",
-    "mu_visc": mu,
     "u_inf_ref": v1,
     "rho_inf_ref": rho,
     "T_inf_ref": T,
diff --git a/runs/3d_periodic_ibs_test/case.py b/runs/3d_periodic_ibs_test/centered/case.py
similarity index 82%
rename from runs/3d_periodic_ibs_test/case.py
rename to runs/3d_periodic_ibs_test/centered/case.py
index 9a63a3f4a3..a3199b3933 100644
--- a/runs/3d_periodic_ibs_test/case.py
+++ b/runs/3d_periodic_ibs_test/centered/case.py
@@ -15,7 +15,7 @@
 
 M = 1.2
 Re = 1500.0
-v1 = M*(gam_a*P/rho)**(1.0/2.0)
+v1 = M*np.sqrt((gam_a*P/rho))
 
 mu = rho*v1*D/Re # dynamic viscosity for current case
 
@@ -25,8 +25,8 @@
 #print('Kn = ' + str( np.sqrt(np.pi*gam_a/2)*(M/Re) )) # Kn < 0.01 = continuum flow
 
 dt = 4.0E-06
-Nt = 5
-t_save = 1
+Nt = 100
+t_save = 10
 
 Nx = 63
 Ny = 63
@@ -36,25 +36,11 @@
 ib_dict = {}
 ib_dict.update({
     f"patch_ib({1})%geometry": 8,
-    f"patch_ib({1})%x_centroid": 0.5,
-    f"patch_ib({1})%y_centroid": 0.5,
-    f"patch_ib({1})%z_centroid": 0.5,
+    f"patch_ib({1})%x_centroid": 0.0,
+    f"patch_ib({1})%y_centroid": 0.0,
+    f"patch_ib({1})%z_centroid": 0.0,
     f"patch_ib({1})%radius": D / 2,
     f"patch_ib({1})%slip": "F",
-
-    f"patch_ib({2})%geometry": 8,
-    f"patch_ib({2})%x_centroid": 0.0,
-    f"patch_ib({2})%y_centroid": 0.0,
-    f"patch_ib({2})%z_centroid": 0.0,
-    f"patch_ib({2})%radius": D / 2,
-    f"patch_ib({2})%slip": "F",
-
-    f"patch_ib({3})%geometry": 8,
-    f"patch_ib({3})%x_centroid": 0.0,
-    f"patch_ib({3})%y_centroid": 0.5,
-    f"patch_ib({3})%z_centroid": 0.25,
-    f"patch_ib({3})%radius": D / 2,
-    f"patch_ib({3})%slip": "F",
     })
 
 # Configuring case dictionary
@@ -116,7 +102,7 @@
     "bc_z%end": -1,
     # Set IB to True and add 1 patch
     "ib": "T",
-    "num_ibs": 3,
+    "num_ibs": 1,
     "viscous": "T",
     # Formatted Database Files Structure Parameters
     "format": 1,
@@ -144,10 +130,15 @@
     # Fluids Physical Parameters
     "fluid_pp(1)%gamma": 1.0e00 / (gam_a - 1.0e00),  # 2.50(Not 1.40)
     "fluid_pp(1)%pi_inf": 0,
-    "fluid_pp(1)%Re(1)": Re,
+    "fluid_pp(1)%Re(1)": 1.0 / mu,
 
     # new case additions
+    "periodic_forcing": "T",
     "periodic_ibs": "T",
+
+    "u_inf_ref": v1,
+    "rho_inf_ref": rho,
+    "T_inf_ref": T,
     }
 
 case_dict.update(ib_dict)
diff --git a/runs/3d_periodic_ibs_test/off-center/case.py b/runs/3d_periodic_ibs_test/off-center/case.py
new file mode 100644
index 0000000000..ecd3d7c9f5
--- /dev/null
+++ b/runs/3d_periodic_ibs_test/off-center/case.py
@@ -0,0 +1,146 @@
+import json
+import math
+import numpy as np
+
+Mu = 1.84e-05
+gam_a = 1.4
+R = 287.0
+
+D = 0.1
+
+P = 101325 # Pa
+rho = 1.225 # kg/m^3
+
+T = P/(rho*R)
+
+M = 1.2
+Re = 1500.0
+v1 = M*(gam_a*P/rho)**(1.0/2.0)
+
+mu = rho*v1*D/Re # dynamic viscosity for current case
+
+#print('mu: ', mu)
+#print('v1: ', v1)
+#print('rho: ', rho)
+#print('Kn = ' + str( np.sqrt(np.pi*gam_a/2)*(M/Re) )) # Kn < 0.01 = continuum flow
+
+dt = 4.0E-06
+Nt = 100
+t_save = 10
+
+Nx = 63
+Ny = 63
+Nz = 63
+
+# immersed boundary dictionary
+ib_dict = {}
+ib_dict.update({
+    f"patch_ib({1})%geometry": 8,
+    f"patch_ib({1})%x_centroid": 15.0 * D,
+    f"patch_ib({1})%y_centroid": 15.0 * D,
+    f"patch_ib({1})%z_centroid": 15.0 * D,
+    f"patch_ib({1})%radius": D / 2,
+    f"patch_ib({1})%slip": "F",
+    })
+
+# Configuring case dictionary
+case_dict = {
+    # Logistics
+    "run_time_info": "T",
+    # Computational Domain Parameters
+    # x direction
+    "x_domain%beg": 5.0 * D,
+    "x_domain%end": 15.0 * D,
+    # y direction
+    "y_domain%beg": 5.0 * D,
+    "y_domain%end": 15.0 * D,
+    # z direction
+    "z_domain%beg": 5.0 * D,
+    "z_domain%end": 15.0 * D,
+    "cyl_coord": "F",
+    "m": Nx,
+    "n": Ny,
+    "p": Nz,
+    "dt": dt,
+    "t_step_start": 0,
+    "t_step_stop": Nt,  # 3000
+    "t_step_save": t_save,  # 10
+    # Simulation Algorithm Parameters
+    # Only one patches are necessary, the air tube
+    "num_patches": 1,
+    # Use the 5 equation model
+    "model_eqns": 2,
+    # 6 equations model does not need the K \div(u) term
+    "alt_soundspeed": "F",
+    # One fluids: air
+    "num_fluids": 1,
+    # time step
+    "mpp_lim": "F",
+    # Correct errors when computing speed of sound
+    "mixture_err": "T",
+    # Use TVD RK3 for time marching
+    "time_stepper": 3,
+    # Reconstruct the primitive variables to minimize spurious
+    # Use WENO5
+    "weno_order": 5,
+    "weno_eps": 1.0e-14,
+    "weno_Re_flux": "T",
+    "weno_avg": "T",
+    "avg_state": 2,
+    "mapped_weno": "T",
+    "null_weights": "F",
+    "mp_weno": "T",
+    "riemann_solver": 2,
+    "low_Mach": 1,
+    "wave_speeds": 1,
+    # periodic bc
+    "bc_x%beg": -1,
+    "bc_x%end": -1,
+    "bc_y%beg": -1,
+    "bc_y%end": -1,
+    "bc_z%beg": -1,
+    "bc_z%end": -1,
+    # Set IB to True and add 1 patch
+    "ib": "T",
+    "num_ibs": 1,
+    "viscous": "T",
+    # Formatted Database Files Structure Parameters
+    "format": 1,
+    "precision": 2,
+    "prim_vars_wrt": "T",
+    "E_wrt": "T",
+    "parallel_io": "T",
+    
+    "patch_icpp(1)%geometry": 9,
+    "patch_icpp(1)%x_centroid": 10.0*D,
+    # Uniform medium density, centroid is at the center of the domain
+    "patch_icpp(1)%y_centroid": 10.0*D,
+    "patch_icpp(1)%z_centroid": 10.0*D,
+    "patch_icpp(1)%length_x": 10 * D,
+    "patch_icpp(1)%length_y": 10 * D,
+    "patch_icpp(1)%length_z": 10 * D,
+    # Specify the patch primitive variables
+    "patch_icpp(1)%vel(1)": v1,
+    "patch_icpp(1)%vel(2)": 0.0e00,
+    "patch_icpp(1)%vel(3)": 0.0e00,
+    "patch_icpp(1)%pres": P,
+    "patch_icpp(1)%alpha_rho(1)": rho,
+    "patch_icpp(1)%alpha(1)": 1.0e00,
+    # Patch: Sphere Immersed Boundary
+    # Fluids Physical Parameters
+    "fluid_pp(1)%gamma": 1.0e00 / (gam_a - 1.0e00),  # 2.50(Not 1.40)
+    "fluid_pp(1)%pi_inf": 0,
+    "fluid_pp(1)%Re(1)": 1.0 / mu,
+
+    # new case additions
+    "periodic_forcing": "T",
+    "periodic_ibs": "T",
+
+    "u_inf_ref": v1,
+    "rho_inf_ref": rho,
+    "T_inf_ref": T,
+    }
+
+case_dict.update(ib_dict)
+
+print(json.dumps(case_dict))
diff --git a/runs/phi01/case.py b/runs/phi01/case.py
index 8e7a5bff4b..1d5b26d462 100644
--- a/runs/phi01/case.py
+++ b/runs/phi01/case.py
@@ -148,7 +148,6 @@
     "u_inf_ref": v1,
     "rho_inf_ref": rho,
     "T_inf_ref": T,
-    "mu_visc": mu,
 
     "store_levelset": "F",
     "slab_domain_decomposition": "T", 
diff --git a/src/common/m_mpi_common.fpp b/src/common/m_mpi_common.fpp
index 662d096665..d3dcab1ac7 100644
--- a/src/common/m_mpi_common.fpp
+++ b/src/common/m_mpi_common.fpp
@@ -1569,8 +1569,10 @@ contains
 #ifdef MFC_MPI
         deallocate (buff_send, buff_recv)
 #ifdef MFC_SIMULATION
-        @:DEALLOCATE(buff_send_scalarfield)
-        @:DEALLOCATE(buff_recv_scalarfield)
+        if (volume_filtering_momentum_eqn) then
+            @:DEALLOCATE(buff_send_scalarfield)
+            @:DEALLOCATE(buff_recv_scalarfield)
+        end if 
 #endif
 #endif
 
diff --git a/src/simulation/m_compute_statistics.fpp b/src/simulation/m_compute_statistics.fpp
index 1721b0706c..93b8d6502d 100644
--- a/src/simulation/m_compute_statistics.fpp
+++ b/src/simulation/m_compute_statistics.fpp
@@ -12,165 +12,169 @@ module m_compute_statistics
     implicit none
 
     private; public :: s_initialize_statistics_module, s_finalize_statistics_module, &
- s_compute_s_order_statistics, s_autocorrelation_function
+    s_compute_statistics_momentum_unclosed_terms, s_update_statistics, &
+    s_compute_234_order_statistics
+ 
+    ! terms for computing 1st, 2nd, 3rd, and 4th order statistical moments
+    type(scalar_field), allocatable, dimension(:) :: Msn_reynolds_stress
+    type(scalar_field), allocatable, dimension(:) :: Msn_eff_visc
+    type(scalar_field), allocatable, dimension(:) :: Msn_int_mom_exch
 
-    type(scalar_field), allocatable, dimension(:) :: xnbar_stat
+    ! 2nd, 3rd, and 4th statistical moments for unclosed terms in volume filtered momentum equation
+    type(scalar_field), allocatable, dimension(:), public :: stat_reynolds_stress
+    type(scalar_field), allocatable, dimension(:), public :: stat_eff_visc
+    type(scalar_field), allocatable, dimension(:), public :: stat_int_mom_exch
 
-    type(scalar_field), allocatable, dimension(:) :: delta_stat
+    !$acc declare create(Msn_reynolds_stress, Msn_eff_visc, Msn_int_mom_exch)
 
-    type(vector_field), allocatable, dimension(:) :: Msn_stat
-
-    real(wp), allocatable, dimension(:) :: xm_th
-
-    real(wp), allocatable, dimension(:) :: x_mom_autocorr
-
-    !$acc declare create(xnbar_stat, delta_stat, Msn_stat)
+    !$acc declare create(stat_reynolds_stress, stat_eff_visc, stat_int_mom_exch)
 
 contains
 
     subroutine s_initialize_statistics_module
-        integer :: i, j
-        @:ALLOCATE(xnbar_stat(1:3))
-        do i = 1, 3
-            @:ALLOCATE(xnbar_stat(i)%sf(0:m, 0:n, 0:p))
-            @:ACC_SETUP_SFs(xnbar_stat(i))
+        integer :: i
+
+        @:ALLOCATE(Msn_reynolds_stress(1:4))
+        do i = 1, 4
+            @:ALLOCATE(Msn_reynolds_stress(i)%sf(0:m, 0:n, 0:p))
+            @:ACC_SETUP_SFs(Msn_reynolds_stress(i))
         end do
 
-        @:ALLOCATE(delta_stat(1:3))
-        do i = 1, 3
-            @:ALLOCATE(delta_stat(i)%sf(0:m, 0:n, 0:p))
-            @:ACC_SETUP_SFs(delta_stat(i))
+        @:ALLOCATE(Msn_eff_visc(1:4))
+        do i = 1, 4
+            @:ALLOCATE(Msn_eff_visc(i)%sf(0:m, 0:n, 0:p))
+            @:ACC_SETUP_SFs(Msn_eff_visc(i))
         end do
 
-        @:ALLOCATE(Msn_stat(1:num_dims))
-        do i = 1, 3
-            @:ALLOCATE(Msn_stat(i)%vf(2:4))
+        @:ALLOCATE(Msn_int_mom_exch(1:4))
+        do i = 1, 4
+            @:ALLOCATE(Msn_int_mom_exch(i)%sf(0:m, 0:n, 0:p))
+            @:ACC_SETUP_SFs(Msn_int_mom_exch(i))
         end do
-        do i = 1, 3
-            do j = 2, 4
-                @:ALLOCATE(Msn_stat(i)%vf(j)%sf(0:m, 0:n, 0:p))
-            end do
-            @:ACC_SETUP_VFs(Msn_stat(i))
+
+        @:ALLOCATE(stat_reynolds_stress(2:4))
+        do i = 2, 4
+            @:ALLOCATE(stat_reynolds_stress(i)%sf(0:m, 0:n, 0:p))
+            @:ACC_SETUP_SFs(stat_reynolds_stress(i))
         end do
 
-        if (compute_autocorrelation) then
-            @:ALLOCATE(xm_th(t_step_stop))
-            @:ALLOCATE(x_mom_autocorr(t_step_stop))
-        end if
+        @:ALLOCATE(stat_eff_visc(2:4))
+        do i = 2, 4
+            @:ALLOCATE(stat_eff_visc(i)%sf(0:m, 0:n, 0:p))
+            @:ACC_SETUP_SFs(stat_eff_visc(i))
+        end do
+
+        @:ALLOCATE(stat_int_mom_exch(2:4))
+        do i = 2, 4
+            @:ALLOCATE(stat_int_mom_exch(i)%sf(0:m, 0:n, 0:p))
+            @:ACC_SETUP_SFs(stat_int_mom_exch(i))
+        end do
 
     end subroutine s_initialize_statistics_module
 
-    subroutine s_compute_s_order_statistics(q_temp, n_step, s_order_stat, id)
-        type(scalar_field), intent(in) :: q_temp
+    subroutine s_compute_statistics_momentum_unclosed_terms(n_step, reynolds_stress, eff_visc, int_mom_exch)
+        type(scalar_field), intent(in) :: reynolds_stress 
+        type(scalar_field), intent(in) :: eff_visc
+        type(scalar_field), intent(in) :: int_mom_exch
+        
         integer, intent(in) :: n_step
-        type(scalar_field), dimension(2:4), intent(inout) :: s_order_stat
-        integer, intent(in) :: id
-        real(wp) :: ns
-        integer :: i, j, k, ii
+        real(wp) :: ns 
 
         ns = real(n_step, wp)
 
-        if (n_step == 1) then
-            !$acc parallel loop collapse(3) gang vector default(present)
-            do i = 0, m 
-                do j = 0, n
-                    do k = 0, p
-                        xnbar_stat(id)%sf(i, j, k) = q_temp%sf(i, j, k)
-                        Msn_stat(id)%vf(2)%sf(i, j, k) = 0.0_wp
-                        Msn_stat(id)%vf(3)%sf(i, j, k) = 0.0_wp
-                        Msn_stat(id)%vf(4)%sf(i, j, k) = 0.0_wp
-                        s_order_stat(2)%sf(i, j, k) = 0.0_wp
-                        s_order_stat(3)%sf(i, j, k) = 0.0_wp
-                        s_order_stat(4)%sf(i, j, k) = 0.0_wp
-                    end do 
-                end do
-            end do
-        else 
-            !$acc parallel loop collapse(3) gang vector default(present) copyin(ns)
-            do i = 0, m 
-                do j = 0, n
-                    do k = 0, p
-                        delta_stat(id)%sf(i, j, k) = q_temp%sf(i, j, k) - xnbar_stat(id)%sf(i, j, k)
-
-                        xnbar_stat(id)%sf(i, j, k) = xnbar_stat(id)%sf(i, j, k) + delta_stat(id)%sf(i, j, k)/ns
-
-                        Msn_stat(id)%vf(4)%sf(i, j, k) = Msn_stat(id)%vf(4)%sf(i, j, k) & 
-                                                + (delta_stat(id)%sf(i, j, k)**4)*(ns - 1.0_wp)*(ns**2 - 3.0_wp*ns + 3.0_wp)/(ns**3) &
-                                                + 6.0_wp*(delta_stat(id)%sf(i, j, k)**2)*Msn_stat(id)%vf(2)%sf(i, j, k)/(ns**2) &
-                                                - 4.0_wp*delta_stat(id)%sf(i, j, k)*Msn_stat(id)%vf(3)%sf(i, j, k)/ns
-
-                        Msn_stat(id)%vf(3)%sf(i, j, k) = Msn_stat(id)%vf(3)%sf(i, j, k) & 
-                                                + (delta_stat(id)%sf(i, j, k)**3)*(ns - 1.0_wp)*(ns - 2.0_wp)/(ns**2) & 
-                                                - 3.0_wp*delta_stat(id)%sf(i, j, k)*Msn_stat(id)%vf(2)%sf(i, j, k)/ns
-
-                        Msn_stat(id)%vf(2)%sf(i, j, k) = Msn_stat(id)%vf(2)%sf(i, j, k) &
-                                                + (delta_stat(id)%sf(i, j, k)**2)*(ns - 1.0_wp)/ns
-
-                        s_order_stat(2)%sf(i, j, k) = Msn_stat(id)%vf(2)%sf(i, j, k)/(ns - 1.0_wp)
-
-                        s_order_stat(3)%sf(i, j, k) = sqrt(ns)*Msn_stat(id)%vf(3)%sf(i, j, k)/(Msn_stat(id)%vf(2)%sf(i, j, k)**1.5_wp)
-
-                        s_order_stat(4)%sf(i, j, k) = ns*Msn_stat(id)%vf(4)%sf(i, j, k)/(Msn_stat(id)%vf(2)%sf(i, j, k)**2) - 3.0_wp
-                    end do 
-                end do
-            end do
+        ! update M1, M2, M3, M4
+        call s_update_statistics(ns, reynolds_stress, Msn_reynolds_stress)
+        call s_update_statistics(ns, eff_visc, Msn_eff_visc)
+        call s_update_statistics(ns, int_mom_exch, Msn_int_mom_exch)
+
+        ! compute 2nd, 3rd, 4th order statistical moments
+        if (n_step > 3) then 
+            call s_compute_234_order_statistics(ns, Msn_reynolds_stress, stat_reynolds_stress) 
+            call s_compute_234_order_statistics(ns, Msn_eff_visc, stat_eff_visc) 
+            call s_compute_234_order_statistics(ns, Msn_int_mom_exch, stat_int_mom_exch)  
         end if
 
-    end subroutine s_compute_s_order_statistics
+    end subroutine s_compute_statistics_momentum_unclosed_terms
 
-    subroutine s_autocorrelation_function(n_step, q_cons_vf)
-        integer, intent(in) :: n_step
-        type(scalar_field), dimension(sys_size), intent(in) :: q_cons_vf
-        real(wp) :: q_avg, q_var, test
-        integer :: i, j, k, s, it
-
-        !$acc update host(q_cons_vf(2))
-        xm_th(n_step) = q_cons_vf(2)%sf(m/4, n/4, p/4)
-
-        if (n_step > 1) then
-            ! compute average
-            q_avg = sum(xm_th(1:n_step)) / real(n_step, wp)
-
-            ! compute variance
-            q_var = sum((xm_th(1:n_step) - q_avg)**2) / real(n_step, wp)
-
-            ! compute autocorrelation function
-            do s = 0, n_step - 1
-                x_mom_autocorr(s+1) = 0.0_wp
-                do it = 1, n_step - s
-                    x_mom_autocorr(s+1) = x_mom_autocorr(s+1) + (xm_th(it) - q_avg) * (xm_th(it+s) - q_avg)
-                end do
-                x_mom_autocorr(s+1) = x_mom_autocorr(s+1) / ((n_step - s) * q_var)
-            end do
-            
-            print *, q_cons_vf(2)%sf(m/4, n/4, p/4)
-            print *, 'Autocorrelation at lag 0:', x_mom_autocorr(1) 
-            print *, 'Autocorrelation at lag N/2:', x_mom_autocorr(n_step/2)
-            print *, 'Autocorrelation at max lag:', x_mom_autocorr(n_step)
-
-        end if
+    subroutine s_update_statistics(ns, q_temp, Msn)
+        type(scalar_field), intent(in) :: q_temp
+        type(scalar_field), dimension(1:4), intent(inout) :: Msn
+
+        real(wp), intent(in) :: ns
+        real(wp) :: delta, delta_n, delta_n2, delta_f
+        integer :: i, j, k
+
+        !$acc parallel loop collapse(3) gang vector default(present) copyin(ns) private(delta, delta_n, delta_n2, delta_f)
+        do i = 0, m 
+            do j = 0, n 
+                do k = 0, p
+                    delta = q_temp%sf(i, j, k) - Msn(1)%sf(i, j, k)
+                    delta_n = delta / ns
+                    delta_n2 = delta_n**2
+                    delta_f = delta * delta_n * (ns - 1._wp)
+
+                    Msn(1)%sf(i, j, k) = Msn(1)%sf(i, j, k) + delta_n
+                    Msn(4)%sf(i, j, k) = Msn(4)%sf(i, j, k) + delta_f * delta_n2 * (ns**2 - 3._wp*ns + 3._wp) + 6._wp * delta_n2 * Msn(2)%sf(i, j, k) - 4._wp * delta_n * Msn(3)%sf(i, j, k)
+                    Msn(3)%sf(i, j, k) = Msn(3)%sf(i, j, k) + delta_f * delta_n * (ns - 2._wp) - 3._wp * delta_n * Msn(2)%sf(i, j, k)
+                    Msn(2)%sf(i, j, k) = Msn(2)%sf(i, j, k) + delta_f
+                end do 
+            end do 
+        end do
+        
+    end subroutine s_update_statistics
+
+    subroutine s_compute_234_order_statistics(ns, Msn, q_stat)
+        type(scalar_field), dimension(1:4), intent(in) :: Msn
+        type(scalar_field), dimension(2:4), intent(inout) :: q_stat
+
+        real(wp), intent(in) :: ns
+        integer :: i, j, k
+
+        !$acc parallel loop collapse(3) gang vector default(present) copyin(ns)
+        do i = 0, m 
+            do j = 0, n 
+                do k = 0, p 
+                    q_stat(2)%sf(i, j, k) = Msn(2)%sf(i, j, k) / (ns - 1._wp)
+                    q_stat(3)%sf(i, j, k) = sqrt(ns - 1._wp) / (ns - 2._wp) * ns * Msn(3)%sf(i, j, k) / (Msn(2)%sf(i, j, k)**1.5)
+                    q_stat(4)%sf(i, j, k) = (ns - 1._wp) / ((ns - 2._wp) * (ns - 3._wp)) * ((ns + 1._wp) * (ns * Msn(4)%sf(i, j, k) / (Msn(2)%sf(i, j, k)**2) - 3._wp) + 6._wp)
+                end do 
+            end do 
+        end do
 
-    end subroutine s_autocorrelation_function
+    end subroutine s_compute_234_order_statistics
 
     subroutine s_finalize_statistics_module
         integer :: i, j
-        do i = 1, 3
-            @:DEALLOCATE(xnbar_stat(i)%sf)
+        do i = 1, 4
+            @:DEALLOCATE(Msn_reynolds_stress(i)%sf)
+        end do
+        @:DEALLOCATE(Msn_reynolds_stress)
+
+        do i = 1, 4
+            @:DEALLOCATE(Msn_eff_visc(i)%sf)
+        end do
+        @:DEALLOCATE(Msn_eff_visc)
+
+        do i = 1, 4
+            @:DEALLOCATE(Msn_int_mom_exch(i)%sf)
         end do
-        @:DEALLOCATE(xnbar_stat)
+        @:DEALLOCATE(Msn_int_mom_exch)
 
-        do i = 1, 3
-            @:DEALLOCATE(delta_stat(i)%sf)
+        do i = 2, 4
+            @:DEALLOCATE(stat_reynolds_stress(i)%sf)
         end do
-        @:DEALLOCATE(delta_stat)
+        @:DEALLOCATE(stat_reynolds_stress)
 
-        do i = 1, 3
-            do j = 2, 4
-                @:DEALLOCATE(Msn_stat(i)%vf(j)%sf)
-            end do
-            @:DEALLOCATE(Msn_stat(i)%vf)
+        do i = 2, 4
+            @:DEALLOCATE(stat_eff_visc(i)%sf)
         end do
-        @:DEALLOCATE(Msn_stat)
+        @:DEALLOCATE(stat_eff_visc)
+
+        do i = 2, 4
+            @:DEALLOCATE(stat_int_mom_exch(i)%sf)
+        end do
+        @:DEALLOCATE(stat_int_mom_exch)
+
     end subroutine s_finalize_statistics_module
 
-end module m_compute_statistics
\ No newline at end of file
+end module m_compute_statistics
diff --git a/src/simulation/m_global_parameters.fpp b/src/simulation/m_global_parameters.fpp
index a71e17a69d..bcd8c74dec 100644
--- a/src/simulation/m_global_parameters.fpp
+++ b/src/simulation/m_global_parameters.fpp
@@ -503,7 +503,6 @@ module m_global_parameters
 
     logical :: periodic_ibs
     logical :: compute_CD
-    real(wp) :: mu_visc !< reference viscosity
     real(wp) :: u_inf_ref !< reference freestream velocity
     real(wp) :: rho_inf_ref !< reference freestream density 
     real(wp) :: T_inf_ref !< reference freestream temperature
@@ -512,8 +511,9 @@ module m_global_parameters
     logical :: store_levelset
     logical :: slab_domain_decomposition
     logical :: compute_autocorrelation
+    integer :: t_step_stat_start
 
-    !$acc declare create(mu_visc, u_inf_ref, rho_inf_ref, T_inf_ref)
+    !$acc declare create(u_inf_ref, rho_inf_ref, T_inf_ref)
 
 contains
 
@@ -792,7 +792,6 @@ contains
 
         periodic_ibs = .false.
         compute_CD = .false.
-        mu_visc = dflt_real
         u_inf_ref = dflt_real
         rho_inf_ref = dflt_real
         T_inf_ref = dflt_real
@@ -801,6 +800,7 @@ contains
         store_levelset = .true.
         slab_domain_decomposition = .false.
         compute_autocorrelation = .false.
+        t_step_stat_start = dflt_int
 
     end subroutine s_assign_default_values_to_user_inputs
 
diff --git a/src/simulation/m_mpi_proxy.fpp b/src/simulation/m_mpi_proxy.fpp
index c2579cc057..730f5ead50 100644
--- a/src/simulation/m_mpi_proxy.fpp
+++ b/src/simulation/m_mpi_proxy.fpp
@@ -132,8 +132,8 @@ contains
             & 'x_domain%beg', 'x_domain%end', 'y_domain%beg', 'y_domain%end',    &
             & 'z_domain%beg', 'z_domain%end', 'x_a', 'x_b', 'y_a', 'y_b', 'z_a', &
             & 'z_b', 't_stop', 't_save', 'cfl_target', 'rkck_tolerance', 'Bx0',  &
-            & 'tau_star', 'cont_damage_s', 'alpha_bar', 'mu_visc', 'u_inf_ref',  & 
-            & 'rho_inf_ref', 'T_inf_ref' ]
+            & 'tau_star', 'cont_damage_s', 'alpha_bar', 'u_inf_ref',  & 
+            & 'rho_inf_ref', 'T_inf_ref', 't_step_stat_start' ]
             call MPI_BCAST(${VAR}$, 1, mpi_p, 0, MPI_COMM_WORLD, ierr)
         #:endfor
 
diff --git a/src/simulation/m_rhs.fpp b/src/simulation/m_rhs.fpp
index eea4a49260..fd25b1c017 100644
--- a/src/simulation/m_rhs.fpp
+++ b/src/simulation/m_rhs.fpp
@@ -61,6 +61,8 @@ module m_rhs
 
     use m_mhd
 
+    use m_additional_forcing
+
     implicit none
 
     private; public :: s_initialize_rhs_module, &
@@ -609,7 +611,7 @@ contains
 
     end subroutine s_initialize_rhs_module
 
-    subroutine s_compute_rhs(q_cons_vf, q_T_sf, q_prim_vf, bc_type, rhs_vf, pb, rhs_pb, mv, rhs_mv, t_step, time_avg, pres_visc_stress)
+    subroutine s_compute_rhs(q_cons_vf, q_T_sf, q_prim_vf, bc_type, rhs_vf, pb, rhs_pb, mv, rhs_mv, t_step, time_avg)
 
         type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
         type(scalar_field), intent(inout) :: q_T_sf
@@ -620,7 +622,6 @@ contains
         real(wp), dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:, 1:, 1:), intent(inout) :: mv, rhs_mv
         integer, intent(in) :: t_step
         real(wp), intent(inout) :: time_avg
-        type(scalar_field), dimension(momxb:momxe), optional, intent(inout) :: pres_visc_stress
 
         real(wp), dimension(0:m, 0:n, 0:p) :: nbub
         real(wp) :: t_start, t_finish
@@ -810,8 +811,7 @@ contains
                                                  rhs_vf, &
                                                  q_cons_qp, &
                                                  q_prim_qp, &
-                                                 flux_src_n(id), & 
-                                                 pres_visc_stress)
+                                                 flux_src_n(id))
             call nvtxEndRange
 
             ! RHS additions for hypoelasticity
@@ -830,8 +830,7 @@ contains
                                                       flux_src_n(id)%vf, &
                                                       dq_prim_dx_qp(1)%vf, &
                                                       dq_prim_dy_qp(1)%vf, &
-                                                      dq_prim_dz_qp(1)%vf, & 
-                                                      pres_visc_stress)
+                                                      dq_prim_dz_qp(1)%vf)
                 call nvtxEndRange
             end if
 
@@ -911,6 +910,8 @@ contains
 
         if (cont_damage) call s_compute_damage_state(q_cons_qp%vf, rhs_vf)
 
+        if (periodic_forcing) call s_add_periodic_forcing(rhs_vf)
+
         ! END: Additional physics and source terms
 
         if (run_time_info .or. probe_wrt .or. ib .or. bubbles_lagrange) then
@@ -938,14 +939,13 @@ contains
 
     end subroutine s_compute_rhs
 
-    subroutine s_compute_advection_source_term(idir, rhs_vf, q_cons_vf, q_prim_vf, flux_src_n_vf, pres_visc_stress)
+    subroutine s_compute_advection_source_term(idir, rhs_vf, q_cons_vf, q_prim_vf, flux_src_n_vf)
 
         integer, intent(in) :: idir
         type(scalar_field), dimension(sys_size), intent(inout) :: rhs_vf
         type(vector_field), intent(inout) :: q_cons_vf
         type(vector_field), intent(inout) :: q_prim_vf
         type(vector_field), intent(inout) :: flux_src_n_vf
-        type(scalar_field), dimension(momxb:momxe), optional, intent(inout) :: pres_visc_stress  
 
         integer :: i, j, k, l, q
 
@@ -998,25 +998,6 @@ contains
                 end do
             end do
 
-            ! particle forces loop, x-dir
-            if ((compute_CD .or. volume_filtering_momentum_eqn) .and. present(pres_visc_stress)) then
-                !$acc parallel loop collapse(3) gang vector default(present)
-                do k = 0, p
-                    do j = 0, n 
-                        do i = 0, m 
-                            !$acc loop seq
-                            do l = momxb, momxe
-                                pres_visc_stress(l)%sf(i, j, k) = 1._wp/dx(i) * & 
-                                                          (flux_n(1)%vf(l)%sf(i-1, j, k) - & 
-                                                           flux_n(1)%vf(l)%sf(i, j, k)) - 0.5_wp/dx(i) * & 
-                                                          (q_cons_vf%vf(2)%sf(i+1, j, k)*q_cons_vf%vf(l)%sf(i+1, j, k)/q_cons_vf%vf(1)%sf(i+1, j, k) - & 
-                                                           q_cons_vf%vf(2)%sf(i-1, j, k)*q_cons_vf%vf(l)%sf(i-1, j, k)/q_cons_vf%vf(1)%sf(i-1, j, k))
-                            end do 
-                        end do 
-                    end do 
-                end do 
-            end if
-
             if (model_eqns == 3) then
                 !$acc parallel loop collapse(4) gang vector default(present)
                 do l = 0, p
@@ -1127,25 +1108,6 @@ contains
                 end do
             end do
 
-            ! particle forces loop, y-dir
-            if ((compute_CD .or. volume_filtering_momentum_eqn) .and. present(pres_visc_stress)) then
-                !$acc parallel loop collapse(3) gang vector default(present)
-                do k = 0, p 
-                    do j = 0, n 
-                        do i = 0, m 
-                            !$acc loop seq
-                            do l = momxb, momxe 
-                                pres_visc_stress(l)%sf(i, j, k) = pres_visc_stress(l)%sf(i, j, k) + 1._wp/dy(j) * & 
-                                                          (flux_n(2)%vf(l)%sf(i, j-1, k) - & 
-                                                           flux_n(2)%vf(l)%sf(i, j, k)) - 0.5_wp/dy(j) * & 
-                                                          (q_cons_vf%vf(3)%sf(i, j+1, k)*q_cons_vf%vf(l)%sf(i, j+1, k)/q_cons_vf%vf(1)%sf(i, j+1, k) - & 
-                                                           q_cons_vf%vf(3)%sf(i, j-1, k)*q_cons_vf%vf(l)%sf(i, j-1, k)/q_cons_vf%vf(1)%sf(i, j-1, k))
-                            end do  
-                        end do 
-                    end do
-                end do
-            end if
-
             if (model_eqns == 3) then
                 !$acc parallel loop collapse(4) gang vector default(present)
                 do l = 0, p
@@ -1352,25 +1314,6 @@ contains
                 end do
             end if
 
-            ! particle forces loop, z-dir
-            if ((compute_CD .or. volume_filtering_momentum_eqn) .and. present(pres_visc_stress)) then
-                !$acc parallel loop collapse(3) gang vector default(present)
-                do k = 0, p 
-                    do j = 0, n 
-                        do i = 0, m 
-                            !$acc loop seq
-                            do l = momxb, momxe 
-                                pres_visc_stress(l)%sf(i, j, k) = pres_visc_stress(l)%sf(i, j, k) + 1._wp/dz(k) * & 
-                                                          (flux_n(3)%vf(l)%sf(i, j, k-1) - & 
-                                                           flux_n(3)%vf(l)%sf(i, j, k)) - 0.5_wp/dz(k) * & 
-                                                          (q_cons_vf%vf(4)%sf(i, j, k+1)*q_cons_vf%vf(l)%sf(i, j, k+1)/q_cons_vf%vf(1)%sf(i, j, k+1) - & 
-                                                           q_cons_vf%vf(4)%sf(i, j, k-1)*q_cons_vf%vf(l)%sf(i, j, k-1)/q_cons_vf%vf(1)%sf(i, j, k-1))
-                            end do  
-                        end do 
-                    end do 
-                end do 
-            end if
-
             if (model_eqns == 3) then
                 !$acc parallel loop collapse(4) gang vector default(present)
                 do l = 0, p
@@ -1552,14 +1495,13 @@ contains
     end subroutine s_compute_advection_source_term
 
     subroutine s_compute_additional_physics_rhs(idir, q_prim_vf, rhs_vf, flux_src_n, &
-                                                dq_prim_dx_vf, dq_prim_dy_vf, dq_prim_dz_vf, pres_visc_stress)
+                                                dq_prim_dx_vf, dq_prim_dy_vf, dq_prim_dz_vf)
 
         integer, intent(in) :: idir
         type(scalar_field), dimension(sys_size), intent(in) :: q_prim_vf
         type(scalar_field), dimension(sys_size), intent(inout) :: rhs_vf
         type(scalar_field), dimension(sys_size), intent(in) :: flux_src_n
         type(scalar_field), dimension(sys_size), intent(in) :: dq_prim_dx_vf, dq_prim_dy_vf, dq_prim_dz_vf
-        type(scalar_field), dimension(momxb:momxe), optional, intent(inout) :: pres_visc_stress
 
         integer :: i, j, k, l
 
@@ -1595,23 +1537,6 @@ contains
                 end do
             end do
 
-            ! particle momentum exchange, viscous stress tensor, x-dir
-            if ((compute_CD .or. volume_filtering_momentum_eqn) .and. present(pres_visc_stress)) then
-                !$acc parallel loop collapse(3) gang vector default(present)
-                do k = 0, p 
-                    do j = 0, n 
-                        do i = 0, m 
-                            !$acc loop seq
-                            do l = momxb, momxe
-                                pres_visc_stress(l)%sf(i, j, k) = pres_visc_stress(l)%sf(i, j, k) + 1._wp/dx(i) * & 
-                                                       (flux_src_n(l)%sf(i-1, j, k) - & 
-                                                        flux_src_n(l)%sf(i, j, k))
-                            end do 
-                        end do 
-                    end do 
-                end do
-            end if
-
         elseif (idir == 2) then ! y-direction
 
             if (surface_tension) then
@@ -1694,23 +1619,6 @@ contains
                 end do
             end if
 
-            ! particle momentum exchange, viscous stress tensor, y-dir
-            if ((compute_CD .or. volume_filtering_momentum_eqn) .and. present(pres_visc_stress)) then
-                !$acc parallel loop collapse(3) gang vector default(present)
-                do k = 0, p 
-                    do j = 0, n 
-                        do i = 0, m 
-                            !$acc loop seq
-                            do l = momxb, momxe
-                                pres_visc_stress(l)%sf(i, j, k) = pres_visc_stress(l)%sf(i, j, k) + 1._wp/dy(j) * & 
-                                                       (flux_src_n(l)%sf(i, j-1, k) - & 
-                                                        flux_src_n(l)%sf(i, j, k))
-                            end do 
-                        end do 
-                    end do
-                end do
-            end if
-
             ! Applying the geometrical viscous Riemann source fluxes calculated as average
             ! of values at cell boundaries
             if (cyl_coord) then
@@ -1796,23 +1704,6 @@ contains
                 end do
             end do
 
-            ! particle momentum exchange, viscous stress tensor, z-dir
-            if ((compute_CD .or. volume_filtering_momentum_eqn) .and. present(pres_visc_stress)) then
-                !$acc parallel loop collapse(3) gang vector default(present)
-                do k = 0, p 
-                    do j = 0, n 
-                        do i = 0, m 
-                            !$acc loop seq
-                            do l = momxb, momxe 
-                                pres_visc_stress(l)%sf(i, j, k) = pres_visc_stress(l)%sf(i, j, k) + 1._wp/dz(k) * & 
-                                                       (flux_src_n(l)%sf(i, j, k-1) - & 
-                                                        flux_src_n(l)%sf(i, j, k))
-                            end do 
-                        end do 
-                    end do 
-                end do 
-            end if 
-
             if (grid_geometry == 3) then
                 !$acc parallel loop collapse(3) gang vector default(present)
                 do l = 0, p
diff --git a/src/simulation/m_start_up.fpp b/src/simulation/m_start_up.fpp
index c34bd05321..5ac4e4dad6 100644
--- a/src/simulation/m_start_up.fpp
+++ b/src/simulation/m_start_up.fpp
@@ -189,9 +189,9 @@ contains
             rkck_adap_dt, rkck_tolerance, &
             hyperelasticity, R0ref, num_bc_patches, Bx0, powell, &
             cont_damage, tau_star, cont_damage_s, alpha_bar, & 
-            periodic_ibs, compute_CD, mu_visc, u_inf_ref, rho_inf_ref, T_inf_ref, & 
+            periodic_ibs, compute_CD, u_inf_ref, rho_inf_ref, T_inf_ref, & 
             periodic_forcing, volume_filtering_momentum_eqn, store_levelset, & 
-            slab_domain_decomposition, compute_autocorrelation
+            slab_domain_decomposition, compute_autocorrelation, t_step_stat_start
 
         ! Checking that an input file has been provided by the user. If it
         ! has, then the input file is read in, otherwise, simulation exits.
@@ -1346,6 +1346,29 @@ contains
         ! Volume filter flow variables, compute unclosed terms and their statistics
         if (volume_filtering_momentum_eqn) then 
             call s_volume_filter_momentum_eqn(q_cons_ts(1)%vf)
+
+            if (t_step > t_step_stat_start) then    
+                call s_compute_statistics_momentum_unclosed_terms(t_step - t_step_stat_start, mag_reynolds_stress, mag_eff_visc, mag_int_mom_exch)
+
+                ! write(100, *) mag_reynolds_stress%sf(10, 10, 10)
+                ! write(101, *) stat_reynolds_stress(2)%sf(10, 10, 10), stat_reynolds_stress(3)%sf(10, 10, 10), stat_reynolds_stress(4)%sf(10, 10, 10)
+            end if
+
+            ! TEMPORARY, for v+v
+            ! if (t_step == 1) then 
+            !     open(unit=100, file='dat_reynolds_stress.txt', status='replace', action='write')
+            !     open(unit=101, file='stat_reynolds_stress.txt', status='replace', action='write')
+            ! end if
+            ! if (t_step == 999) then 
+            !     close(100)
+            !     close(101)
+            ! end if
+
+        end if
+
+        if (periodic_forcing) then 
+            call s_compute_phase_average(q_cons_ts(1)%vf, t_step+1)
+            call s_compute_periodic_forcing(q_cons_ts(1)%vf)
         end if
 
         ! Time-stepping loop controls
@@ -1585,10 +1608,11 @@ contains
         if (mhd .and. powell) call s_initialize_mhd_powell_module
 
         call s_initialize_particle_forces_module()
-        call s_initialize_additional_forcing_module()
-        if (volume_filtering_momentum_eqn) call s_initialize_fftw_explicit_filter_module()
-
-        call s_initialize_statistics_module()
+        if (periodic_forcing) call s_initialize_additional_forcing_module()
+        if (volume_filtering_momentum_eqn) then 
+            call s_initialize_fftw_explicit_filter_module()
+            call s_initialize_statistics_module()
+        end if
 
     end subroutine s_initialize_modules
 
@@ -1702,7 +1726,7 @@ contains
             !$acc update device(ib_markers%sf)
         end if
 
-        !$acc update device(mu_visc, u_inf_ref, rho_inf_ref, T_inf_ref)
+        !$acc update device(u_inf_ref, rho_inf_ref, T_inf_ref)
 
     end subroutine s_initialize_gpu_vars
 
diff --git a/src/simulation/m_time_steppers.fpp b/src/simulation/m_time_steppers.fpp
index 9fdbb519e0..5132efbb23 100644
--- a/src/simulation/m_time_steppers.fpp
+++ b/src/simulation/m_time_steppers.fpp
@@ -87,14 +87,8 @@ module m_time_steppers
     integer, private :: num_ts !<
     !! Number of time stages in the time-stepping scheme
 
-    type(scalar_field), allocatable, dimension(:) :: stat_reynolds_stress
-    type(scalar_field), allocatable, dimension(:) :: stat_eff_visc
-    type(scalar_field), allocatable, dimension(:) :: stat_int_mom_exch
-
     !$acc declare create(q_cons_ts, q_prim_vf, q_T_sf, rhs_vf, rhs_ts_rkck, q_prim_ts, rhs_mv, rhs_pb, max_dt)
 
-    !$acc declare create(stat_reynolds_stress, stat_eff_visc, stat_int_mom_exch)
-
 contains
 
     !> The computation of parameters, the allocation of memory,
@@ -369,32 +363,6 @@ contains
             end do
         end do
 
-        if (compute_CD .or. volume_filtering_momentum_eqn) then
-            @:ALLOCATE(pres_visc_stress(momxb:momxe))
-            do i = momxb, momxe
-                @:ALLOCATE(pres_visc_stress(i)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
-                    idwbuff(2)%beg:idwbuff(2)%end, &
-                    idwbuff(3)%beg:idwbuff(3)%end))
-                @:ACC_SETUP_SFs(pres_visc_stress(i))
-            end do
-        end if
-
-        @:ALLOCATE(stat_reynolds_stress(2:4))
-        do i = 2, 4
-            @:ALLOCATE(stat_reynolds_stress(i)%sf(0:m, 0:n, 0:p))
-            @:ACC_SETUP_SFs(stat_reynolds_stress(i))
-        end do
-        @:ALLOCATE(stat_eff_visc(2:4))
-        do i = 2, 4
-            @:ALLOCATE(stat_eff_visc(i)%sf(0:m, 0:n, 0:p))
-            @:ACC_SETUP_SFs(stat_eff_visc(i))
-        end do
-        @:ALLOCATE(stat_int_mom_exch(2:4))
-        do i = 2, 4
-            @:ALLOCATE(stat_int_mom_exch(i)%sf(0:m, 0:n, 0:p))
-            @:ACC_SETUP_SFs(stat_int_mom_exch(i))
-        end do
-
     end subroutine s_initialize_time_steppers_module
 
     !> 1st order TVD RK time-stepping algorithm
@@ -712,45 +680,7 @@ contains
             call nvtxStartRange("TIMESTEP")
         end if
 
-        if (periodic_forcing) then 
-            call s_compute_phase_average(q_cons_ts(1)%vf, t_step+1)
-            call s_compute_periodic_forcing(q_cons_ts(1)%vf)
-        end if
-
-        call s_compute_rhs(q_cons_ts(1)%vf, q_T_sf, q_prim_vf, bc_type, rhs_vf, pb_ts(1)%sf, rhs_pb, mv_ts(1)%sf, rhs_mv, t_step, time_avg, pres_visc_stress)
-
-        ! if (volume_filtering_momentum_eqn) then 
-        !     call s_apply_fftw_filter_cons(q_cons_ts(1)%vf, q_cons_filtered)
-        !     call s_setup_terms_filtering(q_cons_ts(1)%vf, reynolds_stress, eff_visc)
-        !     call s_apply_fftw_filter_tensor(reynolds_stress, eff_visc, q_cons_filtered, pres_visc_stress, int_mom_exch)
-        !     call s_compute_pseudo_turbulent_reynolds_stress(q_cons_filtered, reynolds_stress, mag_reynolds_stress)
-        !     call s_compute_eff_visc(q_cons_filtered, eff_visc, mag_eff_visc)
-        !     call s_compute_interphase_momentum_exchange_term(int_mom_exch, mag_int_mom_exch)
-        ! end if
-        
-
-        ! call s_autocorrelation_function(t_step+1, q_cons_ts(1)%vf)
-        ! if (t_step > 10) then
-        !     n_step = t_step - 10
-        !     call s_compute_s_order_statistics(mag_reynolds_stress, n_step, stat_reynolds_stress, 1)
-        !     call s_compute_s_order_statistics(mag_eff_visc, n_step, stat_eff_visc, 2)
-        !     call s_compute_s_order_statistics(mag_int_mom_exch, n_step, stat_int_mom_exch, 3)
-        ! end if
-
-
-        ! stat_reynolds_stress(2)%sf(0:m, 0:n, 0:p) = q_cons_filtered(6)%sf(0:m, 0:n, 0:p)
-        ! stat_reynolds_stress(3)%sf(0:m, 0:n, 0:p) = mag_reynolds_stress%sf(0:m, 0:n, 0:p)
-        ! stat_reynolds_stress(4)%sf(0:m, 0:n, 0:p) = mag_eff_visc%sf(0:m, 0:n, 0:p)
-        ! stat_eff_visc(2)%sf(0:m, 0:n, 0:p) = mag_int_mom_exch%sf(0:m, 0:n, 0:p)
-
-
-        if (compute_CD) then
-            call s_compute_drag_coefficient(pres_visc_stress)
-        end if
-
-        if (periodic_forcing) then 
-            call s_add_periodic_forcing(rhs_vf)
-        end if
+        call s_compute_rhs(q_cons_ts(1)%vf, q_T_sf, q_prim_vf, bc_type, rhs_vf, pb_ts(1)%sf, rhs_pb, mv_ts(1)%sf, rhs_mv, t_step, time_avg)
 
         if (run_time_info) then
             call s_write_run_time_information(q_prim_vf, t_step)
@@ -841,10 +771,6 @@ contains
 
         call s_compute_rhs(q_cons_ts(2)%vf, q_T_sf, q_prim_vf, bc_type, rhs_vf, pb_ts(2)%sf, rhs_pb, mv_ts(2)%sf, rhs_mv, t_step, time_avg)
 
-        if (periodic_forcing) then 
-            call s_add_periodic_forcing(rhs_vf)
-        end if
-
         if (bubbles_lagrange) then
             call s_compute_EL_coupled_solver(q_cons_ts(2)%vf, q_prim_vf, rhs_vf, stage=2)
             call s_update_lagrange_tdv_rk(stage=2)
@@ -921,10 +847,6 @@ contains
         ! Stage 3 of 3
         call s_compute_rhs(q_cons_ts(2)%vf, q_T_sf, q_prim_vf, bc_type, rhs_vf, pb_ts(2)%sf, rhs_pb, mv_ts(2)%sf, rhs_mv, t_step, time_avg)
 
-        if (periodic_forcing) then 
-            call s_add_periodic_forcing(rhs_vf)
-        end if
-
         if (bubbles_lagrange) then
             call s_compute_EL_coupled_solver(q_cons_ts(2)%vf, q_prim_vf, rhs_vf, stage=3)
             call s_update_lagrange_tdv_rk(stage=3)
@@ -1416,26 +1338,6 @@ contains
             @:DEALLOCATE(rhs_vf)
         end if
 
-        if (compute_CD .or. volume_filtering_momentum_eqn) then
-            do i = momxb, momxe
-                @:DEALLOCATE(pres_visc_stress(i)%sf)
-            end do
-            @:DEALLOCATE(pres_visc_stress)
-        end if
-
-        do i = 2, 4
-            @:DEALLOCATE(stat_reynolds_stress(i)%sf)
-        end do
-        @:DEALLOCATE(stat_reynolds_stress)
-        do i = 2, 4
-            @:DEALLOCATE(stat_eff_visc(i)%sf)
-        end do
-        @:DEALLOCATE(stat_eff_visc)
-        do i = 2, 4
-            @:DEALLOCATE(stat_int_mom_exch(i)%sf)
-        end do
-        @:DEALLOCATE(stat_int_mom_exch)
-
         ! Writing the footer of and closing the run-time information file
         if (proc_rank == 0 .and. run_time_info) then
             call s_close_run_time_information_file()
diff --git a/src/simulation/m_volume_filtering.fpp b/src/simulation/m_volume_filtering.fpp
index 6b1e981bfc..fa44071328 100644
--- a/src/simulation/m_volume_filtering.fpp
+++ b/src/simulation/m_volume_filtering.fpp
@@ -14,6 +14,8 @@ module m_volume_filtering
 
     use m_boundary_common
 
+    use m_nvtx
+
 #ifdef MFC_MPI
     use mpi                    !< Message passing interface (MPI) module
 #endif
@@ -26,8 +28,9 @@ module m_volume_filtering
 
     private; public :: s_initialize_fftw_explicit_filter_module, &
  s_initialize_filtering_kernel, s_initialize_fluid_indicator_function, & 
- s_finalize_fftw_explicit_filter_module, & 
- s_apply_fftw_filter_cons, s_apply_fftw_filter_tensor, s_apply_fftw_filter_scalarfield, &
+ s_initialize_filtered_fluid_indicator_function, s_finalize_fftw_explicit_filter_module, & 
+ s_apply_fftw_filter_cons, s_volume_filter_momentum_eqn, s_apply_fftw_filter_tensor, s_apply_fftw_filter_scalarfield, &
+ s_compute_viscous_stress_tensor, s_compute_stress_tensor, s_compute_divergence_stress_tensor, &
  s_mpi_transpose_slabZ2Y, s_mpi_transpose_slabY2Z, s_mpi_FFT_fwd, s_mpi_FFT_bwd, &
  s_setup_terms_filtering, s_compute_pseudo_turbulent_reynolds_stress, s_compute_effective_viscosity, s_compute_interphase_momentum_exchange
 
@@ -39,24 +42,34 @@ module m_volume_filtering
 
     ! fluid indicator function (1 = fluid, 0 = otherwise)
     type(scalar_field), public :: fluid_indicator_function
-    type(scalar_field) :: filtered_fluid_indicator_function
+    type(scalar_field), public :: filtered_fluid_indicator_function
 
     ! volume filtered conservative variables
     type(scalar_field), allocatable, dimension(:) :: q_cons_filtered
 
-    ! unclosed terms in momentum eqn
-    type(scalar_field), allocatable, dimension(:) :: pres_visc_stress
+    ! viscous and pressure+viscous stress tensors
+    type(vector_field), allocatable, dimension(:) :: visc_stress
+    type(vector_field), allocatable, dimension(:) :: pres_visc_stress
+
+    ! divergence of stress tensor
+    type(scalar_field), allocatable, dimension(:) :: div_pres_visc_stress
+    
+    ! unclosed terms in volume filtered momentum equation
     type(vector_field), allocatable, dimension(:) :: reynolds_stress
     type(vector_field), allocatable, dimension(:) :: eff_visc
     type(scalar_field), allocatable, dimension(:) :: int_mom_exch
 
-    ! magnitude of unclosed terms in momentum eqn
-    type(scalar_field) :: mag_reynolds_stress
-    type(scalar_field) :: mag_eff_visc
-    type(scalar_field) :: mag_int_mom_exch
+    ! magnitude of unclosed terms in momentum equation
+    type(scalar_field), public :: mag_reynolds_stress
+    type(scalar_field), public :: mag_eff_visc
+    type(scalar_field), public :: mag_int_mom_exch
+
+    real(wp), allocatable, dimension(:, :) :: Res
 
     !$acc declare create(fluid_indicator_function, filtered_fluid_indicator_function, q_cons_filtered)
-    !$acc declare create(pres_visc_stress, reynolds_stress, eff_visc, int_mom_exch, mag_reynolds_stress, mag_eff_visc, mag_int_mom_exch)
+    !$acc declare create(visc_stress, pres_visc_stress, div_pres_visc_stress)
+    !$acc declare create(reynolds_stress, eff_visc, int_mom_exch, mag_reynolds_stress, mag_eff_visc, mag_int_mom_exch)
+    !$acc declare create(Res)
 
 #if defined(MFC_OpenACC)
     ! GPU plans
@@ -98,12 +111,6 @@ contains
     subroutine s_initialize_fftw_explicit_filter_module
         integer :: i, j, k
         integer :: size_n(1), inembed(1), onembed(1)
-
-        @:ALLOCATE(fluid_indicator_function%sf(0:m, 0:n, 0:p))
-        @:ACC_SETUP_SFs(fluid_indicator_function)
-
-        @:ALLOCATE(filtered_fluid_indicator_function%sf(0:m, 0:n, 0:p))
-        @:ACC_SETUP_SFs(filtered_fluid_indicator_function)
         
         @:ALLOCATE(q_cons_filtered(1:sys_size))
         do i = 1, sys_size
@@ -113,18 +120,52 @@ contains
             @:ACC_SETUP_SFs(q_cons_filtered(i))
         end do
 
+        @:ALLOCATE(visc_stress(1:num_dims))
+        do i = 1, num_dims
+            @:ALLOCATE(visc_stress(i)%vf(1:num_dims))
+        end do
+        do i = 1, num_dims
+            do j = 1, num_dims 
+                @:ALLOCATE(visc_stress(i)%vf(j)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
+                    idwbuff(2)%beg:idwbuff(2)%end, &
+                    idwbuff(3)%beg:idwbuff(3)%end))
+            end do 
+            @:ACC_SETUP_VFs(visc_stress(i))
+        end do
+
+        @:ALLOCATE(pres_visc_stress(1:num_dims))
+        do i = 1, num_dims
+            @:ALLOCATE(pres_visc_stress(i)%vf(1:num_dims))
+        end do
+        do i = 1, num_dims
+            do j = 1, num_dims 
+                @:ALLOCATE(pres_visc_stress(i)%vf(j)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
+                    idwbuff(2)%beg:idwbuff(2)%end, &
+                    idwbuff(3)%beg:idwbuff(3)%end))
+            end do 
+            @:ACC_SETUP_VFs(pres_visc_stress(i))
+        end do
+
+        @:ALLOCATE(div_pres_visc_stress(1:num_dims))
+        do i = 1, num_dims
+            @:ALLOCATE(div_pres_visc_stress(i)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
+                idwbuff(2)%beg:idwbuff(2)%end, &
+                idwbuff(3)%beg:idwbuff(3)%end))
+            @:ACC_SETUP_SFs(div_pres_visc_stress(i))
+        end do
+
         @:ALLOCATE(reynolds_stress(1:num_dims))
-            do i = 1, num_dims
-                @:ALLOCATE(reynolds_stress(i)%vf(1:num_dims))
-            end do
-            do i = 1, num_dims
-                do j = 1, num_dims
-                    @:ALLOCATE(reynolds_stress(i)%vf(j)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
-                        idwbuff(2)%beg:idwbuff(2)%end, &
-                        idwbuff(3)%beg:idwbuff(3)%end))
-                end do
-                @:ACC_SETUP_VFs(reynolds_stress(i))
+        do i = 1, num_dims
+            @:ALLOCATE(reynolds_stress(i)%vf(1:num_dims))
+        end do
+        do i = 1, num_dims
+            do j = 1, num_dims
+                @:ALLOCATE(reynolds_stress(i)%vf(j)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
+                    idwbuff(2)%beg:idwbuff(2)%end, &
+                    idwbuff(3)%beg:idwbuff(3)%end))
             end do
+            @:ACC_SETUP_VFs(reynolds_stress(i))
+        end do
 
         @:ALLOCATE(eff_visc(1:num_dims))
         do i = 1, num_dims
@@ -162,6 +203,19 @@ contains
             idwbuff(3)%beg:idwbuff(3)%end))
         @:ACC_SETUP_SFs(mag_int_mom_exch)
 
+        if (viscous) then
+            @:ALLOCATE(Res(1:2, 1:maxval(Re_size)))
+        end if
+
+        if (viscous) then
+            do i = 1, 2
+                do j = 1, Re_size(i)
+                    Res(i, j) = fluid_pp(Re_idx(i, j))%Re(i)
+                end do
+            end do
+            !$acc update device(Res, Re_idx, Re_size)
+        end if
+
         !< global sizes 
         Nx = m_glb + 1
         Ny = n_glb + 1
@@ -404,6 +458,9 @@ contains
     subroutine s_initialize_fluid_indicator_function 
         integer :: i, j, k 
 
+        @:ALLOCATE(fluid_indicator_function%sf(0:m, 0:n, 0:p))
+        @:ACC_SETUP_SFs(fluid_indicator_function)
+
         ! define fluid indicator function
         !$acc parallel loop collapse(3) gang vector default(present)
         do i = 0, m
@@ -417,6 +474,14 @@ contains
                 end do
             end do
         end do
+    
+    end subroutine s_initialize_fluid_indicator_function
+
+    subroutine s_initialize_filtered_fluid_indicator_function
+        integer :: i, j, k
+        
+        @:ALLOCATE(filtered_fluid_indicator_function%sf(0:m, 0:n, 0:p))
+        @:ACC_SETUP_SFs(filtered_fluid_indicator_function)
 
         ! filter fluid indicator function 
         !$acc parallel loop collapse(3) gang vector default(present)
@@ -450,32 +515,24 @@ contains
             end do
         end do
 
-    end subroutine s_initialize_fluid_indicator_function
-
-    !< apply the gaussian filter to the conservative variables and compute their filtered components
-    subroutine s_apply_fftw_filter_cons(q_cons_vf, q_cons_filtered)
-        type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
-        type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_filtered
-
-        integer :: l
-
-        do l = contxb, momxe
-            call s_apply_fftw_filter_scalarfield(filtered_fluid_indicator_function, .true., q_cons_vf(l), q_cons_filtered(l))
-        end do 
-
-    end subroutine s_apply_fftw_filter_cons
+    end subroutine s_initialize_filtered_fluid_indicator_function
 
     !< calculate the unclosed terms present in the volume filtered momentum equation
     subroutine s_volume_filter_momentum_eqn(q_cons_vf)
         type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
         integer :: i, j, k
 
+        call nvtxStartRange("FILTER-CONSERVATIVE-VARIABLES")
         call s_apply_fftw_filter_cons(q_cons_vf, q_cons_filtered)
-        call s_setup_terms_filtering(q_cons_vf, reynolds_stress, eff_visc)
-        call s_apply_fftw_filter_tensor(reynolds_stress, eff_visc, q_cons_filtered, pres_visc_stress, int_mom_exch)
+        call nvtxEndRange
+
+        call nvtxStartRange("COMPUTE-MOMENTUM-UNCLOSED-TERMS")
+        call s_setup_terms_filtering(q_cons_vf, reynolds_stress, visc_stress, pres_visc_stress, div_pres_visc_stress)
+        call s_apply_fftw_filter_tensor(reynolds_stress, visc_stress, eff_visc, div_pres_visc_stress, int_mom_exch)
         call s_compute_pseudo_turbulent_reynolds_stress(q_cons_filtered, reynolds_stress, mag_reynolds_stress)
-        call s_compute_effective_viscosity(q_cons_filtered, eff_visc, mag_eff_visc)
+        call s_compute_effective_viscosity(q_cons_filtered, eff_visc, visc_stress, mag_eff_visc)
         call s_compute_interphase_momentum_exchange(int_mom_exch, mag_int_mom_exch)
+        call nvtxEndRange
 
     end subroutine s_volume_filter_momentum_eqn
 
@@ -544,372 +601,196 @@ contains
 
     end subroutine s_apply_fftw_filter_scalarfield
 
+    !< apply the gaussian filter to the conservative variables and compute their filtered components
+    subroutine s_apply_fftw_filter_cons(q_cons_vf, q_cons_filtered)
+        type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
+        type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_filtered
+
+        integer :: i
+
+        do i = contxb, momxe
+            call s_apply_fftw_filter_scalarfield(filtered_fluid_indicator_function, .true., q_cons_vf(i), q_cons_filtered(i))
+        end do 
+
+    end subroutine s_apply_fftw_filter_cons
+
     !< apply the gaussian filter to the requisite tensors to compute unclosed terms of interest
-    subroutine s_apply_fftw_filter_tensor(reynolds_stress, eff_visc, q_cons_filtered, pres_visc_stress, int_mom_exch)
+    subroutine s_apply_fftw_filter_tensor(reynolds_stress, visc_stress, eff_visc, div_pres_visc_stress, int_mom_exch)
         type(vector_field), dimension(1:num_dims), intent(inout) :: reynolds_stress
+        type(vector_field), dimension(1:num_dims), intent(inout) :: visc_stress
         type(vector_field), dimension(1:num_dims), intent(inout) :: eff_visc
-        type(scalar_field), dimension(sys_size), intent(in) :: q_cons_filtered
-        type(scalar_field), dimension(momxb:momxe), intent(inout) :: pres_visc_stress
+        type(scalar_field), dimension(1:num_dims), intent(inout) :: div_pres_visc_stress
         type(scalar_field), dimension(1:num_dims), intent(inout) :: int_mom_exch
 
-        integer :: i, j, k, l, q
+        integer :: i, j
 
         ! pseudo turbulent reynolds stress
-        do l = 1, num_dims 
-            do q = 1, num_dims
-                call s_apply_fftw_filter_scalarfield(filtered_fluid_indicator_function, .true., reynolds_stress(l)%vf(q))
+        do i = 1, num_dims 
+            do j = 1, num_dims
+                call s_apply_fftw_filter_scalarfield(filtered_fluid_indicator_function, .true., reynolds_stress(i)%vf(j))
             end do
         end do 
 
         ! effective viscosity
-        do l = 1, num_dims 
-            do q = 1, num_dims
-                call s_apply_fftw_filter_scalarfield(filtered_fluid_indicator_function, .true., eff_visc(l)%vf(q))
+        do i = 1, num_dims 
+            do j = 1, num_dims
+                call s_apply_fftw_filter_scalarfield(filtered_fluid_indicator_function, .true., visc_stress(i)%vf(j), eff_visc(i)%vf(j))
             end do
         end do 
 
         ! interphase momentum exchange
-        do l = 1, num_dims
-            call s_apply_fftw_filter_scalarfield(filtered_fluid_indicator_function, .false., pres_visc_stress(momxb-1+l), int_mom_exch(l))
+        do i = 1, num_dims
+            call s_apply_fftw_filter_scalarfield(filtered_fluid_indicator_function, .false., div_pres_visc_stress(i), int_mom_exch(i))
         end do 
 
     end subroutine s_apply_fftw_filter_tensor
 
-    ! compute pressure and viscous stress tensors
-    subroutine s_compute_stress_tensor(q_cons_vf)
+    ! compute viscous stress tensor
+    subroutine s_compute_viscous_stress_tensor(visc_stress, q_cons_vf)
+        type(vector_field), dimension(num_dims), intent(inout) :: visc_stress 
         type(scalar_field), dimension(sys_size), intent(in) :: q_cons_vf
-        real(wp) :: dudx, dudy, dudz, dvdx, dvdy, dvdz, dqdx, dwdy, dwdz ! spatial velocity derivatives
+        real(wp) :: dudx, dudy, dudz, dvdx, dvdy, dvdz, dwdx, dwdy, dwdz ! spatial velocity derivatives
         integer :: i, j, k 
 
-        !$acc parallel loop collapse(3) gang vector default(present)
+        !$acc parallel loop collapse(3) gang vector default(present) private(dudx, dudy, dudz, dvdx, dvdy, dvdz, dwdx, dwdy, dwdz)
         do i = 0, m 
             do j = 0, n 
                 do k = 0, p
-                    ! local to each process
+                    ! velocity gradients, local to each process
                     dudx = ( q_cons_vf(2)%sf(i+1, j, k)/q_cons_vf(1)%sf(i+1, j, k) - q_cons_vf(2)%sf(i-1, j, k)/q_cons_vf(1)%sf(i-1, j, k) ) / (dx(i-1) + dx(i+1))
                     dudy = ( q_cons_vf(2)%sf(i, j+1, k)/q_cons_vf(1)%sf(i, j+1, k) - q_cons_vf(2)%sf(i, j-1, k)/q_cons_vf(1)%sf(i, j-1, k) ) / (dy(j-1) + dy(j+1))
-                    dudz = ( q_cons_vf(2)%vf(i, j, k+1)/q_cons_vf(1)%sf(i, j, k+1) - q_cons_vf(2)%sf(i, j, k-1)/q_cons_vf(1)%sf(i, j, k-1) ) / (dz(k-1) + dz(k+1))
+                    dudz = ( q_cons_vf(2)%sf(i, j, k+1)/q_cons_vf(1)%sf(i, j, k+1) - q_cons_vf(2)%sf(i, j, k-1)/q_cons_vf(1)%sf(i, j, k-1) ) / (dz(k-1) + dz(k+1))
 
                     dvdx = ( q_cons_vf(3)%sf(i+1, j, k)/q_cons_vf(1)%sf(i+1, j, k) - q_cons_vf(3)%sf(i-1, j, k)/q_cons_vf(1)%sf(i-1, j, k) ) / (dx(i-1) + dx(i+1))
                     dvdy = ( q_cons_vf(3)%sf(i, j+1, k)/q_cons_vf(1)%sf(i, j+1, k) - q_cons_vf(3)%sf(i, j-1, k)/q_cons_vf(1)%sf(i, j-1, k) ) / (dy(j-1) + dy(j+1))
-                    dvdz = ( q_cons_vf(3)%vf(i, j, k+1)/q_cons_vf(1)%sf(i, j, k+1) - q_cons_vf(3)%sf(i, j, k-1)/q_cons_vf(1)%sf(i, j, k-1) ) / (dz(k-1) + dz(k+1))
+                    dvdz = ( q_cons_vf(3)%sf(i, j, k+1)/q_cons_vf(1)%sf(i, j, k+1) - q_cons_vf(3)%sf(i, j, k-1)/q_cons_vf(1)%sf(i, j, k-1) ) / (dz(k-1) + dz(k+1))
 
                     dwdx = ( q_cons_vf(4)%sf(i+1, j, k)/q_cons_vf(1)%sf(i+1, j, k) - q_cons_vf(4)%sf(i-1, j, k)/q_cons_vf(1)%sf(i-1, j, k) ) / (dx(i-1) + dx(i+1))
                     dwdy = ( q_cons_vf(4)%sf(i, j+1, k)/q_cons_vf(1)%sf(i, j+1, k) - q_cons_vf(4)%sf(i, j-1, k)/q_cons_vf(1)%sf(i, j-1, k) ) / (dy(j-1) + dy(j+1))
-                    dwdz = ( q_cons_vf(4)%vf(i, j, k+1)/q_cons_vf(1)%sf(i, j, k+1) - q_cons_vf(4)%sf(i, j, k-1)/q_cons_vf(1)%sf(i, j, k-1) ) / (dz(k-1) + dz(k+1))
-
-                    ! viscous stress tensor, tau(row, column)
-                    tau(1)%vf(1) = mu * (4._wp/3._wp * dudx - 2._wp/3._wp * (dvdy + dwdz))
-                    tau(1)%vf(2) = mu * (dudy + dvdx)
-                    tau(1)%vf(3) = mu * (dudz + dwdx)
-                    tau(2)%vf(1) = mu * (dvdx + dudy)
-                    tau(2)%vf(2) = mu * (4._wp/3._wp * dvdy - 2._wp/3._wp * (dudx + dwdz))
-                    tau(2)%vf(3) = mu * (dvdz + dwdy)
-                    tau(3)%vf(1) = mu * (dwdx + dudz)
-                    tau(3)%vf(2) = mu * (dwdy + dvdz) 
-                    tau(3)%vf(3) = mu * (4._wp/3._wp * dwdz - 2._wp/3._wp * (dudx + dvdy))
-
-
+                    dwdz = ( q_cons_vf(4)%sf(i, j, k+1)/q_cons_vf(1)%sf(i, j, k+1) - q_cons_vf(4)%sf(i, j, k-1)/q_cons_vf(1)%sf(i, j, k-1) ) / (dz(k-1) + dz(k+1))
+
+                    ! viscous stress tensor, visc_stress(row, column)
+                    visc_stress(1)%vf(1)%sf(i, j, k) = (4._wp/3._wp * dudx - 2._wp/3._wp * (dvdy + dwdz)) / Res(1, 1)
+                    visc_stress(1)%vf(2)%sf(i, j, k) = (dudy + dvdx) / Res(1, 1)
+                    visc_stress(1)%vf(3)%sf(i, j, k) = (dudz + dwdx) / Res(1, 1)
+                    visc_stress(2)%vf(1)%sf(i, j, k) = (dvdx + dudy) / Res(1, 1)
+                    visc_stress(2)%vf(2)%sf(i, j, k) = (4._wp/3._wp * dvdy - 2._wp/3._wp * (dudx + dwdz)) / Res(1, 1)
+                    visc_stress(2)%vf(3)%sf(i, j, k) = (dvdz + dwdy) / Res(1, 1)
+                    visc_stress(3)%vf(1)%sf(i, j, k) = (dwdx + dudz) / Res(1, 1)
+                    visc_stress(3)%vf(2)%sf(i, j, k) = (dwdy + dvdz) / Res(1, 1)
+                    visc_stress(3)%vf(3)%sf(i, j, k) = (4._wp/3._wp * dwdz - 2._wp/3._wp * (dudx + dvdy)) / Res(1, 1)
                 end do 
             end do 
         end do
 
-    end subroutine s_compute_stress_tensor
-
-    !< transpose domain from z-slabs to y-slabs on each processor
-    subroutine s_mpi_transpose_slabZ2Y
-        complex(c_double_complex), allocatable :: sendbuf(:), recvbuf(:)
-        integer :: dest_rank, src_rank
+    end subroutine s_compute_viscous_stress_tensor
+    
+    subroutine s_compute_stress_tensor(pres_visc_stress, visc_stress, q_cons_vf)
+        type(vector_field), dimension(num_dims), intent(inout) :: pres_visc_stress
+        type(vector_field), dimension(num_dims), intent(in) :: visc_stress
+        type(scalar_field), dimension(sys_size), intent(in) :: q_cons_vf
+        real(wp) :: pressure
         integer :: i, j, k
 
-        allocate(sendbuf(NxC*Nyloc*Nzloc*num_procs))
-        allocate(recvbuf(NxC*Nyloc*Nzloc*num_procs))
-
-        !$acc parallel loop collapse(4) gang vector default(present) copy(sendbuf)
-        do dest_rank = 0, num_procs-1
-            do k = 1, Nzloc 
-                do j = 1, Nyloc
-                    do i = 1, NxC
-                        sendbuf(i + (j-1)*NxC + (k-1)*NxC*Nyloc + dest_rank*NxC*Nyloc*Nzloc) = data_cmplx_slabz(i, j+dest_rank*Nyloc, k)
-                    end do 
-                end do
-            end do
-        end do
-
-        call MPI_Alltoall(sendbuf, NxC*Nyloc*Nzloc, MPI_DOUBLE_COMPLEX, & 
-                          recvbuf, NxC*Nyloc*Nzloc, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD, ierr)
-
-        !$acc parallel loop collapse(4) gang vector default(present) copy(recvbuf)
-        do src_rank = 0, num_procs-1
-            do k = 1, Nzloc 
-                do j = 1, Nyloc
-                    do i = 1, NxC
-                        data_cmplx_slaby(i, j, k+src_rank*Nzloc) = recvbuf(i + (j-1)*NxC + (k-1)*NxC*Nyloc + src_rank*NxC*Nyloc*Nzloc)
-                    end do 
-                end do
+        !$acc parallel loop collapse(3) gang vector default(present) private(pressure)
+        do i = 0, m 
+            do j = 0, n 
+                do k = 0, p
+                    pressure = (q_cons_vf(E_idx)%sf(i, j, k) - 0.5_wp * (q_cons_vf(momxb)%sf(i, j, k)**2 + q_cons_vf(momxb+1)%sf(i, j, k)**2 + q_cons_vf(momxb+2)%sf(i, j, k)**2) &
+                             / q_cons_vf(contxb)%sf(i, j, k) - pi_infs(1) - qvs(1)) / (gammas(1))
+
+                    pres_visc_stress(1)%vf(1)%sf(i, j, k) = pressure - visc_stress(1)%vf(1)%sf(i, j, k)
+                    pres_visc_stress(1)%vf(2)%sf(i, j, k) = - visc_stress(1)%vf(2)%sf(i, j, k) 
+                    pres_visc_stress(1)%vf(3)%sf(i, j, k) = - visc_stress(1)%vf(3)%sf(i, j, k)
+                    pres_visc_stress(2)%vf(1)%sf(i, j, k) = - visc_stress(2)%vf(1)%sf(i, j, k)
+                    pres_visc_stress(2)%vf(2)%sf(i, j, k) = pressure - visc_stress(2)%vf(2)%sf(i, j, k) 
+                    pres_visc_stress(2)%vf(3)%sf(i, j, k) = - visc_stress(2)%vf(3)%sf(i, j, k)
+                    pres_visc_stress(3)%vf(1)%sf(i, j, k) = - visc_stress(3)%vf(1)%sf(i, j, k)
+                    pres_visc_stress(3)%vf(2)%sf(i, j, k) = - visc_stress(3)%vf(2)%sf(i, j, k)
+                    pres_visc_stress(3)%vf(3)%sf(i, j, k) = pressure - visc_stress(3)%vf(3)%sf(i, j, k)
+                end do 
             end do 
-        end do
+        end do 
 
-        deallocate(sendbuf, recvbuf)
-    end subroutine s_mpi_transpose_slabZ2Y
+    end subroutine s_compute_stress_tensor
 
-    !< transpose domain from y-slabs to z-slabs on each processor
-    subroutine s_mpi_transpose_slabY2Z 
-        complex(c_double_complex), allocatable :: sendbuf(:), recvbuf(:)
-        integer :: dest_rank, src_rank
+    !< compute the divergence of the pressure-viscous stress tensor
+    subroutine s_compute_divergence_stress_tensor(div_stress_tensor, stress_tensor)
+        type(scalar_field), dimension(num_dims), intent(inout) :: div_stress_tensor
+        type(vector_field), dimension(num_dims), intent(in) :: stress_tensor
         integer :: i, j, k
 
-        allocate(sendbuf(NxC*Nyloc*Nzloc*num_procs))
-        allocate(recvbuf(NxC*Nyloc*Nzloc*num_procs))
+        !$acc parallel loop collapse(3) gang vector default(present)
+        do i = 0, m 
+            do j = 0, n 
+                do k = 0, p
+                    div_stress_tensor(1)%sf(i, j, k) = (stress_tensor(1)%vf(1)%sf(i+1, j, k) - stress_tensor(1)%vf(1)%sf(i-1, j, k)) / (dx(i-1) + dx(i+1)) &
+                                                     + (stress_tensor(2)%vf(1)%sf(i, j+1, k) - stress_tensor(2)%vf(1)%sf(i, j-1, k)) / (dy(j-1) + dy(j+1)) &
+                                                     + (stress_tensor(3)%vf(1)%sf(i, j, k+1) - stress_tensor(3)%vf(1)%sf(i, j, k-1)) / (dz(k-1) + dz(k+1))
 
-        !$acc parallel loop collapse(4) gang vector default(present) copy(sendbuf)
-        do dest_rank = 0, num_procs-1
-            do k = 1, Nzloc 
-                do j = 1, Nyloc 
-                    do i = 1, NxC 
-                        sendbuf(i + (j-1)*NxC + (k-1)*NxC*Nyloc + dest_rank*NxC*Nyloc*Nzloc) = data_cmplx_slaby(i, j, k+dest_rank*Nzloc)
-                    end do 
+                    div_stress_tensor(2)%sf(i, j, k) = (stress_tensor(1)%vf(2)%sf(i+1, j, k) - stress_tensor(1)%vf(2)%sf(i-1, j, k)) / (dx(i-1) + dx(i+1)) & 
+                                                     + (stress_tensor(2)%vf(2)%sf(i, j+1, k) - stress_tensor(2)%vf(2)%sf(i, j-1, k)) / (dy(j-1) + dy(j+1)) & 
+                                                     + (stress_tensor(3)%vf(2)%sf(i, j, k+1) - stress_tensor(3)%vf(2)%sf(i, j, k-1)) / (dz(k-1) + dz(k+1))
+
+                    div_stress_tensor(3)%sf(i, j, k) = (stress_tensor(1)%vf(3)%sf(i+1, j, k) - stress_tensor(1)%vf(3)%sf(i-1, j, k)) / (dx(i-1) + dx(i+1)) & 
+                                                     + (stress_tensor(2)%vf(3)%sf(i, j+1, k) - stress_tensor(2)%vf(3)%sf(i, j-1, k)) / (dy(j-1) + dy(j+1)) & 
+                                                     + (stress_tensor(3)%vf(3)%sf(i, j, k+1) - stress_tensor(3)%vf(3)%sf(i, j ,k-1)) / (dz(k-1) + dz(k+1))
                 end do 
             end do 
         end do
 
-        call MPI_Alltoall(sendbuf, NxC*Nyloc*Nzloc, MPI_DOUBLE_COMPLEX, & 
-                          recvbuf, NxC*Nyloc*Nzloc, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD, ierr)
+    end subroutine s_compute_divergence_stress_tensor
 
-        !$acc parallel loop collapse(4) gang vector default(present) copy(recvbuf) 
-        do src_rank = 0, num_procs-1
-            do k = 1, Nzloc
-                do j = 1, Nyloc 
-                    do i = 1, NxC 
-                        data_cmplx_slabz(i, j+src_rank*Nyloc, k) = recvbuf(i + (j-1)*NxC + (k-1)*NxC*Nyloc + src_rank*NxC*Nyloc*Nzloc)
-                    end do 
-                end do
-            end do 
-        end do
-        
-        deallocate(sendbuf, recvbuf)
-    end subroutine s_mpi_transpose_slabY2Z
+    !< setup for calculation of unclosed terms in volume filtered momentum eqn
+    subroutine s_setup_terms_filtering(q_cons_vf, reynolds_stress, visc_stress, pres_visc_stress, div_pres_visc_stress)
+        type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
+        type(vector_field), dimension(1:num_dims), intent(inout) :: reynolds_stress
+        type(vector_field), dimension(1:num_dims), intent(inout) :: visc_stress
+        type(vector_field), dimension(1:num_dims), intent(inout) :: pres_visc_stress
+        type(scalar_field), dimension(1:num_dims), intent(inout) :: div_pres_visc_stress
 
-    !< compute forward FFT, input: data_real_3D_slabz, output: data_cmplx_out1d
-    subroutine s_mpi_FFT_fwd
-        integer :: i, j, k
+        integer :: i, j, k, l, q
 
-        ! 3D z-slab -> 1D x, y, z
+        ! pseudo turbulent reynolds stress setup
         !$acc parallel loop collapse(3) gang vector default(present)
-        do i = 1, Nx 
-            do j = 1, Ny 
-                do k = 1, Nzloc
-                    data_real_in1d(i + (j-1)*Nx + (k-1)*Nx*Ny) = data_real_3D_slabz(i, j, k)
-                end do 
+        do i = 0, m
+            do j = 0, n
+                do k = 0, p
+                    !$acc loop seq
+                    do l = 1, num_dims
+                        !$acc loop seq
+                        do q = 1, num_dims
+                            reynolds_stress(l)%vf(q)%sf(i, j, k) = (q_cons_vf(momxb-1+l)%sf(i, j, k) * q_cons_vf(momxb-1+q)%sf(i, j, k)) / q_cons_vf(1)%sf(i, j, k) ! (rho*u x rho*u)/rho = rho*(u x u) 
+                        end do
+                    end do
+                end do
             end do 
         end do
 
-        ! X FFT
-#if defined(MFC_OpenACC)
-        ierr = cufftExecD2Z(plan_x_fwd_gpu, data_real_in1d, data_cmplx_out1d)
-#else
-        call fftw_execute_dft_r2c(plan_x_r2c_fwd, data_real_in1d, data_cmplx_out1d)
-#endif
-
-        ! 1D x, y, z -> 1D y, x, z (CMPLX)
-        !$acc parallel loop collapse(3) gang vector default(present)
-        do i = 1, NxC
-            do j = 1, Ny 
-                do k = 1, Nzloc
-                    data_cmplx_out1dy(j + (i-1)*Ny + (k-1)*Ny*NxC) = data_cmplx_out1d(i + (j-1)*NxC + (k-1)*NxC*Ny)
-                end do 
-            end do 
+        ! set density and momentum buffers
+#ifdef MFC_MPI
+        do i = 1, momxe 
+            call s_populate_scalarfield_buffers(q_cons_vf(i))
         end do
-
-        ! Y FFT 
-#if defined(MFC_OpenACC)
-        ierr = cufftExecZ2Z(plan_y_gpu, data_cmplx_out1dy, data_cmplx_out1dy, CUFFT_FORWARD)
 #else
-        call fftw_execute_dft(plan_y_c2c_fwd, data_cmplx_out1dy, data_cmplx_out1dy)
-#endif 
-
-        ! 1D y, x, z -> 3D z-slab
-        !$acc parallel loop collapse(3) gang vector default(present)
-        do i = 1, NxC 
-            do j = 1, Ny 
-                do k = 1, Nzloc
-                    data_cmplx_slabz(i, j, k) = data_cmplx_out1dy(j + (i-1)*Ny + (k-1)*Ny*NxC)
-                end do 
-            end do 
-        end do 
+        do i = 1, momxe
+            q_cons_vf(i)%sf(-buff_size:-1, :, :) = q_cons_vf(i)%sf(m-buff_size+1:m, :, :)
+            q_cons_vf(i)%sf(m+1:m+buff_size, :, :) = q_cons_vf(i)%sf(0:buff_size-1, :, :)
 
-        ! transpose z-slab to y-slab
-        call s_mpi_transpose_slabZ2Y 
+            q_cons_vf(i)%sf(:, -buff_size:-1, :) = q_cons_vf(i)%sf(:, n-buff_size+1:n, :)
+            q_cons_vf(i)%sf(:, n+1:n+buff_size, :) = q_cons_vf(i)%sf(:, 0:buff_size-1, :)
 
-        ! 3D y-slab -> 1D z, x, y
-        !$acc parallel loop collapse(3) gang vector default(present)
-        do i = 1, NxC 
-            do j = 1, Nyloc 
-                do k = 1, Nz
-                    data_cmplx_out1d(k + (i-1)*Nz + (j-1)*Nz*NxC) = data_cmplx_slaby(i, j, k)
-                end do 
-            end do 
+            q_cons_vf(i)%sf(:, :, -buff_size:-1) = q_cons_vf(i)%sf(:, :, p-buff_size+1:p)
+            q_cons_vf(i)%sf(:, :, p+1:p+buff_size) = q_cons_vf(i)%sf(:, :, 0:buff_size-1)
         end do
-
-        ! Z FFT
-#if defined(MFC_OpenACC)
-        ierr = cufftExecZ2Z(plan_z_gpu, data_cmplx_out1d, data_cmplx_out1d, CUFFT_FORWARD)
-#else
-        call fftw_execute_dft(plan_z_c2c_fwd, data_cmplx_out1d, data_cmplx_out1d)
 #endif
+        
+        ! effective viscosity setup, return viscous stress tensor
+        call s_compute_viscous_stress_tensor(visc_stress, q_cons_vf)
 
-        ! return data_cmplx_out1d: 1D z, x, y
-    end subroutine s_mpi_FFT_fwd
+        call s_compute_stress_tensor(pres_visc_stress, visc_stress, q_cons_vf)
 
-    !< compute inverse FFT, input: data_cmplx_out1d, output: data_real_3D_slabz
-    subroutine s_mpi_FFT_bwd
-        integer :: i, j, k
-
-        ! Z inv FFT 
-#if defined(MFC_OpenACC)
-        ierr = cufftExecZ2Z(plan_z_gpu, data_cmplx_out1d, data_cmplx_out1d, CUFFT_INVERSE)
-#else
-        call fftw_execute_dft(plan_z_c2c_bwd, data_cmplx_out1d, data_cmplx_out1d)
-#endif
-
-        ! 1D z, x, y -> 3D y-slab
-        !$acc parallel loop collapse(3) gang vector default(present)
-        do i = 1, NxC 
-            do j = 1, Nyloc 
-                do k = 1, Nz 
-                    data_cmplx_slaby(i, j, k) = data_cmplx_out1d(k + (i-1)*Nz + (j-1)*Nz*NxC)
-                end do 
-            end do 
-        end do
-
-        ! transpose y-slab to z-slab
-        call s_mpi_transpose_slabY2Z
-
-        ! 3D z-slab -> 1D y, x, z
-        !$acc parallel loop collapse(3) gang vector default(present)
-        do i = 1, NxC 
-            do j = 1, Ny 
-                do k = 1, Nzloc
-                    data_cmplx_out1dy(j + (i-1)*Ny + (k-1)*Ny*NxC) = data_cmplx_slabz(i, j, k)
-                end do 
-            end do 
-        end do
-
-        ! Y inv FFT 
-#if defined(MFC_OpenACC)
-        ierr = cufftExecZ2Z(plan_y_gpu, data_cmplx_out1dy, data_cmplx_out1dy, CUFFT_INVERSE)
-#else
-        call fftw_execute_dft(plan_y_c2c_bwd, data_cmplx_out1dy, data_cmplx_out1dy)
-#endif
-
-        ! 1D y, x, z -> 1D x, y, z 
-        !$acc parallel loop collapse(3) gang vector default(present)
-        do i = 1, NxC 
-            do j = 1, Ny 
-                do k = 1, Nzloc
-                    data_cmplx_out1d(i + (j-1)*NxC + (k-1)*NxC*Ny) = data_cmplx_out1dy(j + (i-1)*Ny + (k-1)*Ny*NxC)
-                end do 
-            end do 
-        end do
-
-        ! X inv FFT
-#if defined(MFC_OpenACC)
-        ierr = cufftExecZ2D(plan_x_bwd_gpu, data_cmplx_out1d, data_real_in1d)
-#else
-        call fftw_execute_dft_c2r(plan_x_c2r_bwd, data_cmplx_out1d, data_real_in1d)
-#endif
-
-        ! 1D x, y, z -> 3D z-slab
-        !$acc parallel loop collapse(3) gang vector default(present)
-        do i = 1, Nx 
-            do j = 1, Ny 
-                do k = 1, Nzloc
-                    data_real_3D_slabz(i, j, k) = data_real_in1d(i + (j-1)*Nx + (k-1)*Nx*Ny)
-                end do 
-            end do 
-        end do
-
-    end subroutine s_mpi_FFT_bwd
-
-    !< setup for calculation of unclosed terms in volume filtered momentum eqn
-    subroutine s_setup_terms_filtering(q_cons_vf, reynolds_stress, eff_visc)
-        type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
-        type(vector_field), dimension(1:num_dims), intent(inout) :: reynolds_stress
-        type(vector_field), dimension(1:num_dims), intent(inout) :: eff_visc
-
-        integer :: i, j, k, l, q
-
-        ! pseudo turbulent reynolds stress setup
-        !$acc parallel loop collapse(3) gang vector default(present)
-        do i = 0, m
-            do j = 0, n
-                do k = 0, p
-                    !$acc loop seq
-                    do l = 1, num_dims
-                        !$acc loop seq
-                        do q = 1, num_dims
-                            reynolds_stress(l)%vf(q)%sf(i, j, k) = (q_cons_vf(momxb-1+l)%sf(i, j, k) * q_cons_vf(momxb-1+q)%sf(i, j, k)) / q_cons_vf(1)%sf(i, j, k) ! (rho*u x rho*u)/rho = rho*(u x u) 
-                        end do
-                    end do
-                end do
-            end do 
-        end do
-
-        ! set density and momentum buffers
-#ifdef MFC_MPI
-        do i = 1, momxe 
-            call s_populate_scalarfield_buffers(q_cons_vf(i))
-        end do
-#else
-        do i = 1, momxe
-            q_cons_vf(i)%sf(-buff_size:-1, :, :) = q_cons_vf(i)%sf(m-buff_size+1:m, :, :)
-            q_cons_vf(i)%sf(m+1:m+buff_size, :, :) = q_cons_vf(i)%sf(0:buff_size-1, :, :)
-
-            q_cons_vf(i)%sf(:, -buff_size:-1, :) = q_cons_vf(i)%sf(:, n-buff_size+1:n, :)
-            q_cons_vf(i)%sf(:, n+1:n+buff_size, :) = q_cons_vf(i)%sf(:, 0:buff_size-1, :)
-
-            q_cons_vf(i)%sf(:, :, -buff_size:-1) = q_cons_vf(i)%sf(:, :, p-buff_size+1:p)
-            q_cons_vf(i)%sf(:, :, p+1:p+buff_size) = q_cons_vf(i)%sf(:, :, 0:buff_size-1)
-        end do
-#endif
-        
-        ! effective viscosity setup
-        !$acc parallel loop collapse(3) gang vector default(present)
-        do i = 0, m
-            do j = 0, n
-                do k = 0, p
-                    eff_visc(1)%vf(1)%sf(i, j, k) = mu_visc * (2._wp*(q_cons_vf(momxb)%sf(i+1, j, k)/q_cons_vf(1)%sf(i+1, j, k) - q_cons_vf(momxb)%sf(i-1, j, k)/q_cons_vf(1)%sf(i-1, j, k))/(2._wp*dx(i)) & 
-                                                - 2._wp/3._wp*((q_cons_vf(momxb)%sf(i+1, j, k)/q_cons_vf(1)%sf(i+1, j, k) - q_cons_vf(momxb)%sf(i-1, j, k)/q_cons_vf(1)%sf(i-1, j, k))/(2._wp*dx(i)) & 
-                                                + (q_cons_vf(momxb+1)%sf(i, j+1, k)/q_cons_vf(1)%sf(i, j+1, k) - q_cons_vf(momxb+1)%sf(i, j-1, k)/q_cons_vf(1)%sf(i, j-1, k))/(2._wp*dy(j)) & 
-                                                + (q_cons_vf(momxb+2)%sf(i, j, k+1)/q_cons_vf(1)%sf(i, j, k+1) - q_cons_vf(momxb+2)%sf(i, j, k-1)/q_cons_vf(1)%sf(i, j, k-1))/(2._wp*dz(k))))
-
-                    eff_visc(2)%vf(2)%sf(i, j, k) = mu_visc * (2._wp*(q_cons_vf(momxb+1)%sf(i, j+1, k)/q_cons_vf(1)%sf(i, j+1, k) - q_cons_vf(momxb+1)%sf(i, j-1, k)/q_cons_vf(1)%sf(i, j-1, k))/(2._wp*dy(j)) & 
-                                                - 2._wp/3._wp*((q_cons_vf(momxb)%sf(i+1, j, k)/q_cons_vf(1)%sf(i+1, j, k) - q_cons_vf(momxb)%sf(i-1, j, k)/q_cons_vf(1)%sf(i-1, j, k))/(2._wp*dx(i)) & 
-                                                + (q_cons_vf(momxb+1)%sf(i, j+1, k)/q_cons_vf(1)%sf(i, j+1, k) - q_cons_vf(momxb+1)%sf(i, j-1, k)/q_cons_vf(1)%sf(i, j-1, k))/(2._wp*dy(j)) & 
-                                                + (q_cons_vf(momxb+2)%sf(i, j, k+1)/q_cons_vf(1)%sf(i, j, k+1) - q_cons_vf(momxb+2)%sf(i, j, k-1)/q_cons_vf(1)%sf(i, j, k-1))/(2._wp*dz(k))))
-
-                    eff_visc(3)%vf(3)%sf(i, j, k) = mu_visc * (2._wp*(q_cons_vf(momxb+2)%sf(i, j, k+1)/q_cons_vf(1)%sf(i, j, k+1) - q_cons_vf(momxb+2)%sf(i, j, k-1)/q_cons_vf(1)%sf(i, j, k-1))/(2._wp*dz(k)) & 
-                                                - 2._wp/3._wp*((q_cons_vf(momxb)%sf(i+1, j, k)/q_cons_vf(1)%sf(i+1, j, k) - q_cons_vf(momxb)%sf(i-1, j, k)/q_cons_vf(1)%sf(i-1, j, k))/(2._wp*dx(i)) & 
-                                                + (q_cons_vf(momxb+1)%sf(i, j+1, k)/q_cons_vf(1)%sf(i, j+1, k) - q_cons_vf(momxb+1)%sf(i, j-1, k)/q_cons_vf(1)%sf(i, j-1, k))/(2._wp*dy(j)) & 
-                                                + (q_cons_vf(momxb+2)%sf(i, j, k+1)/q_cons_vf(1)%sf(i, j, k+1) - q_cons_vf(momxb+2)%sf(i, j, k-1)/q_cons_vf(1)%sf(i, j, k-1))/(2._wp*dz(k))))
-
-                    eff_visc(1)%vf(2)%sf(i, j, k) = mu_visc * ((q_cons_vf(momxb)%sf(i, j+1, k)/q_cons_vf(1)%sf(i, j+1, k) - q_cons_vf(momxb)%sf(i, j-1, k)/q_cons_vf(1)%sf(i, j-1, k))/(2._wp*dy(j))/q_cons_vf(1)%sf(i, j, k) & 
-                                                + (q_cons_vf(momxb+1)%sf(i+1, j, k)/q_cons_vf(1)%sf(i+1, j, k) - q_cons_vf(momxb+1)%sf(i-1, j, k)/q_cons_vf(1)%sf(i-1, j, k))/(2._wp*dx(i))/q_cons_vf(1)%sf(i, j, k))
-                                            
-                    eff_visc(2)%vf(1)%sf(i, j, k) = eff_visc(1)%vf(2)%sf(i, j, k)
-
-                    eff_visc(1)%vf(3)%sf(i, j, k) = mu_visc * ((q_cons_vf(momxb)%sf(i, j, k+1)/q_cons_vf(1)%sf(i, j, k+1) - q_cons_vf(momxb)%sf(i, j, k-1)/q_cons_vf(1)%sf(i, j, k-1))/(2._wp*dz(k))/q_cons_vf(1)%sf(i, j, k) & 
-                                                + (q_cons_vf(momxb+2)%sf(i+1, j, k)/q_cons_vf(1)%sf(i+1, j, k) - q_cons_vf(momxb+2)%sf(i-1, j, k)/q_cons_vf(1)%sf(i-1, j, k))/(2._wp*dx(i))/q_cons_vf(1)%sf(i, j, k))
-
-                    eff_visc(3)%vf(1)%sf(i, j, k) = eff_visc(1)%vf(3)%sf(i, j, k)
-
-                    eff_visc(2)%vf(3)%sf(i, j, k) = mu_visc * ((q_cons_vf(momxb+1)%sf(i, j, k+1)/q_cons_vf(1)%sf(i, j, k+1) - q_cons_vf(momxb+1)%sf(i, j, k-1)/q_cons_vf(1)%sf(i, j, k-1))/(2._wp*dz(k))/q_cons_vf(1)%sf(i, j, k) & 
-                                                + (q_cons_vf(momxb+2)%sf(i, j+1, k)/q_cons_vf(1)%sf(i, j+1, k) - q_cons_vf(momxb+2)%sf(i, j-1, k)/q_cons_vf(1)%sf(i, j-1, k))/(2._wp*dy(j))/q_cons_vf(1)%sf(i, j, k))
-
-                    eff_visc(3)%vf(2)%sf(i, j, k) = eff_visc(2)%vf(3)%sf(i, j, k)
-                end do
-            end do
-        end do
+        call s_compute_divergence_stress_tensor(div_pres_visc_stress, pres_visc_stress)
 
     end subroutine s_setup_terms_filtering
 
@@ -929,7 +810,7 @@ contains
                         !$acc loop seq
                         do q = 1, num_dims
                             reynolds_stress(l)%vf(q)%sf(i, j, k) = reynolds_stress(l)%vf(q)%sf(i, j, k) &
-                                                              - (q_cons_filtered(momxb-1+l)%sf(i, j, k) * q_cons_filtered(momxb-1+q)%sf(i, j, k) / q_cons_filtered(1)%sf(i, j, k))
+                                - (q_cons_filtered(momxb-1+l)%sf(i, j, k) * q_cons_filtered(momxb-1+q)%sf(i, j, k) / q_cons_filtered(1)%sf(i, j, k))
                         end do
                     end do
                 end do
@@ -999,9 +880,10 @@ contains
 
     end subroutine s_compute_pseudo_turbulent_reynolds_stress
 
-    subroutine s_compute_effective_viscosity(q_cons_filtered, eff_visc, mag_eff_visc)
+    subroutine s_compute_effective_viscosity(q_cons_filtered, eff_visc, visc_stress, mag_eff_visc)
         type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_filtered
         type(vector_field), dimension(1:num_dims), intent(inout) :: eff_visc
+        type(vector_field), dimension(1:num_dims), intent(inout) :: visc_stress
         type(scalar_field), intent(inout) :: mag_eff_visc
         real(wp), dimension(1:num_dims, 0:m, 0:n, 0:p) :: div_eff_visc
 
@@ -1025,41 +907,21 @@ contains
         end do
 #endif
 
+        ! calculate stress tensor with filtered quantities 
+        call s_compute_viscous_stress_tensor(visc_stress, q_cons_filtered)
+
         ! calculate eff_visc
         !$acc parallel loop collapse(3) gang vector default(present)
         do i = 0, m
             do j = 0, n
                 do k = 0, p
-                    eff_visc(1)%vf(1)%sf(i, j, k) = eff_visc(1)%vf(1)%sf(i, j, k) - mu_visc * (2._wp*(q_cons_filtered(momxb)%sf(i+1, j, k)/q_cons_filtered(1)%sf(i+1, j, k) - q_cons_filtered(momxb)%sf(i-1, j, k)/q_cons_filtered(1)%sf(i-1, j, k))/(2._wp*dx(i)) & 
-                                            - 2._wp/3._wp*((q_cons_filtered(momxb)%sf(i+1, j, k)/q_cons_filtered(1)%sf(i+1, j, k) - q_cons_filtered(momxb)%sf(i-1, j, k)/q_cons_filtered(1)%sf(i-1, j, k))/(2._wp*dx(i)) & 
-                                            + (q_cons_filtered(momxb+1)%sf(i, j+1, k)/q_cons_filtered(1)%sf(i, j+1, k) - q_cons_filtered(momxb+1)%sf(i, j-1, k)/q_cons_filtered(1)%sf(i, j-1, k))/(2._wp*dy(j)) & 
-                                            + (q_cons_filtered(momxb+2)%sf(i, j, k+1)/q_cons_filtered(1)%sf(i, j, k+1) - q_cons_filtered(momxb+2)%sf(i, j, k-1)/q_cons_filtered(1)%sf(i, j, k-1))/(2._wp*dz(k))))
-
-                    eff_visc(2)%vf(2)%sf(i, j, k) = eff_visc(2)%vf(2)%sf(i, j, k) - mu_visc * (2._wp*(q_cons_filtered(momxb+1)%sf(i, j+1, k)/q_cons_filtered(1)%sf(i, j+1, k) - q_cons_filtered(momxb+1)%sf(i, j-1, k)/q_cons_filtered(1)%sf(i, j-1, k))/(2._wp*dy(j)) & 
-                                            - 2._wp/3._wp*((q_cons_filtered(momxb)%sf(i+1, j, k)/q_cons_filtered(1)%sf(i+1, j, k) - q_cons_filtered(momxb)%sf(i-1, j, k)/q_cons_filtered(1)%sf(i-1, j, k))/(2._wp*dx(i)) & 
-                                            + (q_cons_filtered(momxb+1)%sf(i, j+1, k)/q_cons_filtered(1)%sf(i, j+1, k) - q_cons_filtered(momxb+1)%sf(i, j-1, k)/q_cons_filtered(1)%sf(i, j-1, k))/(2._wp*dy(j)) & 
-                                            + (q_cons_filtered(momxb+2)%sf(i, j, k+1)/q_cons_filtered(1)%sf(i, j, k+1) - q_cons_filtered(momxb+2)%sf(i, j, k-1)/q_cons_filtered(1)%sf(i, j, k-1))/(2._wp*dz(k))))
-
-                    eff_visc(3)%vf(3)%sf(i, j, k) = eff_visc(3)%vf(3)%sf(i, j, k) - mu_visc * (2._wp*(q_cons_filtered(momxb+2)%sf(i, j, k+1)/q_cons_filtered(1)%sf(i, j, k+1) - q_cons_filtered(momxb+2)%sf(i, j, k-1)/q_cons_filtered(1)%sf(i, j, k-1))/(2._wp*dz(k)) & 
-                                            - 2._wp/3._wp*((q_cons_filtered(momxb)%sf(i+1, j, k)/q_cons_filtered(1)%sf(i+1, j, k) - q_cons_filtered(momxb)%sf(i-1, j, k)/q_cons_filtered(1)%sf(i-1, j, k))/(2._wp*dx(i)) & 
-                                            + (q_cons_filtered(momxb+1)%sf(i, j+1, k)/q_cons_filtered(1)%sf(i, j+1, k) - q_cons_filtered(momxb+1)%sf(i, j-1, k)/q_cons_filtered(1)%sf(i, j-1, k))/(2._wp*dy(j)) & 
-                                            + (q_cons_filtered(momxb+2)%sf(i, j, k+1)/q_cons_filtered(1)%sf(i, j, k+1) - q_cons_filtered(momxb+2)%sf(i, j, k-1)/q_cons_filtered(1)%sf(i, j, k-1))/(2._wp*dz(k))))
-
-                    eff_visc(1)%vf(2)%sf(i, j, k) = eff_visc(1)%vf(2)%sf(i, j, k) - mu_visc * ((q_cons_filtered(momxb)%sf(i, j+1, k)/q_cons_filtered(1)%sf(i, j+1, k) - q_cons_filtered(momxb)%sf(i, j-1, k)/q_cons_filtered(1)%sf(i, j-1, k))/(2._wp*dy(j))/q_cons_filtered(1)%sf(i, j, k) & 
-                                            + (q_cons_filtered(momxb+1)%sf(i+1, j, k)/q_cons_filtered(1)%sf(i+1, j, k) - q_cons_filtered(momxb+1)%sf(i-1, j, k)/q_cons_filtered(1)%sf(i-1, j, k))/(2._wp*dx(i))/q_cons_filtered(1)%sf(i, j, k))
-                                        
-                    eff_visc(2)%vf(1)%sf(i, j, k) = eff_visc(1)%vf(2)%sf(i, j, k)
-
-                    eff_visc(1)%vf(3)%sf(i, j, k) = eff_visc(1)%vf(3)%sf(i, j, k) - mu_visc * ((q_cons_filtered(momxb)%sf(i, j, k+1)/q_cons_filtered(1)%sf(i, j, k+1) - q_cons_filtered(momxb)%sf(i, j, k-1)/q_cons_filtered(1)%sf(i, j, k-1))/(2._wp*dz(k))/q_cons_filtered(1)%sf(i, j, k) & 
-                                            + (q_cons_filtered(momxb+2)%sf(i+1, j, k)/q_cons_filtered(1)%sf(i+1, j, k) - q_cons_filtered(momxb+2)%sf(i-1, j, k)/q_cons_filtered(1)%sf(i-1, j, k))/(2._wp*dx(i))/q_cons_filtered(1)%sf(i, j, k))
-
-                    eff_visc(3)%vf(1)%sf(i, j, k) = eff_visc(1)%vf(3)%sf(i, j, k)
-
-                    eff_visc(2)%vf(3)%sf(i, j, k) = eff_visc(2)%vf(3)%sf(i, j, k) - mu_visc * ((q_cons_filtered(momxb+1)%sf(i, j, k+1)/q_cons_filtered(1)%sf(i, j, k+1) - q_cons_filtered(momxb+1)%sf(i, j, k-1)/q_cons_filtered(1)%sf(i, j, k-1))/(2._wp*dz(k))/q_cons_filtered(1)%sf(i, j, k) & 
-                                            + (q_cons_filtered(momxb+2)%sf(i, j+1, k)/q_cons_filtered(1)%sf(i, j+1, k) - q_cons_filtered(momxb+2)%sf(i, j-1, k)/q_cons_filtered(1)%sf(i, j-1, k))/(2._wp*dy(j))/q_cons_filtered(1)%sf(i, j, k))
-
-                    eff_visc(3)%vf(2)%sf(i, j, k) = eff_visc(2)%vf(3)%sf(i, j, k)
-                    
+                    !$acc loop seq
+                    do l = 1, num_dims
+                        !$acc loop seq
+                        do q = 1, num_dims
+                            eff_visc(l)%vf(q)%sf(i, j, k) = eff_visc(l)%vf(q)%sf(i, j, k) - visc_stress(l)%vf(q)%sf(i, j, k)
+                        end do 
+                    end do
                 end do
             end do
         end do
@@ -1109,9 +971,9 @@ contains
                     !$acc loop seq
                     do l = 1, num_dims
                         div_eff_visc(l, i, j, k) = (eff_visc(l)%vf(1)%sf(i+1, j, k) - eff_visc(l)%vf(1)%sf(i-1, j, k))/(2._wp*dx(i)) &
-                                             + (eff_visc(l)%vf(2)%sf(i, j+1, k) - eff_visc(l)%vf(2)%sf(i, j-1, k))/(2._wp*dy(j)) & 
-                                             + (eff_visc(l)%vf(3)%sf(i, j, k+1) - eff_visc(l)%vf(3)%sf(i, j, k-1))/(2._wp*dz(k))
-                    end do
+                                                 + (eff_visc(l)%vf(2)%sf(i, j+1, k) - eff_visc(l)%vf(2)%sf(i, j-1, k))/(2._wp*dy(j)) & 
+                                                 + (eff_visc(l)%vf(3)%sf(i, j, k+1) - eff_visc(l)%vf(3)%sf(i, j, k-1))/(2._wp*dz(k))
+                    end do 
                 end do
             end do
         end do
@@ -1138,15 +1000,233 @@ contains
             do j = 0, n
                 do k = 0, p 
                     mag_int_mom_exch%sf(i, j, k) = sqrt(int_mom_exch(1)%sf(i, j, k)**2 & 
-                                                + int_mom_exch(2)%sf(i, j, k)**2 & 
-                                                + int_mom_exch(3)%sf(i, j, k)**2)
+                                                      + int_mom_exch(2)%sf(i, j, k)**2 & 
+                                                      + int_mom_exch(3)%sf(i, j, k)**2)
                 end do
             end do
         end do 
 
     end subroutine s_compute_interphase_momentum_exchange
 
+
+    !< transpose domain from z-slabs to y-slabs on each processor
+    subroutine s_mpi_transpose_slabZ2Y
+        complex(c_double_complex), allocatable :: sendbuf(:), recvbuf(:)
+        integer :: dest_rank, src_rank
+        integer :: i, j, k
+
+        allocate(sendbuf(NxC*Nyloc*Nzloc*num_procs))
+        allocate(recvbuf(NxC*Nyloc*Nzloc*num_procs))
+
+        !$acc parallel loop collapse(4) gang vector default(present) copy(sendbuf)
+        do dest_rank = 0, num_procs-1
+            do k = 1, Nzloc 
+                do j = 1, Nyloc
+                    do i = 1, NxC
+                        sendbuf(i + (j-1)*NxC + (k-1)*NxC*Nyloc + dest_rank*NxC*Nyloc*Nzloc) = data_cmplx_slabz(i, j+dest_rank*Nyloc, k)
+                    end do 
+                end do
+            end do
+        end do
+
+        call MPI_Alltoall(sendbuf, NxC*Nyloc*Nzloc, MPI_DOUBLE_COMPLEX, & 
+                          recvbuf, NxC*Nyloc*Nzloc, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD, ierr)
+
+        !$acc parallel loop collapse(4) gang vector default(present) copy(recvbuf)
+        do src_rank = 0, num_procs-1
+            do k = 1, Nzloc 
+                do j = 1, Nyloc
+                    do i = 1, NxC
+                        data_cmplx_slaby(i, j, k+src_rank*Nzloc) = recvbuf(i + (j-1)*NxC + (k-1)*NxC*Nyloc + src_rank*NxC*Nyloc*Nzloc)
+                    end do 
+                end do
+            end do 
+        end do
+
+        deallocate(sendbuf, recvbuf)
+    end subroutine s_mpi_transpose_slabZ2Y
+
+    !< transpose domain from y-slabs to z-slabs on each processor
+    subroutine s_mpi_transpose_slabY2Z 
+        complex(c_double_complex), allocatable :: sendbuf(:), recvbuf(:)
+        integer :: dest_rank, src_rank
+        integer :: i, j, k
+
+        allocate(sendbuf(NxC*Nyloc*Nzloc*num_procs))
+        allocate(recvbuf(NxC*Nyloc*Nzloc*num_procs))
+
+        !$acc parallel loop collapse(4) gang vector default(present) copy(sendbuf)
+        do dest_rank = 0, num_procs-1
+            do k = 1, Nzloc 
+                do j = 1, Nyloc 
+                    do i = 1, NxC 
+                        sendbuf(i + (j-1)*NxC + (k-1)*NxC*Nyloc + dest_rank*NxC*Nyloc*Nzloc) = data_cmplx_slaby(i, j, k+dest_rank*Nzloc)
+                    end do 
+                end do 
+            end do 
+        end do
+
+        call MPI_Alltoall(sendbuf, NxC*Nyloc*Nzloc, MPI_DOUBLE_COMPLEX, & 
+                          recvbuf, NxC*Nyloc*Nzloc, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD, ierr)
+
+        !$acc parallel loop collapse(4) gang vector default(present) copy(recvbuf) 
+        do src_rank = 0, num_procs-1
+            do k = 1, Nzloc
+                do j = 1, Nyloc 
+                    do i = 1, NxC 
+                        data_cmplx_slabz(i, j+src_rank*Nyloc, k) = recvbuf(i + (j-1)*NxC + (k-1)*NxC*Nyloc + src_rank*NxC*Nyloc*Nzloc)
+                    end do 
+                end do
+            end do 
+        end do
+        
+        deallocate(sendbuf, recvbuf)
+    end subroutine s_mpi_transpose_slabY2Z
+
+    !< compute forward FFT, input: data_real_3D_slabz, output: data_cmplx_out1d
+    subroutine s_mpi_FFT_fwd
+        integer :: i, j, k
+
+        ! 3D z-slab -> 1D x, y, z
+        !$acc parallel loop collapse(3) gang vector default(present)
+        do i = 1, Nx 
+            do j = 1, Ny 
+                do k = 1, Nzloc
+                    data_real_in1d(i + (j-1)*Nx + (k-1)*Nx*Ny) = data_real_3D_slabz(i, j, k)
+                end do 
+            end do 
+        end do
+
+        ! X FFT
+#if defined(MFC_OpenACC)
+        ierr = cufftExecD2Z(plan_x_fwd_gpu, data_real_in1d, data_cmplx_out1d)
+#else
+        call fftw_execute_dft_r2c(plan_x_r2c_fwd, data_real_in1d, data_cmplx_out1d)
+#endif
+
+        ! 1D x, y, z -> 1D y, x, z (CMPLX)
+        !$acc parallel loop collapse(3) gang vector default(present)
+        do i = 1, NxC
+            do j = 1, Ny 
+                do k = 1, Nzloc
+                    data_cmplx_out1dy(j + (i-1)*Ny + (k-1)*Ny*NxC) = data_cmplx_out1d(i + (j-1)*NxC + (k-1)*NxC*Ny)
+                end do 
+            end do 
+        end do
+
+        ! Y FFT 
+#if defined(MFC_OpenACC)
+        ierr = cufftExecZ2Z(plan_y_gpu, data_cmplx_out1dy, data_cmplx_out1dy, CUFFT_FORWARD)
+#else
+        call fftw_execute_dft(plan_y_c2c_fwd, data_cmplx_out1dy, data_cmplx_out1dy)
+#endif 
+
+        ! 1D y, x, z -> 3D z-slab
+        !$acc parallel loop collapse(3) gang vector default(present)
+        do i = 1, NxC 
+            do j = 1, Ny 
+                do k = 1, Nzloc
+                    data_cmplx_slabz(i, j, k) = data_cmplx_out1dy(j + (i-1)*Ny + (k-1)*Ny*NxC)
+                end do 
+            end do 
+        end do 
+
+        ! transpose z-slab to y-slab
+        call s_mpi_transpose_slabZ2Y 
+
+        ! 3D y-slab -> 1D z, x, y
+        !$acc parallel loop collapse(3) gang vector default(present)
+        do i = 1, NxC 
+            do j = 1, Nyloc 
+                do k = 1, Nz
+                    data_cmplx_out1d(k + (i-1)*Nz + (j-1)*Nz*NxC) = data_cmplx_slaby(i, j, k)
+                end do 
+            end do 
+        end do
+
+        ! Z FFT
+#if defined(MFC_OpenACC)
+        ierr = cufftExecZ2Z(plan_z_gpu, data_cmplx_out1d, data_cmplx_out1d, CUFFT_FORWARD)
+#else
+        call fftw_execute_dft(plan_z_c2c_fwd, data_cmplx_out1d, data_cmplx_out1d)
+#endif
+
+        ! return data_cmplx_out1d: 1D z, x, y
+    end subroutine s_mpi_FFT_fwd
+
+    !< compute inverse FFT, input: data_cmplx_out1d, output: data_real_3D_slabz
+    subroutine s_mpi_FFT_bwd
+        integer :: i, j, k
+
+        ! Z inv FFT 
+#if defined(MFC_OpenACC)
+        ierr = cufftExecZ2Z(plan_z_gpu, data_cmplx_out1d, data_cmplx_out1d, CUFFT_INVERSE)
+#else
+        call fftw_execute_dft(plan_z_c2c_bwd, data_cmplx_out1d, data_cmplx_out1d)
+#endif
+
+        ! 1D z, x, y -> 3D y-slab
+        !$acc parallel loop collapse(3) gang vector default(present)
+        do i = 1, NxC 
+            do j = 1, Nyloc 
+                do k = 1, Nz 
+                    data_cmplx_slaby(i, j, k) = data_cmplx_out1d(k + (i-1)*Nz + (j-1)*Nz*NxC)
+                end do 
+            end do 
+        end do
+
+        ! transpose y-slab to z-slab
+        call s_mpi_transpose_slabY2Z
+
+        ! 3D z-slab -> 1D y, x, z
+        !$acc parallel loop collapse(3) gang vector default(present)
+        do i = 1, NxC 
+            do j = 1, Ny 
+                do k = 1, Nzloc
+                    data_cmplx_out1dy(j + (i-1)*Ny + (k-1)*Ny*NxC) = data_cmplx_slabz(i, j, k)
+                end do 
+            end do 
+        end do
+
+        ! Y inv FFT 
+#if defined(MFC_OpenACC)
+        ierr = cufftExecZ2Z(plan_y_gpu, data_cmplx_out1dy, data_cmplx_out1dy, CUFFT_INVERSE)
+#else
+        call fftw_execute_dft(plan_y_c2c_bwd, data_cmplx_out1dy, data_cmplx_out1dy)
+#endif
+
+        ! 1D y, x, z -> 1D x, y, z 
+        !$acc parallel loop collapse(3) gang vector default(present)
+        do i = 1, NxC 
+            do j = 1, Ny 
+                do k = 1, Nzloc
+                    data_cmplx_out1d(i + (j-1)*NxC + (k-1)*NxC*Ny) = data_cmplx_out1dy(j + (i-1)*Ny + (k-1)*Ny*NxC)
+                end do 
+            end do 
+        end do
+
+        ! X inv FFT
+#if defined(MFC_OpenACC)
+        ierr = cufftExecZ2D(plan_x_bwd_gpu, data_cmplx_out1d, data_real_in1d)
+#else
+        call fftw_execute_dft_c2r(plan_x_c2r_bwd, data_cmplx_out1d, data_real_in1d)
+#endif
+
+        ! 1D x, y, z -> 3D z-slab
+        !$acc parallel loop collapse(3) gang vector default(present)
+        do i = 1, Nx 
+            do j = 1, Ny 
+                do k = 1, Nzloc
+                    data_real_3D_slabz(i, j, k) = data_real_in1d(i + (j-1)*Nx + (k-1)*Nx*Ny)
+                end do 
+            end do 
+        end do
+
+    end subroutine s_mpi_FFT_bwd
+
     subroutine s_finalize_fftw_explicit_filter_module
+        integer :: i, j 
+
         @:DEALLOCATE(fluid_indicator_function%sf)
         @:DEALLOCATE(filtered_fluid_indicator_function%sf)
 
@@ -1155,6 +1235,27 @@ contains
         end do
         @:DEALLOCATE(q_cons_filtered)
 
+        do i = 1, num_dims
+            do j = 1, num_dims
+                @:DEALLOCATE(visc_stress(i)%vf(j)%sf)
+            end do 
+            @:DEALLOCATE(visc_stress(i)%vf)
+        end do
+        @:DEALLOCATE(visc_stress)
+
+        do i = 1, num_dims
+            do j = 1, num_dims
+                @:DEALLOCATE(pres_visc_stress(i)%vf(j)%sf)
+            end do
+            @:DEALLOCATE(pres_visc_stress(i)%vf)
+        end do
+        @:DEALLOCATE(pres_visc_stress)
+
+        do i = 1, num_dims
+            @:DEALLOCATE(div_pres_visc_stress(i)%sf)
+        end do
+        @:DEALLOCATE(div_pres_visc_stress)
+
         do i = 1, num_dims
             do j = 1, num_dims
                 @:DEALLOCATE(reynolds_stress(i)%vf(j)%sf)
diff --git a/src/simulation/p_main.fpp b/src/simulation/p_main.fpp
index 4c3ae9b62b..80b3e4ecf0 100644
--- a/src/simulation/p_main.fpp
+++ b/src/simulation/p_main.fpp
@@ -56,8 +56,11 @@ program p_main
     call s_initialize_gpu_vars()
     call nvtxEndRange
 
-    if (volume_filtering_momentum_eqn) call s_initialize_filtering_kernel()
     call s_initialize_fluid_indicator_function()
+    if (volume_filtering_momentum_eqn) then 
+        call s_initialize_filtering_kernel()
+        call s_initialize_filtered_fluid_indicator_function()
+    end if
 
     ! Setting the time-step iterator to the first time-step
     if (cfl_dt) then
diff --git a/toolchain/mfc/run/case_dicts.py b/toolchain/mfc/run/case_dicts.py
index 212d7a6cb6..2b46a4cb05 100644
--- a/toolchain/mfc/run/case_dicts.py
+++ b/toolchain/mfc/run/case_dicts.py
@@ -301,13 +301,13 @@ def analytic(self):
     'cont_damage_s': ParamType.REAL,
     'alpha_bar': ParamType.REAL,
     'compute_CD': ParamType.LOG,
-    'mu_visc': ParamType.REAL, 
     'u_inf_ref': ParamType.REAL,
     'rho_inf_ref': ParamType.REAL,
     'T_inf_ref': ParamType.REAL,
     'periodic_forcing': ParamType.LOG,
     'volume_filtering_momentum_eqn': ParamType.LOG,
     'compute_autocorrelation': ParamType.LOG,
+    't_step_stat_start': ParamType.INT,
 })
 
 for var in [ 'heatTransfer_model', 'massTransfer_model', 'pressure_corrector',

From e004d229ac09a49f4d5e1fed81917c04f51540b2 Mon Sep 17 00:00:00 2001
From: Conrad Delgado <conradd3@dt-login01.delta.ncsa.illinois.edu>
Date: Wed, 27 Aug 2025 12:46:55 -0500
Subject: [PATCH 07/30] gpu bug fix with periodic forcing

---
 runs/3d_1sphere_periodic/case.py             |   4 +-
 runs/3d_drag_test/case.py                    |   2 +-
 runs/phi01/case.py                           |  19 +++-
 src/common/m_mpi_common.fpp                  |  20 ++--
 src/post_process/m_data_input.f90            |  26 ++---
 src/post_process/m_start_up.f90              |   6 +-
 src/simulation/m_additional_forcing.fpp      | 110 ++++++++-----------
 src/simulation/m_compute_particle_forces.fpp |  10 +-
 src/simulation/m_compute_statistics.fpp      |  23 ++--
 src/simulation/m_data_output.fpp             |  14 +--
 src/simulation/m_global_parameters.fpp       |   8 +-
 src/simulation/m_mpi_proxy.fpp               |   6 +-
 src/simulation/m_start_up.fpp                |  79 +++++++------
 src/simulation/m_volume_filtering.fpp        |  51 +++++++--
 toolchain/mfc/run/case_dicts.py              |   3 +-
 toolchain/templates/delta.mako               |   2 +-
 16 files changed, 204 insertions(+), 179 deletions(-)

diff --git a/runs/3d_1sphere_periodic/case.py b/runs/3d_1sphere_periodic/case.py
index d8760b7909..05f7efc429 100644
--- a/runs/3d_1sphere_periodic/case.py
+++ b/runs/3d_1sphere_periodic/case.py
@@ -136,8 +136,8 @@
     # new case additions
     "periodic_forcing": "T",
     "periodic_ibs": "T",
-    #"compute_CD_vi": "F",
-    #"compute_CD_si": "F",
+    #"compute_particle_drag_vi": "F",
+    #"compute_particle_drag_si": "F",
     #"volume_filtering_momentum_eqn": "T",
 
     "u_inf_ref": v1,
diff --git a/runs/3d_drag_test/case.py b/runs/3d_drag_test/case.py
index 9a78272a88..00eb6a3c30 100644
--- a/runs/3d_drag_test/case.py
+++ b/runs/3d_drag_test/case.py
@@ -133,7 +133,7 @@
     "fluid_pp(1)%Re(1)": Re,
 
     # new case additions
-    "compute_CD": "T",
+    "compute_particle_drag": "T",
     "u_inf_ref": v1,
     "rho_inf_ref": rho,
     "T_inf_ref": T,
diff --git a/runs/phi01/case.py b/runs/phi01/case.py
index 1d5b26d462..3034d52b6d 100644
--- a/runs/phi01/case.py
+++ b/runs/phi01/case.py
@@ -2,6 +2,15 @@
 import math
 import numpy as np
 
+'''
+need to store
+full stats of unclosed term tensors (1, 2, 3, 4) - only at end time
+stats of flow quantities - only at end time
+flow quantities
+filtered fluid indicator function
+drag force on each particle
+'''
+
 Mu = 1.84e-05
 gam_a = 1.4
 R = 287.0
@@ -25,8 +34,8 @@
 #print('Kn = ' + str( np.sqrt(np.pi*gam_a/2)*(M/Re) )) # Kn < 0.01 = continuum flow
 
 dt = 4.0E-06
-Nt = 10
-t_save = 1
+Nt = 200
+t_save = 10
 
 Nx = 99
 Ny = 99
@@ -70,6 +79,7 @@
     "t_step_start": 0,
     "t_step_stop": Nt,  # 3000
     "t_step_save": t_save,  # 10
+    "t_step_stat_start": 50,
     # Simulation Algorithm Parameters
     # Only one patches are necessary, the air tube
     "num_patches": 1,
@@ -137,13 +147,13 @@
     # Fluids Physical Parameters
     "fluid_pp(1)%gamma": 1.0e00 / (gam_a - 1.0e00),  # 2.50(Not 1.40)
     "fluid_pp(1)%pi_inf": 0,
-    "fluid_pp(1)%Re(1)": Re,
+    "fluid_pp(1)%Re(1)": 1.0 / mu,
 
     # new case additions
     "periodic_forcing": "T",
     "periodic_ibs": "T",
-    "compute_CD": "F",
     "volume_filtering_momentum_eqn": "T",
+    "filter_width": 3.0*D/2,
 
     "u_inf_ref": v1,
     "rho_inf_ref": rho,
@@ -151,7 +161,6 @@
 
     "store_levelset": "F",
     "slab_domain_decomposition": "T", 
-    "compute_autocorrelation": "T",
     }
 
 case_dict.update(ib_dict)
diff --git a/src/common/m_mpi_common.fpp b/src/common/m_mpi_common.fpp
index d3dcab1ac7..c352412c75 100644
--- a/src/common/m_mpi_common.fpp
+++ b/src/common/m_mpi_common.fpp
@@ -174,9 +174,9 @@ contains
         type(scalar_field), &
             intent(in), optional :: beta
 
-        type(scalar_field), dimension(2:4), intent(in), optional :: stat_reynolds_stress
-        type(scalar_field), dimension(2:4), intent(in), optional :: stat_eff_visc
-        type(scalar_field), dimension(2:4), intent(in), optional :: stat_int_mom_exch
+        type(scalar_field), dimension(1:4), intent(in), optional :: stat_reynolds_stress
+        type(scalar_field), dimension(1:4), intent(in), optional :: stat_eff_visc
+        type(scalar_field), dimension(1:4), intent(in), optional :: stat_int_mom_exch
 
         integer, dimension(num_dims) :: sizes_glb, sizes_loc
         integer, dimension(1) :: airfoil_glb, airfoil_loc, airfoil_start
@@ -192,7 +192,7 @@ contains
         if (present(beta)) then
             alt_sys = sys_size + 1
         else if (present(stat_reynolds_stress) .and. present(stat_eff_visc) .and. present(stat_int_mom_exch)) then
-            alt_sys = sys_size + 9
+            alt_sys = sys_size + 12
         else
             alt_sys = sys_size
         end if
@@ -202,14 +202,14 @@ contains
         end do
         
         if (present(stat_reynolds_stress) .and. present(stat_eff_visc) .and. present(stat_int_mom_exch)) then 
-            do i = sys_size+1, sys_size+3
-                MPI_IO_DATA%var(i)%sf => stat_reynolds_stress(i-sys_size+1)%sf(0:m, 0:n, 0:p)
+            do i = sys_size+1, sys_size+4
+                MPI_IO_DATA%var(i)%sf => stat_reynolds_stress(i-sys_size)%sf(0:m, 0:n, 0:p)
             end do
-            do i = sys_size+4, sys_size+6
-                MPI_IO_DATA%var(i)%sf => stat_eff_visc(i-sys_size-2)%sf(0:m, 0:n, 0:p)
+            do i = sys_size+5, sys_size+8
+                MPI_IO_DATA%var(i)%sf => stat_eff_visc(i-sys_size-4)%sf(0:m, 0:n, 0:p)
             end do
-            do i = sys_size+7, sys_size+9 
-                MPI_IO_DATA%var(i)%sf => stat_int_mom_exch(i-sys_size-5)%sf(0:m, 0:n, 0:p)
+            do i = sys_size+9, sys_size+12 
+                MPI_IO_DATA%var(i)%sf => stat_int_mom_exch(i-sys_size-8)%sf(0:m, 0:n, 0:p)
             end do 
         end if
 
diff --git a/src/post_process/m_data_input.f90 b/src/post_process/m_data_input.f90
index 1efc1b97d3..d778eeb7fa 100644
--- a/src/post_process/m_data_input.f90
+++ b/src/post_process/m_data_input.f90
@@ -1344,7 +1344,7 @@ subroutine s_populate_filtered_variables_buffer_regions(q_particle)
                     q_particle%sf(-j, 0:n, 0:p) = &
                         q_particle%sf((m + 1) - j, 0:n, 0:p)
                 else
-                    do i = 2, 4
+                    do i = 1, 4
                         stat_reynolds_stress(i)%sf(-j, 0:n, 0:p) = &
                             stat_reynolds_stress(i)%sf((m + 1) - j, 0:n, 0:p)
                         stat_eff_visc(i)%sf(-j, 0:n, 0:p) = &
@@ -1375,7 +1375,7 @@ subroutine s_populate_filtered_variables_buffer_regions(q_particle)
                     q_particle%sf(m + j, 0:n, 0:p) = &
                         q_particle%sf(j - 1, 0:n, 0:p)
                 else
-                    do i = 2, 4
+                    do i = 1, 4
                         stat_reynolds_stress(i)%sf(m + j, 0:n, 0:p) = &
                             stat_reynolds_stress(i)%sf(j - 1, 0:n, 0:p)
                         stat_eff_visc(i)%sf(m + j, 0:n, 0:p) = &
@@ -1413,7 +1413,7 @@ subroutine s_populate_filtered_variables_buffer_regions(q_particle)
                         q_particle%sf(:, -j, 0:p) = &
                             q_particle%sf(:, (n + 1) - j, 0:p)
                     else
-                        do i = 2, 4
+                        do i = 1, 4
                             stat_reynolds_stress(i)%sf(:, -j, 0:p) = &
                                 stat_reynolds_stress(i)%sf(:, (n + 1) - j, 0:p)
                             stat_eff_visc(i)%sf(:, -j, 0:p) = &
@@ -1444,7 +1444,7 @@ subroutine s_populate_filtered_variables_buffer_regions(q_particle)
                         q_particle%sf(:, n + j, 0:p) = &
                             q_particle%sf(:, j - 1, 0:p)
                     else
-                        do i = 2, 4
+                        do i = 1, 4
                             stat_reynolds_stress(i)%sf(:, n + j, 0:p) = &
                                 stat_reynolds_stress(i)%sf(:, j - 1, 0:p)
                             stat_eff_visc(i)%sf(:, n + j, 0:p) = &
@@ -1482,7 +1482,7 @@ subroutine s_populate_filtered_variables_buffer_regions(q_particle)
                             q_particle%sf(:, :, -j) = &
                                 q_particle%sf(:, :, (p + 1) - j)
                         else
-                            do i = 2, 4
+                            do i = 1, 4
                                 stat_reynolds_stress(i)%sf(:, :, -j) = &
                                     stat_reynolds_stress(i)%sf(:, :, (p + 1) - j)
                                 stat_eff_visc(i)%sf(:, :, -j) = &
@@ -1514,7 +1514,7 @@ subroutine s_populate_filtered_variables_buffer_regions(q_particle)
                             q_particle%sf(:, :, p + j) = &
                                 q_particle%sf(:, :, j - 1)
                         else
-                            do i = 2, 4
+                            do i = 1, 4
                                 stat_reynolds_stress(i)%sf(:, :, p + j) = &
                                     stat_reynolds_stress(i)%sf(:, :, j - 1)
                                 stat_eff_visc(i)%sf(:, :, p + j) = &
@@ -1559,9 +1559,9 @@ subroutine s_initialize_data_input_module
         allocate (q_prim_vf(1:sys_size))
         if (bubbles_lagrange) allocate (q_particle(1))
 
-        if (q_filtered_wrt) allocate (stat_reynolds_stress(2:4))
-        if (q_filtered_wrt) allocate (stat_eff_visc(2:4))
-        if (q_filtered_wrt) allocate (stat_int_mom_exch(2:4))
+        if (q_filtered_wrt) allocate (stat_reynolds_stress(1:4))
+        if (q_filtered_wrt) allocate (stat_eff_visc(1:4))
+        if (q_filtered_wrt) allocate (stat_int_mom_exch(1:4))
 
         ! Allocating the parts of the conservative and primitive variables
         ! that do require the direct knowledge of the dimensionality of the
@@ -1601,7 +1601,7 @@ subroutine s_initialize_data_input_module
                 end if
 
                 if (q_filtered_wrt) then
-                    do i = 2, 4
+                    do i = 1, 4
                         allocate (stat_reynolds_stress(i)%sf(-buff_size:m + buff_size, &
                                                      -buff_size:n + buff_size, &
                                                      -buff_size:p + buff_size))
@@ -1707,15 +1707,15 @@ subroutine s_finalize_data_input_module
         end if
 
         if (q_filtered_wrt) then 
-            do i = 2, 4 
+            do i = 1, 4 
                 deallocate (stat_reynolds_stress(i)%sf)
             end do 
             deallocate(stat_reynolds_stress)
-            do i = 2, 4 
+            do i = 1, 4 
                 deallocate (stat_eff_visc(i)%sf)
             end do 
             deallocate(stat_eff_visc)
-            do i = 2, 4 
+            do i = 1, 4 
                 deallocate (stat_int_mom_exch(i)%sf)
             end do 
             deallocate(stat_int_mom_exch)
diff --git a/src/post_process/m_start_up.f90 b/src/post_process/m_start_up.f90
index b454764c3e..481181064c 100644
--- a/src/post_process/m_start_up.f90
+++ b/src/post_process/m_start_up.f90
@@ -328,21 +328,21 @@ subroutine s_save_data(t_step, varname, pres, c, H)
         ! Adding filtered quantities
         if (q_filtered_wrt) then
             ! filtered cons vars
-            do i = 2, 4
+            do i = 1, 4
                 q_sf = stat_reynolds_stress(i)%sf(x_beg:x_end, y_beg:y_end, z_beg:z_end)
                 write (varname, '(A,I0)') 'stat_reynolds_stresss', i
                 call s_write_variable_to_formatted_database_file(varname, t_step)
 
                 varname(:) = ' '
             end do
-            do i = 2, 4
+            do i = 1, 4
                 q_sf = stat_eff_visc(i)%sf(x_beg:x_end, y_beg:y_end, z_beg:z_end)
                 write (varname, '(A,I0)') 'stat_eff_viscs', i
                 call s_write_variable_to_formatted_database_file(varname, t_step)
 
                 varname(:) = ' '
             end do
-            do i = 2, 4
+            do i = 1, 4
                 q_sf = stat_int_mom_exch(i)%sf(x_beg:x_end, y_beg:y_end, z_beg:z_end)
                 write (varname, '(A,I0)') 'stat_int_mom_exchs', i
                 call s_write_variable_to_formatted_database_file(varname, t_step)
diff --git a/src/simulation/m_additional_forcing.fpp b/src/simulation/m_additional_forcing.fpp
index cc90cce4ef..c69ab97db1 100644
--- a/src/simulation/m_additional_forcing.fpp
+++ b/src/simulation/m_additional_forcing.fpp
@@ -17,29 +17,25 @@ module m_additional_forcing
  s_add_periodic_forcing, s_finalize_additional_forcing_module, & 
  s_compute_phase_average, s_compute_periodic_forcing;
 
-    real(wp), allocatable, dimension(:) :: q_bar ! 1:3 rho*u, 4 rho, 5 T
     type(scalar_field), allocatable, dimension(:) :: q_periodic_force
-    real(wp), allocatable, dimension(:) :: q_spatial_avg
-    real(wp), allocatable, dimension(:), public :: q_spatial_avg_glb ! 1:3 rho*u, 4 rho, 5 T
     real(wp) :: volfrac_phi
     integer :: N_x_total_glb
+    real(wp) :: spatial_rho, spatial_u
+    real(wp) :: phase_rho, phase_u
 
-    !$acc declare create(q_bar, q_periodic_force, q_spatial_avg, q_spatial_avg_glb, volfrac_phi, N_x_total_glb)
+    !$acc declare create(q_periodic_force, volfrac_phi, N_x_total_glb)
+    !$acc declare create(spatial_rho, spatial_u, phase_rho, phase_u)
 
 contains
 
     subroutine s_initialize_additional_forcing_module
         integer :: i
-        if (periodic_forcing) then 
-            @:ALLOCATE(q_bar(1:5))
-            @:ALLOCATE(q_periodic_force(1:8))
-            do i = 1, 8 
-                @:ALLOCATE(q_periodic_force(i)%sf(0:m, 0:n, 0:p))
-                @:ACC_SETUP_SFs(q_periodic_force(i))
-            end do
-            @:ALLOCATE(q_spatial_avg(1:5))
-            @:ALLOCATE(q_spatial_avg_glb(1:5))
-        end if
+
+        @:ALLOCATE(q_periodic_force(1:3))
+        do i = 1, 3
+            @:ALLOCATE(q_periodic_force(i)%sf(0:m, 0:n, 0:p))
+            @:ACC_SETUP_SFs(q_periodic_force(i))
+        end do
 
         volfrac_phi = num_ibs * 4._wp/3._wp * pi * patch_ib(1)%radius**3 / ((x_domain%end - x_domain%beg)*(y_domain%end - y_domain%beg)*(z_domain%end - z_domain%beg))
         !$acc update device(volfrac_phi)
@@ -57,60 +53,52 @@ contains
         do i = 0, m
             do j = 0, n
                 do k = 0, p
-                    rhs_vf(1)%sf(i, j, k) = rhs_vf(1)%sf(i, j, k) + q_periodic_force(7)%sf(i, j, k) * fluid_indicator_function%sf(i, j, k) ! continuity
-                    rhs_vf(2)%sf(i, j, k) = rhs_vf(2)%sf(i, j, k) + q_periodic_force(1)%sf(i, j, k) * fluid_indicator_function%sf(i, j, k) * fluid_indicator_function%sf(i, j, k) ! x momentum
-                    rhs_vf(5)%sf(i, j, k) = rhs_vf(5)%sf(i, j, k) + (q_periodic_force(4)%sf(i, j, k) + q_periodic_force(8)%sf(i, j, k)) * fluid_indicator_function%sf(i, j, k) ! energy
+                    rhs_vf(1)%sf(i, j, k) = rhs_vf(1)%sf(i, j, k) + q_periodic_force(1)%sf(i, j, k) * fluid_indicator_function%sf(i, j, k) ! continuity
+                    rhs_vf(2)%sf(i, j, k) = rhs_vf(2)%sf(i, j, k) + q_periodic_force(2)%sf(i, j, k) * fluid_indicator_function%sf(i, j, k) ! x momentum
+                    rhs_vf(5)%sf(i, j, k) = rhs_vf(5)%sf(i, j, k) + q_periodic_force(3)%sf(i, j, k) * fluid_indicator_function%sf(i, j, k) ! energy
                 end do
             end do
         end do
     end subroutine s_add_periodic_forcing
 
+    !< compute the space and time average of quantities
     subroutine s_compute_phase_average(q_cons_vf, t_step)
         type(scalar_field), dimension(sys_size), intent(in) :: q_cons_vf
         integer, intent(in) :: t_step
+        real(wp) :: spatial_rho_glb, spatial_u_glb
         integer :: i, j, k
 
-        !$acc loop seq
-        do i = 1, 5
-            q_spatial_avg(i) = 0._wp
-        end do
+        ! zero spatial averages
+        spatial_rho = 0._wp
+        spatial_u = 0._wp
+        !$acc update device(spatial_rho, spatial_u)
 
-        ! spatial average
-        !$acc parallel loop collapse(3) gang vector default(present) reduction(+:q_spatial_avg(:))
+        ! compute spatial averages
+        !$acc parallel loop collapse(3) gang vector default(present) reduction(+:spatial_rho, spatial_u)
         do i = 0, m 
             do j = 0, n 
                 do k = 0, p 
-                    q_spatial_avg(4) = q_spatial_avg(4) + q_cons_vf(1)%sf(i, j, k) * fluid_indicator_function%sf(i, j, k)
-                    q_spatial_avg(5) = q_spatial_avg(5) + (0.4_wp/287._wp * (q_cons_vf(5)%sf(i, j, k)/q_cons_vf(1)%sf(i, j, k) & 
-                                        - 0.5_wp * ((q_cons_vf(2)%sf(i, j, k)/q_cons_vf(1)%sf(i, j, k))**2 & 
-                                        + (q_cons_vf(3)%sf(i, j, k)/q_cons_vf(1)%sf(i, j, k))**2 & 
-                                        + (q_cons_vf(4)%sf(i, j, k)/q_cons_vf(1)%sf(i, j, k))**2))) * fluid_indicator_function%sf(i, j, k)
-                                        
-                    q_spatial_avg(1) = q_spatial_avg(1) + (q_cons_vf(2)%sf(i, j, k)) * fluid_indicator_function%sf(i, j, k)
-                    q_spatial_avg(2) = q_spatial_avg(2) + (q_cons_vf(3)%sf(i, j, k)) * fluid_indicator_function%sf(i, j, k)
-                    q_spatial_avg(3) = q_spatial_avg(3) + (q_cons_vf(4)%sf(i, j, k)) * fluid_indicator_function%sf(i, j, k)
+                    spatial_rho = spatial_rho + q_cons_vf(1)%sf(i, j, k) * fluid_indicator_function%sf(i, j, k) ! rho
+                    spatial_u = spatial_u + q_cons_vf(2)%sf(i, j, k) * fluid_indicator_function%sf(i, j, k) ! u
                 end do
             end do
         end do
 
-        !$acc update host(q_spatial_avg(:))
+        !$acc update host(spatial_rho, spatial_u)
 
-        do i = 1, 5 
-            call s_mpi_allreduce_sum(q_spatial_avg(i), q_spatial_avg_glb(i))
-        end do
+        ! reduction sum across entire domain
+        call s_mpi_allreduce_sum(spatial_rho, spatial_rho_glb)
+        call s_mpi_allreduce_sum(spatial_u, spatial_u_glb)
 
-        !$acc update device(q_spatial_avg_glb(:))
+        ! compute phase averages
+        phase_rho = phase_rho + (spatial_rho_glb / real(N_x_total_glb, wp) - phase_rho) / real(t_step, wp)
+        phase_u = phase_u + (spatial_u_glb / real(N_x_total_glb, wp) - phase_u) / real(t_step, wp)
+        !$acc update device(phase_rho, phase_u)
 
-        !$acc loop seq
-        do i = 1, 5 
-            q_spatial_avg_glb(i) = q_spatial_avg_glb(i) / real(N_x_total_glb, wp)
-        end do
+        if (proc_rank == 0) then 
+            print *, t_step, 'rho', phase_rho, 'rho*u', phase_u
+        end if
 
-        ! time average
-        !$acc loop seq
-        do i = 1, 5 
-            q_bar(i) = ( (q_spatial_avg_glb(i) + (t_step - 1._wp)*q_bar(i)) / t_step ) 
-        end do
     end subroutine s_compute_phase_average
 
     !< computes the periodic forcing terms described in Khalloufi and Capecelatro
@@ -123,21 +111,14 @@ contains
         do i = 0, m
             do j = 0, n
                 do k = 0, p
+                    ! f_rho
+                    q_periodic_force(1)%sf(i, j, k) = (rho_inf_ref - phase_rho/(1._wp - volfrac_phi)) / dt
+
                     ! f_u
-                    q_periodic_force(1)%sf(i, j, k) = (rho_inf_ref*u_inf_ref - q_bar(1)/(1._wp - volfrac_phi)) / dt
-                    q_periodic_force(2)%sf(i, j, k) = (rho_inf_ref*u_inf_ref - q_bar(2)/(1._wp - volfrac_phi)) / dt
-                    q_periodic_force(3)%sf(i, j, k) = (rho_inf_ref*u_inf_ref - q_bar(3)/(1._wp - volfrac_phi)) / dt
+                    q_periodic_force(2)%sf(i, j, k) = (rho_inf_ref*u_inf_ref - phase_u/(1._wp - volfrac_phi)) / dt
 
                     ! u*f_u
-                    q_periodic_force(4)%sf(i, j, k) = q_cons_vf(2)%sf(i, j, k)/q_cons_vf(1)%sf(i, j, k) * q_periodic_force(1)%sf(i, j, k)
-                    q_periodic_force(5)%sf(i, j, k) = q_cons_vf(3)%sf(i, j, k)/q_cons_vf(1)%sf(i, j, k) * q_periodic_force(2)%sf(i, j, k)
-                    q_periodic_force(6)%sf(i, j, k) = q_cons_vf(4)%sf(i, j, k)/q_cons_vf(1)%sf(i, j, k) * q_periodic_force(3)%sf(i, j, k)
-
-                    ! f_rho
-                    q_periodic_force(7)%sf(i, j, k) = (rho_inf_ref - q_bar(4)/(1._wp - volfrac_phi)) / dt
-
-                    ! f_T
-                    q_periodic_force(8)%sf(i, j, k) = (q_cons_vf(1)%sf(i, j, k) / 1.4_wp) * (T_inf_ref - q_bar(5)/(1._wp - volfrac_phi)) / dt
+                    q_periodic_force(3)%sf(i, j, k) = q_cons_vf(2)%sf(i, j, k)/q_cons_vf(1)%sf(i, j, k) * q_periodic_force(2)%sf(i, j, k)
                 end do 
             end do
         end do
@@ -145,15 +126,10 @@ contains
 
     subroutine s_finalize_additional_forcing_module
         integer :: i
-        if (periodic_forcing) then
-            @:DEALLOCATE(q_bar)
-            do i = 1, 8
-                @:DEALLOCATE(q_periodic_force(i)%sf)
-            end do
-            @:DEALLOCATE(q_periodic_force)
-            @:DEALLOCATE(q_spatial_avg)
-            @:DEALLOCATE(q_spatial_avg_glb)
-        end if
+        do i = 1, 3
+            @:DEALLOCATE(q_periodic_force(i)%sf)
+        end do
+        @:DEALLOCATE(q_periodic_force)
     end subroutine s_finalize_additional_forcing_module
 
 end module m_additional_forcing
\ No newline at end of file
diff --git a/src/simulation/m_compute_particle_forces.fpp b/src/simulation/m_compute_particle_forces.fpp
index 8a1ef5f092..9a2dbe8f09 100644
--- a/src/simulation/m_compute_particle_forces.fpp
+++ b/src/simulation/m_compute_particle_forces.fpp
@@ -9,6 +9,8 @@ module m_compute_particle_forces
 
     use m_mpi_proxy 
 
+    use m_volume_filtering
+
     implicit none
 
     private; public :: s_initialize_particle_forces_module, & 
@@ -21,9 +23,7 @@ module m_compute_particle_forces
 contains
     
     subroutine s_initialize_particle_forces_module
-        if (compute_CD) then
-            @:ALLOCATE(FD_calc(0:num_ibs))
-        end if
+        @:ALLOCATE(FD_calc(0:num_ibs))
 
     end subroutine s_initialize_particle_forces_module
 
@@ -63,9 +63,7 @@ contains
     end subroutine s_compute_drag_coefficient
 
     subroutine s_finalize_particle_forces_module
-        if (compute_CD) then 
-            @:DEALLOCATE(FD_calc)
-        end if
+        @:DEALLOCATE(FD_calc)
 
     end subroutine s_finalize_particle_forces_module
     
diff --git a/src/simulation/m_compute_statistics.fpp b/src/simulation/m_compute_statistics.fpp
index 93b8d6502d..9d574c72f7 100644
--- a/src/simulation/m_compute_statistics.fpp
+++ b/src/simulation/m_compute_statistics.fpp
@@ -9,6 +9,8 @@ module m_compute_statistics
 
     use m_additional_forcing
 
+    use m_nvtx
+
     implicit none
 
     private; public :: s_initialize_statistics_module, s_finalize_statistics_module, &
@@ -52,20 +54,20 @@ contains
             @:ACC_SETUP_SFs(Msn_int_mom_exch(i))
         end do
 
-        @:ALLOCATE(stat_reynolds_stress(2:4))
-        do i = 2, 4
+        @:ALLOCATE(stat_reynolds_stress(1:4))
+        do i = 1, 4
             @:ALLOCATE(stat_reynolds_stress(i)%sf(0:m, 0:n, 0:p))
             @:ACC_SETUP_SFs(stat_reynolds_stress(i))
         end do
 
-        @:ALLOCATE(stat_eff_visc(2:4))
-        do i = 2, 4
+        @:ALLOCATE(stat_eff_visc(1:4))
+        do i = 1, 4
             @:ALLOCATE(stat_eff_visc(i)%sf(0:m, 0:n, 0:p))
             @:ACC_SETUP_SFs(stat_eff_visc(i))
         end do
 
-        @:ALLOCATE(stat_int_mom_exch(2:4))
-        do i = 2, 4
+        @:ALLOCATE(stat_int_mom_exch(1:4))
+        do i = 1, 4
             @:ALLOCATE(stat_int_mom_exch(i)%sf(0:m, 0:n, 0:p))
             @:ACC_SETUP_SFs(stat_int_mom_exch(i))
         end do
@@ -125,7 +127,7 @@ contains
 
     subroutine s_compute_234_order_statistics(ns, Msn, q_stat)
         type(scalar_field), dimension(1:4), intent(in) :: Msn
-        type(scalar_field), dimension(2:4), intent(inout) :: q_stat
+        type(scalar_field), dimension(1:4), intent(inout) :: q_stat
 
         real(wp), intent(in) :: ns
         integer :: i, j, k
@@ -134,6 +136,7 @@ contains
         do i = 0, m 
             do j = 0, n 
                 do k = 0, p 
+                    q_stat(1)%sf(i, j, k) = Msn(1)%sf(i, j, k)
                     q_stat(2)%sf(i, j, k) = Msn(2)%sf(i, j, k) / (ns - 1._wp)
                     q_stat(3)%sf(i, j, k) = sqrt(ns - 1._wp) / (ns - 2._wp) * ns * Msn(3)%sf(i, j, k) / (Msn(2)%sf(i, j, k)**1.5)
                     q_stat(4)%sf(i, j, k) = (ns - 1._wp) / ((ns - 2._wp) * (ns - 3._wp)) * ((ns + 1._wp) * (ns * Msn(4)%sf(i, j, k) / (Msn(2)%sf(i, j, k)**2) - 3._wp) + 6._wp)
@@ -160,17 +163,17 @@ contains
         end do
         @:DEALLOCATE(Msn_int_mom_exch)
 
-        do i = 2, 4
+        do i = 1, 4
             @:DEALLOCATE(stat_reynolds_stress(i)%sf)
         end do
         @:DEALLOCATE(stat_reynolds_stress)
 
-        do i = 2, 4
+        do i = 1, 4
             @:DEALLOCATE(stat_eff_visc(i)%sf)
         end do
         @:DEALLOCATE(stat_eff_visc)
 
-        do i = 2, 4
+        do i = 1, 4
             @:DEALLOCATE(stat_int_mom_exch(i)%sf)
         end do
         @:DEALLOCATE(stat_int_mom_exch)
diff --git a/src/simulation/m_data_output.fpp b/src/simulation/m_data_output.fpp
index f43cebc798..fd783bef1f 100644
--- a/src/simulation/m_data_output.fpp
+++ b/src/simulation/m_data_output.fpp
@@ -94,9 +94,9 @@ contains
         type(scalar_field), &
             intent(inout), optional :: beta
 
-        type(scalar_field), dimension(2:4), intent(inout), optional :: stat_reynolds_stress
-        type(scalar_field), dimension(2:4), intent(inout), optional :: stat_eff_visc
-        type(scalar_field), dimension(2:4), intent(inout), optional :: stat_int_mom_exch
+        type(scalar_field), dimension(1:4), intent(inout), optional :: stat_reynolds_stress
+        type(scalar_field), dimension(1:4), intent(inout), optional :: stat_eff_visc
+        type(scalar_field), dimension(1:4), intent(inout), optional :: stat_int_mom_exch
 
         if (.not. parallel_io) then
             call s_write_serial_data_files(q_cons_vf, q_T_sf, q_prim_vf, t_step, beta)
@@ -796,9 +796,9 @@ contains
         type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
         integer, intent(in) :: t_step
         type(scalar_field), intent(inout), optional :: beta
-        type(scalar_field), dimension(2:4), intent(inout), optional :: stat_reynolds_stress
-        type(scalar_field), dimension(2:4), intent(inout), optional :: stat_eff_visc
-        type(scalar_field), dimension(2:4), intent(inout), optional :: stat_int_mom_exch
+        type(scalar_field), dimension(1:4), intent(inout), optional :: stat_reynolds_stress
+        type(scalar_field), dimension(1:4), intent(inout), optional :: stat_eff_visc
+        type(scalar_field), dimension(1:4), intent(inout), optional :: stat_int_mom_exch
 
 #ifdef MFC_MPI
 
@@ -821,7 +821,7 @@ contains
         if (present(beta)) then
             alt_sys = sys_size + 1
         else if (present(stat_reynolds_stress) .and. present(stat_eff_visc) .and. present(stat_int_mom_exch)) then
-            alt_sys = sys_size + 9
+            alt_sys = sys_size + 12
         else
             alt_sys = sys_size
         end if
diff --git a/src/simulation/m_global_parameters.fpp b/src/simulation/m_global_parameters.fpp
index bcd8c74dec..db2eb1d298 100644
--- a/src/simulation/m_global_parameters.fpp
+++ b/src/simulation/m_global_parameters.fpp
@@ -502,7 +502,7 @@ module m_global_parameters
     !> @}
 
     logical :: periodic_ibs
-    logical :: compute_CD
+    logical :: compute_particle_drag
     real(wp) :: u_inf_ref !< reference freestream velocity
     real(wp) :: rho_inf_ref !< reference freestream density 
     real(wp) :: T_inf_ref !< reference freestream temperature
@@ -512,8 +512,9 @@ module m_global_parameters
     logical :: slab_domain_decomposition
     logical :: compute_autocorrelation
     integer :: t_step_stat_start
+    real(wp) :: filter_width
 
-    !$acc declare create(u_inf_ref, rho_inf_ref, T_inf_ref)
+    !$acc declare create(u_inf_ref, rho_inf_ref, T_inf_ref, filter_width)
 
 contains
 
@@ -791,7 +792,7 @@ contains
         #:endif
 
         periodic_ibs = .false.
-        compute_CD = .false.
+        compute_particle_drag = .false.
         u_inf_ref = dflt_real
         rho_inf_ref = dflt_real
         T_inf_ref = dflt_real
@@ -801,6 +802,7 @@ contains
         slab_domain_decomposition = .false.
         compute_autocorrelation = .false.
         t_step_stat_start = dflt_int
+        filter_width = dflt_real
 
     end subroutine s_assign_default_values_to_user_inputs
 
diff --git a/src/simulation/m_mpi_proxy.fpp b/src/simulation/m_mpi_proxy.fpp
index 730f5ead50..bb359a4bed 100644
--- a/src/simulation/m_mpi_proxy.fpp
+++ b/src/simulation/m_mpi_proxy.fpp
@@ -75,7 +75,7 @@ contains
             & 'bc_y%beg', 'bc_y%end', 'bc_z%beg', 'bc_z%end',  'fd_order',     &
             & 'num_probes', 'num_integrals', 'bubble_model', 'thermal',        &
             & 'R0_type', 'num_source', 'relax_model', 'num_ibs', 'n_start',    &
-            & 'num_bc_patches']
+            & 'num_bc_patches', 't_step_stat_start']
             call MPI_BCAST(${VAR}$, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
         #:endfor
 
@@ -92,7 +92,7 @@ contains
             & 'cfl_adap_dt', 'cfl_const_dt', 'cfl_dt', 'surface_tension',        &
             & 'viscous', 'shear_stress', 'bulk_stress', 'bubbles_lagrange',     &
             & 'hyperelasticity', 'rkck_adap_dt', 'bc_io', 'powell', 'cont_damage', &
-            & 'periodic_ibs', 'compute_CD', 'periodic_forcing', 'volume_filtering_momentum_eqn', & 
+            & 'periodic_ibs', 'compute_particle_drag', 'periodic_forcing', 'volume_filtering_momentum_eqn', & 
             & 'store_levelset', 'slab_domain_decomposition', 'compute_autocorrelation' ]
             call MPI_BCAST(${VAR}$, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr)
         #:endfor
@@ -133,7 +133,7 @@ contains
             & 'z_domain%beg', 'z_domain%end', 'x_a', 'x_b', 'y_a', 'y_b', 'z_a', &
             & 'z_b', 't_stop', 't_save', 'cfl_target', 'rkck_tolerance', 'Bx0',  &
             & 'tau_star', 'cont_damage_s', 'alpha_bar', 'u_inf_ref',  & 
-            & 'rho_inf_ref', 'T_inf_ref', 't_step_stat_start' ]
+            & 'rho_inf_ref', 'T_inf_ref', 'filter_width' ]
             call MPI_BCAST(${VAR}$, 1, mpi_p, 0, MPI_COMM_WORLD, ierr)
         #:endfor
 
diff --git a/src/simulation/m_start_up.fpp b/src/simulation/m_start_up.fpp
index 5ac4e4dad6..c08a2ce7eb 100644
--- a/src/simulation/m_start_up.fpp
+++ b/src/simulation/m_start_up.fpp
@@ -189,9 +189,10 @@ contains
             rkck_adap_dt, rkck_tolerance, &
             hyperelasticity, R0ref, num_bc_patches, Bx0, powell, &
             cont_damage, tau_star, cont_damage_s, alpha_bar, & 
-            periodic_ibs, compute_CD, u_inf_ref, rho_inf_ref, T_inf_ref, & 
+            periodic_ibs, compute_particle_drag, u_inf_ref, rho_inf_ref, T_inf_ref, & 
             periodic_forcing, volume_filtering_momentum_eqn, store_levelset, & 
-            slab_domain_decomposition, compute_autocorrelation, t_step_stat_start
+            slab_domain_decomposition, compute_autocorrelation, t_step_stat_start, & 
+            filter_width
 
         ! Checking that an input file has been provided by the user. If it
         ! has, then the input file is read in, otherwise, simulation exits.
@@ -1320,6 +1321,42 @@ contains
 
         call s_compute_derived_variables(t_step)
 
+        ! ! Volume filter flow variables, compute unclosed terms and their statistics
+        ! if (volume_filtering_momentum_eqn) then 
+        !     if (t_step > t_step_stat_start) then  
+        !         call nvtxStartRange('VOLUME-FILTER-MOMENTUM-EQUATION')  
+        !         call s_volume_filter_momentum_eqn(q_cons_ts(1)%vf)
+        !         call nvtxEndRange
+
+        !         call nvtxStartRange('COMPUTE-STATISTICS')
+        !         call s_compute_statistics_momentum_unclosed_terms(t_step - t_step_stat_start, mag_reynolds_stress, mag_eff_visc, mag_int_mom_exch)
+        !         call nvtxEndRange
+
+        !         ! write(100, *) mag_reynolds_stress%sf(10, 10, 10)
+        !         ! write(101, *) stat_reynolds_stress(2)%sf(10, 10, 10), stat_reynolds_stress(3)%sf(10, 10, 10), stat_reynolds_stress(4)%sf(10, 10, 10)
+        !     end if
+
+        !     ! TEMPORARY, for v+v
+        !     ! if (t_step == 1) then 
+        !     !     open(unit=100, file='dat_reynolds_stress.txt', status='replace', action='write')
+        !     !     open(unit=101, file='stat_reynolds_stress.txt', status='replace', action='write')
+        !     ! end if
+        !     ! if (t_step == 999) then 
+        !     !     close(100)
+        !     !     close(101)
+        !     ! end if
+
+        !     call nvtxStartRange("COMPUTE-PARTICLE-FORCES")
+        !     call s_compute_particle_forces()
+        !     call nvtxEndRange
+        ! end if
+
+        ! if (periodic_forcing) then 
+        !     call nvtxStartRange("COMPUTE-PERIODIC-FORCING")
+        !     call s_compute_phase_average(q_cons_ts(1)%vf, t_step+1)
+        !     call s_compute_periodic_forcing(q_cons_ts(1)%vf)
+        !     call nvtxEndRange
+        ! end if
 
 #ifdef DEBUG
         print *, 'Computed derived vars'
@@ -1343,34 +1380,6 @@ contains
 
         if (relax) call s_infinite_relaxation_k(q_cons_ts(1)%vf)
 
-        ! Volume filter flow variables, compute unclosed terms and their statistics
-        if (volume_filtering_momentum_eqn) then 
-            call s_volume_filter_momentum_eqn(q_cons_ts(1)%vf)
-
-            if (t_step > t_step_stat_start) then    
-                call s_compute_statistics_momentum_unclosed_terms(t_step - t_step_stat_start, mag_reynolds_stress, mag_eff_visc, mag_int_mom_exch)
-
-                ! write(100, *) mag_reynolds_stress%sf(10, 10, 10)
-                ! write(101, *) stat_reynolds_stress(2)%sf(10, 10, 10), stat_reynolds_stress(3)%sf(10, 10, 10), stat_reynolds_stress(4)%sf(10, 10, 10)
-            end if
-
-            ! TEMPORARY, for v+v
-            ! if (t_step == 1) then 
-            !     open(unit=100, file='dat_reynolds_stress.txt', status='replace', action='write')
-            !     open(unit=101, file='stat_reynolds_stress.txt', status='replace', action='write')
-            ! end if
-            ! if (t_step == 999) then 
-            !     close(100)
-            !     close(101)
-            ! end if
-
-        end if
-
-        if (periodic_forcing) then 
-            call s_compute_phase_average(q_cons_ts(1)%vf, t_step+1)
-            call s_compute_periodic_forcing(q_cons_ts(1)%vf)
-        end if
-
         ! Time-stepping loop controls
 
         t_step = t_step + 1
@@ -1450,7 +1459,7 @@ contains
 
         call cpu_time(start)
         call nvtxStartRange("SAVE-DATA")
-        do i = 2, 4 
+        do i = 1, 4 
             !$acc update host(stat_reynolds_stress(i)%sf)
             !$acc update host(stat_eff_visc(i)%sf)
             !$acc update host(stat_int_mom_exch(i)%sf)
@@ -1607,7 +1616,7 @@ contains
 
         if (mhd .and. powell) call s_initialize_mhd_powell_module
 
-        call s_initialize_particle_forces_module()
+        if (compute_particle_drag) call s_initialize_particle_forces_module()
         if (periodic_forcing) call s_initialize_additional_forcing_module()
         if (volume_filtering_momentum_eqn) then 
             call s_initialize_fftw_explicit_filter_module()
@@ -1726,7 +1735,7 @@ contains
             !$acc update device(ib_markers%sf)
         end if
 
-        !$acc update device(u_inf_ref, rho_inf_ref, T_inf_ref)
+        !$acc update device(u_inf_ref, rho_inf_ref, T_inf_ref, filter_width)
 
     end subroutine s_initialize_gpu_vars
 
@@ -1756,8 +1765,8 @@ contains
         if (bodyForces) call s_finalize_body_forces_module()
         if (mhd .and. powell) call s_finalize_mhd_powell_module
 
-        call s_finalize_particle_forces_module()
-        call s_finalize_additional_forcing_module()
+        if (compute_particle_drag) call s_finalize_particle_forces_module()
+        if (periodic_forcing) call s_finalize_additional_forcing_module()
         if (volume_filtering_momentum_eqn) call s_finalize_fftw_explicit_filter_module
 
         ! Terminating MPI execution environment
diff --git a/src/simulation/m_volume_filtering.fpp b/src/simulation/m_volume_filtering.fpp
index fa44071328..c954f6eac6 100644
--- a/src/simulation/m_volume_filtering.fpp
+++ b/src/simulation/m_volume_filtering.fpp
@@ -30,7 +30,7 @@ module m_volume_filtering
  s_initialize_filtering_kernel, s_initialize_fluid_indicator_function, & 
  s_initialize_filtered_fluid_indicator_function, s_finalize_fftw_explicit_filter_module, & 
  s_apply_fftw_filter_cons, s_volume_filter_momentum_eqn, s_apply_fftw_filter_tensor, s_apply_fftw_filter_scalarfield, &
- s_compute_viscous_stress_tensor, s_compute_stress_tensor, s_compute_divergence_stress_tensor, &
+ s_compute_viscous_stress_tensor, s_compute_stress_tensor, s_compute_divergence_stress_tensor, s_compute_particle_forces, &
  s_mpi_transpose_slabZ2Y, s_mpi_transpose_slabY2Z, s_mpi_FFT_fwd, s_mpi_FFT_bwd, &
  s_setup_terms_filtering, s_compute_pseudo_turbulent_reynolds_stress, s_compute_effective_viscosity, s_compute_interphase_momentum_exchange
 
@@ -55,21 +55,25 @@ module m_volume_filtering
     type(scalar_field), allocatable, dimension(:) :: div_pres_visc_stress
     
     ! unclosed terms in volume filtered momentum equation
-    type(vector_field), allocatable, dimension(:) :: reynolds_stress
-    type(vector_field), allocatable, dimension(:) :: eff_visc
-    type(scalar_field), allocatable, dimension(:) :: int_mom_exch
+    type(vector_field), allocatable, dimension(:), public :: reynolds_stress
+    type(vector_field), allocatable, dimension(:), public :: eff_visc
+    type(scalar_field), allocatable, dimension(:), public :: int_mom_exch
 
     ! magnitude of unclosed terms in momentum equation
     type(scalar_field), public :: mag_reynolds_stress
     type(scalar_field), public :: mag_eff_visc
     type(scalar_field), public :: mag_int_mom_exch
 
+    ! 1/mu
     real(wp), allocatable, dimension(:, :) :: Res
 
+    ! x-,y-,z-direction forces on particles
+    real(wp), allocatable, dimension(:, :) :: particle_forces
+
     !$acc declare create(fluid_indicator_function, filtered_fluid_indicator_function, q_cons_filtered)
     !$acc declare create(visc_stress, pres_visc_stress, div_pres_visc_stress)
     !$acc declare create(reynolds_stress, eff_visc, int_mom_exch, mag_reynolds_stress, mag_eff_visc, mag_int_mom_exch)
-    !$acc declare create(Res)
+    !$acc declare create(Res, particle_forces)
 
 #if defined(MFC_OpenACC)
     ! GPU plans
@@ -213,9 +217,11 @@ contains
                     Res(i, j) = fluid_pp(Re_idx(i, j))%Re(i)
                 end do
             end do
-            !$acc update device(Res, Re_idx, Re_size)
+            !$acc update device(Res)
         end if
 
+        @:ALLOCATE(particle_forces(0:num_ibs, 3))
+
         !< global sizes 
         Nx = m_glb + 1
         Ny = n_glb + 1
@@ -339,7 +345,7 @@ contains
         integer :: i, j, k, idx
 
         ! gaussian filter
-        sigma_stddev = 3.0_dp * 0.05_dp
+        sigma_stddev = filter_width
 
         Lx = x_domain_end_glb - x_domain_beg_glb
         Ly = y_domain_end_glb - y_domain_beg_glb  
@@ -522,17 +528,13 @@ contains
         type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
         integer :: i, j, k
 
-        call nvtxStartRange("FILTER-CONSERVATIVE-VARIABLES")
         call s_apply_fftw_filter_cons(q_cons_vf, q_cons_filtered)
-        call nvtxEndRange
 
-        call nvtxStartRange("COMPUTE-MOMENTUM-UNCLOSED-TERMS")
         call s_setup_terms_filtering(q_cons_vf, reynolds_stress, visc_stress, pres_visc_stress, div_pres_visc_stress)
         call s_apply_fftw_filter_tensor(reynolds_stress, visc_stress, eff_visc, div_pres_visc_stress, int_mom_exch)
         call s_compute_pseudo_turbulent_reynolds_stress(q_cons_filtered, reynolds_stress, mag_reynolds_stress)
         call s_compute_effective_viscosity(q_cons_filtered, eff_visc, visc_stress, mag_eff_visc)
         call s_compute_interphase_momentum_exchange(int_mom_exch, mag_int_mom_exch)
-        call nvtxEndRange
 
     end subroutine s_volume_filter_momentum_eqn
 
@@ -993,7 +995,7 @@ contains
         type(scalar_field), dimension(1:num_dims), intent(in) :: int_mom_exch
         type(scalar_field), intent(inout) :: mag_int_mom_exch
 
-        integer :: i, j, k, l, q, ii
+        integer :: i, j, k
 
         !$acc parallel loop collapse(3) gang vector default(present)
         do i = 0, m
@@ -1008,6 +1010,28 @@ contains
 
     end subroutine s_compute_interphase_momentum_exchange
 
+    ! computes x-,y-,z-direction forces on particles
+    subroutine s_compute_particle_forces
+        real(wp) :: dvol
+        integer :: i, j, k, l
+
+        !$acc parallel loop collapse(3) gang vector default(present) private(dvol)
+        do i = 0, m 
+            do j = 0, n 
+                do k = 0, p
+                    dvol = dx(i) * dy(j) * dz(k)
+                    !$acc atomic
+                    particle_forces(ib_markers%sf(i, j, k), 1) = particle_forces(ib_markers%sf(i, j, k), 1) + div_pres_visc_stress(1)%sf(i, j, k) * dvol
+                    !$acc atomic
+                    particle_forces(ib_markers%sf(i, j, k), 2) = particle_forces(ib_markers%sf(i, j, k), 2) + div_pres_visc_stress(2)%sf(i, j, k) * dvol
+                    !$acc atomic
+                    particle_forces(ib_markers%sf(i, j, k), 3) = particle_forces(ib_markers%sf(i, j, k), 3) + div_pres_visc_stress(3)%sf(i, j, k) * dvol
+                end do 
+            end do 
+        end do
+
+    end subroutine s_compute_particle_forces
+
 
     !< transpose domain from z-slabs to y-slabs on each processor
     subroutine s_mpi_transpose_slabZ2Y
@@ -1281,6 +1305,9 @@ contains
         @:DEALLOCATE(mag_eff_visc%sf)
         @:DEALLOCATE(mag_int_mom_exch%sf)
 
+        @:DEALLOCATE(Res)
+        @:DEALLOCATE(particle_forces)
+
         @:DEALLOCATE(data_real_in1d, data_cmplx_out1d, data_cmplx_out1dy)
         @:DEALLOCATE(cmplx_kernelG1d, real_kernelG_in)
         @:DEALLOCATE(data_real_3D_slabz, data_cmplx_slabz, data_cmplx_slaby)
diff --git a/toolchain/mfc/run/case_dicts.py b/toolchain/mfc/run/case_dicts.py
index 2b46a4cb05..f9bedd37af 100644
--- a/toolchain/mfc/run/case_dicts.py
+++ b/toolchain/mfc/run/case_dicts.py
@@ -300,7 +300,7 @@ def analytic(self):
     'tau_star': ParamType.REAL,
     'cont_damage_s': ParamType.REAL,
     'alpha_bar': ParamType.REAL,
-    'compute_CD': ParamType.LOG,
+    'compute_particle_drag': ParamType.LOG,
     'u_inf_ref': ParamType.REAL,
     'rho_inf_ref': ParamType.REAL,
     'T_inf_ref': ParamType.REAL,
@@ -308,6 +308,7 @@ def analytic(self):
     'volume_filtering_momentum_eqn': ParamType.LOG,
     'compute_autocorrelation': ParamType.LOG,
     't_step_stat_start': ParamType.INT,
+    'filter_width': ParamType.REAL,
 })
 
 for var in [ 'heatTransfer_model', 'massTransfer_model', 'pressure_corrector',
diff --git a/toolchain/templates/delta.mako b/toolchain/templates/delta.mako
index 694f22c457..52246fd334 100644
--- a/toolchain/templates/delta.mako
+++ b/toolchain/templates/delta.mako
@@ -16,7 +16,7 @@
 % endif
 % if gpu:
 #SBATCH --gpus-per-node=${tasks_per_node}
-#SBATCH --mem=208G
+#SBATCH --mem=240G
 #SBATCH --gpu-bind=closest
 % endif
 #SBATCH --output="${name}.out"

From 40efc90ccc2cf33e95600844784863782792e38a Mon Sep 17 00:00:00 2001
From: conradd3 <conradd3@illinois.edu>
Date: Wed, 27 Aug 2025 12:56:36 -0500
Subject: [PATCH 08/30] mpi data output bug for filtered q

---
 src/post_process/m_global_parameters.fpp | 8 ++++----
 src/simulation/m_global_parameters.fpp   | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/post_process/m_global_parameters.fpp b/src/post_process/m_global_parameters.fpp
index ec6a3ca3f9..37b396a583 100644
--- a/src/post_process/m_global_parameters.fpp
+++ b/src/post_process/m_global_parameters.fpp
@@ -781,9 +781,9 @@ contains
                 MPI_IO_DATA%var(i)%sf => null()
             end do
         else if (q_filtered_wrt) then
-            allocate (MPI_IO_DATA%view(1:sys_size+9))
-            allocate (MPI_IO_DATA%var(1:sys_size+9))
-            do i = 1, sys_size+9
+            allocate (MPI_IO_DATA%view(1:sys_size+12))
+            allocate (MPI_IO_DATA%var(1:sys_size+12))
+            do i = 1, sys_size+12
                 allocate (MPI_IO_DATA%var(i)%sf(0:m, 0:n, 0:p))
                 MPI_IO_DATA%var(i)%sf => null()
             end do
@@ -974,7 +974,7 @@ contains
             if (bubbles_lagrange) MPI_IO_DATA%var(sys_size + 1)%sf => null()
 
             if (q_filtered_wrt) then 
-                do i = sys_size+1, sys_size+9
+                do i = sys_size+1, sys_size+12
                     MPI_IO_DATA%var(i)%sf => null()
                 end do
             end if
diff --git a/src/simulation/m_global_parameters.fpp b/src/simulation/m_global_parameters.fpp
index db2eb1d298..6efe39d5d9 100644
--- a/src/simulation/m_global_parameters.fpp
+++ b/src/simulation/m_global_parameters.fpp
@@ -1158,8 +1158,8 @@ contains
             allocate (MPI_IO_DATA%view(1:sys_size + 1))
             allocate (MPI_IO_DATA%var(1:sys_size + 1))
         else if (volume_filtering_momentum_eqn) then 
-            allocate (MPI_IO_DATA%view(1:sys_size+9))
-            allocate (MPI_IO_DATA%var(1:sys_size+9))
+            allocate (MPI_IO_DATA%view(1:sys_size+12))
+            allocate (MPI_IO_DATA%var(1:sys_size+12))
         else
             allocate (MPI_IO_DATA%view(1:sys_size))
             allocate (MPI_IO_DATA%var(1:sys_size))
@@ -1180,7 +1180,7 @@ contains
                 MPI_IO_DATA%var(i)%sf => null()
             end do
         else if (volume_filtering_momentum_eqn) then 
-            do i = sys_size+1, sys_size+9
+            do i = sys_size+1, sys_size+12
                 allocate (MPI_IO_DATA%var(i)%sf(0:m, 0:n, 0:p))
                 MPI_IO_DATA%var(i)%sf => null()
             end do
@@ -1357,7 +1357,7 @@ contains
                     MPI_IO_DATA%var(i)%sf => null()
                 end do
             else if (volume_filtering_momentum_eqn) then 
-                do i = 1, sys_size+9
+                do i = 1, sys_size+12
                     MPI_IO_DATA%var(i)%sf => null()
                 end do
             else

From 30fb0e7406ffdb8e12035738c193fc75e8d9716b Mon Sep 17 00:00:00 2001
From: Conrad Delgado <conradd3@dt-login03.delta.ncsa.illinois.edu>
Date: Tue, 2 Sep 2025 14:28:36 -0500
Subject: [PATCH 09/30] gpu profiling blocks

---
 src/simulation/m_additional_forcing.fpp |  6 +--
 src/simulation/m_start_up.fpp           | 72 ++++++++++++-------------
 src/simulation/m_volume_filtering.fpp   | 29 ++++++++++
 3 files changed, 68 insertions(+), 39 deletions(-)

diff --git a/src/simulation/m_additional_forcing.fpp b/src/simulation/m_additional_forcing.fpp
index c69ab97db1..ae1d028330 100644
--- a/src/simulation/m_additional_forcing.fpp
+++ b/src/simulation/m_additional_forcing.fpp
@@ -95,9 +95,9 @@ contains
         phase_u = phase_u + (spatial_u_glb / real(N_x_total_glb, wp) - phase_u) / real(t_step, wp)
         !$acc update device(phase_rho, phase_u)
 
-        if (proc_rank == 0) then 
-            print *, t_step, 'rho', phase_rho, 'rho*u', phase_u
-        end if
+        ! if (proc_rank == 0) then 
+        !     print *, t_step, 'rho', phase_rho, 'rho*u', phase_u
+        ! end if
 
     end subroutine s_compute_phase_average
 
diff --git a/src/simulation/m_start_up.fpp b/src/simulation/m_start_up.fpp
index c08a2ce7eb..453632807a 100644
--- a/src/simulation/m_start_up.fpp
+++ b/src/simulation/m_start_up.fpp
@@ -1321,42 +1321,42 @@ contains
 
         call s_compute_derived_variables(t_step)
 
-        ! ! Volume filter flow variables, compute unclosed terms and their statistics
-        ! if (volume_filtering_momentum_eqn) then 
-        !     if (t_step > t_step_stat_start) then  
-        !         call nvtxStartRange('VOLUME-FILTER-MOMENTUM-EQUATION')  
-        !         call s_volume_filter_momentum_eqn(q_cons_ts(1)%vf)
-        !         call nvtxEndRange
-
-        !         call nvtxStartRange('COMPUTE-STATISTICS')
-        !         call s_compute_statistics_momentum_unclosed_terms(t_step - t_step_stat_start, mag_reynolds_stress, mag_eff_visc, mag_int_mom_exch)
-        !         call nvtxEndRange
-
-        !         ! write(100, *) mag_reynolds_stress%sf(10, 10, 10)
-        !         ! write(101, *) stat_reynolds_stress(2)%sf(10, 10, 10), stat_reynolds_stress(3)%sf(10, 10, 10), stat_reynolds_stress(4)%sf(10, 10, 10)
-        !     end if
-
-        !     ! TEMPORARY, for v+v
-        !     ! if (t_step == 1) then 
-        !     !     open(unit=100, file='dat_reynolds_stress.txt', status='replace', action='write')
-        !     !     open(unit=101, file='stat_reynolds_stress.txt', status='replace', action='write')
-        !     ! end if
-        !     ! if (t_step == 999) then 
-        !     !     close(100)
-        !     !     close(101)
-        !     ! end if
-
-        !     call nvtxStartRange("COMPUTE-PARTICLE-FORCES")
-        !     call s_compute_particle_forces()
-        !     call nvtxEndRange
-        ! end if
-
-        ! if (periodic_forcing) then 
-        !     call nvtxStartRange("COMPUTE-PERIODIC-FORCING")
-        !     call s_compute_phase_average(q_cons_ts(1)%vf, t_step+1)
-        !     call s_compute_periodic_forcing(q_cons_ts(1)%vf)
-        !     call nvtxEndRange
-        ! end if
+        ! Volume filter flow variables, compute unclosed terms and their statistics
+        if (volume_filtering_momentum_eqn) then 
+            if (t_step > t_step_stat_start) then  
+                call nvtxStartRange("VOLUME-FILTER-MOMENTUM-EQUATION")  
+                call s_volume_filter_momentum_eqn(q_cons_ts(1)%vf)
+                call nvtxEndRange
+
+                call nvtxStartRange("COMPUTE-STATISTICS")
+                call s_compute_statistics_momentum_unclosed_terms(t_step - t_step_stat_start, mag_reynolds_stress, mag_eff_visc, mag_int_mom_exch)
+                call nvtxEndRange
+
+                ! write(100, *) mag_reynolds_stress%sf(10, 10, 10)
+                ! write(101, *) stat_reynolds_stress(2)%sf(10, 10, 10), stat_reynolds_stress(3)%sf(10, 10, 10), stat_reynolds_stress(4)%sf(10, 10, 10)
+            end if
+
+            ! TEMPORARY, for v+v
+            ! if (t_step == 1) then 
+            !     open(unit=100, file='dat_reynolds_stress.txt', status='replace', action='write')
+            !     open(unit=101, file='stat_reynolds_stress.txt', status='replace', action='write')
+            ! end if
+            ! if (t_step == 999) then 
+            !     close(100)
+            !     close(101)
+            ! end if
+
+            call nvtxStartRange("COMPUTE-PARTICLE-FORCES")
+            call s_compute_particle_forces()
+            call nvtxEndRange
+        end if
+
+        if (periodic_forcing) then 
+            call nvtxStartRange("COMPUTE-PERIODIC-FORCING")
+            call s_compute_phase_average(q_cons_ts(1)%vf, t_step+1)
+            call s_compute_periodic_forcing(q_cons_ts(1)%vf)
+            call nvtxEndRange
+        end if
 
 #ifdef DEBUG
         print *, 'Computed derived vars'
diff --git a/src/simulation/m_volume_filtering.fpp b/src/simulation/m_volume_filtering.fpp
index c954f6eac6..a503412162 100644
--- a/src/simulation/m_volume_filtering.fpp
+++ b/src/simulation/m_volume_filtering.fpp
@@ -333,6 +333,10 @@ contains
                                                 cmplx_kernelG1d, onembed, 1, Nz, & 
                                                 FFTW_FORWARD, FFTW_MEASURE)
 #endif
+
+        ! file for particle forces
+        open(unit=100, file='particle_force.bin', status='replace', form='unformatted', access='stream')
+
     end subroutine s_initialize_fftw_explicit_filter_module
 
     !< initialize the gaussian filtering kernel in real space and then compute its DFT
@@ -528,13 +532,23 @@ contains
         type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
         integer :: i, j, k
 
+        call nvtxStartRange("FILTER-CONS-VARS")
         call s_apply_fftw_filter_cons(q_cons_vf, q_cons_filtered)
+        call nvtxEndRange
 
+        call nvtxStartRange("UNCLOSED-TERM-SETUP")
         call s_setup_terms_filtering(q_cons_vf, reynolds_stress, visc_stress, pres_visc_stress, div_pres_visc_stress)
+        call nvtxEndRange
+
+        call nvtxStartRange("FILTER-UNCLOSED-TERM-VARS")
         call s_apply_fftw_filter_tensor(reynolds_stress, visc_stress, eff_visc, div_pres_visc_stress, int_mom_exch)
+        call nvtxEndRange
+
+        call nvtxStartRange("COMPUTE-UNCLOSED-TERMS")
         call s_compute_pseudo_turbulent_reynolds_stress(q_cons_filtered, reynolds_stress, mag_reynolds_stress)
         call s_compute_effective_viscosity(q_cons_filtered, eff_visc, visc_stress, mag_eff_visc)
         call s_compute_interphase_momentum_exchange(int_mom_exch, mag_int_mom_exch)
+        call nvtxEndRange
 
     end subroutine s_volume_filter_momentum_eqn
 
@@ -1012,6 +1026,7 @@ contains
 
     ! computes x-,y-,z-direction forces on particles
     subroutine s_compute_particle_forces
+        real(wp), dimension(num_ibs, 3) :: force_glb
         real(wp) :: dvol
         integer :: i, j, k, l
 
@@ -1030,6 +1045,18 @@ contains
             end do 
         end do
 
+        ! reduce particle forces across processors
+        do i = 1, num_ibs
+            call s_mpi_allreduce_sum(particle_forces(i, 1), force_glb(i, 1))
+            call s_mpi_allreduce_sum(particle_forces(i, 2), force_glb(i, 2))
+            call s_mpi_allreduce_sum(particle_forces(i, 3), force_glb(i, 3))
+        end do
+
+        ! write particle forces to file
+        if (proc_rank == 0) then
+            write(100) force_glb
+        end if
+            
     end subroutine s_compute_particle_forces
 
 
@@ -1329,6 +1356,8 @@ contains
         call fftw_destroy_plan(plan_z_c2c_kernelG)
 #endif
 
+        close(100)
+
     end subroutine s_finalize_fftw_explicit_filter_module
 
 end module m_volume_filtering
\ No newline at end of file

From 0a35fda9720cb83ec031f97c7fa9a0cc0b33a6cd Mon Sep 17 00:00:00 2001
From: conradd3 <conradd3@illinois.edu>
Date: Tue, 2 Sep 2025 17:56:00 -0500
Subject: [PATCH 10/30] full tensor stats and data output

---
 runs/phi01/case.py                       |   7 +-
 src/common/m_mpi_common.fpp              |  39 ++-
 src/post_process/m_data_input.f90        | 331 ++++++-----------------
 src/post_process/m_global_parameters.fpp |   8 +-
 src/post_process/m_start_up.f90          |  57 ++--
 src/simulation/m_compute_statistics.fpp  | 214 +++++++++++----
 src/simulation/m_data_output.fpp         |  33 ++-
 src/simulation/m_global_parameters.fpp   |  10 +-
 src/simulation/m_mpi_proxy.fpp           |   3 +-
 src/simulation/m_start_up.fpp            |  34 ++-
 src/simulation/m_volume_filtering.fpp    | 154 +----------
 toolchain/mfc/run/case_dicts.py          |   1 +
 12 files changed, 373 insertions(+), 518 deletions(-)

diff --git a/runs/phi01/case.py b/runs/phi01/case.py
index 3034d52b6d..e47086a47e 100644
--- a/runs/phi01/case.py
+++ b/runs/phi01/case.py
@@ -34,8 +34,9 @@
 #print('Kn = ' + str( np.sqrt(np.pi*gam_a/2)*(M/Re) )) # Kn < 0.01 = continuum flow
 
 dt = 4.0E-06
-Nt = 200
-t_save = 10
+Nt = 20
+t_save = 1
+t_step_start_stats = 10
 
 Nx = 99
 Ny = 99
@@ -79,7 +80,7 @@
     "t_step_start": 0,
     "t_step_stop": Nt,  # 3000
     "t_step_save": t_save,  # 10
-    "t_step_stat_start": 50,
+    "t_step_stat_start": t_step_start_stats,
     # Simulation Algorithm Parameters
     # Only one patches are necessary, the air tube
     "num_patches": 1,
diff --git a/src/common/m_mpi_common.fpp b/src/common/m_mpi_common.fpp
index c352412c75..4eca64e8ad 100644
--- a/src/common/m_mpi_common.fpp
+++ b/src/common/m_mpi_common.fpp
@@ -153,7 +153,8 @@ contains
     !! @param levelset closest distance from every cell to the IB
     !! @param levelset_norm normalized vector from every cell to the closest point to the IB
     !! @param beta Eulerian void fraction from lagrangian bubbles
-    subroutine s_initialize_mpi_data(q_cons_vf, ib_markers, levelset, levelset_norm, beta, stat_reynolds_stress, stat_eff_visc, stat_int_mom_exch)
+    subroutine s_initialize_mpi_data(q_cons_vf, ib_markers, levelset, levelset_norm, beta, filtered_fluid_indicator_function, &
+                                     stat_reynolds_stress, stat_eff_visc, stat_int_mom_exch, stat_q_cons_filtered)
 
         type(scalar_field), &
             dimension(sys_size), &
@@ -174,9 +175,11 @@ contains
         type(scalar_field), &
             intent(in), optional :: beta
 
-        type(scalar_field), dimension(1:4), intent(in), optional :: stat_reynolds_stress
-        type(scalar_field), dimension(1:4), intent(in), optional :: stat_eff_visc
-        type(scalar_field), dimension(1:4), intent(in), optional :: stat_int_mom_exch
+        type(scalar_field), intent(in), optional :: filtered_fluid_indicator_function
+        type(vector_field), dimension(1:9), intent(in), optional :: stat_reynolds_stress
+        type(vector_field), dimension(1:9), intent(in), optional :: stat_eff_visc
+        type(vector_field), dimension(1:3), intent(in), optional :: stat_int_mom_exch
+        type(vector_field), dimension(1:sys_size), intent(in), optional :: stat_q_cons_filtered
 
         integer, dimension(num_dims) :: sizes_glb, sizes_loc
         integer, dimension(1) :: airfoil_glb, airfoil_loc, airfoil_start
@@ -192,7 +195,7 @@ contains
         if (present(beta)) then
             alt_sys = sys_size + 1
         else if (present(stat_reynolds_stress) .and. present(stat_eff_visc) .and. present(stat_int_mom_exch)) then
-            alt_sys = sys_size + 12
+            alt_sys = sys_size + 1 + 9*4 + 9*4 + 3*4 + 6*4 ! 109
         else
             alt_sys = sys_size
         end if
@@ -202,15 +205,27 @@ contains
         end do
         
         if (present(stat_reynolds_stress) .and. present(stat_eff_visc) .and. present(stat_int_mom_exch)) then 
-            do i = sys_size+1, sys_size+4
-                MPI_IO_DATA%var(i)%sf => stat_reynolds_stress(i-sys_size)%sf(0:m, 0:n, 0:p)
+            MPI_IO_DATA%var(sys_size+1)%sf => filtered_fluid_indicator_function%sf(0:m, 0:n, 0:p)
+            do i = 1, 9
+                do j = 1, 4
+                    MPI_IO_DATA%var(sys_size+1+(i-1)*4+j)%sf => stat_reynolds_stress(i)%vf(j)%sf(0:m, 0:n, 0:p)
+                end do
+            end do
+            do i = 1, 9
+                do j = 1, 4
+                    MPI_IO_DATA%var(sys_size+37+(i-1)*4+j)%sf => stat_eff_visc(i)%vf(j)%sf(0:m, 0:n, 0:p)
+                end do
             end do
-            do i = sys_size+5, sys_size+8
-                MPI_IO_DATA%var(i)%sf => stat_eff_visc(i-sys_size-4)%sf(0:m, 0:n, 0:p)
+            do i = 1, 3
+                do j = 1, 4
+                    MPI_IO_DATA%var(sys_size+73+(i-1)*4+j)%sf => stat_int_mom_exch(i)%vf(j)%sf(0:m, 0:n, 0:p)
+                end do
+            end do
+            do i = 1, sys_size
+                do j = 1, 4
+                    MPI_IO_DATA%var(sys_size+85+(i-1)*4+j)%sf => stat_q_cons_filtered(i)%vf(j)%sf(0:m, 0:n, 0:p)
+                end do
             end do
-            do i = sys_size+9, sys_size+12 
-                MPI_IO_DATA%var(i)%sf => stat_int_mom_exch(i-sys_size-8)%sf(0:m, 0:n, 0:p)
-            end do 
         end if
 
         if (present(beta)) then
diff --git a/src/post_process/m_data_input.f90 b/src/post_process/m_data_input.f90
index d778eeb7fa..7ac7a502dc 100644
--- a/src/post_process/m_data_input.f90
+++ b/src/post_process/m_data_input.f90
@@ -29,7 +29,6 @@ module m_data_input
  s_read_parallel_data_files, &
  s_populate_grid_variables_buffer_regions, &
  s_populate_conservative_variables_buffer_regions, &
- s_populate_filtered_variables_buffer_regions, &
  s_finalize_data_input_module
 
     abstract interface
@@ -61,9 +60,11 @@ end subroutine s_read_abstract_data_files
     ! type(scalar_field), public :: ib_markers !<
     type(integer_field), public :: ib_markers
 
-    type(scalar_field), allocatable, dimension(:), public :: stat_reynolds_stress
-    type(scalar_field), allocatable, dimension(:), public :: stat_eff_visc
-    type(scalar_field), allocatable, dimension(:), public :: stat_int_mom_exch
+    type(scalar_field), public :: filtered_fluid_indicator_function
+    type(vector_field), allocatable, dimension(:), public :: stat_reynolds_stress
+    type(vector_field), allocatable, dimension(:), public :: stat_eff_visc
+    type(vector_field), allocatable, dimension(:), public :: stat_int_mom_exch
+    type(vector_field), allocatable, dimension(:), public :: stat_q_cons_filtered
 
     procedure(s_read_abstract_data_files), pointer :: s_read_data_files => null()
 
@@ -301,8 +302,8 @@ subroutine s_read_parallel_data_files(t_step)
 
         if (bubbles_lagrange) then
             alt_sys = sys_size + 1
-        else if (q_filtered_wrt) then
-            alt_sys = sys_size + 9
+        else if (q_filtered_wrt .and. (t_step == 0 .or. t_step == t_step_stop)) then
+            alt_sys = sys_size + 1 + 9*4 + 9*4 + 3*4 + 6*4 ! 109, filtered indicator, stats of: R_u, R_mu, F_imet, q_cons_filtered
         else
             alt_sys = sys_size
         end if
@@ -461,11 +462,13 @@ subroutine s_read_parallel_data_files(t_step)
 
                 ! Initialize MPI data I/O
                 if (ib) then
-                    if (q_filtered_wrt) then
+                    if (q_filtered_wrt .and. (t_step == 0 .or. t_step == t_step_stop)) then
                         call s_initialize_mpi_data(q_cons_vf, ib_markers, &
+                                                   filtered_fluid_indicator_function=filtered_fluid_indicator_function, &
                                                    stat_reynolds_stress=stat_reynolds_stress, & 
                                                    stat_eff_visc=stat_eff_visc, & 
-                                                   stat_int_mom_exch=stat_int_mom_exch)
+                                                   stat_int_mom_exch=stat_int_mom_exch, & 
+                                                   stat_q_cons_filtered=stat_q_cons_filtered)
                     else 
                         call s_initialize_mpi_data(q_cons_vf, ib_markers)
                     end if
@@ -500,7 +503,7 @@ subroutine s_read_parallel_data_files(t_step)
                         call MPI_FILE_READ_ALL(ifile, MPI_IO_DATA%var(i)%sf, data_size, &
                                                mpi_p, status, ierr)
                     end do
-                else if (q_filtered_wrt) then
+                else if (q_filtered_wrt .and. (t_step == 0 .or. t_step == t_step_stop)) then
                     do i = 1, alt_sys
                         var_MOK = int(i, MPI_OFFSET_KIND)
 
@@ -1328,229 +1331,11 @@ subroutine s_populate_conservative_variables_buffer_regions(q_particle)
 
     end subroutine s_populate_conservative_variables_buffer_regions
 
-    subroutine s_populate_filtered_variables_buffer_regions(q_particle)
-
-        type(scalar_field), intent(inout), optional :: q_particle
-
-        integer :: i, j, k !< Generic loop iterators
-
-        ! Populating Buffer Regions in the x-direction
-
-        ! Periodic BC at the beginning
-        if (bc_x%beg == BC_PERIODIC) then
-
-            do j = 1, buff_size
-                if (present(q_particle)) then
-                    q_particle%sf(-j, 0:n, 0:p) = &
-                        q_particle%sf((m + 1) - j, 0:n, 0:p)
-                else
-                    do i = 1, 4
-                        stat_reynolds_stress(i)%sf(-j, 0:n, 0:p) = &
-                            stat_reynolds_stress(i)%sf((m + 1) - j, 0:n, 0:p)
-                        stat_eff_visc(i)%sf(-j, 0:n, 0:p) = &
-                            stat_eff_visc(i)%sf((m + 1) - j, 0:n, 0:p)
-                        stat_int_mom_exch(i)%sf(-j, 0:n, 0:p) = &
-                            stat_int_mom_exch(i)%sf((m + 1) - j, 0:n, 0:p)
-                    end do
-                end if
-            end do
-
-            ! Processor BC at the beginning
-        else
-            if (present(q_particle)) then
-                call s_mpi_sendrecv_cons_vars_buffer_regions(q_cons_vf, &
-                                                             'beg', 'x', q_particle)
-            else
-                call s_mpi_sendrecv_cons_vars_buffer_regions(q_cons_vf, &
-                                                             'beg', 'x')
-            end if
-
-        end if
-
-        ! Perodic BC at the end
-        if (bc_x%end == BC_PERIODIC) then
-
-            do j = 1, buff_size
-                if (present(q_particle)) then
-                    q_particle%sf(m + j, 0:n, 0:p) = &
-                        q_particle%sf(j - 1, 0:n, 0:p)
-                else
-                    do i = 1, 4
-                        stat_reynolds_stress(i)%sf(m + j, 0:n, 0:p) = &
-                            stat_reynolds_stress(i)%sf(j - 1, 0:n, 0:p)
-                        stat_eff_visc(i)%sf(m + j, 0:n, 0:p) = &
-                            stat_eff_visc(i)%sf(j - 1, 0:n, 0:p)
-                        stat_int_mom_exch(i)%sf(m + j, 0:n, 0:p) = &
-                            stat_int_mom_exch(i)%sf(j - 1, 0:n, 0:p)
-                    end do
-                end if
-            end do
-
-            ! Processor BC at the end
-        else
-
-            if (present(q_particle)) then
-                call s_mpi_sendrecv_cons_vars_buffer_regions(q_cons_vf, &
-                                                             'end', 'x', q_particle)
-            else
-                call s_mpi_sendrecv_cons_vars_buffer_regions(q_cons_vf, &
-                                                             'end', 'x')
-            end if
-
-        end if
-
-        ! END: Populating Buffer Regions in the x-direction
-
-        ! Populating Buffer Regions in the y-direction
-
-        if (n > 0) then
-
-            ! Periodic BC at the beginning
-            if (bc_y%beg == BC_PERIODIC) then
-
-                do j = 1, buff_size
-                    if (present(q_particle)) then
-                        q_particle%sf(:, -j, 0:p) = &
-                            q_particle%sf(:, (n + 1) - j, 0:p)
-                    else
-                        do i = 1, 4
-                            stat_reynolds_stress(i)%sf(:, -j, 0:p) = &
-                                stat_reynolds_stress(i)%sf(:, (n + 1) - j, 0:p)
-                            stat_eff_visc(i)%sf(:, -j, 0:p) = &
-                                stat_eff_visc(i)%sf(:, (n + 1) - j, 0:p)
-                            stat_int_mom_exch(i)%sf(:, -j, 0:p) = &
-                                stat_int_mom_exch(i)%sf(:, (n + 1) - j, 0:p)
-                        end do
-                    end if
-                end do
-
-                ! Processor BC at the beginning
-            else
-                if (present(q_particle)) then
-                    call s_mpi_sendrecv_cons_vars_buffer_regions(q_cons_vf, &
-                                                                 'beg', 'y', q_particle)
-                else
-                    call s_mpi_sendrecv_cons_vars_buffer_regions(q_cons_vf, &
-                                                                 'beg', 'y')
-                end if
-
-            end if
-
-            ! Perodic BC at the end
-            if (bc_y%end == BC_PERIODIC) then
-
-                do j = 1, buff_size
-                    if (present(q_particle)) then
-                        q_particle%sf(:, n + j, 0:p) = &
-                            q_particle%sf(:, j - 1, 0:p)
-                    else
-                        do i = 1, 4
-                            stat_reynolds_stress(i)%sf(:, n + j, 0:p) = &
-                                stat_reynolds_stress(i)%sf(:, j - 1, 0:p)
-                            stat_eff_visc(i)%sf(:, n + j, 0:p) = &
-                                stat_eff_visc(i)%sf(:, j - 1, 0:p)
-                            stat_int_mom_exch(i)%sf(:, n + j, 0:p) = &
-                                stat_int_mom_exch(i)%sf(:, j - 1, 0:p)
-                        end do
-                    end if
-                end do
-
-                ! Processor BC at the end
-            else
-
-                if (present(q_particle)) then
-                    call s_mpi_sendrecv_cons_vars_buffer_regions(q_cons_vf, &
-                                                                 'end', 'y', q_particle)
-                else
-                    call s_mpi_sendrecv_cons_vars_buffer_regions(q_cons_vf, &
-                                                                 'end', 'y')
-                end if
-
-            end if
-
-            ! END: Populating Buffer Regions in the y-direction
-
-            ! Populating Buffer Regions in the z-direction
-
-            if (p > 0) then
-
-                ! Periodic BC at the beginning
-                if (bc_z%beg == BC_PERIODIC) then
-
-                    do j = 1, buff_size
-                        if (present(q_particle)) then
-                            q_particle%sf(:, :, -j) = &
-                                q_particle%sf(:, :, (p + 1) - j)
-                        else
-                            do i = 1, 4
-                                stat_reynolds_stress(i)%sf(:, :, -j) = &
-                                    stat_reynolds_stress(i)%sf(:, :, (p + 1) - j)
-                                stat_eff_visc(i)%sf(:, :, -j) = &
-                                    stat_eff_visc(i)%sf(:, :, (p + 1) - j)
-                                stat_int_mom_exch(i)%sf(:, :, -j) = &
-                                    stat_int_mom_exch(i)%sf(:, :, (p + 1) - j)
-                            end do
-                        end if
-                    end do
-
-                    ! Processor BC at the beginning
-                else
-
-                    if (present(q_particle)) then
-                        call s_mpi_sendrecv_cons_vars_buffer_regions(q_cons_vf, &
-                                                                     'beg', 'z', q_particle)
-                    else
-                        call s_mpi_sendrecv_cons_vars_buffer_regions(q_cons_vf, &
-                                                                     'beg', 'z')
-                    end if
-
-                end if
-
-                ! Perodic BC at the end
-                if (bc_z%end == BC_PERIODIC) then
-
-                    do j = 1, buff_size
-                        if (present(q_particle)) then
-                            q_particle%sf(:, :, p + j) = &
-                                q_particle%sf(:, :, j - 1)
-                        else
-                            do i = 1, 4
-                                stat_reynolds_stress(i)%sf(:, :, p + j) = &
-                                    stat_reynolds_stress(i)%sf(:, :, j - 1)
-                                stat_eff_visc(i)%sf(:, :, p + j) = &
-                                    stat_eff_visc(i)%sf(:, :, j - 1)
-                                stat_int_mom_exch(i)%sf(:, :, p + j) = &
-                                    stat_int_mom_exch(i)%sf(:, :, j - 1)
-                            end do
-                        end if
-                    end do
-
-                    ! Processor BC at the end
-                else
-
-                    if (present(q_particle)) then
-                        call s_mpi_sendrecv_cons_vars_buffer_regions(q_cons_vf, &
-                                                                     'end', 'z', q_particle)
-                    else
-                        call s_mpi_sendrecv_cons_vars_buffer_regions(q_cons_vf, &
-                                                                     'end', 'z')
-                    end if
-
-                end if
-
-            end if
-
-        end if
-
-        ! END: Populating Buffer Regions in the z-direction
-
-    end subroutine s_populate_filtered_variables_buffer_regions
-
     !>  Computation of parameters, allocation procedures, and/or
         !!      any other tasks needed to properly setup the module
     subroutine s_initialize_data_input_module
 
-        integer :: i !< Generic loop iterator
+        integer :: i, j !< Generic loop iterator
 
         ! Allocating the parts of the conservative and primitive variables
         ! that do not require the direct knowledge of the dimensionality of
@@ -1559,9 +1344,10 @@ subroutine s_initialize_data_input_module
         allocate (q_prim_vf(1:sys_size))
         if (bubbles_lagrange) allocate (q_particle(1))
 
-        if (q_filtered_wrt) allocate (stat_reynolds_stress(1:4))
-        if (q_filtered_wrt) allocate (stat_eff_visc(1:4))
-        if (q_filtered_wrt) allocate (stat_int_mom_exch(1:4))
+        if (q_filtered_wrt) allocate (stat_reynolds_stress(1:9))
+        if (q_filtered_wrt) allocate (stat_eff_visc(1:9))
+        if (q_filtered_wrt) allocate (stat_int_mom_exch(1:3))
+        if (q_filtered_wrt) allocate (stat_q_cons_filtered(1:sys_size))
 
         ! Allocating the parts of the conservative and primitive variables
         ! that do require the direct knowledge of the dimensionality of the
@@ -1601,16 +1387,42 @@ subroutine s_initialize_data_input_module
                 end if
 
                 if (q_filtered_wrt) then
-                    do i = 1, 4
-                        allocate (stat_reynolds_stress(i)%sf(-buff_size:m + buff_size, &
-                                                     -buff_size:n + buff_size, &
-                                                     -buff_size:p + buff_size))
-                        allocate (stat_eff_visc(i)%sf(-buff_size:m + buff_size, &
-                                                     -buff_size:n + buff_size, &
-                                                     -buff_size:p + buff_size))
-                        allocate (stat_int_mom_exch(i)%sf(-buff_size:m + buff_size, &
-                                                     -buff_size:n + buff_size, &
-                                                     -buff_size:p + buff_size))
+                    allocate (filtered_fluid_indicator_function%sf(-buff_size:m + buff_size, &
+                                                                   -buff_size:n + buff_size, &
+                                                                   -buff_size:p + buff_size))
+                    do i = 1, 9
+                        allocate (stat_reynolds_stress(i)%vf(1:4))
+                        allocate (stat_eff_visc(i)%vf(1:4))
+                    end do
+                    do i = 1, 9 
+                        do j = 1, 4 
+                            allocate (stat_reynolds_stress(i)%vf(j)%sf(-buff_size:m + buff_size, &
+                                                                       -buff_size:n + buff_size, &
+                                                                       -buff_size:p + buff_size))
+                            allocate (stat_eff_visc(i)%vf(j)%sf(-buff_size:m + buff_size, &
+                                                                -buff_size:n + buff_size, &
+                                                                -buff_size:p + buff_size))
+                        end do 
+                    end do
+                    do i = 1, 3 
+                        allocate (stat_int_mom_exch(i)%vf(1:4))
+                    end do
+                    do i = 1, 3 
+                        do j = 1, 4 
+                            allocate (stat_int_mom_exch(i)%vf(j)%sf(-buff_size:m + buff_size, &
+                                                                    -buff_size:n + buff_size, &
+                                                                    -buff_size:p + buff_size))
+                        end do 
+                    end do
+                    do i = 1, sys_size
+                        allocate (stat_q_cons_filtered(i)%vf(1:4))
+                    end do 
+                    do i = 1, sys_size
+                        do j = 1, 4 
+                            allocate (stat_q_cons_filtered(i)%vf(j)%sf(-buff_size:m + buff_size, &
+                                                                       -buff_size:n + buff_size, &
+                                                                       -buff_size:p + buff_size))
+                        end do 
                     end do
                 end if
                 
@@ -1682,7 +1494,7 @@ end subroutine s_initialize_data_input_module
     !> Deallocation procedures for the module
     subroutine s_finalize_data_input_module
 
-        integer :: i !< Generic loop iterator
+        integer :: i, j !< Generic loop iterator
 
         ! Deallocating the conservative and primitive variables
         do i = 1, sys_size
@@ -1707,18 +1519,35 @@ subroutine s_finalize_data_input_module
         end if
 
         if (q_filtered_wrt) then 
-            do i = 1, 4 
-                deallocate (stat_reynolds_stress(i)%sf)
-            end do 
+            deallocate (filtered_fluid_indicator_function%sf)
+            do i = 1, 9 
+                do j = 1, 4 
+                    deallocate (stat_reynolds_stress(i)%vf(j)%sf)
+                end do 
+                deallocate(stat_reynolds_stress(i)%vf)
+            end do
             deallocate(stat_reynolds_stress)
-            do i = 1, 4 
-                deallocate (stat_eff_visc(i)%sf)
-            end do 
+            do i = 1, 9 
+                do j = 1, 4 
+                    deallocate (stat_eff_visc(i)%vf(j)%sf)
+                end do 
+                deallocate(stat_eff_visc(i)%vf)
+            end do
             deallocate(stat_eff_visc)
-            do i = 1, 4 
-                deallocate (stat_int_mom_exch(i)%sf)
-            end do 
+            do i = 1, 3
+                do j = 1, 4 
+                    deallocate (stat_int_mom_exch(i)%vf(j)%sf)
+                end do 
+                deallocate(stat_int_mom_exch(i)%vf)
+            end do
             deallocate(stat_int_mom_exch)
+            do i = 1, sys_size
+                do j = 1, 4 
+                    deallocate (stat_q_cons_filtered(i)%vf(j)%sf)
+                end do 
+                deallocate(stat_q_cons_filtered(i)%vf)
+            end do
+            deallocate(stat_q_cons_filtered)
         end if
 
         s_read_data_files => null()
diff --git a/src/post_process/m_global_parameters.fpp b/src/post_process/m_global_parameters.fpp
index 37b396a583..1b7e452e9f 100644
--- a/src/post_process/m_global_parameters.fpp
+++ b/src/post_process/m_global_parameters.fpp
@@ -781,9 +781,9 @@ contains
                 MPI_IO_DATA%var(i)%sf => null()
             end do
         else if (q_filtered_wrt) then
-            allocate (MPI_IO_DATA%view(1:sys_size+12))
-            allocate (MPI_IO_DATA%var(1:sys_size+12))
-            do i = 1, sys_size+12
+            allocate (MPI_IO_DATA%view(1:sys_size+1+4*9+4*9+3*4+6*4))
+            allocate (MPI_IO_DATA%var (1:sys_size+1+4*9+4*9+3*4+6*4))
+            do i = 1, sys_size+1+4*9+4*9+3*4+6*4
                 allocate (MPI_IO_DATA%var(i)%sf(0:m, 0:n, 0:p))
                 MPI_IO_DATA%var(i)%sf => null()
             end do
@@ -974,7 +974,7 @@ contains
             if (bubbles_lagrange) MPI_IO_DATA%var(sys_size + 1)%sf => null()
 
             if (q_filtered_wrt) then 
-                do i = sys_size+1, sys_size+12
+                do i = sys_size+1, sys_size+1+4*9+4*9+3*4+6*4
                     MPI_IO_DATA%var(i)%sf => null()
                 end do
             end if
diff --git a/src/post_process/m_start_up.f90 b/src/post_process/m_start_up.f90
index 481181064c..59c3e9499d 100644
--- a/src/post_process/m_start_up.f90
+++ b/src/post_process/m_start_up.f90
@@ -180,7 +180,6 @@ subroutine s_perform_time_step(t_step)
         ! Populating the buffer regions of the conservative variables
         if (buff_size > 0) then
             call s_populate_conservative_variables_buffer_regions()
-            if (q_filtered_wrt) call s_populate_filtered_variables_buffer_regions()
             if (bubbles_lagrange) call s_populate_conservative_variables_buffer_regions(q_particle(1))
         end if
 
@@ -326,28 +325,50 @@ subroutine s_save_data(t_step, varname, pres, c, H)
         end do
 
         ! Adding filtered quantities
-        if (q_filtered_wrt) then
-            ! filtered cons vars
-            do i = 1, 4
-                q_sf = stat_reynolds_stress(i)%sf(x_beg:x_end, y_beg:y_end, z_beg:z_end)
-                write (varname, '(A,I0)') 'stat_reynolds_stresss', i
-                call s_write_variable_to_formatted_database_file(varname, t_step)
+        if (q_filtered_wrt .and. (t_step == 0 .or. t_step == t_step_stop)) then
+            ! filtered fluid indicator
+            q_sf = filtered_fluid_indicator_function%sf(x_beg:x_end, y_beg:y_end, z_beg:z_end)
+            write (varname, '(A)') 'filtered_fluid_indicator_function'
+            call s_write_variable_to_formatted_database_file(varname, t_step)
 
-                varname(:) = ' '
+            varname(:) = ' '
+
+            ! filtered vars stats
+            do i = 1, 9
+                do j = 1, 4 
+                    q_sf = stat_reynolds_stress(i)%vf(j)%sf(x_beg:x_end, y_beg:y_end, z_beg:z_end)
+                    write (varname, '(A,I0,A,I0)') 'stat_reynolds_stress', i, '_m', j
+                    call s_write_variable_to_formatted_database_file(varname, t_step)
+
+                    varname(:) = ' '
+                end do 
             end do
-            do i = 1, 4
-                q_sf = stat_eff_visc(i)%sf(x_beg:x_end, y_beg:y_end, z_beg:z_end)
-                write (varname, '(A,I0)') 'stat_eff_viscs', i
-                call s_write_variable_to_formatted_database_file(varname, t_step)
+            do i = 1, 9
+                do j = 1, 4 
+                    q_sf = stat_eff_visc(i)%vf(j)%sf(x_beg:x_end, y_beg:y_end, z_beg:z_end)
+                    write (varname, '(A,I0,A,I0)') 'stat_eff_visc', i, '_m', j
+                    call s_write_variable_to_formatted_database_file(varname, t_step)
 
-                varname(:) = ' '
+                    varname(:) = ' '
+                end do
             end do
-            do i = 1, 4
-                q_sf = stat_int_mom_exch(i)%sf(x_beg:x_end, y_beg:y_end, z_beg:z_end)
-                write (varname, '(A,I0)') 'stat_int_mom_exchs', i
-                call s_write_variable_to_formatted_database_file(varname, t_step)
+            do i = 1, 3 
+                do j = 1, 4 
+                    q_sf = stat_int_mom_exch(i)%vf(j)%sf(x_beg:x_end, y_beg:y_end, z_beg:z_end)
+                    write (varname, '(A,I0,A,I0)') 'stat_int_mom_exch', i, '_m', j
+                    call s_write_variable_to_formatted_database_file(varname, t_step)
 
-                varname(:) = ' '
+                    varname(:) = ' '
+                end do
+            end do
+            do i = 1, sys_size
+                do j = 1, 4 
+                    q_sf = stat_q_cons_filtered(i)%vf(j)%sf(x_beg:x_end, y_beg:y_end, z_beg:z_end)
+                    write (varname, '(A,I0,A,I0)') 'stat_q_cons_filtered', i, '_m', j
+                    call s_write_variable_to_formatted_database_file(varname, t_step)
+
+                    varname(:) = ' '
+                end do 
             end do
         end if
 
diff --git a/src/simulation/m_compute_statistics.fpp b/src/simulation/m_compute_statistics.fpp
index 9d574c72f7..1e1e4de29d 100644
--- a/src/simulation/m_compute_statistics.fpp
+++ b/src/simulation/m_compute_statistics.fpp
@@ -15,85 +15,148 @@ module m_compute_statistics
 
     private; public :: s_initialize_statistics_module, s_finalize_statistics_module, &
     s_compute_statistics_momentum_unclosed_terms, s_update_statistics, &
-    s_compute_234_order_statistics
+    s_compute_statistical_moments
  
     ! terms for computing 1st, 2nd, 3rd, and 4th order statistical moments
-    type(scalar_field), allocatable, dimension(:) :: Msn_reynolds_stress
-    type(scalar_field), allocatable, dimension(:) :: Msn_eff_visc
-    type(scalar_field), allocatable, dimension(:) :: Msn_int_mom_exch
+    type(vector_field), allocatable, dimension(:) :: Msn_reynolds_stress
+    type(vector_field), allocatable, dimension(:) :: Msn_eff_visc
+    type(vector_field), allocatable, dimension(:) :: Msn_int_mom_exch
+    type(vector_field), allocatable, dimension(:) :: Msn_q_cons_filtered
 
     ! 2nd, 3rd, and 4th statistical moments for unclosed terms in volume filtered momentum equation
-    type(scalar_field), allocatable, dimension(:), public :: stat_reynolds_stress
-    type(scalar_field), allocatable, dimension(:), public :: stat_eff_visc
-    type(scalar_field), allocatable, dimension(:), public :: stat_int_mom_exch
+    type(vector_field), allocatable, dimension(:), public :: stat_reynolds_stress
+    type(vector_field), allocatable, dimension(:), public :: stat_eff_visc
+    type(vector_field), allocatable, dimension(:), public :: stat_int_mom_exch
+    type(vector_field), allocatable, dimension(:), public :: stat_q_cons_filtered
 
-    !$acc declare create(Msn_reynolds_stress, Msn_eff_visc, Msn_int_mom_exch)
+    !$acc declare create(Msn_reynolds_stress, Msn_eff_visc, Msn_int_mom_exch, Msn_q_cons_filtered)
 
-    !$acc declare create(stat_reynolds_stress, stat_eff_visc, stat_int_mom_exch)
+    !$acc declare create(stat_reynolds_stress, stat_eff_visc, stat_int_mom_exch, stat_q_cons_filtered)
 
 contains
 
     subroutine s_initialize_statistics_module
-        integer :: i
+        integer :: i, j
 
-        @:ALLOCATE(Msn_reynolds_stress(1:4))
-        do i = 1, 4
-            @:ALLOCATE(Msn_reynolds_stress(i)%sf(0:m, 0:n, 0:p))
-            @:ACC_SETUP_SFs(Msn_reynolds_stress(i))
+        @:ALLOCATE(Msn_reynolds_stress(1:9))
+        do i = 1, 9
+            @:ALLOCATE(Msn_reynolds_stress(i)%vf(1:4))
+        end do
+        do i = 1, 9
+            do j = 1, 4
+                @:ALLOCATE(Msn_reynolds_stress(i)%vf(j)%sf(0:m, 0:n, 0:p))
+            end do
+            @:ACC_SETUP_VFs(Msn_reynolds_stress(i))
+        end do
+
+        @:ALLOCATE(Msn_eff_visc(1:9))
+        do i = 1, 9
+            @:ALLOCATE(Msn_eff_visc(i)%vf(1:4))
+        end do
+        do i = 1, 9
+            do j = 1, 4
+                @:ALLOCATE(Msn_eff_visc(i)%vf(j)%sf(0:m, 0:n, 0:p))
+            end do
+            @:ACC_SETUP_VFs(Msn_eff_visc(i))
         end do
 
-        @:ALLOCATE(Msn_eff_visc(1:4))
-        do i = 1, 4
-            @:ALLOCATE(Msn_eff_visc(i)%sf(0:m, 0:n, 0:p))
-            @:ACC_SETUP_SFs(Msn_eff_visc(i))
+        @:ALLOCATE(Msn_int_mom_exch(1:3))
+        do i = 1, 3
+            @:ALLOCATE(Msn_int_mom_exch(i)%vf(1:4))
+        end do
+        do i = 1, 3
+            do j = 1, 4
+                @:ALLOCATE(Msn_int_mom_exch(i)%vf(j)%sf(0:m, 0:n, 0:p))
+            end do
+            @:ACC_SETUP_VFs(Msn_int_mom_exch(i))
         end do
 
-        @:ALLOCATE(Msn_int_mom_exch(1:4))
-        do i = 1, 4
-            @:ALLOCATE(Msn_int_mom_exch(i)%sf(0:m, 0:n, 0:p))
-            @:ACC_SETUP_SFs(Msn_int_mom_exch(i))
+        @:ALLOCATE(Msn_q_cons_filtered(1:sys_size))
+        do i = 1, sys_size
+            @:ALLOCATE(Msn_q_cons_filtered(i)%vf(1:4))
+        end do 
+        do i = 1, sys_size
+            do j = 1, 4 
+                @:ALLOCATE(Msn_q_cons_filtered(i)%vf(j)%sf(0:m, 0:n, 0:p))
+            end do
+            @:ACC_SETUP_VFs(Msn_q_cons_filtered)
         end do
 
-        @:ALLOCATE(stat_reynolds_stress(1:4))
-        do i = 1, 4
-            @:ALLOCATE(stat_reynolds_stress(i)%sf(0:m, 0:n, 0:p))
-            @:ACC_SETUP_SFs(stat_reynolds_stress(i))
+        @:ALLOCATE(stat_reynolds_stress(1:9))
+        do i = 1, 9
+            @:ALLOCATE(stat_reynolds_stress(i)%vf(1:4))
+        end do
+        do i = 1, 9
+            do j = 1, 4
+                @:ALLOCATE(stat_reynolds_stress(i)%vf(j)%sf(0:m, 0:n, 0:p))
+            end do
+            @:ACC_SETUP_VFs(stat_reynolds_stress(i))
         end do
 
-        @:ALLOCATE(stat_eff_visc(1:4))
-        do i = 1, 4
-            @:ALLOCATE(stat_eff_visc(i)%sf(0:m, 0:n, 0:p))
-            @:ACC_SETUP_SFs(stat_eff_visc(i))
+        @:ALLOCATE(stat_eff_visc(1:9))
+        do i = 1, 9
+            @:ALLOCATE(stat_eff_visc(i)%vf(1:4))
+        end do
+        do i = 1, 9
+            do j = 1, 4
+                @:ALLOCATE(stat_eff_visc(i)%vf(j)%sf(0:m, 0:n, 0:p))
+            end do
+            @:ACC_SETUP_VFs(stat_eff_visc(i))
         end do
 
-        @:ALLOCATE(stat_int_mom_exch(1:4))
-        do i = 1, 4
-            @:ALLOCATE(stat_int_mom_exch(i)%sf(0:m, 0:n, 0:p))
-            @:ACC_SETUP_SFs(stat_int_mom_exch(i))
+        @:ALLOCATE(stat_int_mom_exch(1:3))
+        do i = 1, 3
+            @:ALLOCATE(stat_int_mom_exch(i)%vf(1:4))
+        end do
+        do i = 1, 3
+            do j = 1, 4
+                @:ALLOCATE(stat_int_mom_exch(i)%vf(j)%sf(0:m, 0:n, 0:p))
+            end do
+            @:ACC_SETUP_VFs(stat_int_mom_exch(i))
+        end do
+
+        @:ALLOCATE(stat_q_cons_filtered(1:sys_size))
+        do i = 1, sys_size
+            @:ALLOCATE(stat_q_cons_filtered(i)%vf(1:4))
+        end do 
+        do i = 1, sys_size
+            do j = 1, 4 
+                @:ALLOCATE(stat_q_cons_filtered(i)%vf(j)%sf(0:m, 0:n, 0:p))
+            end do
+            @:ACC_SETUP_VFs(stat_q_cons_filtered)
         end do
 
     end subroutine s_initialize_statistics_module
 
     subroutine s_compute_statistics_momentum_unclosed_terms(n_step, reynolds_stress, eff_visc, int_mom_exch)
-        type(scalar_field), intent(in) :: reynolds_stress 
-        type(scalar_field), intent(in) :: eff_visc
-        type(scalar_field), intent(in) :: int_mom_exch
+        type(vector_field), dimension(3), intent(in) :: reynolds_stress 
+        type(vector_field), dimension(3), intent(in) :: eff_visc
+        type(scalar_field), dimension(3), intent(in) :: int_mom_exch
         
         integer, intent(in) :: n_step
         real(wp) :: ns 
+        integer :: i, j
 
         ns = real(n_step, wp)
 
         ! update M1, M2, M3, M4
-        call s_update_statistics(ns, reynolds_stress, Msn_reynolds_stress)
-        call s_update_statistics(ns, eff_visc, Msn_eff_visc)
-        call s_update_statistics(ns, int_mom_exch, Msn_int_mom_exch)
+        do i = 1, 3
+            do j = 1, 3     
+                call s_update_statistics(ns, reynolds_stress(i)%vf(j), Msn_reynolds_stress((i-1)*3 + j)%vf)
+                call s_update_statistics(ns, eff_visc(i)%vf(j), Msn_eff_visc((i-1)*3 + j)%vf)
+            end do
+            call s_update_statistics(ns, int_mom_exch(i), Msn_int_mom_exch(i)%vf)
+        end do
 
-        ! compute 2nd, 3rd, 4th order statistical moments
+        ! compute 1st, 2nd, 3rd, 4th order statistical moments
         if (n_step > 3) then 
-            call s_compute_234_order_statistics(ns, Msn_reynolds_stress, stat_reynolds_stress) 
-            call s_compute_234_order_statistics(ns, Msn_eff_visc, stat_eff_visc) 
-            call s_compute_234_order_statistics(ns, Msn_int_mom_exch, stat_int_mom_exch)  
+            do i = 1, 3 
+                do j = 1, 3 
+                call s_compute_statistical_moments(ns, Msn_reynolds_stress((i-1)*3 + j)%vf, stat_reynolds_stress((i-1)*3 + j)%vf) 
+                call s_compute_statistical_moments(ns, Msn_eff_visc((i-1)*3 + j)%vf, stat_eff_visc((i-1)*3 + j)%vf) 
+                end do 
+                call s_compute_statistical_moments(ns, Msn_int_mom_exch(i)%vf, stat_int_mom_exch(i)%vf)  
+            end do
         end if
 
     end subroutine s_compute_statistics_momentum_unclosed_terms
@@ -125,7 +188,7 @@ contains
         
     end subroutine s_update_statistics
 
-    subroutine s_compute_234_order_statistics(ns, Msn, q_stat)
+    subroutine s_compute_statistical_moments(ns, Msn, q_stat)
         type(scalar_field), dimension(1:4), intent(in) :: Msn
         type(scalar_field), dimension(1:4), intent(inout) :: q_stat
 
@@ -144,40 +207,75 @@ contains
             end do 
         end do
 
-    end subroutine s_compute_234_order_statistics
+    end subroutine s_compute_statistical_moments
 
     subroutine s_finalize_statistics_module
         integer :: i, j
-        do i = 1, 4
-            @:DEALLOCATE(Msn_reynolds_stress(i)%sf)
+
+        do i = 1, 9
+            do j = 1, 4
+                @:DEALLOCATE(Msn_reynolds_stress(i)%vf(j)%sf)
+            end do
+            @:DEALLOCATE(Msn_reynolds_stress(i)%vf)
         end do
         @:DEALLOCATE(Msn_reynolds_stress)
 
-        do i = 1, 4
-            @:DEALLOCATE(Msn_eff_visc(i)%sf)
+        do i = 1, 9
+            do j = 1, 4
+                @:DEALLOCATE(Msn_eff_visc(i)%vf(j)%sf)
+            end do
+            @:DEALLOCATE(Msn_eff_visc(i)%vf)
         end do
         @:DEALLOCATE(Msn_eff_visc)
 
-        do i = 1, 4
-            @:DEALLOCATE(Msn_int_mom_exch(i)%sf)
+        do i = 1, 3
+            do j = 1, 4
+                @:DEALLOCATE(Msn_int_mom_exch(i)%vf(j)%sf)
+            end do
+            @:DEALLOCATE(Msn_int_mom_exch(i)%vf)
         end do
         @:DEALLOCATE(Msn_int_mom_exch)
 
-        do i = 1, 4
-            @:DEALLOCATE(stat_reynolds_stress(i)%sf)
+        do i = 1, sys_size
+            do j = 1, 4
+                @:DEALLOCATE(Msn_q_cons_filtered(i)%vf(j)%sf)
+            end do
+            @:DEALLOCATE(Msn_q_cons_filtered(i)%vf)
+        end do
+        @:DEALLOCATE(Msn_q_cons_filtered)
+
+        do i = 1, 9
+            do j = 1, 4
+                @:DEALLOCATE(stat_reynolds_stress(i)%vf(j)%sf)
+            end do
+            @:DEALLOCATE(stat_reynolds_stress(i)%vf)
         end do
         @:DEALLOCATE(stat_reynolds_stress)
 
-        do i = 1, 4
-            @:DEALLOCATE(stat_eff_visc(i)%sf)
+        do i = 1, 9
+            do j = 1, 4
+                @:DEALLOCATE(stat_eff_visc(i)%vf(j)%sf)
+            end do
+            @:DEALLOCATE(stat_eff_visc(i)%vf)
         end do
         @:DEALLOCATE(stat_eff_visc)
 
-        do i = 1, 4
-            @:DEALLOCATE(stat_int_mom_exch(i)%sf)
+        do i = 1, 3
+            do j = 1, 4
+                @:DEALLOCATE(stat_int_mom_exch(i)%vf(j)%sf)
+            end do
+            @:DEALLOCATE(stat_int_mom_exch(i)%vf)
         end do
         @:DEALLOCATE(stat_int_mom_exch)
 
+        do i = 1, sys_size
+            do j = 1, 4
+                @:DEALLOCATE(stat_q_cons_filtered(i)%vf(j)%sf)
+            end do
+            @:DEALLOCATE(stat_q_cons_filtered(i)%vf)
+        end do
+        @:DEALLOCATE(stat_q_cons_filtered)
+
     end subroutine s_finalize_statistics_module
 
 end module m_compute_statistics
diff --git a/src/simulation/m_data_output.fpp b/src/simulation/m_data_output.fpp
index fd783bef1f..840278d58f 100644
--- a/src/simulation/m_data_output.fpp
+++ b/src/simulation/m_data_output.fpp
@@ -76,7 +76,8 @@ contains
         !! @param q_cons_vf Conservative variables
         !! @param q_prim_vf Primitive variables
         !! @param t_step Current time step
-    subroutine s_write_data_files(q_cons_vf, q_T_sf, q_prim_vf, t_step, beta, stat_reynolds_stress, stat_eff_visc, stat_int_mom_exch)
+    subroutine s_write_data_files(q_cons_vf, q_T_sf, q_prim_vf, t_step, beta, filtered_fluid_indicator_function, &
+                                  stat_reynolds_stress, stat_eff_visc, stat_int_mom_exch, stat_q_cons_filtered)
 
         type(scalar_field), &
             dimension(sys_size), &
@@ -94,14 +95,17 @@ contains
         type(scalar_field), &
             intent(inout), optional :: beta
 
-        type(scalar_field), dimension(1:4), intent(inout), optional :: stat_reynolds_stress
-        type(scalar_field), dimension(1:4), intent(inout), optional :: stat_eff_visc
-        type(scalar_field), dimension(1:4), intent(inout), optional :: stat_int_mom_exch
+        type(scalar_field), intent(inout), optional :: filtered_fluid_indicator_function
+        type(vector_field), dimension(1:9), intent(inout), optional :: stat_reynolds_stress
+        type(vector_field), dimension(1:9), intent(inout), optional :: stat_eff_visc
+        type(vector_field), dimension(1:3), intent(inout), optional :: stat_int_mom_exch
+        type(vector_field), dimension(1:sys_size), intent(inout), optional :: stat_q_cons_filtered
 
         if (.not. parallel_io) then
             call s_write_serial_data_files(q_cons_vf, q_T_sf, q_prim_vf, t_step, beta)
         else
-            call s_write_parallel_data_files(q_cons_vf, q_prim_vf, t_step, beta, stat_reynolds_stress, stat_eff_visc, stat_int_mom_exch)
+            call s_write_parallel_data_files(q_cons_vf, q_prim_vf, t_step, beta, &
+                                             filtered_fluid_indicator_function, stat_reynolds_stress, stat_eff_visc, stat_int_mom_exch, stat_q_cons_filtered)
         end if
 
     end subroutine s_write_data_files
@@ -790,15 +794,18 @@ contains
         !!  @param q_prim_vf Cell-average primitive variables
         !!  @param t_step Current time-step
         !!  @param beta Eulerian void fraction from lagrangian bubbles
-    subroutine s_write_parallel_data_files(q_cons_vf, q_prim_vf, t_step, beta, stat_reynolds_stress, stat_eff_visc, stat_int_mom_exch)
+    subroutine s_write_parallel_data_files(q_cons_vf, q_prim_vf, t_step, beta, filtered_fluid_indicator_function, &
+                                           stat_reynolds_stress, stat_eff_visc, stat_int_mom_exch, stat_q_cons_filtered)
 
         type(scalar_field), dimension(sys_size), intent(in) :: q_cons_vf
         type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
         integer, intent(in) :: t_step
         type(scalar_field), intent(inout), optional :: beta
-        type(scalar_field), dimension(1:4), intent(inout), optional :: stat_reynolds_stress
-        type(scalar_field), dimension(1:4), intent(inout), optional :: stat_eff_visc
-        type(scalar_field), dimension(1:4), intent(inout), optional :: stat_int_mom_exch
+        type(scalar_field), intent(inout), optional :: filtered_fluid_indicator_function
+        type(vector_field), dimension(1:9), intent(inout), optional :: stat_reynolds_stress
+        type(vector_field), dimension(1:9), intent(inout), optional :: stat_eff_visc
+        type(vector_field), dimension(1:3), intent(inout), optional :: stat_int_mom_exch
+        type(vector_field), dimension(1:sys_size), intent(inout), optional :: stat_q_cons_filtered
 
 #ifdef MFC_MPI
 
@@ -821,7 +828,7 @@ contains
         if (present(beta)) then
             alt_sys = sys_size + 1
         else if (present(stat_reynolds_stress) .and. present(stat_eff_visc) .and. present(stat_int_mom_exch)) then
-            alt_sys = sys_size + 12
+            alt_sys = sys_size + 1 + 9*4 + 9*4 + 3*4 + 6*4
         else
             alt_sys = sys_size
         end if
@@ -906,8 +913,10 @@ contains
 
             if (ib) then
                 if (present(stat_reynolds_stress) .and. present(stat_eff_visc) .and. present(stat_int_mom_exch)) then
-                    call s_initialize_mpi_data(q_cons_vf, ib_markers, levelset, levelset_norm, & 
-                                               stat_reynolds_stress=stat_reynolds_stress, stat_eff_visc=stat_eff_visc, stat_int_mom_exch=stat_int_mom_exch)
+                    call s_initialize_mpi_data(q_cons_vf, ib_markers, levelset, levelset_norm, &
+                                               filtered_fluid_indicator_function=filtered_fluid_indicator_function, & 
+                                               stat_reynolds_stress=stat_reynolds_stress, stat_eff_visc=stat_eff_visc, &
+                                               stat_int_mom_exch=stat_int_mom_exch, stat_q_cons_filtered=stat_q_cons_filtered)
                 else
                     call s_initialize_mpi_data(q_cons_vf, ib_markers, levelset, levelset_norm)
                 end if
diff --git a/src/simulation/m_global_parameters.fpp b/src/simulation/m_global_parameters.fpp
index 6efe39d5d9..50590a26fe 100644
--- a/src/simulation/m_global_parameters.fpp
+++ b/src/simulation/m_global_parameters.fpp
@@ -513,6 +513,7 @@ module m_global_parameters
     logical :: compute_autocorrelation
     integer :: t_step_stat_start
     real(wp) :: filter_width
+    logical :: q_filtered_wrt
 
     !$acc declare create(u_inf_ref, rho_inf_ref, T_inf_ref, filter_width)
 
@@ -803,6 +804,7 @@ contains
         compute_autocorrelation = .false.
         t_step_stat_start = dflt_int
         filter_width = dflt_real
+        q_filtered_wrt = .false.
 
     end subroutine s_assign_default_values_to_user_inputs
 
@@ -1158,8 +1160,8 @@ contains
             allocate (MPI_IO_DATA%view(1:sys_size + 1))
             allocate (MPI_IO_DATA%var(1:sys_size + 1))
         else if (volume_filtering_momentum_eqn) then 
-            allocate (MPI_IO_DATA%view(1:sys_size+12))
-            allocate (MPI_IO_DATA%var(1:sys_size+12))
+            allocate (MPI_IO_DATA%view(1:sys_size+109))
+            allocate (MPI_IO_DATA%var(1:sys_size+109))
         else
             allocate (MPI_IO_DATA%view(1:sys_size))
             allocate (MPI_IO_DATA%var(1:sys_size))
@@ -1180,7 +1182,7 @@ contains
                 MPI_IO_DATA%var(i)%sf => null()
             end do
         else if (volume_filtering_momentum_eqn) then 
-            do i = sys_size+1, sys_size+12
+            do i = sys_size+1, sys_size+109
                 allocate (MPI_IO_DATA%var(i)%sf(0:m, 0:n, 0:p))
                 MPI_IO_DATA%var(i)%sf => null()
             end do
@@ -1357,7 +1359,7 @@ contains
                     MPI_IO_DATA%var(i)%sf => null()
                 end do
             else if (volume_filtering_momentum_eqn) then 
-                do i = 1, sys_size+12
+                do i = 1, sys_size+109
                     MPI_IO_DATA%var(i)%sf => null()
                 end do
             else
diff --git a/src/simulation/m_mpi_proxy.fpp b/src/simulation/m_mpi_proxy.fpp
index bb359a4bed..bac8259b81 100644
--- a/src/simulation/m_mpi_proxy.fpp
+++ b/src/simulation/m_mpi_proxy.fpp
@@ -93,7 +93,8 @@ contains
             & 'viscous', 'shear_stress', 'bulk_stress', 'bubbles_lagrange',     &
             & 'hyperelasticity', 'rkck_adap_dt', 'bc_io', 'powell', 'cont_damage', &
             & 'periodic_ibs', 'compute_particle_drag', 'periodic_forcing', 'volume_filtering_momentum_eqn', & 
-            & 'store_levelset', 'slab_domain_decomposition', 'compute_autocorrelation' ]
+            & 'store_levelset', 'slab_domain_decomposition', 'compute_autocorrelation', &
+            & 'q_filtered_wrt' ]
             call MPI_BCAST(${VAR}$, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr)
         #:endfor
 
diff --git a/src/simulation/m_start_up.fpp b/src/simulation/m_start_up.fpp
index 453632807a..1ddf59c881 100644
--- a/src/simulation/m_start_up.fpp
+++ b/src/simulation/m_start_up.fpp
@@ -192,7 +192,7 @@ contains
             periodic_ibs, compute_particle_drag, u_inf_ref, rho_inf_ref, T_inf_ref, & 
             periodic_forcing, volume_filtering_momentum_eqn, store_levelset, & 
             slab_domain_decomposition, compute_autocorrelation, t_step_stat_start, & 
-            filter_width
+            filter_width, q_filtered_wrt
 
         ! Checking that an input file has been provided by the user. If it
         ! has, then the input file is read in, otherwise, simulation exits.
@@ -1329,7 +1329,7 @@ contains
                 call nvtxEndRange
 
                 call nvtxStartRange("COMPUTE-STATISTICS")
-                call s_compute_statistics_momentum_unclosed_terms(t_step - t_step_stat_start, mag_reynolds_stress, mag_eff_visc, mag_int_mom_exch)
+                call s_compute_statistics_momentum_unclosed_terms(t_step - t_step_stat_start, reynolds_stress, eff_visc, int_mom_exch)
                 call nvtxEndRange
 
                 ! write(100, *) mag_reynolds_stress%sf(10, 10, 10)
@@ -1459,11 +1459,25 @@ contains
 
         call cpu_time(start)
         call nvtxStartRange("SAVE-DATA")
-        do i = 1, 4 
-            !$acc update host(stat_reynolds_stress(i)%sf)
-            !$acc update host(stat_eff_visc(i)%sf)
-            !$acc update host(stat_int_mom_exch(i)%sf)
-        end do  
+        if (q_filtered_wrt .and. (t_step == 0 .or. t_step == t_step_stop)) then
+            !$acc update host(filtered_fluid_indicator_function%sf)
+            do i = 1, 9 
+                do j = 1, 4 
+                    !$acc update host(stat_reynolds_stress(i)%vf(j)%sf)
+                    !$acc update host(stat_eff_visc(i)%vf(j)%sf)
+                end do 
+            end do 
+            do i = 1, 3 
+                do j = 1, 4 
+                    !$acc update host(stat_int_mom_exch(i)%vf(j)%sf)
+                end do 
+            end do
+            do i = 1, sys_size
+                do j = 1, 4 
+                    !$acc update host(stat_q_cons_filtered(i)%vf(j)%sf)
+                end do 
+            end do
+        end if
         do i = 1, sys_size
             !$acc update host(q_cons_ts(1)%vf(i)%sf)
             do l = 0, p
@@ -1496,9 +1510,11 @@ contains
             call s_write_restart_lag_bubbles(save_count) !parallel
             if (lag_params%write_bubbles_stats) call s_write_lag_bubble_stats()
         else
-            if (volume_filtering_momentum_eqn) then
+            if (volume_filtering_momentum_eqn .and. (t_step == 0 .or. t_step == t_step_stop)) then
                 call s_write_data_files(q_cons_ts(1)%vf, q_T_sf, q_prim_vf, save_count, &
-                                        stat_reynolds_stress=stat_reynolds_stress, stat_eff_visc=stat_eff_visc, stat_int_mom_exch=stat_int_mom_exch)
+                                        filtered_fluid_indicator_function=filtered_fluid_indicator_function, &
+                                        stat_reynolds_stress=stat_reynolds_stress, stat_eff_visc=stat_eff_visc, &
+                                        stat_int_mom_exch=stat_int_mom_exch, stat_q_cons_filtered=stat_q_cons_filtered)
             else
                 call s_write_data_files(q_cons_ts(1)%vf, q_T_sf, q_prim_vf, save_count)
             end if
diff --git a/src/simulation/m_volume_filtering.fpp b/src/simulation/m_volume_filtering.fpp
index a503412162..7baf741244 100644
--- a/src/simulation/m_volume_filtering.fpp
+++ b/src/simulation/m_volume_filtering.fpp
@@ -545,9 +545,9 @@ contains
         call nvtxEndRange
 
         call nvtxStartRange("COMPUTE-UNCLOSED-TERMS")
-        call s_compute_pseudo_turbulent_reynolds_stress(q_cons_filtered, reynolds_stress, mag_reynolds_stress)
-        call s_compute_effective_viscosity(q_cons_filtered, eff_visc, visc_stress, mag_eff_visc)
-        call s_compute_interphase_momentum_exchange(int_mom_exch, mag_int_mom_exch)
+        call s_compute_pseudo_turbulent_reynolds_stress(q_cons_filtered, reynolds_stress)
+        call s_compute_effective_viscosity(q_cons_filtered, eff_visc, visc_stress)
+        call s_compute_interphase_momentum_exchange(int_mom_exch)
         call nvtxEndRange
 
     end subroutine s_volume_filter_momentum_eqn
@@ -624,7 +624,7 @@ contains
 
         integer :: i
 
-        do i = contxb, momxe
+        do i = 1, sys_size
             call s_apply_fftw_filter_scalarfield(filtered_fluid_indicator_function, .true., q_cons_vf(i), q_cons_filtered(i))
         end do 
 
@@ -810,11 +810,9 @@ contains
 
     end subroutine s_setup_terms_filtering
 
-    subroutine s_compute_pseudo_turbulent_reynolds_stress(q_cons_filtered, reynolds_stress, mag_reynolds_stress)
+    subroutine s_compute_pseudo_turbulent_reynolds_stress(q_cons_filtered, reynolds_stress)
         type(scalar_field), dimension(sys_size), intent(in) :: q_cons_filtered
         type(vector_field), dimension(1:num_dims), intent(inout) :: reynolds_stress
-        type(scalar_field), intent(inout) :: mag_reynolds_stress
-        real(wp), dimension(1:num_dims, 0:m, 0:n, 0:p) :: div_Ru
         integer :: i, j, k, l, q    
 
         !$acc parallel loop collapse(3) gang vector default(present)
@@ -833,81 +831,18 @@ contains
             end do
         end do
 
-        !$acc parallel loop collapse(3) gang vector default(present)
-        do i = 0, m
-            do j = 0, n
-                do k = 0, p  
-                    !$acc loop seq
-                    do l = 1, num_dims
-                        !$acc loop seq
-                        do q = 1, num_dims
-                            reynolds_stress(l)%vf(q)%sf(i, j, k) = reynolds_stress(l)%vf(q)%sf(i, j, k) * filtered_fluid_indicator_function%sf(i, j, k)
-                        end do 
-                    end do 
-                end do
-            end do 
-        end do
-
-        ! set boundary buffer zone values
-#ifdef MFC_MPI
-        do l = 1, num_dims 
-            do q = 1, num_dims
-                call s_populate_scalarfield_buffers(reynolds_stress(l)%vf(q))
-            end do 
-        end do
-#else
-        do l = 1, num_dims
-            do q = 1, num_dims
-                reynolds_stress(l)%vf(q)%sf(-buff_size:-1, :, :) = reynolds_stress(l)%vf(q)%sf(m-buff_size+1:m, :, :)
-                reynolds_stress(l)%vf(q)%sf(m+1:m+buff_size, :, :) = reynolds_stress(l)%vf(q)%sf(0:buff_size-1, :, :)
-
-                reynolds_stress(l)%vf(q)%sf(:, -buff_size:-1, :) = reynolds_stress(l)%vf(q)%sf(:, n-buff_size+1:n, :)
-                reynolds_stress(l)%vf(q)%sf(:, n+1:n+buff_size, :) = reynolds_stress(l)%vf(q)%sf(:, 0:buff_size-1, :)
-
-                reynolds_stress(l)%vf(q)%sf(:, :, -buff_size:-1) = reynolds_stress(l)%vf(q)%sf(:, :, p-buff_size+1:p)
-                reynolds_stress(l)%vf(q)%sf(:, :, p+1:p+buff_size) = reynolds_stress(l)%vf(q)%sf(:, :, 0:buff_size-1)
-            end do
-        end do
-#endif
-
-        ! div(Ru), using CD2 FD scheme 
-        !$acc parallel loop collapse(3) gang vector default(present) copy(div_Ru)
-        do i = 0, m
-            do j = 0, n 
-                do k = 0, p
-                    !$acc loop seq
-                    do l = 1, num_dims
-                        div_Ru(l, i, j, k) = (reynolds_stress(l)%vf(1)%sf(i+1, j, k) - reynolds_stress(l)%vf(1)%sf(i-1, j, k))/(2._wp*dx(i)) &
-                                           + (reynolds_stress(l)%vf(2)%sf(i, j+1, k) - reynolds_stress(l)%vf(2)%sf(i, j-1, k))/(2._wp*dy(j)) & 
-                                           + (reynolds_stress(l)%vf(3)%sf(i, j, k+1) - reynolds_stress(l)%vf(3)%sf(i, j, k-1))/(2._wp*dz(k))
-                    end do
-                end do
-            end do
-        end do
-
-        !$acc parallel loop collapse(3) gang vector default(present) copyin(div_Ru)
-        do i = 0, m
-            do j = 0, n
-                do k = 0, p 
-                    mag_reynolds_stress%sf(i, j, k) = sqrt(div_Ru(1, i, j, k)**2 + div_Ru(2, i, j, k)**2 + div_Ru(3, i, j, k)**2)
-                end do
-            end do
-        end do
-
     end subroutine s_compute_pseudo_turbulent_reynolds_stress
 
-    subroutine s_compute_effective_viscosity(q_cons_filtered, eff_visc, visc_stress, mag_eff_visc)
+    subroutine s_compute_effective_viscosity(q_cons_filtered, eff_visc, visc_stress)
         type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_filtered
         type(vector_field), dimension(1:num_dims), intent(inout) :: eff_visc
         type(vector_field), dimension(1:num_dims), intent(inout) :: visc_stress
-        type(scalar_field), intent(inout) :: mag_eff_visc
-        real(wp), dimension(1:num_dims, 0:m, 0:n, 0:p) :: div_eff_visc
 
         integer :: i, j, k, l, q
 
         ! set buffers for filtered momentum quantities and density
 #ifdef MFC_MPI
-        do i = 1, momxe 
+        do i = contxb, momxe 
             call s_populate_scalarfield_buffers(q_cons_filtered(i))
         end do
 #else
@@ -942,86 +877,13 @@ contains
             end do
         end do
 
-        !$acc parallel loop collapse(3) gang vector default(present)
-        do i = 0, m
-            do j = 0, n
-                do k = 0, p 
-                    !$acc loop seq
-                    do l = 1, num_dims
-                        !$acc loop seq
-                        do q = 1, num_dims
-                            eff_visc(l)%vf(q)%sf(i, j, k) = eff_visc(l)%vf(q)%sf(i, j, k) * filtered_fluid_indicator_function%sf(i, j, k)
-                        end do 
-                    end do 
-                end do
-            end do 
-        end do
-
-        ! set boundary buffer zone values
-#ifdef MFC_MPI
-        do l = 1, num_dims
-            do q = 1, num_dims
-                call s_populate_scalarfield_buffers(eff_visc(l)%vf(q))
-            end do
-        end do
-#else
-        do l = 1, num_dims
-            do q = 1, num_dims
-                eff_visc(l)%vf(q)%sf(-buff_size:-1, :, :) = eff_visc(l)%vf(q)%sf(m-buff_size+1:m, :, :)
-                eff_visc(l)%vf(q)%sf(m+1:m+buff_size, :, :) = eff_visc(l)%vf(q)%sf(0:buff_size-1, :, :)
-
-                eff_visc(l)%vf(q)%sf(:, -buff_size:-1, :) = eff_visc(l)%vf(q)%sf(:, n-buff_size+1:n, :)
-                eff_visc(l)%vf(q)%sf(:, n+1:n+buff_size, :) = eff_visc(l)%vf(q)%sf(:, 0:buff_size-1, :)
-
-                eff_visc(l)%vf(q)%sf(:, :, -buff_size:-1) = eff_visc(l)%vf(q)%sf(:, :, p-buff_size+1:p)
-                eff_visc(l)%vf(q)%sf(:, :, p+1:p+buff_size) = eff_visc(l)%vf(q)%sf(:, :, 0:buff_size-1)
-            end do
-        end do
-#endif
-
-        ! div(eff_visc), using CD2 FD scheme 
-        !$acc parallel loop collapse(3) gang vector default(present) copy(div_eff_visc)
-        do i = 0, m
-            do j = 0, n 
-                do k = 0, p
-                    !$acc loop seq
-                    do l = 1, num_dims
-                        div_eff_visc(l, i, j, k) = (eff_visc(l)%vf(1)%sf(i+1, j, k) - eff_visc(l)%vf(1)%sf(i-1, j, k))/(2._wp*dx(i)) &
-                                                 + (eff_visc(l)%vf(2)%sf(i, j+1, k) - eff_visc(l)%vf(2)%sf(i, j-1, k))/(2._wp*dy(j)) & 
-                                                 + (eff_visc(l)%vf(3)%sf(i, j, k+1) - eff_visc(l)%vf(3)%sf(i, j, k-1))/(2._wp*dz(k))
-                    end do 
-                end do
-            end do
-        end do
-
-        !$acc parallel loop collapse(3) gang vector default(present) copyin(div_eff_visc)
-        do i = 0, m
-            do j = 0, n
-                do k = 0, p 
-                    mag_eff_visc%sf(i, j, k) = sqrt(div_eff_visc(1, i, j, k)**2 + div_eff_visc(2, i, j, k)**2 + div_eff_visc(3, i, j, k)**2)
-                end do
-            end do
-        end do
-
     end subroutine s_compute_effective_viscosity
 
-    subroutine s_compute_interphase_momentum_exchange(int_mom_exch, mag_int_mom_exch)
+    subroutine s_compute_interphase_momentum_exchange(int_mom_exch)
         type(scalar_field), dimension(1:num_dims), intent(in) :: int_mom_exch
-        type(scalar_field), intent(inout) :: mag_int_mom_exch
 
         integer :: i, j, k
 
-        !$acc parallel loop collapse(3) gang vector default(present)
-        do i = 0, m
-            do j = 0, n
-                do k = 0, p 
-                    mag_int_mom_exch%sf(i, j, k) = sqrt(int_mom_exch(1)%sf(i, j, k)**2 & 
-                                                      + int_mom_exch(2)%sf(i, j, k)**2 & 
-                                                      + int_mom_exch(3)%sf(i, j, k)**2)
-                end do
-            end do
-        end do 
-
     end subroutine s_compute_interphase_momentum_exchange
 
     ! computes x-,y-,z-direction forces on particles
diff --git a/toolchain/mfc/run/case_dicts.py b/toolchain/mfc/run/case_dicts.py
index f9bedd37af..b8ac4ba7c7 100644
--- a/toolchain/mfc/run/case_dicts.py
+++ b/toolchain/mfc/run/case_dicts.py
@@ -309,6 +309,7 @@ def analytic(self):
     'compute_autocorrelation': ParamType.LOG,
     't_step_stat_start': ParamType.INT,
     'filter_width': ParamType.REAL,
+    'q_filtered_wrt': ParamType.LOG,
 })
 
 for var in [ 'heatTransfer_model', 'massTransfer_model', 'pressure_corrector',

From 8ab21bdb0a1f7f66f0928f37f921908e95cfe4b1 Mon Sep 17 00:00:00 2001
From: conradd3 <conradd3@illinois.edu>
Date: Wed, 3 Sep 2025 12:43:42 -0500
Subject: [PATCH 11/30] added filtered pres stats

---
 runs/phi01/case.py                      |  6 +-
 src/common/m_mpi_common.fpp             | 10 +++-
 src/post_process/m_data_input.f90       | 22 +++++--
 src/post_process/m_start_up.f90         |  9 ++-
 src/simulation/m_compute_statistics.fpp | 77 ++++++++++++++++++-------
 src/simulation/m_data_output.fpp        | 18 ++++--
 src/simulation/m_start_up.fpp           | 10 +++-
 src/simulation/m_volume_filtering.fpp   | 65 +++++++++------------
 8 files changed, 137 insertions(+), 80 deletions(-)

diff --git a/runs/phi01/case.py b/runs/phi01/case.py
index e47086a47e..9751518117 100644
--- a/runs/phi01/case.py
+++ b/runs/phi01/case.py
@@ -34,9 +34,9 @@
 #print('Kn = ' + str( np.sqrt(np.pi*gam_a/2)*(M/Re) )) # Kn < 0.01 = continuum flow
 
 dt = 4.0E-06
-Nt = 20
-t_save = 1
-t_step_start_stats = 10
+Nt = 100
+t_save = 10
+t_step_start_stats = 50
 
 Nx = 99
 Ny = 99
diff --git a/src/common/m_mpi_common.fpp b/src/common/m_mpi_common.fpp
index 4eca64e8ad..2469f11348 100644
--- a/src/common/m_mpi_common.fpp
+++ b/src/common/m_mpi_common.fpp
@@ -154,7 +154,7 @@ contains
     !! @param levelset_norm normalized vector from every cell to the closest point to the IB
     !! @param beta Eulerian void fraction from lagrangian bubbles
     subroutine s_initialize_mpi_data(q_cons_vf, ib_markers, levelset, levelset_norm, beta, filtered_fluid_indicator_function, &
-                                     stat_reynolds_stress, stat_eff_visc, stat_int_mom_exch, stat_q_cons_filtered)
+                                     stat_reynolds_stress, stat_eff_visc, stat_int_mom_exch, stat_q_cons_filtered, stat_filtered_pressure)
 
         type(scalar_field), &
             dimension(sys_size), &
@@ -179,7 +179,8 @@ contains
         type(vector_field), dimension(1:9), intent(in), optional :: stat_reynolds_stress
         type(vector_field), dimension(1:9), intent(in), optional :: stat_eff_visc
         type(vector_field), dimension(1:3), intent(in), optional :: stat_int_mom_exch
-        type(vector_field), dimension(1:sys_size), intent(in), optional :: stat_q_cons_filtered
+        type(vector_field), dimension(1:sys_size-1), intent(in), optional :: stat_q_cons_filtered
+        type(scalar_field), dimension(1:4), intent(in), optional :: stat_filtered_pressure
 
         integer, dimension(num_dims) :: sizes_glb, sizes_loc
         integer, dimension(1) :: airfoil_glb, airfoil_loc, airfoil_start
@@ -221,11 +222,14 @@ contains
                     MPI_IO_DATA%var(sys_size+73+(i-1)*4+j)%sf => stat_int_mom_exch(i)%vf(j)%sf(0:m, 0:n, 0:p)
                 end do
             end do
-            do i = 1, sys_size
+            do i = 1, sys_size-1
                 do j = 1, 4
                     MPI_IO_DATA%var(sys_size+85+(i-1)*4+j)%sf => stat_q_cons_filtered(i)%vf(j)%sf(0:m, 0:n, 0:p)
                 end do
             end do
+            do i = 1, 4 
+                MPI_IO_DATA%var(sys_size+105+i)%sf => stat_filtered_pressure(i)%sf(0:m, 0:n, 0:p)
+            end do
         end if
 
         if (present(beta)) then
diff --git a/src/post_process/m_data_input.f90 b/src/post_process/m_data_input.f90
index 7ac7a502dc..d2b203d675 100644
--- a/src/post_process/m_data_input.f90
+++ b/src/post_process/m_data_input.f90
@@ -65,6 +65,7 @@ end subroutine s_read_abstract_data_files
     type(vector_field), allocatable, dimension(:), public :: stat_eff_visc
     type(vector_field), allocatable, dimension(:), public :: stat_int_mom_exch
     type(vector_field), allocatable, dimension(:), public :: stat_q_cons_filtered
+    type(scalar_field), allocatable, dimension(:), public :: stat_filtered_pressure
 
     procedure(s_read_abstract_data_files), pointer :: s_read_data_files => null()
 
@@ -468,7 +469,8 @@ subroutine s_read_parallel_data_files(t_step)
                                                    stat_reynolds_stress=stat_reynolds_stress, & 
                                                    stat_eff_visc=stat_eff_visc, & 
                                                    stat_int_mom_exch=stat_int_mom_exch, & 
-                                                   stat_q_cons_filtered=stat_q_cons_filtered)
+                                                   stat_q_cons_filtered=stat_q_cons_filtered, & 
+                                                   stat_filtered_pressure=stat_filtered_pressure)
                     else 
                         call s_initialize_mpi_data(q_cons_vf, ib_markers)
                     end if
@@ -1347,7 +1349,8 @@ subroutine s_initialize_data_input_module
         if (q_filtered_wrt) allocate (stat_reynolds_stress(1:9))
         if (q_filtered_wrt) allocate (stat_eff_visc(1:9))
         if (q_filtered_wrt) allocate (stat_int_mom_exch(1:3))
-        if (q_filtered_wrt) allocate (stat_q_cons_filtered(1:sys_size))
+        if (q_filtered_wrt) allocate (stat_q_cons_filtered(1:sys_size-1))
+        if (q_filtered_wrt) allocate (stat_filtered_pressure(1:4))
 
         ! Allocating the parts of the conservative and primitive variables
         ! that do require the direct knowledge of the dimensionality of the
@@ -1414,16 +1417,21 @@ subroutine s_initialize_data_input_module
                                                                     -buff_size:p + buff_size))
                         end do 
                     end do
-                    do i = 1, sys_size
+                    do i = 1, sys_size-1
                         allocate (stat_q_cons_filtered(i)%vf(1:4))
                     end do 
-                    do i = 1, sys_size
+                    do i = 1, sys_size-1
                         do j = 1, 4 
                             allocate (stat_q_cons_filtered(i)%vf(j)%sf(-buff_size:m + buff_size, &
                                                                        -buff_size:n + buff_size, &
                                                                        -buff_size:p + buff_size))
                         end do 
                     end do
+                    do i = 1, 4 
+                        allocate (stat_filtered_pressure(i)%sf(-buff_size:m + buff_size, &
+                                                               -buff_size:n + buff_size, &
+                                                               -buff_size:p + buff_size))
+                    end do
                 end if
                 
                 ! Simulation is 2D
@@ -1541,13 +1549,17 @@ subroutine s_finalize_data_input_module
                 deallocate(stat_int_mom_exch(i)%vf)
             end do
             deallocate(stat_int_mom_exch)
-            do i = 1, sys_size
+            do i = 1, sys_size-1
                 do j = 1, 4 
                     deallocate (stat_q_cons_filtered(i)%vf(j)%sf)
                 end do 
                 deallocate(stat_q_cons_filtered(i)%vf)
             end do
             deallocate(stat_q_cons_filtered)
+            do i = 1, 4 
+                deallocate(stat_filtered_pressure(i)%sf)
+            end do
+            deallocate(stat_filtered_pressure)
         end if
 
         s_read_data_files => null()
diff --git a/src/post_process/m_start_up.f90 b/src/post_process/m_start_up.f90
index 59c3e9499d..11d9c8fbf1 100644
--- a/src/post_process/m_start_up.f90
+++ b/src/post_process/m_start_up.f90
@@ -361,7 +361,7 @@ subroutine s_save_data(t_step, varname, pres, c, H)
                     varname(:) = ' '
                 end do
             end do
-            do i = 1, sys_size
+            do i = 1, sys_size-1
                 do j = 1, 4 
                     q_sf = stat_q_cons_filtered(i)%vf(j)%sf(x_beg:x_end, y_beg:y_end, z_beg:z_end)
                     write (varname, '(A,I0,A,I0)') 'stat_q_cons_filtered', i, '_m', j
@@ -370,6 +370,13 @@ subroutine s_save_data(t_step, varname, pres, c, H)
                     varname(:) = ' '
                 end do 
             end do
+            do i = 1, 4 
+                q_sf = stat_filtered_pressure(i)%sf(x_beg:x_end, y_beg:y_end, z_beg:z_end)
+                write (varname, '(A,I0)') 'stat_filtered_pressure_m', i
+                call s_write_variable_to_formatted_database_file(varname, t_step)
+
+                varname(:) = ' '
+            end do
         end if
 
         ! Adding the species' concentrations to the formatted database file
diff --git a/src/simulation/m_compute_statistics.fpp b/src/simulation/m_compute_statistics.fpp
index 1e1e4de29d..6f73d6d967 100644
--- a/src/simulation/m_compute_statistics.fpp
+++ b/src/simulation/m_compute_statistics.fpp
@@ -22,16 +22,18 @@ module m_compute_statistics
     type(vector_field), allocatable, dimension(:) :: Msn_eff_visc
     type(vector_field), allocatable, dimension(:) :: Msn_int_mom_exch
     type(vector_field), allocatable, dimension(:) :: Msn_q_cons_filtered
+    type(scalar_field), allocatable, dimension(:) :: Msn_filtered_pressure
 
-    ! 2nd, 3rd, and 4th statistical moments for unclosed terms in volume filtered momentum equation
+    ! 1st, 2nd, 3rd, and 4th statistical moments for unclosed terms in volume filtered momentum equation
     type(vector_field), allocatable, dimension(:), public :: stat_reynolds_stress
     type(vector_field), allocatable, dimension(:), public :: stat_eff_visc
     type(vector_field), allocatable, dimension(:), public :: stat_int_mom_exch
     type(vector_field), allocatable, dimension(:), public :: stat_q_cons_filtered
+    type(scalar_field), allocatable, dimension(:), public :: stat_filtered_pressure
 
-    !$acc declare create(Msn_reynolds_stress, Msn_eff_visc, Msn_int_mom_exch, Msn_q_cons_filtered)
+    !$acc declare create(Msn_reynolds_stress, Msn_eff_visc, Msn_int_mom_exch, Msn_q_cons_filtered, Msn_filtered_pressure)
 
-    !$acc declare create(stat_reynolds_stress, stat_eff_visc, stat_int_mom_exch, stat_q_cons_filtered)
+    !$acc declare create(stat_reynolds_stress, stat_eff_visc, stat_int_mom_exch, stat_q_cons_filtered, stat_filtered_pressure)
 
 contains
 
@@ -71,17 +73,23 @@ contains
             @:ACC_SETUP_VFs(Msn_int_mom_exch(i))
         end do
 
-        @:ALLOCATE(Msn_q_cons_filtered(1:sys_size))
-        do i = 1, sys_size
+        @:ALLOCATE(Msn_q_cons_filtered(1:sys_size-1))
+        do i = 1, sys_size-1
             @:ALLOCATE(Msn_q_cons_filtered(i)%vf(1:4))
         end do 
-        do i = 1, sys_size
+        do i = 1, sys_size-1
             do j = 1, 4 
                 @:ALLOCATE(Msn_q_cons_filtered(i)%vf(j)%sf(0:m, 0:n, 0:p))
             end do
             @:ACC_SETUP_VFs(Msn_q_cons_filtered)
         end do
 
+        @:ALLOCATE(Msn_filtered_pressure(1:4))
+        do i = 1, 4
+            @:ALLOCATE(Msn_filtered_pressure(i)%sf(0:m, 0:n, 0:p))
+            @:ACC_SETUP_SFs(Msn_filtered_pressure(i))
+        end do
+
         @:ALLOCATE(stat_reynolds_stress(1:9))
         do i = 1, 9
             @:ALLOCATE(stat_reynolds_stress(i)%vf(1:4))
@@ -115,29 +123,38 @@ contains
             @:ACC_SETUP_VFs(stat_int_mom_exch(i))
         end do
 
-        @:ALLOCATE(stat_q_cons_filtered(1:sys_size))
-        do i = 1, sys_size
+        @:ALLOCATE(stat_q_cons_filtered(1:sys_size-1))
+        do i = 1, sys_size-1
             @:ALLOCATE(stat_q_cons_filtered(i)%vf(1:4))
         end do 
-        do i = 1, sys_size
+        do i = 1, sys_size-1
             do j = 1, 4 
                 @:ALLOCATE(stat_q_cons_filtered(i)%vf(j)%sf(0:m, 0:n, 0:p))
             end do
             @:ACC_SETUP_VFs(stat_q_cons_filtered)
         end do
 
+        @:ALLOCATE(stat_filtered_pressure(1:4))
+        do i = 1, 4
+            @:ALLOCATE(stat_filtered_pressure(i)%sf(0:m, 0:n, 0:p))
+            @:ACC_SETUP_SFs(stat_filtered_pressure(i))
+        end do
+
     end subroutine s_initialize_statistics_module
 
-    subroutine s_compute_statistics_momentum_unclosed_terms(n_step, reynolds_stress, eff_visc, int_mom_exch)
-        type(vector_field), dimension(3), intent(in) :: reynolds_stress 
-        type(vector_field), dimension(3), intent(in) :: eff_visc
-        type(scalar_field), dimension(3), intent(in) :: int_mom_exch
-        
-        integer, intent(in) :: n_step
+    subroutine s_compute_statistics_momentum_unclosed_terms(t_step, t_step_stat_start, reynolds_stress, eff_visc, int_mom_exch, q_cons_filtered, filtered_pressure)
+        type(vector_field), dimension(1:3), intent(in) :: reynolds_stress 
+        type(vector_field), dimension(1:3), intent(in) :: eff_visc
+        type(scalar_field), dimension(1:3), intent(in) :: int_mom_exch
+        type(scalar_field), dimension(sys_size-1), intent(in) :: q_cons_filtered
+        type(scalar_field), intent(in) :: filtered_pressure
+        integer, intent(in) :: t_step
+        integer, intent(in) :: t_step_stat_start
+
         real(wp) :: ns 
         integer :: i, j
 
-        ns = real(n_step, wp)
+        ns = real(t_step - t_step_stat_start, wp)
 
         ! update M1, M2, M3, M4
         do i = 1, 3
@@ -147,16 +164,24 @@ contains
             end do
             call s_update_statistics(ns, int_mom_exch(i), Msn_int_mom_exch(i)%vf)
         end do
+        do i = 1, sys_size-1
+            call s_update_statistics(ns, q_cons_filtered(i), Msn_q_cons_filtered(i)%vf)
+        end do
+        call s_update_statistics(ns, filtered_pressure, Msn_filtered_pressure)
 
         ! compute 1st, 2nd, 3rd, 4th order statistical moments
-        if (n_step > 3) then 
+        if (t_step == t_step_stop-1) then ! only compute at final time
             do i = 1, 3 
                 do j = 1, 3 
-                call s_compute_statistical_moments(ns, Msn_reynolds_stress((i-1)*3 + j)%vf, stat_reynolds_stress((i-1)*3 + j)%vf) 
-                call s_compute_statistical_moments(ns, Msn_eff_visc((i-1)*3 + j)%vf, stat_eff_visc((i-1)*3 + j)%vf) 
+                    call s_compute_statistical_moments(ns, Msn_reynolds_stress((i-1)*3 + j)%vf, stat_reynolds_stress((i-1)*3 + j)%vf) 
+                    call s_compute_statistical_moments(ns, Msn_eff_visc((i-1)*3 + j)%vf, stat_eff_visc((i-1)*3 + j)%vf) 
                 end do 
                 call s_compute_statistical_moments(ns, Msn_int_mom_exch(i)%vf, stat_int_mom_exch(i)%vf)  
             end do
+            do i = 1, sys_size-1
+                call s_compute_statistical_moments(ns, Msn_q_cons_filtered(i)%vf, stat_q_cons_filtered(i)%vf)
+            end do
+            call s_compute_statistical_moments(ns, Msn_filtered_pressure, stat_filtered_pressure)
         end if
 
     end subroutine s_compute_statistics_momentum_unclosed_terms
@@ -236,7 +261,7 @@ contains
         end do
         @:DEALLOCATE(Msn_int_mom_exch)
 
-        do i = 1, sys_size
+        do i = 1, sys_size-1
             do j = 1, 4
                 @:DEALLOCATE(Msn_q_cons_filtered(i)%vf(j)%sf)
             end do
@@ -244,6 +269,11 @@ contains
         end do
         @:DEALLOCATE(Msn_q_cons_filtered)
 
+        do i = 1, 4 
+            @:DEALLOCATE(Msn_filtered_pressure(i)%sf)
+        end do
+        @:DEALLOCATE(Msn_filtered_pressure)
+
         do i = 1, 9
             do j = 1, 4
                 @:DEALLOCATE(stat_reynolds_stress(i)%vf(j)%sf)
@@ -268,7 +298,7 @@ contains
         end do
         @:DEALLOCATE(stat_int_mom_exch)
 
-        do i = 1, sys_size
+        do i = 1, sys_size-1
             do j = 1, 4
                 @:DEALLOCATE(stat_q_cons_filtered(i)%vf(j)%sf)
             end do
@@ -276,6 +306,11 @@ contains
         end do
         @:DEALLOCATE(stat_q_cons_filtered)
 
+        do i = 1, 4 
+            @:DEALLOCATE(stat_filtered_pressure(i)%sf)
+        end do
+        @:DEALLOCATE(stat_filtered_pressure)
+
     end subroutine s_finalize_statistics_module
 
 end module m_compute_statistics
diff --git a/src/simulation/m_data_output.fpp b/src/simulation/m_data_output.fpp
index 840278d58f..94a04b94d3 100644
--- a/src/simulation/m_data_output.fpp
+++ b/src/simulation/m_data_output.fpp
@@ -77,7 +77,8 @@ contains
         !! @param q_prim_vf Primitive variables
         !! @param t_step Current time step
     subroutine s_write_data_files(q_cons_vf, q_T_sf, q_prim_vf, t_step, beta, filtered_fluid_indicator_function, &
-                                  stat_reynolds_stress, stat_eff_visc, stat_int_mom_exch, stat_q_cons_filtered)
+                                  stat_reynolds_stress, stat_eff_visc, stat_int_mom_exch, &
+                                  stat_q_cons_filtered, stat_filtered_pressure)
 
         type(scalar_field), &
             dimension(sys_size), &
@@ -99,13 +100,15 @@ contains
         type(vector_field), dimension(1:9), intent(inout), optional :: stat_reynolds_stress
         type(vector_field), dimension(1:9), intent(inout), optional :: stat_eff_visc
         type(vector_field), dimension(1:3), intent(inout), optional :: stat_int_mom_exch
-        type(vector_field), dimension(1:sys_size), intent(inout), optional :: stat_q_cons_filtered
+        type(vector_field), dimension(1:sys_size-1), intent(inout), optional :: stat_q_cons_filtered
+        type(scalar_field), dimension(1:4), intent(inout), optional :: stat_filtered_pressure
 
         if (.not. parallel_io) then
             call s_write_serial_data_files(q_cons_vf, q_T_sf, q_prim_vf, t_step, beta)
         else
             call s_write_parallel_data_files(q_cons_vf, q_prim_vf, t_step, beta, &
-                                             filtered_fluid_indicator_function, stat_reynolds_stress, stat_eff_visc, stat_int_mom_exch, stat_q_cons_filtered)
+                                             filtered_fluid_indicator_function, stat_reynolds_stress, stat_eff_visc, stat_int_mom_exch, &
+                                             stat_q_cons_filtered, stat_filtered_pressure)
         end if
 
     end subroutine s_write_data_files
@@ -795,7 +798,8 @@ contains
         !!  @param t_step Current time-step
         !!  @param beta Eulerian void fraction from lagrangian bubbles
     subroutine s_write_parallel_data_files(q_cons_vf, q_prim_vf, t_step, beta, filtered_fluid_indicator_function, &
-                                           stat_reynolds_stress, stat_eff_visc, stat_int_mom_exch, stat_q_cons_filtered)
+                                           stat_reynolds_stress, stat_eff_visc, stat_int_mom_exch, & 
+                                           stat_q_cons_filtered, stat_filtered_pressure)
 
         type(scalar_field), dimension(sys_size), intent(in) :: q_cons_vf
         type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
@@ -805,7 +809,8 @@ contains
         type(vector_field), dimension(1:9), intent(inout), optional :: stat_reynolds_stress
         type(vector_field), dimension(1:9), intent(inout), optional :: stat_eff_visc
         type(vector_field), dimension(1:3), intent(inout), optional :: stat_int_mom_exch
-        type(vector_field), dimension(1:sys_size), intent(inout), optional :: stat_q_cons_filtered
+        type(vector_field), dimension(1:sys_size-1), intent(inout), optional :: stat_q_cons_filtered
+        type(scalar_field), dimension(1:4), intent(inout), optional :: stat_filtered_pressure
 
 #ifdef MFC_MPI
 
@@ -916,7 +921,8 @@ contains
                     call s_initialize_mpi_data(q_cons_vf, ib_markers, levelset, levelset_norm, &
                                                filtered_fluid_indicator_function=filtered_fluid_indicator_function, & 
                                                stat_reynolds_stress=stat_reynolds_stress, stat_eff_visc=stat_eff_visc, &
-                                               stat_int_mom_exch=stat_int_mom_exch, stat_q_cons_filtered=stat_q_cons_filtered)
+                                               stat_int_mom_exch=stat_int_mom_exch, stat_q_cons_filtered=stat_q_cons_filtered, &
+                                               stat_filtered_pressure=stat_filtered_pressure)
                 else
                     call s_initialize_mpi_data(q_cons_vf, ib_markers, levelset, levelset_norm)
                 end if
diff --git a/src/simulation/m_start_up.fpp b/src/simulation/m_start_up.fpp
index 1ddf59c881..531f4145da 100644
--- a/src/simulation/m_start_up.fpp
+++ b/src/simulation/m_start_up.fpp
@@ -1329,7 +1329,7 @@ contains
                 call nvtxEndRange
 
                 call nvtxStartRange("COMPUTE-STATISTICS")
-                call s_compute_statistics_momentum_unclosed_terms(t_step - t_step_stat_start, reynolds_stress, eff_visc, int_mom_exch)
+                call s_compute_statistics_momentum_unclosed_terms(t_step, t_step_stat_start, reynolds_stress, eff_visc, int_mom_exch, q_cons_filtered, filtered_pressure)
                 call nvtxEndRange
 
                 ! write(100, *) mag_reynolds_stress%sf(10, 10, 10)
@@ -1472,11 +1472,14 @@ contains
                     !$acc update host(stat_int_mom_exch(i)%vf(j)%sf)
                 end do 
             end do
-            do i = 1, sys_size
+            do i = 1, sys_size-1
                 do j = 1, 4 
                     !$acc update host(stat_q_cons_filtered(i)%vf(j)%sf)
                 end do 
             end do
+            do i = 1, 4
+                !$acc update host(stat_filtered_pressure(i)%sf)
+            end do
         end if
         do i = 1, sys_size
             !$acc update host(q_cons_ts(1)%vf(i)%sf)
@@ -1514,7 +1517,8 @@ contains
                 call s_write_data_files(q_cons_ts(1)%vf, q_T_sf, q_prim_vf, save_count, &
                                         filtered_fluid_indicator_function=filtered_fluid_indicator_function, &
                                         stat_reynolds_stress=stat_reynolds_stress, stat_eff_visc=stat_eff_visc, &
-                                        stat_int_mom_exch=stat_int_mom_exch, stat_q_cons_filtered=stat_q_cons_filtered)
+                                        stat_int_mom_exch=stat_int_mom_exch, stat_q_cons_filtered=stat_q_cons_filtered, &
+                                        stat_filtered_pressure=stat_filtered_pressure)
             else
                 call s_write_data_files(q_cons_ts(1)%vf, q_T_sf, q_prim_vf, save_count)
             end if
diff --git a/src/simulation/m_volume_filtering.fpp b/src/simulation/m_volume_filtering.fpp
index 7baf741244..ec659a7905 100644
--- a/src/simulation/m_volume_filtering.fpp
+++ b/src/simulation/m_volume_filtering.fpp
@@ -45,7 +45,8 @@ module m_volume_filtering
     type(scalar_field), public :: filtered_fluid_indicator_function
 
     ! volume filtered conservative variables
-    type(scalar_field), allocatable, dimension(:) :: q_cons_filtered
+    type(scalar_field), allocatable, dimension(:), public :: q_cons_filtered
+    type(scalar_field), public :: filtered_pressure
 
     ! viscous and pressure+viscous stress tensors
     type(vector_field), allocatable, dimension(:) :: visc_stress
@@ -59,20 +60,15 @@ module m_volume_filtering
     type(vector_field), allocatable, dimension(:), public :: eff_visc
     type(scalar_field), allocatable, dimension(:), public :: int_mom_exch
 
-    ! magnitude of unclosed terms in momentum equation
-    type(scalar_field), public :: mag_reynolds_stress
-    type(scalar_field), public :: mag_eff_visc
-    type(scalar_field), public :: mag_int_mom_exch
-
     ! 1/mu
     real(wp), allocatable, dimension(:, :) :: Res
 
     ! x-,y-,z-direction forces on particles
     real(wp), allocatable, dimension(:, :) :: particle_forces
 
-    !$acc declare create(fluid_indicator_function, filtered_fluid_indicator_function, q_cons_filtered)
+    !$acc declare create(fluid_indicator_function, filtered_fluid_indicator_function, q_cons_filtered, filtered_pressure)
     !$acc declare create(visc_stress, pres_visc_stress, div_pres_visc_stress)
-    !$acc declare create(reynolds_stress, eff_visc, int_mom_exch, mag_reynolds_stress, mag_eff_visc, mag_int_mom_exch)
+    !$acc declare create(reynolds_stress, eff_visc, int_mom_exch)
     !$acc declare create(Res, particle_forces)
 
 #if defined(MFC_OpenACC)
@@ -116,14 +112,19 @@ contains
         integer :: i, j, k
         integer :: size_n(1), inembed(1), onembed(1)
         
-        @:ALLOCATE(q_cons_filtered(1:sys_size))
-        do i = 1, sys_size
+        @:ALLOCATE(q_cons_filtered(1:sys_size-1))
+        do i = 1, sys_size-1
             @:ALLOCATE(q_cons_filtered(i)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
                 idwbuff(2)%beg:idwbuff(2)%end, &
                 idwbuff(3)%beg:idwbuff(3)%end))
             @:ACC_SETUP_SFs(q_cons_filtered(i))
         end do
 
+        @:ALLOCATE(filtered_pressure%sf(idwbuff(1)%beg:idwbuff(1)%end, &
+            idwbuff(2)%beg:idwbuff(2)%end, &
+            idwbuff(3)%beg:idwbuff(3)%end))
+        @:ACC_SETUP_SFs(filtered_pressure)
+
         @:ALLOCATE(visc_stress(1:num_dims))
         do i = 1, num_dims
             @:ALLOCATE(visc_stress(i)%vf(1:num_dims))
@@ -192,21 +193,6 @@ contains
             @:ACC_SETUP_SFs(int_mom_exch(i))
         end do
 
-        @:ALLOCATE(mag_reynolds_stress%sf(idwbuff(1)%beg:idwbuff(1)%end, &
-            idwbuff(2)%beg:idwbuff(2)%end, &
-            idwbuff(3)%beg:idwbuff(3)%end))
-        @:ACC_SETUP_SFs(mag_reynolds_stress)
-
-        @:ALLOCATE(mag_eff_visc%sf(idwbuff(1)%beg:idwbuff(1)%end, &
-            idwbuff(2)%beg:idwbuff(2)%end, &
-            idwbuff(3)%beg:idwbuff(3)%end))
-        @:ACC_SETUP_SFs(mag_eff_visc)
-
-        @:ALLOCATE(mag_int_mom_exch%sf(idwbuff(1)%beg:idwbuff(1)%end, &
-            idwbuff(2)%beg:idwbuff(2)%end, &
-            idwbuff(3)%beg:idwbuff(3)%end))
-        @:ACC_SETUP_SFs(mag_int_mom_exch)
-
         if (viscous) then
             @:ALLOCATE(Res(1:2, 1:maxval(Re_size)))
         end if
@@ -537,11 +523,12 @@ contains
         call nvtxEndRange
 
         call nvtxStartRange("UNCLOSED-TERM-SETUP")
-        call s_setup_terms_filtering(q_cons_vf, reynolds_stress, visc_stress, pres_visc_stress, div_pres_visc_stress)
+        call s_setup_terms_filtering(q_cons_vf, reynolds_stress, visc_stress, pres_visc_stress, div_pres_visc_stress, filtered_pressure)
         call nvtxEndRange
 
         call nvtxStartRange("FILTER-UNCLOSED-TERM-VARS")
         call s_apply_fftw_filter_tensor(reynolds_stress, visc_stress, eff_visc, div_pres_visc_stress, int_mom_exch)
+        call s_apply_fftw_filter_scalarfield(filtered_fluid_indicator_function, .true., filtered_pressure)
         call nvtxEndRange
 
         call nvtxStartRange("COMPUTE-UNCLOSED-TERMS")
@@ -620,11 +607,11 @@ contains
     !< apply the gaussian filter to the conservative variables and compute their filtered components
     subroutine s_apply_fftw_filter_cons(q_cons_vf, q_cons_filtered)
         type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
-        type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_filtered
+        type(scalar_field), dimension(sys_size-1), intent(inout) :: q_cons_filtered
 
         integer :: i
 
-        do i = 1, sys_size
+        do i = 1, sys_size-1
             call s_apply_fftw_filter_scalarfield(filtered_fluid_indicator_function, .true., q_cons_vf(i), q_cons_filtered(i))
         end do 
 
@@ -701,10 +688,11 @@ contains
 
     end subroutine s_compute_viscous_stress_tensor
     
-    subroutine s_compute_stress_tensor(pres_visc_stress, visc_stress, q_cons_vf)
+    subroutine s_compute_stress_tensor(pres_visc_stress, visc_stress, q_cons_vf, filtered_pressure)
         type(vector_field), dimension(num_dims), intent(inout) :: pres_visc_stress
         type(vector_field), dimension(num_dims), intent(in) :: visc_stress
         type(scalar_field), dimension(sys_size), intent(in) :: q_cons_vf
+        type(scalar_field), intent(inout) :: filtered_pressure
         real(wp) :: pressure
         integer :: i, j, k
 
@@ -724,6 +712,8 @@ contains
                     pres_visc_stress(3)%vf(1)%sf(i, j, k) = - visc_stress(3)%vf(1)%sf(i, j, k)
                     pres_visc_stress(3)%vf(2)%sf(i, j, k) = - visc_stress(3)%vf(2)%sf(i, j, k)
                     pres_visc_stress(3)%vf(3)%sf(i, j, k) = pressure - visc_stress(3)%vf(3)%sf(i, j, k)
+
+                    filtered_pressure%sf(i, j, k) = pressure
                 end do 
             end do 
         end do 
@@ -758,12 +748,13 @@ contains
     end subroutine s_compute_divergence_stress_tensor
 
     !< setup for calculation of unclosed terms in volume filtered momentum eqn
-    subroutine s_setup_terms_filtering(q_cons_vf, reynolds_stress, visc_stress, pres_visc_stress, div_pres_visc_stress)
+    subroutine s_setup_terms_filtering(q_cons_vf, reynolds_stress, visc_stress, pres_visc_stress, div_pres_visc_stress, filtered_pressure)
         type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
         type(vector_field), dimension(1:num_dims), intent(inout) :: reynolds_stress
         type(vector_field), dimension(1:num_dims), intent(inout) :: visc_stress
         type(vector_field), dimension(1:num_dims), intent(inout) :: pres_visc_stress
         type(scalar_field), dimension(1:num_dims), intent(inout) :: div_pres_visc_stress
+        type(scalar_field), intent(inout) :: filtered_pressure
 
         integer :: i, j, k, l, q
 
@@ -804,14 +795,14 @@ contains
         ! effective viscosity setup, return viscous stress tensor
         call s_compute_viscous_stress_tensor(visc_stress, q_cons_vf)
 
-        call s_compute_stress_tensor(pres_visc_stress, visc_stress, q_cons_vf)
+        call s_compute_stress_tensor(pres_visc_stress, visc_stress, q_cons_vf, filtered_pressure)
 
         call s_compute_divergence_stress_tensor(div_pres_visc_stress, pres_visc_stress)
 
     end subroutine s_setup_terms_filtering
 
     subroutine s_compute_pseudo_turbulent_reynolds_stress(q_cons_filtered, reynolds_stress)
-        type(scalar_field), dimension(sys_size), intent(in) :: q_cons_filtered
+        type(scalar_field), dimension(sys_size-1), intent(in) :: q_cons_filtered
         type(vector_field), dimension(1:num_dims), intent(inout) :: reynolds_stress
         integer :: i, j, k, l, q    
 
@@ -834,7 +825,7 @@ contains
     end subroutine s_compute_pseudo_turbulent_reynolds_stress
 
     subroutine s_compute_effective_viscosity(q_cons_filtered, eff_visc, visc_stress)
-        type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_filtered
+        type(scalar_field), dimension(1:sys_size-1), intent(inout) :: q_cons_filtered
         type(vector_field), dimension(1:num_dims), intent(inout) :: eff_visc
         type(vector_field), dimension(1:num_dims), intent(inout) :: visc_stress
 
@@ -1143,11 +1134,13 @@ contains
         @:DEALLOCATE(fluid_indicator_function%sf)
         @:DEALLOCATE(filtered_fluid_indicator_function%sf)
 
-        do i = 1, sys_size
+        do i = 1, sys_size-1
             @:DEALLOCATE(q_cons_filtered(i)%sf)
         end do
         @:DEALLOCATE(q_cons_filtered)
 
+        @:DEALLOCATE(filtered_pressure%sf)
+
         do i = 1, num_dims
             do j = 1, num_dims
                 @:DEALLOCATE(visc_stress(i)%vf(j)%sf)
@@ -1190,10 +1183,6 @@ contains
         end do
         @:DEALLOCATE(int_mom_exch)
 
-        @:DEALLOCATE(mag_reynolds_stress%sf)
-        @:DEALLOCATE(mag_eff_visc%sf)
-        @:DEALLOCATE(mag_int_mom_exch%sf)
-
         @:DEALLOCATE(Res)
         @:DEALLOCATE(particle_forces)
 

From f1b883968096de53685beaec15d2fd83c314def2 Mon Sep 17 00:00:00 2001
From: conradd3 <conradd3@illinois.edu>
Date: Wed, 3 Sep 2025 17:07:27 -0500
Subject: [PATCH 12/30] updated pressure calc in vol filter and cleanup

---
 src/simulation/m_additional_forcing.fpp |  22 ++--
 src/simulation/m_start_up.fpp           |   9 +-
 src/simulation/m_volume_filtering.fpp   | 128 +++++++++---------------
 3 files changed, 58 insertions(+), 101 deletions(-)

diff --git a/src/simulation/m_additional_forcing.fpp b/src/simulation/m_additional_forcing.fpp
index ae1d028330..b3b6807b55 100644
--- a/src/simulation/m_additional_forcing.fpp
+++ b/src/simulation/m_additional_forcing.fpp
@@ -15,7 +15,7 @@ module m_additional_forcing
 
     private; public :: s_initialize_additional_forcing_module, & 
  s_add_periodic_forcing, s_finalize_additional_forcing_module, & 
- s_compute_phase_average, s_compute_periodic_forcing;
+ s_compute_periodic_forcing
 
     type(scalar_field), allocatable, dimension(:) :: q_periodic_force
     real(wp) :: volfrac_phi
@@ -61,8 +61,8 @@ contains
         end do
     end subroutine s_add_periodic_forcing
 
-    !< compute the space and time average of quantities
-    subroutine s_compute_phase_average(q_cons_vf, t_step)
+    !< compute the space and time average of quantities, compute the periodic forcing terms described in Khalloufi and Capecelatro
+    subroutine s_compute_periodic_forcing(q_cons_vf, t_step)
         type(scalar_field), dimension(sys_size), intent(in) :: q_cons_vf
         integer, intent(in) :: t_step
         real(wp) :: spatial_rho_glb, spatial_u_glb
@@ -95,18 +95,7 @@ contains
         phase_u = phase_u + (spatial_u_glb / real(N_x_total_glb, wp) - phase_u) / real(t_step, wp)
         !$acc update device(phase_rho, phase_u)
 
-        ! if (proc_rank == 0) then 
-        !     print *, t_step, 'rho', phase_rho, 'rho*u', phase_u
-        ! end if
-
-    end subroutine s_compute_phase_average
-
-    !< computes the periodic forcing terms described in Khalloufi and Capecelatro
-    subroutine s_compute_periodic_forcing(q_cons_vf)
-        type(scalar_field), dimension(sys_size), intent(in) :: q_cons_vf
-
-        integer :: i, j, k
-
+        ! compute periodic forcing terms for mass, momentum, energy
         !$acc parallel loop collapse(3) gang vector default(present)
         do i = 0, m
             do j = 0, n
@@ -122,6 +111,7 @@ contains
                 end do 
             end do
         end do
+
     end subroutine s_compute_periodic_forcing
 
     subroutine s_finalize_additional_forcing_module
@@ -132,4 +122,4 @@ contains
         @:DEALLOCATE(q_periodic_force)
     end subroutine s_finalize_additional_forcing_module
 
-end module m_additional_forcing
\ No newline at end of file
+end module m_additional_forcing
diff --git a/src/simulation/m_start_up.fpp b/src/simulation/m_start_up.fpp
index 531f4145da..0d65666498 100644
--- a/src/simulation/m_start_up.fpp
+++ b/src/simulation/m_start_up.fpp
@@ -1324,8 +1324,8 @@ contains
         ! Volume filter flow variables, compute unclosed terms and their statistics
         if (volume_filtering_momentum_eqn) then 
             if (t_step > t_step_stat_start) then  
-                call nvtxStartRange("VOLUME-FILTER-MOMENTUM-EQUATION")  
-                call s_volume_filter_momentum_eqn(q_cons_ts(1)%vf)
+                call nvtxStartRange("VOLUME-FILTERED-MOMENTUM-EQUATION")  
+                call s_volume_filter_momentum_eqn(q_cons_ts(1)%vf, q_prim_vf)
                 call nvtxEndRange
 
                 call nvtxStartRange("COMPUTE-STATISTICS")
@@ -1346,15 +1346,16 @@ contains
             !     close(101)
             ! end if
 
+            ! Compute explicit x-, y-, z- forces on each particle
             call nvtxStartRange("COMPUTE-PARTICLE-FORCES")
             call s_compute_particle_forces()
             call nvtxEndRange
         end if
 
+        ! Compute terms to force a constant mass flow rate in fully periodic domain
         if (periodic_forcing) then 
             call nvtxStartRange("COMPUTE-PERIODIC-FORCING")
-            call s_compute_phase_average(q_cons_ts(1)%vf, t_step+1)
-            call s_compute_periodic_forcing(q_cons_ts(1)%vf)
+            call s_compute_periodic_forcing(q_cons_ts(1)%vf, t_step+1)
             call nvtxEndRange
         end if
 
diff --git a/src/simulation/m_volume_filtering.fpp b/src/simulation/m_volume_filtering.fpp
index ec659a7905..c1317d624e 100644
--- a/src/simulation/m_volume_filtering.fpp
+++ b/src/simulation/m_volume_filtering.fpp
@@ -29,10 +29,10 @@ module m_volume_filtering
     private; public :: s_initialize_fftw_explicit_filter_module, &
  s_initialize_filtering_kernel, s_initialize_fluid_indicator_function, & 
  s_initialize_filtered_fluid_indicator_function, s_finalize_fftw_explicit_filter_module, & 
- s_apply_fftw_filter_cons, s_volume_filter_momentum_eqn, s_apply_fftw_filter_tensor, s_apply_fftw_filter_scalarfield, &
+ s_volume_filter_momentum_eqn, s_apply_fftw_filter_scalarfield, &
  s_compute_viscous_stress_tensor, s_compute_stress_tensor, s_compute_divergence_stress_tensor, s_compute_particle_forces, &
  s_mpi_transpose_slabZ2Y, s_mpi_transpose_slabY2Z, s_mpi_FFT_fwd, s_mpi_FFT_bwd, &
- s_setup_terms_filtering, s_compute_pseudo_turbulent_reynolds_stress, s_compute_effective_viscosity, s_compute_interphase_momentum_exchange
+ s_setup_terms_filtering, s_compute_pseudo_turbulent_reynolds_stress, s_compute_effective_viscosity
 
 #if !defined(MFC_OpenACC)
     include 'fftw3.f03'
@@ -514,27 +514,40 @@ contains
     end subroutine s_initialize_filtered_fluid_indicator_function
 
     !< calculate the unclosed terms present in the volume filtered momentum equation
-    subroutine s_volume_filter_momentum_eqn(q_cons_vf)
+    subroutine s_volume_filter_momentum_eqn(q_cons_vf, q_prim_vf)
         type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
+        type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
         integer :: i, j, k
 
         call nvtxStartRange("FILTER-CONS-VARS")
-        call s_apply_fftw_filter_cons(q_cons_vf, q_cons_filtered)
+        do i = 1, sys_size-1
+            call s_apply_fftw_filter_scalarfield(filtered_fluid_indicator_function, .true., q_cons_vf(i), q_cons_filtered(i))
+        end do 
+        call s_apply_fftw_filter_scalarfield(filtered_fluid_indicator_function, .true., q_prim_vf(E_idx), filtered_pressure)
         call nvtxEndRange
 
-        call nvtxStartRange("UNCLOSED-TERM-SETUP")
-        call s_setup_terms_filtering(q_cons_vf, reynolds_stress, visc_stress, pres_visc_stress, div_pres_visc_stress, filtered_pressure)
-        call nvtxEndRange
+        call nvtxStartRange("COMPUTE-UNCLOSED-TERMS")
+        call s_setup_terms_filtering(q_cons_vf, q_prim_vf, reynolds_stress, visc_stress, pres_visc_stress, div_pres_visc_stress)
 
-        call nvtxStartRange("FILTER-UNCLOSED-TERM-VARS")
-        call s_apply_fftw_filter_tensor(reynolds_stress, visc_stress, eff_visc, div_pres_visc_stress, int_mom_exch)
-        call s_apply_fftw_filter_scalarfield(filtered_fluid_indicator_function, .true., filtered_pressure)
-        call nvtxEndRange
+        ! pseudo turbulent reynolds stress
+        do i = 1, num_dims 
+            do j = 1, num_dims
+                call s_apply_fftw_filter_scalarfield(filtered_fluid_indicator_function, .true., reynolds_stress(i)%vf(j))
+            end do
+        end do 
+        ! effective viscosity
+        do i = 1, num_dims 
+            do j = 1, num_dims
+                call s_apply_fftw_filter_scalarfield(filtered_fluid_indicator_function, .true., visc_stress(i)%vf(j), eff_visc(i)%vf(j))
+            end do
+        end do 
+        ! interphase momentum exchange
+        do i = 1, num_dims
+            call s_apply_fftw_filter_scalarfield(filtered_fluid_indicator_function, .false., div_pres_visc_stress(i), int_mom_exch(i))
+        end do 
 
-        call nvtxStartRange("COMPUTE-UNCLOSED-TERMS")
         call s_compute_pseudo_turbulent_reynolds_stress(q_cons_filtered, reynolds_stress)
         call s_compute_effective_viscosity(q_cons_filtered, eff_visc, visc_stress)
-        call s_compute_interphase_momentum_exchange(int_mom_exch)
         call nvtxEndRange
 
     end subroutine s_volume_filter_momentum_eqn
@@ -569,7 +582,9 @@ contains
             end do
         end if
 
+        call nvtxStartRange("FORWARD-3D-FFT")
         call s_mpi_FFT_fwd 
+        call nvtxEndRange
 
         !$acc parallel loop collapse(3) gang vector default(present)
         do i = 1, NxC 
@@ -580,7 +595,9 @@ contains
             end do 
         end do
 
+        call nvtxStartRange("BACKWARD-3D-FFT")
         call s_mpi_FFT_bwd
+        call nvtxEndRange
 
         if (present(q_temp_out)) then 
             !$acc parallel loop collapse(3) gang vector default(present)
@@ -604,50 +621,6 @@ contains
 
     end subroutine s_apply_fftw_filter_scalarfield
 
-    !< apply the gaussian filter to the conservative variables and compute their filtered components
-    subroutine s_apply_fftw_filter_cons(q_cons_vf, q_cons_filtered)
-        type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
-        type(scalar_field), dimension(sys_size-1), intent(inout) :: q_cons_filtered
-
-        integer :: i
-
-        do i = 1, sys_size-1
-            call s_apply_fftw_filter_scalarfield(filtered_fluid_indicator_function, .true., q_cons_vf(i), q_cons_filtered(i))
-        end do 
-
-    end subroutine s_apply_fftw_filter_cons
-
-    !< apply the gaussian filter to the requisite tensors to compute unclosed terms of interest
-    subroutine s_apply_fftw_filter_tensor(reynolds_stress, visc_stress, eff_visc, div_pres_visc_stress, int_mom_exch)
-        type(vector_field), dimension(1:num_dims), intent(inout) :: reynolds_stress
-        type(vector_field), dimension(1:num_dims), intent(inout) :: visc_stress
-        type(vector_field), dimension(1:num_dims), intent(inout) :: eff_visc
-        type(scalar_field), dimension(1:num_dims), intent(inout) :: div_pres_visc_stress
-        type(scalar_field), dimension(1:num_dims), intent(inout) :: int_mom_exch
-
-        integer :: i, j
-
-        ! pseudo turbulent reynolds stress
-        do i = 1, num_dims 
-            do j = 1, num_dims
-                call s_apply_fftw_filter_scalarfield(filtered_fluid_indicator_function, .true., reynolds_stress(i)%vf(j))
-            end do
-        end do 
-
-        ! effective viscosity
-        do i = 1, num_dims 
-            do j = 1, num_dims
-                call s_apply_fftw_filter_scalarfield(filtered_fluid_indicator_function, .true., visc_stress(i)%vf(j), eff_visc(i)%vf(j))
-            end do
-        end do 
-
-        ! interphase momentum exchange
-        do i = 1, num_dims
-            call s_apply_fftw_filter_scalarfield(filtered_fluid_indicator_function, .false., div_pres_visc_stress(i), int_mom_exch(i))
-        end do 
-
-    end subroutine s_apply_fftw_filter_tensor
-
     ! compute viscous stress tensor
     subroutine s_compute_viscous_stress_tensor(visc_stress, q_cons_vf)
         type(vector_field), dimension(num_dims), intent(inout) :: visc_stress 
@@ -688,11 +661,11 @@ contains
 
     end subroutine s_compute_viscous_stress_tensor
     
-    subroutine s_compute_stress_tensor(pres_visc_stress, visc_stress, q_cons_vf, filtered_pressure)
+    subroutine s_compute_stress_tensor(pres_visc_stress, visc_stress, q_cons_vf, q_prim_vf)
         type(vector_field), dimension(num_dims), intent(inout) :: pres_visc_stress
         type(vector_field), dimension(num_dims), intent(in) :: visc_stress
         type(scalar_field), dimension(sys_size), intent(in) :: q_cons_vf
-        type(scalar_field), intent(inout) :: filtered_pressure
+        type(scalar_field), dimension(sys_size), intent(in) :: q_prim_vf
         real(wp) :: pressure
         integer :: i, j, k
 
@@ -700,20 +673,15 @@ contains
         do i = 0, m 
             do j = 0, n 
                 do k = 0, p
-                    pressure = (q_cons_vf(E_idx)%sf(i, j, k) - 0.5_wp * (q_cons_vf(momxb)%sf(i, j, k)**2 + q_cons_vf(momxb+1)%sf(i, j, k)**2 + q_cons_vf(momxb+2)%sf(i, j, k)**2) &
-                             / q_cons_vf(contxb)%sf(i, j, k) - pi_infs(1) - qvs(1)) / (gammas(1))
-
-                    pres_visc_stress(1)%vf(1)%sf(i, j, k) = pressure - visc_stress(1)%vf(1)%sf(i, j, k)
+                    pres_visc_stress(1)%vf(1)%sf(i, j, k) = q_prim_vf(E_idx)%sf(i, j, k) - visc_stress(1)%vf(1)%sf(i, j, k)
                     pres_visc_stress(1)%vf(2)%sf(i, j, k) = - visc_stress(1)%vf(2)%sf(i, j, k) 
                     pres_visc_stress(1)%vf(3)%sf(i, j, k) = - visc_stress(1)%vf(3)%sf(i, j, k)
                     pres_visc_stress(2)%vf(1)%sf(i, j, k) = - visc_stress(2)%vf(1)%sf(i, j, k)
-                    pres_visc_stress(2)%vf(2)%sf(i, j, k) = pressure - visc_stress(2)%vf(2)%sf(i, j, k) 
+                    pres_visc_stress(2)%vf(2)%sf(i, j, k) = q_prim_vf(E_idx)%sf(i, j, k) - visc_stress(2)%vf(2)%sf(i, j, k) 
                     pres_visc_stress(2)%vf(3)%sf(i, j, k) = - visc_stress(2)%vf(3)%sf(i, j, k)
                     pres_visc_stress(3)%vf(1)%sf(i, j, k) = - visc_stress(3)%vf(1)%sf(i, j, k)
                     pres_visc_stress(3)%vf(2)%sf(i, j, k) = - visc_stress(3)%vf(2)%sf(i, j, k)
-                    pres_visc_stress(3)%vf(3)%sf(i, j, k) = pressure - visc_stress(3)%vf(3)%sf(i, j, k)
-
-                    filtered_pressure%sf(i, j, k) = pressure
+                    pres_visc_stress(3)%vf(3)%sf(i, j, k) = q_prim_vf(E_idx)%sf(i, j, k) - visc_stress(3)%vf(3)%sf(i, j, k)
                 end do 
             end do 
         end do 
@@ -748,13 +716,13 @@ contains
     end subroutine s_compute_divergence_stress_tensor
 
     !< setup for calculation of unclosed terms in volume filtered momentum eqn
-    subroutine s_setup_terms_filtering(q_cons_vf, reynolds_stress, visc_stress, pres_visc_stress, div_pres_visc_stress, filtered_pressure)
+    subroutine s_setup_terms_filtering(q_cons_vf, q_prim_vf, reynolds_stress, visc_stress, pres_visc_stress, div_pres_visc_stress)
         type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
+        type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
         type(vector_field), dimension(1:num_dims), intent(inout) :: reynolds_stress
         type(vector_field), dimension(1:num_dims), intent(inout) :: visc_stress
         type(vector_field), dimension(1:num_dims), intent(inout) :: pres_visc_stress
         type(scalar_field), dimension(1:num_dims), intent(inout) :: div_pres_visc_stress
-        type(scalar_field), intent(inout) :: filtered_pressure
 
         integer :: i, j, k, l, q
 
@@ -767,7 +735,7 @@ contains
                     do l = 1, num_dims
                         !$acc loop seq
                         do q = 1, num_dims
-                            reynolds_stress(l)%vf(q)%sf(i, j, k) = (q_cons_vf(momxb-1+l)%sf(i, j, k) * q_cons_vf(momxb-1+q)%sf(i, j, k)) / q_cons_vf(1)%sf(i, j, k) ! (rho*u x rho*u)/rho = rho*(u x u) 
+                            reynolds_stress(l)%vf(q)%sf(i, j, k) = q_cons_vf(1)%sf(i, j, k) * (q_prim_vf(momxb-1+l)%sf(i, j, k) * q_prim_vf(momxb-1+q)%sf(i, j, k)) ! rho*(u x u) 
                         end do
                     end do
                 end do
@@ -776,11 +744,11 @@ contains
 
         ! set density and momentum buffers
 #ifdef MFC_MPI
-        do i = 1, momxe 
+        do i = contxb, momxe 
             call s_populate_scalarfield_buffers(q_cons_vf(i))
         end do
 #else
-        do i = 1, momxe
+        do i = contxb, momxe
             q_cons_vf(i)%sf(-buff_size:-1, :, :) = q_cons_vf(i)%sf(m-buff_size+1:m, :, :)
             q_cons_vf(i)%sf(m+1:m+buff_size, :, :) = q_cons_vf(i)%sf(0:buff_size-1, :, :)
 
@@ -795,8 +763,9 @@ contains
         ! effective viscosity setup, return viscous stress tensor
         call s_compute_viscous_stress_tensor(visc_stress, q_cons_vf)
 
-        call s_compute_stress_tensor(pres_visc_stress, visc_stress, q_cons_vf, filtered_pressure)
+        call s_compute_stress_tensor(pres_visc_stress, visc_stress, q_cons_vf, q_prim_vf)
 
+        ! interphase momentum exchange term setup
         call s_compute_divergence_stress_tensor(div_pres_visc_stress, pres_visc_stress)
 
     end subroutine s_setup_terms_filtering
@@ -837,7 +806,7 @@ contains
             call s_populate_scalarfield_buffers(q_cons_filtered(i))
         end do
 #else
-        do i = 1, momxe
+        do i = contxb, momxe
             q_cons_filtered(i)%sf(-buff_size:-1, :, :) = q_cons_filtered(i)%sf(m-buff_size+1:m, :, :)
             q_cons_filtered(i)%sf(m+1:m+buff_size, :, :) = q_cons_filtered(i)%sf(0:buff_size-1, :, :)
 
@@ -870,13 +839,6 @@ contains
 
     end subroutine s_compute_effective_viscosity
 
-    subroutine s_compute_interphase_momentum_exchange(int_mom_exch)
-        type(scalar_field), dimension(1:num_dims), intent(in) :: int_mom_exch
-
-        integer :: i, j, k
-
-    end subroutine s_compute_interphase_momentum_exchange
-
     ! computes x-,y-,z-direction forces on particles
     subroutine s_compute_particle_forces
         real(wp), dimension(num_ibs, 3) :: force_glb
@@ -1036,7 +998,9 @@ contains
         end do 
 
         ! transpose z-slab to y-slab
+        call nvtxStartRange("SLAB-MPI-TRANSPOSE-Z2Y")
         call s_mpi_transpose_slabZ2Y 
+        call nvtxEndRange
 
         ! 3D y-slab -> 1D z, x, y
         !$acc parallel loop collapse(3) gang vector default(present)
@@ -1080,7 +1044,9 @@ contains
         end do
 
         ! transpose y-slab to z-slab
+        call nvtxStartRange("SLAB-MPI-TRANSPOSE-Y2Z")
         call s_mpi_transpose_slabY2Z
+        call nvtxEndRange
 
         ! 3D z-slab -> 1D y, x, z
         !$acc parallel loop collapse(3) gang vector default(present)

From b85bbf065a25487b888c155ef4cf5bcb48f60716 Mon Sep 17 00:00:00 2001
From: Conrad Delgado <conradd3@dt-login04.delta.ncsa.illinois.edu>
Date: Thu, 4 Sep 2025 11:04:58 -0500
Subject: [PATCH 13/30] pointer reference gpu bug in viscous stress tensor calc

---
 src/simulation/m_volume_filtering.fpp | 100 +++++++++++++++++---------
 1 file changed, 67 insertions(+), 33 deletions(-)

diff --git a/src/simulation/m_volume_filtering.fpp b/src/simulation/m_volume_filtering.fpp
index c1317d624e..fea8a22811 100644
--- a/src/simulation/m_volume_filtering.fpp
+++ b/src/simulation/m_volume_filtering.fpp
@@ -622,42 +622,76 @@ contains
     end subroutine s_apply_fftw_filter_scalarfield
 
     ! compute viscous stress tensor
-    subroutine s_compute_viscous_stress_tensor(visc_stress, q_cons_vf)
+    subroutine s_compute_viscous_stress_tensor(visc_stress, q_prim_vf, q_cons_filtered)
         type(vector_field), dimension(num_dims), intent(inout) :: visc_stress 
-        type(scalar_field), dimension(sys_size), intent(in) :: q_cons_vf
+        type(scalar_field), dimension(sys_size), intent(in), optional :: q_prim_vf
+        type(scalar_field), dimension(sys_size-1), intent(in), optional :: q_cons_filtered
         real(wp) :: dudx, dudy, dudz, dvdx, dvdy, dvdz, dwdx, dwdy, dwdz ! spatial velocity derivatives
         integer :: i, j, k 
 
-        !$acc parallel loop collapse(3) gang vector default(present) private(dudx, dudy, dudz, dvdx, dvdy, dvdz, dwdx, dwdy, dwdz)
-        do i = 0, m 
-            do j = 0, n 
-                do k = 0, p
-                    ! velocity gradients, local to each process
-                    dudx = ( q_cons_vf(2)%sf(i+1, j, k)/q_cons_vf(1)%sf(i+1, j, k) - q_cons_vf(2)%sf(i-1, j, k)/q_cons_vf(1)%sf(i-1, j, k) ) / (dx(i-1) + dx(i+1))
-                    dudy = ( q_cons_vf(2)%sf(i, j+1, k)/q_cons_vf(1)%sf(i, j+1, k) - q_cons_vf(2)%sf(i, j-1, k)/q_cons_vf(1)%sf(i, j-1, k) ) / (dy(j-1) + dy(j+1))
-                    dudz = ( q_cons_vf(2)%sf(i, j, k+1)/q_cons_vf(1)%sf(i, j, k+1) - q_cons_vf(2)%sf(i, j, k-1)/q_cons_vf(1)%sf(i, j, k-1) ) / (dz(k-1) + dz(k+1))
-
-                    dvdx = ( q_cons_vf(3)%sf(i+1, j, k)/q_cons_vf(1)%sf(i+1, j, k) - q_cons_vf(3)%sf(i-1, j, k)/q_cons_vf(1)%sf(i-1, j, k) ) / (dx(i-1) + dx(i+1))
-                    dvdy = ( q_cons_vf(3)%sf(i, j+1, k)/q_cons_vf(1)%sf(i, j+1, k) - q_cons_vf(3)%sf(i, j-1, k)/q_cons_vf(1)%sf(i, j-1, k) ) / (dy(j-1) + dy(j+1))
-                    dvdz = ( q_cons_vf(3)%sf(i, j, k+1)/q_cons_vf(1)%sf(i, j, k+1) - q_cons_vf(3)%sf(i, j, k-1)/q_cons_vf(1)%sf(i, j, k-1) ) / (dz(k-1) + dz(k+1))
-
-                    dwdx = ( q_cons_vf(4)%sf(i+1, j, k)/q_cons_vf(1)%sf(i+1, j, k) - q_cons_vf(4)%sf(i-1, j, k)/q_cons_vf(1)%sf(i-1, j, k) ) / (dx(i-1) + dx(i+1))
-                    dwdy = ( q_cons_vf(4)%sf(i, j+1, k)/q_cons_vf(1)%sf(i, j+1, k) - q_cons_vf(4)%sf(i, j-1, k)/q_cons_vf(1)%sf(i, j-1, k) ) / (dy(j-1) + dy(j+1))
-                    dwdz = ( q_cons_vf(4)%sf(i, j, k+1)/q_cons_vf(1)%sf(i, j, k+1) - q_cons_vf(4)%sf(i, j, k-1)/q_cons_vf(1)%sf(i, j, k-1) ) / (dz(k-1) + dz(k+1))
-
-                    ! viscous stress tensor, visc_stress(row, column)
-                    visc_stress(1)%vf(1)%sf(i, j, k) = (4._wp/3._wp * dudx - 2._wp/3._wp * (dvdy + dwdz)) / Res(1, 1)
-                    visc_stress(1)%vf(2)%sf(i, j, k) = (dudy + dvdx) / Res(1, 1)
-                    visc_stress(1)%vf(3)%sf(i, j, k) = (dudz + dwdx) / Res(1, 1)
-                    visc_stress(2)%vf(1)%sf(i, j, k) = (dvdx + dudy) / Res(1, 1)
-                    visc_stress(2)%vf(2)%sf(i, j, k) = (4._wp/3._wp * dvdy - 2._wp/3._wp * (dudx + dwdz)) / Res(1, 1)
-                    visc_stress(2)%vf(3)%sf(i, j, k) = (dvdz + dwdy) / Res(1, 1)
-                    visc_stress(3)%vf(1)%sf(i, j, k) = (dwdx + dudz) / Res(1, 1)
-                    visc_stress(3)%vf(2)%sf(i, j, k) = (dwdy + dvdz) / Res(1, 1)
-                    visc_stress(3)%vf(3)%sf(i, j, k) = (4._wp/3._wp * dwdz - 2._wp/3._wp * (dudx + dvdy)) / Res(1, 1)
+        if (present(q_prim_vf)) then
+            !$acc parallel loop collapse(3) gang vector default(present) private(dudx, dudy, dudz, dvdx, dvdy, dvdz, dwdx, dwdy, dwdz)
+            do i = 0, m 
+                do j = 0, n 
+                    do k = 0, p
+                        ! velocity gradients, local to each process
+                        dudx = ( q_prim_vf(2)%sf(i+1, j, k) - q_prim_vf(2)%sf(i-1, j, k) ) / (dx(i-1) + dx(i+1))
+                        dudy = ( q_prim_vf(2)%sf(i, j+1, k) - q_prim_vf(2)%sf(i, j-1, k) ) / (dy(j-1) + dy(j+1))
+                        dudz = ( q_prim_vf(2)%sf(i, j, k+1) - q_prim_vf(2)%sf(i, j, k-1) ) / (dz(k-1) + dz(k+1))
+
+                        dvdx = ( q_prim_vf(3)%sf(i+1, j, k) - q_prim_vf(3)%sf(i-1, j, k) ) / (dx(i-1) + dx(i+1))
+                        dvdy = ( q_prim_vf(3)%sf(i, j+1, k) - q_prim_vf(3)%sf(i, j-1, k) ) / (dy(j-1) + dy(j+1))
+                        dvdz = ( q_prim_vf(3)%sf(i, j, k+1) - q_prim_vf(3)%sf(i, j, k-1) ) / (dz(k-1) + dz(k+1))
+
+                        dwdx = ( q_prim_vf(4)%sf(i+1, j, k) - q_prim_vf(4)%sf(i-1, j, k) ) / (dx(i-1) + dx(i+1))
+                        dwdy = ( q_prim_vf(4)%sf(i, j+1, k) - q_prim_vf(4)%sf(i, j-1, k) ) / (dy(j-1) + dy(j+1))
+                        dwdz = ( q_prim_vf(4)%sf(i, j, k+1) - q_prim_vf(4)%sf(i, j, k-1) ) / (dz(k-1) + dz(k+1))
+
+                        ! viscous stress tensor, visc_stress(row, column)
+                        visc_stress(1)%vf(1)%sf(i, j, k) = (4._wp/3._wp * dudx - 2._wp/3._wp * (dvdy + dwdz)) / Res(1, 1)
+                        visc_stress(1)%vf(2)%sf(i, j, k) = (dudy + dvdx) / Res(1, 1)
+                        visc_stress(1)%vf(3)%sf(i, j, k) = (dudz + dwdx) / Res(1, 1)
+                        visc_stress(2)%vf(1)%sf(i, j, k) = (dvdx + dudy) / Res(1, 1)
+                        visc_stress(2)%vf(2)%sf(i, j, k) = (4._wp/3._wp * dvdy - 2._wp/3._wp * (dudx + dwdz)) / Res(1, 1)
+                        visc_stress(2)%vf(3)%sf(i, j, k) = (dvdz + dwdy) / Res(1, 1)
+                        visc_stress(3)%vf(1)%sf(i, j, k) = (dwdx + dudz) / Res(1, 1)
+                        visc_stress(3)%vf(2)%sf(i, j, k) = (dwdy + dvdz) / Res(1, 1)
+                        visc_stress(3)%vf(3)%sf(i, j, k) = (4._wp/3._wp * dwdz - 2._wp/3._wp * (dudx + dvdy)) / Res(1, 1)
+                    end do 
                 end do 
-            end do 
-        end do
+            end do
+        else if (present(q_cons_filtered)) then
+            !$acc parallel loop collapse(3) gang vector default(present) private(dudx, dudy, dudz, dvdx, dvdy, dvdz, dwdx, dwdy, dwdz)
+            do i = 0, m 
+                do j = 0, n 
+                    do k = 0, p
+                        ! velocity gradients, local to each process
+                        dudx = ( q_cons_filtered(2)%sf(i+1, j, k)/q_cons_filtered(1)%sf(i+1, j, k) - q_cons_filtered(2)%sf(i-1, j, k)/q_cons_filtered(1)%sf(i-1, j, k) ) / (dx(i-1) + dx(i+1))
+                        dudy = ( q_cons_filtered(2)%sf(i, j+1, k)/q_cons_filtered(1)%sf(i, j+1, k) - q_cons_filtered(2)%sf(i, j-1, k)/q_cons_filtered(1)%sf(i, j-1, k) ) / (dy(j-1) + dy(j+1))
+                        dudz = ( q_cons_filtered(2)%sf(i, j, k+1)/q_cons_filtered(1)%sf(i, j, k+1) - q_cons_filtered(2)%sf(i, j, k-1)/q_cons_filtered(1)%sf(i, j, k-1) ) / (dz(k-1) + dz(k+1))
+
+                        dvdx = ( q_cons_filtered(3)%sf(i+1, j, k)/q_cons_filtered(1)%sf(i+1, j, k) - q_cons_filtered(3)%sf(i-1, j, k)/q_cons_filtered(1)%sf(i-1, j, k) ) / (dx(i-1) + dx(i+1))
+                        dvdy = ( q_cons_filtered(3)%sf(i, j+1, k)/q_cons_filtered(1)%sf(i, j+1, k) - q_cons_filtered(3)%sf(i, j-1, k)/q_cons_filtered(1)%sf(i, j-1, k) ) / (dy(j-1) + dy(j+1))
+                        dvdz = ( q_cons_filtered(3)%sf(i, j, k+1)/q_cons_filtered(1)%sf(i, j, k+1) - q_cons_filtered(3)%sf(i, j, k-1)/q_cons_filtered(1)%sf(i, j, k-1) ) / (dz(k-1) + dz(k+1))
+
+                        dwdx = ( q_cons_filtered(4)%sf(i+1, j, k)/q_cons_filtered(1)%sf(i+1, j, k) - q_cons_filtered(4)%sf(i-1, j, k)/q_cons_filtered(1)%sf(i-1, j, k) ) / (dx(i-1) + dx(i+1))
+                        dwdy = ( q_cons_filtered(4)%sf(i, j+1, k)/q_cons_filtered(1)%sf(i, j+1, k) - q_cons_filtered(4)%sf(i, j-1, k)/q_cons_filtered(1)%sf(i, j-1, k) ) / (dy(j-1) + dy(j+1))
+                        dwdz = ( q_cons_filtered(4)%sf(i, j, k+1)/q_cons_filtered(1)%sf(i, j, k+1) - q_cons_filtered(4)%sf(i, j, k-1)/q_cons_filtered(1)%sf(i, j, k-1) ) / (dz(k-1) + dz(k+1))
+
+                        ! viscous stress tensor, visc_stress(row, column)
+                        visc_stress(1)%vf(1)%sf(i, j, k) = (4._wp/3._wp * dudx - 2._wp/3._wp * (dvdy + dwdz)) / Res(1, 1)
+                        visc_stress(1)%vf(2)%sf(i, j, k) = (dudy + dvdx) / Res(1, 1)
+                        visc_stress(1)%vf(3)%sf(i, j, k) = (dudz + dwdx) / Res(1, 1)
+                        visc_stress(2)%vf(1)%sf(i, j, k) = (dvdx + dudy) / Res(1, 1)
+                        visc_stress(2)%vf(2)%sf(i, j, k) = (4._wp/3._wp * dvdy - 2._wp/3._wp * (dudx + dwdz)) / Res(1, 1)
+                        visc_stress(2)%vf(3)%sf(i, j, k) = (dvdz + dwdy) / Res(1, 1)
+                        visc_stress(3)%vf(1)%sf(i, j, k) = (dwdx + dudz) / Res(1, 1)
+                        visc_stress(3)%vf(2)%sf(i, j, k) = (dwdy + dvdz) / Res(1, 1)
+                        visc_stress(3)%vf(3)%sf(i, j, k) = (4._wp/3._wp * dwdz - 2._wp/3._wp * (dudx + dvdy)) / Res(1, 1)
+                    end do 
+                end do 
+            end do
+        end if
 
     end subroutine s_compute_viscous_stress_tensor
     
@@ -761,7 +795,7 @@ contains
 #endif
         
         ! effective viscosity setup, return viscous stress tensor
-        call s_compute_viscous_stress_tensor(visc_stress, q_cons_vf)
+        call s_compute_viscous_stress_tensor(visc_stress, q_prim_vf=q_prim_vf)
 
         call s_compute_stress_tensor(pres_visc_stress, visc_stress, q_cons_vf, q_prim_vf)
 
@@ -819,7 +853,7 @@ contains
 #endif
 
         ! calculate stress tensor with filtered quantities 
-        call s_compute_viscous_stress_tensor(visc_stress, q_cons_filtered)
+        call s_compute_viscous_stress_tensor(visc_stress, q_cons_filtered=q_cons_filtered)
 
         ! calculate eff_visc
         !$acc parallel loop collapse(3) gang vector default(present)

From aff2ca46f05bf209b0ae7f3c2cbe356b37f938ad Mon Sep 17 00:00:00 2001
From: conradd3 <conradd3@illinois.edu>
Date: Fri, 5 Sep 2025 13:52:01 -0500
Subject: [PATCH 14/30] bug fix causes 1d tests to fail

---
 src/simulation/m_start_up.fpp      | 12 +++++-------
 src/simulation/m_time_steppers.fpp |  2 --
 src/simulation/p_main.fpp          |  2 +-
 3 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/src/simulation/m_start_up.fpp b/src/simulation/m_start_up.fpp
index 0d65666498..c593da603d 100644
--- a/src/simulation/m_start_up.fpp
+++ b/src/simulation/m_start_up.fpp
@@ -89,8 +89,6 @@ module m_start_up
 
     use m_mhd
 
-    use m_compute_particle_forces
-
     use m_additional_forcing 
 
     use m_volume_filtering
@@ -1347,9 +1345,11 @@ contains
             ! end if
 
             ! Compute explicit x-, y-, z- forces on each particle
-            call nvtxStartRange("COMPUTE-PARTICLE-FORCES")
-            call s_compute_particle_forces()
-            call nvtxEndRange
+            if (compute_particle_drag) then
+                call nvtxStartRange("COMPUTE-PARTICLE-FORCES")
+                call s_compute_particle_forces()
+                call nvtxEndRange
+            end if
         end if
 
         ! Compute terms to force a constant mass flow rate in fully periodic domain
@@ -1637,7 +1637,6 @@ contains
 
         if (mhd .and. powell) call s_initialize_mhd_powell_module
 
-        if (compute_particle_drag) call s_initialize_particle_forces_module()
         if (periodic_forcing) call s_initialize_additional_forcing_module()
         if (volume_filtering_momentum_eqn) then 
             call s_initialize_fftw_explicit_filter_module()
@@ -1786,7 +1785,6 @@ contains
         if (bodyForces) call s_finalize_body_forces_module()
         if (mhd .and. powell) call s_finalize_mhd_powell_module
 
-        if (compute_particle_drag) call s_finalize_particle_forces_module()
         if (periodic_forcing) call s_finalize_additional_forcing_module()
         if (volume_filtering_momentum_eqn) call s_finalize_fftw_explicit_filter_module
 
diff --git a/src/simulation/m_time_steppers.fpp b/src/simulation/m_time_steppers.fpp
index 5132efbb23..93d2c91724 100644
--- a/src/simulation/m_time_steppers.fpp
+++ b/src/simulation/m_time_steppers.fpp
@@ -671,8 +671,6 @@ contains
 
         real(wp) :: start, finish
 
-        integer :: n_step
-
         ! Stage 1 of 3
 
         if (.not. adap_dt) then
diff --git a/src/simulation/p_main.fpp b/src/simulation/p_main.fpp
index 80b3e4ecf0..9cd571b2ac 100644
--- a/src/simulation/p_main.fpp
+++ b/src/simulation/p_main.fpp
@@ -56,7 +56,7 @@ program p_main
     call s_initialize_gpu_vars()
     call nvtxEndRange
 
-    call s_initialize_fluid_indicator_function()
+    if (volume_filtering_momentum_eqn .or. periodic_forcing) call s_initialize_fluid_indicator_function()
     if (volume_filtering_momentum_eqn) then 
         call s_initialize_filtering_kernel()
         call s_initialize_filtered_fluid_indicator_function()

From 622a0a60db5dabe7445c3047f934768a8d007f30 Mon Sep 17 00:00:00 2001
From: Conrad Delgado <conraddelgado@Conrads-MacBook-Air-6.local>
Date: Sun, 7 Sep 2025 22:21:33 -0500
Subject: [PATCH 15/30] particle drag force bug fix, accumulation

---
 runs/phi01/sphere_array_locations.txt | 380 +++++++++++++-------------
 src/simulation/m_volume_filtering.fpp |  25 +-
 voronoi/gen_voronoi_3D.py             |   2 +-
 3 files changed, 211 insertions(+), 196 deletions(-)

diff --git a/runs/phi01/sphere_array_locations.txt b/runs/phi01/sphere_array_locations.txt
index cb062253cc..047707ef90 100644
--- a/runs/phi01/sphere_array_locations.txt
+++ b/runs/phi01/sphere_array_locations.txt
@@ -1,190 +1,190 @@
--2.269415855407714844e-01 -1.414051055908203125e-01 3.922535181045532227e-01
-4.000198841094970703e-01 2.981948852539062500e-02 -2.832174301147460938e-01
--3.220155239105224609e-01 -3.898024559020996094e-01 -3.041059970855712891e-01
-2.814270257949829102e-01 -7.608795166015625000e-02 -1.437755823135375977e-01
--2.728327512741088867e-01 4.227894544601440430e-01 3.520679473876953125e-02
--4.947633743286132812e-01 -4.232151508331298828e-01 -2.972397804260253906e-01
--1.808261871337890625e-02 2.877434492111206055e-01 -2.310247421264648438e-01
-3.818988800048828125e-01 3.529353141784667969e-01 1.727198362350463867e-01
--2.346787452697753906e-01 2.829644680023193359e-01 1.594238281250000000e-01
--4.887726306915283203e-01 4.662406444549560547e-02 7.227540016174316406e-02
--2.048213481903076172e-01 4.885343313217163086e-01 -2.821706533432006836e-01
--4.693455696105957031e-01 4.566423892974853516e-01 1.360166072845458984e-02
--2.810692787170410156e-02 3.964089155197143555e-01 2.224528789520263672e-01
--4.457854032516479492e-01 2.029451131820678711e-01 -2.691650390625000000e-01
--4.315460920333862305e-01 3.888773918151855469e-02 -4.190684556961059570e-01
-4.569005966186523438e-01 4.780390262603759766e-01 -1.672872304916381836e-01
-4.523042440414428711e-01 2.975084781646728516e-01 -5.123972892761230469e-03
-1.155309677124023438e-01 3.826811313629150391e-01 3.157733678817749023e-01
--3.529649972915649414e-01 3.223993778228759766e-01 3.534083366394042969e-01
--1.699209213256835938e-03 -3.757699728012084961e-01 4.251234531402587891e-01
-3.104512691497802734e-01 3.631212711334228516e-01 3.740961551666259766e-01
-3.886995315551757812e-01 -4.476237297058105469e-01 3.331944942474365234e-01
--8.131015300750732422e-02 3.511540889739990234e-01 6.623625755310058594e-02
-5.544662475585937500e-03 2.087895870208740234e-01 -4.609942436218261719e-03
--2.697887420654296875e-01 7.647264003753662109e-02 1.385573148727416992e-01
--4.056740999221801758e-01 -2.304553985595703125e-03 -2.276074886322021484e-01
--3.986057043075561523e-01 -8.398652076721191406e-02 8.779549598693847656e-02
-1.455659866333007812e-01 -5.315554141998291016e-02 3.587335348129272461e-01
--3.624105453491210938e-02 -1.932673454284667969e-01 3.783030509948730469e-01
-2.404289245605468750e-01 2.313592433929443359e-01 -9.129595756530761719e-02
-4.290236234664916992e-01 -2.806437015533447266e-01 -3.928461074829101562e-01
-3.948264122009277344e-01 1.061335802078247070e-01 -1.345469951629638672e-01
-4.199941158294677734e-01 -5.409121513366699219e-02 -4.431722164154052734e-01
--1.276044845581054688e-01 5.453205108642578125e-02 4.209027290344238281e-01
-2.240920066833496094e-01 5.745470523834228516e-02 -2.274198532104492188e-01
-3.475044965744018555e-01 -1.186680793762207031e-02 3.881464004516601562e-01
--1.399791240692138672e-02 -5.303645133972167969e-02 3.192350864410400391e-01
-3.149266242980957031e-01 -4.960085153579711914e-01 4.852926731109619141e-01
-1.159789562225341797e-01 7.240676879882812500e-02 -2.081871032714843750e-03
-3.457980155944824219e-01 -4.685097932815551758e-01 1.311070919036865234e-01
--3.134734630584716797e-01 -1.447633504867553711e-01 2.294397354125976562e-01
--2.322396039962768555e-01 4.453787803649902344e-01 2.214672565460205078e-01
-7.549452781677246094e-02 2.149226665496826172e-01 1.942512989044189453e-01
-4.877026081085205078e-01 9.565687179565429688e-02 4.446644783020019531e-01
-2.452219724655151367e-01 -1.041567325592041016e-02 -4.420824050903320312e-01
-3.802477121353149414e-01 -2.260215282440185547e-01 -6.829130649566650391e-02
-4.026585817337036133e-01 -9.730875492095947266e-02 5.328035354614257812e-02
--1.340943574905395508e-01 -2.988189458847045898e-01 4.915304183959960938e-01
-1.499507427215576172e-01 -1.232669353485107422e-01 -3.215692043304443359e-01
-7.229900360107421875e-02 1.496689319610595703e-01 -1.584017276763916016e-01
-4.887890815734863281e-02 -2.996931076049804688e-01 -6.179094314575195312e-02
-3.264107704162597656e-01 1.829891204833984375e-01 4.166131019592285156e-01
-3.418397903442382812e-01 -3.681684732437133789e-01 -1.888689994812011719e-01
--1.746954917907714844e-01 3.889560699462890625e-03 2.538719177246093750e-01
--1.082150936126708984e-01 -1.183983087539672852e-01 -4.667383432388305664e-01
-4.464948177337646484e-02 7.829546928405761719e-02 -4.987317323684692383e-01
-2.724659442901611328e-01 3.989661931991577148e-01 -2.271368503570556641e-01
-2.325954437255859375e-01 2.180564403533935547e-01 7.740092277526855469e-02
-4.475378990173339844e-01 8.053278923034667969e-02 2.720277309417724609e-01
-2.500159740447998047e-01 1.361670494079589844e-01 -4.378540515899658203e-01
--8.050751686096191406e-02 2.042385339736938477e-01 4.733436107635498047e-01
-6.334328651428222656e-02 3.953868150711059570e-01 -1.099604368209838867e-01
--6.584823131561279297e-02 4.609835147857666016e-01 -2.351213693618774414e-01
--3.965889215469360352e-01 2.626715898513793945e-01 -4.403696060180664062e-01
--4.123499393463134766e-01 4.679954051971435547e-01 -4.630439281463623047e-01
--3.268948793411254883e-01 -2.706754207611083984e-01 4.083846807479858398e-01
-4.519817829132080078e-01 -4.413130283355712891e-01 -4.950367212295532227e-01
-1.736700534820556641e-01 -4.334635734558105469e-01 3.858578205108642578e-01
--2.476015090942382812e-01 -8.808064460754394531e-02 -3.171390295028686523e-01
-1.416424512863159180e-01 6.130337715148925781e-03 1.613216400146484375e-01
-1.161313056945800781e-01 -8.472347259521484375e-02 -6.638598442077636719e-02
-6.862294673919677734e-02 7.571196556091308594e-02 -3.263452053070068359e-01
--2.883186340332031250e-01 1.637139320373535156e-01 -1.617478132247924805e-01
-4.712302684783935547e-01 -1.252410411834716797e-01 2.302359342575073242e-01
--3.321516513824462891e-02 -3.931099176406860352e-01 -1.693089008331298828e-01
-4.347057342529296875e-01 3.060367107391357422e-01 -1.781182289123535156e-01
-4.378421306610107422e-01 -2.324944734573364258e-01 4.174745082855224609e-01
-1.022851467132568359e-02 -1.360912322998046875e-01 6.093466281890869141e-02
-1.258683204650878906e-01 -2.447234392166137695e-01 3.956108093261718750e-01
--1.879813671112060547e-01 3.079674243927001953e-01 3.408046960830688477e-01
--3.804820775985717773e-01 3.240450620651245117e-01 -1.224457025527954102e-01
-4.557719230651855469e-01 -3.179004192352294922e-01 2.294783592224121094e-01
--1.324630975723266602e-01 -2.825807332992553711e-01 2.794981002807617188e-02
--9.088420867919921875e-02 -4.938784837722778320e-01 -4.559993743896484375e-01
-4.321286678314208984e-01 1.908559799194335938e-01 -4.160747528076171875e-01
-4.761004447937011719e-01 -3.449964523315429688e-02 -9.512662887573242188e-02
--3.295238018035888672e-01 -4.874784946441650391e-01 3.628075122833251953e-01
--3.269430398941040039e-01 4.961208105087280273e-01 -1.530282497406005859e-01
-1.903204917907714844e-01 4.334928989410400391e-01 1.328067779541015625e-01
--1.938850879669189453e-01 -3.347861766815185547e-01 3.228425979614257812e-01
--7.716512680053710938e-02 -1.792883872985839844e-01 -1.214803457260131836e-01
-2.945523262023925781e-01 4.375331401824951172e-01 -4.941940307617187500e-02
-2.805604934692382812e-01 3.923368453979492188e-02 3.594279289245605469e-03
--3.963446617126464844e-02 4.087066650390625000e-02 1.291446685791015625e-01
-3.017591238021850586e-01 -4.672487974166870117e-01 -3.370153903961181641e-01
--5.923175811767578125e-02 -1.029053926467895508e-01 -2.954306602478027344e-01
--4.299471378326416016e-01 1.944204568862915039e-01 1.885912418365478516e-01
-1.226736307144165039e-01 -4.231331348419189453e-01 -4.431772232055664062e-01
--1.630305051803588867e-01 1.654865741729736328e-01 1.177084445953369141e-02
--2.820068597793579102e-01 -1.914020776748657227e-01 4.649567604064941406e-02
--1.803944110870361328e-01 -5.573785305023193359e-02 3.654372692108154297e-02
-3.560798168182373047e-01 -2.656357288360595703e-01 1.175208091735839844e-01
-4.641888141632080078e-01 3.300178050994873047e-01 4.690952301025390625e-01
--3.651157617568969727e-01 4.143847227096557617e-01 -3.058776855468750000e-01
-4.892169237136840820e-01 4.351882934570312500e-01 3.436787128448486328e-01
-1.252651214599609375e-02 1.140588521957397461e-01 3.147521018981933594e-01
-2.564185857772827148e-01 4.870939254760742188e-01 2.839587926864624023e-01
-2.440360784530639648e-01 2.740068435668945312e-01 2.384872436523437500e-01
--1.093761920928955078e-01 2.005448341369628906e-01 2.263984680175781250e-01
-2.751414775848388672e-01 3.257715702056884766e-01 -4.216753244400024414e-01
--2.344570159912109375e-01 3.708822727203369141e-01 -4.901626110076904297e-01
-3.321342468261718750e-01 2.205178737640380859e-01 -2.770333290100097656e-01
--3.562602996826171875e-01 2.268432378768920898e-01 3.148293495178222656e-02
--2.453712224960327148e-01 -3.159594535827636719e-01 -1.403638124465942383e-01
--4.225530624389648438e-01 1.738572120666503906e-01 4.009822607040405273e-01
--2.291325330734252930e-01 3.076609373092651367e-01 -2.942405939102172852e-01
--4.163160324096679688e-01 -1.362502574920654297e-01 -4.328134059906005859e-01
-1.602690219879150391e-01 4.211304187774658203e-01 4.947811365127563477e-01
-1.699512004852294922e-01 -3.455421924591064453e-01 1.857841014862060547e-01
-1.917399168014526367e-01 -2.274444103240966797e-01 -1.499438285827636719e-01
-5.063652992248535156e-02 -7.577204704284667969e-02 -4.671556949615478516e-01
-1.856522560119628906e-01 1.085456609725952148e-01 3.598620891571044922e-01
-2.133283615112304688e-01 -1.748585700988769531e-01 5.385351181030273438e-02
-1.607365608215332031e-01 2.551939487457275391e-01 -2.725876569747924805e-01
-4.380518198013305664e-01 2.549636363983154297e-01 2.876336574554443359e-01
--2.457389831542968750e-01 -4.205622673034667969e-01 -4.621033668518066406e-01
--4.958317279815673828e-01 -4.657427072525024414e-01 1.988265514373779297e-01
-6.845688819885253906e-02 2.681604623794555664e-01 -4.308686256408691406e-01
--4.200789928436279297e-01 3.732924461364746094e-01 1.710724830627441406e-01
-3.544092178344726562e-02 -3.218197822570800781e-01 8.597135543823242188e-02
--5.194902420043945312e-02 1.222956180572509766e-02 -5.136704444885253906e-02
-1.391673088073730469e-01 2.476061582565307617e-01 4.182490110397338867e-01
--1.033620834350585938e-01 3.683781623840332031e-02 -3.891081809997558594e-01
--4.138703346252441406e-01 -3.311948776245117188e-01 -4.624009132385253906e-01
--9.261775016784667969e-02 1.478457450866699219e-01 -1.957361698150634766e-01
-2.608032226562500000e-01 -1.573407649993896484e-01 4.948087930679321289e-01
--1.243667602539062500e-01 -4.962480068206787109e-01 3.667256832122802734e-01
--4.454655647277832031e-01 -2.705636024475097656e-01 1.070654392242431641e-01
--4.106376171112060547e-01 -1.618578433990478516e-01 -6.648111343383789062e-02
-3.302078247070312500e-01 -2.219557762145996094e-02 1.648344993591308594e-01
--1.774271726608276367e-01 3.244402408599853516e-01 -9.372758865356445312e-02
-2.811634540557861328e-01 1.279127597808837891e-01 2.315803766250610352e-01
-2.449696063995361328e-01 -3.595451116561889648e-01 -6.689429283142089844e-03
-5.237126350402832031e-02 -2.531653642654418945e-01 -4.304802417755126953e-01
--2.635989189147949219e-01 -2.267163991928100586e-01 -4.170490503311157227e-01
--2.721209526062011719e-01 1.574560403823852539e-01 2.993257045745849609e-01
--3.956274986267089844e-01 2.191853523254394531e-02 2.550070285797119141e-01
--1.563029289245605469e-01 -2.704749107360839844e-01 -2.991802692413330078e-01
-7.597208023071289062e-02 -1.699868440628051758e-01 2.227045297622680664e-01
--3.653595447540283203e-01 -4.391734600067138672e-01 1.462922096252441406e-01
-1.705410480499267578e-01 -4.559497833251953125e-01 -1.512272357940673828e-01
--1.343528032302856445e-01 -1.545200347900390625e-01 2.051105499267578125e-01
-5.652284622192382812e-02 -3.860473632812500000e-02 -1.806387901306152344e-01
-7.492136955261230469e-02 -4.894123077392578125e-01 3.830230236053466797e-02
-2.993867397308349609e-01 -3.184000253677368164e-01 2.854095697402954102e-01
-9.030818939208984375e-02 4.506881237030029297e-01 -3.190367221832275391e-01
-1.546680927276611328e-01 -3.337359428405761719e-01 -2.724964618682861328e-01
-1.143584251403808594e-01 3.319869041442871094e-01 3.964900970458984375e-02
-3.128879070281982422e-01 -1.711206436157226562e-01 -2.891231775283813477e-01
-3.134812116622924805e-01 -3.195825815200805664e-01 4.452092647552490234e-01
--4.257751703262329102e-01 -3.556568622589111328e-01 3.161740303039550781e-01
--4.604424238204956055e-01 1.566462516784667969e-01 -7.651758193969726562e-02
--4.535093307495117188e-01 -1.047830581665039062e-01 3.779829740524291992e-01
--7.651937007904052734e-02 3.510303497314453125e-01 -4.015958309173583984e-01
--5.069994926452636719e-02 -3.019337654113769531e-01 2.270703315734863281e-01
-4.413585662841796875e-01 3.929922580718994141e-01 -3.560695648193359375e-01
-2.530579566955566406e-01 -3.169052600860595703e-01 -4.228000640869140625e-01
--6.997537612915039062e-02 1.933835744857788086e-01 -3.526034355163574219e-01
--1.785504817962646484e-01 1.803159713745117188e-03 -1.765856742858886719e-01
--1.506757736206054688e-02 3.296717405319213867e-01 4.055316448211669922e-01
--4.429192543029785156e-01 -3.453509807586669922e-01 -1.209781169891357422e-01
--2.643674612045288086e-01 1.182488203048706055e-01 -3.157637119293212891e-01
-4.684782028198242188e-02 -4.617999792098999023e-01 2.509958744049072266e-01
--3.250834941864013672e-01 2.819657325744628906e-03 4.174815416336059570e-01
--3.355050086975097656e-02 -4.035353660583496094e-01 -3.605549335479736328e-01
--3.662085533142089844e-01 -2.316244840621948242e-01 -2.756531238555908203e-01
--2.576720714569091797e-01 -1.255595684051513672e-02 -4.626390933990478516e-01
--3.275632858276367188e-01 2.991151809692382812e-02 -4.782438278198242188e-02
-4.056546688079833984e-01 1.594020128250122070e-01 7.798624038696289062e-02
--2.715262174606323242e-01 -3.173813819885253906e-01 1.938788890838623047e-01
-3.270006179809570312e-02 -2.296169996261596680e-01 -2.338488101959228516e-01
--1.381781101226806641e-01 -4.450683593750000000e-01 1.390277147293090820e-01
-4.581812620162963867e-01 -4.004166126251220703e-01 5.525112152099609375e-03
--2.281215190887451172e-01 -1.310509443283081055e-01 -1.401650905609130859e-01
--2.425242662429809570e-01 1.733251810073852539e-01 -4.973032474517822266e-01
--1.258821487426757812e-01 -4.724828004837036133e-01 -5.991733074188232422e-02
-4.821944236755371094e-01 -1.722755432128906250e-01 -2.475099563598632812e-01
-2.750682830810546875e-02 4.665797948837280273e-01 4.664119482040405273e-01
--3.053290843963623047e-01 -3.777220249176025391e-01 2.397775650024414062e-03
-2.908480167388916016e-01 -1.594734191894531250e-01 2.671622037887573242e-01
+-8.877599239349365234e-02 1.935560703277587891e-01 -6.486654281616210938e-02
+-3.341052532196044922e-01 4.142935276031494141e-01 -4.567451477050781250e-01
+2.565863132476806641e-01 -4.949223995208740234e-02 -4.442641735076904297e-01
+3.103950023651123047e-01 -2.099078893661499023e-01 -4.642441272735595703e-01
+-3.521966934204101562e-02 -1.745276451110839844e-01 -3.202521800994873047e-01
+-1.949143409729003906e-02 -1.775810718536376953e-01 -3.603804111480712891e-02
+-1.835894584655761719e-01 3.262339830398559570e-01 -3.085057735443115234e-01
+-1.445159912109375000e-01 1.513528823852539062e-01 -2.023205757141113281e-01
+-4.898538589477539062e-01 -4.509705305099487305e-01 -1.682095527648925781e-01
+3.143328428268432617e-01 4.728571176528930664e-01 1.526627540588378906e-01
+1.280879974365234375e-01 1.239399909973144531e-01 -3.574787378311157227e-01
+-1.123933792114257812e-01 -3.207942247390747070e-01 9.310150146484375000e-02
+-1.386029720306396484e-01 -1.205575466156005859e-02 2.014696598052978516e-01
+-2.808933258056640625e-01 3.925647735595703125e-01 2.450205087661743164e-01
+4.294252395629882812e-02 2.894115447998046875e-01 -2.536165714263916016e-02
+1.801455020904541016e-01 5.933284759521484375e-02 4.247887134552001953e-01
+1.872421503067016602e-01 3.063344955444335938e-02 8.561480045318603516e-02
+2.484493255615234375e-01 -4.173127412796020508e-01 3.008729219436645508e-01
+8.203792572021484375e-02 1.318891048431396484e-01 -1.190292835235595703e-02
+-4.555282592773437500e-01 -3.696656227111816406e-01 2.237200736999511719e-02
+-1.931151151657104492e-01 5.374908447265625000e-02 5.545830726623535156e-02
+-6.292748451232910156e-02 1.790912151336669922e-01 1.174246072769165039e-01
+-2.316267490386962891e-01 -4.075572490692138672e-01 4.597637653350830078e-01
+-3.437596559524536133e-01 4.005973339080810547e-01 -2.290433645248413086e-01
+-1.910818815231323242e-01 -4.736427068710327148e-01 -2.076803445816040039e-01
+-4.528397321701049805e-01 7.907927036285400391e-02 3.940449953079223633e-01
+1.893968582153320312e-01 4.864903688430786133e-01 -3.449935913085937500e-01
+7.300472259521484375e-02 -3.667246103286743164e-01 3.762015104293823242e-01
+-1.821663379669189453e-01 -4.775607585906982422e-02 3.386561870574951172e-01
+5.136466026306152344e-02 4.852104187011718750e-01 -4.752502441406250000e-01
+-3.295025825500488281e-01 -5.519819259643554688e-02 5.781412124633789062e-02
+4.343043565750122070e-01 2.689909934997558594e-01 3.341940641403198242e-01
+-3.969779014587402344e-01 -2.916865348815917969e-01 -1.138211488723754883e-01
+-4.619355201721191406e-01 2.032375335693359375e-02 -1.161878108978271484e-01
+7.124900817871093750e-03 1.223111152648925781e-02 4.087531566619873047e-01
+-3.908715248107910156e-01 1.400717496871948242e-01 2.354013919830322266e-02
+-1.070375442504882812e-01 3.122891187667846680e-01 2.600712776184082031e-01
+4.667922258377075195e-01 -2.228868007659912109e-01 2.890402078628540039e-01
+9.751558303833007812e-03 3.652515411376953125e-01 1.688425540924072266e-01
+-7.598793506622314453e-02 1.410543918609619141e-02 -6.586468219757080078e-02
+-3.012117147445678711e-01 -1.333975791931152344e-02 -2.475223541259765625e-01
+1.425679922103881836e-01 -1.594284772872924805e-01 4.271366596221923828e-01
+-3.488619327545166016e-01 3.043293952941894531e-01 1.312527656555175781e-01
+1.347296237945556641e-01 -2.548012733459472656e-01 2.497346401214599609e-01
+1.558208465576171875e-01 -1.695448160171508789e-01 8.221673965454101562e-02
+2.994102239608764648e-01 -2.616212368011474609e-01 3.708097934722900391e-01
+4.749594926834106445e-01 4.012154340744018555e-01 -1.113747358322143555e-01
+4.658288955688476562e-01 -2.405116558074951172e-01 -4.019365310668945312e-01
+-4.477721452713012695e-01 1.802740097045898438e-01 2.297303676605224609e-01
+2.828998565673828125e-01 3.781812191009521484e-01 -4.897345304489135742e-01
+-1.556029319763183594e-01 -1.499896049499511719e-01 -1.702260971069335938e-01
+-2.203900814056396484e-01 4.228965044021606445e-01 3.943344354629516602e-01
+-7.529938220977783203e-02 -4.034370183944702148e-01 -4.895013570785522461e-01
+-2.633322477340698242e-01 2.260003089904785156e-01 3.617374897003173828e-01
+-2.043257951736450195e-01 -2.201197147369384766e-01 4.399769306182861328e-01
+2.097340822219848633e-01 -3.915596008300781250e-02 -2.276867628097534180e-01
+-1.167770624160766602e-01 4.129269123077392578e-01 -4.588322639465332031e-01
+3.195565938949584961e-01 2.821329832077026367e-01 2.030262947082519531e-01
+4.332208633422851562e-02 2.999825477600097656e-01 -2.426314353942871094e-01
+-2.900393009185791016e-01 7.278752326965332031e-02 3.351804018020629883e-01
+-3.045821189880371094e-02 -1.478650569915771484e-01 3.491390943527221680e-01
+-2.793753147125244141e-02 -1.773738861083984375e-01 1.675630807876586914e-01
+-3.188729286193847656e-01 -4.904426336288452148e-01 -6.549203395843505859e-02
+-4.071967601776123047e-01 -1.066761016845703125e-01 -4.441113471984863281e-01
+4.105618000030517578e-01 -3.848595619201660156e-01 1.863635778427124023e-01
+-1.051111221313476562e-01 -7.725274562835693359e-02 -4.898943901062011719e-01
+3.737279176712036133e-01 1.056033372879028320e-01 4.786680936813354492e-01
+2.511825561523437500e-01 -3.347592353820800781e-01 1.227176189422607422e-01
+-3.208853006362915039e-01 -1.442481279373168945e-01 -9.813189506530761719e-02
+3.365310430526733398e-01 -4.063715934753417969e-01 -4.750763177871704102e-01
+-3.066674470901489258e-01 -2.005393505096435547e-01 -2.603935003280639648e-01
+4.633438587188720703e-02 -3.628603219985961914e-01 -3.448045253753662109e-01
+-1.228909492492675781e-01 4.968223571777343750e-01 1.755017042160034180e-01
+4.529950618743896484e-01 -4.122850894927978516e-01 3.542938232421875000e-01
+3.015396595001220703e-01 6.062459945678710938e-02 -5.255222320556640625e-02
+7.875204086303710938e-02 -3.220939636230468750e-01 2.097034454345703125e-02
+-3.075191974639892578e-01 -4.913786649703979492e-01 1.174443960189819336e-01
+-2.157187461853027344e-01 -1.293109655380249023e-01 -3.813669681549072266e-01
+-2.569644451141357422e-01 -4.775856733322143555e-01 -3.842570781707763672e-01
+3.374536037445068359e-01 2.595454454421997070e-01 -1.862519979476928711e-01
+-2.484831809997558594e-01 1.898849010467529297e-02 -1.008712053298950195e-01
+-3.550199270248413086e-01 -3.802776336669921875e-03 2.112603187561035156e-01
+-4.047393798828125000e-02 -3.331716060638427734e-01 -1.580150127410888672e-01
+2.301404476165771484e-01 1.020783185958862305e-01 2.300353050231933594e-01
+-4.886188507080078125e-01 -4.335124492645263672e-01 -3.716624975204467773e-01
+3.109852075576782227e-01 -3.871500492095947266e-02 1.583197116851806641e-01
+4.864922761917114258e-01 -2.506246566772460938e-01 4.611170291900634766e-01
+4.114500284194946289e-01 -2.497513294219970703e-01 8.945560455322265625e-02
+-2.041511535644531250e-01 -3.061387538909912109e-01 -1.002895832061767578e-01
+-3.356888294219970703e-01 -2.898548841476440430e-01 -4.294934272766113281e-01
+6.349623203277587891e-02 -4.237914085388183594e-01 1.809575557708740234e-01
+1.638014316558837891e-01 -3.412141799926757812e-01 -4.808696508407592773e-01
+4.292991161346435547e-01 -7.350444793701171875e-02 4.452385902404785156e-01
+-2.837867736816406250e-01 2.394533157348632812e-02 -4.843814373016357422e-01
+-2.125334739685058594e-01 1.921176910400390625e-01 -2.379369735717773438e-02
+1.759276390075683594e-01 4.892826080322265625e-01 4.419517517089843750e-01
+-4.233963489532470703e-01 7.077014446258544922e-02 -3.061563968658447266e-01
+-3.712041378021240234e-01 4.946417212486267090e-01 3.635656833648681641e-01
+-4.665093421936035156e-01 4.070787429809570312e-01 -3.274630308151245117e-01
+3.692833185195922852e-01 -8.178091049194335938e-02 -1.193681955337524414e-01
+6.124496459960937500e-03 -2.011668682098388672e-02 8.408391475677490234e-02
+-1.337385177612304688e-02 -2.435498237609863281e-01 -4.735767841339111328e-01
+2.590975761413574219e-01 -3.270063400268554688e-01 -5.099523067474365234e-02
+3.800438642501831055e-01 4.123662710189819336e-01 -3.175902366638183594e-01
+2.355668544769287109e-01 2.839933633804321289e-01 -3.255009651184082031e-01
+-4.340230226516723633e-01 -4.109045267105102539e-01 4.977314472198486328e-01
+2.350783348083496094e-02 -7.954597473144531250e-02 -2.089430093765258789e-01
+2.528522014617919922e-01 2.231028079986572266e-01 -4.818900823593139648e-01
+3.285017013549804688e-01 -1.968045234680175781e-01 2.016012668609619141e-01
+3.276336193084716797e-01 3.824212551116943359e-01 -2.195405960083007812e-02
+4.347554445266723633e-01 -1.944565773010253906e-02 -3.952792882919311523e-01
+-2.355787754058837891e-01 2.512185573577880859e-01 -4.705796241760253906e-01
+2.304534912109375000e-01 2.335491180419921875e-01 3.436188697814941406e-01
+4.291563034057617188e-01 2.084137201309204102e-01 -3.515939712524414062e-01
+4.610210657119750977e-01 2.877938747406005859e-01 9.413146972656250000e-02
+3.239741325378417969e-01 4.200505018234252930e-01 3.377312421798706055e-01
+-4.339945316314697266e-01 -1.799043416976928711e-01 1.667797565460205078e-01
+4.162905216217041016e-01 -2.838604450225830078e-01 -1.204760074615478516e-01
+4.708716869354248047e-01 4.452165365219116211e-01 4.702655076980590820e-01
+3.935134410858154297e-01 -4.494274854660034180e-01 -1.000511646270751953e-02
+-3.325940370559692383e-01 -3.989632129669189453e-01 -2.595729827880859375e-01
+-4.726890325546264648e-01 -1.577985286712646484e-01 -2.004265785217285156e-02
+-2.578830718994140625e-01 1.816778182983398438e-01 1.800514459609985352e-01
+2.873079776763916016e-01 -1.582661867141723633e-01 1.000881195068359375e-03
+1.284685134887695312e-01 -2.347108125686645508e-01 -1.527856588363647461e-01
+-4.975929260253906250e-01 4.154947996139526367e-01 2.424190044403076172e-01
+1.319632530212402344e-01 2.181564569473266602e-01 1.456822156906127930e-01
+4.251360893249511719e-02 5.486690998077392578e-02 2.446963787078857422e-01
+5.265474319458007812e-03 -4.930623769760131836e-01 1.795315742492675781e-02
+3.435378074645996094e-01 -1.437039375305175781e-01 -2.955729961395263672e-01
+-1.589361429214477539e-01 3.439151048660278320e-01 -1.269352436065673828e-01
+-2.996790409088134766e-01 -2.977983951568603516e-01 5.047678947448730469e-02
+1.387677192687988281e-01 -4.051816463470458984e-02 -6.590497493743896484e-02
+-4.859859943389892578e-01 4.686148166656494141e-01 6.054759025573730469e-02
+3.058031797409057617e-01 -4.722125530242919922e-01 -1.649188995361328125e-01
+3.712953329086303711e-01 -3.612419366836547852e-01 -2.953444719314575195e-01
+-2.350592613220214844e-01 1.253683567047119141e-01 -3.582476377487182617e-01
+-4.282865524291992188e-01 -3.783413171768188477e-01 1.956710815429687500e-01
+-1.545268297195434570e-01 -3.127627372741699219e-01 -3.272031545639038086e-01
+2.250815629959106445e-01 -3.367059230804443359e-01 -2.811298370361328125e-01
+-5.611097812652587891e-02 2.276177406311035156e-01 -3.761705160140991211e-01
+1.843569278717041016e-01 3.698165416717529297e-01 -1.461877822875976562e-01
+-3.651070594787597656e-01 3.224494457244873047e-01 -4.502046108245849609e-02
+-1.052534580230712891e-01 1.394950151443481445e-01 3.169180154800415039e-01
+-7.266819477081298828e-02 -3.203969001770019531e-01 3.076763153076171875e-01
+-1.534210443496704102e-01 -1.421678066253662109e-02 -2.598439455032348633e-01
+4.644811153411865234e-01 -2.855896949768066406e-02 6.111550331115722656e-02
+1.615400314331054688e-01 4.353706836700439453e-01 2.680056095123291016e-01
+-4.789991378784179688e-01 -2.737338542938232422e-01 -2.684531211853027344e-01
+-4.801630973815917969e-01 -1.131765842437744141e-01 -2.253174781799316406e-01
+4.725518226623535156e-01 2.924776077270507812e-01 -4.712775945663452148e-01
+3.934500217437744141e-01 6.538939476013183594e-02 -2.147150039672851562e-01
+5.674338340759277344e-02 1.684566736221313477e-01 4.750093221664428711e-01
+-3.127444982528686523e-01 1.864537000656127930e-01 -1.828011274337768555e-01
+-6.377077102661132812e-02 3.063268661499023438e-01 4.461523294448852539e-01
+-2.393376827239990234e-01 -2.101924419403076172e-01 2.160568237304687500e-01
+-4.714767932891845703e-01 2.386778593063354492e-01 -1.962506771087646484e-01
+-4.175131320953369141e-01 1.262202262878417969e-01 -4.906876087188720703e-01
+1.526114940643310547e-01 -1.855427026748657227e-01 -3.443827629089355469e-01
+6.579875946044921875e-02 -4.886317253112792969e-02 -4.445745944976806641e-01
+1.098661422729492188e-01 3.471816778182983398e-01 4.010045528411865234e-01
+2.641906738281250000e-01 -2.310740947723388672e-01 -1.801049709320068359e-01
+2.215981483459472656e-02 1.125121116638183594e-01 -2.007805109024047852e-01
+4.692313671112060547e-01 -3.348422050476074219e-02 2.421901226043701172e-01
+3.015110492706298828e-01 -7.356131076812744141e-02 3.514482975006103516e-01
+-3.965770006179809570e-01 2.962644100189208984e-01 3.929857015609741211e-01
+1.106926202774047852e-01 -4.377689361572265625e-01 -1.675007343292236328e-01
+1.297621726989746094e-01 -8.046376705169677734e-02 2.488052845001220703e-01
+1.898322105407714844e-01 1.719188690185546875e-01 -1.696370840072631836e-01
+4.060682058334350586e-01 1.258714199066162109e-01 1.274476051330566406e-01
+1.603732109069824219e-01 3.966591358184814453e-01 6.766164302825927734e-02
+5.054616928100585938e-02 2.127890586853027344e-01 3.031399250030517578e-01
+-1.690447330474853516e-01 -1.416635513305664062e-01 3.728961944580078125e-02
+-1.341120004653930664e-01 1.080242395401000977e-01 4.635136127471923828e-01
+-2.457776069641113281e-01 -3.851659297943115234e-01 2.513883113861083984e-01
+-1.634557247161865234e-01 -4.583904743194580078e-01 -2.824854850769042969e-02
+-1.784324645996093750e-03 4.497978687286376953e-01 -1.161942481994628906e-01
+4.503953456878662109e-01 1.885429620742797852e-01 -4.877877235412597656e-02
+2.600491046905517578e-01 2.236571311950683594e-01 2.091717720031738281e-02
+-3.822712898254394531e-01 2.547247409820556641e-01 -3.687927722930908203e-01
+-3.667194843292236328e-01 -1.171383857727050781e-01 3.846424818038940430e-01
+-3.668913841247558594e-01 -2.955377101898193359e-01 3.536789417266845703e-01
+2.957736253738403320e-01 8.799576759338378906e-02 -3.451507091522216797e-01
+-1.604117155075073242e-01 3.587515354156494141e-01 5.187714099884033203e-02
+1.919094324111938477e-01 -4.781463146209716797e-01 4.655241966247558594e-03
+-3.640174865722656250e-02 4.754726886749267578e-01 -2.942006587982177734e-01
+-6.335353851318359375e-02 4.125511646270751953e-02 -3.732511997222900391e-01
+9.152126312255859375e-02 3.327772617340087891e-01 -4.209873676300048828e-01
+-5.436992645263671875e-02 4.926524162292480469e-01 3.434299230575561523e-01
+3.771104812622070312e-01 9.526658058166503906e-02 2.973334789276123047e-01
diff --git a/src/simulation/m_volume_filtering.fpp b/src/simulation/m_volume_filtering.fpp
index fea8a22811..02a2819a25 100644
--- a/src/simulation/m_volume_filtering.fpp
+++ b/src/simulation/m_volume_filtering.fpp
@@ -321,7 +321,9 @@ contains
 #endif
 
         ! file for particle forces
-        open(unit=100, file='particle_force.bin', status='replace', form='unformatted', access='stream')
+        if (proc_rank == 0) then
+            open(unit=100, file='particle_force.bin', status='replace', form='unformatted', access='stream', action='write')
+        end if
 
     end subroutine s_initialize_fftw_explicit_filter_module
 
@@ -879,21 +881,27 @@ contains
         real(wp) :: dvol
         integer :: i, j, k, l
 
+        ! zero particle forces
+        particle_forces = 0.0_wp
+        !$acc update device(particle_forces)
+
         !$acc parallel loop collapse(3) gang vector default(present) private(dvol)
         do i = 0, m 
             do j = 0, n 
                 do k = 0, p
                     dvol = dx(i) * dy(j) * dz(k)
                     !$acc atomic
-                    particle_forces(ib_markers%sf(i, j, k), 1) = particle_forces(ib_markers%sf(i, j, k), 1) + div_pres_visc_stress(1)%sf(i, j, k) * dvol
+                    particle_forces(ib_markers%sf(i, j, k), 1) = particle_forces(ib_markers%sf(i, j, k), 1) - div_pres_visc_stress(1)%sf(i, j, k) * dvol
                     !$acc atomic
-                    particle_forces(ib_markers%sf(i, j, k), 2) = particle_forces(ib_markers%sf(i, j, k), 2) + div_pres_visc_stress(2)%sf(i, j, k) * dvol
+                    particle_forces(ib_markers%sf(i, j, k), 2) = particle_forces(ib_markers%sf(i, j, k), 2) - div_pres_visc_stress(2)%sf(i, j, k) * dvol
                     !$acc atomic
-                    particle_forces(ib_markers%sf(i, j, k), 3) = particle_forces(ib_markers%sf(i, j, k), 3) + div_pres_visc_stress(3)%sf(i, j, k) * dvol
+                    particle_forces(ib_markers%sf(i, j, k), 3) = particle_forces(ib_markers%sf(i, j, k), 3) - div_pres_visc_stress(3)%sf(i, j, k) * dvol
                 end do 
             end do 
         end do
 
+        !$acc update host(particle_forces)
+
         ! reduce particle forces across processors
         do i = 1, num_ibs
             call s_mpi_allreduce_sum(particle_forces(i, 1), force_glb(i, 1))
@@ -901,6 +909,11 @@ contains
             call s_mpi_allreduce_sum(particle_forces(i, 3), force_glb(i, 3))
         end do
 
+        if (proc_rank == 0) then
+            print *, 'force', force_glb(1, 1)
+            print *, 'C_D', 2._wp * force_glb(1, 1) / (rho_inf_ref * u_inf_ref**2 * pi * patch_ib(1)%radius**2)
+        end if
+        
         ! write particle forces to file
         if (proc_rank == 0) then
             write(100) force_glb
@@ -1207,7 +1220,9 @@ contains
         call fftw_destroy_plan(plan_z_c2c_kernelG)
 #endif
 
-        close(100)
+        if (proc_rank == 0) then
+            close(100)
+        end if
 
     end subroutine s_finalize_fftw_explicit_filter_module
 
diff --git a/voronoi/gen_voronoi_3D.py b/voronoi/gen_voronoi_3D.py
index c56a02fb8e..ecb08eb36c 100644
--- a/voronoi/gen_voronoi_3D.py
+++ b/voronoi/gen_voronoi_3D.py
@@ -73,7 +73,7 @@ def lloyd_relaxation_3d(initial_points, box, w=1, iterations=10):
     initial_points = np.stack((x_i, y_i, z_i), axis=1)
     box = freud.box.Box.cube(L)
     
-    relaxed_points = lloyd_relaxation_3d(initial_points, box, iterations=30)
+    relaxed_points = lloyd_relaxation_3d(initial_points, box, iterations=40)
     print(np.shape(relaxed_points))
 
     np.savetxt(output_dir+'/sphere_array_locations.txt', relaxed_points)

From 39f84d1878b57406b2c58d55337b63e49a4ddf36 Mon Sep 17 00:00:00 2001
From: Conrad Delgado <conraddelgado@Conrads-MacBook-Air-6.local>
Date: Sat, 13 Sep 2025 17:07:18 -0500
Subject: [PATCH 16/30] updated interphase momentum exch calc

---
 src/simulation/m_volume_filtering.fpp | 106 +++++++++++++++++++++++---
 src/simulation/p_main.fpp             |   1 +
 2 files changed, 97 insertions(+), 10 deletions(-)

diff --git a/src/simulation/m_volume_filtering.fpp b/src/simulation/m_volume_filtering.fpp
index 02a2819a25..419b07db81 100644
--- a/src/simulation/m_volume_filtering.fpp
+++ b/src/simulation/m_volume_filtering.fpp
@@ -28,8 +28,8 @@ module m_volume_filtering
 
     private; public :: s_initialize_fftw_explicit_filter_module, &
  s_initialize_filtering_kernel, s_initialize_fluid_indicator_function, & 
- s_initialize_filtered_fluid_indicator_function, s_finalize_fftw_explicit_filter_module, & 
- s_volume_filter_momentum_eqn, s_apply_fftw_filter_scalarfield, &
+ s_initialize_filtered_fluid_indicator_function, s_initialize_fluid_indicator_gradient, &
+ s_finalize_fftw_explicit_filter_module, s_volume_filter_momentum_eqn, s_apply_fftw_filter_scalarfield, &
  s_compute_viscous_stress_tensor, s_compute_stress_tensor, s_compute_divergence_stress_tensor, s_compute_particle_forces, &
  s_mpi_transpose_slabZ2Y, s_mpi_transpose_slabY2Z, s_mpi_FFT_fwd, s_mpi_FFT_bwd, &
  s_setup_terms_filtering, s_compute_pseudo_turbulent_reynolds_stress, s_compute_effective_viscosity
@@ -43,6 +43,7 @@ module m_volume_filtering
     ! fluid indicator function (1 = fluid, 0 = otherwise)
     type(scalar_field), public :: fluid_indicator_function
     type(scalar_field), public :: filtered_fluid_indicator_function
+    type(scalar_field), allocatable, dimension(:) :: grad_fluid_indicator
 
     ! volume filtered conservative variables
     type(scalar_field), allocatable, dimension(:), public :: q_cons_filtered
@@ -66,7 +67,8 @@ module m_volume_filtering
     ! x-,y-,z-direction forces on particles
     real(wp), allocatable, dimension(:, :) :: particle_forces
 
-    !$acc declare create(fluid_indicator_function, filtered_fluid_indicator_function, q_cons_filtered, filtered_pressure)
+    !$acc declare create(fluid_indicator_function, filtered_fluid_indicator_function, grad_fluid_indicator)
+    !$acc declare create(q_cons_filtered, filtered_pressure)
     !$acc declare create(visc_stress, pres_visc_stress, div_pres_visc_stress)
     !$acc declare create(reynolds_stress, eff_visc, int_mom_exch)
     !$acc declare create(Res, particle_forces)
@@ -456,14 +458,14 @@ contains
     subroutine s_initialize_fluid_indicator_function 
         integer :: i, j, k 
 
-        @:ALLOCATE(fluid_indicator_function%sf(0:m, 0:n, 0:p))
+        @:ALLOCATE(fluid_indicator_function%sf(-1:m+1, -1:n+1, -1:p+1))
         @:ACC_SETUP_SFs(fluid_indicator_function)
 
         ! define fluid indicator function
         !$acc parallel loop collapse(3) gang vector default(present)
-        do i = 0, m
-            do j = 0, n 
-                do k = 0, p
+        do i = -1, m+1
+            do j = -1, n+1 
+                do k = -1, p+1
                     if (ib_markers%sf(i, j, k) == 0) then 
                         fluid_indicator_function%sf(i, j, k) = 1.0_dp
                     else 
@@ -515,6 +517,36 @@ contains
 
     end subroutine s_initialize_filtered_fluid_indicator_function
 
+
+    subroutine s_initialize_fluid_indicator_gradient
+        integer :: i, j, k
+
+        @:ALLOCATE(grad_fluid_indicator(1:3))
+        do i = 1, 3
+            @:ALLOCATE(grad_fluid_indicator(i)%sf(0:m, 0:n, 0:p))
+            @:ACC_SETUP_SFs(grad_fluid_indicator(i))
+        end do
+
+        !$acc parallel loop collapse(3) gang vector default(present)
+        do i = 0, m
+            do j = 0, n
+                do k = 0, p 
+                    grad_fluid_indicator(1)%sf(i, j, k) = (fluid_indicator_function%sf(i+1, j, k) - &
+                                                           fluid_indicator_function%sf(i-1, j, k)) / & 
+                                                           (x_cc(i+1) - x_cc(i-1))
+                    grad_fluid_indicator(2)%sf(i, j, k) = (fluid_indicator_function%sf(i, j+1, k) - &
+                                                           fluid_indicator_function%sf(i, j-1, k)) / & 
+                                                           (y_cc(j+1) - y_cc(j-1))
+                    grad_fluid_indicator(3)%sf(i, j, k) = (fluid_indicator_function%sf(i, j, k+1) - &
+                                                           fluid_indicator_function%sf(i, j, k-1)) / & 
+                                                           (z_cc(k+1) - z_cc(k-1))
+                end do 
+            end do 
+        end do
+
+    end subroutine s_initialize_fluid_indicator_gradient
+
+
     !< calculate the unclosed terms present in the volume filtered momentum equation
     subroutine s_volume_filter_momentum_eqn(q_cons_vf, q_prim_vf)
         type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
@@ -544,9 +576,7 @@ contains
             end do
         end do 
         ! interphase momentum exchange
-        do i = 1, num_dims
-            call s_apply_fftw_filter_scalarfield(filtered_fluid_indicator_function, .false., div_pres_visc_stress(i), int_mom_exch(i))
-        end do 
+        call s_compute_interphase_momentum_exchange(filtered_fluid_indicator_function, grad_fluid_indicator, pres_visc_stress, int_mom_exch)
 
         call s_compute_pseudo_turbulent_reynolds_stress(q_cons_filtered, reynolds_stress)
         call s_compute_effective_viscosity(q_cons_filtered, eff_visc, visc_stress)
@@ -875,6 +905,58 @@ contains
 
     end subroutine s_compute_effective_viscosity
 
+    subroutine s_compute_interphase_momentum_exchange(filtered_fluid_indicator_function, grad_fluid_indicator, pres_visc_stress, int_mom_exch)
+        type(scalar_field), intent(in) :: filtered_fluid_indicator_function
+        type(scalar_field), dimension(1:3), intent(in) :: grad_fluid_indicator
+        type(vector_field), dimension(1:3), intent(in) :: pres_visc_stress
+        type(scalar_field), dimension(1:3), intent(inout) :: int_mom_exch
+
+        integer :: i, j, k, l
+
+        ! x-, y-, z- component loop
+        do l = 1, 3
+
+            !$acc parallel loop collapse(3) gang vector default(present)
+            do i = 0, m 
+                do j = 0, n 
+                    do k = 0, p
+                        data_real_3D_slabz(i+1, j+1, k+1) = pres_visc_stress(1)%vf(l)%sf(i, j, k) * grad_fluid_indicator(1)%sf(i, j, k) & 
+                                                          + pres_visc_stress(2)%vf(l)%sf(i, j, k) * grad_fluid_indicator(2)%sf(i, j, k) & 
+                                                          + pres_visc_stress(3)%vf(l)%sf(i, j, k) * grad_fluid_indicator(3)%sf(i, j, k)
+                    end do 
+                end do
+            end do
+
+            call nvtxStartRange("FORWARD-3D-FFT")
+            call s_mpi_FFT_fwd 
+            call nvtxEndRange
+
+            ! convolution with filtering kernel
+            !$acc parallel loop collapse(3) gang vector default(present)
+            do i = 1, NxC 
+                do j = 1, Nyloc 
+                    do k = 1, Nz 
+                        data_cmplx_out1d(k + (i-1)*Nz + (j-1)*Nz*NxC) = data_cmplx_out1d(k + (i-1)*Nz + (j-1)*Nz*NxC) * cmplx_kernelG1d(k + (i-1)*Nz + (j-1)*Nz*NxC)
+                    end do 
+                end do 
+            end do
+
+            call nvtxStartRange("BACKWARD-3D-FFT")
+            call s_mpi_FFT_bwd
+            call nvtxEndRange
+
+            !$acc parallel loop collapse(3) gang vector default(present)
+            do i = 0, m
+                do j = 0, n
+                    do k = 0, p
+                        int_mom_exch(l)%sf(i, j, k) = data_real_3D_slabz(i+1, j+1, k+1) / (real(Nx*Ny*Nz, dp))
+                    end do 
+                end do 
+            end do
+        end do ! end component loop
+
+    end subroutine s_compute_interphase_momentum_exchange
+
     ! computes x-,y-,z-direction forces on particles
     subroutine s_compute_particle_forces
         real(wp), dimension(num_ibs, 3) :: force_glb
@@ -1146,6 +1228,10 @@ contains
 
         @:DEALLOCATE(fluid_indicator_function%sf)
         @:DEALLOCATE(filtered_fluid_indicator_function%sf)
+        do i = 1, 3 
+            @:DEALLOCATE(grad_fluid_indicator(i)%sf)
+        end do
+        @:DEALLOCATE(grad_fluid_indicator)
 
         do i = 1, sys_size-1
             @:DEALLOCATE(q_cons_filtered(i)%sf)
diff --git a/src/simulation/p_main.fpp b/src/simulation/p_main.fpp
index 9cd571b2ac..ccfa9cca4f 100644
--- a/src/simulation/p_main.fpp
+++ b/src/simulation/p_main.fpp
@@ -60,6 +60,7 @@ program p_main
     if (volume_filtering_momentum_eqn) then 
         call s_initialize_filtering_kernel()
         call s_initialize_filtered_fluid_indicator_function()
+        call s_initialize_fluid_indicator_gradient()
     end if
 
     ! Setting the time-step iterator to the first time-step

From def36a980e7d2efb6a7a095427cf03fe93d9baba Mon Sep 17 00:00:00 2001
From: Conrad Delgado <conradd3@dt-login02.delta.ncsa.illinois.edu>
Date: Mon, 15 Sep 2025 13:28:22 -0500
Subject: [PATCH 17/30] removed print statements for runs

---
 src/simulation/m_volume_filtering.fpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/simulation/m_volume_filtering.fpp b/src/simulation/m_volume_filtering.fpp
index 419b07db81..24fc6f6445 100644
--- a/src/simulation/m_volume_filtering.fpp
+++ b/src/simulation/m_volume_filtering.fpp
@@ -991,10 +991,10 @@ contains
             call s_mpi_allreduce_sum(particle_forces(i, 3), force_glb(i, 3))
         end do
 
-        if (proc_rank == 0) then
-            print *, 'force', force_glb(1, 1)
-            print *, 'C_D', 2._wp * force_glb(1, 1) / (rho_inf_ref * u_inf_ref**2 * pi * patch_ib(1)%radius**2)
-        end if
+        ! if (proc_rank == 0) then
+        !     print *, 'force', force_glb(1, 1)
+        !     print *, 'C_D', 2._wp * force_glb(1, 1) / (rho_inf_ref * u_inf_ref**2 * pi * patch_ib(1)%radius**2)
+        ! end if
         
         ! write particle forces to file
         if (proc_rank == 0) then

From ee8d2596a46829f916bc83b378d0c26191c5b221 Mon Sep 17 00:00:00 2001
From: Conrad Delgado <168050190+conraddelgado@users.noreply.github.com>
Date: Wed, 17 Sep 2025 16:36:48 -0500
Subject: [PATCH 18/30] Delete examples/3D_ibm_sphere_periodic directory

---
 examples/3D_ibm_sphere_periodic/case.py | 107 ------------------------
 1 file changed, 107 deletions(-)
 delete mode 100644 examples/3D_ibm_sphere_periodic/case.py

diff --git a/examples/3D_ibm_sphere_periodic/case.py b/examples/3D_ibm_sphere_periodic/case.py
deleted file mode 100644
index 41938f69fd..0000000000
--- a/examples/3D_ibm_sphere_periodic/case.py
+++ /dev/null
@@ -1,107 +0,0 @@
-import json
-import math
-
-Mu = 1.84e-05
-gam_a = 1.4
-
-D = 0.1
-
-# Configuring case dictionary
-print(
-    json.dumps(
-        {
-            # Logistics
-            "run_time_info": "T",
-            # Computational Domain Parameters
-            # x direction
-            "x_domain%beg": -5 * D,
-            "x_domain%end": 5.0 * D,
-            # y direction
-            "y_domain%beg": -2.5 * D,
-            "y_domain%end": 2.5 * D,
-            # z direction
-            "z_domain%beg": -2.5 * D,
-            "z_domain%end": 2.5 * D,
-            "cyl_coord": "F",
-            "m": 99,
-            "n": 99,
-            "p": 99,
-            "dt": 1.0e-6,
-            "t_step_start": 0,
-            "t_step_stop": 200,  # 3000
-            "t_step_save": 10,  # 10
-            # Simulation Algorithm Parameters
-            # Only one patches are necessary, the air tube
-            "num_patches": 1,
-            # Use the 5 equation model
-            "model_eqns": 2,
-            # 6 equations model does not need the K \div(u) term
-            "alt_soundspeed": "F",
-            # One fluids: air
-            "num_fluids": 1,
-            # time step
-            "mpp_lim": "F",
-            # Correct errors when computing speed of sound
-            "mixture_err": "T",
-            # Use TVD RK3 for time marching
-            "time_stepper": 3,
-            # Reconstruct the primitive variables to minimize spurious
-            # Use WENO5
-            "weno_order": 5,
-            "weno_eps": 1.0e-16,
-            "weno_Re_flux": "T",
-            "weno_avg": "T",
-            "avg_state": 2,
-            "mapped_weno": "T",
-            "null_weights": "F",
-            "mp_weno": "T",
-            "riemann_solver": 2,
-            "wave_speeds": 1,
-            # Periodic BCs
-            "bc_x%beg": -1,
-            "bc_x%end": -1,
-            "bc_y%beg": -1,
-            "bc_y%end": -1,
-            "bc_z%beg": -1,
-            "bc_z%end": -1,
-            # Set IB to True and add 1 patch
-            "ib": "T",
-            "num_ibs": 1,
-            "viscous": "T",
-            # Formatted Database Files Structure Parameters
-            "format": 1,
-            "precision": 2,
-            "prim_vars_wrt": "T",
-            "E_wrt": "T",
-            "parallel_io": "T",
-            # Patch: Constant Tube filled with air
-            # Specify the cylindrical air tube grid geometry
-            "patch_icpp(1)%geometry": 9,
-            "patch_icpp(1)%x_centroid": 0.0,
-            # Uniform medium density, centroid is at the center of the domain
-            "patch_icpp(1)%y_centroid": 0.0,
-            "patch_icpp(1)%z_centroid": 0.0,
-            "patch_icpp(1)%length_x": 10 * D,
-            "patch_icpp(1)%length_y": 5 * D,
-            "patch_icpp(1)%length_z": 5 * D,
-            # Specify the patch primitive variables
-            "patch_icpp(1)%vel(1)": 527.2e00,
-            "patch_icpp(1)%vel(2)": 0.0e00,
-            "patch_icpp(1)%vel(3)": 0.0e00,
-            "patch_icpp(1)%pres": 10918.2549,
-            "patch_icpp(1)%alpha_rho(1)": 0.2199,
-            "patch_icpp(1)%alpha(1)": 1.0e00,
-            # Patch: Sphere Immersed Boundary
-            "patch_ib(1)%geometry": 8,
-            "patch_ib(1)%x_centroid": -3.0e-3,
-            "patch_ib(1)%y_centroid": 0.0,
-            "patch_ib(1)%z_centroid": 0.0,
-            "patch_ib(1)%radius": D / 2,
-            "patch_ib(1)%slip": "T",
-            # Fluids Physical Parameters
-            "fluid_pp(1)%gamma": 1.0e00 / (gam_a - 1.0e00),  # 2.50(Not 1.40)
-            "fluid_pp(1)%pi_inf": 0,
-            "fluid_pp(1)%Re(1)": 7535533.2,
-        }
-    )
-)

From a0d20155c41466aa8b10869538b95672abb0679e Mon Sep 17 00:00:00 2001
From: conradd3 <conradd3@illinois.edu>
Date: Mon, 22 Sep 2025 13:29:25 -0500
Subject: [PATCH 19/30] batched mpiAlltoAll for tensors

---
 src/simulation/m_volume_filtering.fpp | 352 +++++++++++++++++++++++---
 1 file changed, 317 insertions(+), 35 deletions(-)

diff --git a/src/simulation/m_volume_filtering.fpp b/src/simulation/m_volume_filtering.fpp
index 24fc6f6445..d3981dd55a 100644
--- a/src/simulation/m_volume_filtering.fpp
+++ b/src/simulation/m_volume_filtering.fpp
@@ -29,9 +29,9 @@ module m_volume_filtering
     private; public :: s_initialize_fftw_explicit_filter_module, &
  s_initialize_filtering_kernel, s_initialize_fluid_indicator_function, & 
  s_initialize_filtered_fluid_indicator_function, s_initialize_fluid_indicator_gradient, &
- s_finalize_fftw_explicit_filter_module, s_volume_filter_momentum_eqn, s_apply_fftw_filter_scalarfield, &
+ s_finalize_fftw_explicit_filter_module, s_volume_filter_momentum_eqn, s_apply_fftw_filter_scalarfield, s_filter_tensor_field, &
  s_compute_viscous_stress_tensor, s_compute_stress_tensor, s_compute_divergence_stress_tensor, s_compute_particle_forces, &
- s_mpi_transpose_slabZ2Y, s_mpi_transpose_slabY2Z, s_mpi_FFT_fwd, s_mpi_FFT_bwd, &
+ s_mpi_transpose_slabZ2Y, s_mpi_transpose_slabY2Z, s_mpi_transpose_slabZ2Y_tensor, s_mpi_transpose_slabY2Z_tensor, s_mpi_FFT_fwd, s_mpi_FFT_bwd, &
  s_setup_terms_filtering, s_compute_pseudo_turbulent_reynolds_stress, s_compute_effective_viscosity
 
 #if !defined(MFC_OpenACC)
@@ -94,6 +94,8 @@ module m_volume_filtering
 
     ! 3D arrays for slab transposes
     complex(c_double_complex), allocatable :: data_cmplx_slabz(:, :, :), data_cmplx_slaby(:, :, :)
+    ! 3D arrays for slab transposes of tensor quantities
+    complex(c_double_complex), allocatable :: data_cmplx_slabz_tensor(:, :, :, :), data_cmplx_slaby_tensor(:, :, :, :)
 
     ! input/output array for FFT routine
     real(c_double), allocatable :: data_real_3D_slabz(:, :, :)
@@ -105,7 +107,12 @@ module m_volume_filtering
     complex(c_double_complex), allocatable :: cmplx_kernelG1d(:)
 
     !$acc declare create(Nx, Ny, Nz, NxC, Nyloc, Nzloc)
-    !$acc declare create(data_real_in1d, data_cmplx_out1d, data_cmplx_out1dy, data_cmplx_slabz, data_cmplx_slaby, data_real_3D_slabz, real_kernelG_in, cmplx_kernelG1d)
+    !$acc declare create(data_real_in1d, data_cmplx_out1d, data_cmplx_out1dy)
+    !$acc declare create(data_cmplx_slabz, data_cmplx_slaby, data_cmplx_slabz_tensor, data_cmplx_slaby_tensor, data_real_3D_slabz, real_kernelG_in, cmplx_kernelG1d)
+
+    ! buffers for data transpose
+    complex(c_double_complex), allocatable :: sendbuf_sf(:), recvbuf_sf(:)
+    complex(c_double_complex), allocatable :: sendbuf_tensor(:), recvbuf_tensor(:)
 
 contains
 
@@ -232,6 +239,13 @@ contains
         @:ALLOCATE(data_real_3D_slabz(Nx, Ny, Nzloc))
         @:ALLOCATE(data_cmplx_slabz(NxC, Ny, Nzloc))
         @:ALLOCATE(data_cmplx_slaby(NxC, Nyloc, Nz))
+        @:ALLOCATE(data_cmplx_slabz_tensor(9, NxC, Ny, Nzloc))
+        @:ALLOCATE(data_cmplx_slaby_tensor(9, NxC, Nyloc, Nz))
+
+        allocate(sendbuf_sf(NxC*Nyloc*Nzloc*num_procs))
+        allocate(recvbuf_sf(NxC*Nyloc*Nzloc*num_procs))
+        allocate(sendbuf_tensor(9*NxC*Nyloc*Nzloc*num_procs))
+        allocate(recvbuf_tensor(9*NxC*Nyloc*Nzloc*num_procs))
 
 #if defined(MFC_OpenACC)
         !< GPU FFT plans
@@ -564,17 +578,19 @@ contains
         call s_setup_terms_filtering(q_cons_vf, q_prim_vf, reynolds_stress, visc_stress, pres_visc_stress, div_pres_visc_stress)
 
         ! pseudo turbulent reynolds stress
-        do i = 1, num_dims 
-            do j = 1, num_dims
-                call s_apply_fftw_filter_scalarfield(filtered_fluid_indicator_function, .true., reynolds_stress(i)%vf(j))
-            end do
-        end do 
+        ! do i = 1, num_dims 
+        !     do j = 1, num_dims
+        !         call s_apply_fftw_filter_scalarfield(filtered_fluid_indicator_function, .true., reynolds_stress(i)%vf(j))
+        !     end do
+        ! end do 
+        call s_filter_tensor_field(reynolds_stress)
         ! effective viscosity
-        do i = 1, num_dims 
-            do j = 1, num_dims
-                call s_apply_fftw_filter_scalarfield(filtered_fluid_indicator_function, .true., visc_stress(i)%vf(j), eff_visc(i)%vf(j))
-            end do
-        end do 
+        ! do i = 1, num_dims 
+        !     do j = 1, num_dims
+        !         call s_apply_fftw_filter_scalarfield(filtered_fluid_indicator_function, .true., visc_stress(i)%vf(j), eff_visc(i)%vf(j))
+        !     end do
+        ! end do 
+        call s_filter_tensor_field(visc_stress, eff_visc)
         ! interphase momentum exchange
         call s_compute_interphase_momentum_exchange(filtered_fluid_indicator_function, grad_fluid_indicator, pres_visc_stress, int_mom_exch)
 
@@ -1006,78 +1022,340 @@ contains
 
     !< transpose domain from z-slabs to y-slabs on each processor
     subroutine s_mpi_transpose_slabZ2Y
-        complex(c_double_complex), allocatable :: sendbuf(:), recvbuf(:)
         integer :: dest_rank, src_rank
         integer :: i, j, k
 
-        allocate(sendbuf(NxC*Nyloc*Nzloc*num_procs))
-        allocate(recvbuf(NxC*Nyloc*Nzloc*num_procs))
-
-        !$acc parallel loop collapse(4) gang vector default(present) copy(sendbuf)
+        !$acc parallel loop collapse(4) gang vector default(present) copy(sendbuf_sf)
         do dest_rank = 0, num_procs-1
             do k = 1, Nzloc 
                 do j = 1, Nyloc
                     do i = 1, NxC
-                        sendbuf(i + (j-1)*NxC + (k-1)*NxC*Nyloc + dest_rank*NxC*Nyloc*Nzloc) = data_cmplx_slabz(i, j+dest_rank*Nyloc, k)
+                        sendbuf_sf(i + (j-1)*NxC + (k-1)*NxC*Nyloc + dest_rank*NxC*Nyloc*Nzloc) = data_cmplx_slabz(i, j+dest_rank*Nyloc, k)
                     end do 
                 end do
             end do
         end do
 
-        call MPI_Alltoall(sendbuf, NxC*Nyloc*Nzloc, MPI_DOUBLE_COMPLEX, & 
-                          recvbuf, NxC*Nyloc*Nzloc, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD, ierr)
+        call MPI_Alltoall(sendbuf_sf, NxC*Nyloc*Nzloc, MPI_DOUBLE_COMPLEX, & 
+                          recvbuf_sf, NxC*Nyloc*Nzloc, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD, ierr)
 
-        !$acc parallel loop collapse(4) gang vector default(present) copy(recvbuf)
+        !$acc parallel loop collapse(4) gang vector default(present) copy(recvbuf_sf)
         do src_rank = 0, num_procs-1
             do k = 1, Nzloc 
                 do j = 1, Nyloc
                     do i = 1, NxC
-                        data_cmplx_slaby(i, j, k+src_rank*Nzloc) = recvbuf(i + (j-1)*NxC + (k-1)*NxC*Nyloc + src_rank*NxC*Nyloc*Nzloc)
+                        data_cmplx_slaby(i, j, k+src_rank*Nzloc) = recvbuf_sf(i + (j-1)*NxC + (k-1)*NxC*Nyloc + src_rank*NxC*Nyloc*Nzloc)
                     end do 
                 end do
             end do 
         end do
 
-        deallocate(sendbuf, recvbuf)
     end subroutine s_mpi_transpose_slabZ2Y
 
     !< transpose domain from y-slabs to z-slabs on each processor
     subroutine s_mpi_transpose_slabY2Z 
-        complex(c_double_complex), allocatable :: sendbuf(:), recvbuf(:)
         integer :: dest_rank, src_rank
         integer :: i, j, k
 
-        allocate(sendbuf(NxC*Nyloc*Nzloc*num_procs))
-        allocate(recvbuf(NxC*Nyloc*Nzloc*num_procs))
-
-        !$acc parallel loop collapse(4) gang vector default(present) copy(sendbuf)
+        !$acc parallel loop collapse(4) gang vector default(present) copy(sendbuf_sf)
         do dest_rank = 0, num_procs-1
             do k = 1, Nzloc 
                 do j = 1, Nyloc 
                     do i = 1, NxC 
-                        sendbuf(i + (j-1)*NxC + (k-1)*NxC*Nyloc + dest_rank*NxC*Nyloc*Nzloc) = data_cmplx_slaby(i, j, k+dest_rank*Nzloc)
+                        sendbuf_sf(i + (j-1)*NxC + (k-1)*NxC*Nyloc + dest_rank*NxC*Nyloc*Nzloc) = data_cmplx_slaby(i, j, k+dest_rank*Nzloc)
                     end do 
                 end do 
             end do 
         end do
 
-        call MPI_Alltoall(sendbuf, NxC*Nyloc*Nzloc, MPI_DOUBLE_COMPLEX, & 
-                          recvbuf, NxC*Nyloc*Nzloc, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD, ierr)
+        call MPI_Alltoall(sendbuf_sf, NxC*Nyloc*Nzloc, MPI_DOUBLE_COMPLEX, & 
+                          recvbuf_sf, NxC*Nyloc*Nzloc, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD, ierr)
 
-        !$acc parallel loop collapse(4) gang vector default(present) copy(recvbuf) 
+        !$acc parallel loop collapse(4) gang vector default(present) copy(recvbuf_sf) 
         do src_rank = 0, num_procs-1
             do k = 1, Nzloc
                 do j = 1, Nyloc 
                     do i = 1, NxC 
-                        data_cmplx_slabz(i, j+src_rank*Nyloc, k) = recvbuf(i + (j-1)*NxC + (k-1)*NxC*Nyloc + src_rank*NxC*Nyloc*Nzloc)
+                        data_cmplx_slabz(i, j+src_rank*Nyloc, k) = recvbuf_sf(i + (j-1)*NxC + (k-1)*NxC*Nyloc + src_rank*NxC*Nyloc*Nzloc)
                     end do 
                 end do
             end do 
         end do
         
-        deallocate(sendbuf, recvbuf)
     end subroutine s_mpi_transpose_slabY2Z
 
+    !< transpose domain from z-slabs to y-slabs on each processor for batched 9 element tensors
+    subroutine s_mpi_transpose_slabZ2Y_tensor
+        integer :: dest_rank, src_rank
+        integer :: i, j, k, l
+
+        !$acc parallel loop collapse(5) gang vector default(present) copy(sendbuf_tensor)
+        do dest_rank = 0, num_procs-1
+            do k = 1, Nzloc 
+                do j = 1, Nyloc
+                    do i = 1, NxC
+                        do l = 1, 9
+                            sendbuf_tensor(l + (i-1)*9 + (j-1)*9*NxC + (k-1)*9*NxC*Nyloc + dest_rank*9*NxC*Nyloc*Nzloc) = data_cmplx_slabz_tensor(l, i, j+dest_rank*Nyloc, k)
+                        end do 
+                    end do
+                end do
+            end do
+        end do 
+
+        call MPI_Alltoall(sendbuf_tensor, 9*NxC*Nyloc*Nzloc, MPI_DOUBLE_COMPLEX, & 
+                          recvbuf_tensor, 9*NxC*Nyloc*Nzloc, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD, ierr)
+
+        !$acc parallel loop collapse(5) gang vector default(present) copy(recvbuf_tensor)
+        do src_rank = 0, num_procs-1
+            do k = 1, Nzloc 
+                do j = 1, Nyloc
+                    do i = 1, NxC
+                        do l = 1, 9
+                            data_cmplx_slaby_tensor(l, i, j, k+src_rank*Nzloc) = recvbuf_tensor(l + (i-1)*9 + (j-1)*9*NxC + (k-1)*9*NxC*Nyloc + src_rank*9*NxC*Nyloc*Nzloc)
+                        end do 
+                    end do
+                end do 
+            end do
+        end do
+
+    end subroutine s_mpi_transpose_slabZ2Y_tensor
+
+    !< transpose domain from y-slabs to z-slabs on each processor for batched 9 element tensors
+    subroutine s_mpi_transpose_slabY2Z_tensor
+        integer :: dest_rank, src_rank
+        integer :: i, j, k, l
+
+        !$acc parallel loop collapse(5) gang vector default(present) copy(sendbuf_tensor)
+        do dest_rank = 0, num_procs-1
+            do k = 1, Nzloc 
+                do j = 1, Nyloc 
+                    do i = 1, NxC 
+                        do l = 1, 9
+                            sendbuf_tensor(l + (i-1)*9 + (j-1)*9*NxC + (k-1)*9*NxC*Nyloc + dest_rank*9*NxC*Nyloc*Nzloc) = data_cmplx_slaby_tensor(l, i, j, k+dest_rank*Nzloc)
+                        end do 
+                    end do 
+                end do 
+            end do
+        end do
+
+        call MPI_Alltoall(sendbuf_tensor, 9*NxC*Nyloc*Nzloc, MPI_DOUBLE_COMPLEX, & 
+                          recvbuf_tensor, 9*NxC*Nyloc*Nzloc, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD, ierr)
+
+        !$acc parallel loop collapse(5) gang vector default(present) copy(recvbuf_tensor) 
+        do src_rank = 0, num_procs-1
+            do k = 1, Nzloc
+                do j = 1, Nyloc 
+                    do i = 1, NxC 
+                        do l = 1, 9
+                            data_cmplx_slabz_tensor(l, i, j+src_rank*Nyloc, k) = recvbuf_tensor(l + (i-1)*9 + (j-1)*9*NxC + (k-1)*9*NxC*Nyloc + src_rank*9*NxC*Nyloc*Nzloc)
+                        end do 
+                    end do
+                end do 
+            end do
+        end do
+        
+    end subroutine s_mpi_transpose_slabY2Z_tensor
+
+
+
+    !< compute forward FFT, input: data_real_3D_slabz, output: data_cmplx_out1d
+    subroutine s_filter_tensor_field(q_tensor_in, q_tensor_out)
+        type(vector_field), dimension(3), intent(inout) :: q_tensor_in
+        type(vector_field), dimension(3), intent(inout), optional :: q_tensor_out
+        integer :: i, j, k, l, q
+
+        ! ===== forward FFT =====
+        ! outer tensor element loop
+        do l = 1, 3
+            do q = 1, 3
+
+                !$acc parallel loop collapse(3)
+                do i = 0, m 
+                    do j = 0, n 
+                        do k = 0, p 
+                            data_real_3D_slabz(i+1, j+1, k+1) = q_tensor_in(l)%vf(q)%sf(i, j, k) * fluid_indicator_function%sf(i, j, k)
+                        end do 
+                    end do 
+                end do
+
+                ! 3D z-slab -> 1D x, y, z
+                !$acc parallel loop collapse(3) gang vector default(present)
+                do i = 1, Nx 
+                    do j = 1, Ny 
+                        do k = 1, Nzloc
+                            data_real_in1d(i + (j-1)*Nx + (k-1)*Nx*Ny) = data_real_3D_slabz(i, j, k)
+                        end do 
+                    end do 
+                end do
+        
+                ! X FFT
+#if defined(MFC_OpenACC)
+                ierr = cufftExecD2Z(plan_x_fwd_gpu, data_real_in1d, data_cmplx_out1d)
+#else
+                call fftw_execute_dft_r2c(plan_x_r2c_fwd, data_real_in1d, data_cmplx_out1d)
+#endif
+        
+                ! 1D x, y, z -> 1D y, x, z (CMPLX)
+                !$acc parallel loop collapse(3) gang vector default(present)
+                do i = 1, NxC
+                    do j = 1, Ny 
+                        do k = 1, Nzloc
+                            data_cmplx_out1dy(j + (i-1)*Ny + (k-1)*Ny*NxC) = data_cmplx_out1d(i + (j-1)*NxC + (k-1)*NxC*Ny)
+                        end do 
+                    end do 
+                end do
+        
+                ! Y FFT 
+#if defined(MFC_OpenACC)
+                ierr = cufftExecZ2Z(plan_y_gpu, data_cmplx_out1dy, data_cmplx_out1dy, CUFFT_FORWARD)
+#else
+                call fftw_execute_dft(plan_y_c2c_fwd, data_cmplx_out1dy, data_cmplx_out1dy)
+#endif 
+        
+                ! 1D y, x, z -> 3D z-slab
+                !$acc parallel loop collapse(3) gang vector default(present)
+                do i = 1, NxC 
+                    do j = 1, Ny 
+                        do k = 1, Nzloc
+                            data_cmplx_slabz_tensor((l-1)*3 + q, i, j, k) = data_cmplx_out1dy(j + (i-1)*Ny + (k-1)*Ny*NxC)
+                        end do 
+                    end do 
+                end do 
+                ! pack data_cmplx_slabz_tensor for MPI tranpose
+            end do
+        end do 
+
+        ! tensor MPI data transpose
+        call s_mpi_transpose_slabZ2Y_tensor
+
+        ! outer tensor element loop
+        do l = 1, 3
+            do q = 1, 3
+                ! 3D y-slab -> 1D z, x, y
+                !$acc parallel loop collapse(3) gang vector default(present)
+                do i = 1, NxC 
+                    do j = 1, Nyloc 
+                        do k = 1, Nz
+                            data_cmplx_out1d(k + (i-1)*Nz + (j-1)*Nz*NxC) = data_cmplx_slaby_tensor((l-1)*3 + q, i, j, k)
+                        end do 
+                    end do 
+                end do
+
+                ! Z FFT
+#if defined(MFC_OpenACC)
+                ierr = cufftExecZ2Z(plan_z_gpu, data_cmplx_out1d, data_cmplx_out1d, CUFFT_FORWARD)
+#else
+                call fftw_execute_dft(plan_z_c2c_fwd, data_cmplx_out1d, data_cmplx_out1d)
+#endif
+                
+                ! convolution with filtering kernel in Fourier space
+                !$acc parallel loop collapse(3) gang vector default(present)
+                do i = 1, NxC 
+                    do j = 1, Nyloc 
+                        do k = 1, Nz 
+                            data_cmplx_out1d(k + (i-1)*Nz + (j-1)*Nz*NxC) = data_cmplx_out1d(k + (i-1)*Nz + (j-1)*Nz*NxC) * cmplx_kernelG1d(k + (i-1)*Nz + (j-1)*Nz*NxC)
+                        end do 
+                    end do 
+                end do
+
+                ! ===== begin backward FFT =====
+                ! Z inv FFT 
+#if defined(MFC_OpenACC)
+                ierr = cufftExecZ2Z(plan_z_gpu, data_cmplx_out1d, data_cmplx_out1d, CUFFT_INVERSE)
+#else
+                call fftw_execute_dft(plan_z_c2c_bwd, data_cmplx_out1d, data_cmplx_out1d)
+#endif
+
+                ! 1D z, x, y -> 3D y-slab
+                !$acc parallel loop collapse(3) gang vector default(present)
+                do i = 1, NxC 
+                    do j = 1, Nyloc 
+                        do k = 1, Nz 
+                            data_cmplx_slaby_tensor((l-1)*3 + q, i, j, k) = data_cmplx_out1d(k + (i-1)*Nz + (j-1)*Nz*NxC)
+                        end do 
+                    end do 
+                end do
+                ! pack data_cmplx_slaby_tensor for MPI tranpose
+            end do
+        end do
+
+        call s_mpi_transpose_slabY2Z_tensor
+
+        ! outer tensor element loop
+        do l = 1, 3
+            do q = 1, 3
+                
+                ! 3D z-slab -> 1D y, x, z
+                !$acc parallel loop collapse(3) gang vector default(present)
+                do i = 1, NxC 
+                    do j = 1, Ny 
+                        do k = 1, Nzloc
+                            data_cmplx_out1dy(j + (i-1)*Ny + (k-1)*Ny*NxC) = data_cmplx_slabz_tensor((l-1)*3 + q, i, j, k)
+                        end do 
+                    end do 
+                end do
+
+                ! Y inv FFT 
+#if defined(MFC_OpenACC)
+                ierr = cufftExecZ2Z(plan_y_gpu, data_cmplx_out1dy, data_cmplx_out1dy, CUFFT_INVERSE)
+#else
+                call fftw_execute_dft(plan_y_c2c_bwd, data_cmplx_out1dy, data_cmplx_out1dy)
+#endif
+
+                ! 1D y, x, z -> 1D x, y, z 
+                !$acc parallel loop collapse(3) gang vector default(present)
+                do i = 1, NxC 
+                    do j = 1, Ny 
+                        do k = 1, Nzloc
+                            data_cmplx_out1d(i + (j-1)*NxC + (k-1)*NxC*Ny) = data_cmplx_out1dy(j + (i-1)*Ny + (k-1)*Ny*NxC)
+                        end do 
+                    end do 
+                end do
+
+                ! X inv FFT
+#if defined(MFC_OpenACC)
+                ierr = cufftExecZ2D(plan_x_bwd_gpu, data_cmplx_out1d, data_real_in1d)
+#else
+                call fftw_execute_dft_c2r(plan_x_c2r_bwd, data_cmplx_out1d, data_real_in1d)
+#endif
+
+                ! 1D x, y, z -> 3D z-slab
+                !$acc parallel loop collapse(3) gang vector default(present)
+                do i = 1, Nx 
+                    do j = 1, Ny 
+                        do k = 1, Nzloc
+                            data_real_3D_slabz(i, j, k) = data_real_in1d(i + (j-1)*Nx + (k-1)*Nx*Ny)
+                        end do 
+                    end do 
+                end do
+
+                if (present(q_tensor_out)) then 
+                    !$acc parallel loop collapse(3) gang vector default(present)
+                    do i = 0, m
+                        do j = 0, n
+                            do k = 0, p
+                                q_tensor_out(l)%vf(q)%sf(i, j, k) = data_real_3D_slabz(i+1, j+1, k+1) / (real(Nx*Ny*Nz, dp) * filtered_fluid_indicator_function%sf(i, j, k))
+                            end do 
+                        end do 
+                    end do
+                else
+                    !$acc parallel loop collapse(3) gang vector default(present)
+                    do i = 0, m
+                        do j = 0, n
+                            do k = 0, p
+                                q_tensor_in(l)%vf(q)%sf(i, j, k) = data_real_3D_slabz(i+1, j+1, k+1) / (real(Nx*Ny*Nz, dp) * filtered_fluid_indicator_function%sf(i, j, k))
+                            end do 
+                        end do 
+                    end do
+                end if
+
+            end do
+        end do
+
+    end subroutine s_filter_tensor_field
+
+
+
     !< compute forward FFT, input: data_real_3D_slabz, output: data_cmplx_out1d
     subroutine s_mpi_FFT_fwd
         integer :: i, j, k
@@ -1288,6 +1566,10 @@ contains
         @:DEALLOCATE(data_real_in1d, data_cmplx_out1d, data_cmplx_out1dy)
         @:DEALLOCATE(cmplx_kernelG1d, real_kernelG_in)
         @:DEALLOCATE(data_real_3D_slabz, data_cmplx_slabz, data_cmplx_slaby)
+        @:DEALLOCATE(data_cmplx_slabz_tensor, data_cmplx_slaby_tensor)
+        
+        deallocate(sendbuf_sf, recvbuf_sf)
+        deallocate(sendbuf_tensor, recvbuf_tensor)
 
 #if defined(MFC_OpenACC)
         ierr = cufftDestroy(plan_x_fwd_gpu)

From 4dfe3ccf5a15aad226bf73a4e5d16e3f5638121f Mon Sep 17 00:00:00 2001
From: conradd3 <conradd3@illinois.edu>
Date: Thu, 25 Sep 2025 12:20:42 -0500
Subject: [PATCH 20/30] gpu data allocation

---
 src/simulation/m_volume_filtering.fpp | 46 +++++++++++++++++++--------
 1 file changed, 32 insertions(+), 14 deletions(-)

diff --git a/src/simulation/m_volume_filtering.fpp b/src/simulation/m_volume_filtering.fpp
index d3981dd55a..a2c0c0efac 100644
--- a/src/simulation/m_volume_filtering.fpp
+++ b/src/simulation/m_volume_filtering.fpp
@@ -114,6 +114,8 @@ module m_volume_filtering
     complex(c_double_complex), allocatable :: sendbuf_sf(:), recvbuf_sf(:)
     complex(c_double_complex), allocatable :: sendbuf_tensor(:), recvbuf_tensor(:)
 
+    !$acc declare create(sendbuf_sf, recvbuf_sf, sendbuf_tensor, recvbuf_tensor)
+
 contains
 
     !< create fft plans to be used for explicit filtering of data 
@@ -242,10 +244,10 @@ contains
         @:ALLOCATE(data_cmplx_slabz_tensor(9, NxC, Ny, Nzloc))
         @:ALLOCATE(data_cmplx_slaby_tensor(9, NxC, Nyloc, Nz))
 
-        allocate(sendbuf_sf(NxC*Nyloc*Nzloc*num_procs))
-        allocate(recvbuf_sf(NxC*Nyloc*Nzloc*num_procs))
-        allocate(sendbuf_tensor(9*NxC*Nyloc*Nzloc*num_procs))
-        allocate(recvbuf_tensor(9*NxC*Nyloc*Nzloc*num_procs))
+        @:ALLOCATE(sendbuf_sf(NxC*Nyloc*Nzloc*num_procs))
+        @:ALLOCATE(recvbuf_sf(NxC*Nyloc*Nzloc*num_procs))
+        @:ALLOCATE(sendbuf_tensor(9*NxC*Nyloc*Nzloc*num_procs))
+        @:ALLOCATE(recvbuf_tensor(9*NxC*Nyloc*Nzloc*num_procs))
 
 #if defined(MFC_OpenACC)
         !< GPU FFT plans
@@ -1025,7 +1027,7 @@ contains
         integer :: dest_rank, src_rank
         integer :: i, j, k
 
-        !$acc parallel loop collapse(4) gang vector default(present) copy(sendbuf_sf)
+        !$acc parallel loop collapse(4) gang vector default(present)
         do dest_rank = 0, num_procs-1
             do k = 1, Nzloc 
                 do j = 1, Nyloc
@@ -1036,10 +1038,14 @@ contains
             end do
         end do
 
+        !$acc update host(sendbuf_sf)
+
         call MPI_Alltoall(sendbuf_sf, NxC*Nyloc*Nzloc, MPI_DOUBLE_COMPLEX, & 
                           recvbuf_sf, NxC*Nyloc*Nzloc, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD, ierr)
 
-        !$acc parallel loop collapse(4) gang vector default(present) copy(recvbuf_sf)
+        !$acc update device(recvbuf_sf)
+
+        !$acc parallel loop collapse(4) gang vector default(present) 
         do src_rank = 0, num_procs-1
             do k = 1, Nzloc 
                 do j = 1, Nyloc
@@ -1057,7 +1063,7 @@ contains
         integer :: dest_rank, src_rank
         integer :: i, j, k
 
-        !$acc parallel loop collapse(4) gang vector default(present) copy(sendbuf_sf)
+        !$acc parallel loop collapse(4) gang vector default(present) 
         do dest_rank = 0, num_procs-1
             do k = 1, Nzloc 
                 do j = 1, Nyloc 
@@ -1068,10 +1074,14 @@ contains
             end do 
         end do
 
+        !$acc update host(sendbuf_sf)
+
         call MPI_Alltoall(sendbuf_sf, NxC*Nyloc*Nzloc, MPI_DOUBLE_COMPLEX, & 
                           recvbuf_sf, NxC*Nyloc*Nzloc, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD, ierr)
 
-        !$acc parallel loop collapse(4) gang vector default(present) copy(recvbuf_sf) 
+        !$acc update device(recvbuf_sf)
+
+        !$acc parallel loop collapse(4) gang vector default(present) 
         do src_rank = 0, num_procs-1
             do k = 1, Nzloc
                 do j = 1, Nyloc 
@@ -1089,7 +1099,7 @@ contains
         integer :: dest_rank, src_rank
         integer :: i, j, k, l
 
-        !$acc parallel loop collapse(5) gang vector default(present) copy(sendbuf_tensor)
+        !$acc parallel loop collapse(5) gang vector default(present)
         do dest_rank = 0, num_procs-1
             do k = 1, Nzloc 
                 do j = 1, Nyloc
@@ -1102,10 +1112,14 @@ contains
             end do
         end do 
 
+        !$acc update host(sendbuf_tensor)
+
         call MPI_Alltoall(sendbuf_tensor, 9*NxC*Nyloc*Nzloc, MPI_DOUBLE_COMPLEX, & 
                           recvbuf_tensor, 9*NxC*Nyloc*Nzloc, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD, ierr)
 
-        !$acc parallel loop collapse(5) gang vector default(present) copy(recvbuf_tensor)
+        !$acc update device(recvbuf_tensor)
+
+        !$acc parallel loop collapse(5) gang vector default(present) 
         do src_rank = 0, num_procs-1
             do k = 1, Nzloc 
                 do j = 1, Nyloc
@@ -1125,7 +1139,7 @@ contains
         integer :: dest_rank, src_rank
         integer :: i, j, k, l
 
-        !$acc parallel loop collapse(5) gang vector default(present) copy(sendbuf_tensor)
+        !$acc parallel loop collapse(5) gang vector default(present) 
         do dest_rank = 0, num_procs-1
             do k = 1, Nzloc 
                 do j = 1, Nyloc 
@@ -1138,10 +1152,14 @@ contains
             end do
         end do
 
+        !$acc update host(sendbuf_tensor)
+
         call MPI_Alltoall(sendbuf_tensor, 9*NxC*Nyloc*Nzloc, MPI_DOUBLE_COMPLEX, & 
                           recvbuf_tensor, 9*NxC*Nyloc*Nzloc, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD, ierr)
 
-        !$acc parallel loop collapse(5) gang vector default(present) copy(recvbuf_tensor) 
+        !$acc update device(recvbuf_tensor)
+
+        !$acc parallel loop collapse(5) gang vector default(present) 
         do src_rank = 0, num_procs-1
             do k = 1, Nzloc
                 do j = 1, Nyloc 
@@ -1568,8 +1586,8 @@ contains
         @:DEALLOCATE(data_real_3D_slabz, data_cmplx_slabz, data_cmplx_slaby)
         @:DEALLOCATE(data_cmplx_slabz_tensor, data_cmplx_slaby_tensor)
         
-        deallocate(sendbuf_sf, recvbuf_sf)
-        deallocate(sendbuf_tensor, recvbuf_tensor)
+        @:DEALLOCATE(sendbuf_sf, recvbuf_sf)
+        @:DEALLOCATE(sendbuf_tensor, recvbuf_tensor)
 
 #if defined(MFC_OpenACC)
         ierr = cufftDestroy(plan_x_fwd_gpu)

From 3e6aed61f128ebffce0c1de82a32bdf0d78bd3c3 Mon Sep 17 00:00:00 2001
From: Conrad Delgado <conradd3@dt-login02.delta.ncsa.illinois.edu>
Date: Thu, 25 Sep 2025 15:20:48 -0500
Subject: [PATCH 21/30] batched mpiAlltoall for cons vars and tensors

---
 src/simulation/m_volume_filtering.fpp | 281 +++++++++++++++++++++++++-
 1 file changed, 274 insertions(+), 7 deletions(-)

diff --git a/src/simulation/m_volume_filtering.fpp b/src/simulation/m_volume_filtering.fpp
index a2c0c0efac..5d487c9122 100644
--- a/src/simulation/m_volume_filtering.fpp
+++ b/src/simulation/m_volume_filtering.fpp
@@ -29,9 +29,10 @@ module m_volume_filtering
     private; public :: s_initialize_fftw_explicit_filter_module, &
  s_initialize_filtering_kernel, s_initialize_fluid_indicator_function, & 
  s_initialize_filtered_fluid_indicator_function, s_initialize_fluid_indicator_gradient, &
- s_finalize_fftw_explicit_filter_module, s_volume_filter_momentum_eqn, s_apply_fftw_filter_scalarfield, s_filter_tensor_field, &
+ s_finalize_fftw_explicit_filter_module, s_volume_filter_momentum_eqn, s_apply_fftw_filter_scalarfield, s_filter_tensor_field, s_filter_cons_vars, &
  s_compute_viscous_stress_tensor, s_compute_stress_tensor, s_compute_divergence_stress_tensor, s_compute_particle_forces, &
- s_mpi_transpose_slabZ2Y, s_mpi_transpose_slabY2Z, s_mpi_transpose_slabZ2Y_tensor, s_mpi_transpose_slabY2Z_tensor, s_mpi_FFT_fwd, s_mpi_FFT_bwd, &
+ s_mpi_transpose_slabZ2Y, s_mpi_transpose_slabY2Z, s_mpi_transpose_slabZ2Y_tensor, s_mpi_transpose_slabY2Z_tensor, & 
+ s_mpi_transpose_slabZ2Y_cons, s_mpi_transpose_slabY2Z_cons, s_mpi_FFT_fwd, s_mpi_FFT_bwd, &
  s_setup_terms_filtering, s_compute_pseudo_turbulent_reynolds_stress, s_compute_effective_viscosity
 
 #if !defined(MFC_OpenACC)
@@ -96,6 +97,8 @@ module m_volume_filtering
     complex(c_double_complex), allocatable :: data_cmplx_slabz(:, :, :), data_cmplx_slaby(:, :, :)
     ! 3D arrays for slab transposes of tensor quantities
     complex(c_double_complex), allocatable :: data_cmplx_slabz_tensor(:, :, :, :), data_cmplx_slaby_tensor(:, :, :, :)
+    ! 3D arrays for slab transpose of conserved variables
+    complex(c_double_complex), allocatable :: data_cmplx_slabz_cons(:, :, :, :), data_cmplx_slaby_cons(:, :, :, :)
 
     ! input/output array for FFT routine
     real(c_double), allocatable :: data_real_3D_slabz(:, :, :)
@@ -113,8 +116,9 @@ module m_volume_filtering
     ! buffers for data transpose
     complex(c_double_complex), allocatable :: sendbuf_sf(:), recvbuf_sf(:)
     complex(c_double_complex), allocatable :: sendbuf_tensor(:), recvbuf_tensor(:)
+    complex(c_double_complex), allocatable :: sendbuf_cons(:), recvbuf_cons(:)
 
-    !$acc declare create(sendbuf_sf, recvbuf_sf, sendbuf_tensor, recvbuf_tensor)
+    !$acc declare create(sendbuf_sf, recvbuf_sf, sendbuf_tensor, recvbuf_tensor, sendbuf_cons, recvbuf_cons)
 
 contains
 
@@ -243,11 +247,15 @@ contains
         @:ALLOCATE(data_cmplx_slaby(NxC, Nyloc, Nz))
         @:ALLOCATE(data_cmplx_slabz_tensor(9, NxC, Ny, Nzloc))
         @:ALLOCATE(data_cmplx_slaby_tensor(9, NxC, Nyloc, Nz))
+        @:ALLOCATE(data_cmplx_slabz_cons(4, NxC, Ny, Nzloc))
+        @:ALLOCATE(data_cmplx_slaby_cons(4, NxC, Nyloc, Nz))
 
         @:ALLOCATE(sendbuf_sf(NxC*Nyloc*Nzloc*num_procs))
         @:ALLOCATE(recvbuf_sf(NxC*Nyloc*Nzloc*num_procs))
         @:ALLOCATE(sendbuf_tensor(9*NxC*Nyloc*Nzloc*num_procs))
         @:ALLOCATE(recvbuf_tensor(9*NxC*Nyloc*Nzloc*num_procs))
+        @:ALLOCATE(sendbuf_cons(5*NxC*Nyloc*Nzloc*num_procs))
+        @:ALLOCATE(recvbuf_cons(5*NxC*Nyloc*Nzloc*num_procs))
 
 #if defined(MFC_OpenACC)
         !< GPU FFT plans
@@ -570,9 +578,10 @@ contains
         integer :: i, j, k
 
         call nvtxStartRange("FILTER-CONS-VARS")
-        do i = 1, sys_size-1
-            call s_apply_fftw_filter_scalarfield(filtered_fluid_indicator_function, .true., q_cons_vf(i), q_cons_filtered(i))
-        end do 
+        call s_filter_cons_vars(q_cons_vf, q_cons_filtered)
+        ! do i = 1, sys_size-1
+        !     call s_apply_fftw_filter_scalarfield(filtered_fluid_indicator_function, .true., q_cons_vf(i), q_cons_filtered(i))
+        ! end do 
         call s_apply_fftw_filter_scalarfield(filtered_fluid_indicator_function, .true., q_prim_vf(E_idx), filtered_pressure)
         call nvtxEndRange
 
@@ -1174,7 +1183,85 @@ contains
         
     end subroutine s_mpi_transpose_slabY2Z_tensor
 
+    !< transpose domain from z-slabs to y-slabs on each processor for batched 5 element conserved variables
+    subroutine s_mpi_transpose_slabZ2Y_cons
+        integer :: dest_rank, src_rank
+        integer :: i, j, k, l
+
+        !$acc parallel loop collapse(5) gang vector default(present)
+        do dest_rank = 0, num_procs-1
+            do k = 1, Nzloc 
+                do j = 1, Nyloc
+                    do i = 1, NxC
+                        do l = 1, 5
+                            sendbuf_cons(l + (i-1)*5 + (j-1)*5*NxC + (k-1)*5*NxC*Nyloc + dest_rank*5*NxC*Nyloc*Nzloc) = data_cmplx_slabz_cons(l, i, j+dest_rank*Nyloc, k)
+                        end do 
+                    end do
+                end do
+            end do
+        end do 
+
+        !$acc update host(sendbuf_cons)
+
+        call MPI_Alltoall(sendbuf_cons, 5*NxC*Nyloc*Nzloc, MPI_DOUBLE_COMPLEX, & 
+                          recvbuf_cons, 5*NxC*Nyloc*Nzloc, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD, ierr)
+
+        !$acc update device(recvbuf_cons)
+
+        !$acc parallel loop collapse(5) gang vector default(present) 
+        do src_rank = 0, num_procs-1
+            do k = 1, Nzloc 
+                do j = 1, Nyloc
+                    do i = 1, NxC
+                        do l = 1, 5
+                            data_cmplx_slaby_cons(l, i, j, k+src_rank*Nzloc) = recvbuf_cons(l + (i-1)*5 + (j-1)*5*NxC + (k-1)*5*NxC*Nyloc + src_rank*5*NxC*Nyloc*Nzloc)
+                        end do 
+                    end do
+                end do 
+            end do
+        end do
+
+    end subroutine s_mpi_transpose_slabZ2Y_cons
+
+    !< transpose domain from y-slabs to z-slabs on each processor for batched 4 element conserved variables
+    subroutine s_mpi_transpose_slabY2Z_cons
+        integer :: dest_rank, src_rank
+        integer :: i, j, k, l
+
+        !$acc parallel loop collapse(5) gang vector default(present) 
+        do dest_rank = 0, num_procs-1
+            do k = 1, Nzloc 
+                do j = 1, Nyloc 
+                    do i = 1, NxC 
+                        do l = 1, 5
+                            sendbuf_cons(l + (i-1)*5 + (j-1)*5*NxC + (k-1)*5*NxC*Nyloc + dest_rank*5*NxC*Nyloc*Nzloc) = data_cmplx_slaby_cons(l, i, j, k+dest_rank*Nzloc)
+                        end do 
+                    end do 
+                end do 
+            end do
+        end do
+
+        !$acc update host(sendbuf_cons)
+
+        call MPI_Alltoall(sendbuf_cons, 5*NxC*Nyloc*Nzloc, MPI_DOUBLE_COMPLEX, & 
+                          recvbuf_cons, 5*NxC*Nyloc*Nzloc, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD, ierr)
 
+        !$acc update device(recvbuf_cons)
+
+        !$acc parallel loop collapse(5) gang vector default(present) 
+        do src_rank = 0, num_procs-1
+            do k = 1, Nzloc
+                do j = 1, Nyloc 
+                    do i = 1, NxC 
+                        do l = 1, 5
+                            data_cmplx_slabz_cons(l, i, j+src_rank*Nyloc, k) = recvbuf_cons(l + (i-1)*5 + (j-1)*5*NxC + (k-1)*5*NxC*Nyloc + src_rank*5*NxC*Nyloc*Nzloc)
+                        end do 
+                    end do
+                end do 
+            end do
+        end do
+        
+    end subroutine s_mpi_transpose_slabY2Z_cons
 
     !< compute forward FFT, input: data_real_3D_slabz, output: data_cmplx_out1d
     subroutine s_filter_tensor_field(q_tensor_in, q_tensor_out)
@@ -1372,7 +1459,185 @@ contains
 
     end subroutine s_filter_tensor_field
 
+    !< compute forward FFT, input: data_real_3D_slabz, output: data_cmplx_out1d
+    subroutine s_filter_cons_vars(q_cons_vf, q_cons_filtered)
+        type(scalar_field), dimension(5), intent(inout) :: q_cons_vf
+        type(scalar_field), dimension(5), intent(inout) :: q_cons_filtered
+        integer :: i, j, k, l
+
+        ! ===== forward FFT =====
+        ! outer element loop
+        do l = 1, 5
+
+            !$acc parallel loop collapse(3)
+            do i = 0, m 
+                do j = 0, n 
+                    do k = 0, p 
+                        data_real_3D_slabz(i+1, j+1, k+1) = q_cons_vf(l)%sf(i, j, k) * fluid_indicator_function%sf(i, j, k)
+                    end do 
+                end do 
+            end do
+
+            ! 3D z-slab -> 1D x, y, z
+            !$acc parallel loop collapse(3) gang vector default(present)
+            do i = 1, Nx 
+                do j = 1, Ny 
+                    do k = 1, Nzloc
+                        data_real_in1d(i + (j-1)*Nx + (k-1)*Nx*Ny) = data_real_3D_slabz(i, j, k)
+                    end do 
+                end do 
+            end do
+    
+            ! X FFT
+#if defined(MFC_OpenACC)
+            ierr = cufftExecD2Z(plan_x_fwd_gpu, data_real_in1d, data_cmplx_out1d)
+#else
+            call fftw_execute_dft_r2c(plan_x_r2c_fwd, data_real_in1d, data_cmplx_out1d)
+#endif
+    
+            ! 1D x, y, z -> 1D y, x, z (CMPLX)
+            !$acc parallel loop collapse(3) gang vector default(present)
+            do i = 1, NxC
+                do j = 1, Ny 
+                    do k = 1, Nzloc
+                        data_cmplx_out1dy(j + (i-1)*Ny + (k-1)*Ny*NxC) = data_cmplx_out1d(i + (j-1)*NxC + (k-1)*NxC*Ny)
+                    end do 
+                end do 
+            end do
+    
+            ! Y FFT 
+#if defined(MFC_OpenACC)
+            ierr = cufftExecZ2Z(plan_y_gpu, data_cmplx_out1dy, data_cmplx_out1dy, CUFFT_FORWARD)
+#else
+            call fftw_execute_dft(plan_y_c2c_fwd, data_cmplx_out1dy, data_cmplx_out1dy)
+#endif 
+    
+            ! 1D y, x, z -> 3D z-slab
+            !$acc parallel loop collapse(3) gang vector default(present)
+            do i = 1, NxC 
+                do j = 1, Ny 
+                    do k = 1, Nzloc
+                        data_cmplx_slabz_cons(l, i, j, k) = data_cmplx_out1dy(j + (i-1)*Ny + (k-1)*Ny*NxC)
+                    end do 
+                end do 
+            end do 
+            ! pack data_cmplx_slabz_cons for MPI tranpose
+        end do 
+
+        ! cons vars MPI data transpose
+        call s_mpi_transpose_slabZ2Y_cons
+
+        ! outer element loop
+        do l = 1, 5
+
+            ! 3D y-slab -> 1D z, x, y
+            !$acc parallel loop collapse(3) gang vector default(present)
+            do i = 1, NxC 
+                do j = 1, Nyloc 
+                    do k = 1, Nz
+                        data_cmplx_out1d(k + (i-1)*Nz + (j-1)*Nz*NxC) = data_cmplx_slaby_cons(l, i, j, k)
+                    end do 
+                end do 
+            end do
+
+            ! Z FFT
+#if defined(MFC_OpenACC)
+            ierr = cufftExecZ2Z(plan_z_gpu, data_cmplx_out1d, data_cmplx_out1d, CUFFT_FORWARD)
+#else
+            call fftw_execute_dft(plan_z_c2c_fwd, data_cmplx_out1d, data_cmplx_out1d)
+#endif
+            
+            ! convolution with filtering kernel in Fourier space
+            !$acc parallel loop collapse(3) gang vector default(present)
+            do i = 1, NxC 
+                do j = 1, Nyloc 
+                    do k = 1, Nz 
+                        data_cmplx_out1d(k + (i-1)*Nz + (j-1)*Nz*NxC) = data_cmplx_out1d(k + (i-1)*Nz + (j-1)*Nz*NxC) * cmplx_kernelG1d(k + (i-1)*Nz + (j-1)*Nz*NxC)
+                    end do 
+                end do 
+            end do
+
+            ! ===== begin backward FFT =====
+            ! Z inv FFT 
+#if defined(MFC_OpenACC)
+            ierr = cufftExecZ2Z(plan_z_gpu, data_cmplx_out1d, data_cmplx_out1d, CUFFT_INVERSE)
+#else
+            call fftw_execute_dft(plan_z_c2c_bwd, data_cmplx_out1d, data_cmplx_out1d)
+#endif
+
+            ! 1D z, x, y -> 3D y-slab
+            !$acc parallel loop collapse(3) gang vector default(present)
+            do i = 1, NxC 
+                do j = 1, Nyloc 
+                    do k = 1, Nz 
+                        data_cmplx_slaby_cons(l, i, j, k) = data_cmplx_out1d(k + (i-1)*Nz + (j-1)*Nz*NxC)
+                    end do 
+                end do 
+            end do
+            ! pack data_cmplx_slaby_cons for MPI tranpose
+        end do
+
+        call s_mpi_transpose_slabY2Z_cons
+
+        ! outer element loop
+        do l = 1, 5
+            
+            ! 3D z-slab -> 1D y, x, z
+            !$acc parallel loop collapse(3) gang vector default(present)
+            do i = 1, NxC 
+                do j = 1, Ny 
+                    do k = 1, Nzloc
+                        data_cmplx_out1dy(j + (i-1)*Ny + (k-1)*Ny*NxC) = data_cmplx_slabz_cons(l, i, j, k)
+                    end do 
+                end do 
+            end do
+
+            ! Y inv FFT 
+#if defined(MFC_OpenACC)
+            ierr = cufftExecZ2Z(plan_y_gpu, data_cmplx_out1dy, data_cmplx_out1dy, CUFFT_INVERSE)
+#else
+            call fftw_execute_dft(plan_y_c2c_bwd, data_cmplx_out1dy, data_cmplx_out1dy)
+#endif
+
+            ! 1D y, x, z -> 1D x, y, z 
+            !$acc parallel loop collapse(3) gang vector default(present)
+            do i = 1, NxC 
+                do j = 1, Ny 
+                    do k = 1, Nzloc
+                        data_cmplx_out1d(i + (j-1)*NxC + (k-1)*NxC*Ny) = data_cmplx_out1dy(j + (i-1)*Ny + (k-1)*Ny*NxC)
+                    end do 
+                end do 
+            end do
+
+            ! X inv FFT
+#if defined(MFC_OpenACC)
+            ierr = cufftExecZ2D(plan_x_bwd_gpu, data_cmplx_out1d, data_real_in1d)
+#else
+            call fftw_execute_dft_c2r(plan_x_c2r_bwd, data_cmplx_out1d, data_real_in1d)
+#endif
+
+            ! 1D x, y, z -> 3D z-slab
+            !$acc parallel loop collapse(3) gang vector default(present)
+            do i = 1, Nx 
+                do j = 1, Ny 
+                    do k = 1, Nzloc
+                        data_real_3D_slabz(i, j, k) = data_real_in1d(i + (j-1)*Nx + (k-1)*Nx*Ny)
+                    end do 
+                end do 
+            end do
+
+            !$acc parallel loop collapse(3) gang vector default(present)
+            do i = 0, m
+                do j = 0, n
+                    do k = 0, p
+                        q_cons_filtered(l)%sf(i, j, k) = data_real_3D_slabz(i+1, j+1, k+1) / (real(Nx*Ny*Nz, dp) * filtered_fluid_indicator_function%sf(i, j, k))
+                    end do 
+                end do 
+            end do
+
+        end do
 
+    end subroutine s_filter_cons_vars
 
     !< compute forward FFT, input: data_real_3D_slabz, output: data_cmplx_out1d
     subroutine s_mpi_FFT_fwd
@@ -1585,9 +1850,11 @@ contains
         @:DEALLOCATE(cmplx_kernelG1d, real_kernelG_in)
         @:DEALLOCATE(data_real_3D_slabz, data_cmplx_slabz, data_cmplx_slaby)
         @:DEALLOCATE(data_cmplx_slabz_tensor, data_cmplx_slaby_tensor)
+        @:DEALLOCATE(data_cmplx_slabz_cons, data_cmplx_slaby_cons)
         
         @:DEALLOCATE(sendbuf_sf, recvbuf_sf)
         @:DEALLOCATE(sendbuf_tensor, recvbuf_tensor)
+        @:DEALLOCATE(sendbuf_cons, recvbuf_cons)
 
 #if defined(MFC_OpenACC)
         ierr = cufftDestroy(plan_x_fwd_gpu)
@@ -1612,4 +1879,4 @@ contains
 
     end subroutine s_finalize_fftw_explicit_filter_module
 
-end module m_volume_filtering
\ No newline at end of file
+end module m_volume_filtering

From 3e81245ed27d41dcc27e5fb4c41b8b407ee18018 Mon Sep 17 00:00:00 2001
From: Conrad Delgado <conradd3@dt-login01.delta.ncsa.illinois.edu>
Date: Sat, 27 Sep 2025 23:30:49 -0500
Subject: [PATCH 22/30] starting runs

---
 src/simulation/m_start_up.fpp         |  2 +-
 src/simulation/m_volume_filtering.fpp | 17 +++++++++--------
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/src/simulation/m_start_up.fpp b/src/simulation/m_start_up.fpp
index c593da603d..d83d73e9d3 100644
--- a/src/simulation/m_start_up.fpp
+++ b/src/simulation/m_start_up.fpp
@@ -1322,7 +1322,7 @@ contains
         ! Volume filter flow variables, compute unclosed terms and their statistics
         if (volume_filtering_momentum_eqn) then 
             if (t_step > t_step_stat_start) then  
-                call nvtxStartRange("VOLUME-FILTERED-MOMENTUM-EQUATION")  
+                call nvtxStartRange("VOLUME-FILTER-MOMENTUM-EQUATION")  
                 call s_volume_filter_momentum_eqn(q_cons_ts(1)%vf, q_prim_vf)
                 call nvtxEndRange
 
diff --git a/src/simulation/m_volume_filtering.fpp b/src/simulation/m_volume_filtering.fpp
index 5d487c9122..66844526ba 100644
--- a/src/simulation/m_volume_filtering.fpp
+++ b/src/simulation/m_volume_filtering.fpp
@@ -111,7 +111,8 @@ module m_volume_filtering
 
     !$acc declare create(Nx, Ny, Nz, NxC, Nyloc, Nzloc)
     !$acc declare create(data_real_in1d, data_cmplx_out1d, data_cmplx_out1dy)
-    !$acc declare create(data_cmplx_slabz, data_cmplx_slaby, data_cmplx_slabz_tensor, data_cmplx_slaby_tensor, data_real_3D_slabz, real_kernelG_in, cmplx_kernelG1d)
+    !$acc declare create(data_cmplx_slabz, data_cmplx_slaby, data_cmplx_slabz_tensor, data_cmplx_slaby_tensor, data_cmplx_slabz_cons, data_cmplx_slaby_cons)
+    !$acc declare create(data_real_3D_slabz, real_kernelG_in, cmplx_kernelG1d)
 
     ! buffers for data transpose
     complex(c_double_complex), allocatable :: sendbuf_sf(:), recvbuf_sf(:)
@@ -247,8 +248,8 @@ contains
         @:ALLOCATE(data_cmplx_slaby(NxC, Nyloc, Nz))
         @:ALLOCATE(data_cmplx_slabz_tensor(9, NxC, Ny, Nzloc))
         @:ALLOCATE(data_cmplx_slaby_tensor(9, NxC, Nyloc, Nz))
-        @:ALLOCATE(data_cmplx_slabz_cons(4, NxC, Ny, Nzloc))
-        @:ALLOCATE(data_cmplx_slaby_cons(4, NxC, Nyloc, Nz))
+        @:ALLOCATE(data_cmplx_slabz_cons(5, NxC, Ny, Nzloc))
+        @:ALLOCATE(data_cmplx_slaby_cons(5, NxC, Nyloc, Nz))
 
         @:ALLOCATE(sendbuf_sf(NxC*Nyloc*Nzloc*num_procs))
         @:ALLOCATE(recvbuf_sf(NxC*Nyloc*Nzloc*num_procs))
@@ -578,10 +579,10 @@ contains
         integer :: i, j, k
 
         call nvtxStartRange("FILTER-CONS-VARS")
-        call s_filter_cons_vars(q_cons_vf, q_cons_filtered)
-        ! do i = 1, sys_size-1
-        !     call s_apply_fftw_filter_scalarfield(filtered_fluid_indicator_function, .true., q_cons_vf(i), q_cons_filtered(i))
-        ! end do 
+        !call s_filter_cons_vars(q_cons_vf, q_cons_filtered)
+        do i = 1, sys_size-1
+            call s_apply_fftw_filter_scalarfield(filtered_fluid_indicator_function, .true., q_cons_vf(i), q_cons_filtered(i))
+        end do 
         call s_apply_fftw_filter_scalarfield(filtered_fluid_indicator_function, .true., q_prim_vf(E_idx), filtered_pressure)
         call nvtxEndRange
 
@@ -1223,7 +1224,7 @@ contains
 
     end subroutine s_mpi_transpose_slabZ2Y_cons
 
-    !< transpose domain from y-slabs to z-slabs on each processor for batched 4 element conserved variables
+    !< transpose domain from y-slabs to z-slabs on each processor for batched 5 element conserved variables
     subroutine s_mpi_transpose_slabY2Z_cons
         integer :: dest_rank, src_rank
         integer :: i, j, k, l

From 303ffab5ca64a8609eb8fc6b62790fff2ec2c6a2 Mon Sep 17 00:00:00 2001
From: conradd3 <conradd3@illinois.edu>
Date: Wed, 1 Oct 2025 14:32:33 -0500
Subject: [PATCH 23/30] single precision alltoall

---
 src/simulation/m_volume_filtering.fpp | 30 +++++++++++++--------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/src/simulation/m_volume_filtering.fpp b/src/simulation/m_volume_filtering.fpp
index 66844526ba..9c1661844e 100644
--- a/src/simulation/m_volume_filtering.fpp
+++ b/src/simulation/m_volume_filtering.fpp
@@ -115,9 +115,9 @@ module m_volume_filtering
     !$acc declare create(data_real_3D_slabz, real_kernelG_in, cmplx_kernelG1d)
 
     ! buffers for data transpose
-    complex(c_double_complex), allocatable :: sendbuf_sf(:), recvbuf_sf(:)
-    complex(c_double_complex), allocatable :: sendbuf_tensor(:), recvbuf_tensor(:)
-    complex(c_double_complex), allocatable :: sendbuf_cons(:), recvbuf_cons(:)
+    complex(c_float_complex), allocatable :: sendbuf_sf(:), recvbuf_sf(:)
+    complex(c_float_complex), allocatable :: sendbuf_tensor(:), recvbuf_tensor(:)
+    complex(c_float_complex), allocatable :: sendbuf_cons(:), recvbuf_cons(:)
 
     !$acc declare create(sendbuf_sf, recvbuf_sf, sendbuf_tensor, recvbuf_tensor, sendbuf_cons, recvbuf_cons)
 
@@ -1050,8 +1050,8 @@ contains
 
         !$acc update host(sendbuf_sf)
 
-        call MPI_Alltoall(sendbuf_sf, NxC*Nyloc*Nzloc, MPI_DOUBLE_COMPLEX, & 
-                          recvbuf_sf, NxC*Nyloc*Nzloc, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD, ierr)
+        call MPI_Alltoall(sendbuf_sf, NxC*Nyloc*Nzloc, MPI_COMPLEX, & 
+                          recvbuf_sf, NxC*Nyloc*Nzloc, MPI_COMPLEX, MPI_COMM_WORLD, ierr)
 
         !$acc update device(recvbuf_sf)
 
@@ -1086,8 +1086,8 @@ contains
 
         !$acc update host(sendbuf_sf)
 
-        call MPI_Alltoall(sendbuf_sf, NxC*Nyloc*Nzloc, MPI_DOUBLE_COMPLEX, & 
-                          recvbuf_sf, NxC*Nyloc*Nzloc, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD, ierr)
+        call MPI_Alltoall(sendbuf_sf, NxC*Nyloc*Nzloc, MPI_COMPLEX, & 
+                          recvbuf_sf, NxC*Nyloc*Nzloc, MPI_COMPLEX, MPI_COMM_WORLD, ierr)
 
         !$acc update device(recvbuf_sf)
 
@@ -1124,8 +1124,8 @@ contains
 
         !$acc update host(sendbuf_tensor)
 
-        call MPI_Alltoall(sendbuf_tensor, 9*NxC*Nyloc*Nzloc, MPI_DOUBLE_COMPLEX, & 
-                          recvbuf_tensor, 9*NxC*Nyloc*Nzloc, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD, ierr)
+        call MPI_Alltoall(sendbuf_tensor, 9*NxC*Nyloc*Nzloc, MPI_COMPLEX, & 
+                          recvbuf_tensor, 9*NxC*Nyloc*Nzloc, MPI_COMPLEX, MPI_COMM_WORLD, ierr)
 
         !$acc update device(recvbuf_tensor)
 
@@ -1164,8 +1164,8 @@ contains
 
         !$acc update host(sendbuf_tensor)
 
-        call MPI_Alltoall(sendbuf_tensor, 9*NxC*Nyloc*Nzloc, MPI_DOUBLE_COMPLEX, & 
-                          recvbuf_tensor, 9*NxC*Nyloc*Nzloc, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD, ierr)
+        call MPI_Alltoall(sendbuf_tensor, 9*NxC*Nyloc*Nzloc, MPI_COMPLEX, & 
+                          recvbuf_tensor, 9*NxC*Nyloc*Nzloc, MPI_COMPLEX, MPI_COMM_WORLD, ierr)
 
         !$acc update device(recvbuf_tensor)
 
@@ -1204,8 +1204,8 @@ contains
 
         !$acc update host(sendbuf_cons)
 
-        call MPI_Alltoall(sendbuf_cons, 5*NxC*Nyloc*Nzloc, MPI_DOUBLE_COMPLEX, & 
-                          recvbuf_cons, 5*NxC*Nyloc*Nzloc, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD, ierr)
+        call MPI_Alltoall(sendbuf_cons, 5*NxC*Nyloc*Nzloc, MPI_COMPLEX, & 
+                          recvbuf_cons, 5*NxC*Nyloc*Nzloc, MPI_COMPLEX, MPI_COMM_WORLD, ierr)
 
         !$acc update device(recvbuf_cons)
 
@@ -1244,8 +1244,8 @@ contains
 
         !$acc update host(sendbuf_cons)
 
-        call MPI_Alltoall(sendbuf_cons, 5*NxC*Nyloc*Nzloc, MPI_DOUBLE_COMPLEX, & 
-                          recvbuf_cons, 5*NxC*Nyloc*Nzloc, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD, ierr)
+        call MPI_Alltoall(sendbuf_cons, 5*NxC*Nyloc*Nzloc, MPI_COMPLEX, & 
+                          recvbuf_cons, 5*NxC*Nyloc*Nzloc, MPI_COMPLEX, MPI_COMM_WORLD, ierr)
 
         !$acc update device(recvbuf_cons)
 

From 8ac983847c0297a84493d33ca63cb430c448b28f Mon Sep 17 00:00:00 2001
From: Conrad Delgado <conradd3@dt-login04.delta.ncsa.illinois.edu>
Date: Wed, 8 Oct 2025 18:00:10 -0500
Subject: [PATCH 24/30] 24 batch alltoall

---
 src/simulation/m_volume_filtering.fpp | 756 ++++++++++++++------------
 1 file changed, 409 insertions(+), 347 deletions(-)

diff --git a/src/simulation/m_volume_filtering.fpp b/src/simulation/m_volume_filtering.fpp
index 9c1661844e..3cf904d84d 100644
--- a/src/simulation/m_volume_filtering.fpp
+++ b/src/simulation/m_volume_filtering.fpp
@@ -29,10 +29,10 @@ module m_volume_filtering
     private; public :: s_initialize_fftw_explicit_filter_module, &
  s_initialize_filtering_kernel, s_initialize_fluid_indicator_function, & 
  s_initialize_filtered_fluid_indicator_function, s_initialize_fluid_indicator_gradient, &
- s_finalize_fftw_explicit_filter_module, s_volume_filter_momentum_eqn, s_apply_fftw_filter_scalarfield, s_filter_tensor_field, s_filter_cons_vars, &
+ s_finalize_fftw_explicit_filter_module, s_volume_filter_momentum_eqn, s_apply_fftw_filter_scalarfield, s_filter_batch, &
  s_compute_viscous_stress_tensor, s_compute_stress_tensor, s_compute_divergence_stress_tensor, s_compute_particle_forces, &
- s_mpi_transpose_slabZ2Y, s_mpi_transpose_slabY2Z, s_mpi_transpose_slabZ2Y_tensor, s_mpi_transpose_slabY2Z_tensor, & 
- s_mpi_transpose_slabZ2Y_cons, s_mpi_transpose_slabY2Z_cons, s_mpi_FFT_fwd, s_mpi_FFT_bwd, &
+ s_mpi_transpose_slabZ2Y, s_mpi_transpose_slabY2Z, s_mpi_transpose_slabZ2Y_batch, s_mpi_transpose_slabY2Z_batch, & 
+ s_mpi_FFT_fwd, s_mpi_FFT_bwd, &
  s_setup_terms_filtering, s_compute_pseudo_turbulent_reynolds_stress, s_compute_effective_viscosity
 
 #if !defined(MFC_OpenACC)
@@ -94,11 +94,9 @@ module m_volume_filtering
     complex(c_double_complex), allocatable :: data_cmplx_out1dy(:)
 
     ! 3D arrays for slab transposes
-    complex(c_double_complex), allocatable :: data_cmplx_slabz(:, :, :), data_cmplx_slaby(:, :, :)
+    complex(c_float_complex), allocatable :: data_cmplx_slabz(:, :, :), data_cmplx_slaby(:, :, :)
     ! 3D arrays for slab transposes of tensor quantities
-    complex(c_double_complex), allocatable :: data_cmplx_slabz_tensor(:, :, :, :), data_cmplx_slaby_tensor(:, :, :, :)
-    ! 3D arrays for slab transpose of conserved variables
-    complex(c_double_complex), allocatable :: data_cmplx_slabz_cons(:, :, :, :), data_cmplx_slaby_cons(:, :, :, :)
+    complex(c_float_complex), allocatable :: data_cmplx_slabz_batch(:, :, :, :), data_cmplx_slaby_batch(:, :, :, :)
 
     ! input/output array for FFT routine
     real(c_double), allocatable :: data_real_3D_slabz(:, :, :)
@@ -111,15 +109,14 @@ module m_volume_filtering
 
     !$acc declare create(Nx, Ny, Nz, NxC, Nyloc, Nzloc)
     !$acc declare create(data_real_in1d, data_cmplx_out1d, data_cmplx_out1dy)
-    !$acc declare create(data_cmplx_slabz, data_cmplx_slaby, data_cmplx_slabz_tensor, data_cmplx_slaby_tensor, data_cmplx_slabz_cons, data_cmplx_slaby_cons)
+    !$acc declare create(data_cmplx_slabz, data_cmplx_slaby, data_cmplx_slabz_batch, data_cmplx_slaby_batch)
     !$acc declare create(data_real_3D_slabz, real_kernelG_in, cmplx_kernelG1d)
 
     ! buffers for data transpose
     complex(c_float_complex), allocatable :: sendbuf_sf(:), recvbuf_sf(:)
-    complex(c_float_complex), allocatable :: sendbuf_tensor(:), recvbuf_tensor(:)
-    complex(c_float_complex), allocatable :: sendbuf_cons(:), recvbuf_cons(:)
+    complex(c_float_complex), allocatable :: sendbuf_batch(:), recvbuf_batch(:)
 
-    !$acc declare create(sendbuf_sf, recvbuf_sf, sendbuf_tensor, recvbuf_tensor, sendbuf_cons, recvbuf_cons)
+    !$acc declare create(sendbuf_sf, recvbuf_sf, sendbuf_batch, recvbuf_batch)
 
 contains
 
@@ -246,17 +243,13 @@ contains
         @:ALLOCATE(data_real_3D_slabz(Nx, Ny, Nzloc))
         @:ALLOCATE(data_cmplx_slabz(NxC, Ny, Nzloc))
         @:ALLOCATE(data_cmplx_slaby(NxC, Nyloc, Nz))
-        @:ALLOCATE(data_cmplx_slabz_tensor(9, NxC, Ny, Nzloc))
-        @:ALLOCATE(data_cmplx_slaby_tensor(9, NxC, Nyloc, Nz))
-        @:ALLOCATE(data_cmplx_slabz_cons(5, NxC, Ny, Nzloc))
-        @:ALLOCATE(data_cmplx_slaby_cons(5, NxC, Nyloc, Nz))
+        @:ALLOCATE(data_cmplx_slabz_batch(24, NxC, Ny, Nzloc))
+        @:ALLOCATE(data_cmplx_slaby_batch(24, NxC, Nyloc, Nz))
 
         @:ALLOCATE(sendbuf_sf(NxC*Nyloc*Nzloc*num_procs))
         @:ALLOCATE(recvbuf_sf(NxC*Nyloc*Nzloc*num_procs))
-        @:ALLOCATE(sendbuf_tensor(9*NxC*Nyloc*Nzloc*num_procs))
-        @:ALLOCATE(recvbuf_tensor(9*NxC*Nyloc*Nzloc*num_procs))
-        @:ALLOCATE(sendbuf_cons(5*NxC*Nyloc*Nzloc*num_procs))
-        @:ALLOCATE(recvbuf_cons(5*NxC*Nyloc*Nzloc*num_procs))
+        @:ALLOCATE(sendbuf_batch(24*NxC*Nyloc*Nzloc*num_procs))
+        @:ALLOCATE(recvbuf_batch(24*NxC*Nyloc*Nzloc*num_procs))
 
 #if defined(MFC_OpenACC)
         !< GPU FFT plans
@@ -578,37 +571,15 @@ contains
         type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
         integer :: i, j, k
 
-        call nvtxStartRange("FILTER-CONS-VARS")
-        !call s_filter_cons_vars(q_cons_vf, q_cons_filtered)
-        do i = 1, sys_size-1
-            call s_apply_fftw_filter_scalarfield(filtered_fluid_indicator_function, .true., q_cons_vf(i), q_cons_filtered(i))
-        end do 
-        call s_apply_fftw_filter_scalarfield(filtered_fluid_indicator_function, .true., q_prim_vf(E_idx), filtered_pressure)
-        call nvtxEndRange
-
-        call nvtxStartRange("COMPUTE-UNCLOSED-TERMS")
         call s_setup_terms_filtering(q_cons_vf, q_prim_vf, reynolds_stress, visc_stress, pres_visc_stress, div_pres_visc_stress)
 
-        ! pseudo turbulent reynolds stress
-        ! do i = 1, num_dims 
-        !     do j = 1, num_dims
-        !         call s_apply_fftw_filter_scalarfield(filtered_fluid_indicator_function, .true., reynolds_stress(i)%vf(j))
-        !     end do
-        ! end do 
-        call s_filter_tensor_field(reynolds_stress)
-        ! effective viscosity
-        ! do i = 1, num_dims 
-        !     do j = 1, num_dims
-        !         call s_apply_fftw_filter_scalarfield(filtered_fluid_indicator_function, .true., visc_stress(i)%vf(j), eff_visc(i)%vf(j))
-        !     end do
-        ! end do 
-        call s_filter_tensor_field(visc_stress, eff_visc)
+        call s_filter_batch(q_cons_vf, q_cons_filtered, q_prim_vf(E_idx), filtered_pressure, reynolds_stress, visc_stress, eff_visc)
+
         ! interphase momentum exchange
         call s_compute_interphase_momentum_exchange(filtered_fluid_indicator_function, grad_fluid_indicator, pres_visc_stress, int_mom_exch)
 
         call s_compute_pseudo_turbulent_reynolds_stress(q_cons_filtered, reynolds_stress)
         call s_compute_effective_viscosity(q_cons_filtered, eff_visc, visc_stress)
-        call nvtxEndRange
 
     end subroutine s_volume_filter_momentum_eqn
 
@@ -1104,8 +1075,8 @@ contains
         
     end subroutine s_mpi_transpose_slabY2Z
 
-    !< transpose domain from z-slabs to y-slabs on each processor for batched 9 element tensors
-    subroutine s_mpi_transpose_slabZ2Y_tensor
+    !< transpose domain from z-slabs to y-slabs on each processor for batched 24 element tensors
+    subroutine s_mpi_transpose_slabZ2Y_batch
         integer :: dest_rank, src_rank
         integer :: i, j, k, l
 
@@ -1114,38 +1085,38 @@ contains
             do k = 1, Nzloc 
                 do j = 1, Nyloc
                     do i = 1, NxC
-                        do l = 1, 9
-                            sendbuf_tensor(l + (i-1)*9 + (j-1)*9*NxC + (k-1)*9*NxC*Nyloc + dest_rank*9*NxC*Nyloc*Nzloc) = data_cmplx_slabz_tensor(l, i, j+dest_rank*Nyloc, k)
+                        do l = 1, 24
+                            sendbuf_batch(l + (i-1)*24 + (j-1)*24*NxC + (k-1)*24*NxC*Nyloc + dest_rank*24*NxC*Nyloc*Nzloc) = data_cmplx_slabz_batch(l, i, j+dest_rank*Nyloc, k)
                         end do 
                     end do
                 end do
             end do
         end do 
 
-        !$acc update host(sendbuf_tensor)
+        !$acc update host(sendbuf_batch)
 
-        call MPI_Alltoall(sendbuf_tensor, 9*NxC*Nyloc*Nzloc, MPI_COMPLEX, & 
-                          recvbuf_tensor, 9*NxC*Nyloc*Nzloc, MPI_COMPLEX, MPI_COMM_WORLD, ierr)
+        call MPI_Alltoall(sendbuf_batch, 24*NxC*Nyloc*Nzloc, MPI_COMPLEX, & 
+                          recvbuf_batch, 24*NxC*Nyloc*Nzloc, MPI_COMPLEX, MPI_COMM_WORLD, ierr)
 
-        !$acc update device(recvbuf_tensor)
+        !$acc update device(recvbuf_batch)
 
         !$acc parallel loop collapse(5) gang vector default(present) 
         do src_rank = 0, num_procs-1
             do k = 1, Nzloc 
                 do j = 1, Nyloc
                     do i = 1, NxC
-                        do l = 1, 9
-                            data_cmplx_slaby_tensor(l, i, j, k+src_rank*Nzloc) = recvbuf_tensor(l + (i-1)*9 + (j-1)*9*NxC + (k-1)*9*NxC*Nyloc + src_rank*9*NxC*Nyloc*Nzloc)
+                        do l = 1, 24
+                            data_cmplx_slaby_batch(l, i, j, k+src_rank*Nzloc) = recvbuf_batch(l + (i-1)*24 + (j-1)*24*NxC + (k-1)*24*NxC*Nyloc + src_rank*24*NxC*Nyloc*Nzloc)
                         end do 
                     end do
                 end do 
             end do
         end do
 
-    end subroutine s_mpi_transpose_slabZ2Y_tensor
+    end subroutine s_mpi_transpose_slabZ2Y_batch
 
-    !< transpose domain from y-slabs to z-slabs on each processor for batched 9 element tensors
-    subroutine s_mpi_transpose_slabY2Z_tensor
+    !< transpose domain from y-slabs to z-slabs on each processor for batched 24 element tensors
+    subroutine s_mpi_transpose_slabY2Z_batch
         integer :: dest_rank, src_rank
         integer :: i, j, k, l
 
@@ -1154,137 +1125,197 @@ contains
             do k = 1, Nzloc 
                 do j = 1, Nyloc 
                     do i = 1, NxC 
-                        do l = 1, 9
-                            sendbuf_tensor(l + (i-1)*9 + (j-1)*9*NxC + (k-1)*9*NxC*Nyloc + dest_rank*9*NxC*Nyloc*Nzloc) = data_cmplx_slaby_tensor(l, i, j, k+dest_rank*Nzloc)
+                        do l = 1, 24
+                            sendbuf_batch(l + (i-1)*24 + (j-1)*24*NxC + (k-1)*24*NxC*Nyloc + dest_rank*24*NxC*Nyloc*Nzloc) = data_cmplx_slaby_batch(l, i, j, k+dest_rank*Nzloc)
                         end do 
                     end do 
                 end do 
             end do
         end do
 
-        !$acc update host(sendbuf_tensor)
+        !$acc update host(sendbuf_batch)
 
-        call MPI_Alltoall(sendbuf_tensor, 9*NxC*Nyloc*Nzloc, MPI_COMPLEX, & 
-                          recvbuf_tensor, 9*NxC*Nyloc*Nzloc, MPI_COMPLEX, MPI_COMM_WORLD, ierr)
+        call MPI_Alltoall(sendbuf_batch, 24*NxC*Nyloc*Nzloc, MPI_COMPLEX, & 
+                          recvbuf_batch, 24*NxC*Nyloc*Nzloc, MPI_COMPLEX, MPI_COMM_WORLD, ierr)
 
-        !$acc update device(recvbuf_tensor)
+        !$acc update device(recvbuf_batch)
 
         !$acc parallel loop collapse(5) gang vector default(present) 
         do src_rank = 0, num_procs-1
             do k = 1, Nzloc
                 do j = 1, Nyloc 
                     do i = 1, NxC 
-                        do l = 1, 9
-                            data_cmplx_slabz_tensor(l, i, j+src_rank*Nyloc, k) = recvbuf_tensor(l + (i-1)*9 + (j-1)*9*NxC + (k-1)*9*NxC*Nyloc + src_rank*9*NxC*Nyloc*Nzloc)
+                        do l = 1, 24
+                            data_cmplx_slabz_batch(l, i, j+src_rank*Nyloc, k) = recvbuf_batch(l + (i-1)*24 + (j-1)*24*NxC + (k-1)*24*NxC*Nyloc + src_rank*24*NxC*Nyloc*Nzloc)
                         end do 
                     end do
                 end do 
             end do
         end do
         
-    end subroutine s_mpi_transpose_slabY2Z_tensor
-
-    !< transpose domain from z-slabs to y-slabs on each processor for batched 5 element conserved variables
-    subroutine s_mpi_transpose_slabZ2Y_cons
-        integer :: dest_rank, src_rank
-        integer :: i, j, k, l
-
-        !$acc parallel loop collapse(5) gang vector default(present)
-        do dest_rank = 0, num_procs-1
-            do k = 1, Nzloc 
-                do j = 1, Nyloc
-                    do i = 1, NxC
-                        do l = 1, 5
-                            sendbuf_cons(l + (i-1)*5 + (j-1)*5*NxC + (k-1)*5*NxC*Nyloc + dest_rank*5*NxC*Nyloc*Nzloc) = data_cmplx_slabz_cons(l, i, j+dest_rank*Nyloc, k)
-                        end do 
-                    end do
-                end do
-            end do
-        end do 
-
-        !$acc update host(sendbuf_cons)
+    end subroutine s_mpi_transpose_slabY2Z_batch
 
-        call MPI_Alltoall(sendbuf_cons, 5*NxC*Nyloc*Nzloc, MPI_COMPLEX, & 
-                          recvbuf_cons, 5*NxC*Nyloc*Nzloc, MPI_COMPLEX, MPI_COMM_WORLD, ierr)
 
-        !$acc update device(recvbuf_cons)
+    !< compute forward FFT, input: data_real_3D_slabz, output: data_cmplx_out1d
+    subroutine s_filter_batch(q_cons_vf, q_cons_filtered, pressure, filtered_pressure, reynolds_stress, visc_stress, eff_visc)
+        type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
+        type(scalar_field), dimension(5), intent(inout) :: q_cons_filtered
+        type(scalar_field), intent(inout) :: pressure
+        type(scalar_field), intent(inout) :: filtered_pressure
+        type(vector_field), dimension(3), intent(inout) :: reynolds_stress
+        type(vector_field), dimension(3), intent(inout) :: visc_stress
+        type(vector_field), dimension(3), intent(inout) :: eff_visc
+        integer :: i, j, k, l, q
 
-        !$acc parallel loop collapse(5) gang vector default(present) 
-        do src_rank = 0, num_procs-1
-            do k = 1, Nzloc 
-                do j = 1, Nyloc
-                    do i = 1, NxC
-                        do l = 1, 5
-                            data_cmplx_slaby_cons(l, i, j, k+src_rank*Nzloc) = recvbuf_cons(l + (i-1)*5 + (j-1)*5*NxC + (k-1)*5*NxC*Nyloc + src_rank*5*NxC*Nyloc*Nzloc)
-                        end do 
-                    end do
+        ! cons vars
+        do l = 1, 5
+            !$acc parallel loop collapse(3)
+            do i = 0, m 
+                do j = 0, n 
+                    do k = 0, p 
+                        data_real_3D_slabz(i+1, j+1, k+1) = q_cons_vf(l)%sf(i, j, k) * fluid_indicator_function%sf(i, j, k)
+                    end do 
                 end do 
             end do
-        end do
-
-    end subroutine s_mpi_transpose_slabZ2Y_cons
-
-    !< transpose domain from y-slabs to z-slabs on each processor for batched 5 element conserved variables
-    subroutine s_mpi_transpose_slabY2Z_cons
-        integer :: dest_rank, src_rank
-        integer :: i, j, k, l
-
-        !$acc parallel loop collapse(5) gang vector default(present) 
-        do dest_rank = 0, num_procs-1
-            do k = 1, Nzloc 
-                do j = 1, Nyloc 
-                    do i = 1, NxC 
-                        do l = 1, 5
-                            sendbuf_cons(l + (i-1)*5 + (j-1)*5*NxC + (k-1)*5*NxC*Nyloc + dest_rank*5*NxC*Nyloc*Nzloc) = data_cmplx_slaby_cons(l, i, j, k+dest_rank*Nzloc)
-                        end do 
+            !$acc parallel loop collapse(3) gang vector default(present)
+            do i = 1, Nx 
+                do j = 1, Ny 
+                    do k = 1, Nzloc
+                        data_real_in1d(i + (j-1)*Nx + (k-1)*Nx*Ny) = data_real_3D_slabz(i, j, k)
+                    end do 
+                end do 
+            end do
+#if defined(MFC_OpenACC)
+            ierr = cufftExecD2Z(plan_x_fwd_gpu, data_real_in1d, data_cmplx_out1d)
+#else
+            call fftw_execute_dft_r2c(plan_x_r2c_fwd, data_real_in1d, data_cmplx_out1d)
+#endif
+            !$acc parallel loop collapse(3) gang vector default(present)
+            do i = 1, NxC
+                do j = 1, Ny 
+                    do k = 1, Nzloc
+                        data_cmplx_out1dy(j + (i-1)*Ny + (k-1)*Ny*NxC) = data_cmplx_out1d(i + (j-1)*NxC + (k-1)*NxC*Ny)
                     end do 
                 end do 
             end do
+#if defined(MFC_OpenACC)
+            ierr = cufftExecZ2Z(plan_y_gpu, data_cmplx_out1dy, data_cmplx_out1dy, CUFFT_FORWARD)
+#else
+            call fftw_execute_dft(plan_y_c2c_fwd, data_cmplx_out1dy, data_cmplx_out1dy)
+#endif 
+            !$acc parallel loop collapse(3) gang vector default(present)
+            do i = 1, NxC 
+                do j = 1, Ny 
+                    do k = 1, Nzloc
+                        data_cmplx_slabz_batch(l, i, j, k) = data_cmplx_out1dy(j + (i-1)*Ny + (k-1)*Ny*NxC)
+                    end do 
+                end do 
+            end do 
         end do
 
-        !$acc update host(sendbuf_cons)
-
-        call MPI_Alltoall(sendbuf_cons, 5*NxC*Nyloc*Nzloc, MPI_COMPLEX, & 
-                          recvbuf_cons, 5*NxC*Nyloc*Nzloc, MPI_COMPLEX, MPI_COMM_WORLD, ierr)
-
-        !$acc update device(recvbuf_cons)
+        ! pressure
+        !$acc parallel loop collapse(3)
+        do i = 0, m 
+            do j = 0, n 
+                do k = 0, p 
+                    data_real_3D_slabz(i+1, j+1, k+1) = pressure%sf(i, j, k) * fluid_indicator_function%sf(i, j, k)
+                end do 
+            end do 
+        end do
+        !$acc parallel loop collapse(3) gang vector default(present)
+        do i = 1, Nx 
+            do j = 1, Ny 
+                do k = 1, Nzloc
+                    data_real_in1d(i + (j-1)*Nx + (k-1)*Nx*Ny) = data_real_3D_slabz(i, j, k)
+                end do 
+            end do 
+        end do
+#if defined(MFC_OpenACC)
+        ierr = cufftExecD2Z(plan_x_fwd_gpu, data_real_in1d, data_cmplx_out1d)
+#else
+        call fftw_execute_dft_r2c(plan_x_r2c_fwd, data_real_in1d, data_cmplx_out1d)
+#endif
+        !$acc parallel loop collapse(3) gang vector default(present)
+        do i = 1, NxC
+            do j = 1, Ny 
+                do k = 1, Nzloc
+                    data_cmplx_out1dy(j + (i-1)*Ny + (k-1)*Ny*NxC) = data_cmplx_out1d(i + (j-1)*NxC + (k-1)*NxC*Ny)
+                end do 
+            end do 
+        end do
+#if defined(MFC_OpenACC)
+        ierr = cufftExecZ2Z(plan_y_gpu, data_cmplx_out1dy, data_cmplx_out1dy, CUFFT_FORWARD)
+#else
+        call fftw_execute_dft(plan_y_c2c_fwd, data_cmplx_out1dy, data_cmplx_out1dy)
+#endif 
+        !$acc parallel loop collapse(3) gang vector default(present)
+        do i = 1, NxC 
+            do j = 1, Ny 
+                do k = 1, Nzloc
+                    data_cmplx_slabz_batch(6, i, j, k) = data_cmplx_out1dy(j + (i-1)*Ny + (k-1)*Ny*NxC)
+                end do 
+            end do 
+        end do 
 
-        !$acc parallel loop collapse(5) gang vector default(present) 
-        do src_rank = 0, num_procs-1
-            do k = 1, Nzloc
-                do j = 1, Nyloc 
-                    do i = 1, NxC 
-                        do l = 1, 5
-                            data_cmplx_slabz_cons(l, i, j+src_rank*Nyloc, k) = recvbuf_cons(l + (i-1)*5 + (j-1)*5*NxC + (k-1)*5*NxC*Nyloc + src_rank*5*NxC*Nyloc*Nzloc)
+        ! reynolds stress
+        do l = 1, 3
+            do q = 1, 3
+                !$acc parallel loop collapse(3)
+                do i = 0, m 
+                    do j = 0, n 
+                        do k = 0, p 
+                            data_real_3D_slabz(i+1, j+1, k+1) = reynolds_stress(l)%vf(q)%sf(i, j, k) * fluid_indicator_function%sf(i, j, k)
                         end do 
-                    end do
+                    end do 
+                end do
+                !$acc parallel loop collapse(3) gang vector default(present)
+                do i = 1, Nx 
+                    do j = 1, Ny 
+                        do k = 1, Nzloc
+                            data_real_in1d(i + (j-1)*Nx + (k-1)*Nx*Ny) = data_real_3D_slabz(i, j, k)
+                        end do 
+                    end do 
+                end do
+#if defined(MFC_OpenACC)
+                ierr = cufftExecD2Z(plan_x_fwd_gpu, data_real_in1d, data_cmplx_out1d)
+#else
+                call fftw_execute_dft_r2c(plan_x_r2c_fwd, data_real_in1d, data_cmplx_out1d)
+#endif
+                !$acc parallel loop collapse(3) gang vector default(present)
+                do i = 1, NxC
+                    do j = 1, Ny 
+                        do k = 1, Nzloc
+                            data_cmplx_out1dy(j + (i-1)*Ny + (k-1)*Ny*NxC) = data_cmplx_out1d(i + (j-1)*NxC + (k-1)*NxC*Ny)
+                        end do 
+                    end do 
+                end do
+#if defined(MFC_OpenACC)
+                ierr = cufftExecZ2Z(plan_y_gpu, data_cmplx_out1dy, data_cmplx_out1dy, CUFFT_FORWARD)
+#else
+                call fftw_execute_dft(plan_y_c2c_fwd, data_cmplx_out1dy, data_cmplx_out1dy)
+#endif 
+                !$acc parallel loop collapse(3) gang vector default(present)
+                do i = 1, NxC 
+                    do j = 1, Ny 
+                        do k = 1, Nzloc
+                            data_cmplx_slabz_batch(6 + 3*(l-1) + q, i, j, k) = data_cmplx_out1dy(j + (i-1)*Ny + (k-1)*Ny*NxC)
+                        end do 
+                    end do 
                 end do 
             end do
         end do
-        
-    end subroutine s_mpi_transpose_slabY2Z_cons
 
-    !< compute forward FFT, input: data_real_3D_slabz, output: data_cmplx_out1d
-    subroutine s_filter_tensor_field(q_tensor_in, q_tensor_out)
-        type(vector_field), dimension(3), intent(inout) :: q_tensor_in
-        type(vector_field), dimension(3), intent(inout), optional :: q_tensor_out
-        integer :: i, j, k, l, q
-
-        ! ===== forward FFT =====
-        ! outer tensor element loop
+        ! effective viscosity
         do l = 1, 3
             do q = 1, 3
-
                 !$acc parallel loop collapse(3)
                 do i = 0, m 
                     do j = 0, n 
                         do k = 0, p 
-                            data_real_3D_slabz(i+1, j+1, k+1) = q_tensor_in(l)%vf(q)%sf(i, j, k) * fluid_indicator_function%sf(i, j, k)
+                            data_real_3D_slabz(i+1, j+1, k+1) = visc_stress(l)%vf(q)%sf(i, j, k) * fluid_indicator_function%sf(i, j, k)
                         end do 
                     end do 
                 end do
-
-                ! 3D z-slab -> 1D x, y, z
                 !$acc parallel loop collapse(3) gang vector default(present)
                 do i = 1, Nx 
                     do j = 1, Ny 
@@ -1293,15 +1324,11 @@ contains
                         end do 
                     end do 
                 end do
-        
-                ! X FFT
 #if defined(MFC_OpenACC)
                 ierr = cufftExecD2Z(plan_x_fwd_gpu, data_real_in1d, data_cmplx_out1d)
 #else
                 call fftw_execute_dft_r2c(plan_x_r2c_fwd, data_real_in1d, data_cmplx_out1d)
 #endif
-        
-                ! 1D x, y, z -> 1D y, x, z (CMPLX)
                 !$acc parallel loop collapse(3) gang vector default(present)
                 do i = 1, NxC
                     do j = 1, Ny 
@@ -1310,51 +1337,116 @@ contains
                         end do 
                     end do 
                 end do
-        
-                ! Y FFT 
 #if defined(MFC_OpenACC)
                 ierr = cufftExecZ2Z(plan_y_gpu, data_cmplx_out1dy, data_cmplx_out1dy, CUFFT_FORWARD)
 #else
                 call fftw_execute_dft(plan_y_c2c_fwd, data_cmplx_out1dy, data_cmplx_out1dy)
 #endif 
-        
-                ! 1D y, x, z -> 3D z-slab
                 !$acc parallel loop collapse(3) gang vector default(present)
                 do i = 1, NxC 
                     do j = 1, Ny 
                         do k = 1, Nzloc
-                            data_cmplx_slabz_tensor((l-1)*3 + q, i, j, k) = data_cmplx_out1dy(j + (i-1)*Ny + (k-1)*Ny*NxC)
+                            data_cmplx_slabz_batch(15 + 3*(l-1) + q, i, j, k) = data_cmplx_out1dy(j + (i-1)*Ny + (k-1)*Ny*NxC)
                         end do 
                     end do 
                 end do 
-                ! pack data_cmplx_slabz_tensor for MPI tranpose
             end do
-        end do 
+        end do
+
+
+        call s_mpi_transpose_slabZ2Y_batch
+
+
+        ! cons vars
+        do l = 1, 5
+            !$acc parallel loop collapse(3) gang vector default(present)
+            do i = 1, NxC 
+                do j = 1, Nyloc 
+                    do k = 1, Nz
+                        data_cmplx_out1d(k + (i-1)*Nz + (j-1)*Nz*NxC) = data_cmplx_slaby_batch(l, i, j, k)
+                    end do 
+                end do 
+            end do
+#if defined(MFC_OpenACC)
+            ierr = cufftExecZ2Z(plan_z_gpu, data_cmplx_out1d, data_cmplx_out1d, CUFFT_FORWARD)
+#else
+            call fftw_execute_dft(plan_z_c2c_fwd, data_cmplx_out1d, data_cmplx_out1d)
+#endif
+            !$acc parallel loop collapse(3) gang vector default(present)
+            do i = 1, NxC 
+                do j = 1, Nyloc 
+                    do k = 1, Nz 
+                        data_cmplx_out1d(k + (i-1)*Nz + (j-1)*Nz*NxC) = data_cmplx_out1d(k + (i-1)*Nz + (j-1)*Nz*NxC) * cmplx_kernelG1d(k + (i-1)*Nz + (j-1)*Nz*NxC)
+                    end do 
+                end do 
+            end do
+#if defined(MFC_OpenACC)
+            ierr = cufftExecZ2Z(plan_z_gpu, data_cmplx_out1d, data_cmplx_out1d, CUFFT_INVERSE)
+#else
+            call fftw_execute_dft(plan_z_c2c_bwd, data_cmplx_out1d, data_cmplx_out1d)
+#endif
+            !$acc parallel loop collapse(3) gang vector default(present)
+            do i = 1, NxC 
+                do j = 1, Nyloc 
+                    do k = 1, Nz 
+                        data_cmplx_slaby_batch(l, i, j, k) = data_cmplx_out1d(k + (i-1)*Nz + (j-1)*Nz*NxC)
+                    end do 
+                end do 
+            end do
+        end do
 
-        ! tensor MPI data transpose
-        call s_mpi_transpose_slabZ2Y_tensor
+        ! pressure
+        !$acc parallel loop collapse(3) gang vector default(present)
+        do i = 1, NxC 
+            do j = 1, Nyloc 
+                do k = 1, Nz
+                    data_cmplx_out1d(k + (i-1)*Nz + (j-1)*Nz*NxC) = data_cmplx_slaby_batch(6, i, j, k)
+                end do 
+            end do 
+        end do
+#if defined(MFC_OpenACC)
+        ierr = cufftExecZ2Z(plan_z_gpu, data_cmplx_out1d, data_cmplx_out1d, CUFFT_FORWARD)
+#else
+        call fftw_execute_dft(plan_z_c2c_fwd, data_cmplx_out1d, data_cmplx_out1d)
+#endif
+        !$acc parallel loop collapse(3) gang vector default(present)
+        do i = 1, NxC 
+            do j = 1, Nyloc 
+                do k = 1, Nz 
+                    data_cmplx_out1d(k + (i-1)*Nz + (j-1)*Nz*NxC) = data_cmplx_out1d(k + (i-1)*Nz + (j-1)*Nz*NxC) * cmplx_kernelG1d(k + (i-1)*Nz + (j-1)*Nz*NxC)
+                end do 
+            end do 
+        end do
+#if defined(MFC_OpenACC)
+        ierr = cufftExecZ2Z(plan_z_gpu, data_cmplx_out1d, data_cmplx_out1d, CUFFT_INVERSE)
+#else
+        call fftw_execute_dft(plan_z_c2c_bwd, data_cmplx_out1d, data_cmplx_out1d)
+#endif
+        !$acc parallel loop collapse(3) gang vector default(present)
+        do i = 1, NxC 
+            do j = 1, Nyloc 
+                do k = 1, Nz 
+                    data_cmplx_slaby_batch(6, i, j, k) = data_cmplx_out1d(k + (i-1)*Nz + (j-1)*Nz*NxC)
+                end do 
+            end do 
+        end do
 
-        ! outer tensor element loop
+        ! reynolds stress
         do l = 1, 3
             do q = 1, 3
-                ! 3D y-slab -> 1D z, x, y
                 !$acc parallel loop collapse(3) gang vector default(present)
                 do i = 1, NxC 
                     do j = 1, Nyloc 
                         do k = 1, Nz
-                            data_cmplx_out1d(k + (i-1)*Nz + (j-1)*Nz*NxC) = data_cmplx_slaby_tensor((l-1)*3 + q, i, j, k)
+                            data_cmplx_out1d(k + (i-1)*Nz + (j-1)*Nz*NxC) = data_cmplx_slaby_batch(6 + 3*(l-1) + q, i, j, k)
                         end do 
                     end do 
                 end do
-
-                ! Z FFT
 #if defined(MFC_OpenACC)
                 ierr = cufftExecZ2Z(plan_z_gpu, data_cmplx_out1d, data_cmplx_out1d, CUFFT_FORWARD)
 #else
                 call fftw_execute_dft(plan_z_c2c_fwd, data_cmplx_out1d, data_cmplx_out1d)
 #endif
-                
-                ! convolution with filtering kernel in Fourier space
                 !$acc parallel loop collapse(3) gang vector default(present)
                 do i = 1, NxC 
                     do j = 1, Nyloc 
@@ -1363,282 +1455,254 @@ contains
                         end do 
                     end do 
                 end do
-
-                ! ===== begin backward FFT =====
-                ! Z inv FFT 
 #if defined(MFC_OpenACC)
                 ierr = cufftExecZ2Z(plan_z_gpu, data_cmplx_out1d, data_cmplx_out1d, CUFFT_INVERSE)
 #else
                 call fftw_execute_dft(plan_z_c2c_bwd, data_cmplx_out1d, data_cmplx_out1d)
 #endif
-
-                ! 1D z, x, y -> 3D y-slab
                 !$acc parallel loop collapse(3) gang vector default(present)
                 do i = 1, NxC 
                     do j = 1, Nyloc 
                         do k = 1, Nz 
-                            data_cmplx_slaby_tensor((l-1)*3 + q, i, j, k) = data_cmplx_out1d(k + (i-1)*Nz + (j-1)*Nz*NxC)
+                            data_cmplx_slaby_batch(6 + 3*(l-1) + q, i, j, k) = data_cmplx_out1d(k + (i-1)*Nz + (j-1)*Nz*NxC)
                         end do 
                     end do 
                 end do
-                ! pack data_cmplx_slaby_tensor for MPI tranpose
             end do
         end do
 
-        call s_mpi_transpose_slabY2Z_tensor
-
-        ! outer tensor element loop
+        ! effective viscosity
         do l = 1, 3
             do q = 1, 3
-                
-                ! 3D z-slab -> 1D y, x, z
                 !$acc parallel loop collapse(3) gang vector default(present)
                 do i = 1, NxC 
-                    do j = 1, Ny 
-                        do k = 1, Nzloc
-                            data_cmplx_out1dy(j + (i-1)*Ny + (k-1)*Ny*NxC) = data_cmplx_slabz_tensor((l-1)*3 + q, i, j, k)
+                    do j = 1, Nyloc 
+                        do k = 1, Nz
+                            data_cmplx_out1d(k + (i-1)*Nz + (j-1)*Nz*NxC) = data_cmplx_slaby_batch(15 + 3*(l-1) + q, i, j, k)
                         end do 
                     end do 
                 end do
-
-                ! Y inv FFT 
 #if defined(MFC_OpenACC)
-                ierr = cufftExecZ2Z(plan_y_gpu, data_cmplx_out1dy, data_cmplx_out1dy, CUFFT_INVERSE)
+                ierr = cufftExecZ2Z(plan_z_gpu, data_cmplx_out1d, data_cmplx_out1d, CUFFT_FORWARD)
 #else
-                call fftw_execute_dft(plan_y_c2c_bwd, data_cmplx_out1dy, data_cmplx_out1dy)
+                call fftw_execute_dft(plan_z_c2c_fwd, data_cmplx_out1d, data_cmplx_out1d)
 #endif
-
-                ! 1D y, x, z -> 1D x, y, z 
                 !$acc parallel loop collapse(3) gang vector default(present)
                 do i = 1, NxC 
-                    do j = 1, Ny 
-                        do k = 1, Nzloc
-                            data_cmplx_out1d(i + (j-1)*NxC + (k-1)*NxC*Ny) = data_cmplx_out1dy(j + (i-1)*Ny + (k-1)*Ny*NxC)
+                    do j = 1, Nyloc 
+                        do k = 1, Nz 
+                            data_cmplx_out1d(k + (i-1)*Nz + (j-1)*Nz*NxC) = data_cmplx_out1d(k + (i-1)*Nz + (j-1)*Nz*NxC) * cmplx_kernelG1d(k + (i-1)*Nz + (j-1)*Nz*NxC)
                         end do 
                     end do 
                 end do
-
-                ! X inv FFT
 #if defined(MFC_OpenACC)
-                ierr = cufftExecZ2D(plan_x_bwd_gpu, data_cmplx_out1d, data_real_in1d)
+                ierr = cufftExecZ2Z(plan_z_gpu, data_cmplx_out1d, data_cmplx_out1d, CUFFT_INVERSE)
 #else
-                call fftw_execute_dft_c2r(plan_x_c2r_bwd, data_cmplx_out1d, data_real_in1d)
+                call fftw_execute_dft(plan_z_c2c_bwd, data_cmplx_out1d, data_cmplx_out1d)
 #endif
-
-                ! 1D x, y, z -> 3D z-slab
                 !$acc parallel loop collapse(3) gang vector default(present)
-                do i = 1, Nx 
-                    do j = 1, Ny 
-                        do k = 1, Nzloc
-                            data_real_3D_slabz(i, j, k) = data_real_in1d(i + (j-1)*Nx + (k-1)*Nx*Ny)
+                do i = 1, NxC 
+                    do j = 1, Nyloc 
+                        do k = 1, Nz 
+                            data_cmplx_slaby_batch(15 + 3*(l-1) + q, i, j, k) = data_cmplx_out1d(k + (i-1)*Nz + (j-1)*Nz*NxC)
                         end do 
                     end do 
                 end do
-
-                if (present(q_tensor_out)) then 
-                    !$acc parallel loop collapse(3) gang vector default(present)
-                    do i = 0, m
-                        do j = 0, n
-                            do k = 0, p
-                                q_tensor_out(l)%vf(q)%sf(i, j, k) = data_real_3D_slabz(i+1, j+1, k+1) / (real(Nx*Ny*Nz, dp) * filtered_fluid_indicator_function%sf(i, j, k))
-                            end do 
-                        end do 
-                    end do
-                else
-                    !$acc parallel loop collapse(3) gang vector default(present)
-                    do i = 0, m
-                        do j = 0, n
-                            do k = 0, p
-                                q_tensor_in(l)%vf(q)%sf(i, j, k) = data_real_3D_slabz(i+1, j+1, k+1) / (real(Nx*Ny*Nz, dp) * filtered_fluid_indicator_function%sf(i, j, k))
-                            end do 
-                        end do 
-                    end do
-                end if
-
             end do
         end do
 
-    end subroutine s_filter_tensor_field
 
-    !< compute forward FFT, input: data_real_3D_slabz, output: data_cmplx_out1d
-    subroutine s_filter_cons_vars(q_cons_vf, q_cons_filtered)
-        type(scalar_field), dimension(5), intent(inout) :: q_cons_vf
-        type(scalar_field), dimension(5), intent(inout) :: q_cons_filtered
-        integer :: i, j, k, l
+        call s_mpi_transpose_slabY2Z_batch
 
-        ! ===== forward FFT =====
-        ! outer element loop
-        do l = 1, 5
 
-            !$acc parallel loop collapse(3)
-            do i = 0, m 
-                do j = 0, n 
-                    do k = 0, p 
-                        data_real_3D_slabz(i+1, j+1, k+1) = q_cons_vf(l)%sf(i, j, k) * fluid_indicator_function%sf(i, j, k)
-                    end do 
-                end do 
-            end do
-
-            ! 3D z-slab -> 1D x, y, z
+        ! cons vars 
+        do l = 1, 5              
             !$acc parallel loop collapse(3) gang vector default(present)
-            do i = 1, Nx 
+            do i = 1, NxC 
                 do j = 1, Ny 
                     do k = 1, Nzloc
-                        data_real_in1d(i + (j-1)*Nx + (k-1)*Nx*Ny) = data_real_3D_slabz(i, j, k)
+                        data_cmplx_out1dy(j + (i-1)*Ny + (k-1)*Ny*NxC) = data_cmplx_slabz_batch(l, i, j, k)
                     end do 
                 end do 
             end do
-    
-            ! X FFT
 #if defined(MFC_OpenACC)
-            ierr = cufftExecD2Z(plan_x_fwd_gpu, data_real_in1d, data_cmplx_out1d)
+            ierr = cufftExecZ2Z(plan_y_gpu, data_cmplx_out1dy, data_cmplx_out1dy, CUFFT_INVERSE)
 #else
-            call fftw_execute_dft_r2c(plan_x_r2c_fwd, data_real_in1d, data_cmplx_out1d)
+            call fftw_execute_dft(plan_y_c2c_bwd, data_cmplx_out1dy, data_cmplx_out1dy)
 #endif
-    
-            ! 1D x, y, z -> 1D y, x, z (CMPLX)
             !$acc parallel loop collapse(3) gang vector default(present)
-            do i = 1, NxC
+            do i = 1, NxC 
                 do j = 1, Ny 
                     do k = 1, Nzloc
-                        data_cmplx_out1dy(j + (i-1)*Ny + (k-1)*Ny*NxC) = data_cmplx_out1d(i + (j-1)*NxC + (k-1)*NxC*Ny)
+                        data_cmplx_out1d(i + (j-1)*NxC + (k-1)*NxC*Ny) = data_cmplx_out1dy(j + (i-1)*Ny + (k-1)*Ny*NxC)
                     end do 
                 end do 
             end do
-    
-            ! Y FFT 
 #if defined(MFC_OpenACC)
-            ierr = cufftExecZ2Z(plan_y_gpu, data_cmplx_out1dy, data_cmplx_out1dy, CUFFT_FORWARD)
+            ierr = cufftExecZ2D(plan_x_bwd_gpu, data_cmplx_out1d, data_real_in1d)
 #else
-            call fftw_execute_dft(plan_y_c2c_fwd, data_cmplx_out1dy, data_cmplx_out1dy)
-#endif 
-    
-            ! 1D y, x, z -> 3D z-slab
+            call fftw_execute_dft_c2r(plan_x_c2r_bwd, data_cmplx_out1d, data_real_in1d)
+#endif
             !$acc parallel loop collapse(3) gang vector default(present)
-            do i = 1, NxC 
+            do i = 1, Nx 
                 do j = 1, Ny 
                     do k = 1, Nzloc
-                        data_cmplx_slabz_cons(l, i, j, k) = data_cmplx_out1dy(j + (i-1)*Ny + (k-1)*Ny*NxC)
+                        data_real_3D_slabz(i, j, k) = data_real_in1d(i + (j-1)*Nx + (k-1)*Nx*Ny)
                     end do 
                 end do 
-            end do 
-            ! pack data_cmplx_slabz_cons for MPI tranpose
-        end do 
-
-        ! cons vars MPI data transpose
-        call s_mpi_transpose_slabZ2Y_cons
-
-        ! outer element loop
-        do l = 1, 5
-
-            ! 3D y-slab -> 1D z, x, y
+            end do
             !$acc parallel loop collapse(3) gang vector default(present)
-            do i = 1, NxC 
-                do j = 1, Nyloc 
-                    do k = 1, Nz
-                        data_cmplx_out1d(k + (i-1)*Nz + (j-1)*Nz*NxC) = data_cmplx_slaby_cons(l, i, j, k)
+            do i = 0, m
+                do j = 0, n
+                    do k = 0, p
+                        q_cons_filtered(l)%sf(i, j, k) = data_real_3D_slabz(i+1, j+1, k+1) / (real(Nx*Ny*Nz, dp) * filtered_fluid_indicator_function%sf(i, j, k))
                     end do 
                 end do 
             end do
+        end do
 
-            ! Z FFT
+        ! pressure
+        !$acc parallel loop collapse(3) gang vector default(present)
+        do i = 1, NxC 
+            do j = 1, Ny 
+                do k = 1, Nzloc
+                    data_cmplx_out1dy(j + (i-1)*Ny + (k-1)*Ny*NxC) = data_cmplx_slabz_batch(6, i, j, k)
+                end do 
+            end do 
+        end do
 #if defined(MFC_OpenACC)
-            ierr = cufftExecZ2Z(plan_z_gpu, data_cmplx_out1d, data_cmplx_out1d, CUFFT_FORWARD)
+        ierr = cufftExecZ2Z(plan_y_gpu, data_cmplx_out1dy, data_cmplx_out1dy, CUFFT_INVERSE)
 #else
-            call fftw_execute_dft(plan_z_c2c_fwd, data_cmplx_out1d, data_cmplx_out1d)
+        call fftw_execute_dft(plan_y_c2c_bwd, data_cmplx_out1dy, data_cmplx_out1dy)
 #endif
-            
-            ! convolution with filtering kernel in Fourier space
-            !$acc parallel loop collapse(3) gang vector default(present)
-            do i = 1, NxC 
-                do j = 1, Nyloc 
-                    do k = 1, Nz 
-                        data_cmplx_out1d(k + (i-1)*Nz + (j-1)*Nz*NxC) = data_cmplx_out1d(k + (i-1)*Nz + (j-1)*Nz*NxC) * cmplx_kernelG1d(k + (i-1)*Nz + (j-1)*Nz*NxC)
-                    end do 
+        !$acc parallel loop collapse(3) gang vector default(present)
+        do i = 1, NxC 
+            do j = 1, Ny 
+                do k = 1, Nzloc
+                    data_cmplx_out1d(i + (j-1)*NxC + (k-1)*NxC*Ny) = data_cmplx_out1dy(j + (i-1)*Ny + (k-1)*Ny*NxC)
                 end do 
-            end do
-
-            ! ===== begin backward FFT =====
-            ! Z inv FFT 
+            end do 
+        end do
 #if defined(MFC_OpenACC)
-            ierr = cufftExecZ2Z(plan_z_gpu, data_cmplx_out1d, data_cmplx_out1d, CUFFT_INVERSE)
+        ierr = cufftExecZ2D(plan_x_bwd_gpu, data_cmplx_out1d, data_real_in1d)
 #else
-            call fftw_execute_dft(plan_z_c2c_bwd, data_cmplx_out1d, data_cmplx_out1d)
+        call fftw_execute_dft_c2r(plan_x_c2r_bwd, data_cmplx_out1d, data_real_in1d)
 #endif
-
-            ! 1D z, x, y -> 3D y-slab
-            !$acc parallel loop collapse(3) gang vector default(present)
-            do i = 1, NxC 
-                do j = 1, Nyloc 
-                    do k = 1, Nz 
-                        data_cmplx_slaby_cons(l, i, j, k) = data_cmplx_out1d(k + (i-1)*Nz + (j-1)*Nz*NxC)
-                    end do 
+        !$acc parallel loop collapse(3) gang vector default(present)
+        do i = 1, Nx 
+            do j = 1, Ny 
+                do k = 1, Nzloc
+                    data_real_3D_slabz(i, j, k) = data_real_in1d(i + (j-1)*Nx + (k-1)*Nx*Ny)
                 end do 
-            end do
-            ! pack data_cmplx_slaby_cons for MPI tranpose
+            end do 
         end do
-
-        call s_mpi_transpose_slabY2Z_cons
-
-        ! outer element loop
-        do l = 1, 5
-            
-            ! 3D z-slab -> 1D y, x, z
-            !$acc parallel loop collapse(3) gang vector default(present)
-            do i = 1, NxC 
-                do j = 1, Ny 
-                    do k = 1, Nzloc
-                        data_cmplx_out1dy(j + (i-1)*Ny + (k-1)*Ny*NxC) = data_cmplx_slabz_cons(l, i, j, k)
-                    end do 
+        !$acc parallel loop collapse(3) gang vector default(present)
+        do i = 0, m
+            do j = 0, n
+                do k = 0, p
+                    filtered_pressure%sf(i, j, k) = data_real_3D_slabz(i+1, j+1, k+1) / (real(Nx*Ny*Nz, dp) * filtered_fluid_indicator_function%sf(i, j, k))
                 end do 
-            end do
+            end do 
+        end do
 
-            ! Y inv FFT 
+        ! reynolds stress
+        do l = 1, 3
+            do q = 1, 3
+                !$acc parallel loop collapse(3) gang vector default(present)
+                do i = 1, NxC 
+                    do j = 1, Ny 
+                        do k = 1, Nzloc
+                            data_cmplx_out1dy(j + (i-1)*Ny + (k-1)*Ny*NxC) = data_cmplx_slabz_batch(6 + 3*(l-1) + q, i, j, k)
+                        end do 
+                    end do 
+                end do
 #if defined(MFC_OpenACC)
-            ierr = cufftExecZ2Z(plan_y_gpu, data_cmplx_out1dy, data_cmplx_out1dy, CUFFT_INVERSE)
+                ierr = cufftExecZ2Z(plan_y_gpu, data_cmplx_out1dy, data_cmplx_out1dy, CUFFT_INVERSE)
 #else
-            call fftw_execute_dft(plan_y_c2c_bwd, data_cmplx_out1dy, data_cmplx_out1dy)
+                call fftw_execute_dft(plan_y_c2c_bwd, data_cmplx_out1dy, data_cmplx_out1dy)
 #endif
-
-            ! 1D y, x, z -> 1D x, y, z 
-            !$acc parallel loop collapse(3) gang vector default(present)
-            do i = 1, NxC 
-                do j = 1, Ny 
-                    do k = 1, Nzloc
-                        data_cmplx_out1d(i + (j-1)*NxC + (k-1)*NxC*Ny) = data_cmplx_out1dy(j + (i-1)*Ny + (k-1)*Ny*NxC)
+                !$acc parallel loop collapse(3) gang vector default(present)
+                do i = 1, NxC 
+                    do j = 1, Ny 
+                        do k = 1, Nzloc
+                            data_cmplx_out1d(i + (j-1)*NxC + (k-1)*NxC*Ny) = data_cmplx_out1dy(j + (i-1)*Ny + (k-1)*Ny*NxC)
+                        end do 
                     end do 
-                end do 
-            end do
-
-            ! X inv FFT
+                end do
 #if defined(MFC_OpenACC)
-            ierr = cufftExecZ2D(plan_x_bwd_gpu, data_cmplx_out1d, data_real_in1d)
+                ierr = cufftExecZ2D(plan_x_bwd_gpu, data_cmplx_out1d, data_real_in1d)
 #else
-            call fftw_execute_dft_c2r(plan_x_c2r_bwd, data_cmplx_out1d, data_real_in1d)
+                call fftw_execute_dft_c2r(plan_x_c2r_bwd, data_cmplx_out1d, data_real_in1d)
 #endif
-
-            ! 1D x, y, z -> 3D z-slab
-            !$acc parallel loop collapse(3) gang vector default(present)
-            do i = 1, Nx 
-                do j = 1, Ny 
-                    do k = 1, Nzloc
-                        data_real_3D_slabz(i, j, k) = data_real_in1d(i + (j-1)*Nx + (k-1)*Nx*Ny)
+                !$acc parallel loop collapse(3) gang vector default(present)
+                do i = 1, Nx 
+                    do j = 1, Ny 
+                        do k = 1, Nzloc
+                            data_real_3D_slabz(i, j, k) = data_real_in1d(i + (j-1)*Nx + (k-1)*Nx*Ny)
+                        end do 
                     end do 
-                end do 
+                end do
+                !$acc parallel loop collapse(3) gang vector default(present)
+                do i = 0, m
+                    do j = 0, n
+                        do k = 0, p
+                            reynolds_stress(l)%vf(q)%sf(i, j, k) = data_real_3D_slabz(i+1, j+1, k+1) / (real(Nx*Ny*Nz, dp) * filtered_fluid_indicator_function%sf(i, j, k))
+                        end do 
+                    end do 
+                end do
             end do
+        end do
 
-            !$acc parallel loop collapse(3) gang vector default(present)
-            do i = 0, m
-                do j = 0, n
-                    do k = 0, p
-                        q_cons_filtered(l)%sf(i, j, k) = data_real_3D_slabz(i+1, j+1, k+1) / (real(Nx*Ny*Nz, dp) * filtered_fluid_indicator_function%sf(i, j, k))
+        ! effective viscosity
+        do l = 1, 3
+            do q = 1, 3
+                !$acc parallel loop collapse(3) gang vector default(present)
+                do i = 1, NxC 
+                    do j = 1, Ny 
+                        do k = 1, Nzloc
+                            data_cmplx_out1dy(j + (i-1)*Ny + (k-1)*Ny*NxC) = data_cmplx_slabz_batch(15 + 3*(l-1) + q, i, j, k)
+                        end do 
                     end do 
-                end do 
+                end do
+#if defined(MFC_OpenACC)
+                ierr = cufftExecZ2Z(plan_y_gpu, data_cmplx_out1dy, data_cmplx_out1dy, CUFFT_INVERSE)
+#else
+                call fftw_execute_dft(plan_y_c2c_bwd, data_cmplx_out1dy, data_cmplx_out1dy)
+#endif
+                !$acc parallel loop collapse(3) gang vector default(present)
+                do i = 1, NxC 
+                    do j = 1, Ny 
+                        do k = 1, Nzloc
+                            data_cmplx_out1d(i + (j-1)*NxC + (k-1)*NxC*Ny) = data_cmplx_out1dy(j + (i-1)*Ny + (k-1)*Ny*NxC)
+                        end do 
+                    end do 
+                end do
+#if defined(MFC_OpenACC)
+                ierr = cufftExecZ2D(plan_x_bwd_gpu, data_cmplx_out1d, data_real_in1d)
+#else
+                call fftw_execute_dft_c2r(plan_x_c2r_bwd, data_cmplx_out1d, data_real_in1d)
+#endif
+                !$acc parallel loop collapse(3) gang vector default(present)
+                do i = 1, Nx 
+                    do j = 1, Ny 
+                        do k = 1, Nzloc
+                            data_real_3D_slabz(i, j, k) = data_real_in1d(i + (j-1)*Nx + (k-1)*Nx*Ny)
+                        end do 
+                    end do 
+                end do
+                !$acc parallel loop collapse(3) gang vector default(present)
+                do i = 0, m
+                    do j = 0, n
+                        do k = 0, p
+                            eff_visc(l)%vf(q)%sf(i, j, k) = data_real_3D_slabz(i+1, j+1, k+1) / (real(Nx*Ny*Nz, dp) * filtered_fluid_indicator_function%sf(i, j, k))
+                        end do 
+                    end do 
+                end do
             end do
-
         end do
 
-    end subroutine s_filter_cons_vars
+    end subroutine s_filter_batch
+
 
     !< compute forward FFT, input: data_real_3D_slabz, output: data_cmplx_out1d
     subroutine s_mpi_FFT_fwd
@@ -1850,12 +1914,10 @@ contains
         @:DEALLOCATE(data_real_in1d, data_cmplx_out1d, data_cmplx_out1dy)
         @:DEALLOCATE(cmplx_kernelG1d, real_kernelG_in)
         @:DEALLOCATE(data_real_3D_slabz, data_cmplx_slabz, data_cmplx_slaby)
-        @:DEALLOCATE(data_cmplx_slabz_tensor, data_cmplx_slaby_tensor)
-        @:DEALLOCATE(data_cmplx_slabz_cons, data_cmplx_slaby_cons)
+        @:DEALLOCATE(data_cmplx_slabz_batch, data_cmplx_slaby_batch)
         
         @:DEALLOCATE(sendbuf_sf, recvbuf_sf)
-        @:DEALLOCATE(sendbuf_tensor, recvbuf_tensor)
-        @:DEALLOCATE(sendbuf_cons, recvbuf_cons)
+        @:DEALLOCATE(sendbuf_batch, recvbuf_batch)
 
 #if defined(MFC_OpenACC)
         ierr = cufftDestroy(plan_x_fwd_gpu)

From f5731937f622169b40daa7e210c7616bcb5ff1ee Mon Sep 17 00:00:00 2001
From: Conrad Delgado <conradd3@dt-login04.delta.ncsa.illinois.edu>
Date: Wed, 8 Oct 2025 18:17:26 -0500
Subject: [PATCH 25/30] energy forcing

---
 src/simulation/m_additional_forcing.fpp | 26 ++++++++++++++-----------
 src/simulation/m_global_parameters.fpp  |  6 +++---
 src/simulation/m_mpi_proxy.fpp          |  2 +-
 src/simulation/m_start_up.fpp           |  4 ++--
 toolchain/mfc/run/case_dicts.py         |  2 +-
 5 files changed, 22 insertions(+), 18 deletions(-)

diff --git a/src/simulation/m_additional_forcing.fpp b/src/simulation/m_additional_forcing.fpp
index b3b6807b55..a99dc186a6 100644
--- a/src/simulation/m_additional_forcing.fpp
+++ b/src/simulation/m_additional_forcing.fpp
@@ -20,11 +20,11 @@ module m_additional_forcing
     type(scalar_field), allocatable, dimension(:) :: q_periodic_force
     real(wp) :: volfrac_phi
     integer :: N_x_total_glb
-    real(wp) :: spatial_rho, spatial_u
-    real(wp) :: phase_rho, phase_u
+    real(wp) :: spatial_rho, spatial_u, spatial_E
+    real(wp) :: phase_rho, phase_u, phase_E
 
     !$acc declare create(q_periodic_force, volfrac_phi, N_x_total_glb)
-    !$acc declare create(spatial_rho, spatial_u, phase_rho, phase_u)
+    !$acc declare create(spatial_rho, spatial_u, spatial_E, phase_rho, phase_u, phase_E)
 
 contains
 
@@ -65,35 +65,39 @@ contains
     subroutine s_compute_periodic_forcing(q_cons_vf, t_step)
         type(scalar_field), dimension(sys_size), intent(in) :: q_cons_vf
         integer, intent(in) :: t_step
-        real(wp) :: spatial_rho_glb, spatial_u_glb
+        real(wp) :: spatial_rho_glb, spatial_u_glb, spatial_E_glb
         integer :: i, j, k
 
         ! zero spatial averages
         spatial_rho = 0._wp
         spatial_u = 0._wp
-        !$acc update device(spatial_rho, spatial_u)
+        spatial_E = 0._wp
+        !$acc update device(spatial_rho, spatial_u, spatial_E)
 
         ! compute spatial averages
-        !$acc parallel loop collapse(3) gang vector default(present) reduction(+:spatial_rho, spatial_u)
+        !$acc parallel loop collapse(3) gang vector default(present) reduction(+:spatial_rho, spatial_u, spatial_E)
         do i = 0, m 
             do j = 0, n 
                 do k = 0, p 
                     spatial_rho = spatial_rho + q_cons_vf(1)%sf(i, j, k) * fluid_indicator_function%sf(i, j, k) ! rho
-                    spatial_u = spatial_u + q_cons_vf(2)%sf(i, j, k) * fluid_indicator_function%sf(i, j, k) ! u
+                    spatial_u = spatial_u + q_cons_vf(2)%sf(i, j, k) * fluid_indicator_function%sf(i, j, k) ! rho*u
+                    spatial_E = spatial_E + q_cons_vf(5)%sf(i, j, k)* fluid_indicator_function%sf(i, j, k) ! E 
                 end do
             end do
         end do
 
-        !$acc update host(spatial_rho, spatial_u)
+        !$acc update host(spatial_rho, spatial_u, spatial_E)
 
         ! reduction sum across entire domain
         call s_mpi_allreduce_sum(spatial_rho, spatial_rho_glb)
         call s_mpi_allreduce_sum(spatial_u, spatial_u_glb)
-
+        call s_mpi_allreduce_sum(spatial_E, spatial_E_glb)
+        
         ! compute phase averages
         phase_rho = phase_rho + (spatial_rho_glb / real(N_x_total_glb, wp) - phase_rho) / real(t_step, wp)
         phase_u = phase_u + (spatial_u_glb / real(N_x_total_glb, wp) - phase_u) / real(t_step, wp)
-        !$acc update device(phase_rho, phase_u)
+        phase_E = phase_E + (spatial_E_glb / real(N_x_total_glb, wp) - phase_E) / real(t_step, wp)
+        !$acc update device(phase_rho, phase_u, phase_E)
 
         ! compute periodic forcing terms for mass, momentum, energy
         !$acc parallel loop collapse(3) gang vector default(present)
@@ -107,7 +111,7 @@ contains
                     q_periodic_force(2)%sf(i, j, k) = (rho_inf_ref*u_inf_ref - phase_u/(1._wp - volfrac_phi)) / dt
 
                     ! u*f_u
-                    q_periodic_force(3)%sf(i, j, k) = q_cons_vf(2)%sf(i, j, k)/q_cons_vf(1)%sf(i, j, k) * q_periodic_force(2)%sf(i, j, k)
+                    q_periodic_force(3)%sf(i, j, k) = (P_inf_ref*gammas(1) + 0.5_wp*rho_inf_ref*u_inf_ref**2 - phase_E/(1._wp - volfrac_phi)) / dt
                 end do 
             end do
         end do
diff --git a/src/simulation/m_global_parameters.fpp b/src/simulation/m_global_parameters.fpp
index 50590a26fe..276c3287fd 100644
--- a/src/simulation/m_global_parameters.fpp
+++ b/src/simulation/m_global_parameters.fpp
@@ -505,7 +505,7 @@ module m_global_parameters
     logical :: compute_particle_drag
     real(wp) :: u_inf_ref !< reference freestream velocity
     real(wp) :: rho_inf_ref !< reference freestream density 
-    real(wp) :: T_inf_ref !< reference freestream temperature
+    real(wp) :: P_inf_ref !< reference freestream temperature
     logical :: periodic_forcing
     logical :: volume_filtering_momentum_eqn
     logical :: store_levelset
@@ -515,7 +515,7 @@ module m_global_parameters
     real(wp) :: filter_width
     logical :: q_filtered_wrt
 
-    !$acc declare create(u_inf_ref, rho_inf_ref, T_inf_ref, filter_width)
+    !$acc declare create(u_inf_ref, rho_inf_ref, P_inf_ref, filter_width)
 
 contains
 
@@ -796,7 +796,7 @@ contains
         compute_particle_drag = .false.
         u_inf_ref = dflt_real
         rho_inf_ref = dflt_real
-        T_inf_ref = dflt_real
+        P_inf_ref = dflt_real
         periodic_forcing = .false.
         volume_filtering_momentum_eqn = .false.
         store_levelset = .true.
diff --git a/src/simulation/m_mpi_proxy.fpp b/src/simulation/m_mpi_proxy.fpp
index bac8259b81..6af2a0363e 100644
--- a/src/simulation/m_mpi_proxy.fpp
+++ b/src/simulation/m_mpi_proxy.fpp
@@ -134,7 +134,7 @@ contains
             & 'z_domain%beg', 'z_domain%end', 'x_a', 'x_b', 'y_a', 'y_b', 'z_a', &
             & 'z_b', 't_stop', 't_save', 'cfl_target', 'rkck_tolerance', 'Bx0',  &
             & 'tau_star', 'cont_damage_s', 'alpha_bar', 'u_inf_ref',  & 
-            & 'rho_inf_ref', 'T_inf_ref', 'filter_width' ]
+            & 'rho_inf_ref', 'P_inf_ref', 'filter_width' ]
             call MPI_BCAST(${VAR}$, 1, mpi_p, 0, MPI_COMM_WORLD, ierr)
         #:endfor
 
diff --git a/src/simulation/m_start_up.fpp b/src/simulation/m_start_up.fpp
index d83d73e9d3..ae76d86d40 100644
--- a/src/simulation/m_start_up.fpp
+++ b/src/simulation/m_start_up.fpp
@@ -187,7 +187,7 @@ contains
             rkck_adap_dt, rkck_tolerance, &
             hyperelasticity, R0ref, num_bc_patches, Bx0, powell, &
             cont_damage, tau_star, cont_damage_s, alpha_bar, & 
-            periodic_ibs, compute_particle_drag, u_inf_ref, rho_inf_ref, T_inf_ref, & 
+            periodic_ibs, compute_particle_drag, u_inf_ref, rho_inf_ref, P_inf_ref, & 
             periodic_forcing, volume_filtering_momentum_eqn, store_levelset, & 
             slab_domain_decomposition, compute_autocorrelation, t_step_stat_start, & 
             filter_width, q_filtered_wrt
@@ -1755,7 +1755,7 @@ contains
             !$acc update device(ib_markers%sf)
         end if
 
-        !$acc update device(u_inf_ref, rho_inf_ref, T_inf_ref, filter_width)
+        !$acc update device(u_inf_ref, rho_inf_ref, P_inf_ref, filter_width)
 
     end subroutine s_initialize_gpu_vars
 
diff --git a/toolchain/mfc/run/case_dicts.py b/toolchain/mfc/run/case_dicts.py
index b8ac4ba7c7..47357896ec 100644
--- a/toolchain/mfc/run/case_dicts.py
+++ b/toolchain/mfc/run/case_dicts.py
@@ -303,7 +303,7 @@ def analytic(self):
     'compute_particle_drag': ParamType.LOG,
     'u_inf_ref': ParamType.REAL,
     'rho_inf_ref': ParamType.REAL,
-    'T_inf_ref': ParamType.REAL,
+    'P_inf_ref': ParamType.REAL,
     'periodic_forcing': ParamType.LOG,
     'volume_filtering_momentum_eqn': ParamType.LOG,
     'compute_autocorrelation': ParamType.LOG,

From d3142618269ca9bbf996cd95b4326df3a326ff95 Mon Sep 17 00:00:00 2001
From: conradd3 <conradd3@illinois.edu>
Date: Tue, 21 Oct 2025 11:52:05 -0500
Subject: [PATCH 26/30] div stress tensor fix

---
 src/simulation/m_volume_filtering.fpp | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/src/simulation/m_volume_filtering.fpp b/src/simulation/m_volume_filtering.fpp
index 3cf904d84d..f2d31374fb 100644
--- a/src/simulation/m_volume_filtering.fpp
+++ b/src/simulation/m_volume_filtering.fpp
@@ -764,16 +764,16 @@ contains
             do j = 0, n 
                 do k = 0, p
                     div_stress_tensor(1)%sf(i, j, k) = (stress_tensor(1)%vf(1)%sf(i+1, j, k) - stress_tensor(1)%vf(1)%sf(i-1, j, k)) / (dx(i-1) + dx(i+1)) &
-                                                     + (stress_tensor(2)%vf(1)%sf(i, j+1, k) - stress_tensor(2)%vf(1)%sf(i, j-1, k)) / (dy(j-1) + dy(j+1)) &
-                                                     + (stress_tensor(3)%vf(1)%sf(i, j, k+1) - stress_tensor(3)%vf(1)%sf(i, j, k-1)) / (dz(k-1) + dz(k+1))
+                                                     + (stress_tensor(1)%vf(2)%sf(i, j+1, k) - stress_tensor(1)%vf(2)%sf(i, j-1, k)) / (dy(j-1) + dy(j+1)) &
+                                                     + (stress_tensor(1)%vf(3)%sf(i, j, k+1) - stress_tensor(1)%vf(3)%sf(i, j, k-1)) / (dz(k-1) + dz(k+1))
 
-                    div_stress_tensor(2)%sf(i, j, k) = (stress_tensor(1)%vf(2)%sf(i+1, j, k) - stress_tensor(1)%vf(2)%sf(i-1, j, k)) / (dx(i-1) + dx(i+1)) & 
+                    div_stress_tensor(2)%sf(i, j, k) = (stress_tensor(2)%vf(1)%sf(i+1, j, k) - stress_tensor(2)%vf(1)%sf(i-1, j, k)) / (dx(i-1) + dx(i+1)) & 
                                                      + (stress_tensor(2)%vf(2)%sf(i, j+1, k) - stress_tensor(2)%vf(2)%sf(i, j-1, k)) / (dy(j-1) + dy(j+1)) & 
-                                                     + (stress_tensor(3)%vf(2)%sf(i, j, k+1) - stress_tensor(3)%vf(2)%sf(i, j, k-1)) / (dz(k-1) + dz(k+1))
+                                                     + (stress_tensor(2)%vf(3)%sf(i, j, k+1) - stress_tensor(2)%vf(3)%sf(i, j, k-1)) / (dz(k-1) + dz(k+1))
 
-                    div_stress_tensor(3)%sf(i, j, k) = (stress_tensor(1)%vf(3)%sf(i+1, j, k) - stress_tensor(1)%vf(3)%sf(i-1, j, k)) / (dx(i-1) + dx(i+1)) & 
-                                                     + (stress_tensor(2)%vf(3)%sf(i, j+1, k) - stress_tensor(2)%vf(3)%sf(i, j-1, k)) / (dy(j-1) + dy(j+1)) & 
-                                                     + (stress_tensor(3)%vf(3)%sf(i, j, k+1) - stress_tensor(3)%vf(3)%sf(i, j ,k-1)) / (dz(k-1) + dz(k+1))
+                    div_stress_tensor(3)%sf(i, j, k) = (stress_tensor(3)%vf(1)%sf(i+1, j, k) - stress_tensor(3)%vf(1)%sf(i-1, j, k)) / (dx(i-1) + dx(i+1)) & 
+                                                     + (stress_tensor(3)%vf(2)%sf(i, j+1, k) - stress_tensor(3)%vf(2)%sf(i, j-1, k)) / (dy(j-1) + dy(j+1)) & 
+                                                     + (stress_tensor(3)%vf(3)%sf(i, j, k+1) - stress_tensor(3)%vf(3)%sf(i, j, k-1)) / (dz(k-1) + dz(k+1))
                 end do 
             end do 
         end do
@@ -972,11 +972,11 @@ contains
                 do k = 0, p
                     dvol = dx(i) * dy(j) * dz(k)
                     !$acc atomic
-                    particle_forces(ib_markers%sf(i, j, k), 1) = particle_forces(ib_markers%sf(i, j, k), 1) - div_pres_visc_stress(1)%sf(i, j, k) * dvol
+                    particle_forces(ib_markers%sf(i, j, k), 1) = particle_forces(ib_markers%sf(i, j, k), 1) - (div_pres_visc_stress(1)%sf(i, j, k) * dvol)
                     !$acc atomic
-                    particle_forces(ib_markers%sf(i, j, k), 2) = particle_forces(ib_markers%sf(i, j, k), 2) - div_pres_visc_stress(2)%sf(i, j, k) * dvol
+                    particle_forces(ib_markers%sf(i, j, k), 2) = particle_forces(ib_markers%sf(i, j, k), 2) - (div_pres_visc_stress(2)%sf(i, j, k) * dvol)
                     !$acc atomic
-                    particle_forces(ib_markers%sf(i, j, k), 3) = particle_forces(ib_markers%sf(i, j, k), 3) - div_pres_visc_stress(3)%sf(i, j, k) * dvol
+                    particle_forces(ib_markers%sf(i, j, k), 3) = particle_forces(ib_markers%sf(i, j, k), 3) - (div_pres_visc_stress(3)%sf(i, j, k) * dvol)
                 end do 
             end do 
         end do
@@ -998,6 +998,7 @@ contains
         ! write particle forces to file
         if (proc_rank == 0) then
             write(100) force_glb
+            flush(100)
         end if
             
     end subroutine s_compute_particle_forces

From 646831b8795838eb28720e8dba0dd7ff605b7a04 Mon Sep 17 00:00:00 2001
From: conradd3 <conradd3@illinois.edu>
Date: Tue, 4 Nov 2025 19:49:00 -0600
Subject: [PATCH 27/30] src/common

---
 src/post_process/m_data_input.f90        |  8 ++---
 src/post_process/m_global_parameters.fpp | 10 +++----
 src/simulation/m_data_output.fpp         | 18 +++++------
 src/simulation/m_ibm.fpp                 | 38 ++++++++++++------------
 src/simulation/m_volume_filtering.fpp    | 14 ++++-----
 5 files changed, 44 insertions(+), 44 deletions(-)

diff --git a/src/post_process/m_data_input.f90 b/src/post_process/m_data_input.f90
index 6b90daec08..72c200a15c 100644
--- a/src/post_process/m_data_input.f90
+++ b/src/post_process/m_data_input.f90
@@ -674,7 +674,7 @@ impure subroutine s_read_parallel_filtered_data(t_step, m_MOK, n_MOK, p_MOK, WP_
 
             ! Initialize MPI data I/O
 
-            call s_initialize_mpi_data_filtered(filtered_fluid_indicator_function, & 
+            call s_initialize_mpi_data_filtered(filtered_fluid_indicator_function, &
                                                 stat_q_cons_filtered, stat_filtered_pressure, &
                                                 stat_reynolds_stress, stat_eff_visc, stat_int_mom_exch)
 
@@ -691,7 +691,7 @@ impure subroutine s_read_parallel_filtered_data(t_step, m_MOK, n_MOK, p_MOK, WP_
             WP_MOK = int(8._wp, MPI_OFFSET_KIND)
             MOK = int(1._wp, MPI_OFFSET_KIND)
             str_MOK = int(name_len, MPI_OFFSET_KIND)
-            NVARS_MOK = int(alt_sys, MPI_OFFSET_KIND) 
+            NVARS_MOK = int(alt_sys, MPI_OFFSET_KIND)
 
             call s_setup_mpi_io_params(data_size, m_MOK, n_MOK, p_MOK, WP_MOK, MOK, str_MOK, NVARS_MOK)
 
@@ -703,9 +703,9 @@ impure subroutine s_read_parallel_filtered_data(t_step, m_MOK, n_MOK, p_MOK, WP_
                 disp = m_MOK*max(MOK, n_MOK)*max(MOK, p_MOK)*WP_MOK*(var_MOK - 1)
 
                 call MPI_FILE_SET_VIEW(ifile, disp, mpi_p, MPI_IO_DATA%view(i), &
-                                        'native', mpi_info_int, ierr)
+                                       'native', mpi_info_int, ierr)
                 call MPI_FILE_READ_ALL(ifile, MPI_IO_DATA%var(i)%sf, data_size, &
-                                        mpi_p, status, ierr)
+                                       mpi_p, status, ierr)
             end do
 
             call s_mpi_barrier()
diff --git a/src/post_process/m_global_parameters.fpp b/src/post_process/m_global_parameters.fpp
index 0108979c0a..930123a6d9 100644
--- a/src/post_process/m_global_parameters.fpp
+++ b/src/post_process/m_global_parameters.fpp
@@ -842,9 +842,9 @@ contains
 
 #ifdef MFC_MPI
         if (q_filtered_wrt) then
-            allocate (MPI_IO_DATA%view(1:sys_size+1+4*9+4*9+3*4+6*4))
-            allocate (MPI_IO_DATA%var (1:sys_size+1+4*9+4*9+3*4+6*4))
-            do i = 1, sys_size+1+4*9+4*9+3*4+6*4
+            allocate (MPI_IO_DATA%view(1:sys_size + 1 + 4*9 + 4*9 + 3*4 + 6*4))
+            allocate (MPI_IO_DATA%var(1:sys_size + 1 + 4*9 + 4*9 + 3*4 + 6*4))
+            do i = 1, sys_size + 1 + 4*9 + 4*9 + 3*4 + 6*4
                 allocate (MPI_IO_DATA%var(i)%sf(0:m, 0:n, 0:p))
                 MPI_IO_DATA%var(i)%sf => null()
             end do
@@ -1034,8 +1034,8 @@ contains
                 MPI_IO_DATA%var(i)%sf => null()
             end do
 
-            if (q_filtered_wrt) then 
-                do i = sys_size+1, sys_size+1+4*9+4*9+3*4+6*4
+            if (q_filtered_wrt) then
+                do i = sys_size + 1, sys_size + 1 + 4*9 + 4*9 + 3*4 + 6*4
                     MPI_IO_DATA%var(i)%sf => null()
                 end do
             end if
diff --git a/src/simulation/m_data_output.fpp b/src/simulation/m_data_output.fpp
index 8e2eeff299..c2568ab415 100644
--- a/src/simulation/m_data_output.fpp
+++ b/src/simulation/m_data_output.fpp
@@ -82,9 +82,9 @@ contains
         !! @param q_cons_vf Conservative variables
         !! @param q_prim_vf Primitive variables
         !! @param t_step Current time step
-    impure subroutine s_write_data_files(q_cons_vf, q_T_sf, q_prim_vf, t_step, bc_type, beta, & 
-                                         filtered_fluid_indicator_function, & 
-                                         stat_q_cons_filtered, stat_filtered_pressure, & 
+    impure subroutine s_write_data_files(q_cons_vf, q_T_sf, q_prim_vf, t_step, bc_type, beta, &
+                                         filtered_fluid_indicator_function, &
+                                         stat_q_cons_filtered, stat_filtered_pressure, &
                                          stat_reynolds_stress, stat_eff_visc, stat_int_mom_exch)
 
         type(scalar_field), &
@@ -117,9 +117,9 @@ contains
         if (.not. parallel_io) then
             call s_write_serial_data_files(q_cons_vf, q_T_sf, q_prim_vf, t_step, bc_type, beta)
         else
-            call s_write_parallel_data_files(q_cons_vf, t_step, bc_type, beta, & 
-                                             filtered_fluid_indicator_function, & 
-                                             stat_q_cons_filtered, stat_filtered_pressure, & 
+            call s_write_parallel_data_files(q_cons_vf, t_step, bc_type, beta, &
+                                             filtered_fluid_indicator_function, &
+                                             stat_q_cons_filtered, stat_filtered_pressure, &
                                              stat_reynolds_stress, stat_eff_visc, stat_int_mom_exch)
         end if
 
@@ -798,7 +798,7 @@ contains
         !!  @param beta Eulerian void fraction from lagrangian bubbles
     impure subroutine s_write_parallel_data_files(q_cons_vf, t_step, bc_type, beta, &
                                                   filtered_fluid_indicator_function, &
-                                                  stat_q_cons_filtered, stat_filtered_pressure, & 
+                                                  stat_q_cons_filtered, stat_filtered_pressure, &
                                                   stat_reynolds_stress, stat_eff_visc, stat_int_mom_exch)
 
         type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
@@ -957,8 +957,8 @@ contains
                 call s_initialize_mpi_data(q_cons_vf, ib_markers, levelset, levelset_norm)
                 if (q_filtered_wrt .and. (t_step == 0 .or. t_step == t_step_stop)) then
                     call s_initialize_mpi_data_filtered(filtered_fluid_indicator_function, &
-                                                        stat_q_cons_filtered, stat_filtered_pressure, & 
-                                                        stat_reynolds_stress, stat_eff_visc, stat_int_mom_exch)                    
+                                                        stat_q_cons_filtered, stat_filtered_pressure, &
+                                                        stat_reynolds_stress, stat_eff_visc, stat_int_mom_exch)
                 end if
             elseif (present(beta)) then
                 call s_initialize_mpi_data(q_cons_vf, beta=beta)
diff --git a/src/simulation/m_ibm.fpp b/src/simulation/m_ibm.fpp
index 559c2088e4..7413fa48e9 100644
--- a/src/simulation/m_ibm.fpp
+++ b/src/simulation/m_ibm.fpp
@@ -430,7 +430,7 @@ contains
 
             ! Calculate and store the precise location of the image point
             patch_id = gp%ib_patch_id
-            if (store_levelset) then 
+            if (store_levelset) then
                 dist = abs(levelset%sf(i, j, k, patch_id))
                 norm(:) = levelset_norm%sf(i, j, k, patch_id, :)
             else ! compute levelset and levelset_norm on the fly
@@ -446,54 +446,54 @@ contains
                 if (periodic_ibs) then
                     if ((x_centroid - x_domain_beg_glb) <= radius) then
                         x_pcen = x_domain_end_glb + (x_centroid - x_domain_beg_glb)
-                    else if ((x_domain_end_glb - x_centroid) <= radius) then 
+                    else if ((x_domain_end_glb - x_centroid) <= radius) then
                         x_pcen = x_domain_beg_glb - (x_domain_end_glb - x_centroid)
-                    else 
+                    else
                         x_pcen = x_centroid
                     end if
                     if ((y_centroid - y_domain_beg_glb) <= radius) then
                         y_pcen = y_domain_end_glb + (y_centroid - y_domain_beg_glb)
-                    else if ((y_domain_end_glb - y_centroid) <= radius) then 
+                    else if ((y_domain_end_glb - y_centroid) <= radius) then
                         y_pcen = y_domain_beg_glb - (y_domain_end_glb - y_centroid)
-                    else 
+                    else
                         y_pcen = y_centroid
                     end if
                     if ((z_centroid - z_domain_beg_glb) <= radius) then
                         z_pcen = z_domain_end_glb + (z_centroid - z_domain_beg_glb)
-                    else if ((z_domain_end_glb - z_centroid) <= radius) then 
+                    else if ((z_domain_end_glb - z_centroid) <= radius) then
                         z_pcen = z_domain_beg_glb - (z_domain_end_glb - z_centroid)
-                    else 
+                    else
                         z_pcen = z_centroid
                     end if
-                    dist_vec_per(1, 1) = x_cc(i) - x_pcen 
+                    dist_vec_per(1, 1) = x_cc(i) - x_pcen
                     dist_vec_per(1, 2) = y_cc(j) - y_pcen
                     dist_vec_per(1, 3) = z_cc(k) - z_pcen
                     dist_per(1) = sqrt(sum(dist_vec_per(1, :)**2))
-                    if (dist_per(1) < dist_calc) then    
+                    if (dist_per(1) < dist_calc) then
                         dist_calc = dist_per(1)
                         dist_vec = dist_vec_per(1, :)
-                    end if 
-                    dist_vec_per(2, 1) = x_cc(i) - x_pcen 
+                    end if
+                    dist_vec_per(2, 1) = x_cc(i) - x_pcen
                     dist_vec_per(2, 2) = y_cc(j) - y_centroid
                     dist_vec_per(2, 3) = z_cc(k) - z_pcen
                     dist_per(2) = sqrt(sum(dist_vec_per(2, :)**2))
-                    if (dist_per(2) < dist_calc) then    
+                    if (dist_per(2) < dist_calc) then
                         dist_calc = dist_per(2)
                         dist_vec = dist_vec_per(2, :)
                     end if
-                    dist_vec_per(3, 1) = x_cc(i) - x_pcen 
+                    dist_vec_per(3, 1) = x_cc(i) - x_pcen
                     dist_vec_per(3, 2) = y_cc(j) - y_pcen
                     dist_vec_per(3, 3) = z_cc(k) - z_centroid
                     dist_per(3) = sqrt(sum(dist_vec_per(3, :)**2))
-                    if (dist_per(3) < dist_calc) then    
+                    if (dist_per(3) < dist_calc) then
                         dist_calc = dist_per(3)
                         dist_vec = dist_vec_per(3, :)
                     end if
-                    dist_vec_per(4, 1) = x_cc(i) - x_pcen 
+                    dist_vec_per(4, 1) = x_cc(i) - x_pcen
                     dist_vec_per(4, 2) = y_cc(j) - y_centroid
                     dist_vec_per(4, 3) = z_cc(k) - z_centroid
                     dist_per(4) = sqrt(sum(dist_vec_per(4, :)**2))
-                    if (dist_per(4) < dist_calc) then    
+                    if (dist_per(4) < dist_calc) then
                         dist_calc = dist_per(4)
                         dist_vec = dist_vec_per(4, :)
                     end if
@@ -501,7 +501,7 @@ contains
                     dist_vec_per(5, 2) = y_cc(j) - y_pcen
                     dist_vec_per(5, 3) = z_cc(k) - z_pcen
                     dist_per(5) = sqrt(sum(dist_vec_per(5, :)**2))
-                    if (dist_per(5) < dist_calc) then    
+                    if (dist_per(5) < dist_calc) then
                         dist_calc = dist_per(5)
                         dist_vec = dist_vec_per(5, :)
                     end if
@@ -509,7 +509,7 @@ contains
                     dist_vec_per(6, 2) = y_cc(j) - y_pcen
                     dist_vec_per(6, 3) = z_cc(k) - z_centroid
                     dist_per(6) = sqrt(sum(dist_vec_per(6, :)**2))
-                    if (dist_per(6) < dist_calc) then    
+                    if (dist_per(6) < dist_calc) then
                         dist_calc = dist_per(6)
                         dist_vec = dist_vec_per(6, :)
                     end if
@@ -517,7 +517,7 @@ contains
                     dist_vec_per(7, 2) = y_cc(j) - y_centroid
                     dist_vec_per(7, 3) = z_cc(k) - z_pcen
                     dist_per(7) = sqrt(sum(dist_vec_per(7, :)**2))
-                    if (dist_per(7) < dist_calc) then    
+                    if (dist_per(7) < dist_calc) then
                         dist_calc = dist_per(7)
                         dist_vec = dist_vec_per(7, :)
                     end if
diff --git a/src/simulation/m_volume_filtering.fpp b/src/simulation/m_volume_filtering.fpp
index 63909895e9..2db1eb8517 100644
--- a/src/simulation/m_volume_filtering.fpp
+++ b/src/simulation/m_volume_filtering.fpp
@@ -338,9 +338,9 @@ contains
 
         ! file for particle forces
         if (compute_particle_drag) then
-          if (proc_rank == 0) then
-              open (unit=100, file='particle_force.bin', status='replace', form='unformatted', access='stream', action='write')
-          end if
+            if (proc_rank == 0) then
+                open (unit=100, file='particle_force.bin', status='replace', form='unformatted', access='stream', action='write')
+            end if
         end if
 
     end subroutine s_initialize_fftw_explicit_filter_module
@@ -1915,10 +1915,10 @@ contains
         call fftw_destroy_plan(plan_z_c2c_kernelG)
 #endif
 
-        if (compute_particle_drag) then 
-          if (proc_rank == 0) then
-              close (100)
-          end if
+        if (compute_particle_drag) then
+            if (proc_rank == 0) then
+                close (100)
+            end if
         end if
 
     end subroutine s_finalize_fftw_explicit_filter_module

From 0a4d4e5c17aba91890ed44157b1ef4627dac3358 Mon Sep 17 00:00:00 2001
From: conradd3 <conradd3@illinois.edu>
Date: Tue, 4 Nov 2025 19:49:37 -0600
Subject: [PATCH 28/30] formatting

---
 src/common/m_boundary_common.fpp |  2 +-
 src/common/m_mpi_common.fpp      | 24 ++++++++++++------------
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/src/common/m_boundary_common.fpp b/src/common/m_boundary_common.fpp
index 73dd1bafa5..4c01b10f22 100644
--- a/src/common/m_boundary_common.fpp
+++ b/src/common/m_boundary_common.fpp
@@ -35,7 +35,7 @@ module m_boundary_common
 
     private; public :: s_initialize_boundary_common_module, &
  s_populate_variables_buffers, &
- s_populate_scalarfield_buffers, & 
+ s_populate_scalarfield_buffers, &
  s_create_mpi_types, &
  s_populate_capillary_buffers, &
  s_populate_F_igr_buffers, &
diff --git a/src/common/m_mpi_common.fpp b/src/common/m_mpi_common.fpp
index 79f1e41556..fd7f002884 100644
--- a/src/common/m_mpi_common.fpp
+++ b/src/common/m_mpi_common.fpp
@@ -98,14 +98,14 @@ contains
 
 #ifdef MFC_SIMULATION
         if (volume_filtering_momentum_eqn) then
-          halo_size_sf = nint(-1._wp + 1._wp*buff_size* &
-                                         & (m + 2*buff_size + 1)* &
-                                         & (n + 2*buff_size + 1)* &
-                                         & (p + 2*buff_size + 1)/ &
-                                         & (cells_bounds%mnp_min + 2*buff_size + 1))
-          allocate (buff_send_scalarfield(0:halo_size_sf), buff_recv_scalarfield(0:halo_size_sf))
-          $:GPU_ENTER_DATA(create='[capture:buff_send_scalarfield]')
-          $:GPU_ENTER_DATA(create='[capture:buff_recv_scalarfield]')
+            halo_size_sf = nint(-1._wp + 1._wp*buff_size* &
+                                           & (m + 2*buff_size + 1)* &
+                                           & (n + 2*buff_size + 1)* &
+                                           & (p + 2*buff_size + 1)/ &
+                                           & (cells_bounds%mnp_min + 2*buff_size + 1))
+            allocate (buff_send_scalarfield(0:halo_size_sf), buff_recv_scalarfield(0:halo_size_sf))
+            $:GPU_ENTER_DATA(create='[capture:buff_send_scalarfield]')
+            $:GPU_ENTER_DATA(create='[capture:buff_recv_scalarfield]')
         end if
 #endif
 #endif
@@ -308,7 +308,7 @@ contains
     !! @param stat_reynolds_stress 1-4 order statistics of reynolds stress tensor
     !! @param stat_eff_visc 1-4 order statistics of unclosed effective viscosity tensor
     !! @param stat_int_mom_exch 1-4 order statistics of interphase momentum exchange vector
-    impure subroutine s_initialize_mpi_data_filtered(filtered_fluid_indicator_function, & 
+    impure subroutine s_initialize_mpi_data_filtered(filtered_fluid_indicator_function, &
                                                      stat_q_cons_filtered, stat_filtered_pressure, &
                                                      stat_reynolds_stress, stat_eff_visc, stat_int_mom_exch)
 
@@ -1536,13 +1536,13 @@ contains
 
                     end do
 
-                ! Decompose domain into z-slabs
-                else if (slab_domain_decomposition) then 
+                    ! Decompose domain into z-slabs
+                else if (slab_domain_decomposition) then
                     num_procs_x = 1
                     num_procs_y = 1
                     num_procs_z = num_procs
                     ierr = -1
-                    if (mod((p+1), num_procs_z) == 0) then 
+                    if (mod((p + 1), num_procs_z) == 0) then
                         ierr = 0
                     end if
                 else

From 2d90828a0f5dd05eeba40ab56e46fc9a419ef287 Mon Sep 17 00:00:00 2001
From: conradd3 <conradd3@illinois.edu>
Date: Thu, 13 Nov 2025 14:36:06 -0600
Subject: [PATCH 29/30] periodic ib bug, gp selection alg

---
 src/common/m_mpi_common.fpp           |   6 ++
 src/simulation/m_ibm.fpp              | 142 +++++++++++++++++++++++---
 src/simulation/m_volume_filtering.fpp |  38 +++----
 3 files changed, 151 insertions(+), 35 deletions(-)

diff --git a/src/common/m_mpi_common.fpp b/src/common/m_mpi_common.fpp
index fd7f002884..62429f300d 100644
--- a/src/common/m_mpi_common.fpp
+++ b/src/common/m_mpi_common.fpp
@@ -51,7 +51,9 @@ module m_mpi_common
     real(wp), private, allocatable, dimension(:), target :: buff_recv_scalarfield
     !! This variable is utilized to receive and unpack the buffer of any scalar field from neighboring processors
 
+#ifndef __NVCOMPILER_GPU_UNIFIED_MEM
     $:GPU_DECLARE(create='[buff_send_scalarfield, buff_recv_scalarfield]')
+#endif
 
 contains
 
@@ -103,9 +105,13 @@ contains
                                            & (n + 2*buff_size + 1)* &
                                            & (p + 2*buff_size + 1)/ &
                                            & (cells_bounds%mnp_min + 2*buff_size + 1))
+#ifndef __NVCOMPILER_GPU_UNIFIED_MEM                          
+            @:ALLOCATE(buff_send_scalarfield(0:halo_size_sf), buff_recv_scalarfield(0:halo_size_sf))
+#else
             allocate (buff_send_scalarfield(0:halo_size_sf), buff_recv_scalarfield(0:halo_size_sf))
             $:GPU_ENTER_DATA(create='[capture:buff_send_scalarfield]')
             $:GPU_ENTER_DATA(create='[capture:buff_recv_scalarfield]')
+#endif
         end if
 #endif
 #endif
diff --git a/src/simulation/m_ibm.fpp b/src/simulation/m_ibm.fpp
index 7413fa48e9..33812d17d7 100644
--- a/src/simulation/m_ibm.fpp
+++ b/src/simulation/m_ibm.fpp
@@ -147,6 +147,7 @@ contains
     end subroutine s_ibm_setup
 
     subroutine s_populate_ib_buffers()
+      integer :: j, k, l
 
         #:for DIRC, DIRI in [('x', 1), ('y', 2), ('z', 3)]
             #:for LOCC, LOCI in [('beg', -1), ('end', 1)]
@@ -156,6 +157,77 @@ contains
             #:endfor
         #:endfor
 
+        if (periodic_ibs) then 
+            ! Population of Buffers in x-direction
+            do l = 0, p
+                do k = 0, n
+                    if (bc_x%beg == BC_PERIODIC) then 
+                        do j = 1, buff_size
+                            ib_markers%sf(-j, k, l) = &
+                            ib_markers%sf(m - (j - 1), k, l)
+                        end do
+                    end if
+                end do
+            end do
+
+            do l = 0, p
+                do k = 0, n
+                    if (bc_x%end == BC_PERIODIC) then 
+                        do j = 1, buff_size
+                              ib_markers%sf(m + j, k, l) = &
+                              ib_markers%sf(j - 1, k, l)
+                          end do
+                    end if
+                end do
+            end do
+            
+            ! Population of Buffers in y-direction
+            do l = 0, p
+                do k = -buff_size, m + buff_size
+                    if (bc_y%beg == BC_PERIODIC) then 
+                        do j = 1, buff_size
+                            ib_markers%sf(k, -j, l) = &
+                            ib_markers%sf(k, n - (j - 1), l)
+                        end do
+                    end if
+                end do
+            end do
+
+            do l = 0, p
+                do k = -buff_size, m + buff_size
+                    if (bc_y%end == BC_PERIODIC) then 
+                        do j = 1, buff_size
+                            ib_markers%sf(k, n + j, l) = &
+                            ib_markers%sf(k, j - 1, l)
+                        end do
+                    end if
+                end do
+            end do
+
+            ! Population of Buffers in z-direction
+            do l = -buff_size, n + buff_size
+                do k = -buff_size, m + buff_size
+                    if (bc_z%beg == BC_PERIODIC) then
+                        do j = 1, buff_size
+                            ib_markers%sf(k, l, -j) = &
+                            ib_markers%sf(k, l, p - (j - 1))
+                        end do
+                    end if
+                end do
+            end do
+
+            do l = -buff_size, n + buff_size
+                do k = -buff_size, m + buff_size
+                    if (bc_z%end == BC_PERIODIC) then 
+                        do j = 1, buff_size
+                            ib_markers%sf(k, l, p + j) = &
+                            ib_markers%sf(k, l, j - 1)
+                        end do
+                    end if
+                end do
+            end do
+        end if
+
     end subroutine s_populate_ib_buffers
 
     !>  Subroutine that updates the conservative variables at the ghost points
@@ -529,6 +601,7 @@ contains
                     norm(:) = dist_vec(:)/dist_calc
                 end if
             end if ! end store_levelset if statement
+
             ghost_points_in(q)%ip_loc(:) = physical_loc(:) + 2*dist*norm(:)
 
             ! Find the closest grid point to the image point
@@ -537,13 +610,13 @@ contains
                 ! s_cc points to the dim array we need
                 if (dim == 1) then
                     s_cc => x_cc
-                    bound = m + buff_size - 1
+                    bound = m + buff_size 
                 elseif (dim == 2) then
                     s_cc => y_cc
-                    bound = n + buff_size - 1
+                    bound = n + buff_size 
                 else
                     s_cc => z_cc
-                    bound = p + buff_size - 1
+                    bound = p + buff_size 
                 end if
 
                 if (f_approx_equal(norm(dim), 0._wp)) then
@@ -562,7 +635,10 @@ contains
                                .or. temp_loc > s_cc(index + 1)))
                         index = index + dir
                         if (index < -buff_size .or. index > bound) then
-                            print *, "temp_loc=", temp_loc, " s_cc(index)=", s_cc(index), " s_cc(index+1)=", s_cc(index + 1)
+                            print *, "proc_rank=", proc_rank, "temp_loc=", temp_loc, " index=", index, "ib=", patch_id, "dim", dim, "dir", dir 
+                            print *, i, j, k, physical_loc, ghost_points_in(q)%ip_loc(:)
+                            print *, x_centroid, y_centroid, z_centroid
+                            print *, norm, dist
                             print *, "Increase buff_size further in m_helper_basic (currently set to a minimum of 10)"
                             error stop "Increase buff_size"
                         end if
@@ -590,6 +666,9 @@ contains
             :: subsection_2D
         integer, dimension(2*gp_layers + 1, 2*gp_layers + 1, 2*gp_layers + 1) &
             :: subsection_3D
+        integer, dimension(2*gp_layers + 1) :: subsection_x
+        integer, dimension(2*gp_layers + 1) :: subsection_y
+        integer, dimension(2*gp_layers + 1) :: subsection_z
         integer :: i, j, k!< Iterator variables
 
         num_gps_out = 0
@@ -611,14 +690,26 @@ contains
                 else
                     do k = 0, p
                         if (ib_markers%sf(i, j, k) /= 0) then
-                            subsection_3D = ib_markers%sf( &
-                                            i - gp_layers:i + gp_layers, &
-                                            j - gp_layers:j + gp_layers, &
-                                            k - gp_layers:k + gp_layers)
-                            if (any(subsection_3D == 0)) then
-                                num_gps_out = num_gps_out + 1
+                            ! subsection_3D = ib_markers%sf( &
+                            !                 i - gp_layers:i + gp_layers, &
+                            !                 j - gp_layers:j + gp_layers, &
+                            !                 k - gp_layers:k + gp_layers)
+                            ! if (any(subsection_3D == 0)) then
+                            !     num_gps_out = num_gps_out + 1
+                            ! else
+                            !     num_inner_gps_out = num_inner_gps_out + 1
+                            ! end if
+
+                            subsection_x = ib_markers%sf(i - gp_layers:i + gp_layers, j, k)
+                            subsection_y = ib_markers%sf(i, j - gp_layers:j + gp_layers, k)
+                            subsection_z = ib_markers%sf(i, j, k - gp_layers:k + gp_layers)
+
+                            if (any(subsection_x == 0) .or. & 
+                                any(subsection_y == 0) .or. & 
+                                any(subsection_z == 0)) then 
+                                  num_gps_out = num_gps_out + 1
                             else
-                                num_inner_gps_out = num_inner_gps_out + 1
+                                  num_inner_gps_out = num_inner_gps_out + 1
                             end if
                         end if
                     end do
@@ -637,6 +728,9 @@ contains
             :: subsection_2D
         integer, dimension(2*gp_layers + 1, 2*gp_layers + 1, 2*gp_layers + 1) &
             :: subsection_3D
+        integer, dimension(2*gp_layers + 1) :: subsection_x
+        integer, dimension(2*gp_layers + 1) :: subsection_y
+        integer, dimension(2*gp_layers + 1) :: subsection_z
         integer :: i, j, k !< Iterator variables
         integer :: count, count_i
         integer :: patch_id
@@ -693,11 +787,27 @@ contains
                     ! 3D
                     do k = 0, p
                         if (ib_markers%sf(i, j, k) /= 0) then
-                            subsection_3D = ib_markers%sf( &
-                                            i - gp_layers:i + gp_layers, &
-                                            j - gp_layers:j + gp_layers, &
-                                            k - gp_layers:k + gp_layers)
-                            if (any(subsection_3D == 0)) then
+                            ! subsection_3D = ib_markers%sf( &
+                            !                 i - gp_layers:i + gp_layers, &
+                            !                 j - gp_layers:j + gp_layers, &
+                            !                 k - gp_layers:k + gp_layers)
+
+                            subsection_x = ib_markers%sf(i - gp_layers:i + gp_layers, j, k)
+                            subsection_y = ib_markers%sf(i, j - gp_layers:j + gp_layers, k)
+                            subsection_z = ib_markers%sf(i, j, k - gp_layers:k + gp_layers)
+
+                            if (any(subsection_x == 0) .or. & 
+                                any(subsection_y == 0) .or. & 
+                                any(subsection_z == 0)) then 
+                                
+                                if (i==  7  .and.    j==    26     .and. k==      0) then 
+                                    print *, 'HERE' 
+                                    print *, 'x', subsection_x, 'y', subsection_y, 'z', subsection_z
+                                    print *, proc_rank, ib_markers%sf(7, 26, -1)
+                                end if
+
+
+                            ! if (any(subsection_3D == 0)) then
                                 ghost_points_in(count)%loc = [i, j, k]
                                 patch_id = ib_markers%sf(i, j, k)
                                 ghost_points_in(count)%ib_patch_id = &
diff --git a/src/simulation/m_volume_filtering.fpp b/src/simulation/m_volume_filtering.fpp
index 2db1eb8517..94f9804955 100644
--- a/src/simulation/m_volume_filtering.fpp
+++ b/src/simulation/m_volume_filtering.fpp
@@ -347,11 +347,11 @@ contains
 
     !< initialize the gaussian filtering kernel in real space and then compute its DFT
     subroutine s_initialize_filtering_kernel
-        real(dp) :: sigma_stddev
-        real(dp) :: Lx, Ly, Lz
-        real(dp) :: x_r, y_r, z_r
-        real(dp) :: r2
-        real(dp) :: G_norm_int, G_norm_int_glb
+        real(wp) :: sigma_stddev
+        real(wp) :: Lx, Ly, Lz
+        real(wp) :: x_r, y_r, z_r
+        real(wp) :: r2
+        real(wp) :: G_norm_int, G_norm_int_glb
         integer :: i, j, k
 
         ! gaussian filter
@@ -361,7 +361,7 @@ contains
         Ly = y_domain_end_glb - y_domain_beg_glb
         Lz = z_domain_end_glb - z_domain_beg_glb
 
-        G_norm_int = 0.0_dp
+        G_norm_int = 0.0_wp
 
         $:GPU_PARALLEL_LOOP(collapse=3, reduction='[[G_norm_int]]', reductionOp='[+]', copyin='[Lx, Ly, Lz, sigma_stddev]', private='[x_r, y_r, z_r, r2]')
         do i = 0, m
@@ -373,7 +373,7 @@ contains
 
                     r2 = x_r**2 + y_r**2 + z_r**2
 
-                    real_kernelG_in(i + 1, j + 1, k + 1) = exp(-r2/(2.0_dp*sigma_stddev**2))
+                    real_kernelG_in(i + 1, j + 1, k + 1) = exp(-r2/(2.0_wp*sigma_stddev**2))
 
                     G_norm_int = G_norm_int + real_kernelG_in(i + 1, j + 1, k + 1)*dx(i)*dy(j)*dz(k)
                 end do
@@ -462,7 +462,7 @@ contains
         do i = 1, NxC
             do j = 1, Nyloc
                 do k = 1, Nz
-                    cmplx_kernelG1d(k + (i - 1)*Nz + (j - 1)*Nz*NxC) = cmplx_kernelG1d(k + (i - 1)*Nz + (j - 1)*Nz*NxC)/(real(Nx*Ny*Nz, dp))
+                    cmplx_kernelG1d(k + (i - 1)*Nz + (j - 1)*Nz*NxC) = cmplx_kernelG1d(k + (i - 1)*Nz + (j - 1)*Nz*NxC)/(real(Nx*Ny*Nz, wp))
                 end do
             end do
         end do
@@ -486,9 +486,9 @@ contains
             do j = 0, n
                 do k = 0, p
                     if (ib_markers%sf(i, j, k) == 0) then
-                        fluid_indicator_function%sf(i, j, k) = 1.0_dp
+                        fluid_indicator_function%sf(i, j, k) = 1.0_wp
                     else
-                        fluid_indicator_function%sf(i, j, k) = 0.0_dp
+                        fluid_indicator_function%sf(i, j, k) = 0.0_wp
                     end if
                 end do
             end do
@@ -531,7 +531,7 @@ contains
         do i = 1, Nx
             do j = 1, Ny
                 do k = 1, Nzloc
-                    filtered_fluid_indicator_function%sf(i - 1, j - 1, k - 1) = data_real_3D_slabz(i, j, k)/(real(Nx*Ny*Nz, dp))
+                    filtered_fluid_indicator_function%sf(i - 1, j - 1, k - 1) = data_real_3D_slabz(i, j, k)/(real(Nx*Ny*Nz, wp))
                 end do
             end do
         end do
@@ -616,7 +616,7 @@ contains
             do i = 0, m
                 do j = 0, n
                     do k = 0, p
-                        data_real_3D_slabz(i + 1, j + 1, k + 1) = q_temp_in%sf(i, j, k)*(1.0_dp - fluid_indicator_function%sf(i, j, k))
+                        data_real_3D_slabz(i + 1, j + 1, k + 1) = q_temp_in%sf(i, j, k)*(1.0_wp - fluid_indicator_function%sf(i, j, k))
                     end do
                 end do
             end do
@@ -644,7 +644,7 @@ contains
             do i = 0, m
                 do j = 0, n
                     do k = 0, p
-                        q_temp_out%sf(i, j, k) = data_real_3D_slabz(i + 1, j + 1, k + 1)/(real(Nx*Ny*Nz, dp)*filtered_fluid_indicator_function%sf(i, j, k))
+                        q_temp_out%sf(i, j, k) = data_real_3D_slabz(i + 1, j + 1, k + 1)/(real(Nx*Ny*Nz, wp)*filtered_fluid_indicator_function%sf(i, j, k))
                     end do
                 end do
             end do
@@ -653,7 +653,7 @@ contains
             do i = 0, m
                 do j = 0, n
                     do k = 0, p
-                        q_temp_in%sf(i, j, k) = data_real_3D_slabz(i + 1, j + 1, k + 1)/(real(Nx*Ny*Nz, dp)*filtered_fluid_indicator_function%sf(i, j, k))
+                        q_temp_in%sf(i, j, k) = data_real_3D_slabz(i + 1, j + 1, k + 1)/(real(Nx*Ny*Nz, wp)*filtered_fluid_indicator_function%sf(i, j, k))
                     end do
                 end do
             end do
@@ -938,7 +938,7 @@ contains
             do i = 0, m
                 do j = 0, n
                     do k = 0, p
-                        int_mom_exch(l)%sf(i, j, k) = data_real_3D_slabz(i + 1, j + 1, k + 1)/(real(Nx*Ny*Nz, dp))
+                        int_mom_exch(l)%sf(i, j, k) = data_real_3D_slabz(i + 1, j + 1, k + 1)/(real(Nx*Ny*Nz, wp))
                     end do
                 end do
             end do
@@ -1535,7 +1535,7 @@ contains
             do i = 0, m
                 do j = 0, n
                     do k = 0, p
-                        q_cons_filtered(l)%sf(i, j, k) = data_real_3D_slabz(i + 1, j + 1, k + 1)/(real(Nx*Ny*Nz, dp)*filtered_fluid_indicator_function%sf(i, j, k))
+                        q_cons_filtered(l)%sf(i, j, k) = data_real_3D_slabz(i + 1, j + 1, k + 1)/(real(Nx*Ny*Nz, wp)*filtered_fluid_indicator_function%sf(i, j, k))
                     end do
                 end do
             end do
@@ -1580,7 +1580,7 @@ contains
         do i = 0, m
             do j = 0, n
                 do k = 0, p
-                    filtered_pressure%sf(i, j, k) = data_real_3D_slabz(i + 1, j + 1, k + 1)/(real(Nx*Ny*Nz, dp)*filtered_fluid_indicator_function%sf(i, j, k))
+                    filtered_pressure%sf(i, j, k) = data_real_3D_slabz(i + 1, j + 1, k + 1)/(real(Nx*Ny*Nz, wp)*filtered_fluid_indicator_function%sf(i, j, k))
                 end do
             end do
         end do
@@ -1626,7 +1626,7 @@ contains
                 do i = 0, m
                     do j = 0, n
                         do k = 0, p
-                            reynolds_stress(l)%vf(q)%sf(i, j, k) = data_real_3D_slabz(i + 1, j + 1, k + 1)/(real(Nx*Ny*Nz, dp)*filtered_fluid_indicator_function%sf(i, j, k))
+                            reynolds_stress(l)%vf(q)%sf(i, j, k) = data_real_3D_slabz(i + 1, j + 1, k + 1)/(real(Nx*Ny*Nz, wp)*filtered_fluid_indicator_function%sf(i, j, k))
                         end do
                     end do
                 end do
@@ -1674,7 +1674,7 @@ contains
                 do i = 0, m
                     do j = 0, n
                         do k = 0, p
-                            eff_visc(l)%vf(q)%sf(i, j, k) = data_real_3D_slabz(i + 1, j + 1, k + 1)/(real(Nx*Ny*Nz, dp)*filtered_fluid_indicator_function%sf(i, j, k))
+                            eff_visc(l)%vf(q)%sf(i, j, k) = data_real_3D_slabz(i + 1, j + 1, k + 1)/(real(Nx*Ny*Nz, wp)*filtered_fluid_indicator_function%sf(i, j, k))
                         end do
                     end do
                 end do

From 9c6d3fa99165e2a7745fca817ee8ec5b43fde509 Mon Sep 17 00:00:00 2001
From: conradd3 <conradd3@illinois.edu>
Date: Thu, 13 Nov 2025 16:03:36 -0600
Subject: [PATCH 30/30] gpu ib buff populate bug fix

---
 runs/phi01/case.py          |  59 +++++++++--------
 src/common/m_mpi_common.fpp |   2 +-
 src/simulation/m_ibm.fpp    | 126 +++++++++++++++++-------------------
 3 files changed, 90 insertions(+), 97 deletions(-)

diff --git a/runs/phi01/case.py b/runs/phi01/case.py
index 9751518117..c67369d9c7 100644
--- a/runs/phi01/case.py
+++ b/runs/phi01/case.py
@@ -2,45 +2,34 @@
 import math
 import numpy as np
 
-'''
-need to store
-full stats of unclosed term tensors (1, 2, 3, 4) - only at end time
-stats of flow quantities - only at end time
-flow quantities
-filtered fluid indicator function
-drag force on each particle
-'''
-
-Mu = 1.84e-05
+
 gam_a = 1.4
-R = 287.0
 
 D = 0.1
+L = 10 * D
 
-P = 101325 # Pa
-rho = 1.225 # kg/m^3
-
-T = P/(rho*R)
-
-M = 1.2
+M = 0.8
 Re = 1500.0
-v1 = M*(gam_a*P/rho)**(1.0/2.0)
 
-mu = rho*v1*D/Re # dynamic viscosity for current case
+P = 101325
+rho = 1.225
+
+v1 = M * np.sqrt(gam_a * P / rho) 
+mu = rho * v1 * D / Re
 
 #print('mu: ', mu)
 #print('v1: ', v1)
 #print('rho: ', rho)
 #print('Kn = ' + str( np.sqrt(np.pi*gam_a/2)*(M/Re) )) # Kn < 0.01 = continuum flow
 
-dt = 4.0E-06
-Nt = 100
-t_save = 10
-t_step_start_stats = 50
+dt = 5.0E-06
+Nt = 200 #int(1 * L / v1 / dt)
+t_save = Nt//5
+t_step_start_stats = Nt//2
 
-Nx = 99
-Ny = 99
-Nz = 99
+Nx = 199
+Ny = Nx
+Nz = Ny
 
 # load initial sphere locations
 sphere_loc = np.loadtxt('sphere_array_locations.txt')
@@ -58,6 +47,15 @@
         f"patch_ib({i+1})%slip": "F",
         })
 
+# ib_dict.update({
+#     f"patch_ib({1})%geometry": 8,
+#     f"patch_ib({1})%x_centroid": sphere_loc[20, 0],
+#     f"patch_ib({1})%y_centroid": sphere_loc[20, 1],
+#     f"patch_ib({1})%z_centroid": sphere_loc[20, 2],
+#     f"patch_ib({1})%radius": D / 2,
+#     f"patch_ib({1})%slip": "F",
+#     })
+
 # Configuring case dictionary
 case_dict = {
     # Logistics
@@ -78,8 +76,8 @@
     "p": Nz,
     "dt": dt,
     "t_step_start": 0,
-    "t_step_stop": Nt,  # 3000
-    "t_step_save": t_save,  # 10
+    "t_step_stop": Nt,  
+    "t_step_save": t_save,  
     "t_step_stat_start": t_step_start_stats,
     # Simulation Algorithm Parameters
     # Only one patches are necessary, the air tube
@@ -154,11 +152,12 @@
     "periodic_forcing": "T",
     "periodic_ibs": "T",
     "volume_filtering_momentum_eqn": "T",
-    "filter_width": 3.0*D/2,
+    "filter_width": 3.0*D/2 * np.sqrt(2/(9*np.pi)),
+    "compute_particle_drag": "T",
 
     "u_inf_ref": v1,
     "rho_inf_ref": rho,
-    "T_inf_ref": T,
+    "P_inf_ref": P,
 
     "store_levelset": "F",
     "slab_domain_decomposition": "T", 
diff --git a/src/common/m_mpi_common.fpp b/src/common/m_mpi_common.fpp
index 62429f300d..c5f926c51c 100644
--- a/src/common/m_mpi_common.fpp
+++ b/src/common/m_mpi_common.fpp
@@ -105,7 +105,7 @@ contains
                                            & (n + 2*buff_size + 1)* &
                                            & (p + 2*buff_size + 1)/ &
                                            & (cells_bounds%mnp_min + 2*buff_size + 1))
-#ifndef __NVCOMPILER_GPU_UNIFIED_MEM                          
+#ifndef __NVCOMPILER_GPU_UNIFIED_MEM
             @:ALLOCATE(buff_send_scalarfield(0:halo_size_sf), buff_recv_scalarfield(0:halo_size_sf))
 #else
             allocate (buff_send_scalarfield(0:halo_size_sf), buff_recv_scalarfield(0:halo_size_sf))
diff --git a/src/simulation/m_ibm.fpp b/src/simulation/m_ibm.fpp
index 33812d17d7..444b4ab8d5 100644
--- a/src/simulation/m_ibm.fpp
+++ b/src/simulation/m_ibm.fpp
@@ -147,7 +147,7 @@ contains
     end subroutine s_ibm_setup
 
     subroutine s_populate_ib_buffers()
-      integer :: j, k, l
+        integer :: j, k, l
 
         #:for DIRC, DIRI in [('x', 1), ('y', 2), ('z', 3)]
             #:for LOCC, LOCI in [('beg', -1), ('end', 1)]
@@ -157,75 +157,81 @@ contains
             #:endfor
         #:endfor
 
-        if (periodic_ibs) then 
+        if (periodic_ibs) then
             ! Population of Buffers in x-direction
-            do l = 0, p
-                do k = 0, n
-                    if (bc_x%beg == BC_PERIODIC) then 
+            if (bc_x%beg == BC_PERIODIC) then
+                $:GPU_PARALLEL_LOOP(collapse=3)
+                do l = 0, p
+                    do k = 0, n
                         do j = 1, buff_size
                             ib_markers%sf(-j, k, l) = &
-                            ib_markers%sf(m - (j - 1), k, l)
+                                ib_markers%sf(m - (j - 1), k, l)
                         end do
-                    end if
+                    end do
                 end do
-            end do
+            end if
 
-            do l = 0, p
-                do k = 0, n
-                    if (bc_x%end == BC_PERIODIC) then 
+            if (bc_x%end == BC_PERIODIC) then
+                $:GPU_PARALLEL_LOOP(collapse=3)
+                do l = 0, p
+                    do k = 0, n
                         do j = 1, buff_size
-                              ib_markers%sf(m + j, k, l) = &
-                              ib_markers%sf(j - 1, k, l)
-                          end do
-                    end if
+                            ib_markers%sf(m + j, k, l) = &
+                                ib_markers%sf(j - 1, k, l)
+                        end do
+                    end do
                 end do
-            end do
-            
+            end if
+
             ! Population of Buffers in y-direction
-            do l = 0, p
-                do k = -buff_size, m + buff_size
-                    if (bc_y%beg == BC_PERIODIC) then 
+            if (bc_y%beg == BC_PERIODIC) then
+                $:GPU_PARALLEL_LOOP(collapse=3)
+                do l = 0, p
+                    do k = -buff_size, m + buff_size
                         do j = 1, buff_size
                             ib_markers%sf(k, -j, l) = &
-                            ib_markers%sf(k, n - (j - 1), l)
+                                ib_markers%sf(k, n - (j - 1), l)
                         end do
-                    end if
+                    end do
                 end do
-            end do
+            end if
 
-            do l = 0, p
-                do k = -buff_size, m + buff_size
-                    if (bc_y%end == BC_PERIODIC) then 
+            if (bc_y%end == BC_PERIODIC) then
+                $:GPU_PARALLEL_LOOP(collapse=3)
+                do l = 0, p
+                    do k = -buff_size, m + buff_size
                         do j = 1, buff_size
                             ib_markers%sf(k, n + j, l) = &
-                            ib_markers%sf(k, j - 1, l)
+                                ib_markers%sf(k, j - 1, l)
                         end do
-                    end if
+                    end do
                 end do
-            end do
+            end if
 
             ! Population of Buffers in z-direction
-            do l = -buff_size, n + buff_size
-                do k = -buff_size, m + buff_size
-                    if (bc_z%beg == BC_PERIODIC) then
+            if (bc_z%beg == BC_PERIODIC) then
+                $:GPU_PARALLEL_LOOP(collapse=3)
+                do l = -buff_size, n + buff_size
+                    do k = -buff_size, m + buff_size
                         do j = 1, buff_size
                             ib_markers%sf(k, l, -j) = &
-                            ib_markers%sf(k, l, p - (j - 1))
+                                ib_markers%sf(k, l, p - (j - 1))
                         end do
-                    end if
+                    end do
                 end do
-            end do
+            end if
 
-            do l = -buff_size, n + buff_size
-                do k = -buff_size, m + buff_size
-                    if (bc_z%end == BC_PERIODIC) then 
+            if (bc_z%end == BC_PERIODIC) then
+                $:GPU_PARALLEL_LOOP(collapse=3)
+                do l = -buff_size, n + buff_size
+                    do k = -buff_size, m + buff_size
                         do j = 1, buff_size
                             ib_markers%sf(k, l, p + j) = &
-                            ib_markers%sf(k, l, j - 1)
+                                ib_markers%sf(k, l, j - 1)
                         end do
-                    end if
+                    end do
                 end do
-            end do
+            end if
         end if
 
     end subroutine s_populate_ib_buffers
@@ -610,13 +616,13 @@ contains
                 ! s_cc points to the dim array we need
                 if (dim == 1) then
                     s_cc => x_cc
-                    bound = m + buff_size 
+                    bound = m + buff_size
                 elseif (dim == 2) then
                     s_cc => y_cc
-                    bound = n + buff_size 
+                    bound = n + buff_size
                 else
                     s_cc => z_cc
-                    bound = p + buff_size 
+                    bound = p + buff_size
                 end if
 
                 if (f_approx_equal(norm(dim), 0._wp)) then
@@ -635,10 +641,7 @@ contains
                                .or. temp_loc > s_cc(index + 1)))
                         index = index + dir
                         if (index < -buff_size .or. index > bound) then
-                            print *, "proc_rank=", proc_rank, "temp_loc=", temp_loc, " index=", index, "ib=", patch_id, "dim", dim, "dir", dir 
-                            print *, i, j, k, physical_loc, ghost_points_in(q)%ip_loc(:)
-                            print *, x_centroid, y_centroid, z_centroid
-                            print *, norm, dist
+                            print *, "proc_rank=", proc_rank, "temp_loc=", temp_loc, " index=", index, "ib=", patch_id, "dim", dim, "dir", dir, "i, j, k", i, j, k
                             print *, "Increase buff_size further in m_helper_basic (currently set to a minimum of 10)"
                             error stop "Increase buff_size"
                         end if
@@ -703,13 +706,12 @@ contains
                             subsection_x = ib_markers%sf(i - gp_layers:i + gp_layers, j, k)
                             subsection_y = ib_markers%sf(i, j - gp_layers:j + gp_layers, k)
                             subsection_z = ib_markers%sf(i, j, k - gp_layers:k + gp_layers)
-
-                            if (any(subsection_x == 0) .or. & 
-                                any(subsection_y == 0) .or. & 
-                                any(subsection_z == 0)) then 
-                                  num_gps_out = num_gps_out + 1
+                            if (any(subsection_x == 0) .or. &
+                                any(subsection_y == 0) .or. &
+                                any(subsection_z == 0)) then
+                                num_gps_out = num_gps_out + 1
                             else
-                                  num_inner_gps_out = num_inner_gps_out + 1
+                                num_inner_gps_out = num_inner_gps_out + 1
                             end if
                         end if
                     end do
@@ -791,23 +793,15 @@ contains
                             !                 i - gp_layers:i + gp_layers, &
                             !                 j - gp_layers:j + gp_layers, &
                             !                 k - gp_layers:k + gp_layers)
+                            ! if (any(subsection_3D == 0)) then
 
                             subsection_x = ib_markers%sf(i - gp_layers:i + gp_layers, j, k)
                             subsection_y = ib_markers%sf(i, j - gp_layers:j + gp_layers, k)
                             subsection_z = ib_markers%sf(i, j, k - gp_layers:k + gp_layers)
+                            if (any(subsection_x == 0) .or. &
+                                any(subsection_y == 0) .or. &
+                                any(subsection_z == 0)) then
 
-                            if (any(subsection_x == 0) .or. & 
-                                any(subsection_y == 0) .or. & 
-                                any(subsection_z == 0)) then 
-                                
-                                if (i==  7  .and.    j==    26     .and. k==      0) then 
-                                    print *, 'HERE' 
-                                    print *, 'x', subsection_x, 'y', subsection_y, 'z', subsection_z
-                                    print *, proc_rank, ib_markers%sf(7, 26, -1)
-                                end if
-
-
-                            ! if (any(subsection_3D == 0)) then
                                 ghost_points_in(count)%loc = [i, j, k]
                                 patch_id = ib_markers%sf(i, j, k)
                                 ghost_points_in(count)%ib_patch_id = &