diff --git a/runs/3d_1sphere_filtering/case.py b/runs/3d_1sphere_filtering/case.py
new file mode 100644
index 0000000000..c6d138c110
--- /dev/null
+++ b/runs/3d_1sphere_filtering/case.py
@@ -0,0 +1,154 @@
+import json
+import math
+import numpy as np
+
+Mu = 1.84e-05
+gam_a = 1.4
+R = 287.0
+
+D = 0.1
+
+P = 101325 # Pa
+rho = 1.225 # kg/m^3
+
+T = P/(rho*R)
+
+M = 1.2
+Re = 1500.0
+v1 = M*(gam_a*P/rho)**(1.0/2.0)
+
+mu = rho*v1*D/Re # dynamic viscosity for current case
+
+#print('mu: ', mu)
+#print('v1: ', v1)
+#print('rho: ', rho)
+#print('Kn = ' + str( np.sqrt(np.pi*gam_a/2)*(M/Re) )) # Kn < 0.01 = continuum flow
+
+dt = 4.0E-06
+Nt = 1000
+t_save = 10
+
+Nx = 63
+Ny = 63
+Nz = 63
+
+# immersed boundary dictionary
+ib_dict = {}
+ib_dict.update({
+    f"patch_ib({1})%geometry": 8,
+    f"patch_ib({1})%x_centroid": 0.0,
+    f"patch_ib({1})%y_centroid": 0.0,
+    f"patch_ib({1})%z_centroid": 0.0,
+    f"patch_ib({1})%radius": D / 2,
+    f"patch_ib({1})%slip": "F",
+    })
+
+# Configuring case dictionary
+case_dict = {
+    # Logistics
+    "run_time_info": "T",
+    # Computational Domain Parameters
+    # x direction
+    "x_domain%beg": -5.0 * D,
+    "x_domain%end": 5.0 * D,
+    # y direction
+    "y_domain%beg": -5.0 * D,
+    "y_domain%end": 5.0 * D,
+    # z direction
+    "z_domain%beg": -5.0 * D,
+    "z_domain%end": 5.0 * D,
+    "cyl_coord": "F",
+    "m": Nx,
+    "n": Ny,
+    "p": Nz,
+    "dt": dt,
+    "t_step_start": 0,
+    "t_step_stop": Nt,  # 3000
+    "t_step_save": t_save,  # 10
+    "t_step_stat_start": 100,
+    # Simulation Algorithm Parameters
+    # Only one patches are necessary, the air tube
+    "num_patches": 1,
+    # Use the 5 equation model
+    "model_eqns": 2,
+    # 6 equations model does not need the K \div(u) term
+    "alt_soundspeed": "F",
+    # One fluids: air
+    "num_fluids": 1,
+    # time step
+    "mpp_lim": "F",
+    # Correct errors when computing speed of sound
+    "mixture_err": "T",
+    # Use TVD RK3 for time marching
+    "time_stepper": 3,
+    # Reconstruct the primitive variables to minimize spurious
+    # Use WENO5
+    "weno_order": 5,
+    "weno_eps": 1.0e-14,
+    "weno_Re_flux": "T",
+    "weno_avg": "T",
+    "avg_state": 2,
+    "mapped_weno": "T",
+    "null_weights": "F",
+    "mp_weno": "T",
+    "riemann_solver": 2,
+    "low_Mach": 1,
+    "wave_speeds": 1,
+    # periodic bc
+    "bc_x%beg": -1,
+    "bc_x%end": -1,
+    "bc_y%beg": -1,
+    "bc_y%end": -1,
+    "bc_z%beg": -1,
+    "bc_z%end": -1,
+    # Set IB to True and add 1 patch
+    "ib": "T",
+    "num_ibs": 1,
+    "viscous": "T",
+    # Formatted Database Files Structure Parameters
+    "format": 1,
+    "precision": 2,
+    "prim_vars_wrt": "T",
+    "E_wrt": "T",
+    "q_filtered_wrt": "T",
+    "parallel_io": "T",
+    # Patch: Constant Tube filled with air
+    # Specify the cylindrical air tube grid geometry
+    "patch_icpp(1)%geometry": 9,
+    "patch_icpp(1)%x_centroid": 0.0,
+    # Uniform medium density, centroid is at the center of the domain
+    "patch_icpp(1)%y_centroid": 0.0,
+    "patch_icpp(1)%z_centroid": 0.0,
+    "patch_icpp(1)%length_x": 10 * D,
+    "patch_icpp(1)%length_y": 10 * D,
+    "patch_icpp(1)%length_z": 10 * D,
+    # Specify the patch primitive variables
+    "patch_icpp(1)%vel(1)": v1,
+    "patch_icpp(1)%vel(2)": 0.0e00,
+    "patch_icpp(1)%vel(3)": 0.0e00,
+    "patch_icpp(1)%pres": P,
+    "patch_icpp(1)%alpha_rho(1)": rho,
+    "patch_icpp(1)%alpha(1)": 1.0e00,
+    # Patch: Sphere Immersed Boundary
+    # Fluids Physical Parameters
+    "fluid_pp(1)%gamma": 1.0e00 / (gam_a - 1.0e00),  # 2.50(Not 1.40)
+    "fluid_pp(1)%pi_inf": 0,
+    "fluid_pp(1)%Re(1)": 1.0 / mu,
+
+    # new case additions
+    "periodic_forcing": "T",
+    "periodic_ibs": "T",
+    "volume_filtering_momentum_eqn": "T",
+
+    "u_inf_ref": v1,
+    "rho_inf_ref": rho,
+    "T_inf_ref": T,
+
+    "store_levelset": "F",
+    "slab_domain_decomposition": "T", 
+    "compute_autocorrelation": "T",
+    }
+
+case_dict.update(ib_dict)
+
+print(json.dumps(case_dict))
diff --git a/runs/3d_1sphere_periodic/case.py b/runs/3d_1sphere_periodic/case.py
new file mode 100644
index 0000000000..05f7efc429
--- /dev/null
+++ b/runs/3d_1sphere_periodic/case.py
@@ -0,0 +1,153 @@
+import json
+import math
+import numpy as np
+
+Mu = 1.84e-05
+gam_a = 1.4
+R = 287.0
+
+D = 0.1
+
+P = 101325 # Pa
+rho = 1.225 # kg/m^3
+
+T = P/(rho*R)
+
+M = 1.2
+Re = 1500.0
+v1 = M*(gam_a*P/rho)**(1.0/2.0)
+
+mu = rho*v1*D/Re # dynamic viscosity for current case
+
+#print('mu: ', mu)
+#print('v1: ', v1)
+#print('rho: ', rho)
+#print('Kn = ' + str( np.sqrt(np.pi*gam_a/2)*(M/Re) )) # Kn < 0.01 = continuum flow
+
+dt = 4.0E-06
+Nt = 100
+t_save = 10
+
+Nx = 63
+Ny = 63
+Nz = 63
+
+# immersed boundary dictionary
+ib_dict = {}
+ib_dict.update({
+    f"patch_ib({1})%geometry": 8,
+    f"patch_ib({1})%x_centroid": 0.0,
+    f"patch_ib({1})%y_centroid": 0.0,
+    f"patch_ib({1})%z_centroid": 0.0,
+    f"patch_ib({1})%radius": D / 2,
+    f"patch_ib({1})%slip": "F",
+    })
+
+# Configuring case dictionary
+case_dict = {
+    # Logistics
+    "run_time_info": "T",
+    # Computational Domain Parameters
+    # x direction
+    "x_domain%beg": -5.0 * D,
+    "x_domain%end": 5.0 * D,
+    # y direction
+    "y_domain%beg": -5.0 * D,
+    "y_domain%end": 5.0 * D,
+    # z direction
+    "z_domain%beg": -5.0 * D,
+    "z_domain%end": 5.0 * D,
+    "cyl_coord": "F",
+    "m": Nx,
+    "n": Ny,
+    "p": Nz,
+    "dt": dt,
+    "t_step_start": 0,
+    "t_step_stop": Nt,  # 3000
+    "t_step_save": t_save,  # 10
+    # Simulation Algorithm Parameters
+    # Only one patches are necessary, the air tube
+    "num_patches": 1,
+    # Use the 5 equation model
+    "model_eqns": 2,
+    # 6 equations model does not need the K \div(u) term
+    "alt_soundspeed": "F",
+    # One fluids: air
+    "num_fluids": 1,
+    # time step
+    "mpp_lim": "F",
+    # Correct errors when computing speed of sound
+    "mixture_err": "T",
+    # Use TVD RK3 for time marching
+    "time_stepper": 3,
+    # Reconstruct the primitive variables to minimize spurious
+    # Use WENO5
+    "weno_order": 5,
+    "weno_eps": 1.0e-14,
+    "weno_Re_flux": "T",
+    "weno_avg": "T",
+    "avg_state": 2,
+    "mapped_weno": "T",
+    "null_weights": "F",
+    "mp_weno": "T",
+    "riemann_solver": 2,
+    "low_Mach": 1,
+    "wave_speeds": 1,
+    # periodic bc
+    "bc_x%beg": -1,
+    "bc_x%end": -1,
+    "bc_y%beg": -1,
+    "bc_y%end": -1,
+    "bc_z%beg": -1,
+    "bc_z%end": -1,
+    # Set IB to True and add 1 patch
+    "ib": "T",
+    "num_ibs": 1,
+    "viscous": "T",
+    # Formatted Database Files Structure Parameters
+    "format": 1,
+    "precision": 2,
+    "prim_vars_wrt": "T",
+    "E_wrt": "T",
+    #"q_filtered_wrt": "T",
+    "parallel_io": "T",
+    
+    "patch_icpp(1)%geometry": 9,
+    "patch_icpp(1)%x_centroid": 0.0,
+    # Uniform medium density, centroid is at the center of the domain
+    "patch_icpp(1)%y_centroid": 0.0,
+    "patch_icpp(1)%z_centroid": 0.0,
+    "patch_icpp(1)%length_x": 10 * D,
+    "patch_icpp(1)%length_y": 10 * D,
+    "patch_icpp(1)%length_z": 10 * D,
+    # Specify the patch primitive variables
+    "patch_icpp(1)%vel(1)": v1,
+    "patch_icpp(1)%vel(2)": 0.0e00,
+    "patch_icpp(1)%vel(3)": 0.0e00,
+    "patch_icpp(1)%pres": P,
+    "patch_icpp(1)%alpha_rho(1)": rho,
+    "patch_icpp(1)%alpha(1)": 1.0e00,
+    # Patch: Sphere Immersed Boundary
+    # Fluids Physical Parameters
+    "fluid_pp(1)%gamma": 1.0e00 / (gam_a - 1.0e00),  # 2.50(Not 1.40)
+    "fluid_pp(1)%pi_inf": 0,
+    "fluid_pp(1)%Re(1)": Re,
+
+    # new case additions
+    "periodic_forcing": "T",
+    "periodic_ibs": "T",
+    #"compute_particle_drag_vi": "F",
+    #"compute_particle_drag_si": "F",
+    #"volume_filtering_momentum_eqn": "T",
+
+    "u_inf_ref": v1,
+    "rho_inf_ref": rho,
+    "T_inf_ref": T,
+
+    "store_levelset": "F",
+    "slab_domain_decomposition": "T", 
+    }
+
+case_dict.update(ib_dict)
+
+print(json.dumps(case_dict))
diff --git a/runs/3d_drag_test/case.py b/runs/3d_drag_test/case.py
new file mode 100644
index 0000000000..00eb6a3c30
--- /dev/null
+++ b/runs/3d_drag_test/case.py
@@ -0,0 +1,144 @@
+import json
+import math
+import numpy as np
+
+Mu = 1.84e-05
+gam_a = 1.4
+R = 287.0
+
+D = 0.1
+
+P = 101325 # Pa
+rho = 1.225 # kg/m^3
+
+T = P/(rho*R)
+
+M = 1.2
+Re = 1500.0
+v1 = M*(gam_a*P/rho)**(1.0/2.0)
+
+mu = rho*v1*D/Re # dynamic viscosity for current case
+
+#print('mu: ', mu)
+#print('v1: ', v1)
+#print('rho: ', rho)
+#print('Kn = ' + str( np.sqrt(np.pi*gam_a/2)*(M/Re) )) # Kn < 0.01 = continuum flow
+
+dt = 4.0E-06
+Nt = 100
+t_save = 1
+
+Nx = 99
+Ny = 99
+Nz = 99
+
+# immersed boundary dictionary
+ib_dict = {}
+ib_dict.update({
+    f"patch_ib({1})%geometry": 8,
+    f"patch_ib({1})%x_centroid": 0.0,
+    f"patch_ib({1})%y_centroid": 0.0,
+    f"patch_ib({1})%z_centroid": 0.0,
+    f"patch_ib({1})%radius": D / 2,
+    f"patch_ib({1})%slip": "F",
+    })
+
+# Configuring case dictionary
+case_dict = {
+    # Logistics
+    "run_time_info": "T",
+    # Computational Domain Parameters
+    # x direction
+    "x_domain%beg": -5.0 * D,
+    "x_domain%end": 5.0 * D,
+    # y direction
+    "y_domain%beg": -5.0 * D,
+    "y_domain%end": 5.0 * D,
+    # z direction
+    "z_domain%beg": -5.0 * D,
+    "z_domain%end": 5.0 * D,
+    "cyl_coord": "F",
+    "m": Nx,
+    "n": Ny,
+    "p": Nz,
+    "dt": dt,
+    "t_step_start": 0,
+    "t_step_stop": Nt,  # 3000
+    "t_step_save": t_save,  # 10
+    # Simulation Algorithm Parameters
+    # Only one patches are necessary, the air tube
+    "num_patches": 1,
+    # Use the 5 equation model
+    "model_eqns": 2,
+    # 6 equations model does not need the K \div(u) term
+    "alt_soundspeed": "F",
+    # One fluids: air
+    "num_fluids": 1,
+    # time step
+    "mpp_lim": "F",
+    # Correct errors when computing speed of sound
+    "mixture_err": "T",
+    # Use TVD RK3 for time marching
+    "time_stepper": 3,
+    # Reconstruct the primitive variables to minimize spurious
+    # Use WENO5
+    "weno_order": 5,
+    "weno_eps": 1.0e-14,
+    "weno_Re_flux": "T",
+    "weno_avg": "T",
+    "avg_state": 2,
+    "mapped_weno": "T",
+    "null_weights": "F",
+    "mp_weno": "T",
+    "riemann_solver": 2,
+    "low_Mach": 1,
+    "wave_speeds": 1,
+    # ghost cell extrapolation
+    "bc_x%beg": -3,
+    "bc_x%end": -3,
+    "bc_y%beg": -3,
+    "bc_y%end": -3,
+    "bc_z%beg": -3,
+    "bc_z%end": -3,
+    # Set IB to True and add 1 patch
+    "ib": "T",
+    "num_ibs": 1,
+    "viscous": "T",
+    # Formatted Database Files Structure Parameters
+    "format": 1,
+    "precision": 2,
+    "prim_vars_wrt": "T",
+    "E_wrt": "T",
+    "parallel_io": "T",
+    
+    "patch_icpp(1)%geometry": 9,
+    "patch_icpp(1)%x_centroid": 0.0,
+    # Uniform medium density, centroid is at the center of the domain
+    "patch_icpp(1)%y_centroid": 0.0,
+    "patch_icpp(1)%z_centroid": 0.0,
+    "patch_icpp(1)%length_x": 10 * D,
+    "patch_icpp(1)%length_y": 10 * D,
+    "patch_icpp(1)%length_z": 10 * D,
+    # Specify the patch primitive variables
+    "patch_icpp(1)%vel(1)": v1,
+    "patch_icpp(1)%vel(2)": 0.0e00,
+    "patch_icpp(1)%vel(3)": 0.0e00,
+    "patch_icpp(1)%pres": P,
+    "patch_icpp(1)%alpha_rho(1)": rho,
+    "patch_icpp(1)%alpha(1)": 1.0e00,
+    # Patch: Sphere Immersed Boundary
+    # Fluids Physical Parameters
+    "fluid_pp(1)%gamma": 1.0e00 / (gam_a - 1.0e00),  # 2.50(Not 1.40)
+    "fluid_pp(1)%pi_inf": 0,
+    "fluid_pp(1)%Re(1)": Re,
+
+    # new case additions
+    "compute_particle_drag": "T",
+    "u_inf_ref": v1,
+    "rho_inf_ref": rho,
+    "T_inf_ref": T,
+    }
+
+case_dict.update(ib_dict)
+
+print(json.dumps(case_dict))
diff --git a/runs/3d_periodic_ibs_test/centered/case.py b/runs/3d_periodic_ibs_test/centered/case.py
new file mode 100644
index 0000000000..a3199b3933
--- /dev/null
+++ b/runs/3d_periodic_ibs_test/centered/case.py
@@ -0,0 +1,146 @@
+import json
+import math
+import numpy as np
+
+Mu = 1.84e-05
+gam_a = 1.4
+R = 287.0
+
+D = 0.1
+
+P = 101325 # Pa
+rho = 1.225 # kg/m^3
+
+T = P/(rho*R)
+
+M = 1.2
+Re = 1500.0
+v1 = M*np.sqrt((gam_a*P/rho))
+
+mu = rho*v1*D/Re # dynamic viscosity for current case
+
+#print('mu: ', mu)
+#print('v1: ', v1)
+#print('rho: ', rho)
+#print('Kn = ' + str( np.sqrt(np.pi*gam_a/2)*(M/Re) )) # Kn < 0.01 = continuum flow
+
+dt = 4.0E-06
+Nt = 100
+t_save = 10
+
+Nx = 63
+Ny = 63
+Nz = 63
+
+# immersed boundary dictionary
+ib_dict = {}
+ib_dict.update({
+    f"patch_ib({1})%geometry": 8,
+    f"patch_ib({1})%x_centroid": 0.0,
+    f"patch_ib({1})%y_centroid": 0.0,
+    f"patch_ib({1})%z_centroid": 0.0,
+    f"patch_ib({1})%radius": D / 2,
+    f"patch_ib({1})%slip": "F",
+    })
+
+# Configuring case dictionary
+case_dict = {
+    # Logistics
+    "run_time_info": "T",
+    # Computational Domain Parameters
+    # x direction
+    "x_domain%beg": -5.0 * D,
+    "x_domain%end": 5.0 * D,
+    # y direction
+    "y_domain%beg": -5.0 * D,
+    "y_domain%end": 5.0 * D,
+    # z direction
+    "z_domain%beg": -5.0 * D,
+    "z_domain%end": 5.0 * D,
+    "cyl_coord": "F",
+    "m": Nx,
+    "n": Ny,
+    "p": Nz,
+    "dt": dt,
+    "t_step_start": 0,
+    "t_step_stop": Nt,  # 3000
+    "t_step_save": t_save,  # 10
+    # Simulation Algorithm Parameters
+    # Only one patches are necessary, the air tube
+    "num_patches": 1,
+    # Use the 5 equation model
+    "model_eqns": 2,
+    # 6 equations model does not need the K \div(u) term
+    "alt_soundspeed": "F",
+    # One fluids: air
+    "num_fluids": 1,
+    # time step
+    "mpp_lim": "F",
+    # Correct errors when computing speed of sound
+    "mixture_err": "T",
+    # Use TVD RK3 for time marching
+    "time_stepper": 3,
+    # Reconstruct the primitive variables to minimize spurious
+    # Use WENO5
+    "weno_order": 5,
+    "weno_eps": 1.0e-14,
+    "weno_Re_flux": "T",
+    "weno_avg": "T",
+    "avg_state": 2,
+    "mapped_weno": "T",
+    "null_weights": "F",
+    "mp_weno": "T",
+    "riemann_solver": 2,
+    "low_Mach": 1,
+    "wave_speeds": 1,
+    # periodic bc
+    "bc_x%beg": -1,
+    "bc_x%end": -1,
+    "bc_y%beg": -1,
+    "bc_y%end": -1,
+    "bc_z%beg": -1,
+    "bc_z%end": -1,
+    # Set IB to True and add 1 patch
+    "ib": "T",
+    "num_ibs": 1,
+    "viscous": "T",
+    # Formatted Database Files Structure Parameters
+    "format": 1,
+    "precision": 2,
+    "prim_vars_wrt": "T",
+    "E_wrt": "T",
+    "parallel_io": "T",
+    
+    "patch_icpp(1)%geometry": 9,
+    "patch_icpp(1)%x_centroid": 0.0,
+    # Uniform medium density, centroid is at the center of the domain
+    "patch_icpp(1)%y_centroid": 0.0,
+    "patch_icpp(1)%z_centroid": 0.0,
+    "patch_icpp(1)%length_x": 10 * D,
+    "patch_icpp(1)%length_y": 10 * D,
+    "patch_icpp(1)%length_z": 10 * D,
+    # Specify the patch primitive variables
+    "patch_icpp(1)%vel(1)": v1,
+    "patch_icpp(1)%vel(2)": 0.0e00,
+    "patch_icpp(1)%vel(3)": 0.0e00,
+    "patch_icpp(1)%pres": P,
+    "patch_icpp(1)%alpha_rho(1)": rho,
+    "patch_icpp(1)%alpha(1)": 1.0e00,
+    # Patch: Sphere Immersed Boundary
+    # Fluids Physical Parameters
+    "fluid_pp(1)%gamma": 1.0e00 / (gam_a - 1.0e00),  # 2.50(Not 1.40)
+    "fluid_pp(1)%pi_inf": 0,
+    "fluid_pp(1)%Re(1)": 1.0 / mu,
+
+    # new case additions
+    "periodic_forcing": "T",
+    "periodic_ibs": "T",
+
+    "u_inf_ref": v1,
+    "rho_inf_ref": rho,
+    "T_inf_ref": T,
+    }
+
+case_dict.update(ib_dict)
+
+print(json.dumps(case_dict))
diff --git a/runs/3d_periodic_ibs_test/off-center/case.py b/runs/3d_periodic_ibs_test/off-center/case.py
new file mode 100644
index 0000000000..ecd3d7c9f5
--- /dev/null
+++ b/runs/3d_periodic_ibs_test/off-center/case.py
@@ -0,0 +1,146 @@
+import json
+import math
+import numpy as np
+
+Mu = 1.84e-05
+gam_a = 1.4
+R = 287.0
+
+D = 0.1
+
+P = 101325 # Pa
+rho = 1.225 # kg/m^3
+
+T = P/(rho*R)
+
+M = 1.2
+Re = 1500.0
+v1 = M*(gam_a*P/rho)**(1.0/2.0)
+
+mu = rho*v1*D/Re # dynamic viscosity for current case
+
+#print('mu: ', mu)
+#print('v1: ', v1)
+#print('rho: ', rho)
+#print('Kn = ' + str( np.sqrt(np.pi*gam_a/2)*(M/Re) )) # Kn < 0.01 = continuum flow
+
+dt = 4.0E-06
+Nt = 100
+t_save = 10
+
+Nx = 63
+Ny = 63
+Nz = 63
+
+# immersed boundary dictionary
+ib_dict = {}
+ib_dict.update({
+    f"patch_ib({1})%geometry": 8,
+    f"patch_ib({1})%x_centroid": 15.0 * D,
+    f"patch_ib({1})%y_centroid": 15.0 * D,
+    f"patch_ib({1})%z_centroid": 15.0 * D,
+    f"patch_ib({1})%radius": D / 2,
+    f"patch_ib({1})%slip": "F",
+    })
+
+# Configuring case dictionary
+case_dict = {
+    # Logistics
+    "run_time_info": "T",
+    # Computational Domain Parameters
+    # x direction
+    "x_domain%beg": 5.0 * D,
+    "x_domain%end": 15.0 * D,
+    # y direction
+    "y_domain%beg": 5.0 * D,
+    "y_domain%end": 15.0 * D,
+    # z direction
+    "z_domain%beg": 5.0 * D,
+    "z_domain%end": 15.0 * D,
+    "cyl_coord": "F",
+    "m": Nx,
+    "n": Ny,
+    "p": Nz,
+    "dt": dt,
+    "t_step_start": 0,
+    "t_step_stop": Nt,  # 3000
+    "t_step_save": t_save,  # 10
+    # Simulation Algorithm Parameters
+    # Only one patches are necessary, the air tube
+    "num_patches": 1,
+    # Use the 5 equation model
+    "model_eqns": 2,
+    # 6 equations model does not need the K \div(u) term
+    "alt_soundspeed": "F",
+    # One fluids: air
+    "num_fluids": 1,
+    # time step
+    "mpp_lim": "F",
+    # Correct errors when computing speed of sound
+    "mixture_err": "T",
+    # Use TVD RK3 for time marching
+    "time_stepper": 3,
+    # Reconstruct the primitive variables to minimize spurious
+    # Use WENO5
+    "weno_order": 5,
+    "weno_eps": 1.0e-14,
+    "weno_Re_flux": "T",
+    "weno_avg": "T",
+    "avg_state": 2,
+    "mapped_weno": "T",
+    "null_weights": "F",
+    "mp_weno": "T",
+    "riemann_solver": 2,
+    "low_Mach": 1,
+    "wave_speeds": 1,
+    # periodic bc
+    "bc_x%beg": -1,
+    "bc_x%end": -1,
+    "bc_y%beg": -1,
+    "bc_y%end": -1,
+    "bc_z%beg": -1,
+    "bc_z%end": -1,
+    # Set IB to True and add 1 patch
+    "ib": "T",
+    "num_ibs": 1,
+    "viscous": "T",
+    # Formatted Database Files Structure Parameters
+    "format": 1,
+    "precision": 2,
+    "prim_vars_wrt": "T",
+    "E_wrt": "T",
+    "parallel_io": "T",
+    
+    "patch_icpp(1)%geometry": 9,
+    "patch_icpp(1)%x_centroid": 10.0*D,
+    # Uniform medium density, centroid is at the center of the domain
+    "patch_icpp(1)%y_centroid": 10.0*D,
+    "patch_icpp(1)%z_centroid": 10.0*D,
+    "patch_icpp(1)%length_x": 10 * D,
+    "patch_icpp(1)%length_y": 10 * D,
+    "patch_icpp(1)%length_z": 10 * D,
+    # Specify the patch primitive variables
+    "patch_icpp(1)%vel(1)": v1,
+    "patch_icpp(1)%vel(2)": 0.0e00,
+    "patch_icpp(1)%vel(3)": 0.0e00,
+    "patch_icpp(1)%pres": P,
+    "patch_icpp(1)%alpha_rho(1)": rho,
+    "patch_icpp(1)%alpha(1)": 1.0e00,
+    # Patch: Sphere Immersed Boundary
+    # Fluids Physical Parameters
+    "fluid_pp(1)%gamma": 1.0e00 / (gam_a - 1.0e00),  # 2.50(Not 1.40)
+    "fluid_pp(1)%pi_inf": 0,
+    "fluid_pp(1)%Re(1)": 1.0 / mu,
+
+    # new case additions
+    "periodic_forcing": "T",
+    "periodic_ibs": "T",
+
+    "u_inf_ref": v1,
+    "rho_inf_ref": rho,
+    "T_inf_ref": T,
+    }
+
+case_dict.update(ib_dict)
+
+print(json.dumps(case_dict))
diff --git a/runs/phi01/case.py b/runs/phi01/case.py
new file mode 100644
index 0000000000..c67369d9c7
--- /dev/null
+++ b/runs/phi01/case.py
@@ -0,0 +1,168 @@
+import json
+import math
+import numpy as np
+
+
+gam_a = 1.4
+
+D = 0.1
+L = 10 * D
+
+M = 0.8
+Re = 1500.0
+
+P = 101325
+rho = 1.225
+
+v1 = M * np.sqrt(gam_a * P / rho) 
+mu = rho * v1 * D / Re
+
+#print('mu: ', mu)
+#print('v1: ', v1)
+#print('rho: ', rho)
+#print('Kn = ' + str( np.sqrt(np.pi*gam_a/2)*(M/Re) )) # Kn < 0.01 = continuum flow
+
+dt = 5.0E-06
+Nt = 200 #int(1 * L / v1 / dt)
+t_save = Nt//5
+t_step_start_stats = Nt//2
+
+Nx = 199
+Ny = Nx
+Nz = Ny
+
+# load initial sphere locations
+sphere_loc = np.loadtxt('sphere_array_locations.txt')
+N_sphere = len(sphere_loc)
+
+# immersed boundary dictionary
+ib_dict = {}
+for i in range(N_sphere):
+    ib_dict.update({
+        f"patch_ib({i+1})%geometry": 8,
+        f"patch_ib({i+1})%x_centroid": sphere_loc[i, 0],
+        f"patch_ib({i+1})%y_centroid": sphere_loc[i, 1],
+        f"patch_ib({i+1})%z_centroid": sphere_loc[i, 2],
+        f"patch_ib({i+1})%radius": D / 2,
+        f"patch_ib({i+1})%slip": "F",
+        })
+
+# ib_dict.update({
+#     f"patch_ib({1})%geometry": 8,
+#     f"patch_ib({1})%x_centroid": sphere_loc[20, 0],
+#     f"patch_ib({1})%y_centroid": sphere_loc[20, 1],
+#     f"patch_ib({1})%z_centroid": sphere_loc[20, 2],
+#     f"patch_ib({1})%radius": D / 2,
+#     f"patch_ib({1})%slip": "F",
+#     })
+
+# Configuring case dictionary
+case_dict = {
+    # Logistics
+    "run_time_info": "T",
+    # Computational Domain Parameters
+    # x direction
+    "x_domain%beg": -5.0 * D,
+    "x_domain%end": 5.0 * D,
+    # y direction
+    "y_domain%beg": -5.0 * D,
+    "y_domain%end": 5.0 * D,
+    # z direction
+    "z_domain%beg": -5.0 * D,
+    "z_domain%end": 5.0 * D,
+    "cyl_coord": "F",
+    "m": Nx,
+    "n": Ny,
+    "p": Nz,
+    "dt": dt,
+    "t_step_start": 0,
+    "t_step_stop": Nt,  
+    "t_step_save": t_save,  
+    "t_step_stat_start": t_step_start_stats,
+    # Simulation Algorithm Parameters
+    # Only one patches are necessary, the air tube
+    "num_patches": 1,
+    # Use the 5 equation model
+    "model_eqns": 2,
+    # 6 equations model does not need the K \div(u) term
+    "alt_soundspeed": "F",
+    # One fluids: air
+    "num_fluids": 1,
+    # time step
+    "mpp_lim": "F",
+    # Correct errors when computing speed of sound
+    "mixture_err": "T",
+    # Use TVD RK3 for time marching
+    "time_stepper": 3,
+    # Reconstruct the primitive variables to minimize spurious
+    # Use WENO5
+    "weno_order": 5,
+    "weno_eps": 1.0e-14,
+    "weno_Re_flux": "T",
+    "weno_avg": "T",
+    "avg_state": 2,
+    "mapped_weno": "T",
+    "null_weights": "F",
+    "mp_weno": "T",
+    "riemann_solver": 2,
+    "low_Mach": 1,
+    "wave_speeds": 1,
+    # periodic bc
+    "bc_x%beg": -1,
+    "bc_x%end": -1,
+    "bc_y%beg": -1,
+    "bc_y%end": -1,
+    "bc_z%beg": -1,
+    "bc_z%end": -1,
+    # Set IB to True and add 1 patch
+    "ib": "T",
+    "num_ibs": N_sphere,
+    "viscous": "T",
+    # Formatted Database Files Structure Parameters
+    "format": 1,
+    "precision": 2,
+    "prim_vars_wrt": "T",
+    "E_wrt": "T",
+    "q_filtered_wrt": "T",
+    "parallel_io": "T",
+    # Patch: Constant Tube filled with air
+    # Specify the cylindrical air tube grid geometry
+    "patch_icpp(1)%geometry": 9,
+    "patch_icpp(1)%x_centroid": 0.0,
+    # Uniform medium density, centroid is at the center of the domain
+    "patch_icpp(1)%y_centroid": 0.0,
+    "patch_icpp(1)%z_centroid": 0.0,
+    "patch_icpp(1)%length_x": 10 * D,
+    "patch_icpp(1)%length_y": 10 * D,
+    "patch_icpp(1)%length_z": 10 * D,
+    # Specify the patch primitive variables
+    "patch_icpp(1)%vel(1)": v1,
+    "patch_icpp(1)%vel(2)": 0.0e00,
+    "patch_icpp(1)%vel(3)": 0.0e00,
+    "patch_icpp(1)%pres": P,
+    "patch_icpp(1)%alpha_rho(1)": rho,
+    "patch_icpp(1)%alpha(1)": 1.0e00,
+    # Patch: Sphere Immersed Boundary
+    # Fluids Physical Parameters
+    "fluid_pp(1)%gamma": 1.0e00 / (gam_a - 1.0e00),  # 2.50(Not 1.40)
+    "fluid_pp(1)%pi_inf": 0,
+    "fluid_pp(1)%Re(1)": 1.0 / mu,
+
+    # new case additions
+    "periodic_forcing": "T",
+    "periodic_ibs": "T",
+    "volume_filtering_momentum_eqn": "T",
+    "filter_width": 3.0*D/2 * np.sqrt(2/(9*np.pi)),
+    "compute_particle_drag": "T",
+
+    "u_inf_ref": v1,
+    "rho_inf_ref": rho,
+    "P_inf_ref": P,
+
+    "store_levelset": "F",
+    "slab_domain_decomposition": "T", 
+    }
+
+case_dict.update(ib_dict)
+
+print(json.dumps(case_dict))
diff --git a/runs/phi01/sphere_array_locations.txt b/runs/phi01/sphere_array_locations.txt
new file mode 100644
index 0000000000..047707ef90
--- /dev/null
+++ b/runs/phi01/sphere_array_locations.txt
@@ -0,0 +1,190 @@
+-8.877599239349365234e-02 1.935560703277587891e-01 -6.486654281616210938e-02
+-3.341052532196044922e-01 4.142935276031494141e-01 -4.567451477050781250e-01
+2.565863132476806641e-01 -4.949223995208740234e-02 -4.442641735076904297e-01
+3.103950023651123047e-01 -2.099078893661499023e-01 -4.642441272735595703e-01
+-3.521966934204101562e-02 -1.745276451110839844e-01 -3.202521800994873047e-01
+-1.949143409729003906e-02 -1.775810718536376953e-01 -3.603804111480712891e-02
+-1.835894584655761719e-01 3.262339830398559570e-01 -3.085057735443115234e-01
+-1.445159912109375000e-01 1.513528823852539062e-01 -2.023205757141113281e-01
+-4.898538589477539062e-01 -4.509705305099487305e-01 -1.682095527648925781e-01
+3.143328428268432617e-01 4.728571176528930664e-01 1.526627540588378906e-01
+1.280879974365234375e-01 1.239399909973144531e-01 -3.574787378311157227e-01
+-1.123933792114257812e-01 -3.207942247390747070e-01 9.310150146484375000e-02
+-1.386029720306396484e-01 -1.205575466156005859e-02 2.014696598052978516e-01
+-2.808933258056640625e-01 3.925647735595703125e-01 2.450205087661743164e-01
+4.294252395629882812e-02 2.894115447998046875e-01 -2.536165714263916016e-02
+1.801455020904541016e-01 5.933284759521484375e-02 4.247887134552001953e-01
+1.872421503067016602e-01 3.063344955444335938e-02 8.561480045318603516e-02
+2.484493255615234375e-01 -4.173127412796020508e-01 3.008729219436645508e-01
+8.203792572021484375e-02 1.318891048431396484e-01 -1.190292835235595703e-02
+-4.555282592773437500e-01 -3.696656227111816406e-01 2.237200736999511719e-02
+-1.931151151657104492e-01 5.374908447265625000e-02 5.545830726623535156e-02
+-6.292748451232910156e-02 1.790912151336669922e-01 1.174246072769165039e-01
+-2.316267490386962891e-01 -4.075572490692138672e-01 4.597637653350830078e-01
+-3.437596559524536133e-01 4.005973339080810547e-01 -2.290433645248413086e-01
+-1.910818815231323242e-01 -4.736427068710327148e-01 -2.076803445816040039e-01
+-4.528397321701049805e-01 7.907927036285400391e-02 3.940449953079223633e-01
+1.893968582153320312e-01 4.864903688430786133e-01 -3.449935913085937500e-01
+7.300472259521484375e-02 -3.667246103286743164e-01 3.762015104293823242e-01
+-1.821663379669189453e-01 -4.775607585906982422e-02 3.386561870574951172e-01
+5.136466026306152344e-02 4.852104187011718750e-01 -4.752502441406250000e-01
+-3.295025825500488281e-01 -5.519819259643554688e-02 5.781412124633789062e-02
+4.343043565750122070e-01 2.689909934997558594e-01 3.341940641403198242e-01
+-3.969779014587402344e-01 -2.916865348815917969e-01 -1.138211488723754883e-01
+-4.619355201721191406e-01 2.032375335693359375e-02 -1.161878108978271484e-01
+7.124900817871093750e-03 1.223111152648925781e-02 4.087531566619873047e-01
+-3.908715248107910156e-01 1.400717496871948242e-01 2.354013919830322266e-02
+-1.070375442504882812e-01 3.122891187667846680e-01 2.600712776184082031e-01
+4.667922258377075195e-01 -2.228868007659912109e-01 2.890402078628540039e-01
+9.751558303833007812e-03 3.652515411376953125e-01 1.688425540924072266e-01
+-7.598793506622314453e-02 1.410543918609619141e-02 -6.586468219757080078e-02
+-3.012117147445678711e-01 -1.333975791931152344e-02 -2.475223541259765625e-01
+1.425679922103881836e-01 -1.594284772872924805e-01 4.271366596221923828e-01
+-3.488619327545166016e-01 3.043293952941894531e-01 1.312527656555175781e-01
+1.347296237945556641e-01 -2.548012733459472656e-01 2.497346401214599609e-01
+1.558208465576171875e-01 -1.695448160171508789e-01 8.221673965454101562e-02
+2.994102239608764648e-01 -2.616212368011474609e-01 3.708097934722900391e-01
+4.749594926834106445e-01 4.012154340744018555e-01 -1.113747358322143555e-01
+4.658288955688476562e-01 -2.405116558074951172e-01 -4.019365310668945312e-01
+-4.477721452713012695e-01 1.802740097045898438e-01 2.297303676605224609e-01
+2.828998565673828125e-01 3.781812191009521484e-01 -4.897345304489135742e-01
+-1.556029319763183594e-01 -1.499896049499511719e-01 -1.702260971069335938e-01
+-2.203900814056396484e-01 4.228965044021606445e-01 3.943344354629516602e-01
+-7.529938220977783203e-02 -4.034370183944702148e-01 -4.895013570785522461e-01
+-2.633322477340698242e-01 2.260003089904785156e-01 3.617374897003173828e-01
+-2.043257951736450195e-01 -2.201197147369384766e-01 4.399769306182861328e-01
+2.097340822219848633e-01 -3.915596008300781250e-02 -2.276867628097534180e-01
+-1.167770624160766602e-01 4.129269123077392578e-01 -4.588322639465332031e-01
+3.195565938949584961e-01 2.821329832077026367e-01 2.030262947082519531e-01
+4.332208633422851562e-02 2.999825477600097656e-01 -2.426314353942871094e-01
+-2.900393009185791016e-01 7.278752326965332031e-02 3.351804018020629883e-01
+-3.045821189880371094e-02 -1.478650569915771484e-01 3.491390943527221680e-01
+-2.793753147125244141e-02 -1.773738861083984375e-01 1.675630807876586914e-01
+-3.188729286193847656e-01 -4.904426336288452148e-01 -6.549203395843505859e-02
+-4.071967601776123047e-01 -1.066761016845703125e-01 -4.441113471984863281e-01
+4.105618000030517578e-01 -3.848595619201660156e-01 1.863635778427124023e-01
+-1.051111221313476562e-01 -7.725274562835693359e-02 -4.898943901062011719e-01
+3.737279176712036133e-01 1.056033372879028320e-01 4.786680936813354492e-01
+2.511825561523437500e-01 -3.347592353820800781e-01 1.227176189422607422e-01
+-3.208853006362915039e-01 -1.442481279373168945e-01 -9.813189506530761719e-02
+3.365310430526733398e-01 -4.063715934753417969e-01 -4.750763177871704102e-01
+-3.066674470901489258e-01 -2.005393505096435547e-01 -2.603935003280639648e-01
+4.633438587188720703e-02 -3.628603219985961914e-01 -3.448045253753662109e-01
+-1.228909492492675781e-01 4.968223571777343750e-01 1.755017042160034180e-01
+4.529950618743896484e-01 -4.122850894927978516e-01 3.542938232421875000e-01
+3.015396595001220703e-01 6.062459945678710938e-02 -5.255222320556640625e-02
+7.875204086303710938e-02 -3.220939636230468750e-01 2.097034454345703125e-02
+-3.075191974639892578e-01 -4.913786649703979492e-01 1.174443960189819336e-01
+-2.157187461853027344e-01 -1.293109655380249023e-01 -3.813669681549072266e-01
+-2.569644451141357422e-01 -4.775856733322143555e-01 -3.842570781707763672e-01
+3.374536037445068359e-01 2.595454454421997070e-01 -1.862519979476928711e-01
+-2.484831809997558594e-01 1.898849010467529297e-02 -1.008712053298950195e-01
+-3.550199270248413086e-01 -3.802776336669921875e-03 2.112603187561035156e-01
+-4.047393798828125000e-02 -3.331716060638427734e-01 -1.580150127410888672e-01
+2.301404476165771484e-01 1.020783185958862305e-01 2.300353050231933594e-01
+-4.886188507080078125e-01 -4.335124492645263672e-01 -3.716624975204467773e-01
+3.109852075576782227e-01 -3.871500492095947266e-02 1.583197116851806641e-01
+4.864922761917114258e-01 -2.506246566772460938e-01 4.611170291900634766e-01
+4.114500284194946289e-01 -2.497513294219970703e-01 8.945560455322265625e-02
+-2.041511535644531250e-01 -3.061387538909912109e-01 -1.002895832061767578e-01
+-3.356888294219970703e-01 -2.898548841476440430e-01 -4.294934272766113281e-01
+6.349623203277587891e-02 -4.237914085388183594e-01 1.809575557708740234e-01
+1.638014316558837891e-01 -3.412141799926757812e-01 -4.808696508407592773e-01
+4.292991161346435547e-01 -7.350444793701171875e-02 4.452385902404785156e-01
+-2.837867736816406250e-01 2.394533157348632812e-02 -4.843814373016357422e-01
+-2.125334739685058594e-01 1.921176910400390625e-01 -2.379369735717773438e-02
+1.759276390075683594e-01 4.892826080322265625e-01 4.419517517089843750e-01
+-4.233963489532470703e-01 7.077014446258544922e-02 -3.061563968658447266e-01
+-3.712041378021240234e-01 4.946417212486267090e-01 3.635656833648681641e-01
+-4.665093421936035156e-01 4.070787429809570312e-01 -3.274630308151245117e-01
+3.692833185195922852e-01 -8.178091049194335938e-02 -1.193681955337524414e-01
+6.124496459960937500e-03 -2.011668682098388672e-02 8.408391475677490234e-02
+-1.337385177612304688e-02 -2.435498237609863281e-01 -4.735767841339111328e-01
+2.590975761413574219e-01 -3.270063400268554688e-01 -5.099523067474365234e-02
+3.800438642501831055e-01 4.123662710189819336e-01 -3.175902366638183594e-01
+2.355668544769287109e-01 2.839933633804321289e-01 -3.255009651184082031e-01
+-4.340230226516723633e-01 -4.109045267105102539e-01 4.977314472198486328e-01
+2.350783348083496094e-02 -7.954597473144531250e-02 -2.089430093765258789e-01
+2.528522014617919922e-01 2.231028079986572266e-01 -4.818900823593139648e-01
+3.285017013549804688e-01 -1.968045234680175781e-01 2.016012668609619141e-01
+3.276336193084716797e-01 3.824212551116943359e-01 -2.195405960083007812e-02
+4.347554445266723633e-01 -1.944565773010253906e-02 -3.952792882919311523e-01
+-2.355787754058837891e-01 2.512185573577880859e-01 -4.705796241760253906e-01
+2.304534912109375000e-01 2.335491180419921875e-01 3.436188697814941406e-01
+4.291563034057617188e-01 2.084137201309204102e-01 -3.515939712524414062e-01
+4.610210657119750977e-01 2.877938747406005859e-01 9.413146972656250000e-02
+3.239741325378417969e-01 4.200505018234252930e-01 3.377312421798706055e-01
+-4.339945316314697266e-01 -1.799043416976928711e-01 1.667797565460205078e-01
+4.162905216217041016e-01 -2.838604450225830078e-01 -1.204760074615478516e-01
+4.708716869354248047e-01 4.452165365219116211e-01 4.702655076980590820e-01
+3.935134410858154297e-01 -4.494274854660034180e-01 -1.000511646270751953e-02
+-3.325940370559692383e-01 -3.989632129669189453e-01 -2.595729827880859375e-01
+-4.726890325546264648e-01 -1.577985286712646484e-01 -2.004265785217285156e-02
+-2.578830718994140625e-01 1.816778182983398438e-01 1.800514459609985352e-01
+2.873079776763916016e-01 -1.582661867141723633e-01 1.000881195068359375e-03
+1.284685134887695312e-01 -2.347108125686645508e-01 -1.527856588363647461e-01
+-4.975929260253906250e-01 4.154947996139526367e-01 2.424190044403076172e-01
+1.319632530212402344e-01 2.181564569473266602e-01 1.456822156906127930e-01
+4.251360893249511719e-02 5.486690998077392578e-02 2.446963787078857422e-01
+5.265474319458007812e-03 -4.930623769760131836e-01 1.795315742492675781e-02
+3.435378074645996094e-01 -1.437039375305175781e-01 -2.955729961395263672e-01
+-1.589361429214477539e-01 3.439151048660278320e-01 -1.269352436065673828e-01
+-2.996790409088134766e-01 -2.977983951568603516e-01 5.047678947448730469e-02
+1.387677192687988281e-01 -4.051816463470458984e-02 -6.590497493743896484e-02
+-4.859859943389892578e-01 4.686148166656494141e-01 6.054759025573730469e-02
+3.058031797409057617e-01 -4.722125530242919922e-01 -1.649188995361328125e-01
+3.712953329086303711e-01 -3.612419366836547852e-01 -2.953444719314575195e-01
+-2.350592613220214844e-01 1.253683567047119141e-01 -3.582476377487182617e-01
+-4.282865524291992188e-01 -3.783413171768188477e-01 1.956710815429687500e-01
+-1.545268297195434570e-01 -3.127627372741699219e-01 -3.272031545639038086e-01
+2.250815629959106445e-01 -3.367059230804443359e-01 -2.811298370361328125e-01
+-5.611097812652587891e-02 2.276177406311035156e-01 -3.761705160140991211e-01
+1.843569278717041016e-01 3.698165416717529297e-01 -1.461877822875976562e-01
+-3.651070594787597656e-01 3.224494457244873047e-01 -4.502046108245849609e-02
+-1.052534580230712891e-01 1.394950151443481445e-01 3.169180154800415039e-01
+-7.266819477081298828e-02 -3.203969001770019531e-01 3.076763153076171875e-01
+-1.534210443496704102e-01 -1.421678066253662109e-02 -2.598439455032348633e-01
+4.644811153411865234e-01 -2.855896949768066406e-02 6.111550331115722656e-02
+1.615400314331054688e-01 4.353706836700439453e-01 2.680056095123291016e-01
+-4.789991378784179688e-01 -2.737338542938232422e-01 -2.684531211853027344e-01
+-4.801630973815917969e-01 -1.131765842437744141e-01 -2.253174781799316406e-01
+4.725518226623535156e-01 2.924776077270507812e-01 -4.712775945663452148e-01
+3.934500217437744141e-01 6.538939476013183594e-02 -2.147150039672851562e-01
+5.674338340759277344e-02 1.684566736221313477e-01 4.750093221664428711e-01
+-3.127444982528686523e-01 1.864537000656127930e-01 -1.828011274337768555e-01
+-6.377077102661132812e-02 3.063268661499023438e-01 4.461523294448852539e-01
+-2.393376827239990234e-01 -2.101924419403076172e-01 2.160568237304687500e-01
+-4.714767932891845703e-01 2.386778593063354492e-01 -1.962506771087646484e-01
+-4.175131320953369141e-01 1.262202262878417969e-01 -4.906876087188720703e-01
+1.526114940643310547e-01 -1.855427026748657227e-01 -3.443827629089355469e-01
+6.579875946044921875e-02 -4.886317253112792969e-02 -4.445745944976806641e-01
+1.098661422729492188e-01 3.471816778182983398e-01 4.010045528411865234e-01
+2.641906738281250000e-01 -2.310740947723388672e-01 -1.801049709320068359e-01
+2.215981483459472656e-02 1.125121116638183594e-01 -2.007805109024047852e-01
+4.692313671112060547e-01 -3.348422050476074219e-02 2.421901226043701172e-01
+3.015110492706298828e-01 -7.356131076812744141e-02 3.514482975006103516e-01
+-3.965770006179809570e-01 2.962644100189208984e-01 3.929857015609741211e-01
+1.106926202774047852e-01 -4.377689361572265625e-01 -1.675007343292236328e-01
+1.297621726989746094e-01 -8.046376705169677734e-02 2.488052845001220703e-01
+1.898322105407714844e-01 1.719188690185546875e-01 -1.696370840072631836e-01
+4.060682058334350586e-01 1.258714199066162109e-01 1.274476051330566406e-01
+1.603732109069824219e-01 3.966591358184814453e-01 6.766164302825927734e-02
+5.054616928100585938e-02 2.127890586853027344e-01 3.031399250030517578e-01
+-1.690447330474853516e-01 -1.416635513305664062e-01 3.728961944580078125e-02
+-1.341120004653930664e-01 1.080242395401000977e-01 4.635136127471923828e-01
+-2.457776069641113281e-01 -3.851659297943115234e-01 2.513883113861083984e-01
+-1.634557247161865234e-01 -4.583904743194580078e-01 -2.824854850769042969e-02
+-1.784324645996093750e-03 4.497978687286376953e-01 -1.161942481994628906e-01
+4.503953456878662109e-01 1.885429620742797852e-01 -4.877877235412597656e-02
+2.600491046905517578e-01 2.236571311950683594e-01 2.091717720031738281e-02
+-3.822712898254394531e-01 2.547247409820556641e-01 -3.687927722930908203e-01
+-3.667194843292236328e-01 -1.171383857727050781e-01 3.846424818038940430e-01
+-3.668913841247558594e-01 -2.955377101898193359e-01 3.536789417266845703e-01
+2.957736253738403320e-01 8.799576759338378906e-02 -3.451507091522216797e-01
+-1.604117155075073242e-01 3.587515354156494141e-01 5.187714099884033203e-02
+1.919094324111938477e-01 -4.781463146209716797e-01 4.655241966247558594e-03
+-3.640174865722656250e-02 4.754726886749267578e-01 -2.942006587982177734e-01
+-6.335353851318359375e-02 4.125511646270751953e-02 -3.732511997222900391e-01
+9.152126312255859375e-02 3.327772617340087891e-01 -4.209873676300048828e-01
+-5.436992645263671875e-02 4.926524162292480469e-01 3.434299230575561523e-01
+3.771104812622070312e-01 9.526658058166503906e-02 2.973334789276123047e-01
diff --git a/src/common/m_boundary_common.fpp b/src/common/m_boundary_common.fpp
index f89278a86d..4c01b10f22 100644
--- a/src/common/m_boundary_common.fpp
+++ b/src/common/m_boundary_common.fpp
@@ -35,6 +35,7 @@ module m_boundary_common
 
     private; public :: s_initialize_boundary_common_module, &
  s_populate_variables_buffers, &
+ s_populate_scalarfield_buffers, &
  s_create_mpi_types, &
  s_populate_capillary_buffers, &
  s_populate_F_igr_buffers, &
@@ -276,6 +277,109 @@ contains
 
     end subroutine s_populate_variables_buffers
 
+    !>  The purpose of this procedure is to populate the buffers
+    !!      of any scalarfield variable, currently only with periodic boundary conditions
+    impure subroutine s_populate_scalarfield_buffers(bc_type, q_temp)
+
+        type(scalar_field), intent(inout) :: q_temp
+        type(integer_field), dimension(1:num_dims, -1:1), intent(in) :: bc_type
+
+        integer :: k, l
+
+        ! Population of Buffers in x-direction
+        if (bc_x%beg >= 0) then
+            call s_mpi_sendrecv_variables_buffers_scalarfield(q_temp, 1, -1)
+        else
+            $:GPU_PARALLEL_LOOP(collapse=2)
+            do l = 0, p
+                do k = 0, n
+                    select case (int(bc_type(1, -1)%sf(0, k, l)))
+                    case (BC_PERIODIC)
+                        call s_periodic_scalarfield(q_temp, 1, -1, k, l)
+                    end select
+                end do
+            end do
+        end if
+
+        if (bc_x%end >= 0) then
+            call s_mpi_sendrecv_variables_buffers_scalarfield(q_temp, 1, 1)
+        else
+            $:GPU_PARALLEL_LOOP(collapse=2)
+            do l = 0, p
+                do k = 0, n
+                    select case (int(bc_type(1, 1)%sf(0, k, l)))
+                    case (BC_PERIODIC)
+                        call s_periodic_scalarfield(q_temp, 1, 1, k, l)
+                    end select
+                end do
+            end do
+        end if
+
+        ! Population of Buffers in y-direction
+        if (n == 0) return
+
+        if (bc_y%beg >= 0) then
+            call s_mpi_sendrecv_variables_buffers_scalarfield(q_temp, 2, -1)
+        else
+            $:GPU_PARALLEL_LOOP(collapse=2)
+            do l = 0, p
+                do k = -buff_size, m + buff_size
+                    select case (int(bc_type(2, -1)%sf(k, 0, l)))
+                    case (BC_PERIODIC)
+                        call s_periodic_scalarfield(q_temp, 2, -1, k, l)
+                    end select
+                end do
+            end do
+        end if
+
+        if (bc_y%end >= 0) then
+            call s_mpi_sendrecv_variables_buffers_scalarfield(q_temp, 2, 1)
+        else
+            $:GPU_PARALLEL_LOOP(collapse=2)
+            do l = 0, p
+                do k = -buff_size, m + buff_size
+                    select case (int(bc_type(2, 1)%sf(k, 0, l)))
+                    case (BC_PERIODIC)
+                        call s_periodic_scalarfield(q_temp, 2, 1, k, l)
+                    end select
+                end do
+            end do
+        end if
+
+        ! Population of Buffers in z-direction
+
+        if (p == 0) return
+        if (bc_z%beg >= 0) then
+            call s_mpi_sendrecv_variables_buffers_scalarfield(q_temp, 3, -1)
+        else
+            $:GPU_PARALLEL_LOOP(collapse=2)
+            do l = -buff_size, n + buff_size
+                do k = -buff_size, m + buff_size
+                    select case (int(bc_type(3, -1)%sf(k, l, 0)))
+                    case (BC_PERIODIC)
+                        call s_periodic_scalarfield(q_temp, 3, -1, k, l)
+                    end select
+                end do
+            end do
+        end if
+
+        if (bc_z%end >= 0) then
+            call s_mpi_sendrecv_variables_buffers_scalarfield(q_temp, 3, 1)
+        else
+            $:GPU_PARALLEL_LOOP(collapse=2)
+            do l = -buff_size, n + buff_size
+                do k = -buff_size, m + buff_size
+                    select case (int(bc_type(3, 1)%sf(k, l, 0)))
+                    case (BC_PERIODIC)
+                        call s_periodic_scalarfield(q_temp, 3, 1, k, l)
+                    end select
+                end do
+            end do
+        end if
+        ! END: Population of Buffers in z-direction
+
+    end subroutine s_populate_scalarfield_buffers
+
     pure subroutine s_ghost_cell_extrapolation(q_prim_vf, bc_dir, bc_loc, k, l)
         $:GPU_ROUTINE(function_name='s_ghost_cell_extrapolation', &
             & parallelism='[seq]', cray_inline=True)
@@ -736,6 +840,61 @@ contains
 
     end subroutine s_periodic
 
+    !< Populate periodic boundaries for arbitrary scalarfield quantity
+    pure subroutine s_periodic_scalarfield(q_temp, bc_dir, bc_loc, k, l)
+        $:GPU_ROUTINE(parallelism='[seq]')
+        type(scalar_field), intent(inout) :: q_temp
+        integer, intent(in) :: bc_dir, bc_loc
+        integer, intent(in) :: k, l
+
+        integer :: j, q
+
+        if (bc_dir == 1) then !< x-direction
+            if (bc_loc == -1) then !< bc_x%beg
+                do j = 1, buff_size
+                    q_temp%sf(-j, k, l) = &
+                        q_temp%sf(m - (j - 1), k, l)
+                end do
+
+            else !< bc_x%end
+                do j = 1, buff_size
+                    q_temp%sf(m + j, k, l) = &
+                        q_temp%sf(j - 1, k, l)
+                end do
+            end if
+
+        elseif (bc_dir == 2) then !< y-direction
+            if (bc_loc == -1) then !< bc_y%beg
+                do j = 1, buff_size
+                    q_temp%sf(k, -j, l) = &
+                        q_temp%sf(k, n - (j - 1), l)
+                end do
+
+            else !< bc_y%end
+                do j = 1, buff_size
+                    q_temp%sf(k, n + j, l) = &
+                        q_temp%sf(k, j - 1, l)
+                end do
+            end if
+
+        elseif (bc_dir == 3) then !< z-direction
+            if (bc_loc == -1) then !< bc_z%beg
+                do j = 1, buff_size
+                    q_temp%sf(k, l, -j) = &
+                        q_temp%sf(k, l, p - (j - 1))
+                end do
+
+            else !< bc_z%end
+                do j = 1, buff_size
+                    q_temp%sf(k, l, p + j) = &
+                        q_temp%sf(k, l, j - 1)
+                end do
+
+            end if
+        end if
+
+    end subroutine s_periodic_scalarfield
+
     pure subroutine s_axis(q_prim_vf, pb_in, mv_in, k, l)
         $:GPU_ROUTINE(parallelism='[seq]')
         type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
diff --git a/src/common/m_checker_common.fpp b/src/common/m_checker_common.fpp
index 1ec314536a..0a0d81e748 100644
--- a/src/common/m_checker_common.fpp
+++ b/src/common/m_checker_common.fpp
@@ -184,6 +184,11 @@ contains
         @:PROHIBIT(ib .and. n <= 0, "Immersed Boundaries do not work in 1D")
         @:PROHIBIT(ib .and. (num_ibs <= 0 .or. num_ibs > num_patches_max), "num_ibs must be between 1 and num_patches_max")
         @:PROHIBIT((.not. ib) .and. num_ibs > 0, "num_ibs is set, but ib is not enabled")
+        #:for X in ['x', 'y', 'z']
+            #:for BOUND in ['beg', 'end']
+                @:PROHIBIT(periodic_ibs .and. bc_${X}$%${BOUND}$ /= BC_PERIODIC, "periodic ibs requires periodic BCs, bc_${X}$%${BOUND}$ must = -1")
+            #:endfor
+        #:endfor
     end subroutine s_check_inputs_ibm
 
 #endif
diff --git a/src/common/m_compute_levelset.fpp b/src/common/m_compute_levelset.fpp
index cc473ea544..f6a7097533 100644
--- a/src/common/m_compute_levelset.fpp
+++ b/src/common/m_compute_levelset.fpp
@@ -442,7 +442,7 @@ contains
 
     end subroutine s_cuboid_levelset
 
-    pure subroutine s_sphere_levelset(ib_patch_id, levelset, levelset_norm)
+    impure subroutine s_sphere_levelset(ib_patch_id, levelset, levelset_norm)
 
         type(levelset_field), intent(INOUT), optional :: levelset
         type(levelset_norm_field), intent(INOUT), optional :: levelset_norm
@@ -452,13 +452,50 @@ contains
         real(wp) :: x_centroid, y_centroid, z_centroid
         real(wp), dimension(3) :: dist_vec
 
+        real(wp) :: x_domain_beg, x_domain_end, y_domain_beg, y_domain_end, z_domain_beg, z_domain_end
+        real(wp) :: x_pcen, y_pcen, z_pcen !< periodically projected centroids of sphere
+        real(wp), dimension(7, 3) :: dist_vec_per
+        real(wp), dimension(7) :: dist_per
+
         integer :: i, j, k !< Loop index variables
+        integer :: ierr
 
         radius = patch_ib(ib_patch_id)%radius
         x_centroid = patch_ib(ib_patch_id)%x_centroid
         y_centroid = patch_ib(ib_patch_id)%y_centroid
         z_centroid = patch_ib(ib_patch_id)%z_centroid
 
+        call s_mpi_allreduce_min(x_domain%beg, x_domain_beg)
+        call s_mpi_allreduce_max(x_domain%end, x_domain_end)
+        call s_mpi_allreduce_min(y_domain%beg, y_domain_beg)
+        call s_mpi_allreduce_max(y_domain%end, y_domain_end)
+        call s_mpi_allreduce_min(z_domain%beg, z_domain_beg)
+        call s_mpi_allreduce_max(z_domain%end, z_domain_end)
+
+        if (periodic_ibs) then
+            if ((x_centroid - x_domain_beg) <= radius) then
+                x_pcen = x_domain_end + (x_centroid - x_domain_beg)
+            else if ((x_domain_end - x_centroid) <= radius) then
+                x_pcen = x_domain_beg - (x_domain_end - x_centroid)
+            else
+                x_pcen = x_centroid
+            end if
+            if ((y_centroid - y_domain_beg) <= radius) then
+                y_pcen = y_domain_end + (y_centroid - y_domain_beg)
+            else if ((y_domain_end - y_centroid) <= radius) then
+                y_pcen = y_domain_beg - (y_domain_end - y_centroid)
+            else
+                y_pcen = y_centroid
+            end if
+            if ((z_centroid - z_domain_beg) <= radius) then
+                z_pcen = z_domain_end + (z_centroid - z_domain_beg)
+            else if ((z_domain_end - z_centroid) <= radius) then
+                z_pcen = z_domain_beg - (z_domain_end - z_centroid)
+            else
+                z_pcen = z_centroid
+            end if
+        end if
+
         do i = 0, m
             do j = 0, n
                 do k = 0, p
@@ -466,6 +503,73 @@ contains
                     dist_vec(2) = y_cc(j) - y_centroid
                     dist_vec(3) = z_cc(k) - z_centroid
                     dist = sqrt(sum(dist_vec**2))
+
+                    ! all permutations of periodically projected ib
+                    if (periodic_ibs) then
+                        dist_vec_per(1, 1) = x_cc(i) - x_pcen
+                        dist_vec_per(1, 2) = y_cc(j) - y_pcen
+                        dist_vec_per(1, 3) = z_cc(k) - z_pcen
+                        dist_per(1) = sqrt(sum(dist_vec_per(1, :)**2))
+                        if (dist_per(1) < dist) then
+                            dist = dist_per(1)
+                            dist_vec = dist_vec_per(1, :)
+                        end if
+
+                        dist_vec_per(2, 1) = x_cc(i) - x_pcen
+                        dist_vec_per(2, 2) = y_cc(j) - y_centroid
+                        dist_vec_per(2, 3) = z_cc(k) - z_pcen
+                        dist_per(2) = sqrt(sum(dist_vec_per(2, :)**2))
+                        if (dist_per(2) < dist) then
+                            dist = dist_per(2)
+                            dist_vec = dist_vec_per(2, :)
+                        end if
+
+                        dist_vec_per(3, 1) = x_cc(i) - x_pcen
+                        dist_vec_per(3, 2) = y_cc(j) - y_pcen
+                        dist_vec_per(3, 3) = z_cc(k) - z_centroid
+                        dist_per(3) = sqrt(sum(dist_vec_per(3, :)**2))
+                        if (dist_per(3) < dist) then
+                            dist = dist_per(3)
+                            dist_vec = dist_vec_per(3, :)
+                        end if
+
+                        dist_vec_per(4, 1) = x_cc(i) - x_pcen
+                        dist_vec_per(4, 2) = y_cc(j) - y_centroid
+                        dist_vec_per(4, 3) = z_cc(k) - z_centroid
+                        dist_per(4) = sqrt(sum(dist_vec_per(4, :)**2))
+                        if (dist_per(4) < dist) then
+                            dist = dist_per(4)
+                            dist_vec = dist_vec_per(4, :)
+                        end if
+
+                        dist_vec_per(5, 1) = x_cc(i) - x_centroid
+                        dist_vec_per(5, 2) = y_cc(j) - y_pcen
+                        dist_vec_per(5, 3) = z_cc(k) - z_pcen
+                        dist_per(5) = sqrt(sum(dist_vec_per(5, :)**2))
+                        if (dist_per(5) < dist) then
+                            dist = dist_per(5)
+                            dist_vec = dist_vec_per(5, :)
+                        end if
+
+                        dist_vec_per(6, 1) = x_cc(i) - x_centroid
+                        dist_vec_per(6, 2) = y_cc(j) - y_pcen
+                        dist_vec_per(6, 3) = z_cc(k) - z_centroid
+                        dist_per(6) = sqrt(sum(dist_vec_per(6, :)**2))
+                        if (dist_per(6) < dist) then
+                            dist = dist_per(6)
+                            dist_vec = dist_vec_per(6, :)
+                        end if
+
+                        dist_vec_per(7, 1) = x_cc(i) - x_centroid
+                        dist_vec_per(7, 2) = y_cc(j) - y_centroid
+                        dist_vec_per(7, 3) = z_cc(k) - z_pcen
+                        dist_per(7) = sqrt(sum(dist_vec_per(7, :)**2))
+                        if (dist_per(7) < dist) then
+                            dist = dist_per(7)
+                            dist_vec = dist_vec_per(7, :)
+                        end if
+                    end if
+
                     levelset%sf(i, j, k, ib_patch_id) = dist - radius
                     if (f_approx_equal(dist, 0._wp)) then
                         levelset_norm%sf(i, j, k, ib_patch_id, :) = (/1, 0, 0/)
diff --git a/src/common/m_constants.fpp b/src/common/m_constants.fpp
index 114286f53b..d4d59b728a 100644
--- a/src/common/m_constants.fpp
+++ b/src/common/m_constants.fpp
@@ -21,7 +21,7 @@ module m_constants
     integer, parameter :: fourier_rings = 5                       !< Fourier filter ring limit
     integer, parameter :: num_fluids_max = 10                     !< Maximum number of fluids in the simulation
     integer, parameter :: num_probes_max = 10                     !< Maximum number of flow probes in the simulation
-    integer, parameter :: num_patches_max = 10
+    integer, parameter :: num_patches_max = 1000
     integer, parameter :: num_bc_patches_max = 10
     integer, parameter :: pathlen_max = 400
     integer, parameter :: nnode = 4    !< Number of QBMM nodes
diff --git a/src/common/m_ib_patches.fpp b/src/common/m_ib_patches.fpp
index de1b1f1908..1070858111 100644
--- a/src/common/m_ib_patches.fpp
+++ b/src/common/m_ib_patches.fpp
@@ -557,6 +557,8 @@ contains
         ! Generic loop iterators
         integer :: i, j, k
         real(wp) :: radius
+        real(wp) :: x_domain_beg, x_domain_end, y_domain_beg, y_domain_end, z_domain_beg, z_domain_end
+        real(wp) :: x_pcen, y_pcen, z_pcen
 
         !! Variables to initialize the pressure field that corresponds to the
             !! bubble-collapse test case found in Tiwari et al. (2013)
@@ -577,6 +579,38 @@ contains
         ! and verifying whether the current patch has permission to write to
         ! that cell. If both queries check out, the primitive variables of
         ! the current patch are assigned to this cell.
+        call s_mpi_allreduce_min(x_domain%beg, x_domain_beg)
+        call s_mpi_allreduce_max(x_domain%end, x_domain_end)
+        call s_mpi_allreduce_min(y_domain%beg, y_domain_beg)
+        call s_mpi_allreduce_max(y_domain%end, y_domain_end)
+        call s_mpi_allreduce_min(z_domain%beg, z_domain_beg)
+        call s_mpi_allreduce_max(z_domain%end, z_domain_end)
+
+        ! periodically projected sphere centroid
+        if (periodic_ibs) then
+            if ((x_centroid - x_domain_beg) <= radius) then
+                x_pcen = x_domain_end + (x_centroid - x_domain_beg)
+            else if ((x_domain_end - x_centroid) <= radius) then
+                x_pcen = x_domain_beg - (x_domain_end - x_centroid)
+            else
+                x_pcen = x_centroid
+            end if
+            if ((y_centroid - y_domain_beg) <= radius) then
+                y_pcen = y_domain_end + (y_centroid - y_domain_beg)
+            else if ((y_domain_end - y_centroid) <= radius) then
+                y_pcen = y_domain_beg - (y_domain_end - y_centroid)
+            else
+                y_pcen = y_centroid
+            end if
+            if ((z_centroid - z_domain_beg) <= radius) then
+                z_pcen = z_domain_end + (z_centroid - z_domain_beg)
+            else if ((z_domain_end - z_centroid) <= radius) then
+                z_pcen = z_domain_beg - (z_domain_end - z_centroid)
+            else
+                z_pcen = z_centroid
+            end if
+        end if
+
         do k = 0, p
             do j = 0, n
                 do i = 0, m
@@ -592,6 +626,34 @@ contains
                          + (cart_z - z_centroid)**2 <= radius**2)) then
                         ib_markers_sf(i, j, k) = patch_id
                     end if
+
+                    if (periodic_ibs) then ! check every permutation of the projected cell location
+                        if (((x_cc(i) - x_pcen)**2 &
+                             + (cart_y - y_pcen)**2 &
+                             + (cart_z - z_pcen)**2 <= radius**2) &
+                            .or. ((x_cc(i) - x_pcen)**2 &
+                                  + (cart_y - y_centroid)**2 &
+                                  + (cart_z - z_centroid)**2 <= radius**2) &
+                            .or. ((x_cc(i) - x_pcen)**2 &
+                                  + (cart_y - y_pcen)**2 &
+                                  + (cart_z - z_centroid)**2 <= radius**2) &
+                            .or. ((x_cc(i) - x_pcen)**2 &
+                                  + (cart_y - y_centroid)**2 &
+                                  + (cart_z - z_pcen)**2 <= radius**2) &
+                            .or. ((x_cc(i) - x_centroid)**2 &
+                                  + (cart_y - y_pcen)**2 &
+                                  + (cart_z - z_centroid)**2 <= radius**2) &
+                            .or. ((x_cc(i) - x_centroid)**2 &
+                                  + (cart_y - y_pcen)**2 &
+                                  + (cart_z - z_pcen)**2 <= radius**2) &
+                            .or. ((x_cc(i) - x_centroid)**2 &
+                                  + (cart_y - y_centroid)**2 &
+                                  + (cart_z - z_pcen)**2 <= radius**2)) &
+                            then
+                            ! Updating the patch identities bookkeeping variable
+                            ib_markers_sf(i, j, k) = patch_id
+                        end if
+                    end if
                 end do
             end do
         end do
diff --git a/src/common/m_mpi_common.fpp b/src/common/m_mpi_common.fpp
index 0c705e908f..c5f926c51c 100644
--- a/src/common/m_mpi_common.fpp
+++ b/src/common/m_mpi_common.fpp
@@ -45,13 +45,23 @@ module m_mpi_common
     integer :: halo_size
     $:GPU_DECLARE(create='[halo_size]')
 
+    real(wp), private, allocatable, dimension(:), target :: buff_send_scalarfield
+    !! This variable is utilized to pack and send the buffer of any scalar field to neighboring processors
+
+    real(wp), private, allocatable, dimension(:), target :: buff_recv_scalarfield
+    !! This variable is utilized to receive and unpack the buffer of any scalar field from neighboring processors
+
+#ifndef __NVCOMPILER_GPU_UNIFIED_MEM
+    $:GPU_DECLARE(create='[buff_send_scalarfield, buff_recv_scalarfield]')
+#endif
+
 contains
 
     !> The computation of parameters, the allocation of memory,
         !!      the association of pointers and/or the execution of any
         !!      other procedures that are necessary to setup the module.
     impure subroutine s_initialize_mpi_common_module
-
+        integer :: halo_size_sf
 #ifdef MFC_MPI
         ! Allocating buff_send/recv and. Please note that for the sake of
         ! simplicity, both variables are provided sufficient storage to hold
@@ -87,6 +97,23 @@ contains
         $:GPU_ENTER_DATA(create='[capture:buff_send]')
         $:GPU_ENTER_DATA(create='[capture:buff_recv]')
 #endif
+
+#ifdef MFC_SIMULATION
+        if (volume_filtering_momentum_eqn) then
+            halo_size_sf = nint(-1._wp + 1._wp*buff_size* &
+                                           & (m + 2*buff_size + 1)* &
+                                           & (n + 2*buff_size + 1)* &
+                                           & (p + 2*buff_size + 1)/ &
+                                           & (cells_bounds%mnp_min + 2*buff_size + 1))
+#ifndef __NVCOMPILER_GPU_UNIFIED_MEM
+            @:ALLOCATE(buff_send_scalarfield(0:halo_size_sf), buff_recv_scalarfield(0:halo_size_sf))
+#else
+            allocate (buff_send_scalarfield(0:halo_size_sf), buff_recv_scalarfield(0:halo_size_sf))
+            $:GPU_ENTER_DATA(create='[capture:buff_send_scalarfield]')
+            $:GPU_ENTER_DATA(create='[capture:buff_recv_scalarfield]')
+#endif
+        end if
+#endif
 #endif
 
     end subroutine s_initialize_mpi_common_module
@@ -207,14 +234,18 @@ contains
 
 #ifdef MFC_PRE_PROCESS
             MPI_IO_IB_DATA%var%sf => ib_markers%sf
-            MPI_IO_levelset_DATA%var%sf => levelset%sf
-            MPI_IO_levelsetnorm_DATA%var%sf => levelset_norm%sf
+            if (store_levelset) then
+                MPI_IO_levelset_DATA%var%sf => levelset%sf
+                MPI_IO_levelsetnorm_DATA%var%sf => levelset_norm%sf
+            end if
 #else
             MPI_IO_IB_DATA%var%sf => ib_markers%sf(0:m, 0:n, 0:p)
 
 #ifndef MFC_POST_PROCESS
-            MPI_IO_levelset_DATA%var%sf => levelset%sf(0:m, 0:n, 0:p, 1:num_ibs)
-            MPI_IO_levelsetnorm_DATA%var%sf => levelset_norm%sf(0:m, 0:n, 0:p, 1:num_ibs, 1:3)
+            if (store_levelset) then
+                MPI_IO_levelset_DATA%var%sf => levelset%sf(0:m, 0:n, 0:p, 1:num_ibs)
+                MPI_IO_levelsetnorm_DATA%var%sf => levelset_norm%sf(0:m, 0:n, 0:p, 1:num_ibs, 1:3)
+            end if
 #endif
 
 #endif
@@ -277,6 +308,81 @@ contains
 
     end subroutine s_initialize_mpi_data
 
+    !! @param filtered_fluid_indicator_function volume filtered fluid indicator function
+    !! @param stat_q_cons_filtered 1-4 order statistical moments of volume filtered conservative variables
+    !! @param stat_filtered_pressure 1-4 order statistical moments of volume filtered pressure
+    !! @param stat_reynolds_stress 1-4 order statistics of reynolds stress tensor
+    !! @param stat_eff_visc 1-4 order statistics of unclosed effective viscosity tensor
+    !! @param stat_int_mom_exch 1-4 order statistics of interphase momentum exchange vector
+    impure subroutine s_initialize_mpi_data_filtered(filtered_fluid_indicator_function, &
+                                                     stat_q_cons_filtered, stat_filtered_pressure, &
+                                                     stat_reynolds_stress, stat_eff_visc, stat_int_mom_exch)
+
+        type(scalar_field), intent(in) :: filtered_fluid_indicator_function
+        type(vector_field), dimension(5), intent(in) :: stat_q_cons_filtered
+        type(scalar_field), dimension(1:4), intent(in) :: stat_filtered_pressure
+        type(vector_field), dimension(1:9), intent(in) :: stat_reynolds_stress
+        type(vector_field), dimension(1:9), intent(in) :: stat_eff_visc
+        type(vector_field), dimension(1:3), intent(in) :: stat_int_mom_exch
+
+        integer, dimension(num_dims) :: sizes_glb, sizes_loc
+
+#ifdef MFC_MPI
+
+        ! Generic loop iterator
+        integer :: i, j
+        integer :: ierr !< Generic flag used to identify and report MPI errors
+
+        !total system size
+        integer :: alt_sys
+
+        alt_sys = sys_size + 1 + 9*4 + 9*4 + 3*4 + 6*4 ! 109
+
+        MPI_IO_DATA%var(sys_size + 1)%sf => filtered_fluid_indicator_function%sf(0:m, 0:n, 0:p)
+        do i = 1, 9
+            do j = 1, 4
+                MPI_IO_DATA%var(sys_size + 1 + (i - 1)*4 + j)%sf => stat_reynolds_stress(i)%vf(j)%sf(0:m, 0:n, 0:p)
+            end do
+        end do
+        do i = 1, 9
+            do j = 1, 4
+                MPI_IO_DATA%var(sys_size + 37 + (i - 1)*4 + j)%sf => stat_eff_visc(i)%vf(j)%sf(0:m, 0:n, 0:p)
+            end do
+        end do
+        do i = 1, 3
+            do j = 1, 4
+                MPI_IO_DATA%var(sys_size + 73 + (i - 1)*4 + j)%sf => stat_int_mom_exch(i)%vf(j)%sf(0:m, 0:n, 0:p)
+            end do
+        end do
+        do i = 1, sys_size - 1
+            do j = 1, 4
+                MPI_IO_DATA%var(sys_size + 85 + (i - 1)*4 + j)%sf => stat_q_cons_filtered(i)%vf(j)%sf(0:m, 0:n, 0:p)
+            end do
+        end do
+        do i = 1, 4
+            MPI_IO_DATA%var(sys_size + 105 + i)%sf => stat_filtered_pressure(i)%sf(0:m, 0:n, 0:p)
+        end do
+
+        ! Define global(g) and local(l) sizes for flow variables
+        sizes_glb(1) = m_glb + 1; sizes_loc(1) = m + 1
+        if (n > 0) then
+            sizes_glb(2) = n_glb + 1; sizes_loc(2) = n + 1
+            if (p > 0) then
+                sizes_glb(3) = p_glb + 1; sizes_loc(3) = p + 1
+            end if
+        end if
+
+        ! Define the view for each variable
+        do i = sys_size, alt_sys
+            call MPI_TYPE_CREATE_SUBARRAY(num_dims, sizes_glb, sizes_loc, start_idx, &
+                                          MPI_ORDER_FORTRAN, mpi_p, MPI_IO_DATA%view(i), ierr)
+            call MPI_TYPE_COMMIT(MPI_IO_DATA%view(i), ierr)
+        end do
+
+#endif
+
+    end subroutine s_initialize_mpi_data_filtered
+
     !! @param q_cons_vf Conservative variables
     subroutine s_initialize_mpi_data_ds(q_cons_vf)
 
@@ -1119,6 +1225,225 @@ contains
 
     end subroutine s_mpi_sendrecv_variables_buffers
 
+    !>  The goal of this procedure is to populate the buffers of
+        !!      any scalarfield variable by communicating
+        !!      with the neighboring processors.
+        !!  @param q_comm scalarfield variable
+        !!  @param mpi_dir MPI communication coordinate direction
+        !!  @param pbc_loc Processor boundary condition (PBC) location
+    subroutine s_mpi_sendrecv_variables_buffers_scalarfield(q_comm, &
+                                                            mpi_dir, &
+                                                            pbc_loc)
+
+        type(scalar_field), intent(inout) :: q_comm
+        integer, intent(in) :: mpi_dir, pbc_loc
+
+        integer :: i, j, k, l, r, q !< Generic loop iterators
+
+        integer :: buffer_counts(1:3), buffer_count
+
+        type(int_bounds_info) :: boundary_conditions(1:3)
+        integer :: beg_end(1:2), grid_dims(1:3)
+        integer :: dst_proc, src_proc, recv_tag, send_tag
+
+        logical :: beg_end_geq_0
+
+        integer :: pack_offset, unpack_offset
+
+#ifdef MFC_MPI
+        integer :: ierr !< Generic flag used to identify and report MPI errors
+
+        call nvtxStartRange("RHS-COMM-PACKBUF")
+
+        buffer_counts = (/ &
+                        buff_size*(n + 1)*(p + 1), &
+                        buff_size*(m + 2*buff_size + 1)*(p + 1), &
+                        buff_size*(m + 2*buff_size + 1)*(n + 2*buff_size + 1) &
+                        /)
+
+        buffer_count = buffer_counts(mpi_dir)
+        boundary_conditions = (/bc_x, bc_y, bc_z/)
+        beg_end = (/boundary_conditions(mpi_dir)%beg, boundary_conditions(mpi_dir)%end/)
+        beg_end_geq_0 = beg_end(max(pbc_loc, 0) - pbc_loc + 1) >= 0
+
+        ! Implements:
+        ! pbc_loc  bc_x >= 0 -> [send/recv]_tag  [dst/src]_proc
+        ! -1 (=0)      0            ->     [1,0]       [0,0]      | 0 0 [1,0] [beg,beg]
+        ! -1 (=0)      1            ->     [0,0]       [1,0]      | 0 1 [0,0] [end,beg]
+        ! +1 (=1)      0            ->     [0,1]       [1,1]      | 1 0 [0,1] [end,end]
+        ! +1 (=1)      1            ->     [1,1]       [0,1]      | 1 1 [1,1] [beg,end]
+
+        send_tag = f_logical_to_int(.not. f_xor(beg_end_geq_0, pbc_loc == 1))
+        recv_tag = f_logical_to_int(pbc_loc == 1)
+
+        dst_proc = beg_end(1 + f_logical_to_int(f_xor(pbc_loc == 1, beg_end_geq_0)))
+        src_proc = beg_end(1 + f_logical_to_int(pbc_loc == 1))
+
+        grid_dims = (/m, n, p/)
+
+        pack_offset = 0
+        if (f_xor(pbc_loc == 1, beg_end_geq_0)) then
+            pack_offset = grid_dims(mpi_dir) - buff_size + 1
+        end if
+
+        unpack_offset = 0
+        if (pbc_loc == 1) then
+            unpack_offset = grid_dims(mpi_dir) + buff_size + 1
+        end if
+
+        ! Pack Buffer to Send
+        #:for mpi_dir in [1, 2, 3]
+            if (mpi_dir == ${mpi_dir}$) then
+                #:if mpi_dir == 1
+                    $:GPU_PARALLEL_LOOP(collapse=3,private='[r]')
+                    do l = 0, p
+                        do k = 0, n
+                            do j = 0, buff_size - 1
+                                r = (j + buff_size*(k + (n + 1)*l))
+                                buff_send_scalarfield(r) = q_comm%sf(j + pack_offset, k, l)
+                            end do
+                        end do
+                    end do
+
+                #:elif mpi_dir == 2
+                    $:GPU_PARALLEL_LOOP(collapse=3,private='[r]')
+                    do l = 0, p
+                        do k = 0, buff_size - 1
+                            do j = -buff_size, m + buff_size
+                                r = ((j + buff_size) + (m + 2*buff_size + 1)* &
+                                     (k + buff_size*l))
+                                buff_send_scalarfield(r) = q_comm%sf(j, k + pack_offset, l)
+                            end do
+                        end do
+                    end do
+
+                #:else
+                    $:GPU_PARALLEL_LOOP(collapse=3,private='[r]')
+                    do l = 0, buff_size - 1
+                        do k = -buff_size, n + buff_size
+                            do j = -buff_size, m + buff_size
+                                r = ((j + buff_size) + (m + 2*buff_size + 1)* &
+                                     ((k + buff_size) + (n + 2*buff_size + 1)*l))
+                                buff_send_scalarfield(r) = q_comm%sf(j, k, l + pack_offset)
+                            end do
+                        end do
+                    end do
+
+                #:endif
+            end if
+        #:endfor
+        call nvtxEndRange ! Packbuf
+
+        ! Send/Recv
+#ifdef MFC_SIMULATION
+        #:for rdma_mpi in [False, True]
+            if (rdma_mpi .eqv. ${'.true.' if rdma_mpi else '.false.'}$) then
+                #:if rdma_mpi
+                    #:call GPU_HOST_DATA(use_device='[buff_send_scalarfield, buff_recv_scalarfield]')
+                        call nvtxStartRange("RHS-COMM-SENDRECV-RDMA")
+
+                        call MPI_SENDRECV( &
+                            buff_send_scalarfield, buffer_count, mpi_p, dst_proc, send_tag, &
+                            buff_recv_scalarfield, buffer_count, mpi_p, src_proc, recv_tag, &
+                            MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
+
+                        call nvtxEndRange ! RHS-MPI-SENDRECV-(NO)-RDMA
+
+                    #:endcall GPU_HOST_DATA
+                    $:GPU_WAIT()
+                #:else
+                    call nvtxStartRange("RHS-COMM-DEV2HOST")
+                    $:GPU_UPDATE(host='[buff_send_scalarfield]')
+                    call nvtxEndRange
+                    call nvtxStartRange("RHS-COMM-SENDRECV-NO-RMDA")
+
+                    call MPI_SENDRECV( &
+                        buff_send_scalarfield, buffer_count, mpi_p, dst_proc, send_tag, &
+                        buff_recv_scalarfield, buffer_count, mpi_p, src_proc, recv_tag, &
+                        MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
+
+                    call nvtxEndRange ! RHS-MPI-SENDRECV-(NO)-RDMA
+
+                    call nvtxStartRange("RHS-COMM-HOST2DEV")
+                    $:GPU_UPDATE(device='[buff_recv_scalarfield]')
+                    call nvtxEndRange
+                #:endif
+            end if
+        #:endfor
+#else
+        call MPI_SENDRECV( &
+            buff_send_scalarfield, buffer_count, mpi_p, dst_proc, send_tag, &
+            buff_recv_scalarfield, buffer_count, mpi_p, src_proc, recv_tag, &
+            MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
+#endif
+
+        ! Unpack Received Buffer
+        call nvtxStartRange("RHS-COMM-UNPACKBUF")
+        #:for mpi_dir in [1, 2, 3]
+            if (mpi_dir == ${mpi_dir}$) then
+                #:if mpi_dir == 1
+                    $:GPU_PARALLEL_LOOP(collapse=3,private='[r]')
+                    do l = 0, p
+                        do k = 0, n
+                            do j = -buff_size, -1
+                                r = (j + buff_size*((k + 1) + (n + 1)*l))
+                                q_comm%sf(j + unpack_offset, k, l) = buff_recv_scalarfield(r)
+#if defined(__INTEL_COMPILER)
+                                if (ieee_is_nan(q_comm%sf(j, k, l))) then
+                                    print *, "Error", j, k, l
+                                    error stop "NaN(s) in recv"
+                                end if
+#endif
+                            end do
+                        end do
+                    end do
+
+                #:elif mpi_dir == 2
+                    $:GPU_PARALLEL_LOOP(collapse=3,private='[r]')
+                    do l = 0, p
+                        do k = -buff_size, -1
+                            do j = -buff_size, m + buff_size
+                                r = ((j + buff_size) + (m + 2*buff_size + 1)* &
+                                     ((k + buff_size) + buff_size*l))
+                                q_comm%sf(j, k + unpack_offset, l) = buff_recv_scalarfield(r)
+#if defined(__INTEL_COMPILER)
+                                if (ieee_is_nan(q_comm%sf(j, k, l))) then
+                                    print *, "Error", j, k, l
+                                    error stop "NaN(s) in recv"
+                                end if
+#endif
+                            end do
+                        end do
+                    end do
+
+                #:else
+                    ! Unpacking buffer from bc_z%beg
+                    $:GPU_PARALLEL_LOOP(collapse=3,private='[r]')
+                    do l = -buff_size, -1
+                        do k = -buff_size, n + buff_size
+                            do j = -buff_size, m + buff_size
+                                r = ((j + buff_size) + (m + 2*buff_size + 1)* &
+                                     ((k + buff_size) + (n + 2*buff_size + 1)* &
+                                      (l + buff_size)))
+                                q_comm%sf(j, k, l + unpack_offset) = buff_recv_scalarfield(r)
+#if defined(__INTEL_COMPILER)
+                                if (ieee_is_nan(q_comm%sf(j, k, l))) then
+                                    print *, "Error", j, k, l
+                                    error stop "NaN(s) in recv"
+                                end if
+#endif
+                            end do
+                        end do
+                    end do
+
+                #:endif
+            end if
+        #:endfor
+        call nvtxEndRange
+#endif
+
+    end subroutine s_mpi_sendrecv_variables_buffers_scalarfield
+
     !>  The purpose of this procedure is to optimally decompose
         !!      the computational domain among the available processors.
         !!      This is performed by attempting to award each processor,
@@ -1216,6 +1541,16 @@ contains
                         end if
 
                     end do
+
+                    ! Decompose domain into z-slabs
+                else if (slab_domain_decomposition) then
+                    num_procs_x = 1
+                    num_procs_y = 1
+                    num_procs_z = num_procs
+                    ierr = -1
+                    if (mod((p + 1), num_procs_z) == 0) then
+                        ierr = 0
+                    end if
                 else
 
                     if (cyl_coord .and. p > 0) then
@@ -1849,6 +2184,12 @@ contains
 
 #ifdef MFC_MPI
         deallocate (buff_send, buff_recv)
+#ifdef MFC_SIMULATION
+        if (volume_filtering_momentum_eqn) then
+            @:DEALLOCATE(buff_send_scalarfield)
+            @:DEALLOCATE(buff_recv_scalarfield)
+        end if
+#endif
 #endif
 
     end subroutine s_finalize_mpi_common_module
diff --git a/src/post_process/m_data_input.f90 b/src/post_process/m_data_input.f90
index de1517e079..72c200a15c 100644
--- a/src/post_process/m_data_input.f90
+++ b/src/post_process/m_data_input.f90
@@ -64,6 +64,13 @@ end subroutine s_read_abstract_data_files
     ! type(scalar_field), public :: ib_markers !<
     type(integer_field), public :: ib_markers
 
+    type(scalar_field), public :: filtered_fluid_indicator_function
+    type(vector_field), allocatable, dimension(:), public :: stat_reynolds_stress
+    type(vector_field), allocatable, dimension(:), public :: stat_eff_visc
+    type(vector_field), allocatable, dimension(:), public :: stat_int_mom_exch
+    type(vector_field), allocatable, dimension(:), public :: stat_q_cons_filtered
+    type(scalar_field), allocatable, dimension(:), public :: stat_filtered_pressure
+
     procedure(s_read_abstract_data_files), pointer :: s_read_data_files => null()
 
 contains
@@ -218,6 +225,58 @@ impure subroutine s_allocate_field_arrays(local_start_idx, end_x, end_y, end_z)
 
     end subroutine s_allocate_field_arrays
 
+    !> Helper subroutine to allocate filtered variable arrays for given dimensionality
+    !!  @param start_idx Starting index for allocation
+    !!  @param end_x, end_y, end_z End indices for each dimension
+    impure subroutine s_allocate_filtered_arrays(local_start_idx, end_x, end_y, end_z)
+
+        integer, intent(in) :: local_start_idx, end_x, end_y, end_z
+        integer :: i, j
+
+        allocate (filtered_fluid_indicator_function%sf(local_start_idx:end_x, local_start_idx:end_y, local_start_idx:end_z))
+
+        do i = 1, sys_size - 1
+            allocate (stat_q_cons_filtered(i)%vf(1:4))
+        end do
+        do i = 1, sys_size - 1
+            do j = 1, 4
+                allocate (stat_q_cons_filtered(i)%vf(j)%sf(local_start_idx:end_x, local_start_idx:end_y, local_start_idx:end_z))
+            end do
+        end do
+
+        do i = 1, 4
+            allocate (stat_filtered_pressure(i)%sf(local_start_idx:end_x, local_start_idx:end_y, local_start_idx:end_z))
+        end do
+
+        do i = 1, 9
+            allocate (stat_reynolds_stress(i)%vf(1:4))
+        end do
+        do i = 1, 9
+            do j = 1, 4
+                allocate (stat_reynolds_stress(i)%vf(j)%sf(local_start_idx:end_x, local_start_idx:end_y, local_start_idx:end_z))
+            end do
+        end do
+
+        do i = 1, 9
+            allocate (stat_eff_visc(i)%vf(1:4))
+        end do
+        do i = 1, 9
+            do j = 1, 4
+                allocate (stat_eff_visc(i)%vf(j)%sf(local_start_idx:end_x, local_start_idx:end_y, local_start_idx:end_z))
+            end do
+        end do
+
+        do i = 1, 3
+            allocate (stat_int_mom_exch(i)%vf(1:4))
+        end do
+        do i = 1, 3
+            do j = 1, 4
+                allocate (stat_int_mom_exch(i)%vf(j)%sf(local_start_idx:end_x, local_start_idx:end_y, local_start_idx:end_z))
+            end do
+        end do
+
+    end subroutine s_allocate_filtered_arrays
+
     !>  This subroutine is called at each time-step that has to
         !!      be post-processed in order to read the raw data files
         !!      present in the corresponding time-step directory and to
@@ -449,6 +508,10 @@ impure subroutine s_read_parallel_data_files(t_step)
 
         call s_read_parallel_conservative_data(t_step, m_MOK, n_MOK, p_MOK, WP_MOK, MOK, str_MOK, NVARS_MOK)
 
+        if (q_filtered_wrt .and. (t_step == 0 .or. t_step == t_step_stop)) then
+            call s_read_parallel_filtered_data(t_step, m_MOK, n_MOK, p_MOK, WP_MOK, MOK, str_MOK, NVARS_MOK)
+        end if
+
         deallocate (x_cb_glb, y_cb_glb, z_cb_glb)
 
         if (bc_io) then
@@ -581,6 +644,81 @@ impure subroutine s_read_parallel_conservative_data(t_step, m_MOK, n_MOK, p_MOK,
     end subroutine s_read_parallel_conservative_data
 #endif
 
+#ifdef MFC_MPI
+    !> Helper subroutine to read parallel volume filtered data
+    !!  @param t_step Current time-step
+    !!  @param m_MOK, n_MOK, p_MOK MPI offset kinds for dimensions
+    !!  @param WP_MOK, MOK, str_MOK, NVARS_MOK Other MPI offset kinds
+    impure subroutine s_read_parallel_filtered_data(t_step, m_MOK, n_MOK, p_MOK, WP_MOK, MOK, str_MOK, NVARS_MOK)
+
+        integer, intent(in) :: t_step
+        integer(KIND=MPI_OFFSET_KIND), intent(inout) :: m_MOK, n_MOK, p_MOK
+        integer(KIND=MPI_OFFSET_KIND), intent(inout) :: WP_MOK, MOK, str_MOK, NVARS_MOK
+
+        integer :: ifile, ierr, data_size
+        integer, dimension(MPI_STATUS_SIZE) :: status
+        integer(KIND=MPI_OFFSET_KIND) :: disp, var_MOK
+        character(LEN=path_len + 2*name_len) :: file_loc
+        logical :: file_exist
+        character(len=10) :: t_step_string
+        integer :: alt_sys
+        integer :: i
+
+        ! Open the file to read volume filtered variables
+        write (file_loc, '(I0,A)') t_step, '.dat'
+        file_loc = trim(case_dir)//'/restart_data'//trim(mpiiofs)//trim(file_loc)
+        inquire (FILE=trim(file_loc), EXIST=file_exist)
+
+        if (file_exist) then
+            call MPI_FILE_OPEN(MPI_COMM_WORLD, file_loc, MPI_MODE_RDONLY, mpi_info_int, ifile, ierr)
+
+            ! Initialize MPI data I/O
+
+            call s_initialize_mpi_data_filtered(filtered_fluid_indicator_function, &
+                                                stat_q_cons_filtered, stat_filtered_pressure, &
+                                                stat_reynolds_stress, stat_eff_visc, stat_int_mom_exch)
+
+            ! Size of local arrays
+            data_size = (m + 1)*(n + 1)*(p + 1)
+
+            ! Total size of system
+            alt_sys = sys_size + 1 + 9*4 + 9*4 + 3*4 + 6*4 ! 109, filtered indicator, stats of: R_u, R_mu, F_imet, q_cons_filtered
+
+            ! Resize some integers so MPI can read even the biggest file
+            m_MOK = int(m_glb + 1, MPI_OFFSET_KIND)
+            n_MOK = int(n_glb + 1, MPI_OFFSET_KIND)
+            p_MOK = int(p_glb + 1, MPI_OFFSET_KIND)
+            WP_MOK = int(8._wp, MPI_OFFSET_KIND)
+            MOK = int(1._wp, MPI_OFFSET_KIND)
+            str_MOK = int(name_len, MPI_OFFSET_KIND)
+            NVARS_MOK = int(alt_sys, MPI_OFFSET_KIND)
+
+            call s_setup_mpi_io_params(data_size, m_MOK, n_MOK, p_MOK, WP_MOK, MOK, str_MOK, NVARS_MOK)
+
+            ! Read the data for each variable
+            do i = sys_size, alt_sys
+                var_MOK = int(i, MPI_OFFSET_KIND)
+
+                ! Initial displacement to skip at beginning of file
+                disp = m_MOK*max(MOK, n_MOK)*max(MOK, p_MOK)*WP_MOK*(var_MOK - 1)
+
+                call MPI_FILE_SET_VIEW(ifile, disp, mpi_p, MPI_IO_DATA%view(i), &
+                                       'native', mpi_info_int, ierr)
+                call MPI_FILE_READ_ALL(ifile, MPI_IO_DATA%var(i)%sf, data_size, &
+                                       mpi_p, status, ierr)
+            end do
+
+            call s_mpi_barrier()
+            call MPI_FILE_CLOSE(ifile, ierr)
+
+            call s_read_ib_data_files(trim(case_dir)//'/restart_data'//trim(mpiiofs))
+        else
+            call s_mpi_abort('File '//trim(file_loc)//' is missing. Exiting.')
+        end if
+
+    end subroutine s_read_parallel_filtered_data
+#endif
+
     !>  Computation of parameters, allocation procedures, and/or
         !!      any other tasks needed to properly setup the module
     impure subroutine s_initialize_data_input_module
@@ -594,10 +732,20 @@ impure subroutine s_initialize_data_input_module
         allocate (q_prim_vf(1:sys_size))
         allocate (q_cons_temp(1:sys_size))
 
+        if (q_filtered_wrt) then
+            allocate (stat_q_cons_filtered(1:sys_size - 1))
+            allocate (stat_filtered_pressure(1:4))
+            allocate (stat_reynolds_stress(1:9))
+            allocate (stat_eff_visc(1:9))
+            allocate (stat_int_mom_exch(1:3))
+        end if
+
         ! Allocating the parts of the conservative and primitive variables
         ! that do require the direct knowledge of the dimensionality of
         ! the simulation using helper subroutine
 
+        if (q_filtered_wrt) call s_allocate_filtered_arrays(-buff_size, m + buff_size, n + buff_size, p + buff_size)
+
         ! Simulation is at least 2D
         if (n > 0) then
             ! Simulation is 3D
@@ -642,7 +790,7 @@ end subroutine s_initialize_data_input_module
     !> Deallocation procedures for the module
     impure subroutine s_finalize_data_input_module
 
-        integer :: i !< Generic loop iterator
+        integer :: i, j !< Generic loop iterator
 
         ! Deallocating the conservative and primitive variables
         do i = 1, sys_size
@@ -677,6 +825,47 @@ impure subroutine s_finalize_data_input_module
 
         s_read_data_files => null()
 
+        if (q_filtered_wrt) then
+            deallocate (filtered_fluid_indicator_function%sf)
+
+            do i = 1, sys_size - 1
+                do j = 1, 4
+                    deallocate (stat_q_cons_filtered(i)%vf(j)%sf)
+                end do
+                deallocate (stat_q_cons_filtered(i)%vf)
+            end do
+            deallocate (stat_q_cons_filtered)
+
+            do i = 1, 4
+                deallocate (stat_filtered_pressure(i)%sf)
+            end do
+            deallocate (stat_filtered_pressure)
+
+            do i = 1, 9
+                do j = 1, 4
+                    deallocate (stat_reynolds_stress(i)%vf(j)%sf)
+                end do
+                deallocate (stat_reynolds_stress(i)%vf)
+            end do
+            deallocate (stat_reynolds_stress)
+
+            do i = 1, 9
+                do j = 1, 4
+                    deallocate (stat_eff_visc(i)%vf(j)%sf)
+                end do
+                deallocate (stat_eff_visc(i)%vf)
+            end do
+            deallocate (stat_eff_visc)
+
+            do i = 1, 3
+                do j = 1, 4
+                    deallocate (stat_int_mom_exch(i)%vf(j)%sf)
+                end do
+                deallocate (stat_int_mom_exch(i)%vf)
+            end do
+            deallocate (stat_int_mom_exch)
+        end if
+
     end subroutine s_finalize_data_input_module
 
 end module m_data_input
diff --git a/src/post_process/m_global_parameters.fpp b/src/post_process/m_global_parameters.fpp
index 2274e1152f..930123a6d9 100644
--- a/src/post_process/m_global_parameters.fpp
+++ b/src/post_process/m_global_parameters.fpp
@@ -348,6 +348,10 @@ module m_global_parameters
     real(wp) :: Bx0 !< Constant magnetic field in the x-direction (1D)
 
     real(wp) :: wall_time, wall_time_avg !< Wall time measurements
+    logical :: periodic_ibs !< Periodic immersed boundaries
+    logical :: store_levelset !< Store immersed boundary levelset info
+    logical :: slab_domain_decomposition !< MPI domain decomposition into slabs
+    logical :: q_filtered_wrt !< write FFT filtered quantities
 
 contains
 
@@ -517,6 +521,11 @@ contains
         ! MHD
         Bx0 = dflt_real
 
+        periodic_ibs = .false.
+        store_levelset = .true.
+        slab_domain_decomposition = .false.
+        q_filtered_wrt = .false.
+
     end subroutine s_assign_default_values_to_user_inputs
 
     !>  Computation of parameters, allocation procedures, and/or
@@ -832,16 +841,25 @@ contains
         chemxe = species_idx%end
 
 #ifdef MFC_MPI
-        allocate (MPI_IO_DATA%view(1:sys_size))
-        allocate (MPI_IO_DATA%var(1:sys_size))
-        do i = 1, sys_size
-            if (down_sample) then
-                allocate (MPI_IO_DATA%var(i)%sf(-1:m + 1, -1:n + 1, -1:p + 1))
-            else
+        if (q_filtered_wrt) then
+            allocate (MPI_IO_DATA%view(1:sys_size + 1 + 4*9 + 4*9 + 3*4 + 6*4))
+            allocate (MPI_IO_DATA%var(1:sys_size + 1 + 4*9 + 4*9 + 3*4 + 6*4))
+            do i = 1, sys_size + 1 + 4*9 + 4*9 + 3*4 + 6*4
                 allocate (MPI_IO_DATA%var(i)%sf(0:m, 0:n, 0:p))
-            end if
-            MPI_IO_DATA%var(i)%sf => null()
-        end do
+                MPI_IO_DATA%var(i)%sf => null()
+            end do
+        else
+            allocate (MPI_IO_DATA%view(1:sys_size))
+            allocate (MPI_IO_DATA%var(1:sys_size))
+            do i = 1, sys_size
+                if (down_sample) then
+                    allocate (MPI_IO_DATA%var(i)%sf(-1:m + 1, -1:n + 1, -1:p + 1))
+                else
+                    allocate (MPI_IO_DATA%var(i)%sf(0:m, 0:n, 0:p))
+                end if
+                MPI_IO_DATA%var(i)%sf => null()
+            end do
+        end if
 
         if (ib) allocate (MPI_IO_IB_DATA%var%sf(0:m, 0:n, 0:p))
 #endif
@@ -1016,6 +1034,12 @@ contains
                 MPI_IO_DATA%var(i)%sf => null()
             end do
 
+            if (q_filtered_wrt) then
+                do i = sys_size + 1, sys_size + 1 + 4*9 + 4*9 + 3*4 + 6*4
+                    MPI_IO_DATA%var(i)%sf => null()
+                end do
+            end if
+
             deallocate (MPI_IO_DATA%var)
             deallocate (MPI_IO_DATA%view)
         end if
diff --git a/src/post_process/m_mpi_proxy.fpp b/src/post_process/m_mpi_proxy.fpp
index 8d72568d3e..7d8f15ee5e 100644
--- a/src/post_process/m_mpi_proxy.fpp
+++ b/src/post_process/m_mpi_proxy.fpp
@@ -105,7 +105,8 @@ contains
             & 'adv_n', 'ib', 'cfl_adap_dt', 'cfl_const_dt', 'cfl_dt',          &
             & 'surface_tension', 'hyperelasticity', 'bubbles_lagrange',        &
             & 'output_partial_domain', 'relativity', 'cont_damage', 'bc_io',   &
-            & 'down_sample','fft_wrt' ]
+            & 'down_sample','fft_wrt', 'periodic_ibs', 'store_levelset',       &
+            & 'slab_domain_decomposition', 'q_filtered_wrt' ]
             call MPI_BCAST(${VAR}$, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr)
         #:endfor
 
diff --git a/src/post_process/m_start_up.fpp b/src/post_process/m_start_up.fpp
index 151f25ccee..94892f1ced 100644
--- a/src/post_process/m_start_up.fpp
+++ b/src/post_process/m_start_up.fpp
@@ -118,7 +118,9 @@ contains
             lag_id_wrt, lag_pos_wrt, lag_pos_prev_wrt, lag_vel_wrt, &
             lag_rad_wrt, lag_rvel_wrt, lag_r0_wrt, lag_rmax_wrt, &
             lag_rmin_wrt, lag_dphidt_wrt, lag_pres_wrt, lag_mv_wrt, &
-            lag_mg_wrt, lag_betaT_wrt, lag_betaC_wrt
+            lag_mg_wrt, lag_betaT_wrt, lag_betaC_wrt, &
+            periodic_ibs, store_levelset, slab_domain_decomposition, &
+            q_filtered_wrt
 
         ! Inquiring the status of the post_process.inp file
         file_loc = 'post_process.inp'
@@ -374,6 +376,61 @@ contains
             end if
         end do
 
+        ! Adding filtered quantities
+        if (q_filtered_wrt .and. (t_step == 0 .or. t_step == t_step_stop)) then
+            ! filtered fluid indicator
+            q_sf = filtered_fluid_indicator_function%sf(x_beg:x_end, y_beg:y_end, z_beg:z_end)
+            write (varname, '(A)') 'filtered_fluid_indicator_function'
+            call s_write_variable_to_formatted_database_file(varname, t_step)
+
+            varname(:) = ' '
+
+            ! filtered vars stats
+            do i = 1, 9
+                do j = 1, 4
+                    q_sf = stat_reynolds_stress(i)%vf(j)%sf(x_beg:x_end, y_beg:y_end, z_beg:z_end)
+                    write (varname, '(A,I0,A,I0)') 'stat_reynolds_stress', i, '_m', j
+                    call s_write_variable_to_formatted_database_file(varname, t_step)
+
+                    varname(:) = ' '
+                end do
+            end do
+            do i = 1, 9
+                do j = 1, 4
+                    q_sf = stat_eff_visc(i)%vf(j)%sf(x_beg:x_end, y_beg:y_end, z_beg:z_end)
+                    write (varname, '(A,I0,A,I0)') 'stat_eff_visc', i, '_m', j
+                    call s_write_variable_to_formatted_database_file(varname, t_step)
+
+                    varname(:) = ' '
+                end do
+            end do
+            do i = 1, 3
+                do j = 1, 4
+                    q_sf = stat_int_mom_exch(i)%vf(j)%sf(x_beg:x_end, y_beg:y_end, z_beg:z_end)
+                    write (varname, '(A,I0,A,I0)') 'stat_int_mom_exch', i, '_m', j
+                    call s_write_variable_to_formatted_database_file(varname, t_step)
+
+                    varname(:) = ' '
+                end do
+            end do
+            do i = 1, sys_size - 1
+                do j = 1, 4
+                    q_sf = stat_q_cons_filtered(i)%vf(j)%sf(x_beg:x_end, y_beg:y_end, z_beg:z_end)
+                    write (varname, '(A,I0,A,I0)') 'stat_q_cons_filtered', i, '_m', j
+                    call s_write_variable_to_formatted_database_file(varname, t_step)
+
+                    varname(:) = ' '
+                end do
+            end do
+            do i = 1, 4
+                q_sf = stat_filtered_pressure(i)%sf(x_beg:x_end, y_beg:y_end, z_beg:z_end)
+                write (varname, '(A,I0)') 'stat_filtered_pressure_m', i
+                call s_write_variable_to_formatted_database_file(varname, t_step)
+
+                varname(:) = ' '
+            end do
+        end if
+
         ! Adding the species' concentrations to the formatted database file
         if (chemistry) then
             do i = 1, num_species
diff --git a/src/pre_process/m_data_output.fpp b/src/pre_process/m_data_output.fpp
index fb26a14184..f856d85b21 100644
--- a/src/pre_process/m_data_output.fpp
+++ b/src/pre_process/m_data_output.fpp
@@ -825,45 +825,47 @@ contains
 
             call MPI_FILE_CLOSE(ifile, ierr)
 
-            ! Levelset
-            write (file_loc, '(A)') 'levelset.dat'
-            file_loc = trim(restart_dir)//trim(mpiiofs)//trim(file_loc)
-            inquire (FILE=trim(file_loc), EXIST=file_exist)
-            if (file_exist .and. proc_rank == 0) then
-                call MPI_FILE_DELETE(file_loc, mpi_info_int, ierr)
-            end if
-            call MPI_FILE_OPEN(MPI_COMM_WORLD, file_loc, ior(MPI_MODE_WRONLY, MPI_MODE_CREATE), &
-                               mpi_info_int, ifile, ierr)
+            if (store_levelset) then
+                ! Levelset
+                write (file_loc, '(A)') 'levelset.dat'
+                file_loc = trim(restart_dir)//trim(mpiiofs)//trim(file_loc)
+                inquire (FILE=trim(file_loc), EXIST=file_exist)
+                if (file_exist .and. proc_rank == 0) then
+                    call MPI_FILE_DELETE(file_loc, mpi_info_int, ierr)
+                end if
+                call MPI_FILE_OPEN(MPI_COMM_WORLD, file_loc, ior(MPI_MODE_WRONLY, MPI_MODE_CREATE), &
+                                   mpi_info_int, ifile, ierr)
 
-            ! Initial displacement to skip at beginning of file
-            disp = 0
+                ! Initial displacement to skip at beginning of file
+                disp = 0
 
-            call MPI_FILE_SET_VIEW(ifile, disp, mpi_p, MPI_IO_levelset_DATA%view, &
-                                   'native', mpi_info_int, ierr)
-            call MPI_FILE_WRITE_ALL(ifile, MPI_IO_levelset_DATA%var%sf, data_size*num_ibs, &
-                                    mpi_p, status, ierr)
+                call MPI_FILE_SET_VIEW(ifile, disp, mpi_p, MPI_IO_levelset_DATA%view, &
+                                       'native', mpi_info_int, ierr)
+                call MPI_FILE_WRITE_ALL(ifile, MPI_IO_levelset_DATA%var%sf, data_size*num_ibs, &
+                                        mpi_p, status, ierr)
 
-            call MPI_FILE_CLOSE(ifile, ierr)
+                call MPI_FILE_CLOSE(ifile, ierr)
 
-            ! Levelset Norm
-            write (file_loc, '(A)') 'levelset_norm.dat'
-            file_loc = trim(restart_dir)//trim(mpiiofs)//trim(file_loc)
-            inquire (FILE=trim(file_loc), EXIST=file_exist)
-            if (file_exist .and. proc_rank == 0) then
-                call MPI_FILE_DELETE(file_loc, mpi_info_int, ierr)
-            end if
-            call MPI_FILE_OPEN(MPI_COMM_WORLD, file_loc, ior(MPI_MODE_WRONLY, MPI_MODE_CREATE), &
-                               mpi_info_int, ifile, ierr)
+                ! Levelset Norm
+                write (file_loc, '(A)') 'levelset_norm.dat'
+                file_loc = trim(restart_dir)//trim(mpiiofs)//trim(file_loc)
+                inquire (FILE=trim(file_loc), EXIST=file_exist)
+                if (file_exist .and. proc_rank == 0) then
+                    call MPI_FILE_DELETE(file_loc, mpi_info_int, ierr)
+                end if
+                call MPI_FILE_OPEN(MPI_COMM_WORLD, file_loc, ior(MPI_MODE_WRONLY, MPI_MODE_CREATE), &
+                                   mpi_info_int, ifile, ierr)
 
-            ! Initial displacement to skip at beginning of file
-            disp = 0
+                ! Initial displacement to skip at beginning of file
+                disp = 0
 
-            call MPI_FILE_SET_VIEW(ifile, disp, mpi_p, MPI_IO_levelsetnorm_DATA%view, &
-                                   'native', mpi_info_int, ierr)
-            call MPI_FILE_WRITE_ALL(ifile, MPI_IO_levelsetnorm_DATA%var%sf, data_size*num_ibs*3, &
-                                    mpi_p, status, ierr)
+                call MPI_FILE_SET_VIEW(ifile, disp, mpi_p, MPI_IO_levelsetnorm_DATA%view, &
+                                       'native', mpi_info_int, ierr)
+                call MPI_FILE_WRITE_ALL(ifile, MPI_IO_levelsetnorm_DATA%var%sf, data_size*num_ibs*3, &
+                                        mpi_p, status, ierr)
 
-            call MPI_FILE_CLOSE(ifile, ierr)
+                call MPI_FILE_CLOSE(ifile, ierr)
+            end if
         end if
 
         if (ib) then
diff --git a/src/pre_process/m_global_parameters.fpp b/src/pre_process/m_global_parameters.fpp
index 00d979fd7b..972c57cfc5 100644
--- a/src/pre_process/m_global_parameters.fpp
+++ b/src/pre_process/m_global_parameters.fpp
@@ -289,6 +289,9 @@ module m_global_parameters
     !! to the next time-step.
 
     logical :: fft_wrt
+    logical :: periodic_ibs
+    logical :: store_levelset
+    logical :: slab_domain_decomposition
 
 contains
 
@@ -576,6 +579,10 @@ contains
 
         Bx0 = dflt_real
 
+        periodic_ibs = .false.
+        store_levelset = .true.
+        slab_domain_decomposition = .false.
+
     end subroutine s_assign_default_values_to_user_inputs
 
     !> Computation of parameters, allocation procedures, and/or
diff --git a/src/pre_process/m_mpi_proxy.fpp b/src/pre_process/m_mpi_proxy.fpp
index b45526164c..a67b5b44e4 100644
--- a/src/pre_process/m_mpi_proxy.fpp
+++ b/src/pre_process/m_mpi_proxy.fpp
@@ -59,7 +59,8 @@ contains
             & 'cfl_const_dt', 'cfl_dt', 'surface_tension',                     &
             & 'hyperelasticity', 'pre_stress', 'elliptic_smoothing', 'viscous',&
             & 'bubbles_lagrange', 'bc_io', 'mhd', 'relativity', 'cont_damage', &
-            & 'igr', 'down_sample','fft_wrt' ]
+            & 'igr', 'down_sample','fft_wrt',                                  &
+            & 'periodic_ibs', 'store_levelset', 'slab_domain_decomposition' ]
             call MPI_BCAST(${VAR}$, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr)
         #:endfor
         call MPI_BCAST(fluid_rho(1), num_fluids_max, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr)
diff --git a/src/pre_process/m_start_up.fpp b/src/pre_process/m_start_up.fpp
index 2129ae74ec..555d43acd6 100644
--- a/src/pre_process/m_start_up.fpp
+++ b/src/pre_process/m_start_up.fpp
@@ -152,7 +152,8 @@ contains
             elliptic_smoothing, elliptic_smoothing_iters, &
             viscous, bubbles_lagrange, bc_x, bc_y, bc_z, num_bc_patches, &
             patch_bc, Bx0, relativity, cont_damage, igr, igr_order, &
-            down_sample, recon_type, muscl_order, fft_wrt
+            down_sample, recon_type, muscl_order, fft_wrt, &
+            periodic_ibs, store_levelset, slab_domain_decomposition
 
         ! Inquiring the status of the pre_process.inp file
         file_loc = 'pre_process.inp'
diff --git a/src/simulation/m_additional_forcing.fpp b/src/simulation/m_additional_forcing.fpp
new file mode 100644
index 0000000000..a6b391cc67
--- /dev/null
+++ b/src/simulation/m_additional_forcing.fpp
@@ -0,0 +1,181 @@
+#:include 'macros.fpp'
+
+module m_additional_forcing
+    use m_derived_types
+
+    use m_global_parameters
+
+    use m_ibm
+
+    use m_mpi_proxy
+
+    use m_volume_filtering
+
+    implicit none
+
+    private; public :: s_initialize_additional_forcing_module, &
+ s_finalize_additional_forcing_module, s_compute_periodic_forcing
+
+    type(scalar_field), allocatable, dimension(:) :: q_periodic_force
+    real(wp) :: volfrac_phi
+    integer :: N_x_total_glb
+    real(wp) :: avg_coeff
+    real(wp) :: spatial_rho, spatial_u, spatial_eps
+    real(wp), allocatable, dimension(:) :: rho_window, u_window, eps_window
+    real(wp) :: sum_rho, sum_u, sum_eps
+    real(wp) :: phase_rho, phase_u, phase_eps
+    real(wp) :: tau_dt
+    integer :: forcing_window, window_loc, window_fill
+
+    $:GPU_DECLARE(create='[q_periodic_force, volfrac_phi, N_x_total_glb, avg_coeff, tau_dt]')
+    $:GPU_DECLARE(create='[spatial_rho, spatial_u, spatial_eps, phase_rho, phase_u, phase_eps]')
+
+contains
+
+    subroutine s_initialize_additional_forcing_module
+        integer :: i
+
+        @:ALLOCATE(q_periodic_force(1:3))
+        do i = 1, 3
+            @:ALLOCATE(q_periodic_force(i)%sf(0:m, 0:n, 0:p))
+            @:ACC_SETUP_SFs(q_periodic_force(i))
+        end do
+
+        volfrac_phi = num_ibs*4._wp/3._wp*pi*patch_ib(1)%radius**3/((x_domain%end - x_domain%beg)*(y_domain%end - y_domain%beg)*(z_domain%end - z_domain%beg))
+        $:GPU_UPDATE(device='[volfrac_phi]')
+
+        N_x_total_glb = (m_glb + 1)*(n_glb + 1)*(p_glb + 1)
+        $:GPU_UPDATE(device='[N_x_total_glb]')
+
+        avg_coeff = 1._wp/(real(N_x_total_glb, wp)*(1._wp - volfrac_phi))
+        $:GPU_UPDATE(device='[avg_coeff]')
+
+        forcing_window = 1 ! forcing time window size
+        window_loc = 0
+        window_fill = 0
+
+        @:ALLOCATE(rho_window(forcing_window))
+        @:ALLOCATE(u_window(forcing_window))
+        @:ALLOCATE(eps_window(forcing_window))
+
+        rho_window = 0.0_wp
+        u_window = 0.0_wp
+        eps_window = 0.0_wp
+
+        sum_rho = 0.0_wp
+        sum_u = 0.0_wp
+        sum_eps = 0.0_wp
+
+        phase_rho = 0._wp
+        phase_u = 0._wp
+        phase_eps = 0._wp
+
+        tau_dt = 1._wp/(0.5_wp*dt)
+        $:GPU_UPDATE(device='[tau_dt]')
+
+    end subroutine s_initialize_additional_forcing_module
+
+    !< compute the space and time average of quantities, compute the periodic forcing terms described in Khalloufi and Capecelatro
+    subroutine s_compute_periodic_forcing(rhs_vf, q_cons_vf, q_prim_vf, t_step)
+        type(scalar_field), dimension(sys_size), intent(in) :: rhs_vf
+        type(scalar_field), dimension(sys_size), intent(in) :: q_cons_vf
+        type(scalar_field), dimension(sys_size), intent(in) :: q_prim_vf
+        integer, intent(in) :: t_step
+        real(wp) :: spatial_rho_glb, spatial_u_glb, spatial_eps_glb
+        integer :: i, j, k
+
+        ! zero spatial averages
+        spatial_rho = 0._wp
+        spatial_u = 0._wp
+        spatial_eps = 0._wp
+        $:GPU_UPDATE(device='[spatial_rho, spatial_u, spatial_eps]')
+
+        ! compute spatial averages
+        $:GPU_PARALLEL_LOOP(collapse=3, reduction='[[spatial_rho, spatial_u, spatial_eps]]', reductionOp='[+]')
+        do i = 0, m
+            do j = 0, n
+                do k = 0, p
+                    spatial_rho = spatial_rho + (q_cons_vf(1)%sf(i, j, k)*fluid_indicator_function%sf(i, j, k)) ! rho
+                    spatial_u = spatial_u + (q_cons_vf(2)%sf(i, j, k)*fluid_indicator_function%sf(i, j, k)) ! rho*u
+                    spatial_eps = spatial_eps + ((q_cons_vf(5)%sf(i, j, k) - 0.5_wp*( &
+                                                  q_cons_vf(2)%sf(i, j, k)**2 + &
+                                                  q_cons_vf(3)%sf(i, j, k)**2 + &
+                                                  q_cons_vf(4)%sf(i, j, k)**2)/q_cons_vf(1)%sf(i, j, k))*fluid_indicator_function%sf(i, j, k)) ! rho*e
+                end do
+            end do
+        end do
+
+        $:GPU_UPDATE(host='[spatial_rho, spatial_u, spatial_eps]')
+
+        ! reduction sum across entire domain
+        call s_mpi_allreduce_sum(spatial_rho, spatial_rho_glb)
+        call s_mpi_allreduce_sum(spatial_u, spatial_u_glb)
+        call s_mpi_allreduce_sum(spatial_eps, spatial_eps_glb)
+
+        spatial_rho_glb = spatial_rho_glb*avg_coeff
+        spatial_u_glb = spatial_u_glb*avg_coeff
+        spatial_eps_glb = spatial_eps_glb*avg_coeff
+
+        ! update time average window location
+        window_loc = 1 + mod(t_step - 1, forcing_window)
+
+        ! update time average sum
+        sum_rho = sum_rho - rho_window(window_loc) + spatial_rho_glb
+        sum_u = sum_u - u_window(window_loc) + spatial_u_glb
+        sum_eps = sum_eps - eps_window(window_loc) + spatial_eps_glb
+
+        ! update window arrays
+        rho_window(window_loc) = spatial_rho_glb
+        u_window(window_loc) = spatial_u_glb
+        eps_window(window_loc) = spatial_eps_glb
+
+        ! update number of time samples
+        if (window_fill < forcing_window) window_fill = window_fill + 1
+
+        ! compute phase averages
+        phase_rho = sum_rho/real(window_fill, wp)
+        phase_u = sum_u/real(window_fill, wp)
+        phase_eps = sum_eps/real(window_fill, wp)
+        $:GPU_UPDATE(device='[phase_rho, phase_u, phase_eps]')
+
+        ! compute periodic forcing terms for mass, momentum, energy
+        $:GPU_PARALLEL_LOOP(collapse=3)
+        do i = 0, m
+            do j = 0, n
+                do k = 0, p
+                    ! f_rho
+                    q_periodic_force(1)%sf(i, j, k) = (rho_inf_ref - phase_rho)*tau_dt
+
+                    ! f_u
+                    q_periodic_force(2)%sf(i, j, k) = (rho_inf_ref*u_inf_ref - phase_u)*tau_dt
+
+                    ! f_E
+                    q_periodic_force(3)%sf(i, j, k) = (P_inf_ref*gammas(1) - phase_eps)*tau_dt &
+                                                      + q_cons_vf(2)%sf(i, j, k)*q_periodic_force(2)%sf(i, j, k)/q_cons_vf(1)%sf(i, j, k)
+                end do
+            end do
+        end do
+
+        ! add the forcing terms to the RHS
+        $:GPU_PARALLEL_LOOP(collapse=3)
+        do i = 0, m
+            do j = 0, n
+                do k = 0, p
+                    rhs_vf(1)%sf(i, j, k) = rhs_vf(1)%sf(i, j, k) + q_periodic_force(1)%sf(i, j, k)*fluid_indicator_function%sf(i, j, k) ! continuity
+                    rhs_vf(2)%sf(i, j, k) = rhs_vf(2)%sf(i, j, k) + q_periodic_force(2)%sf(i, j, k)*fluid_indicator_function%sf(i, j, k) ! x momentum
+                    rhs_vf(5)%sf(i, j, k) = rhs_vf(5)%sf(i, j, k) + q_periodic_force(3)%sf(i, j, k)*fluid_indicator_function%sf(i, j, k) ! energy
+                end do
+            end do
+        end do
+
+    end subroutine s_compute_periodic_forcing
+
+    subroutine s_finalize_additional_forcing_module
+        integer :: i
+        do i = 1, 3
+            @:DEALLOCATE(q_periodic_force(i)%sf)
+        end do
+        @:DEALLOCATE(q_periodic_force)
+    end subroutine s_finalize_additional_forcing_module
+
+end module m_additional_forcing
diff --git a/src/simulation/m_checker.fpp b/src/simulation/m_checker.fpp
index da82e0f37b..c0ca8e851a 100644
--- a/src/simulation/m_checker.fpp
+++ b/src/simulation/m_checker.fpp
@@ -401,6 +401,14 @@ contains
     impure subroutine s_check_inputs_misc
         @:PROHIBIT(probe_wrt .and. fd_order == dflt_int, "fd_order must be specified for probe_wrt")
         @:PROHIBIT(integral_wrt .and. (.not. bubbles_euler))
+        #:for X in ['x', 'y', 'z']
+            #:for BOUND in ['beg', 'end']
+                @:PROHIBIT(periodic_forcing .and. bc_${X}$%${BOUND}$ /= BC_PERIODIC, &
+                    "Periodic forcing requires all BCs to be periodic")
+                @:PROHIBIT(volume_filtering_momentum_eqn .and. bc_${X}$%${BOUND}$ /= BC_PERIODIC, &
+                    "Explicit filtering of flow data requires all BCs to be periodic due to fourier transform")
+            #:endfor
+        #:endfor
     end subroutine s_check_inputs_misc
 
     impure subroutine s_check_inputs_mhd
diff --git a/src/simulation/m_compute_statistics.fpp b/src/simulation/m_compute_statistics.fpp
new file mode 100644
index 0000000000..a8ef036fa4
--- /dev/null
+++ b/src/simulation/m_compute_statistics.fpp
@@ -0,0 +1,316 @@
+#:include 'macros.fpp'
+
+module m_compute_statistics
+    use m_derived_types
+
+    use m_global_parameters
+
+    use m_mpi_proxy
+
+    use m_additional_forcing
+
+    use m_nvtx
+
+    implicit none
+
+    private; public :: s_initialize_statistics_module, s_finalize_statistics_module, &
+ s_compute_statistics_momentum_unclosed_terms, s_update_statistics, &
+ s_compute_statistical_moments
+
+    ! terms for computing 1st, 2nd, 3rd, and 4th order statistical moments
+    type(vector_field), allocatable, dimension(:) :: Msn_reynolds_stress
+    type(vector_field), allocatable, dimension(:) :: Msn_eff_visc
+    type(vector_field), allocatable, dimension(:) :: Msn_int_mom_exch
+    type(vector_field), allocatable, dimension(:) :: Msn_q_cons_filtered
+    type(scalar_field), allocatable, dimension(:) :: Msn_filtered_pressure
+
+    ! 1st, 2nd, 3rd, and 4th statistical moments for unclosed terms in volume filtered momentum equation
+    type(vector_field), allocatable, dimension(:), public :: stat_reynolds_stress
+    type(vector_field), allocatable, dimension(:), public :: stat_eff_visc
+    type(vector_field), allocatable, dimension(:), public :: stat_int_mom_exch
+    type(vector_field), allocatable, dimension(:), public :: stat_q_cons_filtered
+    type(scalar_field), allocatable, dimension(:), public :: stat_filtered_pressure
+
+    $:GPU_DECLARE(create='[Msn_reynolds_stress, Msn_eff_visc, Msn_int_mom_exch, Msn_q_cons_filtered, Msn_filtered_pressure]')
+
+    $:GPU_DECLARE(create='[stat_reynolds_stress, stat_eff_visc, stat_int_mom_exch, stat_q_cons_filtered, stat_filtered_pressure]')
+
+contains
+
+    subroutine s_initialize_statistics_module
+        integer :: i, j
+
+        @:ALLOCATE(Msn_reynolds_stress(1:9))
+        do i = 1, 9
+            @:ALLOCATE(Msn_reynolds_stress(i)%vf(1:4))
+        end do
+        do i = 1, 9
+            do j = 1, 4
+                @:ALLOCATE(Msn_reynolds_stress(i)%vf(j)%sf(0:m, 0:n, 0:p))
+            end do
+            @:ACC_SETUP_VFs(Msn_reynolds_stress(i))
+        end do
+
+        @:ALLOCATE(Msn_eff_visc(1:9))
+        do i = 1, 9
+            @:ALLOCATE(Msn_eff_visc(i)%vf(1:4))
+        end do
+        do i = 1, 9
+            do j = 1, 4
+                @:ALLOCATE(Msn_eff_visc(i)%vf(j)%sf(0:m, 0:n, 0:p))
+            end do
+            @:ACC_SETUP_VFs(Msn_eff_visc(i))
+        end do
+
+        @:ALLOCATE(Msn_int_mom_exch(1:3))
+        do i = 1, 3
+            @:ALLOCATE(Msn_int_mom_exch(i)%vf(1:4))
+        end do
+        do i = 1, 3
+            do j = 1, 4
+                @:ALLOCATE(Msn_int_mom_exch(i)%vf(j)%sf(0:m, 0:n, 0:p))
+            end do
+            @:ACC_SETUP_VFs(Msn_int_mom_exch(i))
+        end do
+
+        @:ALLOCATE(Msn_q_cons_filtered(1:sys_size-1))
+        do i = 1, sys_size - 1
+            @:ALLOCATE(Msn_q_cons_filtered(i)%vf(1:4))
+        end do
+        do i = 1, sys_size - 1
+            do j = 1, 4
+                @:ALLOCATE(Msn_q_cons_filtered(i)%vf(j)%sf(0:m, 0:n, 0:p))
+            end do
+            @:ACC_SETUP_VFs(Msn_q_cons_filtered)
+        end do
+
+        @:ALLOCATE(Msn_filtered_pressure(1:4))
+        do i = 1, 4
+            @:ALLOCATE(Msn_filtered_pressure(i)%sf(0:m, 0:n, 0:p))
+            @:ACC_SETUP_SFs(Msn_filtered_pressure(i))
+        end do
+
+        @:ALLOCATE(stat_reynolds_stress(1:9))
+        do i = 1, 9
+            @:ALLOCATE(stat_reynolds_stress(i)%vf(1:4))
+        end do
+        do i = 1, 9
+            do j = 1, 4
+                @:ALLOCATE(stat_reynolds_stress(i)%vf(j)%sf(0:m, 0:n, 0:p))
+            end do
+            @:ACC_SETUP_VFs(stat_reynolds_stress(i))
+        end do
+
+        @:ALLOCATE(stat_eff_visc(1:9))
+        do i = 1, 9
+            @:ALLOCATE(stat_eff_visc(i)%vf(1:4))
+        end do
+        do i = 1, 9
+            do j = 1, 4
+                @:ALLOCATE(stat_eff_visc(i)%vf(j)%sf(0:m, 0:n, 0:p))
+            end do
+            @:ACC_SETUP_VFs(stat_eff_visc(i))
+        end do
+
+        @:ALLOCATE(stat_int_mom_exch(1:3))
+        do i = 1, 3
+            @:ALLOCATE(stat_int_mom_exch(i)%vf(1:4))
+        end do
+        do i = 1, 3
+            do j = 1, 4
+                @:ALLOCATE(stat_int_mom_exch(i)%vf(j)%sf(0:m, 0:n, 0:p))
+            end do
+            @:ACC_SETUP_VFs(stat_int_mom_exch(i))
+        end do
+
+        @:ALLOCATE(stat_q_cons_filtered(1:sys_size-1))
+        do i = 1, sys_size - 1
+            @:ALLOCATE(stat_q_cons_filtered(i)%vf(1:4))
+        end do
+        do i = 1, sys_size - 1
+            do j = 1, 4
+                @:ALLOCATE(stat_q_cons_filtered(i)%vf(j)%sf(0:m, 0:n, 0:p))
+            end do
+            @:ACC_SETUP_VFs(stat_q_cons_filtered)
+        end do
+
+        @:ALLOCATE(stat_filtered_pressure(1:4))
+        do i = 1, 4
+            @:ALLOCATE(stat_filtered_pressure(i)%sf(0:m, 0:n, 0:p))
+            @:ACC_SETUP_SFs(stat_filtered_pressure(i))
+        end do
+
+    end subroutine s_initialize_statistics_module
+
+    subroutine s_compute_statistics_momentum_unclosed_terms(t_step, t_step_stat_start, reynolds_stress, eff_visc, int_mom_exch, q_cons_filtered, filtered_pressure)
+        type(vector_field), dimension(1:3), intent(in) :: reynolds_stress
+        type(vector_field), dimension(1:3), intent(in) :: eff_visc
+        type(scalar_field), dimension(1:3), intent(in) :: int_mom_exch
+        type(scalar_field), dimension(sys_size - 1), intent(in) :: q_cons_filtered
+        type(scalar_field), intent(in) :: filtered_pressure
+        integer, intent(in) :: t_step
+        integer, intent(in) :: t_step_stat_start
+
+        real(wp) :: ns
+        integer :: i, j
+
+        ns = real(t_step - t_step_stat_start, wp)
+
+        ! update M1, M2, M3, M4
+        do i = 1, 3
+            do j = 1, 3
+                call s_update_statistics(ns, reynolds_stress(i)%vf(j), Msn_reynolds_stress((i - 1)*3 + j)%vf)
+                call s_update_statistics(ns, eff_visc(i)%vf(j), Msn_eff_visc((i - 1)*3 + j)%vf)
+            end do
+            call s_update_statistics(ns, int_mom_exch(i), Msn_int_mom_exch(i)%vf)
+        end do
+        do i = 1, sys_size - 1
+            call s_update_statistics(ns, q_cons_filtered(i), Msn_q_cons_filtered(i)%vf)
+        end do
+        call s_update_statistics(ns, filtered_pressure, Msn_filtered_pressure)
+
+        ! compute 1st, 2nd, 3rd, 4th order statistical moments
+        if (t_step == t_step_stop - 1) then ! only compute at final time
+            do i = 1, 3
+                do j = 1, 3
+                    call s_compute_statistical_moments(ns, Msn_reynolds_stress((i - 1)*3 + j)%vf, stat_reynolds_stress((i - 1)*3 + j)%vf)
+                    call s_compute_statistical_moments(ns, Msn_eff_visc((i - 1)*3 + j)%vf, stat_eff_visc((i - 1)*3 + j)%vf)
+                end do
+                call s_compute_statistical_moments(ns, Msn_int_mom_exch(i)%vf, stat_int_mom_exch(i)%vf)
+            end do
+            do i = 1, sys_size - 1
+                call s_compute_statistical_moments(ns, Msn_q_cons_filtered(i)%vf, stat_q_cons_filtered(i)%vf)
+            end do
+            call s_compute_statistical_moments(ns, Msn_filtered_pressure, stat_filtered_pressure)
+        end if
+
+    end subroutine s_compute_statistics_momentum_unclosed_terms
+
+    subroutine s_update_statistics(ns, q_temp, Msn)
+        type(scalar_field), intent(in) :: q_temp
+        type(scalar_field), dimension(1:4), intent(inout) :: Msn
+
+        real(wp), intent(in) :: ns
+        real(wp) :: delta, delta_n, delta_n2, delta_f
+        integer :: i, j, k
+
+        $:GPU_PARALLEL_LOOP(collapse=3, copyin='[ns]', private='[delta, delta_n, delta_n2, delta_f]')
+        do i = 0, m
+            do j = 0, n
+                do k = 0, p
+                    delta = q_temp%sf(i, j, k) - Msn(1)%sf(i, j, k)
+                    delta_n = delta/ns
+                    delta_n2 = delta_n**2
+                    delta_f = delta*delta_n*(ns - 1._wp)
+
+                    Msn(1)%sf(i, j, k) = Msn(1)%sf(i, j, k) + delta_n
+                    Msn(4)%sf(i, j, k) = Msn(4)%sf(i, j, k) + delta_f*delta_n2*(ns**2 - 3._wp*ns + 3._wp) + 6._wp*delta_n2*Msn(2)%sf(i, j, k) - 4._wp*delta_n*Msn(3)%sf(i, j, k)
+                    Msn(3)%sf(i, j, k) = Msn(3)%sf(i, j, k) + delta_f*delta_n*(ns - 2._wp) - 3._wp*delta_n*Msn(2)%sf(i, j, k)
+                    Msn(2)%sf(i, j, k) = Msn(2)%sf(i, j, k) + delta_f
+                end do
+            end do
+        end do
+
+    end subroutine s_update_statistics
+
+    subroutine s_compute_statistical_moments(ns, Msn, q_stat)
+        type(scalar_field), dimension(1:4), intent(in) :: Msn
+        type(scalar_field), dimension(1:4), intent(inout) :: q_stat
+
+        real(wp), intent(in) :: ns
+        integer :: i, j, k
+
+        $:GPU_PARALLEL_LOOP(collapse=3, copyin='[ns]')
+        do i = 0, m
+            do j = 0, n
+                do k = 0, p
+                    q_stat(1)%sf(i, j, k) = Msn(1)%sf(i, j, k)
+                    q_stat(2)%sf(i, j, k) = Msn(2)%sf(i, j, k)/(ns - 1._wp)
+                    q_stat(3)%sf(i, j, k) = sqrt(ns - 1._wp)/(ns - 2._wp)*ns*Msn(3)%sf(i, j, k)/(Msn(2)%sf(i, j, k)**1.5)
+                    q_stat(4)%sf(i, j, k) = (ns - 1._wp)/((ns - 2._wp)*(ns - 3._wp))*((ns + 1._wp)*(ns*Msn(4)%sf(i, j, k)/(Msn(2)%sf(i, j, k)**2) - 3._wp) + 6._wp)
+                end do
+            end do
+        end do
+
+    end subroutine s_compute_statistical_moments
+
+    subroutine s_finalize_statistics_module
+        integer :: i, j
+
+        do i = 1, 9
+            do j = 1, 4
+                @:DEALLOCATE(Msn_reynolds_stress(i)%vf(j)%sf)
+            end do
+            @:DEALLOCATE(Msn_reynolds_stress(i)%vf)
+        end do
+        @:DEALLOCATE(Msn_reynolds_stress)
+
+        do i = 1, 9
+            do j = 1, 4
+                @:DEALLOCATE(Msn_eff_visc(i)%vf(j)%sf)
+            end do
+            @:DEALLOCATE(Msn_eff_visc(i)%vf)
+        end do
+        @:DEALLOCATE(Msn_eff_visc)
+
+        do i = 1, 3
+            do j = 1, 4
+                @:DEALLOCATE(Msn_int_mom_exch(i)%vf(j)%sf)
+            end do
+            @:DEALLOCATE(Msn_int_mom_exch(i)%vf)
+        end do
+        @:DEALLOCATE(Msn_int_mom_exch)
+
+        do i = 1, sys_size - 1
+            do j = 1, 4
+                @:DEALLOCATE(Msn_q_cons_filtered(i)%vf(j)%sf)
+            end do
+            @:DEALLOCATE(Msn_q_cons_filtered(i)%vf)
+        end do
+        @:DEALLOCATE(Msn_q_cons_filtered)
+
+        do i = 1, 4
+            @:DEALLOCATE(Msn_filtered_pressure(i)%sf)
+        end do
+        @:DEALLOCATE(Msn_filtered_pressure)
+
+        do i = 1, 9
+            do j = 1, 4
+                @:DEALLOCATE(stat_reynolds_stress(i)%vf(j)%sf)
+            end do
+            @:DEALLOCATE(stat_reynolds_stress(i)%vf)
+        end do
+        @:DEALLOCATE(stat_reynolds_stress)
+
+        do i = 1, 9
+            do j = 1, 4
+                @:DEALLOCATE(stat_eff_visc(i)%vf(j)%sf)
+            end do
+            @:DEALLOCATE(stat_eff_visc(i)%vf)
+        end do
+        @:DEALLOCATE(stat_eff_visc)
+
+        do i = 1, 3
+            do j = 1, 4
+                @:DEALLOCATE(stat_int_mom_exch(i)%vf(j)%sf)
+            end do
+            @:DEALLOCATE(stat_int_mom_exch(i)%vf)
+        end do
+        @:DEALLOCATE(stat_int_mom_exch)
+
+        do i = 1, sys_size - 1
+            do j = 1, 4
+                @:DEALLOCATE(stat_q_cons_filtered(i)%vf(j)%sf)
+            end do
+            @:DEALLOCATE(stat_q_cons_filtered(i)%vf)
+        end do
+        @:DEALLOCATE(stat_q_cons_filtered)
+
+        do i = 1, 4
+            @:DEALLOCATE(stat_filtered_pressure(i)%sf)
+        end do
+        @:DEALLOCATE(stat_filtered_pressure)
+
+    end subroutine s_finalize_statistics_module
+
+end module m_compute_statistics
diff --git a/src/simulation/m_data_output.fpp b/src/simulation/m_data_output.fpp
index 9e46bdd775..c2568ab415 100644
--- a/src/simulation/m_data_output.fpp
+++ b/src/simulation/m_data_output.fpp
@@ -82,7 +82,10 @@ contains
         !! @param q_cons_vf Conservative variables
         !! @param q_prim_vf Primitive variables
         !! @param t_step Current time step
-    impure subroutine s_write_data_files(q_cons_vf, q_T_sf, q_prim_vf, t_step, bc_type, beta)
+    impure subroutine s_write_data_files(q_cons_vf, q_T_sf, q_prim_vf, t_step, bc_type, beta, &
+                                         filtered_fluid_indicator_function, &
+                                         stat_q_cons_filtered, stat_filtered_pressure, &
+                                         stat_reynolds_stress, stat_eff_visc, stat_int_mom_exch)
 
         type(scalar_field), &
             dimension(sys_size), &
@@ -104,10 +107,20 @@ contains
             dimension(1:num_dims, -1:1), &
             intent(in) :: bc_type
 
+        type(scalar_field), intent(inout), optional :: filtered_fluid_indicator_function
+        type(vector_field), dimension(1:9), intent(inout), optional :: stat_reynolds_stress
+        type(vector_field), dimension(1:9), intent(inout), optional :: stat_eff_visc
+        type(vector_field), dimension(1:3), intent(inout), optional :: stat_int_mom_exch
+        type(vector_field), dimension(1:5), intent(inout), optional :: stat_q_cons_filtered
+        type(scalar_field), dimension(1:4), intent(inout), optional :: stat_filtered_pressure
+
         if (.not. parallel_io) then
             call s_write_serial_data_files(q_cons_vf, q_T_sf, q_prim_vf, t_step, bc_type, beta)
         else
-            call s_write_parallel_data_files(q_cons_vf, t_step, bc_type, beta)
+            call s_write_parallel_data_files(q_cons_vf, t_step, bc_type, beta, &
+                                             filtered_fluid_indicator_function, &
+                                             stat_q_cons_filtered, stat_filtered_pressure, &
+                                             stat_reynolds_stress, stat_eff_visc, stat_int_mom_exch)
         end if
 
     end subroutine s_write_data_files
@@ -783,11 +796,21 @@ contains
         !!  @param q_cons_vf Cell-average conservative variables
         !!  @param t_step Current time-step
         !!  @param beta Eulerian void fraction from lagrangian bubbles
-    impure subroutine s_write_parallel_data_files(q_cons_vf, t_step, bc_type, beta)
+    impure subroutine s_write_parallel_data_files(q_cons_vf, t_step, bc_type, beta, &
+                                                  filtered_fluid_indicator_function, &
+                                                  stat_q_cons_filtered, stat_filtered_pressure, &
+                                                  stat_reynolds_stress, stat_eff_visc, stat_int_mom_exch)
 
         type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
         integer, intent(in) :: t_step
         type(scalar_field), intent(inout), optional :: beta
+        type(scalar_field), intent(inout), optional :: filtered_fluid_indicator_function
+        type(vector_field), dimension(1:sys_size - 1), intent(inout), optional :: stat_q_cons_filtered
+        type(scalar_field), dimension(1:4), intent(inout), optional :: stat_filtered_pressure
+        type(vector_field), dimension(1:9), intent(inout), optional :: stat_reynolds_stress
+        type(vector_field), dimension(1:9), intent(inout), optional :: stat_eff_visc
+        type(vector_field), dimension(1:3), intent(inout), optional :: stat_int_mom_exch
+
         type(integer_field), &
             dimension(1:num_dims, -1:1), &
             intent(in) :: bc_type
@@ -823,6 +846,8 @@ contains
 
         if (present(beta)) then
             alt_sys = sys_size + 1
+        else if (q_filtered_wrt .and. (t_step == 0 .or. t_step == t_step_stop)) then
+            alt_sys = sys_size + 1 + 9*4 + 9*4 + 3*4 + 6*4
         else
             alt_sys = sys_size
         end if
@@ -930,6 +955,11 @@ contains
 
             if (ib) then
                 call s_initialize_mpi_data(q_cons_vf, ib_markers, levelset, levelset_norm)
+                if (q_filtered_wrt .and. (t_step == 0 .or. t_step == t_step_stop)) then
+                    call s_initialize_mpi_data_filtered(filtered_fluid_indicator_function, &
+                                                        stat_q_cons_filtered, stat_filtered_pressure, &
+                                                        stat_reynolds_stress, stat_eff_visc, stat_int_mom_exch)
+                end if
             elseif (present(beta)) then
                 call s_initialize_mpi_data(q_cons_vf, beta=beta)
             else
@@ -984,6 +1014,18 @@ contains
                                                 mpi_p, status, ierr)
                     end do
                 end if
+            else if (volume_filtering_momentum_eqn) then
+                do i = 1, alt_sys
+                    var_MOK = int(i, MPI_OFFSET_KIND)
+
+                    ! Initial displacement to skip at beginning of file
+                    disp = m_MOK*max(MOK, n_MOK)*max(MOK, p_MOK)*WP_MOK*(var_MOK - 1)
+
+                    call MPI_FILE_SET_VIEW(ifile, disp, mpi_p, MPI_IO_DATA%view(i), &
+                                           'native', mpi_info_int, ierr)
+                    call MPI_FILE_WRITE_ALL(ifile, MPI_IO_DATA%var(i)%sf, data_size, &
+                                            mpi_p, status, ierr)
+                end do
             else
                 do i = 1, sys_size !TODO: check if correct (sys_size
                     var_MOK = int(i, MPI_OFFSET_KIND)
diff --git a/src/simulation/m_global_parameters.fpp b/src/simulation/m_global_parameters.fpp
index b5fb9a35a3..bf12f8b3b4 100644
--- a/src/simulation/m_global_parameters.fpp
+++ b/src/simulation/m_global_parameters.fpp
@@ -549,6 +549,21 @@ module m_global_parameters
     $:GPU_DECLARE(create='[tau_star,cont_damage_s,alpha_bar]')
     !> @}
 
+    logical :: periodic_ibs
+    logical :: compute_particle_drag
+    real(wp) :: u_inf_ref !< reference freestream velocity
+    real(wp) :: rho_inf_ref !< reference freestream density
+    real(wp) :: P_inf_ref !< reference freestream temperature
+    logical :: periodic_forcing
+    logical :: volume_filtering_momentum_eqn
+    logical :: store_levelset
+    logical :: slab_domain_decomposition
+    integer :: t_step_stat_start
+    real(wp) :: filter_width
+    logical :: q_filtered_wrt
+
+    $:GPU_DECLARE(create='[u_inf_ref, rho_inf_ref, P_inf_ref, filter_width]')
+
 contains
 
     !> Assigns default values to the user inputs before reading
@@ -835,6 +850,19 @@ contains
             relativity = .false.
         #:endif
 
+        periodic_ibs = .false.
+        compute_particle_drag = .false.
+        u_inf_ref = dflt_real
+        rho_inf_ref = dflt_real
+        P_inf_ref = dflt_real
+        periodic_forcing = .false.
+        volume_filtering_momentum_eqn = .false.
+        store_levelset = .true.
+        slab_domain_decomposition = .false.
+        t_step_stat_start = dflt_int
+        filter_width = dflt_real
+        q_filtered_wrt = .false.
+
     end subroutine s_assign_default_values_to_user_inputs
 
     !>  The computation of parameters, the allocation of memory,
@@ -1203,6 +1231,9 @@ contains
         elseif (bubbles_lagrange) then
             allocate (MPI_IO_DATA%view(1:sys_size + 1))
             allocate (MPI_IO_DATA%var(1:sys_size + 1))
+        else if (volume_filtering_momentum_eqn) then
+            allocate (MPI_IO_DATA%view(1:sys_size + 109))
+            allocate (MPI_IO_DATA%var(1:sys_size + 109))
         else
             allocate (MPI_IO_DATA%view(1:sys_size))
             allocate (MPI_IO_DATA%var(1:sys_size))
@@ -1224,6 +1255,11 @@ contains
                 allocate (MPI_IO_DATA%var(i)%sf(0:m, 0:n, 0:p))
                 MPI_IO_DATA%var(i)%sf => null()
             end do
+        else if (volume_filtering_momentum_eqn) then
+            do i = sys_size + 1, sys_size + 109
+                allocate (MPI_IO_DATA%var(i)%sf(0:m, 0:n, 0:p))
+                MPI_IO_DATA%var(i)%sf => null()
+            end do
         end if
 
         ! Configuring the WENO average flag that will be used to regulate
@@ -1425,6 +1461,10 @@ contains
                 do i = 1, sys_size + 1
                     MPI_IO_DATA%var(i)%sf => null()
                 end do
+            else if (volume_filtering_momentum_eqn) then
+                do i = 1, sys_size + 109
+                    MPI_IO_DATA%var(i)%sf => null()
+                end do
             else
                 do i = 1, sys_size
                     MPI_IO_DATA%var(i)%sf => null()
diff --git a/src/simulation/m_ibm.fpp b/src/simulation/m_ibm.fpp
index 6fe4932574..444b4ab8d5 100644
--- a/src/simulation/m_ibm.fpp
+++ b/src/simulation/m_ibm.fpp
@@ -54,6 +54,9 @@ module m_ibm
 
     logical :: moving_immersed_boundary_flag
 
+    real(wp) :: x_domain_beg_glb, x_domain_end_glb, y_domain_beg_glb, y_domain_end_glb, z_domain_beg_glb, z_domain_end_glb !< global domain beginning/end
+    $:GPU_DECLARE(create='[x_domain_beg_glb, x_domain_end_glb, y_domain_beg_glb, y_domain_end_glb, z_domain_beg_glb, z_domain_end_glb]')
+
 contains
 
     !>  Allocates memory for the variables in the IBM module
@@ -90,6 +93,14 @@ contains
         integer :: i, j, k
         integer :: max_num_gps, max_num_inner_gps
 
+        call s_mpi_allreduce_min(x_domain%beg, x_domain_beg_glb)
+        call s_mpi_allreduce_max(x_domain%end, x_domain_end_glb)
+        call s_mpi_allreduce_min(y_domain%beg, y_domain_beg_glb)
+        call s_mpi_allreduce_max(y_domain%end, y_domain_end_glb)
+        call s_mpi_allreduce_min(z_domain%beg, z_domain_beg_glb)
+        call s_mpi_allreduce_max(z_domain%end, z_domain_end_glb)
+        $:GPU_UPDATE(device='[x_domain_beg_glb, x_domain_end_glb, y_domain_beg_glb, y_domain_end_glb, z_domain_beg_glb, z_domain_end_glb]')
+
         moving_immersed_boundary_flag = .false.
         do i = 1, num_ibs
             if (patch_ib(i)%moving_ibm /= 0) then
@@ -136,6 +147,7 @@ contains
     end subroutine s_ibm_setup
 
     subroutine s_populate_ib_buffers()
+        integer :: j, k, l
 
         #:for DIRC, DIRI in [('x', 1), ('y', 2), ('z', 3)]
             #:for LOCC, LOCI in [('beg', -1), ('end', 1)]
@@ -145,6 +157,83 @@ contains
             #:endfor
         #:endfor
 
+        if (periodic_ibs) then
+            ! Population of Buffers in x-direction
+            if (bc_x%beg == BC_PERIODIC) then
+                $:GPU_PARALLEL_LOOP(collapse=3)
+                do l = 0, p
+                    do k = 0, n
+                        do j = 1, buff_size
+                            ib_markers%sf(-j, k, l) = &
+                                ib_markers%sf(m - (j - 1), k, l)
+                        end do
+                    end do
+                end do
+            end if
+
+            if (bc_x%end == BC_PERIODIC) then
+                $:GPU_PARALLEL_LOOP(collapse=3)
+                do l = 0, p
+                    do k = 0, n
+                        do j = 1, buff_size
+                            ib_markers%sf(m + j, k, l) = &
+                                ib_markers%sf(j - 1, k, l)
+                        end do
+                    end do
+                end do
+            end if
+
+            ! Population of Buffers in y-direction
+            if (bc_y%beg == BC_PERIODIC) then
+                $:GPU_PARALLEL_LOOP(collapse=3)
+                do l = 0, p
+                    do k = -buff_size, m + buff_size
+                        do j = 1, buff_size
+                            ib_markers%sf(k, -j, l) = &
+                                ib_markers%sf(k, n - (j - 1), l)
+                        end do
+                    end do
+                end do
+            end if
+
+            if (bc_y%end == BC_PERIODIC) then
+                $:GPU_PARALLEL_LOOP(collapse=3)
+                do l = 0, p
+                    do k = -buff_size, m + buff_size
+                        do j = 1, buff_size
+                            ib_markers%sf(k, n + j, l) = &
+                                ib_markers%sf(k, j - 1, l)
+                        end do
+                    end do
+                end do
+            end if
+
+            ! Population of Buffers in z-direction
+            if (bc_z%beg == BC_PERIODIC) then
+                $:GPU_PARALLEL_LOOP(collapse=3)
+                do l = -buff_size, n + buff_size
+                    do k = -buff_size, m + buff_size
+                        do j = 1, buff_size
+                            ib_markers%sf(k, l, -j) = &
+                                ib_markers%sf(k, l, p - (j - 1))
+                        end do
+                    end do
+                end do
+            end if
+
+            if (bc_z%end == BC_PERIODIC) then
+                $:GPU_PARALLEL_LOOP(collapse=3)
+                do l = -buff_size, n + buff_size
+                    do k = -buff_size, m + buff_size
+                        do j = 1, buff_size
+                            ib_markers%sf(k, l, p + j) = &
+                                ib_markers%sf(k, l, j - 1)
+                        end do
+                    end do
+                end do
+            end if
+        end if
+
     end subroutine s_populate_ib_buffers
 
     !>  Subroutine that updates the conservative variables at the ghost points
@@ -397,6 +486,13 @@ contains
         integer :: dir
         integer :: index
 
+        real(wp) :: radius, x_centroid, y_centroid, z_centroid
+        real(wp) :: x_pcen, y_pcen, z_pcen
+        real(wp) :: dist_calc
+        real(wp), dimension(3) :: dist_vec
+        real(wp), dimension(7, 3) :: dist_vec_per
+        real(wp), dimension(7) :: dist_per
+
         do q = 1, num_gps
             gp = ghost_points_in(q)
             i = gp%loc(1)
@@ -412,8 +508,106 @@ contains
 
             ! Calculate and store the precise location of the image point
             patch_id = gp%ib_patch_id
-            dist = abs(levelset_in%sf(i, j, k, patch_id))
-            norm(:) = levelset_norm_in%sf(i, j, k, patch_id, :)
+            if (store_levelset) then
+                dist = abs(levelset%sf(i, j, k, patch_id))
+                norm(:) = levelset_norm%sf(i, j, k, patch_id, :)
+            else ! compute levelset and levelset_norm on the fly
+                radius = patch_ib(patch_id)%radius
+                x_centroid = patch_ib(patch_id)%x_centroid
+                y_centroid = patch_ib(patch_id)%y_centroid
+                z_centroid = patch_ib(patch_id)%z_centroid
+                dist_vec(1) = x_cc(i) - x_centroid
+                dist_vec(2) = y_cc(j) - y_centroid
+                dist_vec(3) = z_cc(k) - z_centroid
+                dist_calc = sqrt(sum(dist_vec**2))
+                ! all permutations of periodically projected ib
+                if (periodic_ibs) then
+                    if ((x_centroid - x_domain_beg_glb) <= radius) then
+                        x_pcen = x_domain_end_glb + (x_centroid - x_domain_beg_glb)
+                    else if ((x_domain_end_glb - x_centroid) <= radius) then
+                        x_pcen = x_domain_beg_glb - (x_domain_end_glb - x_centroid)
+                    else
+                        x_pcen = x_centroid
+                    end if
+                    if ((y_centroid - y_domain_beg_glb) <= radius) then
+                        y_pcen = y_domain_end_glb + (y_centroid - y_domain_beg_glb)
+                    else if ((y_domain_end_glb - y_centroid) <= radius) then
+                        y_pcen = y_domain_beg_glb - (y_domain_end_glb - y_centroid)
+                    else
+                        y_pcen = y_centroid
+                    end if
+                    if ((z_centroid - z_domain_beg_glb) <= radius) then
+                        z_pcen = z_domain_end_glb + (z_centroid - z_domain_beg_glb)
+                    else if ((z_domain_end_glb - z_centroid) <= radius) then
+                        z_pcen = z_domain_beg_glb - (z_domain_end_glb - z_centroid)
+                    else
+                        z_pcen = z_centroid
+                    end if
+                    dist_vec_per(1, 1) = x_cc(i) - x_pcen
+                    dist_vec_per(1, 2) = y_cc(j) - y_pcen
+                    dist_vec_per(1, 3) = z_cc(k) - z_pcen
+                    dist_per(1) = sqrt(sum(dist_vec_per(1, :)**2))
+                    if (dist_per(1) < dist_calc) then
+                        dist_calc = dist_per(1)
+                        dist_vec = dist_vec_per(1, :)
+                    end if
+                    dist_vec_per(2, 1) = x_cc(i) - x_pcen
+                    dist_vec_per(2, 2) = y_cc(j) - y_centroid
+                    dist_vec_per(2, 3) = z_cc(k) - z_pcen
+                    dist_per(2) = sqrt(sum(dist_vec_per(2, :)**2))
+                    if (dist_per(2) < dist_calc) then
+                        dist_calc = dist_per(2)
+                        dist_vec = dist_vec_per(2, :)
+                    end if
+                    dist_vec_per(3, 1) = x_cc(i) - x_pcen
+                    dist_vec_per(3, 2) = y_cc(j) - y_pcen
+                    dist_vec_per(3, 3) = z_cc(k) - z_centroid
+                    dist_per(3) = sqrt(sum(dist_vec_per(3, :)**2))
+                    if (dist_per(3) < dist_calc) then
+                        dist_calc = dist_per(3)
+                        dist_vec = dist_vec_per(3, :)
+                    end if
+                    dist_vec_per(4, 1) = x_cc(i) - x_pcen
+                    dist_vec_per(4, 2) = y_cc(j) - y_centroid
+                    dist_vec_per(4, 3) = z_cc(k) - z_centroid
+                    dist_per(4) = sqrt(sum(dist_vec_per(4, :)**2))
+                    if (dist_per(4) < dist_calc) then
+                        dist_calc = dist_per(4)
+                        dist_vec = dist_vec_per(4, :)
+                    end if
+                    dist_vec_per(5, 1) = x_cc(i) - x_centroid
+                    dist_vec_per(5, 2) = y_cc(j) - y_pcen
+                    dist_vec_per(5, 3) = z_cc(k) - z_pcen
+                    dist_per(5) = sqrt(sum(dist_vec_per(5, :)**2))
+                    if (dist_per(5) < dist_calc) then
+                        dist_calc = dist_per(5)
+                        dist_vec = dist_vec_per(5, :)
+                    end if
+                    dist_vec_per(6, 1) = x_cc(i) - x_centroid
+                    dist_vec_per(6, 2) = y_cc(j) - y_pcen
+                    dist_vec_per(6, 3) = z_cc(k) - z_centroid
+                    dist_per(6) = sqrt(sum(dist_vec_per(6, :)**2))
+                    if (dist_per(6) < dist_calc) then
+                        dist_calc = dist_per(6)
+                        dist_vec = dist_vec_per(6, :)
+                    end if
+                    dist_vec_per(7, 1) = x_cc(i) - x_centroid
+                    dist_vec_per(7, 2) = y_cc(j) - y_centroid
+                    dist_vec_per(7, 3) = z_cc(k) - z_pcen
+                    dist_per(7) = sqrt(sum(dist_vec_per(7, :)**2))
+                    if (dist_per(7) < dist_calc) then
+                        dist_calc = dist_per(7)
+                        dist_vec = dist_vec_per(7, :)
+                    end if
+                end if
+                dist = abs(dist_calc - radius)
+                if (dist_calc == 0) then
+                    norm(:) = (/1, 0, 0/)
+                else
+                    norm(:) = dist_vec(:)/dist_calc
+                end if
+            end if ! end store_levelset if statement
+
             ghost_points_in(q)%ip_loc(:) = physical_loc(:) + 2*dist*norm(:)
 
             ! Find the closest grid point to the image point
@@ -422,13 +616,13 @@ contains
                 ! s_cc points to the dim array we need
                 if (dim == 1) then
                     s_cc => x_cc
-                    bound = m + buff_size - 1
+                    bound = m + buff_size
                 elseif (dim == 2) then
                     s_cc => y_cc
-                    bound = n + buff_size - 1
+                    bound = n + buff_size
                 else
                     s_cc => z_cc
-                    bound = p + buff_size - 1
+                    bound = p + buff_size
                 end if
 
                 if (f_approx_equal(norm(dim), 0._wp)) then
@@ -447,7 +641,7 @@ contains
                                .or. temp_loc > s_cc(index + 1)))
                         index = index + dir
                         if (index < -buff_size .or. index > bound) then
-                            print *, "temp_loc=", temp_loc, " s_cc(index)=", s_cc(index), " s_cc(index+1)=", s_cc(index + 1)
+                            print *, "proc_rank=", proc_rank, "temp_loc=", temp_loc, " index=", index, "ib=", patch_id, "dim", dim, "dir", dir, "i, j, k", i, j, k
                             print *, "Increase buff_size further in m_helper_basic (currently set to a minimum of 10)"
                             error stop "Increase buff_size"
                         end if
@@ -475,6 +669,9 @@ contains
             :: subsection_2D
         integer, dimension(2*gp_layers + 1, 2*gp_layers + 1, 2*gp_layers + 1) &
             :: subsection_3D
+        integer, dimension(2*gp_layers + 1) :: subsection_x
+        integer, dimension(2*gp_layers + 1) :: subsection_y
+        integer, dimension(2*gp_layers + 1) :: subsection_z
         integer :: i, j, k!< Iterator variables
 
         num_gps_out = 0
@@ -496,11 +693,22 @@ contains
                 else
                     do k = 0, p
                         if (ib_markers%sf(i, j, k) /= 0) then
-                            subsection_3D = ib_markers%sf( &
-                                            i - gp_layers:i + gp_layers, &
-                                            j - gp_layers:j + gp_layers, &
-                                            k - gp_layers:k + gp_layers)
-                            if (any(subsection_3D == 0)) then
+                            ! subsection_3D = ib_markers%sf( &
+                            !                 i - gp_layers:i + gp_layers, &
+                            !                 j - gp_layers:j + gp_layers, &
+                            !                 k - gp_layers:k + gp_layers)
+                            ! if (any(subsection_3D == 0)) then
+                            !     num_gps_out = num_gps_out + 1
+                            ! else
+                            !     num_inner_gps_out = num_inner_gps_out + 1
+                            ! end if
+
+                            subsection_x = ib_markers%sf(i - gp_layers:i + gp_layers, j, k)
+                            subsection_y = ib_markers%sf(i, j - gp_layers:j + gp_layers, k)
+                            subsection_z = ib_markers%sf(i, j, k - gp_layers:k + gp_layers)
+                            if (any(subsection_x == 0) .or. &
+                                any(subsection_y == 0) .or. &
+                                any(subsection_z == 0)) then
                                 num_gps_out = num_gps_out + 1
                             else
                                 num_inner_gps_out = num_inner_gps_out + 1
@@ -522,6 +730,9 @@ contains
             :: subsection_2D
         integer, dimension(2*gp_layers + 1, 2*gp_layers + 1, 2*gp_layers + 1) &
             :: subsection_3D
+        integer, dimension(2*gp_layers + 1) :: subsection_x
+        integer, dimension(2*gp_layers + 1) :: subsection_y
+        integer, dimension(2*gp_layers + 1) :: subsection_z
         integer :: i, j, k !< Iterator variables
         integer :: count, count_i
         integer :: patch_id
@@ -578,11 +789,19 @@ contains
                     ! 3D
                     do k = 0, p
                         if (ib_markers%sf(i, j, k) /= 0) then
-                            subsection_3D = ib_markers%sf( &
-                                            i - gp_layers:i + gp_layers, &
-                                            j - gp_layers:j + gp_layers, &
-                                            k - gp_layers:k + gp_layers)
-                            if (any(subsection_3D == 0)) then
+                            ! subsection_3D = ib_markers%sf( &
+                            !                 i - gp_layers:i + gp_layers, &
+                            !                 j - gp_layers:j + gp_layers, &
+                            !                 k - gp_layers:k + gp_layers)
+                            ! if (any(subsection_3D == 0)) then
+
+                            subsection_x = ib_markers%sf(i - gp_layers:i + gp_layers, j, k)
+                            subsection_y = ib_markers%sf(i, j - gp_layers:j + gp_layers, k)
+                            subsection_z = ib_markers%sf(i, j, k - gp_layers:k + gp_layers)
+                            if (any(subsection_x == 0) .or. &
+                                any(subsection_y == 0) .or. &
+                                any(subsection_z == 0)) then
+
                                 ghost_points_in(count)%loc = [i, j, k]
                                 patch_id = ib_markers%sf(i, j, k)
                                 ghost_points_in(count)%ib_patch_id = &
@@ -952,8 +1171,10 @@ contains
     impure subroutine s_finalize_ibm_module()
 
         @:DEALLOCATE(ib_markers%sf)
-        @:DEALLOCATE(levelset%sf)
-        @:DEALLOCATE(levelset_norm%sf)
+        if (store_levelset) then
+            @:DEALLOCATE(levelset%sf)
+            @:DEALLOCATE(levelset_norm%sf)
+        end if
 
     end subroutine s_finalize_ibm_module
 
diff --git a/src/simulation/m_mpi_proxy.fpp b/src/simulation/m_mpi_proxy.fpp
index 069cd9a8d6..7d21482e6a 100644
--- a/src/simulation/m_mpi_proxy.fpp
+++ b/src/simulation/m_mpi_proxy.fpp
@@ -100,7 +100,7 @@ contains
             & 'num_probes', 'num_integrals', 'bubble_model', 'thermal',        &
             & 'num_source', 'relax_model', 'num_ibs', 'n_start',    &
             & 'num_bc_patches', 'num_igr_iters', 'num_igr_warm_start_iters', &
-            & 'adap_dt_max_iters' ]
+            & 'adap_dt_max_iters', 't_step_stat_start' ]
             call MPI_BCAST(${VAR}$, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
         #:endfor
 
@@ -116,7 +116,10 @@ contains
             & 'bc_z%grcbc_in', 'bc_z%grcbc_out', 'bc_z%grcbc_vel_out',          &
             & 'cfl_adap_dt', 'cfl_const_dt', 'cfl_dt', 'surface_tension',       &
             & 'shear_stress', 'bulk_stress', 'bubbles_lagrange',                &
-            & 'hyperelasticity', 'down_sample', 'int_comp','fft_wrt' ]
+            & 'hyperelasticity', 'down_sample', 'int_comp','fft_wrt',           &
+            & 'periodic_ibs', 'compute_particle_drag', 'periodic_forcing',      &
+            & 'volume_filtering_momentum_eqn', 'store_levelset',                &
+            'slab_domain_decomposition', 'q_filtered_wrt' ]
             call MPI_BCAST(${VAR}$, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr)
         #:endfor
 
@@ -156,7 +159,8 @@ contains
             & 'z_domain%beg', 'z_domain%end', 'x_a', 'x_b', 'y_a', 'y_b', 'z_a', &
             & 'z_b', 't_stop', 't_save', 'cfl_target', 'Bx0', 'alf_factor',  &
             & 'tau_star', 'cont_damage_s', 'alpha_bar', 'adap_dt_tol', &
-            & 'ic_eps', 'ic_beta' ]
+            & 'ic_eps', 'ic_beta', 'u_inf_ref', 'rho_inf_ref', 'P_inf_ref',      &
+            'filter_width' ]
             call MPI_BCAST(${VAR}$, 1, mpi_p, 0, MPI_COMM_WORLD, ierr)
         #:endfor
 
@@ -202,8 +206,8 @@ contains
         end do
 
         do i = 1, num_ibs
-            #:for VAR in [ 'radius', 'length_x', 'length_y', &
-                & 'x_centroid', 'y_centroid', 'c', 'm', 'p', 't', 'theta', 'slip']
+            #:for VAR in [ 'radius', 'length_x', 'length_y', 'length_z', &
+                & 'x_centroid', 'y_centroid', 'z_centroid', 'c', 'm', 'p', 't', 'theta', 'slip']
                 call MPI_BCAST(patch_ib(i)%${VAR}$, 1, mpi_p, 0, MPI_COMM_WORLD, ierr)
             #:endfor
             #:for VAR in ['vel', 'angular_vel', 'angles']
diff --git a/src/simulation/m_rhs.fpp b/src/simulation/m_rhs.fpp
index cdcb418075..af74e780f3 100644
--- a/src/simulation/m_rhs.fpp
+++ b/src/simulation/m_rhs.fpp
@@ -71,6 +71,8 @@ module m_rhs
 
     use m_pressure_relaxation
 
+    use m_additional_forcing
+
     implicit none
 
     private; public :: s_initialize_rhs_module, &
@@ -1028,6 +1030,12 @@ contains
 
         if (cont_damage) call s_compute_damage_state(q_cons_qp%vf, rhs_vf)
 
+        if (periodic_forcing) then
+            call nvtxStartRange("COMPUTE-PERIODIC-FORCING")
+            call s_compute_periodic_forcing(rhs_vf, q_cons_vf, q_prim_vf, t_step + 1)
+            call nvtxEndRange
+        end if
+
         ! END: Additional pphysics and source terms
 
         if (run_time_info .or. probe_wrt .or. ib .or. bubbles_lagrange) then
diff --git a/src/simulation/m_start_up.fpp b/src/simulation/m_start_up.fpp
index 91c9dd29eb..1de82fd02c 100644
--- a/src/simulation/m_start_up.fpp
+++ b/src/simulation/m_start_up.fpp
@@ -95,6 +95,12 @@ module m_start_up
 
     use m_igr
 
+    use m_additional_forcing 
+
+    use m_volume_filtering
+
+    use m_compute_statistics
+
     implicit none
 
     private; public :: s_read_input_file, &
@@ -189,7 +195,11 @@ contains
             cont_damage, tau_star, cont_damage_s, alpha_bar, &
             alf_factor, num_igr_iters, num_igr_warm_start_iters, &
             int_comp, ic_eps, ic_beta, nv_uvm_out_of_core, &
-            nv_uvm_igr_temps_on_gpu, nv_uvm_pref_gpu, down_sample, fft_wrt
+            nv_uvm_igr_temps_on_gpu, nv_uvm_pref_gpu, down_sample, fft_wrt, & 
+            periodic_ibs, compute_particle_drag, u_inf_ref, rho_inf_ref, P_inf_ref, & 
+            periodic_forcing, volume_filtering_momentum_eqn, store_levelset, & 
+            slab_domain_decomposition, t_step_stat_start, & 
+            filter_width, q_filtered_wrt
 
         ! Checking that an input file has been provided by the user. If it
         ! has, then the input file is read in, otherwise, simulation exits.
@@ -445,33 +455,35 @@ contains
                 call s_mpi_abort(trim(file_path)//' is missing. Exiting.')
             end if
 
-            ! Read Levelset
-            write (file_path, '(A)') &
-                trim(t_step_dir)//'/levelset.dat'
-            inquire (FILE=trim(file_path), EXIST=file_exist)
-            if (file_exist) then
-                open (2, FILE=trim(file_path), &
-                        FORM='unformatted', &
-                        ACTION='read', &
-                        STATUS='old')
-                read (2) levelset%sf(0:m, 0:n, 0:p, 1:num_ibs); close (2)
-                ! print*, 'check', STL_levelset(106, 50, 0, 1)
-            else
-                call s_mpi_abort(trim(file_path)//' is missing. Exiting.')
-            end if
+            if (store_levelset) then
+                ! Read Levelset
+                write (file_path, '(A)') &
+                    trim(t_step_dir)//'/levelset.dat'
+                inquire (FILE=trim(file_path), EXIST=file_exist)
+                if (file_exist) then
+                    open (2, FILE=trim(file_path), &
+                            FORM='unformatted', &
+                            ACTION='read', &
+                            STATUS='old')
+                    read (2) levelset%sf(0:m, 0:n, 0:p, 1:num_ibs); close (2)
+                    ! print*, 'check', STL_levelset(106, 50, 0, 1)
+                else
+                    call s_mpi_abort(trim(file_path)//' is missing. Exiting.')
+                end if
 
-            ! Read Levelset Norm
-            write (file_path, '(A)') &
-                trim(t_step_dir)//'/levelset_norm.dat'
-            inquire (FILE=trim(file_path), EXIST=file_exist)
-            if (file_exist) then
-                open (2, FILE=trim(file_path), &
-                        FORM='unformatted', &
-                        ACTION='read', &
-                        STATUS='old')
-                read (2) levelset_norm%sf(0:m, 0:n, 0:p, 1:num_ibs, 1:3); close (2)
-            else
-                call s_mpi_abort(trim(file_path)//' is missing. Exiting.')
+                ! Read Levelset Norm
+                write (file_path, '(A)') &
+                    trim(t_step_dir)//'/levelset_norm.dat'
+                inquire (FILE=trim(file_path), EXIST=file_exist)
+                if (file_exist) then
+                    open (2, FILE=trim(file_path), &
+                            FORM='unformatted', &
+                            ACTION='read', &
+                            STATUS='old')
+                    read (2) levelset_norm%sf(0:m, 0:n, 0:p, 1:num_ibs, 1:3); close (2)
+                else
+                    call s_mpi_abort(trim(file_path)//' is missing. Exiting.')
+                end if
             end if
 
             do i = 1, num_ibs
@@ -740,44 +752,46 @@ contains
                         call s_mpi_abort('File '//trim(file_loc)//' is missing. Exiting.')
                     end if
 
-                    ! Read Levelset
-                    write (file_loc, '(A)') 'levelset.dat'
-                    file_loc = trim(case_dir)//'/restart_data'//trim(mpiiofs)//trim(file_loc)
-                    inquire (FILE=trim(file_loc), EXIST=file_exist)
+                    if (store_levelset) then
+                        ! Read Levelset
+                        write (file_loc, '(A)') 'levelset.dat'
+                        file_loc = trim(case_dir)//'/restart_data'//trim(mpiiofs)//trim(file_loc)
+                        inquire (FILE=trim(file_loc), EXIST=file_exist)
 
-                    if (file_exist) then
+                        if (file_exist) then
 
-                        call MPI_FILE_OPEN(MPI_COMM_WORLD, file_loc, MPI_MODE_RDONLY, mpi_info_int, ifile, ierr)
+                            call MPI_FILE_OPEN(MPI_COMM_WORLD, file_loc, MPI_MODE_RDONLY, mpi_info_int, ifile, ierr)
 
-                        disp = 0
+                            disp = 0
 
-                        call MPI_FILE_SET_VIEW(ifile, disp, mpi_p, MPI_IO_levelset_DATA%view, &
-                                               'native', mpi_info_int, ierr)
-                        call MPI_FILE_READ(ifile, MPI_IO_levelset_DATA%var%sf, data_size * num_ibs, &
-                                           mpi_p, status, ierr)
+                            call MPI_FILE_SET_VIEW(ifile, disp, mpi_p, MPI_IO_levelset_DATA%view, &
+                                                'native', mpi_info_int, ierr)
+                            call MPI_FILE_READ(ifile, MPI_IO_levelset_DATA%var%sf, data_size * num_ibs, &
+                                            mpi_p, status, ierr)
 
-                    else
-                        call s_mpi_abort('File '//trim(file_loc)//' is missing. Exiting.')
-                    end if
+                        else
+                            call s_mpi_abort('File '//trim(file_loc)//' is missing. Exiting.')
+                        end if
 
-                    ! Read Levelset Norm
-                    write (file_loc, '(A)') 'levelset_norm.dat'
-                    file_loc = trim(case_dir)//'/restart_data'//trim(mpiiofs)//trim(file_loc)
-                    inquire (FILE=trim(file_loc), EXIST=file_exist)
+                        ! Read Levelset Norm
+                        write (file_loc, '(A)') 'levelset_norm.dat'
+                        file_loc = trim(case_dir)//'/restart_data'//trim(mpiiofs)//trim(file_loc)
+                        inquire (FILE=trim(file_loc), EXIST=file_exist)
 
-                    if (file_exist) then
+                        if (file_exist) then
 
-                        call MPI_FILE_OPEN(MPI_COMM_WORLD, file_loc, MPI_MODE_RDONLY, mpi_info_int, ifile, ierr)
+                            call MPI_FILE_OPEN(MPI_COMM_WORLD, file_loc, MPI_MODE_RDONLY, mpi_info_int, ifile, ierr)
 
-                        disp = 0
+                            disp = 0
 
-                        call MPI_FILE_SET_VIEW(ifile, disp, mpi_p, MPI_IO_levelsetnorm_DATA%view, &
-                                               'native', mpi_info_int, ierr)
-                        call MPI_FILE_READ(ifile, MPI_IO_levelsetnorm_DATA%var%sf, data_size * num_ibs * 3, &
-                                           mpi_p, status, ierr)
+                            call MPI_FILE_SET_VIEW(ifile, disp, mpi_p, MPI_IO_levelsetnorm_DATA%view, &
+                                                'native', mpi_info_int, ierr)
+                            call MPI_FILE_READ(ifile, MPI_IO_levelsetnorm_DATA%var%sf, data_size * num_ibs * 3, &
+                                            mpi_p, status, ierr)
 
-                    else
-                        call s_mpi_abort('File '//trim(file_loc)//' is missing. Exiting.')
+                        else
+                            call s_mpi_abort('File '//trim(file_loc)//' is missing. Exiting.')
+                        end if
                     end if
 
                 end if
@@ -889,44 +903,46 @@ contains
                         call s_mpi_abort('File '//trim(file_loc)//' is missing. Exiting.')
                     end if
 
-                    ! Read Levelset
-                    write (file_loc, '(A)') 'levelset.dat'
-                    file_loc = trim(case_dir)//'/restart_data'//trim(mpiiofs)//trim(file_loc)
-                    inquire (FILE=trim(file_loc), EXIST=file_exist)
+                    if (store_levelset) then
+                        ! Read Levelset
+                        write (file_loc, '(A)') 'levelset.dat'
+                        file_loc = trim(case_dir)//'/restart_data'//trim(mpiiofs)//trim(file_loc)
+                        inquire (FILE=trim(file_loc), EXIST=file_exist)
 
-                    if (file_exist) then
+                        if (file_exist) then
 
-                        call MPI_FILE_OPEN(MPI_COMM_WORLD, file_loc, MPI_MODE_RDONLY, mpi_info_int, ifile, ierr)
+                            call MPI_FILE_OPEN(MPI_COMM_WORLD, file_loc, MPI_MODE_RDONLY, mpi_info_int, ifile, ierr)
 
-                        disp = 0
+                            disp = 0
 
-                        call MPI_FILE_SET_VIEW(ifile, disp, mpi_p, MPI_IO_levelset_DATA%view, &
-                                               'native', mpi_info_int, ierr)
-                        call MPI_FILE_READ(ifile, MPI_IO_levelset_DATA%var%sf, data_size * num_ibs, &
-                                           mpi_p, status, ierr)
+                            call MPI_FILE_SET_VIEW(ifile, disp, mpi_p, MPI_IO_levelset_DATA%view, &
+                                                'native', mpi_info_int, ierr)
+                            call MPI_FILE_READ(ifile, MPI_IO_levelset_DATA%var%sf, data_size * num_ibs, &
+                                            mpi_p, status, ierr)
 
-                    else
-                        call s_mpi_abort('File '//trim(file_loc)//' is missing. Exiting.')
-                    end if
+                        else
+                            call s_mpi_abort('File '//trim(file_loc)//' is missing. Exiting.')
+                        end if
 
-                    ! Read Levelset Norm
-                    write (file_loc, '(A)') 'levelset_norm.dat'
-                    file_loc = trim(case_dir)//'/restart_data'//trim(mpiiofs)//trim(file_loc)
-                    inquire (FILE=trim(file_loc), EXIST=file_exist)
+                        ! Read Levelset Norm
+                        write (file_loc, '(A)') 'levelset_norm.dat'
+                        file_loc = trim(case_dir)//'/restart_data'//trim(mpiiofs)//trim(file_loc)
+                        inquire (FILE=trim(file_loc), EXIST=file_exist)
 
-                    if (file_exist) then
+                        if (file_exist) then
 
-                        call MPI_FILE_OPEN(MPI_COMM_WORLD, file_loc, MPI_MODE_RDONLY, mpi_info_int, ifile, ierr)
+                            call MPI_FILE_OPEN(MPI_COMM_WORLD, file_loc, MPI_MODE_RDONLY, mpi_info_int, ifile, ierr)
 
-                        disp = 0
+                            disp = 0
 
-                        call MPI_FILE_SET_VIEW(ifile, disp, mpi_p, MPI_IO_levelsetnorm_DATA%view, &
-                                               'native', mpi_info_int, ierr)
-                        call MPI_FILE_READ(ifile, MPI_IO_levelsetnorm_DATA%var%sf, data_size * num_ibs * 3, &
-                                           mpi_p, status, ierr)
+                            call MPI_FILE_SET_VIEW(ifile, disp, mpi_p, MPI_IO_levelsetnorm_DATA%view, &
+                                                'native', mpi_info_int, ierr)
+                            call MPI_FILE_READ(ifile, MPI_IO_levelsetnorm_DATA%var%sf, data_size * num_ibs * 3, &
+                                            mpi_p, status, ierr)
 
-                    else
-                        call s_mpi_abort('File '//trim(file_loc)//' is missing. Exiting.')
+                        else
+                            call s_mpi_abort('File '//trim(file_loc)//' is missing. Exiting.')
+                        end if
                     end if
 
                 end if
@@ -1134,6 +1150,25 @@ contains
 
         call s_compute_derived_variables(t_step)
 
+        ! Volume filter flow variables, compute unclosed terms and their statistics
+        if (volume_filtering_momentum_eqn) then 
+            if (t_step > t_step_stat_start) then  
+                call nvtxStartRange("VOLUME-FILTER-MOMENTUM-EQUATION")  
+                call s_volume_filter_momentum_eqn(q_cons_ts(1)%vf, q_prim_vf, bc_type)
+                call nvtxEndRange
+
+                call nvtxStartRange("COMPUTE-STATISTICS")
+                call s_compute_statistics_momentum_unclosed_terms(t_step, t_step_stat_start, reynolds_stress, eff_visc, int_mom_exch, q_cons_filtered, filtered_pressure)
+                call nvtxEndRange
+
+                ! Compute explicit x-, y-, z- forces on each particle
+                if (compute_particle_drag) then
+                    call nvtxStartRange("COMPUTE-PARTICLE-FORCES")
+                    call s_compute_particle_forces()
+                    call nvtxEndRange
+                end if
+            end if
+        end if
 
 #ifdef DEBUG
         print *, 'Computed derived vars'
@@ -1226,6 +1261,28 @@ contains
 
         call cpu_time(start)
         call nvtxStartRange("SAVE-DATA")
+        if (q_filtered_wrt .and. (t_step == 0 .or. t_step == t_step_stop)) then
+            $:GPU_UPDATE(host='[filtered_fluid_indicator_function%sf]')
+            do i = 1, 9 
+                do j = 1, 4 
+                    $:GPU_UPDATE(host='[stat_reynolds_stress(i)%vf(j)%sf]')
+                    $:GPU_UPDATE(host='[stat_eff_visc(i)%vf(j)%sf]')
+                end do 
+            end do 
+            do i = 1, 3 
+                do j = 1, 4 
+                    $:GPU_UPDATE(host='[stat_int_mom_exch(i)%vf(j)%sf]')
+                end do 
+            end do
+            do i = 1, sys_size-1
+                do j = 1, 4 
+                    $:GPU_UPDATE(host='[stat_q_cons_filtered(i)%vf(j)%sf]')
+                end do 
+            end do
+            do i = 1, 4
+                $:GPU_UPDATE(host='[stat_filtered_pressure(i)%sf]')
+            end do
+        end if
         do i = 1, sys_size
             $:GPU_UPDATE(host='[q_cons_ts(1)%vf(i)%sf]')
             do l = 0, p
@@ -1266,6 +1323,11 @@ contains
 
             call s_write_restart_lag_bubbles(save_count) !parallel
             if (lag_params%write_bubbles_stats) call s_write_lag_bubble_stats()
+        else if (volume_filtering_momentum_eqn .and. (t_step == 0 .or. t_step == t_step_stop)) then
+            call s_write_data_files(q_cons_ts(1)%vf, q_T_sf, q_prim_vf, save_count, bc_type, & 
+                                    filtered_fluid_indicator_function=filtered_fluid_indicator_function, &
+                                    stat_q_cons_filtered=stat_q_cons_filtered, stat_filtered_pressure=stat_filtered_pressure, & 
+                                    stat_reynolds_stress=stat_reynolds_stress, stat_eff_visc=stat_eff_visc, stat_int_mom_exch=stat_int_mom_exch)
         else
             call s_write_data_files(q_cons_ts(1)%vf, q_T_sf, q_prim_vf, save_count, bc_type)
         end if
@@ -1393,6 +1455,12 @@ contains
 
         if (mhd .and. powell) call s_initialize_mhd_powell_module
 
+        if (periodic_forcing) call s_initialize_additional_forcing_module()
+        if (volume_filtering_momentum_eqn) then 
+            call s_initialize_fftw_explicit_filter_module()
+            call s_initialize_statistics_module()
+        end if
+
     end subroutine s_initialize_modules
 
     impure subroutine s_initialize_mpi_domain
@@ -1517,6 +1585,7 @@ contains
         end if
 
         $:GPU_UPDATE(device='[igr, igr_order]')
+        $:GPU_UPDATE(device='[u_inf_ref, rho_inf_ref, P_inf_ref, filter_width]')
 
     end subroutine s_initialize_gpu_vars
 
@@ -1555,6 +1624,9 @@ contains
         if (bodyForces) call s_finalize_body_forces_module()
         if (mhd .and. powell) call s_finalize_mhd_powell_module
 
+        if (periodic_forcing) call s_finalize_additional_forcing_module()
+        if (volume_filtering_momentum_eqn) call s_finalize_fftw_explicit_filter_module
+
         ! Terminating MPI execution environment
         call s_mpi_finalize()
     end subroutine s_finalize_modules
diff --git a/src/simulation/m_time_steppers.fpp b/src/simulation/m_time_steppers.fpp
index d272c842aa..4651f484d2 100644
--- a/src/simulation/m_time_steppers.fpp
+++ b/src/simulation/m_time_steppers.fpp
@@ -46,6 +46,10 @@ module m_time_steppers
 
     use m_body_forces
 
+    use m_volume_filtering
+
+    use m_compute_statistics
+
     implicit none
 
     type(vector_field), allocatable, dimension(:) :: q_cons_ts !<
@@ -57,7 +61,7 @@ module m_time_steppers
     type(scalar_field), allocatable, dimension(:) :: rhs_vf !<
     !! Cell-average RHS variables at the current time-stage
 
-    type(integer_field), allocatable, dimension(:, :) :: bc_type !<
+    type(integer_field), allocatable, dimension(:, :), public :: bc_type !<
     !! Boundary condition identifiers
 
     type(vector_field), allocatable, dimension(:) :: q_prim_ts !<
diff --git a/src/simulation/m_volume_filtering.fpp b/src/simulation/m_volume_filtering.fpp
new file mode 100644
index 0000000000..94f9804955
--- /dev/null
+++ b/src/simulation/m_volume_filtering.fpp
@@ -0,0 +1,1926 @@
+#:include 'macros.fpp'
+
+module m_volume_filtering
+
+    use, intrinsic :: iso_c_binding
+
+    use m_derived_types        !< Definitions of the derived types
+
+    use m_global_parameters    !< Definitions of the global parameters
+
+    use m_mpi_proxy            !< Message passing interface (MPI) module proxy
+
+    use m_ibm
+
+    use m_boundary_common
+
+    use m_nvtx
+
+#ifdef MFC_MPI
+    use mpi                    !< Message passing interface (MPI) module
+#endif
+
+#if defined(MFC_OpenACC) && defined(__PGI)
+    use cufft
+#endif
+
+    implicit none
+
+    private; public :: s_initialize_fftw_explicit_filter_module, &
+ s_initialize_filtering_kernel, s_initialize_fluid_indicator_function, &
+ s_initialize_filtered_fluid_indicator_function, s_initialize_fluid_indicator_gradient, &
+ s_finalize_fftw_explicit_filter_module, s_volume_filter_momentum_eqn, s_apply_fftw_filter_scalarfield, s_filter_batch, &
+ s_compute_viscous_stress_tensor, s_compute_stress_tensor, s_compute_divergence_stress_tensor, s_compute_particle_forces, &
+ s_mpi_transpose_slabZ2Y, s_mpi_transpose_slabY2Z, s_mpi_transpose_slabZ2Y_batch, s_mpi_transpose_slabY2Z_batch, &
+ s_mpi_FFT_fwd, s_mpi_FFT_bwd, s_setup_terms_filtering, s_compute_pseudo_turbulent_reynolds_stress, s_compute_effective_viscosity
+
+#if !defined(MFC_OpenACC)
+    include 'fftw3.f03'
+#endif
+
+    integer :: ierr
+
+    ! fluid indicator function (1 = fluid, 0 = otherwise)
+    type(scalar_field), public :: fluid_indicator_function
+    type(scalar_field), public :: filtered_fluid_indicator_function
+    type(scalar_field), allocatable, dimension(:) :: grad_fluid_indicator
+
+    ! volume filtered conservative variables
+    type(scalar_field), allocatable, dimension(:), public :: q_cons_filtered
+    type(scalar_field), public :: filtered_pressure
+
+    ! viscous and pressure+viscous stress tensors
+    type(vector_field), allocatable, dimension(:) :: visc_stress
+    type(vector_field), allocatable, dimension(:) :: pres_visc_stress
+
+    ! divergence of stress tensor
+    type(scalar_field), allocatable, dimension(:) :: div_pres_visc_stress
+
+    ! unclosed terms in volume filtered momentum equation
+    type(vector_field), allocatable, dimension(:), public :: reynolds_stress
+    type(vector_field), allocatable, dimension(:), public :: eff_visc
+    type(scalar_field), allocatable, dimension(:), public :: int_mom_exch
+
+    ! 1/mu
+    real(wp), allocatable, dimension(:, :) :: Res
+
+    ! x-,y-,z-direction forces on particles
+    real(wp), allocatable, dimension(:, :) :: particle_forces
+
+    $:GPU_DECLARE(create='[fluid_indicator_function, filtered_fluid_indicator_function, grad_fluid_indicator]')
+    $:GPU_DECLARE(create='[q_cons_filtered, filtered_pressure, visc_stress, pres_visc_stress, div_pres_visc_stress]')
+    $:GPU_DECLARE(create='[reynolds_stress, eff_visc, int_mom_exch, Res, particle_forces]')
+
+#if defined(MFC_OpenACC)
+    ! GPU plans
+    integer :: plan_x_fwd_gpu, plan_x_bwd_gpu, plan_y_gpu, plan_z_gpu
+#else
+    ! CPU plans
+    type(c_ptr) :: plan_x_r2c_fwd, plan_x_c2r_bwd
+    type(c_ptr) :: plan_y_c2c_fwd, plan_y_c2c_bwd
+    type(c_ptr) :: plan_z_c2c_fwd, plan_z_c2c_bwd
+    type(c_ptr) :: plan_x_r2c_kernelG, plan_y_c2c_kernelG, plan_z_c2c_kernelG
+#endif
+
+    ! domain size information (global, complex, local)
+    integer :: Nx, Ny, Nz, NxC, Nyloc, Nzloc
+
+    ! 1D real and complex vectors for FFT routines
+    real(c_double), allocatable :: data_real_in1d(:)
+    complex(c_double_complex), allocatable :: data_cmplx_out1d(:)
+    complex(c_double_complex), allocatable :: data_cmplx_out1dy(:)
+
+    ! 3D arrays for slab transposes
+    complex(c_float_complex), allocatable :: data_cmplx_slabz(:, :, :), data_cmplx_slaby(:, :, :)
+    ! 3D arrays for slab transposes of tensor quantities
+    complex(c_float_complex), allocatable :: data_cmplx_slabz_batch(:, :, :, :), data_cmplx_slaby_batch(:, :, :, :)
+
+    ! input/output array for FFT routine
+    real(c_double), allocatable :: data_real_3D_slabz(:, :, :)
+
+    ! filtering kernel in physical space
+    real(c_double), allocatable :: real_kernelG_in(:, :, :)
+
+    ! FFT of filtering kernel
+    complex(c_double_complex), allocatable :: cmplx_kernelG1d(:)
+
+    $:GPU_DECLARE(create='[Nx, Ny, Nz, NxC, Nyloc, Nzloc]')
+    $:GPU_DECLARE(create='[data_real_in1d, data_cmplx_out1d, data_cmplx_out1dy, data_cmplx_slabz, data_cmplx_slaby, data_cmplx_slabz_batch, data_cmplx_slaby_batch]')
+    $:GPU_DECLARE(create='[data_real_3D_slabz, real_kernelG_in, cmplx_kernelG1d]')
+
+    ! buffers for data transpose
+    complex(c_float_complex), allocatable :: sendbuf_sf(:), recvbuf_sf(:)
+    complex(c_float_complex), allocatable :: sendbuf_batch(:), recvbuf_batch(:)
+
+    $:GPU_DECLARE(create='[sendbuf_sf, recvbuf_sf, sendbuf_batch, recvbuf_batch]')
+
+contains
+
+    !< create fft plans to be used for explicit filtering of data
+    subroutine s_initialize_fftw_explicit_filter_module
+        integer :: i, j, k
+        integer :: size_n(1), inembed(1), onembed(1)
+
+        @:ALLOCATE(q_cons_filtered(1:sys_size-1))
+        do i = 1, sys_size - 1
+            @:ALLOCATE(q_cons_filtered(i)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
+                idwbuff(2)%beg:idwbuff(2)%end, &
+                idwbuff(3)%beg:idwbuff(3)%end))
+            @:ACC_SETUP_SFs(q_cons_filtered(i))
+        end do
+
+        @:ALLOCATE(filtered_pressure%sf(idwbuff(1)%beg:idwbuff(1)%end, &
+            idwbuff(2)%beg:idwbuff(2)%end, &
+            idwbuff(3)%beg:idwbuff(3)%end))
+        @:ACC_SETUP_SFs(filtered_pressure)
+
+        @:ALLOCATE(visc_stress(1:num_dims))
+        do i = 1, num_dims
+            @:ALLOCATE(visc_stress(i)%vf(1:num_dims))
+        end do
+        do i = 1, num_dims
+            do j = 1, num_dims
+                @:ALLOCATE(visc_stress(i)%vf(j)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
+                    idwbuff(2)%beg:idwbuff(2)%end, &
+                    idwbuff(3)%beg:idwbuff(3)%end))
+            end do
+            @:ACC_SETUP_VFs(visc_stress(i))
+        end do
+
+        @:ALLOCATE(pres_visc_stress(1:num_dims))
+        do i = 1, num_dims
+            @:ALLOCATE(pres_visc_stress(i)%vf(1:num_dims))
+        end do
+        do i = 1, num_dims
+            do j = 1, num_dims
+                @:ALLOCATE(pres_visc_stress(i)%vf(j)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
+                    idwbuff(2)%beg:idwbuff(2)%end, &
+                    idwbuff(3)%beg:idwbuff(3)%end))
+            end do
+            @:ACC_SETUP_VFs(pres_visc_stress(i))
+        end do
+
+        @:ALLOCATE(div_pres_visc_stress(1:num_dims))
+        do i = 1, num_dims
+            @:ALLOCATE(div_pres_visc_stress(i)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
+                idwbuff(2)%beg:idwbuff(2)%end, &
+                idwbuff(3)%beg:idwbuff(3)%end))
+            @:ACC_SETUP_SFs(div_pres_visc_stress(i))
+        end do
+
+        @:ALLOCATE(reynolds_stress(1:num_dims))
+        do i = 1, num_dims
+            @:ALLOCATE(reynolds_stress(i)%vf(1:num_dims))
+        end do
+        do i = 1, num_dims
+            do j = 1, num_dims
+                @:ALLOCATE(reynolds_stress(i)%vf(j)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
+                    idwbuff(2)%beg:idwbuff(2)%end, &
+                    idwbuff(3)%beg:idwbuff(3)%end))
+            end do
+            @:ACC_SETUP_VFs(reynolds_stress(i))
+        end do
+
+        @:ALLOCATE(eff_visc(1:num_dims))
+        do i = 1, num_dims
+            @:ALLOCATE(eff_visc(i)%vf(1:num_dims))
+        end do
+        do i = 1, num_dims
+            do j = 1, num_dims
+                @:ALLOCATE(eff_visc(i)%vf(j)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
+                    idwbuff(2)%beg:idwbuff(2)%end, &
+                    idwbuff(3)%beg:idwbuff(3)%end))
+            end do
+            @:ACC_SETUP_VFs(eff_visc(i))
+        end do
+
+        @:ALLOCATE(int_mom_exch(1:num_dims))
+        do i = 1, num_dims
+            @:ALLOCATE(int_mom_exch(i)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
+                idwbuff(2)%beg:idwbuff(2)%end, &
+                idwbuff(3)%beg:idwbuff(3)%end))
+            @:ACC_SETUP_SFs(int_mom_exch(i))
+        end do
+
+        if (viscous) then
+            @:ALLOCATE(Res(1:2, 1:maxval(Re_size)))
+        end if
+
+        if (viscous) then
+            do i = 1, 2
+                do j = 1, Re_size(i)
+                    Res(i, j) = fluid_pp(Re_idx(i, j))%Re(i)
+                end do
+            end do
+            $:GPU_UPDATE(device='[Res]')
+        end if
+
+        @:ALLOCATE(particle_forces(0:num_ibs, 3))
+
+        !< global sizes
+        Nx = m_glb + 1
+        Ny = n_glb + 1
+        Nz = p_glb + 1
+
+        !< complex size
+        NxC = Nx/2 + 1
+
+        !< local sizes on each processor
+        Nyloc = Ny/num_procs
+        Nzloc = p + 1
+
+        $:GPU_UPDATE(device='[Nx, Ny, Nz, NxC, Nyloc, Nzloc]')
+
+        @:ALLOCATE(data_real_in1d(Nx*Ny*Nzloc))
+        @:ALLOCATE(data_cmplx_out1d(NxC*Ny*Nz/num_procs))
+        @:ALLOCATE(data_cmplx_out1dy(NxC*Ny*Nz/num_procs))
+        @:ALLOCATE(cmplx_kernelG1d(NxC*Nyloc*Nz))
+        @:ALLOCATE(real_kernelG_in(Nx, Ny, Nzloc))
+        @:ALLOCATE(data_real_3D_slabz(Nx, Ny, Nzloc))
+        @:ALLOCATE(data_cmplx_slabz(NxC, Ny, Nzloc))
+        @:ALLOCATE(data_cmplx_slaby(NxC, Nyloc, Nz))
+        @:ALLOCATE(data_cmplx_slabz_batch(24, NxC, Ny, Nzloc))
+        @:ALLOCATE(data_cmplx_slaby_batch(24, NxC, Nyloc, Nz))
+
+        @:ALLOCATE(sendbuf_sf(NxC*Nyloc*Nzloc*num_procs))
+        @:ALLOCATE(recvbuf_sf(NxC*Nyloc*Nzloc*num_procs))
+        @:ALLOCATE(sendbuf_batch(24*NxC*Nyloc*Nzloc*num_procs))
+        @:ALLOCATE(recvbuf_batch(24*NxC*Nyloc*Nzloc*num_procs))
+
+#if defined(MFC_OpenACC)
+        !< GPU FFT plans
+        !< X - plans
+        size_n(1) = Nx
+        inembed(1) = Nx
+        onembed(1) = NxC
+        ierr = cufftPlanMany(plan_x_fwd_gpu, 1, size_n, inembed, 1, Nx, onembed, 1, NxC, CUFFT_D2Z, Ny*Nzloc)
+        size_n(1) = Nx
+        inembed(1) = NxC
+        onembed(1) = Nx
+        ierr = cufftPlanMany(plan_x_bwd_gpu, 1, size_n, inembed, 1, NxC, onembed, 1, Nx, CUFFT_Z2D, Ny*Nzloc)
+        !< Y - plans
+        size_n(1) = Ny
+        inembed(1) = Ny
+        onembed(1) = Ny
+        ierr = cufftPlanMany(plan_y_gpu, 1, size_n, inembed, 1, Ny, onembed, 1, Ny, CUFFT_Z2Z, NxC*Nzloc)
+        !< Z - plans
+        size_n(1) = Nz
+        inembed(1) = Nz
+        onembed(1) = Nz
+        ierr = cufftPlanMany(plan_z_gpu, 1, size_n, inembed, 1, Nz, onembed, 1, Nz, CUFFT_Z2Z, NxC*Nyloc)
+#else
+        !< CPU FFT plans
+        !< X - direction plans
+        size_n(1) = Nx
+        inembed(1) = Nx
+        onembed(1) = NxC
+        plan_x_r2c_fwd = fftw_plan_many_dft_r2c(1, size_n, Ny*Nzloc, &                  ! rank, n, howmany
+                                                data_real_in1d, inembed, 1, Nx, &       ! in, inembed, istride, idist
+                                                data_cmplx_out1d, onembed, 1, NxC, &    ! out, onembed, ostride, odist
+                                                FFTW_MEASURE)                           ! sign, flags
+        size_n(1) = Nx
+        inembed(1) = NxC
+        onembed(1) = Nx
+        plan_x_c2r_bwd = fftw_plan_many_dft_c2r(1, size_n, Ny*Nzloc, &
+                                                data_cmplx_out1d, inembed, 1, NxC, &
+                                                data_real_in1d, onembed, 1, Nx, &
+                                                FFTW_MEASURE)
+        !< Y - direction plans
+        size_n(1) = Ny
+        inembed(1) = Ny
+        onembed(1) = Ny
+        plan_y_c2c_fwd = fftw_plan_many_dft(1, size_n, NxC*Nzloc, &
+                                            data_cmplx_out1dy, inembed, 1, Ny, &
+                                            data_cmplx_out1dy, onembed, 1, Ny, &
+                                            FFTW_FORWARD, FFTW_MEASURE)
+        plan_y_c2c_bwd = fftw_plan_many_dft(1, size_n, NxC*Nzloc, &
+                                            data_cmplx_out1dy, inembed, 1, Ny, &
+                                            data_cmplx_out1dy, onembed, 1, Ny, &
+                                            FFTW_BACKWARD, FFTW_MEASURE)
+        !< Z - direction plans
+        size_n(1) = Nz
+        inembed(1) = Nz
+        onembed(1) = Nz
+        plan_z_c2c_fwd = fftw_plan_many_dft(1, size_n, NxC*Nyloc, &
+                                            data_cmplx_out1d, inembed, 1, Nz, &
+                                            data_cmplx_out1d, onembed, 1, Nz, &
+                                            FFTW_FORWARD, FFTW_MEASURE)
+        plan_z_c2c_bwd = fftw_plan_many_dft(1, size_n, NxC*Nyloc, &
+                                            data_cmplx_out1d, inembed, 1, Nz, &
+                                            data_cmplx_out1d, onembed, 1, Nz, &
+                                            FFTW_BACKWARD, FFTW_MEASURE)
+        ! forward plans for filtering kernel
+        ! X kernel plan
+        size_n(1) = Nx
+        inembed(1) = Nx
+        onembed(1) = NxC
+        plan_x_r2c_kernelG = fftw_plan_many_dft_r2c(1, size_n, Ny*Nzloc, &
+                                                    data_real_in1d, inembed, 1, Nx, &
+                                                    cmplx_kernelG1d, onembed, 1, NxC, &
+                                                    FFTW_MEASURE)
+        ! Y kernel plan
+        size_n(1) = Ny
+        inembed(1) = Ny
+        onembed(1) = Ny
+        plan_y_c2c_kernelG = fftw_plan_many_dft(1, size_n, NxC*Nzloc, &
+                                                data_cmplx_out1dy, inembed, 1, Ny, &
+                                                data_cmplx_out1dy, onembed, 1, Ny, &
+                                                FFTW_FORWARD, FFTW_MEASURE)
+        ! Z kernel plan
+        size_n(1) = Nz
+        inembed(1) = Nz
+        onembed(1) = Nz
+        plan_z_c2c_kernelG = fftw_plan_many_dft(1, size_n, NxC*Nyloc, &
+                                                cmplx_kernelG1d, inembed, 1, Nz, &
+                                                cmplx_kernelG1d, onembed, 1, Nz, &
+                                                FFTW_FORWARD, FFTW_MEASURE)
+#endif
+
+        ! file for particle forces
+        if (compute_particle_drag) then
+            if (proc_rank == 0) then
+                open (unit=100, file='particle_force.bin', status='replace', form='unformatted', access='stream', action='write')
+            end if
+        end if
+
+    end subroutine s_initialize_fftw_explicit_filter_module
+
+    !< initialize the gaussian filtering kernel in real space and then compute its DFT
+    subroutine s_initialize_filtering_kernel
+        real(wp) :: sigma_stddev
+        real(wp) :: Lx, Ly, Lz
+        real(wp) :: x_r, y_r, z_r
+        real(wp) :: r2
+        real(wp) :: G_norm_int, G_norm_int_glb
+        integer :: i, j, k
+
+        ! gaussian filter
+        sigma_stddev = filter_width
+
+        Lx = x_domain_end_glb - x_domain_beg_glb
+        Ly = y_domain_end_glb - y_domain_beg_glb
+        Lz = z_domain_end_glb - z_domain_beg_glb
+
+        G_norm_int = 0.0_wp
+
+        $:GPU_PARALLEL_LOOP(collapse=3, reduction='[[G_norm_int]]', reductionOp='[+]', copyin='[Lx, Ly, Lz, sigma_stddev]', private='[x_r, y_r, z_r, r2]')
+        do i = 0, m
+            do j = 0, n
+                do k = 0, p
+                    x_r = min(abs(x_cc(i) - x_domain_beg_glb), Lx - abs(x_cc(i) - x_domain_beg_glb))
+                    y_r = min(abs(y_cc(j) - y_domain_beg_glb), Ly - abs(y_cc(j) - y_domain_beg_glb))
+                    z_r = min(abs(z_cc(k) - z_domain_beg_glb), Lz - abs(z_cc(k) - z_domain_beg_glb))
+
+                    r2 = x_r**2 + y_r**2 + z_r**2
+
+                    real_kernelG_in(i + 1, j + 1, k + 1) = exp(-r2/(2.0_wp*sigma_stddev**2))
+
+                    G_norm_int = G_norm_int + real_kernelG_in(i + 1, j + 1, k + 1)*dx(i)*dy(j)*dz(k)
+                end do
+            end do
+        end do
+
+        call s_mpi_allreduce_sum(G_norm_int, G_norm_int_glb)
+
+        ! FFT of kernel
+        ! normalize kernel
+        $:GPU_PARALLEL_LOOP(collapse=3, copyin='[G_norm_int_glb]')
+        do i = 1, Nx
+            do j = 1, Ny
+                do k = 1, Nzloc
+                    data_real_3D_slabz(i, j, k) = real_kernelG_in(i, j, k)/G_norm_int_glb
+                end do
+            end do
+        end do
+
+        ! 3D z-slab -> 1D x, y, z
+        $:GPU_PARALLEL_LOOP(collapse=3)
+        do i = 1, Nx
+            do j = 1, Ny
+                do k = 1, Nzloc
+                    data_real_in1d(i + (j - 1)*Nx + (k - 1)*Nx*Ny) = data_real_3D_slabz(i, j, k)
+                end do
+            end do
+        end do
+
+        ! X FFT
+#if defined(MFC_OpenACC)
+        ierr = cufftExecD2Z(plan_x_fwd_gpu, data_real_in1d, cmplx_kernelG1d)
+#else
+        call fftw_execute_dft_r2c(plan_x_r2c_kernelG, data_real_in1d, cmplx_kernelG1d)
+#endif
+
+        ! 1D x, y, z -> 1D y, x, z (CMPLX)
+        $:GPU_PARALLEL_LOOP(collapse=3)
+        do i = 1, NxC
+            do j = 1, Ny
+                do k = 1, Nzloc
+                    data_cmplx_out1dy(j + (i - 1)*Ny + (k - 1)*Ny*NxC) = cmplx_kernelG1d(i + (j - 1)*NxC + (k - 1)*NxC*Ny)
+                end do
+            end do
+        end do
+
+        ! Y FFT
+#if defined(MFC_OpenACC)
+        ierr = cufftExecZ2Z(plan_y_gpu, data_cmplx_out1dy, data_cmplx_out1dy, CUFFT_FORWARD)
+#else
+        call fftw_execute_dft(plan_y_c2c_kernelG, data_cmplx_out1dy, data_cmplx_out1dy)
+#endif
+
+        ! 1D y, x, z -> 3D z-slab
+        $:GPU_PARALLEL_LOOP(collapse=3)
+        do i = 1, NxC
+            do j = 1, Ny
+                do k = 1, Nzloc
+                    data_cmplx_slabz(i, j, k) = data_cmplx_out1dy(j + (i - 1)*Ny + (k - 1)*Ny*NxC)
+                end do
+            end do
+        end do
+
+        ! transpose z-slab to y-slab
+        call s_mpi_transpose_slabZ2Y
+
+        ! 3D y-slab -> 1D z, x, y
+        $:GPU_PARALLEL_LOOP(collapse=3)
+        do i = 1, NxC
+            do j = 1, Nyloc
+                do k = 1, Nz
+                    cmplx_kernelG1d(k + (i - 1)*Nz + (j - 1)*Nz*NxC) = data_cmplx_slaby(i, j, k)
+                end do
+            end do
+        end do
+
+        ! Z FFT
+#if defined(MFC_OpenACC)
+        ierr = cufftExecZ2Z(plan_z_gpu, cmplx_kernelG1d, cmplx_kernelG1d, CUFFT_FORWARD)
+#else
+        call fftw_execute_dft(plan_z_c2c_kernelG, cmplx_kernelG1d, cmplx_kernelG1d)
+#endif
+
+        ! normalize FFT
+        $:GPU_PARALLEL_LOOP(collapse=3)
+        do i = 1, NxC
+            do j = 1, Nyloc
+                do k = 1, Nz
+                    cmplx_kernelG1d(k + (i - 1)*Nz + (j - 1)*Nz*NxC) = cmplx_kernelG1d(k + (i - 1)*Nz + (j - 1)*Nz*NxC)/(real(Nx*Ny*Nz, wp))
+                end do
+            end do
+        end do
+
+        ! return cmplx_kernelG1d: 1D z, x, y
+    end subroutine s_initialize_filtering_kernel
+
+    !< initialize fluid indicator function and filtered fluid indicator function
+    subroutine s_initialize_fluid_indicator_function(bc_type)
+        type(integer_field), dimension(1:num_dims, -1:1), intent(in) :: bc_type
+        integer :: i, j, k
+
+        @:ALLOCATE(fluid_indicator_function%sf(idwbuff(1)%beg:idwbuff(1)%end, &
+            idwbuff(2)%beg:idwbuff(2)%end, &
+            idwbuff(3)%beg:idwbuff(3)%end))
+        @:ACC_SETUP_SFs(fluid_indicator_function)
+
+        ! define fluid indicator function
+        $:GPU_PARALLEL_LOOP(collapse=3)
+        do i = 0, m
+            do j = 0, n
+                do k = 0, p
+                    if (ib_markers%sf(i, j, k) == 0) then
+                        fluid_indicator_function%sf(i, j, k) = 1.0_wp
+                    else
+                        fluid_indicator_function%sf(i, j, k) = 0.0_wp
+                    end if
+                end do
+            end do
+        end do
+
+        call s_populate_scalarfield_buffers(bc_type, fluid_indicator_function)
+
+    end subroutine s_initialize_fluid_indicator_function
+
+    subroutine s_initialize_filtered_fluid_indicator_function
+        integer :: i, j, k
+
+        @:ALLOCATE(filtered_fluid_indicator_function%sf(0:m, 0:n, 0:p))
+        @:ACC_SETUP_SFs(filtered_fluid_indicator_function)
+
+        ! filter fluid indicator function
+        $:GPU_PARALLEL_LOOP(collapse=3)
+        do i = 1, Nx
+            do j = 1, Ny
+                do k = 1, Nzloc
+                    data_real_3D_slabz(i, j, k) = fluid_indicator_function%sf(i - 1, j - 1, k - 1)
+                end do
+            end do
+        end do
+
+        call s_mpi_FFT_fwd
+
+        $:GPU_PARALLEL_LOOP(collapse=3)
+        do i = 1, NxC
+            do j = 1, Nyloc
+                do k = 1, Nz
+                    data_cmplx_out1d(k + (i - 1)*Nz + (j - 1)*Nz*NxC) = data_cmplx_out1d(k + (i - 1)*Nz + (j - 1)*Nz*NxC)*cmplx_kernelG1d(k + (i - 1)*Nz + (j - 1)*Nz*NxC)
+                end do
+            end do
+        end do
+
+        call s_mpi_FFT_bwd
+
+        $:GPU_PARALLEL_LOOP(collapse=3)
+        do i = 1, Nx
+            do j = 1, Ny
+                do k = 1, Nzloc
+                    filtered_fluid_indicator_function%sf(i - 1, j - 1, k - 1) = data_real_3D_slabz(i, j, k)/(real(Nx*Ny*Nz, wp))
+                end do
+            end do
+        end do
+
+    end subroutine s_initialize_filtered_fluid_indicator_function
+
+    subroutine s_initialize_fluid_indicator_gradient
+        real(wp), dimension(-4:4) :: fd_coeffs
+        integer :: i, j, k, l
+
+        fd_coeffs(-4) = 1._wp/280._wp
+        fd_coeffs(-3) = -4._wp/105._wp
+        fd_coeffs(-2) = 1._wp/5._wp
+        fd_coeffs(-1) = -4._wp/5._wp
+        fd_coeffs(0) = 0._wp
+        fd_coeffs(1) = 4._wp/5._wp
+        fd_coeffs(2) = -1._wp/5._wp
+        fd_coeffs(3) = 4._wp/105._wp
+        fd_coeffs(4) = -1._wp/280._wp
+
+        @:ALLOCATE(grad_fluid_indicator(1:3))
+        do i = 1, 3
+            @:ALLOCATE(grad_fluid_indicator(i)%sf(0:m, 0:n, 0:p))
+            @:ACC_SETUP_SFs(grad_fluid_indicator(i))
+        end do
+
+        $:GPU_PARALLEL_LOOP(collapse=3, copyin='[fd_coeffs]')
+        do i = 0, m
+            do j = 0, n
+                do k = 0, p
+                    $:GPU_LOOP(parallelism='[seq]')
+                    do l = -4, 4
+                        grad_fluid_indicator(1)%sf(i, j, k) = grad_fluid_indicator(1)%sf(i, j, k) + fd_coeffs(l)*fluid_indicator_function%sf(i + l, j, k)/dx(i)
+                        grad_fluid_indicator(2)%sf(i, j, k) = grad_fluid_indicator(2)%sf(i, j, k) + fd_coeffs(l)*fluid_indicator_function%sf(i, j + l, k)/dy(j)
+                        grad_fluid_indicator(3)%sf(i, j, k) = grad_fluid_indicator(3)%sf(i, j, k) + fd_coeffs(l)*fluid_indicator_function%sf(i, j, k + l)/dz(k)
+                    end do
+                end do
+            end do
+        end do
+
+    end subroutine s_initialize_fluid_indicator_gradient
+
+    !< calculate the unclosed terms present in the volume filtered momentum equation
+    subroutine s_volume_filter_momentum_eqn(q_cons_vf, q_prim_vf, bc_type)
+        type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
+        type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
+        type(integer_field), dimension(1:num_dims, -1:1), intent(in) :: bc_type
+        integer :: i, j, k
+
+        call s_setup_terms_filtering(q_cons_vf, q_prim_vf, reynolds_stress, visc_stress, pres_visc_stress, div_pres_visc_stress, bc_type)
+
+        call s_filter_batch(q_cons_vf, q_cons_filtered, q_prim_vf(E_idx), filtered_pressure, reynolds_stress, visc_stress, eff_visc)
+
+        call s_compute_interphase_momentum_exchange(filtered_fluid_indicator_function, grad_fluid_indicator, pres_visc_stress, int_mom_exch)
+
+        call s_compute_pseudo_turbulent_reynolds_stress(q_cons_filtered, reynolds_stress)
+        call s_compute_effective_viscosity(q_cons_filtered, eff_visc, visc_stress, bc_type)
+
+    end subroutine s_volume_filter_momentum_eqn
+
+    !< applies the gaussian filter to an arbitrary scalar field
+    subroutine s_apply_fftw_filter_scalarfield(filtered_fluid_indicator_function, fluid_quantity, q_temp_in, q_temp_out)
+        type(scalar_field), intent(in) :: filtered_fluid_indicator_function
+        type(scalar_field), intent(inout) :: q_temp_in
+        type(scalar_field), intent(inout), optional :: q_temp_out
+
+        logical, intent(in) :: fluid_quantity !< whether or not convolution integral is over V_f or V_p^(i) - integral over fluid volume or particle volume
+
+        integer :: i, j, k
+
+        if (fluid_quantity) then
+            $:GPU_PARALLEL_LOOP(collapse=3)
+            do i = 0, m
+                do j = 0, n
+                    do k = 0, p
+                        data_real_3D_slabz(i + 1, j + 1, k + 1) = q_temp_in%sf(i, j, k)*fluid_indicator_function%sf(i, j, k)
+                    end do
+                end do
+            end do
+        else
+            $:GPU_PARALLEL_LOOP(collapse=3)
+            do i = 0, m
+                do j = 0, n
+                    do k = 0, p
+                        data_real_3D_slabz(i + 1, j + 1, k + 1) = q_temp_in%sf(i, j, k)*(1.0_wp - fluid_indicator_function%sf(i, j, k))
+                    end do
+                end do
+            end do
+        end if
+
+        call nvtxStartRange("FORWARD-3D-FFT")
+        call s_mpi_FFT_fwd
+        call nvtxEndRange
+
+        $:GPU_PARALLEL_LOOP(collapse=3)
+        do i = 1, NxC
+            do j = 1, Nyloc
+                do k = 1, Nz
+                    data_cmplx_out1d(k + (i - 1)*Nz + (j - 1)*Nz*NxC) = data_cmplx_out1d(k + (i - 1)*Nz + (j - 1)*Nz*NxC)*cmplx_kernelG1d(k + (i - 1)*Nz + (j - 1)*Nz*NxC)
+                end do
+            end do
+        end do
+
+        call nvtxStartRange("BACKWARD-3D-FFT")
+        call s_mpi_FFT_bwd
+        call nvtxEndRange
+
+        if (present(q_temp_out)) then
+            $:GPU_PARALLEL_LOOP(collapse=3)
+            do i = 0, m
+                do j = 0, n
+                    do k = 0, p
+                        q_temp_out%sf(i, j, k) = data_real_3D_slabz(i + 1, j + 1, k + 1)/(real(Nx*Ny*Nz, wp)*filtered_fluid_indicator_function%sf(i, j, k))
+                    end do
+                end do
+            end do
+        else
+            $:GPU_PARALLEL_LOOP(collapse=3)
+            do i = 0, m
+                do j = 0, n
+                    do k = 0, p
+                        q_temp_in%sf(i, j, k) = data_real_3D_slabz(i + 1, j + 1, k + 1)/(real(Nx*Ny*Nz, wp)*filtered_fluid_indicator_function%sf(i, j, k))
+                    end do
+                end do
+            end do
+        end if
+
+    end subroutine s_apply_fftw_filter_scalarfield
+
+    ! compute viscous stress tensor
+    subroutine s_compute_viscous_stress_tensor(visc_stress, q_prim_vf, q_cons_filtered)
+        type(vector_field), dimension(num_dims), intent(inout) :: visc_stress
+        type(scalar_field), dimension(sys_size), intent(in), optional :: q_prim_vf
+        type(scalar_field), dimension(sys_size - 1), intent(in), optional :: q_cons_filtered
+        real(wp) :: dudx, dudy, dudz, dvdx, dvdy, dvdz, dwdx, dwdy, dwdz ! spatial velocity derivatives
+        integer :: i, j, k
+
+        if (present(q_prim_vf)) then
+            $:GPU_PARALLEL_LOOP(collapse=3, private='[dudx, dudy, dudz, dvdx, dvdy, dvdz, dwdx, dwdy, dwdz]')
+            do i = 0, m
+                do j = 0, n
+                    do k = 0, p
+                        ! velocity gradients, local to each process
+                        dudx = (q_prim_vf(2)%sf(i + 1, j, k) - q_prim_vf(2)%sf(i - 1, j, k))/(2._wp*dx(i))
+                        dudy = (q_prim_vf(2)%sf(i, j + 1, k) - q_prim_vf(2)%sf(i, j - 1, k))/(2._wp*dy(j))
+                        dudz = (q_prim_vf(2)%sf(i, j, k + 1) - q_prim_vf(2)%sf(i, j, k - 1))/(2._wp*dz(k))
+
+                        dvdx = (q_prim_vf(3)%sf(i + 1, j, k) - q_prim_vf(3)%sf(i - 1, j, k))/(2._wp*dx(i))
+                        dvdy = (q_prim_vf(3)%sf(i, j + 1, k) - q_prim_vf(3)%sf(i, j - 1, k))/(2._wp*dy(j))
+                        dvdz = (q_prim_vf(3)%sf(i, j, k + 1) - q_prim_vf(3)%sf(i, j, k - 1))/(2._wp*dz(k))
+
+                        dwdx = (q_prim_vf(4)%sf(i + 1, j, k) - q_prim_vf(4)%sf(i - 1, j, k))/(2._wp*dx(i))
+                        dwdy = (q_prim_vf(4)%sf(i, j + 1, k) - q_prim_vf(4)%sf(i, j - 1, k))/(2._wp*dy(j))
+                        dwdz = (q_prim_vf(4)%sf(i, j, k + 1) - q_prim_vf(4)%sf(i, j, k - 1))/(2._wp*dz(k))
+
+                        ! viscous stress tensor, visc_stress(row, column)
+                        visc_stress(1)%vf(1)%sf(i, j, k) = (4._wp/3._wp*dudx - 2._wp/3._wp*(dvdy + dwdz))/Res(1, 1)
+                        visc_stress(1)%vf(2)%sf(i, j, k) = (dudy + dvdx)/Res(1, 1)
+                        visc_stress(1)%vf(3)%sf(i, j, k) = (dudz + dwdx)/Res(1, 1)
+                        visc_stress(2)%vf(1)%sf(i, j, k) = (dvdx + dudy)/Res(1, 1)
+                        visc_stress(2)%vf(2)%sf(i, j, k) = (4._wp/3._wp*dvdy - 2._wp/3._wp*(dudx + dwdz))/Res(1, 1)
+                        visc_stress(2)%vf(3)%sf(i, j, k) = (dvdz + dwdy)/Res(1, 1)
+                        visc_stress(3)%vf(1)%sf(i, j, k) = (dwdx + dudz)/Res(1, 1)
+                        visc_stress(3)%vf(2)%sf(i, j, k) = (dwdy + dvdz)/Res(1, 1)
+                        visc_stress(3)%vf(3)%sf(i, j, k) = (4._wp/3._wp*dwdz - 2._wp/3._wp*(dudx + dvdy))/Res(1, 1)
+                    end do
+                end do
+            end do
+        else if (present(q_cons_filtered)) then
+            $:GPU_PARALLEL_LOOP(collapse=3, private='[dudx, dudy, dudz, dvdx, dvdy, dvdz, dwdx, dwdy, dwdz]')
+            do i = 0, m
+                do j = 0, n
+                    do k = 0, p
+                        ! velocity gradients, local to each process
+                        dudx = (q_cons_filtered(2)%sf(i + 1, j, k)/q_cons_filtered(1)%sf(i + 1, j, k) - q_cons_filtered(2)%sf(i - 1, j, k)/q_cons_filtered(1)%sf(i - 1, j, k))/(2._wp*dx(i))
+                        dudy = (q_cons_filtered(2)%sf(i, j + 1, k)/q_cons_filtered(1)%sf(i, j + 1, k) - q_cons_filtered(2)%sf(i, j - 1, k)/q_cons_filtered(1)%sf(i, j - 1, k))/(2._wp*dy(j))
+                        dudz = (q_cons_filtered(2)%sf(i, j, k + 1)/q_cons_filtered(1)%sf(i, j, k + 1) - q_cons_filtered(2)%sf(i, j, k - 1)/q_cons_filtered(1)%sf(i, j, k - 1))/(2._wp*dz(k))
+
+                        dvdx = (q_cons_filtered(3)%sf(i + 1, j, k)/q_cons_filtered(1)%sf(i + 1, j, k) - q_cons_filtered(3)%sf(i - 1, j, k)/q_cons_filtered(1)%sf(i - 1, j, k))/(2._wp*dx(i))
+                        dvdy = (q_cons_filtered(3)%sf(i, j + 1, k)/q_cons_filtered(1)%sf(i, j + 1, k) - q_cons_filtered(3)%sf(i, j - 1, k)/q_cons_filtered(1)%sf(i, j - 1, k))/(2._wp*dy(j))
+                        dvdz = (q_cons_filtered(3)%sf(i, j, k + 1)/q_cons_filtered(1)%sf(i, j, k + 1) - q_cons_filtered(3)%sf(i, j, k - 1)/q_cons_filtered(1)%sf(i, j, k - 1))/(2._wp*dz(k))
+
+                        dwdx = (q_cons_filtered(4)%sf(i + 1, j, k)/q_cons_filtered(1)%sf(i + 1, j, k) - q_cons_filtered(4)%sf(i - 1, j, k)/q_cons_filtered(1)%sf(i - 1, j, k))/(2._wp*dx(i))
+                        dwdy = (q_cons_filtered(4)%sf(i, j + 1, k)/q_cons_filtered(1)%sf(i, j + 1, k) - q_cons_filtered(4)%sf(i, j - 1, k)/q_cons_filtered(1)%sf(i, j - 1, k))/(2._wp*dy(j))
+                        dwdz = (q_cons_filtered(4)%sf(i, j, k + 1)/q_cons_filtered(1)%sf(i, j, k + 1) - q_cons_filtered(4)%sf(i, j, k - 1)/q_cons_filtered(1)%sf(i, j, k - 1))/(2._wp*dz(k))
+
+                        ! viscous stress tensor, visc_stress(row, column)
+                        visc_stress(1)%vf(1)%sf(i, j, k) = (4._wp/3._wp*dudx - 2._wp/3._wp*(dvdy + dwdz))/Res(1, 1)
+                        visc_stress(1)%vf(2)%sf(i, j, k) = (dudy + dvdx)/Res(1, 1)
+                        visc_stress(1)%vf(3)%sf(i, j, k) = (dudz + dwdx)/Res(1, 1)
+                        visc_stress(2)%vf(1)%sf(i, j, k) = (dvdx + dudy)/Res(1, 1)
+                        visc_stress(2)%vf(2)%sf(i, j, k) = (4._wp/3._wp*dvdy - 2._wp/3._wp*(dudx + dwdz))/Res(1, 1)
+                        visc_stress(2)%vf(3)%sf(i, j, k) = (dvdz + dwdy)/Res(1, 1)
+                        visc_stress(3)%vf(1)%sf(i, j, k) = (dwdx + dudz)/Res(1, 1)
+                        visc_stress(3)%vf(2)%sf(i, j, k) = (dwdy + dvdz)/Res(1, 1)
+                        visc_stress(3)%vf(3)%sf(i, j, k) = (4._wp/3._wp*dwdz - 2._wp/3._wp*(dudx + dvdy))/Res(1, 1)
+                    end do
+                end do
+            end do
+        end if
+
+    end subroutine s_compute_viscous_stress_tensor
+
+    subroutine s_compute_stress_tensor(pres_visc_stress, visc_stress, q_prim_vf)
+        type(vector_field), dimension(num_dims), intent(inout) :: pres_visc_stress
+        type(vector_field), dimension(num_dims), intent(in) :: visc_stress
+        type(scalar_field), dimension(sys_size), intent(in) :: q_prim_vf
+        integer :: i, j, k
+
+        $:GPU_PARALLEL_LOOP(collapse=3)
+        do i = 0, m
+            do j = 0, n
+                do k = 0, p
+                    pres_visc_stress(1)%vf(1)%sf(i, j, k) = q_prim_vf(E_idx)%sf(i, j, k) - visc_stress(1)%vf(1)%sf(i, j, k)
+                    pres_visc_stress(1)%vf(2)%sf(i, j, k) = -visc_stress(1)%vf(2)%sf(i, j, k)
+                    pres_visc_stress(1)%vf(3)%sf(i, j, k) = -visc_stress(1)%vf(3)%sf(i, j, k)
+                    pres_visc_stress(2)%vf(1)%sf(i, j, k) = -visc_stress(2)%vf(1)%sf(i, j, k)
+                    pres_visc_stress(2)%vf(2)%sf(i, j, k) = q_prim_vf(E_idx)%sf(i, j, k) - visc_stress(2)%vf(2)%sf(i, j, k)
+                    pres_visc_stress(2)%vf(3)%sf(i, j, k) = -visc_stress(2)%vf(3)%sf(i, j, k)
+                    pres_visc_stress(3)%vf(1)%sf(i, j, k) = -visc_stress(3)%vf(1)%sf(i, j, k)
+                    pres_visc_stress(3)%vf(2)%sf(i, j, k) = -visc_stress(3)%vf(2)%sf(i, j, k)
+                    pres_visc_stress(3)%vf(3)%sf(i, j, k) = q_prim_vf(E_idx)%sf(i, j, k) - visc_stress(3)%vf(3)%sf(i, j, k)
+                end do
+            end do
+        end do
+
+    end subroutine s_compute_stress_tensor
+
+    !< compute the divergence of the pressure-viscous stress tensor
+    subroutine s_compute_divergence_stress_tensor(div_stress_tensor, stress_tensor)
+        type(scalar_field), dimension(num_dims), intent(inout) :: div_stress_tensor
+        type(vector_field), dimension(num_dims), intent(in) :: stress_tensor
+        integer :: i, j, k
+
+        $:GPU_PARALLEL_LOOP(collapse=3)
+        do i = 0, m
+            do j = 0, n
+                do k = 0, p
+                    div_stress_tensor(1)%sf(i, j, k) = (stress_tensor(1)%vf(1)%sf(i + 1, j, k) - stress_tensor(1)%vf(1)%sf(i - 1, j, k))/(2._wp*dx(i)) &
+                                                       + (stress_tensor(1)%vf(2)%sf(i, j + 1, k) - stress_tensor(1)%vf(2)%sf(i, j - 1, k))/(2._wp*dy(j)) &
+                                                       + (stress_tensor(1)%vf(3)%sf(i, j, k + 1) - stress_tensor(1)%vf(3)%sf(i, j, k - 1))/(2._wp*dz(k))
+
+                    div_stress_tensor(2)%sf(i, j, k) = (stress_tensor(2)%vf(1)%sf(i + 1, j, k) - stress_tensor(2)%vf(1)%sf(i - 1, j, k))/(2._wp*dx(i)) &
+                                                       + (stress_tensor(2)%vf(2)%sf(i, j + 1, k) - stress_tensor(2)%vf(2)%sf(i, j - 1, k))/(2._wp*dy(j)) &
+                                                       + (stress_tensor(2)%vf(3)%sf(i, j, k + 1) - stress_tensor(2)%vf(3)%sf(i, j, k - 1))/(2._wp*dz(k))
+
+                    div_stress_tensor(3)%sf(i, j, k) = (stress_tensor(3)%vf(1)%sf(i + 1, j, k) - stress_tensor(3)%vf(1)%sf(i - 1, j, k))/(2._wp*dx(i)) &
+                                                       + (stress_tensor(3)%vf(2)%sf(i, j + 1, k) - stress_tensor(3)%vf(2)%sf(i, j - 1, k))/(2._wp*dy(j)) &
+                                                       + (stress_tensor(3)%vf(3)%sf(i, j, k + 1) - stress_tensor(3)%vf(3)%sf(i, j, k - 1))/(2._wp*dz(k))
+                end do
+            end do
+        end do
+
+    end subroutine s_compute_divergence_stress_tensor
+
+    !< setup for calculation of unclosed terms in volume filtered momentum eqn
+    subroutine s_setup_terms_filtering(q_cons_vf, q_prim_vf, reynolds_stress, visc_stress, pres_visc_stress, div_pres_visc_stress, bc_type)
+        type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
+        type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
+        type(vector_field), dimension(1:num_dims), intent(inout) :: reynolds_stress
+        type(vector_field), dimension(1:num_dims), intent(inout) :: visc_stress
+        type(vector_field), dimension(1:num_dims), intent(inout) :: pres_visc_stress
+        type(scalar_field), dimension(1:num_dims), intent(inout) :: div_pres_visc_stress
+        type(integer_field), dimension(1:num_dims, -1:1), intent(in) :: bc_type
+
+        integer :: i, j, k, l, q
+
+        ! pseudo turbulent reynolds stress setup
+        $:GPU_PARALLEL_LOOP(collapse=3)
+        do i = 0, m
+            do j = 0, n
+                do k = 0, p
+                    $:GPU_LOOP(parallelism='[seq]')
+                    do l = 1, num_dims
+                        $:GPU_LOOP(parallelism='[seq]')
+                        do q = 1, num_dims
+                            reynolds_stress(l)%vf(q)%sf(i, j, k) = q_cons_vf(1)%sf(i, j, k)*(q_prim_vf(momxb - 1 + l)%sf(i, j, k)*q_prim_vf(momxb - 1 + q)%sf(i, j, k)) ! rho*(u x u)
+                        end do
+                    end do
+                end do
+            end do
+        end do
+
+        ! set density and momentum buffers
+        do i = contxb, momxe
+            call s_populate_scalarfield_buffers(bc_type, q_prim_vf(i))
+        end do
+
+        ! effective viscosity setup, return viscous stress tensor
+        call s_compute_viscous_stress_tensor(visc_stress, q_prim_vf=q_prim_vf)
+
+        call s_compute_stress_tensor(pres_visc_stress, visc_stress, q_prim_vf)
+
+        ! set stress tensor buffers for taking divergence
+        do i = 1, 3
+            do j = 1, 3
+                call s_populate_scalarfield_buffers(bc_type, pres_visc_stress(i)%vf(j))
+            end do
+        end do
+
+        ! interphase momentum exchange term setup
+        call s_compute_divergence_stress_tensor(div_pres_visc_stress, pres_visc_stress)
+
+    end subroutine s_setup_terms_filtering
+
+    subroutine s_compute_pseudo_turbulent_reynolds_stress(q_cons_filtered, reynolds_stress)
+        type(scalar_field), dimension(sys_size - 1), intent(in) :: q_cons_filtered
+        type(vector_field), dimension(1:num_dims), intent(inout) :: reynolds_stress
+        integer :: i, j, k, l, q
+
+        $:GPU_PARALLEL_LOOP(collapse=3)
+        do i = 0, m
+            do j = 0, n
+                do k = 0, p
+                    $:GPU_LOOP(parallelism='[seq]')
+                    do l = 1, num_dims
+                        $:GPU_LOOP(parallelism='[seq]')
+                        do q = 1, num_dims
+                            reynolds_stress(l)%vf(q)%sf(i, j, k) = reynolds_stress(l)%vf(q)%sf(i, j, k) &
+                                                                   - (q_cons_filtered(momxb - 1 + l)%sf(i, j, k)*q_cons_filtered(momxb - 1 + q)%sf(i, j, k)/q_cons_filtered(1)%sf(i, j, k))
+                        end do
+                    end do
+                end do
+            end do
+        end do
+
+    end subroutine s_compute_pseudo_turbulent_reynolds_stress
+
+    subroutine s_compute_effective_viscosity(q_cons_filtered, eff_visc, visc_stress, bc_type)
+        type(scalar_field), dimension(1:sys_size - 1), intent(inout) :: q_cons_filtered
+        type(vector_field), dimension(1:num_dims), intent(inout) :: eff_visc
+        type(vector_field), dimension(1:num_dims), intent(inout) :: visc_stress
+        type(integer_field), dimension(1:num_dims, -1:1), intent(in) :: bc_type
+
+        integer :: i, j, k, l, q
+
+        ! set buffers for filtered momentum quantities and density
+        do i = contxb, momxe
+            call s_populate_scalarfield_buffers(bc_type, q_cons_filtered(i))
+        end do
+
+        ! calculate stress tensor with filtered quantities
+        call s_compute_viscous_stress_tensor(visc_stress, q_cons_filtered=q_cons_filtered)
+
+        ! calculate eff_visc
+        $:GPU_PARALLEL_LOOP(collapse=3)
+        do i = 0, m
+            do j = 0, n
+                do k = 0, p
+                    $:GPU_LOOP(parallelism='[seq]')
+                    do l = 1, num_dims
+                        $:GPU_LOOP(parallelism='[seq]')
+                        do q = 1, num_dims
+                            eff_visc(l)%vf(q)%sf(i, j, k) = eff_visc(l)%vf(q)%sf(i, j, k) - visc_stress(l)%vf(q)%sf(i, j, k)
+                        end do
+                    end do
+                end do
+            end do
+        end do
+
+    end subroutine s_compute_effective_viscosity
+
+    subroutine s_compute_interphase_momentum_exchange(filtered_fluid_indicator_function, grad_fluid_indicator, pres_visc_stress, int_mom_exch)
+        type(scalar_field), intent(in) :: filtered_fluid_indicator_function
+        type(scalar_field), dimension(1:3), intent(in) :: grad_fluid_indicator
+        type(vector_field), dimension(1:3), intent(in) :: pres_visc_stress
+        type(scalar_field), dimension(1:3), intent(inout) :: int_mom_exch
+
+        integer :: i, j, k, l
+
+        ! x-, y-, z- component loop
+        do l = 1, 3
+
+            $:GPU_PARALLEL_LOOP(collapse=3)
+            do i = 0, m
+                do j = 0, n
+                    do k = 0, p
+                        data_real_3D_slabz(i + 1, j + 1, k + 1) = pres_visc_stress(l)%vf(1)%sf(i, j, k)*grad_fluid_indicator(1)%sf(i, j, k) &
+                                                                  + pres_visc_stress(l)%vf(2)%sf(i, j, k)*grad_fluid_indicator(2)%sf(i, j, k) &
+                                                                  + pres_visc_stress(l)%vf(3)%sf(i, j, k)*grad_fluid_indicator(3)%sf(i, j, k)
+                    end do
+                end do
+            end do
+
+            call nvtxStartRange("FORWARD-3D-FFT")
+            call s_mpi_FFT_fwd
+            call nvtxEndRange
+
+            ! convolution with filtering kernel
+            $:GPU_PARALLEL_LOOP(collapse=3)
+            do i = 1, NxC
+                do j = 1, Nyloc
+                    do k = 1, Nz
+                        data_cmplx_out1d(k + (i - 1)*Nz + (j - 1)*Nz*NxC) = data_cmplx_out1d(k + (i - 1)*Nz + (j - 1)*Nz*NxC)*cmplx_kernelG1d(k + (i - 1)*Nz + (j - 1)*Nz*NxC)
+                    end do
+                end do
+            end do
+
+            call nvtxStartRange("BACKWARD-3D-FFT")
+            call s_mpi_FFT_bwd
+            call nvtxEndRange
+
+            $:GPU_PARALLEL_LOOP(collapse=3)
+            do i = 0, m
+                do j = 0, n
+                    do k = 0, p
+                        int_mom_exch(l)%sf(i, j, k) = data_real_3D_slabz(i + 1, j + 1, k + 1)/(real(Nx*Ny*Nz, wp))
+                    end do
+                end do
+            end do
+        end do ! end component loop
+
+    end subroutine s_compute_interphase_momentum_exchange
+
+    ! computes x-,y-,z-direction forces on particles
+    subroutine s_compute_particle_forces
+        real(wp), dimension(num_ibs, 3) :: force_glb
+        real(wp) :: dvol
+        integer :: i, j, k, l
+
+        ! zero particle forces
+        particle_forces = 0.0_wp
+        $:GPU_UPDATE(device='[particle_forces]')
+
+        $:GPU_PARALLEL_LOOP(collapse=3, private='[dvol]')
+        do i = 0, m
+            do j = 0, n
+                do k = 0, p
+                    dvol = dx(i)*dy(j)*dz(k)
+                    $:GPU_ATOMIC(atomic='update')
+                    particle_forces(ib_markers%sf(i, j, k), 1) = particle_forces(ib_markers%sf(i, j, k), 1) - (div_pres_visc_stress(1)%sf(i, j, k)*dvol)
+                    $:GPU_ATOMIC(atomic='update')
+                    particle_forces(ib_markers%sf(i, j, k), 2) = particle_forces(ib_markers%sf(i, j, k), 2) - (div_pres_visc_stress(2)%sf(i, j, k)*dvol)
+                    $:GPU_ATOMIC(atomic='update')
+                    particle_forces(ib_markers%sf(i, j, k), 3) = particle_forces(ib_markers%sf(i, j, k), 3) - (div_pres_visc_stress(3)%sf(i, j, k)*dvol)
+                end do
+            end do
+        end do
+
+        $:GPU_UPDATE(host='[particle_forces]')
+
+        ! reduce particle forces across processors
+        do i = 1, num_ibs
+            call s_mpi_allreduce_sum(particle_forces(i, 1), force_glb(i, 1))
+            call s_mpi_allreduce_sum(particle_forces(i, 2), force_glb(i, 2))
+            call s_mpi_allreduce_sum(particle_forces(i, 3), force_glb(i, 3))
+        end do
+
+        ! write particle forces to file
+        if (proc_rank == 0) then
+            write (100) force_glb
+            flush (100)
+        end if
+
+    end subroutine s_compute_particle_forces
+
+    !< transpose domain from z-slabs to y-slabs on each processor
+    subroutine s_mpi_transpose_slabZ2Y
+        integer :: dest_rank, src_rank
+        integer :: i, j, k
+
+        $:GPU_PARALLEL_LOOP(collapse=4)
+        do dest_rank = 0, num_procs - 1
+            do k = 1, Nzloc
+                do j = 1, Nyloc
+                    do i = 1, NxC
+                        sendbuf_sf(i + (j - 1)*NxC + (k - 1)*NxC*Nyloc + dest_rank*NxC*Nyloc*Nzloc) = data_cmplx_slabz(i, j + dest_rank*Nyloc, k)
+                    end do
+                end do
+            end do
+        end do
+
+        $:GPU_UPDATE(host='[sendbuf_sf]')
+
+        call MPI_Alltoall(sendbuf_sf, NxC*Nyloc*Nzloc, MPI_COMPLEX, &
+                          recvbuf_sf, NxC*Nyloc*Nzloc, MPI_COMPLEX, MPI_COMM_WORLD, ierr)
+
+        $:GPU_UPDATE(device='[recvbuf_sf]')
+
+        $:GPU_PARALLEL_LOOP(collapse=4)
+        do src_rank = 0, num_procs - 1
+            do k = 1, Nzloc
+                do j = 1, Nyloc
+                    do i = 1, NxC
+                        data_cmplx_slaby(i, j, k + src_rank*Nzloc) = recvbuf_sf(i + (j - 1)*NxC + (k - 1)*NxC*Nyloc + src_rank*NxC*Nyloc*Nzloc)
+                    end do
+                end do
+            end do
+        end do
+
+    end subroutine s_mpi_transpose_slabZ2Y
+
+    !< transpose domain from y-slabs to z-slabs on each processor
+    subroutine s_mpi_transpose_slabY2Z
+        integer :: dest_rank, src_rank
+        integer :: i, j, k
+
+        $:GPU_PARALLEL_LOOP(collapse=4)
+        do dest_rank = 0, num_procs - 1
+            do k = 1, Nzloc
+                do j = 1, Nyloc
+                    do i = 1, NxC
+                        sendbuf_sf(i + (j - 1)*NxC + (k - 1)*NxC*Nyloc + dest_rank*NxC*Nyloc*Nzloc) = data_cmplx_slaby(i, j, k + dest_rank*Nzloc)
+                    end do
+                end do
+            end do
+        end do
+
+        $:GPU_UPDATE(host='[sendbuf_sf]')
+
+        call MPI_Alltoall(sendbuf_sf, NxC*Nyloc*Nzloc, MPI_COMPLEX, &
+                          recvbuf_sf, NxC*Nyloc*Nzloc, MPI_COMPLEX, MPI_COMM_WORLD, ierr)
+
+        $:GPU_UPDATE(device='[recvbuf_sf]')
+
+        $:GPU_PARALLEL_LOOP(collapse=4)
+        do src_rank = 0, num_procs - 1
+            do k = 1, Nzloc
+                do j = 1, Nyloc
+                    do i = 1, NxC
+                        data_cmplx_slabz(i, j + src_rank*Nyloc, k) = recvbuf_sf(i + (j - 1)*NxC + (k - 1)*NxC*Nyloc + src_rank*NxC*Nyloc*Nzloc)
+                    end do
+                end do
+            end do
+        end do
+
+    end subroutine s_mpi_transpose_slabY2Z
+
+    !< transpose domain from z-slabs to y-slabs on each processor for batched 24 element tensors
+    subroutine s_mpi_transpose_slabZ2Y_batch
+        integer :: dest_rank, src_rank
+        integer :: i, j, k, l
+
+        $:GPU_PARALLEL_LOOP(collapse=5)
+        do dest_rank = 0, num_procs - 1
+            do k = 1, Nzloc
+                do j = 1, Nyloc
+                    do i = 1, NxC
+                        do l = 1, 24
+                            sendbuf_batch(l + (i - 1)*24 + (j - 1)*24*NxC + (k - 1)*24*NxC*Nyloc + dest_rank*24*NxC*Nyloc*Nzloc) = data_cmplx_slabz_batch(l, i, j + dest_rank*Nyloc, k)
+                        end do
+                    end do
+                end do
+            end do
+        end do
+
+        $:GPU_UPDATE(host='[sendbuf_batch]')
+
+        call MPI_Alltoall(sendbuf_batch, 24*NxC*Nyloc*Nzloc, MPI_COMPLEX, &
+                          recvbuf_batch, 24*NxC*Nyloc*Nzloc, MPI_COMPLEX, MPI_COMM_WORLD, ierr)
+
+        $:GPU_UPDATE(device='[recvbuf_batch]')
+
+        $:GPU_PARALLEL_LOOP(collapse=5)
+        do src_rank = 0, num_procs - 1
+            do k = 1, Nzloc
+                do j = 1, Nyloc
+                    do i = 1, NxC
+                        do l = 1, 24
+                            data_cmplx_slaby_batch(l, i, j, k + src_rank*Nzloc) = recvbuf_batch(l + (i - 1)*24 + (j - 1)*24*NxC + (k - 1)*24*NxC*Nyloc + src_rank*24*NxC*Nyloc*Nzloc)
+                        end do
+                    end do
+                end do
+            end do
+        end do
+
+    end subroutine s_mpi_transpose_slabZ2Y_batch
+
+    !< transpose domain from y-slabs to z-slabs on each processor for batched 24 element tensors
+    subroutine s_mpi_transpose_slabY2Z_batch
+        integer :: dest_rank, src_rank
+        integer :: i, j, k, l
+
+        $:GPU_PARALLEL_LOOP(collapse=5)
+        do dest_rank = 0, num_procs - 1
+            do k = 1, Nzloc
+                do j = 1, Nyloc
+                    do i = 1, NxC
+                        do l = 1, 24
+                            sendbuf_batch(l + (i - 1)*24 + (j - 1)*24*NxC + (k - 1)*24*NxC*Nyloc + dest_rank*24*NxC*Nyloc*Nzloc) = data_cmplx_slaby_batch(l, i, j, k + dest_rank*Nzloc)
+                        end do
+                    end do
+                end do
+            end do
+        end do
+
+        $:GPU_UPDATE(host='[sendbuf_batch]')
+
+        call MPI_Alltoall(sendbuf_batch, 24*NxC*Nyloc*Nzloc, MPI_COMPLEX, &
+                          recvbuf_batch, 24*NxC*Nyloc*Nzloc, MPI_COMPLEX, MPI_COMM_WORLD, ierr)
+
+        $:GPU_UPDATE(device='[recvbuf_batch]')
+
+        $:GPU_PARALLEL_LOOP(collapse=5)
+        do src_rank = 0, num_procs - 1
+            do k = 1, Nzloc
+                do j = 1, Nyloc
+                    do i = 1, NxC
+                        do l = 1, 24
+                            data_cmplx_slabz_batch(l, i, j + src_rank*Nyloc, k) = recvbuf_batch(l + (i - 1)*24 + (j - 1)*24*NxC + (k - 1)*24*NxC*Nyloc + src_rank*24*NxC*Nyloc*Nzloc)
+                        end do
+                    end do
+                end do
+            end do
+        end do
+
+    end subroutine s_mpi_transpose_slabY2Z_batch
+
+    !< compute forward FFT, input: data_real_3D_slabz, output: data_cmplx_out1d
+    subroutine s_filter_batch(q_cons_vf, q_cons_filtered, pressure, filtered_pressure, reynolds_stress, visc_stress, eff_visc)
+        type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
+        type(scalar_field), dimension(5), intent(inout) :: q_cons_filtered
+        type(scalar_field), intent(inout) :: pressure
+        type(scalar_field), intent(inout) :: filtered_pressure
+        type(vector_field), dimension(3), intent(inout) :: reynolds_stress
+        type(vector_field), dimension(3), intent(inout) :: visc_stress
+        type(vector_field), dimension(3), intent(inout) :: eff_visc
+        integer :: i, j, k, l, q
+
+        ! cons vars
+        do l = 1, 5
+            $:GPU_PARALLEL_LOOP(collapse=3)
+            do i = 0, m
+                do j = 0, n
+                    do k = 0, p
+                        data_real_3D_slabz(i + 1, j + 1, k + 1) = q_cons_vf(l)%sf(i, j, k)*fluid_indicator_function%sf(i, j, k)
+                    end do
+                end do
+            end do
+            $:GPU_PARALLEL_LOOP(collapse=3)
+            do i = 1, Nx
+                do j = 1, Ny
+                    do k = 1, Nzloc
+                        data_real_in1d(i + (j - 1)*Nx + (k - 1)*Nx*Ny) = data_real_3D_slabz(i, j, k)
+                    end do
+                end do
+            end do
+#if defined(MFC_OpenACC)
+            ierr = cufftExecD2Z(plan_x_fwd_gpu, data_real_in1d, data_cmplx_out1d)
+#else
+            call fftw_execute_dft_r2c(plan_x_r2c_fwd, data_real_in1d, data_cmplx_out1d)
+#endif
+            $:GPU_PARALLEL_LOOP(collapse=3)
+            do i = 1, NxC
+                do j = 1, Ny
+                    do k = 1, Nzloc
+                        data_cmplx_out1dy(j + (i - 1)*Ny + (k - 1)*Ny*NxC) = data_cmplx_out1d(i + (j - 1)*NxC + (k - 1)*NxC*Ny)
+                    end do
+                end do
+            end do
+#if defined(MFC_OpenACC)
+            ierr = cufftExecZ2Z(plan_y_gpu, data_cmplx_out1dy, data_cmplx_out1dy, CUFFT_FORWARD)
+#else
+            call fftw_execute_dft(plan_y_c2c_fwd, data_cmplx_out1dy, data_cmplx_out1dy)
+#endif
+            $:GPU_PARALLEL_LOOP(collapse=3)
+            do i = 1, NxC
+                do j = 1, Ny
+                    do k = 1, Nzloc
+                        data_cmplx_slabz_batch(l, i, j, k) = data_cmplx_out1dy(j + (i - 1)*Ny + (k - 1)*Ny*NxC)
+                    end do
+                end do
+            end do
+        end do
+
+        ! pressure
+        $:GPU_PARALLEL_LOOP(collapse=3)
+        do i = 0, m
+            do j = 0, n
+                do k = 0, p
+                    data_real_3D_slabz(i + 1, j + 1, k + 1) = pressure%sf(i, j, k)*fluid_indicator_function%sf(i, j, k)
+                end do
+            end do
+        end do
+        $:GPU_PARALLEL_LOOP(collapse=3)
+        do i = 1, Nx
+            do j = 1, Ny
+                do k = 1, Nzloc
+                    data_real_in1d(i + (j - 1)*Nx + (k - 1)*Nx*Ny) = data_real_3D_slabz(i, j, k)
+                end do
+            end do
+        end do
+#if defined(MFC_OpenACC)
+        ierr = cufftExecD2Z(plan_x_fwd_gpu, data_real_in1d, data_cmplx_out1d)
+#else
+        call fftw_execute_dft_r2c(plan_x_r2c_fwd, data_real_in1d, data_cmplx_out1d)
+#endif
+        $:GPU_PARALLEL_LOOP(collapse=3)
+        do i = 1, NxC
+            do j = 1, Ny
+                do k = 1, Nzloc
+                    data_cmplx_out1dy(j + (i - 1)*Ny + (k - 1)*Ny*NxC) = data_cmplx_out1d(i + (j - 1)*NxC + (k - 1)*NxC*Ny)
+                end do
+            end do
+        end do
+#if defined(MFC_OpenACC)
+        ierr = cufftExecZ2Z(plan_y_gpu, data_cmplx_out1dy, data_cmplx_out1dy, CUFFT_FORWARD)
+#else
+        call fftw_execute_dft(plan_y_c2c_fwd, data_cmplx_out1dy, data_cmplx_out1dy)
+#endif
+        $:GPU_PARALLEL_LOOP(collapse=3)
+        do i = 1, NxC
+            do j = 1, Ny
+                do k = 1, Nzloc
+                    data_cmplx_slabz_batch(6, i, j, k) = data_cmplx_out1dy(j + (i - 1)*Ny + (k - 1)*Ny*NxC)
+                end do
+            end do
+        end do
+
+        ! reynolds stress
+        do l = 1, 3
+            do q = 1, 3
+                $:GPU_PARALLEL_LOOP(collapse=3)
+                do i = 0, m
+                    do j = 0, n
+                        do k = 0, p
+                            data_real_3D_slabz(i + 1, j + 1, k + 1) = reynolds_stress(l)%vf(q)%sf(i, j, k)*fluid_indicator_function%sf(i, j, k)
+                        end do
+                    end do
+                end do
+                $:GPU_PARALLEL_LOOP(collapse=3)
+                do i = 1, Nx
+                    do j = 1, Ny
+                        do k = 1, Nzloc
+                            data_real_in1d(i + (j - 1)*Nx + (k - 1)*Nx*Ny) = data_real_3D_slabz(i, j, k)
+                        end do
+                    end do
+                end do
+#if defined(MFC_OpenACC)
+                ierr = cufftExecD2Z(plan_x_fwd_gpu, data_real_in1d, data_cmplx_out1d)
+#else
+                call fftw_execute_dft_r2c(plan_x_r2c_fwd, data_real_in1d, data_cmplx_out1d)
+#endif
+                $:GPU_PARALLEL_LOOP(collapse=3)
+                do i = 1, NxC
+                    do j = 1, Ny
+                        do k = 1, Nzloc
+                            data_cmplx_out1dy(j + (i - 1)*Ny + (k - 1)*Ny*NxC) = data_cmplx_out1d(i + (j - 1)*NxC + (k - 1)*NxC*Ny)
+                        end do
+                    end do
+                end do
+#if defined(MFC_OpenACC)
+                ierr = cufftExecZ2Z(plan_y_gpu, data_cmplx_out1dy, data_cmplx_out1dy, CUFFT_FORWARD)
+#else
+                call fftw_execute_dft(plan_y_c2c_fwd, data_cmplx_out1dy, data_cmplx_out1dy)
+#endif
+                $:GPU_PARALLEL_LOOP(collapse=3)
+                do i = 1, NxC
+                    do j = 1, Ny
+                        do k = 1, Nzloc
+                            data_cmplx_slabz_batch(6 + 3*(l - 1) + q, i, j, k) = data_cmplx_out1dy(j + (i - 1)*Ny + (k - 1)*Ny*NxC)
+                        end do
+                    end do
+                end do
+            end do
+        end do
+
+        ! effective viscosity
+        do l = 1, 3
+            do q = 1, 3
+                $:GPU_PARALLEL_LOOP(collapse=3)
+                do i = 0, m
+                    do j = 0, n
+                        do k = 0, p
+                            data_real_3D_slabz(i + 1, j + 1, k + 1) = visc_stress(l)%vf(q)%sf(i, j, k)*fluid_indicator_function%sf(i, j, k)
+                        end do
+                    end do
+                end do
+                $:GPU_PARALLEL_LOOP(collapse=3)
+                do i = 1, Nx
+                    do j = 1, Ny
+                        do k = 1, Nzloc
+                            data_real_in1d(i + (j - 1)*Nx + (k - 1)*Nx*Ny) = data_real_3D_slabz(i, j, k)
+                        end do
+                    end do
+                end do
+#if defined(MFC_OpenACC)
+                ierr = cufftExecD2Z(plan_x_fwd_gpu, data_real_in1d, data_cmplx_out1d)
+#else
+                call fftw_execute_dft_r2c(plan_x_r2c_fwd, data_real_in1d, data_cmplx_out1d)
+#endif
+                $:GPU_PARALLEL_LOOP(collapse=3)
+                do i = 1, NxC
+                    do j = 1, Ny
+                        do k = 1, Nzloc
+                            data_cmplx_out1dy(j + (i - 1)*Ny + (k - 1)*Ny*NxC) = data_cmplx_out1d(i + (j - 1)*NxC + (k - 1)*NxC*Ny)
+                        end do
+                    end do
+                end do
+#if defined(MFC_OpenACC)
+                ierr = cufftExecZ2Z(plan_y_gpu, data_cmplx_out1dy, data_cmplx_out1dy, CUFFT_FORWARD)
+#else
+                call fftw_execute_dft(plan_y_c2c_fwd, data_cmplx_out1dy, data_cmplx_out1dy)
+#endif
+                $:GPU_PARALLEL_LOOP(collapse=3)
+                do i = 1, NxC
+                    do j = 1, Ny
+                        do k = 1, Nzloc
+                            data_cmplx_slabz_batch(15 + 3*(l - 1) + q, i, j, k) = data_cmplx_out1dy(j + (i - 1)*Ny + (k - 1)*Ny*NxC)
+                        end do
+                    end do
+                end do
+            end do
+        end do
+
+        call s_mpi_transpose_slabZ2Y_batch
+
+        ! cons vars
+        do l = 1, 5
+            $:GPU_PARALLEL_LOOP(collapse=3)
+            do i = 1, NxC
+                do j = 1, Nyloc
+                    do k = 1, Nz
+                        data_cmplx_out1d(k + (i - 1)*Nz + (j - 1)*Nz*NxC) = data_cmplx_slaby_batch(l, i, j, k)
+                    end do
+                end do
+            end do
+#if defined(MFC_OpenACC)
+            ierr = cufftExecZ2Z(plan_z_gpu, data_cmplx_out1d, data_cmplx_out1d, CUFFT_FORWARD)
+#else
+            call fftw_execute_dft(plan_z_c2c_fwd, data_cmplx_out1d, data_cmplx_out1d)
+#endif
+            $:GPU_PARALLEL_LOOP(collapse=3)
+            do i = 1, NxC
+                do j = 1, Nyloc
+                    do k = 1, Nz
+                        data_cmplx_out1d(k + (i - 1)*Nz + (j - 1)*Nz*NxC) = data_cmplx_out1d(k + (i - 1)*Nz + (j - 1)*Nz*NxC)*cmplx_kernelG1d(k + (i - 1)*Nz + (j - 1)*Nz*NxC)
+                    end do
+                end do
+            end do
+#if defined(MFC_OpenACC)
+            ierr = cufftExecZ2Z(plan_z_gpu, data_cmplx_out1d, data_cmplx_out1d, CUFFT_INVERSE)
+#else
+            call fftw_execute_dft(plan_z_c2c_bwd, data_cmplx_out1d, data_cmplx_out1d)
+#endif
+            $:GPU_PARALLEL_LOOP(collapse=3)
+            do i = 1, NxC
+                do j = 1, Nyloc
+                    do k = 1, Nz
+                        data_cmplx_slaby_batch(l, i, j, k) = data_cmplx_out1d(k + (i - 1)*Nz + (j - 1)*Nz*NxC)
+                    end do
+                end do
+            end do
+        end do
+
+        ! pressure
+        $:GPU_PARALLEL_LOOP(collapse=3)
+        do i = 1, NxC
+            do j = 1, Nyloc
+                do k = 1, Nz
+                    data_cmplx_out1d(k + (i - 1)*Nz + (j - 1)*Nz*NxC) = data_cmplx_slaby_batch(6, i, j, k)
+                end do
+            end do
+        end do
+#if defined(MFC_OpenACC)
+        ierr = cufftExecZ2Z(plan_z_gpu, data_cmplx_out1d, data_cmplx_out1d, CUFFT_FORWARD)
+#else
+        call fftw_execute_dft(plan_z_c2c_fwd, data_cmplx_out1d, data_cmplx_out1d)
+#endif
+        $:GPU_PARALLEL_LOOP(collapse=3)
+        do i = 1, NxC
+            do j = 1, Nyloc
+                do k = 1, Nz
+                    data_cmplx_out1d(k + (i - 1)*Nz + (j - 1)*Nz*NxC) = data_cmplx_out1d(k + (i - 1)*Nz + (j - 1)*Nz*NxC)*cmplx_kernelG1d(k + (i - 1)*Nz + (j - 1)*Nz*NxC)
+                end do
+            end do
+        end do
+#if defined(MFC_OpenACC)
+        ierr = cufftExecZ2Z(plan_z_gpu, data_cmplx_out1d, data_cmplx_out1d, CUFFT_INVERSE)
+#else
+        call fftw_execute_dft(plan_z_c2c_bwd, data_cmplx_out1d, data_cmplx_out1d)
+#endif
+        $:GPU_PARALLEL_LOOP(collapse=3)
+        do i = 1, NxC
+            do j = 1, Nyloc
+                do k = 1, Nz
+                    data_cmplx_slaby_batch(6, i, j, k) = data_cmplx_out1d(k + (i - 1)*Nz + (j - 1)*Nz*NxC)
+                end do
+            end do
+        end do
+
+        ! reynolds stress
+        do l = 1, 3
+            do q = 1, 3
+                $:GPU_PARALLEL_LOOP(collapse=3)
+                do i = 1, NxC
+                    do j = 1, Nyloc
+                        do k = 1, Nz
+                            data_cmplx_out1d(k + (i - 1)*Nz + (j - 1)*Nz*NxC) = data_cmplx_slaby_batch(6 + 3*(l - 1) + q, i, j, k)
+                        end do
+                    end do
+                end do
+#if defined(MFC_OpenACC)
+                ierr = cufftExecZ2Z(plan_z_gpu, data_cmplx_out1d, data_cmplx_out1d, CUFFT_FORWARD)
+#else
+                call fftw_execute_dft(plan_z_c2c_fwd, data_cmplx_out1d, data_cmplx_out1d)
+#endif
+                $:GPU_PARALLEL_LOOP(collapse=3)
+                do i = 1, NxC
+                    do j = 1, Nyloc
+                        do k = 1, Nz
+                            data_cmplx_out1d(k + (i - 1)*Nz + (j - 1)*Nz*NxC) = data_cmplx_out1d(k + (i - 1)*Nz + (j - 1)*Nz*NxC)*cmplx_kernelG1d(k + (i - 1)*Nz + (j - 1)*Nz*NxC)
+                        end do
+                    end do
+                end do
+#if defined(MFC_OpenACC)
+                ierr = cufftExecZ2Z(plan_z_gpu, data_cmplx_out1d, data_cmplx_out1d, CUFFT_INVERSE)
+#else
+                call fftw_execute_dft(plan_z_c2c_bwd, data_cmplx_out1d, data_cmplx_out1d)
+#endif
+                $:GPU_PARALLEL_LOOP(collapse=3)
+                do i = 1, NxC
+                    do j = 1, Nyloc
+                        do k = 1, Nz
+                            data_cmplx_slaby_batch(6 + 3*(l - 1) + q, i, j, k) = data_cmplx_out1d(k + (i - 1)*Nz + (j - 1)*Nz*NxC)
+                        end do
+                    end do
+                end do
+            end do
+        end do
+
+        ! effective viscosity
+        do l = 1, 3
+            do q = 1, 3
+                $:GPU_PARALLEL_LOOP(collapse=3)
+                do i = 1, NxC
+                    do j = 1, Nyloc
+                        do k = 1, Nz
+                            data_cmplx_out1d(k + (i - 1)*Nz + (j - 1)*Nz*NxC) = data_cmplx_slaby_batch(15 + 3*(l - 1) + q, i, j, k)
+                        end do
+                    end do
+                end do
+#if defined(MFC_OpenACC)
+                ierr = cufftExecZ2Z(plan_z_gpu, data_cmplx_out1d, data_cmplx_out1d, CUFFT_FORWARD)
+#else
+                call fftw_execute_dft(plan_z_c2c_fwd, data_cmplx_out1d, data_cmplx_out1d)
+#endif
+                $:GPU_PARALLEL_LOOP(collapse=3)
+                do i = 1, NxC
+                    do j = 1, Nyloc
+                        do k = 1, Nz
+                            data_cmplx_out1d(k + (i - 1)*Nz + (j - 1)*Nz*NxC) = data_cmplx_out1d(k + (i - 1)*Nz + (j - 1)*Nz*NxC)*cmplx_kernelG1d(k + (i - 1)*Nz + (j - 1)*Nz*NxC)
+                        end do
+                    end do
+                end do
+#if defined(MFC_OpenACC)
+                ierr = cufftExecZ2Z(plan_z_gpu, data_cmplx_out1d, data_cmplx_out1d, CUFFT_INVERSE)
+#else
+                call fftw_execute_dft(plan_z_c2c_bwd, data_cmplx_out1d, data_cmplx_out1d)
+#endif
+                $:GPU_PARALLEL_LOOP(collapse=3)
+                do i = 1, NxC
+                    do j = 1, Nyloc
+                        do k = 1, Nz
+                            data_cmplx_slaby_batch(15 + 3*(l - 1) + q, i, j, k) = data_cmplx_out1d(k + (i - 1)*Nz + (j - 1)*Nz*NxC)
+                        end do
+                    end do
+                end do
+            end do
+        end do
+
+        call s_mpi_transpose_slabY2Z_batch
+
+        ! cons vars
+        do l = 1, 5
+            $:GPU_PARALLEL_LOOP(collapse=3)
+            do i = 1, NxC
+                do j = 1, Ny
+                    do k = 1, Nzloc
+                        data_cmplx_out1dy(j + (i - 1)*Ny + (k - 1)*Ny*NxC) = data_cmplx_slabz_batch(l, i, j, k)
+                    end do
+                end do
+            end do
+#if defined(MFC_OpenACC)
+            ierr = cufftExecZ2Z(plan_y_gpu, data_cmplx_out1dy, data_cmplx_out1dy, CUFFT_INVERSE)
+#else
+            call fftw_execute_dft(plan_y_c2c_bwd, data_cmplx_out1dy, data_cmplx_out1dy)
+#endif
+            $:GPU_PARALLEL_LOOP(collapse=3)
+            do i = 1, NxC
+                do j = 1, Ny
+                    do k = 1, Nzloc
+                        data_cmplx_out1d(i + (j - 1)*NxC + (k - 1)*NxC*Ny) = data_cmplx_out1dy(j + (i - 1)*Ny + (k - 1)*Ny*NxC)
+                    end do
+                end do
+            end do
+#if defined(MFC_OpenACC)
+            ierr = cufftExecZ2D(plan_x_bwd_gpu, data_cmplx_out1d, data_real_in1d)
+#else
+            call fftw_execute_dft_c2r(plan_x_c2r_bwd, data_cmplx_out1d, data_real_in1d)
+#endif
+            $:GPU_PARALLEL_LOOP(collapse=3)
+            do i = 1, Nx
+                do j = 1, Ny
+                    do k = 1, Nzloc
+                        data_real_3D_slabz(i, j, k) = data_real_in1d(i + (j - 1)*Nx + (k - 1)*Nx*Ny)
+                    end do
+                end do
+            end do
+            $:GPU_PARALLEL_LOOP(collapse=3)
+            do i = 0, m
+                do j = 0, n
+                    do k = 0, p
+                        q_cons_filtered(l)%sf(i, j, k) = data_real_3D_slabz(i + 1, j + 1, k + 1)/(real(Nx*Ny*Nz, wp)*filtered_fluid_indicator_function%sf(i, j, k))
+                    end do
+                end do
+            end do
+        end do
+
+        ! pressure
+        $:GPU_PARALLEL_LOOP(collapse=3)
+        do i = 1, NxC
+            do j = 1, Ny
+                do k = 1, Nzloc
+                    data_cmplx_out1dy(j + (i - 1)*Ny + (k - 1)*Ny*NxC) = data_cmplx_slabz_batch(6, i, j, k)
+                end do
+            end do
+        end do
+#if defined(MFC_OpenACC)
+        ierr = cufftExecZ2Z(plan_y_gpu, data_cmplx_out1dy, data_cmplx_out1dy, CUFFT_INVERSE)
+#else
+        call fftw_execute_dft(plan_y_c2c_bwd, data_cmplx_out1dy, data_cmplx_out1dy)
+#endif
+        $:GPU_PARALLEL_LOOP(collapse=3)
+        do i = 1, NxC
+            do j = 1, Ny
+                do k = 1, Nzloc
+                    data_cmplx_out1d(i + (j - 1)*NxC + (k - 1)*NxC*Ny) = data_cmplx_out1dy(j + (i - 1)*Ny + (k - 1)*Ny*NxC)
+                end do
+            end do
+        end do
+#if defined(MFC_OpenACC)
+        ierr = cufftExecZ2D(plan_x_bwd_gpu, data_cmplx_out1d, data_real_in1d)
+#else
+        call fftw_execute_dft_c2r(plan_x_c2r_bwd, data_cmplx_out1d, data_real_in1d)
+#endif
+        $:GPU_PARALLEL_LOOP(collapse=3)
+        do i = 1, Nx
+            do j = 1, Ny
+                do k = 1, Nzloc
+                    data_real_3D_slabz(i, j, k) = data_real_in1d(i + (j - 1)*Nx + (k - 1)*Nx*Ny)
+                end do
+            end do
+        end do
+        $:GPU_PARALLEL_LOOP(collapse=3)
+        do i = 0, m
+            do j = 0, n
+                do k = 0, p
+                    filtered_pressure%sf(i, j, k) = data_real_3D_slabz(i + 1, j + 1, k + 1)/(real(Nx*Ny*Nz, wp)*filtered_fluid_indicator_function%sf(i, j, k))
+                end do
+            end do
+        end do
+
+        ! reynolds stress
+        do l = 1, 3
+            do q = 1, 3
+                $:GPU_PARALLEL_LOOP(collapse=3)
+                do i = 1, NxC
+                    do j = 1, Ny
+                        do k = 1, Nzloc
+                            data_cmplx_out1dy(j + (i - 1)*Ny + (k - 1)*Ny*NxC) = data_cmplx_slabz_batch(6 + 3*(l - 1) + q, i, j, k)
+                        end do
+                    end do
+                end do
+#if defined(MFC_OpenACC)
+                ierr = cufftExecZ2Z(plan_y_gpu, data_cmplx_out1dy, data_cmplx_out1dy, CUFFT_INVERSE)
+#else
+                call fftw_execute_dft(plan_y_c2c_bwd, data_cmplx_out1dy, data_cmplx_out1dy)
+#endif
+                $:GPU_PARALLEL_LOOP(collapse=3)
+                do i = 1, NxC
+                    do j = 1, Ny
+                        do k = 1, Nzloc
+                            data_cmplx_out1d(i + (j - 1)*NxC + (k - 1)*NxC*Ny) = data_cmplx_out1dy(j + (i - 1)*Ny + (k - 1)*Ny*NxC)
+                        end do
+                    end do
+                end do
+#if defined(MFC_OpenACC)
+                ierr = cufftExecZ2D(plan_x_bwd_gpu, data_cmplx_out1d, data_real_in1d)
+#else
+                call fftw_execute_dft_c2r(plan_x_c2r_bwd, data_cmplx_out1d, data_real_in1d)
+#endif
+                $:GPU_PARALLEL_LOOP(collapse=3)
+                do i = 1, Nx
+                    do j = 1, Ny
+                        do k = 1, Nzloc
+                            data_real_3D_slabz(i, j, k) = data_real_in1d(i + (j - 1)*Nx + (k - 1)*Nx*Ny)
+                        end do
+                    end do
+                end do
+                $:GPU_PARALLEL_LOOP(collapse=3)
+                do i = 0, m
+                    do j = 0, n
+                        do k = 0, p
+                            reynolds_stress(l)%vf(q)%sf(i, j, k) = data_real_3D_slabz(i + 1, j + 1, k + 1)/(real(Nx*Ny*Nz, wp)*filtered_fluid_indicator_function%sf(i, j, k))
+                        end do
+                    end do
+                end do
+            end do
+        end do
+
+        ! effective viscosity
+        do l = 1, 3
+            do q = 1, 3
+                $:GPU_PARALLEL_LOOP(collapse=3)
+                do i = 1, NxC
+                    do j = 1, Ny
+                        do k = 1, Nzloc
+                            data_cmplx_out1dy(j + (i - 1)*Ny + (k - 1)*Ny*NxC) = data_cmplx_slabz_batch(15 + 3*(l - 1) + q, i, j, k)
+                        end do
+                    end do
+                end do
+#if defined(MFC_OpenACC)
+                ierr = cufftExecZ2Z(plan_y_gpu, data_cmplx_out1dy, data_cmplx_out1dy, CUFFT_INVERSE)
+#else
+                call fftw_execute_dft(plan_y_c2c_bwd, data_cmplx_out1dy, data_cmplx_out1dy)
+#endif
+                $:GPU_PARALLEL_LOOP(collapse=3)
+                do i = 1, NxC
+                    do j = 1, Ny
+                        do k = 1, Nzloc
+                            data_cmplx_out1d(i + (j - 1)*NxC + (k - 1)*NxC*Ny) = data_cmplx_out1dy(j + (i - 1)*Ny + (k - 1)*Ny*NxC)
+                        end do
+                    end do
+                end do
+#if defined(MFC_OpenACC)
+                ierr = cufftExecZ2D(plan_x_bwd_gpu, data_cmplx_out1d, data_real_in1d)
+#else
+                call fftw_execute_dft_c2r(plan_x_c2r_bwd, data_cmplx_out1d, data_real_in1d)
+#endif
+                $:GPU_PARALLEL_LOOP(collapse=3)
+                do i = 1, Nx
+                    do j = 1, Ny
+                        do k = 1, Nzloc
+                            data_real_3D_slabz(i, j, k) = data_real_in1d(i + (j - 1)*Nx + (k - 1)*Nx*Ny)
+                        end do
+                    end do
+                end do
+                $:GPU_PARALLEL_LOOP(collapse=3)
+                do i = 0, m
+                    do j = 0, n
+                        do k = 0, p
+                            eff_visc(l)%vf(q)%sf(i, j, k) = data_real_3D_slabz(i + 1, j + 1, k + 1)/(real(Nx*Ny*Nz, wp)*filtered_fluid_indicator_function%sf(i, j, k))
+                        end do
+                    end do
+                end do
+            end do
+        end do
+
+    end subroutine s_filter_batch
+
+    !< compute forward FFT, input: data_real_3D_slabz, output: data_cmplx_out1d
+    subroutine s_mpi_FFT_fwd
+        integer :: i, j, k
+
+        ! 3D z-slab -> 1D x, y, z
+        $:GPU_PARALLEL_LOOP(collapse=3)
+        do i = 1, Nx
+            do j = 1, Ny
+                do k = 1, Nzloc
+                    data_real_in1d(i + (j - 1)*Nx + (k - 1)*Nx*Ny) = data_real_3D_slabz(i, j, k)
+                end do
+            end do
+        end do
+
+        ! X FFT
+#if defined(MFC_OpenACC)
+        ierr = cufftExecD2Z(plan_x_fwd_gpu, data_real_in1d, data_cmplx_out1d)
+#else
+        call fftw_execute_dft_r2c(plan_x_r2c_fwd, data_real_in1d, data_cmplx_out1d)
+#endif
+
+        ! 1D x, y, z -> 1D y, x, z (CMPLX)
+        $:GPU_PARALLEL_LOOP(collapse=3)
+        do i = 1, NxC
+            do j = 1, Ny
+                do k = 1, Nzloc
+                    data_cmplx_out1dy(j + (i - 1)*Ny + (k - 1)*Ny*NxC) = data_cmplx_out1d(i + (j - 1)*NxC + (k - 1)*NxC*Ny)
+                end do
+            end do
+        end do
+
+        ! Y FFT
+#if defined(MFC_OpenACC)
+        ierr = cufftExecZ2Z(plan_y_gpu, data_cmplx_out1dy, data_cmplx_out1dy, CUFFT_FORWARD)
+#else
+        call fftw_execute_dft(plan_y_c2c_fwd, data_cmplx_out1dy, data_cmplx_out1dy)
+#endif
+
+        ! 1D y, x, z -> 3D z-slab
+        $:GPU_PARALLEL_LOOP(collapse=3)
+        do i = 1, NxC
+            do j = 1, Ny
+                do k = 1, Nzloc
+                    data_cmplx_slabz(i, j, k) = data_cmplx_out1dy(j + (i - 1)*Ny + (k - 1)*Ny*NxC)
+                end do
+            end do
+        end do
+
+        ! transpose z-slab to y-slab
+        call nvtxStartRange("SLAB-MPI-TRANSPOSE-Z2Y")
+        call s_mpi_transpose_slabZ2Y
+        call nvtxEndRange
+
+        ! 3D y-slab -> 1D z, x, y
+        $:GPU_PARALLEL_LOOP(collapse=3)
+        do i = 1, NxC
+            do j = 1, Nyloc
+                do k = 1, Nz
+                    data_cmplx_out1d(k + (i - 1)*Nz + (j - 1)*Nz*NxC) = data_cmplx_slaby(i, j, k)
+                end do
+            end do
+        end do
+
+        ! Z FFT
+#if defined(MFC_OpenACC)
+        ierr = cufftExecZ2Z(plan_z_gpu, data_cmplx_out1d, data_cmplx_out1d, CUFFT_FORWARD)
+#else
+        call fftw_execute_dft(plan_z_c2c_fwd, data_cmplx_out1d, data_cmplx_out1d)
+#endif
+
+        ! return data_cmplx_out1d: 1D z, x, y
+    end subroutine s_mpi_FFT_fwd
+
+    !< compute inverse FFT, input: data_cmplx_out1d, output: data_real_3D_slabz
+    subroutine s_mpi_FFT_bwd
+        integer :: i, j, k
+
+        ! Z inv FFT
+#if defined(MFC_OpenACC)
+        ierr = cufftExecZ2Z(plan_z_gpu, data_cmplx_out1d, data_cmplx_out1d, CUFFT_INVERSE)
+#else
+        call fftw_execute_dft(plan_z_c2c_bwd, data_cmplx_out1d, data_cmplx_out1d)
+#endif
+
+        ! 1D z, x, y -> 3D y-slab
+        $:GPU_PARALLEL_LOOP(collapse=3)
+        do i = 1, NxC
+            do j = 1, Nyloc
+                do k = 1, Nz
+                    data_cmplx_slaby(i, j, k) = data_cmplx_out1d(k + (i - 1)*Nz + (j - 1)*Nz*NxC)
+                end do
+            end do
+        end do
+
+        ! transpose y-slab to z-slab
+        call nvtxStartRange("SLAB-MPI-TRANSPOSE-Y2Z")
+        call s_mpi_transpose_slabY2Z
+        call nvtxEndRange
+
+        ! 3D z-slab -> 1D y, x, z
+        $:GPU_PARALLEL_LOOP(collapse=3)
+        do i = 1, NxC
+            do j = 1, Ny
+                do k = 1, Nzloc
+                    data_cmplx_out1dy(j + (i - 1)*Ny + (k - 1)*Ny*NxC) = data_cmplx_slabz(i, j, k)
+                end do
+            end do
+        end do
+
+        ! Y inv FFT
+#if defined(MFC_OpenACC)
+        ierr = cufftExecZ2Z(plan_y_gpu, data_cmplx_out1dy, data_cmplx_out1dy, CUFFT_INVERSE)
+#else
+        call fftw_execute_dft(plan_y_c2c_bwd, data_cmplx_out1dy, data_cmplx_out1dy)
+#endif
+
+        ! 1D y, x, z -> 1D x, y, z
+        $:GPU_PARALLEL_LOOP(collapse=3)
+        do i = 1, NxC
+            do j = 1, Ny
+                do k = 1, Nzloc
+                    data_cmplx_out1d(i + (j - 1)*NxC + (k - 1)*NxC*Ny) = data_cmplx_out1dy(j + (i - 1)*Ny + (k - 1)*Ny*NxC)
+                end do
+            end do
+        end do
+
+        ! X inv FFT
+#if defined(MFC_OpenACC)
+        ierr = cufftExecZ2D(plan_x_bwd_gpu, data_cmplx_out1d, data_real_in1d)
+#else
+        call fftw_execute_dft_c2r(plan_x_c2r_bwd, data_cmplx_out1d, data_real_in1d)
+#endif
+
+        ! 1D x, y, z -> 3D z-slab
+        $:GPU_PARALLEL_LOOP(collapse=3)
+        do i = 1, Nx
+            do j = 1, Ny
+                do k = 1, Nzloc
+                    data_real_3D_slabz(i, j, k) = data_real_in1d(i + (j - 1)*Nx + (k - 1)*Nx*Ny)
+                end do
+            end do
+        end do
+
+    end subroutine s_mpi_FFT_bwd
+
+    subroutine s_finalize_fftw_explicit_filter_module
+        integer :: i, j
+
+        @:DEALLOCATE(fluid_indicator_function%sf)
+        @:DEALLOCATE(filtered_fluid_indicator_function%sf)
+        do i = 1, 3
+            @:DEALLOCATE(grad_fluid_indicator(i)%sf)
+        end do
+        @:DEALLOCATE(grad_fluid_indicator)
+
+        do i = 1, sys_size - 1
+            @:DEALLOCATE(q_cons_filtered(i)%sf)
+        end do
+        @:DEALLOCATE(q_cons_filtered)
+
+        @:DEALLOCATE(filtered_pressure%sf)
+
+        do i = 1, num_dims
+            do j = 1, num_dims
+                @:DEALLOCATE(visc_stress(i)%vf(j)%sf)
+            end do
+            @:DEALLOCATE(visc_stress(i)%vf)
+        end do
+        @:DEALLOCATE(visc_stress)
+
+        do i = 1, num_dims
+            do j = 1, num_dims
+                @:DEALLOCATE(pres_visc_stress(i)%vf(j)%sf)
+            end do
+            @:DEALLOCATE(pres_visc_stress(i)%vf)
+        end do
+        @:DEALLOCATE(pres_visc_stress)
+
+        do i = 1, num_dims
+            @:DEALLOCATE(div_pres_visc_stress(i)%sf)
+        end do
+        @:DEALLOCATE(div_pres_visc_stress)
+
+        do i = 1, num_dims
+            do j = 1, num_dims
+                @:DEALLOCATE(reynolds_stress(i)%vf(j)%sf)
+            end do
+            @:DEALLOCATE(reynolds_stress(i)%vf)
+        end do
+        @:DEALLOCATE(reynolds_stress)
+
+        do i = 1, num_dims
+            do j = 1, num_dims
+                @:DEALLOCATE(eff_visc(i)%vf(j)%sf)
+            end do
+            @:DEALLOCATE(eff_visc(i)%vf)
+        end do
+        @:DEALLOCATE(eff_visc)
+
+        do i = 1, num_dims
+            @:DEALLOCATE(int_mom_exch(i)%sf)
+        end do
+        @:DEALLOCATE(int_mom_exch)
+
+        @:DEALLOCATE(Res)
+        @:DEALLOCATE(particle_forces)
+
+        @:DEALLOCATE(data_real_in1d, data_cmplx_out1d, data_cmplx_out1dy)
+        @:DEALLOCATE(cmplx_kernelG1d, real_kernelG_in)
+        @:DEALLOCATE(data_real_3D_slabz, data_cmplx_slabz, data_cmplx_slaby)
+        @:DEALLOCATE(data_cmplx_slabz_batch, data_cmplx_slaby_batch)
+
+        @:DEALLOCATE(sendbuf_sf, recvbuf_sf)
+        @:DEALLOCATE(sendbuf_batch, recvbuf_batch)
+
+#if defined(MFC_OpenACC)
+        ierr = cufftDestroy(plan_x_fwd_gpu)
+        ierr = cufftDestroy(plan_x_bwd_gpu)
+        ierr = cufftDestroy(plan_y_gpu)
+        ierr = cufftDestroy(plan_z_gpu)
+#else
+        call fftw_destroy_plan(plan_x_r2c_fwd)
+        call fftw_destroy_plan(plan_x_c2r_bwd)
+        call fftw_destroy_plan(plan_y_c2c_fwd)
+        call fftw_destroy_plan(plan_y_c2c_bwd)
+        call fftw_destroy_plan(plan_z_c2c_fwd)
+        call fftw_destroy_plan(plan_z_c2c_bwd)
+        call fftw_destroy_plan(plan_x_r2c_kernelG)
+        call fftw_destroy_plan(plan_y_c2c_kernelG)
+        call fftw_destroy_plan(plan_z_c2c_kernelG)
+#endif
+
+        if (compute_particle_drag) then
+            if (proc_rank == 0) then
+                close (100)
+            end if
+        end if
+
+    end subroutine s_finalize_fftw_explicit_filter_module
+
+end module m_volume_filtering
diff --git a/src/simulation/p_main.fpp b/src/simulation/p_main.fpp
index 29cb3b8281..dcd11525eb 100644
--- a/src/simulation/p_main.fpp
+++ b/src/simulation/p_main.fpp
@@ -22,6 +22,8 @@ program p_main
 
     use m_nvtx
 
+    use m_volume_filtering
+
     implicit none
 
     integer :: t_step !< Iterator for the time-stepping loop
@@ -54,6 +56,13 @@ program p_main
     call s_initialize_gpu_vars()
     call nvtxEndRange
 
+    if (volume_filtering_momentum_eqn .or. periodic_forcing) call s_initialize_fluid_indicator_function(bc_type)
+    if (volume_filtering_momentum_eqn) then
+        call s_initialize_filtering_kernel()
+        call s_initialize_filtered_fluid_indicator_function()
+        call s_initialize_fluid_indicator_gradient()
+    end if
+
     ! Setting the time-step iterator to the first time-step
     if (cfl_dt) then
         t_step = 0
diff --git a/toolchain/mfc/run/case_dicts.py b/toolchain/mfc/run/case_dicts.py
index 80eb30cc34..4fd05780b6 100644
--- a/toolchain/mfc/run/case_dicts.py
+++ b/toolchain/mfc/run/case_dicts.py
@@ -66,6 +66,9 @@ def analytic(self):
     'down_sample': ParamType.LOG,
     'recon_type': ParamType.INT,
     'muscl_order': ParamType.INT,
+    'periodic_ibs': ParamType.LOG,
+    'store_levelset': ParamType.LOG,
+    'slab_domain_decomposition': ParamType.LOG,
 }
 
 PRE_PROCESS = COMMON.copy()
@@ -109,7 +112,7 @@ def analytic(self):
     'fft_wrt': ParamType.LOG,
 })
 
-for ib_id in range(1, 10+1):
+for ib_id in range(1, 1000+1):
     for real_attr, ty in [("geometry", ParamType.INT), ("radius", ParamType.REAL),
                           ("theta", ParamType.REAL), ("slip", ParamType.LOG),
                           ("c", ParamType.REAL), ("p", ParamType.REAL),
@@ -324,6 +327,15 @@ def analytic(self):
     'nv_uvm_igr_temps_on_gpu': ParamType.INT,
     'nv_uvm_pref_gpu': ParamType.LOG,
     'fft_wrt': ParamType.LOG,
+    'compute_particle_drag': ParamType.LOG,
+    'u_inf_ref': ParamType.REAL,
+    'rho_inf_ref': ParamType.REAL,
+    'P_inf_ref': ParamType.REAL,
+    'periodic_forcing': ParamType.LOG,
+    'volume_filtering_momentum_eqn': ParamType.LOG,
+    't_step_stat_start': ParamType.INT,
+    'filter_width': ParamType.REAL,
+    'q_filtered_wrt': ParamType.LOG,
 })
 
 for var in [ 'heatTransfer_model', 'massTransfer_model', 'pressure_corrector',
@@ -343,7 +355,7 @@ def analytic(self):
 for var in [ 'gamma_method' ]:
     SIMULATION[f'chem_params%{var}'] = ParamType.INT
 
-for ib_id in range(1, 10+1):
+for ib_id in range(1, 1000+1):
     for real_attr, ty in [("geometry", ParamType.INT), ("radius", ParamType.REAL),
                           ("theta", ParamType.REAL), ("slip", ParamType.LOG),
                           ("c", ParamType.REAL), ("p", ParamType.REAL),
@@ -489,6 +501,7 @@ def analytic(self):
     'lag_mg_wrt': ParamType.LOG,
     'lag_betaT_wrt': ParamType.LOG,
     'lag_betaC_wrt': ParamType.LOG,
+    'q_filtered_wrt': ParamType.LOG,
 })
 
 for cmp in ["x", "y", "z"]:
diff --git a/toolchain/templates/delta.mako b/toolchain/templates/delta.mako
index 694f22c457..52246fd334 100644
--- a/toolchain/templates/delta.mako
+++ b/toolchain/templates/delta.mako
@@ -16,7 +16,7 @@
 % endif
 % if gpu:
 #SBATCH --gpus-per-node=${tasks_per_node}
-#SBATCH --mem=208G
+#SBATCH --mem=240G
 #SBATCH --gpu-bind=closest
 % endif
 #SBATCH --output="${name}.out"
diff --git a/voronoi/gen_voronoi_2D.py b/voronoi/gen_voronoi_2D.py
new file mode 100644
index 0000000000..73beb4b8d7
--- /dev/null
+++ b/voronoi/gen_voronoi_2D.py
@@ -0,0 +1,99 @@
+import numpy as np
+import matplotlib
+import matplotlib.pyplot as plt
+import freud
+
+
+# lloyd relaxation
+def compute_simplex_centroid(simplex_vertices):
+    v1 = simplex_vertices[:, :, 0]
+    v2 = simplex_vertices[:, :, 1]
+    v3 = simplex_vertices[:, :, 2]
+
+    v1_mean = np.mean(v1, axis=1)
+    v2_mean = np.mean(v2, axis=1)
+    v3_mean = np.mean(v3, axis=1)
+
+    simplex_centroids = np.array([v1_mean, v2_mean, v3_mean])
+
+    return simplex_centroids
+
+def compute_simplex_area(simplex_vertices):
+    v1 = simplex_vertices[:, :, 0]
+    v2 = simplex_vertices[:, :, 1]
+    v3 = simplex_vertices[:, :, 2]
+
+    area = 0.5 * np.linalg.norm( np.cross(v2 - v1, v3 - v1), axis=1 )
+
+    return area
+
+def lloyd_relaxation_2d(initial_points, box, w=1.0, iterations=20):
+    points = initial_points
+
+    for _ in range(iterations):
+        voro = freud.locality.Voronoi()
+        voro_data = voro.compute((box, initial_points))
+        vertices = voro_data.polytopes
+
+        for i in range(len(points)):
+            n = len(vertices[i])
+
+            simplex_vertices = np.array( [(points[i, :], vertices[i][j-1], vertices[i][j]) for j in range(n)] )
+
+            simplex_centroids = compute_simplex_centroid(simplex_vertices)
+            simplex_areas = compute_simplex_area(simplex_vertices)
+
+            centroid = (1/np.sum(simplex_areas)) * (np.sum(simplex_centroids*simplex_areas, axis=1))
+
+            dist = centroid - points[i, :]
+
+            points[i, :] += w * dist
+
+        points = box.wrap(points)
+
+    return points
+
+if (__name__ == '__main__'):
+    print('running 2D...')
+
+    # setup 
+    phi = 0.4
+    D = 0.1
+    L = 10*D
+
+    N = int( 4*phi*L**2 / (np.pi*D**2) )
+    print(f'volume fraction phi: {phi}, number of circles: {N}')
+
+    x_i = L/2 * np.random.uniform(-1, 1, N)
+    y_i = L/2 * np.random.uniform(-1, 1, N)
+    z_i = L/2 * np.random.uniform(-1, 1, N) * 0
+
+    initial_points = np.stack((x_i, y_i, z_i), axis=1)
+
+    box = freud.box.Box.square(L)
+    voro = freud.locality.Voronoi()
+
+    cells = voro.compute((box, initial_points)).polytopes
+
+    # plot initial distribution
+    plt.figure()
+    ax = plt.gca()
+    voro.plot(ax=ax, cmap='RdBu')
+    ax.scatter(initial_points[:, 0], initial_points[:, 1], s=5, c='k')
+    plt.show()
+    plt.close()
+
+    # calculate relaxed points
+    relaxed_points = lloyd_relaxation_2d(initial_points, box, w=1.5, iterations=25)
+    voro.compute((box, relaxed_points))
+
+    # plot relaxed distribution
+    plt.figure()
+    ax = plt.gca()
+    voro.plot(ax=ax, cmap='RdBu')
+    ax.scatter(relaxed_points[:, 0], relaxed_points[:, 1], s=5, c='k')
+    plt.show()
+    plt.close()
+
+
+
diff --git a/voronoi/gen_voronoi_3D.py b/voronoi/gen_voronoi_3D.py
new file mode 100644
index 0000000000..ecb08eb36c
--- /dev/null
+++ b/voronoi/gen_voronoi_3D.py
@@ -0,0 +1,98 @@
+import os
+import numpy as np
+import matplotlib
+import matplotlib.pyplot as plt
+import freud
+
+
+# lloyd relaxation
+def compute_tetrahedron_centroid(tetrahedron_vertices):
+    
+    return np.mean(tetrahedron_vertices, axis=0)
+
+def compute_tetrahedron_volume(tetrahedron_vertices):
+    v0, v1, v2, v3 = tetrahedron_vertices
+    matrix = np.vstack([v1 - v0, v2 - v0, v3 - v0]).T
+    volume = np.abs(np.linalg.det(matrix)) / 6
+
+    return volume
+
+def lloyd_relaxation_3d(initial_points, box, w=1, iterations=10):
+    points = initial_points
+
+    for _ in range(iterations):
+        voro = freud.locality.Voronoi()
+        voro_data = voro.compute((box, points))
+        vertices = voro_data.polytopes
+
+        for i in range(len(points)):
+            n = len(vertices[i])
+
+            tetrahedra = []
+            for j in range(n):
+                tetrahedra.append([points[i, :], vertices[i][j], vertices[i][(j+1) % n], vertices[i][(j+2) % n]])
+
+            centroids = np.array([compute_tetrahedron_centroid(t) for t in tetrahedra])
+            volumes = np.array([compute_tetrahedron_volume(t) for t in tetrahedra])
+
+            weighted_centroid = np.sum(centroids * volumes[:, np.newaxis], axis=0)
+            total_volume = np.sum(volumes)
+
+            if total_volume > 1.0e-12:
+                centroid = weighted_centroid / total_volume
+                dist = centroid - points[i, :]
+
+                points[i, :] += w * dist
+
+        points = box.wrap(points)
+
+    return points
+
+if (__name__ == '__main__'): 
+    print('running 3D...')
+
+    # setup 
+    phi = 0.1
+    str_phi = '01'
+
+    D = 0.1
+    L = 10*D
+
+    output_dir = '../runs/phi'+str_phi
+    if os.path.exists(output_dir) == False:
+        os.mkdir(output_dir)
+
+    N_sphere = int( 6*phi*L**3 / (np.pi*D**3) )
+    print(f'volume fraction phi: {phi}, number of spheres: {N_sphere}')
+    print(f'actual phi value: {N_sphere*4/3*np.pi*(D/2)**3/(L**3)}')
+
+    x_i = L/2 * np.random.uniform(-1, 1, N_sphere)
+    y_i = L/2 * np.random.uniform(-1, 1, N_sphere)
+    z_i = L/2 * np.random.uniform(-1, 1, N_sphere)
+
+    initial_points = np.stack((x_i, y_i, z_i), axis=1)
+    box = freud.box.Box.cube(L)
+    
+    relaxed_points = lloyd_relaxation_3d(initial_points, box, iterations=40)
+    print(np.shape(relaxed_points))
+
+    np.savetxt(output_dir+'/sphere_array_locations.txt', relaxed_points)
+
+    # check no spheres are overlaping
+    for i in range(N_sphere):
+        for j in range(N_sphere):
+            if (i != j):
+                dist = np.sqrt((relaxed_points[i, 0] - relaxed_points[j, 0])**2 + (relaxed_points[i, 1] - relaxed_points[j, 1])**2 + (relaxed_points[i, 2] - relaxed_points[j, 2])**2)
+                if (dist <= 1.05*D):
+                    print(f'spheres overlaping, dist={dist}, spheres #: {i}, {j}')
+                    print(f'locations: ({relaxed_points[i, :]}), ({relaxed_points[j, :]})')
+
+    fig = plt.figure(figsize=(10,5))
+    ax1 = fig.add_subplot(121, projection='3d')
+    ax1.scatter(initial_points[:, 0], initial_points[:, 1], initial_points[:, 2], color='blue', s=10)
+    ax1.set_title('initial points')
+    ax2 = fig.add_subplot(122, projection='3d')
+    ax2.scatter(relaxed_points[:, 0], relaxed_points[:, 1], relaxed_points[:, 2], color='red', s=10)
+    ax2.set_title('relaxed points')
+    plt.show()
+    plt.close()