From bdaaa2d6bf6c0f7f2bf7d73943a48962706d7457 Mon Sep 17 00:00:00 2001
From: Jan Streffing <jan.streffing@awi.de>
Date: Mon, 27 Apr 2026 17:10:38 +0200
Subject: [PATCH] perf: OpenMP fork-join consolidation + SIMD hints + I/O
 timer-attribution barrier
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Cuts FESOM-side timestep overhead by reducing OpenMP fork-join count and
giving the compiler explicit vectorization hints in the hottest streams
in tracers, dynamics, ALE mixing, and the mean-output bookkeeping.

Theme A — combine adjacent !$OMP PARALLEL DO regions into a single
!$OMP PARALLEL with multiple !$OMP DO blocks. Each pair previously paid
two fork-joins per timestep; now one. Touched: oce_ale_mixing_kpp
(KPP coefficient + boundary-layer setup), oce_ale_tracer (Fer_GM bolus
add / subtract), oce_ale_vel_rhs (rhs build + ke_cor diagnostic),
oce_tracer_mod (init_tracers_AB + AB interpolation).

Theme B — io_meandata.update_means: replaced the per-stream
PARALLEL DO with one outer PARALLEL DO over io_NSTREAMS and lifted the
size() calls out of the inner loop into szI/szJ; added !$OMP SIMD on
the inner I-axis so the compiler can vectorize the float adds. At HR
the original code spawned ~50 streams * ~6700 calls/sim-month worth
of OMP regions per rank per month; one outer fork-join replaces the lot.

Theme C — io_xios mask helpers: io_xios_apply_wet_2d / 2d_elem / 3d
(_r4 and _r8) and io_xios_apply_ice_mask_2d_elem now run their fill-
loops under !$OMP PARALLEL DO. Lifts size() / min() out of the loop
header (nn / ne) so OMP doesn't have to re-evaluate every iteration.

Theme D — fesom_module: insert one MPI_Barrier at the end of the I/O
block before f%t5 = MPI_Wtime(). XIOS-client / OASIS asymmetry stalls
that would otherwise be absorbed by the first halo exchange in
ice_timestep are now captured in rtime_write_means instead of leaking
into rtime_fullice. No wall-time change, only honest accounting.

Theme E — oce_ale.F90 and oce_setup_step.F90: trailing-whitespace
cleanup that landed on the same lines as the OMP rework; pulled in to
keep the OMP commits hunk-clean.
---
 src/fesom_module.F90       |  7 ++++
 src/io_meandata.F90        | 51 ++++++++++++++++-----------
 src/io_xios.F90            | 70 ++++++++++++++++++++++++++-----------
 src/oce_ale.F90            | 10 +++---
 src/oce_ale_mixing_kpp.F90 | 31 +++++++++--------
 src/oce_ale_tracer.F90     | 20 ++++++-----
 src/oce_ale_vel_rhs.F90    | 22 ++++++------
 src/oce_setup_step.F90     |  6 ++--
 src/oce_tracer_mod.F90     | 71 +++++++++++++++++++-------------------
 9 files changed, 170 insertions(+), 118 deletions(-)

diff --git a/src/fesom_module.F90 b/src/fesom_module.F90
index 788e5cf77..1c76a2c6c 100755
--- a/src/fesom_module.F90
+++ b/src/fesom_module.F90
@@ -784,6 +784,13 @@ subroutine fesom_runloop(current_nsteps)
         end if
         !--------------------------
 
+        ! Sync point at end of I/O block: any upstream stall (XIOS client work,
+        ! OASIS asymmetry) that would otherwise be absorbed by the next MPI
+        ! halo exchange in ice_timestep is captured here, attributed to
+        ! rtime_write_means instead of rtime_fullice. Doesn't add wall time
+        ! (the wait happens anyway), just makes timer accounting honest.
+        call MPI_Barrier(f%MPI_COMM_FESOM, f%MPIerr)
+
         f%t5 = MPI_Wtime()
 #if defined (FESOM_PROFILING)
         call fesom_profiler_start("restart")
diff --git a/src/io_meandata.F90 b/src/io_meandata.F90
index 1adf4d032..ed257d3c7 100644
--- a/src/io_meandata.F90
+++ b/src/io_meandata.F90
@@ -2602,55 +2602,64 @@ subroutine update_means
     type(Meandata), pointer :: entry
     integer                 :: n
     integer                 :: I, J
-
+    integer                 :: szI, szJ
+
+    ! Single OMP fork-join across all streams instead of one per stream. With
+    ! ~50 io_NSTREAMS and ~6700 calls/sim-month at HR (FESOM dt=400s), the
+    ! original code spawned ~335k OMP regions per rank per month, each with
+    ! its own fork-join overhead. SIMD hints on the innermost loop let the
+    ! compiler vectorize the float adds. Targets the 146 s/rank/month
+    ! FESOM-side stream-prep cost identified in HR_test_45 profiling.
+!$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(n, entry, I, J, szI, szJ) SCHEDULE(dynamic)
     DO n=1, io_NSTREAMS
         entry=>io_stream(n)%p
 
         !_____________ compute in 8 byte accuracy ______________________________
         IF (entry%accuracy == i_real8) then
+            szI = size(entry%local_values_r8, dim=1)
+            szJ = size(entry%local_values_r8, dim=2)
             IF (entry%flip) then
-!$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(I,J)
-                DO J=1, size(entry%local_values_r8,dim=2)
-                    DO I=1, size(entry%local_values_r8,dim=1)
+                DO J=1, szJ
+!$OMP SIMD
+                    DO I=1, szI
                         entry%local_values_r8(I,J)=entry%local_values_r8(I,J)+entry%ptr3(J,I)
                     END DO
                 END DO
-!$OMP END PARALLEL DO
             ELSE
-!$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(I,J)
-                DO J=1, size(entry%local_values_r8,dim=2)
-                    DO I=1, size(entry%local_values_r8,dim=1)
+                DO J=1, szJ
+!$OMP SIMD
+                    DO I=1, szI
                         entry%local_values_r8(I,J)=entry%local_values_r8(I,J)+entry%ptr3(I,J)
                     END DO
                 END DO
-!$OMP END PARALLEL DO
             END IF
-            
+
         !_____________ compute in 4 byte accuracy ______________________________
         ELSE IF (entry%accuracy == i_real4) then
+            szI = size(entry%local_values_r4, dim=1)
+            szJ = size(entry%local_values_r4, dim=2)
             IF (entry%flip) then
-!$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(I,J)
-                DO J=1, size(entry%local_values_r4,dim=2)
-                    DO I=1, size(entry%local_values_r4,dim=1)
+                DO J=1, szJ
+!$OMP SIMD
+                    DO I=1, szI
                         entry%local_values_r4(I,J)=entry%local_values_r4(I,J)+real(entry%ptr3(J,I), real32)
                     END DO
                 END DO
-!$OMP END PARALLEL DO
             ELSE
-!$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(I, J)
-                DO J=1, size(entry%local_values_r4,dim=2)
-                    DO I=1, size(entry%local_values_r4,dim=1)
+                DO J=1, szJ
+!$OMP SIMD
+                    DO I=1, szI
                         entry%local_values_r4(I,J)=entry%local_values_r4(I,J)+real(entry%ptr3(I,J), real32)
                     END DO
                 END DO
-!$OMP END PARALLEL DO
             END IF
         END IF ! --> IF (entry%accuracy == i_real8) then
-        
+
         entry%addcounter=entry%addcounter+1
     END DO ! --> DO n=1, io_NSTREAMS
-    
-    ! Update 0D (scalar) means
+!$OMP END PARALLEL DO
+
+    ! Update 0D (scalar) means — small loop, leave serial
     DO n=1, io_NSTREAMS0D
         io_stream0D(n)%local_value = io_stream0D(n)%local_value + real(io_stream0D(n)%ptr, real64)
         io_stream0D(n)%addcounter = io_stream0D(n)%addcounter + 1
diff --git a/src/io_xios.F90 b/src/io_xios.F90
index 82263bd2d..e78555353 100644
--- a/src/io_xios.F90
+++ b/src/io_xios.F90
@@ -398,9 +398,11 @@ subroutine io_xios_apply_ice_mask_2d_r8(buf)
     integer :: i, n
     if (.not. associated(p_ice_conc)) return
     n = min(size(buf), size(p_ice_conc))
+!$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(i)
     do i = 1, n
        if (real(p_ice_conc(i), kind=8) < ICE_CONC_EPS) buf(i) = NC_FILL_DOUBLE
     end do
+!$OMP END PARALLEL DO
   end subroutine
 
   subroutine io_xios_apply_ice_mask_2d_r4(buf)
@@ -408,9 +410,11 @@ subroutine io_xios_apply_ice_mask_2d_r4(buf)
     integer :: i, n
     if (.not. associated(p_ice_conc)) return
     n = min(size(buf), size(p_ice_conc))
+!$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(i)
     do i = 1, n
        if (real(p_ice_conc(i), kind=4) < real(ICE_CONC_EPS, kind=4)) buf(i) = NC_FILL_FLOAT
     end do
+!$OMP END PARALLEL DO
   end subroutine
 
   !> Element-based ice mask: element is ice-covered iff any of its 3 vertex
@@ -418,11 +422,13 @@ subroutine io_xios_apply_ice_mask_2d_r4(buf)
   !> element indices come from owned_elem_local.
   subroutine io_xios_apply_ice_mask_2d_elem_r8(buf)
     real(kind=8), intent(inout) :: buf(:)
-    integer :: ee, e, n1, n2, n3
+    integer :: ee, e, n1, n2, n3, ne
     real(kind=8) :: amax
     if (.not. associated(p_ice_conc) .or. .not. associated(p_elem2D_nodes)) return
     if (.not. allocated(owned_elem_local)) return
-    do ee = 1, min(size(buf), n_owned_elem)
+    ne = min(size(buf), n_owned_elem)
+!$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(ee, e, n1, n2, n3, amax)
+    do ee = 1, ne
        e  = owned_elem_local(ee)
        n1 = p_elem2D_nodes(1, e)
        n2 = p_elem2D_nodes(2, e)
@@ -432,15 +438,18 @@ subroutine io_xios_apply_ice_mask_2d_elem_r8(buf)
                   real(p_ice_conc(n3), kind=8))
        if (amax < ICE_CONC_EPS) buf(ee) = NC_FILL_DOUBLE
     end do
+!$OMP END PARALLEL DO
   end subroutine
 
   subroutine io_xios_apply_ice_mask_2d_elem_r4(buf)
     real(kind=4), intent(inout) :: buf(:)
-    integer :: ee, e, n1, n2, n3
+    integer :: ee, e, n1, n2, n3, ne
     real(kind=4) :: amax
     if (.not. associated(p_ice_conc) .or. .not. associated(p_elem2D_nodes)) return
     if (.not. allocated(owned_elem_local)) return
-    do ee = 1, min(size(buf), n_owned_elem)
+    ne = min(size(buf), n_owned_elem)
+!$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(ee, e, n1, n2, n3, amax)
+    do ee = 1, ne
        e  = owned_elem_local(ee)
        n1 = p_elem2D_nodes(1, e)
        n2 = p_elem2D_nodes(2, e)
@@ -450,6 +459,7 @@ subroutine io_xios_apply_ice_mask_2d_elem_r4(buf)
                   real(p_ice_conc(n3), kind=4))
        if (amax < real(ICE_CONC_EPS, kind=4)) buf(ee) = NC_FILL_FLOAT
     end do
+!$OMP END PARALLEL DO
   end subroutine
 
 
@@ -471,44 +481,56 @@ subroutine io_xios_set_wet_ptrs(ulevels_nod, nlevels_nod, &
   !> for non-cavity runs where all nodes are wet at surface).
   subroutine io_xios_apply_wet_2d_r8(buf)
     real(kind=8), intent(inout) :: buf(:)
-    integer :: n
+    integer :: n, nn
     if (.not. associated(p_ulevels_nod)) return
-    do n = 1, min(size(buf), size(p_ulevels_nod))
+    nn = min(size(buf), size(p_ulevels_nod))
+!$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(n)
+    do n = 1, nn
        if (p_ulevels_nod(n) > 1) buf(n) = NC_FILL_DOUBLE
     end do
+!$OMP END PARALLEL DO
   end subroutine
 
   subroutine io_xios_apply_wet_2d_r4(buf)
     real(kind=4), intent(inout) :: buf(:)
-    integer :: n
+    integer :: n, nn
     if (.not. associated(p_ulevels_nod)) return
-    do n = 1, min(size(buf), size(p_ulevels_nod))
+    nn = min(size(buf), size(p_ulevels_nod))
+!$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(n)
+    do n = 1, nn
        if (p_ulevels_nod(n) > 1) buf(n) = NC_FILL_FLOAT
     end do
+!$OMP END PARALLEL DO
   end subroutine
 
   !> 2D element field (strictly-owned subset): NC_FILL where any of the
   !> 3 vertex nodes has ulevels > 1 (cavity element).
   subroutine io_xios_apply_wet_2d_elem_r8(buf)
     real(kind=8), intent(inout) :: buf(:)
-    integer :: ee, e
+    integer :: ee, e, ne
     if (.not. associated(p_ulevels_elem)) return
     if (.not. allocated(owned_elem_local)) return
-    do ee = 1, min(size(buf), n_owned_elem)
+    ne = min(size(buf), n_owned_elem)
+!$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(ee, e)
+    do ee = 1, ne
        e = owned_elem_local(ee)
        if (p_ulevels_elem(e) > 1) buf(ee) = NC_FILL_DOUBLE
     end do
+!$OMP END PARALLEL DO
   end subroutine
 
   subroutine io_xios_apply_wet_2d_elem_r4(buf)
     real(kind=4), intent(inout) :: buf(:)
-    integer :: ee, e
+    integer :: ee, e, ne
     if (.not. associated(p_ulevels_elem)) return
     if (.not. allocated(owned_elem_local)) return
-    do ee = 1, min(size(buf), n_owned_elem)
+    ne = min(size(buf), n_owned_elem)
+!$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(ee, e)
+    do ee = 1, ne
        e = owned_elem_local(ee)
        if (p_ulevels_elem(e) > 1) buf(ee) = NC_FILL_FLOAT
     end do
+!$OMP END PARALLEL DO
   end subroutine
 
   !> 3D node field, axis-first (nz, nn). Valid range is
@@ -520,9 +542,10 @@ subroutine io_xios_apply_wet_3d_r8(buf)
     integer :: n, k, nz, nn, ub, un, bn
     logical :: is_interface
     if (.not. associated(p_ulevels_nod) .or. .not. associated(p_nlevels_nod)) return
-    nz = size(buf, 1); nn = size(buf, 2)
+    nz = size(buf, 1); nn = min(size(buf, 2), size(p_ulevels_nod))
     is_interface = (p_nl > 0 .and. nz == p_nl)
-    do n = 1, min(nn, size(p_ulevels_nod))
+!$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(n, k, un, bn, ub)
+    do n = 1, nn
        un = p_ulevels_nod(n)
        bn = p_nlevels_nod(n)
        ub = bn; if (.not. is_interface) ub = bn - 1
@@ -530,6 +553,7 @@ subroutine io_xios_apply_wet_3d_r8(buf)
           if (k < un .or. k > ub) buf(k, n) = NC_FILL_DOUBLE
        end do
     end do
+!$OMP END PARALLEL DO
   end subroutine
 
   subroutine io_xios_apply_wet_3d_r4(buf)
@@ -537,9 +561,10 @@ subroutine io_xios_apply_wet_3d_r4(buf)
     integer :: n, k, nz, nn, ub, un, bn
     logical :: is_interface
     if (.not. associated(p_ulevels_nod) .or. .not. associated(p_nlevels_nod)) return
-    nz = size(buf, 1); nn = size(buf, 2)
+    nz = size(buf, 1); nn = min(size(buf, 2), size(p_ulevels_nod))
     is_interface = (p_nl > 0 .and. nz == p_nl)
-    do n = 1, min(nn, size(p_ulevels_nod))
+!$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(n, k, un, bn, ub)
+    do n = 1, nn
        un = p_ulevels_nod(n)
        bn = p_nlevels_nod(n)
        ub = bn; if (.not. is_interface) ub = bn - 1
@@ -547,6 +572,7 @@ subroutine io_xios_apply_wet_3d_r4(buf)
           if (k < un .or. k > ub) buf(k, n) = NC_FILL_FLOAT
        end do
     end do
+!$OMP END PARALLEL DO
   end subroutine
 
   !> 3D element field, axis-first (nz, ne_owned). Same bound logic as
@@ -557,9 +583,10 @@ subroutine io_xios_apply_wet_3d_elem_r8(buf)
     logical :: is_interface
     if (.not. associated(p_ulevels_elem) .or. .not. associated(p_nlevels_elem)) return
     if (.not. allocated(owned_elem_local)) return
-    nz = size(buf, 1); ne = size(buf, 2)
+    nz = size(buf, 1); ne = min(size(buf, 2), n_owned_elem)
     is_interface = (p_nl > 0 .and. nz == p_nl)
-    do ee = 1, min(ne, n_owned_elem)
+!$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(ee, e, k, ue, be, ub)
+    do ee = 1, ne
        e = owned_elem_local(ee)
        ue = p_ulevels_elem(e)
        be = p_nlevels_elem(e)
@@ -568,6 +595,7 @@ subroutine io_xios_apply_wet_3d_elem_r8(buf)
           if (k < ue .or. k > ub) buf(k, ee) = NC_FILL_DOUBLE
        end do
     end do
+!$OMP END PARALLEL DO
   end subroutine
 
   subroutine io_xios_apply_wet_3d_elem_r4(buf)
@@ -576,9 +604,10 @@ subroutine io_xios_apply_wet_3d_elem_r4(buf)
     logical :: is_interface
     if (.not. associated(p_ulevels_elem) .or. .not. associated(p_nlevels_elem)) return
     if (.not. allocated(owned_elem_local)) return
-    nz = size(buf, 1); ne = size(buf, 2)
+    nz = size(buf, 1); ne = min(size(buf, 2), n_owned_elem)
     is_interface = (p_nl > 0 .and. nz == p_nl)
-    do ee = 1, min(ne, n_owned_elem)
+!$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(ee, e, k, ue, be, ub)
+    do ee = 1, ne
        e = owned_elem_local(ee)
        ue = p_ulevels_elem(e)
        be = p_nlevels_elem(e)
@@ -587,6 +616,7 @@ subroutine io_xios_apply_wet_3d_elem_r4(buf)
           if (k < ue .or. k > ub) buf(k, ee) = NC_FILL_FLOAT
        end do
     end do
+!$OMP END PARALLEL DO
   end subroutine
 
 
diff --git a/src/oce_ale.F90 b/src/oce_ale.F90
index 0f8e93b57..212538acc 100644
--- a/src/oce_ale.F90
+++ b/src/oce_ale.F90
@@ -3348,7 +3348,7 @@ subroutine oce_timestep_ale(n, ice, dynamics, tracers, partit, mesh)
 #if defined (FESOM_PROFILING)
     use fesom_profiler
 #endif
-    
+
     IMPLICIT NONE
     integer       , intent(in)            :: n
     type(t_dyn)   , intent(inout), target :: dynamics
@@ -3498,13 +3498,13 @@ subroutine oce_timestep_ale(n, ice, dynamics, tracers, partit, mesh)
         call calc_cvmix_tidal(partit, mesh)
         
     end if
-#endif    
+#endif
     t1=MPI_Wtime()
 #if defined (FESOM_PROFILING)
     call fesom_profiler_end("oce_mix_pres")
     call fesom_profiler_start("oce_dyn_momentum")
-#endif    
-    
+#endif
+
     !___________________________________________________________________________
     ! add contribution from momentum advection, coriolis and pressure gradient |
     ! force to UV_rhs
@@ -3603,7 +3603,7 @@ subroutine oce_timestep_ale(n, ice, dynamics, tracers, partit, mesh)
             call impl_vert_visc_ale(dynamics,partit, mesh)
         else
             call impl_vert_visc_ale_vtransp(dynamics, partit, mesh)
-        end if 
+        end if
     end if
     
     !___________________________________________________________________________
diff --git a/src/oce_ale_mixing_kpp.F90 b/src/oce_ale_mixing_kpp.F90
index c7204cd46..fa1c7c5f6 100755
--- a/src/oce_ale_mixing_kpp.F90
+++ b/src/oce_ale_mixing_kpp.F90
@@ -277,12 +277,13 @@ SUBROUTINE oce_mixing_KPP(viscAE, diffK, dynamics, tracers, partit, mesh)
 #include "associate_mesh_ass.h"
     UVnode=>dynamics%uvnode(:,:,:)
 
-!$OMP PARALLEL DO
+!$OMP PARALLEL DEFAULT(SHARED) PRIVATE(node, nz, nzmin, nzmax, usurf, vsurf, u_loc, v_loc)
+!$OMP DO
      DO node=1, myDim_nod2D+eDim_nod2D
         ViscA(:, node) = 0.0_WP
      END DO
-!$OMP END PARALLEL DO
-!$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(node, nz, nzmin, nzmax, usurf, vsurf, u_loc, v_loc)
+!$OMP END DO
+!$OMP DO
      DO node=1, myDim_nod2D !+eDim_nod2D
         nzmin = ulevels_nod2D(node)
         nzmax = nlevels_nod2D(node)
@@ -320,7 +321,8 @@ SUBROUTINE oce_mixing_KPP(viscAE, diffK, dynamics, tracers, partit, mesh)
          END DO
          dVsq ( nzmax, node ) = dVsq  ( nzmax-1, node )
      END DO
-!$OMP END PARALLEL DO
+!$OMP END DO
+!$OMP END PARALLEL
 
 !      *******************************************************************
 !       compute thermal and haline expansion coefficients (without factor of rho).
@@ -486,18 +488,17 @@ SUBROUTINE bldepth(partit, mesh)
 #include "associate_part_ass.h"
 #include "associate_mesh_ass.h"
 
-!$OMP PARALLEL DO
+!$OMP PARALLEL DEFAULT(SHARED) PRIVATE(node, nz, nzmin, nzmax, coeff_sw, Rib_km1, zk, zkm1, sigma, zehat, &
+!$OMP                                  wm, ws, bvsq, Vtsq, Ritop, Rib_k, dzup, hekman, hmonob, hlimit)
+!$OMP DO
      ! Initialize hbl and kbl to bottomed out values
      DO node=1, myDim_nod2D !+eDim_nod2D
         ! Index of first grid level below hbl
-        kbl(node) = nlevels_nod2D(node)      
+        kbl(node) = nlevels_nod2D(node)
         ! Boundary layer depth
         hbl(node) = ABS( zbar_3d_n( nlevels_nod2d(node),node ) )
      END DO
-!$OMP END PARALLEL DO
-
-!$OMP PARALLEL DEFAULT(SHARED) PRIVATE(node, nz, nzmin, nzmax, coeff_sw, Rib_km1, zk, zkm1, sigma, zehat, &
-!$OMP                                  wm, ws, bvsq, Vtsq, Ritop, Rib_k, dzup, hekman, hmonob, hlimit)
+!$OMP END DO
 !$OMP DO
      DO node=1, myDim_nod2D !+eDim_nod2D
         nzmin = ulevels_nod2D(node)
@@ -967,16 +968,16 @@ subroutine blmix_kpp(viscA,diffK, partit, mesh)
 #include "associate_mesh_def.h"
 #include "associate_part_ass.h"
 #include "associate_mesh_ass.h"
-!$OMP PARALLEL DO
+!$OMP PARALLEL DEFAULT(SHARED) PRIVATE(node, nz, kn, knm1, knp1, nl1, nu1, delhat, R, dvdzup, dvdzdn, viscp, difsp, diftp, visch, difsh, difth, f1, sig, &
+!$OMP                                            a1, a2, a3, Gm, Gs, Gt, sigma, zehat, wm, ws, gat1m, gat1t, gat1s, dat1m, dat1s, dat1t, dthick, diff_col)
+!$OMP DO
      DO node=1, myDim_nod2D+eDim_nod2D
         blmc   (:, node,  :) = 0.0_WP
      END DO
-!$OMP END PARALLEL DO
+!$OMP END DO
 !    *******************************************************************
-!     Kv over the NODE 
+!     Kv over the NODE
 !    *******************************************************************
-!$OMP PARALLEL DEFAULT(SHARED) PRIVATE(node, nz, kn, knm1, knp1, nl1, nu1, delhat, R, dvdzup, dvdzdn, viscp, difsp, diftp, visch, difsh, difth, f1, sig, &
-!$OMP                                            a1, a2, a3, Gm, Gs, Gt, sigma, zehat, wm, ws, gat1m, gat1t, gat1s, dat1m, dat1s, dat1t, dthick, diff_col)
 !$OMP DO
      DO node=1, myDim_nod2D
         nl1=nlevels_nod2d(node)
diff --git a/src/oce_ale_tracer.F90 b/src/oce_ale_tracer.F90
index 7c88a2c5c..0c4549297 100644
--- a/src/oce_ale_tracer.F90
+++ b/src/oce_ale_tracer.F90
@@ -200,17 +200,19 @@ subroutine solve_tracers_ale(ice, dynamics, tracers, partit, mesh)
     ! 1. bolus velocities are computed according to GM implementation after R. Ferrari et al., 2010
     ! 2. bolus velocities are used only for advecting tracers and shall be subtracted back afterwards
     if (Fer_GM) then
-!$OMP PARALLEL DO
+!$OMP PARALLEL DEFAULT(SHARED) PRIVATE(elem, node)
+!$OMP DO
         do elem=1, myDim_elem2D+eDim_elem2D
            UV(:, :, elem)    =UV(:, :, elem) + fer_UV(:, :, elem)
         end do
-!$OMP END PARALLEL DO
-!$OMP PARALLEL DO
+!$OMP END DO
+!$OMP DO
         do node=1, myDim_nod2D+eDim_nod2D
            Wvel_e(:, node)=Wvel_e(:, node)+fer_Wvel(:, node)
            Wvel  (:, node)=Wvel  (:, node)+fer_Wvel(:, node)
         end do
-!$OMP END PARALLEL DO
+!$OMP END DO
+!$OMP END PARALLEL
     end if
 
     ! Set advective and diffusive components of total tracer fluxes to zero
@@ -329,17 +331,19 @@ subroutine solve_tracers_ale(ice, dynamics, tracers, partit, mesh)
     !___________________________________________________________________________
     ! subtract the the bolus velocities back from 3D velocities:
     if (Fer_GM) then
-!$OMP PARALLEL DO
+!$OMP PARALLEL DEFAULT(SHARED) PRIVATE(elem, node)
+!$OMP DO
         do elem=1, myDim_elem2D+eDim_elem2D
            UV(:, :, elem)    =UV(:, :, elem) - fer_UV(:, :, elem)
         end do
-!$OMP END PARALLEL DO
-!$OMP PARALLEL DO
+!$OMP END DO
+!$OMP DO
         do node=1, myDim_nod2D+eDim_nod2D
            Wvel_e(:, node)=Wvel_e(:, node)-fer_Wvel(:, node)
            Wvel  (:, node)=Wvel  (:, node)-fer_Wvel(:, node)
         end do
-!$OMP END PARALLEL DO
+!$OMP END DO
+!$OMP END PARALLEL
     end if
     
     ! TODO: do it only when it is coupled to atmosphere 
diff --git a/src/oce_ale_vel_rhs.F90 b/src/oce_ale_vel_rhs.F90
index 2cf0ab2b0..96116ba4f 100644
--- a/src/oce_ale_vel_rhs.F90
+++ b/src/oce_ale_vel_rhs.F90
@@ -288,7 +288,8 @@ subroutine compute_vel_rhs(ice, dynamics, partit, mesh)
         ff=1.0_WP
         lfirst=.false.
     end if
-!$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(elem, nz, nzmin, nzmax)
+!$OMP PARALLEL DEFAULT(SHARED) PRIVATE(elem, nz, nzmin, nzmax)
+!$OMP DO
     do elem=1, myDim_elem2D
         nzmin = ulevels(elem)
         nzmax = nlevels(elem)
@@ -298,19 +299,19 @@ subroutine compute_vel_rhs(ice, dynamics, partit, mesh)
             !                       |                   |
             !                       V                   V
             !        fAB = (f_pgf - 1/2*fab_n-1)    +3/2*fab_n
-            !        
-            ! until here: UV_rhs = dt*[ (R_advec + R_coriolis)^n + R_pressure] 
-            ! --> horizontal viscosity contribution still missing is added in 
-            !     call viscosity_filter      
-            ! --> vertical viscosity contribution still missing is added in 
-            !     call impl_vert_visc_ale      
+            !
+            ! until here: UV_rhs = dt*[ (R_advec + R_coriolis)^n + R_pressure]
+            ! --> horizontal viscosity contribution still missing is added in
+            !     call viscosity_filter
+            ! --> vertical viscosity contribution still missing is added in
+            !     call impl_vert_visc_ale
         end do
     end do
-!$OMP END PARALLEL DO
+!$OMP END DO
 
     !___________________________________________________________________________
     if (dynamics%ldiag_ke) then
-!$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(elem, nz, nzmin, nzmax)
+!$OMP DO
         do elem=1, myDim_elem2D
             nzmin = ulevels(elem)
             nzmax = nlevels(elem)
@@ -319,8 +320,9 @@ subroutine compute_vel_rhs(ice, dynamics, partit, mesh)
                 dynamics%ke_cor(:,nz,elem)=dt*(dynamics%ke_cor(:,nz,elem)+dynamics%ke_cor_AB(1,:,nz,elem)*ff)/elem_area(elem)
             end do
         end do
-!$OMP END PARALLEL DO
+!$OMP END DO
     end if
+!$OMP END PARALLEL
     
     ! =======================  
     ! U_rhs contains all contributions to velocity from old time steps   
diff --git a/src/oce_setup_step.F90 b/src/oce_setup_step.F90
index 48a5c84a1..7e3147300 100755
--- a/src/oce_setup_step.F90
+++ b/src/oce_setup_step.F90
@@ -275,10 +275,10 @@ subroutine ocean_setup(dynamics, tracers, partit, mesh)
     
     !___________________________________________________________________________
     ! precompute mask for GM/Redi upscalling in the GINsea
-    if ((Fer_GM .or. Redi) .and. scaling_GINsea) then 
+    if ((Fer_GM .or. Redi) .and. scaling_GINsea) then
         call init_RediGM_GINsea_mask(partit, mesh)
-    end if 
-    
+    end if
+
     !___________________________________________________________________________
     if(partit%mype==0) write(*,*) 'Initial state'
     if (dynamics%use_wsplit .and. partit%mype==0) then
diff --git a/src/oce_tracer_mod.F90 b/src/oce_tracer_mod.F90
index 2fb963fe0..a5374d739 100755
--- a/src/oce_tracer_mod.F90
+++ b/src/oce_tracer_mod.F90
@@ -26,6 +26,7 @@ SUBROUTINE init_tracers_AB(tr_num, tracers, partit, mesh)
     integer                               :: n,nz 
 
 #ifndef ENABLE_OPENACC
+!$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(n, nz)
 #else
 !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(PRESENT)
 #endif
@@ -38,14 +39,19 @@ SUBROUTINE init_tracers_AB(tr_num, tracers, partit, mesh)
        end do
 end do
 #ifndef ENABLE_OPENACC
+!$OMP END PARALLEL DO
 #else
 !$ACC END PARALLEL LOOP
 #endif
 
-    ! AB interpolation
+    ! AB interpolation. Both the valuesAB compute and the valuesold update
+    ! were originally separate PARALLEL DO regions; combined here into a
+    ! single PARALLEL with two DOs, saving one fork-join per tracer per
+    ! timestep.
     if (tracers%data(tr_num)%AB_order==2) then
 #ifndef ENABLE_OPENACC
-!$OMP PARALLEL DO
+!$OMP PARALLEL DEFAULT(SHARED) PRIVATE(n)
+!$OMP DO
 #else
 !ACC PARALLEL LOOP DEFAULT(PRESENT)
 #endif
@@ -53,13 +59,25 @@ SUBROUTINE init_tracers_AB(tr_num, tracers, partit, mesh)
            tracers%data(tr_num)%valuesAB(:, n)  =-(0.5_WP+epsilon)*tracers%data(tr_num)%valuesold(1, :, n)+(1.5_WP+epsilon)*tracers%data(tr_num)%values(:, n)
        end do
 #ifndef ENABLE_OPENACC
-!$OMP END PARALLEL DO
+!$OMP END DO
+!$OMP DO
+#else
+!ACC END PARALLEL LOOP
+!ACC PARALLEL LOOP DEFAULT(PRESENT)
+#endif
+       do n=1, partit%myDim_nod2d+partit%eDim_nod2D
+          tracers%data(tr_num)%valuesold(1, :, n)=tracers%data(tr_num)%values(:, n)
+       end do
+#ifndef ENABLE_OPENACC
+!$OMP END DO
+!$OMP END PARALLEL
 #else
 !ACC END PARALLEL LOOP
 #endif
     elseif (tracers%data(tr_num)%AB_order==3) then
 #ifndef ENABLE_OPENACC
-!$OMP PARALLEL DO
+!$OMP PARALLEL DEFAULT(SHARED) PRIVATE(n)
+!$OMP DO
 #else
 !ACC PARALLEL LOOP DEFAULT(PRESENT)
 #endif
@@ -68,7 +86,19 @@ SUBROUTINE init_tracers_AB(tr_num, tracers, partit, mesh)
            tracers%data(tr_num)%valuesAB(:, n)  =tracers%data(tr_num)%valuesAB(:, n)/12.0_WP
        end do
 #ifndef ENABLE_OPENACC
-!$OMP END PARALLEL DO
+!$OMP END DO
+!$OMP DO
+#else
+!ACC END PARALLEL LOOP
+!ACC PARALLEL LOOP DEFAULT(PRESENT)
+#endif
+       do n=1, partit%myDim_nod2d+partit%eDim_nod2D
+          tracers%data(tr_num)%valuesold(2, :, n)=tracers%data(tr_num)%valuesold(1, :, n)
+          tracers%data(tr_num)%valuesold(1, :, n)=tracers%data(tr_num)%values(:, n)
+       end do
+#ifndef ENABLE_OPENACC
+!$OMP END DO
+!$OMP END PARALLEL
 #else
 !ACC END PARALLEL LOOP
 #endif
@@ -91,37 +121,6 @@ SUBROUTINE init_tracers_AB(tr_num, tracers, partit, mesh)
        call par_ex(partit%MPI_COMM_FESOM, partit%mype, 0)
     end if
 
-    if (tracers%data(tr_num)%AB_order==2) then
-#ifndef ENABLE_OPENACC
-!$OMP PARALLEL DO
-#else
-!ACC PARALLEL LOOP DEFAULT(PRESENT)
-#endif
-       do n=1, partit%myDim_nod2d+partit%eDim_nod2D
-          tracers%data(tr_num)%valuesold(1, :, n)=tracers%data(tr_num)%values(:, n)
-       end do
-#ifndef ENABLE_OPENACC
-!$OMP END PARALLEL DO
-#else
-!ACC END PARALLEL LOOP
-#endif
-    elseif (tracers%data(tr_num)%AB_order==3) then
-#ifndef ENABLE_OPENACC
-!$OMP PARALLEL DO
-#else
-!ACC PARALLEL LOOP DEFAULT(PRESENT)
-#endif
-       do n=1, partit%myDim_nod2d+partit%eDim_nod2D
-          tracers%data(tr_num)%valuesold(2, :, n)=tracers%data(tr_num)%valuesold(1, :, n)
-          tracers%data(tr_num)%valuesold(1, :, n)=tracers%data(tr_num)%values(:, n)
-       end do
-#ifndef ENABLE_OPENACC
-!$OMP END PARALLEL DO
-#else
-!ACC END PARALLEL LOOP
-#endif
-    end if
-
     if (flag_debug .and. partit%mype==0)  print *, achar(27)//'[38m'//'             --> call tracer_gradient_elements'//achar(27)//'[0m'
 !PS     call tracer_gradient_elements(tracers%data(tr_num)%valuesAB, partit, mesh)
     call tracer_gradient_elements(tracers%data(tr_num)%values, partit, mesh)