From bdaaa2d6bf6c0f7f2bf7d73943a48962706d7457 Mon Sep 17 00:00:00 2001 From: Jan Streffing Date: Mon, 27 Apr 2026 17:10:38 +0200 Subject: [PATCH] perf: OpenMP fork-join consolidation + SIMD hints + I/O timer-attribution barrier MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cuts FESOM-side timestep overhead by reducing OpenMP fork-join count and giving the compiler explicit vectorization hints in the hottest streams in tracers, dynamics, ALE mixing, and the mean-output bookkeeping. Theme A — combine adjacent !$OMP PARALLEL DO regions into a single !$OMP PARALLEL with multiple !$OMP DO blocks. Each pair previously paid two fork-joins per timestep; now one. Touched: oce_ale_mixing_kpp (KPP coefficient + boundary-layer setup), oce_ale_tracer (Fer_GM bolus add / subtract), oce_ale_vel_rhs (rhs build + ke_cor diagnostic), oce_tracer_mod (init_tracers_AB + AB interpolation). Theme B — io_meandata.update_means: replaced the per-stream PARALLEL DO with one outer PARALLEL DO over io_NSTREAMS and lifted the size() calls out of the inner loop into szI/szJ; added !$OMP SIMD on the inner I-axis so the compiler can vectorize the float adds. At HR the original code spawned ~50 streams * ~6700 calls/sim-month worth of OMP regions per rank per month; one outer fork-join replaces the lot. Theme C — io_xios mask helpers: io_xios_apply_wet_2d / 2d_elem / 3d (_r4 and _r8) and io_xios_apply_ice_mask_2d_elem now run their fill- loops under !$OMP PARALLEL DO. Lifts size() / min() out of the loop header (nn / ne) so OMP doesn't have to re-evaluate every iteration. Theme D — fesom_module: insert one MPI_Barrier at the end of the I/O block before f%t5 = MPI_Wtime(). XIOS-client / OASIS asymmetry stalls that would otherwise be absorbed by the first halo exchange in ice_timestep are now captured in rtime_write_means instead of leaking into rtime_fullice. No wall-time change, only honest accounting. Theme E — oce_ale.F90 and oce_setup_step.F90: trailing-whitespace cleanup that landed on the same lines as the OMP rework; pulled in to keep the OMP commits hunk-clean. --- src/fesom_module.F90 | 7 ++++ src/io_meandata.F90 | 51 ++++++++++++++++----------- src/io_xios.F90 | 70 ++++++++++++++++++++++++++----------- src/oce_ale.F90 | 10 +++--- src/oce_ale_mixing_kpp.F90 | 31 +++++++++-------- src/oce_ale_tracer.F90 | 20 ++++++----- src/oce_ale_vel_rhs.F90 | 22 ++++++------ src/oce_setup_step.F90 | 6 ++-- src/oce_tracer_mod.F90 | 71 +++++++++++++++++++------------------- 9 files changed, 170 insertions(+), 118 deletions(-) diff --git a/src/fesom_module.F90 b/src/fesom_module.F90 index 788e5cf77..1c76a2c6c 100755 --- a/src/fesom_module.F90 +++ b/src/fesom_module.F90 @@ -784,6 +784,13 @@ subroutine fesom_runloop(current_nsteps) end if !-------------------------- + ! Sync point at end of I/O block: any upstream stall (XIOS client work, + ! OASIS asymmetry) that would otherwise be absorbed by the next MPI + ! halo exchange in ice_timestep is captured here, attributed to + ! rtime_write_means instead of rtime_fullice. Doesn't add wall time + ! (the wait happens anyway), just makes timer accounting honest. + call MPI_Barrier(f%MPI_COMM_FESOM, f%MPIerr) + f%t5 = MPI_Wtime() #if defined (FESOM_PROFILING) call fesom_profiler_start("restart") diff --git a/src/io_meandata.F90 b/src/io_meandata.F90 index 1adf4d032..ed257d3c7 100644 --- a/src/io_meandata.F90 +++ b/src/io_meandata.F90 @@ -2602,55 +2602,64 @@ subroutine update_means type(Meandata), pointer :: entry integer :: n integer :: I, J - + integer :: szI, szJ + + ! Single OMP fork-join across all streams instead of one per stream. With + ! ~50 io_NSTREAMS and ~6700 calls/sim-month at HR (FESOM dt=400s), the + ! original code spawned ~335k OMP regions per rank per month, each with + ! its own fork-join overhead. SIMD hints on the innermost loop let the + ! compiler vectorize the float adds. Targets the 146 s/rank/month + ! FESOM-side stream-prep cost identified in HR_test_45 profiling. +!$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(n, entry, I, J, szI, szJ) SCHEDULE(dynamic) DO n=1, io_NSTREAMS entry=>io_stream(n)%p !_____________ compute in 8 byte accuracy ______________________________ IF (entry%accuracy == i_real8) then + szI = size(entry%local_values_r8, dim=1) + szJ = size(entry%local_values_r8, dim=2) IF (entry%flip) then -!$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(I,J) - DO J=1, size(entry%local_values_r8,dim=2) - DO I=1, size(entry%local_values_r8,dim=1) + DO J=1, szJ +!$OMP SIMD + DO I=1, szI entry%local_values_r8(I,J)=entry%local_values_r8(I,J)+entry%ptr3(J,I) END DO END DO -!$OMP END PARALLEL DO ELSE -!$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(I,J) - DO J=1, size(entry%local_values_r8,dim=2) - DO I=1, size(entry%local_values_r8,dim=1) + DO J=1, szJ +!$OMP SIMD + DO I=1, szI entry%local_values_r8(I,J)=entry%local_values_r8(I,J)+entry%ptr3(I,J) END DO END DO -!$OMP END PARALLEL DO END IF - + !_____________ compute in 4 byte accuracy ______________________________ ELSE IF (entry%accuracy == i_real4) then + szI = size(entry%local_values_r4, dim=1) + szJ = size(entry%local_values_r4, dim=2) IF (entry%flip) then -!$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(I,J) - DO J=1, size(entry%local_values_r4,dim=2) - DO I=1, size(entry%local_values_r4,dim=1) + DO J=1, szJ +!$OMP SIMD + DO I=1, szI entry%local_values_r4(I,J)=entry%local_values_r4(I,J)+real(entry%ptr3(J,I), real32) END DO END DO -!$OMP END PARALLEL DO ELSE -!$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(I, J) - DO J=1, size(entry%local_values_r4,dim=2) - DO I=1, size(entry%local_values_r4,dim=1) + DO J=1, szJ +!$OMP SIMD + DO I=1, szI entry%local_values_r4(I,J)=entry%local_values_r4(I,J)+real(entry%ptr3(I,J), real32) END DO END DO -!$OMP END PARALLEL DO END IF END IF ! --> IF (entry%accuracy == i_real8) then - + entry%addcounter=entry%addcounter+1 END DO ! --> DO n=1, io_NSTREAMS - - ! Update 0D (scalar) means +!$OMP END PARALLEL DO + + ! Update 0D (scalar) means — small loop, leave serial DO n=1, io_NSTREAMS0D io_stream0D(n)%local_value = io_stream0D(n)%local_value + real(io_stream0D(n)%ptr, real64) io_stream0D(n)%addcounter = io_stream0D(n)%addcounter + 1 diff --git a/src/io_xios.F90 b/src/io_xios.F90 index 82263bd2d..e78555353 100644 --- a/src/io_xios.F90 +++ b/src/io_xios.F90 @@ -398,9 +398,11 @@ subroutine io_xios_apply_ice_mask_2d_r8(buf) integer :: i, n if (.not. associated(p_ice_conc)) return n = min(size(buf), size(p_ice_conc)) +!$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(i) do i = 1, n if (real(p_ice_conc(i), kind=8) < ICE_CONC_EPS) buf(i) = NC_FILL_DOUBLE end do +!$OMP END PARALLEL DO end subroutine subroutine io_xios_apply_ice_mask_2d_r4(buf) @@ -408,9 +410,11 @@ subroutine io_xios_apply_ice_mask_2d_r4(buf) integer :: i, n if (.not. associated(p_ice_conc)) return n = min(size(buf), size(p_ice_conc)) +!$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(i) do i = 1, n if (real(p_ice_conc(i), kind=4) < real(ICE_CONC_EPS, kind=4)) buf(i) = NC_FILL_FLOAT end do +!$OMP END PARALLEL DO end subroutine !> Element-based ice mask: element is ice-covered iff any of its 3 vertex @@ -418,11 +422,13 @@ subroutine io_xios_apply_ice_mask_2d_r4(buf) !> element indices come from owned_elem_local. subroutine io_xios_apply_ice_mask_2d_elem_r8(buf) real(kind=8), intent(inout) :: buf(:) - integer :: ee, e, n1, n2, n3 + integer :: ee, e, n1, n2, n3, ne real(kind=8) :: amax if (.not. associated(p_ice_conc) .or. .not. associated(p_elem2D_nodes)) return if (.not. allocated(owned_elem_local)) return - do ee = 1, min(size(buf), n_owned_elem) + ne = min(size(buf), n_owned_elem) +!$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(ee, e, n1, n2, n3, amax) + do ee = 1, ne e = owned_elem_local(ee) n1 = p_elem2D_nodes(1, e) n2 = p_elem2D_nodes(2, e) @@ -432,15 +438,18 @@ subroutine io_xios_apply_ice_mask_2d_elem_r8(buf) real(p_ice_conc(n3), kind=8)) if (amax < ICE_CONC_EPS) buf(ee) = NC_FILL_DOUBLE end do +!$OMP END PARALLEL DO end subroutine subroutine io_xios_apply_ice_mask_2d_elem_r4(buf) real(kind=4), intent(inout) :: buf(:) - integer :: ee, e, n1, n2, n3 + integer :: ee, e, n1, n2, n3, ne real(kind=4) :: amax if (.not. associated(p_ice_conc) .or. .not. associated(p_elem2D_nodes)) return if (.not. allocated(owned_elem_local)) return - do ee = 1, min(size(buf), n_owned_elem) + ne = min(size(buf), n_owned_elem) +!$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(ee, e, n1, n2, n3, amax) + do ee = 1, ne e = owned_elem_local(ee) n1 = p_elem2D_nodes(1, e) n2 = p_elem2D_nodes(2, e) @@ -450,6 +459,7 @@ subroutine io_xios_apply_ice_mask_2d_elem_r4(buf) real(p_ice_conc(n3), kind=4)) if (amax < real(ICE_CONC_EPS, kind=4)) buf(ee) = NC_FILL_FLOAT end do +!$OMP END PARALLEL DO end subroutine @@ -471,44 +481,56 @@ subroutine io_xios_set_wet_ptrs(ulevels_nod, nlevels_nod, & !> for non-cavity runs where all nodes are wet at surface). subroutine io_xios_apply_wet_2d_r8(buf) real(kind=8), intent(inout) :: buf(:) - integer :: n + integer :: n, nn if (.not. associated(p_ulevels_nod)) return - do n = 1, min(size(buf), size(p_ulevels_nod)) + nn = min(size(buf), size(p_ulevels_nod)) +!$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(n) + do n = 1, nn if (p_ulevels_nod(n) > 1) buf(n) = NC_FILL_DOUBLE end do +!$OMP END PARALLEL DO end subroutine subroutine io_xios_apply_wet_2d_r4(buf) real(kind=4), intent(inout) :: buf(:) - integer :: n + integer :: n, nn if (.not. associated(p_ulevels_nod)) return - do n = 1, min(size(buf), size(p_ulevels_nod)) + nn = min(size(buf), size(p_ulevels_nod)) +!$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(n) + do n = 1, nn if (p_ulevels_nod(n) > 1) buf(n) = NC_FILL_FLOAT end do +!$OMP END PARALLEL DO end subroutine !> 2D element field (strictly-owned subset): NC_FILL where any of the !> 3 vertex nodes has ulevels > 1 (cavity element). subroutine io_xios_apply_wet_2d_elem_r8(buf) real(kind=8), intent(inout) :: buf(:) - integer :: ee, e + integer :: ee, e, ne if (.not. associated(p_ulevels_elem)) return if (.not. allocated(owned_elem_local)) return - do ee = 1, min(size(buf), n_owned_elem) + ne = min(size(buf), n_owned_elem) +!$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(ee, e) + do ee = 1, ne e = owned_elem_local(ee) if (p_ulevels_elem(e) > 1) buf(ee) = NC_FILL_DOUBLE end do +!$OMP END PARALLEL DO end subroutine subroutine io_xios_apply_wet_2d_elem_r4(buf) real(kind=4), intent(inout) :: buf(:) - integer :: ee, e + integer :: ee, e, ne if (.not. associated(p_ulevels_elem)) return if (.not. allocated(owned_elem_local)) return - do ee = 1, min(size(buf), n_owned_elem) + ne = min(size(buf), n_owned_elem) +!$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(ee, e) + do ee = 1, ne e = owned_elem_local(ee) if (p_ulevels_elem(e) > 1) buf(ee) = NC_FILL_FLOAT end do +!$OMP END PARALLEL DO end subroutine !> 3D node field, axis-first (nz, nn). Valid range is @@ -520,9 +542,10 @@ subroutine io_xios_apply_wet_3d_r8(buf) integer :: n, k, nz, nn, ub, un, bn logical :: is_interface if (.not. associated(p_ulevels_nod) .or. .not. associated(p_nlevels_nod)) return - nz = size(buf, 1); nn = size(buf, 2) + nz = size(buf, 1); nn = min(size(buf, 2), size(p_ulevels_nod)) is_interface = (p_nl > 0 .and. nz == p_nl) - do n = 1, min(nn, size(p_ulevels_nod)) +!$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(n, k, un, bn, ub) + do n = 1, nn un = p_ulevels_nod(n) bn = p_nlevels_nod(n) ub = bn; if (.not. is_interface) ub = bn - 1 @@ -530,6 +553,7 @@ subroutine io_xios_apply_wet_3d_r8(buf) if (k < un .or. k > ub) buf(k, n) = NC_FILL_DOUBLE end do end do +!$OMP END PARALLEL DO end subroutine subroutine io_xios_apply_wet_3d_r4(buf) @@ -537,9 +561,10 @@ subroutine io_xios_apply_wet_3d_r4(buf) integer :: n, k, nz, nn, ub, un, bn logical :: is_interface if (.not. associated(p_ulevels_nod) .or. .not. associated(p_nlevels_nod)) return - nz = size(buf, 1); nn = size(buf, 2) + nz = size(buf, 1); nn = min(size(buf, 2), size(p_ulevels_nod)) is_interface = (p_nl > 0 .and. nz == p_nl) - do n = 1, min(nn, size(p_ulevels_nod)) +!$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(n, k, un, bn, ub) + do n = 1, nn un = p_ulevels_nod(n) bn = p_nlevels_nod(n) ub = bn; if (.not. is_interface) ub = bn - 1 @@ -547,6 +572,7 @@ subroutine io_xios_apply_wet_3d_r4(buf) if (k < un .or. k > ub) buf(k, n) = NC_FILL_FLOAT end do end do +!$OMP END PARALLEL DO end subroutine !> 3D element field, axis-first (nz, ne_owned). Same bound logic as @@ -557,9 +583,10 @@ subroutine io_xios_apply_wet_3d_elem_r8(buf) logical :: is_interface if (.not. associated(p_ulevels_elem) .or. .not. associated(p_nlevels_elem)) return if (.not. allocated(owned_elem_local)) return - nz = size(buf, 1); ne = size(buf, 2) + nz = size(buf, 1); ne = min(size(buf, 2), n_owned_elem) is_interface = (p_nl > 0 .and. nz == p_nl) - do ee = 1, min(ne, n_owned_elem) +!$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(ee, e, k, ue, be, ub) + do ee = 1, ne e = owned_elem_local(ee) ue = p_ulevels_elem(e) be = p_nlevels_elem(e) @@ -568,6 +595,7 @@ subroutine io_xios_apply_wet_3d_elem_r8(buf) if (k < ue .or. k > ub) buf(k, ee) = NC_FILL_DOUBLE end do end do +!$OMP END PARALLEL DO end subroutine subroutine io_xios_apply_wet_3d_elem_r4(buf) @@ -576,9 +604,10 @@ subroutine io_xios_apply_wet_3d_elem_r4(buf) logical :: is_interface if (.not. associated(p_ulevels_elem) .or. .not. associated(p_nlevels_elem)) return if (.not. allocated(owned_elem_local)) return - nz = size(buf, 1); ne = size(buf, 2) + nz = size(buf, 1); ne = min(size(buf, 2), n_owned_elem) is_interface = (p_nl > 0 .and. nz == p_nl) - do ee = 1, min(ne, n_owned_elem) +!$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(ee, e, k, ue, be, ub) + do ee = 1, ne e = owned_elem_local(ee) ue = p_ulevels_elem(e) be = p_nlevels_elem(e) @@ -587,6 +616,7 @@ subroutine io_xios_apply_wet_3d_elem_r4(buf) if (k < ue .or. k > ub) buf(k, ee) = NC_FILL_FLOAT end do end do +!$OMP END PARALLEL DO end subroutine diff --git a/src/oce_ale.F90 b/src/oce_ale.F90 index 0f8e93b57..212538acc 100644 --- a/src/oce_ale.F90 +++ b/src/oce_ale.F90 @@ -3348,7 +3348,7 @@ subroutine oce_timestep_ale(n, ice, dynamics, tracers, partit, mesh) #if defined (FESOM_PROFILING) use fesom_profiler #endif - + IMPLICIT NONE integer , intent(in) :: n type(t_dyn) , intent(inout), target :: dynamics @@ -3498,13 +3498,13 @@ subroutine oce_timestep_ale(n, ice, dynamics, tracers, partit, mesh) call calc_cvmix_tidal(partit, mesh) end if -#endif +#endif t1=MPI_Wtime() #if defined (FESOM_PROFILING) call fesom_profiler_end("oce_mix_pres") call fesom_profiler_start("oce_dyn_momentum") -#endif - +#endif + !___________________________________________________________________________ ! add contribution from momentum advection, coriolis and pressure gradient | ! force to UV_rhs @@ -3603,7 +3603,7 @@ subroutine oce_timestep_ale(n, ice, dynamics, tracers, partit, mesh) call impl_vert_visc_ale(dynamics,partit, mesh) else call impl_vert_visc_ale_vtransp(dynamics, partit, mesh) - end if + end if end if !___________________________________________________________________________ diff --git a/src/oce_ale_mixing_kpp.F90 b/src/oce_ale_mixing_kpp.F90 index c7204cd46..fa1c7c5f6 100755 --- a/src/oce_ale_mixing_kpp.F90 +++ b/src/oce_ale_mixing_kpp.F90 @@ -277,12 +277,13 @@ SUBROUTINE oce_mixing_KPP(viscAE, diffK, dynamics, tracers, partit, mesh) #include "associate_mesh_ass.h" UVnode=>dynamics%uvnode(:,:,:) -!$OMP PARALLEL DO +!$OMP PARALLEL DEFAULT(SHARED) PRIVATE(node, nz, nzmin, nzmax, usurf, vsurf, u_loc, v_loc) +!$OMP DO DO node=1, myDim_nod2D+eDim_nod2D ViscA(:, node) = 0.0_WP END DO -!$OMP END PARALLEL DO -!$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(node, nz, nzmin, nzmax, usurf, vsurf, u_loc, v_loc) +!$OMP END DO +!$OMP DO DO node=1, myDim_nod2D !+eDim_nod2D nzmin = ulevels_nod2D(node) nzmax = nlevels_nod2D(node) @@ -320,7 +321,8 @@ SUBROUTINE oce_mixing_KPP(viscAE, diffK, dynamics, tracers, partit, mesh) END DO dVsq ( nzmax, node ) = dVsq ( nzmax-1, node ) END DO -!$OMP END PARALLEL DO +!$OMP END DO +!$OMP END PARALLEL ! ******************************************************************* ! compute thermal and haline expansion coefficients (without factor of rho). @@ -486,18 +488,17 @@ SUBROUTINE bldepth(partit, mesh) #include "associate_part_ass.h" #include "associate_mesh_ass.h" -!$OMP PARALLEL DO +!$OMP PARALLEL DEFAULT(SHARED) PRIVATE(node, nz, nzmin, nzmax, coeff_sw, Rib_km1, zk, zkm1, sigma, zehat, & +!$OMP wm, ws, bvsq, Vtsq, Ritop, Rib_k, dzup, hekman, hmonob, hlimit) +!$OMP DO ! Initialize hbl and kbl to bottomed out values DO node=1, myDim_nod2D !+eDim_nod2D ! Index of first grid level below hbl - kbl(node) = nlevels_nod2D(node) + kbl(node) = nlevels_nod2D(node) ! Boundary layer depth hbl(node) = ABS( zbar_3d_n( nlevels_nod2d(node),node ) ) END DO -!$OMP END PARALLEL DO - -!$OMP PARALLEL DEFAULT(SHARED) PRIVATE(node, nz, nzmin, nzmax, coeff_sw, Rib_km1, zk, zkm1, sigma, zehat, & -!$OMP wm, ws, bvsq, Vtsq, Ritop, Rib_k, dzup, hekman, hmonob, hlimit) +!$OMP END DO !$OMP DO DO node=1, myDim_nod2D !+eDim_nod2D nzmin = ulevels_nod2D(node) @@ -967,16 +968,16 @@ subroutine blmix_kpp(viscA,diffK, partit, mesh) #include "associate_mesh_def.h" #include "associate_part_ass.h" #include "associate_mesh_ass.h" -!$OMP PARALLEL DO +!$OMP PARALLEL DEFAULT(SHARED) PRIVATE(node, nz, kn, knm1, knp1, nl1, nu1, delhat, R, dvdzup, dvdzdn, viscp, difsp, diftp, visch, difsh, difth, f1, sig, & +!$OMP a1, a2, a3, Gm, Gs, Gt, sigma, zehat, wm, ws, gat1m, gat1t, gat1s, dat1m, dat1s, dat1t, dthick, diff_col) +!$OMP DO DO node=1, myDim_nod2D+eDim_nod2D blmc (:, node, :) = 0.0_WP END DO -!$OMP END PARALLEL DO +!$OMP END DO ! ******************************************************************* -! Kv over the NODE +! Kv over the NODE ! ******************************************************************* -!$OMP PARALLEL DEFAULT(SHARED) PRIVATE(node, nz, kn, knm1, knp1, nl1, nu1, delhat, R, dvdzup, dvdzdn, viscp, difsp, diftp, visch, difsh, difth, f1, sig, & -!$OMP a1, a2, a3, Gm, Gs, Gt, sigma, zehat, wm, ws, gat1m, gat1t, gat1s, dat1m, dat1s, dat1t, dthick, diff_col) !$OMP DO DO node=1, myDim_nod2D nl1=nlevels_nod2d(node) diff --git a/src/oce_ale_tracer.F90 b/src/oce_ale_tracer.F90 index 7c88a2c5c..0c4549297 100644 --- a/src/oce_ale_tracer.F90 +++ b/src/oce_ale_tracer.F90 @@ -200,17 +200,19 @@ subroutine solve_tracers_ale(ice, dynamics, tracers, partit, mesh) ! 1. bolus velocities are computed according to GM implementation after R. Ferrari et al., 2010 ! 2. bolus velocities are used only for advecting tracers and shall be subtracted back afterwards if (Fer_GM) then -!$OMP PARALLEL DO +!$OMP PARALLEL DEFAULT(SHARED) PRIVATE(elem, node) +!$OMP DO do elem=1, myDim_elem2D+eDim_elem2D UV(:, :, elem) =UV(:, :, elem) + fer_UV(:, :, elem) end do -!$OMP END PARALLEL DO -!$OMP PARALLEL DO +!$OMP END DO +!$OMP DO do node=1, myDim_nod2D+eDim_nod2D Wvel_e(:, node)=Wvel_e(:, node)+fer_Wvel(:, node) Wvel (:, node)=Wvel (:, node)+fer_Wvel(:, node) end do -!$OMP END PARALLEL DO +!$OMP END DO +!$OMP END PARALLEL end if ! Set advective and diffusive components of total tracer fluxes to zero @@ -329,17 +331,19 @@ subroutine solve_tracers_ale(ice, dynamics, tracers, partit, mesh) !___________________________________________________________________________ ! subtract the the bolus velocities back from 3D velocities: if (Fer_GM) then -!$OMP PARALLEL DO +!$OMP PARALLEL DEFAULT(SHARED) PRIVATE(elem, node) +!$OMP DO do elem=1, myDim_elem2D+eDim_elem2D UV(:, :, elem) =UV(:, :, elem) - fer_UV(:, :, elem) end do -!$OMP END PARALLEL DO -!$OMP PARALLEL DO +!$OMP END DO +!$OMP DO do node=1, myDim_nod2D+eDim_nod2D Wvel_e(:, node)=Wvel_e(:, node)-fer_Wvel(:, node) Wvel (:, node)=Wvel (:, node)-fer_Wvel(:, node) end do -!$OMP END PARALLEL DO +!$OMP END DO +!$OMP END PARALLEL end if ! TODO: do it only when it is coupled to atmosphere diff --git a/src/oce_ale_vel_rhs.F90 b/src/oce_ale_vel_rhs.F90 index 2cf0ab2b0..96116ba4f 100644 --- a/src/oce_ale_vel_rhs.F90 +++ b/src/oce_ale_vel_rhs.F90 @@ -288,7 +288,8 @@ subroutine compute_vel_rhs(ice, dynamics, partit, mesh) ff=1.0_WP lfirst=.false. end if -!$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(elem, nz, nzmin, nzmax) +!$OMP PARALLEL DEFAULT(SHARED) PRIVATE(elem, nz, nzmin, nzmax) +!$OMP DO do elem=1, myDim_elem2D nzmin = ulevels(elem) nzmax = nlevels(elem) @@ -298,19 +299,19 @@ subroutine compute_vel_rhs(ice, dynamics, partit, mesh) ! | | ! V V ! fAB = (f_pgf - 1/2*fab_n-1) +3/2*fab_n - ! - ! until here: UV_rhs = dt*[ (R_advec + R_coriolis)^n + R_pressure] - ! --> horizontal viscosity contribution still missing is added in - ! call viscosity_filter - ! --> vertical viscosity contribution still missing is added in - ! call impl_vert_visc_ale + ! + ! until here: UV_rhs = dt*[ (R_advec + R_coriolis)^n + R_pressure] + ! --> horizontal viscosity contribution still missing is added in + ! call viscosity_filter + ! --> vertical viscosity contribution still missing is added in + ! call impl_vert_visc_ale end do end do -!$OMP END PARALLEL DO +!$OMP END DO !___________________________________________________________________________ if (dynamics%ldiag_ke) then -!$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(elem, nz, nzmin, nzmax) +!$OMP DO do elem=1, myDim_elem2D nzmin = ulevels(elem) nzmax = nlevels(elem) @@ -319,8 +320,9 @@ subroutine compute_vel_rhs(ice, dynamics, partit, mesh) dynamics%ke_cor(:,nz,elem)=dt*(dynamics%ke_cor(:,nz,elem)+dynamics%ke_cor_AB(1,:,nz,elem)*ff)/elem_area(elem) end do end do -!$OMP END PARALLEL DO +!$OMP END DO end if +!$OMP END PARALLEL ! ======================= ! U_rhs contains all contributions to velocity from old time steps diff --git a/src/oce_setup_step.F90 b/src/oce_setup_step.F90 index 48a5c84a1..7e3147300 100755 --- a/src/oce_setup_step.F90 +++ b/src/oce_setup_step.F90 @@ -275,10 +275,10 @@ subroutine ocean_setup(dynamics, tracers, partit, mesh) !___________________________________________________________________________ ! precompute mask for GM/Redi upscalling in the GINsea - if ((Fer_GM .or. Redi) .and. scaling_GINsea) then + if ((Fer_GM .or. Redi) .and. scaling_GINsea) then call init_RediGM_GINsea_mask(partit, mesh) - end if - + end if + !___________________________________________________________________________ if(partit%mype==0) write(*,*) 'Initial state' if (dynamics%use_wsplit .and. partit%mype==0) then diff --git a/src/oce_tracer_mod.F90 b/src/oce_tracer_mod.F90 index 2fb963fe0..a5374d739 100755 --- a/src/oce_tracer_mod.F90 +++ b/src/oce_tracer_mod.F90 @@ -26,6 +26,7 @@ SUBROUTINE init_tracers_AB(tr_num, tracers, partit, mesh) integer :: n,nz #ifndef ENABLE_OPENACC +!$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(n, nz) #else !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(PRESENT) #endif @@ -38,14 +39,19 @@ SUBROUTINE init_tracers_AB(tr_num, tracers, partit, mesh) end do end do #ifndef ENABLE_OPENACC +!$OMP END PARALLEL DO #else !$ACC END PARALLEL LOOP #endif - ! AB interpolation + ! AB interpolation. Both the valuesAB compute and the valuesold update + ! were originally separate PARALLEL DO regions; combined here into a + ! single PARALLEL with two DOs, saving one fork-join per tracer per + ! timestep. if (tracers%data(tr_num)%AB_order==2) then #ifndef ENABLE_OPENACC -!$OMP PARALLEL DO +!$OMP PARALLEL DEFAULT(SHARED) PRIVATE(n) +!$OMP DO #else !ACC PARALLEL LOOP DEFAULT(PRESENT) #endif @@ -53,13 +59,25 @@ SUBROUTINE init_tracers_AB(tr_num, tracers, partit, mesh) tracers%data(tr_num)%valuesAB(:, n) =-(0.5_WP+epsilon)*tracers%data(tr_num)%valuesold(1, :, n)+(1.5_WP+epsilon)*tracers%data(tr_num)%values(:, n) end do #ifndef ENABLE_OPENACC -!$OMP END PARALLEL DO +!$OMP END DO +!$OMP DO +#else +!ACC END PARALLEL LOOP +!ACC PARALLEL LOOP DEFAULT(PRESENT) +#endif + do n=1, partit%myDim_nod2d+partit%eDim_nod2D + tracers%data(tr_num)%valuesold(1, :, n)=tracers%data(tr_num)%values(:, n) + end do +#ifndef ENABLE_OPENACC +!$OMP END DO +!$OMP END PARALLEL #else !ACC END PARALLEL LOOP #endif elseif (tracers%data(tr_num)%AB_order==3) then #ifndef ENABLE_OPENACC -!$OMP PARALLEL DO +!$OMP PARALLEL DEFAULT(SHARED) PRIVATE(n) +!$OMP DO #else !ACC PARALLEL LOOP DEFAULT(PRESENT) #endif @@ -68,7 +86,19 @@ SUBROUTINE init_tracers_AB(tr_num, tracers, partit, mesh) tracers%data(tr_num)%valuesAB(:, n) =tracers%data(tr_num)%valuesAB(:, n)/12.0_WP end do #ifndef ENABLE_OPENACC -!$OMP END PARALLEL DO +!$OMP END DO +!$OMP DO +#else +!ACC END PARALLEL LOOP +!ACC PARALLEL LOOP DEFAULT(PRESENT) +#endif + do n=1, partit%myDim_nod2d+partit%eDim_nod2D + tracers%data(tr_num)%valuesold(2, :, n)=tracers%data(tr_num)%valuesold(1, :, n) + tracers%data(tr_num)%valuesold(1, :, n)=tracers%data(tr_num)%values(:, n) + end do +#ifndef ENABLE_OPENACC +!$OMP END DO +!$OMP END PARALLEL #else !ACC END PARALLEL LOOP #endif @@ -91,37 +121,6 @@ SUBROUTINE init_tracers_AB(tr_num, tracers, partit, mesh) call par_ex(partit%MPI_COMM_FESOM, partit%mype, 0) end if - if (tracers%data(tr_num)%AB_order==2) then -#ifndef ENABLE_OPENACC -!$OMP PARALLEL DO -#else -!ACC PARALLEL LOOP DEFAULT(PRESENT) -#endif - do n=1, partit%myDim_nod2d+partit%eDim_nod2D - tracers%data(tr_num)%valuesold(1, :, n)=tracers%data(tr_num)%values(:, n) - end do -#ifndef ENABLE_OPENACC -!$OMP END PARALLEL DO -#else -!ACC END PARALLEL LOOP -#endif - elseif (tracers%data(tr_num)%AB_order==3) then -#ifndef ENABLE_OPENACC -!$OMP PARALLEL DO -#else -!ACC PARALLEL LOOP DEFAULT(PRESENT) -#endif - do n=1, partit%myDim_nod2d+partit%eDim_nod2D - tracers%data(tr_num)%valuesold(2, :, n)=tracers%data(tr_num)%valuesold(1, :, n) - tracers%data(tr_num)%valuesold(1, :, n)=tracers%data(tr_num)%values(:, n) - end do -#ifndef ENABLE_OPENACC -!$OMP END PARALLEL DO -#else -!ACC END PARALLEL LOOP -#endif - end if - if (flag_debug .and. partit%mype==0) print *, achar(27)//'[38m'//' --> call tracer_gradient_elements'//achar(27)//'[0m' !PS call tracer_gradient_elements(tracers%data(tr_num)%valuesAB, partit, mesh) call tracer_gradient_elements(tracers%data(tr_num)%values, partit, mesh)