Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions src/fesom_module.F90
Original file line number Diff line number Diff line change
Expand Up @@ -784,6 +784,13 @@ subroutine fesom_runloop(current_nsteps)
end if
!--------------------------

! Sync point at end of I/O block: any upstream stall (XIOS client work,
! OASIS asymmetry) that would otherwise be absorbed by the next MPI
! halo exchange in ice_timestep is captured here, attributed to
! rtime_write_means instead of rtime_fullice. Doesn't add wall time
! (the wait happens anyway), just makes timer accounting honest.
call MPI_Barrier(f%MPI_COMM_FESOM, f%MPIerr)

f%t5 = MPI_Wtime()
#if defined (FESOM_PROFILING)
call fesom_profiler_start("restart")
Expand Down
51 changes: 30 additions & 21 deletions src/io_meandata.F90
Original file line number Diff line number Diff line change
Expand Up @@ -2602,55 +2602,64 @@ subroutine update_means
type(Meandata), pointer :: entry
integer :: n
integer :: I, J

integer :: szI, szJ

! Single OMP fork-join across all streams instead of one per stream. With
! ~50 io_NSTREAMS and ~6700 calls/sim-month at HR (FESOM dt=400s), the
! original code spawned ~335k OMP regions per rank per month, each with
! its own fork-join overhead. SIMD hints on the innermost loop let the
! compiler vectorize the float adds. Targets the 146 s/rank/month
! FESOM-side stream-prep cost identified in HR_test_45 profiling.
!$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(n, entry, I, J, szI, szJ) SCHEDULE(dynamic)
DO n=1, io_NSTREAMS
entry=>io_stream(n)%p

!_____________ compute in 8 byte accuracy ______________________________
IF (entry%accuracy == i_real8) then
szI = size(entry%local_values_r8, dim=1)
szJ = size(entry%local_values_r8, dim=2)
IF (entry%flip) then
!$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(I,J)
DO J=1, size(entry%local_values_r8,dim=2)
DO I=1, size(entry%local_values_r8,dim=1)
DO J=1, szJ
!$OMP SIMD
DO I=1, szI
entry%local_values_r8(I,J)=entry%local_values_r8(I,J)+entry%ptr3(J,I)
END DO
END DO
!$OMP END PARALLEL DO
ELSE
!$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(I,J)
DO J=1, size(entry%local_values_r8,dim=2)
DO I=1, size(entry%local_values_r8,dim=1)
DO J=1, szJ
!$OMP SIMD
DO I=1, szI
entry%local_values_r8(I,J)=entry%local_values_r8(I,J)+entry%ptr3(I,J)
END DO
END DO
!$OMP END PARALLEL DO
END IF

!_____________ compute in 4 byte accuracy ______________________________
ELSE IF (entry%accuracy == i_real4) then
szI = size(entry%local_values_r4, dim=1)
szJ = size(entry%local_values_r4, dim=2)
IF (entry%flip) then
!$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(I,J)
DO J=1, size(entry%local_values_r4,dim=2)
DO I=1, size(entry%local_values_r4,dim=1)
DO J=1, szJ
!$OMP SIMD
DO I=1, szI
entry%local_values_r4(I,J)=entry%local_values_r4(I,J)+real(entry%ptr3(J,I), real32)
END DO
END DO
!$OMP END PARALLEL DO
ELSE
!$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(I, J)
DO J=1, size(entry%local_values_r4,dim=2)
DO I=1, size(entry%local_values_r4,dim=1)
DO J=1, szJ
!$OMP SIMD
DO I=1, szI
entry%local_values_r4(I,J)=entry%local_values_r4(I,J)+real(entry%ptr3(I,J), real32)
END DO
END DO
!$OMP END PARALLEL DO
END IF
END IF ! --> IF (entry%accuracy == i_real8) then

entry%addcounter=entry%addcounter+1
END DO ! --> DO n=1, io_NSTREAMS

! Update 0D (scalar) means
!$OMP END PARALLEL DO

! Update 0D (scalar) means — small loop, leave serial
DO n=1, io_NSTREAMS0D
io_stream0D(n)%local_value = io_stream0D(n)%local_value + real(io_stream0D(n)%ptr, real64)
io_stream0D(n)%addcounter = io_stream0D(n)%addcounter + 1
Expand Down
70 changes: 50 additions & 20 deletions src/io_xios.F90
Original file line number Diff line number Diff line change
Expand Up @@ -398,31 +398,37 @@ subroutine io_xios_apply_ice_mask_2d_r8(buf)
integer :: i, n
if (.not. associated(p_ice_conc)) return
n = min(size(buf), size(p_ice_conc))
!$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(i)
do i = 1, n
if (real(p_ice_conc(i), kind=8) < ICE_CONC_EPS) buf(i) = NC_FILL_DOUBLE
end do
!$OMP END PARALLEL DO
end subroutine

subroutine io_xios_apply_ice_mask_2d_r4(buf)
real(kind=4), intent(inout) :: buf(:)
integer :: i, n
if (.not. associated(p_ice_conc)) return
n = min(size(buf), size(p_ice_conc))
!$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(i)
do i = 1, n
if (real(p_ice_conc(i), kind=4) < real(ICE_CONC_EPS, kind=4)) buf(i) = NC_FILL_FLOAT
end do
!$OMP END PARALLEL DO
end subroutine

!> Element-based ice mask: element is ice-covered iff any of its 3 vertex
!> nodes has a_ice >= eps. buf is indexed by owned-element order; local
!> element indices come from owned_elem_local.
subroutine io_xios_apply_ice_mask_2d_elem_r8(buf)
real(kind=8), intent(inout) :: buf(:)
integer :: ee, e, n1, n2, n3
integer :: ee, e, n1, n2, n3, ne
real(kind=8) :: amax
if (.not. associated(p_ice_conc) .or. .not. associated(p_elem2D_nodes)) return
if (.not. allocated(owned_elem_local)) return
do ee = 1, min(size(buf), n_owned_elem)
ne = min(size(buf), n_owned_elem)
!$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(ee, e, n1, n2, n3, amax)
do ee = 1, ne
e = owned_elem_local(ee)
n1 = p_elem2D_nodes(1, e)
n2 = p_elem2D_nodes(2, e)
Expand All @@ -432,15 +438,18 @@ subroutine io_xios_apply_ice_mask_2d_elem_r8(buf)
real(p_ice_conc(n3), kind=8))
if (amax < ICE_CONC_EPS) buf(ee) = NC_FILL_DOUBLE
end do
!$OMP END PARALLEL DO
end subroutine

subroutine io_xios_apply_ice_mask_2d_elem_r4(buf)
real(kind=4), intent(inout) :: buf(:)
integer :: ee, e, n1, n2, n3
integer :: ee, e, n1, n2, n3, ne
real(kind=4) :: amax
if (.not. associated(p_ice_conc) .or. .not. associated(p_elem2D_nodes)) return
if (.not. allocated(owned_elem_local)) return
do ee = 1, min(size(buf), n_owned_elem)
ne = min(size(buf), n_owned_elem)
!$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(ee, e, n1, n2, n3, amax)
do ee = 1, ne
e = owned_elem_local(ee)
n1 = p_elem2D_nodes(1, e)
n2 = p_elem2D_nodes(2, e)
Expand All @@ -450,6 +459,7 @@ subroutine io_xios_apply_ice_mask_2d_elem_r4(buf)
real(p_ice_conc(n3), kind=4))
if (amax < real(ICE_CONC_EPS, kind=4)) buf(ee) = NC_FILL_FLOAT
end do
!$OMP END PARALLEL DO
end subroutine


Expand All @@ -471,44 +481,56 @@ subroutine io_xios_set_wet_ptrs(ulevels_nod, nlevels_nod, &
!> for non-cavity runs where all nodes are wet at surface).
subroutine io_xios_apply_wet_2d_r8(buf)
real(kind=8), intent(inout) :: buf(:)
integer :: n
integer :: n, nn
if (.not. associated(p_ulevels_nod)) return
do n = 1, min(size(buf), size(p_ulevels_nod))
nn = min(size(buf), size(p_ulevels_nod))
!$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(n)
do n = 1, nn
if (p_ulevels_nod(n) > 1) buf(n) = NC_FILL_DOUBLE
end do
!$OMP END PARALLEL DO
end subroutine

subroutine io_xios_apply_wet_2d_r4(buf)
real(kind=4), intent(inout) :: buf(:)
integer :: n
integer :: n, nn
if (.not. associated(p_ulevels_nod)) return
do n = 1, min(size(buf), size(p_ulevels_nod))
nn = min(size(buf), size(p_ulevels_nod))
!$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(n)
do n = 1, nn
if (p_ulevels_nod(n) > 1) buf(n) = NC_FILL_FLOAT
end do
!$OMP END PARALLEL DO
end subroutine

!> 2D element field (strictly-owned subset): NC_FILL where any of the
!> 3 vertex nodes has ulevels > 1 (cavity element).
subroutine io_xios_apply_wet_2d_elem_r8(buf)
real(kind=8), intent(inout) :: buf(:)
integer :: ee, e
integer :: ee, e, ne
if (.not. associated(p_ulevels_elem)) return
if (.not. allocated(owned_elem_local)) return
do ee = 1, min(size(buf), n_owned_elem)
ne = min(size(buf), n_owned_elem)
!$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(ee, e)
do ee = 1, ne
e = owned_elem_local(ee)
if (p_ulevels_elem(e) > 1) buf(ee) = NC_FILL_DOUBLE
end do
!$OMP END PARALLEL DO
end subroutine

subroutine io_xios_apply_wet_2d_elem_r4(buf)
real(kind=4), intent(inout) :: buf(:)
integer :: ee, e
integer :: ee, e, ne
if (.not. associated(p_ulevels_elem)) return
if (.not. allocated(owned_elem_local)) return
do ee = 1, min(size(buf), n_owned_elem)
ne = min(size(buf), n_owned_elem)
!$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(ee, e)
do ee = 1, ne
e = owned_elem_local(ee)
if (p_ulevels_elem(e) > 1) buf(ee) = NC_FILL_FLOAT
end do
!$OMP END PARALLEL DO
end subroutine

!> 3D node field, axis-first (nz, nn). Valid range is
Expand All @@ -520,33 +542,37 @@ subroutine io_xios_apply_wet_3d_r8(buf)
integer :: n, k, nz, nn, ub, un, bn
logical :: is_interface
if (.not. associated(p_ulevels_nod) .or. .not. associated(p_nlevels_nod)) return
nz = size(buf, 1); nn = size(buf, 2)
nz = size(buf, 1); nn = min(size(buf, 2), size(p_ulevels_nod))
is_interface = (p_nl > 0 .and. nz == p_nl)
do n = 1, min(nn, size(p_ulevels_nod))
!$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(n, k, un, bn, ub)
do n = 1, nn
un = p_ulevels_nod(n)
bn = p_nlevels_nod(n)
ub = bn; if (.not. is_interface) ub = bn - 1
do k = 1, nz
if (k < un .or. k > ub) buf(k, n) = NC_FILL_DOUBLE
end do
end do
!$OMP END PARALLEL DO
end subroutine

subroutine io_xios_apply_wet_3d_r4(buf)
real(kind=4), intent(inout) :: buf(:,:)
integer :: n, k, nz, nn, ub, un, bn
logical :: is_interface
if (.not. associated(p_ulevels_nod) .or. .not. associated(p_nlevels_nod)) return
nz = size(buf, 1); nn = size(buf, 2)
nz = size(buf, 1); nn = min(size(buf, 2), size(p_ulevels_nod))
is_interface = (p_nl > 0 .and. nz == p_nl)
do n = 1, min(nn, size(p_ulevels_nod))
!$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(n, k, un, bn, ub)
do n = 1, nn
un = p_ulevels_nod(n)
bn = p_nlevels_nod(n)
ub = bn; if (.not. is_interface) ub = bn - 1
do k = 1, nz
if (k < un .or. k > ub) buf(k, n) = NC_FILL_FLOAT
end do
end do
!$OMP END PARALLEL DO
end subroutine

!> 3D element field, axis-first (nz, ne_owned). Same bound logic as
Expand All @@ -557,9 +583,10 @@ subroutine io_xios_apply_wet_3d_elem_r8(buf)
logical :: is_interface
if (.not. associated(p_ulevels_elem) .or. .not. associated(p_nlevels_elem)) return
if (.not. allocated(owned_elem_local)) return
nz = size(buf, 1); ne = size(buf, 2)
nz = size(buf, 1); ne = min(size(buf, 2), n_owned_elem)
is_interface = (p_nl > 0 .and. nz == p_nl)
do ee = 1, min(ne, n_owned_elem)
!$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(ee, e, k, ue, be, ub)
do ee = 1, ne
e = owned_elem_local(ee)
ue = p_ulevels_elem(e)
be = p_nlevels_elem(e)
Expand All @@ -568,6 +595,7 @@ subroutine io_xios_apply_wet_3d_elem_r8(buf)
if (k < ue .or. k > ub) buf(k, ee) = NC_FILL_DOUBLE
end do
end do
!$OMP END PARALLEL DO
end subroutine

subroutine io_xios_apply_wet_3d_elem_r4(buf)
Expand All @@ -576,9 +604,10 @@ subroutine io_xios_apply_wet_3d_elem_r4(buf)
logical :: is_interface
if (.not. associated(p_ulevels_elem) .or. .not. associated(p_nlevels_elem)) return
if (.not. allocated(owned_elem_local)) return
nz = size(buf, 1); ne = size(buf, 2)
nz = size(buf, 1); ne = min(size(buf, 2), n_owned_elem)
is_interface = (p_nl > 0 .and. nz == p_nl)
do ee = 1, min(ne, n_owned_elem)
!$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(ee, e, k, ue, be, ub)
do ee = 1, ne
e = owned_elem_local(ee)
ue = p_ulevels_elem(e)
be = p_nlevels_elem(e)
Expand All @@ -587,6 +616,7 @@ subroutine io_xios_apply_wet_3d_elem_r4(buf)
if (k < ue .or. k > ub) buf(k, ee) = NC_FILL_FLOAT
end do
end do
!$OMP END PARALLEL DO
end subroutine


Expand Down
10 changes: 5 additions & 5 deletions src/oce_ale.F90
Original file line number Diff line number Diff line change
Expand Up @@ -3348,7 +3348,7 @@ subroutine oce_timestep_ale(n, ice, dynamics, tracers, partit, mesh)
#if defined (FESOM_PROFILING)
use fesom_profiler
#endif

IMPLICIT NONE
integer , intent(in) :: n
type(t_dyn) , intent(inout), target :: dynamics
Expand Down Expand Up @@ -3498,13 +3498,13 @@ subroutine oce_timestep_ale(n, ice, dynamics, tracers, partit, mesh)
call calc_cvmix_tidal(partit, mesh)

end if
#endif
#endif
t1=MPI_Wtime()
#if defined (FESOM_PROFILING)
call fesom_profiler_end("oce_mix_pres")
call fesom_profiler_start("oce_dyn_momentum")
#endif
#endif

!___________________________________________________________________________
! add contribution from momentum advection, coriolis and pressure gradient |
! force to UV_rhs
Expand Down Expand Up @@ -3603,7 +3603,7 @@ subroutine oce_timestep_ale(n, ice, dynamics, tracers, partit, mesh)
call impl_vert_visc_ale(dynamics,partit, mesh)
else
call impl_vert_visc_ale_vtransp(dynamics, partit, mesh)
end if
end if
end if

!___________________________________________________________________________
Expand Down
Loading
Loading