From a30267aa66cd9bd0701a27ed25f03313d99c9fe8 Mon Sep 17 00:00:00 2001 From: Ludovic Raess Date: Sat, 9 May 2026 16:46:09 +0200 Subject: [PATCH 1/5] Catch and clear spurious hip error --- src/blas/rocBLAS.jl | 4 ++++ src/dnn/MIOpen.jl | 4 ++++ src/hip/error.jl | 15 +++++++++++++++ src/sparse/rocSPARSE.jl | 6 ++++++ 4 files changed, 29 insertions(+) diff --git a/src/blas/rocBLAS.jl b/src/blas/rocBLAS.jl index fde133ae6..fdc7bef0d 100644 --- a/src/blas/rocBLAS.jl +++ b/src/blas/rocBLAS.jl @@ -32,6 +32,10 @@ end function create_handle() AMDGPU.functional(:rocblas) || error("rocBLAS is not available") + # Consume any sticky HIP error from prior GPU work in this context. + # See rocSPARSE.create_handle for the rationale. + HIP.clear_last_error() + handle_ref = Ref{rocblas_handle}() @check rocblas_create_handle(handle_ref) handle_ref[] diff --git a/src/dnn/MIOpen.jl b/src/dnn/MIOpen.jl index b593ff0e8..af8b10c38 100644 --- a/src/dnn/MIOpen.jl +++ b/src/dnn/MIOpen.jl @@ -66,6 +66,10 @@ end function create_handle()::miopenHandle_t AMDGPU.functional(:MIOpen) || error("MIOpen is not available") + # Consume any sticky HIP error from prior GPU work in this context. + # See rocSPARSE.create_handle for the rationale. + HIP.clear_last_error() + handle = Ref{miopenHandle_t}() miopenCreate(handle) handle[] diff --git a/src/hip/error.jl b/src/hip/error.jl index d9a084720..bac2dd5c4 100644 --- a/src/hip/error.jl +++ b/src/hip/error.jl @@ -145,3 +145,18 @@ function check(err::hipError_t) throw(HIPError(err)) end end + +""" + clear_last_error() + +Consume any sticky HIP error on the current context without throwing. + +Some HIP operations (e.g. `hipDeviceSynchronize`) surface errors that were set +by previous GPU work (e.g. a kernel exception). These errors persist on the +context until consumed. Call this before creating library handles to prevent +stale errors from causing spurious failures in unrelated operations. +""" +function clear_last_error() + @gcsafe_ccall libhip.hipGetLastError()::hipError_t + return +end diff --git a/src/sparse/rocSPARSE.jl b/src/sparse/rocSPARSE.jl index 1ca043101..0a818451f 100644 --- a/src/sparse/rocSPARSE.jl +++ b/src/sparse/rocSPARSE.jl @@ -27,6 +27,12 @@ include("librocsparse_deprecated.jl") function create_handle() AMDGPU.functional(:rocsparse) || error("rocSPARSE is not available") + # Consume any sticky HIP error from prior GPU work in this context. + # rocsparse_create_handle internally calls hipDeviceSynchronize which + # surfaces pending errors (e.g. hipErrorLaunchFailure from a prior kernel + # exception) and returns rocsparse_status_internal_error as a result. + HIP.clear_last_error() + handle_ref = Ref{rocsparse_handle}() rocsparse_create_handle(handle_ref) handle_ref[] From 40f6467af32dd68fc28c985d0bf0b0533d556325 Mon Sep 17 00:00:00 2001 From: Ludovic Raess Date: Sat, 9 May 2026 20:18:11 +0200 Subject: [PATCH 2/5] Fix rand --- src/rand/rocRAND.jl | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/rand/rocRAND.jl b/src/rand/rocRAND.jl index 4a665d836..97be15704 100644 --- a/src/rand/rocRAND.jl +++ b/src/rand/rocRAND.jl @@ -30,6 +30,10 @@ lib_state() = library_state( :rocRAND, RNG, IDLE_RNGS, () -> RNG(), r -> return, # RNG destroys itself in finalizer. (nh, s) -> begin + # Consume any sticky HIP error from prior GPU work in this context. + # rocrand_initialize_generator (called inside seed!) internally syncs + # and will surface pending errors as ROCRAND_STATUS_LAUNCH_FAILURE. + HIP.clear_last_error() rocrand_set_stream(nh.handle, s) Random.seed!(nh) end) From 4baadb2c8334c404fa6e82499ac662c55a6afc75 Mon Sep 17 00:00:00 2001 From: Ludovic Raess Date: Mon, 11 May 2026 08:52:13 +0200 Subject: [PATCH 3/5] tmp fix until #907 lands --- test/device/hostcall.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/device/hostcall.jl b/test/device/hostcall.jl index b37c1a9ce..4d9c329c7 100644 --- a/test/device/hostcall.jl +++ b/test/device/hostcall.jl @@ -39,7 +39,7 @@ end RB = ROCArray(zeros(Float32, 1)) dref = Ref{Bool}(false) - @test_logs (:error, "HostCall error") begin + @test_logs (:error, "HostCall error") match_mode=:any begin hc = HostCallHolder(Nothing, Tuple{}) do error("Some error") dref[] = true From 20d55a2732a8f872013e1165287d04e94f66eb23 Mon Sep 17 00:00:00 2001 From: Ludovic Raess Date: Mon, 11 May 2026 10:33:31 +0200 Subject: [PATCH 4/5] Further tweaks --- src/blas/rocBLAS.jl | 11 ++++++----- src/dnn/MIOpen.jl | 11 ++++++----- src/rand/rocRAND.jl | 11 ++++++----- src/sparse/rocSPARSE.jl | 14 +++++++------- 4 files changed, 25 insertions(+), 22 deletions(-) diff --git a/src/blas/rocBLAS.jl b/src/blas/rocBLAS.jl index fdc7bef0d..a7d0d4c3d 100644 --- a/src/blas/rocBLAS.jl +++ b/src/blas/rocBLAS.jl @@ -32,10 +32,6 @@ end function create_handle() AMDGPU.functional(:rocblas) || error("rocBLAS is not available") - # Consume any sticky HIP error from prior GPU work in this context. - # See rocSPARSE.create_handle for the rationale. - HIP.clear_last_error() - handle_ref = Ref{rocblas_handle}() @check rocblas_create_handle(handle_ref) handle_ref[] @@ -55,7 +51,12 @@ function lib_state() (nh, s) -> rocblas_set_stream(nh, s)) end -handle() = lib_state().handle +function handle() + # Consume any sticky HIP error from prior GPU work in this context before + # any rocblas call. See rocSPARSE.handle for the rationale. + HIP.clear_last_error() + return lib_state().handle +end stream() = lib_state().stream end diff --git a/src/dnn/MIOpen.jl b/src/dnn/MIOpen.jl index af8b10c38..eca71e6d0 100644 --- a/src/dnn/MIOpen.jl +++ b/src/dnn/MIOpen.jl @@ -66,10 +66,6 @@ end function create_handle()::miopenHandle_t AMDGPU.functional(:MIOpen) || error("MIOpen is not available") - # Consume any sticky HIP error from prior GPU work in this context. - # See rocSPARSE.create_handle for the rationale. - HIP.clear_last_error() - handle = Ref{miopenHandle_t}() miopenCreate(handle) handle[] @@ -87,7 +83,12 @@ lib_state() = library_state( create_handle, destroy_handle!, (nh, s) -> miopenSetStream(nh, s)) -handle() = lib_state().handle +function handle() + # Consume any sticky HIP error from prior GPU work in this context before + # any MIOpen call. See rocSPARSE.handle for the rationale. + HIP.clear_last_error() + return lib_state().handle +end stream() = lib_state().stream include("descriptors.jl") diff --git a/src/rand/rocRAND.jl b/src/rand/rocRAND.jl index 97be15704..89395e692 100644 --- a/src/rand/rocRAND.jl +++ b/src/rand/rocRAND.jl @@ -30,15 +30,16 @@ lib_state() = library_state( :rocRAND, RNG, IDLE_RNGS, () -> RNG(), r -> return, # RNG destroys itself in finalizer. (nh, s) -> begin - # Consume any sticky HIP error from prior GPU work in this context. - # rocrand_initialize_generator (called inside seed!) internally syncs - # and will surface pending errors as ROCRAND_STATUS_LAUNCH_FAILURE. - HIP.clear_last_error() rocrand_set_stream(nh.handle, s) Random.seed!(nh) end) -handle() = lib_state().handle +function handle() + # Consume any sticky HIP error from prior GPU work in this context before + # any rocrand call. See rocSPARSE.handle for the rationale. + HIP.clear_last_error() + return lib_state().handle +end stream() = lib_state().stream end diff --git a/src/sparse/rocSPARSE.jl b/src/sparse/rocSPARSE.jl index 0a818451f..892a0fe60 100644 --- a/src/sparse/rocSPARSE.jl +++ b/src/sparse/rocSPARSE.jl @@ -27,12 +27,6 @@ include("librocsparse_deprecated.jl") function create_handle() AMDGPU.functional(:rocsparse) || error("rocSPARSE is not available") - # Consume any sticky HIP error from prior GPU work in this context. - # rocsparse_create_handle internally calls hipDeviceSynchronize which - # surfaces pending errors (e.g. hipErrorLaunchFailure from a prior kernel - # exception) and returns rocsparse_status_internal_error as a result. - HIP.clear_last_error() - handle_ref = Ref{rocsparse_handle}() rocsparse_create_handle(handle_ref) handle_ref[] @@ -44,7 +38,13 @@ lib_state() = library_state( :rocSPARSE, rocsparse_handle, IDLE_HANDLES, create_handle, rocsparse_destroy_handle, rocsparse_set_stream) -handle() = lib_state().handle +function handle() + # Consume any sticky HIP error from prior GPU work in this context before + # any rocsparse call. rocsparse operations internally synchronize and will + # surface a pending hipErrorLaunchFailure as rocsparse_status_internal_error. + HIP.clear_last_error() + return lib_state().handle +end stream() = lib_state().stream function version() From f22f91a94f258a6a138c1cb82f0f5b15df0615c0 Mon Sep 17 00:00:00 2001 From: Ludovic Raess Date: Tue, 12 May 2026 19:00:24 +0200 Subject: [PATCH 5/5] track cleared err --- src/hip/error.jl | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/hip/error.jl b/src/hip/error.jl index bac2dd5c4..32a679342 100644 --- a/src/hip/error.jl +++ b/src/hip/error.jl @@ -157,6 +157,9 @@ context until consumed. Call this before creating library handles to prevent stale errors from causing spurious failures in unrelated operations. """ function clear_last_error() - @gcsafe_ccall libhip.hipGetLastError()::hipError_t + err = @gcsafe_ccall libhip.hipGetLastError()::hipError_t + if err != hipSuccess + @debug "Cleared sticky HIP error before library call" error=HIPError(err) + end return end