Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion src/blas/rocBLAS.jl
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,12 @@ function lib_state()
(nh, s) -> rocblas_set_stream(nh, s))
end

handle() = lib_state().handle
function handle()
# Consume any sticky HIP error from prior GPU work in this context before
# any rocblas call. See rocSPARSE.handle for the rationale.
HIP.clear_last_error()
return lib_state().handle
end
stream() = lib_state().stream

end
7 changes: 6 additions & 1 deletion src/dnn/MIOpen.jl
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,12 @@ lib_state() = library_state(
create_handle, destroy_handle!,
(nh, s) -> miopenSetStream(nh, s))

handle() = lib_state().handle
function handle()
# Consume any sticky HIP error from prior GPU work in this context before
# any MIOpen call. See rocSPARSE.handle for the rationale.
HIP.clear_last_error()
return lib_state().handle
end
stream() = lib_state().stream

include("descriptors.jl")
Expand Down
18 changes: 18 additions & 0 deletions src/hip/error.jl
Original file line number Diff line number Diff line change
Expand Up @@ -145,3 +145,21 @@ function check(err::hipError_t)
throw(HIPError(err))
end
end

"""
clear_last_error()

Consume any sticky HIP error on the current context without throwing.

Some HIP operations (e.g. `hipDeviceSynchronize`) surface errors that were set
by previous GPU work (e.g. a kernel exception). These errors persist on the
context until consumed. Call this before creating library handles to prevent
stale errors from causing spurious failures in unrelated operations.
"""
function clear_last_error()
err = @gcsafe_ccall libhip.hipGetLastError()::hipError_t
if err != hipSuccess
@debug "Cleared sticky HIP error before library call" error=HIPError(err)
end
return
end
7 changes: 6 additions & 1 deletion src/rand/rocRAND.jl
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,12 @@ lib_state() = library_state(
Random.seed!(nh)
end)

handle() = lib_state().handle
function handle()
# Consume any sticky HIP error from prior GPU work in this context before
# any rocrand call. See rocSPARSE.handle for the rationale.
HIP.clear_last_error()
return lib_state().handle
end
stream() = lib_state().stream

end
8 changes: 7 additions & 1 deletion src/sparse/rocSPARSE.jl
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,13 @@ lib_state() = library_state(
:rocSPARSE, rocsparse_handle, IDLE_HANDLES,
create_handle, rocsparse_destroy_handle, rocsparse_set_stream)

handle() = lib_state().handle
function handle()
# Consume any sticky HIP error from prior GPU work in this context before
# any rocsparse call. rocsparse operations internally synchronize and will
# surface a pending hipErrorLaunchFailure as rocsparse_status_internal_error.
HIP.clear_last_error()
return lib_state().handle
end
stream() = lib_state().stream

function version()
Expand Down
2 changes: 1 addition & 1 deletion test/device/hostcall.jl
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ end
RB = ROCArray(zeros(Float32, 1))
dref = Ref{Bool}(false)

@test_logs (:error, "HostCall error") begin
@test_logs (:error, "HostCall error") match_mode=:any begin
hc = HostCallHolder(Nothing, Tuple{}) do
error("Some error")
dref[] = true
Expand Down