Skip to content

Commit e941598

Browse files
committed
fix(uvm): skip GPU semaphore reads during surprise removal
1 parent fc889e9 commit e941598

File tree

2 files changed

+28
-4
lines changed

2 files changed

+28
-4
lines changed

kernel-open/nvidia-uvm/uvm_gpu_semaphore.c

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
#include "uvm_kvmalloc.h"
2828
#include "uvm_channel.h" // For UVM_GPU_SEMAPHORE_MAX_JUMP
2929
#include "uvm_conf_computing.h"
30+
#include "uvm_gpu_isr.h"
3031

3132
#define UVM_SEMAPHORE_SIZE 4
3233
#define UVM_SEMAPHORE_PAGE_SIZE PAGE_SIZE
@@ -822,10 +823,18 @@ static NvU64 update_completed_value_locked(uvm_gpu_tracking_semaphore_t *trackin
822823
NvU64 uvm_gpu_tracking_semaphore_update_completed_value(uvm_gpu_tracking_semaphore_t *tracking_semaphore)
823824
{
824825
NvU64 completed;
826+
uvm_gpu_t *gpu = tracking_semaphore->semaphore.page->pool->gpu;
825827

826828
// Check that the GPU which owns the semaphore is still present
827829
UVM_ASSERT(tracking_semaphore_check_gpu(tracking_semaphore));
828830

831+
// If the GPU is not accessible (surprise removed), return the cached
832+
// completed value without reading from GPU memory. Reading from GPU
833+
// memory after surprise removal returns garbage values that cause
834+
// assertion failures.
835+
if (!uvm_parent_gpu_is_accessible(gpu->parent))
836+
return atomic64_read(&tracking_semaphore->completed_value);
837+
829838
if (tracking_semaphore_uses_mutex(tracking_semaphore))
830839
uvm_mutex_lock(&tracking_semaphore->m_lock);
831840
else
@@ -844,10 +853,16 @@ NvU64 uvm_gpu_tracking_semaphore_update_completed_value(uvm_gpu_tracking_semapho
844853
bool uvm_gpu_tracking_semaphore_is_value_completed(uvm_gpu_tracking_semaphore_t *tracking_sem, NvU64 value)
845854
{
846855
NvU64 completed = atomic64_read(&tracking_sem->completed_value);
856+
uvm_gpu_t *gpu = tracking_sem->semaphore.page->pool->gpu;
847857

848858
// Check that the GPU which owns the semaphore is still present
849859
UVM_ASSERT(tracking_semaphore_check_gpu(tracking_sem));
850860

861+
// If the GPU is not accessible, consider all values completed to avoid
862+
// spinning forever waiting for a GPU that's gone.
863+
if (!uvm_parent_gpu_is_accessible(gpu->parent))
864+
return true;
865+
851866
if (completed >= value) {
852867
// atomic64_read() doesn't imply any memory barriers and we need all
853868
// subsequent memory accesses in this thread to be ordered after the

src/nvidia/src/kernel/rmapi/nv_gpu_ops.c

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -809,10 +809,19 @@ NV_STATUS nvGpuOpsDestroySession(struct gpuSession *session)
809809
if (!session)
810810
return NV_OK;
811811

812-
// Sanity Check: There should not be any attached devices with the session!
813-
NV_ASSERT(!session->devices);
814-
// Sanity Check: If there are no devices, there should also be no p2p Info!
815-
NV_ASSERT(!session->p2pInfo);
812+
// During surprise removal (GPU lost), devices may not have been properly
813+
// detached. In normal operation, these assertions catch programming errors.
814+
// When the GPU is lost, we log and continue to avoid blocking cleanup.
815+
if (session->devices)
816+
{
817+
NV_PRINTF(LEVEL_WARNING,
818+
"Destroying session with devices still attached (GPU may be lost)\n");
819+
}
820+
if (session->p2pInfo)
821+
{
822+
NV_PRINTF(LEVEL_WARNING,
823+
"Destroying session with p2p info still present (GPU may be lost)\n");
824+
}
816825

817826
// freeing session will free everything under it
818827
pRmApi->Free(pRmApi, session->handle, session->handle);

0 commit comments

Comments
 (0)