2727#include "uvm_kvmalloc.h"
2828#include "uvm_channel.h" // For UVM_GPU_SEMAPHORE_MAX_JUMP
2929#include "uvm_conf_computing.h"
30+ #include "uvm_gpu_isr.h"
3031
3132#define UVM_SEMAPHORE_SIZE 4
3233#define UVM_SEMAPHORE_PAGE_SIZE PAGE_SIZE
@@ -822,10 +823,18 @@ static NvU64 update_completed_value_locked(uvm_gpu_tracking_semaphore_t *trackin
822823NvU64 uvm_gpu_tracking_semaphore_update_completed_value (uvm_gpu_tracking_semaphore_t * tracking_semaphore )
823824{
824825 NvU64 completed ;
826+ uvm_gpu_t * gpu = tracking_semaphore -> semaphore .page -> pool -> gpu ;
825827
826828 // Check that the GPU which owns the semaphore is still present
827829 UVM_ASSERT (tracking_semaphore_check_gpu (tracking_semaphore ));
828830
831+ // If the GPU is not accessible (surprise removed), return the cached
832+ // completed value without reading from GPU memory. Reading from GPU
833+ // memory after surprise removal returns garbage values that cause
834+ // assertion failures.
835+ if (!uvm_parent_gpu_is_accessible (gpu -> parent ))
836+ return atomic64_read (& tracking_semaphore -> completed_value );
837+
829838 if (tracking_semaphore_uses_mutex (tracking_semaphore ))
830839 uvm_mutex_lock (& tracking_semaphore -> m_lock );
831840 else
@@ -844,10 +853,16 @@ NvU64 uvm_gpu_tracking_semaphore_update_completed_value(uvm_gpu_tracking_semapho
844853bool uvm_gpu_tracking_semaphore_is_value_completed (uvm_gpu_tracking_semaphore_t * tracking_sem , NvU64 value )
845854{
846855 NvU64 completed = atomic64_read (& tracking_sem -> completed_value );
856+ uvm_gpu_t * gpu = tracking_sem -> semaphore .page -> pool -> gpu ;
847857
848858 // Check that the GPU which owns the semaphore is still present
849859 UVM_ASSERT (tracking_semaphore_check_gpu (tracking_sem ));
850860
861+ // If the GPU is not accessible, consider all values completed to avoid
862+ // spinning forever waiting for a GPU that's gone.
863+ if (!uvm_parent_gpu_is_accessible (gpu -> parent ))
864+ return true;
865+
851866 if (completed >= value ) {
852867 // atomic64_read() doesn't imply any memory barriers and we need all
853868 // subsequent memory accesses in this thread to be ordered after the
0 commit comments