Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
c9bb69c
Fix: Thunderbolt eGPU hot-plug/unplug kernel crash support
bdandy Dec 11, 2025
91eae20
fix(thunderbolt): add nvInvalidateDeviceReferences to fix module unlo…
bdandy Dec 11, 2025
9392240
fix(thunderbolt): add GPU reconnect fixes for 590
bdandy Dec 12, 2025
aedb5e7
fix(thunderbolt): check gpuLost in FreeDeviceReference
bdandy Dec 12, 2025
816d70a
fix(thunderbolt): check GPU accessibility before HAL calls in UVM ISR
bdandy Dec 12, 2025
5427a2b
fix(uvm): add GPU accessibility check in fault_buffer_flush
bdandy Dec 12, 2025
06d0b20
fix(uvm): add GPU accessibility checks to all fault/counter service h…
bdandy Dec 12, 2025
b62d7f9
fix(uvm): skip PMA free when GPU is not accessible
bdandy Dec 12, 2025
b8de6e7
fix(uvm): skip PMA callback unregistration when GPU not accessible
bdandy Dec 12, 2025
3c33b0f
fix(uvm): skip RM calls in fault_buffer_deinit when GPU not accessible
bdandy Dec 12, 2025
f4e9c5d
fix(uvm): add GPU accessibility checks to high-risk deinit paths
bdandy Dec 12, 2025
6b0ac94
fix(nvidia-drm): skip nvKms calls during surprise removal in GEM/FB free
bdandy Dec 12, 2025
98c06f6
fix(uvm): check device_p2p_initialised before accessing pci_dev in de…
bdandy Dec 12, 2025
3aab5cd
fix(uvm): skip RM calls during cleanup when GPU is surprise removed
bdandy Dec 12, 2025
f235437
fix(uvm): skip GPU semaphore reads during surprise removal
bdandy Dec 12, 2025
ebac140
nvkms-dma: fix struct member references for 590.48.01
bdandy Dec 19, 2025
247d755
nvkms-evo3: fix struct initialization and add minimal eGPU protection
bdandy Dec 19, 2025
c44fe64
nv-pci: add missing wait_iterations variable declarations
bdandy Dec 19, 2025
bc93285
nv-pci: add timeout to usage_count wait for eGPU hotplug
bdandy Dec 19, 2025
5a12cfb
nv-pci: add missing nv-rsync due to merge conflict
bdandy Dec 19, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions kernel-open/common/inc/nvkms-kapi.h
Original file line number Diff line number Diff line change
Expand Up @@ -656,6 +656,17 @@ struct NvKmsKapiFunctionsTable {
*/
void (*freeDevice)(struct NvKmsKapiDevice *device);

/*!
* Frees a device during surprise removal (e.g., Thunderbolt eGPU unplug).
* This skips all hardware access and only releases kernel resources.
* Use this instead of freeDevice() when the GPU hardware is no longer
* accessible to avoid page faults and hangs.
*
* \param [in] device A device returned by allocateDevice().
* This function is a no-op if device is not valid.
*/
void (*freeDeviceForSurpriseRemoval)(struct NvKmsKapiDevice *device);

/*!
* Grab ownership of device, ownership is required to do modeset.
*
Expand Down
84 changes: 84 additions & 0 deletions kernel-open/nvidia-drm/nvidia-drm-drv.c
Original file line number Diff line number Diff line change
Expand Up @@ -852,6 +852,46 @@ static void nv_drm_dev_unload(struct drm_device *dev)
return;
}

/*
* During surprise removal (e.g., Thunderbolt eGPU hot-unplug),
* the GPU hardware is no longer accessible. Skip NVKMS calls that
* would access hardware to prevent page faults and crashes.
* Use freeDeviceForSurpriseRemoval which only releases kernel resources
* without attempting any hardware access.
*/
if (nv_dev->inSurpriseRemoval) {
NV_DRM_DEV_LOG_INFO(nv_dev,
"Surprise removal detected, skipping hardware access");

/* Wake up any processes waiting on flip events */
wake_up_all(&nv_dev->flip_event_wq);

cancel_delayed_work_sync(&nv_dev->hotplug_event_work);
mutex_lock(&nv_dev->lock);

atomic_set(&nv_dev->enable_event_handling, false);
drm_kms_helper_poll_fini(dev);
drm_mode_config_cleanup(dev);

pDevice = nv_dev->pDevice;
nv_dev->pDevice = NULL;

mutex_unlock(&nv_dev->lock);

/*
* Use freeDeviceForSurpriseRemoval instead of freeDevice.
* This skips KmsFreeDevice() and RmFreeDevice() which would try
* to access GPU hardware via ioctls/RM API calls and cause
* page faults since the GPU memory is unmapped.
* It only calls nvkms_close_gpu() to release the GPU reference
* count, allowing the eGPU to be re-initialized when reconnected.
*/
if (pDevice != NULL) {
nvKms->freeDeviceForSurpriseRemoval(pDevice);
}
return;
}

/* Release modeset ownership if fbdev is enabled */

#if defined(NV_DRM_FBDEV_AVAILABLE)
Expand Down Expand Up @@ -2167,6 +2207,28 @@ static void nv_drm_dev_destroy(struct nv_drm_device *nv_dev)
nv_drm_free(nv_dev);
}

/*
* Helper to get PCI device from DRM device, handling both old and new kernels.
* Returns NULL if not a PCI device or device not available.
*/
static struct pci_dev *nv_drm_get_pci_dev(struct drm_device *dev)
{
if (dev == NULL) {
return NULL;
}

#if defined(NV_DRM_DEVICE_HAS_PDEV)
return dev->pdev;
#else
/* On newer kernels (5.14+), drm_device.pdev was removed.
* Get PCI device from the parent device. */
if (dev->dev != NULL && dev->dev->bus == &pci_bus_type) {
return to_pci_dev(dev->dev);
}
return NULL;
#endif
}

/*
* Unregister a single NVIDIA DRM device.
*/
Expand All @@ -2175,7 +2237,29 @@ void nv_drm_remove(NvU32 gpuId)
struct nv_drm_device *nv_dev = nv_drm_find_and_remove_device(gpuId);

if (nv_dev) {
struct pci_dev *pdev;

NV_DRM_DEV_LOG_INFO(nv_dev, "Removing device");

/*
* Check if this is a surprise removal (hot-unplug) by testing
* if the PCI channel is offline. This happens when:
* - Thunderbolt eGPU is physically disconnected
* - GPU falls off the bus unexpectedly
*
* For normal driver unload (rmmod), the PCI channel remains online.
* We only skip NVKMS hardware access during surprise removal.
*/
pdev = nv_drm_get_pci_dev(nv_dev->dev);
if (pdev != NULL && pci_channel_offline(pdev)) {
NV_DRM_DEV_LOG_INFO(nv_dev,
"PCI channel offline - surprise removal detected");
nv_dev->inSurpriseRemoval = NV_TRUE;

/* Wake up any processes waiting on flip events */
wake_up_all(&nv_dev->flip_event_wq);
}

drm_dev_unplug(nv_dev->dev);
nv_drm_dev_destroy(nv_dev);
}
Expand Down
12 changes: 9 additions & 3 deletions kernel-open/nvidia-drm/nvidia-drm-fb.c
Original file line number Diff line number Diff line change
Expand Up @@ -61,9 +61,15 @@ static void nv_drm_framebuffer_destroy(struct drm_framebuffer *fb)

drm_framebuffer_cleanup(fb);

/* Free NvKmsKapiSurface associated with this framebuffer object */

nvKms->destroySurface(nv_dev->pDevice, nv_fb->pSurface);
/*
* Only call nvKms->destroySurface if pDevice is valid and device is not
* in surprise removal. During hot-unplug, nvidia_modeset internal state
* may be corrupted before this destructor runs from delayed_fput.
*/
if (nv_dev->pDevice != NULL && !nv_dev->inSurpriseRemoval) {
/* Free NvKmsKapiSurface associated with this framebuffer object */
nvKms->destroySurface(nv_dev->pDevice, nv_fb->pSurface);
}

__nv_drm_framebuffer_free(nv_fb);
}
Expand Down
13 changes: 13 additions & 0 deletions kernel-open/nvidia-drm/nvidia-drm-fence.c
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,19 @@ static void __nv_drm_prime_fence_context_destroy(
struct nv_drm_prime_fence_context *nv_prime_fence_context =
to_nv_prime_fence_context(nv_fence_context);

/*
* Skip nvKms calls if device is being surprise-removed.
* The nvidia_modeset internal state may be corrupted.
*/
if (nv_dev->pDevice == NULL || nv_dev->inSurpriseRemoval) {
/* Force signal pending fences and free */
spin_lock(&nv_prime_fence_context->lock);
nv_drm_gem_prime_force_fence_signal(nv_prime_fence_context);
spin_unlock(&nv_prime_fence_context->lock);
nv_drm_free(nv_fence_context);
return;
}

/*
* Free channel event before destroying the fence context, otherwise event
* callback continue to get called.
Expand Down
8 changes: 7 additions & 1 deletion kernel-open/nvidia-drm/nvidia-drm-gem-dma-buf.c
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,13 @@ void __nv_drm_gem_dma_buf_free(struct nv_drm_gem_object *nv_gem)
struct nv_drm_device *nv_dev = nv_gem->nv_dev;
struct nv_drm_gem_dma_buf *nv_dma_buf = to_nv_dma_buf(nv_gem);

if (nv_dma_buf->base.pMemory) {
/*
* Only call nvKms->freeMemory if pDevice is valid and device is not
* in surprise removal. During hot-unplug, nvidia_modeset internal state
* may be corrupted before this destructor runs from delayed_fput.
*/
if (nv_dma_buf->base.pMemory && nv_dev->pDevice != NULL &&
!nv_dev->inSurpriseRemoval) {
/* Free NvKmsKapiMemory handle associated with this gem object */
nvKms->freeMemory(nv_dev->pDevice, nv_dma_buf->base.pMemory);
}
Expand Down
11 changes: 11 additions & 0 deletions kernel-open/nvidia-drm/nvidia-drm-gem-nvkms-memory.c
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,17 @@ static void __nv_drm_gem_nvkms_memory_free(struct nv_drm_gem_object *nv_gem)
struct nv_drm_gem_nvkms_memory *nv_nvkms_memory =
to_nv_nvkms_memory(nv_gem);

/*
* Skip nvKms calls if pDevice is NULL or inSurpriseRemoval is set.
* During hot-unplug, the nvidia_modeset internal state (semaphores,
* memory handles) may be corrupted or freed before this destructor
* runs from delayed_fput. The memory resources are gone with the GPU.
*/
if (nv_dev->pDevice == NULL || nv_dev->inSurpriseRemoval) {
nv_drm_free(nv_nvkms_memory);
return;
}

if (nv_nvkms_memory->physically_mapped) {
if (nv_nvkms_memory->pWriteCombinedIORemapAddress != NULL) {
iounmap(nv_nvkms_memory->pWriteCombinedIORemapAddress);
Expand Down
15 changes: 15 additions & 0 deletions kernel-open/nvidia-drm/nvidia-drm-priv.h
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,21 @@ struct nv_drm_device {
NvBool subOwnershipGranted;
NvBool hasFramebufferConsole;

/*
* Set to NV_TRUE for external GPUs (e.g., Thunderbolt/USB4 eGPU).
* External GPUs use the fast removal path to avoid hangs during
* both surprise removal and "safe" software-initiated disconnect.
*/
NvBool isExternalGpu;

/*
* Set to NV_TRUE when the device is being removed due to
* surprise removal (e.g., Thunderbolt eGPU hot-unplug).
* When set, NVKMS operations that would access GPU hardware
* are skipped to prevent crashes from accessing unmapped memory.
*/
NvBool inSurpriseRemoval;

struct drm_property *nv_out_fence_property;
struct drm_property *nv_input_colorspace_property;

Expand Down
21 changes: 21 additions & 0 deletions kernel-open/nvidia-modeset/nvidia-modeset-linux.c
Original file line number Diff line number Diff line change
Expand Up @@ -1208,6 +1208,27 @@ void nvkms_close_gpu(NvU32 gpuId)
__rm_ops.free_stack(stack);
}

void nvkms_gpu_lost(NvU32 gpuId)
{
/*
* Mark the GPU as lost in NVKMS. This prevents hardware access
* and cancels pending timers that might try to access the removed GPU.
*
* NOTE: We intentionally do NOT take nvkms_lock here because this function
* may be called from contexts that already hold the lock (e.g., during
* module unload). The gpuLost flag is a simple boolean that can be safely
* written without a lock - any racing operation will either:
* 1. See gpuLost=TRUE and bail out early
* 2. See gpuLost=FALSE but hit the 0xFFFFFFFF check when reading hardware
*
* A memory barrier ensures the write is visible to other CPUs promptly.
*/
nvKmsGpuLost(gpuId);

/* Ensure gpuLost write is visible to other CPUs */
smp_wmb();
}

NvU32 nvkms_enumerate_gpus(nv_gpu_info_t *gpu_info)
{
return __rm_ops.enumerate_gpus(gpu_info);
Expand Down
6 changes: 6 additions & 0 deletions kernel-open/nvidia-modeset/nvidia-modeset-os-interface.h
Original file line number Diff line number Diff line change
Expand Up @@ -310,6 +310,12 @@ void* nvkms_get_per_open_data(int fd);
NvBool nvkms_open_gpu(NvU32 gpuId);
void nvkms_close_gpu(NvU32 gpuId);

/*!
* Mark a GPU as lost (surprise removal, e.g., Thunderbolt eGPU unplug).
* This prevents hardware access and cancels pending timers.
*/
void nvkms_gpu_lost(NvU32 gpuId);


/*!
* Enumerate nvidia gpus.
Expand Down
2 changes: 2 additions & 0 deletions kernel-open/nvidia-modeset/nvkms.h
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,8 @@ void nvKmsModuleUnload(void);
void nvKmsSuspend(NvU32 gpuId);
void nvKmsResume(NvU32 gpuId);

void nvKmsGpuLost(NvU32 gpuId);

void nvKmsGetProcFiles(const nvkms_procfs_file_t **ppProcFiles);

NvBool nvKmsReadConf(const char *buff, size_t size,
Expand Down
20 changes: 15 additions & 5 deletions kernel-open/nvidia-uvm/uvm_channel.c
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@
#include "uvm_common.h"
#include "uvm_global.h"
#include "uvm_hal.h"
#include "uvm_gpu.h"
#include "uvm_gpu_isr.h"
#include "uvm_procfs.h"
#include "uvm_push.h"
#include "uvm_gpu_semaphore.h"
Expand Down Expand Up @@ -2310,10 +2312,14 @@ static void channel_destroy(uvm_channel_pool_t *pool, uvm_channel_t *channel)
free_conf_computing_buffers(channel);
}

if (uvm_channel_is_proxy(channel))
uvm_rm_locked_call_void(nvUvmInterfacePagingChannelDestroy(channel->proxy.handle));
else
uvm_rm_locked_call_void(nvUvmInterfaceChannelDestroy(channel->handle));
// Skip RM calls if GPU has been surprise removed. Calling RM with stale
// handles will result in NV_ERR_INVALID_OBJECT_HANDLE errors.
if (uvm_parent_gpu_is_accessible(pool->manager->gpu->parent)) {
if (uvm_channel_is_proxy(channel))
uvm_rm_locked_call_void(nvUvmInterfacePagingChannelDestroy(channel->proxy.handle));
else
uvm_rm_locked_call_void(nvUvmInterfaceChannelDestroy(channel->handle));
}

uvm_gpu_tracking_semaphore_free(&channel->tracking_sem);

Expand Down Expand Up @@ -2657,7 +2663,11 @@ static void tsg_destroy(uvm_channel_pool_t *pool, uvmGpuTsgHandle tsg_handle)
{
UVM_ASSERT(pool->num_tsgs > 0);

uvm_rm_locked_call_void(nvUvmInterfaceTsgDestroy(tsg_handle));
// Skip RM call if GPU has been surprise removed. Calling RM with stale
// handles will result in NV_ERR_INVALID_OBJECT_HANDLE errors.
if (uvm_parent_gpu_is_accessible(pool->manager->gpu->parent))
uvm_rm_locked_call_void(nvUvmInterfaceTsgDestroy(tsg_handle));

pool->num_tsgs--;
}

Expand Down
30 changes: 21 additions & 9 deletions kernel-open/nvidia-uvm/uvm_gpu.c
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
#include "uvm_linux.h"
#include "uvm_mmu.h"
#include "uvm_kvmalloc.h"
#include "uvm_gpu_isr.h"

#define UVM_PROC_GPUS_PEER_DIR_NAME "peers"

Expand Down Expand Up @@ -1362,7 +1363,8 @@ static NV_STATUS configure_address_space(uvm_gpu_t *gpu)

static void deconfigure_address_space(uvm_gpu_t *gpu)
{
if (gpu->rm_address_space_moved_to_page_tree)
// Skip RM call if GPU is not accessible (e.g., hot-unplugged).
if (gpu->rm_address_space_moved_to_page_tree && uvm_parent_gpu_is_accessible(gpu->parent))
uvm_rm_locked_call_void(nvUvmInterfaceUnsetPageDirectory(gpu->rm_address_space));

if (gpu->address_space_tree.root)
Expand Down Expand Up @@ -1780,6 +1782,10 @@ static void remove_gpu_from_parent_gpu(uvm_gpu_t *gpu)

static void deinit_parent_gpu(uvm_parent_gpu_t *parent_gpu)
{
// Check GPU accessibility before pci_dev is cleared.
// If the GPU was hot-unplugged, skip RM calls that would crash.
bool gpu_accessible = uvm_parent_gpu_is_accessible(parent_gpu);

// All channels should have been removed before the retained count went to 0
UVM_ASSERT(uvm_rb_tree_empty(&parent_gpu->instance_ptr_table));
UVM_ASSERT(uvm_rb_tree_empty(&parent_gpu->tsg_table));
Expand All @@ -1805,7 +1811,9 @@ static void deinit_parent_gpu(uvm_parent_gpu_t *parent_gpu)
if (parent_gpu->rm_info.isSimulated)
--g_uvm_global.num_simulated_devices;

if (parent_gpu->rm_device != 0)
// Skip RM call if GPU was not accessible (e.g., hot-unplugged).
// The nvidia module's internal state is corrupted when the GPU is gone.
if (parent_gpu->rm_device != 0 && gpu_accessible)
uvm_rm_locked_call_void(nvUvmInterfaceDeviceDestroy(parent_gpu->rm_device));

uvm_parent_gpu_kref_put(parent_gpu);
Expand Down Expand Up @@ -1848,16 +1856,20 @@ static void deinit_gpu(uvm_gpu_t *gpu)

uvm_pmm_gpu_deinit(&gpu->pmm);

if (gpu->rm_address_space != 0)
uvm_rm_locked_call_void(nvUvmInterfaceAddressSpaceDestroy(gpu->rm_address_space));

deinit_procfs_dirs(gpu);
// Skip RM calls if GPU is not accessible (e.g., hot-unplugged).
// The nvidia module's internal state is corrupted when the GPU is gone.
if (uvm_parent_gpu_is_accessible(gpu->parent)) {
if (gpu->rm_address_space != 0)
uvm_rm_locked_call_void(nvUvmInterfaceAddressSpaceDestroy(gpu->rm_address_space));

if (gpu->parent->smc.enabled) {
if (gpu->smc.rm_device != 0)
uvm_rm_locked_call_void(nvUvmInterfaceDeviceDestroy(gpu->smc.rm_device));
if (gpu->parent->smc.enabled) {
if (gpu->smc.rm_device != 0)
uvm_rm_locked_call_void(nvUvmInterfaceDeviceDestroy(gpu->smc.rm_device));
}
}

deinit_procfs_dirs(gpu);

gpu->magic = 0;
}

Expand Down
Loading