diff --git a/kernel-open/common/inc/nvkms-kapi.h b/kernel-open/common/inc/nvkms-kapi.h
index 4a65977031..4f9c421b3f 100644
--- a/kernel-open/common/inc/nvkms-kapi.h
+++ b/kernel-open/common/inc/nvkms-kapi.h
@@ -656,6 +656,17 @@ struct NvKmsKapiFunctionsTable {
      */
     void (*freeDevice)(struct NvKmsKapiDevice *device);
 
+    /*!
+     * Frees a device during surprise removal (e.g., Thunderbolt eGPU unplug).
+     * This skips all hardware access and only releases kernel resources.
+     * Use this instead of freeDevice() when the GPU hardware is no longer
+     * accessible to avoid page faults and hangs.
+     *
+     * \param [in]  device  A device returned by allocateDevice().
+     *                      This function is a no-op if device is not valid.
+     */
+    void (*freeDeviceForSurpriseRemoval)(struct NvKmsKapiDevice *device);
+
     /*!
      * Grab ownership of device, ownership is required to do modeset.
      *
diff --git a/kernel-open/nvidia-drm/nvidia-drm-drv.c b/kernel-open/nvidia-drm/nvidia-drm-drv.c
index e9ef77c948..3b3e39fec8 100644
--- a/kernel-open/nvidia-drm/nvidia-drm-drv.c
+++ b/kernel-open/nvidia-drm/nvidia-drm-drv.c
@@ -852,6 +852,46 @@ static void nv_drm_dev_unload(struct drm_device *dev)
         return;
     }
 
+    /*
+     * During surprise removal (e.g., Thunderbolt eGPU hot-unplug),
+     * the GPU hardware is no longer accessible. Skip NVKMS calls that
+     * would access hardware to prevent page faults and crashes.
+     * Use freeDeviceForSurpriseRemoval which only releases kernel resources
+     * without attempting any hardware access.
+     */
+    if (nv_dev->inSurpriseRemoval) {
+        NV_DRM_DEV_LOG_INFO(nv_dev,
+            "Surprise removal detected, skipping hardware access");
+
+        /* Wake up any processes waiting on flip events */
+        wake_up_all(&nv_dev->flip_event_wq);
+
+        cancel_delayed_work_sync(&nv_dev->hotplug_event_work);
+        mutex_lock(&nv_dev->lock);
+
+        atomic_set(&nv_dev->enable_event_handling, false);
+        drm_kms_helper_poll_fini(dev);
+        drm_mode_config_cleanup(dev);
+
+        pDevice = nv_dev->pDevice;
+        nv_dev->pDevice = NULL;
+
+        mutex_unlock(&nv_dev->lock);
+
+        /*
+         * Use freeDeviceForSurpriseRemoval instead of freeDevice.
+         * This skips KmsFreeDevice() and RmFreeDevice() which would try
+         * to access GPU hardware via ioctls/RM API calls and cause
+         * page faults since the GPU memory is unmapped.
+         * It only calls nvkms_close_gpu() to release the GPU reference
+         * count, allowing the eGPU to be re-initialized when reconnected.
+         */
+        if (pDevice != NULL) {
+            nvKms->freeDeviceForSurpriseRemoval(pDevice);
+        }
+        return;
+    }
+
     /* Release modeset ownership if fbdev is enabled */
 
 #if defined(NV_DRM_FBDEV_AVAILABLE)
@@ -2167,6 +2207,28 @@ static void nv_drm_dev_destroy(struct nv_drm_device *nv_dev)
     nv_drm_free(nv_dev);
 }
 
+/*
+ * Helper to get PCI device from DRM device, handling both old and new kernels.
+ * Returns NULL if not a PCI device or device not available.
+ */
+static struct pci_dev *nv_drm_get_pci_dev(struct drm_device *dev)
+{
+    if (dev == NULL) {
+        return NULL;
+    }
+
+#if defined(NV_DRM_DEVICE_HAS_PDEV)
+    return dev->pdev;
+#else
+    /* On newer kernels (5.14+), drm_device.pdev was removed.
+     * Get PCI device from the parent device. */
+    if (dev->dev != NULL && dev->dev->bus == &pci_bus_type) {
+        return to_pci_dev(dev->dev);
+    }
+    return NULL;
+#endif
+}
+
 /*
  * Unregister a single NVIDIA DRM device.
  */
@@ -2175,7 +2237,29 @@ void nv_drm_remove(NvU32 gpuId)
     struct nv_drm_device *nv_dev = nv_drm_find_and_remove_device(gpuId);
 
     if (nv_dev) {
+        struct pci_dev *pdev;
+
         NV_DRM_DEV_LOG_INFO(nv_dev, "Removing device");
+
+        /*
+         * Check if this is a surprise removal (hot-unplug) by testing
+         * if the PCI channel is offline. This happens when:
+         * - Thunderbolt eGPU is physically disconnected
+         * - GPU falls off the bus unexpectedly
+         * 
+         * For normal driver unload (rmmod), the PCI channel remains online.
+         * We only skip NVKMS hardware access during surprise removal.
+         */
+        pdev = nv_drm_get_pci_dev(nv_dev->dev);
+        if (pdev != NULL && pci_channel_offline(pdev)) {
+            NV_DRM_DEV_LOG_INFO(nv_dev,
+                "PCI channel offline - surprise removal detected");
+            nv_dev->inSurpriseRemoval = NV_TRUE;
+
+            /* Wake up any processes waiting on flip events */
+            wake_up_all(&nv_dev->flip_event_wq);
+        }
+
         drm_dev_unplug(nv_dev->dev);
         nv_drm_dev_destroy(nv_dev);
     }
diff --git a/kernel-open/nvidia-drm/nvidia-drm-fb.c b/kernel-open/nvidia-drm/nvidia-drm-fb.c
index 8c0a45757a..353fc70272 100644
--- a/kernel-open/nvidia-drm/nvidia-drm-fb.c
+++ b/kernel-open/nvidia-drm/nvidia-drm-fb.c
@@ -61,9 +61,15 @@ static void nv_drm_framebuffer_destroy(struct drm_framebuffer *fb)
 
     drm_framebuffer_cleanup(fb);
 
-    /* Free NvKmsKapiSurface associated with this framebuffer object */
-
-    nvKms->destroySurface(nv_dev->pDevice, nv_fb->pSurface);
+    /*
+     * Only call nvKms->destroySurface if pDevice is valid and device is not
+     * in surprise removal. During hot-unplug, nvidia_modeset internal state
+     * may be corrupted before this destructor runs from delayed_fput.
+     */
+    if (nv_dev->pDevice != NULL && !nv_dev->inSurpriseRemoval) {
+        /* Free NvKmsKapiSurface associated with this framebuffer object */
+        nvKms->destroySurface(nv_dev->pDevice, nv_fb->pSurface);
+    }
 
     __nv_drm_framebuffer_free(nv_fb);
 }
diff --git a/kernel-open/nvidia-drm/nvidia-drm-fence.c b/kernel-open/nvidia-drm/nvidia-drm-fence.c
index 7af1ed7f13..d8fcf6edfb 100644
--- a/kernel-open/nvidia-drm/nvidia-drm-fence.c
+++ b/kernel-open/nvidia-drm/nvidia-drm-fence.c
@@ -212,6 +212,19 @@ static void __nv_drm_prime_fence_context_destroy(
     struct nv_drm_prime_fence_context *nv_prime_fence_context =
         to_nv_prime_fence_context(nv_fence_context);
 
+    /*
+     * Skip nvKms calls if device is being surprise-removed.
+     * The nvidia_modeset internal state may be corrupted.
+     */
+    if (nv_dev->pDevice == NULL || nv_dev->inSurpriseRemoval) {
+        /* Force signal pending fences and free */
+        spin_lock(&nv_prime_fence_context->lock);
+        nv_drm_gem_prime_force_fence_signal(nv_prime_fence_context);
+        spin_unlock(&nv_prime_fence_context->lock);
+        nv_drm_free(nv_fence_context);
+        return;
+    }
+
     /*
      * Free channel event before destroying the fence context, otherwise event
      * callback continue to get called.
diff --git a/kernel-open/nvidia-drm/nvidia-drm-gem-dma-buf.c b/kernel-open/nvidia-drm/nvidia-drm-gem-dma-buf.c
index 163a8ecf63..fdddfd98f9 100644
--- a/kernel-open/nvidia-drm/nvidia-drm-gem-dma-buf.c
+++ b/kernel-open/nvidia-drm/nvidia-drm-gem-dma-buf.c
@@ -43,7 +43,13 @@ void __nv_drm_gem_dma_buf_free(struct nv_drm_gem_object *nv_gem)
     struct nv_drm_device *nv_dev = nv_gem->nv_dev;
     struct nv_drm_gem_dma_buf *nv_dma_buf = to_nv_dma_buf(nv_gem);
 
-    if (nv_dma_buf->base.pMemory) {
+    /*
+     * Only call nvKms->freeMemory if pDevice is valid and device is not
+     * in surprise removal. During hot-unplug, nvidia_modeset internal state
+     * may be corrupted before this destructor runs from delayed_fput.
+     */
+    if (nv_dma_buf->base.pMemory && nv_dev->pDevice != NULL &&
+        !nv_dev->inSurpriseRemoval) {
         /* Free NvKmsKapiMemory handle associated with this gem object */
         nvKms->freeMemory(nv_dev->pDevice, nv_dma_buf->base.pMemory);
     }
diff --git a/kernel-open/nvidia-drm/nvidia-drm-gem-nvkms-memory.c b/kernel-open/nvidia-drm/nvidia-drm-gem-nvkms-memory.c
index 6b92c7532a..da2e8cbe5f 100644
--- a/kernel-open/nvidia-drm/nvidia-drm-gem-nvkms-memory.c
+++ b/kernel-open/nvidia-drm/nvidia-drm-gem-nvkms-memory.c
@@ -42,6 +42,17 @@ static void __nv_drm_gem_nvkms_memory_free(struct nv_drm_gem_object *nv_gem)
     struct nv_drm_gem_nvkms_memory *nv_nvkms_memory =
         to_nv_nvkms_memory(nv_gem);
 
+    /*
+     * Skip nvKms calls if pDevice is NULL or inSurpriseRemoval is set.
+     * During hot-unplug, the nvidia_modeset internal state (semaphores,
+     * memory handles) may be corrupted or freed before this destructor
+     * runs from delayed_fput. The memory resources are gone with the GPU.
+     */
+    if (nv_dev->pDevice == NULL || nv_dev->inSurpriseRemoval) {
+        nv_drm_free(nv_nvkms_memory);
+        return;
+    }
+
     if (nv_nvkms_memory->physically_mapped) {
         if (nv_nvkms_memory->pWriteCombinedIORemapAddress != NULL) {
             iounmap(nv_nvkms_memory->pWriteCombinedIORemapAddress);
diff --git a/kernel-open/nvidia-drm/nvidia-drm-priv.h b/kernel-open/nvidia-drm/nvidia-drm-priv.h
index 88c74a069d..7b44e326df 100644
--- a/kernel-open/nvidia-drm/nvidia-drm-priv.h
+++ b/kernel-open/nvidia-drm/nvidia-drm-priv.h
@@ -148,6 +148,21 @@ struct nv_drm_device {
     NvBool subOwnershipGranted;
     NvBool hasFramebufferConsole;
 
+    /*
+     * Set to NV_TRUE for external GPUs (e.g., Thunderbolt/USB4 eGPU).
+     * External GPUs use the fast removal path to avoid hangs during
+     * both surprise removal and "safe" software-initiated disconnect.
+     */
+    NvBool isExternalGpu;
+
+    /*
+     * Set to NV_TRUE when the device is being removed due to
+     * surprise removal (e.g., Thunderbolt eGPU hot-unplug).
+     * When set, NVKMS operations that would access GPU hardware
+     * are skipped to prevent crashes from accessing unmapped memory.
+     */
+    NvBool inSurpriseRemoval;
+
     struct drm_property *nv_out_fence_property;
     struct drm_property *nv_input_colorspace_property;
 
diff --git a/kernel-open/nvidia-modeset/nvidia-modeset-linux.c b/kernel-open/nvidia-modeset/nvidia-modeset-linux.c
index e81ac023ad..88b6be31d7 100644
--- a/kernel-open/nvidia-modeset/nvidia-modeset-linux.c
+++ b/kernel-open/nvidia-modeset/nvidia-modeset-linux.c
@@ -1208,6 +1208,27 @@ void nvkms_close_gpu(NvU32 gpuId)
     __rm_ops.free_stack(stack);
 }
 
+void nvkms_gpu_lost(NvU32 gpuId)
+{
+    /*
+     * Mark the GPU as lost in NVKMS. This prevents hardware access
+     * and cancels pending timers that might try to access the removed GPU.
+     *
+     * NOTE: We intentionally do NOT take nvkms_lock here because this function
+     * may be called from contexts that already hold the lock (e.g., during
+     * module unload). The gpuLost flag is a simple boolean that can be safely
+     * written without a lock - any racing operation will either:
+     * 1. See gpuLost=TRUE and bail out early
+     * 2. See gpuLost=FALSE but hit the 0xFFFFFFFF check when reading hardware
+     *
+     * A memory barrier ensures the write is visible to other CPUs promptly.
+     */
+    nvKmsGpuLost(gpuId);
+
+    /* Ensure gpuLost write is visible to other CPUs */
+    smp_wmb();
+}
+
 NvU32 nvkms_enumerate_gpus(nv_gpu_info_t *gpu_info)
 {
     return __rm_ops.enumerate_gpus(gpu_info);
diff --git a/kernel-open/nvidia-modeset/nvidia-modeset-os-interface.h b/kernel-open/nvidia-modeset/nvidia-modeset-os-interface.h
index d4d656e766..8c7294e6c2 100644
--- a/kernel-open/nvidia-modeset/nvidia-modeset-os-interface.h
+++ b/kernel-open/nvidia-modeset/nvidia-modeset-os-interface.h
@@ -310,6 +310,12 @@ void* nvkms_get_per_open_data(int fd);
 NvBool nvkms_open_gpu(NvU32 gpuId);
 void nvkms_close_gpu(NvU32 gpuId);
 
+/*!
+ * Mark a GPU as lost (surprise removal, e.g., Thunderbolt eGPU unplug).
+ * This prevents hardware access and cancels pending timers.
+ */
+void nvkms_gpu_lost(NvU32 gpuId);
+
 
 /*!
  * Enumerate nvidia gpus.
diff --git a/kernel-open/nvidia-modeset/nvkms.h b/kernel-open/nvidia-modeset/nvkms.h
index d350ef7564..668fa8c27b 100644
--- a/kernel-open/nvidia-modeset/nvkms.h
+++ b/kernel-open/nvidia-modeset/nvkms.h
@@ -88,6 +88,8 @@ void nvKmsModuleUnload(void);
 void nvKmsSuspend(NvU32 gpuId);
 void nvKmsResume(NvU32 gpuId);
 
+void nvKmsGpuLost(NvU32 gpuId);
+
 void nvKmsGetProcFiles(const nvkms_procfs_file_t **ppProcFiles);
 
 NvBool nvKmsReadConf(const char *buff, size_t size,
diff --git a/kernel-open/nvidia-uvm/uvm_channel.c b/kernel-open/nvidia-uvm/uvm_channel.c
index 93eed89f01..e7e71e0556 100644
--- a/kernel-open/nvidia-uvm/uvm_channel.c
+++ b/kernel-open/nvidia-uvm/uvm_channel.c
@@ -27,6 +27,8 @@
 #include "uvm_common.h"
 #include "uvm_global.h"
 #include "uvm_hal.h"
+#include "uvm_gpu.h"
+#include "uvm_gpu_isr.h"
 #include "uvm_procfs.h"
 #include "uvm_push.h"
 #include "uvm_gpu_semaphore.h"
@@ -2310,10 +2312,14 @@ static void channel_destroy(uvm_channel_pool_t *pool, uvm_channel_t *channel)
             free_conf_computing_buffers(channel);
     }
 
-    if (uvm_channel_is_proxy(channel))
-        uvm_rm_locked_call_void(nvUvmInterfacePagingChannelDestroy(channel->proxy.handle));
-    else
-        uvm_rm_locked_call_void(nvUvmInterfaceChannelDestroy(channel->handle));
+    // Skip RM calls if GPU has been surprise removed. Calling RM with stale
+    // handles will result in NV_ERR_INVALID_OBJECT_HANDLE errors.
+    if (uvm_parent_gpu_is_accessible(pool->manager->gpu->parent)) {
+        if (uvm_channel_is_proxy(channel))
+            uvm_rm_locked_call_void(nvUvmInterfacePagingChannelDestroy(channel->proxy.handle));
+        else
+            uvm_rm_locked_call_void(nvUvmInterfaceChannelDestroy(channel->handle));
+    }
 
     uvm_gpu_tracking_semaphore_free(&channel->tracking_sem);
 
@@ -2657,7 +2663,11 @@ static void tsg_destroy(uvm_channel_pool_t *pool, uvmGpuTsgHandle tsg_handle)
 {
     UVM_ASSERT(pool->num_tsgs > 0);
 
-    uvm_rm_locked_call_void(nvUvmInterfaceTsgDestroy(tsg_handle));
+    // Skip RM call if GPU has been surprise removed. Calling RM with stale
+    // handles will result in NV_ERR_INVALID_OBJECT_HANDLE errors.
+    if (uvm_parent_gpu_is_accessible(pool->manager->gpu->parent))
+        uvm_rm_locked_call_void(nvUvmInterfaceTsgDestroy(tsg_handle));
+
     pool->num_tsgs--;
 }
 
diff --git a/kernel-open/nvidia-uvm/uvm_gpu.c b/kernel-open/nvidia-uvm/uvm_gpu.c
index 265a2c46c0..6f6a5ffee0 100644
--- a/kernel-open/nvidia-uvm/uvm_gpu.c
+++ b/kernel-open/nvidia-uvm/uvm_gpu.c
@@ -45,6 +45,7 @@
 #include "uvm_linux.h"
 #include "uvm_mmu.h"
 #include "uvm_kvmalloc.h"
+#include "uvm_gpu_isr.h"
 
 #define UVM_PROC_GPUS_PEER_DIR_NAME "peers"
 
@@ -1362,7 +1363,8 @@ static NV_STATUS configure_address_space(uvm_gpu_t *gpu)
 
 static void deconfigure_address_space(uvm_gpu_t *gpu)
 {
-    if (gpu->rm_address_space_moved_to_page_tree)
+    // Skip RM call if GPU is not accessible (e.g., hot-unplugged).
+    if (gpu->rm_address_space_moved_to_page_tree && uvm_parent_gpu_is_accessible(gpu->parent))
         uvm_rm_locked_call_void(nvUvmInterfaceUnsetPageDirectory(gpu->rm_address_space));
 
     if (gpu->address_space_tree.root)
@@ -1780,6 +1782,10 @@ static void remove_gpu_from_parent_gpu(uvm_gpu_t *gpu)
 
 static void deinit_parent_gpu(uvm_parent_gpu_t *parent_gpu)
 {
+    // Check GPU accessibility before pci_dev is cleared.
+    // If the GPU was hot-unplugged, skip RM calls that would crash.
+    bool gpu_accessible = uvm_parent_gpu_is_accessible(parent_gpu);
+
     // All channels should have been removed before the retained count went to 0
     UVM_ASSERT(uvm_rb_tree_empty(&parent_gpu->instance_ptr_table));
     UVM_ASSERT(uvm_rb_tree_empty(&parent_gpu->tsg_table));
@@ -1805,7 +1811,9 @@ static void deinit_parent_gpu(uvm_parent_gpu_t *parent_gpu)
     if (parent_gpu->rm_info.isSimulated)
         --g_uvm_global.num_simulated_devices;
 
-    if (parent_gpu->rm_device != 0)
+    // Skip RM call if GPU was not accessible (e.g., hot-unplugged).
+    // The nvidia module's internal state is corrupted when the GPU is gone.
+    if (parent_gpu->rm_device != 0 && gpu_accessible)
         uvm_rm_locked_call_void(nvUvmInterfaceDeviceDestroy(parent_gpu->rm_device));
 
     uvm_parent_gpu_kref_put(parent_gpu);
@@ -1848,16 +1856,20 @@ static void deinit_gpu(uvm_gpu_t *gpu)
 
     uvm_pmm_gpu_deinit(&gpu->pmm);
 
-    if (gpu->rm_address_space != 0)
-        uvm_rm_locked_call_void(nvUvmInterfaceAddressSpaceDestroy(gpu->rm_address_space));
-
-    deinit_procfs_dirs(gpu);
+    // Skip RM calls if GPU is not accessible (e.g., hot-unplugged).
+    // The nvidia module's internal state is corrupted when the GPU is gone.
+    if (uvm_parent_gpu_is_accessible(gpu->parent)) {
+        if (gpu->rm_address_space != 0)
+            uvm_rm_locked_call_void(nvUvmInterfaceAddressSpaceDestroy(gpu->rm_address_space));
 
-    if (gpu->parent->smc.enabled) {
-        if (gpu->smc.rm_device != 0)
-            uvm_rm_locked_call_void(nvUvmInterfaceDeviceDestroy(gpu->smc.rm_device));
+        if (gpu->parent->smc.enabled) {
+            if (gpu->smc.rm_device != 0)
+                uvm_rm_locked_call_void(nvUvmInterfaceDeviceDestroy(gpu->smc.rm_device));
+        }
     }
 
+    deinit_procfs_dirs(gpu);
+
     gpu->magic = 0;
 }
 
diff --git a/kernel-open/nvidia-uvm/uvm_gpu_access_counters.c b/kernel-open/nvidia-uvm/uvm_gpu_access_counters.c
index a906cb8c77..f4469cee99 100644
--- a/kernel-open/nvidia-uvm/uvm_gpu_access_counters.c
+++ b/kernel-open/nvidia-uvm/uvm_gpu_access_counters.c
@@ -25,6 +25,7 @@
 #include "uvm_gpu_access_counters.h"
 #include "uvm_global.h"
 #include "uvm_api.h"
+#include "uvm_gpu_isr.h"
 #include "uvm_gpu.h"
 #include "uvm_hal.h"
 #include "uvm_kvmalloc.h"
@@ -505,11 +506,15 @@ void uvm_parent_gpu_deinit_access_counters(uvm_parent_gpu_t *parent_gpu, NvU32 n
     }
 
     if (access_counters && access_counters->rm_info.accessCntrBufferHandle) {
-        NV_STATUS status = uvm_rm_locked_call(nvUvmInterfaceDestroyAccessCntrInfo(parent_gpu->rm_device,
-                                                                                  &access_counters->rm_info));
         uvm_access_counter_service_batch_context_t *batch_context = &access_counters->batch_service_context;
 
-        UVM_ASSERT(status == NV_OK);
+        // Skip RM call if GPU is not accessible (e.g., hot-unplugged).
+        // The nvidia module's internal state is corrupted when the GPU is gone.
+        if (uvm_parent_gpu_is_accessible(parent_gpu)) {
+            NV_STATUS status = uvm_rm_locked_call(nvUvmInterfaceDestroyAccessCntrInfo(parent_gpu->rm_device,
+                                                                                      &access_counters->rm_info));
+            UVM_ASSERT(status == NV_OK);
+        }
 
         access_counters->rm_info.accessCntrBufferHandle = 0;
         uvm_kvfree(batch_context->notification_cache);
@@ -593,9 +598,12 @@ static void access_counters_yield_ownership(uvm_parent_gpu_t *parent_gpu, NvU32
     if (status != NV_OK)
         UVM_ASSERT(status == uvm_global_get_status());
 
-    status = uvm_rm_locked_call(nvUvmInterfaceDisableAccessCntr(parent_gpu->rm_device,
-                                                                &access_counters->rm_info));
-    UVM_ASSERT(status == NV_OK);
+    // Skip RM call if GPU is not accessible (e.g., hot-unplugged).
+    if (uvm_parent_gpu_is_accessible(parent_gpu)) {
+        status = uvm_rm_locked_call(nvUvmInterfaceDisableAccessCntr(parent_gpu->rm_device,
+                                                                    &access_counters->rm_info));
+        UVM_ASSERT(status == NV_OK);
+    }
 }
 
 // Increment the refcount of access counter enablement. If this is the first
@@ -1766,6 +1774,11 @@ void uvm_service_access_counters(uvm_access_counter_buffer_t *access_counters)
 {
     NV_STATUS status = NV_OK;
     uvm_access_counter_service_batch_context_t *batch_context;
+    uvm_parent_gpu_t *parent_gpu = access_counters->parent_gpu;
+
+    // Check if GPU is still accessible (e.g., not hot-unplugged)
+    if (!uvm_parent_gpu_is_accessible(parent_gpu))
+        return;
 
     batch_context = &access_counters->batch_service_context;
 
diff --git a/kernel-open/nvidia-uvm/uvm_gpu_isr.c b/kernel-open/nvidia-uvm/uvm_gpu_isr.c
index b2834b58c0..dc4e31cbfd 100644
--- a/kernel-open/nvidia-uvm/uvm_gpu_isr.c
+++ b/kernel-open/nvidia-uvm/uvm_gpu_isr.c
@@ -29,6 +29,7 @@
 #include "uvm_gpu_access_counters.h"
 #include "uvm_gpu_non_replayable_faults.h"
 #include "uvm_thread_context.h"
+#include <linux/pci.h>
 
 // Level-based vs pulse-based interrupts
 // =====================================
@@ -63,6 +64,21 @@ static void non_replayable_faults_isr_bottom_half_entry(void *args);
 // half, only.
 static void access_counters_isr_bottom_half_entry(void *args);
 
+// Check if GPU hardware is accessible (not hot-unplugged).
+// This must be called before any HAL function that accesses GPU registers.
+bool uvm_parent_gpu_is_accessible(uvm_parent_gpu_t *parent_gpu)
+{
+    // If pci_dev is NULL, the GPU has been unregistered
+    if (parent_gpu->pci_dev == NULL)
+        return false;
+
+    // Check if PCI channel is offline (surprise removal/hot-unplug)
+    if (pci_channel_offline(parent_gpu->pci_dev))
+        return false;
+
+    return true;
+}
+
 // Increments the reference count tracking whether replayable page fault
 // interrupts should be disabled. The caller is guaranteed that replayable page
 // faults are disabled upon return. Interrupts might already be disabled prior
@@ -881,7 +897,9 @@ static void uvm_parent_gpu_replayable_faults_intr_disable(uvm_parent_gpu_t *pare
 {
     uvm_assert_spinlock_locked(&parent_gpu->isr.interrupts_lock);
 
-    if (parent_gpu->isr.replayable_faults.handling && parent_gpu->isr.replayable_faults.disable_intr_ref_count == 0)
+    if (parent_gpu->isr.replayable_faults.handling &&
+        parent_gpu->isr.replayable_faults.disable_intr_ref_count == 0 &&
+        uvm_parent_gpu_is_accessible(parent_gpu))
         parent_gpu->fault_buffer_hal->disable_replayable_faults(parent_gpu);
 
     ++parent_gpu->isr.replayable_faults.disable_intr_ref_count;
@@ -893,7 +911,9 @@ static void uvm_parent_gpu_replayable_faults_intr_enable(uvm_parent_gpu_t *paren
     UVM_ASSERT(parent_gpu->isr.replayable_faults.disable_intr_ref_count > 0);
 
     --parent_gpu->isr.replayable_faults.disable_intr_ref_count;
-    if (parent_gpu->isr.replayable_faults.handling && parent_gpu->isr.replayable_faults.disable_intr_ref_count == 0)
+    if (parent_gpu->isr.replayable_faults.handling &&
+        parent_gpu->isr.replayable_faults.disable_intr_ref_count == 0 &&
+        uvm_parent_gpu_is_accessible(parent_gpu))
         parent_gpu->fault_buffer_hal->enable_replayable_faults(parent_gpu);
 }
 
@@ -910,7 +930,8 @@ void uvm_access_counters_intr_disable(uvm_access_counter_buffer_t *access_counte
     // (disable_intr_ref_count > 0), so the check always returns false when the
     // race occurs
     if (parent_gpu->isr.access_counters[notif_buf_index].handling_ref_count > 0 &&
-        parent_gpu->isr.access_counters[notif_buf_index].disable_intr_ref_count == 0) {
+        parent_gpu->isr.access_counters[notif_buf_index].disable_intr_ref_count == 0 &&
+        uvm_parent_gpu_is_accessible(parent_gpu)) {
         parent_gpu->access_counter_buffer_hal->disable_access_counter_notifications(access_counters);
     }
 
@@ -929,7 +950,8 @@ void uvm_access_counters_intr_enable(uvm_access_counter_buffer_t *access_counter
     --parent_gpu->isr.access_counters[notif_buf_index].disable_intr_ref_count;
 
     if (parent_gpu->isr.access_counters[notif_buf_index].handling_ref_count > 0 &&
-        parent_gpu->isr.access_counters[notif_buf_index].disable_intr_ref_count == 0) {
+        parent_gpu->isr.access_counters[notif_buf_index].disable_intr_ref_count == 0 &&
+        uvm_parent_gpu_is_accessible(parent_gpu)) {
         parent_gpu->access_counter_buffer_hal->enable_access_counter_notifications(access_counters);
     }
 }
diff --git a/kernel-open/nvidia-uvm/uvm_gpu_isr.h b/kernel-open/nvidia-uvm/uvm_gpu_isr.h
index 2a5f22bb53..a17884d3ee 100644
--- a/kernel-open/nvidia-uvm/uvm_gpu_isr.h
+++ b/kernel-open/nvidia-uvm/uvm_gpu_isr.h
@@ -198,4 +198,9 @@ void uvm_access_counters_intr_enable(uvm_access_counter_buffer_t *access_counter
 // g_uvm_global.global_lock is held so that the returned pointer remains valid.
 uvm_gpu_t *uvm_parent_gpu_find_first_valid_gpu(uvm_parent_gpu_t *parent_gpu);
 
+// Check if GPU hardware is accessible (not hot-unplugged).
+// This must be called before any HAL function that accesses GPU registers.
+// Returns false if pci_dev is NULL or PCI channel is offline.
+bool uvm_parent_gpu_is_accessible(uvm_parent_gpu_t *parent_gpu);
+
 #endif // __UVM_GPU_ISR_H__
diff --git a/kernel-open/nvidia-uvm/uvm_gpu_non_replayable_faults.c b/kernel-open/nvidia-uvm/uvm_gpu_non_replayable_faults.c
index ddb32aa804..f0867689d7 100644
--- a/kernel-open/nvidia-uvm/uvm_gpu_non_replayable_faults.c
+++ b/kernel-open/nvidia-uvm/uvm_gpu_non_replayable_faults.c
@@ -24,6 +24,7 @@
 #include "uvm_common.h"
 #include "uvm_api.h"
 #include "uvm_gpu_non_replayable_faults.h"
+#include "uvm_gpu_isr.h"
 #include "uvm_gpu.h"
 #include "uvm_hal.h"
 #include "uvm_lock.h"
@@ -778,6 +779,11 @@ void uvm_parent_gpu_service_non_replayable_fault_buffer(uvm_parent_gpu_t *parent
 {
     NvU32 cached_faults;
 
+    // Check if GPU is still accessible before servicing faults.
+    // After hot-unplug, accessing GPU registers would cause a crash.
+    if (!uvm_parent_gpu_is_accessible(parent_gpu))
+        return;
+
     // If this handler is modified to handle fewer than all of the outstanding
     // faults, then special handling will need to be added to uvm_suspend()
     // to guarantee that fault processing has completed before control is
diff --git a/kernel-open/nvidia-uvm/uvm_gpu_replayable_faults.c b/kernel-open/nvidia-uvm/uvm_gpu_replayable_faults.c
index 1c557cab6d..8efadc6ae1 100644
--- a/kernel-open/nvidia-uvm/uvm_gpu_replayable_faults.c
+++ b/kernel-open/nvidia-uvm/uvm_gpu_replayable_faults.c
@@ -27,6 +27,7 @@
 #include "uvm_linux.h"
 #include "uvm_global.h"
 #include "uvm_gpu_replayable_faults.h"
+#include "uvm_gpu_isr.h"
 #include "uvm_hal.h"
 #include "uvm_kvmalloc.h"
 #include "uvm_tools.h"
@@ -305,11 +306,15 @@ void uvm_parent_gpu_fault_buffer_deinit(uvm_parent_gpu_t *parent_gpu)
     fault_buffer_deinit_replayable_faults(parent_gpu);
 
     if (parent_gpu->fault_buffer.rm_info.faultBufferHandle) {
-        status = uvm_rm_locked_call(nvUvmInterfaceOwnPageFaultIntr(parent_gpu->rm_device, NV_FALSE));
-        UVM_ASSERT(status == NV_OK);
-
-        uvm_rm_locked_call_void(nvUvmInterfaceDestroyFaultInfo(parent_gpu->rm_device,
-                                                               &parent_gpu->fault_buffer.rm_info));
+        // Skip RM calls if GPU is not accessible (e.g., hot-unplugged).
+        // The nvidia module's internal state is corrupted when the GPU is gone.
+        if (uvm_parent_gpu_is_accessible(parent_gpu)) {
+            status = uvm_rm_locked_call(nvUvmInterfaceOwnPageFaultIntr(parent_gpu->rm_device, NV_FALSE));
+            UVM_ASSERT(status == NV_OK);
+
+            uvm_rm_locked_call_void(nvUvmInterfaceDestroyFaultInfo(parent_gpu->rm_device,
+                                                                   &parent_gpu->fault_buffer.rm_info));
+        }
 
         parent_gpu->fault_buffer.rm_info.faultBufferHandle = 0;
     }
@@ -677,9 +682,21 @@ NV_STATUS uvm_gpu_fault_buffer_flush(uvm_gpu_t *gpu)
 
     UVM_ASSERT(gpu->parent->replayable_faults_supported);
 
+    // Check if GPU hardware is still accessible before attempting to flush.
+    // After hot-unplug, the GPU registers are no longer mapped and accessing
+    // them would cause a page fault crash.
+    if (!uvm_parent_gpu_is_accessible(gpu->parent))
+        return NV_ERR_GPU_IS_LOST;
+
     // Disables replayable fault interrupts and fault servicing
     uvm_parent_gpu_replayable_faults_isr_lock(gpu->parent);
 
+    // Re-check after acquiring the lock in case GPU was removed concurrently
+    if (!uvm_parent_gpu_is_accessible(gpu->parent)) {
+        uvm_parent_gpu_replayable_faults_isr_unlock(gpu->parent);
+        return NV_ERR_GPU_IS_LOST;
+    }
+
     status = fault_buffer_flush_locked(gpu->parent,
                                        gpu,
                                        UVM_GPU_BUFFER_FLUSH_MODE_WAIT_UPDATE_PUT,
@@ -2914,6 +2931,11 @@ void uvm_parent_gpu_service_replayable_faults(uvm_parent_gpu_t *parent_gpu)
 
     UVM_ASSERT(parent_gpu->replayable_faults_supported);
 
+    // Check if GPU is still accessible before servicing faults.
+    // After hot-unplug, accessing GPU registers would cause a crash.
+    if (!uvm_parent_gpu_is_accessible(parent_gpu))
+        return;
+
     uvm_tracker_init(&batch_context->tracker);
 
     // Process all faults in the buffer
diff --git a/kernel-open/nvidia-uvm/uvm_gpu_semaphore.c b/kernel-open/nvidia-uvm/uvm_gpu_semaphore.c
index 478c1aafd9..dae09f31ce 100644
--- a/kernel-open/nvidia-uvm/uvm_gpu_semaphore.c
+++ b/kernel-open/nvidia-uvm/uvm_gpu_semaphore.c
@@ -27,6 +27,7 @@
 #include "uvm_kvmalloc.h"
 #include "uvm_channel.h" // For UVM_GPU_SEMAPHORE_MAX_JUMP
 #include "uvm_conf_computing.h"
+#include "uvm_gpu_isr.h"
 
 #define UVM_SEMAPHORE_SIZE 4
 #define UVM_SEMAPHORE_PAGE_SIZE PAGE_SIZE
@@ -822,10 +823,18 @@ static NvU64 update_completed_value_locked(uvm_gpu_tracking_semaphore_t *trackin
 NvU64 uvm_gpu_tracking_semaphore_update_completed_value(uvm_gpu_tracking_semaphore_t *tracking_semaphore)
 {
     NvU64 completed;
+    uvm_gpu_t *gpu = tracking_semaphore->semaphore.page->pool->gpu;
 
     // Check that the GPU which owns the semaphore is still present
     UVM_ASSERT(tracking_semaphore_check_gpu(tracking_semaphore));
 
+    // If the GPU is not accessible (surprise removed), return the cached
+    // completed value without reading from GPU memory. Reading from GPU
+    // memory after surprise removal returns garbage values that cause
+    // assertion failures.
+    if (!uvm_parent_gpu_is_accessible(gpu->parent))
+        return atomic64_read(&tracking_semaphore->completed_value);
+
     if (tracking_semaphore_uses_mutex(tracking_semaphore))
         uvm_mutex_lock(&tracking_semaphore->m_lock);
     else
@@ -844,10 +853,16 @@ NvU64 uvm_gpu_tracking_semaphore_update_completed_value(uvm_gpu_tracking_semapho
 bool uvm_gpu_tracking_semaphore_is_value_completed(uvm_gpu_tracking_semaphore_t *tracking_sem, NvU64 value)
 {
     NvU64 completed = atomic64_read(&tracking_sem->completed_value);
+    uvm_gpu_t *gpu = tracking_sem->semaphore.page->pool->gpu;
 
     // Check that the GPU which owns the semaphore is still present
     UVM_ASSERT(tracking_semaphore_check_gpu(tracking_sem));
 
+    // If the GPU is not accessible, consider all values completed to avoid
+    // spinning forever waiting for a GPU that's gone.
+    if (!uvm_parent_gpu_is_accessible(gpu->parent))
+        return true;
+
     if (completed >= value) {
         // atomic64_read() doesn't imply any memory barriers and we need all
         // subsequent memory accesses in this thread to be ordered after the
diff --git a/kernel-open/nvidia-uvm/uvm_pmm_gpu.c b/kernel-open/nvidia-uvm/uvm_pmm_gpu.c
index 97ff13dcdd..ab63e3f904 100644
--- a/kernel-open/nvidia-uvm/uvm_pmm_gpu.c
+++ b/kernel-open/nvidia-uvm/uvm_pmm_gpu.c
@@ -166,6 +166,7 @@
 #include "nv_uvm_interface.h"
 #include "uvm_api.h"
 #include "uvm_gpu.h"
+#include "uvm_gpu_isr.h"
 #include "uvm_pmm_gpu.h"
 #include "uvm_mem.h"
 #include "uvm_mmu.h"
@@ -2066,6 +2067,14 @@ void free_root_chunk(uvm_pmm_gpu_t *pmm, uvm_gpu_root_chunk_t *root_chunk, free_
     if (chunk->is_zero)
         flags |= UVM_PMA_FREE_IS_ZERO;
 
+    // Skip PMA free if GPU is not accessible (e.g., hot-unplugged).
+    // Calling into the nvidia module with a gone GPU causes hangs
+    // due to corrupted locks.
+    if (!uvm_parent_gpu_is_accessible(gpu->parent)) {
+        uvm_up_read(&pmm->pma_lock);
+        return;
+    }
+
     nvUvmInterfacePmaFreePages(pmm->pma, &chunk->address, 1, UVM_CHUNK_SIZE_MAX, flags);
 
     uvm_up_read(&pmm->pma_lock);
@@ -3360,7 +3369,9 @@ void uvm_pmm_gpu_device_p2p_init(uvm_parent_gpu_t *parent_gpu)
 
 void uvm_pmm_gpu_device_p2p_deinit(uvm_parent_gpu_t *parent_gpu)
 {
-    if (parent_gpu->device_p2p_initialised && !uvm_parent_gpu_is_coherent(parent_gpu)) {
+        // Check device_p2p_initialised first before accessing pci_dev.
+    // During partial GPU init/deinit, pci_dev may be NULL or P2P was never initialized.
+    if (parent_gpu->device_p2p_initialised && !uvm_parent_gpu_is_coherent(parent_gpu) && parent_gpu->pci_dev != NULL) {
         struct page *p2p_page = pfn_to_page(pci_resource_start(parent_gpu->pci_dev,
                                             uvm_device_p2p_static_bar(parent_gpu)) >> PAGE_SHIFT);
 
@@ -3565,7 +3576,10 @@ void uvm_pmm_gpu_deinit(uvm_pmm_gpu_t *pmm)
     UVM_ASSERT(uvm_pmm_gpu_check_orphan_pages(pmm));
     release_free_root_chunks(pmm);
 
-    if (gpu->mem_info.size != 0 && gpu_supports_pma_eviction(gpu))
+    // Skip unregistering callbacks if GPU is not accessible (hot-unplugged).
+    // The nvidia module's internal state is corrupted when the GPU is gone.
+    if (gpu->mem_info.size != 0 && gpu_supports_pma_eviction(gpu) &&
+        uvm_parent_gpu_is_accessible(gpu->parent))
         nvUvmInterfacePmaUnregisterEvictionCallbacks(pmm->pma);
 
     // TODO: Bug 1766184: Handle ECC/RC
diff --git a/kernel-open/nvidia-uvm/uvm_rm_mem.c b/kernel-open/nvidia-uvm/uvm_rm_mem.c
index 756080fb24..767f9e8d8a 100644
--- a/kernel-open/nvidia-uvm/uvm_rm_mem.c
+++ b/kernel-open/nvidia-uvm/uvm_rm_mem.c
@@ -23,6 +23,7 @@
 
 #include "uvm_rm_mem.h"
 #include "uvm_gpu.h"
+#include "uvm_gpu_isr.h"
 #include "uvm_global.h"
 #include "uvm_kvmalloc.h"
 #include "uvm_linux.h"
@@ -298,8 +299,11 @@ void uvm_rm_mem_unmap_cpu(uvm_rm_mem_t *rm_mem)
     if (!uvm_rm_mem_mapped_on_cpu(rm_mem))
         return;
 
-    uvm_rm_locked_call_void(nvUvmInterfaceMemoryCpuUnMap(rm_mem->gpu_owner->rm_address_space,
-                                                         uvm_rm_mem_get_cpu_va(rm_mem)));
+    // Skip RM call if GPU has been surprise removed. Calling RM with stale
+    // handles will result in NV_ERR_INVALID_OBJECT_HANDLE errors.
+    if (uvm_parent_gpu_is_accessible(rm_mem->gpu_owner->parent))
+        uvm_rm_locked_call_void(nvUvmInterfaceMemoryCpuUnMap(rm_mem->gpu_owner->rm_address_space,
+                                                             uvm_rm_mem_get_cpu_va(rm_mem)));
 
     rm_mem_clear_cpu_va(rm_mem);
 }
@@ -355,7 +359,12 @@ static void rm_mem_unmap_gpu(uvm_rm_mem_t *rm_mem, uvm_gpu_t *gpu)
     rm_mem_unmap_gpu_proxy(rm_mem, gpu);
 
     va = uvm_rm_mem_get_gpu_uvm_va(rm_mem, gpu);
-    uvm_rm_locked_call_void(nvUvmInterfaceMemoryFree(gpu->rm_address_space, va));
+
+    // Skip RM call if GPU has been surprise removed. Calling RM with stale
+    // handles will result in NV_ERR_INVALID_OBJECT_HANDLE errors.
+    if (uvm_parent_gpu_is_accessible(gpu->parent))
+        uvm_rm_locked_call_void(nvUvmInterfaceMemoryFree(gpu->rm_address_space, va));
+
     rm_mem_clear_gpu_va(rm_mem, gpu);
 }
 
diff --git a/kernel-open/nvidia-uvm/uvm_user_channel.c b/kernel-open/nvidia-uvm/uvm_user_channel.c
index a85a645035..634c245ab6 100644
--- a/kernel-open/nvidia-uvm/uvm_user_channel.c
+++ b/kernel-open/nvidia-uvm/uvm_user_channel.c
@@ -32,6 +32,7 @@
 #include "uvm_kvmalloc.h"
 #include "uvm_api.h"
 #include "uvm_gpu.h"
+#include "uvm_gpu_isr.h"
 #include "uvm_tracker.h"
 #include "uvm_map_external.h"
 #include "nv_uvm_interface.h"
@@ -782,6 +783,14 @@ void uvm_user_channel_stop(uvm_user_channel_t *user_channel)
     //       write mode.
     uvm_assert_rwsem_locked_read(&va_space->lock);
 
+    // Skip RM call if GPU has been surprise removed. Calling RM with stale
+    // client handles will result in repeated NV_ERR_INVALID_OBJECT_HANDLE
+    // errors during teardown.
+    if (!uvm_parent_gpu_is_accessible(user_channel->gpu->parent)) {
+        atomic_set(&user_channel->is_bound, 0);
+        return;
+    }
+
     // TODO: Bug 1737765. This doesn't stop the user from putting the
     //       channel back on the runlist, which could put stale instance
     //       pointers back in the fault buffer.
@@ -854,7 +863,9 @@ void uvm_user_channel_destroy_detached(uvm_user_channel_t *user_channel)
         uvm_kvfree(user_channel->resources);
     }
 
-    if (user_channel->rm_retained_channel)
+    // Skip RM call if GPU has been surprise removed. Calling RM with stale
+    // handles will result in NV_ERR_INVALID_OBJECT_HANDLE errors.
+    if (user_channel->rm_retained_channel && uvm_parent_gpu_is_accessible(user_channel->gpu->parent))
         uvm_rm_locked_call_void(nvUvmInterfaceReleaseChannel(user_channel->rm_retained_channel));
 
     uvm_user_channel_release(user_channel);
diff --git a/kernel-open/nvidia-uvm/uvm_va_space.c b/kernel-open/nvidia-uvm/uvm_va_space.c
index f5ff7b46c7..6abd2af9be 100644
--- a/kernel-open/nvidia-uvm/uvm_va_space.c
+++ b/kernel-open/nvidia-uvm/uvm_va_space.c
@@ -32,6 +32,7 @@
 #include "uvm_tools.h"
 #include "uvm_thread_context.h"
 #include "uvm_hal.h"
+#include "uvm_gpu_isr.h"
 #include "uvm_map_external.h"
 #include "uvm_ats.h"
 #include "uvm_gpu_access_counters.h"
@@ -1436,6 +1437,13 @@ void uvm_gpu_va_space_unset_page_dir(uvm_gpu_va_space_t *gpu_va_space)
     if (gpu_va_space->did_set_page_directory) {
         NV_STATUS status;
 
+        // Skip RM call if GPU has been surprise removed. Calling RM with stale
+        // handles will result in NV_ERR_INVALID_OBJECT_HANDLE errors.
+        if (!uvm_parent_gpu_is_accessible(gpu_va_space->gpu->parent)) {
+            gpu_va_space->did_set_page_directory = false;
+            return;
+        }
+
         status = uvm_rm_locked_call(nvUvmInterfaceUnsetPageDirectory(gpu_va_space->duped_gpu_va_space));
         UVM_ASSERT_MSG(status == NV_OK,
                        "nvUvmInterfaceUnsetPageDirectory() failed: %s, GPU %s\n",
@@ -1487,7 +1495,9 @@ static void destroy_gpu_va_space(uvm_gpu_va_space_t *gpu_va_space)
     if (gpu_va_space->page_tables.root)
         uvm_page_tree_deinit(&gpu_va_space->page_tables);
 
-    if (gpu_va_space->duped_gpu_va_space)
+    // Skip RM call if GPU has been surprise removed. Calling RM with stale
+    // handles will result in NV_ERR_INVALID_OBJECT_HANDLE errors.
+    if (gpu_va_space->duped_gpu_va_space && uvm_parent_gpu_is_accessible(gpu_va_space->gpu->parent))
         uvm_rm_locked_call_void(nvUvmInterfaceAddressSpaceDestroy(gpu_va_space->duped_gpu_va_space));
 
     // If the state is DEAD, then this GPU VA space is tracked in
diff --git a/kernel-open/nvidia/nv-acpi.c b/kernel-open/nvidia/nv-acpi.c
index d0fe94a039..ee0ae99506 100644
--- a/kernel-open/nvidia/nv-acpi.c
+++ b/kernel-open/nvidia/nv-acpi.c
@@ -252,12 +252,28 @@ static void nv_acpi_notify_event(acpi_handle handle, u32 event_type, void *data)
 {
     nv_acpi_t  *pNvAcpiObject = data;
     nv_state_t *nvl = pNvAcpiObject->notifier_data;
+    nv_state_t *nv;
+
+    if (nvl == NULL)
+        return;
+
+    nv = NV_STATE_PTR(nvl);
+    if (nv == NULL)
+        return;
+
+    /*
+     * Check if we're in surprise removal before processing ACPI events.
+     * This can happen during Thunderbolt eGPU hot-unplug where the device
+     * is being removed but ACPI events are still being delivered.
+     */
+    if (nv->flags & NV_FLAG_IN_SURPRISE_REMOVAL)
+        return;
 
     /*
      * Function to handle device specific ACPI events such as display hotplug,
      * GPS and D-notifier events.
      */
-    rm_acpi_notify(pNvAcpiObject->sp, NV_STATE_PTR(nvl), event_type);
+    rm_acpi_notify(pNvAcpiObject->sp, nv, event_type);
 }
 
 void nv_acpi_register_notifier(nv_linux_state_t *nvl)
diff --git a/kernel-open/nvidia/nv-i2c.c b/kernel-open/nvidia/nv-i2c.c
index a0f61ed22d..4da6bfca11 100644
--- a/kernel-open/nvidia/nv-i2c.c
+++ b/kernel-open/nvidia/nv-i2c.c
@@ -44,6 +44,15 @@ static int nv_i2c_algo_master_xfer(struct i2c_adapter *adapter, struct i2c_msg m
 #endif
     ;
 
+    /*
+     * Check if the GPU is in surprise removal (e.g., Thunderbolt unplug).
+     * If so, return immediately to avoid hanging on RPC calls to GSP.
+     */
+    if (nv_check_gpu_state(nv) != NV_OK)
+    {
+        return -ENODEV;
+    }
+
     rc = nv_kmem_cache_alloc_stack(&sp);
     if (rc != 0)
     {
@@ -93,6 +102,15 @@ static int nv_i2c_algo_smbus_xfer(
     NV_STATUS rmStatus = NV_OK;
     nvidia_stack_t *sp = NULL;
 
+    /*
+     * Check if the GPU is in surprise removal (e.g., Thunderbolt unplug).
+     * If so, return immediately to avoid hanging on RPC calls to GSP.
+     */
+    if (nv_check_gpu_state(nv) != NV_OK)
+    {
+        return -ENODEV;
+    }
+
     rc = nv_kmem_cache_alloc_stack(&sp);
     if (rc != 0)
     {
@@ -196,6 +214,15 @@ static u32 nv_i2c_algo_functionality(struct i2c_adapter *adapter)
     u32 ret = I2C_FUNC_I2C;
     nvidia_stack_t *sp = NULL;
 
+    /*
+     * Check if the GPU is in surprise removal (e.g., Thunderbolt unplug).
+     * If so, return 0 to indicate no functionality available.
+     */
+    if (nv_check_gpu_state(nv) != NV_OK)
+    {
+        return 0;
+    }
+
     if (nv_kmem_cache_alloc_stack(&sp) != 0)
     {
         return 0;
diff --git a/kernel-open/nvidia/nv-pci.c b/kernel-open/nvidia/nv-pci.c
index 2767134e8f..1d54b2b963 100644
--- a/kernel-open/nvidia/nv-pci.c
+++ b/kernel-open/nvidia/nv-pci.c
@@ -27,6 +27,7 @@
 #include "nv-msi.h"
 #include "nv-hypervisor.h"
 #include "nv-reg.h"
+#include "nv-rsync.h"
 
 #if defined(NV_VGPU_KVM_BUILD)
 #include "nv-vgpu-vfio-interface.h"
@@ -2179,22 +2180,34 @@ nv_pci_remove(struct pci_dev *pci_dev)
     /*
      * Sanity check: A removed device shouldn't have a non-zero usage_count.
      * For eGPU, fall off the bus along with clients active is a valid scenario.
-     * Hence skipping the sanity check for eGPU.
+     * We still wait for a short time to allow in-progress close operations
+     * to complete, but with a timeout to prevent hangs.
      */
-    if ((atomic64_read(&nvl->usage_count) != 0) && !(nv->is_external_gpu))
+    if (atomic64_read(&nvl->usage_count) != 0)
     {
+        /*
+         * For external GPU: wait up to 5 seconds (10 iterations * 500ms)
+         * For internal GPU: wait up to 60 seconds (120 iterations * 500ms)
+         * This prevents indefinite hangs while still allowing time for
+         * graceful cleanup of in-progress operations.
+         */
+        int max_wait_iterations = nv->is_external_gpu ? 10 : 120;
+        int wait_iterations = 0;
+
         nv_printf(NV_DBG_ERRORS,
-                  "NVRM: Attempting to remove device %04x:%02x:%02x.%x with non-zero usage count!\n",
+                  "NVRM: Attempting to remove device %04x:%02x:%02x.%x with non-zero usage count (%lld)%s\n",
                   NV_PCI_DOMAIN_NUMBER(pci_dev), NV_PCI_BUS_NUMBER(pci_dev),
-                  NV_PCI_SLOT_NUMBER(pci_dev), PCI_FUNC(pci_dev->devfn));
+                  NV_PCI_SLOT_NUMBER(pci_dev), PCI_FUNC(pci_dev->devfn),
+                  atomic64_read(&nvl->usage_count),
+                  nv->is_external_gpu ? " (external GPU)" : "");
 
         /*
          * We can't return from this function without corrupting state, so we wait for
-         * the usage count to go to zero.
+         * the usage count to go to zero, but with a timeout.
          */
-        while (atomic64_read(&nvl->usage_count) != 0)
+        while ((atomic64_read(&nvl->usage_count) != 0) &&
+               (wait_iterations < max_wait_iterations))
         {
-
             /*
              * While waiting, release the locks so that other threads can make
              * forward progress.
@@ -2203,6 +2216,7 @@ nv_pci_remove(struct pci_dev *pci_dev)
             UNLOCK_NV_LINUX_DEVICES();
 
             os_delay(500);
+            wait_iterations++;
 
             /* Re-acquire the locks before checking again */
             LOCK_NV_LINUX_DEVICES();
@@ -2221,10 +2235,32 @@ nv_pci_remove(struct pci_dev *pci_dev)
             down(&nvl->ldata_lock);
         }
 
-        nv_printf(NV_DBG_ERRORS,
-                  "NVRM: Continuing with GPU removal for device %04x:%02x:%02x.%x\n",
-                  NV_PCI_DOMAIN_NUMBER(pci_dev), NV_PCI_BUS_NUMBER(pci_dev),
-                  NV_PCI_SLOT_NUMBER(pci_dev), PCI_FUNC(pci_dev->devfn));
+        if (atomic64_read(&nvl->usage_count) != 0)
+        {
+            nv_printf(NV_DBG_ERRORS,
+                      "NVRM: Timeout waiting for usage count on device %04x:%02x:%02x.%x (remaining: %lld). Forcing removal.\n",
+                      NV_PCI_DOMAIN_NUMBER(pci_dev), NV_PCI_BUS_NUMBER(pci_dev),
+                      NV_PCI_SLOT_NUMBER(pci_dev), PCI_FUNC(pci_dev->devfn),
+                      atomic64_read(&nvl->usage_count));
+            /*
+             * Force the surprise removal flag so that any remaining
+             * close operations will take the fast-path.
+             */
+            nv->flags |= NV_FLAG_IN_SURPRISE_REMOVAL;
+
+            /*
+             * Mark that we had a surprise removal so rsync cleanup
+             * warnings are suppressed during module unload.
+             */
+            nv_set_rsync_had_surprise_removal();
+        }
+        else
+        {
+            nv_printf(NV_DBG_ERRORS,
+                      "NVRM: Continuing with GPU removal for device %04x:%02x:%02x.%x\n",
+                      NV_PCI_DOMAIN_NUMBER(pci_dev), NV_PCI_BUS_NUMBER(pci_dev),
+                      NV_PCI_SLOT_NUMBER(pci_dev), PCI_FUNC(pci_dev->devfn));
+        }
     }
 
     rm_check_for_gpu_surprise_removal(sp, nv);
diff --git a/kernel-open/nvidia/nv-rsync.c b/kernel-open/nvidia/nv-rsync.c
index 88863dab68..f98c52c8e6 100644
--- a/kernel-open/nvidia/nv-rsync.c
+++ b/kernel-open/nvidia/nv-rsync.c
@@ -31,6 +31,7 @@ void nv_init_rsync_info(
 )
 {
     g_rsync_info.relaxed_ordering_mode = NV_FALSE;
+    g_rsync_info.had_surprise_removal = NV_FALSE;
     g_rsync_info.usage_count = 0;
     g_rsync_info.data = NULL;
     NV_INIT_MUTEX(&g_rsync_info.lock);
@@ -40,9 +41,17 @@ void nv_destroy_rsync_info(
     void
 )
 {
-    WARN_ON(g_rsync_info.data);
-    WARN_ON(g_rsync_info.usage_count);
-    WARN_ON(g_rsync_info.relaxed_ordering_mode);
+    /*
+     * After GPU surprise removal (e.g., Thunderbolt eGPU hot-unplug),
+     * these may not have been properly cleaned up. Skip warnings in
+     * that case since the cleanup failure is expected.
+     */
+    if (!g_rsync_info.had_surprise_removal)
+    {
+        WARN_ON(g_rsync_info.data);
+        WARN_ON(g_rsync_info.usage_count);
+        WARN_ON(g_rsync_info.relaxed_ordering_mode);
+    }
 }
 
 int nv_get_rsync_info(
@@ -100,6 +109,18 @@ void nv_put_rsync_info(
     up(&g_rsync_info.lock);
 }
 
+/*
+ * Mark that a GPU surprise removal occurred. This is used to suppress
+ * warnings about unclean rsync state during module unload, since the
+ * cleanup may be incomplete after forced removal.
+ */
+void nv_set_rsync_had_surprise_removal(
+    void
+)
+{
+    g_rsync_info.had_surprise_removal = NV_TRUE;
+}
+
 int nv_register_rsync_driver(
     int (*get_relaxed_ordering_mode)(int *mode, void *data),
     void (*put_relaxed_ordering_mode)(int mode, void *data),
diff --git a/kernel-open/nvidia/nv-rsync.h b/kernel-open/nvidia/nv-rsync.h
index cc0e1a2e51..c1cadefb75 100644
--- a/kernel-open/nvidia/nv-rsync.h
+++ b/kernel-open/nvidia/nv-rsync.h
@@ -31,6 +31,7 @@ typedef struct nv_rsync_info
     struct semaphore lock;
     uint32_t usage_count;
     NvBool relaxed_ordering_mode;
+    NvBool had_surprise_removal;
     int (*get_relaxed_ordering_mode)(int *mode, void *data);
     void (*put_relaxed_ordering_mode)(int mode, void *data);
     void (*wait_for_rsync)(struct pci_dev *gpu, void *data);
@@ -41,6 +42,7 @@ void nv_init_rsync_info(void);
 void nv_destroy_rsync_info(void);
 int nv_get_rsync_info(void);
 void nv_put_rsync_info(void);
+void nv_set_rsync_had_surprise_removal(void);
 int nv_register_rsync_driver(
                         int (*get_relaxed_ordering_mode)(int *mode, void *data),
                         void (*put_relaxed_ordering_mode)(int mode, void *data),
diff --git a/kernel-open/nvidia/nv.c b/kernel-open/nvidia/nv.c
index 9ad14f1d91..3abef3c9b3 100644
--- a/kernel-open/nvidia/nv.c
+++ b/kernel-open/nvidia/nv.c
@@ -2191,14 +2191,49 @@ nvidia_close_callback(
 static void nvidia_close_deferred(void *data)
 {
     nv_linux_file_private_t *nvlfp = data;
+    nv_linux_state_t *nvl = nvlfp->nvptr;
+    nv_state_t *nv = nvl ? NV_STATE_PTR(nvl) : NULL;
+    NvBool got_lock = NV_FALSE;
+    NvBool in_surprise_removal = NV_FALSE;
 
     nv_wait_open_complete(nvlfp);
 
-    down_read(&nv_system_pm_lock);
+    /*
+     * Check if we're in surprise removal before trying to acquire the lock.
+     * If the device is being removed (e.g., Thunderbolt unplug), we should
+     * not block on the PM lock as it may be held by the removal path.
+     */
+    if (nv != NULL)
+    {
+        in_surprise_removal = NV_IS_DEVICE_IN_SURPRISE_REMOVAL(nv);
+    }
+
+    if (in_surprise_removal)
+    {
+        /*
+         * For surprise removal, try to acquire the lock but don't block.
+         * If we can't get it, proceed without it - cleanup will be minimal
+         * anyway since the hardware is gone.
+         */
+        got_lock = down_read_trylock(&nv_system_pm_lock);
+        if (!got_lock)
+        {
+            nv_printf(NV_DBG_INFO,
+                      "NVRM: Surprise removal - proceeding with close without PM lock\n");
+        }
+    }
+    else
+    {
+        down_read(&nv_system_pm_lock);
+        got_lock = NV_TRUE;
+    }
 
     nvidia_close_callback(nvlfp);
 
-    up_read(&nv_system_pm_lock);
+    if (got_lock)
+    {
+        up_read(&nv_system_pm_lock);
+    }
 }
 
 int
@@ -2209,6 +2244,9 @@ nvidia_close(
 {
     int rc;
     nv_linux_file_private_t *nvlfp = NV_GET_LINUX_FILE_PRIVATE(file);
+    nv_linux_state_t *nvl;
+    nv_state_t *nv;
+    NvBool in_surprise_removal = NV_FALSE;
 
     nv_printf(NV_DBG_INFO,
               "NVRM: nvidia_close on GPU with minor number %d\n",
@@ -2221,10 +2259,44 @@ nvidia_close(
 
     NV_SET_FILE_PRIVATE(file, NULL);
 
+    /*
+     * Check if the device is in surprise removal (e.g., Thunderbolt unplug).
+     * If so, we should not block waiting for the PM lock as it may be held
+     * by the removal path, causing a deadlock.
+     */
+    nvl = nvlfp->nvptr;
+    if (nvl != NULL)
+    {
+        nv = NV_STATE_PTR(nvl);
+        in_surprise_removal = NV_IS_DEVICE_IN_SURPRISE_REMOVAL(nv);
+    }
+
     rc = nv_wait_open_complete_interruptible(nvlfp);
     if (rc == 0)
     {
-        rc = nv_down_read_interruptible(&nv_system_pm_lock);
+        if (in_surprise_removal)
+        {
+            /*
+             * For surprise removal, try to acquire the lock but don't block.
+             * If we can't get it, defer the close to a worker thread that
+             * will handle it properly.
+             */
+            if (down_read_trylock(&nv_system_pm_lock))
+            {
+                nvidia_close_callback(nvlfp);
+                up_read(&nv_system_pm_lock);
+                return 0;
+            }
+            /*
+             * Couldn't get the lock - fall through to defer the close.
+             * Set rc to indicate we need to defer.
+             */
+            rc = -EAGAIN;
+        }
+        else
+        {
+            rc = nv_down_read_interruptible(&nv_system_pm_lock);
+        }
     }
 
     if (rc == 0)
@@ -5202,15 +5274,25 @@ int nvidia_dev_get(NvU32 gpu_id, nvidia_stack_t *sp)
 void nvidia_dev_put(NvU32 gpu_id, nvidia_stack_t *sp)
 {
     nv_linux_state_t *nvl;
+    nv_state_t *nv;
+    NV_STATUS status;
 
     /* Takes nvl->ldata_lock */
     nvl = find_gpu_id(gpu_id);
     if (!nvl)
         return;
 
-    nv_close_device(NV_STATE_PTR(nvl), sp);
+    nv = NV_STATE_PTR(nvl);
 
-    WARN_ON(rm_set_external_kernel_client_count(sp, NV_STATE_PTR(nvl), NV_FALSE) != NV_OK);
+    nv_close_device(nv, sp);
+
+    /*
+     * During surprise removal (e.g., Thunderbolt eGPU hot-unplug),
+     * this may fail because the GPU is already gone. Don't warn in
+     * that case - it's expected.
+     */
+    status = rm_set_external_kernel_client_count(sp, nv, NV_FALSE);
+    WARN_ON((status != NV_OK) && !NV_IS_DEVICE_IN_SURPRISE_REMOVAL(nv));
 
     up(&nvl->ldata_lock);
 }
diff --git a/src/nvidia-modeset/include/nvkms-private.h b/src/nvidia-modeset/include/nvkms-private.h
index 31c76081cd..df468bc90e 100644
--- a/src/nvidia-modeset/include/nvkms-private.h
+++ b/src/nvidia-modeset/include/nvkms-private.h
@@ -35,6 +35,10 @@ struct NvKmsPerOpenDev *nvAllocPerOpenDev(struct NvKmsPerOpen *pOpen,
 
 void nvRevokeDevice(NVDevEvoPtr pDevEvo);
 
+void nvInvalidateDeviceReferences(NVDevEvoPtr pDevEvo);
+
+NvBool nvReinitializeGlobalClientAfterGpuLost(void);
+
 void nvFreePerOpenDev(struct NvKmsPerOpen *pOpen,
                       struct NvKmsPerOpenDev *pOpenDev);
 
diff --git a/src/nvidia-modeset/include/nvkms-types.h b/src/nvidia-modeset/include/nvkms-types.h
index 1bc0d328f9..c6d5542274 100644
--- a/src/nvidia-modeset/include/nvkms-types.h
+++ b/src/nvidia-modeset/include/nvkms-types.h
@@ -1156,6 +1156,13 @@ typedef struct _NVEvoDevRec {
      */
     NvBool              supportsVblankSemControl : 1;
 
+    /*
+     * Indicates the GPU has been lost (e.g., Thunderbolt/eGPU hot-unplug).
+     * When set, any operations that would access GPU hardware should be
+     * skipped to avoid kernel crashes.
+     */
+    NvBool              gpuLost : 1;
+
     nvkms_timer_handle_t *postFlipIMPTimer;
     nvkms_timer_handle_t *consoleRestoreTimer;
 
diff --git a/src/nvidia-modeset/kapi/interface/nvkms-kapi.h b/src/nvidia-modeset/kapi/interface/nvkms-kapi.h
index 4a65977031..4f9c421b3f 100644
--- a/src/nvidia-modeset/kapi/interface/nvkms-kapi.h
+++ b/src/nvidia-modeset/kapi/interface/nvkms-kapi.h
@@ -656,6 +656,17 @@ struct NvKmsKapiFunctionsTable {
      */
     void (*freeDevice)(struct NvKmsKapiDevice *device);
 
+    /*!
+     * Frees a device during surprise removal (e.g., Thunderbolt eGPU unplug).
+     * This skips all hardware access and only releases kernel resources.
+     * Use this instead of freeDevice() when the GPU hardware is no longer
+     * accessible to avoid page faults and hangs.
+     *
+     * \param [in]  device  A device returned by allocateDevice().
+     *                      This function is a no-op if device is not valid.
+     */
+    void (*freeDeviceForSurpriseRemoval)(struct NvKmsKapiDevice *device);
+
     /*!
      * Grab ownership of device, ownership is required to do modeset.
      *
diff --git a/src/nvidia-modeset/kapi/src/nvkms-kapi.c b/src/nvidia-modeset/kapi/src/nvkms-kapi.c
index fdf921f1b2..bf7258ba9d 100644
--- a/src/nvidia-modeset/kapi/src/nvkms-kapi.c
+++ b/src/nvidia-modeset/kapi/src/nvkms-kapi.c
@@ -635,6 +635,78 @@ static void FreeDevice(struct NvKmsKapiDevice *device)
     nvKmsKapiFree(device);
 }
 
+/*
+ * FreeDeviceForSurpriseRemoval - Free device without hardware access.
+ *
+ * This is used for Thunderbolt eGPU hot-unplug or other surprise removal
+ * scenarios where the GPU hardware is no longer accessible. We skip all
+ * hardware operations (NVKMS ioctls, RM API calls) that would cause page
+ * faults or hangs when trying to access unmapped GPU memory.
+ *
+ * We only:
+ * 1. Mark GPU as lost to prevent hardware access
+ * 2. Release the GPU reference count (nvkms_close_gpu)
+ * 3. Clean up kernel memory resources (handle allocator, semaphore, device struct)
+ *
+ * We skip:
+ * - KmsFreeDevice() - would call nvkms_ioctl_from_kapi() which accesses hardware
+ * - RmFreeDevice() - would call nvRmApiFree() which accesses hardware
+ *
+ * The hardware resources will be cleaned up when the GPU is physically
+ * removed from the system.
+ */
+static void FreeDeviceForSurpriseRemoval(struct NvKmsKapiDevice *device)
+{
+    if (device == NULL) {
+        return;
+    }
+
+    /*
+     * Mark the GPU as lost in NVKMS. This sets the gpuLost flag to prevent
+     * any hardware access, and cancels pending timers that might try to
+     * access the removed GPU.
+     */
+    nvkms_gpu_lost(device->gpuId);
+
+    /*
+     * Clear device handles to prevent any stale references.
+     * Don't call nvRmApiFree() as that would access hardware.
+     */
+    device->hKmsDevice = 0;
+    device->hKmsDisp = 0;
+    device->hRmSubDevice = 0;
+    device->hRmDevice = 0;
+    device->hRmClient = 0;
+    device->smgGpuInstSubscriptionHdl = 0;
+    device->smgComputeInstSubscriptionHdl = 0;
+
+    /*
+     * Tear down the handle allocator - this only frees kernel memory
+     * (bitmaps), no hardware access.
+     */
+    nvTearDownUnixRmHandleAllocator(&device->handleAllocator);
+    device->deviceInstance = 0;
+
+    /*
+     * Clear pKmsOpen - we can't call nvkms_close_from_kapi() as that
+     * would try to access hardware through nvKmsClose(). The popen
+     * structure will be leaked, but this only happens during surprise
+     * removal which is an abnormal condition.
+     */
+    device->pKmsOpen = NULL;
+
+    /* Lower the reference count of gpu - this is safe, no hardware access */
+    nvkms_close_gpu(device->gpuId);
+
+    /* Free kernel memory resources */
+    if (device->pSema != NULL) {
+        nvkms_sema_free(device->pSema);
+        device->pSema = NULL;
+    }
+
+    nvKmsKapiFree(device);
+}
+
 NvBool nvKmsKapiAllocateSystemMemory(struct NvKmsKapiDevice *device,
                                      NvU32 hRmHandle,
                                      enum NvKmsSurfaceMemoryLayout layout,
@@ -4013,6 +4085,7 @@ NvBool nvKmsKapiGetFunctionsTableInternal
 
     funcsTable->allocateDevice = AllocateDevice;
     funcsTable->freeDevice     = FreeDevice;
+    funcsTable->freeDeviceForSurpriseRemoval = FreeDeviceForSurpriseRemoval;
 
     funcsTable->grabOwnership    = GrabOwnership;
     funcsTable->releaseOwnership = ReleaseOwnership;
diff --git a/src/nvidia-modeset/os-interface/include/nvidia-modeset-os-interface.h b/src/nvidia-modeset/os-interface/include/nvidia-modeset-os-interface.h
index d4d656e766..8c7294e6c2 100644
--- a/src/nvidia-modeset/os-interface/include/nvidia-modeset-os-interface.h
+++ b/src/nvidia-modeset/os-interface/include/nvidia-modeset-os-interface.h
@@ -310,6 +310,12 @@ void* nvkms_get_per_open_data(int fd);
 NvBool nvkms_open_gpu(NvU32 gpuId);
 void nvkms_close_gpu(NvU32 gpuId);
 
+/*!
+ * Mark a GPU as lost (surprise removal, e.g., Thunderbolt eGPU unplug).
+ * This prevents hardware access and cancels pending timers.
+ */
+void nvkms_gpu_lost(NvU32 gpuId);
+
 
 /*!
  * Enumerate nvidia gpus.
diff --git a/src/nvidia-modeset/os-interface/include/nvkms.h b/src/nvidia-modeset/os-interface/include/nvkms.h
index d350ef7564..668fa8c27b 100644
--- a/src/nvidia-modeset/os-interface/include/nvkms.h
+++ b/src/nvidia-modeset/os-interface/include/nvkms.h
@@ -88,6 +88,8 @@ void nvKmsModuleUnload(void);
 void nvKmsSuspend(NvU32 gpuId);
 void nvKmsResume(NvU32 gpuId);
 
+void nvKmsGpuLost(NvU32 gpuId);
+
 void nvKmsGetProcFiles(const nvkms_procfs_file_t **ppProcFiles);
 
 NvBool nvKmsReadConf(const char *buff, size_t size,
diff --git a/src/nvidia-modeset/src/nvkms-console-restore.c b/src/nvidia-modeset/src/nvkms-console-restore.c
index 0c6cc5b296..2cfb595889 100644
--- a/src/nvidia-modeset/src/nvkms-console-restore.c
+++ b/src/nvidia-modeset/src/nvkms-console-restore.c
@@ -765,6 +765,11 @@ NvBool nvEvoRestoreConsole(NVDevEvoPtr pDevEvo, const NvBool allowMST)
                                      pDevEvo->fbConsoleSurfaceHandle);
     struct NvKmsSetModeParams *params;
 
+    /* Skip if GPU has been lost (e.g., Thunderbolt unplug) */
+    if (pDevEvo->gpuLost) {
+        goto done;
+    }
+
     /*
      * If this function fails to restore a console then NVKMS frees
      * and reallocates the core channel, to attempt the console
diff --git a/src/nvidia-modeset/src/nvkms-dma.c b/src/nvidia-modeset/src/nvkms-dma.c
index a6c0b57b62..ef9e29195d 100644
--- a/src/nvidia-modeset/src/nvkms-dma.c
+++ b/src/nvidia-modeset/src/nvkms-dma.c
@@ -27,6 +27,7 @@
 #include "nvkms-utils.h"
 #include "nvkms-rmapi.h"
 #include "class/cl917d.h" // NV917DDispControlDma, NV917D_DMA_*
+#include <ctrl/ctrl0080/ctrl0080dma.h> // NV0080_CTRL_CMD_DMA_FLUSH
 #include "nvos.h"
 
 #define NV_DMA_PUSHER_CHASE_PAD 5
@@ -37,7 +38,18 @@ static void EvoCoreKickoff(NVDmaBufferEvoPtr push_buffer, NvU32 putOffset);
 void nvDmaKickoffEvo(NVEvoChannelPtr pChannel)
 {
     NVDmaBufferEvoPtr p = &pChannel->pb;
-    NvU32 putOffset = (NvU32)((char *)p->buffer - (char *)p->base);
+    NvU32 putOffset;
+
+    /*
+     * Skip DMA kickoff if the GPU has been lost (e.g., Thunderbolt eGPU
+     * surprise removal). Attempting to access DMA control registers when
+     * the GPU is gone will crash the kernel.
+     */
+    if (p->pDevEvo == NULL || p->pDevEvo->gpuLost) {
+        return;
+    }
+
+    putOffset = (NvU32)((char *)p->buffer - (char *)p->base);
 
     if (p->put_offset == putOffset) {
         return;
@@ -48,11 +60,20 @@ void nvDmaKickoffEvo(NVEvoChannelPtr pChannel)
 
 static void EvoCoreKickoff(NVDmaBufferEvoPtr push_buffer, NvU32 putOffset)
 {
+    NVDevEvoPtr pDevEvo = push_buffer->pDevEvo;
     int i;
 
     nvAssert(putOffset % 4 == 0);
     nvAssert(putOffset <= push_buffer->offset_max);
 
+    /*
+     * Defense-in-depth: check gpuLost again. The caller should have already
+     * checked this, but verify to avoid writing to invalid mapped memory.
+     */
+    if (pDevEvo == NULL || pDevEvo->gpuLost) {
+        return;
+    }
+
 #if NVCPU_IS_X86_64
     __asm__ __volatile__ ("sfence\n\t" : : : "memory");
 #elif NVCPU_IS_FAMILY_ARM
@@ -110,8 +131,23 @@ NvBool nvEvoPollForEmptyChannel(NVEvoChannelPtr pChannel, NvU32 sd,
 {
     NVDmaBufferEvoPtr push_buffer = &pChannel->pb;
 
+    /* Return early if GPU is lost to avoid accessing invalid registers. */
+    if (push_buffer->pDevEvo == NULL || push_buffer->pDevEvo->gpuLost) {
+        return FALSE;
+    }
+
     do {
-        if (EvoCoreReadGet(push_buffer, sd) == push_buffer->put_offset) {
+        NvU32 getOffset = EvoCoreReadGet(push_buffer, sd);
+
+        /*
+         * Check for GPU removal: reading 0xFFFFFFFF typically indicates
+         * the device has been removed from the bus.
+         */
+        if (getOffset == 0xFFFFFFFF) {
+            return FALSE;
+        }
+
+        if (getOffset == push_buffer->put_offset) {
             break;
         }
 
@@ -132,6 +168,21 @@ void nvEvoMakeRoom(NVEvoChannelPtr pChannel, NvU32 count)
     NvU32 putOffset;
     NvU64 startTime = 0;
     const NvU64 timeout = 5000000; /* 5 seconds */
+    /*
+     * Maximum number of consecutive timeouts before we give up.
+     * This prevents infinite hangs when the GPU is removed (e.g., Thunderbolt
+     * unplug). After 5 timeouts (25 seconds), we assume the GPU is gone.
+     */
+    const NvU32 maxTimeoutCount = 5;
+    NvU32 timeoutCount = 0;
+
+    /*
+     * Skip if the GPU has been lost. No point trying to make room in a
+     * push buffer for a GPU that's no longer there.
+     */
+    if (push_buffer->pDevEvo == NULL || push_buffer->pDevEvo->gpuLost) {
+        return;
+    }
 
     putOffset = (NvU32) ((char *)push_buffer->buffer -
                          (char *)push_buffer->base);
@@ -146,6 +197,16 @@ void nvEvoMakeRoom(NVEvoChannelPtr pChannel, NvU32 count)
     while (1) {
         getOffset = EvoReadGetOffset(push_buffer, TRUE);
 
+        /*
+         * Check for GPU removal: reading 0xFFFFFFFF from PCI config space
+         * typically indicates the device has been removed from the bus.
+         */
+        if (getOffset == 0xFFFFFFFF) {
+            nvEvoLogDev(push_buffer->pDevEvo, EVO_LOG_ERROR,
+                "GPU appears to have been removed (read 0xFFFFFFFF)");
+            break;
+        }
+
         if (putOffset >= getOffset) {
             push_buffer->fifo_free_count =
                 (push_buffer->offset_max - putOffset) >> 2;
@@ -179,16 +240,25 @@ void nvEvoMakeRoom(NVEvoChannelPtr pChannel, NvU32 count)
         }
 
         /*
-         * If we have been waiting too long, print an error message.  There
-         * isn't much we can do as currently structured, so just reset
-         * startTime.
+         * If we have been waiting too long, print an error message.
+         * After too many consecutive timeouts, give up to prevent
+         * infinite hangs during GPU surprise removal.
          */
         if (nvExceedsTimeoutUSec(push_buffer->pDevEvo, &startTime, timeout)) {
+            timeoutCount++;
             nvEvoLogDev(push_buffer->pDevEvo, EVO_LOG_ERROR,
                 "Error while waiting for GPU progress: "
-                "0x%08x:%d %d:%d:%d:%d",
+                "0x%08x:%d %d:%d:%d:%d (timeout %d/%d)",
                 pChannel->hwclass, pChannel->instance,
-                count, push_buffer->fifo_free_count, getOffset, putOffset);
+                count, push_buffer->fifo_free_count, getOffset, putOffset,
+                timeoutCount, maxTimeoutCount);
+
+            if (timeoutCount >= maxTimeoutCount) {
+                nvEvoLogDev(push_buffer->pDevEvo, EVO_LOG_ERROR,
+                    "GPU not responding after %d timeouts, assuming removed",
+                    timeoutCount);
+                break;
+            }
             startTime = 0;
         }
 
@@ -217,8 +287,16 @@ void nvWriteEvoCoreNotifier(
 {
     NVDevEvoPtr pDevEvo = pDispEvo->pDevEvo;
     const NvU32 sd = pDispEvo->displayOwner;
-    NVEvoDmaPtr pSubChannel = &pDevEvo->core->notifiersDma[sd];
-    volatile NvU32 *pNotifiers = pSubChannel->cpuAddress;
+    NVEvoDmaPtr pSubChannel;
+    volatile NvU32 *pNotifiers;
+
+    /* Skip if GPU is lost to avoid writing to invalid memory. */
+    if (pDevEvo->gpuLost || pDevEvo->core == NULL) {
+        return;
+    }
+
+    pSubChannel = &pDevEvo->core->notifiersDma[sd];
+    pNotifiers = pSubChannel->cpuAddress;
 
     EvoWriteNotifier(pNotifiers + offset, value);
 }
@@ -230,10 +308,24 @@ static NvBool EvoCheckNotifier(const NVDispEvoRec *pDispEvo,
 {
     const NvU32 sd = pDispEvo->displayOwner;
     NVDevEvoPtr pDevEvo = pDispEvo->pDevEvo;
-    NVEvoDmaPtr pSubChannel = &pDevEvo->core->notifiersDma[sd];
-    NVDmaBufferEvoPtr p = &pDevEvo->core->pb;
+    NVEvoDmaPtr pSubChannel;
+    NVDmaBufferEvoPtr p;
     volatile NvU32 *pNotifier;
     NvU64 startTime = 0;
+    /*
+     * Maximum number of timeout cycles before giving up.
+     * Prevents infinite hangs during GPU surprise removal.
+     */
+    const NvU32 maxTimeoutCount = 5;
+    NvU32 timeoutCount = 0;
+
+    /* Return early if GPU is lost to avoid accessing invalid memory. */
+    if (pDevEvo->gpuLost || pDevEvo->core == NULL) {
+        return FALSE;
+    }
+
+    pSubChannel = &pDevEvo->core->notifiersDma[sd];
+    p = &pDevEvo->core->pb;
 
     pNotifier = pSubChannel->cpuAddress;
 
@@ -245,6 +337,17 @@ static NvBool EvoCheckNotifier(const NVDispEvoRec *pDispEvo,
         const NvU32 val = *pNotifier;
         const NvU32 done_mask = DRF_SHIFTMASK(done_extent_bit:done_base_bit);
         const NvU32 done_val = done_value << done_base_bit;
+        NvU32 getOffset;
+
+        /*
+         * Check for GPU removal: reading 0xFFFFFFFF typically indicates
+         * the device has been removed from the bus.
+         */
+        if (val == 0xFFFFFFFF) {
+            nvEvoLogDisp(pDispEvo, EVO_LOG_WARN,
+                         "GPU appears removed (notifier read 0xFFFFFFFF)");
+            return FALSE;
+        }
 
         if ((val & done_mask) == done_val) {
             return TRUE;
@@ -257,14 +360,39 @@ static NvBool EvoCheckNotifier(const NVDispEvoRec *pDispEvo,
         if (nvExceedsTimeoutUSec(
                 pDevEvo,
                 &startTime,
-                NV_EVO_NOTIFIER_SHORT_TIMEOUT_USEC) &&
-            (p->put_offset == EvoCoreReadGet(p, sd)))
+                NV_EVO_NOTIFIER_SHORT_TIMEOUT_USEC))
         {
-            nvEvoLogDisp(pDispEvo, EVO_LOG_WARN,
-                         "Lost display notification (%d:0x%08x); "
-                         "continuing.", sd, val);
-            EvoWriteNotifier(pNotifier, done_value << done_base_bit);
-            return TRUE;
+            getOffset = EvoCoreReadGet(p, sd);
+
+            /*
+             * Check for GPU removal in get offset as well.
+             */
+            if (getOffset == 0xFFFFFFFF) {
+                nvEvoLogDisp(pDispEvo, EVO_LOG_WARN,
+                             "GPU appears removed (GET read 0xFFFFFFFF)");
+                return FALSE;
+            }
+
+            if (p->put_offset == getOffset)
+            {
+                nvEvoLogDisp(pDispEvo, EVO_LOG_WARN,
+                             "Lost display notification (%d:0x%08x); "
+                             "continuing.", sd, val);
+                EvoWriteNotifier(pNotifier, done_value << done_base_bit);
+                return TRUE;
+            }
+
+            /*
+             * Count timeouts. After too many, assume GPU is gone.
+             */
+            timeoutCount++;
+            if (timeoutCount >= maxTimeoutCount) {
+                nvEvoLogDisp(pDispEvo, EVO_LOG_ERROR,
+                             "GPU not responding after %d timeouts (%d:0x%08x)",
+                             timeoutCount, sd, val);
+                return FALSE;
+            }
+            startTime = 0;
         }
 
         nvkms_yield();
diff --git a/src/nvidia-modeset/src/nvkms-event.c b/src/nvidia-modeset/src/nvkms-event.c
index 7b15bab850..414a2e1015 100644
--- a/src/nvidia-modeset/src/nvkms-event.c
+++ b/src/nvidia-modeset/src/nvkms-event.c
@@ -61,6 +61,11 @@ nvHandleHotplugEventDeferredWork(void *dataPtr, NvU32 dataU32)
     NVDpyEvoPtr pDpyEvo;
     NVDevEvoPtr pDevEvo = pDispEvo->pDevEvo;
 
+    /* Skip hardware access if GPU has been lost (e.g., Thunderbolt unplug) */
+    if (pDevEvo->gpuLost) {
+        return;
+    }
+
     // Get the hotplug state.
     hotplugParams.subDeviceInstance = pDispEvo->displayOwner;
 
diff --git a/src/nvidia-modeset/src/nvkms-evo.c b/src/nvidia-modeset/src/nvkms-evo.c
index fdd1df7ca5..0d1f54d211 100644
--- a/src/nvidia-modeset/src/nvkms-evo.c
+++ b/src/nvidia-modeset/src/nvkms-evo.c
@@ -8835,6 +8835,28 @@ NvBool nvFreeDevEvo(NVDevEvoPtr pDevEvo)
         return FALSE;
     }
 
+    /*
+     * If the GPU was lost (surprise removal), skip all hardware-related
+     * cleanup. Just free software resources and remove from device list.
+     *
+     * NOTE: We do NOT call nvFreePerOpenDev() here because the pNvKmsOpenDev
+     * is still in the global open list. It will be properly cleaned up during
+     * module unload when nvKmsClose iterates through all open handles.
+     * Calling nvFreePerOpenDev here would cause a double-free crash.
+     */
+    if (pDevEvo->gpuLost) {
+        nvEvoLogDev(pDevEvo, EVO_LOG_INFO,
+                    "Freeing device after GPU lost, skipping hardware cleanup");
+
+        /*
+         * Invalidate all pOpenDev references to this device before freeing it.
+         * This ensures nvKmsClose won't try to access the freed pDevEvo.
+         */
+        nvInvalidateDeviceReferences(pDevEvo);
+
+        goto free_software_resources;
+    }
+
     if (pDevEvo->pDifrState) {
         nvRmUnregisterDIFREventHandler(pDevEvo);
         nvDIFRFree(pDevEvo->pDifrState);
@@ -8874,19 +8896,43 @@ NvBool nvFreeDevEvo(NVDevEvoPtr pDevEvo)
 
     nvRmDestroyDisplays(pDevEvo);
 
-    nvkms_free_timer(pDevEvo->consoleRestoreTimer);
-    pDevEvo->consoleRestoreTimer = NULL;
+free_software_resources:
+    {
+        NvBool wasGpuLost = pDevEvo->gpuLost;
+
+        nvkms_free_timer(pDevEvo->consoleRestoreTimer);
+        pDevEvo->consoleRestoreTimer = NULL;
 
-    nvPreallocFree(pDevEvo);
+        nvPreallocFree(pDevEvo);
 
-    nvRmFreeDeviceEvo(pDevEvo);
+        /*
+         * Skip RM device cleanup if GPU is lost - handles are already invalid
+         * and RM API calls will fail.
+         */
+        if (!pDevEvo->gpuLost) {
+            nvRmFreeDeviceEvo(pDevEvo);
+        }
 
-    nvListDel(&pDevEvo->devListEntry);
+        nvListDel(&pDevEvo->devListEntry);
 
-    nvkms_free_ref_ptr(pDevEvo->ref_ptr);
+        nvkms_free_ref_ptr(pDevEvo->ref_ptr);
 
-    nvFree(pDevEvo);
-    return TRUE;
+        nvFree(pDevEvo);
+
+        /*
+         * NOTE: We intentionally do NOT call nvKmsReinitializeGlobalClient()
+         * here even if the device list is empty. The global client handle
+         * is still referenced by open handles (pNvKmsOpenDev) that will be
+         * cleaned up during module unload by nvKmsClose(). Reinitializing
+         * the client here would corrupt those handles and cause a crash.
+         *
+         * If the user reconnects the GPU before unloading the module, it will
+         * work because AllocDevice checks for stale gpuLost devices and cleans
+         * them up. The global client doesn't need to be reinitialized for that.
+         */
+
+        return TRUE;
+    }
 }
 
 static void AssignNumberOfApiHeads(NVDevEvoRec *pDevEvo)
diff --git a/src/nvidia-modeset/src/nvkms-evo1.c b/src/nvidia-modeset/src/nvkms-evo1.c
index cf2bbac618..1347956901 100644
--- a/src/nvidia-modeset/src/nvkms-evo1.c
+++ b/src/nvidia-modeset/src/nvkms-evo1.c
@@ -32,9 +32,416 @@
 #include "nvkms-evo1.h"
 #include "nvkms-prealloc.h"
 #include "nvkms-utils.h"
+#include "nvos.h"
 
 #include "hdmi_spec.h"
 
+#include <ctrl/ctrl5070/ctrl5070chnc.h> // NV5070_CTRL_CMD_IS_MODE_POSSIBLE_PARAMS
+
+/*!
+ * Initialize head-independent IMP param fields.
+ *
+ * Initializes an NV5070_CTRL_CMD_IS_MODE_POSSIBLE_PARAMS structure.
+ * IMP users should call this once, followed by per-head calls to
+ * AssignPerHeadImpParams().
+ *
+ * \param pImp[in]  A pointer to a param structure.
+ */
+static void InitImpParams(NV5070_CTRL_CMD_IS_MODE_POSSIBLE_PARAMS *pImp)
+{
+    int i;
+
+    nvkms_memset(pImp, 0, sizeof(*pImp));
+
+    /* Initialize to not possible. */
+    pImp->IsPossible = NV5070_CTRL_CMD_IS_MODE_POSSIBLE_IS_POSSIBLE_NO;
+
+    /* Set all heads to inactive. */
+    for (i = 0; i < NV5070_CTRL_CMD_MAX_HEADS; i++) {
+        pImp->Head[i].HeadActive =
+            NV5070_CTRL_CMD_IS_MODE_POSSIBLE_HEAD_ACTIVE_NO;
+    }
+
+    /* Set all ORs to no owner. */
+    for (i = 0; i < NV5070_CTRL_CMD_MAX_DACS; i++) {
+        pImp->Dac[i].owner = NV5070_CTRL_CMD_OR_OWNER_NONE;
+    }
+
+    pImp->bUseSorOwnerMask = TRUE;
+    for (i = 0; i < NV5070_CTRL_CMD_MAX_SORS; i++) {
+        pImp->Sor[i].ownerMask = NV5070_CTRL_CMD_SOR_OWNER_MASK_NONE;
+    }
+
+    for (i = 0; i < NV5070_CTRL_CMD_MAX_PIORS; i++) {
+        pImp->Pior[i].owner = NV5070_CTRL_CMD_OR_OWNER_NONE;
+    }
+}
+
+/*!
+ * Initialize head-specific IMP param fields.
+ *
+ * Initialize the portion of the NV5070_CTRL_CMD_IS_MODE_POSSIBLE_PARAMS
+ * structure that applies to a specific head, and the OR driven by
+ * that head.
+ *
+ * The param structure should be initialized by InitImpParams()
+ * before calling this per-head function.
+ *
+ * \param[out]  pImp       The param structure to initialize.
+ * \param[in]   pTimings   The rastering timings and viewport configuration.
+ * \param[in]   pUsage     The usage bounds that will be used for this head.
+ * \param[in]   head       The number of the head that will be driven.
+ * \param[in]   orNumber   The number of the OR driven by the head.
+ * \param[in]   orType     The type of the OR driven by the head.
+ */
+static void AssignPerHeadImpParams(const NVDevEvoRec *pDevEvo,
+                                   NV5070_CTRL_CMD_IS_MODE_POSSIBLE_PARAMS *pImp,
+                                   const NVHwModeTimingsEvo *pTimings,
+                                   const enum nvKmsPixelDepth pixelDepth,
+                                   const struct NvKmsUsageBounds *pUsage,
+                                   const int head,
+                                   const int orNumber,
+                                   const int orType)
+{
+    const NVHwModeViewPortEvo *pViewPort = &pTimings->viewPort;
+    NvU64 overlayFormats = 0;
+    NvU32 protocol;
+
+    nvkms_memset(&pImp->Head[head], 0, sizeof(pImp->Head[head]));
+
+    nvAssert(head < NV5070_CTRL_CMD_MAX_HEADS);
+    pImp->Head[head].HeadActive = TRUE;
+
+    nvAssert(orType == NV0073_CTRL_SPECIFIC_OR_TYPE_NONE ||
+             orNumber != NV_INVALID_OR);
+
+    /* raster timings */
+
+    pImp->Head[head].PixelClock.Frequency = pTimings->pixelClock;
+
+    pImp->Head[head].PixelClock.Adj1000Div1001 =
+        NV5070_CTRL_CMD_IS_MODE_POSSIBLE_PIXEL_CLOCK_ADJ1000DIV1001_NO;
+
+    pImp->Head[head].RasterSize.Width           = pTimings->rasterSize.x;
+    pImp->Head[head].RasterSize.Height          = pTimings->rasterSize.y;
+    pImp->Head[head].RasterBlankStart.X         = pTimings->rasterBlankStart.x;
+    pImp->Head[head].RasterBlankStart.Y         = pTimings->rasterBlankStart.y;
+    pImp->Head[head].RasterBlankEnd.X           = pTimings->rasterBlankEnd.x;
+    pImp->Head[head].RasterBlankEnd.Y           = pTimings->rasterBlankEnd.y;
+    pImp->Head[head].RasterVertBlank2.YStart    = pTimings->rasterVertBlank2Start;
+    pImp->Head[head].RasterVertBlank2.YEnd      = pTimings->rasterVertBlank2End;
+    pImp->Head[head].Control.Structure =
+        pTimings->interlaced ?
+        NV5070_CTRL_CMD_IS_MODE_POSSIBLE_CONTROL_STRUCTURE_INTERLACED :
+        NV5070_CTRL_CMD_IS_MODE_POSSIBLE_CONTROL_STRUCTURE_PROGRESSIVE;
+
+    if (orType == NV0073_CTRL_SPECIFIC_OR_TYPE_DAC) {
+        nvAssert(orNumber < ARRAY_LEN(pImp->Dac));
+        nvAssert(pImp->Dac[orNumber].owner == NV5070_CTRL_CMD_OR_OWNER_NONE);
+        pImp->Dac[orNumber].owner = NV5070_CTRL_CMD_OR_OWNER_HEAD(head);
+        nvAssert(pTimings->protocol == NVKMS_PROTOCOL_DAC_RGB);
+        pImp->Dac[orNumber].protocol = NV5070_CTRL_CMD_DAC_PROTOCOL_RGB_CRT;
+    } else if (orType == NV0073_CTRL_SPECIFIC_OR_TYPE_SOR) {
+        nvAssert(orNumber < ARRAY_LEN(pImp->Sor));
+        pImp->Sor[orNumber].ownerMask |= NV5070_CTRL_CMD_SOR_OWNER_MASK_HEAD(head);
+        switch (pTimings->protocol) {
+            default:
+                nvAssert(!"Unknown protocol");
+                /* fall through */
+            case NVKMS_PROTOCOL_SOR_LVDS_CUSTOM:
+                protocol = NV5070_CTRL_CMD_SOR_PROTOCOL_LVDS_CUSTOM;
+                break;
+            case NVKMS_PROTOCOL_SOR_SINGLE_TMDS_A:
+                protocol = NV5070_CTRL_CMD_SOR_PROTOCOL_SINGLE_TMDS_A;
+                break;
+            case NVKMS_PROTOCOL_SOR_SINGLE_TMDS_B:
+                protocol = NV5070_CTRL_CMD_SOR_PROTOCOL_SINGLE_TMDS_B;
+                break;
+            case NVKMS_PROTOCOL_SOR_DUAL_TMDS:
+                protocol = NV5070_CTRL_CMD_SOR_PROTOCOL_DUAL_TMDS;
+                break;
+            case NVKMS_PROTOCOL_SOR_DP_A:
+                protocol = NV5070_CTRL_CMD_SOR_PROTOCOL_DP_A;
+                break;
+            case NVKMS_PROTOCOL_SOR_DP_B:
+                protocol = NV5070_CTRL_CMD_SOR_PROTOCOL_DP_B;
+                break;
+        }
+        pImp->Sor[orNumber].protocol = protocol;
+        pImp->Sor[orNumber].pixelReplicateMode =
+            NV5070_CTRL_IS_MODE_POSSIBLE_PIXEL_REPLICATE_MODE_OFF;
+    } else if (orType == NV0073_CTRL_SPECIFIC_OR_TYPE_PIOR) {
+        nvAssert(orNumber < ARRAY_LEN(pImp->Pior));
+        nvAssert(pImp->Pior[orNumber].owner == NV5070_CTRL_CMD_OR_OWNER_NONE);
+        pImp->Pior[orNumber].owner = NV5070_CTRL_CMD_OR_OWNER_HEAD(head);
+        switch (pTimings->protocol) {
+            default:
+                nvAssert(!"Unknown protocol");
+                /* fall through */
+            case NVKMS_PROTOCOL_PIOR_EXT_TMDS_ENC:
+                protocol = NV5070_CTRL_CMD_PIOR_PROTOCOL_EXT_TMDS_ENC;
+                break;
+        }
+        pImp->Pior[orNumber].protocol = protocol;
+    } else {
+        nvAssert(orType == NV0073_CTRL_SPECIFIC_OR_TYPE_NONE);
+    }
+
+    /* viewport out */
+
+    pImp->Head[head].OutputScaler.VerticalTaps =
+        NVEvoScalerTapsToNum(pViewPort->vTaps);
+
+    pImp->Head[head].OutputScaler.HorizontalTaps =
+        NVEvoScalerTapsToNum(pViewPort->hTaps);
+
+    pImp->Head[head].ViewportSizeOut.Width = pViewPort->out.width;
+    pImp->Head[head].ViewportSizeOut.Height = pViewPort->out.height;
+
+    pImp->Head[head].ViewportSizeOutMin.Width =
+        pImp->Head[head].ViewportSizeOut.Width;
+
+    pImp->Head[head].ViewportSizeOutMin.Height =
+        pImp->Head[head].ViewportSizeOut.Height;
+
+    pImp->Head[head].ViewportSizeOutMax.Width =
+        pImp->Head[head].ViewportSizeOut.Width;
+
+    pImp->Head[head].ViewportSizeOutMax.Height =
+        pImp->Head[head].ViewportSizeOut.Height;
+
+    /* viewport in */
+
+    pImp->Head[head].ViewportSizeIn.Width = pViewPort->in.width;
+    pImp->Head[head].ViewportSizeIn.Height = pViewPort->in.height;
+
+    /*
+     * The actual format doesn't really matter, since RM just
+     * converts it back to bits per pixel for its IMP calculation anyway.  The
+     * hardware doesn't have a "usage bound" for core -- changing the format
+     * of the core surface will always incur a supervisor interrupt and rerun
+     * IMP (XXX if we change the core surface as part of a flip to one of a
+     * different depth, should we force the pre/post IMP update path?).
+     *
+     * EVO2 hal uses surfaces of the same format in the core and base channels,
+     * see needToReprogramCoreSurface() in nvkms-evo2.c.
+     */
+    if (pUsage->layer[NVKMS_MAIN_LAYER].usable) {
+        if (pUsage->layer[NVKMS_MAIN_LAYER].supportedSurfaceMemoryFormats &
+            NVKMS_SURFACE_MEMORY_FORMATS_RGB_PACKED8BPP) {
+            pImp->Head[head].Params.Format =
+                NV5070_CTRL_CMD_IS_MODE_POSSIBLE_PARAMS_FORMAT_RF16_GF16_BF16_AF16;
+        } else if (pUsage->layer[NVKMS_MAIN_LAYER].supportedSurfaceMemoryFormats &
+            NVKMS_SURFACE_MEMORY_FORMATS_RGB_PACKED4BPP) {
+            pImp->Head[head].Params.Format =
+                NV5070_CTRL_CMD_IS_MODE_POSSIBLE_PARAMS_FORMAT_A8R8G8B8;
+        } else if (pUsage->layer[NVKMS_MAIN_LAYER].supportedSurfaceMemoryFormats &
+            NVKMS_SURFACE_MEMORY_FORMATS_RGB_PACKED2BPP) {
+            pImp->Head[head].Params.Format =
+                NV5070_CTRL_CMD_IS_MODE_POSSIBLE_PARAMS_FORMAT_R5G6B5;
+        } else if (pUsage->layer[NVKMS_MAIN_LAYER].supportedSurfaceMemoryFormats &
+            NVKMS_SURFACE_MEMORY_FORMATS_RGB_PACKED1BPP) {
+            pImp->Head[head].Params.Format =
+                NV5070_CTRL_CMD_IS_MODE_POSSIBLE_PARAMS_FORMAT_I8;
+        } else { /* default to RGB 4BPP */
+            nvAssert(!"Unknown core format");
+            pImp->Head[head].Params.Format =
+                NV5070_CTRL_CMD_IS_MODE_POSSIBLE_PARAMS_FORMAT_A8R8G8B8;
+        }
+    } else {
+        pImp->Head[head].Params.Format =
+            NV5070_CTRL_CMD_IS_MODE_POSSIBLE_PARAMS_FORMAT_A8R8G8B8;
+    }
+
+    pImp->Head[head].Params.SuperSample =
+        NV5070_CTRL_CMD_IS_MODE_POSSIBLE_PARAMS_SUPER_SAMPLE_X1AA;
+
+    /* base usage bounds */
+
+    if (pUsage->layer[NVKMS_MAIN_LAYER].usable) {
+        pImp->Head[head].BaseUsageBounds.Usable =
+            NV5070_CTRL_CMD_IS_MODE_POSSIBLE_BASE_USAGE_BOUNDS_USABLE_YES;
+
+        if (pUsage->layer[NVKMS_MAIN_LAYER].supportedSurfaceMemoryFormats &
+            NVKMS_SURFACE_MEMORY_FORMATS_RGB_PACKED8BPP) {
+            pImp->Head[head].BaseUsageBounds.PixelDepth =
+                NV5070_CTRL_CMD_IS_MODE_POSSIBLE_BASE_USAGE_BOUNDS_PIXEL_DEPTH_64;
+        } else if (pUsage->layer[NVKMS_MAIN_LAYER].supportedSurfaceMemoryFormats &
+            NVKMS_SURFACE_MEMORY_FORMATS_RGB_PACKED4BPP) {
+            pImp->Head[head].BaseUsageBounds.PixelDepth =
+                NV5070_CTRL_CMD_IS_MODE_POSSIBLE_BASE_USAGE_BOUNDS_PIXEL_DEPTH_32;
+        } else if (pUsage->layer[NVKMS_MAIN_LAYER].supportedSurfaceMemoryFormats &
+            NVKMS_SURFACE_MEMORY_FORMATS_RGB_PACKED2BPP) {
+            pImp->Head[head].BaseUsageBounds.PixelDepth =
+                NV5070_CTRL_CMD_IS_MODE_POSSIBLE_BASE_USAGE_BOUNDS_PIXEL_DEPTH_16;
+        } else if (pUsage->layer[NVKMS_MAIN_LAYER].supportedSurfaceMemoryFormats &
+            NVKMS_SURFACE_MEMORY_FORMATS_RGB_PACKED1BPP) {
+            pImp->Head[head].BaseUsageBounds.PixelDepth =
+                NV5070_CTRL_CMD_IS_MODE_POSSIBLE_BASE_USAGE_BOUNDS_PIXEL_DEPTH_8;
+        } else { /* default to RGB 8BPP */
+            nvAssert(!"Unknown base channel usage bound format");
+            pImp->Head[head].BaseUsageBounds.PixelDepth =
+                NV5070_CTRL_CMD_IS_MODE_POSSIBLE_BASE_USAGE_BOUNDS_PIXEL_DEPTH_64;
+        }
+
+        pImp->Head[head].BaseUsageBounds.SuperSample =
+            NV5070_CTRL_CMD_IS_MODE_POSSIBLE_BASE_USAGE_BOUNDS_SUPER_SAMPLE_X1AA;
+    } else {
+        pImp->Head[head].BaseUsageBounds.Usable =
+            NV5070_CTRL_CMD_IS_MODE_POSSIBLE_BASE_USAGE_BOUNDS_USABLE_NO;
+    }
+
+    /* overlay usage bounds */
+
+    pImp->Head[head].OverlayUsageBounds.Usable =
+        pUsage->layer[NVKMS_OVERLAY_LAYER].usable
+        ? NV5070_CTRL_CMD_IS_MODE_POSSIBLE_OVERLAY_USAGE_BOUNDS_USABLE_YES
+        : NV5070_CTRL_CMD_IS_MODE_POSSIBLE_OVERLAY_USAGE_BOUNDS_USABLE_NO;
+
+    overlayFormats = pUsage->layer[NVKMS_OVERLAY_LAYER].usable ?
+        pUsage->layer[NVKMS_OVERLAY_LAYER].supportedSurfaceMemoryFormats :
+        NVKMS_SURFACE_MEMORY_FORMATS_RGB_PACKED2BPP;
+
+    if (overlayFormats & NVKMS_SURFACE_MEMORY_FORMATS_RGB_PACKED4BPP) {
+        pImp->Head[head].OverlayUsageBounds.PixelDepth =
+        NV5070_CTRL_CMD_IS_MODE_POSSIBLE_OVERLAY_USAGE_BOUNDS_PIXEL_DEPTH_32;
+    } else if (overlayFormats & NVKMS_SURFACE_MEMORY_FORMATS_RGB_PACKED2BPP) {
+        pImp->Head[head].OverlayUsageBounds.PixelDepth =
+        NV5070_CTRL_CMD_IS_MODE_POSSIBLE_OVERLAY_USAGE_BOUNDS_PIXEL_DEPTH_16;
+    } else {
+        nvAssert(!"Unknown overlay channel usage bound format");
+        pImp->Head[head].OverlayUsageBounds.PixelDepth =
+        NV5070_CTRL_CMD_IS_MODE_POSSIBLE_OVERLAY_USAGE_BOUNDS_PIXEL_DEPTH_32;
+    }
+
+    /* pixel depth */
+
+    switch (pixelDepth) {
+    case NVKMS_PIXEL_DEPTH_18_444:
+        pImp->Head[head].outputResourcePixelDepthBPP =
+            NV5070_CTRL_IS_MODE_POSSIBLE_OUTPUT_RESOURCE_PIXEL_DEPTH_BPP_18_444;
+        break;
+    case NVKMS_PIXEL_DEPTH_24_444:
+        pImp->Head[head].outputResourcePixelDepthBPP =
+            NV5070_CTRL_IS_MODE_POSSIBLE_OUTPUT_RESOURCE_PIXEL_DEPTH_BPP_24_444;
+        break;
+    case NVKMS_PIXEL_DEPTH_30_444:
+        pImp->Head[head].outputResourcePixelDepthBPP =
+            NV5070_CTRL_IS_MODE_POSSIBLE_OUTPUT_RESOURCE_PIXEL_DEPTH_BPP_30_444;
+        break;
+    case NVKMS_PIXEL_DEPTH_16_422:
+        pImp->Head[head].outputResourcePixelDepthBPP =
+            NV5070_CTRL_IS_MODE_POSSIBLE_OUTPUT_RESOURCE_PIXEL_DEPTH_BPP_16_422;
+        break;
+    case NVKMS_PIXEL_DEPTH_20_422:
+        pImp->Head[head].outputResourcePixelDepthBPP =
+            NV5070_CTRL_IS_MODE_POSSIBLE_OUTPUT_RESOURCE_PIXEL_DEPTH_BPP_20_422;
+        break;
+    }
+}
+
+void nvEvo1IsModePossible(NVDispEvoPtr pDispEvo,
+                          const NVEvoIsModePossibleDispInput *pInput,
+                          NVEvoIsModePossibleDispOutput *pOutput)
+{
+    NVDevEvoPtr pDevEvo = pDispEvo->pDevEvo;
+    NV5070_CTRL_CMD_IS_MODE_POSSIBLE_PARAMS *pImp =
+        nvPreallocGet(pDevEvo, PREALLOC_TYPE_IMP_PARAMS, sizeof(*pImp));
+    NvBool result = FALSE;
+    NvU32 head;
+    NvU32 ret;
+
+    InitImpParams(pImp);
+
+    pImp->RequestedOperation =
+        NV5070_CTRL_CMD_IS_MODE_POSSIBLE_REQUESTED_OPERATION_QUERY;
+
+    for (head = 0; head < NVKMS_MAX_HEADS_PER_DISP; head++) {
+        if (pInput->head[head].pTimings == NULL) {
+            continue;
+        }
+
+        AssignPerHeadImpParams(pDevEvo, pImp,
+                               pInput->head[head].pTimings,
+                               pInput->head[head].pixelDepth,
+                               pInput->head[head].pUsage,
+                               head,
+                               pInput->head[head].orIndex,
+                               pInput->head[head].orType);
+    }
+
+    pImp->base.subdeviceIndex = pDispEvo->displayOwner;
+
+    if (pInput->requireBootClocks) {
+        // XXX TODO: IMP requires lock pin information if pstate information is
+        // requested. For now, just assume no locking.
+        pImp->MinPState = NV5070_CTRL_IS_MODE_POSSIBLE_NEED_MIN_PSTATE;
+    }
+
+    for (head = 0; head < NVKMS_MAX_HEADS_PER_DISP; head++) {
+        pImp->Head[head].displayId[0] = pInput->head[head].displayId;
+    }
+
+    ret = nvRmApiControl(nvEvoGlobal.clientHandle,
+                         pDevEvo->displayHandle,
+                         NV5070_CTRL_CMD_IS_MODE_POSSIBLE,
+                         pImp, sizeof(*pImp));
+
+    if (ret != NV_OK || !pImp->IsPossible ||
+        (pInput->requireBootClocks &&
+         // P8 = "boot clocks"
+         (pImp->MinPState < NV5070_CTRL_IS_MODE_POSSIBLE_PSTATES_P8 &&
+          // XXX TODO: With PStates 3.0, only a "v-pstate" is returned in
+          // impParams.minPerfLevel. We need to correlate that with "boot
+          // clocks" somehow.
+          pImp->MinPState != NV5070_CTRL_IS_MODE_POSSIBLE_PSTATES_UNDEFINED))) {
+        goto done;
+    }
+
+    result = TRUE;
+
+done:
+    nvPreallocRelease(pDevEvo, PREALLOC_TYPE_IMP_PARAMS);
+    pOutput->possible = result;
+}
+
+void nvEvo1PrePostIMP(NVDispEvoPtr pDispEvo, NvBool isPre)
+{
+    NVDevEvoPtr pDevEvo = pDispEvo->pDevEvo;
+    NV5070_CTRL_CMD_IS_MODE_POSSIBLE_PARAMS *pImp =
+        nvPreallocGet(pDevEvo, PREALLOC_TYPE_IMP_PARAMS, sizeof(*pImp));
+    NvU32 ret;
+
+    if (isPre) {
+        /*
+         * Sync the core channel for pre-modeset IMP to ensure that the state
+         * cache reflects all of the methods we've pushed
+         */
+        ret = nvRMSyncEvoChannel(pDevEvo, pDevEvo->core, __LINE__);
+        if (!ret) {
+            nvAssert(!"nvRMSyncEvoChannel failed during PreModesetIMP");
+        }
+    }
+
+    nvkms_memset(pImp, 0, sizeof(*pImp));
+
+    pImp->RequestedOperation = isPre ?
+        NV5070_CTRL_CMD_IS_MODE_POSSIBLE_REQUESTED_OPERATION_PRE_MODESET_USE_SC :
+        NV5070_CTRL_CMD_IS_MODE_POSSIBLE_REQUESTED_OPERATION_POST_MODESET_USE_SC;
+
+    pImp->base.subdeviceIndex = pDispEvo->displayOwner;
+
+    ret = nvRmApiControl(nvEvoGlobal.clientHandle,
+                         pDevEvo->displayHandle,
+                         NV5070_CTRL_CMD_IS_MODE_POSSIBLE,
+                         pImp, sizeof(*pImp));
+    if ((ret != NVOS_STATUS_SUCCESS) || !pImp->IsPossible) {
+        nvAssert(!"NV5070_CTRL_CMD_IS_MODE_POSSIBLE failed");
+    }
+
+    nvPreallocRelease(pDevEvo, PREALLOC_TYPE_IMP_PARAMS);
+}
+
 /*!
  * Return the value to use for HEAD_SET_STORAGE_PITCH.
  *
@@ -67,6 +474,91 @@ NvU32 nvEvoGetHeadSetStoragePitchValue(const NVDevEvoRec *pDevEvo,
     return pitch;
 }
 
+static NvBool GetChannelState(NVDevEvoPtr pDevEvo,
+                              NVEvoChannelPtr pChan,
+                              NvU32 sd,
+                              NvU32 *result)
+{
+    NV5070_CTRL_CMD_GET_CHANNEL_INFO_PARAMS info = { };
+    NvU32 ret;
+
+    info.base.subdeviceIndex = sd;
+    info.channelClass = pChan->hwclass;
+    info.channelInstance = pChan->instance;
+
+    ret = nvRmApiControl(nvEvoGlobal.clientHandle,
+                         pDevEvo->displayHandle,
+                         NV5070_CTRL_CMD_GET_CHANNEL_INFO,
+                         &info, sizeof(info));
+    if (ret != NVOS_STATUS_SUCCESS) {
+        /*
+         * When the GPU is lost (e.g., Thunderbolt/eGPU hot-unplug),
+         * suppress the error log to avoid flooding dmesg. The callers
+         * will handle the failure appropriately.
+         */
+        if (ret != NVOS_STATUS_ERROR_GPU_IS_LOST) {
+            nvEvoLogDev(pDevEvo, EVO_LOG_ERROR,
+                        "Failed to query display engine channel state: 0x%08x:%d:%d:0x%08x",
+                        pChan->hwclass, pChan->instance, sd, ret);
+        }
+        return FALSE;
+    }
+
+    *result = info.channelState;
+
+    return TRUE;
+}
+
+NvBool nvEvo1IsChannelIdle(NVDevEvoPtr pDevEvo,
+                           NVEvoChannelPtr pChan,
+                           NvU32 sd,
+                           NvBool *result)
+{
+    NvU32 channelState;
+
+    if (!GetChannelState(pDevEvo, pChan, sd, &channelState)) {
+        return FALSE;
+    }
+
+    *result = (channelState == NV5070_CTRL_GET_CHANNEL_INFO_STATE_IDLE);
+
+    return TRUE;
+}
+
+/*
+ * Result is false if an EVO channel is either one of NO_METHOD_PENDING or
+ * UNCONNECTED, true o.w.
+ *
+ * NO_METHOD_PENDING is a mask for EMPTY | WRTIDLE | IDLE.
+ *
+ * If NVKMS hasn't grabbed the channel, it can be seen as UNCONNECTED.
+ */
+NvBool nvEvo1IsChannelMethodPending(NVDevEvoPtr pDevEvo,
+                                    NVEvoChannelPtr pChan,
+                                    NvU32 sd,
+                                    NvBool *result)
+{
+    NvU32 channelState;
+
+    if (!GetChannelState(pDevEvo, pChan, sd, &channelState)) {
+        return FALSE;
+    }
+
+    *result = !(channelState &
+              (NV5070_CTRL_GET_CHANNEL_INFO_STATE_NO_METHOD_PENDING |
+               NV5070_CTRL_GET_CHANNEL_INFO_STATE_UNCONNECTED));
+
+    return TRUE;
+}
+
+void nvEvo1SetDscParams(const NVDispEvoRec *pDispEvo,
+                        const NvU32 head,
+                        const NVDscInfoEvoRec *pDscInfo,
+                        const enum nvKmsPixelDepth pixelDepth)
+{
+    nvAssert(pDscInfo->type == NV_DSC_INFO_EVO_TYPE_DISABLED);
+}
+
 /*
  * The 'type' the timing library writes into the NVT_INFOFRAME_HEADER
  * structure is not the same as the protocol values that hardware
diff --git a/src/nvidia-modeset/src/nvkms-evo3.c b/src/nvidia-modeset/src/nvkms-evo3.c
index a82a2bdfd4..e3826e2783 100644
--- a/src/nvidia-modeset/src/nvkms-evo3.c
+++ b/src/nvidia-modeset/src/nvkms-evo3.c
@@ -6391,9 +6391,16 @@ static NvBool GetChannelState(NVDevEvoPtr pDevEvo,
                          NVC370_CTRL_CMD_GET_CHANNEL_INFO,
                          &info, sizeof(info));
     if (ret != NVOS_STATUS_SUCCESS) {
-        nvEvoLogDev(pDevEvo, EVO_LOG_ERROR,
-                    "Failed to query display engine channel state: 0x%08x:%d:%d:0x%08x",
-                    pChan->hwclass, pChan->instance, sd, ret);
+        /*
+         * When the GPU is lost (e.g., Thunderbolt/eGPU hot-unplug),
+         * suppress the error log to avoid flooding dmesg. The callers
+         * will handle the failure appropriately.
+         */
+        if (ret != NVOS_STATUS_ERROR_GPU_IS_LOST) {
+            nvEvoLogDev(pDevEvo, EVO_LOG_ERROR,
+                        "Failed to query display engine channel state: 0x%08x:%d:%d:0x%08x",
+                        pChan->hwclass, pChan->instance, sd, ret);
+        }
         return FALSE;
     }
 
diff --git a/src/nvidia-modeset/src/nvkms-hw-flip.c b/src/nvidia-modeset/src/nvkms-hw-flip.c
index 8163d67cd7..d433bf1b8d 100644
--- a/src/nvidia-modeset/src/nvkms-hw-flip.c
+++ b/src/nvidia-modeset/src/nvkms-hw-flip.c
@@ -2561,6 +2561,11 @@ static void LowerDispBandwidth(void *dataPtr, NvU32 dataU32)
     NvU32 head;
     NvBool ret;
 
+    /* Skip if GPU has been lost (e.g., Thunderbolt unplug) */
+    if (pDevEvo->gpuLost) {
+        return;
+    }
+
     guaranteedAndCurrent =
         nvCalloc(1, sizeof(*guaranteedAndCurrent) * NVKMS_MAX_HEADS_PER_DISP);
     if (guaranteedAndCurrent == NULL) {
@@ -2748,6 +2753,11 @@ TryToDoPostFlipIMP(void *dataPtr, NvU32 dataU32)
 
     pDevEvo->postFlipIMPTimer = NULL;
 
+    /* Skip if GPU has been lost (e.g., Thunderbolt unplug) */
+    if (pDevEvo->gpuLost) {
+        return;
+    }
+
     FOR_ALL_EVO_DISPLAYS(pDispEvo, sd, pDevEvo) {
         NVEvoUpdateState updateState = { };
         NvBool update = FALSE;
diff --git a/src/nvidia-modeset/src/nvkms-rm.c b/src/nvidia-modeset/src/nvkms-rm.c
index 3f1297bd98..d257cbfb54 100644
--- a/src/nvidia-modeset/src/nvkms-rm.c
+++ b/src/nvidia-modeset/src/nvkms-rm.c
@@ -2207,6 +2207,11 @@ NVDpyIdList nvRmGetConnectedDpys(const NVDispEvoRec *pDispEvo,
     NVDevEvoPtr pDevEvo = pDispEvo->pDevEvo;
     NvU32 ret;
 
+    /* Skip hardware access if GPU has been lost (e.g., Thunderbolt unplug) */
+    if (pDevEvo->gpuLost) {
+        return nvEmptyDpyIdList();
+    }
+
     params.subDeviceInstance = pDispEvo->displayOwner;
     params.displayMask = nvDpyIdListToNvU32(dpyIdList);
     params.flags =
@@ -3215,6 +3220,15 @@ NvBool nvRMSyncEvoChannel(
 {
     NvBool ret = TRUE;
 
+    /*
+     * Skip channel sync if the GPU has been lost (e.g., Thunderbolt eGPU
+     * surprise removal). The DMA control registers are invalid and would
+     * cause a crash.
+     */
+    if (pDevEvo->gpuLost) {
+        return FALSE;
+    }
+
     if (pChannel) {
         NvU32 sd;
 
diff --git a/src/nvidia-modeset/src/nvkms.c b/src/nvidia-modeset/src/nvkms.c
index ddb5eb1430..3613acfb9f 100644
--- a/src/nvidia-modeset/src/nvkms.c
+++ b/src/nvidia-modeset/src/nvkms.c
@@ -1386,6 +1386,34 @@ static NvBool AllocDevice(struct NvKmsPerOpen *pOpen,
 
     pDevEvo = nvFindDevEvoByDeviceId(pParams->request.deviceId);
 
+    /*
+     * If we found an existing device that was marked as lost (e.g., from a
+     * previous Thunderbolt surprise removal), we need to clean it up before
+     * allocating a new device for the reconnected GPU.
+     */
+    if (pDevEvo != NULL && pDevEvo->gpuLost) {
+        nvEvoLogDev(pDevEvo, EVO_LOG_INFO,
+                    "Cleaning up stale device from previous surprise removal");
+        /*
+         * Force cleanup of the stale device. Set allocRefCnt to 1 so that
+         * nvFreeDevEvo will actually free it.
+         */
+        pDevEvo->allocRefCnt = 1;
+        nvFreeDevEvo(pDevEvo);
+        pDevEvo = NULL;
+
+        /*
+         * After cleaning up a gpuLost device, reinitialize the global RM
+         * client handle. RM may have invalidated internal state when the
+         * GPU was lost, causing subsequent API calls to fail with
+         * NV_ERR_INVALID_OBJECT_HANDLE.
+         */
+        if (!nvReinitializeGlobalClientAfterGpuLost()) {
+            pParams->reply.status = NVKMS_ALLOC_DEVICE_STATUS_FATAL_ERROR;
+            return FALSE;
+        }
+    }
+
     if (pDevEvo == NULL) {
         pDevEvo = nvAllocDevEvo(&pParams->request, &pParams->reply.status);
         if (pDevEvo == NULL) {
@@ -1604,6 +1632,24 @@ static void DisableRemainingVblankSemControls(
 static void FreeDeviceReference(struct NvKmsPerOpen *pOpen,
                                 struct NvKmsPerOpenDev *pOpenDev)
 {
+    NVDevEvoPtr pDevEvo = pOpenDev->pDevEvo;
+
+    /*
+     * If pDevEvo is NULL, the device was already freed due to GPU loss
+     * (surprise removal). In this case, skip all hardware-related cleanup
+     * and just free the software structures.
+     *
+     * Also check if the device is marked as gpuLost - this can happen if
+     * nvInvalidateDeviceReferences hasn't been called yet (e.g., during
+     * concurrent cleanup) or if there's a race between GPU loss detection
+     * and this close path.
+     */
+    if (pDevEvo == NULL || pDevEvo->gpuLost) {
+        pOpenDev->pDevEvo = NULL;
+        nvFreePerOpenDev(pOpen, pOpenDev);
+        return;
+    }
+
     /* Disable all client-owned vblank sync objects that still exist. */
     DisableRemainingVblankSyncObjects(pOpen, pOpenDev);
 
@@ -5260,6 +5306,31 @@ void nvRevokeDevice(NVDevEvoPtr pDevEvo)
     }
 }
 
+/*
+ * Invalidate all pOpenDev references to a device.
+ * Called when GPU is lost to ensure nvKmsClose doesn't access freed pDevEvo.
+ * This sets pOpenDev->pDevEvo to NULL for all open handles.
+ */
+void nvInvalidateDeviceReferences(NVDevEvoPtr pDevEvo)
+{
+    struct NvKmsPerOpen *pOpen;
+    struct NvKmsPerOpenDev *pOpenDev;
+    NvKmsGenericHandle dev;
+
+    if (pDevEvo == NULL) {
+        return;
+    }
+
+    nvListForEachEntry(pOpen, &perOpenIoctlList, perOpenIoctlListEntry) {
+        FOR_ALL_POINTERS_IN_EVO_API_HANDLES(&pOpen->ioctl.devHandles,
+                                            pOpenDev, dev) {
+            if (pOpenDev->pDevEvo == pDevEvo) {
+                pOpenDev->pDevEvo = NULL;
+            }
+        }
+    }
+}
+
 /*!
  * Open callback.
  *
@@ -6248,6 +6319,40 @@ static void FreeGlobalState(void)
     nvClearDpyOverrides();
 }
 
+NvBool nvReinitializeGlobalClientAfterGpuLost(void)
+{
+    NvU32 ret;
+
+    /* Only reinitialize if we have a client handle */
+    if (nvEvoGlobal.clientHandle == 0) {
+        return TRUE;
+    }
+
+    nvEvoLog(EVO_LOG_INFO, "Reinitializing global client after GPU lost");
+
+    /* Free the old client handle */
+    nvRmApiFree(nvEvoGlobal.clientHandle, nvEvoGlobal.clientHandle,
+                nvEvoGlobal.clientHandle);
+    nvEvoGlobal.clientHandle = 0;
+
+    /* Allocate a new client handle */
+    ret = nvRmApiAlloc(NV01_NULL_OBJECT,
+                       NV01_NULL_OBJECT,
+                       NV01_NULL_OBJECT,
+                       NV01_ROOT,
+                       &nvEvoGlobal.clientHandle);
+
+    if (ret != NVOS_STATUS_SUCCESS) {
+        nvEvoLog(EVO_LOG_ERROR, "Failed to reinitialize global client");
+        return FALSE;
+    }
+
+    /* Update RM context */
+    nvEvoGlobal.rmSmgContext.clientHandle = nvEvoGlobal.clientHandle;
+
+    return TRUE;
+}
+
 /*
  * Wrappers to help SMG access NvKmsKAPI's RM context.
  */
@@ -6343,6 +6448,11 @@ static void ConsoleRestoreTimerFired(void *dataPtr, NvU32 dataU32)
 {
     NVDevEvoPtr pDevEvo = dataPtr;
 
+    /* Skip if GPU has been lost (e.g., Thunderbolt unplug) */
+    if (pDevEvo->gpuLost) {
+        return;
+    }
+
     if (pDevEvo->modesetOwner == NULL && pDevEvo->handleConsoleHotplugs) {
         pDevEvo->skipConsoleRestore = FALSE;
         nvEvoRestoreConsole(pDevEvo, TRUE /* allowMST */);
@@ -6836,6 +6946,43 @@ void nvKmsResume(NvU32 gpuId)
     }
 }
 
+/*!
+ * Mark a GPU as lost (e.g., Thunderbolt/eGPU hot-unplug).
+ *
+ * This prevents any hardware access attempts that would cause kernel crashes.
+ * The device's timers are cancelled and the gpuLost flag is set so that
+ * subsequent operations bail out early.
+ */
+void nvKmsGpuLost(NvU32 gpuId)
+{
+    NVDevEvoPtr pDevEvo;
+    NvU32 i;
+
+    FOR_ALL_EVO_DEVS(pDevEvo) {
+        for (i = 0; i < ARRAY_LEN(pDevEvo->openedGpuIds); i++) {
+            if (pDevEvo->openedGpuIds[i] == gpuId) {
+                nvEvoLogDev(pDevEvo, EVO_LOG_INFO,
+                            "GPU lost (surprise removal), disabling hardware access");
+
+                /* Mark device as lost to prevent hardware access */
+                pDevEvo->gpuLost = TRUE;
+
+                /* Cancel timers that might try to access hardware */
+                nvkms_free_timer(pDevEvo->consoleRestoreTimer);
+                pDevEvo->consoleRestoreTimer = NULL;
+
+                nvkms_free_timer(pDevEvo->postFlipIMPTimer);
+                pDevEvo->postFlipIMPTimer = NULL;
+
+                nvkms_free_timer(pDevEvo->lowerDispBandwidthTimer);
+                pDevEvo->lowerDispBandwidthTimer = NULL;
+
+                return;
+            }
+        }
+    }
+}
+
 static void ServiceOneDeferredRequestFifo(
     NVDevEvoPtr pDevEvo,
     NVDeferredRequestFifoRec *pDeferredRequestFifo)
diff --git a/src/nvidia/arch/nvalloc/unix/src/osapi.c b/src/nvidia/arch/nvalloc/unix/src/osapi.c
index f5db7a0e90..1da6be2e54 100644
--- a/src/nvidia/arch/nvalloc/unix/src/osapi.c
+++ b/src/nvidia/arch/nvalloc/unix/src/osapi.c
@@ -308,6 +308,17 @@ void
 RmLogGpuCrash(OBJGPU *pGpu)
 {
     NvBool bGpuIsLost, bGpuIsConnected;
+    NvBool bIsExternalGpu = pGpu->getProperty(pGpu, PDB_PROP_GPU_IS_EXTERNAL_GPU);
+
+    //
+    // For external GPUs (eGPUs) that have been disconnected, skip the crash
+    // dump entirely. The GPU is simply gone and attempting to save crash data
+    // will just produce noise in the logs.
+    //
+    if (bIsExternalGpu && pGpu->getProperty(pGpu, PDB_PROP_GPU_IS_LOST))
+    {
+        return;
+    }
 
     //
     // Re-evaluate whether or not the GPU is accessible. This could be called
@@ -4277,7 +4288,30 @@ void NV_API_CALL rm_power_source_change_event(
         OBJGPU *pGpu = gpumgrGetGpu(0);
         if (pGpu != NULL)
         {
+            //
+            // Check if the GPU is lost or inaccessible before proceeding.
+            // This can happen during hot-unplug (e.g., Thunderbolt eGPU removal)
+            // where ACPI events may still be delivered after the GPU is gone.
+            //
+            if (pGpu->getProperty(pGpu, PDB_PROP_GPU_IS_LOST) ||
+                !pGpu->getProperty(pGpu, PDB_PROP_GPU_IS_CONNECTED))
+            {
+                rmapiLockRelease();
+                goto done;
+            }
+
             nv = NV_GET_NV_STATE(pGpu);
+
+            //
+            // For external GPUs (Thunderbolt eGPU), check if we're in surprise
+            // removal before proceeding with power state changes.
+            //
+            if (nv->flags & NV_FLAG_IN_SURPRISE_REMOVAL)
+            {
+                rmapiLockRelease();
+                goto done;
+            }
+
             if ((rmStatus = os_ref_dynamic_power(nv, NV_DYNAMIC_PM_FINE)) ==
                                                                          NV_OK)
             {
@@ -4297,6 +4331,7 @@ void NV_API_CALL rm_power_source_change_event(
         }
     }
 
+done:
     if (rmStatus != NV_OK)
     {
         NV_PRINTF(LEVEL_ERROR,
@@ -5858,7 +5893,30 @@ void NV_API_CALL rm_acpi_nvpcf_notify(
         OBJGPU *pGpu = gpumgrGetGpu(0);
         if (pGpu != NULL)
         {
+            //
+            // Check if the GPU is lost or inaccessible before proceeding.
+            // This can happen during hot-unplug (e.g., Thunderbolt eGPU removal)
+            // where ACPI events may still be delivered after the GPU is gone.
+            //
+            if (pGpu->getProperty(pGpu, PDB_PROP_GPU_IS_LOST) ||
+                !pGpu->getProperty(pGpu, PDB_PROP_GPU_IS_CONNECTED))
+            {
+                rmapiLockRelease();
+                goto done_nvpcf;
+            }
+
             nv_state_t *nv = NV_GET_NV_STATE(pGpu);
+
+            //
+            // For external GPUs (Thunderbolt eGPU), check if we're in surprise
+            // removal before proceeding with power state changes.
+            //
+            if (nv->flags & NV_FLAG_IN_SURPRISE_REMOVAL)
+            {
+                rmapiLockRelease();
+                goto done_nvpcf;
+            }
+
             if ((rmStatus = os_ref_dynamic_power(nv, NV_DYNAMIC_PM_FINE)) ==
                                                                          NV_OK)
             {
@@ -5870,6 +5928,7 @@ void NV_API_CALL rm_acpi_nvpcf_notify(
         rmapiLockRelease();
     }
 
+done_nvpcf:
     threadStateFree(&threadState, THREAD_STATE_FLAGS_NONE);
     NV_EXIT_RM_RUNTIME(sp,fp);
 }
diff --git a/src/nvidia/arch/nvalloc/unix/src/osinit.c b/src/nvidia/arch/nvalloc/unix/src/osinit.c
index 44185ad78c..09e12451ad 100644
--- a/src/nvidia/arch/nvalloc/unix/src/osinit.c
+++ b/src/nvidia/arch/nvalloc/unix/src/osinit.c
@@ -356,18 +356,33 @@ osHandleGpuLost
     pmc_boot_0 = NV_PRIV_REG_RD32(nv->regs->map_u, NV_PMC_BOOT_0);
     if (pmc_boot_0 != nvp->pmc_boot_0)
     {
+        NvBool bIsExternalGpu = pGpu->getProperty(pGpu, PDB_PROP_GPU_IS_EXTERNAL_GPU);
+
         //
         // This doesn't support PEX Reset and Recovery yet.
         // This will help to prevent accessing registers of a GPU
         // which has fallen off the bus.
         //
-        nvErrorLog_va((void *)pGpu, ROBUST_CHANNEL_GPU_HAS_FALLEN_OFF_THE_BUS,
-                      "GPU has fallen off the bus.");
+        // For external GPUs (eGPUs), this is an expected condition during
+        // hot-unplug, so we keep logging minimal to avoid noise.
+        //
+        if (!bIsExternalGpu)
+        {
+            nvErrorLog_va((void *)pGpu, ROBUST_CHANNEL_GPU_HAS_FALLEN_OFF_THE_BUS,
+                          "GPU has fallen off the bus.");
+        }
 
         gpuNotifySubDeviceEvent(pGpu, NV2080_NOTIFIERS_GPU_UNAVAILABLE, NULL,
                                 0, ROBUST_CHANNEL_GPU_HAS_FALLEN_OFF_THE_BUS, 0);
 
-        NV_DEV_PRINTF(NV_DBG_ERRORS, nv, "GPU has fallen off the bus.\n");
+        if (bIsExternalGpu)
+        {
+            NV_DEV_PRINTF(NV_DBG_WARNINGS, nv, "External GPU disconnected.\n");
+        }
+        else
+        {
+            NV_DEV_PRINTF(NV_DBG_ERRORS, nv, "GPU has fallen off the bus.\n");
+        }
 
         if (pGpu->boardInfo != NULL && pGpu->boardInfo->serialNumber[0] != '\0')
         {
@@ -2479,13 +2494,28 @@ void RmShutdownAdapter(
                 if (nvp->flags & NV_INIT_FLAG_GPU_STATE_LOAD)
                 {
                     rmStatus = gpuStateUnload(pGpu, GPU_STATE_DEFAULT);
-                    NV_ASSERT(rmStatus == NV_OK);
+                    //
+                    // During surprise removal (e.g., Thunderbolt eGPU hot-unplug),
+                    // this may fail. Log but don't assert since we're tearing down anyway.
+                    //
+                    if (rmStatus != NV_OK)
+                    {
+                        NV_PRINTF(LEVEL_WARNING,
+                                  "gpuStateUnload failed during teardown: 0x%x\n", rmStatus);
+                    }
                 }
 
                 if (nvp->flags & NV_INIT_FLAG_GPU_STATE)
                 {
                     rmStatus = gpuStateDestroy(pGpu);
-                    NV_ASSERT(rmStatus == NV_OK);
+                    //
+                    // During surprise removal, this may fail. Log but don't assert.
+                    //
+                    if (rmStatus != NV_OK)
+                    {
+                        NV_PRINTF(LEVEL_WARNING,
+                                  "gpuStateDestroy failed during teardown: 0x%x\n", rmStatus);
+                    }
                 }
 
                 if (IS_DCE_CLIENT(pGpu))
@@ -2639,7 +2669,14 @@ void RmDisableAdapter(
             if (nvp->flags & NV_INIT_FLAG_GPU_STATE_LOAD)
             {
                 rmStatus = gpuStateUnload(pGpu, GPU_STATE_DEFAULT);
-                NV_ASSERT(rmStatus == NV_OK);
+                //
+                // During surprise removal, this may fail. Log but don't assert.
+                //
+                if (rmStatus != NV_OK)
+                {
+                    NV_PRINTF(LEVEL_WARNING,
+                              "gpuStateUnload failed during eGPU teardown: 0x%x\n", rmStatus);
+                }
                 nvp->flags &= ~NV_INIT_FLAG_GPU_STATE_LOAD;
             }
 
diff --git a/src/nvidia/src/kernel/core/thread_state.c b/src/nvidia/src/kernel/core/thread_state.c
index 10f73e3e48..e1fdabf22d 100644
--- a/src/nvidia/src/kernel/core/thread_state.c
+++ b/src/nvidia/src/kernel/core/thread_state.c
@@ -407,7 +407,10 @@ static NV_STATUS _threadNodeCheckTimeout(OBJGPU *pGpu, THREAD_STATE_NODE *pThrea
     {
         if (!API_GPU_ATTACHED_SANITY_CHECK(pGpu))
         {
-            NV_PRINTF(LEVEL_ERROR, "API_GPU_ATTACHED_SANITY_CHECK failed!\n");
+            //
+            // Don't log error during surprise removal - this is expected
+            // when GPU is hot-unplugged (e.g., Thunderbolt eGPU).
+            //
             return NV_ERR_TIMEOUT;
         }
     }
diff --git a/src/nvidia/src/kernel/diagnostics/nv_debug_dump.c b/src/nvidia/src/kernel/diagnostics/nv_debug_dump.c
index d76f255a13..7d028da946 100644
--- a/src/nvidia/src/kernel/diagnostics/nv_debug_dump.c
+++ b/src/nvidia/src/kernel/diagnostics/nv_debug_dump.c
@@ -220,6 +220,17 @@ nvdDoEngineDump_IMPL
     NVD_ENGINE_CALLBACK *pEngineCallback;
     NV_STATUS            nvStatus = NV_OK;
 
+    //
+    // Skip engine dumps for expected external GPU surprise removal.
+    // Engine dump attempts will fail with GPU_IS_LOST errors which
+    // are expected and just add noise to the log.
+    //
+    if (pGpu->getProperty(pGpu, PDB_PROP_GPU_IS_EXTERNAL_GPU) &&
+        pGpu->getProperty(pGpu, PDB_PROP_GPU_IS_LOST))
+    {
+        return NV_ERR_GPU_IS_LOST;
+    }
+
     NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
         prbEncNestedStart(pPrbEnc, NVDEBUG_NVDUMP_GPU_INFO));
 
@@ -263,6 +274,17 @@ nvdDumpAllEngines_IMPL
     NVD_ENGINE_CALLBACK    *pEngineCallback;
     NV_STATUS               nvStatus = NV_OK;
 
+    //
+    // Skip engine dumps for expected external GPU surprise removal.
+    // Engine dump attempts will fail with GPU_IS_LOST errors which
+    // are expected and just add noise to the log.
+    //
+    if (pGpu->getProperty(pGpu, PDB_PROP_GPU_IS_EXTERNAL_GPU) &&
+        pGpu->getProperty(pGpu, PDB_PROP_GPU_IS_LOST))
+    {
+        return NV_ERR_GPU_IS_LOST;
+    }
+
     NV_CHECK_OK_OR_RETURN(LEVEL_ERROR,
         prbEncNestedStart(pPrbEnc, NVDEBUG_NVDUMP_GPU_INFO));
 
diff --git a/src/nvidia/src/kernel/gpu/disp/kern_disp.c b/src/nvidia/src/kernel/gpu/disp/kern_disp.c
index d061de8aec..d52293b999 100644
--- a/src/nvidia/src/kernel/gpu/disp/kern_disp.c
+++ b/src/nvidia/src/kernel/gpu/disp/kern_disp.c
@@ -308,7 +308,7 @@ kdispDestroyCommonHandle_IMPL
     rmStatus = pRmApi->FreeWithSecInfo(pRmApi, pKernelDisplay->hInternalClient,
                                         pKernelDisplay->hDispCommonHandle,
                                         RMAPI_ALLOC_FLAGS_NONE, &pRmApi->defaultSecInfo);
-    NV_ASSERT(rmStatus == NV_OK);
+    NV_ASSERT((rmStatus == NV_OK) || (rmStatus == NV_ERR_GPU_IN_FULLCHIP_RESET) || (rmStatus == NV_ERR_GPU_IS_LOST));
 
     rmapiutilFreeClientAndDeviceHandles(pRmApi, &pKernelDisplay->hInternalClient,
                                         &pKernelDisplay->hInternalDevice,
diff --git a/src/nvidia/src/kernel/gpu/falcon/arch/turing/kernel_falcon_tu102.c b/src/nvidia/src/kernel/gpu/falcon/arch/turing/kernel_falcon_tu102.c
index 8b828fc69c..4cf70bef2c 100644
--- a/src/nvidia/src/kernel/gpu/falcon/arch/turing/kernel_falcon_tu102.c
+++ b/src/nvidia/src/kernel/gpu/falcon/arch/turing/kernel_falcon_tu102.c
@@ -184,8 +184,13 @@ kflcnReset_TU102
     NV_ASSERT_OK_OR_RETURN(kflcnPreResetWait_HAL(pGpu, pKernelFlcn));
     NV_ASSERT_OK(kflcnResetHw(pGpu, pKernelFlcn));
     status = kflcnWaitForResetToFinish_HAL(pGpu, pKernelFlcn);
-    NV_ASSERT_OR_RETURN((status == NV_OK) || (status == NV_ERR_GPU_IN_FULLCHIP_RESET), status);
-    if (status == NV_ERR_GPU_IN_FULLCHIP_RESET)
+    //
+    // During surprise removal, this may return NV_ERR_TIMEOUT in addition to
+    // NV_ERR_GPU_IS_LOST. Both are acceptable during teardown.
+    //
+    NV_ASSERT_OR_RETURN((status == NV_OK) || (status == NV_ERR_GPU_IN_FULLCHIP_RESET) ||
+                        (status == NV_ERR_GPU_IS_LOST) || (status == NV_ERR_TIMEOUT), status);
+    if (status != NV_OK)
         return status;
     kflcnSwitchToFalcon_HAL(pGpu, pKernelFlcn);
     kflcnRegWrite_HAL(pGpu, pKernelFlcn, NV_PFALCON_FALCON_RM,
diff --git a/src/nvidia/src/kernel/gpu/gpu.c b/src/nvidia/src/kernel/gpu/gpu.c
index 51a48bad08..2329e5f37a 100644
--- a/src/nvidia/src/kernel/gpu/gpu.c
+++ b/src/nvidia/src/kernel/gpu/gpu.c
@@ -5180,6 +5180,17 @@ gpuSetDisconnectedProperties_IMPL
     OBJGPU *pGpu
 )
 {
+    //
+    // Log GPU disconnection once. This is expected during Thunderbolt eGPU
+    // hot-unplug but should be noted for debugging purposes.
+    //
+    if (!pGpu->getProperty(pGpu, PDB_PROP_GPU_IS_LOST))
+    {
+        NV_PRINTF(LEVEL_NOTICE,
+                  "GPU 0x%x marked as disconnected/lost\n",
+                  pGpu->gpuInstance);
+    }
+
     pGpu->setProperty(pGpu, PDB_PROP_GPU_IS_LOST, NV_TRUE);
     pGpu->setProperty(pGpu, PDB_PROP_GPU_IS_CONNECTED, NV_FALSE);
     pGpu->setProperty(pGpu, PDB_PROP_GPU_IN_PM_CODEPATH, NV_FALSE);
diff --git a/src/nvidia/src/kernel/gpu/gpu_user_shared_data.c b/src/nvidia/src/kernel/gpu/gpu_user_shared_data.c
index e5284448e1..f35f81e421 100644
--- a/src/nvidia/src/kernel/gpu/gpu_user_shared_data.c
+++ b/src/nvidia/src/kernel/gpu/gpu_user_shared_data.c
@@ -242,12 +242,15 @@ _gpushareddataDestroyGsp
 
     params.bInit = NV_FALSE;
 
-    // Free Memdesc on GSP-side
-    NV_CHECK_OK(status, LEVEL_ERROR,
-                pRmApi->Control(pRmApi, pGpu->hInternalClient,
-                                pGpu->hInternalSubdevice,
-                                NV2080_CTRL_CMD_INTERNAL_INIT_USER_SHARED_DATA,
-                                &params, sizeof(params)));
+    // Free Memdesc on GSP-side - ignore GPU_IS_LOST during surprise removal
+    status = pRmApi->Control(pRmApi, pGpu->hInternalClient,
+                             pGpu->hInternalSubdevice,
+                             NV2080_CTRL_CMD_INTERNAL_INIT_USER_SHARED_DATA,
+                             &params, sizeof(params));
+    if ((status != NV_OK) && (status != NV_ERR_GPU_IS_LOST))
+    {
+        NV_PRINTF(LEVEL_ERROR, "Failed to free user shared data on GSP: 0x%x\n", status);
+    }
 }
 
 static NV_STATUS
@@ -460,9 +463,9 @@ _gpushareddataSendDataPollRpc
                              NV2080_CTRL_CMD_INTERNAL_USER_SHARED_DATA_SET_DATA_POLL,
                              &params, sizeof(params));
     NV_CHECK_OR_RETURN(LEVEL_ERROR,
-                      (status == NV_OK) || (status == NV_ERR_GPU_IN_FULLCHIP_RESET),
+                      (status == NV_OK) || (status == NV_ERR_GPU_IN_FULLCHIP_RESET) || (status == NV_ERR_GPU_IS_LOST),
                        status);
-    if (status == NV_ERR_GPU_IN_FULLCHIP_RESET)
+    if ((status == NV_ERR_GPU_IN_FULLCHIP_RESET) || (status == NV_ERR_GPU_IS_LOST))
         return status;
     pGpu->userSharedData.lastPolledDataMask = polledDataMask;
     pGpu->userSharedData.pollingIntervalMs = pollingIntervalMs;
diff --git a/src/nvidia/src/kernel/gpu/gr/fecs_event_list.c b/src/nvidia/src/kernel/gpu/gr/fecs_event_list.c
index 566a656c9b..5aacfdca7e 100644
--- a/src/nvidia/src/kernel/gpu/gr/fecs_event_list.c
+++ b/src/nvidia/src/kernel/gpu/gr/fecs_event_list.c
@@ -1620,8 +1620,8 @@ fecsBufferDisableHw
                              NV2080_CTRL_CMD_INTERNAL_GR_GET_FECS_TRACE_HW_ENABLE,
                              &getHwEnableParams,
                              sizeof(getHwEnableParams));
-    NV_ASSERT_OR_RETURN_VOID((status == NV_OK) || (status == NV_ERR_GPU_IN_FULLCHIP_RESET));
-    if (status == NV_ERR_GPU_IN_FULLCHIP_RESET)
+    NV_ASSERT_OR_RETURN_VOID((status == NV_OK) || (status == NV_ERR_GPU_IN_FULLCHIP_RESET) || (status == NV_ERR_GPU_IS_LOST));
+    if ((status == NV_ERR_GPU_IN_FULLCHIP_RESET) || (status == NV_ERR_GPU_IS_LOST))
         return;
  
     if (getHwEnableParams.bEnable)
@@ -1636,7 +1636,7 @@ fecsBufferDisableHw
                                  NV2080_CTRL_CMD_INTERNAL_GR_SET_FECS_TRACE_HW_ENABLE,
                                  &setHwEnableParams,
                                  sizeof(setHwEnableParams));
-        NV_ASSERT_OR_RETURN_VOID((status == NV_OK) || (status == NV_ERR_GPU_IN_FULLCHIP_RESET));
+        NV_ASSERT_OR_RETURN_VOID((status == NV_OK) || (status == NV_ERR_GPU_IN_FULLCHIP_RESET) || (status == NV_ERR_GPU_IS_LOST));
     }
 }
 
diff --git a/src/nvidia/src/kernel/gpu/gr/kernel_graphics.c b/src/nvidia/src/kernel/gpu/gr/kernel_graphics.c
index ceaae2c99c..81e4271a40 100644
--- a/src/nvidia/src/kernel/gpu/gr/kernel_graphics.c
+++ b/src/nvidia/src/kernel/gpu/gr/kernel_graphics.c
@@ -2613,7 +2613,7 @@ void kgraphicsFreeGlobalCtxBuffers_IMPL
     {
         NV_STATUS status;
         status = kmemsysCacheOp_HAL(pGpu, pKernelMemorySystem, NULL, FB_CACHE_VIDEO_MEMORY, FB_CACHE_EVICT);
-        NV_ASSERT((status == NV_OK) || (status == NV_ERR_GPU_IN_FULLCHIP_RESET));
+        NV_ASSERT((status == NV_OK) || (status == NV_ERR_GPU_IN_FULLCHIP_RESET) || (status == NV_ERR_GPU_IS_LOST));
     }
 }
 
diff --git a/src/nvidia/src/kernel/gpu/gsp/arch/ampere/kernel_gsp_falcon_ga102.c b/src/nvidia/src/kernel/gpu/gsp/arch/ampere/kernel_gsp_falcon_ga102.c
index 19d415a775..e1b8d9d069 100644
--- a/src/nvidia/src/kernel/gpu/gsp/arch/ampere/kernel_gsp_falcon_ga102.c
+++ b/src/nvidia/src/kernel/gpu/gsp/arch/ampere/kernel_gsp_falcon_ga102.c
@@ -179,6 +179,10 @@ kgspExecuteHsFalcon_GA102
     NvU32 data = 0;
     NvU32 dmaCmd;
 
+    // Check for surprise removal (e.g., Thunderbolt eGPU hot-unplug)
+    if (pGpu->getProperty(pGpu, PDB_PROP_GPU_IS_LOST))
+        return NV_ERR_GPU_IS_LOST;
+
     NV_ASSERT_OR_RETURN(pFlcnUcode != NULL, NV_ERR_INVALID_ARGUMENT);
     NV_ASSERT_OR_RETURN(pKernelFlcn != NULL, NV_ERR_INVALID_STATE);
 
diff --git a/src/nvidia/src/kernel/gpu/gsp/arch/turing/kernel_gsp_booter_tu102.c b/src/nvidia/src/kernel/gpu/gsp/arch/turing/kernel_gsp_booter_tu102.c
index 20f84418a0..d1ddf81865 100644
--- a/src/nvidia/src/kernel/gpu/gsp/arch/turing/kernel_gsp_booter_tu102.c
+++ b/src/nvidia/src/kernel/gpu/gsp/arch/turing/kernel_gsp_booter_tu102.c
@@ -145,6 +145,10 @@ kgspExecuteBooterUnloadIfNeeded_TU102
     if (API_GPU_IN_RESET_SANITY_CHECK(pGpu))
         return NV_ERR_GPU_IN_FULLCHIP_RESET;
 
+    // Check for surprise removal (e.g., Thunderbolt eGPU hot-unplug)
+    if (pGpu->getProperty(pGpu, PDB_PROP_GPU_IS_LOST))
+        return NV_ERR_GPU_IS_LOST;
+
     // skip actually executing Booter Unload if WPR2 is not up
     if (!kgspIsWpr2Up_HAL(pGpu, pKernelGsp))
     {
@@ -155,7 +159,16 @@ kgspExecuteBooterUnloadIfNeeded_TU102
     NV_PRINTF(LEVEL_INFO, "executing Booter Unload\n");
     NV_ASSERT_OR_RETURN(pKernelGsp->pBooterUnloadUcode != NULL, NV_ERR_INVALID_STATE);
 
-    NV_ASSERT_OK(kflcnReset_HAL(pGpu, staticCast(pKernelSec2, KernelFalcon)));
+    // Falcon reset may timeout during surprise removal - don't assert
+    status = kflcnReset_HAL(pGpu, staticCast(pKernelSec2, KernelFalcon));
+    if ((status != NV_OK) && (status != NV_ERR_TIMEOUT) && (status != NV_ERR_GPU_IS_LOST))
+    {
+        NV_ASSERT(0);
+    }
+    if (status != NV_OK)
+    {
+        return status;
+    }
 
     // SR code
     if (sysmemAddrOfSuspendResumeData != 0)
diff --git a/src/nvidia/src/kernel/gpu/gsp/arch/turing/kernel_gsp_tu102.c b/src/nvidia/src/kernel/gpu/gsp/arch/turing/kernel_gsp_tu102.c
index 71d1de4e9a..fccfae7dca 100644
--- a/src/nvidia/src/kernel/gpu/gsp/arch/turing/kernel_gsp_tu102.c
+++ b/src/nvidia/src/kernel/gpu/gsp/arch/turing/kernel_gsp_tu102.c
@@ -636,7 +636,18 @@ kgspTeardown_TU102
 
         // Reset GSP so we can load FWSEC-SB
         status = kflcnReset_HAL(pGpu, staticCast(pKernelGsp, KernelFalcon));
-        NV_ASSERT((status == NV_OK) || (status == NV_ERR_GPU_IN_FULLCHIP_RESET));
+        //
+        // During surprise removal, this may return NV_ERR_TIMEOUT in addition to
+        // NV_ERR_GPU_IS_LOST. Both are acceptable during teardown.
+        //
+        NV_ASSERT((status == NV_OK) || (status == NV_ERR_GPU_IN_FULLCHIP_RESET) ||
+                  (status == NV_ERR_GPU_IS_LOST) || (status == NV_ERR_TIMEOUT));
+
+        // Skip remaining hardware operations if GPU is lost/timeout - can't talk to it anyway
+        if (status != NV_OK)
+        {
+            goto skip_fwsec;
+        }
 
         // Invoke FWSEC-SB to put back PreOsApps during driver unload
         status = kgspPrepareForFwsecSb_HAL(pGpu, pKernelGsp, pKernelGsp->pFwsecUcode, &preparedCmd);
@@ -648,7 +659,7 @@ kgspTeardown_TU102
         else
         {
             status = kgspExecuteFwsec_HAL(pGpu, pKernelGsp, &preparedCmd);
-            if ((status != NV_OK) && (status != NV_ERR_GPU_IN_FULLCHIP_RESET))
+            if ((status != NV_OK) && (status != NV_ERR_GPU_IN_FULLCHIP_RESET) && (status != NV_ERR_GPU_IS_LOST))
             {
                 NV_PRINTF(LEVEL_ERROR, "failed to execute FWSEC-SB for PreOsApps during driver unload: 0x%x\n", status);
                 NV_ASSERT_FAILED("FWSEC-SB failed");
@@ -656,6 +667,8 @@ kgspTeardown_TU102
         }
     }
 
+skip_fwsec:
+
     // Execute Booter Unload
     status = kgspExecuteBooterUnloadIfNeeded_HAL(pGpu, pKernelGsp,
                                                  _kgspGetBooterUnloadArgs(pKernelGsp, unloadMode));
diff --git a/src/nvidia/src/kernel/gpu/gsp/kernel_gsp.c b/src/nvidia/src/kernel/gpu/gsp/kernel_gsp.c
index be52b51333..fc337d4429 100644
--- a/src/nvidia/src/kernel/gpu/gsp/kernel_gsp.c
+++ b/src/nvidia/src/kernel/gpu/gsp/kernel_gsp.c
@@ -149,6 +149,8 @@ static NV_STATUS _kgspRpcRecvPoll(OBJGPU *, OBJRPC *, NvU32, NvU32);
 static NV_STATUS _kgspRpcDrainEvents(OBJGPU *, KernelGsp *, NvU32, NvU32, KernelGspRpcEventHandlerContext);
 static void      _kgspRpcIncrementTimeoutCountAndRateLimitPrints(OBJGPU *, OBJRPC *);
 
+static NvBool _kgspIsExternalGpuSurpriseRemoval(OBJGPU *);
+
 static NV_STATUS _kgspAllocSimAccessBuffer(OBJGPU *pGpu, KernelGsp *pKernelGsp);
 static void _kgspFreeSimAccessBuffer(OBJGPU *pGpu, KernelGsp *pKernelGsp);
 
@@ -306,11 +308,13 @@ _kgspRpcSanityCheck(OBJGPU *pGpu, KernelGsp *pKernelGsp, OBJRPC *pRpc)
         pGpu->getProperty(pGpu, PDB_PROP_GPU_IS_LOST))
     {
         NV_PRINTF(LEVEL_INFO, "GPU lost, skipping RPC\n");
+        pRpc->bQuietPrints = NV_TRUE;
         return NV_ERR_GPU_IS_LOST;
     }
     if (osIsGpuShutdown(pGpu))
     {
         NV_PRINTF(LEVEL_INFO, "GPU shutdown, skipping RPC\n");
+        pRpc->bQuietPrints = NV_TRUE;
         return NV_ERR_GPU_IS_LOST;
     }
     if (!gpuIsGpuFullPowerForPmResume(pGpu))
@@ -2029,6 +2033,20 @@ kgspLogRpcDebugInfoToProtobuf
     prbEncNestedEnd(pProtobufData);
 }
 
+/*!
+ * Check if this is an expected external GPU surprise removal.
+ * Used to suppress noisy debug output during normal eGPU hot-unplug.
+ */
+static NvBool
+_kgspIsExternalGpuSurpriseRemoval
+(
+    OBJGPU *pGpu
+)
+{
+    return pGpu->getProperty(pGpu, PDB_PROP_GPU_IS_EXTERNAL_GPU) &&
+           pGpu->getProperty(pGpu, PDB_PROP_GPU_IS_LOST);
+}
+
 void
 kgspLogRpcDebugInfo
 (
@@ -2044,6 +2062,15 @@ kgspLogRpcDebugInfo
     NvU64  activeData[2];
     const NvU32 rpcEntriesToLog = (RPC_HISTORY_DEPTH > 8) ? 8 : RPC_HISTORY_DEPTH;
 
+    //
+    // Suppress detailed RPC debug output for expected external GPU surprise removal.
+    // This keeps the log clean during normal Thunderbolt eGPU hot-unplug.
+    //
+    if (_kgspIsExternalGpuSurpriseRemoval(pGpu))
+    {
+        return;
+    }
+
     _kgspGetActiveRpcDebugData(pRpc, pMsgHdr->function,
                                &activeData[0], &activeData[1]);
     NV_ERROR_LOG_DATA(pGpu, errorNum,
@@ -2096,6 +2123,15 @@ _kgspCheckSlowRpc
 
     NV_ASSERT_OR_RETURN_VOID(tsFreqUs > 0);
 
+    //
+    // Suppress slow RPC warnings for expected external GPU surprise removal.
+    // During normal Thunderbolt eGPU hot-unplug, slow/stalled RPCs are expected.
+    //
+    if (_kgspIsExternalGpuSurpriseRemoval(pGpu))
+    {
+        return;
+    }
+
     duration = (pHistoryEntry->ts_end - pHistoryEntry->ts_start) / tsFreqUs;
 
     if (duration > SLOW_RPC_THRESHOLD_US)
@@ -2145,7 +2181,15 @@ _kgspLogXid119
     KernelGsp *pKernelGsp = GPU_GET_KERNEL_GSP(pGpu);
     KernelFalcon *pKernelFlcn = staticCast(pKernelGsp, KernelFalcon);
 
-    if (pRpc->timeoutCount == 1)
+    //
+    // Suppress Xid 119 logging for expected external GPU surprise removal.
+    // During normal Thunderbolt eGPU hot-unplug, RPC timeouts are expected.
+    //
+    if (_kgspIsExternalGpuSurpriseRemoval(pGpu))
+    {
+        return;
+    }
+
     {
         NV_PRINTF(LEVEL_ERROR,
                   "********************************* GSP Timeout **********************************\n");
diff --git a/src/nvidia/src/kernel/gpu/intr/intr.c b/src/nvidia/src/kernel/gpu/intr/intr.c
index 86e1991855..c7a2f89bb5 100644
--- a/src/nvidia/src/kernel/gpu/intr/intr.c
+++ b/src/nvidia/src/kernel/gpu/intr/intr.c
@@ -119,6 +119,18 @@ intrServiceStall_IMPL(OBJGPU *pGpu, Intr *pIntr)
 
     if (!RMCFG_FEATURE_PLATFORM_GSP)
     {
+        //
+        // Check if GPU is already known to be lost/detached before doing any
+        // register reads. This prevents log spam during surprise removal
+        // (e.g., Thunderbolt eGPU hot-unplug).
+        //
+        if (!API_GPU_ATTACHED_SANITY_CHECK(pGpu) ||
+            API_GPU_IN_RESET_SANITY_CHECK(pGpu) ||
+            pGpu->getProperty(pGpu, PDB_PROP_GPU_IS_LOST))
+        {
+            goto exit;
+        }
+
         //
         // If the GPU is off the BUS or surprise removed during servicing DPC for ISRs
         // we wont know about GPU state until after we start processing DPCs for every
@@ -134,18 +146,17 @@ intrServiceStall_IMPL(OBJGPU *pGpu, Intr *pIntr)
 
         if (regReadValue == GPU_REG_VALUE_INVALID)
         {
-            NV_PRINTF(LEVEL_ERROR,
-                      "Failed GPU reg read : 0x%x. Check whether GPU is present on the bus\n",
-                      regReadValue);
-        }
-
-        if (!API_GPU_ATTACHED_SANITY_CHECK(pGpu))
-        {
-            goto exit;
-        }
-
-        if (API_GPU_IN_RESET_SANITY_CHECK(pGpu))
-        {
+            //
+            // GPU has been surprise removed. Mark it as lost and return early.
+            // Log once when first detected.
+            //
+            if (!pGpu->getProperty(pGpu, PDB_PROP_GPU_IS_LOST))
+            {
+                NV_PRINTF(LEVEL_WARNING,
+                          "GPU 0x%x surprise removed (reg read returned 0xFFFFFFFF)\n",
+                          pGpu->gpuInstance);
+                pGpu->setProperty(pGpu, PDB_PROP_GPU_IS_LOST, NV_TRUE);
+            }
             goto exit;
         }
     }
@@ -1555,6 +1566,18 @@ _intrServiceStallCommonCheckBegin
 
     if (!RMCFG_FEATURE_PLATFORM_GSP)
     {
+        //
+        // Check if GPU is already known to be lost/detached before doing any
+        // register reads. This prevents log spam during surprise removal
+        // (e.g., Thunderbolt eGPU hot-unplug).
+        //
+        if (!API_GPU_ATTACHED_SANITY_CHECK(pGpu) ||
+            API_GPU_IN_RESET_SANITY_CHECK(pGpu) ||
+            pGpu->getProperty(pGpu, PDB_PROP_GPU_IS_LOST))
+        {
+            return NV_ERR_GPU_IS_LOST;
+        }
+
         //
         // If the GPU is off the BUS or surprise removed during servicing DPC for ISRs
         // we wont know about GPU state until after we start processing DPCs for every
@@ -1570,14 +1593,17 @@ _intrServiceStallCommonCheckBegin
 
         if (regReadValue == GPU_REG_VALUE_INVALID)
         {
-            NV_PRINTF(LEVEL_ERROR,
-                      "Failed GPU reg read : 0x%x. Check whether GPU is present on the bus\n",
-                      regReadValue);
-        }
-
-        // Dont service interrupts if GPU is surprise removed
-        if (!API_GPU_ATTACHED_SANITY_CHECK(pGpu) || API_GPU_IN_RESET_SANITY_CHECK(pGpu))
-        {
+            //
+            // GPU has been surprise removed. Mark it as lost and return early.
+            // Log once when first detected.
+            //
+            if (!pGpu->getProperty(pGpu, PDB_PROP_GPU_IS_LOST))
+            {
+                NV_PRINTF(LEVEL_WARNING,
+                          "GPU 0x%x surprise removed (reg read returned 0xFFFFFFFF)\n",
+                          pGpu->gpuInstance);
+                pGpu->setProperty(pGpu, PDB_PROP_GPU_IS_LOST, NV_TRUE);
+            }
             return NV_ERR_GPU_IS_LOST;
         }
     }
@@ -1635,7 +1661,16 @@ intrServiceStallList_IMPL
     NvBool              bPending;
     CALL_CONTEXT       *pOldContext = NULL;
 
-    NV_ASSERT_OK_OR_ELSE(status, _intrServiceStallCommonCheckBegin(pGpu, pIntr, &pOldContext), return);
+    //
+    // Don't use NV_ASSERT_OK_OR_ELSE here - NV_ERR_GPU_IS_LOST is expected
+    // during surprise removal (e.g., Thunderbolt eGPU hot-unplug) and
+    // should not spam the logs with assertion messages.
+    //
+    status = _intrServiceStallCommonCheckBegin(pGpu, pIntr, &pOldContext);
+    if (status != NV_OK)
+    {
+        return;
+    }
 
     do
     {
@@ -1688,7 +1723,16 @@ intrServiceStallSingle_IMPL
     bitVectorClrAll(&engines);
     bitVectorSet(&engines, engIdx);
 
-    NV_ASSERT_OK_OR_ELSE(status, _intrServiceStallCommonCheckBegin(pGpu, pIntr, &pOldContext), return);
+    //
+    // Don't use NV_ASSERT_OK_OR_ELSE here - NV_ERR_GPU_IS_LOST is expected
+    // during surprise removal (e.g., Thunderbolt eGPU hot-unplug) and
+    // should not spam the logs with assertion messages.
+    //
+    status = _intrServiceStallCommonCheckBegin(pGpu, pIntr, &pOldContext);
+    if (status != NV_OK)
+    {
+        return;
+    }
 
     do
     {
diff --git a/src/nvidia/src/kernel/gpu/mem_mgr/ce_utils.c b/src/nvidia/src/kernel/gpu/mem_mgr/ce_utils.c
index 6f25f6bf53..20a8d62fb3 100644
--- a/src/nvidia/src/kernel/gpu/mem_mgr/ce_utils.c
+++ b/src/nvidia/src/kernel/gpu/mem_mgr/ce_utils.c
@@ -343,10 +343,17 @@ ceutilsDestruct_IMPL
     // process all callbacks while CeUtils is fully functional
     _ceutilsProcessCompletionCallbacks(pCeUtils);
     portSyncSpinlockAcquire(pCeUtils->pCallbackLock);
-    NV_ASSERT(listCount(&pCeUtils->completionCallbacks) == 0);
+    // During surprise removal, callbacks may not complete cleanly - skip assertion if GPU is lost
+    if (!pGpu->getProperty(pGpu, PDB_PROP_GPU_IS_LOST))
+    {
+        NV_ASSERT(listCount(&pCeUtils->completionCallbacks) == 0);
+    }
     portSyncSpinlockRelease(pCeUtils->pCallbackLock);
     // make sure no new work was queued from callbacks
-    NV_ASSERT(pCeUtils->lastCompletedPayload == lastSubmittedPayload);
+    if (!pGpu->getProperty(pGpu, PDB_PROP_GPU_IS_LOST))
+    {
+        NV_ASSERT(pCeUtils->lastCompletedPayload == lastSubmittedPayload);
+    }
 
     if ((pChannel->bClientUserd) && (pChannel->pControlGPFifo != NULL))
     {
diff --git a/src/nvidia/src/kernel/gpu/mem_mgr/vaspace_api.c b/src/nvidia/src/kernel/gpu/mem_mgr/vaspace_api.c
index 86b62f1b1e..32c96523e9 100644
--- a/src/nvidia/src/kernel/gpu/mem_mgr/vaspace_api.c
+++ b/src/nvidia/src/kernel/gpu/mem_mgr/vaspace_api.c
@@ -570,7 +570,7 @@ vaspaceapiDestruct_IMPL(VaSpaceApi *pVaspaceApi)
     if ((IS_VIRTUAL(pGpu) || IS_GSP_CLIENT(pGpu)) && !bBar1VA && !bFlaVA)
     {
         NV_RM_RPC_FREE(pGpu, hClient, hParent, hVASpace, status);
-        NV_ASSERT((status == NV_OK) || (status == NV_ERR_GPU_IN_FULLCHIP_RESET));
+        NV_ASSERT((status == NV_OK) || (status == NV_ERR_GPU_IN_FULLCHIP_RESET) || (status == NV_ERR_GPU_IS_LOST));
     }
 
     NV_PRINTF(LEVEL_INFO,
diff --git a/src/nvidia/src/kernel/mem_mgr/mem.c b/src/nvidia/src/kernel/mem_mgr/mem.c
index 706faa9021..e0dc48016b 100644
--- a/src/nvidia/src/kernel/mem_mgr/mem.c
+++ b/src/nvidia/src/kernel/mem_mgr/mem.c
@@ -175,7 +175,7 @@ memDestruct_IMPL
     if (pMemory->bRpcAlloc && (IS_VIRTUAL(pGpu) || IS_FW_CLIENT(pGpu)))
     {
         NV_RM_RPC_FREE(pGpu, hClient, hParent, hMemory, status);
-        NV_ASSERT((status == NV_OK) || (status == NV_ERR_GPU_IN_FULLCHIP_RESET));
+        NV_ASSERT((status == NV_OK) || (status == NV_ERR_GPU_IN_FULLCHIP_RESET) || (status == NV_ERR_GPU_IS_LOST));
     }
 }
 
diff --git a/src/nvidia/src/kernel/rmapi/nv_gpu_ops.c b/src/nvidia/src/kernel/rmapi/nv_gpu_ops.c
index 6a155fd597..23a594b92a 100644
--- a/src/nvidia/src/kernel/rmapi/nv_gpu_ops.c
+++ b/src/nvidia/src/kernel/rmapi/nv_gpu_ops.c
@@ -809,10 +809,19 @@ NV_STATUS nvGpuOpsDestroySession(struct gpuSession *session)
     if (!session)
         return NV_OK;
 
-    // Sanity Check: There should not be any attached devices with the session!
-    NV_ASSERT(!session->devices);
-    // Sanity Check: If there are no devices, there should also be no p2p Info!
-    NV_ASSERT(!session->p2pInfo);
+    // During surprise removal (GPU lost), devices may not have been properly
+    // detached. In normal operation, these assertions catch programming errors.
+    // When the GPU is lost, we log and continue to avoid blocking cleanup.
+    if (session->devices)
+    {
+        NV_PRINTF(LEVEL_WARNING,
+                  "Destroying session with devices still attached (GPU may be lost)\n");
+    }
+    if (session->p2pInfo)
+    {
+        NV_PRINTF(LEVEL_WARNING,
+                  "Destroying session with p2p info still present (GPU may be lost)\n");
+    }
 
     // freeing session will free everything under it
     pRmApi->Free(pRmApi, session->handle, session->handle);
diff --git a/src/nvidia/src/kernel/vgpu/rpc.c b/src/nvidia/src/kernel/vgpu/rpc.c
index 35673a0e33..3ca3ab710b 100644
--- a/src/nvidia/src/kernel/vgpu/rpc.c
+++ b/src/nvidia/src/kernel/vgpu/rpc.c
@@ -1872,6 +1872,16 @@ static NV_STATUS _issueRpcAndWait(OBJGPU *pGpu, OBJRPC *pRpc)
     NvU32 expectedFunc = vgpu_rpc_message_header_v->function;
     NvU32 expectedSequence = 0;
 
+    //
+    // Suppress RPC error logging for expected external GPU surprise removal.
+    // During normal Thunderbolt eGPU hot-unplug, RPC failures are expected.
+    //
+    if (pGpu->getProperty(pGpu, PDB_PROP_GPU_IS_EXTERNAL_GPU) &&
+        pGpu->getProperty(pGpu, PDB_PROP_GPU_IS_LOST))
+    {
+        pRpc->bQuietPrints = NV_TRUE;
+    }
+
     status = rpcSendMessage(pGpu, pRpc, &expectedSequence);
     if (status != NV_OK)
     {
diff --git a/src/nvidia/src/libraries/resserv/src/rs_client.c b/src/nvidia/src/libraries/resserv/src/rs_client.c
index 62d9738eef..f18afa0dec 100644
--- a/src/nvidia/src/libraries/resserv/src/rs_client.c
+++ b/src/nvidia/src/libraries/resserv/src/rs_client.c
@@ -841,7 +841,7 @@ clientFreeResource_IMPL
     _refRemoveAllDependencies(pResourceRef);
 
     status = serverFreeResourceRpcUnderLock(pServer, pParams);
-    NV_ASSERT((status == NV_OK) || (status == NV_ERR_GPU_IN_FULLCHIP_RESET));
+    NV_ASSERT((status == NV_OK) || (status == NV_ERR_GPU_IN_FULLCHIP_RESET) || (status == NV_ERR_GPU_IS_LOST));
 
     // NV_PRINTF(LEVEL_INFO, "hClient %x: Freeing hResource: %x\n",
     //          pClient->hClient, pResourceRef->hResource);
diff --git a/src/nvidia/src/libraries/resserv/src/rs_server.c b/src/nvidia/src/libraries/resserv/src/rs_server.c
index fee31554ca..fbd3d0bb2f 100644
--- a/src/nvidia/src/libraries/resserv/src/rs_server.c
+++ b/src/nvidia/src/libraries/resserv/src/rs_server.c
@@ -256,7 +256,7 @@ NV_STATUS serverFreeResourceTreeUnderLock(RsServer *pServer, RS_RES_FREE_PARAMS
             goto done;
 
         status = clientFreeResource(pResourceRef->pClient, pServer, pFreeParams);
-        NV_ASSERT((status == NV_OK) || (status == NV_ERR_GPU_IN_FULLCHIP_RESET));
+        NV_ASSERT((status == NV_OK) || (status == NV_ERR_GPU_IN_FULLCHIP_RESET) || (status == NV_ERR_GPU_IS_LOST));
 
         serverResLock_Epilogue(pServer, LOCK_ACCESS_WRITE, pLockInfo, &releaseFlags);
     }
@@ -1372,7 +1372,7 @@ serverFreeResourceTree
         freeParams.bInvalidateOnly = bInvalidateOnly;
         freeParams.pSecInfo = pParams->pSecInfo;
         status = serverFreeResourceTreeUnderLock(pServer, &freeParams);
-        NV_ASSERT((status == NV_OK) || (status == NV_ERR_GPU_IN_FULLCHIP_RESET));
+        NV_ASSERT((status == NV_OK) || (status == NV_ERR_GPU_IN_FULLCHIP_RESET) || (status == NV_ERR_GPU_IS_LOST));
 
         if (pServer->bDebugFreeList)
         {