From 77f4c28e319bbbd6156a15f6f0563d2d641e0c1c Mon Sep 17 00:00:00 2001 From: l000064 Date: Tue, 26 May 2026 14:12:40 +0800 Subject: [PATCH 1/2] fix cuda init Signed-off-by: lyquid --- src/cuda/memory.c | 4 ++++ src/include/libvgpu.h | 3 ++- src/libvgpu.c | 6 +++++- src/multiprocess/multiprocess_memory_limit.c | 1 - 4 files changed, 11 insertions(+), 3 deletions(-) diff --git a/src/cuda/memory.c b/src/cuda/memory.c index 00857f30..f98d2a92 100755 --- a/src/cuda/memory.c +++ b/src/cuda/memory.c @@ -3,6 +3,7 @@ #include "allocator/allocator.h" #include "include/libcuda_hook.h" +#include "include/libvgpu.h" #include "include/memory_limit.h" extern int pidfound; @@ -555,6 +556,7 @@ CUresult cuMipmappedArrayDestroy(CUmipmappedArray hMipmappedArray) { CUresult cuLaunchKernel ( CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void** kernelParams, void** extra ){ ENSURE_RUNNING(); + ensure_post_init(); pre_launch_kernel(); if (pidfound==1){ rate_limiter(gridDimX * gridDimY * gridDimZ, @@ -566,6 +568,7 @@ CUresult cuLaunchKernel ( CUfunction f, unsigned int gridDimX, unsigned int gr CUresult cuLaunchKernelEx(const CUlaunchConfig *config, CUfunction f, void **kernelParams, void **extra) { ENSURE_RUNNING(); + ensure_post_init(); pre_launch_kernel(); if (pidfound==1){ rate_limiter(config->gridDimX * config->gridDimY * config->gridDimZ, @@ -577,6 +580,7 @@ CUresult cuLaunchKernelEx(const CUlaunchConfig *config, CUfunction f, void **ker CUresult cuLaunchCooperativeKernel ( CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void** kernelParams ){ ENSURE_RUNNING(); + ensure_post_init(); pre_launch_kernel(); CUresult res = CUDA_OVERRIDE_CALL(cuda_library_entry,cuLaunchCooperativeKernel,f,gridDimX,gridDimY,gridDimZ,blockDimX,blockDimY,blockDimZ,sharedMemBytes,hStream,kernelParams); return res; diff --git a/src/include/libvgpu.h b/src/include/libvgpu.h index e596497e..781b5c4c 100755 --- a/src/include/libvgpu.h +++ b/src/include/libvgpu.h @@ -67,5 +67,6 @@ typedef void* (*fp_dlsym)(void*, const char*); nvmlReturn_t set_task_pid(); int map_cuda_visible_devices(); +void ensure_post_init(); -#endif // __LIBVGPU_GLIBC_H__ \ No newline at end of file +#endif // __LIBVGPU_GLIBC_H__ diff --git a/src/libvgpu.c b/src/libvgpu.c index befc16c3..e36248e7 100644 --- a/src/libvgpu.c +++ b/src/libvgpu.c @@ -919,6 +919,10 @@ void postInit(){ init_utilization_watcher(); } +void ensure_post_init(){ + pthread_once(&post_cuinit_flag, (void(*) (void))postInit); +} + CUresult cuInit(unsigned int Flags){ LOG_INFO("Into cuInit"); pthread_once(&pre_cuinit_flag,(void(*)(void))preInit); @@ -928,6 +932,6 @@ CUresult cuInit(unsigned int Flags){ LOG_ERROR("cuInit failed:%d",res); return res; } - pthread_once(&post_cuinit_flag, (void(*) (void))postInit); + ensure_post_init(); return CUDA_SUCCESS; } diff --git a/src/multiprocess/multiprocess_memory_limit.c b/src/multiprocess/multiprocess_memory_limit.c index fce713ba..6e458c01 100755 --- a/src/multiprocess/multiprocess_memory_limit.c +++ b/src/multiprocess/multiprocess_memory_limit.c @@ -411,7 +411,6 @@ int init_gpu_device_utilization(){ 0, memory_order_relaxed); atomic_store_explicit(®ion_info.shared_region->procs[i].monitorused[dev], 0, memory_order_relaxed); - break; } } return 1; From 03e8938630262e3928e22ea59e0944d0d46b2cc6 Mon Sep 17 00:00:00 2001 From: l000064 Date: Tue, 26 May 2026 15:38:00 +0800 Subject: [PATCH 2/2] fix cuda init for subprocess Signed-off-by: lyquid --- src/include/libvgpu.h | 2 +- src/libvgpu.c | 10 +++++++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/src/include/libvgpu.h b/src/include/libvgpu.h index 781b5c4c..1092a7b0 100755 --- a/src/include/libvgpu.h +++ b/src/include/libvgpu.h @@ -69,4 +69,4 @@ nvmlReturn_t set_task_pid(); int map_cuda_visible_devices(); void ensure_post_init(); -#endif // __LIBVGPU_GLIBC_H__ +#endif // SRC_INCLUDE_LIBVGPU_H_ diff --git a/src/libvgpu.c b/src/libvgpu.c index e36248e7..a5011bbd 100644 --- a/src/libvgpu.c +++ b/src/libvgpu.c @@ -19,6 +19,7 @@ extern void initial_virtual_map(void); extern int set_host_pid(int hostpid); extern void allocator_init(void); void preInit(); +void childReinitPostInit(); char *(*real_realpath)(const char *path, char *resolved_path); void *vgpulib; @@ -884,6 +885,7 @@ void preInit(){ load_cuda_libraries(); //nvmlInit(); ENSURE_INITIALIZED(); + pthread_atfork(NULL, NULL, childReinitPostInit); } void postInit(){ @@ -919,7 +921,13 @@ void postInit(){ init_utilization_watcher(); } -void ensure_post_init(){ +void childReinitPostInit() { + LOG_DEBUG("Reset postInit state after fork"); + post_cuinit_flag = PTHREAD_ONCE_INIT; + pidfound = 0; +} + +void ensure_post_init() { pthread_once(&post_cuinit_flag, (void(*) (void))postInit); }