From cd0cbb4327b8728edf149c918c0d48705e3f28f5 Mon Sep 17 00:00:00 2001 From: Ben Ashbaugh Date: Mon, 19 Jan 2026 16:56:56 -0800 Subject: [PATCH 1/3] initial version of capture replay with SVM offsets --- intercept/CMakeLists.txt | 1 + intercept/scripts/run.py | 47 ++++++++++++++++++++++++++---------- intercept/src/intercept.cpp | 48 ++++++++++++++++++++----------------- intercept/src/intercept.h | 9 ++++++- 4 files changed, 69 insertions(+), 36 deletions(-) diff --git a/intercept/CMakeLists.txt b/intercept/CMakeLists.txt index 457753d6..11e81e7e 100644 --- a/intercept/CMakeLists.txt +++ b/intercept/CMakeLists.txt @@ -45,6 +45,7 @@ set(CLINTERCEPT_RESOURCE_FILES kernels/precompiled_kernels.cl resource/clIntercept.rc resource/clIntercept_resource.h + scripts/run.py "${CMAKE_CURRENT_BINARY_DIR}/git_version.rc2" ) source_group(Resources FILES diff --git a/intercept/scripts/run.py b/intercept/scripts/run.py index e7494396..44c14fdd 100644 --- a/intercept/scripts/run.py +++ b/intercept/scripts/run.py @@ -14,6 +14,13 @@ import argparse from collections import defaultdict +def get_svm_arg_offset(idx: Int): + fileName = f"./SVM_Arg_Offset_{idx}.txt" + with open(fileName) as offsetFile: + offset = int(offsetFile.read()) + return offset + return 0 + def get_image_metadata(idx: int): fileName = f"./Image_MetaData_{idx}.txt" with open(fileName) as metadata: @@ -45,7 +52,7 @@ def sampler_from_string(ctx, sampler_descr): cl.filter_mode.NEAREST return cl.Sampler(ctx, False, addressing_mode, filter_mode) -def replay(repetitions): +def replay(repetitions, use_svm = False): # Read the enqueue number from the file with open('./enqueueNumber.txt') as file: enqueue_number = file.read().splitlines()[0] @@ -67,7 +74,7 @@ def replay(repetitions): start = buffer.find("_Arg_") idx = int(re.findall(r'\d+', buffer[start:])[0]) buffer_idx.append(idx) - input_buffers[idx] = np.fromfile(buffer, dtype='uint8').tobytes() + input_buffers[idx] = np.fromfile(buffer, dtype='uint8') input_buffer_ptrs[arguments[idx]].append(idx) output_buffers[idx] = np.empty_like(input_buffers[idx]) @@ -80,7 +87,7 @@ def replay(repetitions): start = image.find("_Arg_") idx = int(re.findall(r'\d+', image[start:])[0]) image_idx.append(idx) - input_images[idx] = np.fromfile(image, dtype='uint8').tobytes() + input_images[idx] = np.fromfile(image, dtype='uint8') input_images_ptrs[arguments[idx]].append(idx) output_images[idx] = np.empty_like(input_images[idx]) @@ -123,16 +130,22 @@ def replay(repetitions): for idx in list(samplers): del arguments[idx] - mf = cl.mem_flags - gpu_buffers = {} + gpu_svm = {} for idxs in input_buffer_ptrs.values(): - gpu_buffers[tuple(idxs)] = cl.Buffer(ctx, mf.COPY_HOST_PTR, hostbuf=input_buffers[idxs[0]]) + if use_svm: + svm = cl.SVM(cl.csvm_empty_like(ctx, input_buffers[idxs[0]])) + cl.enqueue_copy(queue, svm, input_buffers[idxs[0]]) + gpu_svm[tuple(idxs)] = svm + else: + buf = cl.Buffer(ctx, cl.mem_flags.COPY_HOST_PTR, hostbuf=input_buffers[idxs[0]].tobytes()) + gpu_buffers[tuple(idxs)] = buf gpu_images = {} for idx in image_idx: format, shape = get_image_metadata(idx) - gpu_images[idx] = cl.Image(ctx, mf.COPY_HOST_PTR, format, shape, hostbuf=input_images[idx]) + img = cl.Image(ctx, cl.mem_flags.COPY_HOST_PTR, format, shape, hostbuf=input_images[idx].tobytes()) + gpu_images[idx] = img with open("buildOptions.txt", 'r') as file: options = [line.rstrip() for line in file] @@ -176,6 +189,11 @@ def replay(repetitions): for pos, buffer in gpu_buffers.items(): for idx in pos: kernel.set_arg(idx, buffer) + + for pos, svm in gpu_svm.items(): + for idx in pos: + offset = get_svm_arg_offset(idx) + kernel.set_arg(idx, cl.SVM(svm.mem[offset:])) for pos, image in gpu_images.items(): kernel.set_arg(pos, image) @@ -208,11 +226,12 @@ def replay(repetitions): cl.enqueue_nd_range_kernel(queue, kernel, gws, lws, gwo) for pos in gpu_buffers.keys(): - if len(pos) == 1: - cl.enqueue_copy(queue, output_buffers[pos[0]], gpu_buffers[pos]) - else: - for idx in range(len(pos)): - cl.enqueue_copy(queue, output_buffers[pos[idx]], gpu_buffers[pos]) + for idx in range(len(pos)): + cl.enqueue_copy(queue, output_buffers[pos[idx]], gpu_buffers[pos]) + + for pos in gpu_svm.keys(): + for idx in range(len(pos)): + cl.enqueue_copy(queue, output_buffers[pos[idx]], gpu_svm[pos]) for pos in gpu_images.keys(): cl.enqueue_copy(queue, output_images[pos], gpu_images[pos], region=shape, origin=(0,0,0)) @@ -296,6 +315,8 @@ def validate(): parser = argparse.ArgumentParser(description='Script to replay and validate captured kernels') parser.add_argument('-r', '--repetitions', type=int, dest='repetitions', default=1, help='How often the kernel should be enqueued') +parser.add_argument('-m', '--svm', action='store_true', dest='svm', default=False, + help='Use SVM when replaying the captured kernel') parser.add_argument('-s', '--skipreplay', action='store_true', dest='skip', default=False, help='Skip replaying the captured kernel and do not dump data') parser.add_argument('-v', '--validate', action='store_true', dest='validate', default=False, @@ -305,7 +326,7 @@ def validate(): if args.skip: print("Skipping replay of the captured kernel.") else: - replay(args.repetitions) + replay(args.repetitions, args.svm) if args.validate: validate() diff --git a/intercept/src/intercept.cpp b/intercept/src/intercept.cpp index 88ba2e87..010bfcbf 100644 --- a/intercept/src/intercept.cpp +++ b/intercept/src/intercept.cpp @@ -7900,7 +7900,8 @@ void CLIntercept::setKernelArg( if( m_MemAllocNumberMap.find(mem) != m_MemAllocNumberMap.end() ) { CArgMemMap& argMemMap = m_KernelArgMemMap[ kernel ]; - argMemMap[ arg_index ] = mem; + SMemInfo& memInfo = argMemMap[ arg_index ]; + memInfo.Value = mem; } } @@ -7954,15 +7955,13 @@ void CLIntercept::setKernelArgSVMPointer( if( arg >= startPtr && arg < endPtr ) { CArgMemMap& argMemMap = m_KernelArgMemMap[ kernel ]; - argMemMap[ arg_index ] = startPtr; - } + SMemInfo& memInfo = argMemMap[ arg_index ]; + memInfo.Value = startPtr; + memInfo.Offset = (const char*)arg - (const char*)startPtr; - // Currently, only pointers to the start of an SVM allocation are supported for - // capture and replay. - if( arg == startPtr ) - { + // Save the base of the SVM allocation for capture and replay. CArgDataMap& argDataMap = m_KernelArgDataMap[kernel]; - const uint8_t* pRawArgData = reinterpret_cast(&arg); + const uint8_t* pRawArgData = reinterpret_cast(&startPtr); argDataMap[ arg_index ] = std::vector( pRawArgData, pRawArgData + sizeof(void*) ); } @@ -7994,15 +7993,13 @@ void CLIntercept::setKernelArgUSMPointer( if( arg >= startPtr && arg < endPtr ) { CArgMemMap& argMemMap = m_KernelArgMemMap[ kernel ]; - argMemMap[ arg_index ] = startPtr; - } + SMemInfo& memInfo = argMemMap[ arg_index ]; + memInfo.Value = startPtr; + memInfo.Offset = (const char*)arg - (const char*)startPtr; - // Currently, only pointers to the start of an SVM allocation are supported for - // capture and replay. - if( arg == startPtr ) - { + // Save the base of the USM allocation for capture and replay. CArgDataMap& argDataMap = m_KernelArgDataMap[kernel]; - const uint8_t* pRawArgData = reinterpret_cast(&arg); + const uint8_t* pRawArgData = reinterpret_cast(&startPtr); argDataMap[ arg_index ] = std::vector( pRawArgData, pRawArgData + sizeof(void*) ); } @@ -8197,7 +8194,8 @@ void CLIntercept::dumpCaptureReplayKernelArguments( for( const auto& arg : argMemMap ) { const auto index = arg.first; - const auto value = (cl_mem)arg.second; + const auto value = (cl_mem)arg.second.Value; + const auto offset = arg.second.Offset; if( m_ImageInfoMap.find( value ) != m_ImageInfoMap.end() ) { const SImageInfo& info = m_ImageInfoMap[ value ]; @@ -8213,6 +8211,12 @@ void CLIntercept::dumpCaptureReplayKernelArguments( << info.Format.image_channel_order << '\n' << static_cast(info.ImageType); } + else + { + std::string fileName{dumpDirectory + "SVM_Arg_Offset_" + std::to_string(index) + ".txt"}; + std::ofstream out{fileName}; + out << offset << '\n'; + } } const auto& samplerValues = m_KernelArgSamplerMap[kernel]; @@ -8282,7 +8286,7 @@ void CLIntercept::dumpBuffersForKernel( CLI_C_ASSERT( sizeof(void*) == sizeof(cl_mem) ); cl_uint arg_index = (*i).first; - void* allocation = (void*)(*i).second; + const void* allocation = (*i).second.Value; cl_mem memobj = (cl_mem)allocation; ++i; @@ -8392,7 +8396,7 @@ void CLIntercept::dumpBuffersForKernel( command_queue, CL_TRUE, CL_MAP_READ, - allocation, + (void*)allocation, size, 0, NULL, @@ -8420,7 +8424,7 @@ void CLIntercept::dumpBuffersForKernel( dispatch().clEnqueueSVMUnmap( command_queue, - allocation, + (void*)allocation, 0, NULL, NULL ); @@ -8530,7 +8534,7 @@ void CLIntercept::dumpImagesForKernel( CLI_C_ASSERT( sizeof(void*) == sizeof(cl_mem) ); cl_uint arg_index = (*i).first; - cl_mem memobj = (cl_mem)(*i).second; + cl_mem memobj = (cl_mem)(*i).second.Value; ++i; @@ -8666,7 +8670,7 @@ void CLIntercept::injectBuffersForKernel( CLI_C_ASSERT( sizeof(void*) == sizeof(cl_mem) ); cl_uint arg_index = (*i).first; - void* allocation = (void*)(*i).second; + void* allocation = (void*)(*i).second.Value; cl_mem memobj = (cl_mem)allocation; ++i; @@ -8861,7 +8865,7 @@ void CLIntercept::injectImagesForKernel( CLI_C_ASSERT( sizeof(void*) == sizeof(cl_mem) ); cl_uint arg_index = (*i).first; - cl_mem memobj = (cl_mem)(*i).second; + cl_mem memobj = (cl_mem)(*i).second.Value; ++i; diff --git a/intercept/src/intercept.h b/intercept/src/intercept.h index abfe9e60..98b6cb8e 100644 --- a/intercept/src/intercept.h +++ b/intercept/src/intercept.h @@ -1314,7 +1314,14 @@ class CLIntercept typedef std::map< cl_mem, SImageInfo > CImageInfoMap; CImageInfoMap m_ImageInfoMap; - typedef std::map< cl_uint, const void* > CArgMemMap; + struct SMemInfo + { + // Note: "Value" is either a cl_mem or the base of an SVM/USM allocation. + const void* Value = NULL; + size_t Offset = 0; + }; + + typedef std::map< cl_uint, SMemInfo > CArgMemMap; typedef std::map< cl_kernel, CArgMemMap > CKernelArgMemMap; CKernelArgMemMap m_KernelArgMemMap; From 2a199023d698d627fc486bd535fe14dae46765d7 Mon Sep 17 00:00:00 2001 From: Ben Ashbaugh Date: Mon, 19 Jan 2026 17:35:43 -0800 Subject: [PATCH 2/3] tidy up the replay and validate script --- intercept/scripts/run.py | 53 +++++++++++++++++-------------------- intercept/src/intercept.cpp | 5 +--- 2 files changed, 26 insertions(+), 32 deletions(-) diff --git a/intercept/scripts/run.py b/intercept/scripts/run.py index 44c14fdd..1131755a 100644 --- a/intercept/scripts/run.py +++ b/intercept/scripts/run.py @@ -14,13 +14,6 @@ import argparse from collections import defaultdict -def get_svm_arg_offset(idx: Int): - fileName = f"./SVM_Arg_Offset_{idx}.txt" - with open(fileName) as offsetFile: - offset = int(offsetFile.read()) - return offset - return 0 - def get_image_metadata(idx: int): fileName = f"./Image_MetaData_{idx}.txt" with open(fileName) as metadata: @@ -60,50 +53,54 @@ def replay(repetitions, use_svm = False): padded_enqueue_num = str(enqueue_number).rjust(4, "0") arguments = {} - argument_files = gl.glob("./Argument*.bin") - for argument in argument_files: - idx = int(re.findall(r'\d+', argument)[0]) - arguments[idx] = np.fromfile(argument, dtype='uint8').tobytes() + for fileName in gl.glob("./Argument*.bin"): + idx = int(re.findall(r'\d+', fileName)[0]) + arguments[idx] = np.fromfile(fileName, dtype='uint8').tobytes() + + svm_arg_offsets = {} + for fileName in gl.glob("./SVM_Arg_Offset*.txt"): + idx = int(re.findall(r'\d+', fileName)[0]) + with open(fileName) as file: + svm_arg_offsets[idx] = int(file.read()) + if use_svm is False and svm_arg_offsets[idx] != 0: + print("Non-zero SVM arg offset found, forcing SVM replay.") + use_svm = True buffer_idx = [] input_buffers = {} output_buffers = {} - buffer_files = gl.glob("./Pre/Enqueue_" + padded_enqueue_num + "*.bin") input_buffer_ptrs = defaultdict(list) - for buffer in buffer_files: - start = buffer.find("_Arg_") - idx = int(re.findall(r'\d+', buffer[start:])[0]) + for fileName in gl.glob("./Pre/Enqueue_" + padded_enqueue_num + "*.bin"): + start = fileName.find("_Arg_") + idx = int(re.findall(r'\d+', fileName[start:])[0]) buffer_idx.append(idx) - input_buffers[idx] = np.fromfile(buffer, dtype='uint8') + input_buffers[idx] = np.fromfile(fileName, dtype='uint8') input_buffer_ptrs[arguments[idx]].append(idx) output_buffers[idx] = np.empty_like(input_buffers[idx]) image_idx = [] input_images = {} output_images = {} - image_files = gl.glob("./Pre/Enqueue_" + padded_enqueue_num + "*.raw") input_images_ptrs = defaultdict(list) - for image in image_files: - start = image.find("_Arg_") - idx = int(re.findall(r'\d+', image[start:])[0]) + for fileName in gl.glob("./Pre/Enqueue_" + padded_enqueue_num + "*.raw"): + start = fileName.find("_Arg_") + idx = int(re.findall(r'\d+', fileName[start:])[0]) image_idx.append(idx) - input_images[idx] = np.fromfile(image, dtype='uint8') + input_images[idx] = np.fromfile(fileName, dtype='uint8') input_images_ptrs[arguments[idx]].append(idx) output_images[idx] = np.empty_like(input_images[idx]) local_sizes = {} - local_files = gl.glob("./Local*.txt") - for local in local_files: - with open(local) as file: + for fileName in gl.glob("./Local*.txt"): + idx = int(re.findall(r'\d+', fileName)[0]) + with open(fileName) as file: size = int(file.read()) - local_sizes[int(re.findall(r'\d+', local)[0])] = size + local_sizes[idx] = size # Check if we have pointer aliasing for the buffers tmp_args = [] for idx in buffer_idx: tmp_args.append(arguments[idx]) - - # Check if all input pointer addresses are unique if len(tmp_args) != len(set(tmp_args)): print("Some of the buffers are aliasing, we will replicate this behavior.") @@ -192,7 +189,7 @@ def replay(repetitions, use_svm = False): for pos, svm in gpu_svm.items(): for idx in pos: - offset = get_svm_arg_offset(idx) + offset = svm_arg_offsets[idx] kernel.set_arg(idx, cl.SVM(svm.mem[offset:])) for pos, image in gpu_images.items(): diff --git a/intercept/src/intercept.cpp b/intercept/src/intercept.cpp index 010bfcbf..6af66c33 100644 --- a/intercept/src/intercept.cpp +++ b/intercept/src/intercept.cpp @@ -7959,7 +7959,6 @@ void CLIntercept::setKernelArgSVMPointer( memInfo.Value = startPtr; memInfo.Offset = (const char*)arg - (const char*)startPtr; - // Save the base of the SVM allocation for capture and replay. CArgDataMap& argDataMap = m_KernelArgDataMap[kernel]; const uint8_t* pRawArgData = reinterpret_cast(&startPtr); argDataMap[ arg_index ] = std::vector( @@ -7997,7 +7996,6 @@ void CLIntercept::setKernelArgUSMPointer( memInfo.Value = startPtr; memInfo.Offset = (const char*)arg - (const char*)startPtr; - // Save the base of the USM allocation for capture and replay. CArgDataMap& argDataMap = m_KernelArgDataMap[kernel]; const uint8_t* pRawArgData = reinterpret_cast(&startPtr); argDataMap[ arg_index ] = std::vector( @@ -8195,7 +8193,6 @@ void CLIntercept::dumpCaptureReplayKernelArguments( { const auto index = arg.first; const auto value = (cl_mem)arg.second.Value; - const auto offset = arg.second.Offset; if( m_ImageInfoMap.find( value ) != m_ImageInfoMap.end() ) { const SImageInfo& info = m_ImageInfoMap[ value ]; @@ -8215,7 +8212,7 @@ void CLIntercept::dumpCaptureReplayKernelArguments( { std::string fileName{dumpDirectory + "SVM_Arg_Offset_" + std::to_string(index) + ".txt"}; std::ofstream out{fileName}; - out << offset << '\n'; + out << arg.second.Offset << '\n'; } } From b61e5e8c11a378256f910e5e8c6b701ce898709d Mon Sep 17 00:00:00 2001 From: Ben Ashbaugh Date: Mon, 19 Jan 2026 21:15:46 -0800 Subject: [PATCH 3/3] one more minor replay script improvement --- intercept/scripts/run.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/intercept/scripts/run.py b/intercept/scripts/run.py index 1131755a..33e734ba 100644 --- a/intercept/scripts/run.py +++ b/intercept/scripts/run.py @@ -45,7 +45,7 @@ def sampler_from_string(ctx, sampler_descr): cl.filter_mode.NEAREST return cl.Sampler(ctx, False, addressing_mode, filter_mode) -def replay(repetitions, use_svm = False): +def replay(repetitions = 1, use_svm = False): # Read the enqueue number from the file with open('./enqueueNumber.txt') as file: enqueue_number = file.read().splitlines()[0]