From ecb2d562f90c316e7f4e2168ec0faf95dc018727 Mon Sep 17 00:00:00 2001 From: Lukasz Dorau Date: Tue, 3 Mar 2026 19:06:26 +0000 Subject: [PATCH 1/9] [UR] Add urKernelGetSuggestedLocalWorkSizeWithArgs API to UR spec Signed-off-by: Lukasz Dorau --- unified-runtime/scripts/core/kernel.yml | 47 +++++++++++++++++++++++ unified-runtime/scripts/core/registry.yml | 3 ++ 2 files changed, 50 insertions(+) diff --git a/unified-runtime/scripts/core/kernel.yml b/unified-runtime/scripts/core/kernel.yml index a420a8568f3e5..70ba4ebf76c7f 100644 --- a/unified-runtime/scripts/core/kernel.yml +++ b/unified-runtime/scripts/core/kernel.yml @@ -602,6 +602,53 @@ returns: - $X_RESULT_ERROR_UNSUPPORTED_FEATURE --- #-------------------------------------------------------------------------- type: function +desc: "Get the suggested local work size for a kernel and set args at kernel launch time." +class: $xKernel +name: GetSuggestedLocalWorkSizeWithArgs +ordinal: "0" +details: + - "Query a suggested local work size for a kernel given a global size for each dimension." + - "The application may call this function from simultaneous threads for the same context." +params: + - type: $x_kernel_handle_t + name: hKernel + desc: | + [in] handle of the kernel + - type: $x_queue_handle_t + name: hQueue + desc: | + [in] handle of the queue object + - type: uint32_t + name: numWorkDim + desc: | + [in] number of dimensions, from 1 to 3, to specify the global + and work-group work-items + - type: const size_t* + name: pGlobalWorkOffset + desc: | + [in] pointer to an array of numWorkDim unsigned values that specify + the offset used to calculate the global ID of a work-item + - type: const size_t* + name: pGlobalWorkSize + desc: | + [in] pointer to an array of numWorkDim unsigned values that specify + the number of global work-items in workDim that will execute the + kernel function + - type: uint32_t + name: numArgs + desc: "[in] Number of entries in pArgs" + - type: "const $x_exp_kernel_arg_properties_t*" + name: pArgs + desc: "[in][optional][range(0, numArgs)] pointer to a list of kernel arg properties." + - type: size_t* + name: pSuggestedLocalWorkSize + desc: | + [out] pointer to an array of numWorkDim unsigned values that specify + suggested local work size that will contain the result of the query +returns: + - $X_RESULT_ERROR_UNSUPPORTED_FEATURE +--- #-------------------------------------------------------------------------- +type: function desc: "Query the maximum number of work groups for a cooperative kernel" class: $xKernel name: SuggestMaxCooperativeGroupCount diff --git a/unified-runtime/scripts/core/registry.yml b/unified-runtime/scripts/core/registry.yml index 8762563e9a569..93757c2dae27b 100644 --- a/unified-runtime/scripts/core/registry.yml +++ b/unified-runtime/scripts/core/registry.yml @@ -523,6 +523,9 @@ etors: - name: ENQUEUE_TIMESTAMP_RECORDING_EXP desc: Enumerator for $xEnqueueTimestampRecordingExp value: '223' +- name: KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE_WITH_ARGS + desc: Enumerator for $xKernelGetSuggestedLocalWorkSizeWithArgs + value: '224' - name: KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE desc: Enumerator for $xKernelGetSuggestedLocalWorkSize value: '225' From b2331e94686c7e7fe391eb3362d7c6ba810658eb Mon Sep 17 00:00:00 2001 From: Lukasz Dorau Date: Wed, 4 Mar 2026 08:16:46 +0000 Subject: [PATCH 2/9] [UR] Add urKernelGetSuggestedLocalWorkSizeWithArgs API (generated UR code) Commit output of make generate. Signed-off-by: Lukasz Dorau --- .../include/unified-runtime/ur_api.h | 66 +++++++++++++++++ .../include/unified-runtime/ur_api_funcs.def | 1 + .../include/unified-runtime/ur_ddi.h | 9 +++ .../include/unified-runtime/ur_print.h | 13 ++++ .../include/unified-runtime/ur_print.hpp | 68 +++++++++++++++++ .../level_zero/ur_interface_loader.cpp | 2 + .../level_zero/ur_interface_loader.hpp | 5 ++ .../source/adapters/mock/ur_mockddi.cpp | 70 ++++++++++++++++++ .../loader/layers/tracing/ur_trcddi.cpp | 67 +++++++++++++++++ .../loader/layers/validation/ur_valddi.cpp | 74 +++++++++++++++++++ unified-runtime/source/loader/loader.def.in | 2 + unified-runtime/source/loader/loader.map.in | 2 + unified-runtime/source/loader/ur_ldrddi.cpp | 41 ++++++++++ unified-runtime/source/loader/ur_libapi.cpp | 61 +++++++++++++++ unified-runtime/source/loader/ur_print.cpp | 9 +++ unified-runtime/source/ur_api.cpp | 52 +++++++++++++ 16 files changed, 542 insertions(+) diff --git a/unified-runtime/include/unified-runtime/ur_api.h b/unified-runtime/include/unified-runtime/ur_api.h index d8b040fa4e763..6e750d761549d 100644 --- a/unified-runtime/include/unified-runtime/ur_api.h +++ b/unified-runtime/include/unified-runtime/ur_api.h @@ -377,6 +377,8 @@ typedef enum ur_function_t { UR_FUNCTION_COMMAND_BUFFER_GET_INFO_EXP = 221, /// Enumerator for ::urEnqueueTimestampRecordingExp UR_FUNCTION_ENQUEUE_TIMESTAMP_RECORDING_EXP = 223, + /// Enumerator for ::urKernelGetSuggestedLocalWorkSizeWithArgs + UR_FUNCTION_KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE_WITH_ARGS = 224, /// Enumerator for ::urKernelGetSuggestedLocalWorkSize UR_FUNCTION_KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE = 225, /// Enumerator for ::urBindlessImagesImportExternalMemoryExp @@ -9501,6 +9503,55 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize( /// suggested local work size that will contain the result of the query size_t *pSuggestedLocalWorkSize); +/////////////////////////////////////////////////////////////////////////////// +/// @brief Get the suggested local work size for a kernel and set args at kernel +/// launch time. +/// +/// @details +/// - Query a suggested local work size for a kernel given a global size for +/// each dimension. +/// - The application may call this function from simultaneous threads for +/// the same context. +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_UNINITIALIZED +/// - ::UR_RESULT_ERROR_DEVICE_LOST +/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC +/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE +/// + `NULL == hKernel` +/// + `NULL == hQueue` +/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +/// + `NULL == pGlobalWorkOffset` +/// + `NULL == pGlobalWorkSize` +/// + `NULL == pSuggestedLocalWorkSize` +/// - ::UR_RESULT_ERROR_INVALID_ENUMERATION +/// + `NULL != pArgs && ::UR_EXP_KERNEL_ARG_TYPE_SAMPLER < pArgs->type` +/// - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE +UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSizeWithArgs( + /// [in] handle of the kernel + ur_kernel_handle_t hKernel, + /// [in] handle of the queue object + ur_queue_handle_t hQueue, + /// [in] number of dimensions, from 1 to 3, to specify the global + /// and work-group work-items + uint32_t numWorkDim, + /// [in] pointer to an array of numWorkDim unsigned values that specify + /// the offset used to calculate the global ID of a work-item + const size_t *pGlobalWorkOffset, + /// [in] pointer to an array of numWorkDim unsigned values that specify + /// the number of global work-items in workDim that will execute the + /// kernel function + const size_t *pGlobalWorkSize, + /// [in] Number of entries in pArgs + uint32_t numArgs, + /// [in][optional][range(0, numArgs)] pointer to a list of kernel arg + /// properties. + const ur_exp_kernel_arg_properties_t *pArgs, + /// [out] pointer to an array of numWorkDim unsigned values that specify + /// suggested local work size that will contain the result of the query + size_t *pSuggestedLocalWorkSize); + /////////////////////////////////////////////////////////////////////////////// /// @brief Query the maximum number of work groups for a cooperative kernel /// @@ -14580,6 +14631,21 @@ typedef struct ur_kernel_get_suggested_local_work_size_params_t { size_t **ppSuggestedLocalWorkSize; } ur_kernel_get_suggested_local_work_size_params_t; +/////////////////////////////////////////////////////////////////////////////// +/// @brief Function parameters for urKernelGetSuggestedLocalWorkSizeWithArgs +/// @details Each entry is a pointer to the parameter passed to the function; +/// allowing the callback the ability to modify the parameter's value +typedef struct ur_kernel_get_suggested_local_work_size_with_args_params_t { + ur_kernel_handle_t *phKernel; + ur_queue_handle_t *phQueue; + uint32_t *pnumWorkDim; + const size_t **ppGlobalWorkOffset; + const size_t **ppGlobalWorkSize; + uint32_t *pnumArgs; + const ur_exp_kernel_arg_properties_t **ppArgs; + size_t **ppSuggestedLocalWorkSize; +} ur_kernel_get_suggested_local_work_size_with_args_params_t; + /////////////////////////////////////////////////////////////////////////////// /// @brief Function parameters for urKernelSetArgValue /// @details Each entry is a pointer to the parameter passed to the function; diff --git a/unified-runtime/include/unified-runtime/ur_api_funcs.def b/unified-runtime/include/unified-runtime/ur_api_funcs.def index 06a3efa6bc437..3659aee433432 100644 --- a/unified-runtime/include/unified-runtime/ur_api_funcs.def +++ b/unified-runtime/include/unified-runtime/ur_api_funcs.def @@ -72,6 +72,7 @@ _UR_API(urKernelRelease) _UR_API(urKernelGetNativeHandle) _UR_API(urKernelCreateWithNativeHandle) _UR_API(urKernelGetSuggestedLocalWorkSize) +_UR_API(urKernelGetSuggestedLocalWorkSizeWithArgs) _UR_API(urKernelSetArgValue) _UR_API(urKernelSetArgLocal) _UR_API(urKernelSetArgPointer) diff --git a/unified-runtime/include/unified-runtime/ur_ddi.h b/unified-runtime/include/unified-runtime/ur_ddi.h index da3747e385a9d..3c381bc98dcda 100644 --- a/unified-runtime/include/unified-runtime/ur_ddi.h +++ b/unified-runtime/include/unified-runtime/ur_ddi.h @@ -521,6 +521,13 @@ typedef ur_result_t(UR_APICALL *ur_pfnKernelGetSuggestedLocalWorkSize_t)( ur_kernel_handle_t, ur_queue_handle_t, uint32_t, const size_t *, const size_t *, size_t *); +/////////////////////////////////////////////////////////////////////////////// +/// @brief Function-pointer for urKernelGetSuggestedLocalWorkSizeWithArgs +typedef ur_result_t( + UR_APICALL *ur_pfnKernelGetSuggestedLocalWorkSizeWithArgs_t)( + ur_kernel_handle_t, ur_queue_handle_t, uint32_t, const size_t *, + const size_t *, uint32_t, const ur_exp_kernel_arg_properties_t *, size_t *); + /////////////////////////////////////////////////////////////////////////////// /// @brief Function-pointer for urKernelSetArgValue typedef ur_result_t(UR_APICALL *ur_pfnKernelSetArgValue_t)( @@ -580,6 +587,8 @@ typedef struct ur_kernel_dditable_t { ur_pfnKernelGetNativeHandle_t pfnGetNativeHandle; ur_pfnKernelCreateWithNativeHandle_t pfnCreateWithNativeHandle; ur_pfnKernelGetSuggestedLocalWorkSize_t pfnGetSuggestedLocalWorkSize; + ur_pfnKernelGetSuggestedLocalWorkSizeWithArgs_t + pfnGetSuggestedLocalWorkSizeWithArgs; ur_pfnKernelSetArgValue_t pfnSetArgValue; ur_pfnKernelSetArgLocal_t pfnSetArgLocal; ur_pfnKernelSetArgPointer_t pfnSetArgPointer; diff --git a/unified-runtime/include/unified-runtime/ur_print.h b/unified-runtime/include/unified-runtime/ur_print.h index 881027a92a77e..ea22e1e4c7783 100644 --- a/unified-runtime/include/unified-runtime/ur_print.h +++ b/unified-runtime/include/unified-runtime/ur_print.h @@ -2169,6 +2169,19 @@ urPrintKernelGetSuggestedLocalWorkSizeParams( const struct ur_kernel_get_suggested_local_work_size_params_t *params, char *buffer, const size_t buff_size, size_t *out_size); +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print ur_kernel_get_suggested_local_work_size_with_args_params_t +/// struct +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_INVALID_SIZE +/// - `buff_size < out_size` +UR_APIEXPORT ur_result_t UR_APICALL +urPrintKernelGetSuggestedLocalWorkSizeWithArgsParams( + const struct ur_kernel_get_suggested_local_work_size_with_args_params_t + *params, + char *buffer, const size_t buff_size, size_t *out_size); + /////////////////////////////////////////////////////////////////////////////// /// @brief Print ur_kernel_set_arg_value_params_t struct /// @returns diff --git a/unified-runtime/include/unified-runtime/ur_print.hpp b/unified-runtime/include/unified-runtime/ur_print.hpp index e1d30d0d1f3b3..75a37a9236df6 100644 --- a/unified-runtime/include/unified-runtime/ur_print.hpp +++ b/unified-runtime/include/unified-runtime/ur_print.hpp @@ -1165,6 +1165,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_function_t value) { case UR_FUNCTION_ENQUEUE_TIMESTAMP_RECORDING_EXP: os << "UR_FUNCTION_ENQUEUE_TIMESTAMP_RECORDING_EXP"; break; + case UR_FUNCTION_KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE_WITH_ARGS: + os << "UR_FUNCTION_KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE_WITH_ARGS"; + break; case UR_FUNCTION_KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE: os << "UR_FUNCTION_KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE"; break; @@ -14923,6 +14926,67 @@ operator<<(std::ostream &os, [[maybe_unused]] const struct return os; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print operator for the +/// ur_kernel_get_suggested_local_work_size_with_args_params_t type +/// @returns +/// std::ostream & +inline std::ostream & +operator<<(std::ostream &os, [[maybe_unused]] const struct + ur_kernel_get_suggested_local_work_size_with_args_params_t *params) { + + os << ".hKernel = "; + + ur::details::printPtr(os, *(params->phKernel)); + + os << ", "; + os << ".hQueue = "; + + ur::details::printPtr(os, *(params->phQueue)); + + os << ", "; + os << ".numWorkDim = "; + + os << *(params->pnumWorkDim); + + os << ", "; + os << ".pGlobalWorkOffset = "; + + ur::details::printPtr(os, *(params->ppGlobalWorkOffset)); + + os << ", "; + os << ".pGlobalWorkSize = "; + + ur::details::printPtr(os, *(params->ppGlobalWorkSize)); + + os << ", "; + os << ".numArgs = "; + + os << *(params->pnumArgs); + + os << ", "; + os << ".pArgs = "; + ur::details::printPtr(os, reinterpret_cast(*(params->ppArgs))); + if (*(params->ppArgs) != NULL) { + os << " {"; + for (size_t i = 0; i < *params->pnumArgs; ++i) { + if (i != 0) { + os << ", "; + } + + os << (*(params->ppArgs))[i]; + } + os << "}"; + } + + os << ", "; + os << ".pSuggestedLocalWorkSize = "; + + ur::details::printPtr(os, *(params->ppSuggestedLocalWorkSize)); + + return os; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Print operator for the ur_kernel_set_arg_value_params_t type /// @returns @@ -22582,6 +22646,10 @@ inline ur_result_t UR_APICALL printFunctionParams(std::ostream &os, os << (const struct ur_kernel_get_suggested_local_work_size_params_t *) params; } break; + case UR_FUNCTION_KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE_WITH_ARGS: { + os << (const struct + ur_kernel_get_suggested_local_work_size_with_args_params_t *)params; + } break; case UR_FUNCTION_KERNEL_SET_ARG_VALUE: { os << (const struct ur_kernel_set_arg_value_params_t *)params; } break; diff --git a/unified-runtime/source/adapters/level_zero/ur_interface_loader.cpp b/unified-runtime/source/adapters/level_zero/ur_interface_loader.cpp index e1fcdc4dd5739..e1bfdb7120bc0 100644 --- a/unified-runtime/source/adapters/level_zero/ur_interface_loader.cpp +++ b/unified-runtime/source/adapters/level_zero/ur_interface_loader.cpp @@ -315,6 +315,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable( ur::level_zero::urKernelCreateWithNativeHandle; pDdiTable->pfnGetSuggestedLocalWorkSize = ur::level_zero::urKernelGetSuggestedLocalWorkSize; + pDdiTable->pfnGetSuggestedLocalWorkSizeWithArgs = + ur::level_zero::urKernelGetSuggestedLocalWorkSizeWithArgs; pDdiTable->pfnSetArgValue = ur::level_zero::urKernelSetArgValue; pDdiTable->pfnSetArgLocal = ur::level_zero::urKernelSetArgLocal; pDdiTable->pfnSetArgPointer = ur::level_zero::urKernelSetArgPointer; diff --git a/unified-runtime/source/adapters/level_zero/ur_interface_loader.hpp b/unified-runtime/source/adapters/level_zero/ur_interface_loader.hpp index f64c404ec3c9a..e906015b67855 100644 --- a/unified-runtime/source/adapters/level_zero/ur_interface_loader.hpp +++ b/unified-runtime/source/adapters/level_zero/ur_interface_loader.hpp @@ -483,6 +483,11 @@ ur_result_t urKernelGetSuggestedLocalWorkSize(ur_kernel_handle_t hKernel, const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, size_t *pSuggestedLocalWorkSize); +ur_result_t urKernelGetSuggestedLocalWorkSizeWithArgs( + ur_kernel_handle_t hKernel, ur_queue_handle_t hQueue, uint32_t numWorkDim, + const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, + uint32_t numArgs, const ur_exp_kernel_arg_properties_t *pArgs, + size_t *pSuggestedLocalWorkSize); ur_result_t urKernelSuggestMaxCooperativeGroupCount( ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, uint32_t workDim, const size_t *pLocalWorkSize, size_t dynamicSharedMemorySize, diff --git a/unified-runtime/source/adapters/mock/ur_mockddi.cpp b/unified-runtime/source/adapters/mock/ur_mockddi.cpp index 1603b657a459a..f4e553891f027 100644 --- a/unified-runtime/source/adapters/mock/ur_mockddi.cpp +++ b/unified-runtime/source/adapters/mock/ur_mockddi.cpp @@ -7473,6 +7473,73 @@ __urdlllocal ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize( return exceptionToResult(std::current_exception()); } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urKernelGetSuggestedLocalWorkSizeWithArgs +__urdlllocal ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSizeWithArgs( + /// [in] handle of the kernel + ur_kernel_handle_t hKernel, + /// [in] handle of the queue object + ur_queue_handle_t hQueue, + /// [in] number of dimensions, from 1 to 3, to specify the global + /// and work-group work-items + uint32_t numWorkDim, + /// [in] pointer to an array of numWorkDim unsigned values that specify + /// the offset used to calculate the global ID of a work-item + const size_t *pGlobalWorkOffset, + /// [in] pointer to an array of numWorkDim unsigned values that specify + /// the number of global work-items in workDim that will execute the + /// kernel function + const size_t *pGlobalWorkSize, + /// [in] Number of entries in pArgs + uint32_t numArgs, + /// [in][optional][range(0, numArgs)] pointer to a list of kernel arg + /// properties. + const ur_exp_kernel_arg_properties_t *pArgs, + /// [out] pointer to an array of numWorkDim unsigned values that specify + /// suggested local work size that will contain the result of the query + size_t *pSuggestedLocalWorkSize) try { + ur_result_t result = UR_RESULT_SUCCESS; + + ur_kernel_get_suggested_local_work_size_with_args_params_t params = { + &hKernel, &hQueue, &numWorkDim, &pGlobalWorkOffset, + &pGlobalWorkSize, &numArgs, &pArgs, &pSuggestedLocalWorkSize}; + + auto beforeCallback = reinterpret_cast( + mock::getCallbacks().get_before_callback( + "urKernelGetSuggestedLocalWorkSizeWithArgs")); + if (beforeCallback) { + result = beforeCallback(¶ms); + if (result != UR_RESULT_SUCCESS) { + return result; + } + } + + auto replaceCallback = reinterpret_cast( + mock::getCallbacks().get_replace_callback( + "urKernelGetSuggestedLocalWorkSizeWithArgs")); + if (replaceCallback) { + result = replaceCallback(¶ms); + } else { + + result = UR_RESULT_SUCCESS; + } + + if (result != UR_RESULT_SUCCESS) { + return result; + } + + auto afterCallback = reinterpret_cast( + mock::getCallbacks().get_after_callback( + "urKernelGetSuggestedLocalWorkSizeWithArgs")); + if (afterCallback) { + return afterCallback(¶ms); + } + + return result; +} catch (...) { + return exceptionToResult(std::current_exception()); +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urKernelSuggestMaxCooperativeGroupCount __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCount( @@ -13664,6 +13731,9 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable( pDdiTable->pfnGetSuggestedLocalWorkSize = driver::urKernelGetSuggestedLocalWorkSize; + pDdiTable->pfnGetSuggestedLocalWorkSizeWithArgs = + driver::urKernelGetSuggestedLocalWorkSizeWithArgs; + pDdiTable->pfnSetArgValue = driver::urKernelSetArgValue; pDdiTable->pfnSetArgLocal = driver::urKernelSetArgLocal; diff --git a/unified-runtime/source/loader/layers/tracing/ur_trcddi.cpp b/unified-runtime/source/loader/layers/tracing/ur_trcddi.cpp index 0e873718facba..11ccfa1c4610b 100644 --- a/unified-runtime/source/loader/layers/tracing/ur_trcddi.cpp +++ b/unified-runtime/source/loader/layers/tracing/ur_trcddi.cpp @@ -6224,6 +6224,68 @@ __urdlllocal ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urKernelGetSuggestedLocalWorkSizeWithArgs +__urdlllocal ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSizeWithArgs( + /// [in] handle of the kernel + ur_kernel_handle_t hKernel, + /// [in] handle of the queue object + ur_queue_handle_t hQueue, + /// [in] number of dimensions, from 1 to 3, to specify the global + /// and work-group work-items + uint32_t numWorkDim, + /// [in] pointer to an array of numWorkDim unsigned values that specify + /// the offset used to calculate the global ID of a work-item + const size_t *pGlobalWorkOffset, + /// [in] pointer to an array of numWorkDim unsigned values that specify + /// the number of global work-items in workDim that will execute the + /// kernel function + const size_t *pGlobalWorkSize, + /// [in] Number of entries in pArgs + uint32_t numArgs, + /// [in][optional][range(0, numArgs)] pointer to a list of kernel arg + /// properties. + const ur_exp_kernel_arg_properties_t *pArgs, + /// [out] pointer to an array of numWorkDim unsigned values that specify + /// suggested local work size that will contain the result of the query + size_t *pSuggestedLocalWorkSize) { + auto pfnGetSuggestedLocalWorkSizeWithArgs = + getContext()->urDdiTable.Kernel.pfnGetSuggestedLocalWorkSizeWithArgs; + + if (nullptr == pfnGetSuggestedLocalWorkSizeWithArgs) + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + + ur_kernel_get_suggested_local_work_size_with_args_params_t params = { + &hKernel, &hQueue, &numWorkDim, &pGlobalWorkOffset, + &pGlobalWorkSize, &numArgs, &pArgs, &pSuggestedLocalWorkSize}; + uint64_t instance = getContext()->notify_begin( + UR_FUNCTION_KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE_WITH_ARGS, + "urKernelGetSuggestedLocalWorkSizeWithArgs", ¶ms); + + auto &logger = getContext()->logger; + UR_LOG_L(logger, INFO, " ---> urKernelGetSuggestedLocalWorkSizeWithArgs\n"); + + ur_result_t result = pfnGetSuggestedLocalWorkSizeWithArgs( + hKernel, hQueue, numWorkDim, pGlobalWorkOffset, pGlobalWorkSize, numArgs, + pArgs, pSuggestedLocalWorkSize); + + getContext()->notify_end( + UR_FUNCTION_KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE_WITH_ARGS, + "urKernelGetSuggestedLocalWorkSizeWithArgs", ¶ms, &result, instance); + + if (logger.getLevel() <= UR_LOGGER_LEVEL_INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE_WITH_ARGS, + ¶ms); + UR_LOG_L(logger, INFO, + " <--- urKernelGetSuggestedLocalWorkSizeWithArgs({}) -> {};\n", + args_str.str(), result); + } + + return result; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urKernelSuggestMaxCooperativeGroupCount __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCount( @@ -11841,6 +11903,11 @@ __urdlllocal ur_result_t UR_APICALL urGetKernelProcAddrTable( pDdiTable->pfnGetSuggestedLocalWorkSize = ur_tracing_layer::urKernelGetSuggestedLocalWorkSize; + dditable.pfnGetSuggestedLocalWorkSizeWithArgs = + pDdiTable->pfnGetSuggestedLocalWorkSizeWithArgs; + pDdiTable->pfnGetSuggestedLocalWorkSizeWithArgs = + ur_tracing_layer::urKernelGetSuggestedLocalWorkSizeWithArgs; + dditable.pfnSetArgValue = pDdiTable->pfnSetArgValue; pDdiTable->pfnSetArgValue = ur_tracing_layer::urKernelSetArgValue; diff --git a/unified-runtime/source/loader/layers/validation/ur_valddi.cpp b/unified-runtime/source/loader/layers/validation/ur_valddi.cpp index 89adf47a79943..cc28eb9c34ff0 100644 --- a/unified-runtime/source/loader/layers/validation/ur_valddi.cpp +++ b/unified-runtime/source/loader/layers/validation/ur_valddi.cpp @@ -6887,6 +6887,75 @@ __urdlllocal ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urKernelGetSuggestedLocalWorkSizeWithArgs +__urdlllocal ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSizeWithArgs( + /// [in] handle of the kernel + ur_kernel_handle_t hKernel, + /// [in] handle of the queue object + ur_queue_handle_t hQueue, + /// [in] number of dimensions, from 1 to 3, to specify the global + /// and work-group work-items + uint32_t numWorkDim, + /// [in] pointer to an array of numWorkDim unsigned values that specify + /// the offset used to calculate the global ID of a work-item + const size_t *pGlobalWorkOffset, + /// [in] pointer to an array of numWorkDim unsigned values that specify + /// the number of global work-items in workDim that will execute the + /// kernel function + const size_t *pGlobalWorkSize, + /// [in] Number of entries in pArgs + uint32_t numArgs, + /// [in][optional][range(0, numArgs)] pointer to a list of kernel arg + /// properties. + const ur_exp_kernel_arg_properties_t *pArgs, + /// [out] pointer to an array of numWorkDim unsigned values that specify + /// suggested local work size that will contain the result of the query + size_t *pSuggestedLocalWorkSize) { + auto pfnGetSuggestedLocalWorkSizeWithArgs = + getContext()->urDdiTable.Kernel.pfnGetSuggestedLocalWorkSizeWithArgs; + + if (nullptr == pfnGetSuggestedLocalWorkSizeWithArgs) { + return UR_RESULT_ERROR_UNINITIALIZED; + } + + if (getContext()->enableParameterValidation) { + if (NULL == pGlobalWorkOffset) + return UR_RESULT_ERROR_INVALID_NULL_POINTER; + + if (NULL == pGlobalWorkSize) + return UR_RESULT_ERROR_INVALID_NULL_POINTER; + + if (NULL == pSuggestedLocalWorkSize) + return UR_RESULT_ERROR_INVALID_NULL_POINTER; + + if (NULL == hKernel) + return UR_RESULT_ERROR_INVALID_NULL_HANDLE; + + if (NULL == hQueue) + return UR_RESULT_ERROR_INVALID_NULL_HANDLE; + + if (NULL != pArgs && UR_EXP_KERNEL_ARG_TYPE_SAMPLER < pArgs->type) + return UR_RESULT_ERROR_INVALID_ENUMERATION; + } + + if (getContext()->enableLifetimeValidation && + !getContext()->refCountContext->isReferenceValid(hKernel)) { + URLOG_CTX_INVALID_REFERENCE(hKernel); + } + + if (getContext()->enableLifetimeValidation && + !getContext()->refCountContext->isReferenceValid(hQueue)) { + URLOG_CTX_INVALID_REFERENCE(hQueue); + } + + ur_result_t result = pfnGetSuggestedLocalWorkSizeWithArgs( + hKernel, hQueue, numWorkDim, pGlobalWorkOffset, pGlobalWorkSize, numArgs, + pArgs, pSuggestedLocalWorkSize); + + return result; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urKernelSuggestMaxCooperativeGroupCount __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCount( @@ -12625,6 +12694,11 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable( pDdiTable->pfnGetSuggestedLocalWorkSize = ur_validation_layer::urKernelGetSuggestedLocalWorkSize; + dditable.pfnGetSuggestedLocalWorkSizeWithArgs = + pDdiTable->pfnGetSuggestedLocalWorkSizeWithArgs; + pDdiTable->pfnGetSuggestedLocalWorkSizeWithArgs = + ur_validation_layer::urKernelGetSuggestedLocalWorkSizeWithArgs; + dditable.pfnSetArgValue = pDdiTable->pfnSetArgValue; pDdiTable->pfnSetArgValue = ur_validation_layer::urKernelSetArgValue; diff --git a/unified-runtime/source/loader/loader.def.in b/unified-runtime/source/loader/loader.def.in index a9a1325e97d12..3bc3e9d2957aa 100644 --- a/unified-runtime/source/loader/loader.def.in +++ b/unified-runtime/source/loader/loader.def.in @@ -157,6 +157,7 @@ EXPORTS urKernelGetNativeHandle urKernelGetSubGroupInfo urKernelGetSuggestedLocalWorkSize + urKernelGetSuggestedLocalWorkSizeWithArgs urKernelRelease urKernelRetain urKernelSetArgLocal @@ -417,6 +418,7 @@ EXPORTS urPrintKernelGetNativeHandleParams urPrintKernelGetSubGroupInfoParams urPrintKernelGetSuggestedLocalWorkSizeParams + urPrintKernelGetSuggestedLocalWorkSizeWithArgsParams urPrintKernelGroupInfo urPrintKernelInfo urPrintKernelLaunchClusterProperty diff --git a/unified-runtime/source/loader/loader.map.in b/unified-runtime/source/loader/loader.map.in index 465978fdb5f8e..3d6e0e7b9db43 100644 --- a/unified-runtime/source/loader/loader.map.in +++ b/unified-runtime/source/loader/loader.map.in @@ -157,6 +157,7 @@ urKernelGetNativeHandle; urKernelGetSubGroupInfo; urKernelGetSuggestedLocalWorkSize; + urKernelGetSuggestedLocalWorkSizeWithArgs; urKernelRelease; urKernelRetain; urKernelSetArgLocal; @@ -417,6 +418,7 @@ urPrintKernelGetNativeHandleParams; urPrintKernelGetSubGroupInfoParams; urPrintKernelGetSuggestedLocalWorkSizeParams; + urPrintKernelGetSuggestedLocalWorkSizeWithArgsParams; urPrintKernelGroupInfo; urPrintKernelInfo; urPrintKernelLaunchClusterProperty; diff --git a/unified-runtime/source/loader/ur_ldrddi.cpp b/unified-runtime/source/loader/ur_ldrddi.cpp index 0439bb0d95456..5263fafcf1661 100644 --- a/unified-runtime/source/loader/ur_ldrddi.cpp +++ b/unified-runtime/source/loader/ur_ldrddi.cpp @@ -3595,6 +3595,45 @@ __urdlllocal ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize( pSuggestedLocalWorkSize); } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urKernelGetSuggestedLocalWorkSizeWithArgs +__urdlllocal ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSizeWithArgs( + /// [in] handle of the kernel + ur_kernel_handle_t hKernel, + /// [in] handle of the queue object + ur_queue_handle_t hQueue, + /// [in] number of dimensions, from 1 to 3, to specify the global + /// and work-group work-items + uint32_t numWorkDim, + /// [in] pointer to an array of numWorkDim unsigned values that specify + /// the offset used to calculate the global ID of a work-item + const size_t *pGlobalWorkOffset, + /// [in] pointer to an array of numWorkDim unsigned values that specify + /// the number of global work-items in workDim that will execute the + /// kernel function + const size_t *pGlobalWorkSize, + /// [in] Number of entries in pArgs + uint32_t numArgs, + /// [in][optional][range(0, numArgs)] pointer to a list of kernel arg + /// properties. + const ur_exp_kernel_arg_properties_t *pArgs, + /// [out] pointer to an array of numWorkDim unsigned values that specify + /// suggested local work size that will contain the result of the query + size_t *pSuggestedLocalWorkSize) { + + auto *dditable = *reinterpret_cast(hKernel); + + auto *pfnGetSuggestedLocalWorkSizeWithArgs = + dditable->Kernel.pfnGetSuggestedLocalWorkSizeWithArgs; + if (nullptr == pfnGetSuggestedLocalWorkSizeWithArgs) + return UR_RESULT_ERROR_UNINITIALIZED; + + // forward to device-platform + return pfnGetSuggestedLocalWorkSizeWithArgs( + hKernel, hQueue, numWorkDim, pGlobalWorkOffset, pGlobalWorkSize, numArgs, + pArgs, pSuggestedLocalWorkSize); +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urKernelSuggestMaxCooperativeGroupCount __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCount( @@ -7004,6 +7043,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable( ur_loader::urKernelCreateWithNativeHandle; pDdiTable->pfnGetSuggestedLocalWorkSize = ur_loader::urKernelGetSuggestedLocalWorkSize; + pDdiTable->pfnGetSuggestedLocalWorkSizeWithArgs = + ur_loader::urKernelGetSuggestedLocalWorkSizeWithArgs; pDdiTable->pfnSetArgValue = ur_loader::urKernelSetArgValue; pDdiTable->pfnSetArgLocal = ur_loader::urKernelSetArgLocal; pDdiTable->pfnSetArgPointer = ur_loader::urKernelSetArgPointer; diff --git a/unified-runtime/source/loader/ur_libapi.cpp b/unified-runtime/source/loader/ur_libapi.cpp index 2c0989155dc0c..2a1050ef459d6 100644 --- a/unified-runtime/source/loader/ur_libapi.cpp +++ b/unified-runtime/source/loader/ur_libapi.cpp @@ -6893,6 +6893,67 @@ ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize( return exceptionToResult(std::current_exception()); } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Get the suggested local work size for a kernel and set args at kernel +/// launch time. +/// +/// @details +/// - Query a suggested local work size for a kernel given a global size for +/// each dimension. +/// - The application may call this function from simultaneous threads for +/// the same context. +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_UNINITIALIZED +/// - ::UR_RESULT_ERROR_DEVICE_LOST +/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC +/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE +/// + `NULL == hKernel` +/// + `NULL == hQueue` +/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +/// + `NULL == pGlobalWorkOffset` +/// + `NULL == pGlobalWorkSize` +/// + `NULL == pSuggestedLocalWorkSize` +/// - ::UR_RESULT_ERROR_INVALID_ENUMERATION +/// + `NULL != pArgs && ::UR_EXP_KERNEL_ARG_TYPE_SAMPLER < pArgs->type` +/// - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE +ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSizeWithArgs( + /// [in] handle of the kernel + ur_kernel_handle_t hKernel, + /// [in] handle of the queue object + ur_queue_handle_t hQueue, + /// [in] number of dimensions, from 1 to 3, to specify the global + /// and work-group work-items + uint32_t numWorkDim, + /// [in] pointer to an array of numWorkDim unsigned values that specify + /// the offset used to calculate the global ID of a work-item + const size_t *pGlobalWorkOffset, + /// [in] pointer to an array of numWorkDim unsigned values that specify + /// the number of global work-items in workDim that will execute the + /// kernel function + const size_t *pGlobalWorkSize, + /// [in] Number of entries in pArgs + uint32_t numArgs, + /// [in][optional][range(0, numArgs)] pointer to a list of kernel arg + /// properties. + const ur_exp_kernel_arg_properties_t *pArgs, + /// [out] pointer to an array of numWorkDim unsigned values that specify + /// suggested local work size that will contain the result of the query + size_t *pSuggestedLocalWorkSize) try { + auto pfnGetSuggestedLocalWorkSizeWithArgs = + ur_lib::getContext() + ->urDdiTable.Kernel.pfnGetSuggestedLocalWorkSizeWithArgs; + if (nullptr == pfnGetSuggestedLocalWorkSizeWithArgs) + return UR_RESULT_ERROR_UNINITIALIZED; + + return pfnGetSuggestedLocalWorkSizeWithArgs( + hKernel, hQueue, numWorkDim, pGlobalWorkOffset, pGlobalWorkSize, numArgs, + pArgs, pSuggestedLocalWorkSize); +} catch (...) { + return exceptionToResult(std::current_exception()); +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Query the maximum number of work groups for a cooperative kernel /// diff --git a/unified-runtime/source/loader/ur_print.cpp b/unified-runtime/source/loader/ur_print.cpp index f45fceb05c676..aa043bffc1c2e 100644 --- a/unified-runtime/source/loader/ur_print.cpp +++ b/unified-runtime/source/loader/ur_print.cpp @@ -2264,6 +2264,15 @@ ur_result_t urPrintKernelGetSuggestedLocalWorkSizeParams( return str_copy(&ss, buffer, buff_size, out_size); } +ur_result_t urPrintKernelGetSuggestedLocalWorkSizeWithArgsParams( + const struct ur_kernel_get_suggested_local_work_size_with_args_params_t + *params, + char *buffer, const size_t buff_size, size_t *out_size) { + std::stringstream ss; + ss << params; + return str_copy(&ss, buffer, buff_size, out_size); +} + ur_result_t urPrintKernelSetArgValueParams( const struct ur_kernel_set_arg_value_params_t *params, char *buffer, const size_t buff_size, size_t *out_size) { diff --git a/unified-runtime/source/ur_api.cpp b/unified-runtime/source/ur_api.cpp index 44702034644e8..8c3d8d35bf329 100644 --- a/unified-runtime/source/ur_api.cpp +++ b/unified-runtime/source/ur_api.cpp @@ -6065,6 +6065,58 @@ ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Get the suggested local work size for a kernel and set args at kernel +/// launch time. +/// +/// @details +/// - Query a suggested local work size for a kernel given a global size for +/// each dimension. +/// - The application may call this function from simultaneous threads for +/// the same context. +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_UNINITIALIZED +/// - ::UR_RESULT_ERROR_DEVICE_LOST +/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC +/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE +/// + `NULL == hKernel` +/// + `NULL == hQueue` +/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +/// + `NULL == pGlobalWorkOffset` +/// + `NULL == pGlobalWorkSize` +/// + `NULL == pSuggestedLocalWorkSize` +/// - ::UR_RESULT_ERROR_INVALID_ENUMERATION +/// + `NULL != pArgs && ::UR_EXP_KERNEL_ARG_TYPE_SAMPLER < pArgs->type` +/// - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE +ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSizeWithArgs( + /// [in] handle of the kernel + ur_kernel_handle_t hKernel, + /// [in] handle of the queue object + ur_queue_handle_t hQueue, + /// [in] number of dimensions, from 1 to 3, to specify the global + /// and work-group work-items + uint32_t numWorkDim, + /// [in] pointer to an array of numWorkDim unsigned values that specify + /// the offset used to calculate the global ID of a work-item + const size_t *pGlobalWorkOffset, + /// [in] pointer to an array of numWorkDim unsigned values that specify + /// the number of global work-items in workDim that will execute the + /// kernel function + const size_t *pGlobalWorkSize, + /// [in] Number of entries in pArgs + uint32_t numArgs, + /// [in][optional][range(0, numArgs)] pointer to a list of kernel arg + /// properties. + const ur_exp_kernel_arg_properties_t *pArgs, + /// [out] pointer to an array of numWorkDim unsigned values that specify + /// suggested local work size that will contain the result of the query + size_t *pSuggestedLocalWorkSize) { + ur_result_t result = UR_RESULT_SUCCESS; + return result; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Query the maximum number of work groups for a cooperative kernel /// From 56678d5da39ee17db9e4c3fcd4503ba4c5ec179f Mon Sep 17 00:00:00 2001 From: Lukasz Dorau Date: Wed, 4 Mar 2026 14:33:19 +0000 Subject: [PATCH 3/9] [UR] Add implementation of urKernelGetSuggestedLocalWorkSizeWithArgs() API urKernelGetSuggestedLocalWorkSizeWithArgs() has to call urKernelGetSuggestedLocalWorkSize() only for now, because its full implementation requires more changes in the sanitizers' layer, what will be implemented in the following pull requests. Signed-off-by: Lukasz Dorau --- unified-runtime/source/adapters/cuda/kernel.cpp | 11 +++++++++++ unified-runtime/source/adapters/hip/kernel.cpp | 11 +++++++++++ .../source/adapters/level_zero/kernel.cpp | 11 +++++++++++ .../source/adapters/level_zero/v2/kernel.cpp | 11 +++++++++++ .../source/adapters/native_cpu/kernel.cpp | 12 ++++++++++++ unified-runtime/source/adapters/offload/kernel.cpp | 7 +++++++ unified-runtime/source/adapters/opencl/kernel.cpp | 11 +++++++++++ 7 files changed, 74 insertions(+) diff --git a/unified-runtime/source/adapters/cuda/kernel.cpp b/unified-runtime/source/adapters/cuda/kernel.cpp index 41c0873807995..1550457fa13c6 100644 --- a/unified-runtime/source/adapters/cuda/kernel.cpp +++ b/unified-runtime/source/adapters/cuda/kernel.cpp @@ -445,6 +445,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize( return UR_RESULT_SUCCESS; } +UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSizeWithArgs( + ur_kernel_handle_t hKernel, ur_queue_handle_t hQueue, uint32_t workDim, + const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, + [[maybe_unused]] uint32_t numArgs, + [[maybe_unused]] const ur_exp_kernel_arg_properties_t *pArgs, + size_t *pSuggestedLocalWorkSize) { + return urKernelGetSuggestedLocalWorkSize(hKernel, hQueue, workDim, + pGlobalWorkOffset, pGlobalWorkSize, + pSuggestedLocalWorkSize); +} + UR_APIEXPORT ur_result_t UR_APICALL urKernelSetSpecializationConstants( ur_kernel_handle_t, uint32_t, const ur_specialization_constant_info_t *) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; diff --git a/unified-runtime/source/adapters/hip/kernel.cpp b/unified-runtime/source/adapters/hip/kernel.cpp index 980449020d125..1cbb798c6b8d2 100644 --- a/unified-runtime/source/adapters/hip/kernel.cpp +++ b/unified-runtime/source/adapters/hip/kernel.cpp @@ -373,3 +373,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize( pSuggestedLocalWorkSize); return UR_RESULT_SUCCESS; } + +UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSizeWithArgs( + ur_kernel_handle_t hKernel, ur_queue_handle_t hQueue, uint32_t workDim, + const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, + [[maybe_unused]] uint32_t numArgs, + [[maybe_unused]] const ur_exp_kernel_arg_properties_t *pArgs, + size_t *pSuggestedLocalWorkSize) { + return urKernelGetSuggestedLocalWorkSize(hKernel, hQueue, workDim, + pGlobalWorkOffset, pGlobalWorkSize, + pSuggestedLocalWorkSize); +} diff --git a/unified-runtime/source/adapters/level_zero/kernel.cpp b/unified-runtime/source/adapters/level_zero/kernel.cpp index 4f5d72c0d88c7..ca8c7f67bc63c 100644 --- a/unified-runtime/source/adapters/level_zero/kernel.cpp +++ b/unified-runtime/source/adapters/level_zero/kernel.cpp @@ -56,6 +56,17 @@ ur_result_t urKernelGetSuggestedLocalWorkSize( return UR_RESULT_SUCCESS; } +ur_result_t urKernelGetSuggestedLocalWorkSizeWithArgs( + ur_kernel_handle_t hKernel, ur_queue_handle_t hQueue, uint32_t workDim, + const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, + [[maybe_unused]] uint32_t numArgs, + [[maybe_unused]] const ur_exp_kernel_arg_properties_t *pArgs, + size_t *pSuggestedLocalWorkSize) { + return ur::level_zero::urKernelGetSuggestedLocalWorkSize( + hKernel, hQueue, workDim, pGlobalWorkOffset, pGlobalWorkSize, + pSuggestedLocalWorkSize); +} + ur_result_t urKernelSetArgValueHelper( ur_kernel_handle_t Kernel, /// [in] argument index in range [0, num args - 1] diff --git a/unified-runtime/source/adapters/level_zero/v2/kernel.cpp b/unified-runtime/source/adapters/level_zero/v2/kernel.cpp index 5f4a51f7f2ef5..54eb0862d9201 100644 --- a/unified-runtime/source/adapters/level_zero/v2/kernel.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/kernel.cpp @@ -719,6 +719,17 @@ ur_result_t urKernelGetSuggestedLocalWorkSize( return UR_RESULT_SUCCESS; } +ur_result_t urKernelGetSuggestedLocalWorkSizeWithArgs( + ur_kernel_handle_t hKernel, ur_queue_handle_t hQueue, uint32_t workDim, + const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, + [[maybe_unused]] uint32_t numArgs, + [[maybe_unused]] const ur_exp_kernel_arg_properties_t *pArgs, + size_t *pSuggestedLocalWorkSize) { + return ur::level_zero::urKernelGetSuggestedLocalWorkSize( + hKernel, hQueue, workDim, pGlobalWorkOffset, pGlobalWorkSize, + pSuggestedLocalWorkSize); +} + ur_result_t urKernelSuggestMaxCooperativeGroupCount( ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, uint32_t workDim, const size_t *pLocalWorkSize, size_t dynamicSharedMemorySize, diff --git a/unified-runtime/source/adapters/native_cpu/kernel.cpp b/unified-runtime/source/adapters/native_cpu/kernel.cpp index 021a95bdadc5c..9fe7e3cbefaed 100644 --- a/unified-runtime/source/adapters/native_cpu/kernel.cpp +++ b/unified-runtime/source/adapters/native_cpu/kernel.cpp @@ -267,6 +267,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize( return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } +UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSizeWithArgs( + [[maybe_unused]] ur_kernel_handle_t hKernel, + [[maybe_unused]] ur_queue_handle_t hQueue, + [[maybe_unused]] uint32_t workDim, + [[maybe_unused]] const size_t *pGlobalWorkOffset, + [[maybe_unused]] const size_t *pGlobalWorkSize, + [[maybe_unused]] uint32_t numArgs, + [[maybe_unused]] const ur_exp_kernel_arg_properties_t *pArgs, + [[maybe_unused]] size_t *pSuggestedLocalWorkSize) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + UR_APIEXPORT ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCount( [[maybe_unused]] ur_kernel_handle_t hKernel, [[maybe_unused]] ur_device_handle_t hDevice, diff --git a/unified-runtime/source/adapters/offload/kernel.cpp b/unified-runtime/source/adapters/offload/kernel.cpp index ae155e2ad607c..5eada6fda135c 100644 --- a/unified-runtime/source/adapters/offload/kernel.cpp +++ b/unified-runtime/source/adapters/offload/kernel.cpp @@ -152,6 +152,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize( return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } +UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSizeWithArgs( + ur_kernel_handle_t, ur_queue_handle_t, uint32_t, const size_t *, + const size_t *, uint32_t, const ur_exp_kernel_arg_properties_t *, + size_t *) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgSampler( ur_kernel_handle_t, uint32_t, const ur_kernel_arg_sampler_properties_t *, ur_sampler_handle_t) { diff --git a/unified-runtime/source/adapters/opencl/kernel.cpp b/unified-runtime/source/adapters/opencl/kernel.cpp index b673f9743766c..2ccc58077f3fb 100644 --- a/unified-runtime/source/adapters/opencl/kernel.cpp +++ b/unified-runtime/source/adapters/opencl/kernel.cpp @@ -529,6 +529,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize( return UR_RESULT_SUCCESS; } +UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSizeWithArgs( + ur_kernel_handle_t hKernel, ur_queue_handle_t hQueue, uint32_t workDim, + const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, + [[maybe_unused]] uint32_t numArgs, + [[maybe_unused]] const ur_exp_kernel_arg_properties_t *pArgs, + size_t *pSuggestedLocalWorkSize) { + return urKernelGetSuggestedLocalWorkSize(hKernel, hQueue, workDim, + pGlobalWorkOffset, pGlobalWorkSize, + pSuggestedLocalWorkSize); +} + UR_APIEXPORT ur_result_t UR_APICALL urKernelSetSpecializationConstants( ur_kernel_handle_t, uint32_t, const ur_specialization_constant_info_t *) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; From dcea2b1461f0f0205d0c63db47691b64335694eb Mon Sep 17 00:00:00 2001 From: Lukasz Dorau Date: Thu, 5 Mar 2026 09:18:53 +0000 Subject: [PATCH 4/9] [UR] Use urKernelGetSuggestedLocalWorkSizeWithArgs() in sanitizers Use urKernelGetSuggestedLocalWorkSizeWithArgs() in sanitizers' intercept functions for urEnqueueKernelLaunchWithArgsExp(). Signed-off-by: Lukasz Dorau --- .../loader/layers/sanitizer/asan/asan_ddi.cpp | 6 +++-- .../sanitizer/asan/asan_interceptor.cpp | 19 ++++++++------ .../sanitizer/asan/asan_interceptor.hpp | 7 +++-- .../loader/layers/sanitizer/msan/msan_ddi.cpp | 6 +++-- .../sanitizer/msan/msan_interceptor.cpp | 20 ++++++++------ .../sanitizer/msan/msan_interceptor.hpp | 7 +++-- .../loader/layers/sanitizer/tsan/tsan_ddi.cpp | 6 +++-- .../sanitizer/tsan/tsan_interceptor.cpp | 26 ++++++++++--------- .../sanitizer/tsan/tsan_interceptor.hpp | 7 +++-- 9 files changed, 64 insertions(+), 40 deletions(-) diff --git a/unified-runtime/source/loader/layers/sanitizer/asan/asan_ddi.cpp b/unified-runtime/source/loader/layers/sanitizer/asan/asan_ddi.cpp index e273680ea75d5..a15040f33a385 100644 --- a/unified-runtime/source/loader/layers/sanitizer/asan/asan_ddi.cpp +++ b/unified-runtime/source/loader/layers/sanitizer/asan/asan_ddi.cpp @@ -539,7 +539,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunch( pLocalWorkSize, pGlobalWorkOffset, workDim); UR_CALL(LaunchInfo.Data.syncToDevice(hQueue)); - UR_CALL(getAsanInterceptor()->preLaunchKernel(hKernel, hQueue, LaunchInfo)); + UR_CALL(getAsanInterceptor()->preLaunchKernel(hKernel, hQueue, LaunchInfo, 0, + nullptr)); ur_result_t UrRes = getContext()->urDdiTable.Enqueue.pfnKernelLaunch( hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, @@ -1805,7 +1806,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchWithArgsExp( pLocalWorkSize, pGlobalWorkOffset, workDim); UR_CALL(LaunchInfo.Data.syncToDevice(hQueue)); - UR_CALL(getAsanInterceptor()->preLaunchKernel(hKernel, hQueue, LaunchInfo)); + UR_CALL(getAsanInterceptor()->preLaunchKernel(hKernel, hQueue, LaunchInfo, + numArgs, pArgs)); UR_CALL(getContext()->urDdiTable.EnqueueExp.pfnKernelLaunchWithArgsExp( hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, diff --git a/unified-runtime/source/loader/layers/sanitizer/asan/asan_interceptor.cpp b/unified-runtime/source/loader/layers/sanitizer/asan/asan_interceptor.cpp index 4288ca38236f7..234fb4358ad10 100644 --- a/unified-runtime/source/loader/layers/sanitizer/asan/asan_interceptor.cpp +++ b/unified-runtime/source/loader/layers/sanitizer/asan/asan_interceptor.cpp @@ -246,9 +246,9 @@ ur_result_t AsanInterceptor::releaseMemory(ur_context_handle_t Context, return UR_RESULT_SUCCESS; } -ur_result_t AsanInterceptor::preLaunchKernel(ur_kernel_handle_t Kernel, - ur_queue_handle_t Queue, - LaunchInfo &LaunchInfo) { +ur_result_t AsanInterceptor::preLaunchKernel( + ur_kernel_handle_t Kernel, ur_queue_handle_t Queue, LaunchInfo &LaunchInfo, + uint32_t numArgs, const ur_exp_kernel_arg_properties_t *pArgs) { auto Context = GetContext(Queue); auto Device = GetDevice(Queue); auto ContextInfo = getContextInfo(Context); @@ -271,7 +271,7 @@ ur_result_t AsanInterceptor::preLaunchKernel(ur_kernel_handle_t Kernel, UR_CALL(updateShadowMemory(DeviceInfo, InternalQueue)); UR_CALL(prepareLaunch(ContextInfo, DeviceInfo, InternalQueue, Kernel, - LaunchInfo)); + LaunchInfo, numArgs, pArgs)); UR_CALL(getContext()->urDdiTable.Queue.pfnFinish(InternalQueue)); @@ -724,7 +724,8 @@ AsanInterceptor::getMemBuffer(ur_mem_handle_t MemHandle) { ur_result_t AsanInterceptor::prepareLaunch( std::shared_ptr &ContextInfo, std::shared_ptr &DeviceInfo, ur_queue_handle_t Queue, - ur_kernel_handle_t Kernel, LaunchInfo &LaunchInfo) { + ur_kernel_handle_t Kernel, LaunchInfo &LaunchInfo, uint32_t numArgs, + const ur_exp_kernel_arg_properties_t *pArgs) { auto &KernelInfo = getOrCreateKernelInfo(Kernel); std::shared_lock Guard(KernelInfo.Mutex); @@ -800,9 +801,11 @@ ur_result_t AsanInterceptor::prepareLaunch( if (LaunchInfo.LocalWorkSize.empty()) { LaunchInfo.LocalWorkSize.resize(LaunchInfo.WorkDim); - auto URes = getContext()->urDdiTable.Kernel.pfnGetSuggestedLocalWorkSize( - Kernel, Queue, LaunchInfo.WorkDim, LaunchInfo.GlobalWorkOffset.data(), - LaunchInfo.GlobalWorkSize, LaunchInfo.LocalWorkSize.data()); + auto URes = + getContext()->urDdiTable.Kernel.pfnGetSuggestedLocalWorkSizeWithArgs( + Kernel, Queue, LaunchInfo.WorkDim, + LaunchInfo.GlobalWorkOffset.data(), LaunchInfo.GlobalWorkSize, + numArgs, pArgs, LaunchInfo.LocalWorkSize.data()); if (URes != UR_RESULT_SUCCESS) { if (URes != UR_RESULT_ERROR_UNSUPPORTED_FEATURE) { return URes; diff --git a/unified-runtime/source/loader/layers/sanitizer/asan/asan_interceptor.hpp b/unified-runtime/source/loader/layers/sanitizer/asan/asan_interceptor.hpp index fe262f955b6a7..ba27ec4d3e7df 100644 --- a/unified-runtime/source/loader/layers/sanitizer/asan/asan_interceptor.hpp +++ b/unified-runtime/source/loader/layers/sanitizer/asan/asan_interceptor.hpp @@ -312,7 +312,9 @@ class AsanInterceptor { ur_result_t unregisterProgram(ur_program_handle_t Program); ur_result_t preLaunchKernel(ur_kernel_handle_t Kernel, - ur_queue_handle_t Queue, LaunchInfo &LaunchInfo); + ur_queue_handle_t Queue, LaunchInfo &LaunchInfo, + uint32_t numArgs, + const ur_exp_kernel_arg_properties_t *pArgs); ur_result_t postLaunchKernel(ur_kernel_handle_t Kernel, ur_queue_handle_t Queue, LaunchInfo &LaunchInfo); @@ -394,7 +396,8 @@ class AsanInterceptor { ur_result_t prepareLaunch(std::shared_ptr &ContextInfo, std::shared_ptr &DeviceInfo, ur_queue_handle_t Queue, ur_kernel_handle_t Kernel, - LaunchInfo &LaunchInfo); + LaunchInfo &LaunchInfo, uint32_t numArgs, + const ur_exp_kernel_arg_properties_t *pArgs); ur_result_t registerDeviceGlobals(ur_program_handle_t Program); ur_result_t registerSpirKernels(ur_program_handle_t Program); diff --git a/unified-runtime/source/loader/layers/sanitizer/msan/msan_ddi.cpp b/unified-runtime/source/loader/layers/sanitizer/msan/msan_ddi.cpp index 00eda07ce10da..fb1d04ae0dd23 100644 --- a/unified-runtime/source/loader/layers/sanitizer/msan/msan_ddi.cpp +++ b/unified-runtime/source/loader/layers/sanitizer/msan/msan_ddi.cpp @@ -503,7 +503,8 @@ ur_result_t urEnqueueKernelLaunch( LaunchInfo LaunchInfo(GetContext(hQueue), GetDevice(hQueue), pGlobalWorkSize, pLocalWorkSize, pGlobalWorkOffset, workDim); - UR_CALL(getMsanInterceptor()->preLaunchKernel(hKernel, hQueue, LaunchInfo)); + UR_CALL(getMsanInterceptor()->preLaunchKernel(hKernel, hQueue, LaunchInfo, 0, + nullptr)); UR_CALL(getContext()->urDdiTable.Enqueue.pfnKernelLaunch( hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, @@ -1946,7 +1947,8 @@ ur_result_t urEnqueueKernelLaunchWithArgsExp( LaunchInfo LaunchInfo(GetContext(hQueue), GetDevice(hQueue), pGlobalWorkSize, pLocalWorkSize, pGlobalWorkOffset, workDim); - UR_CALL(getMsanInterceptor()->preLaunchKernel(hKernel, hQueue, LaunchInfo)); + UR_CALL(getMsanInterceptor()->preLaunchKernel(hKernel, hQueue, LaunchInfo, + numArgs, pArgs)); UR_CALL(getContext()->urDdiTable.EnqueueExp.pfnKernelLaunchWithArgsExp( hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, diff --git a/unified-runtime/source/loader/layers/sanitizer/msan/msan_interceptor.cpp b/unified-runtime/source/loader/layers/sanitizer/msan/msan_interceptor.cpp index a16c78c552722..3cce8d460129f 100644 --- a/unified-runtime/source/loader/layers/sanitizer/msan/msan_interceptor.cpp +++ b/unified-runtime/source/loader/layers/sanitizer/msan/msan_interceptor.cpp @@ -135,9 +135,9 @@ ur_result_t MsanInterceptor::releaseMemory(ur_context_handle_t Context, return getContext()->urDdiTable.USM.pfnFree(Context, Ptr); } -ur_result_t MsanInterceptor::preLaunchKernel(ur_kernel_handle_t Kernel, - ur_queue_handle_t Queue, - LaunchInfo &LaunchInfo) { +ur_result_t MsanInterceptor::preLaunchKernel( + ur_kernel_handle_t Kernel, ur_queue_handle_t Queue, LaunchInfo &LaunchInfo, + uint32_t numArgs, const ur_exp_kernel_arg_properties_t *pArgs) { auto Context = GetContext(Queue); auto Device = GetDevice(Queue); auto ContextInfo = getContextInfo(Context); @@ -149,7 +149,8 @@ ur_result_t MsanInterceptor::preLaunchKernel(ur_kernel_handle_t Kernel, return UR_RESULT_ERROR_INVALID_QUEUE; } - UR_CALL(prepareLaunch(DeviceInfo, InternalQueue, Kernel, LaunchInfo)); + UR_CALL(prepareLaunch(DeviceInfo, InternalQueue, Kernel, LaunchInfo, numArgs, + pArgs)); return UR_RESULT_SUCCESS; } @@ -458,7 +459,8 @@ MsanInterceptor::getMemBuffer(ur_mem_handle_t MemHandle) { ur_result_t MsanInterceptor::prepareLaunch( std::shared_ptr &DeviceInfo, ur_queue_handle_t Queue, - ur_kernel_handle_t Kernel, LaunchInfo &LaunchInfo) { + ur_kernel_handle_t Kernel, LaunchInfo &LaunchInfo, uint32_t numArgs, + const ur_exp_kernel_arg_properties_t *pArgs) { auto Program = GetProgram(Kernel); // Set membuffer arguments @@ -509,9 +511,11 @@ ur_result_t MsanInterceptor::prepareLaunch( if (LaunchInfo.LocalWorkSize.empty()) { LaunchInfo.LocalWorkSize.resize(LaunchInfo.WorkDim); - auto URes = getContext()->urDdiTable.Kernel.pfnGetSuggestedLocalWorkSize( - Kernel, Queue, LaunchInfo.WorkDim, LaunchInfo.GlobalWorkOffset.data(), - LaunchInfo.GlobalWorkSize, LaunchInfo.LocalWorkSize.data()); + auto URes = + getContext()->urDdiTable.Kernel.pfnGetSuggestedLocalWorkSizeWithArgs( + Kernel, Queue, LaunchInfo.WorkDim, + LaunchInfo.GlobalWorkOffset.data(), LaunchInfo.GlobalWorkSize, + numArgs, pArgs, LaunchInfo.LocalWorkSize.data()); if (URes != UR_RESULT_SUCCESS) { if (URes != UR_RESULT_ERROR_UNSUPPORTED_FEATURE) { return URes; diff --git a/unified-runtime/source/loader/layers/sanitizer/msan/msan_interceptor.hpp b/unified-runtime/source/loader/layers/sanitizer/msan/msan_interceptor.hpp index 574bf4f8013f2..10a32fa200f61 100644 --- a/unified-runtime/source/loader/layers/sanitizer/msan/msan_interceptor.hpp +++ b/unified-runtime/source/loader/layers/sanitizer/msan/msan_interceptor.hpp @@ -295,7 +295,9 @@ class MsanInterceptor { ur_result_t unregisterProgram(ur_program_handle_t Program); ur_result_t preLaunchKernel(ur_kernel_handle_t Kernel, - ur_queue_handle_t Queue, LaunchInfo &LaunchInfo); + ur_queue_handle_t Queue, LaunchInfo &LaunchInfo, + uint32_t numArgs, + const ur_exp_kernel_arg_properties_t *pArgs); ur_result_t postLaunchKernel(ur_kernel_handle_t Kernel, ur_queue_handle_t Queue, LaunchInfo &LaunchInfo); @@ -358,7 +360,8 @@ class MsanInterceptor { /// Initialize Global Variables & Kernel Name at first Launch ur_result_t prepareLaunch(std::shared_ptr &DeviceInfo, ur_queue_handle_t Queue, ur_kernel_handle_t Kernel, - LaunchInfo &LaunchInfo); + LaunchInfo &LaunchInfo, uint32_t numArgs, + const ur_exp_kernel_arg_properties_t *pArgs); ur_result_t allocShadowMemory(ur_context_handle_t Context, std::shared_ptr &DeviceInfo); diff --git a/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_ddi.cpp b/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_ddi.cpp index a5d7db6ad72cb..02a38935ed0c2 100644 --- a/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_ddi.cpp +++ b/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_ddi.cpp @@ -1347,7 +1347,8 @@ ur_result_t urEnqueueKernelLaunch( LaunchInfo LaunchInfo(GetContext(hQueue), GetDevice(hQueue), pGlobalWorkSize, pLocalWorkSize, pGlobalWorkOffset, workDim); - UR_CALL(getTsanInterceptor()->preLaunchKernel(hKernel, hQueue, LaunchInfo)); + UR_CALL(getTsanInterceptor()->preLaunchKernel(hKernel, hQueue, LaunchInfo, 0, + nullptr)); UR_CALL(getContext()->urDdiTable.Enqueue.pfnKernelLaunch( hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, @@ -1472,7 +1473,8 @@ ur_result_t urEnqueueKernelLaunchWithArgsExp( LaunchInfo LaunchInfo(GetContext(hQueue), GetDevice(hQueue), pGlobalWorkSize, pLocalWorkSize, pGlobalWorkOffset, workDim); - UR_CALL(getTsanInterceptor()->preLaunchKernel(hKernel, hQueue, LaunchInfo)); + UR_CALL(getTsanInterceptor()->preLaunchKernel(hKernel, hQueue, LaunchInfo, + numArgs, pArgs)); UR_CALL(getContext()->urDdiTable.EnqueueExp.pfnKernelLaunchWithArgsExp( hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, diff --git a/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_interceptor.cpp b/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_interceptor.cpp index 90904e404850a..b7c9e58383a23 100644 --- a/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_interceptor.cpp +++ b/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_interceptor.cpp @@ -336,15 +336,16 @@ TsanInterceptor::getMemBuffer(ur_mem_handle_t MemHandle) { return nullptr; } -ur_result_t TsanInterceptor::preLaunchKernel(ur_kernel_handle_t Kernel, - ur_queue_handle_t Queue, - LaunchInfo &LaunchInfo) { +ur_result_t TsanInterceptor::preLaunchKernel( + ur_kernel_handle_t Kernel, ur_queue_handle_t Queue, LaunchInfo &LaunchInfo, + uint32_t numArgs, const ur_exp_kernel_arg_properties_t *pArgs) { auto CI = getContextInfo(GetContext(Queue)); auto DI = getDeviceInfo(GetDevice(Queue)); ur_queue_handle_t InternalQueue = CI->getInternalQueue(DI->Handle); - UR_CALL(prepareLaunch(CI, DI, InternalQueue, Kernel, LaunchInfo)); + UR_CALL( + prepareLaunch(CI, DI, InternalQueue, Kernel, LaunchInfo, numArgs, pArgs)); UR_CALL(updateShadowMemory(DI, Kernel, InternalQueue)); @@ -373,11 +374,10 @@ ur_result_t TsanInterceptor::postLaunchKernel(ur_kernel_handle_t Kernel, return UR_RESULT_SUCCESS; } -ur_result_t TsanInterceptor::prepareLaunch(std::shared_ptr &, - std::shared_ptr &DI, - ur_queue_handle_t Queue, - ur_kernel_handle_t Kernel, - LaunchInfo &LaunchInfo) { +ur_result_t TsanInterceptor::prepareLaunch( + std::shared_ptr &, std::shared_ptr &DI, + ur_queue_handle_t Queue, ur_kernel_handle_t Kernel, LaunchInfo &LaunchInfo, + uint32_t numArgs, const ur_exp_kernel_arg_properties_t *pArgs) { // Set membuffer arguments auto &KernelInfo = getKernelInfo(Kernel); { @@ -399,9 +399,11 @@ ur_result_t TsanInterceptor::prepareLaunch(std::shared_ptr &, // Get suggested local work size if user doesn't determine it. if (LaunchInfo.LocalWorkSize.empty()) { LaunchInfo.LocalWorkSize.resize(LaunchInfo.WorkDim); - auto URes = getContext()->urDdiTable.Kernel.pfnGetSuggestedLocalWorkSize( - Kernel, Queue, LaunchInfo.WorkDim, LaunchInfo.GlobalWorkOffset.data(), - LaunchInfo.GlobalWorkSize, LaunchInfo.LocalWorkSize.data()); + auto URes = + getContext()->urDdiTable.Kernel.pfnGetSuggestedLocalWorkSizeWithArgs( + Kernel, Queue, LaunchInfo.WorkDim, + LaunchInfo.GlobalWorkOffset.data(), LaunchInfo.GlobalWorkSize, + numArgs, pArgs, LaunchInfo.LocalWorkSize.data()); if (URes != UR_RESULT_SUCCESS) { if (URes != UR_RESULT_ERROR_UNSUPPORTED_FEATURE) { return URes; diff --git a/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_interceptor.hpp b/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_interceptor.hpp index d454993dae6a1..16c618351f149 100644 --- a/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_interceptor.hpp +++ b/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_interceptor.hpp @@ -274,7 +274,9 @@ class TsanInterceptor { } ur_result_t preLaunchKernel(ur_kernel_handle_t Kernel, - ur_queue_handle_t Queue, LaunchInfo &LaunchInfo); + ur_queue_handle_t Queue, LaunchInfo &LaunchInfo, + uint32_t numArgs, + const ur_exp_kernel_arg_properties_t *pArgs); ur_result_t postLaunchKernel(ur_kernel_handle_t Kernel, ur_queue_handle_t Queue, LaunchInfo &LaunchInfo); @@ -313,7 +315,8 @@ class TsanInterceptor { ur_result_t prepareLaunch(std::shared_ptr &CI, std::shared_ptr &DI, ur_queue_handle_t Queue, ur_kernel_handle_t Kernel, - LaunchInfo &LaunchInfo); + LaunchInfo &LaunchInfo, uint32_t numArgs, + const ur_exp_kernel_arg_properties_t *pArgs); ur_result_t registerDeviceGlobals(ur_program_handle_t Program); From fa6e7dfd4b866a0760441e511ca3119de654bdbe Mon Sep 17 00:00:00 2001 From: Lukasz Dorau Date: Tue, 17 Mar 2026 10:35:44 +0000 Subject: [PATCH 5/9] Fixes after review #1 --- .../source/adapters/cuda/ur_interface_loader.cpp | 2 ++ .../source/adapters/hip/ur_interface_loader.cpp | 2 ++ .../source/adapters/native_cpu/ur_interface_loader.cpp | 2 ++ .../source/adapters/offload/ur_interface_loader.cpp | 2 ++ .../source/adapters/opencl/ur_interface_loader.cpp | 2 ++ .../loader/layers/sanitizer/asan/asan_interceptor.cpp | 7 ++++--- 6 files changed, 14 insertions(+), 3 deletions(-) diff --git a/unified-runtime/source/adapters/cuda/ur_interface_loader.cpp b/unified-runtime/source/adapters/cuda/ur_interface_loader.cpp index 0da25b3507997..80068cb3ddf7a 100644 --- a/unified-runtime/source/adapters/cuda/ur_interface_loader.cpp +++ b/unified-runtime/source/adapters/cuda/ur_interface_loader.cpp @@ -141,6 +141,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable( pDdiTable->pfnSetExecInfo = urKernelSetExecInfo; pDdiTable->pfnSetSpecializationConstants = urKernelSetSpecializationConstants; pDdiTable->pfnGetSuggestedLocalWorkSize = urKernelGetSuggestedLocalWorkSize; + pDdiTable->pfnGetSuggestedLocalWorkSizeWithArgs = + urKernelGetSuggestedLocalWorkSizeWithArgs; pDdiTable->pfnSuggestMaxCooperativeGroupCount = urKernelSuggestMaxCooperativeGroupCount; return UR_RESULT_SUCCESS; diff --git a/unified-runtime/source/adapters/hip/ur_interface_loader.cpp b/unified-runtime/source/adapters/hip/ur_interface_loader.cpp index 56ffca3ae7094..8e7c481f60947 100644 --- a/unified-runtime/source/adapters/hip/ur_interface_loader.cpp +++ b/unified-runtime/source/adapters/hip/ur_interface_loader.cpp @@ -141,6 +141,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable( pDdiTable->pfnSetExecInfo = urKernelSetExecInfo; pDdiTable->pfnSetSpecializationConstants = urKernelSetSpecializationConstants; pDdiTable->pfnGetSuggestedLocalWorkSize = urKernelGetSuggestedLocalWorkSize; + pDdiTable->pfnGetSuggestedLocalWorkSizeWithArgs = + urKernelGetSuggestedLocalWorkSizeWithArgs; pDdiTable->pfnSuggestMaxCooperativeGroupCount = urKernelSuggestMaxCooperativeGroupCount; return UR_RESULT_SUCCESS; diff --git a/unified-runtime/source/adapters/native_cpu/ur_interface_loader.cpp b/unified-runtime/source/adapters/native_cpu/ur_interface_loader.cpp index 78fe857721aa6..556d0c106f229 100644 --- a/unified-runtime/source/adapters/native_cpu/ur_interface_loader.cpp +++ b/unified-runtime/source/adapters/native_cpu/ur_interface_loader.cpp @@ -141,6 +141,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable( pDdiTable->pfnSetExecInfo = urKernelSetExecInfo; pDdiTable->pfnSetSpecializationConstants = urKernelSetSpecializationConstants; pDdiTable->pfnGetSuggestedLocalWorkSize = urKernelGetSuggestedLocalWorkSize; + pDdiTable->pfnGetSuggestedLocalWorkSizeWithArgs = + urKernelGetSuggestedLocalWorkSizeWithArgs; pDdiTable->pfnSuggestMaxCooperativeGroupCount = urKernelSuggestMaxCooperativeGroupCount; return UR_RESULT_SUCCESS; diff --git a/unified-runtime/source/adapters/offload/ur_interface_loader.cpp b/unified-runtime/source/adapters/offload/ur_interface_loader.cpp index 46db10a7325bd..f65781361e35e 100644 --- a/unified-runtime/source/adapters/offload/ur_interface_loader.cpp +++ b/unified-runtime/source/adapters/offload/ur_interface_loader.cpp @@ -140,6 +140,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable( pDdiTable->pfnSetExecInfo = urKernelSetExecInfo; pDdiTable->pfnSetSpecializationConstants = urKernelSetSpecializationConstants; pDdiTable->pfnGetSuggestedLocalWorkSize = urKernelGetSuggestedLocalWorkSize; + pDdiTable->pfnGetSuggestedLocalWorkSizeWithArgs = + urKernelGetSuggestedLocalWorkSizeWithArgs; return UR_RESULT_SUCCESS; } diff --git a/unified-runtime/source/adapters/opencl/ur_interface_loader.cpp b/unified-runtime/source/adapters/opencl/ur_interface_loader.cpp index c753ce498cb40..a9c0166255d50 100644 --- a/unified-runtime/source/adapters/opencl/ur_interface_loader.cpp +++ b/unified-runtime/source/adapters/opencl/ur_interface_loader.cpp @@ -142,6 +142,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable( pDdiTable->pfnSetExecInfo = urKernelSetExecInfo; pDdiTable->pfnSetSpecializationConstants = urKernelSetSpecializationConstants; pDdiTable->pfnGetSuggestedLocalWorkSize = urKernelGetSuggestedLocalWorkSize; + pDdiTable->pfnGetSuggestedLocalWorkSizeWithArgs = + urKernelGetSuggestedLocalWorkSizeWithArgs; pDdiTable->pfnSuggestMaxCooperativeGroupCount = urKernelSuggestMaxCooperativeGroupCount; return UR_RESULT_SUCCESS; diff --git a/unified-runtime/source/loader/layers/sanitizer/asan/asan_interceptor.cpp b/unified-runtime/source/loader/layers/sanitizer/asan/asan_interceptor.cpp index 234fb4358ad10..b0cc5f1904afd 100644 --- a/unified-runtime/source/loader/layers/sanitizer/asan/asan_interceptor.cpp +++ b/unified-runtime/source/loader/layers/sanitizer/asan/asan_interceptor.cpp @@ -724,8 +724,8 @@ AsanInterceptor::getMemBuffer(ur_mem_handle_t MemHandle) { ur_result_t AsanInterceptor::prepareLaunch( std::shared_ptr &ContextInfo, std::shared_ptr &DeviceInfo, ur_queue_handle_t Queue, - ur_kernel_handle_t Kernel, LaunchInfo &LaunchInfo, uint32_t numArgs, - const ur_exp_kernel_arg_properties_t *pArgs) { + ur_kernel_handle_t Kernel, LaunchInfo &LaunchInfo, uint32_t, + const ur_exp_kernel_arg_properties_t *) { auto &KernelInfo = getOrCreateKernelInfo(Kernel); std::shared_lock Guard(KernelInfo.Mutex); @@ -805,7 +805,8 @@ ur_result_t AsanInterceptor::prepareLaunch( getContext()->urDdiTable.Kernel.pfnGetSuggestedLocalWorkSizeWithArgs( Kernel, Queue, LaunchInfo.WorkDim, LaunchInfo.GlobalWorkOffset.data(), LaunchInfo.GlobalWorkSize, - numArgs, pArgs, LaunchInfo.LocalWorkSize.data()); + ArgNums, KernelInfo.ArgProps.data(), + LaunchInfo.LocalWorkSize.data()); if (URes != UR_RESULT_SUCCESS) { if (URes != UR_RESULT_ERROR_UNSUPPORTED_FEATURE) { return URes; From 3c873c6775366261797732bfa025a569da002b22 Mon Sep 17 00:00:00 2001 From: Lukasz Dorau Date: Wed, 18 Mar 2026 15:23:25 +0000 Subject: [PATCH 6/9] Remove extra args --- .../loader/layers/sanitizer/asan/asan_ddi.cpp | 6 ++--- .../sanitizer/asan/asan_interceptor.cpp | 11 +++++----- .../sanitizer/asan/asan_interceptor.hpp | 7 ++---- .../loader/layers/sanitizer/msan/msan_ddi.cpp | 6 ++--- .../sanitizer/msan/msan_interceptor.cpp | 16 +++++++------- .../sanitizer/msan/msan_interceptor.hpp | 7 ++---- .../loader/layers/sanitizer/tsan/tsan_ddi.cpp | 6 ++--- .../sanitizer/tsan/tsan_interceptor.cpp | 22 ++++++++++--------- .../sanitizer/tsan/tsan_interceptor.hpp | 7 ++---- 9 files changed, 37 insertions(+), 51 deletions(-) diff --git a/unified-runtime/source/loader/layers/sanitizer/asan/asan_ddi.cpp b/unified-runtime/source/loader/layers/sanitizer/asan/asan_ddi.cpp index a15040f33a385..e273680ea75d5 100644 --- a/unified-runtime/source/loader/layers/sanitizer/asan/asan_ddi.cpp +++ b/unified-runtime/source/loader/layers/sanitizer/asan/asan_ddi.cpp @@ -539,8 +539,7 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunch( pLocalWorkSize, pGlobalWorkOffset, workDim); UR_CALL(LaunchInfo.Data.syncToDevice(hQueue)); - UR_CALL(getAsanInterceptor()->preLaunchKernel(hKernel, hQueue, LaunchInfo, 0, - nullptr)); + UR_CALL(getAsanInterceptor()->preLaunchKernel(hKernel, hQueue, LaunchInfo)); ur_result_t UrRes = getContext()->urDdiTable.Enqueue.pfnKernelLaunch( hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, @@ -1806,8 +1805,7 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchWithArgsExp( pLocalWorkSize, pGlobalWorkOffset, workDim); UR_CALL(LaunchInfo.Data.syncToDevice(hQueue)); - UR_CALL(getAsanInterceptor()->preLaunchKernel(hKernel, hQueue, LaunchInfo, - numArgs, pArgs)); + UR_CALL(getAsanInterceptor()->preLaunchKernel(hKernel, hQueue, LaunchInfo)); UR_CALL(getContext()->urDdiTable.EnqueueExp.pfnKernelLaunchWithArgsExp( hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, diff --git a/unified-runtime/source/loader/layers/sanitizer/asan/asan_interceptor.cpp b/unified-runtime/source/loader/layers/sanitizer/asan/asan_interceptor.cpp index b0cc5f1904afd..62d21d6e1bf66 100644 --- a/unified-runtime/source/loader/layers/sanitizer/asan/asan_interceptor.cpp +++ b/unified-runtime/source/loader/layers/sanitizer/asan/asan_interceptor.cpp @@ -246,9 +246,9 @@ ur_result_t AsanInterceptor::releaseMemory(ur_context_handle_t Context, return UR_RESULT_SUCCESS; } -ur_result_t AsanInterceptor::preLaunchKernel( - ur_kernel_handle_t Kernel, ur_queue_handle_t Queue, LaunchInfo &LaunchInfo, - uint32_t numArgs, const ur_exp_kernel_arg_properties_t *pArgs) { +ur_result_t AsanInterceptor::preLaunchKernel(ur_kernel_handle_t Kernel, + ur_queue_handle_t Queue, + LaunchInfo &LaunchInfo) { auto Context = GetContext(Queue); auto Device = GetDevice(Queue); auto ContextInfo = getContextInfo(Context); @@ -271,7 +271,7 @@ ur_result_t AsanInterceptor::preLaunchKernel( UR_CALL(updateShadowMemory(DeviceInfo, InternalQueue)); UR_CALL(prepareLaunch(ContextInfo, DeviceInfo, InternalQueue, Kernel, - LaunchInfo, numArgs, pArgs)); + LaunchInfo)); UR_CALL(getContext()->urDdiTable.Queue.pfnFinish(InternalQueue)); @@ -724,8 +724,7 @@ AsanInterceptor::getMemBuffer(ur_mem_handle_t MemHandle) { ur_result_t AsanInterceptor::prepareLaunch( std::shared_ptr &ContextInfo, std::shared_ptr &DeviceInfo, ur_queue_handle_t Queue, - ur_kernel_handle_t Kernel, LaunchInfo &LaunchInfo, uint32_t, - const ur_exp_kernel_arg_properties_t *) { + ur_kernel_handle_t Kernel, LaunchInfo &LaunchInfo) { auto &KernelInfo = getOrCreateKernelInfo(Kernel); std::shared_lock Guard(KernelInfo.Mutex); diff --git a/unified-runtime/source/loader/layers/sanitizer/asan/asan_interceptor.hpp b/unified-runtime/source/loader/layers/sanitizer/asan/asan_interceptor.hpp index ba27ec4d3e7df..fe262f955b6a7 100644 --- a/unified-runtime/source/loader/layers/sanitizer/asan/asan_interceptor.hpp +++ b/unified-runtime/source/loader/layers/sanitizer/asan/asan_interceptor.hpp @@ -312,9 +312,7 @@ class AsanInterceptor { ur_result_t unregisterProgram(ur_program_handle_t Program); ur_result_t preLaunchKernel(ur_kernel_handle_t Kernel, - ur_queue_handle_t Queue, LaunchInfo &LaunchInfo, - uint32_t numArgs, - const ur_exp_kernel_arg_properties_t *pArgs); + ur_queue_handle_t Queue, LaunchInfo &LaunchInfo); ur_result_t postLaunchKernel(ur_kernel_handle_t Kernel, ur_queue_handle_t Queue, LaunchInfo &LaunchInfo); @@ -396,8 +394,7 @@ class AsanInterceptor { ur_result_t prepareLaunch(std::shared_ptr &ContextInfo, std::shared_ptr &DeviceInfo, ur_queue_handle_t Queue, ur_kernel_handle_t Kernel, - LaunchInfo &LaunchInfo, uint32_t numArgs, - const ur_exp_kernel_arg_properties_t *pArgs); + LaunchInfo &LaunchInfo); ur_result_t registerDeviceGlobals(ur_program_handle_t Program); ur_result_t registerSpirKernels(ur_program_handle_t Program); diff --git a/unified-runtime/source/loader/layers/sanitizer/msan/msan_ddi.cpp b/unified-runtime/source/loader/layers/sanitizer/msan/msan_ddi.cpp index fb1d04ae0dd23..00eda07ce10da 100644 --- a/unified-runtime/source/loader/layers/sanitizer/msan/msan_ddi.cpp +++ b/unified-runtime/source/loader/layers/sanitizer/msan/msan_ddi.cpp @@ -503,8 +503,7 @@ ur_result_t urEnqueueKernelLaunch( LaunchInfo LaunchInfo(GetContext(hQueue), GetDevice(hQueue), pGlobalWorkSize, pLocalWorkSize, pGlobalWorkOffset, workDim); - UR_CALL(getMsanInterceptor()->preLaunchKernel(hKernel, hQueue, LaunchInfo, 0, - nullptr)); + UR_CALL(getMsanInterceptor()->preLaunchKernel(hKernel, hQueue, LaunchInfo)); UR_CALL(getContext()->urDdiTable.Enqueue.pfnKernelLaunch( hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, @@ -1947,8 +1946,7 @@ ur_result_t urEnqueueKernelLaunchWithArgsExp( LaunchInfo LaunchInfo(GetContext(hQueue), GetDevice(hQueue), pGlobalWorkSize, pLocalWorkSize, pGlobalWorkOffset, workDim); - UR_CALL(getMsanInterceptor()->preLaunchKernel(hKernel, hQueue, LaunchInfo, - numArgs, pArgs)); + UR_CALL(getMsanInterceptor()->preLaunchKernel(hKernel, hQueue, LaunchInfo)); UR_CALL(getContext()->urDdiTable.EnqueueExp.pfnKernelLaunchWithArgsExp( hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, diff --git a/unified-runtime/source/loader/layers/sanitizer/msan/msan_interceptor.cpp b/unified-runtime/source/loader/layers/sanitizer/msan/msan_interceptor.cpp index 3cce8d460129f..b9b1cdab08241 100644 --- a/unified-runtime/source/loader/layers/sanitizer/msan/msan_interceptor.cpp +++ b/unified-runtime/source/loader/layers/sanitizer/msan/msan_interceptor.cpp @@ -135,9 +135,9 @@ ur_result_t MsanInterceptor::releaseMemory(ur_context_handle_t Context, return getContext()->urDdiTable.USM.pfnFree(Context, Ptr); } -ur_result_t MsanInterceptor::preLaunchKernel( - ur_kernel_handle_t Kernel, ur_queue_handle_t Queue, LaunchInfo &LaunchInfo, - uint32_t numArgs, const ur_exp_kernel_arg_properties_t *pArgs) { +ur_result_t MsanInterceptor::preLaunchKernel(ur_kernel_handle_t Kernel, + ur_queue_handle_t Queue, + LaunchInfo &LaunchInfo) { auto Context = GetContext(Queue); auto Device = GetDevice(Queue); auto ContextInfo = getContextInfo(Context); @@ -149,8 +149,7 @@ ur_result_t MsanInterceptor::preLaunchKernel( return UR_RESULT_ERROR_INVALID_QUEUE; } - UR_CALL(prepareLaunch(DeviceInfo, InternalQueue, Kernel, LaunchInfo, numArgs, - pArgs)); + UR_CALL(prepareLaunch(DeviceInfo, InternalQueue, Kernel, LaunchInfo)); return UR_RESULT_SUCCESS; } @@ -459,8 +458,7 @@ MsanInterceptor::getMemBuffer(ur_mem_handle_t MemHandle) { ur_result_t MsanInterceptor::prepareLaunch( std::shared_ptr &DeviceInfo, ur_queue_handle_t Queue, - ur_kernel_handle_t Kernel, LaunchInfo &LaunchInfo, uint32_t numArgs, - const ur_exp_kernel_arg_properties_t *pArgs) { + ur_kernel_handle_t Kernel, LaunchInfo &LaunchInfo) { auto Program = GetProgram(Kernel); // Set membuffer arguments @@ -511,11 +509,13 @@ ur_result_t MsanInterceptor::prepareLaunch( if (LaunchInfo.LocalWorkSize.empty()) { LaunchInfo.LocalWorkSize.resize(LaunchInfo.WorkDim); + auto ArgNums = GetKernelNumArgs(Kernel); auto URes = getContext()->urDdiTable.Kernel.pfnGetSuggestedLocalWorkSizeWithArgs( Kernel, Queue, LaunchInfo.WorkDim, LaunchInfo.GlobalWorkOffset.data(), LaunchInfo.GlobalWorkSize, - numArgs, pArgs, LaunchInfo.LocalWorkSize.data()); + ArgNums, KernelInfo.ArgProps.data(), + LaunchInfo.LocalWorkSize.data()); if (URes != UR_RESULT_SUCCESS) { if (URes != UR_RESULT_ERROR_UNSUPPORTED_FEATURE) { return URes; diff --git a/unified-runtime/source/loader/layers/sanitizer/msan/msan_interceptor.hpp b/unified-runtime/source/loader/layers/sanitizer/msan/msan_interceptor.hpp index 10a32fa200f61..574bf4f8013f2 100644 --- a/unified-runtime/source/loader/layers/sanitizer/msan/msan_interceptor.hpp +++ b/unified-runtime/source/loader/layers/sanitizer/msan/msan_interceptor.hpp @@ -295,9 +295,7 @@ class MsanInterceptor { ur_result_t unregisterProgram(ur_program_handle_t Program); ur_result_t preLaunchKernel(ur_kernel_handle_t Kernel, - ur_queue_handle_t Queue, LaunchInfo &LaunchInfo, - uint32_t numArgs, - const ur_exp_kernel_arg_properties_t *pArgs); + ur_queue_handle_t Queue, LaunchInfo &LaunchInfo); ur_result_t postLaunchKernel(ur_kernel_handle_t Kernel, ur_queue_handle_t Queue, LaunchInfo &LaunchInfo); @@ -360,8 +358,7 @@ class MsanInterceptor { /// Initialize Global Variables & Kernel Name at first Launch ur_result_t prepareLaunch(std::shared_ptr &DeviceInfo, ur_queue_handle_t Queue, ur_kernel_handle_t Kernel, - LaunchInfo &LaunchInfo, uint32_t numArgs, - const ur_exp_kernel_arg_properties_t *pArgs); + LaunchInfo &LaunchInfo); ur_result_t allocShadowMemory(ur_context_handle_t Context, std::shared_ptr &DeviceInfo); diff --git a/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_ddi.cpp b/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_ddi.cpp index 02a38935ed0c2..a5d7db6ad72cb 100644 --- a/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_ddi.cpp +++ b/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_ddi.cpp @@ -1347,8 +1347,7 @@ ur_result_t urEnqueueKernelLaunch( LaunchInfo LaunchInfo(GetContext(hQueue), GetDevice(hQueue), pGlobalWorkSize, pLocalWorkSize, pGlobalWorkOffset, workDim); - UR_CALL(getTsanInterceptor()->preLaunchKernel(hKernel, hQueue, LaunchInfo, 0, - nullptr)); + UR_CALL(getTsanInterceptor()->preLaunchKernel(hKernel, hQueue, LaunchInfo)); UR_CALL(getContext()->urDdiTable.Enqueue.pfnKernelLaunch( hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, @@ -1473,8 +1472,7 @@ ur_result_t urEnqueueKernelLaunchWithArgsExp( LaunchInfo LaunchInfo(GetContext(hQueue), GetDevice(hQueue), pGlobalWorkSize, pLocalWorkSize, pGlobalWorkOffset, workDim); - UR_CALL(getTsanInterceptor()->preLaunchKernel(hKernel, hQueue, LaunchInfo, - numArgs, pArgs)); + UR_CALL(getTsanInterceptor()->preLaunchKernel(hKernel, hQueue, LaunchInfo)); UR_CALL(getContext()->urDdiTable.EnqueueExp.pfnKernelLaunchWithArgsExp( hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, diff --git a/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_interceptor.cpp b/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_interceptor.cpp index b7c9e58383a23..e4e6ce0df6360 100644 --- a/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_interceptor.cpp +++ b/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_interceptor.cpp @@ -336,16 +336,15 @@ TsanInterceptor::getMemBuffer(ur_mem_handle_t MemHandle) { return nullptr; } -ur_result_t TsanInterceptor::preLaunchKernel( - ur_kernel_handle_t Kernel, ur_queue_handle_t Queue, LaunchInfo &LaunchInfo, - uint32_t numArgs, const ur_exp_kernel_arg_properties_t *pArgs) { +ur_result_t TsanInterceptor::preLaunchKernel(ur_kernel_handle_t Kernel, + ur_queue_handle_t Queue, + LaunchInfo &LaunchInfo) { auto CI = getContextInfo(GetContext(Queue)); auto DI = getDeviceInfo(GetDevice(Queue)); ur_queue_handle_t InternalQueue = CI->getInternalQueue(DI->Handle); - UR_CALL( - prepareLaunch(CI, DI, InternalQueue, Kernel, LaunchInfo, numArgs, pArgs)); + UR_CALL(prepareLaunch(CI, DI, InternalQueue, Kernel, LaunchInfo)); UR_CALL(updateShadowMemory(DI, Kernel, InternalQueue)); @@ -374,10 +373,11 @@ ur_result_t TsanInterceptor::postLaunchKernel(ur_kernel_handle_t Kernel, return UR_RESULT_SUCCESS; } -ur_result_t TsanInterceptor::prepareLaunch( - std::shared_ptr &, std::shared_ptr &DI, - ur_queue_handle_t Queue, ur_kernel_handle_t Kernel, LaunchInfo &LaunchInfo, - uint32_t numArgs, const ur_exp_kernel_arg_properties_t *pArgs) { +ur_result_t TsanInterceptor::prepareLaunch(std::shared_ptr &, + std::shared_ptr &DI, + ur_queue_handle_t Queue, + ur_kernel_handle_t Kernel, + LaunchInfo &LaunchInfo) { // Set membuffer arguments auto &KernelInfo = getKernelInfo(Kernel); { @@ -399,11 +399,13 @@ ur_result_t TsanInterceptor::prepareLaunch( // Get suggested local work size if user doesn't determine it. if (LaunchInfo.LocalWorkSize.empty()) { LaunchInfo.LocalWorkSize.resize(LaunchInfo.WorkDim); + auto ArgNums = GetKernelNumArgs(Kernel); auto URes = getContext()->urDdiTable.Kernel.pfnGetSuggestedLocalWorkSizeWithArgs( Kernel, Queue, LaunchInfo.WorkDim, LaunchInfo.GlobalWorkOffset.data(), LaunchInfo.GlobalWorkSize, - numArgs, pArgs, LaunchInfo.LocalWorkSize.data()); + ArgNums, KernelInfo.ArgProps.data(), + LaunchInfo.LocalWorkSize.data()); if (URes != UR_RESULT_SUCCESS) { if (URes != UR_RESULT_ERROR_UNSUPPORTED_FEATURE) { return URes; diff --git a/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_interceptor.hpp b/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_interceptor.hpp index 16c618351f149..d454993dae6a1 100644 --- a/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_interceptor.hpp +++ b/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_interceptor.hpp @@ -274,9 +274,7 @@ class TsanInterceptor { } ur_result_t preLaunchKernel(ur_kernel_handle_t Kernel, - ur_queue_handle_t Queue, LaunchInfo &LaunchInfo, - uint32_t numArgs, - const ur_exp_kernel_arg_properties_t *pArgs); + ur_queue_handle_t Queue, LaunchInfo &LaunchInfo); ur_result_t postLaunchKernel(ur_kernel_handle_t Kernel, ur_queue_handle_t Queue, LaunchInfo &LaunchInfo); @@ -315,8 +313,7 @@ class TsanInterceptor { ur_result_t prepareLaunch(std::shared_ptr &CI, std::shared_ptr &DI, ur_queue_handle_t Queue, ur_kernel_handle_t Kernel, - LaunchInfo &LaunchInfo, uint32_t numArgs, - const ur_exp_kernel_arg_properties_t *pArgs); + LaunchInfo &LaunchInfo); ur_result_t registerDeviceGlobals(ur_program_handle_t Program); From e83a3a7e44b582924afe1aa4f6e1e732589b6193 Mon Sep 17 00:00:00 2001 From: Lukasz Dorau Date: Wed, 18 Mar 2026 15:51:57 +0000 Subject: [PATCH 7/9] Fix UR spec --- unified-runtime/include/unified-runtime/ur_api.h | 8 ++++---- unified-runtime/include/unified-runtime/ur_print.hpp | 6 +++--- unified-runtime/scripts/core/kernel.yml | 4 +++- unified-runtime/scripts/core/registry.yml | 8 ++++---- .../source/loader/layers/validation/ur_valddi.cpp | 3 +++ unified-runtime/source/loader/ur_libapi.cpp | 4 ++-- unified-runtime/source/ur_api.cpp | 4 ++-- 7 files changed, 21 insertions(+), 16 deletions(-) diff --git a/unified-runtime/include/unified-runtime/ur_api.h b/unified-runtime/include/unified-runtime/ur_api.h index 6e750d761549d..f4042ec192b79 100644 --- a/unified-runtime/include/unified-runtime/ur_api.h +++ b/unified-runtime/include/unified-runtime/ur_api.h @@ -377,8 +377,6 @@ typedef enum ur_function_t { UR_FUNCTION_COMMAND_BUFFER_GET_INFO_EXP = 221, /// Enumerator for ::urEnqueueTimestampRecordingExp UR_FUNCTION_ENQUEUE_TIMESTAMP_RECORDING_EXP = 223, - /// Enumerator for ::urKernelGetSuggestedLocalWorkSizeWithArgs - UR_FUNCTION_KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE_WITH_ARGS = 224, /// Enumerator for ::urKernelGetSuggestedLocalWorkSize UR_FUNCTION_KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE = 225, /// Enumerator for ::urBindlessImagesImportExternalMemoryExp @@ -517,6 +515,8 @@ typedef enum ur_function_t { UR_FUNCTION_ENQUEUE_HOST_TASK_EXP = 309, /// Enumerator for ::urCommandBufferAppendKernelLaunchWithArgsExp UR_FUNCTION_COMMAND_BUFFER_APPEND_KERNEL_LAUNCH_WITH_ARGS_EXP = 310, + /// Enumerator for ::urKernelGetSuggestedLocalWorkSizeWithArgs + UR_FUNCTION_KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE_WITH_ARGS = 311, /// @cond UR_FUNCTION_FORCE_UINT32 = 0x7fffffff /// @endcond @@ -9504,8 +9504,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize( size_t *pSuggestedLocalWorkSize); /////////////////////////////////////////////////////////////////////////////// -/// @brief Get the suggested local work size for a kernel and set args at kernel -/// launch time. +/// @brief Set kernel args and get the suggested local work size for a kernel. /// /// @details /// - Query a suggested local work size for a kernel given a global size for @@ -9525,6 +9524,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize( /// + `NULL == pGlobalWorkOffset` /// + `NULL == pGlobalWorkSize` /// + `NULL == pSuggestedLocalWorkSize` +/// + `pArgs == NULL && numArgs > 0` /// - ::UR_RESULT_ERROR_INVALID_ENUMERATION /// + `NULL != pArgs && ::UR_EXP_KERNEL_ARG_TYPE_SAMPLER < pArgs->type` /// - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE diff --git a/unified-runtime/include/unified-runtime/ur_print.hpp b/unified-runtime/include/unified-runtime/ur_print.hpp index 75a37a9236df6..35736578de1a6 100644 --- a/unified-runtime/include/unified-runtime/ur_print.hpp +++ b/unified-runtime/include/unified-runtime/ur_print.hpp @@ -1165,9 +1165,6 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_function_t value) { case UR_FUNCTION_ENQUEUE_TIMESTAMP_RECORDING_EXP: os << "UR_FUNCTION_ENQUEUE_TIMESTAMP_RECORDING_EXP"; break; - case UR_FUNCTION_KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE_WITH_ARGS: - os << "UR_FUNCTION_KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE_WITH_ARGS"; - break; case UR_FUNCTION_KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE: os << "UR_FUNCTION_KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE"; break; @@ -1376,6 +1373,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_function_t value) { case UR_FUNCTION_COMMAND_BUFFER_APPEND_KERNEL_LAUNCH_WITH_ARGS_EXP: os << "UR_FUNCTION_COMMAND_BUFFER_APPEND_KERNEL_LAUNCH_WITH_ARGS_EXP"; break; + case UR_FUNCTION_KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE_WITH_ARGS: + os << "UR_FUNCTION_KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE_WITH_ARGS"; + break; default: os << "unknown enumerator"; break; diff --git a/unified-runtime/scripts/core/kernel.yml b/unified-runtime/scripts/core/kernel.yml index 70ba4ebf76c7f..6a0ab3600ca61 100644 --- a/unified-runtime/scripts/core/kernel.yml +++ b/unified-runtime/scripts/core/kernel.yml @@ -602,7 +602,7 @@ returns: - $X_RESULT_ERROR_UNSUPPORTED_FEATURE --- #-------------------------------------------------------------------------- type: function -desc: "Get the suggested local work size for a kernel and set args at kernel launch time." +desc: "Set kernel args and get the suggested local work size for a kernel." class: $xKernel name: GetSuggestedLocalWorkSizeWithArgs ordinal: "0" @@ -646,6 +646,8 @@ params: [out] pointer to an array of numWorkDim unsigned values that specify suggested local work size that will contain the result of the query returns: + - $X_RESULT_ERROR_INVALID_NULL_POINTER: + - "`pArgs == NULL && numArgs > 0`" - $X_RESULT_ERROR_UNSUPPORTED_FEATURE --- #-------------------------------------------------------------------------- type: function diff --git a/unified-runtime/scripts/core/registry.yml b/unified-runtime/scripts/core/registry.yml index 93757c2dae27b..288469a2f0d37 100644 --- a/unified-runtime/scripts/core/registry.yml +++ b/unified-runtime/scripts/core/registry.yml @@ -523,9 +523,6 @@ etors: - name: ENQUEUE_TIMESTAMP_RECORDING_EXP desc: Enumerator for $xEnqueueTimestampRecordingExp value: '223' -- name: KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE_WITH_ARGS - desc: Enumerator for $xKernelGetSuggestedLocalWorkSizeWithArgs - value: '224' - name: KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE desc: Enumerator for $xKernelGetSuggestedLocalWorkSize value: '225' @@ -733,7 +730,10 @@ etors: - name: COMMAND_BUFFER_APPEND_KERNEL_LAUNCH_WITH_ARGS_EXP desc: Enumerator for $xCommandBufferAppendKernelLaunchWithArgsExp value: '310' -max_id: '310' +- name: KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE_WITH_ARGS + desc: Enumerator for $xKernelGetSuggestedLocalWorkSizeWithArgs + value: '311' +max_id: '311' --- type: enum desc: Defines structure types diff --git a/unified-runtime/source/loader/layers/validation/ur_valddi.cpp b/unified-runtime/source/loader/layers/validation/ur_valddi.cpp index cc28eb9c34ff0..66c7e1c48d8b3 100644 --- a/unified-runtime/source/loader/layers/validation/ur_valddi.cpp +++ b/unified-runtime/source/loader/layers/validation/ur_valddi.cpp @@ -6929,6 +6929,9 @@ __urdlllocal ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSizeWithArgs( if (NULL == pSuggestedLocalWorkSize) return UR_RESULT_ERROR_INVALID_NULL_POINTER; + if (pArgs == NULL && numArgs > 0) + return UR_RESULT_ERROR_INVALID_NULL_POINTER; + if (NULL == hKernel) return UR_RESULT_ERROR_INVALID_NULL_HANDLE; diff --git a/unified-runtime/source/loader/ur_libapi.cpp b/unified-runtime/source/loader/ur_libapi.cpp index 2a1050ef459d6..4ff02f2a429a1 100644 --- a/unified-runtime/source/loader/ur_libapi.cpp +++ b/unified-runtime/source/loader/ur_libapi.cpp @@ -6894,8 +6894,7 @@ ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize( } /////////////////////////////////////////////////////////////////////////////// -/// @brief Get the suggested local work size for a kernel and set args at kernel -/// launch time. +/// @brief Set kernel args and get the suggested local work size for a kernel. /// /// @details /// - Query a suggested local work size for a kernel given a global size for @@ -6915,6 +6914,7 @@ ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize( /// + `NULL == pGlobalWorkOffset` /// + `NULL == pGlobalWorkSize` /// + `NULL == pSuggestedLocalWorkSize` +/// + `pArgs == NULL && numArgs > 0` /// - ::UR_RESULT_ERROR_INVALID_ENUMERATION /// + `NULL != pArgs && ::UR_EXP_KERNEL_ARG_TYPE_SAMPLER < pArgs->type` /// - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE diff --git a/unified-runtime/source/ur_api.cpp b/unified-runtime/source/ur_api.cpp index 8c3d8d35bf329..a8928e80c4753 100644 --- a/unified-runtime/source/ur_api.cpp +++ b/unified-runtime/source/ur_api.cpp @@ -6066,8 +6066,7 @@ ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize( } /////////////////////////////////////////////////////////////////////////////// -/// @brief Get the suggested local work size for a kernel and set args at kernel -/// launch time. +/// @brief Set kernel args and get the suggested local work size for a kernel. /// /// @details /// - Query a suggested local work size for a kernel given a global size for @@ -6087,6 +6086,7 @@ ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize( /// + `NULL == pGlobalWorkOffset` /// + `NULL == pGlobalWorkSize` /// + `NULL == pSuggestedLocalWorkSize` +/// + `pArgs == NULL && numArgs > 0` /// - ::UR_RESULT_ERROR_INVALID_ENUMERATION /// + `NULL != pArgs && ::UR_EXP_KERNEL_ARG_TYPE_SAMPLER < pArgs->type` /// - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE From 3a84c1ebf9611f5d7c68c45d2724053514663609 Mon Sep 17 00:00:00 2001 From: Lukasz Dorau Date: Fri, 6 Mar 2026 08:54:54 +0000 Subject: [PATCH 8/9] [UR] Implement urKernelGetSuggestedLocalWorkSizeWithArgs() for OpenCL Signed-off-by: Lukasz Dorau --- .../source/adapters/opencl/kernel.cpp | 48 ++++++++++++++++++- 1 file changed, 46 insertions(+), 2 deletions(-) diff --git a/unified-runtime/source/adapters/opencl/kernel.cpp b/unified-runtime/source/adapters/opencl/kernel.cpp index 2ccc58077f3fb..10a7f9055dfa8 100644 --- a/unified-runtime/source/adapters/opencl/kernel.cpp +++ b/unified-runtime/source/adapters/opencl/kernel.cpp @@ -532,9 +532,53 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize( UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSizeWithArgs( ur_kernel_handle_t hKernel, ur_queue_handle_t hQueue, uint32_t workDim, const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, - [[maybe_unused]] uint32_t numArgs, - [[maybe_unused]] const ur_exp_kernel_arg_properties_t *pArgs, + uint32_t numArgs, const ur_exp_kernel_arg_properties_t *pArgs, size_t *pSuggestedLocalWorkSize) { + + clSetKernelArgMemPointerINTEL_fn SetKernelArgMemPointerPtr = nullptr; + UR_RETURN_ON_FAILURE( + cl_ext::getExtFuncFromContext( + hKernel->Context->CLContext, + ur::cl::getAdapter()->fnCache.clSetKernelArgMemPointerINTELCache, + cl_ext::SetKernelArgMemPointerName, &SetKernelArgMemPointerPtr)); + + for (uint32_t i = 0; i < numArgs; i++) { + switch (pArgs[i].type) { + case UR_EXP_KERNEL_ARG_TYPE_LOCAL: + CL_RETURN_ON_FAILURE(clSetKernelArg(hKernel->CLKernel, + static_cast(pArgs[i].index), + pArgs[i].size, nullptr)); + break; + case UR_EXP_KERNEL_ARG_TYPE_VALUE: + CL_RETURN_ON_FAILURE(clSetKernelArg(hKernel->CLKernel, + static_cast(pArgs[i].index), + pArgs[i].size, pArgs[i].value.value)); + break; + case UR_EXP_KERNEL_ARG_TYPE_MEM_OBJ: { + cl_mem mem = pArgs[i].value.memObjTuple.hMem + ? pArgs[i].value.memObjTuple.hMem->CLMemory + : nullptr; + CL_RETURN_ON_FAILURE(clSetKernelArg(hKernel->CLKernel, + static_cast(pArgs[i].index), + pArgs[i].size, &mem)); + break; + } + case UR_EXP_KERNEL_ARG_TYPE_POINTER: + CL_RETURN_ON_FAILURE(SetKernelArgMemPointerPtr( + hKernel->CLKernel, static_cast(pArgs[i].index), + pArgs[i].value.pointer)); + break; + case UR_EXP_KERNEL_ARG_TYPE_SAMPLER: { + CL_RETURN_ON_FAILURE(clSetKernelArg( + hKernel->CLKernel, static_cast(pArgs[i].index), + pArgs[i].size, &pArgs[i].value.sampler->CLSampler)); + break; + } + default: + return UR_RESULT_ERROR_INVALID_ENUMERATION; + } + } + return urKernelGetSuggestedLocalWorkSize(hKernel, hQueue, workDim, pGlobalWorkOffset, pGlobalWorkSize, pSuggestedLocalWorkSize); From bdbe295c5959fbe603d4eab3151b87fbbf473def Mon Sep 17 00:00:00 2001 From: Lukasz Dorau Date: Thu, 19 Mar 2026 08:41:51 +0000 Subject: [PATCH 9/9] [UR] Do not set kernel args in intercept functions for urEnqueueKernelLaunchWithArgsExp() Signed-off-by: Lukasz Dorau --- .../loader/layers/sanitizer/asan/asan_ddi.cpp | 57 ------------------- 1 file changed, 57 deletions(-) diff --git a/unified-runtime/source/loader/layers/sanitizer/asan/asan_ddi.cpp b/unified-runtime/source/loader/layers/sanitizer/asan/asan_ddi.cpp index e273680ea75d5..dc4f01005fa50 100644 --- a/unified-runtime/source/loader/layers/sanitizer/asan/asan_ddi.cpp +++ b/unified-runtime/source/loader/layers/sanitizer/asan/asan_ddi.cpp @@ -1744,63 +1744,6 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchWithArgsExp( std::memcpy(KernelInfo.ArgProps.data(), pArgs, numArgs * sizeof(ur_exp_kernel_arg_properties_t)); - // We need to set all the args now rather than letting LaunchWithArgs handle - // them. This is because some implementations of - // urKernelGetSuggestedLocalWorkSize, which is used in preLaunchKernel, rely - // on all the args being set. - for (uint32_t ArgPropIndex = 0; ArgPropIndex < numArgs; ArgPropIndex++) { - switch (pArgs[ArgPropIndex].type) { - case UR_EXP_KERNEL_ARG_TYPE_LOCAL: { - UR_CALL(ur_sanitizer_layer::asan::urKernelSetArgLocal( - hKernel, pArgs[ArgPropIndex].index, pArgs[ArgPropIndex].size, - nullptr)); - KernelInfo.ArgProps[ArgPropIndex].size = - KernelInfo.LocalArgs[ArgPropIndex].SizeWithRedZone; - break; - } - case UR_EXP_KERNEL_ARG_TYPE_POINTER: { - UR_CALL(ur_sanitizer_layer::asan::urKernelSetArgPointer( - hKernel, pArgs[ArgPropIndex].index, nullptr, - pArgs[ArgPropIndex].value.pointer)); - break; - } - case UR_EXP_KERNEL_ARG_TYPE_VALUE: { - UR_CALL(ur_sanitizer_layer::asan::urKernelSetArgValue( - hKernel, pArgs[ArgPropIndex].index, pArgs[ArgPropIndex].size, nullptr, - pArgs[ArgPropIndex].value.value)); - break; - } - case UR_EXP_KERNEL_ARG_TYPE_MEM_OBJ: { - ur_kernel_arg_mem_obj_properties_t Properties = { - UR_STRUCTURE_TYPE_KERNEL_ARG_MEM_OBJ_PROPERTIES, nullptr, - pArgs[ArgPropIndex].value.memObjTuple.flags}; - UR_CALL(ur_sanitizer_layer::asan::urKernelSetArgMemObj( - hKernel, pArgs[ArgPropIndex].index, &Properties, - pArgs[ArgPropIndex].value.memObjTuple.hMem)); - if (std::shared_ptr MemBuffer = - getAsanInterceptor()->getMemBuffer( - pArgs[ArgPropIndex].value.memObjTuple.hMem)) { - char *Handle = nullptr; - UR_CALL(MemBuffer->getHandle(GetDevice(hQueue), Handle)); - KernelInfo.ArgProps[ArgPropIndex].type = - ur_exp_kernel_arg_type_t::UR_EXP_KERNEL_ARG_TYPE_POINTER; - KernelInfo.ArgProps[ArgPropIndex].value.pointer = Handle; - } - break; - } - case UR_EXP_KERNEL_ARG_TYPE_SAMPLER: { - auto pfnKernelSetArgSampler = - getContext()->urDdiTable.Kernel.pfnSetArgSampler; - UR_CALL(pfnKernelSetArgSampler(hKernel, pArgs[ArgPropIndex].index, - nullptr, - pArgs[ArgPropIndex].value.sampler)); - break; - } - default: - return UR_RESULT_ERROR_INVALID_ENUMERATION; - } - } - LaunchInfo LaunchInfo(GetContext(hQueue), GetDevice(hQueue), pGlobalWorkSize, pLocalWorkSize, pGlobalWorkOffset, workDim); UR_CALL(LaunchInfo.Data.syncToDevice(hQueue));