diff --git a/unified-runtime/include/unified-runtime/ur_api.h b/unified-runtime/include/unified-runtime/ur_api.h index d8b040fa4e763..f4042ec192b79 100644 --- a/unified-runtime/include/unified-runtime/ur_api.h +++ b/unified-runtime/include/unified-runtime/ur_api.h @@ -515,6 +515,8 @@ typedef enum ur_function_t { UR_FUNCTION_ENQUEUE_HOST_TASK_EXP = 309, /// Enumerator for ::urCommandBufferAppendKernelLaunchWithArgsExp UR_FUNCTION_COMMAND_BUFFER_APPEND_KERNEL_LAUNCH_WITH_ARGS_EXP = 310, + /// Enumerator for ::urKernelGetSuggestedLocalWorkSizeWithArgs + UR_FUNCTION_KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE_WITH_ARGS = 311, /// @cond UR_FUNCTION_FORCE_UINT32 = 0x7fffffff /// @endcond @@ -9501,6 +9503,55 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize( /// suggested local work size that will contain the result of the query size_t *pSuggestedLocalWorkSize); +/////////////////////////////////////////////////////////////////////////////// +/// @brief Set kernel args and get the suggested local work size for a kernel. +/// +/// @details +/// - Query a suggested local work size for a kernel given a global size for +/// each dimension. +/// - The application may call this function from simultaneous threads for +/// the same context. +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_UNINITIALIZED +/// - ::UR_RESULT_ERROR_DEVICE_LOST +/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC +/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE +/// + `NULL == hKernel` +/// + `NULL == hQueue` +/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +/// + `NULL == pGlobalWorkOffset` +/// + `NULL == pGlobalWorkSize` +/// + `NULL == pSuggestedLocalWorkSize` +/// + `pArgs == NULL && numArgs > 0` +/// - ::UR_RESULT_ERROR_INVALID_ENUMERATION +/// + `NULL != pArgs && ::UR_EXP_KERNEL_ARG_TYPE_SAMPLER < pArgs->type` +/// - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE +UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSizeWithArgs( + /// [in] handle of the kernel + ur_kernel_handle_t hKernel, + /// [in] handle of the queue object + ur_queue_handle_t hQueue, + /// [in] number of dimensions, from 1 to 3, to specify the global + /// and work-group work-items + uint32_t numWorkDim, + /// [in] pointer to an array of numWorkDim unsigned values that specify + /// the offset used to calculate the global ID of a work-item + const size_t *pGlobalWorkOffset, + /// [in] pointer to an array of numWorkDim unsigned values that specify + /// the number of global work-items in workDim that will execute the + /// kernel function + const size_t *pGlobalWorkSize, + /// [in] Number of entries in pArgs + uint32_t numArgs, + /// [in][optional][range(0, numArgs)] pointer to a list of kernel arg + /// properties. + const ur_exp_kernel_arg_properties_t *pArgs, + /// [out] pointer to an array of numWorkDim unsigned values that specify + /// suggested local work size that will contain the result of the query + size_t *pSuggestedLocalWorkSize); + /////////////////////////////////////////////////////////////////////////////// /// @brief Query the maximum number of work groups for a cooperative kernel /// @@ -14580,6 +14631,21 @@ typedef struct ur_kernel_get_suggested_local_work_size_params_t { size_t **ppSuggestedLocalWorkSize; } ur_kernel_get_suggested_local_work_size_params_t; +/////////////////////////////////////////////////////////////////////////////// +/// @brief Function parameters for urKernelGetSuggestedLocalWorkSizeWithArgs +/// @details Each entry is a pointer to the parameter passed to the function; +/// allowing the callback the ability to modify the parameter's value +typedef struct ur_kernel_get_suggested_local_work_size_with_args_params_t { + ur_kernel_handle_t *phKernel; + ur_queue_handle_t *phQueue; + uint32_t *pnumWorkDim; + const size_t **ppGlobalWorkOffset; + const size_t **ppGlobalWorkSize; + uint32_t *pnumArgs; + const ur_exp_kernel_arg_properties_t **ppArgs; + size_t **ppSuggestedLocalWorkSize; +} ur_kernel_get_suggested_local_work_size_with_args_params_t; + /////////////////////////////////////////////////////////////////////////////// /// @brief Function parameters for urKernelSetArgValue /// @details Each entry is a pointer to the parameter passed to the function; diff --git a/unified-runtime/include/unified-runtime/ur_api_funcs.def b/unified-runtime/include/unified-runtime/ur_api_funcs.def index 06a3efa6bc437..3659aee433432 100644 --- a/unified-runtime/include/unified-runtime/ur_api_funcs.def +++ b/unified-runtime/include/unified-runtime/ur_api_funcs.def @@ -72,6 +72,7 @@ _UR_API(urKernelRelease) _UR_API(urKernelGetNativeHandle) _UR_API(urKernelCreateWithNativeHandle) _UR_API(urKernelGetSuggestedLocalWorkSize) +_UR_API(urKernelGetSuggestedLocalWorkSizeWithArgs) _UR_API(urKernelSetArgValue) _UR_API(urKernelSetArgLocal) _UR_API(urKernelSetArgPointer) diff --git a/unified-runtime/include/unified-runtime/ur_ddi.h b/unified-runtime/include/unified-runtime/ur_ddi.h index da3747e385a9d..3c381bc98dcda 100644 --- a/unified-runtime/include/unified-runtime/ur_ddi.h +++ b/unified-runtime/include/unified-runtime/ur_ddi.h @@ -521,6 +521,13 @@ typedef ur_result_t(UR_APICALL *ur_pfnKernelGetSuggestedLocalWorkSize_t)( ur_kernel_handle_t, ur_queue_handle_t, uint32_t, const size_t *, const size_t *, size_t *); +/////////////////////////////////////////////////////////////////////////////// +/// @brief Function-pointer for urKernelGetSuggestedLocalWorkSizeWithArgs +typedef ur_result_t( + UR_APICALL *ur_pfnKernelGetSuggestedLocalWorkSizeWithArgs_t)( + ur_kernel_handle_t, ur_queue_handle_t, uint32_t, const size_t *, + const size_t *, uint32_t, const ur_exp_kernel_arg_properties_t *, size_t *); + /////////////////////////////////////////////////////////////////////////////// /// @brief Function-pointer for urKernelSetArgValue typedef ur_result_t(UR_APICALL *ur_pfnKernelSetArgValue_t)( @@ -580,6 +587,8 @@ typedef struct ur_kernel_dditable_t { ur_pfnKernelGetNativeHandle_t pfnGetNativeHandle; ur_pfnKernelCreateWithNativeHandle_t pfnCreateWithNativeHandle; ur_pfnKernelGetSuggestedLocalWorkSize_t pfnGetSuggestedLocalWorkSize; + ur_pfnKernelGetSuggestedLocalWorkSizeWithArgs_t + pfnGetSuggestedLocalWorkSizeWithArgs; ur_pfnKernelSetArgValue_t pfnSetArgValue; ur_pfnKernelSetArgLocal_t pfnSetArgLocal; ur_pfnKernelSetArgPointer_t pfnSetArgPointer; diff --git a/unified-runtime/include/unified-runtime/ur_print.h b/unified-runtime/include/unified-runtime/ur_print.h index 881027a92a77e..ea22e1e4c7783 100644 --- a/unified-runtime/include/unified-runtime/ur_print.h +++ b/unified-runtime/include/unified-runtime/ur_print.h @@ -2169,6 +2169,19 @@ urPrintKernelGetSuggestedLocalWorkSizeParams( const struct ur_kernel_get_suggested_local_work_size_params_t *params, char *buffer, const size_t buff_size, size_t *out_size); +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print ur_kernel_get_suggested_local_work_size_with_args_params_t +/// struct +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_INVALID_SIZE +/// - `buff_size < out_size` +UR_APIEXPORT ur_result_t UR_APICALL +urPrintKernelGetSuggestedLocalWorkSizeWithArgsParams( + const struct ur_kernel_get_suggested_local_work_size_with_args_params_t + *params, + char *buffer, const size_t buff_size, size_t *out_size); + /////////////////////////////////////////////////////////////////////////////// /// @brief Print ur_kernel_set_arg_value_params_t struct /// @returns diff --git a/unified-runtime/include/unified-runtime/ur_print.hpp b/unified-runtime/include/unified-runtime/ur_print.hpp index e1d30d0d1f3b3..35736578de1a6 100644 --- a/unified-runtime/include/unified-runtime/ur_print.hpp +++ b/unified-runtime/include/unified-runtime/ur_print.hpp @@ -1373,6 +1373,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_function_t value) { case UR_FUNCTION_COMMAND_BUFFER_APPEND_KERNEL_LAUNCH_WITH_ARGS_EXP: os << "UR_FUNCTION_COMMAND_BUFFER_APPEND_KERNEL_LAUNCH_WITH_ARGS_EXP"; break; + case UR_FUNCTION_KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE_WITH_ARGS: + os << "UR_FUNCTION_KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE_WITH_ARGS"; + break; default: os << "unknown enumerator"; break; @@ -14923,6 +14926,67 @@ operator<<(std::ostream &os, [[maybe_unused]] const struct return os; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print operator for the +/// ur_kernel_get_suggested_local_work_size_with_args_params_t type +/// @returns +/// std::ostream & +inline std::ostream & +operator<<(std::ostream &os, [[maybe_unused]] const struct + ur_kernel_get_suggested_local_work_size_with_args_params_t *params) { + + os << ".hKernel = "; + + ur::details::printPtr(os, *(params->phKernel)); + + os << ", "; + os << ".hQueue = "; + + ur::details::printPtr(os, *(params->phQueue)); + + os << ", "; + os << ".numWorkDim = "; + + os << *(params->pnumWorkDim); + + os << ", "; + os << ".pGlobalWorkOffset = "; + + ur::details::printPtr(os, *(params->ppGlobalWorkOffset)); + + os << ", "; + os << ".pGlobalWorkSize = "; + + ur::details::printPtr(os, *(params->ppGlobalWorkSize)); + + os << ", "; + os << ".numArgs = "; + + os << *(params->pnumArgs); + + os << ", "; + os << ".pArgs = "; + ur::details::printPtr(os, reinterpret_cast(*(params->ppArgs))); + if (*(params->ppArgs) != NULL) { + os << " {"; + for (size_t i = 0; i < *params->pnumArgs; ++i) { + if (i != 0) { + os << ", "; + } + + os << (*(params->ppArgs))[i]; + } + os << "}"; + } + + os << ", "; + os << ".pSuggestedLocalWorkSize = "; + + ur::details::printPtr(os, *(params->ppSuggestedLocalWorkSize)); + + return os; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Print operator for the ur_kernel_set_arg_value_params_t type /// @returns @@ -22582,6 +22646,10 @@ inline ur_result_t UR_APICALL printFunctionParams(std::ostream &os, os << (const struct ur_kernel_get_suggested_local_work_size_params_t *) params; } break; + case UR_FUNCTION_KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE_WITH_ARGS: { + os << (const struct + ur_kernel_get_suggested_local_work_size_with_args_params_t *)params; + } break; case UR_FUNCTION_KERNEL_SET_ARG_VALUE: { os << (const struct ur_kernel_set_arg_value_params_t *)params; } break; diff --git a/unified-runtime/scripts/core/kernel.yml b/unified-runtime/scripts/core/kernel.yml index a420a8568f3e5..6a0ab3600ca61 100644 --- a/unified-runtime/scripts/core/kernel.yml +++ b/unified-runtime/scripts/core/kernel.yml @@ -602,6 +602,55 @@ returns: - $X_RESULT_ERROR_UNSUPPORTED_FEATURE --- #-------------------------------------------------------------------------- type: function +desc: "Set kernel args and get the suggested local work size for a kernel." +class: $xKernel +name: GetSuggestedLocalWorkSizeWithArgs +ordinal: "0" +details: + - "Query a suggested local work size for a kernel given a global size for each dimension." + - "The application may call this function from simultaneous threads for the same context." +params: + - type: $x_kernel_handle_t + name: hKernel + desc: | + [in] handle of the kernel + - type: $x_queue_handle_t + name: hQueue + desc: | + [in] handle of the queue object + - type: uint32_t + name: numWorkDim + desc: | + [in] number of dimensions, from 1 to 3, to specify the global + and work-group work-items + - type: const size_t* + name: pGlobalWorkOffset + desc: | + [in] pointer to an array of numWorkDim unsigned values that specify + the offset used to calculate the global ID of a work-item + - type: const size_t* + name: pGlobalWorkSize + desc: | + [in] pointer to an array of numWorkDim unsigned values that specify + the number of global work-items in workDim that will execute the + kernel function + - type: uint32_t + name: numArgs + desc: "[in] Number of entries in pArgs" + - type: "const $x_exp_kernel_arg_properties_t*" + name: pArgs + desc: "[in][optional][range(0, numArgs)] pointer to a list of kernel arg properties." + - type: size_t* + name: pSuggestedLocalWorkSize + desc: | + [out] pointer to an array of numWorkDim unsigned values that specify + suggested local work size that will contain the result of the query +returns: + - $X_RESULT_ERROR_INVALID_NULL_POINTER: + - "`pArgs == NULL && numArgs > 0`" + - $X_RESULT_ERROR_UNSUPPORTED_FEATURE +--- #-------------------------------------------------------------------------- +type: function desc: "Query the maximum number of work groups for a cooperative kernel" class: $xKernel name: SuggestMaxCooperativeGroupCount diff --git a/unified-runtime/scripts/core/registry.yml b/unified-runtime/scripts/core/registry.yml index 8762563e9a569..288469a2f0d37 100644 --- a/unified-runtime/scripts/core/registry.yml +++ b/unified-runtime/scripts/core/registry.yml @@ -730,7 +730,10 @@ etors: - name: COMMAND_BUFFER_APPEND_KERNEL_LAUNCH_WITH_ARGS_EXP desc: Enumerator for $xCommandBufferAppendKernelLaunchWithArgsExp value: '310' -max_id: '310' +- name: KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE_WITH_ARGS + desc: Enumerator for $xKernelGetSuggestedLocalWorkSizeWithArgs + value: '311' +max_id: '311' --- type: enum desc: Defines structure types diff --git a/unified-runtime/source/adapters/cuda/kernel.cpp b/unified-runtime/source/adapters/cuda/kernel.cpp index 41c0873807995..1550457fa13c6 100644 --- a/unified-runtime/source/adapters/cuda/kernel.cpp +++ b/unified-runtime/source/adapters/cuda/kernel.cpp @@ -445,6 +445,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize( return UR_RESULT_SUCCESS; } +UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSizeWithArgs( + ur_kernel_handle_t hKernel, ur_queue_handle_t hQueue, uint32_t workDim, + const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, + [[maybe_unused]] uint32_t numArgs, + [[maybe_unused]] const ur_exp_kernel_arg_properties_t *pArgs, + size_t *pSuggestedLocalWorkSize) { + return urKernelGetSuggestedLocalWorkSize(hKernel, hQueue, workDim, + pGlobalWorkOffset, pGlobalWorkSize, + pSuggestedLocalWorkSize); +} + UR_APIEXPORT ur_result_t UR_APICALL urKernelSetSpecializationConstants( ur_kernel_handle_t, uint32_t, const ur_specialization_constant_info_t *) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; diff --git a/unified-runtime/source/adapters/cuda/ur_interface_loader.cpp b/unified-runtime/source/adapters/cuda/ur_interface_loader.cpp index 0da25b3507997..80068cb3ddf7a 100644 --- a/unified-runtime/source/adapters/cuda/ur_interface_loader.cpp +++ b/unified-runtime/source/adapters/cuda/ur_interface_loader.cpp @@ -141,6 +141,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable( pDdiTable->pfnSetExecInfo = urKernelSetExecInfo; pDdiTable->pfnSetSpecializationConstants = urKernelSetSpecializationConstants; pDdiTable->pfnGetSuggestedLocalWorkSize = urKernelGetSuggestedLocalWorkSize; + pDdiTable->pfnGetSuggestedLocalWorkSizeWithArgs = + urKernelGetSuggestedLocalWorkSizeWithArgs; pDdiTable->pfnSuggestMaxCooperativeGroupCount = urKernelSuggestMaxCooperativeGroupCount; return UR_RESULT_SUCCESS; diff --git a/unified-runtime/source/adapters/hip/kernel.cpp b/unified-runtime/source/adapters/hip/kernel.cpp index 980449020d125..1cbb798c6b8d2 100644 --- a/unified-runtime/source/adapters/hip/kernel.cpp +++ b/unified-runtime/source/adapters/hip/kernel.cpp @@ -373,3 +373,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize( pSuggestedLocalWorkSize); return UR_RESULT_SUCCESS; } + +UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSizeWithArgs( + ur_kernel_handle_t hKernel, ur_queue_handle_t hQueue, uint32_t workDim, + const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, + [[maybe_unused]] uint32_t numArgs, + [[maybe_unused]] const ur_exp_kernel_arg_properties_t *pArgs, + size_t *pSuggestedLocalWorkSize) { + return urKernelGetSuggestedLocalWorkSize(hKernel, hQueue, workDim, + pGlobalWorkOffset, pGlobalWorkSize, + pSuggestedLocalWorkSize); +} diff --git a/unified-runtime/source/adapters/hip/ur_interface_loader.cpp b/unified-runtime/source/adapters/hip/ur_interface_loader.cpp index 56ffca3ae7094..8e7c481f60947 100644 --- a/unified-runtime/source/adapters/hip/ur_interface_loader.cpp +++ b/unified-runtime/source/adapters/hip/ur_interface_loader.cpp @@ -141,6 +141,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable( pDdiTable->pfnSetExecInfo = urKernelSetExecInfo; pDdiTable->pfnSetSpecializationConstants = urKernelSetSpecializationConstants; pDdiTable->pfnGetSuggestedLocalWorkSize = urKernelGetSuggestedLocalWorkSize; + pDdiTable->pfnGetSuggestedLocalWorkSizeWithArgs = + urKernelGetSuggestedLocalWorkSizeWithArgs; pDdiTable->pfnSuggestMaxCooperativeGroupCount = urKernelSuggestMaxCooperativeGroupCount; return UR_RESULT_SUCCESS; diff --git a/unified-runtime/source/adapters/level_zero/kernel.cpp b/unified-runtime/source/adapters/level_zero/kernel.cpp index 4f5d72c0d88c7..ca8c7f67bc63c 100644 --- a/unified-runtime/source/adapters/level_zero/kernel.cpp +++ b/unified-runtime/source/adapters/level_zero/kernel.cpp @@ -56,6 +56,17 @@ ur_result_t urKernelGetSuggestedLocalWorkSize( return UR_RESULT_SUCCESS; } +ur_result_t urKernelGetSuggestedLocalWorkSizeWithArgs( + ur_kernel_handle_t hKernel, ur_queue_handle_t hQueue, uint32_t workDim, + const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, + [[maybe_unused]] uint32_t numArgs, + [[maybe_unused]] const ur_exp_kernel_arg_properties_t *pArgs, + size_t *pSuggestedLocalWorkSize) { + return ur::level_zero::urKernelGetSuggestedLocalWorkSize( + hKernel, hQueue, workDim, pGlobalWorkOffset, pGlobalWorkSize, + pSuggestedLocalWorkSize); +} + ur_result_t urKernelSetArgValueHelper( ur_kernel_handle_t Kernel, /// [in] argument index in range [0, num args - 1] diff --git a/unified-runtime/source/adapters/level_zero/ur_interface_loader.cpp b/unified-runtime/source/adapters/level_zero/ur_interface_loader.cpp index e1fcdc4dd5739..e1bfdb7120bc0 100644 --- a/unified-runtime/source/adapters/level_zero/ur_interface_loader.cpp +++ b/unified-runtime/source/adapters/level_zero/ur_interface_loader.cpp @@ -315,6 +315,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable( ur::level_zero::urKernelCreateWithNativeHandle; pDdiTable->pfnGetSuggestedLocalWorkSize = ur::level_zero::urKernelGetSuggestedLocalWorkSize; + pDdiTable->pfnGetSuggestedLocalWorkSizeWithArgs = + ur::level_zero::urKernelGetSuggestedLocalWorkSizeWithArgs; pDdiTable->pfnSetArgValue = ur::level_zero::urKernelSetArgValue; pDdiTable->pfnSetArgLocal = ur::level_zero::urKernelSetArgLocal; pDdiTable->pfnSetArgPointer = ur::level_zero::urKernelSetArgPointer; diff --git a/unified-runtime/source/adapters/level_zero/ur_interface_loader.hpp b/unified-runtime/source/adapters/level_zero/ur_interface_loader.hpp index f64c404ec3c9a..e906015b67855 100644 --- a/unified-runtime/source/adapters/level_zero/ur_interface_loader.hpp +++ b/unified-runtime/source/adapters/level_zero/ur_interface_loader.hpp @@ -483,6 +483,11 @@ ur_result_t urKernelGetSuggestedLocalWorkSize(ur_kernel_handle_t hKernel, const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, size_t *pSuggestedLocalWorkSize); +ur_result_t urKernelGetSuggestedLocalWorkSizeWithArgs( + ur_kernel_handle_t hKernel, ur_queue_handle_t hQueue, uint32_t numWorkDim, + const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, + uint32_t numArgs, const ur_exp_kernel_arg_properties_t *pArgs, + size_t *pSuggestedLocalWorkSize); ur_result_t urKernelSuggestMaxCooperativeGroupCount( ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, uint32_t workDim, const size_t *pLocalWorkSize, size_t dynamicSharedMemorySize, diff --git a/unified-runtime/source/adapters/level_zero/v2/kernel.cpp b/unified-runtime/source/adapters/level_zero/v2/kernel.cpp index 5f4a51f7f2ef5..54eb0862d9201 100644 --- a/unified-runtime/source/adapters/level_zero/v2/kernel.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/kernel.cpp @@ -719,6 +719,17 @@ ur_result_t urKernelGetSuggestedLocalWorkSize( return UR_RESULT_SUCCESS; } +ur_result_t urKernelGetSuggestedLocalWorkSizeWithArgs( + ur_kernel_handle_t hKernel, ur_queue_handle_t hQueue, uint32_t workDim, + const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, + [[maybe_unused]] uint32_t numArgs, + [[maybe_unused]] const ur_exp_kernel_arg_properties_t *pArgs, + size_t *pSuggestedLocalWorkSize) { + return ur::level_zero::urKernelGetSuggestedLocalWorkSize( + hKernel, hQueue, workDim, pGlobalWorkOffset, pGlobalWorkSize, + pSuggestedLocalWorkSize); +} + ur_result_t urKernelSuggestMaxCooperativeGroupCount( ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, uint32_t workDim, const size_t *pLocalWorkSize, size_t dynamicSharedMemorySize, diff --git a/unified-runtime/source/adapters/mock/ur_mockddi.cpp b/unified-runtime/source/adapters/mock/ur_mockddi.cpp index 1603b657a459a..f4e553891f027 100644 --- a/unified-runtime/source/adapters/mock/ur_mockddi.cpp +++ b/unified-runtime/source/adapters/mock/ur_mockddi.cpp @@ -7473,6 +7473,73 @@ __urdlllocal ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize( return exceptionToResult(std::current_exception()); } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urKernelGetSuggestedLocalWorkSizeWithArgs +__urdlllocal ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSizeWithArgs( + /// [in] handle of the kernel + ur_kernel_handle_t hKernel, + /// [in] handle of the queue object + ur_queue_handle_t hQueue, + /// [in] number of dimensions, from 1 to 3, to specify the global + /// and work-group work-items + uint32_t numWorkDim, + /// [in] pointer to an array of numWorkDim unsigned values that specify + /// the offset used to calculate the global ID of a work-item + const size_t *pGlobalWorkOffset, + /// [in] pointer to an array of numWorkDim unsigned values that specify + /// the number of global work-items in workDim that will execute the + /// kernel function + const size_t *pGlobalWorkSize, + /// [in] Number of entries in pArgs + uint32_t numArgs, + /// [in][optional][range(0, numArgs)] pointer to a list of kernel arg + /// properties. + const ur_exp_kernel_arg_properties_t *pArgs, + /// [out] pointer to an array of numWorkDim unsigned values that specify + /// suggested local work size that will contain the result of the query + size_t *pSuggestedLocalWorkSize) try { + ur_result_t result = UR_RESULT_SUCCESS; + + ur_kernel_get_suggested_local_work_size_with_args_params_t params = { + &hKernel, &hQueue, &numWorkDim, &pGlobalWorkOffset, + &pGlobalWorkSize, &numArgs, &pArgs, &pSuggestedLocalWorkSize}; + + auto beforeCallback = reinterpret_cast( + mock::getCallbacks().get_before_callback( + "urKernelGetSuggestedLocalWorkSizeWithArgs")); + if (beforeCallback) { + result = beforeCallback(¶ms); + if (result != UR_RESULT_SUCCESS) { + return result; + } + } + + auto replaceCallback = reinterpret_cast( + mock::getCallbacks().get_replace_callback( + "urKernelGetSuggestedLocalWorkSizeWithArgs")); + if (replaceCallback) { + result = replaceCallback(¶ms); + } else { + + result = UR_RESULT_SUCCESS; + } + + if (result != UR_RESULT_SUCCESS) { + return result; + } + + auto afterCallback = reinterpret_cast( + mock::getCallbacks().get_after_callback( + "urKernelGetSuggestedLocalWorkSizeWithArgs")); + if (afterCallback) { + return afterCallback(¶ms); + } + + return result; +} catch (...) { + return exceptionToResult(std::current_exception()); +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urKernelSuggestMaxCooperativeGroupCount __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCount( @@ -13664,6 +13731,9 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable( pDdiTable->pfnGetSuggestedLocalWorkSize = driver::urKernelGetSuggestedLocalWorkSize; + pDdiTable->pfnGetSuggestedLocalWorkSizeWithArgs = + driver::urKernelGetSuggestedLocalWorkSizeWithArgs; + pDdiTable->pfnSetArgValue = driver::urKernelSetArgValue; pDdiTable->pfnSetArgLocal = driver::urKernelSetArgLocal; diff --git a/unified-runtime/source/adapters/native_cpu/kernel.cpp b/unified-runtime/source/adapters/native_cpu/kernel.cpp index 021a95bdadc5c..9fe7e3cbefaed 100644 --- a/unified-runtime/source/adapters/native_cpu/kernel.cpp +++ b/unified-runtime/source/adapters/native_cpu/kernel.cpp @@ -267,6 +267,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize( return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } +UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSizeWithArgs( + [[maybe_unused]] ur_kernel_handle_t hKernel, + [[maybe_unused]] ur_queue_handle_t hQueue, + [[maybe_unused]] uint32_t workDim, + [[maybe_unused]] const size_t *pGlobalWorkOffset, + [[maybe_unused]] const size_t *pGlobalWorkSize, + [[maybe_unused]] uint32_t numArgs, + [[maybe_unused]] const ur_exp_kernel_arg_properties_t *pArgs, + [[maybe_unused]] size_t *pSuggestedLocalWorkSize) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + UR_APIEXPORT ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCount( [[maybe_unused]] ur_kernel_handle_t hKernel, [[maybe_unused]] ur_device_handle_t hDevice, diff --git a/unified-runtime/source/adapters/native_cpu/ur_interface_loader.cpp b/unified-runtime/source/adapters/native_cpu/ur_interface_loader.cpp index 78fe857721aa6..556d0c106f229 100644 --- a/unified-runtime/source/adapters/native_cpu/ur_interface_loader.cpp +++ b/unified-runtime/source/adapters/native_cpu/ur_interface_loader.cpp @@ -141,6 +141,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable( pDdiTable->pfnSetExecInfo = urKernelSetExecInfo; pDdiTable->pfnSetSpecializationConstants = urKernelSetSpecializationConstants; pDdiTable->pfnGetSuggestedLocalWorkSize = urKernelGetSuggestedLocalWorkSize; + pDdiTable->pfnGetSuggestedLocalWorkSizeWithArgs = + urKernelGetSuggestedLocalWorkSizeWithArgs; pDdiTable->pfnSuggestMaxCooperativeGroupCount = urKernelSuggestMaxCooperativeGroupCount; return UR_RESULT_SUCCESS; diff --git a/unified-runtime/source/adapters/offload/kernel.cpp b/unified-runtime/source/adapters/offload/kernel.cpp index ae155e2ad607c..5eada6fda135c 100644 --- a/unified-runtime/source/adapters/offload/kernel.cpp +++ b/unified-runtime/source/adapters/offload/kernel.cpp @@ -152,6 +152,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize( return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } +UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSizeWithArgs( + ur_kernel_handle_t, ur_queue_handle_t, uint32_t, const size_t *, + const size_t *, uint32_t, const ur_exp_kernel_arg_properties_t *, + size_t *) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgSampler( ur_kernel_handle_t, uint32_t, const ur_kernel_arg_sampler_properties_t *, ur_sampler_handle_t) { diff --git a/unified-runtime/source/adapters/offload/ur_interface_loader.cpp b/unified-runtime/source/adapters/offload/ur_interface_loader.cpp index 46db10a7325bd..f65781361e35e 100644 --- a/unified-runtime/source/adapters/offload/ur_interface_loader.cpp +++ b/unified-runtime/source/adapters/offload/ur_interface_loader.cpp @@ -140,6 +140,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable( pDdiTable->pfnSetExecInfo = urKernelSetExecInfo; pDdiTable->pfnSetSpecializationConstants = urKernelSetSpecializationConstants; pDdiTable->pfnGetSuggestedLocalWorkSize = urKernelGetSuggestedLocalWorkSize; + pDdiTable->pfnGetSuggestedLocalWorkSizeWithArgs = + urKernelGetSuggestedLocalWorkSizeWithArgs; return UR_RESULT_SUCCESS; } diff --git a/unified-runtime/source/adapters/opencl/kernel.cpp b/unified-runtime/source/adapters/opencl/kernel.cpp index b673f9743766c..10a7f9055dfa8 100644 --- a/unified-runtime/source/adapters/opencl/kernel.cpp +++ b/unified-runtime/source/adapters/opencl/kernel.cpp @@ -529,6 +529,61 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize( return UR_RESULT_SUCCESS; } +UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSizeWithArgs( + ur_kernel_handle_t hKernel, ur_queue_handle_t hQueue, uint32_t workDim, + const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, + uint32_t numArgs, const ur_exp_kernel_arg_properties_t *pArgs, + size_t *pSuggestedLocalWorkSize) { + + clSetKernelArgMemPointerINTEL_fn SetKernelArgMemPointerPtr = nullptr; + UR_RETURN_ON_FAILURE( + cl_ext::getExtFuncFromContext( + hKernel->Context->CLContext, + ur::cl::getAdapter()->fnCache.clSetKernelArgMemPointerINTELCache, + cl_ext::SetKernelArgMemPointerName, &SetKernelArgMemPointerPtr)); + + for (uint32_t i = 0; i < numArgs; i++) { + switch (pArgs[i].type) { + case UR_EXP_KERNEL_ARG_TYPE_LOCAL: + CL_RETURN_ON_FAILURE(clSetKernelArg(hKernel->CLKernel, + static_cast(pArgs[i].index), + pArgs[i].size, nullptr)); + break; + case UR_EXP_KERNEL_ARG_TYPE_VALUE: + CL_RETURN_ON_FAILURE(clSetKernelArg(hKernel->CLKernel, + static_cast(pArgs[i].index), + pArgs[i].size, pArgs[i].value.value)); + break; + case UR_EXP_KERNEL_ARG_TYPE_MEM_OBJ: { + cl_mem mem = pArgs[i].value.memObjTuple.hMem + ? pArgs[i].value.memObjTuple.hMem->CLMemory + : nullptr; + CL_RETURN_ON_FAILURE(clSetKernelArg(hKernel->CLKernel, + static_cast(pArgs[i].index), + pArgs[i].size, &mem)); + break; + } + case UR_EXP_KERNEL_ARG_TYPE_POINTER: + CL_RETURN_ON_FAILURE(SetKernelArgMemPointerPtr( + hKernel->CLKernel, static_cast(pArgs[i].index), + pArgs[i].value.pointer)); + break; + case UR_EXP_KERNEL_ARG_TYPE_SAMPLER: { + CL_RETURN_ON_FAILURE(clSetKernelArg( + hKernel->CLKernel, static_cast(pArgs[i].index), + pArgs[i].size, &pArgs[i].value.sampler->CLSampler)); + break; + } + default: + return UR_RESULT_ERROR_INVALID_ENUMERATION; + } + } + + return urKernelGetSuggestedLocalWorkSize(hKernel, hQueue, workDim, + pGlobalWorkOffset, pGlobalWorkSize, + pSuggestedLocalWorkSize); +} + UR_APIEXPORT ur_result_t UR_APICALL urKernelSetSpecializationConstants( ur_kernel_handle_t, uint32_t, const ur_specialization_constant_info_t *) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; diff --git a/unified-runtime/source/adapters/opencl/ur_interface_loader.cpp b/unified-runtime/source/adapters/opencl/ur_interface_loader.cpp index c753ce498cb40..a9c0166255d50 100644 --- a/unified-runtime/source/adapters/opencl/ur_interface_loader.cpp +++ b/unified-runtime/source/adapters/opencl/ur_interface_loader.cpp @@ -142,6 +142,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable( pDdiTable->pfnSetExecInfo = urKernelSetExecInfo; pDdiTable->pfnSetSpecializationConstants = urKernelSetSpecializationConstants; pDdiTable->pfnGetSuggestedLocalWorkSize = urKernelGetSuggestedLocalWorkSize; + pDdiTable->pfnGetSuggestedLocalWorkSizeWithArgs = + urKernelGetSuggestedLocalWorkSizeWithArgs; pDdiTable->pfnSuggestMaxCooperativeGroupCount = urKernelSuggestMaxCooperativeGroupCount; return UR_RESULT_SUCCESS; diff --git a/unified-runtime/source/loader/layers/sanitizer/asan/asan_ddi.cpp b/unified-runtime/source/loader/layers/sanitizer/asan/asan_ddi.cpp index e273680ea75d5..dc4f01005fa50 100644 --- a/unified-runtime/source/loader/layers/sanitizer/asan/asan_ddi.cpp +++ b/unified-runtime/source/loader/layers/sanitizer/asan/asan_ddi.cpp @@ -1744,63 +1744,6 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchWithArgsExp( std::memcpy(KernelInfo.ArgProps.data(), pArgs, numArgs * sizeof(ur_exp_kernel_arg_properties_t)); - // We need to set all the args now rather than letting LaunchWithArgs handle - // them. This is because some implementations of - // urKernelGetSuggestedLocalWorkSize, which is used in preLaunchKernel, rely - // on all the args being set. - for (uint32_t ArgPropIndex = 0; ArgPropIndex < numArgs; ArgPropIndex++) { - switch (pArgs[ArgPropIndex].type) { - case UR_EXP_KERNEL_ARG_TYPE_LOCAL: { - UR_CALL(ur_sanitizer_layer::asan::urKernelSetArgLocal( - hKernel, pArgs[ArgPropIndex].index, pArgs[ArgPropIndex].size, - nullptr)); - KernelInfo.ArgProps[ArgPropIndex].size = - KernelInfo.LocalArgs[ArgPropIndex].SizeWithRedZone; - break; - } - case UR_EXP_KERNEL_ARG_TYPE_POINTER: { - UR_CALL(ur_sanitizer_layer::asan::urKernelSetArgPointer( - hKernel, pArgs[ArgPropIndex].index, nullptr, - pArgs[ArgPropIndex].value.pointer)); - break; - } - case UR_EXP_KERNEL_ARG_TYPE_VALUE: { - UR_CALL(ur_sanitizer_layer::asan::urKernelSetArgValue( - hKernel, pArgs[ArgPropIndex].index, pArgs[ArgPropIndex].size, nullptr, - pArgs[ArgPropIndex].value.value)); - break; - } - case UR_EXP_KERNEL_ARG_TYPE_MEM_OBJ: { - ur_kernel_arg_mem_obj_properties_t Properties = { - UR_STRUCTURE_TYPE_KERNEL_ARG_MEM_OBJ_PROPERTIES, nullptr, - pArgs[ArgPropIndex].value.memObjTuple.flags}; - UR_CALL(ur_sanitizer_layer::asan::urKernelSetArgMemObj( - hKernel, pArgs[ArgPropIndex].index, &Properties, - pArgs[ArgPropIndex].value.memObjTuple.hMem)); - if (std::shared_ptr MemBuffer = - getAsanInterceptor()->getMemBuffer( - pArgs[ArgPropIndex].value.memObjTuple.hMem)) { - char *Handle = nullptr; - UR_CALL(MemBuffer->getHandle(GetDevice(hQueue), Handle)); - KernelInfo.ArgProps[ArgPropIndex].type = - ur_exp_kernel_arg_type_t::UR_EXP_KERNEL_ARG_TYPE_POINTER; - KernelInfo.ArgProps[ArgPropIndex].value.pointer = Handle; - } - break; - } - case UR_EXP_KERNEL_ARG_TYPE_SAMPLER: { - auto pfnKernelSetArgSampler = - getContext()->urDdiTable.Kernel.pfnSetArgSampler; - UR_CALL(pfnKernelSetArgSampler(hKernel, pArgs[ArgPropIndex].index, - nullptr, - pArgs[ArgPropIndex].value.sampler)); - break; - } - default: - return UR_RESULT_ERROR_INVALID_ENUMERATION; - } - } - LaunchInfo LaunchInfo(GetContext(hQueue), GetDevice(hQueue), pGlobalWorkSize, pLocalWorkSize, pGlobalWorkOffset, workDim); UR_CALL(LaunchInfo.Data.syncToDevice(hQueue)); diff --git a/unified-runtime/source/loader/layers/sanitizer/asan/asan_interceptor.cpp b/unified-runtime/source/loader/layers/sanitizer/asan/asan_interceptor.cpp index 4288ca38236f7..62d21d6e1bf66 100644 --- a/unified-runtime/source/loader/layers/sanitizer/asan/asan_interceptor.cpp +++ b/unified-runtime/source/loader/layers/sanitizer/asan/asan_interceptor.cpp @@ -800,9 +800,12 @@ ur_result_t AsanInterceptor::prepareLaunch( if (LaunchInfo.LocalWorkSize.empty()) { LaunchInfo.LocalWorkSize.resize(LaunchInfo.WorkDim); - auto URes = getContext()->urDdiTable.Kernel.pfnGetSuggestedLocalWorkSize( - Kernel, Queue, LaunchInfo.WorkDim, LaunchInfo.GlobalWorkOffset.data(), - LaunchInfo.GlobalWorkSize, LaunchInfo.LocalWorkSize.data()); + auto URes = + getContext()->urDdiTable.Kernel.pfnGetSuggestedLocalWorkSizeWithArgs( + Kernel, Queue, LaunchInfo.WorkDim, + LaunchInfo.GlobalWorkOffset.data(), LaunchInfo.GlobalWorkSize, + ArgNums, KernelInfo.ArgProps.data(), + LaunchInfo.LocalWorkSize.data()); if (URes != UR_RESULT_SUCCESS) { if (URes != UR_RESULT_ERROR_UNSUPPORTED_FEATURE) { return URes; diff --git a/unified-runtime/source/loader/layers/sanitizer/msan/msan_interceptor.cpp b/unified-runtime/source/loader/layers/sanitizer/msan/msan_interceptor.cpp index a16c78c552722..b9b1cdab08241 100644 --- a/unified-runtime/source/loader/layers/sanitizer/msan/msan_interceptor.cpp +++ b/unified-runtime/source/loader/layers/sanitizer/msan/msan_interceptor.cpp @@ -509,9 +509,13 @@ ur_result_t MsanInterceptor::prepareLaunch( if (LaunchInfo.LocalWorkSize.empty()) { LaunchInfo.LocalWorkSize.resize(LaunchInfo.WorkDim); - auto URes = getContext()->urDdiTable.Kernel.pfnGetSuggestedLocalWorkSize( - Kernel, Queue, LaunchInfo.WorkDim, LaunchInfo.GlobalWorkOffset.data(), - LaunchInfo.GlobalWorkSize, LaunchInfo.LocalWorkSize.data()); + auto ArgNums = GetKernelNumArgs(Kernel); + auto URes = + getContext()->urDdiTable.Kernel.pfnGetSuggestedLocalWorkSizeWithArgs( + Kernel, Queue, LaunchInfo.WorkDim, + LaunchInfo.GlobalWorkOffset.data(), LaunchInfo.GlobalWorkSize, + ArgNums, KernelInfo.ArgProps.data(), + LaunchInfo.LocalWorkSize.data()); if (URes != UR_RESULT_SUCCESS) { if (URes != UR_RESULT_ERROR_UNSUPPORTED_FEATURE) { return URes; diff --git a/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_interceptor.cpp b/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_interceptor.cpp index 90904e404850a..e4e6ce0df6360 100644 --- a/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_interceptor.cpp +++ b/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_interceptor.cpp @@ -399,9 +399,13 @@ ur_result_t TsanInterceptor::prepareLaunch(std::shared_ptr &, // Get suggested local work size if user doesn't determine it. if (LaunchInfo.LocalWorkSize.empty()) { LaunchInfo.LocalWorkSize.resize(LaunchInfo.WorkDim); - auto URes = getContext()->urDdiTable.Kernel.pfnGetSuggestedLocalWorkSize( - Kernel, Queue, LaunchInfo.WorkDim, LaunchInfo.GlobalWorkOffset.data(), - LaunchInfo.GlobalWorkSize, LaunchInfo.LocalWorkSize.data()); + auto ArgNums = GetKernelNumArgs(Kernel); + auto URes = + getContext()->urDdiTable.Kernel.pfnGetSuggestedLocalWorkSizeWithArgs( + Kernel, Queue, LaunchInfo.WorkDim, + LaunchInfo.GlobalWorkOffset.data(), LaunchInfo.GlobalWorkSize, + ArgNums, KernelInfo.ArgProps.data(), + LaunchInfo.LocalWorkSize.data()); if (URes != UR_RESULT_SUCCESS) { if (URes != UR_RESULT_ERROR_UNSUPPORTED_FEATURE) { return URes; diff --git a/unified-runtime/source/loader/layers/tracing/ur_trcddi.cpp b/unified-runtime/source/loader/layers/tracing/ur_trcddi.cpp index 0e873718facba..11ccfa1c4610b 100644 --- a/unified-runtime/source/loader/layers/tracing/ur_trcddi.cpp +++ b/unified-runtime/source/loader/layers/tracing/ur_trcddi.cpp @@ -6224,6 +6224,68 @@ __urdlllocal ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urKernelGetSuggestedLocalWorkSizeWithArgs +__urdlllocal ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSizeWithArgs( + /// [in] handle of the kernel + ur_kernel_handle_t hKernel, + /// [in] handle of the queue object + ur_queue_handle_t hQueue, + /// [in] number of dimensions, from 1 to 3, to specify the global + /// and work-group work-items + uint32_t numWorkDim, + /// [in] pointer to an array of numWorkDim unsigned values that specify + /// the offset used to calculate the global ID of a work-item + const size_t *pGlobalWorkOffset, + /// [in] pointer to an array of numWorkDim unsigned values that specify + /// the number of global work-items in workDim that will execute the + /// kernel function + const size_t *pGlobalWorkSize, + /// [in] Number of entries in pArgs + uint32_t numArgs, + /// [in][optional][range(0, numArgs)] pointer to a list of kernel arg + /// properties. + const ur_exp_kernel_arg_properties_t *pArgs, + /// [out] pointer to an array of numWorkDim unsigned values that specify + /// suggested local work size that will contain the result of the query + size_t *pSuggestedLocalWorkSize) { + auto pfnGetSuggestedLocalWorkSizeWithArgs = + getContext()->urDdiTable.Kernel.pfnGetSuggestedLocalWorkSizeWithArgs; + + if (nullptr == pfnGetSuggestedLocalWorkSizeWithArgs) + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + + ur_kernel_get_suggested_local_work_size_with_args_params_t params = { + &hKernel, &hQueue, &numWorkDim, &pGlobalWorkOffset, + &pGlobalWorkSize, &numArgs, &pArgs, &pSuggestedLocalWorkSize}; + uint64_t instance = getContext()->notify_begin( + UR_FUNCTION_KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE_WITH_ARGS, + "urKernelGetSuggestedLocalWorkSizeWithArgs", ¶ms); + + auto &logger = getContext()->logger; + UR_LOG_L(logger, INFO, " ---> urKernelGetSuggestedLocalWorkSizeWithArgs\n"); + + ur_result_t result = pfnGetSuggestedLocalWorkSizeWithArgs( + hKernel, hQueue, numWorkDim, pGlobalWorkOffset, pGlobalWorkSize, numArgs, + pArgs, pSuggestedLocalWorkSize); + + getContext()->notify_end( + UR_FUNCTION_KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE_WITH_ARGS, + "urKernelGetSuggestedLocalWorkSizeWithArgs", ¶ms, &result, instance); + + if (logger.getLevel() <= UR_LOGGER_LEVEL_INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE_WITH_ARGS, + ¶ms); + UR_LOG_L(logger, INFO, + " <--- urKernelGetSuggestedLocalWorkSizeWithArgs({}) -> {};\n", + args_str.str(), result); + } + + return result; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urKernelSuggestMaxCooperativeGroupCount __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCount( @@ -11841,6 +11903,11 @@ __urdlllocal ur_result_t UR_APICALL urGetKernelProcAddrTable( pDdiTable->pfnGetSuggestedLocalWorkSize = ur_tracing_layer::urKernelGetSuggestedLocalWorkSize; + dditable.pfnGetSuggestedLocalWorkSizeWithArgs = + pDdiTable->pfnGetSuggestedLocalWorkSizeWithArgs; + pDdiTable->pfnGetSuggestedLocalWorkSizeWithArgs = + ur_tracing_layer::urKernelGetSuggestedLocalWorkSizeWithArgs; + dditable.pfnSetArgValue = pDdiTable->pfnSetArgValue; pDdiTable->pfnSetArgValue = ur_tracing_layer::urKernelSetArgValue; diff --git a/unified-runtime/source/loader/layers/validation/ur_valddi.cpp b/unified-runtime/source/loader/layers/validation/ur_valddi.cpp index 89adf47a79943..66c7e1c48d8b3 100644 --- a/unified-runtime/source/loader/layers/validation/ur_valddi.cpp +++ b/unified-runtime/source/loader/layers/validation/ur_valddi.cpp @@ -6887,6 +6887,78 @@ __urdlllocal ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urKernelGetSuggestedLocalWorkSizeWithArgs +__urdlllocal ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSizeWithArgs( + /// [in] handle of the kernel + ur_kernel_handle_t hKernel, + /// [in] handle of the queue object + ur_queue_handle_t hQueue, + /// [in] number of dimensions, from 1 to 3, to specify the global + /// and work-group work-items + uint32_t numWorkDim, + /// [in] pointer to an array of numWorkDim unsigned values that specify + /// the offset used to calculate the global ID of a work-item + const size_t *pGlobalWorkOffset, + /// [in] pointer to an array of numWorkDim unsigned values that specify + /// the number of global work-items in workDim that will execute the + /// kernel function + const size_t *pGlobalWorkSize, + /// [in] Number of entries in pArgs + uint32_t numArgs, + /// [in][optional][range(0, numArgs)] pointer to a list of kernel arg + /// properties. + const ur_exp_kernel_arg_properties_t *pArgs, + /// [out] pointer to an array of numWorkDim unsigned values that specify + /// suggested local work size that will contain the result of the query + size_t *pSuggestedLocalWorkSize) { + auto pfnGetSuggestedLocalWorkSizeWithArgs = + getContext()->urDdiTable.Kernel.pfnGetSuggestedLocalWorkSizeWithArgs; + + if (nullptr == pfnGetSuggestedLocalWorkSizeWithArgs) { + return UR_RESULT_ERROR_UNINITIALIZED; + } + + if (getContext()->enableParameterValidation) { + if (NULL == pGlobalWorkOffset) + return UR_RESULT_ERROR_INVALID_NULL_POINTER; + + if (NULL == pGlobalWorkSize) + return UR_RESULT_ERROR_INVALID_NULL_POINTER; + + if (NULL == pSuggestedLocalWorkSize) + return UR_RESULT_ERROR_INVALID_NULL_POINTER; + + if (pArgs == NULL && numArgs > 0) + return UR_RESULT_ERROR_INVALID_NULL_POINTER; + + if (NULL == hKernel) + return UR_RESULT_ERROR_INVALID_NULL_HANDLE; + + if (NULL == hQueue) + return UR_RESULT_ERROR_INVALID_NULL_HANDLE; + + if (NULL != pArgs && UR_EXP_KERNEL_ARG_TYPE_SAMPLER < pArgs->type) + return UR_RESULT_ERROR_INVALID_ENUMERATION; + } + + if (getContext()->enableLifetimeValidation && + !getContext()->refCountContext->isReferenceValid(hKernel)) { + URLOG_CTX_INVALID_REFERENCE(hKernel); + } + + if (getContext()->enableLifetimeValidation && + !getContext()->refCountContext->isReferenceValid(hQueue)) { + URLOG_CTX_INVALID_REFERENCE(hQueue); + } + + ur_result_t result = pfnGetSuggestedLocalWorkSizeWithArgs( + hKernel, hQueue, numWorkDim, pGlobalWorkOffset, pGlobalWorkSize, numArgs, + pArgs, pSuggestedLocalWorkSize); + + return result; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urKernelSuggestMaxCooperativeGroupCount __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCount( @@ -12625,6 +12697,11 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable( pDdiTable->pfnGetSuggestedLocalWorkSize = ur_validation_layer::urKernelGetSuggestedLocalWorkSize; + dditable.pfnGetSuggestedLocalWorkSizeWithArgs = + pDdiTable->pfnGetSuggestedLocalWorkSizeWithArgs; + pDdiTable->pfnGetSuggestedLocalWorkSizeWithArgs = + ur_validation_layer::urKernelGetSuggestedLocalWorkSizeWithArgs; + dditable.pfnSetArgValue = pDdiTable->pfnSetArgValue; pDdiTable->pfnSetArgValue = ur_validation_layer::urKernelSetArgValue; diff --git a/unified-runtime/source/loader/loader.def.in b/unified-runtime/source/loader/loader.def.in index a9a1325e97d12..3bc3e9d2957aa 100644 --- a/unified-runtime/source/loader/loader.def.in +++ b/unified-runtime/source/loader/loader.def.in @@ -157,6 +157,7 @@ EXPORTS urKernelGetNativeHandle urKernelGetSubGroupInfo urKernelGetSuggestedLocalWorkSize + urKernelGetSuggestedLocalWorkSizeWithArgs urKernelRelease urKernelRetain urKernelSetArgLocal @@ -417,6 +418,7 @@ EXPORTS urPrintKernelGetNativeHandleParams urPrintKernelGetSubGroupInfoParams urPrintKernelGetSuggestedLocalWorkSizeParams + urPrintKernelGetSuggestedLocalWorkSizeWithArgsParams urPrintKernelGroupInfo urPrintKernelInfo urPrintKernelLaunchClusterProperty diff --git a/unified-runtime/source/loader/loader.map.in b/unified-runtime/source/loader/loader.map.in index 465978fdb5f8e..3d6e0e7b9db43 100644 --- a/unified-runtime/source/loader/loader.map.in +++ b/unified-runtime/source/loader/loader.map.in @@ -157,6 +157,7 @@ urKernelGetNativeHandle; urKernelGetSubGroupInfo; urKernelGetSuggestedLocalWorkSize; + urKernelGetSuggestedLocalWorkSizeWithArgs; urKernelRelease; urKernelRetain; urKernelSetArgLocal; @@ -417,6 +418,7 @@ urPrintKernelGetNativeHandleParams; urPrintKernelGetSubGroupInfoParams; urPrintKernelGetSuggestedLocalWorkSizeParams; + urPrintKernelGetSuggestedLocalWorkSizeWithArgsParams; urPrintKernelGroupInfo; urPrintKernelInfo; urPrintKernelLaunchClusterProperty; diff --git a/unified-runtime/source/loader/ur_ldrddi.cpp b/unified-runtime/source/loader/ur_ldrddi.cpp index 0439bb0d95456..5263fafcf1661 100644 --- a/unified-runtime/source/loader/ur_ldrddi.cpp +++ b/unified-runtime/source/loader/ur_ldrddi.cpp @@ -3595,6 +3595,45 @@ __urdlllocal ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize( pSuggestedLocalWorkSize); } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urKernelGetSuggestedLocalWorkSizeWithArgs +__urdlllocal ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSizeWithArgs( + /// [in] handle of the kernel + ur_kernel_handle_t hKernel, + /// [in] handle of the queue object + ur_queue_handle_t hQueue, + /// [in] number of dimensions, from 1 to 3, to specify the global + /// and work-group work-items + uint32_t numWorkDim, + /// [in] pointer to an array of numWorkDim unsigned values that specify + /// the offset used to calculate the global ID of a work-item + const size_t *pGlobalWorkOffset, + /// [in] pointer to an array of numWorkDim unsigned values that specify + /// the number of global work-items in workDim that will execute the + /// kernel function + const size_t *pGlobalWorkSize, + /// [in] Number of entries in pArgs + uint32_t numArgs, + /// [in][optional][range(0, numArgs)] pointer to a list of kernel arg + /// properties. + const ur_exp_kernel_arg_properties_t *pArgs, + /// [out] pointer to an array of numWorkDim unsigned values that specify + /// suggested local work size that will contain the result of the query + size_t *pSuggestedLocalWorkSize) { + + auto *dditable = *reinterpret_cast(hKernel); + + auto *pfnGetSuggestedLocalWorkSizeWithArgs = + dditable->Kernel.pfnGetSuggestedLocalWorkSizeWithArgs; + if (nullptr == pfnGetSuggestedLocalWorkSizeWithArgs) + return UR_RESULT_ERROR_UNINITIALIZED; + + // forward to device-platform + return pfnGetSuggestedLocalWorkSizeWithArgs( + hKernel, hQueue, numWorkDim, pGlobalWorkOffset, pGlobalWorkSize, numArgs, + pArgs, pSuggestedLocalWorkSize); +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urKernelSuggestMaxCooperativeGroupCount __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCount( @@ -7004,6 +7043,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable( ur_loader::urKernelCreateWithNativeHandle; pDdiTable->pfnGetSuggestedLocalWorkSize = ur_loader::urKernelGetSuggestedLocalWorkSize; + pDdiTable->pfnGetSuggestedLocalWorkSizeWithArgs = + ur_loader::urKernelGetSuggestedLocalWorkSizeWithArgs; pDdiTable->pfnSetArgValue = ur_loader::urKernelSetArgValue; pDdiTable->pfnSetArgLocal = ur_loader::urKernelSetArgLocal; pDdiTable->pfnSetArgPointer = ur_loader::urKernelSetArgPointer; diff --git a/unified-runtime/source/loader/ur_libapi.cpp b/unified-runtime/source/loader/ur_libapi.cpp index 2c0989155dc0c..4ff02f2a429a1 100644 --- a/unified-runtime/source/loader/ur_libapi.cpp +++ b/unified-runtime/source/loader/ur_libapi.cpp @@ -6893,6 +6893,67 @@ ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize( return exceptionToResult(std::current_exception()); } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Set kernel args and get the suggested local work size for a kernel. +/// +/// @details +/// - Query a suggested local work size for a kernel given a global size for +/// each dimension. +/// - The application may call this function from simultaneous threads for +/// the same context. +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_UNINITIALIZED +/// - ::UR_RESULT_ERROR_DEVICE_LOST +/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC +/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE +/// + `NULL == hKernel` +/// + `NULL == hQueue` +/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +/// + `NULL == pGlobalWorkOffset` +/// + `NULL == pGlobalWorkSize` +/// + `NULL == pSuggestedLocalWorkSize` +/// + `pArgs == NULL && numArgs > 0` +/// - ::UR_RESULT_ERROR_INVALID_ENUMERATION +/// + `NULL != pArgs && ::UR_EXP_KERNEL_ARG_TYPE_SAMPLER < pArgs->type` +/// - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE +ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSizeWithArgs( + /// [in] handle of the kernel + ur_kernel_handle_t hKernel, + /// [in] handle of the queue object + ur_queue_handle_t hQueue, + /// [in] number of dimensions, from 1 to 3, to specify the global + /// and work-group work-items + uint32_t numWorkDim, + /// [in] pointer to an array of numWorkDim unsigned values that specify + /// the offset used to calculate the global ID of a work-item + const size_t *pGlobalWorkOffset, + /// [in] pointer to an array of numWorkDim unsigned values that specify + /// the number of global work-items in workDim that will execute the + /// kernel function + const size_t *pGlobalWorkSize, + /// [in] Number of entries in pArgs + uint32_t numArgs, + /// [in][optional][range(0, numArgs)] pointer to a list of kernel arg + /// properties. + const ur_exp_kernel_arg_properties_t *pArgs, + /// [out] pointer to an array of numWorkDim unsigned values that specify + /// suggested local work size that will contain the result of the query + size_t *pSuggestedLocalWorkSize) try { + auto pfnGetSuggestedLocalWorkSizeWithArgs = + ur_lib::getContext() + ->urDdiTable.Kernel.pfnGetSuggestedLocalWorkSizeWithArgs; + if (nullptr == pfnGetSuggestedLocalWorkSizeWithArgs) + return UR_RESULT_ERROR_UNINITIALIZED; + + return pfnGetSuggestedLocalWorkSizeWithArgs( + hKernel, hQueue, numWorkDim, pGlobalWorkOffset, pGlobalWorkSize, numArgs, + pArgs, pSuggestedLocalWorkSize); +} catch (...) { + return exceptionToResult(std::current_exception()); +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Query the maximum number of work groups for a cooperative kernel /// diff --git a/unified-runtime/source/loader/ur_print.cpp b/unified-runtime/source/loader/ur_print.cpp index f45fceb05c676..aa043bffc1c2e 100644 --- a/unified-runtime/source/loader/ur_print.cpp +++ b/unified-runtime/source/loader/ur_print.cpp @@ -2264,6 +2264,15 @@ ur_result_t urPrintKernelGetSuggestedLocalWorkSizeParams( return str_copy(&ss, buffer, buff_size, out_size); } +ur_result_t urPrintKernelGetSuggestedLocalWorkSizeWithArgsParams( + const struct ur_kernel_get_suggested_local_work_size_with_args_params_t + *params, + char *buffer, const size_t buff_size, size_t *out_size) { + std::stringstream ss; + ss << params; + return str_copy(&ss, buffer, buff_size, out_size); +} + ur_result_t urPrintKernelSetArgValueParams( const struct ur_kernel_set_arg_value_params_t *params, char *buffer, const size_t buff_size, size_t *out_size) { diff --git a/unified-runtime/source/ur_api.cpp b/unified-runtime/source/ur_api.cpp index 44702034644e8..a8928e80c4753 100644 --- a/unified-runtime/source/ur_api.cpp +++ b/unified-runtime/source/ur_api.cpp @@ -6065,6 +6065,58 @@ ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Set kernel args and get the suggested local work size for a kernel. +/// +/// @details +/// - Query a suggested local work size for a kernel given a global size for +/// each dimension. +/// - The application may call this function from simultaneous threads for +/// the same context. +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_UNINITIALIZED +/// - ::UR_RESULT_ERROR_DEVICE_LOST +/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC +/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE +/// + `NULL == hKernel` +/// + `NULL == hQueue` +/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +/// + `NULL == pGlobalWorkOffset` +/// + `NULL == pGlobalWorkSize` +/// + `NULL == pSuggestedLocalWorkSize` +/// + `pArgs == NULL && numArgs > 0` +/// - ::UR_RESULT_ERROR_INVALID_ENUMERATION +/// + `NULL != pArgs && ::UR_EXP_KERNEL_ARG_TYPE_SAMPLER < pArgs->type` +/// - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE +ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSizeWithArgs( + /// [in] handle of the kernel + ur_kernel_handle_t hKernel, + /// [in] handle of the queue object + ur_queue_handle_t hQueue, + /// [in] number of dimensions, from 1 to 3, to specify the global + /// and work-group work-items + uint32_t numWorkDim, + /// [in] pointer to an array of numWorkDim unsigned values that specify + /// the offset used to calculate the global ID of a work-item + const size_t *pGlobalWorkOffset, + /// [in] pointer to an array of numWorkDim unsigned values that specify + /// the number of global work-items in workDim that will execute the + /// kernel function + const size_t *pGlobalWorkSize, + /// [in] Number of entries in pArgs + uint32_t numArgs, + /// [in][optional][range(0, numArgs)] pointer to a list of kernel arg + /// properties. + const ur_exp_kernel_arg_properties_t *pArgs, + /// [out] pointer to an array of numWorkDim unsigned values that specify + /// suggested local work size that will contain the result of the query + size_t *pSuggestedLocalWorkSize) { + ur_result_t result = UR_RESULT_SUCCESS; + return result; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Query the maximum number of work groups for a cooperative kernel ///