diff --git a/unified-runtime/include/unified-runtime/ur_api.h b/unified-runtime/include/unified-runtime/ur_api.h
index d8b040fa4e763..f4042ec192b79 100644
--- a/unified-runtime/include/unified-runtime/ur_api.h
+++ b/unified-runtime/include/unified-runtime/ur_api.h
@@ -515,6 +515,8 @@ typedef enum ur_function_t {
   UR_FUNCTION_ENQUEUE_HOST_TASK_EXP = 309,
   /// Enumerator for ::urCommandBufferAppendKernelLaunchWithArgsExp
   UR_FUNCTION_COMMAND_BUFFER_APPEND_KERNEL_LAUNCH_WITH_ARGS_EXP = 310,
+  /// Enumerator for ::urKernelGetSuggestedLocalWorkSizeWithArgs
+  UR_FUNCTION_KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE_WITH_ARGS = 311,
   /// @cond
   UR_FUNCTION_FORCE_UINT32 = 0x7fffffff
   /// @endcond
@@ -9501,6 +9503,55 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize(
     /// suggested local work size that will contain the result of the query
     size_t *pSuggestedLocalWorkSize);
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Set kernel args and get the suggested local work size for a kernel.
+///
+/// @details
+///     - Query a suggested local work size for a kernel given a global size for
+///       each dimension.
+///     - The application may call this function from simultaneous threads for
+///       the same context.
+///
+/// @returns
+///     - ::UR_RESULT_SUCCESS
+///     - ::UR_RESULT_ERROR_UNINITIALIZED
+///     - ::UR_RESULT_ERROR_DEVICE_LOST
+///     - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC
+///     - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `NULL == hKernel`
+///         + `NULL == hQueue`
+///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `NULL == pGlobalWorkOffset`
+///         + `NULL == pGlobalWorkSize`
+///         + `NULL == pSuggestedLocalWorkSize`
+///         + `pArgs == NULL && numArgs > 0`
+///     - ::UR_RESULT_ERROR_INVALID_ENUMERATION
+///         + `NULL != pArgs && ::UR_EXP_KERNEL_ARG_TYPE_SAMPLER < pArgs->type`
+///     - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE
+UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSizeWithArgs(
+    /// [in] handle of the kernel
+    ur_kernel_handle_t hKernel,
+    /// [in] handle of the queue object
+    ur_queue_handle_t hQueue,
+    /// [in] number of dimensions, from 1 to 3, to specify the global
+    /// and work-group work-items
+    uint32_t numWorkDim,
+    /// [in] pointer to an array of numWorkDim unsigned values that specify
+    /// the offset used to calculate the global ID of a work-item
+    const size_t *pGlobalWorkOffset,
+    /// [in] pointer to an array of numWorkDim unsigned values that specify
+    /// the number of global work-items in workDim that will execute the
+    /// kernel function
+    const size_t *pGlobalWorkSize,
+    /// [in] Number of entries in pArgs
+    uint32_t numArgs,
+    /// [in][optional][range(0, numArgs)] pointer to a list of kernel arg
+    /// properties.
+    const ur_exp_kernel_arg_properties_t *pArgs,
+    /// [out] pointer to an array of numWorkDim unsigned values that specify
+    /// suggested local work size that will contain the result of the query
+    size_t *pSuggestedLocalWorkSize);
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Query the maximum number of work groups for a cooperative kernel
 ///
@@ -14580,6 +14631,21 @@ typedef struct ur_kernel_get_suggested_local_work_size_params_t {
   size_t **ppSuggestedLocalWorkSize;
 } ur_kernel_get_suggested_local_work_size_params_t;
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function parameters for urKernelGetSuggestedLocalWorkSizeWithArgs
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct ur_kernel_get_suggested_local_work_size_with_args_params_t {
+  ur_kernel_handle_t *phKernel;
+  ur_queue_handle_t *phQueue;
+  uint32_t *pnumWorkDim;
+  const size_t **ppGlobalWorkOffset;
+  const size_t **ppGlobalWorkSize;
+  uint32_t *pnumArgs;
+  const ur_exp_kernel_arg_properties_t **ppArgs;
+  size_t **ppSuggestedLocalWorkSize;
+} ur_kernel_get_suggested_local_work_size_with_args_params_t;
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Function parameters for urKernelSetArgValue
 /// @details Each entry is a pointer to the parameter passed to the function;
diff --git a/unified-runtime/include/unified-runtime/ur_api_funcs.def b/unified-runtime/include/unified-runtime/ur_api_funcs.def
index 06a3efa6bc437..3659aee433432 100644
--- a/unified-runtime/include/unified-runtime/ur_api_funcs.def
+++ b/unified-runtime/include/unified-runtime/ur_api_funcs.def
@@ -72,6 +72,7 @@ _UR_API(urKernelRelease)
 _UR_API(urKernelGetNativeHandle)
 _UR_API(urKernelCreateWithNativeHandle)
 _UR_API(urKernelGetSuggestedLocalWorkSize)
+_UR_API(urKernelGetSuggestedLocalWorkSizeWithArgs)
 _UR_API(urKernelSetArgValue)
 _UR_API(urKernelSetArgLocal)
 _UR_API(urKernelSetArgPointer)
diff --git a/unified-runtime/include/unified-runtime/ur_ddi.h b/unified-runtime/include/unified-runtime/ur_ddi.h
index da3747e385a9d..3c381bc98dcda 100644
--- a/unified-runtime/include/unified-runtime/ur_ddi.h
+++ b/unified-runtime/include/unified-runtime/ur_ddi.h
@@ -521,6 +521,13 @@ typedef ur_result_t(UR_APICALL *ur_pfnKernelGetSuggestedLocalWorkSize_t)(
     ur_kernel_handle_t, ur_queue_handle_t, uint32_t, const size_t *,
     const size_t *, size_t *);
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for urKernelGetSuggestedLocalWorkSizeWithArgs
+typedef ur_result_t(
+    UR_APICALL *ur_pfnKernelGetSuggestedLocalWorkSizeWithArgs_t)(
+    ur_kernel_handle_t, ur_queue_handle_t, uint32_t, const size_t *,
+    const size_t *, uint32_t, const ur_exp_kernel_arg_properties_t *, size_t *);
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Function-pointer for urKernelSetArgValue
 typedef ur_result_t(UR_APICALL *ur_pfnKernelSetArgValue_t)(
@@ -580,6 +587,8 @@ typedef struct ur_kernel_dditable_t {
   ur_pfnKernelGetNativeHandle_t pfnGetNativeHandle;
   ur_pfnKernelCreateWithNativeHandle_t pfnCreateWithNativeHandle;
   ur_pfnKernelGetSuggestedLocalWorkSize_t pfnGetSuggestedLocalWorkSize;
+  ur_pfnKernelGetSuggestedLocalWorkSizeWithArgs_t
+      pfnGetSuggestedLocalWorkSizeWithArgs;
   ur_pfnKernelSetArgValue_t pfnSetArgValue;
   ur_pfnKernelSetArgLocal_t pfnSetArgLocal;
   ur_pfnKernelSetArgPointer_t pfnSetArgPointer;
diff --git a/unified-runtime/include/unified-runtime/ur_print.h b/unified-runtime/include/unified-runtime/ur_print.h
index 881027a92a77e..ea22e1e4c7783 100644
--- a/unified-runtime/include/unified-runtime/ur_print.h
+++ b/unified-runtime/include/unified-runtime/ur_print.h
@@ -2169,6 +2169,19 @@ urPrintKernelGetSuggestedLocalWorkSizeParams(
     const struct ur_kernel_get_suggested_local_work_size_params_t *params,
     char *buffer, const size_t buff_size, size_t *out_size);
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Print ur_kernel_get_suggested_local_work_size_with_args_params_t
+/// struct
+/// @returns
+///     - ::UR_RESULT_SUCCESS
+///     - ::UR_RESULT_ERROR_INVALID_SIZE
+///         - `buff_size < out_size`
+UR_APIEXPORT ur_result_t UR_APICALL
+urPrintKernelGetSuggestedLocalWorkSizeWithArgsParams(
+    const struct ur_kernel_get_suggested_local_work_size_with_args_params_t
+        *params,
+    char *buffer, const size_t buff_size, size_t *out_size);
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Print ur_kernel_set_arg_value_params_t struct
 /// @returns
diff --git a/unified-runtime/include/unified-runtime/ur_print.hpp b/unified-runtime/include/unified-runtime/ur_print.hpp
index e1d30d0d1f3b3..35736578de1a6 100644
--- a/unified-runtime/include/unified-runtime/ur_print.hpp
+++ b/unified-runtime/include/unified-runtime/ur_print.hpp
@@ -1373,6 +1373,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_function_t value) {
   case UR_FUNCTION_COMMAND_BUFFER_APPEND_KERNEL_LAUNCH_WITH_ARGS_EXP:
     os << "UR_FUNCTION_COMMAND_BUFFER_APPEND_KERNEL_LAUNCH_WITH_ARGS_EXP";
     break;
+  case UR_FUNCTION_KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE_WITH_ARGS:
+    os << "UR_FUNCTION_KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE_WITH_ARGS";
+    break;
   default:
     os << "unknown enumerator";
     break;
@@ -14923,6 +14926,67 @@ operator<<(std::ostream &os, [[maybe_unused]] const struct
   return os;
 }
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Print operator for the
+/// ur_kernel_get_suggested_local_work_size_with_args_params_t type
+/// @returns
+///     std::ostream &
+inline std::ostream &
+operator<<(std::ostream &os, [[maybe_unused]] const struct
+           ur_kernel_get_suggested_local_work_size_with_args_params_t *params) {
+
+  os << ".hKernel = ";
+
+  ur::details::printPtr(os, *(params->phKernel));
+
+  os << ", ";
+  os << ".hQueue = ";
+
+  ur::details::printPtr(os, *(params->phQueue));
+
+  os << ", ";
+  os << ".numWorkDim = ";
+
+  os << *(params->pnumWorkDim);
+
+  os << ", ";
+  os << ".pGlobalWorkOffset = ";
+
+  ur::details::printPtr(os, *(params->ppGlobalWorkOffset));
+
+  os << ", ";
+  os << ".pGlobalWorkSize = ";
+
+  ur::details::printPtr(os, *(params->ppGlobalWorkSize));
+
+  os << ", ";
+  os << ".numArgs = ";
+
+  os << *(params->pnumArgs);
+
+  os << ", ";
+  os << ".pArgs = ";
+  ur::details::printPtr(os, reinterpret_cast<const void *>(*(params->ppArgs)));
+  if (*(params->ppArgs) != NULL) {
+    os << " {";
+    for (size_t i = 0; i < *params->pnumArgs; ++i) {
+      if (i != 0) {
+        os << ", ";
+      }
+
+      os << (*(params->ppArgs))[i];
+    }
+    os << "}";
+  }
+
+  os << ", ";
+  os << ".pSuggestedLocalWorkSize = ";
+
+  ur::details::printPtr(os, *(params->ppSuggestedLocalWorkSize));
+
+  return os;
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Print operator for the ur_kernel_set_arg_value_params_t type
 /// @returns
@@ -22582,6 +22646,10 @@ inline ur_result_t UR_APICALL printFunctionParams(std::ostream &os,
     os << (const struct ur_kernel_get_suggested_local_work_size_params_t *)
             params;
   } break;
+  case UR_FUNCTION_KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE_WITH_ARGS: {
+    os << (const struct
+           ur_kernel_get_suggested_local_work_size_with_args_params_t *)params;
+  } break;
   case UR_FUNCTION_KERNEL_SET_ARG_VALUE: {
     os << (const struct ur_kernel_set_arg_value_params_t *)params;
   } break;
diff --git a/unified-runtime/scripts/core/kernel.yml b/unified-runtime/scripts/core/kernel.yml
index a420a8568f3e5..6a0ab3600ca61 100644
--- a/unified-runtime/scripts/core/kernel.yml
+++ b/unified-runtime/scripts/core/kernel.yml
@@ -602,6 +602,55 @@ returns:
     - $X_RESULT_ERROR_UNSUPPORTED_FEATURE
 --- #--------------------------------------------------------------------------
 type: function
+desc: "Set kernel args and get the suggested local work size for a kernel."
+class: $xKernel
+name: GetSuggestedLocalWorkSizeWithArgs
+ordinal: "0"
+details:
+    - "Query a suggested local work size for a kernel given a global size for each dimension."
+    - "The application may call this function from simultaneous threads for the same context."
+params:
+    - type: $x_kernel_handle_t
+      name: hKernel
+      desc: |
+            [in] handle of the kernel
+    - type: $x_queue_handle_t
+      name: hQueue
+      desc: |
+            [in] handle of the queue object
+    - type: uint32_t
+      name: numWorkDim
+      desc: |
+            [in] number of dimensions, from 1 to 3, to specify the global
+            and work-group work-items
+    - type: const size_t*
+      name: pGlobalWorkOffset
+      desc: |
+            [in] pointer to an array of numWorkDim unsigned values that specify
+            the offset used to calculate the global ID of a work-item
+    - type: const size_t*
+      name: pGlobalWorkSize
+      desc: |
+            [in] pointer to an array of numWorkDim unsigned values that specify
+            the number of global work-items in workDim that will execute the
+            kernel function
+    - type: uint32_t
+      name: numArgs
+      desc: "[in] Number of entries in pArgs"
+    - type: "const $x_exp_kernel_arg_properties_t*"
+      name: pArgs
+      desc: "[in][optional][range(0, numArgs)] pointer to a list of kernel arg properties."
+    - type: size_t*
+      name: pSuggestedLocalWorkSize
+      desc: |
+            [out] pointer to an array of numWorkDim unsigned values that specify
+            suggested local work size that will contain the result of the query
+returns:
+    - $X_RESULT_ERROR_INVALID_NULL_POINTER:
+        - "`pArgs == NULL && numArgs > 0`"
+    - $X_RESULT_ERROR_UNSUPPORTED_FEATURE
+--- #--------------------------------------------------------------------------
+type: function
 desc: "Query the maximum number of work groups for a cooperative kernel"
 class: $xKernel
 name: SuggestMaxCooperativeGroupCount
diff --git a/unified-runtime/scripts/core/registry.yml b/unified-runtime/scripts/core/registry.yml
index 8762563e9a569..288469a2f0d37 100644
--- a/unified-runtime/scripts/core/registry.yml
+++ b/unified-runtime/scripts/core/registry.yml
@@ -730,7 +730,10 @@ etors:
 - name: COMMAND_BUFFER_APPEND_KERNEL_LAUNCH_WITH_ARGS_EXP
   desc: Enumerator for $xCommandBufferAppendKernelLaunchWithArgsExp
   value: '310'
-max_id: '310'
+- name: KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE_WITH_ARGS
+  desc: Enumerator for $xKernelGetSuggestedLocalWorkSizeWithArgs
+  value: '311'
+max_id: '311'
 ---
 type: enum
 desc: Defines structure types
diff --git a/unified-runtime/source/adapters/cuda/kernel.cpp b/unified-runtime/source/adapters/cuda/kernel.cpp
index 41c0873807995..1550457fa13c6 100644
--- a/unified-runtime/source/adapters/cuda/kernel.cpp
+++ b/unified-runtime/source/adapters/cuda/kernel.cpp
@@ -445,6 +445,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize(
   return UR_RESULT_SUCCESS;
 }
 
+UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSizeWithArgs(
+    ur_kernel_handle_t hKernel, ur_queue_handle_t hQueue, uint32_t workDim,
+    const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
+    [[maybe_unused]] uint32_t numArgs,
+    [[maybe_unused]] const ur_exp_kernel_arg_properties_t *pArgs,
+    size_t *pSuggestedLocalWorkSize) {
+  return urKernelGetSuggestedLocalWorkSize(hKernel, hQueue, workDim,
+                                           pGlobalWorkOffset, pGlobalWorkSize,
+                                           pSuggestedLocalWorkSize);
+}
+
 UR_APIEXPORT ur_result_t UR_APICALL urKernelSetSpecializationConstants(
     ur_kernel_handle_t, uint32_t, const ur_specialization_constant_info_t *) {
   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
diff --git a/unified-runtime/source/adapters/cuda/ur_interface_loader.cpp b/unified-runtime/source/adapters/cuda/ur_interface_loader.cpp
index 0da25b3507997..80068cb3ddf7a 100644
--- a/unified-runtime/source/adapters/cuda/ur_interface_loader.cpp
+++ b/unified-runtime/source/adapters/cuda/ur_interface_loader.cpp
@@ -141,6 +141,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable(
   pDdiTable->pfnSetExecInfo = urKernelSetExecInfo;
   pDdiTable->pfnSetSpecializationConstants = urKernelSetSpecializationConstants;
   pDdiTable->pfnGetSuggestedLocalWorkSize = urKernelGetSuggestedLocalWorkSize;
+  pDdiTable->pfnGetSuggestedLocalWorkSizeWithArgs =
+      urKernelGetSuggestedLocalWorkSizeWithArgs;
   pDdiTable->pfnSuggestMaxCooperativeGroupCount =
       urKernelSuggestMaxCooperativeGroupCount;
   return UR_RESULT_SUCCESS;
diff --git a/unified-runtime/source/adapters/hip/kernel.cpp b/unified-runtime/source/adapters/hip/kernel.cpp
index 980449020d125..1cbb798c6b8d2 100644
--- a/unified-runtime/source/adapters/hip/kernel.cpp
+++ b/unified-runtime/source/adapters/hip/kernel.cpp
@@ -373,3 +373,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize(
             pSuggestedLocalWorkSize);
   return UR_RESULT_SUCCESS;
 }
+
+UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSizeWithArgs(
+    ur_kernel_handle_t hKernel, ur_queue_handle_t hQueue, uint32_t workDim,
+    const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
+    [[maybe_unused]] uint32_t numArgs,
+    [[maybe_unused]] const ur_exp_kernel_arg_properties_t *pArgs,
+    size_t *pSuggestedLocalWorkSize) {
+  return urKernelGetSuggestedLocalWorkSize(hKernel, hQueue, workDim,
+                                           pGlobalWorkOffset, pGlobalWorkSize,
+                                           pSuggestedLocalWorkSize);
+}
diff --git a/unified-runtime/source/adapters/hip/ur_interface_loader.cpp b/unified-runtime/source/adapters/hip/ur_interface_loader.cpp
index 56ffca3ae7094..8e7c481f60947 100644
--- a/unified-runtime/source/adapters/hip/ur_interface_loader.cpp
+++ b/unified-runtime/source/adapters/hip/ur_interface_loader.cpp
@@ -141,6 +141,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable(
   pDdiTable->pfnSetExecInfo = urKernelSetExecInfo;
   pDdiTable->pfnSetSpecializationConstants = urKernelSetSpecializationConstants;
   pDdiTable->pfnGetSuggestedLocalWorkSize = urKernelGetSuggestedLocalWorkSize;
+  pDdiTable->pfnGetSuggestedLocalWorkSizeWithArgs =
+      urKernelGetSuggestedLocalWorkSizeWithArgs;
   pDdiTable->pfnSuggestMaxCooperativeGroupCount =
       urKernelSuggestMaxCooperativeGroupCount;
   return UR_RESULT_SUCCESS;
diff --git a/unified-runtime/source/adapters/level_zero/kernel.cpp b/unified-runtime/source/adapters/level_zero/kernel.cpp
index 4f5d72c0d88c7..ca8c7f67bc63c 100644
--- a/unified-runtime/source/adapters/level_zero/kernel.cpp
+++ b/unified-runtime/source/adapters/level_zero/kernel.cpp
@@ -56,6 +56,17 @@ ur_result_t urKernelGetSuggestedLocalWorkSize(
   return UR_RESULT_SUCCESS;
 }
 
+ur_result_t urKernelGetSuggestedLocalWorkSizeWithArgs(
+    ur_kernel_handle_t hKernel, ur_queue_handle_t hQueue, uint32_t workDim,
+    const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
+    [[maybe_unused]] uint32_t numArgs,
+    [[maybe_unused]] const ur_exp_kernel_arg_properties_t *pArgs,
+    size_t *pSuggestedLocalWorkSize) {
+  return ur::level_zero::urKernelGetSuggestedLocalWorkSize(
+      hKernel, hQueue, workDim, pGlobalWorkOffset, pGlobalWorkSize,
+      pSuggestedLocalWorkSize);
+}
+
 ur_result_t urKernelSetArgValueHelper(
     ur_kernel_handle_t Kernel,
     /// [in] argument index in range [0, num args - 1]
diff --git a/unified-runtime/source/adapters/level_zero/ur_interface_loader.cpp b/unified-runtime/source/adapters/level_zero/ur_interface_loader.cpp
index e1fcdc4dd5739..e1bfdb7120bc0 100644
--- a/unified-runtime/source/adapters/level_zero/ur_interface_loader.cpp
+++ b/unified-runtime/source/adapters/level_zero/ur_interface_loader.cpp
@@ -315,6 +315,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable(
       ur::level_zero::urKernelCreateWithNativeHandle;
   pDdiTable->pfnGetSuggestedLocalWorkSize =
       ur::level_zero::urKernelGetSuggestedLocalWorkSize;
+  pDdiTable->pfnGetSuggestedLocalWorkSizeWithArgs =
+      ur::level_zero::urKernelGetSuggestedLocalWorkSizeWithArgs;
   pDdiTable->pfnSetArgValue = ur::level_zero::urKernelSetArgValue;
   pDdiTable->pfnSetArgLocal = ur::level_zero::urKernelSetArgLocal;
   pDdiTable->pfnSetArgPointer = ur::level_zero::urKernelSetArgPointer;
diff --git a/unified-runtime/source/adapters/level_zero/ur_interface_loader.hpp b/unified-runtime/source/adapters/level_zero/ur_interface_loader.hpp
index f64c404ec3c9a..e906015b67855 100644
--- a/unified-runtime/source/adapters/level_zero/ur_interface_loader.hpp
+++ b/unified-runtime/source/adapters/level_zero/ur_interface_loader.hpp
@@ -483,6 +483,11 @@ ur_result_t urKernelGetSuggestedLocalWorkSize(ur_kernel_handle_t hKernel,
                                               const size_t *pGlobalWorkOffset,
                                               const size_t *pGlobalWorkSize,
                                               size_t *pSuggestedLocalWorkSize);
+ur_result_t urKernelGetSuggestedLocalWorkSizeWithArgs(
+    ur_kernel_handle_t hKernel, ur_queue_handle_t hQueue, uint32_t numWorkDim,
+    const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
+    uint32_t numArgs, const ur_exp_kernel_arg_properties_t *pArgs,
+    size_t *pSuggestedLocalWorkSize);
 ur_result_t urKernelSuggestMaxCooperativeGroupCount(
     ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, uint32_t workDim,
     const size_t *pLocalWorkSize, size_t dynamicSharedMemorySize,
diff --git a/unified-runtime/source/adapters/level_zero/v2/kernel.cpp b/unified-runtime/source/adapters/level_zero/v2/kernel.cpp
index 5f4a51f7f2ef5..54eb0862d9201 100644
--- a/unified-runtime/source/adapters/level_zero/v2/kernel.cpp
+++ b/unified-runtime/source/adapters/level_zero/v2/kernel.cpp
@@ -719,6 +719,17 @@ ur_result_t urKernelGetSuggestedLocalWorkSize(
   return UR_RESULT_SUCCESS;
 }
 
+ur_result_t urKernelGetSuggestedLocalWorkSizeWithArgs(
+    ur_kernel_handle_t hKernel, ur_queue_handle_t hQueue, uint32_t workDim,
+    const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
+    [[maybe_unused]] uint32_t numArgs,
+    [[maybe_unused]] const ur_exp_kernel_arg_properties_t *pArgs,
+    size_t *pSuggestedLocalWorkSize) {
+  return ur::level_zero::urKernelGetSuggestedLocalWorkSize(
+      hKernel, hQueue, workDim, pGlobalWorkOffset, pGlobalWorkSize,
+      pSuggestedLocalWorkSize);
+}
+
 ur_result_t urKernelSuggestMaxCooperativeGroupCount(
     ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, uint32_t workDim,
     const size_t *pLocalWorkSize, size_t dynamicSharedMemorySize,
diff --git a/unified-runtime/source/adapters/mock/ur_mockddi.cpp b/unified-runtime/source/adapters/mock/ur_mockddi.cpp
index 1603b657a459a..f4e553891f027 100644
--- a/unified-runtime/source/adapters/mock/ur_mockddi.cpp
+++ b/unified-runtime/source/adapters/mock/ur_mockddi.cpp
@@ -7473,6 +7473,73 @@ __urdlllocal ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize(
   return exceptionToResult(std::current_exception());
 }
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urKernelGetSuggestedLocalWorkSizeWithArgs
+__urdlllocal ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSizeWithArgs(
+    /// [in] handle of the kernel
+    ur_kernel_handle_t hKernel,
+    /// [in] handle of the queue object
+    ur_queue_handle_t hQueue,
+    /// [in] number of dimensions, from 1 to 3, to specify the global
+    /// and work-group work-items
+    uint32_t numWorkDim,
+    /// [in] pointer to an array of numWorkDim unsigned values that specify
+    /// the offset used to calculate the global ID of a work-item
+    const size_t *pGlobalWorkOffset,
+    /// [in] pointer to an array of numWorkDim unsigned values that specify
+    /// the number of global work-items in workDim that will execute the
+    /// kernel function
+    const size_t *pGlobalWorkSize,
+    /// [in] Number of entries in pArgs
+    uint32_t numArgs,
+    /// [in][optional][range(0, numArgs)] pointer to a list of kernel arg
+    /// properties.
+    const ur_exp_kernel_arg_properties_t *pArgs,
+    /// [out] pointer to an array of numWorkDim unsigned values that specify
+    /// suggested local work size that will contain the result of the query
+    size_t *pSuggestedLocalWorkSize) try {
+  ur_result_t result = UR_RESULT_SUCCESS;
+
+  ur_kernel_get_suggested_local_work_size_with_args_params_t params = {
+      &hKernel,         &hQueue,  &numWorkDim, &pGlobalWorkOffset,
+      &pGlobalWorkSize, &numArgs, &pArgs,      &pSuggestedLocalWorkSize};
+
+  auto beforeCallback = reinterpret_cast<ur_mock_callback_t>(
+      mock::getCallbacks().get_before_callback(
+          "urKernelGetSuggestedLocalWorkSizeWithArgs"));
+  if (beforeCallback) {
+    result = beforeCallback(&params);
+    if (result != UR_RESULT_SUCCESS) {
+      return result;
+    }
+  }
+
+  auto replaceCallback = reinterpret_cast<ur_mock_callback_t>(
+      mock::getCallbacks().get_replace_callback(
+          "urKernelGetSuggestedLocalWorkSizeWithArgs"));
+  if (replaceCallback) {
+    result = replaceCallback(&params);
+  } else {
+
+    result = UR_RESULT_SUCCESS;
+  }
+
+  if (result != UR_RESULT_SUCCESS) {
+    return result;
+  }
+
+  auto afterCallback = reinterpret_cast<ur_mock_callback_t>(
+      mock::getCallbacks().get_after_callback(
+          "urKernelGetSuggestedLocalWorkSizeWithArgs"));
+  if (afterCallback) {
+    return afterCallback(&params);
+  }
+
+  return result;
+} catch (...) {
+  return exceptionToResult(std::current_exception());
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Intercept function for urKernelSuggestMaxCooperativeGroupCount
 __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCount(
@@ -13664,6 +13731,9 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable(
   pDdiTable->pfnGetSuggestedLocalWorkSize =
       driver::urKernelGetSuggestedLocalWorkSize;
 
+  pDdiTable->pfnGetSuggestedLocalWorkSizeWithArgs =
+      driver::urKernelGetSuggestedLocalWorkSizeWithArgs;
+
   pDdiTable->pfnSetArgValue = driver::urKernelSetArgValue;
 
   pDdiTable->pfnSetArgLocal = driver::urKernelSetArgLocal;
diff --git a/unified-runtime/source/adapters/native_cpu/kernel.cpp b/unified-runtime/source/adapters/native_cpu/kernel.cpp
index 021a95bdadc5c..9fe7e3cbefaed 100644
--- a/unified-runtime/source/adapters/native_cpu/kernel.cpp
+++ b/unified-runtime/source/adapters/native_cpu/kernel.cpp
@@ -267,6 +267,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize(
   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
 }
 
+UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSizeWithArgs(
+    [[maybe_unused]] ur_kernel_handle_t hKernel,
+    [[maybe_unused]] ur_queue_handle_t hQueue,
+    [[maybe_unused]] uint32_t workDim,
+    [[maybe_unused]] const size_t *pGlobalWorkOffset,
+    [[maybe_unused]] const size_t *pGlobalWorkSize,
+    [[maybe_unused]] uint32_t numArgs,
+    [[maybe_unused]] const ur_exp_kernel_arg_properties_t *pArgs,
+    [[maybe_unused]] size_t *pSuggestedLocalWorkSize) {
+  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+}
+
 UR_APIEXPORT ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCount(
     [[maybe_unused]] ur_kernel_handle_t hKernel,
     [[maybe_unused]] ur_device_handle_t hDevice,
diff --git a/unified-runtime/source/adapters/native_cpu/ur_interface_loader.cpp b/unified-runtime/source/adapters/native_cpu/ur_interface_loader.cpp
index 78fe857721aa6..556d0c106f229 100644
--- a/unified-runtime/source/adapters/native_cpu/ur_interface_loader.cpp
+++ b/unified-runtime/source/adapters/native_cpu/ur_interface_loader.cpp
@@ -141,6 +141,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable(
   pDdiTable->pfnSetExecInfo = urKernelSetExecInfo;
   pDdiTable->pfnSetSpecializationConstants = urKernelSetSpecializationConstants;
   pDdiTable->pfnGetSuggestedLocalWorkSize = urKernelGetSuggestedLocalWorkSize;
+  pDdiTable->pfnGetSuggestedLocalWorkSizeWithArgs =
+      urKernelGetSuggestedLocalWorkSizeWithArgs;
   pDdiTable->pfnSuggestMaxCooperativeGroupCount =
       urKernelSuggestMaxCooperativeGroupCount;
   return UR_RESULT_SUCCESS;
diff --git a/unified-runtime/source/adapters/offload/kernel.cpp b/unified-runtime/source/adapters/offload/kernel.cpp
index ae155e2ad607c..5eada6fda135c 100644
--- a/unified-runtime/source/adapters/offload/kernel.cpp
+++ b/unified-runtime/source/adapters/offload/kernel.cpp
@@ -152,6 +152,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize(
   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
 }
 
+UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSizeWithArgs(
+    ur_kernel_handle_t, ur_queue_handle_t, uint32_t, const size_t *,
+    const size_t *, uint32_t, const ur_exp_kernel_arg_properties_t *,
+    size_t *) {
+  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+}
+
 UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgSampler(
     ur_kernel_handle_t, uint32_t, const ur_kernel_arg_sampler_properties_t *,
     ur_sampler_handle_t) {
diff --git a/unified-runtime/source/adapters/offload/ur_interface_loader.cpp b/unified-runtime/source/adapters/offload/ur_interface_loader.cpp
index 46db10a7325bd..f65781361e35e 100644
--- a/unified-runtime/source/adapters/offload/ur_interface_loader.cpp
+++ b/unified-runtime/source/adapters/offload/ur_interface_loader.cpp
@@ -140,6 +140,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable(
   pDdiTable->pfnSetExecInfo = urKernelSetExecInfo;
   pDdiTable->pfnSetSpecializationConstants = urKernelSetSpecializationConstants;
   pDdiTable->pfnGetSuggestedLocalWorkSize = urKernelGetSuggestedLocalWorkSize;
+  pDdiTable->pfnGetSuggestedLocalWorkSizeWithArgs =
+      urKernelGetSuggestedLocalWorkSizeWithArgs;
   return UR_RESULT_SUCCESS;
 }
 
diff --git a/unified-runtime/source/adapters/opencl/kernel.cpp b/unified-runtime/source/adapters/opencl/kernel.cpp
index b673f9743766c..10a7f9055dfa8 100644
--- a/unified-runtime/source/adapters/opencl/kernel.cpp
+++ b/unified-runtime/source/adapters/opencl/kernel.cpp
@@ -529,6 +529,61 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize(
   return UR_RESULT_SUCCESS;
 }
 
+UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSizeWithArgs(
+    ur_kernel_handle_t hKernel, ur_queue_handle_t hQueue, uint32_t workDim,
+    const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
+    uint32_t numArgs, const ur_exp_kernel_arg_properties_t *pArgs,
+    size_t *pSuggestedLocalWorkSize) {
+
+  clSetKernelArgMemPointerINTEL_fn SetKernelArgMemPointerPtr = nullptr;
+  UR_RETURN_ON_FAILURE(
+      cl_ext::getExtFuncFromContext<clSetKernelArgMemPointerINTEL_fn>(
+          hKernel->Context->CLContext,
+          ur::cl::getAdapter()->fnCache.clSetKernelArgMemPointerINTELCache,
+          cl_ext::SetKernelArgMemPointerName, &SetKernelArgMemPointerPtr));
+
+  for (uint32_t i = 0; i < numArgs; i++) {
+    switch (pArgs[i].type) {
+    case UR_EXP_KERNEL_ARG_TYPE_LOCAL:
+      CL_RETURN_ON_FAILURE(clSetKernelArg(hKernel->CLKernel,
+                                          static_cast<cl_uint>(pArgs[i].index),
+                                          pArgs[i].size, nullptr));
+      break;
+    case UR_EXP_KERNEL_ARG_TYPE_VALUE:
+      CL_RETURN_ON_FAILURE(clSetKernelArg(hKernel->CLKernel,
+                                          static_cast<cl_uint>(pArgs[i].index),
+                                          pArgs[i].size, pArgs[i].value.value));
+      break;
+    case UR_EXP_KERNEL_ARG_TYPE_MEM_OBJ: {
+      cl_mem mem = pArgs[i].value.memObjTuple.hMem
+                       ? pArgs[i].value.memObjTuple.hMem->CLMemory
+                       : nullptr;
+      CL_RETURN_ON_FAILURE(clSetKernelArg(hKernel->CLKernel,
+                                          static_cast<cl_uint>(pArgs[i].index),
+                                          pArgs[i].size, &mem));
+      break;
+    }
+    case UR_EXP_KERNEL_ARG_TYPE_POINTER:
+      CL_RETURN_ON_FAILURE(SetKernelArgMemPointerPtr(
+          hKernel->CLKernel, static_cast<cl_uint>(pArgs[i].index),
+          pArgs[i].value.pointer));
+      break;
+    case UR_EXP_KERNEL_ARG_TYPE_SAMPLER: {
+      CL_RETURN_ON_FAILURE(clSetKernelArg(
+          hKernel->CLKernel, static_cast<cl_uint>(pArgs[i].index),
+          pArgs[i].size, &pArgs[i].value.sampler->CLSampler));
+      break;
+    }
+    default:
+      return UR_RESULT_ERROR_INVALID_ENUMERATION;
+    }
+  }
+
+  return urKernelGetSuggestedLocalWorkSize(hKernel, hQueue, workDim,
+                                           pGlobalWorkOffset, pGlobalWorkSize,
+                                           pSuggestedLocalWorkSize);
+}
+
 UR_APIEXPORT ur_result_t UR_APICALL urKernelSetSpecializationConstants(
     ur_kernel_handle_t, uint32_t, const ur_specialization_constant_info_t *) {
   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
diff --git a/unified-runtime/source/adapters/opencl/ur_interface_loader.cpp b/unified-runtime/source/adapters/opencl/ur_interface_loader.cpp
index c753ce498cb40..a9c0166255d50 100644
--- a/unified-runtime/source/adapters/opencl/ur_interface_loader.cpp
+++ b/unified-runtime/source/adapters/opencl/ur_interface_loader.cpp
@@ -142,6 +142,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable(
   pDdiTable->pfnSetExecInfo = urKernelSetExecInfo;
   pDdiTable->pfnSetSpecializationConstants = urKernelSetSpecializationConstants;
   pDdiTable->pfnGetSuggestedLocalWorkSize = urKernelGetSuggestedLocalWorkSize;
+  pDdiTable->pfnGetSuggestedLocalWorkSizeWithArgs =
+      urKernelGetSuggestedLocalWorkSizeWithArgs;
   pDdiTable->pfnSuggestMaxCooperativeGroupCount =
       urKernelSuggestMaxCooperativeGroupCount;
   return UR_RESULT_SUCCESS;
diff --git a/unified-runtime/source/loader/layers/sanitizer/asan/asan_ddi.cpp b/unified-runtime/source/loader/layers/sanitizer/asan/asan_ddi.cpp
index e273680ea75d5..dc4f01005fa50 100644
--- a/unified-runtime/source/loader/layers/sanitizer/asan/asan_ddi.cpp
+++ b/unified-runtime/source/loader/layers/sanitizer/asan/asan_ddi.cpp
@@ -1744,63 +1744,6 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchWithArgsExp(
   std::memcpy(KernelInfo.ArgProps.data(), pArgs,
               numArgs * sizeof(ur_exp_kernel_arg_properties_t));
 
-  // We need to set all the args now rather than letting LaunchWithArgs handle
-  // them. This is because some implementations of
-  // urKernelGetSuggestedLocalWorkSize, which is used in preLaunchKernel, rely
-  // on all the args being set.
-  for (uint32_t ArgPropIndex = 0; ArgPropIndex < numArgs; ArgPropIndex++) {
-    switch (pArgs[ArgPropIndex].type) {
-    case UR_EXP_KERNEL_ARG_TYPE_LOCAL: {
-      UR_CALL(ur_sanitizer_layer::asan::urKernelSetArgLocal(
-          hKernel, pArgs[ArgPropIndex].index, pArgs[ArgPropIndex].size,
-          nullptr));
-      KernelInfo.ArgProps[ArgPropIndex].size =
-          KernelInfo.LocalArgs[ArgPropIndex].SizeWithRedZone;
-      break;
-    }
-    case UR_EXP_KERNEL_ARG_TYPE_POINTER: {
-      UR_CALL(ur_sanitizer_layer::asan::urKernelSetArgPointer(
-          hKernel, pArgs[ArgPropIndex].index, nullptr,
-          pArgs[ArgPropIndex].value.pointer));
-      break;
-    }
-    case UR_EXP_KERNEL_ARG_TYPE_VALUE: {
-      UR_CALL(ur_sanitizer_layer::asan::urKernelSetArgValue(
-          hKernel, pArgs[ArgPropIndex].index, pArgs[ArgPropIndex].size, nullptr,
-          pArgs[ArgPropIndex].value.value));
-      break;
-    }
-    case UR_EXP_KERNEL_ARG_TYPE_MEM_OBJ: {
-      ur_kernel_arg_mem_obj_properties_t Properties = {
-          UR_STRUCTURE_TYPE_KERNEL_ARG_MEM_OBJ_PROPERTIES, nullptr,
-          pArgs[ArgPropIndex].value.memObjTuple.flags};
-      UR_CALL(ur_sanitizer_layer::asan::urKernelSetArgMemObj(
-          hKernel, pArgs[ArgPropIndex].index, &Properties,
-          pArgs[ArgPropIndex].value.memObjTuple.hMem));
-      if (std::shared_ptr<MemBuffer> MemBuffer =
-              getAsanInterceptor()->getMemBuffer(
-                  pArgs[ArgPropIndex].value.memObjTuple.hMem)) {
-        char *Handle = nullptr;
-        UR_CALL(MemBuffer->getHandle(GetDevice(hQueue), Handle));
-        KernelInfo.ArgProps[ArgPropIndex].type =
-            ur_exp_kernel_arg_type_t::UR_EXP_KERNEL_ARG_TYPE_POINTER;
-        KernelInfo.ArgProps[ArgPropIndex].value.pointer = Handle;
-      }
-      break;
-    }
-    case UR_EXP_KERNEL_ARG_TYPE_SAMPLER: {
-      auto pfnKernelSetArgSampler =
-          getContext()->urDdiTable.Kernel.pfnSetArgSampler;
-      UR_CALL(pfnKernelSetArgSampler(hKernel, pArgs[ArgPropIndex].index,
-                                     nullptr,
-                                     pArgs[ArgPropIndex].value.sampler));
-      break;
-    }
-    default:
-      return UR_RESULT_ERROR_INVALID_ENUMERATION;
-    }
-  }
-
   LaunchInfo LaunchInfo(GetContext(hQueue), GetDevice(hQueue), pGlobalWorkSize,
                         pLocalWorkSize, pGlobalWorkOffset, workDim);
   UR_CALL(LaunchInfo.Data.syncToDevice(hQueue));
diff --git a/unified-runtime/source/loader/layers/sanitizer/asan/asan_interceptor.cpp b/unified-runtime/source/loader/layers/sanitizer/asan/asan_interceptor.cpp
index 4288ca38236f7..62d21d6e1bf66 100644
--- a/unified-runtime/source/loader/layers/sanitizer/asan/asan_interceptor.cpp
+++ b/unified-runtime/source/loader/layers/sanitizer/asan/asan_interceptor.cpp
@@ -800,9 +800,12 @@ ur_result_t AsanInterceptor::prepareLaunch(
 
   if (LaunchInfo.LocalWorkSize.empty()) {
     LaunchInfo.LocalWorkSize.resize(LaunchInfo.WorkDim);
-    auto URes = getContext()->urDdiTable.Kernel.pfnGetSuggestedLocalWorkSize(
-        Kernel, Queue, LaunchInfo.WorkDim, LaunchInfo.GlobalWorkOffset.data(),
-        LaunchInfo.GlobalWorkSize, LaunchInfo.LocalWorkSize.data());
+    auto URes =
+        getContext()->urDdiTable.Kernel.pfnGetSuggestedLocalWorkSizeWithArgs(
+            Kernel, Queue, LaunchInfo.WorkDim,
+            LaunchInfo.GlobalWorkOffset.data(), LaunchInfo.GlobalWorkSize,
+            ArgNums, KernelInfo.ArgProps.data(),
+            LaunchInfo.LocalWorkSize.data());
     if (URes != UR_RESULT_SUCCESS) {
       if (URes != UR_RESULT_ERROR_UNSUPPORTED_FEATURE) {
         return URes;
diff --git a/unified-runtime/source/loader/layers/sanitizer/msan/msan_interceptor.cpp b/unified-runtime/source/loader/layers/sanitizer/msan/msan_interceptor.cpp
index a16c78c552722..b9b1cdab08241 100644
--- a/unified-runtime/source/loader/layers/sanitizer/msan/msan_interceptor.cpp
+++ b/unified-runtime/source/loader/layers/sanitizer/msan/msan_interceptor.cpp
@@ -509,9 +509,13 @@ ur_result_t MsanInterceptor::prepareLaunch(
 
   if (LaunchInfo.LocalWorkSize.empty()) {
     LaunchInfo.LocalWorkSize.resize(LaunchInfo.WorkDim);
-    auto URes = getContext()->urDdiTable.Kernel.pfnGetSuggestedLocalWorkSize(
-        Kernel, Queue, LaunchInfo.WorkDim, LaunchInfo.GlobalWorkOffset.data(),
-        LaunchInfo.GlobalWorkSize, LaunchInfo.LocalWorkSize.data());
+    auto ArgNums = GetKernelNumArgs(Kernel);
+    auto URes =
+        getContext()->urDdiTable.Kernel.pfnGetSuggestedLocalWorkSizeWithArgs(
+            Kernel, Queue, LaunchInfo.WorkDim,
+            LaunchInfo.GlobalWorkOffset.data(), LaunchInfo.GlobalWorkSize,
+            ArgNums, KernelInfo.ArgProps.data(),
+            LaunchInfo.LocalWorkSize.data());
     if (URes != UR_RESULT_SUCCESS) {
       if (URes != UR_RESULT_ERROR_UNSUPPORTED_FEATURE) {
         return URes;
diff --git a/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_interceptor.cpp b/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_interceptor.cpp
index 90904e404850a..e4e6ce0df6360 100644
--- a/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_interceptor.cpp
+++ b/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_interceptor.cpp
@@ -399,9 +399,13 @@ ur_result_t TsanInterceptor::prepareLaunch(std::shared_ptr<ContextInfo> &,
   // Get suggested local work size if user doesn't determine it.
   if (LaunchInfo.LocalWorkSize.empty()) {
     LaunchInfo.LocalWorkSize.resize(LaunchInfo.WorkDim);
-    auto URes = getContext()->urDdiTable.Kernel.pfnGetSuggestedLocalWorkSize(
-        Kernel, Queue, LaunchInfo.WorkDim, LaunchInfo.GlobalWorkOffset.data(),
-        LaunchInfo.GlobalWorkSize, LaunchInfo.LocalWorkSize.data());
+    auto ArgNums = GetKernelNumArgs(Kernel);
+    auto URes =
+        getContext()->urDdiTable.Kernel.pfnGetSuggestedLocalWorkSizeWithArgs(
+            Kernel, Queue, LaunchInfo.WorkDim,
+            LaunchInfo.GlobalWorkOffset.data(), LaunchInfo.GlobalWorkSize,
+            ArgNums, KernelInfo.ArgProps.data(),
+            LaunchInfo.LocalWorkSize.data());
     if (URes != UR_RESULT_SUCCESS) {
       if (URes != UR_RESULT_ERROR_UNSUPPORTED_FEATURE) {
         return URes;
diff --git a/unified-runtime/source/loader/layers/tracing/ur_trcddi.cpp b/unified-runtime/source/loader/layers/tracing/ur_trcddi.cpp
index 0e873718facba..11ccfa1c4610b 100644
--- a/unified-runtime/source/loader/layers/tracing/ur_trcddi.cpp
+++ b/unified-runtime/source/loader/layers/tracing/ur_trcddi.cpp
@@ -6224,6 +6224,68 @@ __urdlllocal ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize(
   return result;
 }
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urKernelGetSuggestedLocalWorkSizeWithArgs
+__urdlllocal ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSizeWithArgs(
+    /// [in] handle of the kernel
+    ur_kernel_handle_t hKernel,
+    /// [in] handle of the queue object
+    ur_queue_handle_t hQueue,
+    /// [in] number of dimensions, from 1 to 3, to specify the global
+    /// and work-group work-items
+    uint32_t numWorkDim,
+    /// [in] pointer to an array of numWorkDim unsigned values that specify
+    /// the offset used to calculate the global ID of a work-item
+    const size_t *pGlobalWorkOffset,
+    /// [in] pointer to an array of numWorkDim unsigned values that specify
+    /// the number of global work-items in workDim that will execute the
+    /// kernel function
+    const size_t *pGlobalWorkSize,
+    /// [in] Number of entries in pArgs
+    uint32_t numArgs,
+    /// [in][optional][range(0, numArgs)] pointer to a list of kernel arg
+    /// properties.
+    const ur_exp_kernel_arg_properties_t *pArgs,
+    /// [out] pointer to an array of numWorkDim unsigned values that specify
+    /// suggested local work size that will contain the result of the query
+    size_t *pSuggestedLocalWorkSize) {
+  auto pfnGetSuggestedLocalWorkSizeWithArgs =
+      getContext()->urDdiTable.Kernel.pfnGetSuggestedLocalWorkSizeWithArgs;
+
+  if (nullptr == pfnGetSuggestedLocalWorkSizeWithArgs)
+    return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+
+  ur_kernel_get_suggested_local_work_size_with_args_params_t params = {
+      &hKernel,         &hQueue,  &numWorkDim, &pGlobalWorkOffset,
+      &pGlobalWorkSize, &numArgs, &pArgs,      &pSuggestedLocalWorkSize};
+  uint64_t instance = getContext()->notify_begin(
+      UR_FUNCTION_KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE_WITH_ARGS,
+      "urKernelGetSuggestedLocalWorkSizeWithArgs", &params);
+
+  auto &logger = getContext()->logger;
+  UR_LOG_L(logger, INFO, "   ---> urKernelGetSuggestedLocalWorkSizeWithArgs\n");
+
+  ur_result_t result = pfnGetSuggestedLocalWorkSizeWithArgs(
+      hKernel, hQueue, numWorkDim, pGlobalWorkOffset, pGlobalWorkSize, numArgs,
+      pArgs, pSuggestedLocalWorkSize);
+
+  getContext()->notify_end(
+      UR_FUNCTION_KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE_WITH_ARGS,
+      "urKernelGetSuggestedLocalWorkSizeWithArgs", &params, &result, instance);
+
+  if (logger.getLevel() <= UR_LOGGER_LEVEL_INFO) {
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE_WITH_ARGS,
+        &params);
+    UR_LOG_L(logger, INFO,
+             "   <--- urKernelGetSuggestedLocalWorkSizeWithArgs({}) -> {};\n",
+             args_str.str(), result);
+  }
+
+  return result;
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Intercept function for urKernelSuggestMaxCooperativeGroupCount
 __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCount(
@@ -11841,6 +11903,11 @@ __urdlllocal ur_result_t UR_APICALL urGetKernelProcAddrTable(
   pDdiTable->pfnGetSuggestedLocalWorkSize =
       ur_tracing_layer::urKernelGetSuggestedLocalWorkSize;
 
+  dditable.pfnGetSuggestedLocalWorkSizeWithArgs =
+      pDdiTable->pfnGetSuggestedLocalWorkSizeWithArgs;
+  pDdiTable->pfnGetSuggestedLocalWorkSizeWithArgs =
+      ur_tracing_layer::urKernelGetSuggestedLocalWorkSizeWithArgs;
+
   dditable.pfnSetArgValue = pDdiTable->pfnSetArgValue;
   pDdiTable->pfnSetArgValue = ur_tracing_layer::urKernelSetArgValue;
 
diff --git a/unified-runtime/source/loader/layers/validation/ur_valddi.cpp b/unified-runtime/source/loader/layers/validation/ur_valddi.cpp
index 89adf47a79943..66c7e1c48d8b3 100644
--- a/unified-runtime/source/loader/layers/validation/ur_valddi.cpp
+++ b/unified-runtime/source/loader/layers/validation/ur_valddi.cpp
@@ -6887,6 +6887,78 @@ __urdlllocal ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize(
   return result;
 }
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urKernelGetSuggestedLocalWorkSizeWithArgs
+__urdlllocal ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSizeWithArgs(
+    /// [in] handle of the kernel
+    ur_kernel_handle_t hKernel,
+    /// [in] handle of the queue object
+    ur_queue_handle_t hQueue,
+    /// [in] number of dimensions, from 1 to 3, to specify the global
+    /// and work-group work-items
+    uint32_t numWorkDim,
+    /// [in] pointer to an array of numWorkDim unsigned values that specify
+    /// the offset used to calculate the global ID of a work-item
+    const size_t *pGlobalWorkOffset,
+    /// [in] pointer to an array of numWorkDim unsigned values that specify
+    /// the number of global work-items in workDim that will execute the
+    /// kernel function
+    const size_t *pGlobalWorkSize,
+    /// [in] Number of entries in pArgs
+    uint32_t numArgs,
+    /// [in][optional][range(0, numArgs)] pointer to a list of kernel arg
+    /// properties.
+    const ur_exp_kernel_arg_properties_t *pArgs,
+    /// [out] pointer to an array of numWorkDim unsigned values that specify
+    /// suggested local work size that will contain the result of the query
+    size_t *pSuggestedLocalWorkSize) {
+  auto pfnGetSuggestedLocalWorkSizeWithArgs =
+      getContext()->urDdiTable.Kernel.pfnGetSuggestedLocalWorkSizeWithArgs;
+
+  if (nullptr == pfnGetSuggestedLocalWorkSizeWithArgs) {
+    return UR_RESULT_ERROR_UNINITIALIZED;
+  }
+
+  if (getContext()->enableParameterValidation) {
+    if (NULL == pGlobalWorkOffset)
+      return UR_RESULT_ERROR_INVALID_NULL_POINTER;
+
+    if (NULL == pGlobalWorkSize)
+      return UR_RESULT_ERROR_INVALID_NULL_POINTER;
+
+    if (NULL == pSuggestedLocalWorkSize)
+      return UR_RESULT_ERROR_INVALID_NULL_POINTER;
+
+    if (pArgs == NULL && numArgs > 0)
+      return UR_RESULT_ERROR_INVALID_NULL_POINTER;
+
+    if (NULL == hKernel)
+      return UR_RESULT_ERROR_INVALID_NULL_HANDLE;
+
+    if (NULL == hQueue)
+      return UR_RESULT_ERROR_INVALID_NULL_HANDLE;
+
+    if (NULL != pArgs && UR_EXP_KERNEL_ARG_TYPE_SAMPLER < pArgs->type)
+      return UR_RESULT_ERROR_INVALID_ENUMERATION;
+  }
+
+  if (getContext()->enableLifetimeValidation &&
+      !getContext()->refCountContext->isReferenceValid(hKernel)) {
+    URLOG_CTX_INVALID_REFERENCE(hKernel);
+  }
+
+  if (getContext()->enableLifetimeValidation &&
+      !getContext()->refCountContext->isReferenceValid(hQueue)) {
+    URLOG_CTX_INVALID_REFERENCE(hQueue);
+  }
+
+  ur_result_t result = pfnGetSuggestedLocalWorkSizeWithArgs(
+      hKernel, hQueue, numWorkDim, pGlobalWorkOffset, pGlobalWorkSize, numArgs,
+      pArgs, pSuggestedLocalWorkSize);
+
+  return result;
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Intercept function for urKernelSuggestMaxCooperativeGroupCount
 __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCount(
@@ -12625,6 +12697,11 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable(
   pDdiTable->pfnGetSuggestedLocalWorkSize =
       ur_validation_layer::urKernelGetSuggestedLocalWorkSize;
 
+  dditable.pfnGetSuggestedLocalWorkSizeWithArgs =
+      pDdiTable->pfnGetSuggestedLocalWorkSizeWithArgs;
+  pDdiTable->pfnGetSuggestedLocalWorkSizeWithArgs =
+      ur_validation_layer::urKernelGetSuggestedLocalWorkSizeWithArgs;
+
   dditable.pfnSetArgValue = pDdiTable->pfnSetArgValue;
   pDdiTable->pfnSetArgValue = ur_validation_layer::urKernelSetArgValue;
 
diff --git a/unified-runtime/source/loader/loader.def.in b/unified-runtime/source/loader/loader.def.in
index a9a1325e97d12..3bc3e9d2957aa 100644
--- a/unified-runtime/source/loader/loader.def.in
+++ b/unified-runtime/source/loader/loader.def.in
@@ -157,6 +157,7 @@ EXPORTS
 	urKernelGetNativeHandle
 	urKernelGetSubGroupInfo
 	urKernelGetSuggestedLocalWorkSize
+	urKernelGetSuggestedLocalWorkSizeWithArgs
 	urKernelRelease
 	urKernelRetain
 	urKernelSetArgLocal
@@ -417,6 +418,7 @@ EXPORTS
 	urPrintKernelGetNativeHandleParams
 	urPrintKernelGetSubGroupInfoParams
 	urPrintKernelGetSuggestedLocalWorkSizeParams
+	urPrintKernelGetSuggestedLocalWorkSizeWithArgsParams
 	urPrintKernelGroupInfo
 	urPrintKernelInfo
 	urPrintKernelLaunchClusterProperty
diff --git a/unified-runtime/source/loader/loader.map.in b/unified-runtime/source/loader/loader.map.in
index 465978fdb5f8e..3d6e0e7b9db43 100644
--- a/unified-runtime/source/loader/loader.map.in
+++ b/unified-runtime/source/loader/loader.map.in
@@ -157,6 +157,7 @@
 		urKernelGetNativeHandle;
 		urKernelGetSubGroupInfo;
 		urKernelGetSuggestedLocalWorkSize;
+		urKernelGetSuggestedLocalWorkSizeWithArgs;
 		urKernelRelease;
 		urKernelRetain;
 		urKernelSetArgLocal;
@@ -417,6 +418,7 @@
 		urPrintKernelGetNativeHandleParams;
 		urPrintKernelGetSubGroupInfoParams;
 		urPrintKernelGetSuggestedLocalWorkSizeParams;
+		urPrintKernelGetSuggestedLocalWorkSizeWithArgsParams;
 		urPrintKernelGroupInfo;
 		urPrintKernelInfo;
 		urPrintKernelLaunchClusterProperty;
diff --git a/unified-runtime/source/loader/ur_ldrddi.cpp b/unified-runtime/source/loader/ur_ldrddi.cpp
index 0439bb0d95456..5263fafcf1661 100644
--- a/unified-runtime/source/loader/ur_ldrddi.cpp
+++ b/unified-runtime/source/loader/ur_ldrddi.cpp
@@ -3595,6 +3595,45 @@ __urdlllocal ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize(
                                       pSuggestedLocalWorkSize);
 }
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urKernelGetSuggestedLocalWorkSizeWithArgs
+__urdlllocal ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSizeWithArgs(
+    /// [in] handle of the kernel
+    ur_kernel_handle_t hKernel,
+    /// [in] handle of the queue object
+    ur_queue_handle_t hQueue,
+    /// [in] number of dimensions, from 1 to 3, to specify the global
+    /// and work-group work-items
+    uint32_t numWorkDim,
+    /// [in] pointer to an array of numWorkDim unsigned values that specify
+    /// the offset used to calculate the global ID of a work-item
+    const size_t *pGlobalWorkOffset,
+    /// [in] pointer to an array of numWorkDim unsigned values that specify
+    /// the number of global work-items in workDim that will execute the
+    /// kernel function
+    const size_t *pGlobalWorkSize,
+    /// [in] Number of entries in pArgs
+    uint32_t numArgs,
+    /// [in][optional][range(0, numArgs)] pointer to a list of kernel arg
+    /// properties.
+    const ur_exp_kernel_arg_properties_t *pArgs,
+    /// [out] pointer to an array of numWorkDim unsigned values that specify
+    /// suggested local work size that will contain the result of the query
+    size_t *pSuggestedLocalWorkSize) {
+
+  auto *dditable = *reinterpret_cast<ur_dditable_t **>(hKernel);
+
+  auto *pfnGetSuggestedLocalWorkSizeWithArgs =
+      dditable->Kernel.pfnGetSuggestedLocalWorkSizeWithArgs;
+  if (nullptr == pfnGetSuggestedLocalWorkSizeWithArgs)
+    return UR_RESULT_ERROR_UNINITIALIZED;
+
+  // forward to device-platform
+  return pfnGetSuggestedLocalWorkSizeWithArgs(
+      hKernel, hQueue, numWorkDim, pGlobalWorkOffset, pGlobalWorkSize, numArgs,
+      pArgs, pSuggestedLocalWorkSize);
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Intercept function for urKernelSuggestMaxCooperativeGroupCount
 __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCount(
@@ -7004,6 +7043,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable(
           ur_loader::urKernelCreateWithNativeHandle;
       pDdiTable->pfnGetSuggestedLocalWorkSize =
           ur_loader::urKernelGetSuggestedLocalWorkSize;
+      pDdiTable->pfnGetSuggestedLocalWorkSizeWithArgs =
+          ur_loader::urKernelGetSuggestedLocalWorkSizeWithArgs;
       pDdiTable->pfnSetArgValue = ur_loader::urKernelSetArgValue;
       pDdiTable->pfnSetArgLocal = ur_loader::urKernelSetArgLocal;
       pDdiTable->pfnSetArgPointer = ur_loader::urKernelSetArgPointer;
diff --git a/unified-runtime/source/loader/ur_libapi.cpp b/unified-runtime/source/loader/ur_libapi.cpp
index 2c0989155dc0c..4ff02f2a429a1 100644
--- a/unified-runtime/source/loader/ur_libapi.cpp
+++ b/unified-runtime/source/loader/ur_libapi.cpp
@@ -6893,6 +6893,67 @@ ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize(
   return exceptionToResult(std::current_exception());
 }
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Set kernel args and get the suggested local work size for a kernel.
+///
+/// @details
+///     - Query a suggested local work size for a kernel given a global size for
+///       each dimension.
+///     - The application may call this function from simultaneous threads for
+///       the same context.
+///
+/// @returns
+///     - ::UR_RESULT_SUCCESS
+///     - ::UR_RESULT_ERROR_UNINITIALIZED
+///     - ::UR_RESULT_ERROR_DEVICE_LOST
+///     - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC
+///     - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `NULL == hKernel`
+///         + `NULL == hQueue`
+///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `NULL == pGlobalWorkOffset`
+///         + `NULL == pGlobalWorkSize`
+///         + `NULL == pSuggestedLocalWorkSize`
+///         + `pArgs == NULL && numArgs > 0`
+///     - ::UR_RESULT_ERROR_INVALID_ENUMERATION
+///         + `NULL != pArgs && ::UR_EXP_KERNEL_ARG_TYPE_SAMPLER < pArgs->type`
+///     - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE
+ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSizeWithArgs(
+    /// [in] handle of the kernel
+    ur_kernel_handle_t hKernel,
+    /// [in] handle of the queue object
+    ur_queue_handle_t hQueue,
+    /// [in] number of dimensions, from 1 to 3, to specify the global
+    /// and work-group work-items
+    uint32_t numWorkDim,
+    /// [in] pointer to an array of numWorkDim unsigned values that specify
+    /// the offset used to calculate the global ID of a work-item
+    const size_t *pGlobalWorkOffset,
+    /// [in] pointer to an array of numWorkDim unsigned values that specify
+    /// the number of global work-items in workDim that will execute the
+    /// kernel function
+    const size_t *pGlobalWorkSize,
+    /// [in] Number of entries in pArgs
+    uint32_t numArgs,
+    /// [in][optional][range(0, numArgs)] pointer to a list of kernel arg
+    /// properties.
+    const ur_exp_kernel_arg_properties_t *pArgs,
+    /// [out] pointer to an array of numWorkDim unsigned values that specify
+    /// suggested local work size that will contain the result of the query
+    size_t *pSuggestedLocalWorkSize) try {
+  auto pfnGetSuggestedLocalWorkSizeWithArgs =
+      ur_lib::getContext()
+          ->urDdiTable.Kernel.pfnGetSuggestedLocalWorkSizeWithArgs;
+  if (nullptr == pfnGetSuggestedLocalWorkSizeWithArgs)
+    return UR_RESULT_ERROR_UNINITIALIZED;
+
+  return pfnGetSuggestedLocalWorkSizeWithArgs(
+      hKernel, hQueue, numWorkDim, pGlobalWorkOffset, pGlobalWorkSize, numArgs,
+      pArgs, pSuggestedLocalWorkSize);
+} catch (...) {
+  return exceptionToResult(std::current_exception());
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Query the maximum number of work groups for a cooperative kernel
 ///
diff --git a/unified-runtime/source/loader/ur_print.cpp b/unified-runtime/source/loader/ur_print.cpp
index f45fceb05c676..aa043bffc1c2e 100644
--- a/unified-runtime/source/loader/ur_print.cpp
+++ b/unified-runtime/source/loader/ur_print.cpp
@@ -2264,6 +2264,15 @@ ur_result_t urPrintKernelGetSuggestedLocalWorkSizeParams(
   return str_copy(&ss, buffer, buff_size, out_size);
 }
 
+ur_result_t urPrintKernelGetSuggestedLocalWorkSizeWithArgsParams(
+    const struct ur_kernel_get_suggested_local_work_size_with_args_params_t
+        *params,
+    char *buffer, const size_t buff_size, size_t *out_size) {
+  std::stringstream ss;
+  ss << params;
+  return str_copy(&ss, buffer, buff_size, out_size);
+}
+
 ur_result_t urPrintKernelSetArgValueParams(
     const struct ur_kernel_set_arg_value_params_t *params, char *buffer,
     const size_t buff_size, size_t *out_size) {
diff --git a/unified-runtime/source/ur_api.cpp b/unified-runtime/source/ur_api.cpp
index 44702034644e8..a8928e80c4753 100644
--- a/unified-runtime/source/ur_api.cpp
+++ b/unified-runtime/source/ur_api.cpp
@@ -6065,6 +6065,58 @@ ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize(
   return result;
 }
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Set kernel args and get the suggested local work size for a kernel.
+///
+/// @details
+///     - Query a suggested local work size for a kernel given a global size for
+///       each dimension.
+///     - The application may call this function from simultaneous threads for
+///       the same context.
+///
+/// @returns
+///     - ::UR_RESULT_SUCCESS
+///     - ::UR_RESULT_ERROR_UNINITIALIZED
+///     - ::UR_RESULT_ERROR_DEVICE_LOST
+///     - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC
+///     - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `NULL == hKernel`
+///         + `NULL == hQueue`
+///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `NULL == pGlobalWorkOffset`
+///         + `NULL == pGlobalWorkSize`
+///         + `NULL == pSuggestedLocalWorkSize`
+///         + `pArgs == NULL && numArgs > 0`
+///     - ::UR_RESULT_ERROR_INVALID_ENUMERATION
+///         + `NULL != pArgs && ::UR_EXP_KERNEL_ARG_TYPE_SAMPLER < pArgs->type`
+///     - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE
+ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSizeWithArgs(
+    /// [in] handle of the kernel
+    ur_kernel_handle_t hKernel,
+    /// [in] handle of the queue object
+    ur_queue_handle_t hQueue,
+    /// [in] number of dimensions, from 1 to 3, to specify the global
+    /// and work-group work-items
+    uint32_t numWorkDim,
+    /// [in] pointer to an array of numWorkDim unsigned values that specify
+    /// the offset used to calculate the global ID of a work-item
+    const size_t *pGlobalWorkOffset,
+    /// [in] pointer to an array of numWorkDim unsigned values that specify
+    /// the number of global work-items in workDim that will execute the
+    /// kernel function
+    const size_t *pGlobalWorkSize,
+    /// [in] Number of entries in pArgs
+    uint32_t numArgs,
+    /// [in][optional][range(0, numArgs)] pointer to a list of kernel arg
+    /// properties.
+    const ur_exp_kernel_arg_properties_t *pArgs,
+    /// [out] pointer to an array of numWorkDim unsigned values that specify
+    /// suggested local work size that will contain the result of the query
+    size_t *pSuggestedLocalWorkSize) {
+  ur_result_t result = UR_RESULT_SUCCESS;
+  return result;
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Query the maximum number of work groups for a cooperative kernel
 ///