diff --git a/sycl/include/sycl/detail/nd_range_view.hpp b/sycl/include/sycl/detail/nd_range_view.hpp index effc7a1bc8fe4..c69f94e1ed9e8 100644 --- a/sycl/include/sycl/detail/nd_range_view.hpp +++ b/sycl/include/sycl/detail/nd_range_view.hpp @@ -29,6 +29,10 @@ class nd_range_view { nd_range_view &operator=(const nd_range_view &Desc) = default; nd_range_view &operator=(nd_range_view &&Desc) = default; + template + nd_range_view(sycl::range &N) + : MGlobalSize(&(N[0])), MDims(size_t(Dims_)) {} + template nd_range_view(sycl::nd_range &ExecutionRange) : MGlobalSize(&(ExecutionRange.globalSize[0])), diff --git a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp index 864ea780083fe..8dbb1ab87e6bf 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp @@ -192,10 +192,26 @@ template void parallel_for(queue Q, range Range, const KernelType &KernelObj, ReductionsT &&...Reductions) { - submit(std::move(Q), [&](handler &CGH) { - parallel_for(CGH, Range, KernelObj, - std::forward(Reductions)...); - }); + using LambdaArgType = + sycl::detail::lambda_arg_type>; + using TransformedArgType = std::conditional_t< + std::is_integral::value && Dimensions == 1, + item, + typename detail::TransformUserItemType::type>; + + // TODO The handler-less path does not support reductions, and + // kernel functions with the kernel_handler type argument yet. + if constexpr (sizeof...(ReductionsT) == 0 && + !(detail::KernelLambdaHasKernelHandlerArgT< + KernelType, TransformedArgType>::value)) { + detail::submit_kernel_direct_parallel_for(std::move(Q), Range, + KernelObj); + } else { + submit(std::move(Q), [&](handler &CGH) { + parallel_for(CGH, Range, KernelObj, + std::forward(Reductions)...); + }); + } } template void parallel_for(queue Q, launch_config, Properties> Config, const KernelType &KernelObj, ReductionsT &&...Reductions) { - submit(std::move(Q), [&](handler &CGH) { - parallel_for(CGH, Config, KernelObj, - std::forward(Reductions)...); - }); + using LambdaArgType = + sycl::detail::lambda_arg_type>; + using TransformedArgType = std::conditional_t< + std::is_integral::value && Dimensions == 1, + item, + typename detail::TransformUserItemType::type>; + + // TODO The handler-less path does not support reductions, and + // kernel functions with the kernel_handler type argument yet. + if constexpr (sizeof...(ReductionsT) == 0 && + !(detail::KernelLambdaHasKernelHandlerArgT< + KernelType, TransformedArgType>::value)) { + ext::oneapi::experimental::detail::LaunchConfigAccess, + Properties> + LaunchConfigAccess(Config); + + detail::submit_kernel_direct_parallel_for( + std::move(Q), LaunchConfigAccess.getRange(), KernelObj, {}, + LaunchConfigAccess.getProperties()); + } else { + submit(std::move(Q), [&](handler &CGH) { + parallel_for(CGH, Config, KernelObj, + std::forward(Reductions)...); + }); + } } template diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index 1d904a5b81ad2..9db974fd0bfbc 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -279,6 +279,13 @@ template bool range_size_fits_in_size_t(const range &r) { return true; } +template struct TransformUserItemType { + using type = std::conditional_t< + std::is_convertible_v, LambdaArgType>, nd_item, + std::conditional_t, LambdaArgType>, + item, LambdaArgType>>; +}; + } // namespace detail /// Command group handler class. @@ -778,13 +785,6 @@ class __SYCL_EXPORT handler { device get_device() const; - template struct TransformUserItemType { - using type = std::conditional_t< - std::is_convertible_v, LambdaArgType>, nd_item, - std::conditional_t, LambdaArgType>, - item, LambdaArgType>>; - }; - /// Defines and invokes a SYCL kernel function for the specified range. /// /// The SYCL kernel function is defined as a lambda function or a named @@ -823,7 +823,7 @@ class __SYCL_EXPORT handler { // sycl::item/sycl::nd_item to transport item information using TransformedArgType = std::conditional_t< std::is_integral::value && Dims == 1, item, - typename TransformUserItemType::type>; + typename detail::TransformUserItemType::type>; static_assert(!std::is_same_v>, "Kernel argument cannot have a sycl::nd_item type in " @@ -1349,7 +1349,7 @@ class __SYCL_EXPORT handler { using LambdaArgType = sycl::detail::lambda_arg_type>; using TransformedArgType = std::conditional_t< std::is_integral::value && Dims == 1, item, - typename TransformUserItemType::type>; + typename detail::TransformUserItemType::type>; wrap_kernel(KernelFunc, {} /*Props*/, NumWorkItems, WorkItemOffset); } diff --git a/sycl/include/sycl/khr/free_function_commands.hpp b/sycl/include/sycl/khr/free_function_commands.hpp index 66ac24ed8736f..f08353e2997c0 100644 --- a/sycl/include/sycl/khr/free_function_commands.hpp +++ b/sycl/include/sycl/khr/free_function_commands.hpp @@ -54,19 +54,53 @@ template void launch(const queue &q, range<1> r, const KernelType &k, const sycl::detail::code_location &codeLoc = sycl::detail::code_location::current()) { - submit(q, [&](handler &h) { launch(h, r, k); }, codeLoc); + using LambdaArgType = sycl::detail::lambda_arg_type>; + using TransformedArgType = std::conditional_t< + std::is_integral::value, item<1>, + typename detail::TransformUserItemType<1, LambdaArgType>::type>; + + // TODO The handler-less path does not support kernel functions with the + // kernel_handler type argument yet. + if constexpr (!(detail::KernelLambdaHasKernelHandlerArgT< + KernelType, TransformedArgType>::value)) { + detail::submit_kernel_direct_parallel_for(q, r, k); + } else { + submit(q, [&](handler &h) { launch(h, r, k); }, codeLoc); + } } template void launch(const queue &q, range<2> r, const KernelType &k, const sycl::detail::code_location &codeLoc = sycl::detail::code_location::current()) { - submit(q, [&](handler &h) { launch(h, r, k); }, codeLoc); + using LambdaArgType = sycl::detail::lambda_arg_type>; + using TransformedArgType = + typename detail::TransformUserItemType<2, LambdaArgType>::type; + + // TODO The handler-less path does not support kernel functions with the + // kernel_handler type argument yet. + if constexpr (!(detail::KernelLambdaHasKernelHandlerArgT< + KernelType, TransformedArgType>::value)) { + detail::submit_kernel_direct_parallel_for(q, r, k); + } else { + submit(q, [&](handler &h) { launch(h, r, k); }, codeLoc); + } } template void launch(const queue &q, range<3> r, const KernelType &k, const sycl::detail::code_location &codeLoc = sycl::detail::code_location::current()) { - submit(q, [&](handler &h) { launch(h, r, k); }, codeLoc); + using LambdaArgType = sycl::detail::lambda_arg_type>; + using TransformedArgType = + typename detail::TransformUserItemType<3, LambdaArgType>::type; + + // TODO The handler-less path does not support kernel functions with the + // kernel_handler type argument yet. + if constexpr (!(detail::KernelLambdaHasKernelHandlerArgT< + KernelType, TransformedArgType>::value)) { + detail::submit_kernel_direct_parallel_for(q, r, k); + } else { + submit(q, [&](handler &h) { launch(h, r, k); }, codeLoc); + } } template diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index b48e488b5fc36..ab4d4ea78fa04 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -24,6 +24,7 @@ #include #include #include // for OwnerLessBase +#include // for range rounding utils #include // for device #include // for device_selector #include // for event @@ -126,8 +127,8 @@ class __SYCL_EXPORT SubmissionInfo { } // namespace _V1 -template auto submit_kernel_direct( @@ -135,77 +136,7 @@ auto submit_kernel_direct( KernelTypeUniversalRef &&KernelFunc, sycl::span DepEvents, const PropertiesT &ExtraProps = ext::oneapi::experimental::empty_properties_t{}, - const detail::code_location &CodeLoc = detail::code_location::current()) { - detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); - - using KernelType = std::decay_t; - - using NameT = - typename detail::get_kernel_name_t::name; - - detail::KernelWrapper::wrap(KernelFunc); - - HostKernelRef - HostKernel(std::forward(KernelFunc)); - - // Instantiating the kernel on the host improves debugging. - // Passing this pointer to another translation unit prevents optimization. -#ifndef NDEBUG - // TODO: call library to prevent dropping call due to optimization. - (void) - detail::GetInstantiateKernelOnHostPtr(); -#endif - - detail::DeviceKernelInfo *DeviceKernelInfoPtr = - &detail::getDeviceKernelInfo(); - constexpr auto Info = detail::CompileTimeKernelInfo; - - assert(Info.Name != std::string_view{} && "Kernel must have a name!"); - - static_assert( - Info.Name == std::string_view{} || sizeof(KernelType) == Info.KernelSize, - "Unexpected kernel lambda size. This can be caused by an " - "external host compiler producing a lambda with an " - "unexpected layout. This is a limitation of the compiler." - "In many cases the difference is related to capturing constexpr " - "variables. In such cases removing constexpr specifier aligns the " - "captures between the host compiler and the device compiler." - "\n" - "In case of MSVC, passing " - "-fsycl-host-compiler-options='/std:c++latest' " - "might also help."); - - detail::KernelPropertyHolderStructTy ParsedProperties; - if constexpr (ext::oneapi::experimental::detail::HasKernelPropertiesGetMethod< - const KernelType &>::value) { - // Merge properties via get() and manually specified properties. - // get() method is used for specifying kernel properties but properties - // passed via launch_config (ExtraProps) should be kernel launch properties. - // They are mutually exclusive, so there should not be any conflict when - // merging properties. merge_properties() throws if there's a conflict. - auto MergedProps = - sycl::ext::oneapi::experimental::detail::merge_properties( - ExtraProps, - KernelFunc.get(ext::oneapi::experimental::properties_tag{})); - - ParsedProperties = extractKernelProperties(MergedProps); - } else { - ParsedProperties = extractKernelProperties(ExtraProps); - } - - if constexpr (EventNeeded) { - return submit_kernel_direct_with_event_impl( - Queue, RangeView, HostKernel, DeviceKernelInfoPtr, DepEvents, - ParsedProperties, TlsCodeLocCapture.query(), - TlsCodeLocCapture.isToplevel()); - } else { - submit_kernel_direct_without_event_impl( - Queue, RangeView, HostKernel, DeviceKernelInfoPtr, DepEvents, - ParsedProperties, TlsCodeLocCapture.query(), - TlsCodeLocCapture.isToplevel()); - } -} + const detail::code_location &CodeLoc = detail::code_location::current()); template Range, KernelTypeUniversalRef &&KernelFunc, sycl::span DepEvents = {}, const PropertiesT &Props = ext::oneapi::experimental::empty_properties_t{}, - const detail::code_location &CodeLoc = detail::code_location::current()) { - - using KernelType = std::decay_t; - - using LambdaArgType = - sycl::detail::lambda_arg_type>; - static_assert( - std::is_convertible_v, LambdaArgType>, - "Kernel argument of a sycl::parallel_for with sycl::nd_range " - "must be either sycl::nd_item or be convertible from sycl::nd_item"); - using TransformedArgType = sycl::nd_item; - -#ifndef __SYCL_DEVICE_ONLY__ - detail::checkValueRange(Range); -#endif + const detail::code_location &CodeLoc = detail::code_location::current()); - return submit_kernel_direct( - Queue, detail::nd_range_view(Range), - std::forward(KernelFunc), DepEvents, Props, - CodeLoc); -} +template +auto submit_kernel_direct_parallel_for( + const queue &Queue, range Range, KernelTypeUniversalRef &&KernelFunc, + sycl::span DepEvents = {}, + const PropertiesT &Props = ext::oneapi::experimental::empty_properties_t{}, + const detail::code_location &CodeLoc = detail::code_location::current()); template DepEvents = {}, const PropertiesT &Props = ext::oneapi::experimental::empty_properties_t{}, - const detail::code_location &CodeLoc = detail::code_location::current()) { - - return submit_kernel_direct( - Queue, detail::nd_range_view(), - std::forward(KernelFunc), DepEvents, Props, - CodeLoc); -} + const detail::code_location &CodeLoc = detail::code_location::current()); } // namespace detail @@ -3813,11 +3723,27 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { RestT &&...Rest) { constexpr detail::code_location CodeLoc = getCodeLocation(); detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); - return submit( - [&](handler &CGH) { - CGH.template parallel_for(Range, Properties, Rest...); - }, - TlsCodeLocCapture.query()); + using KernelType = + std::decay_t>; + using LambdaArgType = sycl::detail::lambda_arg_type>; + using TransformedArgType = std::conditional_t< + std::is_integral::value && Dims == 1, item, + typename detail::TransformUserItemType::type>; + + // TODO The handler-less path does not support reductions, and + // kernel functions with the kernel_handler type argument yet. + if constexpr (sizeof...(RestT) == 1 && + !(detail::KernelLambdaHasKernelHandlerArgT< + KernelType, TransformedArgType>::value)) { + return detail::submit_kernel_direct_parallel_for( + *this, Range, Rest..., {}, Properties, TlsCodeLocCapture.query()); + } else { + return submit( + [&](handler &CGH) { + CGH.template parallel_for(Range, Properties, Rest...); + }, + TlsCodeLocCapture.query()); + } } /// parallel_for_impl with a kernel represented as a lambda + range that @@ -3847,12 +3773,29 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { RestT &&...Rest) { constexpr detail::code_location CodeLoc = getCodeLocation(); detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); - return submit( - [&](handler &CGH) { - CGH.depends_on(DepEvent); - CGH.template parallel_for(Range, Properties, Rest...); - }, - TlsCodeLocCapture.query()); + using KernelType = + std::decay_t>; + using LambdaArgType = sycl::detail::lambda_arg_type>; + using TransformedArgType = std::conditional_t< + std::is_integral::value && Dims == 1, item, + typename detail::TransformUserItemType::type>; + + // TODO The handler-less path does not support reductions, and + // kernel functions with the kernel_handler type argument yet. + if constexpr (sizeof...(RestT) == 1 && + !(detail::KernelLambdaHasKernelHandlerArgT< + KernelType, TransformedArgType>::value)) { + return detail::submit_kernel_direct_parallel_for( + *this, Range, Rest..., sycl::span(&DepEvent, 1), + Properties, TlsCodeLocCapture.query()); + } else { + return submit( + [&](handler &CGH) { + CGH.depends_on(DepEvent); + CGH.template parallel_for(Range, Properties, Rest...); + }, + TlsCodeLocCapture.query()); + } } /// parallel_for_impl with a kernel represented as a lambda + range that @@ -3884,12 +3827,29 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { PropertiesT Properties, RestT &&...Rest) { constexpr detail::code_location CodeLoc = getCodeLocation(); detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); - return submit( - [&](handler &CGH) { - CGH.depends_on(DepEvents); - CGH.template parallel_for(Range, Properties, Rest...); - }, - TlsCodeLocCapture.query()); + using KernelType = + std::decay_t>; + using LambdaArgType = sycl::detail::lambda_arg_type>; + using TransformedArgType = std::conditional_t< + std::is_integral::value && Dims == 1, item, + typename detail::TransformUserItemType::type>; + + // TODO The handler-less path does not support reductions, and + // kernel functions with the kernel_handler type argument yet. + if constexpr (sizeof...(RestT) == 1 && + !(detail::KernelLambdaHasKernelHandlerArgT< + KernelType, TransformedArgType>::value)) { + return detail::submit_kernel_direct_parallel_for( + *this, Range, Rest..., DepEvents, Properties, + TlsCodeLocCapture.query()); + } else { + return submit( + [&](handler &CGH) { + CGH.depends_on(DepEvents); + CGH.template parallel_for(Range, Properties, Rest...); + }, + TlsCodeLocCapture.query()); + } } /// parallel_for_impl version with a kernel represented as a lambda + range @@ -3931,6 +3891,232 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { } }; +namespace detail { + +template +auto submit_kernel_direct(const queue &Queue, + const detail::nd_range_view &RangeView, + KernelTypeUniversalRef &&KernelFunc, + sycl::span DepEvents, + const PropertiesT &ExtraProps, + const detail::code_location &CodeLoc) { + detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); + + using KernelType = std::decay_t; + + detail::KernelWrapper::wrap(KernelFunc); + + // Instantiating the kernel on the host improves debugging. + // Passing this pointer to another translation unit prevents optimization. +#ifndef NDEBUG + // TODO: call library to prevent dropping call due to optimization. + (void) + detail::GetInstantiateKernelOnHostPtr(); +#endif + + detail::DeviceKernelInfo *DeviceKernelInfoPtr = + &detail::getDeviceKernelInfo(); + constexpr auto Info = detail::CompileTimeKernelInfo; + + assert(Info.Name != std::string_view{} && "Kernel must have a name!"); + + static_assert( + Info.Name == std::string_view{} || sizeof(KernelType) == Info.KernelSize, + "Unexpected kernel lambda size. This can be caused by an " + "external host compiler producing a lambda with an " + "unexpected layout. This is a limitation of the compiler." + "In many cases the difference is related to capturing constexpr " + "variables. In such cases removing constexpr specifier aligns the " + "captures between the host compiler and the device compiler." + "\n" + "In case of MSVC, passing " + "-fsycl-host-compiler-options='/std:c++latest' " + "might also help."); + + detail::KernelPropertyHolderStructTy ParsedProperties; + if constexpr (ext::oneapi::experimental::detail::HasKernelPropertiesGetMethod< + const KernelType &>::value) { + // Merge properties via get() and manually specified properties. + // get() method is used for specifying kernel properties but properties + // passed via launch_config (ExtraProps) should be kernel launch properties. + // They are mutually exclusive, so there should not be any conflict when + // merging properties. merge_properties() throws if there's a conflict. + auto MergedProps = + sycl::ext::oneapi::experimental::detail::merge_properties( + ExtraProps, + KernelFunc.get(ext::oneapi::experimental::properties_tag{})); + + ParsedProperties = extractKernelProperties(MergedProps); + } else { + ParsedProperties = extractKernelProperties(ExtraProps); + } + + HostKernelRef + HostKernel(std::forward(KernelFunc)); + + if constexpr (EventNeeded) { + return submit_kernel_direct_with_event_impl( + Queue, RangeView, HostKernel, DeviceKernelInfoPtr, DepEvents, + ParsedProperties, TlsCodeLocCapture.query(), + TlsCodeLocCapture.isToplevel()); + } else { + submit_kernel_direct_without_event_impl( + Queue, RangeView, HostKernel, DeviceKernelInfoPtr, DepEvents, + ParsedProperties, TlsCodeLocCapture.query(), + TlsCodeLocCapture.isToplevel()); + } +} + +template +auto submit_kernel_direct_parallel_for(const queue &Queue, nd_range Range, + KernelTypeUniversalRef &&KernelFunc, + sycl::span DepEvents, + const PropertiesT &Props, + const detail::code_location &CodeLoc) { + + using KernelType = std::decay_t; + using NameT = + typename detail::get_kernel_name_t::name; + + using LambdaArgType = + sycl::detail::lambda_arg_type>; + static_assert( + std::is_convertible_v, LambdaArgType>, + "Kernel argument of a sycl::parallel_for with sycl::nd_range " + "must be either sycl::nd_item or be convertible from sycl::nd_item"); + using TransformedArgType = sycl::nd_item; + +#ifndef __SYCL_DEVICE_ONLY__ + detail::checkValueRange(Range); +#endif + + return submit_kernel_direct( + Queue, detail::nd_range_view(Range), + std::forward(KernelFunc), DepEvents, Props, + CodeLoc); +} + +template +auto submit_kernel_direct_parallel_for(const queue &Queue, range Range, + KernelTypeUniversalRef &&KernelFunc, + sycl::span DepEvents, + const PropertiesT &Props, + const detail::code_location &CodeLoc) { + +#ifndef __SYCL_DEVICE_ONLY__ + if (!range_size_fits_in_size_t(Range)) + throw sycl::exception(make_error_code(errc::runtime), + "The total number of work-items in " + "a range must fit within size_t"); +#endif + + using KernelType = std::decay_t; + using NameT = + typename detail::get_kernel_name_t::name; + using LambdaArgType = sycl::detail::lambda_arg_type>; + + // If 1D kernel argument is an integral type, convert it to sycl::item<1> + // If user type is convertible from sycl::item/sycl::nd_item, use + // sycl::item/sycl::nd_item to transport item information + using TransformedArgType = std::conditional_t< + std::is_integral::value && Dims == 1, item, + typename detail::TransformUserItemType::type>; + + static_assert(!std::is_same_v>, + "Kernel argument cannot have a sycl::nd_item type in " + "sycl::parallel_for with sycl::range"); + + static_assert(std::is_convertible_v, LambdaArgType> || + std::is_convertible_v, LambdaArgType>, + "sycl::parallel_for(sycl::range) kernel must have the " + "first argument of sycl::item type, or of a type which is " + "implicitly convertible from sycl::item"); + + using RefLambdaArgType = std::add_lvalue_reference_t; + static_assert( + (std::is_invocable_v), + "SYCL kernel lambda/functor has an unexpected signature, it should be " + "invocable with sycl::item"); + + // Range rounding can be disabled by the user. + // Range rounding is supported only for newer SYCL standards. +#if !defined(__SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING__) && \ + SYCL_LANGUAGE_VERSION >= 202012L + auto [RoundedRange, HasRoundedRange] = + detail::getRoundedRange(Range, Queue.get_device()); + if (HasRoundedRange) { + using NameWT = typename detail::get_kernel_wrapper_name_t::name; + auto Wrapper = + detail::getRangeRoundedKernelLambda( + KernelFunc, Range); + + using KTypeWrapper = decltype(Wrapper); + using KName = std::conditional_t::value, + KTypeWrapper, NameWT>; +#ifndef __SYCL_DEVICE_ONLY__ + // We are executing over the rounded range, but there are still + // items/ids that are constructed in the range rounded + // kernel, use items/ids in the user range, which means that + // __SYCL_ASSUME_INT can still be violated. So check the bounds + // of the user range, instead of the rounded range. + detail::checkValueRange(Range); +#endif + return submit_kernel_direct( + Queue, detail::nd_range_view(RoundedRange), std::move(Wrapper), + DepEvents, Props, CodeLoc); + } else +#endif // !__SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING__ && + // SYCL_LANGUAGE_VERSION >= 202012L + { +#ifndef __SYCL_FORCE_PARALLEL_FOR_RANGE_ROUNDING__ +#ifndef __SYCL_DEVICE_ONLY__ + detail::checkValueRange(Range); +#endif + return submit_kernel_direct( + Queue, detail::nd_range_view(Range), + std::forward(KernelFunc), DepEvents, Props, + CodeLoc); + +#else + (void)Range; + (void)Props; + (void)KernelFunc; +#endif // __SYCL_FORCE_PARALLEL_FOR_RANGE_ROUNDING__ + } +} + +template +auto submit_kernel_direct_single_task(const queue &Queue, + KernelTypeUniversalRef &&KernelFunc, + sycl::span DepEvents, + const PropertiesT &Props, + const detail::code_location &CodeLoc) { + + using KernelType = std::decay_t; + using NameT = + typename detail::get_kernel_name_t::name; + + return submit_kernel_direct( + Queue, detail::nd_range_view(), + std::forward(KernelFunc), DepEvents, Props, + CodeLoc); +} +} // namespace detail + } // namespace _V1 } // namespace sycl diff --git a/sycl/source/detail/ndrange_desc.hpp b/sycl/source/detail/ndrange_desc.hpp index 1a18aa97a31f2..825383851d6aa 100644 --- a/sycl/source/detail/ndrange_desc.hpp +++ b/sycl/source/detail/ndrange_desc.hpp @@ -33,9 +33,13 @@ class NDRDescT { NDRDescT(const NDRDescT &Desc) = default; NDRDescT(NDRDescT &&Desc) = default; - NDRDescT(const detail::nd_range_view &NDRangeView) : Dims{NDRangeView.MDims} { + NDRDescT(const detail::nd_range_view &NDRangeView, + bool SetNumWorkGroups = false) + : Dims{NDRangeView.MDims} { if (!NDRangeView.MGlobalSize) { init(); + } else if (!NDRangeView.MLocalSize) { + init(&(NDRangeView.MGlobalSize[0]), SetNumWorkGroups); } else { init(NDRangeView.MGlobalSize, NDRangeView.MLocalSize, NDRangeView.MOffset); @@ -44,19 +48,7 @@ class NDRDescT { template NDRDescT(sycl::range N, bool SetNumWorkGroups) : Dims{size_t(Dims_)} { - if (SetNumWorkGroups) { - for (size_t I = 0; I < Dims_; ++I) { - NumWorkGroups[I] = N[I]; - } - } else { - for (size_t I = 0; I < Dims_; ++I) { - GlobalSize[I] = N[I]; - } - - for (size_t I = Dims_; I < 3; ++I) { - GlobalSize[I] = 1; - } - } + init(&(N[0]), SetNumWorkGroups); } template @@ -109,6 +101,22 @@ class NDRDescT { size_t Dims = 0; private: + void init(const size_t *N, bool SetNumWorkGroups) { + if (SetNumWorkGroups) { + for (size_t I = 0; I < Dims; ++I) { + NumWorkGroups[I] = N[I]; + } + } else { + for (size_t I = 0; I < Dims; ++I) { + GlobalSize[I] = N[I]; + } + + for (size_t I = Dims; I < 3; ++I) { + GlobalSize[I] = 1; + } + } + } + void init(const size_t *NumWorkItems, const size_t *LocalSizes, const size_t *Offset) { for (size_t I = 0; I < Dims; ++I) { diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 9737d225048f2..120d05ed8597f 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -643,6 +643,8 @@ detail::EventImplPtr queue_impl::submit_direct( std::unique_lock Lock(MMutex); const bool inOrder = isInOrder(); + NestedCallsTracker tracker; + // Sync with an external event std::optional ExternalEvent = popExternalEvent(); if (ExternalEvent) { diff --git a/sycl/test-e2e/Basic/test_num_kernel_copies.cpp b/sycl/test-e2e/Basic/test_num_kernel_copies.cpp index 82f8477a10962..d0770a56696be 100644 --- a/sycl/test-e2e/Basic/test_num_kernel_copies.cpp +++ b/sycl/test-e2e/Basic/test_num_kernel_copies.cpp @@ -23,7 +23,8 @@ int main(int argc, char **argv) { kernel<0> krn0; q.parallel_for(sycl::range<1>{1}, krn0); - assert(copy_count == 1); + // The kernel is copied on the scheduler-based path only + assert(copy_count == 0); assert(move_count == 0); copy_count = 0; diff --git a/sycl/test/include_deps/sycl_detail_core.hpp.cpp b/sycl/test/include_deps/sycl_detail_core.hpp.cpp index 94a598a1cc4d6..6d001584e1665 100644 --- a/sycl/test/include_deps/sycl_detail_core.hpp.cpp +++ b/sycl/test/include_deps/sycl_detail_core.hpp.cpp @@ -99,19 +99,21 @@ // CHECK-NEXT: detail/id_queries_fit_in_int.hpp // CHECK-NEXT: detail/nd_range_view.hpp // CHECK-NEXT: detail/optional.hpp +// CHECK-NEXT: detail/range_rounding.hpp // CHECK-NEXT: device.hpp // CHECK-NEXT: detail/string_view.hpp // CHECK-NEXT: detail/util.hpp // CHECK-NEXT: device_selector.hpp // CHECK-NEXT: kernel_bundle_enums.hpp +// CHECK-NEXT: ext/oneapi/kernel_properties/properties.hpp +// CHECK-NEXT: ext/oneapi/properties/properties.hpp +// CHECK-NEXT: ext/oneapi/properties/property_utils.hpp +// CHECK-NEXT: ext/oneapi/properties/property.hpp +// CHECK-NEXT: ext/oneapi/properties/property_value.hpp // CHECK-NEXT: event.hpp // CHECK-NEXT: exception_list.hpp // CHECK-NEXT: ext/oneapi/device_global/device_global.hpp // CHECK-NEXT: ext/oneapi/device_global/properties.hpp -// CHECK-NEXT: ext/oneapi/properties/property.hpp -// CHECK-NEXT: ext/oneapi/properties/property_value.hpp -// CHECK-NEXT: ext/oneapi/properties/properties.hpp -// CHECK-NEXT: ext/oneapi/properties/property_utils.hpp // CHECK-NEXT: ext/oneapi/experimental/event_mode_property.hpp // CHECK-NEXT: ext/oneapi/experimental/graph.hpp // CHECK-NEXT: ext/oneapi/experimental/graph/command_graph.hpp @@ -138,10 +140,8 @@ // CHECK-NEXT: ext/oneapi/experimental/cluster_group_prop.hpp // CHECK-NEXT: ext/oneapi/experimental/use_root_sync_prop.hpp // CHECK-NEXT: ext/oneapi/experimental/virtual_functions.hpp -// CHECK-NEXT: ext/oneapi/kernel_properties/properties.hpp // CHECK-NEXT: ext/oneapi/work_group_scratch_memory.hpp // CHECK-NEXT: detail/sycl_local_mem_builtins.hpp -// CHECK-NEXT: detail/range_rounding.hpp // CHECK-NEXT: detail/reduction_forward.hpp // CHECK-NEXT: detail/ur.hpp // CHECK-NEXT: ur_api_funcs.def diff --git a/sycl/test/include_deps/sycl_khr_includes_queue.hpp.cpp b/sycl/test/include_deps/sycl_khr_includes_queue.hpp.cpp index 98992e753be8a..50a9deb5dd578 100644 --- a/sycl/test/include_deps/sycl_khr_includes_queue.hpp.cpp +++ b/sycl/test/include_deps/sycl_khr_includes_queue.hpp.cpp @@ -103,19 +103,21 @@ // CHECK-NEXT: detail/id_queries_fit_in_int.hpp // CHECK-NEXT: detail/nd_range_view.hpp // CHECK-NEXT: detail/optional.hpp +// CHECK-NEXT: detail/range_rounding.hpp // CHECK-NEXT: device.hpp // CHECK-NEXT: detail/string_view.hpp // CHECK-NEXT: detail/util.hpp // CHECK-NEXT: device_selector.hpp // CHECK-NEXT: kernel_bundle_enums.hpp +// CHECK-NEXT: ext/oneapi/kernel_properties/properties.hpp +// CHECK-NEXT: ext/oneapi/properties/properties.hpp +// CHECK-NEXT: ext/oneapi/properties/property_utils.hpp +// CHECK-NEXT: ext/oneapi/properties/property.hpp +// CHECK-NEXT: ext/oneapi/properties/property_value.hpp // CHECK-NEXT: event.hpp // CHECK-NEXT: exception_list.hpp // CHECK-NEXT: ext/oneapi/device_global/device_global.hpp // CHECK-NEXT: ext/oneapi/device_global/properties.hpp -// CHECK-NEXT: ext/oneapi/properties/property.hpp -// CHECK-NEXT: ext/oneapi/properties/property_value.hpp -// CHECK-NEXT: ext/oneapi/properties/properties.hpp -// CHECK-NEXT: ext/oneapi/properties/property_utils.hpp // CHECK-NEXT: ext/oneapi/experimental/event_mode_property.hpp // CHECK-NEXT: ext/oneapi/experimental/graph.hpp // CHECK-NEXT: ext/oneapi/experimental/graph/command_graph.hpp @@ -142,10 +144,8 @@ // CHECK-NEXT: ext/oneapi/experimental/cluster_group_prop.hpp // CHECK-NEXT: ext/oneapi/experimental/use_root_sync_prop.hpp // CHECK-NEXT: ext/oneapi/experimental/virtual_functions.hpp -// CHECK-NEXT: ext/oneapi/kernel_properties/properties.hpp // CHECK-NEXT: ext/oneapi/work_group_scratch_memory.hpp // CHECK-NEXT: detail/sycl_local_mem_builtins.hpp -// CHECK-NEXT: detail/range_rounding.hpp // CHECK-NEXT: detail/reduction_forward.hpp // CHECK-NEXT: detail/ur.hpp // CHECK-NEXT: ur_api_funcs.def diff --git a/sycl/test/include_deps/sycl_khr_includes_usm.hpp.cpp b/sycl/test/include_deps/sycl_khr_includes_usm.hpp.cpp index 88bbaa89ccbda..0311e59e7abe5 100644 --- a/sycl/test/include_deps/sycl_khr_includes_usm.hpp.cpp +++ b/sycl/test/include_deps/sycl_khr_includes_usm.hpp.cpp @@ -123,14 +123,16 @@ // CHECK-NEXT: detail/id_queries_fit_in_int.hpp // CHECK-NEXT: detail/nd_range_view.hpp // CHECK-NEXT: detail/optional.hpp +// CHECK-NEXT: detail/range_rounding.hpp +// CHECK-NEXT: ext/oneapi/kernel_properties/properties.hpp +// CHECK-NEXT: ext/oneapi/properties/properties.hpp +// CHECK-NEXT: ext/oneapi/properties/property_utils.hpp +// CHECK-NEXT: ext/oneapi/properties/property.hpp +// CHECK-NEXT: ext/oneapi/properties/property_value.hpp // CHECK-NEXT: event.hpp // CHECK-NEXT: exception_list.hpp // CHECK-NEXT: ext/oneapi/device_global/device_global.hpp // CHECK-NEXT: ext/oneapi/device_global/properties.hpp -// CHECK-NEXT: ext/oneapi/properties/property.hpp -// CHECK-NEXT: ext/oneapi/properties/property_value.hpp -// CHECK-NEXT: ext/oneapi/properties/properties.hpp -// CHECK-NEXT: ext/oneapi/properties/property_utils.hpp // CHECK-NEXT: ext/oneapi/experimental/event_mode_property.hpp // CHECK-NEXT: ext/oneapi/experimental/graph.hpp // CHECK-NEXT: ext/oneapi/experimental/graph/command_graph.hpp @@ -157,10 +159,8 @@ // CHECK-NEXT: ext/oneapi/experimental/cluster_group_prop.hpp // CHECK-NEXT: ext/oneapi/experimental/use_root_sync_prop.hpp // CHECK-NEXT: ext/oneapi/experimental/virtual_functions.hpp -// CHECK-NEXT: ext/oneapi/kernel_properties/properties.hpp // CHECK-NEXT: ext/oneapi/work_group_scratch_memory.hpp // CHECK-NEXT: detail/sycl_local_mem_builtins.hpp -// CHECK-NEXT: detail/range_rounding.hpp // CHECK-NEXT: detail/reduction_forward.hpp // CHECK-NEXT: detail/ur.hpp // CHECK-NEXT: ur_api_funcs.def