From c2fbe142295328bac766f4451b7a57f697ce6709 Mon Sep 17 00:00:00 2001 From: Civitasv Date: Thu, 27 Jul 2023 10:27:25 +0800 Subject: [PATCH 01/14] [tensorrt] [byoc] [plugin] add ability to support generating tensorrt plugin with tvm --- python/tvm/contrib/graph_executor.py | 52 ++++++++++++++ python/tvm/relay/backend/executor_factory.py | 19 ++++++ python/tvm/relay/build_module.py | 10 +++ src/driver/driver_api.cc | 5 ++ src/relay/backend/aot_executor_codegen.cc | 17 +++++ src/relay/backend/build_module.cc | 29 ++++++++ src/relay/backend/graph_executor_codegen.cc | 15 +++++ src/relay/backend/utils.h | 2 + src/runtime/cuda/cuda_module.cc | 24 +++++++ src/runtime/graph_executor/graph_executor.cc | 67 +++++++++++++++++++ src/runtime/graph_executor/graph_executor.h | 10 +++ .../transforms/lower_device_kernel_launch.cc | 64 ++++++++++++++++++ src/tir/transforms/make_packed_api.cc | 12 ++++ src/tir/transforms/split_host_device.cc | 30 +++++++-- 14 files changed, 351 insertions(+), 5 deletions(-) diff --git a/python/tvm/contrib/graph_executor.py b/python/tvm/contrib/graph_executor.py index ab94f203c231..25a5cc46aa8d 100644 --- a/python/tvm/contrib/graph_executor.py +++ b/python/tvm/contrib/graph_executor.py @@ -178,6 +178,12 @@ def __init__(self, module): self._load_params = module["load_params"] self._share_params = module["share_params"] + self._get_workspace_dtype = module["get_workspace_dtype"] + self._get_workspace_size = module["get_workspace_size"] + self._get_func_inorder = module["get_func_inorder"] + self._get_storageid = module["get_storageid"] + self._get_output_eid = module["get_output_eid"] + def set_input(self, key=None, value=None, **params): """Set inputs to the module via kwargs @@ -512,3 +518,49 @@ def benchmark( cooldown_interval_ms=cooldown_interval_ms, repeats_to_cooldown=repeats_to_cooldown, )() + + def get_workspace_dtype(self): + """Get the dtype of workspace to the graph + + Returns + ------- + dtype : str + The dtypes of workspace. + """ + return self._get_workspace_dtype() + + def get_workspace_size(self): + """Get the dtype of workspace to the graph + + Returns + ------- + dtype : int + The bytes size of workspace. + """ + return self._get_workspace_size() + + def get_func_inorder(self): + """Get the Host Function execute order + + Returns + ------- + dtype : str + The Host function execute order + """ + return self._get_func_inorder() + + def get_storageid(self): + return self._get_storageid() + + def get_output_eid(self, index): + """Get index-th output to out + + Parameters + ---------- + index : int + The output index + + out : NDArray + The output array container + """ + return self._get_output_eid(index) \ No newline at end of file diff --git a/python/tvm/relay/backend/executor_factory.py b/python/tvm/relay/backend/executor_factory.py index eee3169400ff..9eafcc2cfb93 100644 --- a/python/tvm/relay/backend/executor_factory.py +++ b/python/tvm/relay/backend/executor_factory.py @@ -180,6 +180,7 @@ def __init__( libmod_name, params, function_metadata, + constant_params = None ): assert isinstance(graph_json_str, string_types) fcreate = get_global_func("tvm.graph_executor_factory.create") @@ -199,6 +200,12 @@ def __init__( self.iter_cnt = 0 self.function_metadata = function_metadata + self.constant_params = constant_params + self.device_funcs_list_func = get_global_func("tir.transform.retrieve_device_funcs_list") + self.device_memory_size_func = get_global_func("tir.transform.retrieve_device_memory_size") + self.grid_block_thread_config_func = get_global_func("runtime.module.retrieve_grid_block_thread_config") + + def export_library(self, file_name, fcompile=None, addons=None, **kwargs): return self.module.export_library(file_name, fcompile, addons, **kwargs) @@ -216,3 +223,15 @@ def get_executor_config(self): def get_lib(self): return self.lib + + def get_constant_params(self): + return self.constant_params + + def get_device_function_list(self): + return self.device_funcs_list_func() + + def get_grid_block_thread_config(self): + return self.grid_block_thread_config_func() + + def get_device_memory_size(self): + return self.device_memory_size_func() \ No newline at end of file diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py index 40a91cc75a00..33783a74315a 100644 --- a/python/tvm/relay/build_module.py +++ b/python/tvm/relay/build_module.py @@ -70,6 +70,7 @@ def __init__(self): self._get_executor_codegen_metadata = self.mod["get_executor_codegen_metadata"] self._get_devices = self.mod["get_devices"] self._get_irmodule = self.mod["get_irmodule"] + self._get_constant_params_func = self.mod["get_constant_params"] def build( self, @@ -249,6 +250,13 @@ def get_params(self): ret[key] = value.data return ret + def get_constant_params(self): + params = self._get_constant_params_func() + ret = {} + for key, value in params.items(): + ret[key] = value.data.asnumpy() + return ret + def get_irmodule(self): """Returns the TargetIRModule's post-lowering""" return self._get_irmodule() @@ -372,6 +380,7 @@ def build( mod_name=mod_name, ) func_metadata = bld_mod.get_function_metadata() + constant_params = bld_mod.get_constant_params() devices = bld_mod.get_devices() lowered_ir_mods = bld_mod.get_irmodule() executor_codegen_metadata = bld_mod.get_executor_codegen_metadata() @@ -400,6 +409,7 @@ def build( mod_name, params, func_metadata, + constant_params=constant_params ) else: assert False, "Executor " + executor + " not supported" diff --git a/src/driver/driver_api.cc b/src/driver/driver_api.cc index b7ba0ffe4468..7a27bbddddfe 100644 --- a/src/driver/driver_api.cc +++ b/src/driver/driver_api.cc @@ -601,6 +601,8 @@ transform::Sequential MixedModulePassManager(IRModule mixed_mod, Target target) } mixed_pass_list.push_back(tir::transform::AnnotateDeviceRegions()); + + // std::cout << "@1. SplitHostDevice" << '\n'; mixed_pass_list.push_back(tir::transform::SplitHostDevice()); bool unpacked_api = mixed_mod->GetAttr(tvm::attr::kExecutor) @@ -608,13 +610,16 @@ transform::Sequential MixedModulePassManager(IRModule mixed_mod, Target target) ->GetAttr("unpacked-api") .value_or(Bool(false)); if (unpacked_api) { + // std::cout << "@2.1 UNMakePackedAPI" << '\n'; mixed_pass_list.push_back(tir::transform::MakeUnpackedAPI()); } else { + // std::cout << "@2.2 MakePackedAPI" << '\n'; mixed_pass_list.push_back(tir::transform::MakePackedAPI()); } mixed_pass_list.push_back(tir::transform::FP8StorageLegalize()); mixed_pass_list.push_back(tir::transform::BF16StorageLegalize()); + // std::cout << "@3. LowerDevice" << '\n'; mixed_pass_list.push_back(tir::transform::LowerDeviceKernelLaunch()); return transform::Sequential(mixed_pass_list); diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc index f698c654d6d8..ade89e544a52 100644 --- a/src/relay/backend/aot_executor_codegen.cc +++ b/src/relay/backend/aot_executor_codegen.cc @@ -1228,16 +1228,22 @@ class AOTExecutorCodegen : public MixedModeVisitor { // Collect any constants extracted by external codegen. ret.params = std::unordered_map(); + ret.params_for_tpat = std::unordered_map>(); + Map const_name_to_constant = lowered_mod->GetAttr>(tvm::attr::kConstNameToConstant) .value_or({}); for (const auto& kv : const_name_to_constant) { ICHECK(ret.params.emplace(kv.first, kv.second).second); + ret.params_for_tpat.emplace(std::make_pair( + kv.first, std::make_pair(static_cast(param_storage_ids_[kv.first]), kv.second))); } // Collect any constants extracted during lowering. for (const auto& kv : params_) { ICHECK(ret.params.emplace(kv.first, kv.second).second); + ret.params_for_tpat.emplace(std::make_pair( + kv.first, std::make_pair(static_cast(param_storage_ids_[kv.first]), kv.second))); } // AoT Executor codegen works completely on TIR beyond this point, hence removing relay main @@ -1387,6 +1393,11 @@ class AOTExecutorCodegenModule : public runtime::ModuleNode { String key = args[0]; *rv = get_param_by_name(key); }); + } else if (name == "get_param_id") { + return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { + String key = args[0]; + *rv = get_param_id(key); + }); } else if (name == "get_irmodule") { return PackedFunc( [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = get_irmodule(); }); @@ -1436,6 +1447,12 @@ class AOTExecutorCodegenModule : public runtime::ModuleNode { Array get_external_modules() { return output_.external_mods; } + int get_param_id(String key) { + auto it = this->output_.params_for_tpat.find(key); + CHECK(it != this->output_.params_for_tpat.end()) << "no such parameter " << key; + return (*it).second.first; + } + Map get_irmodule() { return this->output_.lowered_funcs; } std::shared_ptr codegen_; diff --git a/src/relay/backend/build_module.cc b/src/relay/backend/build_module.cc index 83c252d831c5..216a375b7b53 100644 --- a/src/relay/backend/build_module.cc +++ b/src/relay/backend/build_module.cc @@ -86,6 +86,17 @@ struct ExecutorCodegen { return ret; } + std::unordered_map GetParamIds() { + std::unordered_map ret; + auto names = CallFunc>("list_params_name", nullptr); + for (const auto& expr : names) { + // Implicit cast from runtime::String to std::string + std::string key = expr; + ret[key] = CallFunc("get_param_id", key); + } + return ret; + } + Array GetExternalModules() { return CallFunc>("get_external_modules", nullptr); } @@ -222,6 +233,9 @@ class RelayBuildModule : public runtime::ModuleNode { ICHECK_EQ(args.num_args, 2); *rv = this->Optimize(args[0], args[1]); }); + } else if (name == "get_constant_params") { + return PackedFunc( + [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->GetConstantParams(); }); } else { LOG(FATAL) << "Unknown packed function: " << name; return PackedFunc([sptr_to_self, name](TVMArgs args, TVMRetValue* rv) {}); @@ -268,6 +282,21 @@ class RelayBuildModule : public runtime::ModuleNode { return ret; } + /*! + * \brief Get params dictionary, but key is ParamIdx + * + * \return Map params dictionary + */ + Map GetConstantParams() { + Map ret; + auto param_ids = this->executor_codegen_->GetParamIds(); + + for (const auto& kv : ret_.params) { + ret.Set(std::to_string(param_ids[kv.first]), Constant(kv.second)); + } + return ret; + } + /*! * \brief Set the parameters * diff --git a/src/relay/backend/graph_executor_codegen.cc b/src/relay/backend/graph_executor_codegen.cc index 868173d28c13..180a6273a803 100644 --- a/src/relay/backend/graph_executor_codegen.cc +++ b/src/relay/backend/graph_executor_codegen.cc @@ -266,18 +266,26 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslator(); + ret.params_for_tpat = std::unordered_map>(); + Map const_name_to_constant = lowered_mod->GetAttr>(tvm::attr::kConstNameToConstant) .value_or({}); for (const auto& kv : const_name_to_constant) { VLOG(1) << "constant '" << kv.first << "' contributed by external codegen"; ICHECK(ret.params.emplace(kv.first, kv.second).second); + ret.params_for_tpat.emplace(std::make_pair( + kv.first, + std::make_pair(static_cast(param_storage_ids_[kv.first]), kv.second))); } // Collect any constants extracted during lowering. for (const auto& kv : params_) { VLOG(1) << "constant '" << kv.first << "' contributed by TECompiler"; ICHECK(ret.params.emplace(kv.first, kv.second).second); + ret.params_for_tpat.emplace(std::make_pair( + kv.first, + std::make_pair(static_cast(param_storage_ids_[kv.first]), kv.second))); } ret.function_metadata = std::move(function_metadata_); @@ -663,6 +671,13 @@ class GraphExecutorCodegenModule : public runtime::ModuleNode { CHECK(it != this->output_.params.end()) << "no such parameter " << key; *rv = (*it).second; }); + } else if (name == "get_param_id") { + return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { + String key = args[0]; + auto it = this->output_.params_for_tpat.find(key); + CHECK(it != this->output_.params_for_tpat.end()) << "no such parameter " << key; + *rv = (*it).second.first; + }); } else if (name == "get_irmodule") { return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->output_.lowered_funcs; diff --git a/src/relay/backend/utils.h b/src/relay/backend/utils.h index acaea425d178..7b50e70f034b 100644 --- a/src/relay/backend/utils.h +++ b/src/relay/backend/utils.h @@ -304,6 +304,8 @@ struct LoweredOutput { * to the constant's value. */ std::unordered_map params; + + std::unordered_map> params_for_tpat; ExecutorCodegenMetadata metadata; }; diff --git a/src/runtime/cuda/cuda_module.cc b/src/runtime/cuda/cuda_module.cc index f54aefe8c4eb..5d0a98b4b54a 100644 --- a/src/runtime/cuda/cuda_module.cc +++ b/src/runtime/cuda/cuda_module.cc @@ -32,6 +32,8 @@ #include #include +#include + #include "../file_utils.h" #include "../meta_data.h" #include "../pack_args.h" @@ -41,6 +43,8 @@ namespace tvm { namespace runtime { +std::vector device_funcs_thread_config; + // Module to support thread-safe multi-GPU execution. // cuModule is a per-GPU module // The runtime will contain a per-device module table @@ -204,6 +208,14 @@ class CUDAWrappedFunc { << cuda; } LOG(FATAL) << os.str(); + } else { + std::stringstream ss; + ss << func_name_ + << " grid=(" << wl.grid_dim(0) << "," << wl.grid_dim(1) << "," + << wl.grid_dim(2) << ") " + << " block=(" << wl.block_dim(0) << "," << wl.block_dim(1) << "," << wl.block_dim(2) + << ")\n"; + device_funcs_thread_config.push_back(ss.str()); } } @@ -263,6 +275,7 @@ PackedFunc CUDAModuleNode::GetFunction(const String& name, const ObjectPtr fmap, std::string cuda_source) { + device_funcs_thread_config.clear(); auto n = make_object(data, fmt, fmap, cuda_source); return Module(n); } @@ -289,10 +302,21 @@ Module CUDAModuleLoadBinary(void* strm) { return CUDAModuleCreate(data, fmt, fmap, std::string()); } +String CUDAModuleGetGridBlockThreadConfig() { + String ret = ""; + for (auto func_config : device_funcs_thread_config) { + ret = ret + func_config; + } + return ret; +} + TVM_REGISTER_GLOBAL("runtime.module.loadfile_cubin").set_body_typed(CUDAModuleLoadFile); TVM_REGISTER_GLOBAL("runtime.module.loadfile_ptx").set_body_typed(CUDAModuleLoadFile); TVM_REGISTER_GLOBAL("runtime.module.loadbinary_cuda").set_body_typed(CUDAModuleLoadBinary); + +TVM_REGISTER_GLOBAL("runtime.module.retrieve_grid_block_thread_config") + .set_body([](TVMArgs args, TVMRetValue* rv) { *rv = CUDAModuleGetGridBlockThreadConfig(); }); } // namespace runtime } // namespace tvm diff --git a/src/runtime/graph_executor/graph_executor.cc b/src/runtime/graph_executor/graph_executor.cc index 777a5a442a98..867971ae875b 100644 --- a/src/runtime/graph_executor/graph_executor.cc +++ b/src/runtime/graph_executor/graph_executor.cc @@ -375,6 +375,49 @@ void GraphExecutor::DefaultLookupLinkedParam(TVMArgs args, TVMRetValue* rv) { *rv = NDArray(GetObjectPtr(container)); } +String GraphExecutor::GetWorkspaceDtype() { + std::ostringstream os; + for (const std::string& s_type : attrs_.dltype) { + os << s_type << " "; + } + return os.str(); +} + +String GraphExecutor::GetWorkspaceSize() { + std::ostringstream os; + for (size_t i = 0; i < data_entry_.size(); ++i) { + const DLTensor* tmp = data_entry_[i].operator->(); + os << GetDataSize(*tmp) << " "; + } + return os.str(); +} + +String GraphExecutor::GetFuncList() { + std::ostringstream os; + for (auto funcs : exec_func_) { + for (auto func : funcs) { + os << func << " "; + } + os << "\n"; + } + return os.str(); +} + +String GraphExecutor::GetStorageId() { + std::ostringstream os; + for (auto id : attrs_.storage_id) { + os << id << " "; + } + os << "\n"; + return os.str(); +} + +int GraphExecutor::GetOutputEid(int index) const { + ICHECK_LT(static_cast(index), outputs_.size()); + uint32_t eid = this->entry_id(outputs_[index]); + return eid; +} + void GraphExecutor::SetupStorage() { // Grab saved optimization plan from graph. std::vector vtype; @@ -510,14 +553,23 @@ void GraphExecutor::SetupOpExecs() { const auto& inode = nodes_[nid]; if (inode.op_type == "null") continue; std::vector args; + std::vector indexes; + std::vector funcs; for (const auto& e : inode.inputs) { uint32_t eid = this->entry_id(e); args.push_back(const_cast(data_entry_[eid].operator->())); + indexes.push_back(eid); } for (uint32_t index = 0; index < inode.param.num_outputs; ++index) { uint32_t eid = this->entry_id(nid, index); args.push_back(const_cast(data_entry_[eid].operator->())); + indexes.push_back(eid); } + funcs.push_back(inode.param.func_name); + for (auto eid : indexes) { + funcs.push_back(std::to_string(eid)); + } + exec_func_.push_back(funcs); ICHECK(inode.op_type == "tvm_op") << "Can only take tvm_op as op"; std::shared_ptr op_args = nullptr; @@ -738,6 +790,21 @@ PackedFunc GraphExecutor::GetFunction(const String& name, const ObjectPtrGetWorkspaceDtype(); }); + } else if (name == "get_workspace_size") { + return PackedFunc( + [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->GetWorkspaceSize(); }); + } else if (name == "get_func_inorder") { + return PackedFunc( + [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->GetFuncList(); }); + } else if (name == "get_storageid") { + return PackedFunc( + [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->GetStorageId(); }); + } else if (name == "get_output_eid") { + return PackedFunc( + [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->GetOutputEid(args[0]); }); } else { return PackedFunc(); } diff --git a/src/runtime/graph_executor/graph_executor.h b/src/runtime/graph_executor/graph_executor.h index 2f6b8b8147e5..8bbabd1c9c72 100644 --- a/src/runtime/graph_executor/graph_executor.h +++ b/src/runtime/graph_executor/graph_executor.h @@ -416,6 +416,14 @@ class TVM_DLL GraphExecutor : public ModuleNode { } ICHECK_EQ(bitmask, 1 | 2 | 4 | 8 | 16) << "invalid format"; } + /*! \brief get the storage dtype */ + String GetWorkspaceDtype(); + /*! \brief get the storage size */ + String GetWorkspaceSize(); + /*! \brief get the exec func in order*/ + String GetFuncList(); + String GetStorageId(); + int GetOutputEid(int index) const; /*! \brief PackedFunc to lookup a linked paramter from a local Module. */ void DefaultLookupLinkedParam(TVMArgs args, TVMRetValue* rv); /*! \brief Delete NDArray::Container with linked (i.e. static) data. */ @@ -430,6 +438,8 @@ class TVM_DLL GraphExecutor : public ModuleNode { * \param eid The data_enrty_ index. */ void CheckExternalDLTensor(const DLTensor* external, uint32_t eid) const; + /*! \brief Store execute function in order */ + std::vector> exec_func_; /*! * \brief Create an execution function given input. * \param attrs The node attributes. diff --git a/src/tir/transforms/lower_device_kernel_launch.cc b/src/tir/transforms/lower_device_kernel_launch.cc index 932116485fa1..76e1e69f7444 100644 --- a/src/tir/transforms/lower_device_kernel_launch.cc +++ b/src/tir/transforms/lower_device_kernel_launch.cc @@ -28,12 +28,17 @@ #include #include #include +#include #include "../../runtime/thread_storage_scope.h" #include "ir_utils.h" namespace tvm { namespace tir { +extern std::unordered_map > host_name_to_param; +extern std::unordered_map curr2prev; +std::vector device_funcs; +std::vector device_memory_size; namespace { struct KernelInfo { @@ -120,6 +125,14 @@ class DeviceInfoCollector : public StmtVisitor { } void VisitStmt_(const AllocateNode* op) final { + std::ostringstream os; + os << op->buffer_var.get() << " " << op->dtype << " "; + for (auto extent : op->extents) { + os << extent << " "; + } + os << "\n"; + device_memory_size.push_back(os.str()); + auto storage_scope = runtime::StorageScope::Create(GetPtrStorageScope(op->buffer_var)); if (storage_scope.rank == runtime::StorageRank::kShared && storage_scope.tag == ".dyn") { ICHECK(!dyn_shmem_size.defined()) << "Only one dynamic shared memory allocation is allowed."; @@ -298,13 +311,39 @@ class DeviceKernelMutator : public StmtExprMutator { device_kernel_launch_.insert(gvar); Array call_args; + Array cuda_kernel_args; + call_args.push_back(StringImm(dev_info.global_symbol)); for (PrimExpr arg : node->args) { call_args.push_back(arg); + cuda_kernel_args.push_back(arg); } for (const auto& launch_arg : dev_info.launch_args) { call_args.push_back(Substitute(launch_arg, param_map)); } + std::stringstream ss; + ss << gvar->name_hint << " "; + for (auto arg : cuda_kernel_args) { + bool find_param_in_host = false; + for (int i = 0; i < host_name_to_param[curr2prev[gvar->name_hint]].size(); ++i) { + if (arg.same_as(host_name_to_param[curr2prev[gvar->name_hint]][i])) { + ss << i << " "; + find_param_in_host = true; + } + } + std::cout << std::endl; + if (!find_param_in_host) { + ss << arg.get() << " "; + } + } + ss << "\n"; + device_funcs.push_back(ss.str()); + + // std::cout << "3. Lower device kernel" << '\n'; + // for (auto& item: device_funcs) { + // std::cout << item << ", "; + // } + // std::cout << '\n'; auto dtype = node->dtype.is_void() ? DataType::Int(32) : node->dtype; @@ -318,9 +357,27 @@ class DeviceKernelMutator : public StmtExprMutator { }; namespace transform { +String GetDeviceFuncsList() { + String ret = ""; + for (auto func : device_funcs) { + ret = ret + func; + } + return ret; +} + +String GetDeviceMemorySize() { + String ret = ""; + for (auto m : device_memory_size) { + ret = ret + m; + } + return ret; +} Pass LowerDeviceKernelLaunch() { auto pass_func = [](IRModule mod, PassContext ctx) -> IRModule { + device_funcs.clear(); + device_memory_size.clear(); + auto mutator = [&mod]() { std::unordered_map device_info_map; for (const auto& [gvar, base_func] : mod->functions) { @@ -372,6 +429,13 @@ Pass LowerDeviceKernelLaunch() { TVM_REGISTER_GLOBAL("tir.transform.LowerDeviceKernelLaunch") .set_body_typed(LowerDeviceKernelLaunch); + +TVM_REGISTER_GLOBAL("tir.transform.retrieve_device_funcs_list") + .set_body([](TVMArgs args, TVMRetValue* rv) { *rv = GetDeviceFuncsList(); }); + +TVM_REGISTER_GLOBAL("tir.transform.retrieve_device_memory_size") + .set_body([](TVMArgs args, TVMRetValue* rv) { *rv = GetDeviceMemorySize(); }); + } // namespace transform } // namespace tir } // namespace tvm diff --git a/src/tir/transforms/make_packed_api.cc b/src/tir/transforms/make_packed_api.cc index 94e245b636a8..f51b079a2ff9 100644 --- a/src/tir/transforms/make_packed_api.cc +++ b/src/tir/transforms/make_packed_api.cc @@ -41,6 +41,7 @@ namespace tvm { namespace tir { static constexpr const char* kDeviceContextVar = "device_api_context"; +std::unordered_map > host_name_to_param; namespace { class ReturnRewriter : public StmtMutator { @@ -277,6 +278,7 @@ PrimFunc MakePackedAPI(PrimFunc func) { // appear in the buffer. std::vector> var_def; std::vector> buffer_def; + std::vector cur_func_param; for (int i = 0; i < static_cast(func_ptr->params.size()); ++i) { Var param = func_ptr->params[i]; @@ -290,6 +292,7 @@ PrimFunc MakePackedAPI(PrimFunc func) { var_def.emplace_back(f_arg_value(param.dtype(), i), param); if (func_ptr->buffer_map.count(param)) { + cur_func_param.push_back(func_ptr->buffer_map[param]->data); buffer_def.emplace_back(param, func_ptr->buffer_map[param]); } @@ -316,6 +319,14 @@ PrimFunc MakePackedAPI(PrimFunc func) { } } + host_name_to_param[name_hint] = cur_func_param; + + // std::cout << "2.2. IN MAKE_PACKED_API, NAME HINT: " << name_hint << " : " << '\n'; + // for (auto& item: cur_func_param) { + // std::cout << ">>> " << item << ", "; + // } + // std::cout << "=====================\n\n\n"; + Array args{v_packed_args, buf_packed_arg_type_ids->data, v_num_packed_args, v_out_ret_value, v_out_ret_tcode, v_resource_handle}; @@ -398,6 +409,7 @@ Pass MakePackedAPI() { IRModuleNode* mptr = mod.CopyOnWrite(); IRModule updates; + host_name_to_param.clear(); for (const auto& [gvar, base_func] : mptr->functions) { if (auto opt = base_func.as()) { diff --git a/src/tir/transforms/split_host_device.cc b/src/tir/transforms/split_host_device.cc index 9b1dbf1a6618..a1788758718c 100644 --- a/src/tir/transforms/split_host_device.cc +++ b/src/tir/transforms/split_host_device.cc @@ -32,6 +32,8 @@ #include #include +#include +#include #include #include "../../runtime/thread_storage_scope.h" @@ -41,10 +43,13 @@ namespace tvm { namespace tir { +extern std::unordered_map > host_name_to_param; +std::unordered_map curr2prev; + class HostDeviceSplitter : public StmtMutator { public: - explicit HostDeviceSplitter(IRModule* device_mod, std::function var_supply) - : device_mod_(device_mod), var_supply_(var_supply) {} + explicit HostDeviceSplitter(IRModule* device_mod, std::function var_supply, std::string name_prefix = "") + : device_mod_(device_mod), var_supply_(var_supply), name_prefix_(name_prefix) {} Stmt VisitStmt_(const AttrStmtNode* op) final { if (op->attr_key == tvm::attr::kTarget) { @@ -92,6 +97,9 @@ class HostDeviceSplitter : public StmtMutator { } GlobalVar kernel_symbol_global = var_supply_(); + + curr2prev[kernel_symbol_global->name_hint] = name_prefix_; + PrimFunc device_func(params, body, kernel_ret_type); device_func = WithAttrs(std::move(device_func), {{tvm::attr::kTarget, device_target}, {tir::attr::kNoAlias, Bool(true)}, @@ -100,6 +108,15 @@ class HostDeviceSplitter : public StmtMutator { (*device_mod_)->Add(kernel_symbol_global, device_func); Array args = params.Map([](const Var& var) -> PrimExpr { return var; }); + // std::cout << "1. IN SPLIT HOST DEVICE: " << '\n'; + // for (auto& entry : host_name_to_param) { + // std::cout << ">>> NAME HINT: " << entry.first << " : " << '\n'; + // for (auto& item : entry.second) { + // std::cout << ">>> " << item << ", "; + // } + // } + // std::cout << "=========================\n\n\n"; + if (can_propagate_errors) { Var kernel_error_code("kernel_error_code", success->dtype); Call kernel_call(success->dtype, kernel_symbol_global, args); @@ -117,11 +134,12 @@ class HostDeviceSplitter : public StmtMutator { IRModule* device_mod_; // Generate new GlobalVar for the kernel std::function var_supply_; + std::string name_prefix_; }; PrimFunc SplitHostDevice(PrimFunc func, IRModule* device_mod, - std::function var_supply) { - HostDeviceSplitter splitter(device_mod, var_supply); + std::function var_supply, std::string name_prefix = "") { + HostDeviceSplitter splitter(device_mod, var_supply, name_prefix); if (auto body = splitter(func->body); !body.same_as(func->body)) { func.CopyOnWrite()->body = body; @@ -139,6 +157,8 @@ Pass SplitHostDevice() { IRModule device_mod = IRModule(Map({})); IRModule updates = IRModule(Map({})); + curr2prev.clear(); + for (const auto& [gvar, base_func] : mod->functions) { if (auto opt = base_func.as()) { PrimFunc func = opt.value(); @@ -150,7 +170,7 @@ Pass SplitHostDevice() { return global_var_supply->FreshGlobal(kernel_name, false); }; - func = SplitHostDevice(std::move(func), &device_mod, var_supply); + func = SplitHostDevice(std::move(func), &device_mod, var_supply, name_prefix); if (!func.same_as(base_func)) { updates->Add(gvar, func); } From ac896a3429d618eb0fcb9ac5f146533854418138 Mon Sep 17 00:00:00 2001 From: Civitasv Date: Thu, 10 Aug 2023 21:06:34 +0800 Subject: [PATCH 02/14] [tensorrt] [byoc] [plugin] add TPAT python lib, make the api clearer --- python/tvm/tpat/__init__.py | 18 + python/tvm/tpat/cuda/__init__.py | 18 + python/tvm/tpat/cuda/kernel.py | 188 + python/tvm/tpat/cuda/pipeline.py | 124 + python/tvm/tpat/cuda/plugin/Makefile | 78 + .../cuda/plugin/trt8.0_plugin_cu.template | 54 + .../tpat/cuda/plugin/trt8.0_plugin_h.template | 135 + python/tvm/tpat/cuda/rewrite.py | 132 + python/tvm/tpat/cuda/template.py | 283 ++ python/tvm/tpat/cuda/template_params.py | 476 +++ python/tvm/tpat/cuda/type_mapping.py | 59 + tests/python/tpat/cuda/__init__.py | 16 + tests/python/tpat/cuda/common.py | 3455 +++++++++++++++++ tests/python/tpat/cuda/trt.py | 178 + 14 files changed, 5214 insertions(+) create mode 100644 python/tvm/tpat/__init__.py create mode 100644 python/tvm/tpat/cuda/__init__.py create mode 100644 python/tvm/tpat/cuda/kernel.py create mode 100644 python/tvm/tpat/cuda/pipeline.py create mode 100644 python/tvm/tpat/cuda/plugin/Makefile create mode 100644 python/tvm/tpat/cuda/plugin/trt8.0_plugin_cu.template create mode 100644 python/tvm/tpat/cuda/plugin/trt8.0_plugin_h.template create mode 100644 python/tvm/tpat/cuda/rewrite.py create mode 100644 python/tvm/tpat/cuda/template.py create mode 100644 python/tvm/tpat/cuda/template_params.py create mode 100644 python/tvm/tpat/cuda/type_mapping.py create mode 100644 tests/python/tpat/cuda/__init__.py create mode 100644 tests/python/tpat/cuda/common.py create mode 100644 tests/python/tpat/cuda/trt.py diff --git a/python/tvm/tpat/__init__.py b/python/tvm/tpat/__init__.py new file mode 100644 index 000000000000..44b1fdcc5697 --- /dev/null +++ b/python/tvm/tpat/__init__.py @@ -0,0 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from . import cuda \ No newline at end of file diff --git a/python/tvm/tpat/cuda/__init__.py b/python/tvm/tpat/cuda/__init__.py new file mode 100644 index 000000000000..ee0bce8a0d32 --- /dev/null +++ b/python/tvm/tpat/cuda/__init__.py @@ -0,0 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from .pipeline import pipeline \ No newline at end of file diff --git a/python/tvm/tpat/cuda/kernel.py b/python/tvm/tpat/cuda/kernel.py new file mode 100644 index 000000000000..b9a543acb33d --- /dev/null +++ b/python/tvm/tpat/cuda/kernel.py @@ -0,0 +1,188 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import tvm +import tvm.contrib.graph_executor as runtime +import tvm.relay as relay +from tvm import dlight +from tvm import meta_schedule as ms + + +class Config(object): + def __init__(self, onnx_model, input_shapes, target, work_dir) -> None: + self.onnx_model = onnx_model + self.input_shapes = input_shapes + self.work_dir = work_dir + + if target == "gpu": + self.target = self._detect_cuda_target() + + def tune_option(self): + return { + "target": self.target, + "builder": ms.builder.LocalBuilder(), + "runner": ms.runner.LocalRunner(), + "max_trials_global": 1000, + "max_trials_per_task": 100, + "work_dir": self.work_dir, + } + + def _detect_cuda_target(self): + dev = tvm.cuda() + if not dev.exist: + return None + + return tvm.target.Target( + { + "kind": "cuda", + "max_shared_memory_per_block": dev.max_shared_memory_per_block, + "max_threads_per_block": dev.max_threads_per_block, + "thread_warp_size": dev.warp_size, + "registers_per_block": 65536, + "arch": "sm_" + tvm.cuda().compute_version.replace(".", ""), + } + ) + + +class Kernel(object): + def __init__(self, name, onnx_model, input_shapes, enable_tunning, work_dir): + self._name = name + self._enable_tunning = enable_tunning + self._config = Config(onnx_model, input_shapes, "gpu", work_dir) + + self._lib = None + self._module = None + + def run(self): + """ + Tvm Auto Scheduler + """ + + # 1. Model -> Relay + mod, params = relay.frontend.from_onnx(self._config.onnx_model) + + # 2. Tune it + if self._enable_tunning: + tunning_option = self._config.tune_option() + ms.relay_integration.tune_relay(mod=mod, params=params, **tunning_option) + + # 3. Compiling + try: + if self._enable_tunning: + db = ms.Database.create(kind="json", work_dir=self._config.work_dir) + with db, self._config.target as target, tvm.transform.PassContext(opt_level=3): + mod = dlight.ApplyDefaultSchedule(dlight.gpu.Fallback())(mod) # type: ignore + mod = tvm.tir.transform.ForceNarrowIndexToInt32()(mod) + lib = ms.relay_integration.compile_relay( + database=db, + mod=mod, + target=target, + params=params, + ) + else: + with self._config.target as target, tvm.transform.PassContext(opt_level=3): + mod = dlight.ApplyDefaultSchedule(dlight.gpu.Fallback())(mod) # type: ignore + mod = tvm.tir.transform.ForceNarrowIndexToInt32()(mod) + lib = relay.build(mod, target=target, params=params) + + # load parameters + dev = tvm.cuda(0) + module_exec = runtime.GraphModule(lib["default"](dev)) # type: ignore + + self._lib = lib + self._module = module_exec + + # 4. Running + self._module.run() + except Exception as e: + print("[ERROR]: ", e) + self._lib = None + self._module = None + + @property + def cuda_source_code(self): + """Return source code of this kernel. + + Returns + ------- + str + source code of kernel + """ + if not self._lib: + return None + + try: + source_code = self._lib.get_lib().imported_modules[0].get_source() + source_code = source_code.replace("signed char*", "int*") + source_code = source_code.replace("uint64_t*", "int*") + source_code = source_code.replace("long long", "int") + source_code = source_code.replace("double", "float") + except IndexError: + return None + return source_code + + @property + def runtime_module(self): + return self._lib + + @property + def graph_module(self): + return self._module + + @property + def constant_param(self): + return self._lib.get_constant_params() if self._lib else None + + @property + def device_funcs_inorder(self): + return self._lib.get_device_function_list() if self._lib else None + + @property + def device_funcs_thread_config(self): + return self._lib.get_grid_block_thread_config() if self._lib else None + + @property + def device_allocate_global_memory(self): + return self._lib.get_device_memory_size() if self._lib else None + + @property + def num_inputs(self): + return self._module.get_num_inputs() if self._module else None + + @property + def num_outputs(self): + return self._module.get_num_outputs() if self._module else None + + @property + def workspace_dtype(self): + return self._module.get_workspace_dtype() if self._module else None + + @property + def workspace_size(self): + return self._module.get_workspace_size() if self._module else None + + @property + def func_inorder(self): + return self._module.get_func_inorder() if self._module else None + + @property + def storageid(self): + return self._module.get_storageid() if self._module else None + + @property + def plugin_name(self): + return self._name diff --git a/python/tvm/tpat/cuda/pipeline.py b/python/tvm/tpat/cuda/pipeline.py new file mode 100644 index 000000000000..5e1d112626df --- /dev/null +++ b/python/tvm/tpat/cuda/pipeline.py @@ -0,0 +1,124 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import os +from typing import Tuple + +import numpy as np +import onnx +import onnx_graphsurgeon as gs +import onnxruntime as ort +from onnx import shape_inference + +from tvm.tpat.cuda.kernel import Kernel +from tvm.tpat.cuda.template import StaticBatchPluginTemplate +from tvm.tpat.cuda.template_params import PluginTemplateParams + +from .rewrite import rewrite +import copy + + +def _extract_target_onnx_node(model, tunning_node): + """ + Extract target node from onnx graph + """ + graph = gs.import_onnx(model) + tensors = graph.tensors() + + tuning_node_inputs = [ + tensors[inp.name].to_variable(dtype=inp.dtype, shape=inp.shape) + for inp in tunning_node.inputs + if (inp.__class__ == gs.Variable and not inp.is_empty()) + ] + tuning_node_outputs = [ + tensors[oup.name].to_variable(dtype=oup.dtype, shape=oup.shape) + for oup in tunning_node.outputs + ] + tuning_input_shapes = [(inp.name, inp.shape, inp.dtype.name) for inp in graph.inputs] + + graph.inputs = tuning_node_inputs + graph.outputs = tuning_node_outputs + graph.cleanup() + submodel = gs.export_onnx(graph) + + return graph, submodel, tuning_input_shapes + + +def pipeline( + onnx_file: str, node_names: list[str], enable_tunning: bool, work_dir: str, output_onnx: str +) -> Tuple[str, list[str]]: + """Generate plugins for specified nodes in an ONNX model. + + This function is the entry point for generating plugins for specific nodes as requested by users. + + Parameters + ---------- + onnx_file : str + Path to the input ONNX file. + node_names : list[str] + Names of the nodes to be generated as TensorRT plugins. + enable_tunning : bool + Flag indicating whether tunning is enabled. + work_dir : str + Path to the tunning log file where the records will be saved. + output_onnx : str + Path to the output ONNX file where the modified model will be saved. + + Returns + ------- + Tuple[str, List[str]] + A tuple containing the path to the output ONNX file and a list of generated plugin paths. + """ + + # 1. load onnx + onnx_model = onnx.load(onnx_file) + inferred_model = shape_inference.infer_shapes(onnx_model) + graph = gs.import_onnx(inferred_model) + + # 2. retrieve all node which need to transform to plugins + if node_names is None or len(node_names) == 0: + return + + node_to_be_tunned = [node for node in graph.nodes if node.name in node_names] + + assert len(node_to_be_tunned) > 0, "The number of nodes to be tunned should larger than zero" + + # 3. generate plugins for each of them + node_name_to_plugin_name = {} + plugin_path = [] + for node in node_to_be_tunned: + name = node.name + plugin_name = "tpat_{}".format(name.replace("/", "_").replace(".", "_")) + + subgraph, submodel, shapes = _extract_target_onnx_node(inferred_model, node) + + kernel = Kernel(plugin_name, submodel, shapes, enable_tunning, work_dir) + kernel.run() + + ## 3.1 fill in template + params = PluginTemplateParams(kernel, submodel, subgraph, node, name) + template = StaticBatchPluginTemplate(params) + lib = template.fill() + + plugin_path.append(lib) + + node_name_to_plugin_name[name] = plugin_name + + # 4. generate the modified onnx + rewrite(inferred_model, node_to_be_tunned, node_name_to_plugin_name, output_onnx) + + return output_onnx, plugin_path diff --git a/python/tvm/tpat/cuda/plugin/Makefile b/python/tvm/tpat/cuda/plugin/Makefile new file mode 100644 index 000000000000..f9b48ffcf27d --- /dev/null +++ b/python/tvm/tpat/cuda/plugin/Makefile @@ -0,0 +1,78 @@ +# +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +CUDA_PATH = /home/huangzhe1/anaconda3/envs/tvm_tunning +CUDNN_PATH = /home/huangzhe1/husen/cudnn-linux-x86_64-8.9.3.28_cuda11-archive +TRT_PATH = /home/huangzhe1/husen/TensorRT-8.6.1.6 + +CUDA_INC_PATH = $(CUDA_PATH)/include +CUDA_LIB_PATH = $(CUDA_PATH)/lib +CUDA_COM_PATH = $(CUDA_PATH)/samples/common/inc + +CUDNN_INC_PATH = $(CUDNN_PATH)/include +CUDNN_LIB_PATH = $(CUDNN_PATH)/lib + +TRT_INC_PATH = $(TRT_PATH)/include +TRT_LIB_PATH = $(TRT_PATH)/lib + + +ARCH = sm_86 +GCC = g++ +NVCC = $(CUDA_PATH)/bin/nvcc +# CCFLAGS = -g -std=c++11 -DNDEBUG +CCFLAGS = -w -std=c++11 +# CCFLAGS+= -DDEBUG_ME +INCLUDES := -I. -I$(CUDA_COM_PATH) -I$(CUDA_INC_PATH) -I$(CUDNN_INC_PATH) -I$(TRT_INC_PATH) -I/usr/include + +LDFLAGS := -L$(CUDA_LIB_PATH) -L$(CUDNN_LIB_PATH) -L$(TRT_LIB_PATH) +LDFLAGS += -lnvinfer -lcudart -lcuda + +LDFLAGS += -Wl,-rpath=$(CUDA_LIB_PATH) +LDFLAGS += -Wl,-rpath=$(CUDNN_LIB_PATH) +LDFLAGS += -Wl,-rpath=$(TRT_LIB_PATH) + +SO = $(plugin_name).so +OBJ = $(shell find . -name '*.o') +DEP = $(OBJ:.o=.d) + +SRCDIR := ./src +OBJDIR := ./obj +LIBDIR := ./lib + +all: $(SO) + +$(plugin_name).so: $(plugin_name).o + +-include $(DEP) + +clean: + rm -rf $(LIBDIR)/$(SO) $(OBJDIR)/* + +%.o: $(SRCDIR)/%.cpp + $(AT)if [ ! -d $(OBJDIR) ]; then mkdir -p $(OBJDIR); fi + $(GCC) $(CCFLAGS) -fPIC -MD -MP $(INCLUDES) -o $@ -c $< + +%.o: $(SRCDIR)/%.cu + $(AT)if [ ! -d $(OBJDIR) ]; then mkdir -p $(OBJDIR); fi + $(NVCC) $(CCFLAGS) -M -MT $@ $(INCLUDES) -o $(@:.o=.d) $< + $(NVCC) $(CCFLAGS) $(INCLUDES) -Xcompiler -fPIC -arch=$(ARCH) -o $@ -c $< + +$(SO): + $(GCC) $(CCFLAGS) -shared -o $@ $+ $(LDFLAGS) + $(AT)if [ ! -d $(LIBDIR) ]; then mkdir -p $(LIBDIR); fi + $(AT) mv *.o $(OBJDIR)/ + $(AT) mv *.d $(OBJDIR)/ + $(AT) mv *.so $(LIBDIR)/ diff --git a/python/tvm/tpat/cuda/plugin/trt8.0_plugin_cu.template b/python/tvm/tpat/cuda/plugin/trt8.0_plugin_cu.template new file mode 100644 index 000000000000..565a72b00e23 --- /dev/null +++ b/python/tvm/tpat/cuda/plugin/trt8.0_plugin_cu.template @@ -0,0 +1,54 @@ +#include "{{plugin_name}}.h" +#include +#include +#include +#include +#include + +#define BLOCKSIZE_X 16 +#define BLOCKSIZE_Y 16 + +using namespace nvinfer1; +using namespace plugin; + +// CUDA Runtime error messages +#ifdef __DRIVER_TYPES_H__ +static const char *_cudaGetErrorEnum(cudaError_t error) +{ + return cudaGetErrorName(error); +} +#endif + +template +void check(T result, char const *const func, const char *const file, + int const line) +{ + if (result) + { + fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line, + static_cast(result), _cudaGetErrorEnum(result), func); + exit(EXIT_FAILURE); + } +} +#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__) + + +{{plugin_kernels_body}} + +PluginFieldCollection {{plugin_name}}Creator::mFC{}; +std::vector {{plugin_name}}Creator::mPluginAttributes; + +int {{plugin_name}}::enqueue(const nvinfer1::PluginTensorDesc* inputDesc, const nvinfer1::PluginTensorDesc* outputDesc, const void* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) noexcept { + {% for constant in plugin_constant_init %} + const {{constant.type}} constant_{{constant.index}}[{{constant.length}}] = { {{constant.value}} }; + checkCudaErrors(cudaMemcpyAsync({{constant.pos}}, &constant_{{constant.index}}, {{constant.length}} * sizeof({{constant.type}}), cudaMemcpyHostToDevice, stream)); + {% endfor %} + dim3 dimBlock, dimGrid; + {% for kernel in plugin_kernels_params %} + dimGrid = dim3{{kernel.grid_dim}}; + dimBlock = dim3{{kernel.block_dim}}; + {{kernel.name}}<<>>({{kernel.enqueue_params}}); + {% endfor %} +} + +REGISTER_TENSORRT_PLUGIN({{plugin_name}}Creator); diff --git a/python/tvm/tpat/cuda/plugin/trt8.0_plugin_h.template b/python/tvm/tpat/cuda/plugin/trt8.0_plugin_h.template new file mode 100644 index 000000000000..fdc9a0bcbe29 --- /dev/null +++ b/python/tvm/tpat/cuda/plugin/trt8.0_plugin_h.template @@ -0,0 +1,135 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "NvInfer.h" +#include +#include +#include +#include + +namespace nvinfer1 +{ +namespace plugin +{ + +class {{plugin_name}}: public IPluginV2DynamicExt { +public: + {{plugin_name}}() {} + + {{plugin_name}}(const void *buffer, size_t length) { + } + + virtual size_t getSerializationSize() const noexcept override { + return 0; + } + virtual void serialize(void *buffer) const noexcept override {} + + //! The combination of kLINEAR + kFLOAT is supported. + bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) noexcept override + { + bool condition = true; + {% for tensor_format in plugin_tensor_format %}if (pos == {{ loop.index0 }}){ + //std::cout << (inOut[pos].format == nvinfer1::TensorFormat::k{{tensor_format.format}}) << ", " << (inOut[pos].type == nvinfer1::DataType::k{{tensor_format.type}}) << std::endl; + condition &= inOut[pos].format == nvinfer1::TensorFormat::k{{tensor_format.format}}; + condition &= inOut[pos].type == nvinfer1::DataType::k{{tensor_format.type}}; + } + {% endfor %} + return condition; + } + + nvinfer1::IPluginV2DynamicExt* clone() const noexcept override { + return new {{plugin_name}}(); + } + int getNbOutputs() const noexcept override { + //std::cout << __FUNCTION__ << std::endl; + return {{plugin_output_number}}; + } + nvinfer1::DimsExprs getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs, nvinfer1::IExprBuilder& exprBuilder) noexcept override { + //std::cout << __FUNCTION__ << std::endl; + {% for tensor_dims in plugin_output_shape %}if (outputIndex == {{ loop.index0 }}){ + nvinfer1::DimsExprs output_shape; + output_shape.nbDims = {{tensor_dims.nbdims}}; + {% for s in tensor_dims.shape %}output_shape.d[{{loop.index0}}] = exprBuilder.constant({{s}}); + {% endfor %} + return output_shape; + } + {% endfor %} + } + nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const noexcept override{ + //std::cout << __FUNCTION__ << std::endl; + {% for type in plugin_output_type %}if (index == {{ loop.index0 }}){ + return nvinfer1::DataType::k{{type}}; + } + {% endfor %} + } + size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, int nbInputs, const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const noexcept override{ + return {{plugin_workspace_size}}; + } + int enqueue(const nvinfer1::PluginTensorDesc* inputDesc, const nvinfer1::PluginTensorDesc* outputDesc, const void* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) noexcept override; + + void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs, const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) noexcept override {} + int initialize() noexcept override {return 0;} + void terminate() noexcept override {} + void destroy() noexcept override { delete this; } + void setPluginNamespace(const char* szNamespace) noexcept override {mNamespace = szNamespace;} + const char* getPluginNamespace() const noexcept override {return mNamespace.c_str();} + const char* getPluginType() const noexcept override {return "{{plugin_name}}";} + const char* getPluginVersion() const noexcept override {return "1";} + void attachToContext(cudnnContext * /*cudnn*/, cublasContext * /*cublas*/, nvinfer1::IGpuAllocator * /*allocator*/) noexcept {} + void detachFromContext() noexcept {} + +private: + const char* mPluginNamespace; + std::string mNamespace; +}; + +class {{plugin_name}}Creator: public nvinfer1::IPluginCreator { +public: + {{plugin_name}}Creator(){ + mFC.nbFields = mPluginAttributes.size(); + mFC.fields = mPluginAttributes.data(); + } + nvinfer1::IPluginV2DynamicExt* deserializePlugin(const char* name, const void* serialData, size_t serialLength) noexcept override { + {{plugin_name}}* obj = new {{plugin_name}}{serialData, serialLength}; + obj->setPluginNamespace(mNamespace.c_str()); + return obj; + } + + const char* getPluginName() const noexcept override {return "{{plugin_name}}";} + const char* getPluginVersion() const noexcept override {return "1";} + + void setPluginNamespace(const char* szNamespace) noexcept override {mNamespace = szNamespace;} + const char* getPluginNamespace() const noexcept override {return mNamespace.c_str();} + + const nvinfer1::PluginFieldCollection* getFieldNames() noexcept override { + //std::cout << __FUNCTION__ << std::endl; + return &mFC; + } + nvinfer1::IPluginV2DynamicExt* createPlugin(const char* name, const nvinfer1::PluginFieldCollection* fc) noexcept override { + //std::cout << __FUNCTION__ << std::endl; + {{plugin_name}}* obj = new {{plugin_name}}{}; + obj->setPluginNamespace(mNamespace.c_str()); + return obj; + } +private: + std::string mNamespace; + static PluginFieldCollection mFC; + static std::vector mPluginAttributes; +}; + +} // namespace plugin + +} // namespace nvinfer1 diff --git a/python/tvm/tpat/cuda/rewrite.py b/python/tvm/tpat/cuda/rewrite.py new file mode 100644 index 000000000000..61b63be09ff0 --- /dev/null +++ b/python/tvm/tpat/cuda/rewrite.py @@ -0,0 +1,132 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import os + +import onnx +import onnx_graphsurgeon as gs +from loguru import logger +from onnx import shape_inference +from .type_mapping import onnx_type_mapping + + +def _handle_trt_not_support_type( + graph, + output_model_path, + node_name_to_plugin_name, + onnx_original_tensor_type, +): + count = 0 + insert_cast_nodes = False + + for node in graph.nodes: + if node.name in node_name_to_plugin_name: + node.op = node_name_to_plugin_name[node.name] + for i, inp in enumerate(node.inputs): + if inp.is_empty(): + node.inputs.remove(inp) + graph.cleanup() + continue + if onnx_original_tensor_type[inp.name] in onnx_type_mapping: + cast_node = gs.Node( + op="Cast", + name="cast_to_int32_for_" + inp.name.split(":")[0], + attrs={"to": 6}, + ) # 6: INT32 + + cast_node.inputs = [inp] + cast_node_out = gs.Variable(cast_node.name + ":0") + cast_node.outputs = [cast_node_out] + node.inputs[i] = cast_node_out + graph.nodes.append(cast_node) + graph.cleanup() + insert_cast_nodes = True + for i, oup in enumerate(node.outputs): + if onnx_original_tensor_type[oup.name] in onnx_type_mapping: + dtype = onnx_type_mapping[onnx_original_tensor_type[oup.name]] + cast_node = gs.Node( + op="Cast", + name="cast_back_for_" + oup.name.split(":")[0], + attrs={"to": dtype}, + ) + + cast_node.outputs = [oup] + cast_node_out = gs.Variable(cast_node.name + ":0") + cast_node.inputs = [cast_node_out] + node.outputs[i] = cast_node_out + graph.nodes.append(cast_node) + graph.cleanup() + insert_cast_nodes = True + count = count + 1 + assert count == len(node_name_to_plugin_name) + if insert_cast_nodes: + _remove_unnecessary_cast_nodes(graph) + onnx.save(gs.export_onnx(graph), output_model_path) + + +def _remove_unnecessary_cast_nodes(graph): + graph.toposort() + cast_nodes = [ + node + for node in graph.nodes + if (node.op == "Cast" and node.outputs[0] not in graph.outputs and node.o().op == "Cast") + ] + for node in cast_nodes: + if ( + node.attrs["to"] == 13 + and len(node.inputs[0].inputs) <= 1 + and len(node.outputs[0].outputs) <= 1 + ): + node.o().inputs = node.inputs + node.inputs.clear() + graph.cleanup() + + +def _compute_tensor_type(graph, tunning_nodes): + onnx_original_tensor_type = {} + + for tunning_node in tunning_nodes: + for inp in tunning_node.inputs: + if inp.__class__ == gs.Constant or not inp.is_empty(): + onnx_original_tensor_type[inp.name] = inp.dtype.name + [ + onnx_original_tensor_type.update({oup.name: oup.dtype.name}) + for oup in tunning_node.outputs + ] + return onnx_original_tensor_type + + +def rewrite( + inferred_model, + tunning_nodes, + node_name_to_plugin_name, + output_model_path, +): + """ + Insert cast operator for operators which inputs or outputs has bool type. + Modify operator type in onnx model for tensorRT can run plugin. + """ + + graph = gs.import_onnx(inferred_model) + _onnx_original_tensor_type = _compute_tensor_type(graph, tunning_nodes) + + _handle_trt_not_support_type( + graph, + output_model_path, + node_name_to_plugin_name, + _onnx_original_tensor_type, + ) diff --git a/python/tvm/tpat/cuda/template.py b/python/tvm/tpat/cuda/template.py new file mode 100644 index 000000000000..df02e9f0b7d9 --- /dev/null +++ b/python/tvm/tpat/cuda/template.py @@ -0,0 +1,283 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import contextlib +import os +import re + +import onnx +import onnx_graphsurgeon as gs +from jinja2 import Environment, FileSystemLoader +from loguru import logger +from onnx import shape_inference + + +@contextlib.contextmanager +def pushd(new_dir): + pre_dir = os.getcwd() + os.chdir(new_dir) + try: + yield + finally: + os.chdir(pre_dir) + + +def rm_part_define(source_code): + m = re.search('extern "C"', source_code.strip()) + return source_code[m.start() :] + + +class PluginTemplate(object): + def __init__(self, template_params): + self._template_params = template_params + self._plugin_name = template_params.plugin_name + self._plugin_config = template_params.plugin_config + + with pushd(os.path.normpath(os.path.dirname(__file__))): + template_loader = FileSystemLoader(searchpath='./') + self._template_env = Environment(loader=template_loader) + + self._plugin_output_number = template_params.output_num + self._plugin_output_type = template_params.output_type + self._plugin_workspace_size = template_params.workspace_size + self._plugin_total_workspace_size = template_params.total_workspace_size + onnx_output_shape = template_params.output_shape + onnx_input_shape = template_params.input_shape + self._plugin_output_shape = self.parse_plugin_output_shape(onnx_output_shape) + self._plugin_input_shape = self.parse_plugin_input_shape(onnx_input_shape) + self._plugin_tensor_input_index = template_params.onnx_tensor_input_index + onnx_tensor_type = template_params.tensor_type + self._plugin_tensor_format = self.parse_plugin_tensor_format(onnx_tensor_type) + kernel_order = template_params.kernel_order + workspace_init = template_params.workspace_init + self._plugin_kernels_params = self.parse_plugin_kernels_params(kernel_order) + self._plugin_constant_init = self.parse_plugin_workspace_init(workspace_init) + self._plugin_kernels_body = template_params.cuda_source_code + self._onnx_input_python_type = template_params.onnx_input_python_type + self._onnx_output_python_type = template_params.onnx_output_python_type + self._input_workspace_size = template_params.input_workspace_size + self._output_workspace_size = template_params.output_workspace_size + + @property + def plugin_name(self): + return self._plugin_name + + class TensorDims: + def __init__(self, nbdims, shape): + self.nbdims = nbdims + self.shape = tuple(shape) + + def __str__(self): + return f"TensorDims(nbdims={self.nbdims}, shape={self.shape})" + + def __repr__(self): + return str(self) + + class TensorFormat: + def __init__(self, format, type): + self.format = format + self.type = type + + def __str__(self): + return f"TensorFormat(format={self.format}, type={self.type})" + + def __repr__(self): + return str(self) + + class Kernel: + def __init__( + self, + name, + grid_dim, + block_dim, + enqueue_params, + kernel_params=None, + code=None, + ): + self.name = name + self.grid_dim = grid_dim + self.block_dim = block_dim + self.enqueue_params = enqueue_params + self.kernel_params = kernel_params + self.code = code + + def __str__(self): + return f"Kernel(name={self.name}, grid_dim={self.grid_dim}, block_dim={self.block_dim}, enqueue_params={self.enqueue_params})" + + def __repr__(self): + return str(self) + + class Constant: + def __init__(self, pos, value, type, index, length): + self.pos = pos + self.value = value + self.type = type + self.index = index + self.length = length + + def __str__(self): + return f"Constant(pos={self.pos}, length={self.length}, type={self.type}, index={self.index})" + + def __repr__(self): + return str(self) + + class Case: + def __init__( + self, + batch_size, + plugin_template, + dy_plugin_input_size_type_without_bs=None, + dy_plugin_output_size_type_without_bs=None, + ): + self.batch_size = batch_size + self.plugin_template = plugin_template + self.dy_plugin_input_size_type_without_bs = dy_plugin_input_size_type_without_bs + self.dy_plugin_output_size_type_without_bs = dy_plugin_output_size_type_without_bs + + class Shape: + def __init__(self, size, dtype): + self.size = size + self.dtype = dtype + + def parse_plugin_input_shape(self, onnx_input_shape): + plugin_input_shape = [] + for s in onnx_input_shape: + nbdims = len(s) + shape = s + plugin_input_shape.append(self.TensorDims(nbdims, shape)) + return plugin_input_shape + + def parse_plugin_output_shape(self, onnx_output_shape): + plugin_output_shape = [] + for s in onnx_output_shape: + nbdims = len(s) + shape = s + plugin_output_shape.append(self.TensorDims(nbdims, shape)) + return plugin_output_shape + + def parse_plugin_tensor_format(self, onnx_tensor_type): + plugin_tensor_format = [] + for dtype in onnx_tensor_type: + plugin_tensor_format.append(self.TensorFormat("LINEAR", dtype)) + return plugin_tensor_format + + def parse_plugin_kernels_params(self, kernel_order): + kernel_call = {} + plugin_kernels_params = [] + for func_name in kernel_order: + if func_name not in kernel_call.keys(): + kernel_call[func_name] = 0 + key_name = func_name + else: + kernel_call[func_name] += 1 + key_name = func_name + "_" + str(kernel_call[func_name]) + plugin_kernels_params.append( + self.Kernel( + func_name, + self._plugin_config[key_name]["grid_dim"], + self._plugin_config[key_name]["block_dim"], + self._plugin_config[key_name]["enqueue_params"], + ) + ) + return plugin_kernels_params + + def parse_plugin_workspace_init(self, workspace_init): + plugin_constant_init = [] + for init_constant in workspace_init.items(): + value_str = ", ".join(str(ele) for ele in init_constant[1][0]) + value_str = value_str.strip(",") + plugin_constant_init.append( + self.Constant( + init_constant[0], + value_str, + init_constant[1][1], + init_constant[1][2], + len(init_constant[1][0]), + ) + ) + return plugin_constant_init + + def generate_header_file(self): + raise Exception("not implement method") + + def generate_source_file(self): + raise Exception("not implement method") + + def fill(self): + plugin_header_path = f"./plugin/src/{self.plugin_name}.h" + plugin_source_path = f"./plugin/src/{self.plugin_name}.cu" + if os.path.isfile(plugin_header_path): + os.remove(plugin_header_path) + if os.path.isfile(plugin_source_path): + os.remove(plugin_source_path) + + with pushd(os.path.normpath(os.path.dirname(__file__))): + self.generate_header_file() + self.generate_source_file() + self.build_plugin() + + return f"{os.path.dirname(os.path.abspath(__file__))}/plugin/lib/{self.plugin_name}.so" + + def build_plugin(self): + os.chdir("./plugin") + + os.system(f"make clean plugin_name={self.plugin_name}") + os.system(f"make plugin_name={self.plugin_name}") + + os.chdir("../") + + +class StaticBatchPluginTemplate(PluginTemplate): + """ + Fill in the useable params which generated by PluginTemplateParams to plugin template. + The plugin template is compatible with TensorRT-8.0. + """ + + def __init__( + self, + template_params, + TEMPLATE_HEADER_FILE="./plugin/trt8.0_plugin_h.template", + TEMPLATE_SOURCE_FILE="./plugin/trt8.0_plugin_cu.template", + ): + super(StaticBatchPluginTemplate, self).__init__(template_params) + + self._template_header_file = TEMPLATE_HEADER_FILE + self._template_source_file = TEMPLATE_SOURCE_FILE + + def generate_header_file(self): + template = self._template_env.get_template(self._template_header_file) + output_text = template.render( + plugin_name=self._plugin_name, + plugin_output_number=self._plugin_output_number, + plugin_output_shape=self._plugin_output_shape, + plugin_output_type=self._plugin_output_type, + plugin_workspace_size=self._plugin_workspace_size, + plugin_tensor_format=self._plugin_tensor_format, + ) + with open("./plugin/src/{}.h".format(self._plugin_name), "w") as f: + f.write(output_text) + + def generate_source_file(self): + template = self._template_env.get_template(self._template_source_file) + output_text = template.render( + plugin_name=self._plugin_name, + plugin_kernels_params=self._plugin_kernels_params, + plugin_kernels_body=self._plugin_kernels_body, + plugin_constant_init=self._plugin_constant_init, + ) + with open("./plugin/src/{}.cu".format(self._plugin_name), "w") as f: + f.write(output_text) diff --git a/python/tvm/tpat/cuda/template_params.py b/python/tvm/tpat/cuda/template_params.py new file mode 100644 index 000000000000..8cec8e48e794 --- /dev/null +++ b/python/tvm/tpat/cuda/template_params.py @@ -0,0 +1,476 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import copy +import os +import re + +import numpy as np +import onnx +import onnx_graphsurgeon as gs +import onnxruntime as ort +from onnx import shape_inference +from .type_mapping import plugin_type_size, python_to_trt_type_mapping, tvm_to_c_type_mapping + + +class PluginTemplateParams(object): + """ + Generate useable params for TensorRT plugin. + """ + + def __init__(self, kernel, model, graph, tunning_node, name): + self._kernel = kernel + self._model = model + self._graph = graph + self._tunning_name = name + self._tunning_node = tunning_node + + self._onnx_input_order = [] + self._input_dict = {} + self._tvm_executor_order = {} + self._allocate_size = [] + self._data_type = [] + self._cuda_kernel_order = {} + self._gpu_thread_config = {} + self._tvm_func_order = [] + self._nums_input = 0 + self._nums_output = 0 + self._workspace_size = 0 + self._output_type = [] + self._cuda_func_order = [] + self._tvm_constant = {} + self._tvm_workspace_constant = {} + self._onnx_input_shape = [] + self._onnx_output_shape = [] + self._onnx_weight_input_index = [] + self._onnx_tensor_input_index = [] + self._onnx_tensor_type = [] + self._onnx_input_python_type = [] + self._onnx_output_python_type = [] + self._storage_id = [] + self._allocate_global_memory = {} + self._plugin_config = None + + self.infer_for_output_shape() + self.input_weight_and_tensor_index() + self.parse() + self.align_onnx_and_tvm_input() + self.match_address_for_eid() + self.cuda_kernel_config() + + def describe(self): + print(f"Cuda Kernel Order >>> {self._cuda_kernel_order}") + print(f"Gpu Thread Config >>> {self._gpu_thread_config}") + print(f"Cuda Func Rrder >>> {self._cuda_func_order}") + print(f"Nums Input >>> {self._nums_input}") + print(f"Nums Output >>> {self._nums_output}") + print(f"Data Type >>> {self._data_type}") + print(f"Allocate Size >>> {self._allocate_size}") + print(f"Tvm Executor Order >>> {self._tvm_executor_order}") + print(f"Tvm Func Order >>> {self._tvm_func_order}") + print(f"Cuda Source Code >>> {self._cuda_source_code}") + print(f"Storage Id >>> {self._storage_id}") + print(f"Storage Slot >>> {self.storage_slot}") + print(f"Allocate Global Memory >>> {self._allocate_global_memory}") + print(f"Input Workspace Size >>> {self._input_workspace_size}") + print(f"Output Workspace Size >>> {self._output_workspace_size}") + + + # Parse Constant. + def parse_constant_params(self, constant_params): + tvm_constant = {} + for key, value in constant_params.items(): + tvm_constant[key] = value.flatten() + return tvm_constant + + # Parse device functions params order. + def parse_device_funcs_params(self, device_funcs_inorder): + cuda_kernel_order = {} + for device_func_inorder in device_funcs_inorder: + if len(device_func_inorder) == 0: + continue + tvm_device_func = device_func_inorder.split() + + cuda_kernel_order[tvm_device_func[0]] = tvm_device_func[1:] + return cuda_kernel_order + + # Parse device functions thread config. + def parse_device_funcs_thread_config(self, device_funcs_thread_config): + gpu_thread_config = {} + cuda_func_order = [] + for device_func_thread_config in device_funcs_thread_config: + if len(device_func_thread_config) == 0: + continue + config = device_func_thread_config.split() + cuda_func_name = config[0] + gpu_thread_config[cuda_func_name] = config[1:] + cuda_func_order.append(cuda_func_name) + return gpu_thread_config, cuda_func_order + + # Parse global memory allocated in device side. + def parse_device_allocate_global_memory(self, device_allocate_global_memory): + allocate_global_memory = {} + for allocate_memory in device_allocate_global_memory: + if len(allocate_memory) == 0: + continue + allocate = allocate_memory.split() + allocate_global_memory[allocate[0]] = allocate[1:] + return allocate_global_memory + + # Parse variables storage index. + def parse_storageid(self, storageid): + storage_id = [] + storage_slot = {} + for sid in storageid: + if len(sid) == 0: + continue + storage_id = sid.split() + storage_slot = {}.fromkeys(sid).keys() + return storage_id, storage_slot + + # Parse numbers of input. + def parse_nums_input(self, nums_input): + real_nums_input = int(nums_input) - int(len(self._tvm_constant)) + return real_nums_input + + # Parse numbers of output. + def parse_nums_output(self, nums_output): + real_nums_output = int(nums_output) + return real_nums_output + + # Parse datatype of variables in memory. + def parse_workspace_dtype(self, workspaces_dtype): + return workspaces_dtype.split() + + # Parse size of variables in memory. + def parse_workspace_size(self, workspace_size): + return workspace_size.split() + + def parse_func_inorder(self, funcs_inorder): + """ + Parse the order of host functions. + """ + func_call = {} + tvm_executor_order = {} + tvm_func_order = [] + for host_func_inorder in funcs_inorder: + if len(host_func_inorder) == 0: + continue + tvm_host_func = host_func_inorder.split() + if tvm_host_func[0] not in tvm_executor_order.keys(): + tvm_executor_order[tvm_host_func[0]] = tvm_host_func[1:] + tvm_func_order.append(tvm_host_func[0]) + func_call[tvm_host_func[0]] = 0 + else: + func_call[tvm_host_func[0]] += 1 + func_name = tvm_host_func[0] + "_" + str(func_call[tvm_host_func[0]]) + tvm_executor_order[func_name] = tvm_host_func[1:] + tvm_func_order.append(func_name) + return tvm_executor_order, tvm_func_order + + def parse(self): + constant_params = self._kernel.constant_param + device_funcs_inorder = self._kernel.device_funcs_inorder.split("\n") + device_funcs_thread_config = self._kernel.device_funcs_thread_config.split("\n") + device_allocate_global_memory = self._kernel.device_allocate_global_memory.split("\n") + num_inputs = self._kernel.num_inputs + num_outputs = self._kernel.num_outputs + workspace_dtype = self._kernel.workspace_dtype + workspace_size = self._kernel.workspace_size + funcs_inorder = self._kernel.func_inorder.split("\n") + storage_id = self._kernel.storageid.split("\n") + + self._tvm_constant = self.parse_constant_params(constant_params) + self._cuda_kernel_order = self.parse_device_funcs_params(device_funcs_inorder) + ( + self._gpu_thread_config, + self._cuda_func_order, + ) = self.parse_device_funcs_thread_config(device_funcs_thread_config) + self._nums_input = self.parse_nums_input(num_inputs) + self._nums_output = self.parse_nums_output(num_outputs) + self._data_type = self.parse_workspace_dtype(workspace_dtype) + self._allocate_size = self.parse_workspace_size(workspace_size) + self._tvm_executor_order, self._tvm_func_order = self.parse_func_inorder(funcs_inorder) + self._cuda_source_code = self._kernel.cuda_source_code + self._storage_id, self.storage_slot = self.parse_storageid(storage_id) + self._allocate_global_memory = self.parse_device_allocate_global_memory( + device_allocate_global_memory + ) + self._input_workspace_size = self._allocate_size[0 : self._nums_input] + self._output_workspace_size = self._allocate_size[-self._nums_output :] + + self.describe() + + def infer_for_output_shape(self): + """ + Infer for output shape. + """ + tunning_node = self._tunning_node + + for inp in tunning_node.inputs: + if inp.__class__==gs.Constant or not inp.is_empty(): + self._onnx_input_python_type.append(tvm_to_c_type_mapping[inp.dtype.name]) + self._onnx_tensor_type.append(python_to_trt_type_mapping[inp.dtype.name]) + + for oup in tunning_node.outputs: + self._onnx_output_python_type.append(tvm_to_c_type_mapping[oup.dtype.name]) + self._onnx_tensor_type.append(python_to_trt_type_mapping[oup.dtype.name]) + + self._onnx_output_shape = [oup.shape for oup in tunning_node.outputs] + self._onnx_input_shape = [ + inp.shape + for inp in tunning_node.inputs + if ( + inp.__class__ == gs.Variable + and not inp.is_empty() + ) + ] + + def input_weight_and_tensor_index(self): + """ + Calculate the index of weight input and tensor input. + """ + tunning_node = self._tunning_node + self._onnx_tensor_input_index = [ + k + for k, inp in enumerate(tunning_node.inputs) + if ( + inp.__class__ == gs.Variable + and not (len(inp.inputs) == 1 and tunning_node.i(k, 0).op == "Constant") + ) + ] + + self._onnx_weight_input_index = [ + k + for k, inp in enumerate(tunning_node.inputs) + if ( + inp.__class__ == gs.Constant + or (len(inp.inputs) == 1 and tunning_node.i(k, 0).op == "Constant") + ) + ] + + def align_onnx_and_tvm_input(self): + """ + Align onnx and tvm input. Because tvm let constants in the after of variables params. + """ + model = self._model + graph = model.graph + nodes = graph.node + onnx_inputs = graph.input + + init_order = {} + for node in nodes: + op_inputs = node.input + for i in range(len(op_inputs)): + init_order[op_inputs[i]] = i + + for i in onnx_inputs: + self._onnx_input_order.append(init_order[i.name]) + + def match_address_for_eid(self): + """ + The memory address used by functions params. + """ + workspace = 0 + input_slot_dict = {} + for i in range(self._nums_output): + eid = self._kernel.graph_module.get_output_eid(i) + idx = int(self._storage_id[eid]) + self._output_type.append(python_to_trt_type_mapping[self._data_type[eid]]) + self._input_dict[str(eid)] = "outputs[" + str(i) + "]" + input_slot_dict[idx] = self._input_dict[str(eid)] + + duplicate_allocate = {} + for i in range(len(self._allocate_size)): + idx = int(self._storage_id[i]) + if idx not in duplicate_allocate.keys(): + duplicate_allocate[idx] = 0 + duplicate_allocate[idx] = max(int(self._allocate_size[i]), int(duplicate_allocate[idx])) + for i in range(len(self._allocate_size)): + idx = int(self._storage_id[i]) + if idx in input_slot_dict.keys(): + self._input_dict[str(i)] = input_slot_dict[idx] + continue + if i < self._nums_input: + self._input_dict[str(i)] = "inputs[" + str(self._onnx_input_order[i]) + "]" + elif i < len(self._allocate_size) - self._nums_output: + if i == self._nums_input: + self._input_dict[str(i)] = "workspace" + else: + self._input_dict[str(i)] = "(workspace + " + str(workspace) + ")" + workspace += int(duplicate_allocate[idx]) + self._workspace_size = workspace + if ( + self._input_dict[str(i)] not in self._tvm_workspace_constant.keys() + and str(idx) in self._tvm_constant.keys() + ): + # self._tvm_workspace_constant[self._input_dict[str(i)]] = None + self._tvm_workspace_constant[self._input_dict[str(i)]] = ( + self._tvm_constant[str(idx)], + tvm_to_c_type_mapping[self._data_type[i]], + int(i), + ) + input_slot_dict[idx] = self._input_dict[str(i)] + + if len(self._allocate_global_memory) != 0: + for key, value in self._allocate_global_memory.items(): + self._input_dict[key] = ( + "(" + tvm_to_c_type_mapping[value[0]] + "*)(workspace + " + str(workspace) + ")" + ) + workspace += int(value[1]) * plugin_type_size[value[0]] + self._workspace_size = workspace + + def cuda_kernel_config(self): + """ + Grid. Block. Thread. size. + """ + output = "" + output_json = {} + cuda_func_call = {} + for i in range(len(self._cuda_func_order)): + cuda_func_name = self._cuda_func_order[i] + + func_name = re.sub(r"_kernel_?\d*", "", cuda_func_name, count=1) + if cuda_func_name not in output_json.keys(): + output_json[cuda_func_name] = {} + cuda_func_call[cuda_func_name] = 0 + multi_cuda_func_name = cuda_func_name + else: + cuda_func_call[cuda_func_name] += 1 + func_name = func_name + "_" + str(cuda_func_call[cuda_func_name]) + multi_cuda_func_name = cuda_func_name + "_" + str(cuda_func_call[cuda_func_name]) + output_json[multi_cuda_func_name] = {} + + output_json[multi_cuda_func_name]["grid_dim"] = self._gpu_thread_config[cuda_func_name][ + 0 + ].strip("grid=") + output_json[multi_cuda_func_name]["block_dim"] = self._gpu_thread_config[ + cuda_func_name + ][1].strip("block=") + output += cuda_func_name + "\n" + str(self._gpu_thread_config[cuda_func_name]) + "\n" + kernel_param_order = self._cuda_kernel_order[cuda_func_name] + tvm_param_order = self._tvm_executor_order[func_name] + + enqueue_params = "" + for j in range(len(kernel_param_order)): + if kernel_param_order[j].isdigit(): + # enqueue_params += self._input_dict[str(tvm_param_order[int(kernel_param_order[j])])] + output += self._input_dict[str(tvm_param_order[int(kernel_param_order[j])])] + eid = tvm_param_order[int(kernel_param_order[j])] + enqueue_params += ( + "(" + + tvm_to_c_type_mapping[self._data_type[int(eid)]] + + "*)" + + self._input_dict[str(eid)] + ) + else: + if kernel_param_order[j] in self._input_dict.keys(): + enqueue_params += self._input_dict[kernel_param_order[j]] + if j == len(kernel_param_order) - 1: + output += "\n" + else: + output += ", " + enqueue_params += ", " + output_json[multi_cuda_func_name]["enqueue_params"] = enqueue_params + self._plugin_config = output_json + + @property + def host_func_order(self): + return self._tvm_func_order + + @property + def kernel_order(self): + return self._cuda_func_order + + @property + def plugin_config(self): + return self._plugin_config + + @property + def workspace_size(self): + return self._workspace_size + + @property + def output_num(self): + return self._nums_output + + @property + def output_type(self): + return self._output_type + + @property + def output_shape(self): + return self._onnx_output_shape + + @property + def input_shape(self): + return self._onnx_input_shape + + @property + def onnx_weight_input_index(self): + return self._onnx_weight_input_index + + @property + def onnx_tensor_input_index(self): + return self._onnx_tensor_input_index + + @property + def tensor_type(self): + return self._onnx_tensor_type + + @property + def workspace_init(self): + return self._tvm_workspace_constant + + @property + def cuda_source_code(self): + return self._cuda_source_code + + @property + def plugin_name(self): + return self._kernel.plugin_name + + @property + def onnx_op_type(self): + return self._kernel.onnx_op_type + + @property + def storage_id(self): + return self._storage_id + + @property + def onnx_input_python_type(self): + return self._onnx_input_python_type + + @property + def onnx_output_python_type(self): + return self._onnx_output_python_type + + @property + def input_workspace_size(self): + return self._input_workspace_size + + @property + def output_workspace_size(self): + return self._output_workspace_size + + @property + def total_workspace_size(self): + allocate_size = 0 + for size in self._allocate_size: + allocate_size += int(size) + return allocate_size diff --git a/python/tvm/tpat/cuda/type_mapping.py b/python/tvm/tpat/cuda/type_mapping.py new file mode 100644 index 000000000000..d47b46c12860 --- /dev/null +++ b/python/tvm/tpat/cuda/type_mapping.py @@ -0,0 +1,59 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# type mapping : tvm -> c +tvm_to_c_type_mapping = { + "int16": "int", + "int32": "int", + "int64": "int", + "float32": "float", + "uint64": "int", + "uint8": "int8", + "uint1": "int", + "uint32": "int", + "float64": "float", + "bool": "int", +} + +# type mapping : python -> trt +python_to_trt_type_mapping = { + "bool": "INT32", + "int32": "INT32", + "int64": "INT32", + "float32": "FLOAT", + "uint64": "INT32", + "uint8": "INT8", + "uint1": "INT32", + "float64": "FLOAT", +} + +# type size : trt workspace +plugin_type_size = { + "int16": 4, + "int32": 4, + "float32": 4, + "int64": 4, + "uint32": 4, + "uint64": 4, + "uint8": 1, + "uint1": 1, + "float64": 4, +} + +# onnx type +onnx_type_mapping = {"int64": 7, "bool": 9, "uint32": 12, "uint64": 13} +# "int32": 6 \ No newline at end of file diff --git a/tests/python/tpat/cuda/__init__.py b/tests/python/tpat/cuda/__init__.py new file mode 100644 index 000000000000..13a83393a912 --- /dev/null +++ b/tests/python/tpat/cuda/__init__.py @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. diff --git a/tests/python/tpat/cuda/common.py b/tests/python/tpat/cuda/common.py new file mode 100644 index 000000000000..250535015d1f --- /dev/null +++ b/tests/python/tpat/cuda/common.py @@ -0,0 +1,3455 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import ctypes +import os +import sys + +import numpy as np +import onnx +import pycuda.autoinit +import pycuda.driver as cuda +import pytest +import tensorflow as tf +import tensorflow.compat.v1 as tf +import tensorrt as trt +from onnx import TensorProto, helper, mapping, numpy_helper +from onnx.backend.test.case.node import _extract_value_info + +from tvm import tpat + +from .trt import allocate_buffers, build_engine, do_inference, load_plugin + +tf.disable_v2_behavior() + +I_GPU = 0 +os.environ["CUDA_VISIBLE_DEVICES"] = str(I_GPU) +np.random.seed(0) +ITERATIONS = 10 +INPUT_MODEL_FILE = "test_op_plugin.onnx" +OUTPUT_MODEL_FILE = "test_op_trt.onnx" + +TRT_LOGGER = trt.Logger(trt.Logger.VERBOSE) +BATCH_SIZE = 1 + + +# Simple helper data class that's a little nicer to use than a 2-tuple. + + +def convert_to_list(x): + if not isinstance(x, list): + x = [x] + return x + + +def run_tf_graph(sess, input_data, input_node, output_node): + """Generic function to execute tensorflow""" + input_data = convert_to_list(input_data) + input_node = convert_to_list(input_node) + output_node = convert_to_list(output_node) + + tensor = [sess.graph.get_tensor_by_name(output_name) for output_name in output_node] + + input_dict = {e: input_data[i] for i, e in enumerate(input_node)} + # if len(input_node) == 1 and input_node[0] == "": + # output_data = sess.run(tensor) + # else: + output_data = sess.run(tensor, input_dict) + return output_data + + +def verify_tf_with_trt_result(in_data, in_name, out_name, op_name): + def name_without_num(name): + return name.split(":")[0] if ":" in name else name + + out_name = convert_to_list(out_name) + out_node = [name_without_num(name) for name in out_name] + in_data = convert_to_list(in_data) + in_name = convert_to_list(in_name) + with tf.Session() as sess: + sess.run(tf.global_variables_initializer()) + tf_result = run_tf_graph(sess, in_data, in_name, out_name) + frozen_graph = tf.graph_util.convert_variables_to_constants(sess, sess.graph_def, out_node) + with open("./test_op_{}.pb".format(op_name), "wb") as ofile: + ofile.write(frozen_graph.SerializeToString()) + os.system( + "python3 -m tf2onnx.convert --input ./test_op_{}.pb --inputs {} --outputs {} --output {} --opset 11".format( + op_name, str(",").join(in_name), str(",").join(out_name), INPUT_MODEL_FILE + ) + ) + ops_name = [op_name] + + _, trt_plugin_names = tpat.cuda.pipeline( + INPUT_MODEL_FILE, ops_name, False, "./log_db", OUTPUT_MODEL_FILE + ) + + load_plugin(trt_plugin_names) + engine = build_engine(OUTPUT_MODEL_FILE, trt_engine_datatype=trt.DataType.HALF) + + inputs, outputs, bindings, stream = allocate_buffers(engine) + with engine.create_execution_context() as context: + for i in range(len(inputs)): + input_data = in_data[i].ravel() + np.copyto(inputs[i].host, input_data) + + trt_result = do_inference( + context, + bindings=bindings, + inputs=inputs, + outputs=outputs, + stream=stream, + ) + + ret = True + if len(trt_result) == 1: + ret = compare_tf_trt_result(tf_result, trt_result) + else: + for i in range(len(trt_result)): + ret &= compare_tf_trt_result(tf_result[i], trt_result[i]) + assert ret, "result check False" + return ret + + +def compare_tf_trt_result(tf_result, trt_result): + print(tf_result) + print("================") + print(trt_result) + tf_reshape = np.array(tf_result).reshape(-1) + trt_reshape = np.array(trt_result).reshape(-1) + + if ( + isinstance(tf_result, list) + and isinstance(trt_result, list) + and len(tf_result) > 0 + and len(trt_result) > 0 + and np.isnan(tf_result[0]).any() + and np.isnan(trt_result[0]).any() + ): + return True + elif ( + isinstance(tf_result, list) + and isinstance(trt_result, list) + and len(tf_result) > 0 + and len(trt_result) > 0 + and np.isinf(tf_result[0]).any() + and np.isinf(trt_result[0]).any() + ): + return True + elif np.isnan(tf_reshape).any() and np.isnan(trt_reshape).any(): + return True + print( + "trt cross_check output ", + str(np.allclose(tf_reshape.flatten(), trt_reshape.flatten(), atol=1e-5)), + flush=True, + ) + return bool(np.allclose(tf_reshape.flatten(), trt_reshape.flatten(), atol=1e-5)) + + +def get_onnxruntime_output(model, inputs): + import onnxruntime.backend + + rep = onnxruntime.backend.prepare(model, "GPU") + if isinstance(inputs, list) and len(inputs) == 1: + inp = inputs[0] + else: + inp = inputs + output = rep.run(inp) + # Unpack output if there's only a single value. + if len(output) == 1: + output = output[0] + return output + + +def verify_with_ort_with_trt( + model, + inputs, + op_name, + opset=None, + dtype="float32", + opt_level=1, + np_result=None, + use_vm=False, + layout=0, +): + if opset is not None: + model.opset_import[0].version = opset + onnx.save(model, INPUT_MODEL_FILE) + if np_result is None: + ort_result = get_onnxruntime_output(model, inputs) + else: + ort_result = np_result + + in_data = convert_to_list(inputs) + ops_name = [op_name] + + _, trt_plugin_names = tpat.cuda.pipeline( + INPUT_MODEL_FILE, ops_name, False, "./log_db", OUTPUT_MODEL_FILE + ) + + load_plugin(trt_plugin_names) + engine = build_engine(OUTPUT_MODEL_FILE, trt_engine_datatype=trt.DataType.HALF) + + inputs, outputs, bindings, stream = allocate_buffers(engine) + with engine.create_execution_context() as context: + for i in range(len(inputs)): + input_data = in_data[i].ravel() + np.copyto(inputs[i].host, input_data) + + trt_result = do_inference( + context, + bindings=bindings, + inputs=inputs, + outputs=outputs, + stream=stream, + ) + + ret = True + if len(trt_result) == 1: + ret = compare_tf_trt_result(ort_result, trt_result) + else: + # ret &= compare_tf_trt_result(ort_result[0], trt_result[0]) + for i in range(len(trt_result)): + ret &= compare_tf_trt_result(ort_result[i], trt_result[i]) + assert ret, "result check False" + return ret + + +def make_constant_node(name, data_type, dims, vals): + return helper.make_node( + "Constant", + inputs=[], + outputs=[name], + value=helper.make_tensor(name=name, data_type=data_type, dims=dims, vals=vals), + ) + + +def make_onnx_model(node, inputs, outputs, name, **kwargs): + present_inputs = [x for x in node.input if (x != "")] + present_outputs = [x for x in node.output if (x != "")] + input_type_protos = [None] * len(inputs) + if "input_type_protos" in kwargs: + input_type_protos = kwargs[str("input_type_protos")] + del kwargs[str("input_type_protos")] + output_type_protos = [None] * len(outputs) + if "output_type_protos" in kwargs: + output_type_protos = kwargs[str("output_type_protos")] + del kwargs[str("output_type_protos")] + inputs_vi = [ + _extract_value_info(arr, arr_name, input_type) + for arr, arr_name, input_type in zip(inputs, present_inputs, input_type_protos) + ] + outputs_vi = [ + _extract_value_info(arr, arr_name, output_type) + for arr, arr_name, output_type in zip(outputs, present_outputs, output_type_protos) + ] + graph = helper.make_graph(nodes=[node], name=name, inputs=inputs_vi, outputs=outputs_vi) + kwargs[str("producer_name")] = "TRTPluginAutoGen-test" + model = onnx.helper.make_model(graph, **kwargs) + return model + + +def op_expect(node, inputs, outputs, op_type, op_name, np_result=None): + model = make_onnx_model(node, inputs=inputs, outputs=outputs, name="test_{}".format(op_type)) + verify_with_ort_with_trt(model, inputs, op_name, np_result=np_result) + + +# ==================================================================================== +# ---UnitTest +# ==================================================================================== + + +def test_abs(): + op_name = "abs_0" + op_type = "Abs" + x = np.random.randn(3, 4, 5).astype(np.float32) + y = abs(x) + node = helper.make_node(op_type, inputs=["x"], outputs=["y"], name=op_name) + op_expect(node, inputs=[x], outputs=[y], op_type=op_type, op_name=op_name) + + +def test_acos(): + op_name = "acos_0" + op_type = "Acos" + node = onnx.helper.make_node("Acos", inputs=["x"], outputs=["y"], name=op_name) + x = np.array([-0.5, 0, 0.5]).astype(np.float32) + y = np.arccos(x) + op_expect(node, inputs=[x], outputs=[y], op_type=op_type, op_name=op_name) + + op_name = "acos_1" + op_type = "Acos" + node = onnx.helper.make_node("Acos", inputs=["x"], outputs=["y"], name=op_name) + x = np.random.rand(3, 4, 5).astype(np.float32) + y = np.arccos(x) + op_expect(node, inputs=[x], outputs=[y], op_type=op_type, op_name=op_name) + + +@pytest.mark.skip(reason="TensorRT segmentfault") +def test_and(): + op_name = "and_0" + op_type = "And" + node = onnx.helper.make_node("And", inputs=["x", "y"], outputs=["and"], name=op_name) + # 2d + x = (np.random.randn(3, 4) > 0).astype(bool) + y = (np.random.randn(3, 4) > 0).astype(bool) + z = np.logical_and(x, y) + op_expect(node, inputs=[x, y], outputs=[z], op_type=op_type, op_name=op_name) + + op_name = "and_1" + op_type = "And" + node = onnx.helper.make_node("And", inputs=["x", "y"], outputs=["and"], name=op_name) + x = (np.random.randn(3, 4, 5) > 0).astype(bool) + y = (np.random.randn(3, 4, 5) > 0).astype(bool) + z = np.logical_and(x, y) + op_expect(node, inputs=[x, y], outputs=[z], op_type=op_type, op_name=op_name) + + op_name = "and_2" + op_type = "And" + node = onnx.helper.make_node("And", inputs=["x", "y"], outputs=["and"], name=op_name) + x = (np.random.randn(3, 4, 5, 6) > 0).astype(bool) + y = (np.random.randn(3, 4, 5, 6) > 0).astype(bool) + z = np.logical_and(x, y) + op_expect(node, inputs=[x, y], outputs=[z], op_type=op_type, op_name=op_name) + + +def test_add(): + op_name = "add_0" + op_type = "Add" + node = onnx.helper.make_node("Add", inputs=["x", "y"], outputs=["sum"], name=op_name) + + x = np.random.randn(3, 4, 5).astype(np.float32) + y = np.random.randn(3, 4, 5).astype(np.float32) + op_expect(node, inputs=[x, y], outputs=[x + y], op_type=op_type, op_name=op_name) + + op_name = "add_1" + op_type = "Add" + node = onnx.helper.make_node("Add", inputs=["x", "y"], outputs=["sum"], name=op_name) + + x = np.random.randn(3, 4, 5).astype(np.float32) + y = np.random.randn(5).astype(np.float32) + op_expect(node, inputs=[x, y], outputs=[x + y], op_type=op_type, op_name=op_name) + + +def test_argmax(): + op_type = "ArgMax" + op_name = "argmax_0" + data = np.array([[2, 1, 3, 10], [3, 4, 5, 6]], dtype=np.float32) + keepdims = 1 + axis = -1 + node = onnx.helper.make_node( + "ArgMax", + inputs=["data"], + outputs=["result"], + keepdims=keepdims, + axis=axis, + name=op_name, + ) + + # result: [[1], [1]] + from onnx.backend.test.case.node.argmax import argmax_use_numpy + + result = argmax_use_numpy(data, keepdims=keepdims, axis=axis) + op_expect(node, inputs=[data], outputs=[result], op_type=op_type, op_name=op_name) + + op_name = "argmax_1" + node = onnx.helper.make_node( + "ArgMax", + inputs=["data"], + outputs=["result"], + keepdims=keepdims, + axis=axis, + name=op_name, + ) + + data = np.random.uniform(-10, 10, [2, 3, 4]).astype(np.float32) + # result's shape: [1, 3, 4] + result = argmax_use_numpy(data, keepdims=keepdims, axis=axis) + op_expect(node, inputs=[data], outputs=[result], op_type=op_type, op_name=op_name) + + +def test_argmin(): + op_type = "ArgMin" + op_name = "argmin_0" + data = np.array([[2, 1], [3, 10]], dtype=np.float32) + keepdims = 1 + axis = 1 + node = onnx.helper.make_node( + "ArgMin", + inputs=["data"], + outputs=["result"], + keepdims=keepdims, + axis=axis, + name=op_name, + ) + + # result: [[1], [1]] + from onnx.backend.test.case.node.argmin import argmin_use_numpy + + result = argmin_use_numpy(data, keepdims=keepdims, axis=axis) + op_expect(node, inputs=[data], outputs=[result], op_type=op_type, op_name=op_name) + + +def test_asin(): + op_name = "asin_0" + op_type = "Asin" + node = onnx.helper.make_node("Asin", inputs=["x"], outputs=["y"], name=op_name) + + x = np.array([-0.5, 0, 0.5]).astype(np.float32) + y = np.arcsin(x) + op_expect(node, inputs=[x], outputs=[y], op_type=op_type, op_name=op_name) + + op_name = "asin_1" + op_type = "Asin" + node = onnx.helper.make_node("Asin", inputs=["x"], outputs=["y"], name=op_name) + + x = np.random.rand(3, 4, 5).astype(np.float32) + y = np.arcsin(x) + op_expect(node, inputs=[x], outputs=[y], op_type=op_type, op_name=op_name) + + +def test_asinh(): + op_name = "asinh_0" + op_type = "Asinh" + node = onnx.helper.make_node("Asinh", inputs=["x"], outputs=["y"], name=op_name) + + x = np.array([-1, 0, 1]).astype(np.float32) + y = np.arcsinh(x) # expected output [-0.88137358, 0., 0.88137358] + op_expect(node, inputs=[x], outputs=[y], op_type=op_type, op_name=op_name) + + op_name = "asinh_1" + op_type = "Asinh" + node = onnx.helper.make_node("Asinh", inputs=["x"], outputs=["y"], name=op_name) + x = np.random.randn(3, 4, 5).astype(np.float32) + y = np.arcsinh(x) + op_expect(node, inputs=[x], outputs=[y], op_type=op_type, op_name=op_name) + + +def test_atan(): + op_type = "Atan" + op_name = "atan_0" + node = onnx.helper.make_node("Atan", inputs=["x"], outputs=["y"], name=op_name) + + x = np.array([-1, 0, 1]).astype(np.float32) + y = np.arctan(x) + op_expect(node, inputs=[x], outputs=[y], op_type=op_type, op_name=op_name) + + op_type = "Atan" + op_name = "atan_1" + node = onnx.helper.make_node("Atan", inputs=["x"], outputs=["y"], name=op_name) + x = np.random.randn(3, 4, 5).astype(np.float32) + y = np.arctan(x) + op_expect(node, inputs=[x], outputs=[y], op_type=op_type, op_name=op_name) + + +def test_atanh(): + op_name = "atanh_0" + op_type = "Atanh" + node = onnx.helper.make_node("Atanh", inputs=["x"], outputs=["y"], name=op_name) + + x = np.array([-0.5, 0, 0.5]).astype(np.float32) + y = np.arctanh(x) # expected output [-0.54930615, 0., 0.54930615] + op_expect(node, inputs=[x], outputs=[y], op_type=op_type, op_name=op_name) + + op_name = "atanh_1" + op_type = "Atanh" + node = onnx.helper.make_node("Atanh", inputs=["x"], outputs=["y"], name=op_name) + x = np.random.uniform(0.0, 1.0, (3, 4, 5)).astype(np.float32) + y = np.arctanh(x) + op_expect(node, inputs=[x], outputs=[y], op_type=op_type, op_name=op_name) + + +def test_averagepool(): + op_name = "averagepool_1d_default" + op_type = "AveragePool" + """ + input_shape: [1, 3, 32] + output_shape: [1, 3, 31] + """ + node = onnx.helper.make_node( + "AveragePool", inputs=["x"], outputs=["y"], kernel_shape=[2], name=op_name + ) + x = np.random.randn(1, 3, 32).astype(np.float32) + x_shape = np.shape(x) + kernel_shape = [2] + strides = [1] + from onnx.backend.test.case.node.pool_op_common import get_output_shape, pool + + out_shape = get_output_shape("VALID", x_shape[2:], kernel_shape, strides) + padded = x + y = pool(padded, x_shape, kernel_shape, strides, out_shape, [0], "AVG") + op_expect(node, inputs=[x], outputs=[y], op_type=op_type, op_name=op_name) + + op_name = "averagepool_2d_ceil" + op_type = "AveragePool" + node = onnx.helper.make_node( + "AveragePool", + inputs=["x"], + outputs=["y"], + kernel_shape=[3, 3], + strides=[2, 2], + ceil_mode=True, + name=op_name, + ) + x = np.array( + [ + [ + [ + [1, 2, 3, 4], + [5, 6, 7, 8], + [9, 10, 11, 12], + [13, 14, 15, 16], + ] + ] + ] + ).astype(np.float32) + y = np.array([[[[6, 7.5], [12, 13.5]]]]).astype(np.float32) + + op_expect(node, inputs=[x], outputs=[y], op_type=op_type, op_name=op_name) + + +@pytest.mark.skip(reason="TensorRT segmentfault") +def test_batchnormalization(): + op_name = "batchnormalization_0" + op_type = "BatchNormalization" + # input size: (2, 3, 4, 5) + x = np.random.randn(2, 3, 4, 5).astype(np.float32) + s = np.random.randn(3).astype(np.float32) + bias = np.random.randn(3).astype(np.float32) + mean = np.random.randn(3).astype(np.float32) + var = np.random.rand(3).astype(np.float32) + from onnx.backend.test.case.node.batchnorm import _batchnorm_test_mode + + y = _batchnorm_test_mode(x, s, bias, mean, var).astype(np.float32) + + node = onnx.helper.make_node( + "BatchNormalization", + inputs=["x", "s", "bias", "mean", "var"], + outputs=["y"], + name=op_name, + ) + + # output size: (2, 3, 4, 5) + op_expect( + node, + inputs=[x, s, bias, mean, var], + outputs=[y], + op_type=op_type, + op_name=op_name, + ) + + +def test_ceil(): + op_name = "ceil_0" + op_type = "Ceil" + node = onnx.helper.make_node("Ceil", inputs=["x"], outputs=["y"], name=op_name) + + x = np.array([-1.5, 1.2]).astype(np.float32) + y = np.ceil(x) # expected output [-1., 2.] + op_expect(node, inputs=[x], outputs=[y], op_type=op_type, op_name=op_name) + + op_name = "ceil_1" + op_type = "Ceil" + node = onnx.helper.make_node("Ceil", inputs=["x"], outputs=["y"], name=op_name) + + x = np.random.randn(3, 4, 5).astype(np.float32) + y = np.ceil(x) + op_expect(node, inputs=[x], outputs=[y], op_type=op_type, op_name=op_name) + + +def test_celu(): + op_name = "celu_0" + op_type = "Celu" + alpha = 2.0 + node = onnx.helper.make_node("Celu", inputs=["X"], outputs=["Y"], alpha=alpha, name=op_name) + + input_data = np.array( + [ + [ + [[0.8439683], [0.5665144], [0.05836735]], + [[0.02916367], [0.12964272], [0.5060197]], + [[0.79538304], [0.9411346], [0.9546573]], + ], + [ + [[0.17730942], [0.46192095], [0.26480448]], + [[0.6746842], [0.01665257], [0.62473077]], + [[0.9240844], [0.9722341], [0.11965699]], + ], + [ + [[0.41356155], [0.9129373], [0.59330076]], + [[0.81929934], [0.7862604], [0.11799799]], + [[0.69248444], [0.54119414], [0.07513223]], + ], + ], + dtype=np.float32, + ) + + # Calculate expected output data + positive_input = np.maximum(0, input_data) + negative_input = np.minimum(0, alpha * (np.exp(input_data / alpha) - 1)) + expected_output = positive_input + negative_input + + op_expect( + node, + inputs=[input_data], + outputs=[expected_output], + op_type=op_type, + op_name=op_name, + ) + + +def test_clip(): + op_name = "Clip_0" + op_type = "Clip" + node = onnx.helper.make_node("Clip", inputs=["x", "min", "max"], outputs=["y"], name=op_name) + x = np.array([-2, 0, 2]).astype(np.float32) + min_val = np.array([-1.0]).astype(np.float32) # .float32(-1.0) + max_val = np.array([1.0]).astype(np.float32) # .float32(1.0) + y = np.clip(x, min_val, max_val) # expected output [-1., 0., 1.] + op_expect( + node, + inputs=[x, min_val, max_val], + outputs=[y], + op_type=op_type, + op_name=op_name, + ) + + op_name = "Clip_1" + op_type = "Clip" + node = onnx.helper.make_node("Clip", inputs=["x", "min", "max"], outputs=["y"], name=op_name) + + x = np.random.randn(3, 4, 5).astype(np.float32) + y = np.clip(x, min_val, max_val) + op_expect( + node, + inputs=[x, min_val, max_val], + outputs=[y], + op_type=op_type, + op_name=op_name, + ) + + op_name = "Clip_2" + op_type = "Clip" + node = onnx.helper.make_node("Clip", inputs=["x", "min", "max"], outputs=["y"], name=op_name) + min_val = np.array([-5.0]).astype(np.float32) # .float32(-1.0) + max_val = np.array([5.0]).astype(np.float32) # .float32(1.0) + op_name = "Clip_3" + op_type = "Clip" + node = onnx.helper.make_node("Clip", inputs=["x", "min", "max"], outputs=["y"], name=op_name) + + x = np.array([-1, 0, 1]).astype(np.float32) + y = np.array([-1, 0, 1]).astype(np.float32) + op_expect( + node, + inputs=[x, min_val, max_val], + outputs=[y], + op_type=op_type, + op_name=op_name, + ) + + op_name = "Clip_4" + op_type = "Clip" + node = onnx.helper.make_node("Clip", inputs=["x", "min", "max"], outputs=["y"], name=op_name) + x = np.array([-6, 0, 6]).astype(np.float32) + y = np.array([-5, 0, 5]).astype(np.float32) + op_expect( + node, + inputs=[x, min_val, max_val], + outputs=[y], + op_type=op_type, + op_name=op_name, + ) + + op_name = "Clip_5" + op_type = "Clip" + node = onnx.helper.make_node("Clip", inputs=["x", "min", "max"], outputs=["y"], name=op_name) + x = np.array([-1, 0, 6]).astype(np.float32) + y = np.array([-1, 0, 5]).astype(np.float32) + op_expect( + node, + inputs=[x, min_val, max_val], + outputs=[y], + op_type=op_type, + op_name=op_name, + ) + + +def test_concat(): + test_cases = { + "1d": ([1, 2], [3, 4]), + "2d": ([[1, 2], [3, 4]], [[5, 6], [7, 8]]), + "3d": ( + [[[1, 2], [3, 4]], [[5, 6], [7, 8]]], + [[[9, 10], [11, 12]], [[13, 14], [15, 16]]], + ), + } # type: Dict[Text, Sequence[Any]] + + for test_case, values_ in test_cases.items(): + values = [np.asarray(v, dtype=np.float32) for v in values_] + for i in range(len(values[0].shape)): + op_name = "concat_{}_{}".format(test_case, i) + op_type = "Concat" + in_args = ["value" + str(k) for k in range(len(values))] + node = onnx.helper.make_node( + "Concat", + inputs=[s for s in in_args], + outputs=["output"], + axis=i, + name=op_name, + ) + output = np.concatenate(values, i) + op_expect( + node, + inputs=[v for v in values], + outputs=[output], + op_type=op_type, + op_name=op_name, + ) + + for i in range(-len(values[0].shape), 0): + op_name = "concat_{}_1_{}".format(test_case, abs(i)) + op_type = "Concat" + in_args = ["value" + str(k) for k in range(len(values))] + node = onnx.helper.make_node( + "Concat", + inputs=[s for s in in_args], + outputs=["output"], + axis=i, + name=op_name, + ) + output = np.concatenate(values, i) + op_expect( + node, + inputs=[v for v in values], + outputs=[output], + op_type=op_type, + op_name=op_name, + ) + + +def test_conv(): + # ------Conv + op_name, op_type = "test_basic_conv_with_padding", "Conv" + x = np.array( + [ + [ + [ + [0.0, 1.0, 2.0, 3.0, 4.0], # (1, 1, 5, 5) input tensor + [5.0, 6.0, 7.0, 8.0, 9.0], + [10.0, 11.0, 12.0, 13.0, 14.0], + [15.0, 16.0, 17.0, 18.0, 19.0], + [20.0, 21.0, 22.0, 23.0, 24.0], + ] + ] + ] + ).astype(np.float32) + # NOCC:invalid-name(å…¶ä»–:onnx example) + W = np.array( + [ + [ + [ + [1.0, 1.0, 1.0], # (1, 1, 3, 3) tensor for convolution weights + [1.0, 1.0, 1.0], + [1.0, 1.0, 1.0], + ] + ] + ] + ).astype(np.float32) + + # Convolution with padding + node_with_padding = onnx.helper.make_node( + "Conv", + inputs=["x", "W"], + outputs=["y"], + kernel_shape=[3, 3], + # Default values for other attributes: strides=[1, 1], dilations=[1, 1], groups=1 + pads=[1, 1, 1, 1], + name=op_name, + ) + y_with_padding = np.array( + [ + [ + [ + [12.0, 21.0, 27.0, 33.0, 24.0], # (1, 1, 5, 5) output tensor + [33.0, 54.0, 63.0, 72.0, 51.0], + [63.0, 99.0, 108.0, 117.0, 81.0], + [93.0, 144.0, 153.0, 162.0, 111.0], + [72.0, 111.0, 117.0, 123.0, 84.0], + ] + ] + ] + ).astype(np.float32) + op_expect( + node_with_padding, + inputs=[x, W], + outputs=[y_with_padding], + op_type=op_type, + op_name=op_name, + ) + + op_name, op_type = "test_basic_conv_without_padding", "Conv" + # Convolution without padding + node_without_padding = onnx.helper.make_node( + "Conv", + inputs=["x", "W"], + outputs=["y"], + kernel_shape=[3, 3], + # Default values for other attributes: strides=[1, 1], dilations=[1, 1], groups=1 + pads=[0, 0, 0, 0], + name=op_name, + ) + y_without_padding = np.array( + [ + [ + [ + [54.0, 63.0, 72.0], # (1, 1, 3, 3) output tensor + [99.0, 108.0, 117.0], + [144.0, 153.0, 162.0], + ] + ] + ] + ).astype(np.float32) + op_expect( + node_without_padding, + inputs=[x, W], + outputs=[y_without_padding], + op_type=op_type, + op_name=op_name, + ) + + # conv_with_autopad_same + op_name, op_type = "test_conv_with_autopad_same", "Conv" + x = np.array( + [ + [ + [ + [0.0, 1.0, 2.0, 3.0, 4.0], # (1, 1, 5, 5) input tensor + [5.0, 6.0, 7.0, 8.0, 9.0], + [10.0, 11.0, 12.0, 13.0, 14.0], + [15.0, 16.0, 17.0, 18.0, 19.0], + [20.0, 21.0, 22.0, 23.0, 24.0], + ] + ] + ] + ).astype(np.float32) + # NOCC:invalid-name(å…¶ä»–:onnx example) + W = np.array( + [ + [ + [ + [1.0, 1.0, 1.0], # (1, 1, 3, 3) tensor for convolution weights + [1.0, 1.0, 1.0], + [1.0, 1.0, 1.0], + ] + ] + ] + ).astype(np.float32) + + # Convolution with auto_pad='SAME_LOWER' and strides=2 + node = onnx.helper.make_node( + "Conv", + inputs=["x", "W"], + outputs=["y"], + auto_pad="SAME_LOWER", + kernel_shape=[3, 3], + strides=[2, 2], + name=op_name, + ) + y = np.array([[[[12.0, 27.0, 24.0], [63.0, 108.0, 81.0], [72.0, 117.0, 84.0]]]]).astype( + np.float32 + ) + op_expect(node, inputs=[x, W], outputs=[y], op_type=op_type, op_name=op_name) + + # conv_with_strides + op_name, op_type = "test_conv_with_strides_padding", "Conv" + x = np.array( + [ + [ + [ + [0.0, 1.0, 2.0, 3.0, 4.0], # (1, 1, 7, 5) input tensor + [5.0, 6.0, 7.0, 8.0, 9.0], + [10.0, 11.0, 12.0, 13.0, 14.0], + [15.0, 16.0, 17.0, 18.0, 19.0], + [20.0, 21.0, 22.0, 23.0, 24.0], + [25.0, 26.0, 27.0, 28.0, 29.0], + [30.0, 31.0, 32.0, 33.0, 34.0], + ] + ] + ] + ).astype(np.float32) + # NOCC:invalid-name(å…¶ä»–:onnx example) + W = np.array( + [ + [ + [ + [1.0, 1.0, 1.0], # (1, 1, 3, 3) tensor for convolution weights + [1.0, 1.0, 1.0], + [1.0, 1.0, 1.0], + ] + ] + ] + ).astype(np.float32) + + # Convolution with strides=2 and padding + node_with_padding = onnx.helper.make_node( + "Conv", + inputs=["x", "W"], + outputs=["y"], + kernel_shape=[3, 3], + pads=[1, 1, 1, 1], + strides=[ + 2, + 2, + ], # Default values for other attributes: dilations=[1, 1], groups=1 + name=op_name, + ) + y_with_padding = np.array( + [ + [ + [ + [12.0, 27.0, 24.0], # (1, 1, 4, 3) output tensor + [63.0, 108.0, 81.0], + [123.0, 198.0, 141.0], + [112.0, 177.0, 124.0], + ] + ] + ] + ).astype(np.float32) + op_expect( + node_with_padding, + inputs=[x, W], + outputs=[y_with_padding], + op_type=op_type, + op_name=op_name, + ) + + op_name = "test_conv_with_strides_no_padding" + # Convolution with strides=2 and no padding + node_without_padding = onnx.helper.make_node( + "Conv", + inputs=["x", "W"], + outputs=["y"], + kernel_shape=[3, 3], + pads=[0, 0, 0, 0], + strides=[ + 2, + 2, + ], # Default values for other attributes: dilations=[1, 1], groups=1 + name=op_name, + ) + y_without_padding = np.array( + [[[[54.0, 72.0], [144.0, 162.0], [234.0, 252.0]]]] # (1, 1, 3, 2) output tensor + ).astype(np.float32) + op_expect( + node_without_padding, + inputs=[x, W], + outputs=[y_without_padding], + op_type=op_type, + op_name=op_name, + ) + + op_name = "test_conv_with_strides_and_asymmetric_padding" + # Convolution with strides=2 and padding only along one dimension (the H dimension in NxCxHxW tensor) + node_with_asymmetric_padding = onnx.helper.make_node( + "Conv", + inputs=["x", "W"], + outputs=["y"], + kernel_shape=[3, 3], + pads=[1, 0, 1, 0], + strides=[ + 2, + 2, + ], # Default values for other attributes: dilations=[1, 1], groups=1 + name=op_name, + ) + y_with_asymmetric_padding = np.array( + [ + [ + [ + [21.0, 33.0], # (1, 1, 4, 2) output tensor + [99.0, 117.0], + [189.0, 207.0], + [171.0, 183.0], + ] + ] + ] + ).astype(np.float32) + op_expect( + node_with_asymmetric_padding, + inputs=[x, W], + outputs=[y_with_asymmetric_padding], + op_type=op_type, + op_name=op_name, + ) + + +def test_convtranspose(): + op_name, op_type = "test_convtranspose", "ConvTranspose" + x = np.array([[[[0.0, 1.0, 2.0], [3.0, 4.0, 5.0], [6.0, 7.0, 8.0]]]]).astype( # (1, 1, 3, 3) + np.float32 + ) + + # NOCC:invalid-name(å…¶ä»–:onnx example) + W = np.array( + [ + [ + [[1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0, 1.0]], # (1, 2, 3, 3) + [[1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0, 1.0]], + ] + ] + ).astype(np.float32) + + node = onnx.helper.make_node("ConvTranspose", ["X", "W"], ["Y"], name=op_name) + + y = np.array( + [ + [ + [ + [0.0, 1.0, 3.0, 3.0, 2.0], # (1, 2, 5, 5) + [3.0, 8.0, 15.0, 12.0, 7.0], + [9.0, 21.0, 36.0, 27.0, 15.0], + [9.0, 20.0, 33.0, 24.0, 13.0], + [6.0, 13.0, 21.0, 15.0, 8.0], + ], + [ + [0.0, 1.0, 3.0, 3.0, 2.0], + [3.0, 8.0, 15.0, 12.0, 7.0], + [9.0, 21.0, 36.0, 27.0, 15.0], + [9.0, 20.0, 33.0, 24.0, 13.0], + [6.0, 13.0, 21.0, 15.0, 8.0], + ], + ] + ] + ).astype(np.float32) + + op_expect(node, inputs=[x, W], outputs=[y], op_type=op_type, op_name=op_name) + + op_name, op_type = "test_convtranspose_1d", "ConvTranspose" + + x = np.array([[[0.0, 1.0, 2.0]]]).astype(np.float32) # (1, 1, 3) + + # NOCC:invalid-name(å…¶ä»–:onnx example) + W = np.array([[[1.0, 1.0, 1.0], [1.0, 1.0, 1.0]]]).astype(np.float32) # (1, 2, 3) + + node = onnx.helper.make_node("ConvTranspose", ["X", "W"], ["Y"], name=op_name) + + y = np.array([[[0.0, 1.0, 3.0, 3.0, 2.0], [0.0, 1.0, 3.0, 3.0, 2.0]]]).astype( # (1, 2, 5) + np.float32 + ) + + op_expect(node, inputs=[x, W], outputs=[y], op_type=op_type, op_name=op_name) + + op_name, op_type = "test_convtranspose_3d", "ConvTranspose" + x = np.array( + [ + [ + [ + [ + [0.0, 1.0, 2.0, 3.0, 4.0], # (1, 1, 3, 4, 5) + [5.0, 6.0, 7.0, 8.0, 9.0], + [10.0, 11.0, 12.0, 13.0, 14.0], + [15.0, 16.0, 17.0, 18.0, 19.0], + ], + [ + [20.0, 21.0, 22.0, 23.0, 24.0], + [25.0, 26.0, 27.0, 28.0, 29.0], + [30.0, 31.0, 32.0, 33.0, 34.0], + [35.0, 36.0, 37.0, 38.0, 39.0], + ], + [ + [40.0, 41.0, 42.0, 43.0, 44.0], + [45.0, 46.0, 47.0, 48.0, 49.0], + [50.0, 51.0, 52.0, 53.0, 54.0], + [55.0, 56.0, 57.0, 58.0, 59.0], + ], + ] + ] + ] + ).astype(np.float32) + + # NOCC:invalid-name(å…¶ä»–:onnx example) + W = np.array( + [ + [ + [ + [ + [1.0, 1.0, 1.0], # (1, 2, 3, 3, 3) + [1.0, 1.0, 1.0], + [1.0, 1.0, 1.0], + ], + [[1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0, 1.0]], + [[1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0, 1.0]], + ], + [ + [[1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0, 1.0]], + [[1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0, 1.0]], + [[1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0, 1.0]], + ], + ] + ] + ).astype(np.float32) + + node = onnx.helper.make_node("ConvTranspose", ["X", "W"], ["Y"], name=op_name) + + y = np.array( + [ + [ + [ + [ + [0.0, 1.0, 3.0, 6.0, 9.0, 7.0, 4.0], # (1, 2, 5, 6, 7) + [5.0, 12.0, 21.0, 27.0, 33.0, 24.0, 13.0], + [15.0, 33.0, 54.0, 63.0, 72.0, 51.0, 27.0], + [30.0, 63.0, 99.0, 108.0, 117.0, 81.0, 42.0], + [25.0, 52.0, 81.0, 87.0, 93.0, 64.0, 33.0], + [15.0, 31.0, 48.0, 51.0, 54.0, 37.0, 19.0], + ], + [ + [20.0, 42.0, 66.0, 72.0, 78.0, 54.0, 28.0], + [50.0, 104.0, 162.0, 174.0, 186.0, 128.0, 66.0], + [90.0, 186.0, 288.0, 306.0, 324.0, 222.0, 114.0], + [120.0, 246.0, 378.0, 396.0, 414.0, 282.0, 144.0], + [90.0, 184.0, 282.0, 294.0, 306.0, 208.0, 106.0], + [50.0, 102.0, 156.0, 162.0, 168.0, 114.0, 58.0], + ], + [ + [60.0, 123.0, 189.0, 198.0, 207.0, 141.0, 72.0], + [135.0, 276.0, 423.0, 441.0, 459.0, 312.0, 159.0], + [225.0, 459.0, 702.0, 729.0, 756.0, 513.0, 261.0], + [270.0, 549.0, 837.0, 864.0, 891.0, 603.0, 306.0], + [195.0, 396.0, 603.0, 621.0, 639.0, 432.0, 219.0], + [105.0, 213.0, 324.0, 333.0, 342.0, 231.0, 117.0], + ], + [ + [60.0, 122.0, 186.0, 192.0, 198.0, 134.0, 68.0], + [130.0, 264.0, 402.0, 414.0, 426.0, 288.0, 146.0], + [210.0, 426.0, 648.0, 666.0, 684.0, 462.0, 234.0], + [240.0, 486.0, 738.0, 756.0, 774.0, 522.0, 264.0], + [170.0, 344.0, 522.0, 534.0, 546.0, 368.0, 186.0], + [90.0, 182.0, 276.0, 282.0, 288.0, 194.0, 98.0], + ], + [ + [40.0, 81.0, 123.0, 126.0, 129.0, 87.0, 44.0], + [85.0, 172.0, 261.0, 267.0, 273.0, 184.0, 93.0], + [135.0, 273.0, 414.0, 423.0, 432.0, 291.0, 147.0], + [150.0, 303.0, 459.0, 468.0, 477.0, 321.0, 162.0], + [105.0, 212.0, 321.0, 327.0, 333.0, 224.0, 113.0], + [55.0, 111.0, 168.0, 171.0, 174.0, 117.0, 59.0], + ], + ], + [ + [ + [0.0, 1.0, 3.0, 6.0, 9.0, 7.0, 4.0], + [5.0, 12.0, 21.0, 27.0, 33.0, 24.0, 13.0], + [15.0, 33.0, 54.0, 63.0, 72.0, 51.0, 27.0], + [30.0, 63.0, 99.0, 108.0, 117.0, 81.0, 42.0], + [25.0, 52.0, 81.0, 87.0, 93.0, 64.0, 33.0], + [15.0, 31.0, 48.0, 51.0, 54.0, 37.0, 19.0], + ], + [ + [20.0, 42.0, 66.0, 72.0, 78.0, 54.0, 28.0], + [50.0, 104.0, 162.0, 174.0, 186.0, 128.0, 66.0], + [90.0, 186.0, 288.0, 306.0, 324.0, 222.0, 114.0], + [120.0, 246.0, 378.0, 396.0, 414.0, 282.0, 144.0], + [90.0, 184.0, 282.0, 294.0, 306.0, 208.0, 106.0], + [50.0, 102.0, 156.0, 162.0, 168.0, 114.0, 58.0], + ], + [ + [60.0, 123.0, 189.0, 198.0, 207.0, 141.0, 72.0], + [135.0, 276.0, 423.0, 441.0, 459.0, 312.0, 159.0], + [225.0, 459.0, 702.0, 729.0, 756.0, 513.0, 261.0], + [270.0, 549.0, 837.0, 864.0, 891.0, 603.0, 306.0], + [195.0, 396.0, 603.0, 621.0, 639.0, 432.0, 219.0], + [105.0, 213.0, 324.0, 333.0, 342.0, 231.0, 117.0], + ], + [ + [60.0, 122.0, 186.0, 192.0, 198.0, 134.0, 68.0], + [130.0, 264.0, 402.0, 414.0, 426.0, 288.0, 146.0], + [210.0, 426.0, 648.0, 666.0, 684.0, 462.0, 234.0], + [240.0, 486.0, 738.0, 756.0, 774.0, 522.0, 264.0], + [170.0, 344.0, 522.0, 534.0, 546.0, 368.0, 186.0], + [90.0, 182.0, 276.0, 282.0, 288.0, 194.0, 98.0], + ], + [ + [40.0, 81.0, 123.0, 126.0, 129.0, 87.0, 44.0], + [85.0, 172.0, 261.0, 267.0, 273.0, 184.0, 93.0], + [135.0, 273.0, 414.0, 423.0, 432.0, 291.0, 147.0], + [150.0, 303.0, 459.0, 468.0, 477.0, 321.0, 162.0], + [105.0, 212.0, 321.0, 327.0, 333.0, 224.0, 113.0], + [55.0, 111.0, 168.0, 171.0, 174.0, 117.0, 59.0], + ], + ], + ] + ] + ).astype(np.float32) + + op_expect(node, inputs=[x, W], outputs=[y], op_type=op_type, op_name=op_name) + + op_name, op_type = "test_convtranspose_pads", "ConvTranspose" + + x = np.array([[[[0.0, 1.0, 2.0], [3.0, 4.0, 5.0], [6.0, 7.0, 8.0]]]]).astype( # (1, 1, 3, 3) + np.float32 + ) + + # NOCC:invalid-name(å…¶ä»–:onnx example) + W = np.array( + [ + [ + [[1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0, 1.0]], # (1, 2, 3, 3) + [[1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0, 1.0]], + ] + ] + ).astype(np.float32) + + node = onnx.helper.make_node( + "ConvTranspose", + ["X", "W"], + ["Y"], + strides=[3, 2], + pads=[1, 2, 1, 2], + name=op_name, + ) + + y = np.array( + [ + [ + [ + [1.0, 1.0, 3.0], # (1, 2, 7, 3) + [1.0, 1.0, 3.0], + [7.0, 4.0, 9.0], + [7.0, 4.0, 9.0], + [7.0, 4.0, 9.0], + [13.0, 7.0, 15.0], + [13.0, 7.0, 15.0], + ], + [ + [1.0, 1.0, 3.0], + [1.0, 1.0, 3.0], + [7.0, 4.0, 9.0], + [7.0, 4.0, 9.0], + [7.0, 4.0, 9.0], + [13.0, 7.0, 15.0], + [13.0, 7.0, 15.0], + ], + ] + ] + ).astype(np.float32) + + op_expect(node, inputs=[x, W], outputs=[y], op_type=op_type, op_name=op_name) + + +def test_cos(): + op_name, op_type = "test_cos_example", "Cos" + node = onnx.helper.make_node("Cos", inputs=["x"], outputs=["y"], name=op_name) + + x = np.array([-1, 0, 1]).astype(np.float32) + y = np.cos(x) + op_expect(node, inputs=[x], outputs=[y], op_type=op_type, op_name=op_name) + + op_name, op_type = "test_cos", "Cos" + node = onnx.helper.make_node("Cos", inputs=["x"], outputs=["y"], name=op_name) + x = np.random.randn(3, 4, 5).astype(np.float32) + y = np.cos(x) + op_expect(node, inputs=[x], outputs=[y], op_type=op_type, op_name=op_name) + + +def test_cosh(): + op_name, op_type = "test_cosh_example", "Cosh" + node = onnx.helper.make_node("Cosh", inputs=["x"], outputs=["y"], name=op_name) + + x = np.array([-1, 0, 1]).astype(np.float32) + y = np.cosh(x) # expected output [1.54308069, 1., 1.54308069] + op_expect(node, inputs=[x], outputs=[y], op_type=op_type, op_name=op_name) + + op_name, op_type = "test_cosh", "Cosh" + node = onnx.helper.make_node("Cosh", inputs=["x"], outputs=["y"], name=op_name) + x = np.random.randn(3, 4, 5).astype(np.float32) + y = np.cosh(x) + op_expect(node, inputs=[x], outputs=[y], op_type=op_type, op_name=op_name) + + +def test_depthtospace(): + op_name, op_type = "test_depthtospace_crd_mode_example", "DepthToSpace" + node = onnx.helper.make_node( + "DepthToSpace", + inputs=["x"], + outputs=["y"], + blocksize=2, + mode="CRD", + name=op_name, + ) + + # (1, 8, 2, 3) input tensor + x = np.array( + [ + [ + [[0.0, 1.0, 2.0], [3.0, 4.0, 5.0]], + [[9.0, 10.0, 11.0], [12.0, 13.0, 14.0]], + [[18.0, 19.0, 20.0], [21.0, 22.0, 23.0]], + [[27.0, 28.0, 29.0], [30.0, 31.0, 32.0]], + [[36.0, 37.0, 38.0], [39.0, 40.0, 41.0]], + [[45.0, 46.0, 47.0], [48.0, 49.0, 50.0]], + [[54.0, 55.0, 56.0], [57.0, 58.0, 59.0]], + [[63.0, 64.0, 65.0], [66.0, 67.0, 68.0]], + ] + ] + ).astype(np.float32) + + # (1, 2, 4, 6) output tensor + y = np.array( + [ + [ + [ + [0.0, 9.0, 1.0, 10.0, 2.0, 11.0], + [18.0, 27.0, 19.0, 28.0, 20.0, 29.0], + [3.0, 12.0, 4.0, 13.0, 5.0, 14.0], + [21.0, 30.0, 22.0, 31.0, 23.0, 32.0], + ], + [ + [36.0, 45.0, 37.0, 46.0, 38.0, 47.0], + [54.0, 63.0, 55.0, 64.0, 56.0, 65.0], + [39.0, 48.0, 40.0, 49.0, 41.0, 50.0], + [57.0, 66.0, 58.0, 67.0, 59.0, 68.0], + ], + ] + ] + ).astype(np.float32) + op_expect(node, inputs=[x], outputs=[y], op_type=op_type, op_name=op_name) + + op_name = "test_depthtospace_example" + node = onnx.helper.make_node( + "DepthToSpace", + inputs=["x"], + outputs=["y"], + blocksize=2, + mode="DCR", + name=op_name, + ) + + # (1, 8, 2, 3) input tensor + x = np.array( + [ + [ + [[0.0, 1.0, 2.0], [3.0, 4.0, 5.0]], + [[9.0, 10.0, 11.0], [12.0, 13.0, 14.0]], + [[18.0, 19.0, 20.0], [21.0, 22.0, 23.0]], + [[27.0, 28.0, 29.0], [30.0, 31.0, 32.0]], + [[36.0, 37.0, 38.0], [39.0, 40.0, 41.0]], + [[45.0, 46.0, 47.0], [48.0, 49.0, 50.0]], + [[54.0, 55.0, 56.0], [57.0, 58.0, 59.0]], + [[63.0, 64.0, 65.0], [66.0, 67.0, 68.0]], + ] + ] + ).astype(np.float32) + + # (1, 2, 4, 6) output tensor + y = np.array( + [ + [ + [ + [0.0, 18.0, 1.0, 19.0, 2.0, 20.0], + [36.0, 54.0, 37.0, 55.0, 38.0, 56.0], + [3.0, 21.0, 4.0, 22.0, 5.0, 23.0], + [39.0, 57.0, 40.0, 58.0, 41.0, 59.0], + ], + [ + [9.0, 27.0, 10.0, 28.0, 11.0, 29.0], + [45.0, 63.0, 46.0, 64.0, 47.0, 65.0], + [12.0, 30.0, 13.0, 31.0, 14.0, 32.0], + [48.0, 66.0, 49.0, 67.0, 50.0, 68.0], + ], + ] + ] + ).astype(np.float32) + op_expect(node, inputs=[x], outputs=[y], op_type=op_type, op_name=op_name) + + +def test_div(): + op_name, op_type = "test_div_example", "Div" + node = onnx.helper.make_node("Div", inputs=["x", "y"], outputs=["z"], name=op_name) + + x = np.array([3, 4]).astype(np.float32) + y = np.array([1, 2]).astype(np.float32) + z = x / y # expected output [3., 2.] + op_expect(node, inputs=[x, y], outputs=[z], op_type=op_type, op_name=op_name) + + op_name, op_type = "test_div", "Div" + node = onnx.helper.make_node("Div", inputs=["x", "y"], outputs=["z"], name=op_name) + + x = np.random.randn(3, 4, 5).astype(np.float32) + y = np.random.rand(3, 4, 5).astype(np.float32) + 1.0 + z = x / y + op_expect(node, inputs=[x, y], outputs=[z], op_type=op_type, op_name=op_name) + + op_name, op_type = "test_div_bcast", "Div" + node = onnx.helper.make_node("Div", inputs=["x", "y"], outputs=["z"], name=op_name) + + x = np.random.randn(3, 4, 5).astype(np.float32) + y = np.random.rand(5).astype(np.float32) + 1.0 + z = x / y + op_expect(node, inputs=[x, y], outputs=[z], op_type=op_type, op_name=op_name) + + +@pytest.mark.skip(reason="TensorRT segmentfault") +def test_einsum(): + op_name, op_type = "test_einsum_batch_diagonal", "Einsum" + eqn = "...ii ->...i" + node = onnx.helper.make_node("Einsum", inputs=["x"], outputs=["y"], equation=eqn, name=op_name) + + # NOCC:invalid-name(å…¶ä»–:onnx example) + X = np.random.randn(3, 5, 5).astype(np.float32) + from onnx.backend.test.case.node.einsum import einsum_reference_implementation + + # NOCC:invalid-name(å…¶ä»–:onnx example) + Z = einsum_reference_implementation(eqn, (X,)) + op_expect(node, inputs=[X], outputs=[Z], op_type=op_type, op_name=op_name) + + +def test_elu(): + op_name, op_type = "test_elu_example", "Elu" + node = onnx.helper.make_node("Elu", inputs=["x"], outputs=["y"], alpha=2.0, name=op_name) + + x = np.array([-1, 0, 1]).astype(np.float32) + # expected output [-1.2642411, 0., 1.] + y = np.clip(x, 0, np.inf) + (np.exp(np.clip(x, -np.inf, 0)) - 1) * 2.0 + op_expect(node, inputs=[x], outputs=[y], op_type=op_type, op_name=op_name) + + op_name, op_type = "test_elu", "Elu" + node = onnx.helper.make_node("Elu", inputs=["x"], outputs=["y"], alpha=2.0, name=op_name) + + x = np.random.randn(3, 4, 5).astype(np.float32) + y = np.clip(x, 0, np.inf) + (np.exp(np.clip(x, -np.inf, 0)) - 1) * 2.0 + op_expect(node, inputs=[x], outputs=[y], op_type=op_type, op_name=op_name) + + op_name, op_type = "test_elu_default", "Elu" + default_alpha = 1.0 + node = onnx.helper.make_node("Elu", inputs=["x"], outputs=["y"], name=op_name) + x = np.random.randn(3, 4, 5).astype(np.float32) + y = np.clip(x, 0, np.inf) + (np.exp(np.clip(x, -np.inf, 0)) - 1) * default_alpha + op_expect(node, inputs=[x], outputs=[y], op_type=op_type, op_name=op_name) + + +def test_erf(): + op_name, op_type = "test_erf", "Erf" + node = onnx.helper.make_node("Erf", inputs=["x"], outputs=["y"], name=op_name) + + x = np.random.randn(1, 3, 32, 32).astype(np.float32) + import math + + y = np.vectorize(math.erf)(x).astype(np.float32) + op_expect(node, inputs=[x], outputs=[y], op_type=op_type, op_name=op_name) + + +def test_exp(): + op_name, op_type = "test_exp_example", "Exp" + node = onnx.helper.make_node("Exp", inputs=["x"], outputs=["y"], name=op_name) + + x = np.array([-1, 0, 1]).astype(np.float32) + y = np.exp(x) # expected output [0.36787945, 1., 2.71828175] + op_expect(node, inputs=[x], outputs=[y], op_type=op_type, op_name=op_name) + + op_name, op_type = "test_exp", "Exp" + node = onnx.helper.make_node("Exp", inputs=["x"], outputs=["y"], name=op_name) + x = np.random.randn(3, 4, 5).astype(np.float32) + y = np.exp(x) + op_expect(node, inputs=[x], outputs=[y], op_type=op_type, op_name=op_name) + + +def test_eyelike(): + op_name, op_type = "test_eyelike_populate_off_main_diagonal", "EyeLike" + shape = (4, 5) + off_diagonal_offset = 1 + node = onnx.helper.make_node( + "EyeLike", + inputs=["x"], + outputs=["y"], + k=off_diagonal_offset, + dtype=onnx.TensorProto.FLOAT, + name=op_name, + ) + + x = np.random.randint(0, 100, size=shape, dtype=np.int32) + y = np.eye(shape[0], shape[1], k=off_diagonal_offset, dtype=np.float32) + op_expect(node, inputs=[x], outputs=[y], op_type=op_type, op_name=op_name) + + op_name = "test_eyelike_with_dtype" + shape = (3, 4) + node = onnx.helper.make_node( + "EyeLike", + inputs=["x"], + outputs=["y"], + dtype=onnx.TensorProto.FLOAT, + name=op_name, + ) + + x = np.random.randint(0, 100, size=shape, dtype=np.int32) + y = np.eye(shape[0], shape[1], dtype=np.float32) + op_expect(node, inputs=[x], outputs=[y], op_type=op_type, op_name=op_name) + + op_name = "test_eyelike_without_dtype" + shape = (4, 4) + node = onnx.helper.make_node("EyeLike", inputs=["x"], outputs=["y"], name=op_name) + + x = np.random.randint(0, 100, size=shape, dtype=np.int32) + y = np.eye(shape[0], shape[1], dtype=np.int32) + op_expect(node, inputs=[x], outputs=[y], op_type=op_type, op_name=op_name) + + +def test_floor(): + op_name, op_type = "test_floor_example", "Floor" + node = onnx.helper.make_node("Floor", inputs=["x"], outputs=["y"], name=op_name) + + x = np.array([-1.5, 1.2, 2]).astype(np.float32) + y = np.floor(x) # expected output [-2., 1., 2.] + op_expect(node, inputs=[x], outputs=[y], op_type=op_type, op_name=op_name) + + op_name, op_type = "test_floor", "Floor" + node = onnx.helper.make_node("Floor", inputs=["x"], outputs=["y"], name=op_name) + + x = np.random.randn(3, 4, 5).astype(np.float32) + y = np.floor(x) + op_expect(node, inputs=[x], outputs=[y], op_type=op_type, op_name=op_name) + + +def verify_rnn( + seq_length, + batch_size, + input_size, + hidden_size, + rnn_type="LSTM", + use_bias=False, + activations=None, + alphas=None, + betas=None, + use_initial_state=False, + use_peep=False, + linear_before_reset=False, + op_name=None, + layout=0, +): + if rnn_type == "LSTM": + multiplier = 4 + elif rnn_type == "GRU": + multiplier = 3 + else: + raise NotImplementedError("%s RNNs not yet supported." % rnn_type) + + x_np = np.random.uniform(size=(seq_length, batch_size, input_size)).astype("float32") + w_np = np.random.uniform(size=(1, multiplier * hidden_size, input_size)).astype("float32") + r_np = np.random.uniform(size=(1, multiplier * hidden_size, hidden_size)).astype("float32") + input_names = ["X", "W", "R"] + + input_tensors = [ + helper.make_tensor_value_info("X", TensorProto.FLOAT, list(x_np.shape)), + helper.make_tensor_value_info("W", TensorProto.FLOAT, list(w_np.shape)), + helper.make_tensor_value_info("R", TensorProto.FLOAT, list(r_np.shape)), + ] + + input_values = [x_np, w_np, r_np] + + if use_bias: + b_np = np.random.uniform(size=(1, multiplier * 2 * hidden_size)).astype("float32") + input_names.append("B") + input_tensors.append( + helper.make_tensor_value_info("B", TensorProto.FLOAT, [1, multiplier * 2 * hidden_size]) + ) + input_values.append(b_np) + + if use_initial_state: + assert use_bias is True, "Initial states must have bias specified." + sequence_np = np.repeat(seq_length, batch_size).astype("int32") + input_names.append("sequence_lens") + input_tensors.append( + helper.make_tensor_value_info("sequence_lens", TensorProto.INT32, [batch_size]) + ) + input_values.append(sequence_np) + + initial_h_np = np.random.uniform(size=(1, batch_size, hidden_size)).astype("float32") + input_names.append("initial_h") + input_tensors.append( + helper.make_tensor_value_info( + "initial_h", TensorProto.FLOAT, [1, batch_size, hidden_size] + ) + ) + input_values.append(initial_h_np) + + if rnn_type == "LSTM": + initial_c_np = np.random.uniform(size=(1, batch_size, hidden_size)).astype("float32") + input_names.append("initial_c") + input_tensors.append( + helper.make_tensor_value_info( + "initial_c", TensorProto.FLOAT, [1, batch_size, hidden_size] + ) + ) + input_values.append(initial_c_np) + + if use_peep and rnn_type == "LSTM": + assert use_initial_state is True, "Peepholes require initial state to be specified." + p_np = np.random.uniform(size=(1, 3 * hidden_size)).astype("float32") + input_names.append("P") + input_tensors.append( + helper.make_tensor_value_info("P", TensorProto.FLOAT, [1, 3 * hidden_size]) + ) + input_values.append(p_np) + + Y_shape = [seq_length, 1, batch_size, hidden_size] + Y_h_shape = [1, batch_size, hidden_size] + outputs = ["Y", "Y_h"] + + graph_outputs = [ + helper.make_tensor_value_info("Y", TensorProto.FLOAT, list(Y_shape)), + helper.make_tensor_value_info("Y_h", TensorProto.FLOAT, list(Y_h_shape)), + ] + output_shapes = [Y_shape, Y_h_shape] + + if rnn_type == "LSTM": + Y_c_shape_0 = [1, batch_size, hidden_size] + outputs.append("Y_c") + graph_outputs.append( + helper.make_tensor_value_info("Y_c", TensorProto.FLOAT, list(Y_c_shape_0)) + ) + output_shapes.append(Y_c_shape_0) + + rnn_node = helper.make_node( + rnn_type, + inputs=input_names, + outputs=outputs, + hidden_size=hidden_size, + layout=0, + name=op_name, + ) + if activations is not None: + activations_attr = helper.make_attribute("activations", activations) + rnn_node.attribute.append(activations_attr) + if alphas is not None: + alphas_attr = helper.make_attribute("activation_alpha", alphas) + rnn_node.attribute.append(alphas_attr) + if betas is not None: + betas_attr = helper.make_attribute("activation_beta", betas) + rnn_node.attribute.append(betas_attr) + if linear_before_reset and rnn_type == "GRU": + lbr_attr = helper.make_attribute("linear_before_reset", 1) + rnn_node.attribute.append(lbr_attr) + + graph = helper.make_graph([rnn_node], "rnn_test", inputs=input_tensors, outputs=graph_outputs) + + model = helper.make_model(graph, producer_name="rnn_test") + + verify_with_ort_with_trt(model, input_values, op_name, layout=layout) + + +def test_gather(): + op_name, op_type = "test_gather_0", "Gather" + node = onnx.helper.make_node( + "Gather", inputs=["data", "indices"], outputs=["y"], axis=0, name=op_name + ) + data = np.random.randn(5, 4, 3, 2).astype(np.float32) + indices = np.array([0, 1, 3]) + y = np.take(data, indices, axis=0) + + op_expect( + node, + inputs=[data, indices.astype(np.int64)], + outputs=[y], + op_type=op_type, + op_name=op_name, + ) + + op_name = "test_gather_1" + node = onnx.helper.make_node( + "Gather", inputs=["data", "indices"], outputs=["y"], axis=1, name=op_name + ) + data = np.random.randn(5, 4, 3, 2).astype(np.float32) + indices = np.array([0, 1, 3]) + y = np.take(data, indices, axis=1) + + op_expect( + node, + inputs=[data, indices.astype(np.int64)], + outputs=[y], + op_type=op_type, + op_name=op_name, + ) + + op_name = "test_gather_2d_indices" + node = onnx.helper.make_node( + "Gather", inputs=["data", "indices"], outputs=["y"], axis=1, name=op_name + ) + data = np.random.randn(3, 3).astype(np.float32) + indices = np.array([[0, 2]]) + y = np.take(data, indices, axis=1) + + op_expect( + node, + inputs=[data, indices.astype(np.int64)], + outputs=[y], + op_type=op_type, + op_name=op_name, + ) + + op_name = "test_gather_negative_indices" + node = onnx.helper.make_node( + "Gather", inputs=["data", "indices"], outputs=["y"], axis=0, name=op_name + ) + data = np.arange(10).astype(np.float32) + indices = np.array([0, -9, -10]) + y = np.take(data, indices, axis=0) + + # print(y) + # [0. 1. 0.] + + op_expect( + node, + inputs=[data, indices.astype(np.int64)], + outputs=[y], + op_type=op_type, + op_name=op_name, + ) + + +def test_gatherelement(): + op_name, op_type = "test_gather_elements_0", "GatherElements" + axis = 1 + node = onnx.helper.make_node( + "GatherElements", + inputs=["data", "indices"], + outputs=["y"], + axis=axis, + name=op_name, + ) + data = np.array([[1, 2], [3, 4]], dtype=np.float32) + indices = np.array([[0, 0], [1, 0]], dtype=np.int32) + + from onnx.backend.test.case.node.gatherelements import gather_elements + + y = gather_elements(data, indices, axis) + # print(y) produces + # [[1, 1], + # [4, 3]] + + op_expect( + node, + inputs=[data, indices.astype(np.int64)], + outputs=[y], + op_type=op_type, + op_name=op_name, + ) + + op_name = "test_gather_elements_1" + axis = 0 + node = onnx.helper.make_node( + "GatherElements", + inputs=["data", "indices"], + outputs=["y"], + axis=axis, + name=op_name, + ) + data = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.float32) + indices = np.array([[1, 2, 0], [2, 0, 0]], dtype=np.int32) + + y = gather_elements(data, indices, axis) + # print(y) produces + # [[4, 8, 3], + # [7, 2, 3]] + op_expect( + node, + inputs=[data, indices.astype(np.int64)], + outputs=[y], + op_type=op_type, + op_name=op_name, + ) + + op_name = "test_gather_elements_negative_indices" + axis = 0 + node = onnx.helper.make_node( + "GatherElements", + inputs=["data", "indices"], + outputs=["y"], + axis=axis, + name=op_name, + ) + data = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.float32) + indices = np.array([[-1, -2, 0], [-2, 0, 0]], dtype=np.int32) + + y = gather_elements(data, indices, axis) + # print(y) produces + # [[7, 5, 3], + # [4, 2, 3]] + op_expect( + node, + inputs=[data, indices.astype(np.int64)], + outputs=[y], + op_type=op_type, + op_name=op_name, + ) + + +def test_gathernd(): + op_name, op_type = "test_gathernd_example_float32", "GatherND" + node = onnx.helper.make_node( + "GatherND", inputs=["data", "indices"], outputs=["output"], name=op_name + ) + + data = np.array([[[0, 1], [2, 3]], [[4, 5], [6, 7]]], dtype=np.float32) + indices = np.array([[[0, 1]], [[1, 0]]], dtype=np.int64) + from onnx.backend.test.case.node.gathernd import gather_nd_impl + + output = gather_nd_impl(data, indices, 0) + expected_output = np.array([[[2, 3]], [[4, 5]]], dtype=np.float32) + assert np.array_equal(output, expected_output) + op_expect(node, inputs=[data, indices], outputs=[output], op_type=op_type, op_name=op_name) + + op_name = "test_gathernd_example_int32" + node = onnx.helper.make_node( + "GatherND", inputs=["data", "indices"], outputs=["output"], name=op_name + ) + + data = np.array([[0, 1], [2, 3]], dtype=np.int32) + indices = np.array([[0, 0], [1, 1]], dtype=np.int64) + output = gather_nd_impl(data, indices, 0) + expected_output = np.array([0, 3], dtype=np.int32) + assert np.array_equal(output, expected_output) + op_expect(node, inputs=[data, indices], outputs=[output], op_type=op_type, op_name=op_name) + + op_name = "test_gathernd_example_int32_batch_dim1" + node = onnx.helper.make_node( + "GatherND", + inputs=["data", "indices"], + outputs=["output"], + batch_dims=1, + name=op_name, + ) + + data = np.array([[[0, 1], [2, 3]], [[4, 5], [6, 7]]], dtype=np.int32) + indices = np.array([[1], [0]], dtype=np.int64) + output = gather_nd_impl(data, indices, 1) + expected_output = np.array([[2, 3], [4, 5]], dtype=np.int32) + assert np.array_equal(output, expected_output) + op_expect(node, inputs=[data, indices], outputs=[output], op_type=op_type, op_name=op_name) + + +def test_gemm(): + op_name, op_type = "test_gemm_all_attributes", "Gemm" + node = onnx.helper.make_node( + "Gemm", + inputs=["a", "b", "c"], + outputs=["y"], + alpha=0.25, + beta=0.35, + transA=1, + transB=1, + name=op_name, + ) + a = np.random.ranf([4, 3]).astype(np.float32) + b = np.random.ranf([5, 4]).astype(np.float32) + c = np.random.ranf([1, 5]).astype(np.float32) + from onnx.backend.test.case.node.gemm import gemm_reference_implementation + + y = gemm_reference_implementation(a, b, c, transA=1, transB=1, alpha=0.25, beta=0.35) + op_expect(node, inputs=[a, b, c], outputs=[y], op_type=op_type, op_name=op_name) + + op_name = "test_gemm_alpha" + node = onnx.helper.make_node( + "Gemm", inputs=["a", "b", "c"], outputs=["y"], alpha=0.5, name=op_name + ) + a = np.random.ranf([3, 5]).astype(np.float32) + b = np.random.ranf([5, 4]).astype(np.float32) + c = np.zeros([1, 4]).astype(np.float32) + y = gemm_reference_implementation(a, b, c, alpha=0.5) + op_expect(node, inputs=[a, b, c], outputs=[y], op_type=op_type, op_name=op_name) + + op_name = "test_gemm_beta" + node = onnx.helper.make_node( + "Gemm", inputs=["a", "b", "c"], outputs=["y"], beta=0.5, name=op_name + ) + a = np.random.ranf([2, 7]).astype(np.float32) + b = np.random.ranf([7, 4]).astype(np.float32) + c = np.random.ranf([1, 4]).astype(np.float32) + y = gemm_reference_implementation(a, b, c, beta=0.5) + op_expect(node, inputs=[a, b, c], outputs=[y], op_type=op_type, op_name=op_name) + + +def test_globalaveragepool(): + op_name, op_type = "test_globalaveragepool", "GlobalAveragePool" + node = onnx.helper.make_node("GlobalAveragePool", inputs=["x"], outputs=["y"], name=op_name) + x = np.random.randn(1, 3, 5, 5).astype(np.float32) + y = np.mean(x, axis=tuple(range(2, np.ndim(x))), keepdims=True) + op_expect(node, inputs=[x], outputs=[y], op_type=op_type, op_name=op_name) + + op_name = "test_globalaveragepool_precomputed" + node = onnx.helper.make_node("GlobalAveragePool", inputs=["x"], outputs=["y"], name=op_name) + x = np.array( + [ + [ + [ + [1, 2, 3], + [4, 5, 6], + [7, 8, 9], + ] + ] + ] + ).astype(np.float32) + y = np.array([[[[5]]]]).astype(np.float32) + op_expect(node, inputs=[x], outputs=[y], op_type=op_type, op_name=op_name) + + +def test_globalmaxpool(): + op_name = "test_globalmaxpool" + op_type = "GlobalMaxPool" + node = onnx.helper.make_node("GlobalMaxPool", inputs=["x"], outputs=["y"], name=op_name) + x = np.random.randn(1, 3, 5, 5).astype(np.float32) + y = np.max(x, axis=tuple(range(2, np.ndim(x))), keepdims=True) + op_expect(node, inputs=[x], outputs=[y], op_type=op_type, op_name=op_name) + + op_name = "test_globalmaxpool_precomputed" + node = onnx.helper.make_node("GlobalMaxPool", inputs=["x"], outputs=["y"], name=op_name) + x = np.array( + [ + [ + [ + [1, 2, 3], + [4, 5, 6], + [7, 8, 9], + ] + ] + ] + ).astype(np.float32) + y = np.array([[[[9]]]]).astype(np.float32) + op_expect(node, inputs=[x], outputs=[y], op_type=op_type, op_name=op_name) + + +def test_hardsigmoid(): + op_name, op_type = "test_hardsigmoid_example", "HardSigmoid" + node = onnx.helper.make_node( + "HardSigmoid", inputs=["x"], outputs=["y"], alpha=0.5, beta=0.6, name=op_name + ) + + x = np.array([-1, 0, 1]).astype(np.float32) + y = np.clip(x * 0.5 + 0.6, 0, 1) # expected output [0.1, 0.6, 1.] + op_expect(node, inputs=[x], outputs=[y], op_type=op_type, op_name=op_name) + + op_name = "test_hardsigmoid" + node = onnx.helper.make_node( + "HardSigmoid", inputs=["x"], outputs=["y"], alpha=0.5, beta=0.6, name=op_name + ) + x = np.random.randn(3, 4, 5).astype(np.float32) + y = np.clip(x * 0.5 + 0.6, 0, 1) + op_expect(node, inputs=[x], outputs=[y], op_type=op_type, op_name=op_name) + + op_name = "test_hardsigmoid_default" + + default_alpha = 0.2 + default_beta = 0.5 + node = onnx.helper.make_node("HardSigmoid", inputs=["x"], outputs=["y"], name=op_name) + x = np.random.randn(3, 4, 5).astype(np.float32) + y = np.clip(x * default_alpha + default_beta, 0, 1) + op_expect(node, inputs=[x], outputs=[y], op_type=op_type, op_name=op_name) + + +def test_hardswish(): + op_name, op_type = "test_hardswish", "HardSwish" + node = onnx.helper.make_node("HardSwish", inputs=["x"], outputs=["y"], name=op_name) + x = np.random.randn(3, 4, 5).astype(np.float32) + from onnx.backend.test.case.node.hardswish import hardswish + + y = hardswish(x) + + op_expect(node, inputs=[x], outputs=[y], op_type=op_type, op_name=op_name) + + +def test_hardmax(): + op_name, op_type = "test_hardmax_example", "Hardmax" + node = onnx.helper.make_node("Hardmax", inputs=["x"], outputs=["y"], name=op_name) + + x = np.array([[3, 0, 1, 2], [2, 5, 1, 0], [0, 1, 3, 2], [0, 1, 2, 3]]).astype(np.float32) + # expect result: + # [[1. 0. 0. 0.] + # [0. 1. 0. 0.] + # [0. 0. 1. 0.] + # [0. 0. 0. 1.]] + from onnx.backend.test.case.node.hardmax import hardmax + + y = hardmax(x) + op_expect(node, inputs=[x], outputs=[y], op_type=op_type, op_name=op_name) + + +def test_identity(): + op_name, op_type = "test_identity", "Identity" + node = onnx.helper.make_node("Identity", inputs=["x"], outputs=["y"], name=op_name) + + data = np.array( + [ + [ + [ + [1, 2], + [3, 4], + ] + ] + ], + dtype=np.float32, + ) + + op_expect(node, inputs=[data], outputs=[data], op_type=op_type, op_name=op_name) + + +def test_instancenormalization(): + op_name, op_type = "test_instancenorm_example", "InstanceNormalization" + + def _instancenorm_test_mode(x, s, bias, epsilon=1e-5): # type: ignore + dims_x = len(x.shape) + axis = tuple(range(2, dims_x)) + mean = np.mean(x, axis=axis, keepdims=True) + var = np.var(x, axis=axis, keepdims=True) + dim_ones = (1,) * (dims_x - 2) + s = s.reshape(-1, *dim_ones) + bias = bias.reshape(-1, *dim_ones) + return s * (x - mean) / np.sqrt(var + epsilon) + bias + + # input size: (1, 2, 1, 3) + x = np.array([[[[-1, 0, 1]], [[2, 3, 4]]]]).astype(np.float32) + s = np.array([1.0, 1.5]).astype(np.float32) + bias = np.array([0, 1]).astype(np.float32) + y = _instancenorm_test_mode(x, s, bias).astype(np.float32) + + node = onnx.helper.make_node( + "InstanceNormalization", inputs=["x", "s", "bias"], outputs=["y"], name=op_name + ) + + # output size: (1, 2, 1, 3) + op_expect(node, inputs=[x, s, bias], outputs=[y], op_type=op_type, op_name=op_name) + + op_name = "test_instancenorm_epsilon" + # input size: (2, 3, 4, 5) + x = np.random.randn(2, 3, 4, 5).astype(np.float32) + s = np.random.randn(3).astype(np.float32) + bias = np.random.randn(3).astype(np.float32) + epsilon = 1e-2 + y = _instancenorm_test_mode(x, s, bias, epsilon).astype(np.float32) + + node = onnx.helper.make_node( + "InstanceNormalization", + inputs=["x", "s", "bias"], + outputs=["y"], + epsilon=epsilon, + name=op_name, + ) + + # output size: (2, 3, 4, 5) + op_expect(node, inputs=[x, s, bias], outputs=[y], op_type=op_type, op_name=op_name) + + +def test_leakyrelu(): + op_name, op_type = "test_leakyrelu_example", "LeakyRelu" + node = onnx.helper.make_node("LeakyRelu", inputs=["x"], outputs=["y"], alpha=0.1, name=op_name) + + x = np.array([-1, 0, 1]).astype(np.float32) + # expected output [-0.1, 0., 1.] + y = np.clip(x, 0, np.inf) + np.clip(x, -np.inf, 0) * 0.1 + op_expect(node, inputs=[x], outputs=[y], op_type=op_type, op_name=op_name) + + op_name = "test_leakyrelu" + node = onnx.helper.make_node("LeakyRelu", inputs=["x"], outputs=["y"], alpha=0.1, name=op_name) + + x = np.random.randn(3, 4, 5).astype(np.float32) + y = np.clip(x, 0, np.inf) + np.clip(x, -np.inf, 0) * 0.1 + op_expect(node, inputs=[x], outputs=[y], op_type=op_type, op_name=op_name) + + op_name = "test_leakyrelu_default" + default_alpha = 0.01 + node = onnx.helper.make_node("LeakyRelu", inputs=["x"], outputs=["y"], name=op_name) + x = np.random.randn(3, 4, 5).astype(np.float32) + y = np.clip(x, 0, np.inf) + np.clip(x, -np.inf, 0) * default_alpha + op_expect(node, inputs=[x], outputs=[y], op_type=op_type, op_name=op_name) + + +def test_log(): + op_name = "test_log_example" + op_type = "Log" + node = onnx.helper.make_node("Log", inputs=["x"], outputs=["y"], name=op_name) + + x = np.array([1, 10]).astype(np.float32) + y = np.log(x) # expected output [0., 2.30258512] + op_expect(node, inputs=[x], outputs=[y], op_type=op_type, op_name=op_name) + + op_name = "test_log" + node = onnx.helper.make_node("Log", inputs=["x"], outputs=["y"], name=op_name) + + x = np.exp(np.random.randn(3, 4, 5).astype(np.float32)) + y = np.log(x) + op_expect(node, inputs=[x], outputs=[y], op_type=op_type, op_name=op_name) + + +@pytest.mark.skip(reason="Wrong answer, at axis 1") +def test_logsoftmax(): + op_name, op_type = "test_logsoftmax_example_1", "LogSoftmax" + node = onnx.helper.make_node("LogSoftmax", inputs=["x"], outputs=["y"], name=op_name) + x = np.array([[-1, 0, 1]]).astype(np.float32) + # expected output + # [[-2.4076061 -1.407606 -0.407606 ]] + from onnx.backend.test.case.node.logsoftmax import logsoftmax + + y = logsoftmax(x) + op_expect(node, inputs=[x], outputs=[y], op_type=op_type, op_name=op_name) + + x = np.array([[0, 1, 2, 3], [10000, 10001, 10002, 10003]]).astype(np.float32) + axis_order = [0, 1, -1] + for axis in axis_order: + op_name = "test_logsoftmax_axis_{}".format(str(axis + 1)) + node = onnx.helper.make_node( + "LogSoftmax", inputs=["x"], outputs=["y"], axis=axis, name=op_name + ) + y = logsoftmax(x, axis=axis) + op_expect(node, inputs=[x], outputs=[y], op_type=op_type, op_name=op_name) + + +def test_matmul(): + op_name, op_type = "test_matmul_2d", "MatMul" + node = onnx.helper.make_node("MatMul", inputs=["a", "b"], outputs=["c"], name=op_name) + + # 2d + a = np.random.randn(3, 4).astype(np.float32) + b = np.random.randn(4, 3).astype(np.float32) + c = np.matmul(a, b) + op_expect(node, inputs=[a, b], outputs=[c], op_type=op_type, op_name=op_name) + + +def test_max(): + op_name = "test_max_example" + op_type = "Max" + data_0 = np.array([3, 2, 1]).astype(np.float32) + data_1 = np.array([1, 4, 4]).astype(np.float32) + data_2 = np.array([2, 5, 3]).astype(np.float32) + result = np.array([3, 5, 4]).astype(np.float32) + node = onnx.helper.make_node( + "Max", inputs=["data_0", "data_1", "data_2"], outputs=["result"], name=op_name + ) + op_expect( + node, + inputs=[data_0, data_1, data_2], + outputs=[result], + op_type=op_type, + op_name=op_name, + ) + + op_name = "test_max_two_inputs" + result = np.maximum(data_0, data_1) + node = onnx.helper.make_node( + "Max", inputs=["data_0", "data_1"], outputs=["result"], name=op_name + ) + op_expect( + node, + inputs=[data_0, data_1], + outputs=[result], + op_type=op_type, + op_name=op_name, + ) + + +def _test_maxpool_2d_ceil(): + op_name, op_type = "test_maxpool_2d_ceil", "MaxPool" + node = onnx.helper.make_node( + "MaxPool", + inputs=["x"], + outputs=["y"], + kernel_shape=[3, 3], + strides=[2, 2], + ceil_mode=True, + name=op_name, + ) + x = np.array( + [ + [ + [ + [1, 2, 3, 4], + [5, 6, 7, 8], + [9, 10, 11, 12], + [13, 14, 15, 16], + ] + ] + ] + ).astype(np.float32) + y = np.array([[[[11, 12], [15, 16]]]]).astype(np.float32) + + op_expect(node, inputs=[x], outputs=[y], op_type=op_type, op_name=op_name) + + +def _test_maxpool_1d_default(): + op_name, op_type = "test_maxpool_1d_default", "MaxPool" + node = onnx.helper.make_node( + "MaxPool", inputs=["x"], outputs=["y"], kernel_shape=[2], name=op_name + ) + x = np.random.randn(1, 3, 32).astype(np.float32) + x_shape = np.shape(x) + kernel_shape = [2] + strides = [1] + from onnx.backend.test.case.node.pool_op_common import get_output_shape, pool + + out_shape = get_output_shape("VALID", x_shape[2:], kernel_shape, strides) + padded = x + y = pool(padded, x_shape, kernel_shape, strides, out_shape, [0], "MAX") + op_expect(node, inputs=[x], outputs=[y], op_type=op_type, op_name=op_name) + + +def test_maxpool(): + _test_maxpool_2d_ceil() + _test_maxpool_1d_default() + + +def test_mean(): + op_name, op_type = "test_mean_example", "Mean" + data_0 = np.array([3, 0, 2]).astype(np.float32) + data_1 = np.array([1, 3, 4]).astype(np.float32) + data_2 = np.array([2, 6, 6]).astype(np.float32) + result = np.array([2, 3, 4]).astype(np.float32) + node = onnx.helper.make_node( + "Mean", inputs=["data_0", "data_1", "data_2"], outputs=["result"], name=op_name + ) + op_expect( + node, + inputs=[data_0, data_1, data_2], + outputs=[result], + op_type=op_type, + op_name=op_name, + ) + + op_name = "test_mean_two_inputs" + result = np.divide(np.add(data_0, data_1), 2.0) + node = onnx.helper.make_node( + "Mean", inputs=["data_0", "data_1"], outputs=["result"], name=op_name + ) + op_expect( + node, + inputs=[data_0, data_1], + outputs=[result], + op_type=op_type, + op_name=op_name, + ) + + +def test_min(): + op_name, op_type = "test_min_example", "Min" + data_0 = np.array([3, 2, 1]).astype(np.float32) + data_1 = np.array([1, 4, 4]).astype(np.float32) + data_2 = np.array([2, 5, 0]).astype(np.float32) + result = np.array([1, 2, 0]).astype(np.float32) + node = onnx.helper.make_node( + "Min", inputs=["data_0", "data_1", "data_2"], outputs=["result"], name=op_name + ) + op_expect( + node, + inputs=[data_0, data_1, data_2], + outputs=[result], + op_type=op_type, + op_name=op_name, + ) + + op_name = "test_min_two_inputs" + result = np.minimum(data_0, data_1) + node = onnx.helper.make_node( + "Min", inputs=["data_0", "data_1"], outputs=["result"], name=op_name + ) + op_expect( + node, + inputs=[data_0, data_1], + outputs=[result], + op_type=op_type, + op_name=op_name, + ) + + +def test_mul(): + op_name, op_type = "test_mul_example", "Mul" + node = onnx.helper.make_node("Mul", inputs=["x", "y"], outputs=["z"], name=op_name) + + x = np.array([1, 2, 3]).astype(np.float32) + y = np.array([4, 5, 6]).astype(np.float32) + z = x * y # expected output [4., 10., 18.] + op_expect(node, inputs=[x, y], outputs=[z], op_type=op_type, op_name=op_name) + + op_name = "test_mul" + node = onnx.helper.make_node("Mul", inputs=["x", "y"], outputs=["z"], name=op_name) + x = np.random.randn(3, 4, 5).astype(np.float32) + y = np.random.randn(3, 4, 5).astype(np.float32) + z = x * y + op_expect(node, inputs=[x, y], outputs=[z], op_type=op_type, op_name=op_name) + + op_name = "test_mul_bcast" + node = onnx.helper.make_node("Mul", inputs=["x", "y"], outputs=["z"], name=op_name) + + x = np.random.randn(3, 4, 5).astype(np.float32) + y = np.random.randn(5).astype(np.float32) + z = x * y + op_expect(node, inputs=[x, y], outputs=[z], op_type=op_type, op_name=op_name) + + +def test_neg(): + op_name, op_type = "test_neg_example", "Neg" + node = onnx.helper.make_node("Neg", inputs=["x"], outputs=["y"], name=op_name) + + x = np.array([-4, 2]).astype(np.float32) + y = np.negative(x) # expected output [4., -2.], + op_expect(node, inputs=[x], outputs=[y], op_type=op_type, op_name=op_name) + + op_name = "test_neg" + node = onnx.helper.make_node("Neg", inputs=["x"], outputs=["y"], name=op_name) + x = np.random.randn(3, 4, 5).astype(np.float32) + y = np.negative(x) + op_expect(node, inputs=[x], outputs=[y], op_type=op_type, op_name=op_name) + + +def test_negativeloglikelihoodloss(): + op_name, op_type = "test_nllloss_NC", "NegativeLogLikelihoodLoss" + reduction = "none" + node = onnx.helper.make_node( + "NegativeLogLikelihoodLoss", + inputs=["input", "target"], + outputs=["loss"], + reduction=reduction, + name=op_name, + ) + + # NOCC:invalid-name(å…¶ä»–:onnx example) + N, C = 3, 5 + np.random.seed(0) + input = np.random.rand(N, C).astype(np.float32) + target = np.random.randint(0, high=C, size=(N,)).astype(np.int64) + from onnx.backend.test.case.node.negativeloglikelihoodloss import ( + compute_negative_log_likelihood_loss, + ) + + negative_log_likelihood_loss = compute_negative_log_likelihood_loss( + input, target, weight=None, reduction=reduction + ) + + op_expect( + node, + inputs=[input, target], + outputs=[negative_log_likelihood_loss], + op_type=op_type, + op_name=op_name, + ) + + +def test_prelu(): + op_name, op_type = "test_prelu_example", "PRelu" + node = onnx.helper.make_node("PRelu", inputs=["x", "slope"], outputs=["y"], name=op_name) + + x = np.random.randn(3, 4, 5).astype(np.float32) + slope = np.random.randn(3, 4, 5).astype(np.float32) + y = np.clip(x, 0, np.inf) + np.clip(x, -np.inf, 0) * slope + + op_expect(node, inputs=[x, slope], outputs=[y], op_type=op_type, op_name=op_name) + + op_name = "test_prelu_broadcast" + node = onnx.helper.make_node("PRelu", inputs=["x", "slope"], outputs=["y"], name=op_name) + + x = np.random.randn(3, 4, 5).astype(np.float32) + slope = np.random.randn(5).astype(np.float32) + y = np.clip(x, 0, np.inf) + np.clip(x, -np.inf, 0) * slope + + op_expect(node, inputs=[x, slope], outputs=[y], op_type=op_type, op_name=op_name) + + +def test_pow(): + op_name, op_type = "test_pow_example", "Pow" + node = onnx.helper.make_node("Pow", inputs=["x", "y"], outputs=["z"], name=op_name) + + x = np.array([1, 2, 3]).astype(np.float32) + y = np.array([4, 5, 6]).astype(np.float32) + z = pow(x, y) # expected output [1., 32., 729.] + op_expect(node, inputs=[x, y], outputs=[z], op_type=op_type, op_name=op_name) + + op_name = "test_pow" + node = onnx.helper.make_node("Pow", inputs=["x", "y"], outputs=["z"], name=op_name) + x = np.arange(60).reshape(3, 4, 5).astype(np.float32) + y = np.random.randn(3, 4, 5).astype(np.float32) + z = pow(x, y) + op_expect(node, inputs=[x, y], outputs=[z], op_type=op_type, op_name=op_name) + + op_name = "test_pow_bcast_scalar" + node = onnx.helper.make_node("Pow", inputs=["x", "y"], outputs=["z"], name=op_name) + + x = np.array([1, 2, 3]).astype(np.float32) + y = np.array([2]).astype(np.float32) + z = pow(x, y) # expected output [1., 4., 9.] + op_expect(node, inputs=[x, y], outputs=[z], op_type=op_type, op_name=op_name) + + op_name = "test_pow_bcast_array" + node = onnx.helper.make_node("Pow", inputs=["x", "y"], outputs=["z"], name=op_name) + x = np.array([[1, 2, 3], [4, 5, 6]]).astype(np.float32) + y = np.array([[1, 2, 3]]).astype(np.float32) + # expected output [[1, 4, 27], [4, 25, 216]] + z = pow(x, y) + op_expect(node, inputs=[x, y], outputs=[z], op_type=op_type, op_name=op_name) + + +def test_reciprocal(): + op_name, op_type = "test_reciprocal_example", "Reciprocal" + node = onnx.helper.make_node("Reciprocal", inputs=["x"], outputs=["y"], name=op_name) + + x = np.array([-4, 2]).astype(np.float32) + y = np.reciprocal(x) # expected output [-0.25, 0.5], + op_expect(node, inputs=[x], outputs=[y], op_type=op_type, op_name=op_name) + + op_name = "test_reciprocal" + node = onnx.helper.make_node("Reciprocal", inputs=["x"], outputs=["y"], name=op_name) + x = np.random.rand(3, 4, 5).astype(np.float32) + 0.5 + y = np.reciprocal(x) + op_expect(node, inputs=[x], outputs=[y], op_type=op_type, op_name=op_name) + + +def test_reducel1(): + op_name, op_type = "test_reduce_l1_default_axes_keepdims_example", "ReduceL1" + shape = [3, 2, 2] + axes = None + keepdims = 1 + + node = onnx.helper.make_node( + "ReduceL1", + inputs=["data"], + outputs=["reduced"], + keepdims=keepdims, + name=op_name, + ) + + data = np.reshape(np.arange(1, np.prod(shape) + 1, dtype=np.float32), shape) + # print(data) + # [[[1., 2.], [3., 4.]], [[5., 6.], [7., 8.]], [[9., 10.], [11., 12.]]] + + reduced = np.sum(a=np.abs(data), axis=axes, keepdims=keepdims == 1) + # print(reduced) + # [[[78.]]] + + op_expect(node, inputs=[data], outputs=[reduced], op_type=op_type, op_name=op_name) + + np.random.seed(0) + data = np.random.uniform(-10, 10, shape).astype(np.float32) + reduced = np.sum(a=np.abs(data), axis=axes, keepdims=keepdims == 1) + + op_name = "test_reduce_l1_default_axes_keepdims_random" + node = onnx.helper.make_node( + "ReduceL1", + inputs=["data"], + outputs=["reduced"], + keepdims=keepdims, + name=op_name, + ) + op_expect(node, inputs=[data], outputs=[reduced], op_type=op_type, op_name=op_name) + + +def test_reducel2(): + op_name, op_type = "test_reduce_l2_default_axes_keepdims_example", "ReduceL2" + shape = [3, 2, 2] + axes = None + keepdims = 1 + + node = onnx.helper.make_node( + "ReduceL2", + inputs=["data"], + outputs=["reduced"], + keepdims=keepdims, + name=op_name, + ) + + data = np.reshape(np.arange(1, np.prod(shape) + 1, dtype=np.float32), shape) + # print(data) + # [[[1., 2.], [3., 4.]], [[5., 6.], [7., 8.]], [[9., 10.], [11., 12.]]] + + reduced = np.sqrt(np.sum(a=np.square(data), axis=axes, keepdims=keepdims == 1)) + # print(reduced) + # [[[25.49509757]]] + + op_expect(node, inputs=[data], outputs=[reduced], op_type=op_type, op_name=op_name) + + op_name = "test_reduce_l2_default_axes_keepdims_random" + np.random.seed(0) + data = np.random.uniform(-10, 10, shape).astype(np.float32) + reduced = np.sqrt(np.sum(a=np.square(data), axis=axes, keepdims=keepdims == 1)) + node = onnx.helper.make_node( + "ReduceL2", + inputs=["data"], + outputs=["reduced"], + keepdims=keepdims, + name=op_name, + ) + + op_expect(node, inputs=[data], outputs=[reduced], op_type=op_type, op_name=op_name) + + +@pytest.mark.skip(reason="ORT: Unrecognized attribute: axes for operator ReduceLogSu") +def test_reducelogsum(): + op_name, op_type = "test_reduce_log_sum_default", "ReduceLogSum" + node = onnx.helper.make_node("ReduceLogSum", inputs=["data"], outputs=["reduced"], name=op_name) + data = np.random.ranf([3, 4, 5]).astype(np.float32) + reduced = np.log(np.sum(data, keepdims=True)) + op_expect(node, inputs=[data], outputs=[reduced], op_type=op_type, op_name=op_name) + + op_name = "test_reduce_log_sum_negative_axes" + node = onnx.helper.make_node( + "ReduceLogSum", inputs=["data"], outputs=["reduced"], axes=[-2], name=op_name + ) + data = np.random.ranf([3, 4, 5]).astype(np.float32) + reduced = np.log(np.sum(data, axis=(-2), keepdims=True)) + # print(reduced) + op_expect(node, inputs=[data], outputs=[reduced], op_type=op_type, op_name=op_name) + + op_name = "test_reduce_log_sum_desc_axes" + node = onnx.helper.make_node( + "ReduceLogSum", + inputs=["data"], + outputs=["reduced"], + axes=[2, 1], + keepdims=0, + name=op_name, + ) + data = np.random.ranf([3, 4, 5]).astype(np.float32) + reduced = np.log(np.sum(data, axis=(2, 1), keepdims=False)) + op_expect(node, inputs=[data], outputs=[reduced], op_type=op_type, op_name=op_name) + + op_name = "test_reduce_log_sum_asc_axes" + node = onnx.helper.make_node( + "ReduceLogSum", + inputs=["data"], + outputs=["reduced"], + axes=[0, 1], + keepdims=0, + name=op_name, + ) + data = np.random.ranf([3, 4, 5]).astype(np.float32) + reduced = np.log(np.sum(data, axis=(0, 1), keepdims=False)) + op_expect(node, inputs=[data], outputs=[reduced], op_type=op_type, op_name=op_name) + + +def test_reducelogsumexp(): + op_name, op_type = ( + "test_reduce_log_sum_exp_default_axes_keepdims_example", + "ReduceLogSumExp", + ) + shape = [3, 2, 2] + axes = None + keepdims = 1 + + node = onnx.helper.make_node( + "ReduceLogSumExp", + inputs=["data"], + outputs=["reduced"], + keepdims=keepdims, + name=op_name, + ) + + data = np.array([[[5, 1], [20, 2]], [[30, 1], [40, 2]], [[55, 1], [60, 2]]], dtype=np.float32) + reduced = np.log(np.sum(np.exp(data), axis=axes, keepdims=keepdims == 1)) + # print(reduced) + # [[[60.00671387]]] + + op_expect(node, inputs=[data], outputs=[reduced], op_type=op_type, op_name=op_name) + + op_name = "test_reduce_log_sum_exp_default_axes_keepdims_random" + node = onnx.helper.make_node( + "ReduceLogSumExp", + inputs=["data"], + outputs=["reduced"], + keepdims=keepdims, + name=op_name, + ) + + np.random.seed(0) + data = np.random.uniform(-10, 10, shape).astype(np.float32) + reduced = np.log(np.sum(np.exp(data), axis=axes, keepdims=keepdims == 1)) + op_expect(node, inputs=[data], outputs=[reduced], op_type=op_type, op_name=op_name) + + +def test_reducemax(): + op_name, op_type = "test_reduce_max_default_axes_keepdim_example", "ReduceMax" + shape = [3, 2, 2] + axes = None + keepdims = 1 + node = onnx.helper.make_node( + "ReduceMax", + inputs=["data"], + outputs=["reduced"], + keepdims=keepdims, + name=op_name, + ) + + data = np.array([[[5, 1], [20, 2]], [[30, 1], [40, 2]], [[55, 1], [60, 2]]], dtype=np.float32) + reduced = np.maximum.reduce(data, axis=axes, keepdims=keepdims == 1) + # print(reduced) + # [[[60.]]] + + op_expect(node, inputs=[data], outputs=[reduced], op_type=op_type, op_name=op_name) + + op_name = "test_reduce_max_default_axes_keepdims_random" + node = onnx.helper.make_node( + "ReduceMax", + inputs=["data"], + outputs=["reduced"], + keepdims=keepdims, + name=op_name, + ) + np.random.seed(0) + data = np.random.uniform(-10, 10, shape).astype(np.float32) + reduced = np.maximum.reduce(data, axis=axes, keepdims=keepdims == 1) + + op_expect(node, inputs=[data], outputs=[reduced], op_type=op_type, op_name=op_name) + + +def test_reducemean(): + op_name, op_type = "test_reduce_mean_default_axes_keepdims_example", "ReduceMean" + shape = [3, 2, 2] + axes = None + keepdims = 1 + + node = onnx.helper.make_node( + "ReduceMean", + inputs=["data"], + outputs=["reduced"], + keepdims=keepdims, + name=op_name, + ) + + data = np.array([[[5, 1], [20, 2]], [[30, 1], [40, 2]], [[55, 1], [60, 2]]], dtype=np.float32) + reduced = np.mean(data, axis=axes, keepdims=keepdims == 1) + # print(reduced) + # [[[18.25]]] + + op_expect(node, inputs=[data], outputs=[reduced], op_type=op_type, op_name=op_name) + + op_name = "test_reduce_mean_default_axes_keepdims_random" + + node = onnx.helper.make_node( + "ReduceMean", + inputs=["data"], + outputs=["reduced"], + keepdims=keepdims, + name=op_name, + ) + np.random.seed(0) + data = np.random.uniform(-10, 10, shape).astype(np.float32) + reduced = np.mean(data, axis=axes, keepdims=keepdims == 1) + + op_expect(node, inputs=[data], outputs=[reduced], op_type=op_type, op_name=op_name) + + +def test_reducesum(): + batch_size = 32 + op_name = "reduce_sum_1" + with tf.Graph().as_default(): + input_ph = tf.placeholder( + dtype=tf.float32, shape=[batch_size, 256], name="input" + ) # [batchsize, 10] + input_data = np.random.rand(batch_size, 256).astype(np.float32) + x = tf.math.reduce_sum(input_ph, axis=1, name=op_name) + _ = tf.identity(x, name="output") + verify_tf_with_trt_result([input_data], ["input:0"], ["output:0"], op_name=op_name) + + +def test_maxunpool(): + def verify_maxunpool( + data, indices, kernel_shape, strides, output_shape=None, pads=None, op_name=None + ): + input_names = ["xT", "xI"] + input_info = [ + helper.make_tensor_value_info("xT", TensorProto.FLOAT, list(data.shape)), + helper.make_tensor_value_info("xI", TensorProto.INT64, list(indices.shape)), + ] + input_values = [data, indices] + # input_values = [data ] + if output_shape is not None: + input_names.append("output_shape") + input_info.append( + helper.make_tensor_value_info( + "output_shape", TensorProto.INT64, list(output_shape.shape) + ) + ) + input_values.append(output_shape) + else: + # Compute expected output shape + output_shape = np.asarray(([1, 1] + list(strides))) * np.asarray(list(data.shape)) + output_shape += np.asarray(([0, 0] + list(kernel_shape))) - np.asarray( + ([0, 0] + list(strides)) + ) + if pads is not None: + output_shape -= np.asarray( + [0, 0] + list(np.sum(np.reshape(list(pads), [-1, 2]), axis=-1)) + ) + output_shape = [int(i) for i in output_shape] + + node = helper.make_node( + "MaxUnpool", + inputs=input_names, + outputs=["y"], + kernel_shape=kernel_shape, + name=op_name, + ) + + if pads is not None: + pad_attr = helper.make_attribute("pads", pads) + node.attribute.append(pad_attr) + + if strides is not None: + strides_attr = helper.make_attribute("strides", strides) + node.attribute.append(strides_attr) + + graph = helper.make_graph( + [node], + "maxunpool_test", + inputs=input_info, + outputs=[helper.make_tensor_value_info("y", TensorProto.FLOAT, output_shape)], + ) + + model = helper.make_model(graph, producer_name="size_test") + verify_with_ort_with_trt(model, input_values, op_name=op_name, opset=11) + + # NOCC:invalid-name(å…¶ä»–:onnx example) + xT = np.array([[[[5, 6], [7, 8]]]], dtype=np.float32) + # NOCC:invalid-name(å…¶ä»–:onnx example) + xI = np.array([[[[0, 7], [13, 15]]]], dtype=np.int64) + verify_maxunpool(xT, xI, [2, 2], strides=[2, 2], op_name="max_unpool_1") + + +def _test_forward_one_hot(indices_shape, depth, on_value, off_value, axis, out_dtype, op_name): + inp_array1 = np.random.randint(0, 5, size=indices_shape) + with tf.Graph().as_default(): + in1 = tf.placeholder(shape=inp_array1.shape, dtype=inp_array1.dtype, name="input") + out = tf.one_hot(in1, depth, on_value, off_value, axis, dtype=out_dtype, name=op_name) + out = tf.identity(out, "output") + verify_tf_with_trt_result([inp_array1], ["input:0"], ["output:0"], op_name) + # compare_tf_with_tvm(inp_array1, in1.name, out.name) + + +def test_forward_one_hot(): + _test_forward_one_hot((3,), 3, 1.0, 0.0, -1, "float32", "onehot_2") + + +def test_where(): + op_name, op_type = "test_where", "Where" + node = onnx.helper.make_node( + "Where", inputs=["condition", "x", "y"], outputs=["z"], name=op_name + ) + condition = np.array([[1, 0], [1, 1]], dtype=bool) + x = np.array([[1, 2], [3, 4]], dtype=np.int64) + y = np.array([[9, 8], [7, 6]], dtype=np.int64) + z = np.where(condition, x, y) # expected output [[1, 8], [3, 4]] + op_expect(node, inputs=[condition, x, y], outputs=[z], op_type=op_type, op_name=op_name) + + +def _test_slice_iteration_v1(indata, outdata, starts, ends, axes=None): + op_name = "slice_0" + if axes: + y = helper.make_node( + "Slice", ["in"], ["out"], axes=axes, starts=starts, ends=ends, name=op_name + ) + else: + y = helper.make_node("Slice", ["in"], ["out"], starts=starts, ends=ends, name=op_name) + + graph = helper.make_graph( + [y], + "slice_test", + inputs=[helper.make_tensor_value_info("in", TensorProto.FLOAT, list(indata.shape))], + outputs=[helper.make_tensor_value_info("out", TensorProto.FLOAT, list(outdata.shape))], + ) + + model = helper.make_model(graph, producer_name="slice_test") + # verify_with_ort_with_trt(model, [indata], [outdata.shape], op_name=op_name, opset=1) + verify_with_ort_with_trt(model, [indata], op_name=op_name, opset=1) + + +def test_slice(): + x = np.random.randn(20, 10, 5).astype(np.float32) + _test_slice_iteration_v1(x, x[0:3, 0:10], starts=(0, 0), ends=(3, 10), axes=(0, 1)) + + +def verify_pad_v11(indata, pads, mode="constant", value=0.0): + op_name = "pad_001" + indata = np.array(indata).astype(np.float32) + # numpy expect result + len_dim = len(pads) // 2 + np_pads = [(pads[i], pads[i + len_dim]) for i in range(len_dim)] + pads = np.array(pads) + # onnx graph + if mode in ["edge", "reflect"]: + inputs = [indata] + outdata = np.pad(indata, pad_width=np_pads, mode=mode) + node = helper.make_node( + "Pad", inputs=["input", "pads"], outputs=["output"], mode=mode, name=op_name + ) + graph = helper.make_graph( + [node], + "pad_test", + inputs=[ + helper.make_tensor_value_info("input", TensorProto.FLOAT, list(indata.shape)), + helper.make_tensor_value_info("pads", TensorProto.INT64, (len(pads),)), + ], + initializer=[helper.make_tensor("pads", TensorProto.INT64, (len(pads),), pads)], + outputs=[ + helper.make_tensor_value_info("output", TensorProto.FLOAT, list(outdata.shape)) + ], + ) + else: + inputs = [indata] + outdata = np.pad(indata, pad_width=np_pads, mode="constant", constant_values=value) + node = helper.make_node( + "Pad", + inputs=["input", "pads", "constant_value"], + outputs=["output"], + mode="constant", + name=op_name, + ) + graph = helper.make_graph( + [node], + "pad_test", + inputs=[ + helper.make_tensor_value_info("input", TensorProto.FLOAT, list(indata.shape)), + helper.make_tensor_value_info("pads", TensorProto.INT64, (len(pads),)), + helper.make_tensor_value_info("constant_value", TensorProto.FLOAT, (1,)), + ], + initializer=[ + helper.make_tensor("pads", TensorProto.INT64, (len(pads),), pads), + helper.make_tensor("constant_value", TensorProto.FLOAT, (1,), [value]), + ], + outputs=[ + helper.make_tensor_value_info("output", TensorProto.FLOAT, list(outdata.shape)) + ], + ) + model = helper.make_model(graph, producer_name="pad_test") + verify_with_ort_with_trt(model, inputs, op_name, opset=11) + + +@pytest.mark.skip(reason="TensorRT segmentfault") +def test_pad(): + verify_pad_v11(np.random.randn(2, 2).astype(np.float32), [0, 1, 0, 0], "constant", 0.0) + + +@pytest.mark.skip(reason="TensorRT segmentfault") +def test_batch_norm(): + def verify_batch_norm(in_shape): + op_name = "batchNorm_{}".format(sum(in_shape)) + batchnorm = onnx.helper.make_node( + "BatchNormalization", + inputs=["x", "scale", "B", "mean", "var"], + outputs=["Y"], + name=op_name, + ) + + graph = helper.make_graph( + [batchnorm], + "batchnorm_test", + inputs=[ + helper.make_tensor_value_info("x", TensorProto.FLOAT, list(in_shape)), + helper.make_tensor_value_info("scale", TensorProto.FLOAT, [in_shape[1]]), + helper.make_tensor_value_info("B", TensorProto.FLOAT, [in_shape[1]]), + helper.make_tensor_value_info("mean", TensorProto.FLOAT, [in_shape[1]]), + helper.make_tensor_value_info("var", TensorProto.FLOAT, [in_shape[1]]), + ], + outputs=[helper.make_tensor_value_info("Y", TensorProto.FLOAT, list(in_shape))], + ) + + model = helper.make_model(graph, producer_name="batchnorm_test") + # X, scale, b, mean, var + inshapes = [in_shape, in_shape[1], in_shape[1], in_shape[1], in_shape[1]] + inputs = [np.random.uniform(size=ishape).astype("float32") for ishape in inshapes] + + verify_with_ort_with_trt(model, inputs, op_name=op_name) + + verify_batch_norm([1, 3, 224, 224]) + verify_batch_norm([1, 3, 24, 24]) + verify_batch_norm([16, 3, 24, 24]) + verify_batch_norm([16, 16, 24, 24]) + verify_batch_norm([16, 16, 10, 10]) + + +def verify_softmax(inshape, axis, op_name): + indata = np.random.uniform(size=inshape).astype(np.float32) + outshape = inshape + y = helper.make_node("Softmax", ["in"], ["out"], name=op_name) + if axis is not None: + axis_attr = helper.make_attribute("axis", axis) + y.attribute.append(axis_attr) + + graph = helper.make_graph( + [y], + "Softmax_test", + inputs=[helper.make_tensor_value_info("in", TensorProto.FLOAT, list(indata.shape))], + outputs=[helper.make_tensor_value_info("out", TensorProto.FLOAT, list(outshape))], + ) + + model = helper.make_model(graph, producer_name="Softmax_test") + verify_with_ort_with_trt(model, [indata], op_name=op_name) + + +def test_softmax(): + verify_softmax((1, 10), None, op_name="softmax_0") + # verify_softmax((1, 10), 1, op_name='softmax_1') + + +def verify_mod(x_shape, y_shape, fmod, out_shape, dtype="float32", op_name=""): + x_np = np.random.uniform(-100.0, 100.0, x_shape).astype(dtype) + y_np = np.random.uniform(-100.0, 100.0, y_shape).astype(dtype) + y_np = np.where(y_np == 0, 1, y_np) # remove 0's to avoid division by zero error + + mod_node = helper.make_node("Mod", inputs=["x", "y"], outputs=["z"], fmod=fmod, name=op_name) + + onnx_dtype = TensorProto.FLOAT if dtype == "float32" else TensorProto.INT32 + graph = helper.make_graph( + [mod_node], + "mod_test", + inputs=[ + helper.make_tensor_value_info("x", onnx_dtype, list(x_shape)), + helper.make_tensor_value_info("y", onnx_dtype, list(y_shape)), + ], + outputs=[helper.make_tensor_value_info("z", onnx_dtype, list(out_shape))], + ) + model = helper.make_model(graph, producer_name="mod_test") + # verify_with_ort_with_trt(model, [x_np, y_np], [out_shape], op_name=op_name) + verify_with_ort_with_trt(model, [x_np, y_np], op_name=op_name) + + +def test_mod(): + # Mod + verify_mod( + x_shape=[1, 32, 32], + y_shape=[1, 1, 32], + fmod=0, + out_shape=(1, 32, 32), + dtype="int32", + op_name="tvm_mod", + ) + + +def verify_mean(input_dim, op_name): + dtype = "float32" + a_np1 = np.random.uniform(size=input_dim).astype(dtype) + a_np2 = np.random.uniform(size=input_dim).astype(dtype) + a_np3 = np.random.uniform(size=input_dim).astype(dtype) + + mean_node = helper.make_node("Mean", ["a_np1", "a_np2", "a_np3"], ["out"], name=op_name) + + graph = helper.make_graph( + [mean_node], + "Mean_test", + inputs=[ + helper.make_tensor_value_info("a_np1", TensorProto.FLOAT, list(input_dim)), + helper.make_tensor_value_info("a_np2", TensorProto.FLOAT, list(input_dim)), + helper.make_tensor_value_info("a_np3", TensorProto.FLOAT, list(input_dim)), + ], + outputs=[helper.make_tensor_value_info("out", TensorProto.FLOAT, list(input_dim))], + ) + + model = helper.make_model(graph, producer_name="Mean_test") + verify_with_ort_with_trt(model, [a_np1, a_np2, a_np3], op_name=op_name) + + +def test_forward_mean(): + verify_mean((1, 3, 20, 20), op_name="mean_111") + verify_mean((20, 20), op_name="mean_222") + + +def verify_instance_norm(shape, axis=1, op_name="default"): + x = np.random.randn(*shape).astype(np.float32) + gamma = np.random.randn(shape[1]).astype(np.float32) + beta = np.random.randn(shape[1]).astype(np.float32) + epsilon = 1e-5 + + node = onnx.helper.make_node( + "InstanceNormalization", + inputs=["x", "gamma", "beta"], + outputs=["y"], + epsilon=epsilon, + name=op_name, + ) + graph = helper.make_graph( + [node], + "instance_norm_test", + inputs=[ + helper.make_tensor_value_info("x", TensorProto.FLOAT, list(shape)), + helper.make_tensor_value_info("gamma", TensorProto.FLOAT, (shape[1],)), + helper.make_tensor_value_info("beta", TensorProto.FLOAT, (shape[1],)), + ], + outputs=[helper.make_tensor_value_info("y", TensorProto.FLOAT, list(shape))], + ) + model = helper.make_model(graph, producer_name="instance_norm_test") + verify_with_ort_with_trt(model, [x, gamma, beta], op_name=op_name) + + +def test_instance_norm(): + verify_instance_norm((2, 3, 4, 5), op_name="instance_norm") + # verify_instance_norm((32, 64, 80, 64)) + # verify_instance_norm((8, 6, 5)) + # verify_instance_norm((8, 7, 6, 5, 4)) + + +def verify_lrn(shape, nsize, dtype, alpha=None, beta=None, bias=None, op_name=None): + in_array = np.random.uniform(size=shape).astype(dtype) + + if alpha is None and beta is None and bias is None: + alpha = 0.0001 + beta = 0.75 + bias = 1.0 + node = onnx.helper.make_node( + "LRN", inputs=["in"], outputs=["out"], size=nsize, name=op_name + ) + else: + node = onnx.helper.make_node( + "LRN", + inputs=["in"], + outputs=["out"], + alpha=alpha, + beta=beta, + bias=bias, + size=nsize, + name=op_name, + ) + + graph = helper.make_graph( + [node], + "lrn_test", + inputs=[helper.make_tensor_value_info("in", TensorProto.FLOAT, list(shape))], + outputs=[helper.make_tensor_value_info("out", TensorProto.FLOAT, list(shape))], + ) + model = helper.make_model(graph, producer_name="lrn_test") + verify_with_ort_with_trt(model, [in_array], op_name=op_name) + + +def test_lrn(): + verify_lrn((5, 5, 5, 5), 3, "float32", op_name="test_lrn_1") + verify_lrn( + (5, 5, 5, 5), + 3, + "float32", + alpha=0.0002, + beta=0.5, + bias=2.0, + op_name="test_lrn_2", + ) + + +def test_lstm(): + # # Different activation testing. + # # Default value hardsigmoid. + verify_rnn( + seq_length=2, + batch_size=1, + input_size=16, + hidden_size=32, + use_bias=False, + activations=["HardSigmoid", "Tanh", "Tanh"], + rnn_type="LSTM", + op_name="test_lstm_without_bias", + layout=1, + ) + + +def test_binary_ops(): + in_shape = (1, 2, 3, 3) + dtype = "float32" + out_shape = in_shape + + def verify_binary_ops(op, x, y, out_type="float32", op_name=None): + z = helper.make_node(op, ["in1", "in2"], ["out"], name=op_name) + graph = helper.make_graph( + [z], + "_test", + inputs=[ + helper.make_tensor_value_info("in1", TensorProto.FLOAT, x.shape), + helper.make_tensor_value_info("in2", TensorProto.FLOAT, y.shape), + ], + outputs=[ + helper.make_tensor_value_info( + "out", + mapping.NP_TYPE_TO_TENSOR_TYPE[np.dtype(out_type)], + list(out_shape), + ) + ], + ) + model = helper.make_model(graph, producer_name="_test") + verify_with_ort_with_trt(model, [x, y], op_name=op_name) + + x = np.random.uniform(size=in_shape).astype(dtype) + y = np.random.uniform(size=in_shape).astype(dtype) + z = np.random.uniform(size=(3,)).astype(dtype) + verify_binary_ops("Sub", x, y, op_name="sub_1") + verify_binary_ops("Sub", x, z, op_name="sub_2") + + +def verify_reduce_func(func, data, axis, keepdims, op_name=None): + inshape = data.shape + outshape = np.sum(data, axis=axis, keepdims=keepdims == 1).shape + + if axis: + node = onnx.helper.make_node( + func, + inputs=["x"], + outputs=["y"], + axes=axis, + keepdims=keepdims, + name=op_name, + ) + else: + node = onnx.helper.make_node( + func, inputs=["x"], outputs=["y"], keepdims=keepdims, name=op_name + ) + + graph = helper.make_graph( + [node], + "reduce_test", + inputs=[helper.make_tensor_value_info("x", TensorProto.FLOAT, list(inshape))], + outputs=[helper.make_tensor_value_info("y", TensorProto.FLOAT, list(outshape))], + ) + + model = helper.make_model(graph, producer_name="reduce_test") + + verify_with_ort_with_trt(model, [data], opset=11, op_name=op_name) + + +def test_all_reduce_funcs(): + funcs = [ + # "ReduceMax", + # "ReduceMean", + # "ReduceMin", + # "ReduceProd", + # "ReduceSum", + # "ReduceSumSquare", + "ReduceLogSum", + "ReduceLogSumExp", + "ReduceL1", + "ReduceL2", + ] + + for func in funcs: + for keepdims in [True, False]: + verify_reduce_func( + func, + np.random.randn(3, 2, 2).astype(np.float32), + axis=None, + keepdims=keepdims, + op_name=func + str(int(keepdims)) + "1", + ) + + verify_reduce_func( + func, + np.random.randn(3, 2, 3).astype(np.float32), + axis=None, + keepdims=keepdims, + op_name=func + str(int(keepdims)) + "2", + ) + + verify_reduce_func( + func, + np.random.randn(3, 3, 3).astype(np.float32), + axis=(1,), + keepdims=keepdims, + op_name=func + str(int(keepdims)) + "3", + ) + + verify_reduce_func( + func, + np.random.randn(3, 3, 3, 1).astype(np.float32), + axis=(1, 2), + keepdims=keepdims, + op_name=func + str(int(keepdims)) + "4", + ) + + verify_reduce_func( + func, + np.random.randn(3, 3, 3, 1).astype(np.float32), + axis=(1,), + keepdims=keepdims, + op_name=func + str(int(keepdims)) + "5", + ) + + verify_reduce_func( + func, + np.random.randn(1, 3, 4, 1).astype(np.float32), + axis=(1,), + keepdims=keepdims, + op_name=func + str(int(keepdims)) + "6", + ) + + +def verify_split(indata, outdatas, split, axis=0, pass_split=True, opset=11, op_name=None): + indata = np.array(indata).astype(np.float32) + outdatas = [np.array(o).astype(np.float32) for o in outdatas] + inputs = [helper.make_tensor_value_info("input", TensorProto.FLOAT, list(indata.shape))] + input_names = ["input"] + initializer = [] + + if split: + split_index = range(len(split)) + else: + split_index = range(len(outdatas)) + + if pass_split: + if opset >= 13: + input_names.append("split") + np_split = np.array(split).astype(np.int64) + inputs.append( + helper.make_tensor_value_info("split", TensorProto.INT64, list(np_split.shape)) + ) + indata = [indata, np_split] + initializer.append( + helper.make_tensor("split", TensorProto.INT64, list(np_split.shape), np_split) + ) + node = helper.make_node( + "Split", + inputs=input_names, + outputs=["output_{}".format(i) for i in range(len(split_index))], + axis=axis, + name=op_name, + ) + + if pass_split and opset < 13: + split_attr = helper.make_attribute("split", split) + node.attribute.append(split_attr) + + graph = helper.make_graph( + [node], + "split_test", + inputs=inputs, + initializer=initializer, + outputs=[ + helper.make_tensor_value_info( + "output_{}".format(i), TensorProto.FLOAT, list(outdatas[i].shape) + ) + for i in range(len(split_index)) + ], + ) + model = helper.make_model(graph, producer_name="split_test") + verify_with_ort_with_trt(model, indata, opset=opset, op_name=op_name) + + +def test_split(): + # 1D + verify_split( + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0], + [[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], + [2, 2, 2], + 0, + op_name="split_1", + ) + verify_split( + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0], + [[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], + [2, 2, 2], + 0, + False, + op_name="split_2", + ) + # 2D + verify_split( + [[1.0, 2.0, 3.0, 4.0], [7.0, 8.0, 9.0, 10.0]], + [[[1.0, 2.0], [7.0, 8.0]], [[3.0, 4.0], [9.0, 10.0]]], + [2, 2], + 1, + op_name="split_4", + ) + # Split evenly (unstack) + verify_split([1, 2, 3], [[1], [2], [3]], False, 0, False, op_name="split_5") + # Split a single value to a single value + verify_split([1], [[1]], [1], pass_split=True, op_name="split_6") + + +def verify_xor(x_shape, y_shape, op_name=None): + x_np = np.random.choice(a=[False, True], size=x_shape).astype("bool") + y_np = np.random.choice(a=[False, True], size=y_shape).astype("bool") + + np_out = np.logical_xor(x_np, y_np) + out_shape = np_out.shape + + xor_node = helper.make_node("Xor", inputs=["x", "y"], outputs=["z"], name=op_name) + + onnx_dtype = TensorProto.BOOL + graph = helper.make_graph( + [xor_node], + "xor_test", + inputs=[ + helper.make_tensor_value_info("x", onnx_dtype, list(x_shape)), + helper.make_tensor_value_info("y", onnx_dtype, list(y_shape)), + ], + outputs=[helper.make_tensor_value_info("z", onnx_dtype, list(out_shape))], + ) + model = helper.make_model(graph, producer_name="xor_test") + verify_with_ort_with_trt(model, [x_np, y_np], op_name=op_name) + + +@pytest.mark.skip(reason="TensorRT segmentfault") +def test_xor(): + # XOR + verify_xor(x_shape=[1, 32, 32], y_shape=[1, 32, 32], op_name="test_xor_1") + + # Xor broadcast + verify_xor(x_shape=[1, 32, 32], y_shape=[1, 1, 32], op_name="test_xor_2") + + +def verify_if(cond_array, op_name): + # Given a bool scalar input cond. + # return constant tensor x if cond is True, otherwise return constant tensor y. + then_out = onnx.helper.make_tensor_value_info("then_out", onnx.TensorProto.FLOAT, [5]) + else_out = onnx.helper.make_tensor_value_info("else_out", onnx.TensorProto.FLOAT, [5]) + + x = np.array([1, 2, 3, 4, 5]).astype(np.float32) + y = np.array([5, 4, 3, 2, 1]).astype(np.float32) + + then_const_node = onnx.helper.make_node( + "Constant", inputs=[], outputs=["then_out"], value=numpy_helper.from_array(x) + ) + + else_const_node = onnx.helper.make_node( + "Constant", inputs=[], outputs=["else_out"], value=numpy_helper.from_array(y) + ) + + then_body = onnx.helper.make_graph([then_const_node], "then_body", [], [then_out]) + + else_body = onnx.helper.make_graph([else_const_node], "else_body", [], [else_out]) + + if_node = onnx.helper.make_node( + "If", + inputs=["cond"], + outputs=["res"], + then_branch=then_body, + else_branch=else_body, + name=op_name, + ) + + if_graph = onnx.helper.make_graph( + [if_node], + "if_outer", + inputs=[ + onnx.helper.make_tensor_value_info("cond", onnx.TensorProto.BOOL, []), + ], + outputs=[ + onnx.helper.make_tensor_value_info("res", onnx.TensorProto.FLOAT, [5]), + ], + ) + + if_model = onnx.helper.make_model(if_graph) + if cond_array: + cond = np.array([1]).astype("bool") + else: + cond = np.array(1).astype("bool") + verify_with_ort_with_trt(if_model, [cond], op_name=op_name) + + +@pytest.mark.skip( + reason="ORT: NOT_IMPLEMENTED : Could not find an implementation for If(19) node with name 'if_test_1'" +) +def test_if(): + # Confirm that if works with cond as an array or scalar. + verify_if(cond_array=False, op_name="if_test_1") + verify_if(cond_array=True, op_name="if_test_2") + + +def test_softmax_cross_entropyloss(): + op_name = "test_SoftmaxCrossEntropyLoss" + reduction = "mean" + ignore_index = np.int64(-1) + + node = onnx.helper.make_node( + "SoftmaxCrossEntropyLoss", + inputs=["x", "y", "w"], + outputs=["z"], + reduction=reduction, + ignore_index=ignore_index, + name=op_name, + ) + # NOCC:invalid-name(å…¶ä»–:onnx example) + N, C, dim1 = 3, 5, 6 + np.random.seed(0) + x = np.random.rand(N, C, dim1).astype(np.float32) + labels = np.random.randint(0, high=C, size=(N, dim1)).astype(np.int64) + labels[0][0] = -1 + weight = np.random.rand(C).astype(np.float32) + from onnx.backend.test.case.node.softmaxcrossentropy import softmaxcrossentropy + + sce = softmaxcrossentropy( + x, labels, weight=weight, reduction=reduction, ignore_index=ignore_index + ) + + op_expect( + node, + inputs=[x, labels, weight], + outputs=[sce], + op_name=op_name, + op_type="float32", + ) + + +def _test_logical(method, op_name): + batch_size = 128 + input_data = (2 * np.random.rand(batch_size, 256) - 1).astype(np.float32) + with tf.Graph().as_default(): + input_ph = tf.placeholder(dtype=tf.float32, shape=[batch_size, 256], name="input") + x = tf.nn.relu(input_ph) + mask = tf.cast(x, tf.bool) + x = tf.nn.relu(tf.layers.dense(x, 256)) + y = x + x = tf.cast(x, tf.bool) + if method == "or": + x = tf.math.logical_or(x, mask, name=op_name) + elif method == "and": + x = tf.math.logical_and(x, mask, name=op_name) + elif method == "not": + x = tf.math.logical_not(x, name=op_name) + elif method == "equal": + x = tf.math.equal(x, mask, name=op_name) + elif method == "greater": + x = tf.math.greater(y, input_ph, name=op_name) + elif method == "xor": + x = tf.math.logical_xor(x, mask, name=op_name) + elif method == "is_inf": + x = tf.math.is_inf(input_ph, name=op_name) + elif method == "is_nan": + x = tf.math.is_nan(input_ph, name=op_name) + _ = tf.identity(x, name="output") + verify_tf_with_trt_result([input_data], ["input:0"], ["output:0"], op_name) + + +@pytest.mark.skip(reason="TensorRT segmentfault") +def test_logical(): + _test_logical("or", "test_logical_or") + _test_logical("and", "test_logical_and") + _test_logical("not", "test_logical_not") + _test_logical("equal", "test_logical_equal") + _test_logical("greater", "test_logical_greater") + _test_logical("xor", "test_logical_xor") + _test_logical("is_inf", "test_logical_inf") + _test_logical("is_nan", "test_logical_nan") + + +@pytest.mark.skip(reason="TensorRT segmentfault") +def test_scatternd(): + batch_size = 32 + op_name = "scatternd" + with tf.Graph().as_default(): + input_ph = tf.placeholder( + dtype=tf.float32, shape=[batch_size, 10], name="input" + ) # [batchsize, 10] + input_data = np.random.rand(batch_size, 10).astype(np.float32) + x = tf.layers.dense(input_ph, 1) + # duplicated indices case (undefined) + # test ScatterND (32, 128, 128, 256) (32, 600, 3) (32, 600, 256) + data = tf.tile(tf.reshape(tf.layers.dense(x, 128 * 128), [-1, 128, 128, 1]), [1, 1, 1, 256]) + x = tf.add(x, 1) + idx = tf.reshape(tf.layers.dense(x, 600 * 3), [-1, 600, 3]) + idx = tf.cast(tf.clip_by_value(idx, 0, 1), tf.int32) + indices = idx + # indices = tf.zeros([32, 600, 3], dtype=tf.dtypes.int32) + # indices = tf.stack([tf.range(tf.shape(x)[0]), idx], axis=1) + x = tf.add(x, 2) + updates = tf.reshape(tf.layers.dense(x, 600 * 256), [-1, 600, 256]) + # updates = tf.ones([32, 600, 256]) + x = tf.tensor_scatter_nd_update(data, indices, updates, name=op_name) + # x = tf.scatter_nd(indices, updates, data.shape) + _ = tf.identity(x, name="output") + verify_tf_with_trt_result([input_data], ["input:0"], ["output:0"], op_name) diff --git a/tests/python/tpat/cuda/trt.py b/tests/python/tpat/cuda/trt.py new file mode 100644 index 000000000000..4cf4151c2f43 --- /dev/null +++ b/tests/python/tpat/cuda/trt.py @@ -0,0 +1,178 @@ +# +# Copyright 1993-2019 NVIDIA Corporation. All rights reserved. +# +# NOTICE TO LICENSEE: +# +# This source code and/or documentation ("Licensed Deliverables") are +# subject to NVIDIA intellectual property rights under U.S. and +# international Copyright laws. +# +# These Licensed Deliverables contained herein is PROPRIETARY and +# CONFIDENTIAL to NVIDIA and is being provided under the terms and +# conditions of a form of NVIDIA software license agreement by and +# between NVIDIA and Licensee ("License Agreement") or electronically +# accepted by Licensee. Notwithstanding any terms or conditions to +# the contrary in the License Agreement, reproduction or disclosure +# of the Licensed Deliverables to any third party without the express +# written consent of NVIDIA is prohibited. +# +# NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE +# LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE +# SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS +# PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. +# NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED +# DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, +# NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. +# NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE +# LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY +# SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY +# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS +# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE +# OF THESE LICENSED DELIVERABLES. +# +# U.S. Government End Users. These Licensed Deliverables are a +# "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT +# 1995), consisting of "commercial computer software" and "commercial +# computer software documentation" as such terms are used in 48 +# C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government +# only as a commercial end item. Consistent with 48 C.F.R.12.212 and +# 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all +# U.S. Government End Users acquire the Licensed Deliverables with +# only those rights set forth herein. +# +# Any use of the Licensed Deliverables in individual and commercial +# software must include, in the user documentation and internal +# comments to the code, the above Disclaimer and U.S. Government End +# Users Notice. +# + +import ctypes +import os + +import numpy as np +import pycuda.autoinit +import pycuda.driver as cuda +import tensorrt as trt + + +def GiB(val): + return val * 1 << 30 + + +# Simple helper data class that's a little nicer to use than a 2-tuple. +class HostDeviceMem(object): + def __init__(self, host_mem, device_mem): + self.host = host_mem + self.device = device_mem + + def __str__(self): + return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device) + + def __repr__(self): + return self.__str__() + + +# Allocates all buffers required for an engine, i.e. host/device inputs/outputs. +def allocate_buffers(engine): + inputs = [] + outputs = [] + bindings = [] + stream = cuda.Stream() + for binding in engine: + size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size + dtype = trt.nptype(engine.get_binding_dtype(binding)) + # Allocate host and device buffers + host_mem = cuda.pagelocked_empty(size, dtype) + device_mem = cuda.mem_alloc(host_mem.nbytes) + # Append the device buffer to device bindings. + bindings.append(int(device_mem)) + # Append to the appropriate list. + if engine.binding_is_input(binding): + inputs.append(HostDeviceMem(host_mem, device_mem)) + else: + outputs.append(HostDeviceMem(host_mem, device_mem)) + return inputs, outputs, bindings, stream + + +# This function is generalized for multiple inputs/outputs. +# inputs and outputs are expected to be lists of HostDeviceMem objects. +def do_inference(context, bindings, inputs, outputs, stream, batch_size=1): + # Transfer input data to the GPU. + [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs] + # Run inference. + context.execute_async_v2(bindings=bindings, stream_handle=stream.handle) + # Transfer predictions back from the GPU. + [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs] + # Synchronize the stream + stream.synchronize() + # Return only the host outputs. + return [out.host for out in outputs] + + +def build_engine( + onnx_model_path, + trt_logger=trt.Logger(trt.Logger.WARNING), + trt_engine_datatype=trt.DataType.FLOAT, + batch_size=1, + silent=False, +): + try: + with trt.Builder(trt_logger) as builder, builder.create_network( # type: ignore + 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH) # type: ignore + ) as network, trt.OnnxParser( # type: ignore + network, trt_logger + ) as parser: + # https://github.com/NVIDIA/TensorRT/blob/main/demo/BERT/builder.py#L405 + builder_config = builder.create_builder_config() + builder_config.max_workspace_size = 2 << 60 + builder.max_batch_size = batch_size + + if trt_engine_datatype == trt.DataType.HALF: + builder_config.set_flag(trt.BuilderFlag.FP16) + elif trt_engine_datatype == trt.DataType.INT8: + builder_config.set_flag(trt.BuilderFlag.INT8) + + with open(onnx_model_path, "rb") as model: + # parse onnx model + parser.parse(model.read()) + for i in range(parser.num_errors): + print(parser.get_error(i)) + engine = builder.build_engine(network, builder_config) + if engine is None: + print("[ERROR] engine is None") + exit(-1) + return engine + except Exception as e: + print(e.with_traceback()) + + +def save_engine(engine, engine_dest_path): + buf = engine.serialize() + with open(engine_dest_path, "wb") as f: + f.write(buf) + + +def load_engine(trt_runtime, engine_path): + with open(engine_path, "rb") as f: + engine_data = f.read() + engine = trt_runtime.deserialize_cuda_engine(engine_data) + return engine + + +def load_plugin(trt_plugins): + libs = [] + for trt_plugin in trt_plugins: + assert os.path.isfile(trt_plugin) + lib = ctypes.CDLL(trt_plugin, winmode=0) + libs.append(lib) + return libs + + +def remove_plugin(libs): + for lib in libs: + _unload_lib(lib) + + +def _unload_lib(lib): + del lib From 2fea9e28f5b8b4b1e20d29d81d02587b68ef1634 Mon Sep 17 00:00:00 2001 From: Civitasv Date: Fri, 11 Aug 2023 11:36:18 +0800 Subject: [PATCH 03/14] [tensorrt] [byoc] [plugin] make cpp side api clearer --- python/tvm/relay/backend/executor_factory.py | 2 + python/tvm/relay/build_module.py | 5 +- python/tvm/tpat/cuda/plugin/Makefile | 6 +-- src/driver/driver_api.cc | 5 -- src/relay/backend/aot_executor_codegen.cc | 16 ------ src/relay/backend/graph_executor_codegen.cc | 18 +++---- src/relay/backend/utils.h | 1 - src/runtime/cuda/cuda_module.cc | 15 +++--- src/runtime/graph_executor/graph_executor.h | 1 + .../transforms/lower_device_kernel_launch.cc | 53 ++++++++++--------- src/tir/transforms/make_packed_api.cc | 9 +--- src/tir/transforms/split_host_device.cc | 19 ++----- 12 files changed, 57 insertions(+), 93 deletions(-) diff --git a/python/tvm/relay/backend/executor_factory.py b/python/tvm/relay/backend/executor_factory.py index 9eafcc2cfb93..bc1abfe2ca31 100644 --- a/python/tvm/relay/backend/executor_factory.py +++ b/python/tvm/relay/backend/executor_factory.py @@ -200,6 +200,8 @@ def __init__( self.iter_cnt = 0 self.function_metadata = function_metadata + print("SELF MODULE :::", dir(self.module)) + self.constant_params = constant_params self.device_funcs_list_func = get_global_func("tir.transform.retrieve_device_funcs_list") self.device_memory_size_func = get_global_func("tir.transform.retrieve_device_memory_size") diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py index 33783a74315a..1621255d3df2 100644 --- a/python/tvm/relay/build_module.py +++ b/python/tvm/relay/build_module.py @@ -70,7 +70,7 @@ def __init__(self): self._get_executor_codegen_metadata = self.mod["get_executor_codegen_metadata"] self._get_devices = self.mod["get_devices"] self._get_irmodule = self.mod["get_irmodule"] - self._get_constant_params_func = self.mod["get_constant_params"] + self._get_constant_params = self.mod["get_constant_params"] def build( self, @@ -251,7 +251,8 @@ def get_params(self): return ret def get_constant_params(self): - params = self._get_constant_params_func() + """Return the constant params.""" + params = self._get_constant_params() ret = {} for key, value in params.items(): ret[key] = value.data.asnumpy() diff --git a/python/tvm/tpat/cuda/plugin/Makefile b/python/tvm/tpat/cuda/plugin/Makefile index f9b48ffcf27d..d90f15f1bd77 100644 --- a/python/tvm/tpat/cuda/plugin/Makefile +++ b/python/tvm/tpat/cuda/plugin/Makefile @@ -14,9 +14,9 @@ # limitations under the License. # -CUDA_PATH = /home/huangzhe1/anaconda3/envs/tvm_tunning -CUDNN_PATH = /home/huangzhe1/husen/cudnn-linux-x86_64-8.9.3.28_cuda11-archive -TRT_PATH = /home/huangzhe1/husen/TensorRT-8.6.1.6 +CUDA_PATH = /path/to/cuda +CUDNN_PATH = /path/to/cudnn +TRT_PATH = /path/to/TensorRT CUDA_INC_PATH = $(CUDA_PATH)/include CUDA_LIB_PATH = $(CUDA_PATH)/lib diff --git a/src/driver/driver_api.cc b/src/driver/driver_api.cc index 7a27bbddddfe..b7ba0ffe4468 100644 --- a/src/driver/driver_api.cc +++ b/src/driver/driver_api.cc @@ -601,8 +601,6 @@ transform::Sequential MixedModulePassManager(IRModule mixed_mod, Target target) } mixed_pass_list.push_back(tir::transform::AnnotateDeviceRegions()); - - // std::cout << "@1. SplitHostDevice" << '\n'; mixed_pass_list.push_back(tir::transform::SplitHostDevice()); bool unpacked_api = mixed_mod->GetAttr(tvm::attr::kExecutor) @@ -610,16 +608,13 @@ transform::Sequential MixedModulePassManager(IRModule mixed_mod, Target target) ->GetAttr("unpacked-api") .value_or(Bool(false)); if (unpacked_api) { - // std::cout << "@2.1 UNMakePackedAPI" << '\n'; mixed_pass_list.push_back(tir::transform::MakeUnpackedAPI()); } else { - // std::cout << "@2.2 MakePackedAPI" << '\n'; mixed_pass_list.push_back(tir::transform::MakePackedAPI()); } mixed_pass_list.push_back(tir::transform::FP8StorageLegalize()); mixed_pass_list.push_back(tir::transform::BF16StorageLegalize()); - // std::cout << "@3. LowerDevice" << '\n'; mixed_pass_list.push_back(tir::transform::LowerDeviceKernelLaunch()); return transform::Sequential(mixed_pass_list); diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc index ade89e544a52..ee4e98b4b22e 100644 --- a/src/relay/backend/aot_executor_codegen.cc +++ b/src/relay/backend/aot_executor_codegen.cc @@ -1228,22 +1228,17 @@ class AOTExecutorCodegen : public MixedModeVisitor { // Collect any constants extracted by external codegen. ret.params = std::unordered_map(); - ret.params_for_tpat = std::unordered_map>(); Map const_name_to_constant = lowered_mod->GetAttr>(tvm::attr::kConstNameToConstant) .value_or({}); for (const auto& kv : const_name_to_constant) { ICHECK(ret.params.emplace(kv.first, kv.second).second); - ret.params_for_tpat.emplace(std::make_pair( - kv.first, std::make_pair(static_cast(param_storage_ids_[kv.first]), kv.second))); } // Collect any constants extracted during lowering. for (const auto& kv : params_) { ICHECK(ret.params.emplace(kv.first, kv.second).second); - ret.params_for_tpat.emplace(std::make_pair( - kv.first, std::make_pair(static_cast(param_storage_ids_[kv.first]), kv.second))); } // AoT Executor codegen works completely on TIR beyond this point, hence removing relay main @@ -1393,11 +1388,6 @@ class AOTExecutorCodegenModule : public runtime::ModuleNode { String key = args[0]; *rv = get_param_by_name(key); }); - } else if (name == "get_param_id") { - return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { - String key = args[0]; - *rv = get_param_id(key); - }); } else if (name == "get_irmodule") { return PackedFunc( [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = get_irmodule(); }); @@ -1447,12 +1437,6 @@ class AOTExecutorCodegenModule : public runtime::ModuleNode { Array get_external_modules() { return output_.external_mods; } - int get_param_id(String key) { - auto it = this->output_.params_for_tpat.find(key); - CHECK(it != this->output_.params_for_tpat.end()) << "no such parameter " << key; - return (*it).second.first; - } - Map get_irmodule() { return this->output_.lowered_funcs; } std::shared_ptr codegen_; diff --git a/src/relay/backend/graph_executor_codegen.cc b/src/relay/backend/graph_executor_codegen.cc index 180a6273a803..15c62d7f8fae 100644 --- a/src/relay/backend/graph_executor_codegen.cc +++ b/src/relay/backend/graph_executor_codegen.cc @@ -266,7 +266,6 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslator(); - ret.params_for_tpat = std::unordered_map>(); Map const_name_to_constant = lowered_mod->GetAttr>(tvm::attr::kConstNameToConstant) @@ -274,18 +273,12 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslator(param_storage_ids_[kv.first]), kv.second))); } // Collect any constants extracted during lowering. for (const auto& kv : params_) { VLOG(1) << "constant '" << kv.first << "' contributed by TECompiler"; ICHECK(ret.params.emplace(kv.first, kv.second).second); - ret.params_for_tpat.emplace(std::make_pair( - kv.first, - std::make_pair(static_cast(param_storage_ids_[kv.first]), kv.second))); } ret.function_metadata = std::move(function_metadata_); @@ -300,6 +293,10 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslator param_storage_ids() { + return param_storage_ids_; + } + protected: /*! * \brief Add node to graph @@ -674,9 +671,10 @@ class GraphExecutorCodegenModule : public runtime::ModuleNode { } else if (name == "get_param_id") { return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { String key = args[0]; - auto it = this->output_.params_for_tpat.find(key); - CHECK(it != this->output_.params_for_tpat.end()) << "no such parameter " << key; - *rv = (*it).second.first; + auto it = this->output_.params.find(key); + CHECK(it != this->output_.params.end()) << "no such parameter " << key; + auto storage_ids = this->codegen_->param_storage_ids(); + *rv = static_cast(storage_ids[(*it).first]); }); } else if (name == "get_irmodule") { return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { diff --git a/src/relay/backend/utils.h b/src/relay/backend/utils.h index 7b50e70f034b..97b28a021903 100644 --- a/src/relay/backend/utils.h +++ b/src/relay/backend/utils.h @@ -305,7 +305,6 @@ struct LoweredOutput { */ std::unordered_map params; - std::unordered_map> params_for_tpat; ExecutorCodegenMetadata metadata; }; diff --git a/src/runtime/cuda/cuda_module.cc b/src/runtime/cuda/cuda_module.cc index 5d0a98b4b54a..fdd070791544 100644 --- a/src/runtime/cuda/cuda_module.cc +++ b/src/runtime/cuda/cuda_module.cc @@ -28,12 +28,11 @@ #include #include +#include #include #include #include -#include - #include "../file_utils.h" #include "../meta_data.h" #include "../pack_args.h" @@ -43,7 +42,8 @@ namespace tvm { namespace runtime { -std::vector device_funcs_thread_config; +// funcs thread config +std::vector funcs_thread_config; // Module to support thread-safe multi-GPU execution. // cuModule is a per-GPU module @@ -210,12 +210,11 @@ class CUDAWrappedFunc { LOG(FATAL) << os.str(); } else { std::stringstream ss; - ss << func_name_ - << " grid=(" << wl.grid_dim(0) << "," << wl.grid_dim(1) << "," + ss << func_name_ << " grid=(" << wl.grid_dim(0) << "," << wl.grid_dim(1) << "," << wl.grid_dim(2) << ") " << " block=(" << wl.block_dim(0) << "," << wl.block_dim(1) << "," << wl.block_dim(2) << ")\n"; - device_funcs_thread_config.push_back(ss.str()); + funcs_thread_config.push_back(ss.str()); } } @@ -275,7 +274,7 @@ PackedFunc CUDAModuleNode::GetFunction(const String& name, const ObjectPtr fmap, std::string cuda_source) { - device_funcs_thread_config.clear(); + funcs_thread_config.clear(); auto n = make_object(data, fmt, fmap, cuda_source); return Module(n); } @@ -304,7 +303,7 @@ Module CUDAModuleLoadBinary(void* strm) { String CUDAModuleGetGridBlockThreadConfig() { String ret = ""; - for (auto func_config : device_funcs_thread_config) { + for (const String& func_config : funcs_thread_config) { ret = ret + func_config; } return ret; diff --git a/src/runtime/graph_executor/graph_executor.h b/src/runtime/graph_executor/graph_executor.h index 8bbabd1c9c72..40731c303816 100644 --- a/src/runtime/graph_executor/graph_executor.h +++ b/src/runtime/graph_executor/graph_executor.h @@ -422,6 +422,7 @@ class TVM_DLL GraphExecutor : public ModuleNode { String GetWorkspaceSize(); /*! \brief get the exec func in order*/ String GetFuncList(); + /*! \brief get storage ids*/ String GetStorageId(); int GetOutputEid(int index) const; /*! \brief PackedFunc to lookup a linked paramter from a local Module. */ diff --git a/src/tir/transforms/lower_device_kernel_launch.cc b/src/tir/transforms/lower_device_kernel_launch.cc index 76e1e69f7444..b8a0ef240bc4 100644 --- a/src/tir/transforms/lower_device_kernel_launch.cc +++ b/src/tir/transforms/lower_device_kernel_launch.cc @@ -28,6 +28,7 @@ #include #include #include + #include #include "../../runtime/thread_storage_scope.h" @@ -36,7 +37,7 @@ namespace tvm { namespace tir { extern std::unordered_map > host_name_to_param; -extern std::unordered_map curr2prev; +extern std::unordered_map name_to_prefix; std::vector device_funcs; std::vector device_memory_size; @@ -125,13 +126,7 @@ class DeviceInfoCollector : public StmtVisitor { } void VisitStmt_(const AllocateNode* op) final { - std::ostringstream os; - os << op->buffer_var.get() << " " << op->dtype << " "; - for (auto extent : op->extents) { - os << extent << " "; - } - os << "\n"; - device_memory_size.push_back(os.str()); + ResolveDeviceMemorySize(op); auto storage_scope = runtime::StorageScope::Create(GetPtrStorageScope(op->buffer_var)); if (storage_scope.rank == runtime::StorageRank::kShared && storage_scope.tag == ".dyn") { @@ -149,6 +144,16 @@ class DeviceInfoCollector : public StmtVisitor { StmtVisitor::VisitStmt_(op); } + void ResolveDeviceMemorySize(const AllocateNode* op) { + std::stringstream ss; + ss << op->buffer_var.get() << " " << op->dtype << " "; + for (auto extent : op->extents) { + ss << extent << " "; + } + ss << "\n"; + device_memory_size.push_back(ss.str()); + } + // The collected results KernelInfo info_; // recording what thread axis have been visited. @@ -311,23 +316,30 @@ class DeviceKernelMutator : public StmtExprMutator { device_kernel_launch_.insert(gvar); Array call_args; - Array cuda_kernel_args; call_args.push_back(StringImm(dev_info.global_symbol)); for (PrimExpr arg : node->args) { call_args.push_back(arg); - cuda_kernel_args.push_back(arg); } for (const auto& launch_arg : dev_info.launch_args) { call_args.push_back(Substitute(launch_arg, param_map)); } + + ResolveDeviceFuncs(gvar->name_hint, node->args); + + auto dtype = node->dtype.is_void() ? DataType::Int(32) : node->dtype; + + return Call(dtype, builtin::tvm_call_packed(), call_args); + } + + void ResolveDeviceFuncs(const String& name_hint, const Array& args) { std::stringstream ss; - ss << gvar->name_hint << " "; - for (auto arg : cuda_kernel_args) { + ss << name_hint << " "; + for (auto arg : args) { bool find_param_in_host = false; - for (int i = 0; i < host_name_to_param[curr2prev[gvar->name_hint]].size(); ++i) { - if (arg.same_as(host_name_to_param[curr2prev[gvar->name_hint]][i])) { - ss << i << " "; + for (int i = 0; i < host_name_to_param[name_to_prefix[name_hint]].size(); ++i) { + if (arg.same_as(host_name_to_param[name_to_prefix[name_hint]][i])) { + ss << i << " "; find_param_in_host = true; } } @@ -338,16 +350,6 @@ class DeviceKernelMutator : public StmtExprMutator { } ss << "\n"; device_funcs.push_back(ss.str()); - - // std::cout << "3. Lower device kernel" << '\n'; - // for (auto& item: device_funcs) { - // std::cout << item << ", "; - // } - // std::cout << '\n'; - - auto dtype = node->dtype.is_void() ? DataType::Int(32) : node->dtype; - - return Call(dtype, builtin::tvm_call_packed(), call_args); } Optional current_target_; @@ -429,7 +431,6 @@ Pass LowerDeviceKernelLaunch() { TVM_REGISTER_GLOBAL("tir.transform.LowerDeviceKernelLaunch") .set_body_typed(LowerDeviceKernelLaunch); - TVM_REGISTER_GLOBAL("tir.transform.retrieve_device_funcs_list") .set_body([](TVMArgs args, TVMRetValue* rv) { *rv = GetDeviceFuncsList(); }); diff --git a/src/tir/transforms/make_packed_api.cc b/src/tir/transforms/make_packed_api.cc index f51b079a2ff9..6e7f597a9583 100644 --- a/src/tir/transforms/make_packed_api.cc +++ b/src/tir/transforms/make_packed_api.cc @@ -321,12 +321,6 @@ PrimFunc MakePackedAPI(PrimFunc func) { host_name_to_param[name_hint] = cur_func_param; - // std::cout << "2.2. IN MAKE_PACKED_API, NAME HINT: " << name_hint << " : " << '\n'; - // for (auto& item: cur_func_param) { - // std::cout << ">>> " << item << ", "; - // } - // std::cout << "=====================\n\n\n"; - Array args{v_packed_args, buf_packed_arg_type_ids->data, v_num_packed_args, v_out_ret_value, v_out_ret_tcode, v_resource_handle}; @@ -397,6 +391,8 @@ namespace transform { Pass MakePackedAPI() { auto pass_func = [](IRModule mod, PassContext ctx) { + host_name_to_param.clear(); + Map packed_func_methods; for (const auto& [gvar, base_func] : mod->functions) { if (auto opt = base_func.as()) { @@ -409,7 +405,6 @@ Pass MakePackedAPI() { IRModuleNode* mptr = mod.CopyOnWrite(); IRModule updates; - host_name_to_param.clear(); for (const auto& [gvar, base_func] : mptr->functions) { if (auto opt = base_func.as()) { diff --git a/src/tir/transforms/split_host_device.cc b/src/tir/transforms/split_host_device.cc index a1788758718c..d79e30520b94 100644 --- a/src/tir/transforms/split_host_device.cc +++ b/src/tir/transforms/split_host_device.cc @@ -32,8 +32,6 @@ #include #include -#include -#include #include #include "../../runtime/thread_storage_scope.h" @@ -43,8 +41,7 @@ namespace tvm { namespace tir { -extern std::unordered_map > host_name_to_param; -std::unordered_map curr2prev; +std::unordered_map name_to_prefix; class HostDeviceSplitter : public StmtMutator { public: @@ -98,7 +95,7 @@ class HostDeviceSplitter : public StmtMutator { GlobalVar kernel_symbol_global = var_supply_(); - curr2prev[kernel_symbol_global->name_hint] = name_prefix_; + name_to_prefix[kernel_symbol_global->name_hint] = name_prefix_; PrimFunc device_func(params, body, kernel_ret_type); device_func = WithAttrs(std::move(device_func), {{tvm::attr::kTarget, device_target}, @@ -108,15 +105,6 @@ class HostDeviceSplitter : public StmtMutator { (*device_mod_)->Add(kernel_symbol_global, device_func); Array args = params.Map([](const Var& var) -> PrimExpr { return var; }); - // std::cout << "1. IN SPLIT HOST DEVICE: " << '\n'; - // for (auto& entry : host_name_to_param) { - // std::cout << ">>> NAME HINT: " << entry.first << " : " << '\n'; - // for (auto& item : entry.second) { - // std::cout << ">>> " << item << ", "; - // } - // } - // std::cout << "=========================\n\n\n"; - if (can_propagate_errors) { Var kernel_error_code("kernel_error_code", success->dtype); Call kernel_call(success->dtype, kernel_symbol_global, args); @@ -134,6 +122,7 @@ class HostDeviceSplitter : public StmtMutator { IRModule* device_mod_; // Generate new GlobalVar for the kernel std::function var_supply_; + // name prefix of function std::string name_prefix_; }; @@ -157,7 +146,7 @@ Pass SplitHostDevice() { IRModule device_mod = IRModule(Map({})); IRModule updates = IRModule(Map({})); - curr2prev.clear(); + name_to_prefix.clear(); for (const auto& [gvar, base_func] : mod->functions) { if (auto opt = base_func.as()) { From cb1c86ccd9f65420a08d143ecacaf2349c763b05 Mon Sep 17 00:00:00 2001 From: Civitasv Date: Fri, 11 Aug 2023 14:50:09 +0800 Subject: [PATCH 04/14] [tensorrt] [byoc] [plugin] Allow users to specify tunning option --- python/tvm/contrib/graph_executor.py | 4 +- python/tvm/relay/backend/executor_factory.py | 14 +- python/tvm/tpat/cuda/kernel.py | 70 +++- python/tvm/tpat/cuda/pipeline.py | 8 +- python/tvm/tpat/cuda/plugin/Makefile | 16 +- python/tvm/tpat/cuda/template.py | 57 +-- python/tvm/tpat/cuda/template_params.py | 364 +++++++++--------- src/runtime/cuda/cuda_module.cc | 6 +- src/runtime/graph_executor/graph_executor.cc | 2 +- .../transforms/lower_device_kernel_launch.cc | 14 +- src/tir/transforms/make_packed_api.cc | 12 +- tests/python/tpat/cuda/common.py | 12 +- 12 files changed, 309 insertions(+), 270 deletions(-) diff --git a/python/tvm/contrib/graph_executor.py b/python/tvm/contrib/graph_executor.py index 25a5cc46aa8d..ea8a402900a2 100644 --- a/python/tvm/contrib/graph_executor.py +++ b/python/tvm/contrib/graph_executor.py @@ -180,7 +180,7 @@ def __init__(self, module): self._get_workspace_dtype = module["get_workspace_dtype"] self._get_workspace_size = module["get_workspace_size"] - self._get_func_inorder = module["get_func_inorder"] + self._get_func_list = module["get_func_list"] self._get_storageid = module["get_storageid"] self._get_output_eid = module["get_output_eid"] @@ -547,7 +547,7 @@ def get_func_inorder(self): dtype : str The Host function execute order """ - return self._get_func_inorder() + return self._get_func_list() def get_storageid(self): return self._get_storageid() diff --git a/python/tvm/relay/backend/executor_factory.py b/python/tvm/relay/backend/executor_factory.py index bc1abfe2ca31..9095ae8e59d5 100644 --- a/python/tvm/relay/backend/executor_factory.py +++ b/python/tvm/relay/backend/executor_factory.py @@ -200,12 +200,10 @@ def __init__( self.iter_cnt = 0 self.function_metadata = function_metadata - print("SELF MODULE :::", dir(self.module)) - self.constant_params = constant_params - self.device_funcs_list_func = get_global_func("tir.transform.retrieve_device_funcs_list") - self.device_memory_size_func = get_global_func("tir.transform.retrieve_device_memory_size") - self.grid_block_thread_config_func = get_global_func("runtime.module.retrieve_grid_block_thread_config") + self.device_function_list = get_global_func("tir.transform.retrieve_device_function_list") + self.device_function_thread_config = get_global_func("runtime.module.retrieve_device_function_thread_config") + self.device_memory_size = get_global_func("tir.transform.retrieve_device_memory_size") def export_library(self, file_name, fcompile=None, addons=None, **kwargs): @@ -230,10 +228,10 @@ def get_constant_params(self): return self.constant_params def get_device_function_list(self): - return self.device_funcs_list_func() + return self.device_function_list() def get_grid_block_thread_config(self): - return self.grid_block_thread_config_func() + return self.device_function_thread_config() def get_device_memory_size(self): - return self.device_memory_size_func() \ No newline at end of file + return self.device_memory_size() \ No newline at end of file diff --git a/python/tvm/tpat/cuda/kernel.py b/python/tvm/tpat/cuda/kernel.py index b9a543acb33d..c37dcd01a57d 100644 --- a/python/tvm/tpat/cuda/kernel.py +++ b/python/tvm/tpat/cuda/kernel.py @@ -23,16 +23,17 @@ class Config(object): - def __init__(self, onnx_model, input_shapes, target, work_dir) -> None: + def __init__(self, onnx_model, input_shapes, target, tunning_option) -> None: self.onnx_model = onnx_model self.input_shapes = input_shapes - self.work_dir = work_dir + self.tunning_option = tunning_option + self.work_dir = tunning_option["work_dir"] if tunning_option["work_dir"] else "./log_db" if target == "gpu": self.target = self._detect_cuda_target() def tune_option(self): - return { + default = { "target": self.target, "builder": ms.builder.LocalBuilder(), "runner": ms.runner.LocalRunner(), @@ -41,6 +42,9 @@ def tune_option(self): "work_dir": self.work_dir, } + default.update(self.tunning_option) + return default + def _detect_cuda_target(self): dev = tvm.cuda() if not dev.exist: @@ -59,10 +63,10 @@ def _detect_cuda_target(self): class Kernel(object): - def __init__(self, name, onnx_model, input_shapes, enable_tunning, work_dir): + def __init__(self, name, onnx_model, input_shapes, enable_tunning, tunning_option): self._name = name self._enable_tunning = enable_tunning - self._config = Config(onnx_model, input_shapes, "gpu", work_dir) + self._config = Config(onnx_model, input_shapes, "gpu", tunning_option) self._lib = None self._module = None @@ -113,6 +117,14 @@ def run(self): self._lib = None self._module = None + @property + def build_module(self): + return self._lib + + @property + def graph_module(self): + return self._module + @property def cuda_source_code(self): """Return source code of this kernel. @@ -136,51 +148,75 @@ def cuda_source_code(self): return source_code @property - def runtime_module(self): - return self._lib - - @property - def graph_module(self): - return self._module + def constant_params(self): + """Get constant params of the built module. - @property - def constant_param(self): + It's a map, whose key is the storage id of param, + value is the numpy data of param. + """ return self._lib.get_constant_params() if self._lib else None @property - def device_funcs_inorder(self): + def device_function_list(self): + """Get a list of functions which will executed by device. + + The format is: param1 param2 ... paramn. + + If param is in constant params list, it will be an address, + or it will be an index which indicates the order of it. + """ return self._lib.get_device_function_list() if self._lib else None @property - def device_funcs_thread_config(self): + def device_function_thread_config(self): + """Get block and grid dim config for kernel functions. + + The format is: grid=(x, y, z) block=(x, y, z). + """ return self._lib.get_grid_block_thread_config() if self._lib else None @property - def device_allocate_global_memory(self): + def device_allocate_memory_size(self): + """Get allocate memory for kernel functions. + + The format is: + """ return self._lib.get_device_memory_size() if self._lib else None @property def num_inputs(self): + """Get input number of node.""" return self._module.get_num_inputs() if self._module else None @property def num_outputs(self): + """Get output number of node.""" return self._module.get_num_outputs() if self._module else None @property def workspace_dtype(self): + """Get dtype of inputs and outputs. + + You can use dtype.split()[eid] to get workspace type of specific entry id. + """ return self._module.get_workspace_dtype() if self._module else None @property def workspace_size(self): + """Get size of inputs and outputs. + + You can use size.split()[eid] to get workspace size of specific entry id. + """ return self._module.get_workspace_size() if self._module else None @property - def func_inorder(self): + def host_function_list(self): + """Get host function list.""" return self._module.get_func_inorder() if self._module else None @property def storageid(self): + """Get storage id.""" return self._module.get_storageid() if self._module else None @property diff --git a/python/tvm/tpat/cuda/pipeline.py b/python/tvm/tpat/cuda/pipeline.py index 5e1d112626df..8302fd0cb769 100644 --- a/python/tvm/tpat/cuda/pipeline.py +++ b/python/tvm/tpat/cuda/pipeline.py @@ -59,7 +59,7 @@ def _extract_target_onnx_node(model, tunning_node): def pipeline( - onnx_file: str, node_names: list[str], enable_tunning: bool, work_dir: str, output_onnx: str + onnx_file: str, node_names: list[str], enable_tunning: bool, tunning_option: object, output_onnx: str ) -> Tuple[str, list[str]]: """Generate plugins for specified nodes in an ONNX model. @@ -73,8 +73,8 @@ def pipeline( Names of the nodes to be generated as TensorRT plugins. enable_tunning : bool Flag indicating whether tunning is enabled. - work_dir : str - Path to the tunning log file where the records will be saved. + tunning_option : object + Tunning option provided for ms.relay_integration.tune_relay, you don't need to specify mod, params and target. output_onnx : str Path to the output ONNX file where the modified model will be saved. @@ -106,7 +106,7 @@ def pipeline( subgraph, submodel, shapes = _extract_target_onnx_node(inferred_model, node) - kernel = Kernel(plugin_name, submodel, shapes, enable_tunning, work_dir) + kernel = Kernel(plugin_name, submodel, shapes, enable_tunning, tunning_option) kernel.run() ## 3.1 fill in template diff --git a/python/tvm/tpat/cuda/plugin/Makefile b/python/tvm/tpat/cuda/plugin/Makefile index d90f15f1bd77..3406001e81dc 100644 --- a/python/tvm/tpat/cuda/plugin/Makefile +++ b/python/tvm/tpat/cuda/plugin/Makefile @@ -14,9 +14,15 @@ # limitations under the License. # -CUDA_PATH = /path/to/cuda -CUDNN_PATH = /path/to/cudnn -TRT_PATH = /path/to/TensorRT +# Variables need to be defined by Users +# CUDA_PATH = /path/to/cuda +# CUDNN_PATH = /path/to/cudnn +# TRT_PATH = /path/to/TensorRT +CUDA_PATH = /home/huangzhe1/anaconda3/envs/tvm_tunning +CUDNN_PATH = /home/huangzhe1/husen/cudnn-linux-x86_64-8.9.3.28_cuda11-archive +TRT_PATH = /home/huangzhe1/husen/TensorRT-8.6.1.6 +ARCH = sm_86 +######################################## CUDA_INC_PATH = $(CUDA_PATH)/include CUDA_LIB_PATH = $(CUDA_PATH)/lib @@ -28,13 +34,9 @@ CUDNN_LIB_PATH = $(CUDNN_PATH)/lib TRT_INC_PATH = $(TRT_PATH)/include TRT_LIB_PATH = $(TRT_PATH)/lib - -ARCH = sm_86 GCC = g++ NVCC = $(CUDA_PATH)/bin/nvcc -# CCFLAGS = -g -std=c++11 -DNDEBUG CCFLAGS = -w -std=c++11 -# CCFLAGS+= -DDEBUG_ME INCLUDES := -I. -I$(CUDA_COM_PATH) -I$(CUDA_INC_PATH) -I$(CUDNN_INC_PATH) -I$(TRT_INC_PATH) -I/usr/include LDFLAGS := -L$(CUDA_LIB_PATH) -L$(CUDNN_LIB_PATH) -L$(TRT_LIB_PATH) diff --git a/python/tvm/tpat/cuda/template.py b/python/tvm/tpat/cuda/template.py index df02e9f0b7d9..c31997475450 100644 --- a/python/tvm/tpat/cuda/template.py +++ b/python/tvm/tpat/cuda/template.py @@ -43,38 +43,39 @@ def rm_part_define(source_code): class PluginTemplate(object): def __init__(self, template_params): - self._template_params = template_params - self._plugin_name = template_params.plugin_name - self._plugin_config = template_params.plugin_config - with pushd(os.path.normpath(os.path.dirname(__file__))): template_loader = FileSystemLoader(searchpath='./') self._template_env = Environment(loader=template_loader) + self._plugin_name = template_params.plugin_name + self._plugin_device_function_configuration = template_params.device_function_configuration self._plugin_output_number = template_params.output_num self._plugin_output_type = template_params.output_type self._plugin_workspace_size = template_params.workspace_size self._plugin_total_workspace_size = template_params.total_workspace_size + self._plugin_variable_input_index = template_params.onnx_variable_input_index + self._plugin_kernels_body = template_params.cuda_source_code + self._onnx_input_python_type = template_params.onnx_input_python_type + self._onnx_output_python_type = template_params.onnx_output_python_type + self._input_workspace_size = template_params.input_workspace_size + self._output_workspace_size = template_params.output_workspace_size + onnx_output_shape = template_params.output_shape - onnx_input_shape = template_params.input_shape self._plugin_output_shape = self.parse_plugin_output_shape(onnx_output_shape) + + onnx_input_shape = template_params.input_shape self._plugin_input_shape = self.parse_plugin_input_shape(onnx_input_shape) - self._plugin_tensor_input_index = template_params.onnx_tensor_input_index + + onnx_tensor_type = template_params.tensor_type self._plugin_tensor_format = self.parse_plugin_tensor_format(onnx_tensor_type) - kernel_order = template_params.kernel_order - workspace_init = template_params.workspace_init + + kernel_order = template_params.device_function_order self._plugin_kernels_params = self.parse_plugin_kernels_params(kernel_order) - self._plugin_constant_init = self.parse_plugin_workspace_init(workspace_init) - self._plugin_kernels_body = template_params.cuda_source_code - self._onnx_input_python_type = template_params.onnx_input_python_type - self._onnx_output_python_type = template_params.onnx_output_python_type - self._input_workspace_size = template_params.input_workspace_size - self._output_workspace_size = template_params.output_workspace_size - @property - def plugin_name(self): - return self._plugin_name + workspace_constant = template_params.workspace_constant + self._plugin_constant_init = self.parse_plugin_workspace_constant(workspace_constant) + class TensorDims: def __init__(self, nbdims, shape): @@ -188,16 +189,16 @@ def parse_plugin_kernels_params(self, kernel_order): plugin_kernels_params.append( self.Kernel( func_name, - self._plugin_config[key_name]["grid_dim"], - self._plugin_config[key_name]["block_dim"], - self._plugin_config[key_name]["enqueue_params"], + self._plugin_device_function_configuration[key_name]["grid_dim"], + self._plugin_device_function_configuration[key_name]["block_dim"], + self._plugin_device_function_configuration[key_name]["enqueue_params"], ) ) return plugin_kernels_params - def parse_plugin_workspace_init(self, workspace_init): + def parse_plugin_workspace_constant(self, workspace_constant): plugin_constant_init = [] - for init_constant in workspace_init.items(): + for init_constant in workspace_constant.items(): value_str = ", ".join(str(ele) for ele in init_constant[1][0]) value_str = value_str.strip(",") plugin_constant_init.append( @@ -218,8 +219,8 @@ def generate_source_file(self): raise Exception("not implement method") def fill(self): - plugin_header_path = f"./plugin/src/{self.plugin_name}.h" - plugin_source_path = f"./plugin/src/{self.plugin_name}.cu" + plugin_header_path = f"./plugin/src/{self._plugin_name}.h" + plugin_source_path = f"./plugin/src/{self._plugin_name}.cu" if os.path.isfile(plugin_header_path): os.remove(plugin_header_path) if os.path.isfile(plugin_source_path): @@ -229,14 +230,14 @@ def fill(self): self.generate_header_file() self.generate_source_file() self.build_plugin() - - return f"{os.path.dirname(os.path.abspath(__file__))}/plugin/lib/{self.plugin_name}.so" + + return f"{os.path.dirname(os.path.abspath(__file__))}/plugin/lib/{self._plugin_name}.so" def build_plugin(self): os.chdir("./plugin") - os.system(f"make clean plugin_name={self.plugin_name}") - os.system(f"make plugin_name={self.plugin_name}") + os.system(f"make clean plugin_name={self._plugin_name}") + os.system(f"make plugin_name={self._plugin_name}") os.chdir("../") diff --git a/python/tvm/tpat/cuda/template_params.py b/python/tvm/tpat/cuda/template_params.py index 8cec8e48e794..2eda53dbd46d 100644 --- a/python/tvm/tpat/cuda/template_params.py +++ b/python/tvm/tpat/cuda/template_params.py @@ -39,90 +39,93 @@ def __init__(self, kernel, model, graph, tunning_node, name): self._tunning_name = name self._tunning_node = tunning_node - self._onnx_input_order = [] self._input_dict = {} - self._tvm_executor_order = {} self._allocate_size = [] self._data_type = [] - self._cuda_kernel_order = {} - self._gpu_thread_config = {} - self._tvm_func_order = [] + + self._device_function_list = {} + self._device_thread_config = {} + self._device_function_order = [] + self._device_allocate_memory_size = {} + + self._host_function_list = {} + self._host_function_order = [] + self._nums_input = 0 self._nums_output = 0 self._workspace_size = 0 self._output_type = [] - self._cuda_func_order = [] - self._tvm_constant = {} + self._constant_params = {} self._tvm_workspace_constant = {} + self._onnx_constant_input_index = [] + self._onnx_variable_input_index = [] + self._onnx_input_shape = [] self._onnx_output_shape = [] - self._onnx_weight_input_index = [] - self._onnx_tensor_input_index = [] self._onnx_tensor_type = [] self._onnx_input_python_type = [] self._onnx_output_python_type = [] + self._storage_id = [] - self._allocate_global_memory = {} - self._plugin_config = None + self._device_function_configuration = None - self.infer_for_output_shape() - self.input_weight_and_tensor_index() - self.parse() - self.align_onnx_and_tvm_input() - self.match_address_for_eid() - self.cuda_kernel_config() + self.parse_shape_and_type() + self.parse_input_index() + self.parse_kernel() + self.parse_device_function_inputs() + self.parse_device_function_config() def describe(self): - print(f"Cuda Kernel Order >>> {self._cuda_kernel_order}") - print(f"Gpu Thread Config >>> {self._gpu_thread_config}") - print(f"Cuda Func Rrder >>> {self._cuda_func_order}") + print(f"Constant params >>> {self._constant_params}") + print(f"Device Function List >>> {self._device_function_list}") + print(f"Device Thread Config >>> {self._device_thread_config}") + print(f"Device Function Order >>> {self._device_function_order}") print(f"Nums Input >>> {self._nums_input}") print(f"Nums Output >>> {self._nums_output}") print(f"Data Type >>> {self._data_type}") print(f"Allocate Size >>> {self._allocate_size}") - print(f"Tvm Executor Order >>> {self._tvm_executor_order}") - print(f"Tvm Func Order >>> {self._tvm_func_order}") + print(f"Host Function List >>> {self._host_function_list}") + print(f"Host Function Order >>> {self._host_function_order}") print(f"Cuda Source Code >>> {self._cuda_source_code}") print(f"Storage Id >>> {self._storage_id}") print(f"Storage Slot >>> {self.storage_slot}") - print(f"Allocate Global Memory >>> {self._allocate_global_memory}") + print(f"Device Memory Size >>> {self._device_allocate_memory_size}") print(f"Input Workspace Size >>> {self._input_workspace_size}") print(f"Output Workspace Size >>> {self._output_workspace_size}") - # Parse Constant. - def parse_constant_params(self, constant_params): + def _parse_constant_params(self, constant_params): tvm_constant = {} for key, value in constant_params.items(): tvm_constant[key] = value.flatten() return tvm_constant # Parse device functions params order. - def parse_device_funcs_params(self, device_funcs_inorder): - cuda_kernel_order = {} - for device_func_inorder in device_funcs_inorder: - if len(device_func_inorder) == 0: + def _parse_device_function_list(self, device_function_list): + _device_function_list = {} + for device_function in device_function_list: + if len(device_function) == 0: continue - tvm_device_func = device_func_inorder.split() + item = device_function.split() - cuda_kernel_order[tvm_device_func[0]] = tvm_device_func[1:] - return cuda_kernel_order + _device_function_list[item[0]] = item[1:] + return _device_function_list # Parse device functions thread config. - def parse_device_funcs_thread_config(self, device_funcs_thread_config): - gpu_thread_config = {} - cuda_func_order = [] - for device_func_thread_config in device_funcs_thread_config: - if len(device_func_thread_config) == 0: + def _parse_device_function_thread_config(self, device_function_thread_config): + kernel_thread_config = {} + kernel_order = [] + for item in device_function_thread_config: + if len(item) == 0: continue - config = device_func_thread_config.split() - cuda_func_name = config[0] - gpu_thread_config[cuda_func_name] = config[1:] - cuda_func_order.append(cuda_func_name) - return gpu_thread_config, cuda_func_order + config = item.split() + kernel_name = config[0] + kernel_thread_config[kernel_name] = config[1:] + kernel_order.append(kernel_name) + return kernel_thread_config, kernel_order # Parse global memory allocated in device side. - def parse_device_allocate_global_memory(self, device_allocate_global_memory): + def _parse_device_allocate_memory_size(self, device_allocate_global_memory): allocate_global_memory = {} for allocate_memory in device_allocate_global_memory: if len(allocate_memory) == 0: @@ -132,7 +135,7 @@ def parse_device_allocate_global_memory(self, device_allocate_global_memory): return allocate_global_memory # Parse variables storage index. - def parse_storageid(self, storageid): + def _parse_storageid(self, storageid): storage_id = [] storage_slot = {} for sid in storageid: @@ -142,87 +145,87 @@ def parse_storageid(self, storageid): storage_slot = {}.fromkeys(sid).keys() return storage_id, storage_slot - # Parse numbers of input. - def parse_nums_input(self, nums_input): - real_nums_input = int(nums_input) - int(len(self._tvm_constant)) + # Parse numbers of input, only variable. + def _parse_nums_input(self, nums_input): + real_nums_input = int(nums_input) - int(len(self._constant_params)) return real_nums_input # Parse numbers of output. - def parse_nums_output(self, nums_output): + def _parse_nums_output(self, nums_output): real_nums_output = int(nums_output) return real_nums_output # Parse datatype of variables in memory. - def parse_workspace_dtype(self, workspaces_dtype): + def _parse_workspace_dtype(self, workspaces_dtype): return workspaces_dtype.split() # Parse size of variables in memory. - def parse_workspace_size(self, workspace_size): + def _parse_workspace_size(self, workspace_size): return workspace_size.split() - def parse_func_inorder(self, funcs_inorder): + def _parse_host_function_list(self, host_function_list): """ - Parse the order of host functions. + Parse the list of host functions. """ func_call = {} - tvm_executor_order = {} - tvm_func_order = [] - for host_func_inorder in funcs_inorder: + host_executor_order = {} + host_func_order = [] + for host_func_inorder in host_function_list: if len(host_func_inorder) == 0: continue tvm_host_func = host_func_inorder.split() - if tvm_host_func[0] not in tvm_executor_order.keys(): - tvm_executor_order[tvm_host_func[0]] = tvm_host_func[1:] - tvm_func_order.append(tvm_host_func[0]) + if tvm_host_func[0] not in host_executor_order.keys(): + host_executor_order[tvm_host_func[0]] = tvm_host_func[1:] + host_func_order.append(tvm_host_func[0]) func_call[tvm_host_func[0]] = 0 else: func_call[tvm_host_func[0]] += 1 func_name = tvm_host_func[0] + "_" + str(func_call[tvm_host_func[0]]) - tvm_executor_order[func_name] = tvm_host_func[1:] - tvm_func_order.append(func_name) - return tvm_executor_order, tvm_func_order - - def parse(self): - constant_params = self._kernel.constant_param - device_funcs_inorder = self._kernel.device_funcs_inorder.split("\n") - device_funcs_thread_config = self._kernel.device_funcs_thread_config.split("\n") - device_allocate_global_memory = self._kernel.device_allocate_global_memory.split("\n") + host_executor_order[func_name] = tvm_host_func[1:] + host_func_order.append(func_name) + return host_executor_order, host_func_order + + def parse_kernel(self): + constant_params = self._kernel.constant_params + device_function_list = self._kernel.device_function_list.split("\n") + device_function_thread_config = self._kernel.device_function_thread_config.split("\n") + device_allocate_memory_size = self._kernel.device_allocate_memory_size.split("\n") num_inputs = self._kernel.num_inputs num_outputs = self._kernel.num_outputs workspace_dtype = self._kernel.workspace_dtype workspace_size = self._kernel.workspace_size - funcs_inorder = self._kernel.func_inorder.split("\n") + host_function_list = self._kernel.host_function_list.split("\n") storage_id = self._kernel.storageid.split("\n") - self._tvm_constant = self.parse_constant_params(constant_params) - self._cuda_kernel_order = self.parse_device_funcs_params(device_funcs_inorder) + self._constant_params = self._parse_constant_params(constant_params) + self._device_function_list = self._parse_device_function_list(device_function_list) ( - self._gpu_thread_config, - self._cuda_func_order, - ) = self.parse_device_funcs_thread_config(device_funcs_thread_config) - self._nums_input = self.parse_nums_input(num_inputs) - self._nums_output = self.parse_nums_output(num_outputs) - self._data_type = self.parse_workspace_dtype(workspace_dtype) - self._allocate_size = self.parse_workspace_size(workspace_size) - self._tvm_executor_order, self._tvm_func_order = self.parse_func_inorder(funcs_inorder) + self._device_thread_config, + self._device_function_order, + ) = self._parse_device_function_thread_config(device_function_thread_config) + self._nums_input = self._parse_nums_input(num_inputs) + self._nums_output = self._parse_nums_output(num_outputs) + self._data_type = self._parse_workspace_dtype(workspace_dtype) + self._allocate_size = self._parse_workspace_size(workspace_size) + self._host_function_list, self._host_function_order = self._parse_host_function_list(host_function_list) self._cuda_source_code = self._kernel.cuda_source_code - self._storage_id, self.storage_slot = self.parse_storageid(storage_id) - self._allocate_global_memory = self.parse_device_allocate_global_memory( - device_allocate_global_memory + self._storage_id, self.storage_slot = self._parse_storageid(storage_id) + self._device_allocate_memory_size = self._parse_device_allocate_memory_size( + device_allocate_memory_size ) self._input_workspace_size = self._allocate_size[0 : self._nums_input] self._output_workspace_size = self._allocate_size[-self._nums_output :] self.describe() - def infer_for_output_shape(self): + def parse_shape_and_type(self): """ - Infer for output shape. + Infer for input and output shape. """ tunning_node = self._tunning_node for inp in tunning_node.inputs: - if inp.__class__==gs.Constant or not inp.is_empty(): + if inp.__class__ == gs.Constant or not inp.is_empty(): self._onnx_input_python_type.append(tvm_to_c_type_mapping[inp.dtype.name]) self._onnx_tensor_type.append(python_to_trt_type_mapping[inp.dtype.name]) @@ -234,143 +237,132 @@ def infer_for_output_shape(self): self._onnx_input_shape = [ inp.shape for inp in tunning_node.inputs - if ( - inp.__class__ == gs.Variable - and not inp.is_empty() - ) + if (inp.__class__ == gs.Variable and not inp.is_empty()) ] - def input_weight_and_tensor_index(self): + def parse_input_index(self): """ - Calculate the index of weight input and tensor input. + Calculate the index of variable and constant input. """ tunning_node = self._tunning_node - self._onnx_tensor_input_index = [ + self._onnx_variable_input_index = [ k for k, inp in enumerate(tunning_node.inputs) if ( inp.__class__ == gs.Variable - and not (len(inp.inputs) == 1 and tunning_node.i(k, 0).op == "Constant") + and not (len(inp.inputs) == 1 and inp.inputs[0].op == "Constant") ) ] - self._onnx_weight_input_index = [ + self._onnx_constant_input_index = [ k for k, inp in enumerate(tunning_node.inputs) if ( inp.__class__ == gs.Constant - or (len(inp.inputs) == 1 and tunning_node.i(k, 0).op == "Constant") + or (len(inp.inputs) == 1 and inp.inputs[0].op == "Constant") ) ] - def align_onnx_and_tvm_input(self): - """ - Align onnx and tvm input. Because tvm let constants in the after of variables params. - """ - model = self._model - graph = model.graph - nodes = graph.node - onnx_inputs = graph.input - - init_order = {} - for node in nodes: - op_inputs = node.input - for i in range(len(op_inputs)): - init_order[op_inputs[i]] = i - - for i in onnx_inputs: - self._onnx_input_order.append(init_order[i.name]) - - def match_address_for_eid(self): + def parse_device_function_inputs(self): """ The memory address used by functions params. """ - workspace = 0 - input_slot_dict = {} + workspace_size = 0 + input_slot_dict = {} # storageid -> xx + + # 1. for outputs for i in range(self._nums_output): + # entry id of output eid = self._kernel.graph_module.get_output_eid(i) idx = int(self._storage_id[eid]) + # resolve output type given entry id self._output_type.append(python_to_trt_type_mapping[self._data_type[eid]]) self._input_dict[str(eid)] = "outputs[" + str(i) + "]" - input_slot_dict[idx] = self._input_dict[str(eid)] + input_slot_dict[idx] = "outputs[" + str(i) + "]" + + # 2. for inputs, including variable and constants + storage_id_to_allocate_size = {} + for eid in range(len(self._allocate_size)): + idx = int(self._storage_id[eid]) + if idx not in storage_id_to_allocate_size.keys(): + storage_id_to_allocate_size[idx] = 0 + storage_id_to_allocate_size[idx] = max(int(self._allocate_size[eid]), int(storage_id_to_allocate_size[idx])) - duplicate_allocate = {} - for i in range(len(self._allocate_size)): - idx = int(self._storage_id[i]) - if idx not in duplicate_allocate.keys(): - duplicate_allocate[idx] = 0 - duplicate_allocate[idx] = max(int(self._allocate_size[i]), int(duplicate_allocate[idx])) - for i in range(len(self._allocate_size)): - idx = int(self._storage_id[i]) + for eid in range(len(self._allocate_size)): + idx = int(self._storage_id[eid]) if idx in input_slot_dict.keys(): - self._input_dict[str(i)] = input_slot_dict[idx] + self._input_dict[str(eid)] = input_slot_dict[idx] continue - if i < self._nums_input: - self._input_dict[str(i)] = "inputs[" + str(self._onnx_input_order[i]) + "]" - elif i < len(self._allocate_size) - self._nums_output: - if i == self._nums_input: - self._input_dict[str(i)] = "workspace" + if eid < self._nums_input: + # it must be variable + self._input_dict[str(eid)] = "inputs[" + str(eid) + "]" + elif eid < len(self._allocate_size) - self._nums_output: + # it must be constant + if eid == self._nums_input: + # the first one + self._input_dict[str(eid)] = "workspace" else: - self._input_dict[str(i)] = "(workspace + " + str(workspace) + ")" - workspace += int(duplicate_allocate[idx]) - self._workspace_size = workspace + self._input_dict[str(eid)] = f"(workspace + {workspace_size})" + workspace_size += int(storage_id_to_allocate_size[idx]) + if ( - self._input_dict[str(i)] not in self._tvm_workspace_constant.keys() - and str(idx) in self._tvm_constant.keys() + self._input_dict[str(eid)] not in self._tvm_workspace_constant.keys() + and str(idx) in self._constant_params.keys() ): - # self._tvm_workspace_constant[self._input_dict[str(i)]] = None - self._tvm_workspace_constant[self._input_dict[str(i)]] = ( - self._tvm_constant[str(idx)], - tvm_to_c_type_mapping[self._data_type[i]], - int(i), + self._tvm_workspace_constant[self._input_dict[str(eid)]] = ( + self._constant_params[str(idx)], + tvm_to_c_type_mapping[self._data_type[eid]], + int(eid), ) - input_slot_dict[idx] = self._input_dict[str(i)] + input_slot_dict[idx] = self._input_dict[str(eid)] - if len(self._allocate_global_memory) != 0: - for key, value in self._allocate_global_memory.items(): + if len(self._device_allocate_memory_size) != 0: + for key, value in self._device_allocate_memory_size.items(): self._input_dict[key] = ( - "(" + tvm_to_c_type_mapping[value[0]] + "*)(workspace + " + str(workspace) + ")" + "(" + tvm_to_c_type_mapping[value[0]] + "*)(workspace + " + str(workspace_size) + ")" ) - workspace += int(value[1]) * plugin_type_size[value[0]] - self._workspace_size = workspace + workspace_size += int(value[1]) * plugin_type_size[value[0]] - def cuda_kernel_config(self): + self._workspace_size = workspace_size + + def parse_device_function_config(self): """ - Grid. Block. Thread. size. + Grid, Block Layout, etc. """ output = "" output_json = {} - cuda_func_call = {} - for i in range(len(self._cuda_func_order)): - cuda_func_name = self._cuda_func_order[i] - - func_name = re.sub(r"_kernel_?\d*", "", cuda_func_name, count=1) - if cuda_func_name not in output_json.keys(): - output_json[cuda_func_name] = {} - cuda_func_call[cuda_func_name] = 0 - multi_cuda_func_name = cuda_func_name + kernel_call_times = {} + for i in range(len(self._device_function_order)): + device_funtion_name = self._device_function_order[i] + host_function_name = re.sub(r"_kernel_?\d*", "", device_funtion_name, count=1) + + if device_funtion_name not in output_json.keys(): + output_json[device_funtion_name] = {} + kernel_call_times[device_funtion_name] = 0 + unique_device_function_name = device_funtion_name else: - cuda_func_call[cuda_func_name] += 1 - func_name = func_name + "_" + str(cuda_func_call[cuda_func_name]) - multi_cuda_func_name = cuda_func_name + "_" + str(cuda_func_call[cuda_func_name]) - output_json[multi_cuda_func_name] = {} + kernel_call_times[device_funtion_name] += 1 + host_function_name = host_function_name + "_" + str(kernel_call_times[device_funtion_name]) + unique_device_function_name = device_funtion_name + "_" + str(kernel_call_times[device_funtion_name]) + output_json[unique_device_function_name] = {} - output_json[multi_cuda_func_name]["grid_dim"] = self._gpu_thread_config[cuda_func_name][ + # grid and block dim + output_json[unique_device_function_name]["grid_dim"] = self._device_thread_config[device_funtion_name][ 0 ].strip("grid=") - output_json[multi_cuda_func_name]["block_dim"] = self._gpu_thread_config[ - cuda_func_name + output_json[unique_device_function_name]["block_dim"] = self._device_thread_config[ + device_funtion_name ][1].strip("block=") - output += cuda_func_name + "\n" + str(self._gpu_thread_config[cuda_func_name]) + "\n" - kernel_param_order = self._cuda_kernel_order[cuda_func_name] - tvm_param_order = self._tvm_executor_order[func_name] + output += device_funtion_name + "\n" + str(self._device_thread_config[device_funtion_name]) + "\n" + + device_param_order = self._device_function_list[device_funtion_name] + host_param_order = self._host_function_list[host_function_name] enqueue_params = "" - for j in range(len(kernel_param_order)): - if kernel_param_order[j].isdigit(): - # enqueue_params += self._input_dict[str(tvm_param_order[int(kernel_param_order[j])])] - output += self._input_dict[str(tvm_param_order[int(kernel_param_order[j])])] - eid = tvm_param_order[int(kernel_param_order[j])] + for j in range(len(device_param_order)): + if device_param_order[j].isdigit(): + output += self._input_dict[str(host_param_order[int(device_param_order[j])])] + eid = host_param_order[int(device_param_order[j])] enqueue_params += ( "(" + tvm_to_c_type_mapping[self._data_type[int(eid)]] @@ -378,27 +370,27 @@ def cuda_kernel_config(self): + self._input_dict[str(eid)] ) else: - if kernel_param_order[j] in self._input_dict.keys(): - enqueue_params += self._input_dict[kernel_param_order[j]] - if j == len(kernel_param_order) - 1: + if device_param_order[j] in self._input_dict.keys(): + enqueue_params += self._input_dict[device_param_order[j]] + if j == len(device_param_order) - 1: output += "\n" else: output += ", " enqueue_params += ", " - output_json[multi_cuda_func_name]["enqueue_params"] = enqueue_params - self._plugin_config = output_json + output_json[unique_device_function_name]["enqueue_params"] = enqueue_params + self._device_function_configuration = output_json @property def host_func_order(self): - return self._tvm_func_order + return self._host_function_order @property - def kernel_order(self): - return self._cuda_func_order + def device_function_order(self): + return self._device_function_order @property - def plugin_config(self): - return self._plugin_config + def device_function_configuration(self): + return self._device_function_configuration @property def workspace_size(self): @@ -422,18 +414,18 @@ def input_shape(self): @property def onnx_weight_input_index(self): - return self._onnx_weight_input_index + return self._onnx_constant_input_index @property - def onnx_tensor_input_index(self): - return self._onnx_tensor_input_index + def onnx_variable_input_index(self): + return self._onnx_variable_input_index @property def tensor_type(self): return self._onnx_tensor_type @property - def workspace_init(self): + def workspace_constant(self): return self._tvm_workspace_constant @property diff --git a/src/runtime/cuda/cuda_module.cc b/src/runtime/cuda/cuda_module.cc index fdd070791544..39c59f17f40b 100644 --- a/src/runtime/cuda/cuda_module.cc +++ b/src/runtime/cuda/cuda_module.cc @@ -301,7 +301,7 @@ Module CUDAModuleLoadBinary(void* strm) { return CUDAModuleCreate(data, fmt, fmap, std::string()); } -String CUDAModuleGetGridBlockThreadConfig() { +String CUDAModuleGetThreadConfig() { String ret = ""; for (const String& func_config : funcs_thread_config) { ret = ret + func_config; @@ -315,7 +315,7 @@ TVM_REGISTER_GLOBAL("runtime.module.loadfile_ptx").set_body_typed(CUDAModuleLoad TVM_REGISTER_GLOBAL("runtime.module.loadbinary_cuda").set_body_typed(CUDAModuleLoadBinary); -TVM_REGISTER_GLOBAL("runtime.module.retrieve_grid_block_thread_config") - .set_body([](TVMArgs args, TVMRetValue* rv) { *rv = CUDAModuleGetGridBlockThreadConfig(); }); +TVM_REGISTER_GLOBAL("runtime.module.retrieve_device_function_thread_config") + .set_body([](TVMArgs args, TVMRetValue* rv) { *rv = CUDAModuleGetThreadConfig(); }); } // namespace runtime } // namespace tvm diff --git a/src/runtime/graph_executor/graph_executor.cc b/src/runtime/graph_executor/graph_executor.cc index 867971ae875b..c055cb66af05 100644 --- a/src/runtime/graph_executor/graph_executor.cc +++ b/src/runtime/graph_executor/graph_executor.cc @@ -796,7 +796,7 @@ PackedFunc GraphExecutor::GetFunction(const String& name, const ObjectPtrGetWorkspaceSize(); }); - } else if (name == "get_func_inorder") { + } else if (name == "get_func_list") { return PackedFunc( [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->GetFuncList(); }); } else if (name == "get_storageid") { diff --git a/src/tir/transforms/lower_device_kernel_launch.cc b/src/tir/transforms/lower_device_kernel_launch.cc index b8a0ef240bc4..9000f04e2626 100644 --- a/src/tir/transforms/lower_device_kernel_launch.cc +++ b/src/tir/transforms/lower_device_kernel_launch.cc @@ -36,7 +36,7 @@ namespace tvm { namespace tir { -extern std::unordered_map > host_name_to_param; +extern std::unordered_map> host_function_name_to_params; extern std::unordered_map name_to_prefix; std::vector device_funcs; std::vector device_memory_size; @@ -337,13 +337,13 @@ class DeviceKernelMutator : public StmtExprMutator { ss << name_hint << " "; for (auto arg : args) { bool find_param_in_host = false; - for (int i = 0; i < host_name_to_param[name_to_prefix[name_hint]].size(); ++i) { - if (arg.same_as(host_name_to_param[name_to_prefix[name_hint]][i])) { + auto params = host_function_name_to_params[name_to_prefix[name_hint]]; + for (int i = 0; i < params.size(); ++i) { + if (arg.same_as(params[i])) { ss << i << " "; find_param_in_host = true; } } - std::cout << std::endl; if (!find_param_in_host) { ss << arg.get() << " "; } @@ -359,7 +359,7 @@ class DeviceKernelMutator : public StmtExprMutator { }; namespace transform { -String GetDeviceFuncsList() { +String GetDeviceFunctionList() { String ret = ""; for (auto func : device_funcs) { ret = ret + func; @@ -431,8 +431,8 @@ Pass LowerDeviceKernelLaunch() { TVM_REGISTER_GLOBAL("tir.transform.LowerDeviceKernelLaunch") .set_body_typed(LowerDeviceKernelLaunch); -TVM_REGISTER_GLOBAL("tir.transform.retrieve_device_funcs_list") - .set_body([](TVMArgs args, TVMRetValue* rv) { *rv = GetDeviceFuncsList(); }); +TVM_REGISTER_GLOBAL("tir.transform.retrieve_device_function_list") + .set_body([](TVMArgs args, TVMRetValue* rv) { *rv = GetDeviceFunctionList(); }); TVM_REGISTER_GLOBAL("tir.transform.retrieve_device_memory_size") .set_body([](TVMArgs args, TVMRetValue* rv) { *rv = GetDeviceMemorySize(); }); diff --git a/src/tir/transforms/make_packed_api.cc b/src/tir/transforms/make_packed_api.cc index 6e7f597a9583..4db041d16fe5 100644 --- a/src/tir/transforms/make_packed_api.cc +++ b/src/tir/transforms/make_packed_api.cc @@ -41,7 +41,7 @@ namespace tvm { namespace tir { static constexpr const char* kDeviceContextVar = "device_api_context"; -std::unordered_map > host_name_to_param; +std::unordered_map> host_function_name_to_params; namespace { class ReturnRewriter : public StmtMutator { @@ -215,6 +215,7 @@ PrimFunc MakePackedAPI(PrimFunc func) { return func; } std::string name_hint = global_symbol.value(); + std::cout << "NAME HINT ===> " << name_hint << '\n'; Target target = [&]() { auto opt = func->GetAttr(tvm::attr::kTarget); @@ -278,7 +279,8 @@ PrimFunc MakePackedAPI(PrimFunc func) { // appear in the buffer. std::vector> var_def; std::vector> buffer_def; - std::vector cur_func_param; + + std::vector params_of_function; for (int i = 0; i < static_cast(func_ptr->params.size()); ++i) { Var param = func_ptr->params[i]; @@ -292,7 +294,7 @@ PrimFunc MakePackedAPI(PrimFunc func) { var_def.emplace_back(f_arg_value(param.dtype(), i), param); if (func_ptr->buffer_map.count(param)) { - cur_func_param.push_back(func_ptr->buffer_map[param]->data); + params_of_function.push_back(func_ptr->buffer_map[param]->data); buffer_def.emplace_back(param, func_ptr->buffer_map[param]); } @@ -319,7 +321,7 @@ PrimFunc MakePackedAPI(PrimFunc func) { } } - host_name_to_param[name_hint] = cur_func_param; + host_function_name_to_params[name_hint] = params_of_function; Array args{v_packed_args, buf_packed_arg_type_ids->data, v_num_packed_args, v_out_ret_value, @@ -391,7 +393,7 @@ namespace transform { Pass MakePackedAPI() { auto pass_func = [](IRModule mod, PassContext ctx) { - host_name_to_param.clear(); + host_function_name_to_params.clear(); Map packed_func_methods; for (const auto& [gvar, base_func] : mod->functions) { diff --git a/tests/python/tpat/cuda/common.py b/tests/python/tpat/cuda/common.py index 250535015d1f..019a0cf366b0 100644 --- a/tests/python/tpat/cuda/common.py +++ b/tests/python/tpat/cuda/common.py @@ -94,7 +94,11 @@ def name_without_num(name): ops_name = [op_name] _, trt_plugin_names = tpat.cuda.pipeline( - INPUT_MODEL_FILE, ops_name, False, "./log_db", OUTPUT_MODEL_FILE + INPUT_MODEL_FILE, + ops_name, + False, + {"work_dir": "./log_db", "max_trials_per_task": 500}, + OUTPUT_MODEL_FILE, ) load_plugin(trt_plugin_names) @@ -197,7 +201,11 @@ def verify_with_ort_with_trt( ops_name = [op_name] _, trt_plugin_names = tpat.cuda.pipeline( - INPUT_MODEL_FILE, ops_name, False, "./log_db", OUTPUT_MODEL_FILE + INPUT_MODEL_FILE, + ops_name, + False, + {"work_dir": "./log_db", "max_trials_per_task": 500}, + OUTPUT_MODEL_FILE, ) load_plugin(trt_plugin_names) From 83cee7ae0231e183e1542740c76893ffd2a4ec56 Mon Sep 17 00:00:00 2001 From: Civitasv Date: Sat, 12 Aug 2023 23:22:38 +0800 Subject: [PATCH 05/14] fix: make extract onnx and rewrite cleaer --- python/tvm/tpat/cuda/pipeline.py | 16 ++++++---------- python/tvm/tpat/cuda/rewrite.py | 17 ++++++++++------- 2 files changed, 16 insertions(+), 17 deletions(-) diff --git a/python/tvm/tpat/cuda/pipeline.py b/python/tvm/tpat/cuda/pipeline.py index 8302fd0cb769..7a56727ab84f 100644 --- a/python/tvm/tpat/cuda/pipeline.py +++ b/python/tvm/tpat/cuda/pipeline.py @@ -15,13 +15,10 @@ # specific language governing permissions and limitations # under the License. -import os from typing import Tuple -import numpy as np import onnx import onnx_graphsurgeon as gs -import onnxruntime as ort from onnx import shape_inference from tvm.tpat.cuda.kernel import Kernel @@ -29,7 +26,6 @@ from tvm.tpat.cuda.template_params import PluginTemplateParams from .rewrite import rewrite -import copy def _extract_target_onnx_node(model, tunning_node): @@ -39,23 +35,23 @@ def _extract_target_onnx_node(model, tunning_node): graph = gs.import_onnx(model) tensors = graph.tensors() - tuning_node_inputs = [ + subgraph_inputs = [ tensors[inp.name].to_variable(dtype=inp.dtype, shape=inp.shape) for inp in tunning_node.inputs if (inp.__class__ == gs.Variable and not inp.is_empty()) ] - tuning_node_outputs = [ + subgraph_outputs = [ tensors[oup.name].to_variable(dtype=oup.dtype, shape=oup.shape) for oup in tunning_node.outputs ] - tuning_input_shapes = [(inp.name, inp.shape, inp.dtype.name) for inp in graph.inputs] + input_shapes = [(inp.name, inp.shape, inp.dtype.name) for inp in subgraph_inputs] - graph.inputs = tuning_node_inputs - graph.outputs = tuning_node_outputs + graph.inputs = subgraph_inputs + graph.outputs = subgraph_outputs graph.cleanup() submodel = gs.export_onnx(graph) - return graph, submodel, tuning_input_shapes + return graph, submodel, input_shapes def pipeline( diff --git a/python/tvm/tpat/cuda/rewrite.py b/python/tvm/tpat/cuda/rewrite.py index 61b63be09ff0..ea726aad620b 100644 --- a/python/tvm/tpat/cuda/rewrite.py +++ b/python/tvm/tpat/cuda/rewrite.py @@ -87,7 +87,7 @@ def _remove_unnecessary_cast_nodes(graph): ] for node in cast_nodes: if ( - node.attrs["to"] == 13 + node.attrs["to"] == 13 # uint64 and len(node.inputs[0].inputs) <= 1 and len(node.outputs[0].outputs) <= 1 ): @@ -101,12 +101,15 @@ def _compute_tensor_type(graph, tunning_nodes): for tunning_node in tunning_nodes: for inp in tunning_node.inputs: - if inp.__class__ == gs.Constant or not inp.is_empty(): - onnx_original_tensor_type[inp.name] = inp.dtype.name - [ - onnx_original_tensor_type.update({oup.name: oup.dtype.name}) - for oup in tunning_node.outputs - ] + if inp.is_empty(): + continue + onnx_original_tensor_type[inp.name] = inp.dtype.name + + for oup in tunning_node.outputs: + if oup.is_empty(): + continue + onnx_original_tensor_type[oup.name] = oup.dtype.name + return onnx_original_tensor_type From b4726fc5fe9ac11080e90dd8541043a4aec802bb Mon Sep 17 00:00:00 2001 From: Civitasv Date: Sun, 13 Aug 2023 22:15:44 +0800 Subject: [PATCH 06/14] [tensorrt] [byoc] [plugin] Make API clearer and remove unneccessary fields --- python/tvm/contrib/graph_executor.py | 6 +- python/tvm/tpat/cuda/kernel.py | 6 +- python/tvm/tpat/cuda/template.py | 38 +-- python/tvm/tpat/cuda/template_params.py | 313 +++++++------------ src/runtime/graph_executor/graph_executor.cc | 14 +- src/runtime/graph_executor/graph_executor.h | 2 +- src/tir/transforms/make_packed_api.cc | 1 - tests/python/tpat/cuda/common.py | 27 +- 8 files changed, 159 insertions(+), 248 deletions(-) diff --git a/python/tvm/contrib/graph_executor.py b/python/tvm/contrib/graph_executor.py index ea8a402900a2..d3b1522e50c9 100644 --- a/python/tvm/contrib/graph_executor.py +++ b/python/tvm/contrib/graph_executor.py @@ -180,7 +180,7 @@ def __init__(self, module): self._get_workspace_dtype = module["get_workspace_dtype"] self._get_workspace_size = module["get_workspace_size"] - self._get_func_list = module["get_func_list"] + self._get_function_list = module["get_function_list"] self._get_storageid = module["get_storageid"] self._get_output_eid = module["get_output_eid"] @@ -539,7 +539,7 @@ def get_workspace_size(self): """ return self._get_workspace_size() - def get_func_inorder(self): + def get_function_list(self): """Get the Host Function execute order Returns @@ -547,7 +547,7 @@ def get_func_inorder(self): dtype : str The Host function execute order """ - return self._get_func_list() + return self._get_function_list() def get_storageid(self): return self._get_storageid() diff --git a/python/tvm/tpat/cuda/kernel.py b/python/tvm/tpat/cuda/kernel.py index c37dcd01a57d..c3eb31a934c0 100644 --- a/python/tvm/tpat/cuda/kernel.py +++ b/python/tvm/tpat/cuda/kernel.py @@ -32,7 +32,7 @@ def __init__(self, onnx_model, input_shapes, target, tunning_option) -> None: if target == "gpu": self.target = self._detect_cuda_target() - def tune_option(self): + def _tune_option(self): default = { "target": self.target, "builder": ms.builder.LocalBuilder(), @@ -81,7 +81,7 @@ def run(self): # 2. Tune it if self._enable_tunning: - tunning_option = self._config.tune_option() + tunning_option = self._config._tune_option() ms.relay_integration.tune_relay(mod=mod, params=params, **tunning_option) # 3. Compiling @@ -212,7 +212,7 @@ def workspace_size(self): @property def host_function_list(self): """Get host function list.""" - return self._module.get_func_inorder() if self._module else None + return self._module.get_function_list() if self._module else None @property def storageid(self): diff --git a/python/tvm/tpat/cuda/template.py b/python/tvm/tpat/cuda/template.py index c31997475450..53dd94100304 100644 --- a/python/tvm/tpat/cuda/template.py +++ b/python/tvm/tpat/cuda/template.py @@ -49,32 +49,22 @@ def __init__(self, template_params): self._plugin_name = template_params.plugin_name self._plugin_device_function_configuration = template_params.device_function_configuration - self._plugin_output_number = template_params.output_num - self._plugin_output_type = template_params.output_type - self._plugin_workspace_size = template_params.workspace_size - self._plugin_total_workspace_size = template_params.total_workspace_size - self._plugin_variable_input_index = template_params.onnx_variable_input_index + self._plugin_output_number = template_params.num_outputs + self._plugin_output_type = template_params.output_dtype + self._plugin_workspace_size = template_params.total_workspace_size self._plugin_kernels_body = template_params.cuda_source_code - self._onnx_input_python_type = template_params.onnx_input_python_type - self._onnx_output_python_type = template_params.onnx_output_python_type - self._input_workspace_size = template_params.input_workspace_size - self._output_workspace_size = template_params.output_workspace_size onnx_output_shape = template_params.output_shape - self._plugin_output_shape = self.parse_plugin_output_shape(onnx_output_shape) - - onnx_input_shape = template_params.input_shape - self._plugin_input_shape = self.parse_plugin_input_shape(onnx_input_shape) - + self._plugin_output_shape = self._parse_plugin_output_shape(onnx_output_shape) onnx_tensor_type = template_params.tensor_type - self._plugin_tensor_format = self.parse_plugin_tensor_format(onnx_tensor_type) + self._plugin_tensor_format = self._parse_plugin_tensor_format(onnx_tensor_type) kernel_order = template_params.device_function_order - self._plugin_kernels_params = self.parse_plugin_kernels_params(kernel_order) + self._plugin_kernels_params = self._parse_plugin_kernels_params(kernel_order) workspace_constant = template_params.workspace_constant - self._plugin_constant_init = self.parse_plugin_workspace_constant(workspace_constant) + self._plugin_constant_init = self._parse_plugin_workspace_constant(workspace_constant) class TensorDims: @@ -154,7 +144,7 @@ def __init__(self, size, dtype): self.size = size self.dtype = dtype - def parse_plugin_input_shape(self, onnx_input_shape): + def _parse_plugin_input_shape(self, onnx_input_shape): plugin_input_shape = [] for s in onnx_input_shape: nbdims = len(s) @@ -162,7 +152,7 @@ def parse_plugin_input_shape(self, onnx_input_shape): plugin_input_shape.append(self.TensorDims(nbdims, shape)) return plugin_input_shape - def parse_plugin_output_shape(self, onnx_output_shape): + def _parse_plugin_output_shape(self, onnx_output_shape): plugin_output_shape = [] for s in onnx_output_shape: nbdims = len(s) @@ -170,13 +160,13 @@ def parse_plugin_output_shape(self, onnx_output_shape): plugin_output_shape.append(self.TensorDims(nbdims, shape)) return plugin_output_shape - def parse_plugin_tensor_format(self, onnx_tensor_type): + def _parse_plugin_tensor_format(self, onnx_tensor_type): plugin_tensor_format = [] for dtype in onnx_tensor_type: plugin_tensor_format.append(self.TensorFormat("LINEAR", dtype)) return plugin_tensor_format - def parse_plugin_kernels_params(self, kernel_order): + def _parse_plugin_kernels_params(self, kernel_order): kernel_call = {} plugin_kernels_params = [] for func_name in kernel_order: @@ -196,7 +186,7 @@ def parse_plugin_kernels_params(self, kernel_order): ) return plugin_kernels_params - def parse_plugin_workspace_constant(self, workspace_constant): + def _parse_plugin_workspace_constant(self, workspace_constant): plugin_constant_init = [] for init_constant in workspace_constant.items(): value_str = ", ".join(str(ele) for ele in init_constant[1][0]) @@ -229,11 +219,11 @@ def fill(self): with pushd(os.path.normpath(os.path.dirname(__file__))): self.generate_header_file() self.generate_source_file() - self.build_plugin() + self._build_plugin() return f"{os.path.dirname(os.path.abspath(__file__))}/plugin/lib/{self._plugin_name}.so" - def build_plugin(self): + def _build_plugin(self): os.chdir("./plugin") os.system(f"make clean plugin_name={self._plugin_name}") diff --git a/python/tvm/tpat/cuda/template_params.py b/python/tvm/tpat/cuda/template_params.py index 2eda53dbd46d..be83887e3d6c 100644 --- a/python/tvm/tpat/cuda/template_params.py +++ b/python/tvm/tpat/cuda/template_params.py @@ -40,58 +40,53 @@ def __init__(self, kernel, model, graph, tunning_node, name): self._tunning_node = tunning_node self._input_dict = {} - self._allocate_size = [] - self._data_type = [] - - self._device_function_list = {} - self._device_thread_config = {} - self._device_function_order = [] - self._device_allocate_memory_size = {} - - self._host_function_list = {} - self._host_function_order = [] - - self._nums_input = 0 - self._nums_output = 0 - self._workspace_size = 0 - self._output_type = [] - self._constant_params = {} + + self._workspace_size = [] # eid -> workspace size + self._workspace_dtype = [] # eid -> workspace dtype + self._total_workspace_size = 0 # total workspace size need by plugin + + # Kernel related params + self._device_function_list = {} # kernel -> index for params of host function + self._device_thread_config = {} # kernel -> thread dim + self._device_function_order = [] # kernel invoke order + self._device_allocate_memory_size = {} # address -> (dtype, extent) + + # Host side function attrs + self._host_function_list = {} # function -> eid of params (firstly inputs, then outputs) + self._host_function_order = [] # host function order + + self._nums_inputs = 0 # number of inputs + self._nums_outputs = 0 # number of outputs + self._output_dtype = [] # dtype of outputs + self._output_shape = [] # shape of outputs + self._constant_params = {} # constant params, storage_id -> data self._tvm_workspace_constant = {} - self._onnx_constant_input_index = [] - self._onnx_variable_input_index = [] - self._onnx_input_shape = [] - self._onnx_output_shape = [] - self._onnx_tensor_type = [] - self._onnx_input_python_type = [] - self._onnx_output_python_type = [] + self._tensor_type = [] # tensor type of inputs and outputs - self._storage_id = [] + self._storage_id = [] # eid -> storage id self._device_function_configuration = None - self.parse_shape_and_type() - self.parse_input_index() - self.parse_kernel() - self.parse_device_function_inputs() - self.parse_device_function_config() + self._parse_shape_and_type() + self._parse_kernel_params() + self._parse_device_function_inputs() + self._parse_device_function_config() - def describe(self): + def _describe(self): + """Use for debug.""" print(f"Constant params >>> {self._constant_params}") print(f"Device Function List >>> {self._device_function_list}") print(f"Device Thread Config >>> {self._device_thread_config}") print(f"Device Function Order >>> {self._device_function_order}") - print(f"Nums Input >>> {self._nums_input}") - print(f"Nums Output >>> {self._nums_output}") - print(f"Data Type >>> {self._data_type}") - print(f"Allocate Size >>> {self._allocate_size}") + print(f"Nums Input >>> {self._nums_inputs}") + print(f"Nums Output >>> {self._nums_outputs}") + print(f"Workspace Data Type >>> {self._workspace_dtype}") + print(f"Workspace Size >>> {self._workspace_size}") print(f"Host Function List >>> {self._host_function_list}") print(f"Host Function Order >>> {self._host_function_order}") - print(f"Cuda Source Code >>> {self._cuda_source_code}") print(f"Storage Id >>> {self._storage_id}") - print(f"Storage Slot >>> {self.storage_slot}") print(f"Device Memory Size >>> {self._device_allocate_memory_size}") - print(f"Input Workspace Size >>> {self._input_workspace_size}") - print(f"Output Workspace Size >>> {self._output_workspace_size}") + # print(f"Cuda Source Code >>> {self._cuda_source_code}") # Parse Constant. def _parse_constant_params(self, constant_params): @@ -103,7 +98,7 @@ def _parse_constant_params(self, constant_params): # Parse device functions params order. def _parse_device_function_list(self, device_function_list): _device_function_list = {} - for device_function in device_function_list: + for device_function in device_function_list.split("\n"): if len(device_function) == 0: continue item = device_function.split() @@ -115,7 +110,7 @@ def _parse_device_function_list(self, device_function_list): def _parse_device_function_thread_config(self, device_function_thread_config): kernel_thread_config = {} kernel_order = [] - for item in device_function_thread_config: + for item in device_function_thread_config.split("\n"): if len(item) == 0: continue config = item.split() @@ -127,7 +122,7 @@ def _parse_device_function_thread_config(self, device_function_thread_config): # Parse global memory allocated in device side. def _parse_device_allocate_memory_size(self, device_allocate_global_memory): allocate_global_memory = {} - for allocate_memory in device_allocate_global_memory: + for allocate_memory in device_allocate_global_memory.split("\n"): if len(allocate_memory) == 0: continue allocate = allocate_memory.split() @@ -137,13 +132,11 @@ def _parse_device_allocate_memory_size(self, device_allocate_global_memory): # Parse variables storage index. def _parse_storageid(self, storageid): storage_id = [] - storage_slot = {} - for sid in storageid: + for sid in storageid.split("\n"): if len(sid) == 0: continue storage_id = sid.split() - storage_slot = {}.fromkeys(sid).keys() - return storage_id, storage_slot + return storage_id # Parse numbers of input, only variable. def _parse_nums_input(self, nums_input): @@ -170,7 +163,7 @@ def _parse_host_function_list(self, host_function_list): func_call = {} host_executor_order = {} host_func_order = [] - for host_func_inorder in host_function_list: + for host_func_inorder in host_function_list.split("\n"): if len(host_func_inorder) == 0: continue tvm_host_func = host_func_inorder.split() @@ -185,151 +178,116 @@ def _parse_host_function_list(self, host_function_list): host_func_order.append(func_name) return host_executor_order, host_func_order - def parse_kernel(self): - constant_params = self._kernel.constant_params - device_function_list = self._kernel.device_function_list.split("\n") - device_function_thread_config = self._kernel.device_function_thread_config.split("\n") - device_allocate_memory_size = self._kernel.device_allocate_memory_size.split("\n") - num_inputs = self._kernel.num_inputs - num_outputs = self._kernel.num_outputs - workspace_dtype = self._kernel.workspace_dtype - workspace_size = self._kernel.workspace_size - host_function_list = self._kernel.host_function_list.split("\n") - storage_id = self._kernel.storageid.split("\n") - - self._constant_params = self._parse_constant_params(constant_params) - self._device_function_list = self._parse_device_function_list(device_function_list) + def _parse_kernel_params(self): + self._constant_params = self._parse_constant_params(self._kernel.constant_params) + self._device_function_list = self._parse_device_function_list( + self._kernel.device_function_list + ) ( self._device_thread_config, self._device_function_order, - ) = self._parse_device_function_thread_config(device_function_thread_config) - self._nums_input = self._parse_nums_input(num_inputs) - self._nums_output = self._parse_nums_output(num_outputs) - self._data_type = self._parse_workspace_dtype(workspace_dtype) - self._allocate_size = self._parse_workspace_size(workspace_size) - self._host_function_list, self._host_function_order = self._parse_host_function_list(host_function_list) - self._cuda_source_code = self._kernel.cuda_source_code - self._storage_id, self.storage_slot = self._parse_storageid(storage_id) + ) = self._parse_device_function_thread_config(self._kernel.device_function_thread_config) self._device_allocate_memory_size = self._parse_device_allocate_memory_size( - device_allocate_memory_size + self._kernel.device_allocate_memory_size ) - self._input_workspace_size = self._allocate_size[0 : self._nums_input] - self._output_workspace_size = self._allocate_size[-self._nums_output :] + self._nums_inputs = self._parse_nums_input(self._kernel.num_inputs) + self._nums_outputs = self._parse_nums_output(self._kernel.num_outputs) + self._workspace_dtype = self._parse_workspace_dtype(self._kernel.workspace_dtype) + self._workspace_size = self._parse_workspace_size(self._kernel.workspace_size) + self._host_function_list, self._host_function_order = self._parse_host_function_list( + self._kernel.host_function_list + ) + self._storage_id = self._parse_storageid(self._kernel.storageid) + self._cuda_source_code = self._kernel.cuda_source_code - self.describe() + self._describe() - def parse_shape_and_type(self): + def _parse_shape_and_type(self): """ Infer for input and output shape. """ tunning_node = self._tunning_node for inp in tunning_node.inputs: - if inp.__class__ == gs.Constant or not inp.is_empty(): - self._onnx_input_python_type.append(tvm_to_c_type_mapping[inp.dtype.name]) - self._onnx_tensor_type.append(python_to_trt_type_mapping[inp.dtype.name]) + self._tensor_type.append(python_to_trt_type_mapping[inp.dtype.name]) for oup in tunning_node.outputs: - self._onnx_output_python_type.append(tvm_to_c_type_mapping[oup.dtype.name]) - self._onnx_tensor_type.append(python_to_trt_type_mapping[oup.dtype.name]) + self._tensor_type.append(python_to_trt_type_mapping[oup.dtype.name]) - self._onnx_output_shape = [oup.shape for oup in tunning_node.outputs] - self._onnx_input_shape = [ - inp.shape - for inp in tunning_node.inputs - if (inp.__class__ == gs.Variable and not inp.is_empty()) - ] + self._output_shape = [oup.shape for oup in tunning_node.outputs] - def parse_input_index(self): - """ - Calculate the index of variable and constant input. - """ - tunning_node = self._tunning_node - self._onnx_variable_input_index = [ - k - for k, inp in enumerate(tunning_node.inputs) - if ( - inp.__class__ == gs.Variable - and not (len(inp.inputs) == 1 and inp.inputs[0].op == "Constant") - ) - ] - - self._onnx_constant_input_index = [ - k - for k, inp in enumerate(tunning_node.inputs) - if ( - inp.__class__ == gs.Constant - or (len(inp.inputs) == 1 and inp.inputs[0].op == "Constant") - ) - ] - - def parse_device_function_inputs(self): + def _parse_device_function_inputs(self): """ The memory address used by functions params. """ workspace_size = 0 - input_slot_dict = {} # storageid -> xx + input_slot_dict = {} # storageid -> xx # 1. for outputs - for i in range(self._nums_output): - # entry id of output + for i in range(self._nums_outputs): + # given index of outputs, return entry id eid = self._kernel.graph_module.get_output_eid(i) - idx = int(self._storage_id[eid]) + sid = int(self._storage_id[eid]) # resolve output type given entry id - self._output_type.append(python_to_trt_type_mapping[self._data_type[eid]]) - self._input_dict[str(eid)] = "outputs[" + str(i) + "]" - input_slot_dict[idx] = "outputs[" + str(i) + "]" + self._output_dtype.append(python_to_trt_type_mapping[self._workspace_dtype[eid]]) + self._input_dict[str(eid)] = f"outputs[{i}]" + input_slot_dict[sid] = f"outputs[{i}]" # 2. for inputs, including variable and constants storage_id_to_allocate_size = {} - for eid in range(len(self._allocate_size)): - idx = int(self._storage_id[eid]) - if idx not in storage_id_to_allocate_size.keys(): - storage_id_to_allocate_size[idx] = 0 - storage_id_to_allocate_size[idx] = max(int(self._allocate_size[eid]), int(storage_id_to_allocate_size[idx])) - - for eid in range(len(self._allocate_size)): - idx = int(self._storage_id[eid]) - if idx in input_slot_dict.keys(): - self._input_dict[str(eid)] = input_slot_dict[idx] + for eid in range(len(self._workspace_size)): + sid = int(self._storage_id[eid]) + if sid not in storage_id_to_allocate_size.keys(): + storage_id_to_allocate_size[sid] = 0 + storage_id_to_allocate_size[sid] = max( + int(self._workspace_size[eid]), int(storage_id_to_allocate_size[sid]) + ) + + for eid in range(len(self._workspace_size)): + sid = int(self._storage_id[eid]) + if sid in input_slot_dict.keys(): + self._input_dict[str(eid)] = input_slot_dict[sid] continue - if eid < self._nums_input: + if eid < self._nums_inputs: # it must be variable self._input_dict[str(eid)] = "inputs[" + str(eid) + "]" - elif eid < len(self._allocate_size) - self._nums_output: + elif eid < len(self._workspace_size) - self._nums_outputs: # it must be constant - if eid == self._nums_input: + if eid == self._nums_inputs: # the first one self._input_dict[str(eid)] = "workspace" else: self._input_dict[str(eid)] = f"(workspace + {workspace_size})" - workspace_size += int(storage_id_to_allocate_size[idx]) + workspace_size += int(storage_id_to_allocate_size[sid]) if ( self._input_dict[str(eid)] not in self._tvm_workspace_constant.keys() - and str(idx) in self._constant_params.keys() + and str(sid) in self._constant_params.keys() ): self._tvm_workspace_constant[self._input_dict[str(eid)]] = ( - self._constant_params[str(idx)], - tvm_to_c_type_mapping[self._data_type[eid]], + self._constant_params[str(sid)], + tvm_to_c_type_mapping[self._workspace_dtype[eid]], int(eid), ) - input_slot_dict[idx] = self._input_dict[str(eid)] + input_slot_dict[sid] = self._input_dict[str(eid)] if len(self._device_allocate_memory_size) != 0: for key, value in self._device_allocate_memory_size.items(): self._input_dict[key] = ( - "(" + tvm_to_c_type_mapping[value[0]] + "*)(workspace + " + str(workspace_size) + ")" + "(" + + tvm_to_c_type_mapping[value[0]] + + "*)(workspace + " + + str(workspace_size) + + ")" ) workspace_size += int(value[1]) * plugin_type_size[value[0]] - self._workspace_size = workspace_size + self._total_workspace_size = workspace_size - def parse_device_function_config(self): + def _parse_device_function_config(self): """ Grid, Block Layout, etc. """ - output = "" output_json = {} kernel_call_times = {} for i in range(len(self._device_function_order)): @@ -342,40 +300,40 @@ def parse_device_function_config(self): unique_device_function_name = device_funtion_name else: kernel_call_times[device_funtion_name] += 1 - host_function_name = host_function_name + "_" + str(kernel_call_times[device_funtion_name]) - unique_device_function_name = device_funtion_name + "_" + str(kernel_call_times[device_funtion_name]) + host_function_name = ( + host_function_name + "_" + str(kernel_call_times[device_funtion_name]) + ) + unique_device_function_name = ( + device_funtion_name + "_" + str(kernel_call_times[device_funtion_name]) + ) output_json[unique_device_function_name] = {} # grid and block dim - output_json[unique_device_function_name]["grid_dim"] = self._device_thread_config[device_funtion_name][ - 0 - ].strip("grid=") + output_json[unique_device_function_name]["grid_dim"] = self._device_thread_config[ + device_funtion_name + ][0].strip("grid=") output_json[unique_device_function_name]["block_dim"] = self._device_thread_config[ device_funtion_name ][1].strip("block=") - output += device_funtion_name + "\n" + str(self._device_thread_config[device_funtion_name]) + "\n" device_param_order = self._device_function_list[device_funtion_name] - host_param_order = self._host_function_list[host_function_name] + host_param_order = self._host_function_list[host_function_name] # eid enqueue_params = "" for j in range(len(device_param_order)): if device_param_order[j].isdigit(): - output += self._input_dict[str(host_param_order[int(device_param_order[j])])] eid = host_param_order[int(device_param_order[j])] enqueue_params += ( "(" - + tvm_to_c_type_mapping[self._data_type[int(eid)]] + + tvm_to_c_type_mapping[self._workspace_dtype[int(eid)]] + "*)" + self._input_dict[str(eid)] ) else: if device_param_order[j] in self._input_dict.keys(): enqueue_params += self._input_dict[device_param_order[j]] - if j == len(device_param_order) - 1: - output += "\n" - else: - output += ", " + + if j != len(device_param_order) - 1: enqueue_params += ", " output_json[unique_device_function_name]["enqueue_params"] = enqueue_params self._device_function_configuration = output_json @@ -393,36 +351,24 @@ def device_function_configuration(self): return self._device_function_configuration @property - def workspace_size(self): - return self._workspace_size + def total_workspace_size(self): + return self._total_workspace_size @property - def output_num(self): - return self._nums_output + def num_outputs(self): + return self._nums_outputs @property - def output_type(self): - return self._output_type + def output_dtype(self): + return self._output_dtype @property def output_shape(self): - return self._onnx_output_shape - - @property - def input_shape(self): - return self._onnx_input_shape - - @property - def onnx_weight_input_index(self): - return self._onnx_constant_input_index - - @property - def onnx_variable_input_index(self): - return self._onnx_variable_input_index + return self._output_shape @property def tensor_type(self): - return self._onnx_tensor_type + return self._tensor_type @property def workspace_constant(self): @@ -439,30 +385,3 @@ def plugin_name(self): @property def onnx_op_type(self): return self._kernel.onnx_op_type - - @property - def storage_id(self): - return self._storage_id - - @property - def onnx_input_python_type(self): - return self._onnx_input_python_type - - @property - def onnx_output_python_type(self): - return self._onnx_output_python_type - - @property - def input_workspace_size(self): - return self._input_workspace_size - - @property - def output_workspace_size(self): - return self._output_workspace_size - - @property - def total_workspace_size(self): - allocate_size = 0 - for size in self._allocate_size: - allocate_size += int(size) - return allocate_size diff --git a/src/runtime/graph_executor/graph_executor.cc b/src/runtime/graph_executor/graph_executor.cc index c055cb66af05..cbdae9a510ab 100644 --- a/src/runtime/graph_executor/graph_executor.cc +++ b/src/runtime/graph_executor/graph_executor.cc @@ -392,7 +392,7 @@ String GraphExecutor::GetWorkspaceSize() { return os.str(); } -String GraphExecutor::GetFuncList() { +String GraphExecutor::GetFunctionList() { std::ostringstream os; for (auto funcs : exec_func_) { for (auto func : funcs) { @@ -553,20 +553,20 @@ void GraphExecutor::SetupOpExecs() { const auto& inode = nodes_[nid]; if (inode.op_type == "null") continue; std::vector args; - std::vector indexes; + std::vector eids; std::vector funcs; for (const auto& e : inode.inputs) { uint32_t eid = this->entry_id(e); args.push_back(const_cast(data_entry_[eid].operator->())); - indexes.push_back(eid); + eids.push_back(eid); // entry id of inputs } for (uint32_t index = 0; index < inode.param.num_outputs; ++index) { uint32_t eid = this->entry_id(nid, index); args.push_back(const_cast(data_entry_[eid].operator->())); - indexes.push_back(eid); + eids.push_back(eid); // entry id of outputs } funcs.push_back(inode.param.func_name); - for (auto eid : indexes) { + for (auto eid : eids) { funcs.push_back(std::to_string(eid)); } exec_func_.push_back(funcs); @@ -796,9 +796,9 @@ PackedFunc GraphExecutor::GetFunction(const String& name, const ObjectPtrGetWorkspaceSize(); }); - } else if (name == "get_func_list") { + } else if (name == "get_function_list") { return PackedFunc( - [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->GetFuncList(); }); + [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->GetFunctionList(); }); } else if (name == "get_storageid") { return PackedFunc( [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->GetStorageId(); }); diff --git a/src/runtime/graph_executor/graph_executor.h b/src/runtime/graph_executor/graph_executor.h index 40731c303816..9d044cdf8a2f 100644 --- a/src/runtime/graph_executor/graph_executor.h +++ b/src/runtime/graph_executor/graph_executor.h @@ -421,7 +421,7 @@ class TVM_DLL GraphExecutor : public ModuleNode { /*! \brief get the storage size */ String GetWorkspaceSize(); /*! \brief get the exec func in order*/ - String GetFuncList(); + String GetFunctionList(); /*! \brief get storage ids*/ String GetStorageId(); int GetOutputEid(int index) const; diff --git a/src/tir/transforms/make_packed_api.cc b/src/tir/transforms/make_packed_api.cc index 4db041d16fe5..18acbda1bee8 100644 --- a/src/tir/transforms/make_packed_api.cc +++ b/src/tir/transforms/make_packed_api.cc @@ -215,7 +215,6 @@ PrimFunc MakePackedAPI(PrimFunc func) { return func; } std::string name_hint = global_symbol.value(); - std::cout << "NAME HINT ===> " << name_hint << '\n'; Target target = [&]() { auto opt = func->GetAttr(tvm::attr::kTarget); diff --git a/tests/python/tpat/cuda/common.py b/tests/python/tpat/cuda/common.py index 019a0cf366b0..e8453f779e33 100644 --- a/tests/python/tpat/cuda/common.py +++ b/tests/python/tpat/cuda/common.py @@ -15,9 +15,7 @@ # specific language governing permissions and limitations # under the License. -import ctypes import os -import sys import numpy as np import onnx @@ -32,20 +30,25 @@ from tvm import tpat -from .trt import allocate_buffers, build_engine, do_inference, load_plugin +from .trt import ( + allocate_buffers, + build_engine, + do_inference, + load_plugin, + remove_plugin, +) tf.disable_v2_behavior() -I_GPU = 0 -os.environ["CUDA_VISIBLE_DEVICES"] = str(I_GPU) -np.random.seed(0) -ITERATIONS = 10 INPUT_MODEL_FILE = "test_op_plugin.onnx" OUTPUT_MODEL_FILE = "test_op_trt.onnx" TRT_LOGGER = trt.Logger(trt.Logger.VERBOSE) -BATCH_SIZE = 1 +# set gpu device for tensorflow +gpu_devices = tf.config.experimental.list_physical_devices("GPU") +for device in gpu_devices: + tf.config.experimental.set_memory_growth(device, True) # Simple helper data class that's a little nicer to use than a 2-tuple. @@ -208,7 +211,7 @@ def verify_with_ort_with_trt( OUTPUT_MODEL_FILE, ) - load_plugin(trt_plugin_names) + libs = load_plugin(trt_plugin_names) engine = build_engine(OUTPUT_MODEL_FILE, trt_engine_datatype=trt.DataType.HALF) inputs, outputs, bindings, stream = allocate_buffers(engine) @@ -225,11 +228,12 @@ def verify_with_ort_with_trt( stream=stream, ) + remove_plugin(libs) + ret = True if len(trt_result) == 1: ret = compare_tf_trt_result(ort_result, trt_result) else: - # ret &= compare_tf_trt_result(ort_result[0], trt_result[0]) for i in range(len(trt_result)): ret &= compare_tf_trt_result(ort_result[i], trt_result[i]) assert ret, "result check False" @@ -2734,7 +2738,6 @@ def _test_forward_one_hot(indices_shape, depth, on_value, off_value, axis, out_d out = tf.one_hot(in1, depth, on_value, off_value, axis, dtype=out_dtype, name=op_name) out = tf.identity(out, "output") verify_tf_with_trt_result([inp_array1], ["input:0"], ["output:0"], op_name) - # compare_tf_with_tvm(inp_array1, in1.name, out.name) def test_forward_one_hot(): @@ -3435,7 +3438,7 @@ def test_logical(): _test_logical("is_nan", "test_logical_nan") -@pytest.mark.skip(reason="TensorRT segmentfault") +@pytest.mark.skip(reason="TensorFlow segmentfault") def test_scatternd(): batch_size = 32 op_name = "scatternd" From 52dd98f28d869d45a893fcc9abd3afa7129143c1 Mon Sep 17 00:00:00 2001 From: Civitasv Date: Mon, 14 Aug 2023 11:15:21 +0800 Subject: [PATCH 07/14] [tensorrt] [byoc] [plugin] remove unused imports --- python/tvm/tpat/cuda/kernel.py | 1 + python/tvm/tpat/cuda/pipeline.py | 4 ++ python/tvm/tpat/cuda/rewrite.py | 6 +- python/tvm/tpat/cuda/template.py | 12 +--- python/tvm/tpat/cuda/template_params.py | 19 +++--- python/tvm/tpat/cuda/type_mapping.py | 30 +++++---- tests/python/tpat/cuda/common.py | 82 +++++++++++++++++++++++++ 7 files changed, 110 insertions(+), 44 deletions(-) diff --git a/python/tvm/tpat/cuda/kernel.py b/python/tvm/tpat/cuda/kernel.py index c3eb31a934c0..e0ffc35b7595 100644 --- a/python/tvm/tpat/cuda/kernel.py +++ b/python/tvm/tpat/cuda/kernel.py @@ -139,6 +139,7 @@ def cuda_source_code(self): try: source_code = self._lib.get_lib().imported_modules[0].get_source() + # consistent type source_code = source_code.replace("signed char*", "int*") source_code = source_code.replace("uint64_t*", "int*") source_code = source_code.replace("long long", "int") diff --git a/python/tvm/tpat/cuda/pipeline.py b/python/tvm/tpat/cuda/pipeline.py index 7a56727ab84f..bdd441dae8b1 100644 --- a/python/tvm/tpat/cuda/pipeline.py +++ b/python/tvm/tpat/cuda/pipeline.py @@ -98,12 +98,16 @@ def pipeline( plugin_path = [] for node in node_to_be_tunned: name = node.name + print(f"Processing ---- {name}") plugin_name = "tpat_{}".format(name.replace("/", "_").replace(".", "_")) subgraph, submodel, shapes = _extract_target_onnx_node(inferred_model, node) kernel = Kernel(plugin_name, submodel, shapes, enable_tunning, tunning_option) kernel.run() + if not kernel.cuda_source_code: + print(f"Skip {name}, because cuda source code is None") + continue ## 3.1 fill in template params = PluginTemplateParams(kernel, submodel, subgraph, node, name) diff --git a/python/tvm/tpat/cuda/rewrite.py b/python/tvm/tpat/cuda/rewrite.py index ea726aad620b..c071f7662620 100644 --- a/python/tvm/tpat/cuda/rewrite.py +++ b/python/tvm/tpat/cuda/rewrite.py @@ -15,12 +15,8 @@ # specific language governing permissions and limitations # under the License. -import os - import onnx import onnx_graphsurgeon as gs -from loguru import logger -from onnx import shape_inference from .type_mapping import onnx_type_mapping @@ -87,7 +83,7 @@ def _remove_unnecessary_cast_nodes(graph): ] for node in cast_nodes: if ( - node.attrs["to"] == 13 # uint64 + node.attrs["to"] == 13 # uint64 and len(node.inputs[0].inputs) <= 1 and len(node.outputs[0].outputs) <= 1 ): diff --git a/python/tvm/tpat/cuda/template.py b/python/tvm/tpat/cuda/template.py index 53dd94100304..5b35ced0cf2b 100644 --- a/python/tvm/tpat/cuda/template.py +++ b/python/tvm/tpat/cuda/template.py @@ -19,11 +19,7 @@ import os import re -import onnx -import onnx_graphsurgeon as gs from jinja2 import Environment, FileSystemLoader -from loguru import logger -from onnx import shape_inference @contextlib.contextmanager @@ -36,15 +32,10 @@ def pushd(new_dir): os.chdir(pre_dir) -def rm_part_define(source_code): - m = re.search('extern "C"', source_code.strip()) - return source_code[m.start() :] - - class PluginTemplate(object): def __init__(self, template_params): with pushd(os.path.normpath(os.path.dirname(__file__))): - template_loader = FileSystemLoader(searchpath='./') + template_loader = FileSystemLoader(searchpath="./") self._template_env = Environment(loader=template_loader) self._plugin_name = template_params.plugin_name @@ -66,7 +57,6 @@ def __init__(self, template_params): workspace_constant = template_params.workspace_constant self._plugin_constant_init = self._parse_plugin_workspace_constant(workspace_constant) - class TensorDims: def __init__(self, nbdims, shape): self.nbdims = nbdims diff --git a/python/tvm/tpat/cuda/template_params.py b/python/tvm/tpat/cuda/template_params.py index be83887e3d6c..96911ba6126b 100644 --- a/python/tvm/tpat/cuda/template_params.py +++ b/python/tvm/tpat/cuda/template_params.py @@ -15,15 +15,8 @@ # specific language governing permissions and limitations # under the License. -import copy -import os import re -import numpy as np -import onnx -import onnx_graphsurgeon as gs -import onnxruntime as ort -from onnx import shape_inference from .type_mapping import plugin_type_size, python_to_trt_type_mapping, tvm_to_c_type_mapping @@ -41,12 +34,16 @@ def __init__(self, kernel, model, graph, tunning_node, name): self._input_dict = {} + self._cuda_source_code = None + self._workspace_size = [] # eid -> workspace size self._workspace_dtype = [] # eid -> workspace dtype self._total_workspace_size = 0 # total workspace size need by plugin # Kernel related params - self._device_function_list = {} # kernel -> index for params of host function + self._device_function_list = ( + {} + ) # kernel -> index for params of host function or address based on address self._device_thread_config = {} # kernel -> thread dim self._device_function_order = [] # kernel invoke order self._device_allocate_memory_size = {} # address -> (dtype, extent) @@ -74,6 +71,7 @@ def __init__(self, kernel, model, graph, tunning_node, name): def _describe(self): """Use for debug.""" + print(f"Cuda source code >>> {self._cuda_source_code}") print(f"Constant params >>> {self._constant_params}") print(f"Device Function List >>> {self._device_function_list}") print(f"Device Thread Config >>> {self._device_thread_config}") @@ -86,7 +84,6 @@ def _describe(self): print(f"Host Function Order >>> {self._host_function_order}") print(f"Storage Id >>> {self._storage_id}") print(f"Device Memory Size >>> {self._device_allocate_memory_size}") - # print(f"Cuda Source Code >>> {self._cuda_source_code}") # Parse Constant. def _parse_constant_params(self, constant_params): @@ -179,6 +176,7 @@ def _parse_host_function_list(self, host_function_list): return host_executor_order, host_func_order def _parse_kernel_params(self): + self._cuda_source_code = self._kernel.cuda_source_code self._constant_params = self._parse_constant_params(self._kernel.constant_params) self._device_function_list = self._parse_device_function_list( self._kernel.device_function_list @@ -198,9 +196,6 @@ def _parse_kernel_params(self): self._kernel.host_function_list ) self._storage_id = self._parse_storageid(self._kernel.storageid) - self._cuda_source_code = self._kernel.cuda_source_code - - self._describe() def _parse_shape_and_type(self): """ diff --git a/python/tvm/tpat/cuda/type_mapping.py b/python/tvm/tpat/cuda/type_mapping.py index d47b46c12860..92ec2a1f7808 100644 --- a/python/tvm/tpat/cuda/type_mapping.py +++ b/python/tvm/tpat/cuda/type_mapping.py @@ -15,45 +15,43 @@ # specific language governing permissions and limitations # under the License. -# type mapping : tvm -> c +# type mapping : tvm -> c, used by c++ tvm_to_c_type_mapping = { + "bool": "int", "int16": "int", "int32": "int", "int64": "int", - "float32": "float", - "uint64": "int", - "uint8": "int8", - "uint1": "int", + "uint8": "uchar", "uint32": "int", + "uint64": "int", + "float32": "float", "float64": "float", - "bool": "int", } -# type mapping : python -> trt +# type mapping : python -> trt, used by TensorRT's getOutputDataType python_to_trt_type_mapping = { "bool": "INT32", "int32": "INT32", "int64": "INT32", - "float32": "FLOAT", "uint64": "INT32", "uint8": "INT8", - "uint1": "INT32", + "float32": "FLOAT", "float64": "FLOAT", } -# type size : trt workspace +# type size : trt workspace, sizeof c++ data type plugin_type_size = { + "bool": 4, "int16": 4, "int32": 4, - "float32": 4, "int64": 4, + "uint8": 1, "uint32": 4, "uint64": 4, - "uint8": 1, - "uint1": 1, + "float32": 4, "float64": 4, } -# onnx type -onnx_type_mapping = {"int64": 7, "bool": 9, "uint32": 12, "uint64": 13} -# "int32": 6 \ No newline at end of file +# onnx type, used by CAST operator +# "int32": 6 +onnx_type_mapping = {"int64": 7, "bool": 9, "uint32": 12, "uint64": 13} \ No newline at end of file diff --git a/tests/python/tpat/cuda/common.py b/tests/python/tpat/cuda/common.py index e8453f779e33..58ef60c7ce91 100644 --- a/tests/python/tpat/cuda/common.py +++ b/tests/python/tpat/cuda/common.py @@ -3464,3 +3464,85 @@ def test_scatternd(): # x = tf.scatter_nd(indices, updates, data.shape) _ = tf.identity(x, name="output") verify_tf_with_trt_result([input_data], ["input:0"], ["output:0"], op_name) + +if __name__ == "__main__": + test_abs() + test_acos() + test_and() + test_add() + test_argmax() + test_argmin() + test_asin() + test_asinh() + test_atan() + test_atanh() + test_averagepool() + test_batchnormalization() + test_ceil() + test_celu() + test_clip() + test_concat() + test_conv() + test_convtranspose() + test_cos() + test_cosh() + test_depthtospace() + test_div() + # ------100 limited library + test_einsum() + test_elu() + test_erf() + test_exp() + test_eyelike() + test_floor() + test_gather() + test_gatherelement() + test_gathernd() + test_gemm() + test_globalaveragepool() + test_globalmaxpool() + test_hardsigmoid() + test_hardswish() + test_hardmax() + test_identity() + test_instancenormalization() + test_leakyrelu() + test_log() + test_logsoftmax() + test_matmul() + test_max() + test_maxpool() + test_mean() + test_min() + test_mul() + test_neg() + test_negativeloglikelihoodloss() + # ---------100 limited library + test_prelu() + test_pow() + test_reciprocal() + test_reducel1() + test_reducel2() + test_reducelogsum() + test_reducelogsumexp() + test_reducemax() + test_reducemean() + test_reducesum() + test_maxunpool() + test_forward_one_hot() + test_where() + test_slice() + test_pad() + test_batch_norm() + test_softmax() + test_mod() + test_forward_mean() + test_instance_norm() + test_lrn() + test_binary_ops() + test_all_reduce_funcs() + test_split() + test_xor() + test_if() + test_logical() + test_scatternd() From a2c322badefb84efe842864a54c087aba4bc5c04 Mon Sep 17 00:00:00 2001 From: Civitasv Date: Mon, 14 Aug 2023 17:08:10 +0800 Subject: [PATCH 08/14] [tensorrt] [byoc] [plugin] Make API clearer --- python/tvm/tpat/cuda/template_params.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/python/tvm/tpat/cuda/template_params.py b/python/tvm/tpat/cuda/template_params.py index 96911ba6126b..1f349f04e2a6 100644 --- a/python/tvm/tpat/cuda/template_params.py +++ b/python/tvm/tpat/cuda/template_params.py @@ -43,7 +43,7 @@ def __init__(self, kernel, model, graph, tunning_node, name): # Kernel related params self._device_function_list = ( {} - ) # kernel -> index for params of host function or address based on address + ) # kernel -> index for params of host function or address based on workspace self._device_thread_config = {} # kernel -> thread dim self._device_function_order = [] # kernel invoke order self._device_allocate_memory_size = {} # address -> (dtype, extent) @@ -66,8 +66,8 @@ def __init__(self, kernel, model, graph, tunning_node, name): self._parse_shape_and_type() self._parse_kernel_params() - self._parse_device_function_inputs() - self._parse_device_function_config() + self._prepare_input_dict() + self._prepare_device_function_config() def _describe(self): """Use for debug.""" @@ -211,7 +211,7 @@ def _parse_shape_and_type(self): self._output_shape = [oup.shape for oup in tunning_node.outputs] - def _parse_device_function_inputs(self): + def _prepare_input_dict(self): """ The memory address used by functions params. """ @@ -229,7 +229,7 @@ def _parse_device_function_inputs(self): input_slot_dict[sid] = f"outputs[{i}]" # 2. for inputs, including variable and constants - storage_id_to_allocate_size = {} + storage_id_to_allocate_size = {} # different entry id may map to same storage id for eid in range(len(self._workspace_size)): sid = int(self._storage_id[eid]) if sid not in storage_id_to_allocate_size.keys(): @@ -279,7 +279,7 @@ def _parse_device_function_inputs(self): self._total_workspace_size = workspace_size - def _parse_device_function_config(self): + def _prepare_device_function_config(self): """ Grid, Block Layout, etc. """ From 78815800cea0cfafb1a3fce75a977ad4f2da9640 Mon Sep 17 00:00:00 2001 From: Civitasv Date: Mon, 14 Aug 2023 17:10:51 +0800 Subject: [PATCH 09/14] [tensorrt] [byoc] [plugin] change configuration of Makefile. --- python/tvm/tpat/cuda/plugin/Makefile | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/python/tvm/tpat/cuda/plugin/Makefile b/python/tvm/tpat/cuda/plugin/Makefile index 3406001e81dc..1aa97fcb7b62 100644 --- a/python/tvm/tpat/cuda/plugin/Makefile +++ b/python/tvm/tpat/cuda/plugin/Makefile @@ -15,12 +15,9 @@ # # Variables need to be defined by Users -# CUDA_PATH = /path/to/cuda -# CUDNN_PATH = /path/to/cudnn -# TRT_PATH = /path/to/TensorRT -CUDA_PATH = /home/huangzhe1/anaconda3/envs/tvm_tunning -CUDNN_PATH = /home/huangzhe1/husen/cudnn-linux-x86_64-8.9.3.28_cuda11-archive -TRT_PATH = /home/huangzhe1/husen/TensorRT-8.6.1.6 +CUDA_PATH = /path/to/cuda +CUDNN_PATH = /path/to/cudnn +TRT_PATH = /path/to/TensorRT ARCH = sm_86 ######################################## From e13a474e6cfb0a856a0c1f452ce677dae7b3ca12 Mon Sep 17 00:00:00 2001 From: Civitasv Date: Tue, 15 Aug 2023 10:38:22 +0800 Subject: [PATCH 10/14] [tensorrt] [byoc] [plugin] anyway, better name is better --- .../cuda/plugin/trt8.0_plugin_cu.template | 6 +- .../tpat/cuda/plugin/trt8.0_plugin_h.template | 10 +- python/tvm/tpat/cuda/template.py | 81 +++---- python/tvm/tpat/cuda/template_params.py | 208 ++++++++++-------- 4 files changed, 159 insertions(+), 146 deletions(-) diff --git a/python/tvm/tpat/cuda/plugin/trt8.0_plugin_cu.template b/python/tvm/tpat/cuda/plugin/trt8.0_plugin_cu.template index 565a72b00e23..48f843f19741 100644 --- a/python/tvm/tpat/cuda/plugin/trt8.0_plugin_cu.template +++ b/python/tvm/tpat/cuda/plugin/trt8.0_plugin_cu.template @@ -33,18 +33,18 @@ void check(T result, char const *const func, const char *const file, #define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__) -{{plugin_kernels_body}} +{{plugin_source_code}} PluginFieldCollection {{plugin_name}}Creator::mFC{}; std::vector {{plugin_name}}Creator::mPluginAttributes; int {{plugin_name}}::enqueue(const nvinfer1::PluginTensorDesc* inputDesc, const nvinfer1::PluginTensorDesc* outputDesc, const void* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) noexcept { - {% for constant in plugin_constant_init %} + {% for constant in plugin_workspace_constant %} const {{constant.type}} constant_{{constant.index}}[{{constant.length}}] = { {{constant.value}} }; checkCudaErrors(cudaMemcpyAsync({{constant.pos}}, &constant_{{constant.index}}, {{constant.length}} * sizeof({{constant.type}}), cudaMemcpyHostToDevice, stream)); {% endfor %} dim3 dimBlock, dimGrid; - {% for kernel in plugin_kernels_params %} + {% for kernel in plugin_device_function_configuration %} dimGrid = dim3{{kernel.grid_dim}}; dimBlock = dim3{{kernel.block_dim}}; {{kernel.name}}<<>>({{kernel.enqueue_params}}); diff --git a/python/tvm/tpat/cuda/plugin/trt8.0_plugin_h.template b/python/tvm/tpat/cuda/plugin/trt8.0_plugin_h.template index fdc9a0bcbe29..22b3d0a8deb1 100644 --- a/python/tvm/tpat/cuda/plugin/trt8.0_plugin_h.template +++ b/python/tvm/tpat/cuda/plugin/trt8.0_plugin_h.template @@ -28,7 +28,7 @@ namespace plugin class {{plugin_name}}: public IPluginV2DynamicExt { public: {{plugin_name}}() {} - + {{plugin_name}}(const void *buffer, size_t length) { } @@ -36,7 +36,7 @@ public: return 0; } virtual void serialize(void *buffer) const noexcept override {} - + //! The combination of kLINEAR + kFLOAT is supported. bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) noexcept override { @@ -70,7 +70,7 @@ public: } nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const noexcept override{ //std::cout << __FUNCTION__ << std::endl; - {% for type in plugin_output_type %}if (index == {{ loop.index0 }}){ + {% for type in plugin_output_dtype %}if (index == {{ loop.index0 }}){ return nvinfer1::DataType::k{{type}}; } {% endfor %} @@ -107,13 +107,13 @@ public: obj->setPluginNamespace(mNamespace.c_str()); return obj; } - + const char* getPluginName() const noexcept override {return "{{plugin_name}}";} const char* getPluginVersion() const noexcept override {return "1";} void setPluginNamespace(const char* szNamespace) noexcept override {mNamespace = szNamespace;} const char* getPluginNamespace() const noexcept override {return mNamespace.c_str();} - + const nvinfer1::PluginFieldCollection* getFieldNames() noexcept override { //std::cout << __FUNCTION__ << std::endl; return &mFC; diff --git a/python/tvm/tpat/cuda/template.py b/python/tvm/tpat/cuda/template.py index 5b35ced0cf2b..9621394661ae 100644 --- a/python/tvm/tpat/cuda/template.py +++ b/python/tvm/tpat/cuda/template.py @@ -17,7 +17,6 @@ import contextlib import os -import re from jinja2 import Environment, FileSystemLoader @@ -39,23 +38,20 @@ def __init__(self, template_params): self._template_env = Environment(loader=template_loader) self._plugin_name = template_params.plugin_name - self._plugin_device_function_configuration = template_params.device_function_configuration self._plugin_output_number = template_params.num_outputs - self._plugin_output_type = template_params.output_dtype + self._plugin_output_dtype = template_params.output_dtype self._plugin_workspace_size = template_params.total_workspace_size - self._plugin_kernels_body = template_params.cuda_source_code - - onnx_output_shape = template_params.output_shape - self._plugin_output_shape = self._parse_plugin_output_shape(onnx_output_shape) - - onnx_tensor_type = template_params.tensor_type - self._plugin_tensor_format = self._parse_plugin_tensor_format(onnx_tensor_type) - - kernel_order = template_params.device_function_order - self._plugin_kernels_params = self._parse_plugin_kernels_params(kernel_order) - - workspace_constant = template_params.workspace_constant - self._plugin_constant_init = self._parse_plugin_workspace_constant(workspace_constant) + self._plugin_source_code = template_params.cuda_source_code + self._plugin_output_shape = self._parse_plugin_output_shape(template_params.output_shape) + self._plugin_tensor_format = self._parse_plugin_tensor_format(template_params.tensor_type) + self._plugin_device_function_configuration = ( + self._parse_plugin_device_function_configuration( + template_params.device_function_configuration, template_params.device_function_list + ) + ) + self._plugin_workspace_constant = self._parse_plugin_workspace_constant( + template_params.workspace_constant + ) class TensorDims: def __init__(self, nbdims, shape): @@ -134,47 +130,42 @@ def __init__(self, size, dtype): self.size = size self.dtype = dtype - def _parse_plugin_input_shape(self, onnx_input_shape): - plugin_input_shape = [] - for s in onnx_input_shape: - nbdims = len(s) - shape = s - plugin_input_shape.append(self.TensorDims(nbdims, shape)) - return plugin_input_shape - - def _parse_plugin_output_shape(self, onnx_output_shape): + def _parse_plugin_output_shape(self, output_shape): plugin_output_shape = [] - for s in onnx_output_shape: + for s in output_shape: nbdims = len(s) shape = s plugin_output_shape.append(self.TensorDims(nbdims, shape)) return plugin_output_shape - def _parse_plugin_tensor_format(self, onnx_tensor_type): + def _parse_plugin_tensor_format(self, tensor_type): plugin_tensor_format = [] - for dtype in onnx_tensor_type: + for dtype in tensor_type: plugin_tensor_format.append(self.TensorFormat("LINEAR", dtype)) return plugin_tensor_format - def _parse_plugin_kernels_params(self, kernel_order): - kernel_call = {} - plugin_kernels_params = [] - for func_name in kernel_order: - if func_name not in kernel_call.keys(): - kernel_call[func_name] = 0 + def _parse_plugin_device_function_configuration( + self, device_function_configuration, device_function_list + ): + frequency = {} + kernel_configuration = [] + for func_name in device_function_list: + if func_name not in frequency.keys(): + frequency[func_name] = 0 key_name = func_name else: - kernel_call[func_name] += 1 - key_name = func_name + "_" + str(kernel_call[func_name]) - plugin_kernels_params.append( + frequency[func_name] += 1 + key_name = f"{func_name}_{frequency[func_name]}" + + kernel_configuration.append( self.Kernel( func_name, - self._plugin_device_function_configuration[key_name]["grid_dim"], - self._plugin_device_function_configuration[key_name]["block_dim"], - self._plugin_device_function_configuration[key_name]["enqueue_params"], + device_function_configuration[key_name]["grid_dim"], + device_function_configuration[key_name]["block_dim"], + device_function_configuration[key_name]["enqueue_params"], ) ) - return plugin_kernels_params + return kernel_configuration def _parse_plugin_workspace_constant(self, workspace_constant): plugin_constant_init = [] @@ -245,7 +236,7 @@ def generate_header_file(self): plugin_name=self._plugin_name, plugin_output_number=self._plugin_output_number, plugin_output_shape=self._plugin_output_shape, - plugin_output_type=self._plugin_output_type, + plugin_output_dtype=self._plugin_output_dtype, plugin_workspace_size=self._plugin_workspace_size, plugin_tensor_format=self._plugin_tensor_format, ) @@ -256,9 +247,9 @@ def generate_source_file(self): template = self._template_env.get_template(self._template_source_file) output_text = template.render( plugin_name=self._plugin_name, - plugin_kernels_params=self._plugin_kernels_params, - plugin_kernels_body=self._plugin_kernels_body, - plugin_constant_init=self._plugin_constant_init, + plugin_device_function_configuration=self._plugin_device_function_configuration, + plugin_source_code=self._plugin_source_code, + plugin_workspace_constant=self._plugin_workspace_constant, ) with open("./plugin/src/{}.cu".format(self._plugin_name), "w") as f: f.write(output_text) diff --git a/python/tvm/tpat/cuda/template_params.py b/python/tvm/tpat/cuda/template_params.py index 1f349f04e2a6..89771fd9a304 100644 --- a/python/tvm/tpat/cuda/template_params.py +++ b/python/tvm/tpat/cuda/template_params.py @@ -41,23 +41,22 @@ def __init__(self, kernel, model, graph, tunning_node, name): self._total_workspace_size = 0 # total workspace size need by plugin # Kernel related params - self._device_function_list = ( + self._device_function_params = ( {} ) # kernel -> index for params of host function or address based on workspace self._device_thread_config = {} # kernel -> thread dim - self._device_function_order = [] # kernel invoke order - self._device_allocate_memory_size = {} # address -> (dtype, extent) + self._device_function_list = [] # kernel invoke order + self._device_allocate_memory_size = {} # address -> (dtype, extent), intermediate variable # Host side function attrs - self._host_function_list = {} # function -> eid of params (firstly inputs, then outputs) - self._host_function_order = [] # host function order + self._host_function_params = {} # function -> eid of params (firstly inputs, then outputs) self._nums_inputs = 0 # number of inputs self._nums_outputs = 0 # number of outputs self._output_dtype = [] # dtype of outputs self._output_shape = [] # shape of outputs self._constant_params = {} # constant params, storage_id -> data - self._tvm_workspace_constant = {} + self._trt_workspace_constant = {} self._tensor_type = [] # tensor type of inputs and outputs @@ -73,15 +72,14 @@ def _describe(self): """Use for debug.""" print(f"Cuda source code >>> {self._cuda_source_code}") print(f"Constant params >>> {self._constant_params}") - print(f"Device Function List >>> {self._device_function_list}") + print(f"Device Function Param >>> {self._device_function_params}") print(f"Device Thread Config >>> {self._device_thread_config}") - print(f"Device Function Order >>> {self._device_function_order}") + print(f"Device Function List >>> {self._device_function_list}") print(f"Nums Input >>> {self._nums_inputs}") print(f"Nums Output >>> {self._nums_outputs}") print(f"Workspace Data Type >>> {self._workspace_dtype}") print(f"Workspace Size >>> {self._workspace_size}") - print(f"Host Function List >>> {self._host_function_list}") - print(f"Host Function Order >>> {self._host_function_order}") + print(f"Host Function Params >>> {self._host_function_params}") print(f"Storage Id >>> {self._storage_id}") print(f"Device Memory Size >>> {self._device_allocate_memory_size}") @@ -92,29 +90,56 @@ def _parse_constant_params(self, constant_params): tvm_constant[key] = value.flatten() return tvm_constant - # Parse device functions params order. def _parse_device_function_list(self, device_function_list): - _device_function_list = {} + function_list = [] + for item in device_function_list.split("\n"): + if len(item) == 0: + continue + item = item.split() + + function_list.append(item[0]) + + return function_list + + # Parse device functions params order. + def _parse_device_function_params(self, device_function_list): + frequency = {} + result = {} for device_function in device_function_list.split("\n"): if len(device_function) == 0: continue item = device_function.split() + name = item[0] + params = item[1:] - _device_function_list[item[0]] = item[1:] - return _device_function_list + if name not in result.keys(): + result[name] = params + frequency[name] = 0 + else: + frequency[name] += 1 + func_name = f"{name}_{frequency[name]}" + result[func_name] = params + return result # Parse device functions thread config. def _parse_device_function_thread_config(self, device_function_thread_config): + frequency = {} kernel_thread_config = {} - kernel_order = [] for item in device_function_thread_config.split("\n"): if len(item) == 0: continue config = item.split() kernel_name = config[0] - kernel_thread_config[kernel_name] = config[1:] - kernel_order.append(kernel_name) - return kernel_thread_config, kernel_order + params = config[1:] + + if kernel_name not in kernel_thread_config.keys(): + kernel_thread_config[kernel_name] = params + frequency[kernel_name] = 0 + else: + frequency[kernel_name] += 1 + func_name = f"{kernel_name}_{frequency[kernel_name]}" + kernel_thread_config[func_name] = params + return kernel_thread_config # Parse global memory allocated in device side. def _parse_device_allocate_memory_size(self, device_allocate_global_memory): @@ -153,38 +178,40 @@ def _parse_workspace_dtype(self, workspaces_dtype): def _parse_workspace_size(self, workspace_size): return workspace_size.split() - def _parse_host_function_list(self, host_function_list): + def _parse_host_function_params(self, host_function_list): """ Parse the list of host functions. """ - func_call = {} - host_executor_order = {} - host_func_order = [] - for host_func_inorder in host_function_list.split("\n"): - if len(host_func_inorder) == 0: + frequency = {} + result = {} + for function in host_function_list.split("\n"): + if len(function) == 0: continue - tvm_host_func = host_func_inorder.split() - if tvm_host_func[0] not in host_executor_order.keys(): - host_executor_order[tvm_host_func[0]] = tvm_host_func[1:] - host_func_order.append(tvm_host_func[0]) - func_call[tvm_host_func[0]] = 0 + data = function.split() + name = data[0] + params = data[1:] + + if name not in result.keys(): + result[name] = params + frequency[name] = 0 else: - func_call[tvm_host_func[0]] += 1 - func_name = tvm_host_func[0] + "_" + str(func_call[tvm_host_func[0]]) - host_executor_order[func_name] = tvm_host_func[1:] - host_func_order.append(func_name) - return host_executor_order, host_func_order + frequency[name] += 1 + func_name = f"{name}_{frequency[name]}" + result[func_name] = params + return result def _parse_kernel_params(self): self._cuda_source_code = self._kernel.cuda_source_code self._constant_params = self._parse_constant_params(self._kernel.constant_params) + self._device_function_params = self._parse_device_function_params( + self._kernel.device_function_list + ) self._device_function_list = self._parse_device_function_list( self._kernel.device_function_list ) - ( - self._device_thread_config, - self._device_function_order, - ) = self._parse_device_function_thread_config(self._kernel.device_function_thread_config) + self._device_thread_config = self._parse_device_function_thread_config( + self._kernel.device_function_thread_config + ) self._device_allocate_memory_size = self._parse_device_allocate_memory_size( self._kernel.device_allocate_memory_size ) @@ -192,11 +219,13 @@ def _parse_kernel_params(self): self._nums_outputs = self._parse_nums_output(self._kernel.num_outputs) self._workspace_dtype = self._parse_workspace_dtype(self._kernel.workspace_dtype) self._workspace_size = self._parse_workspace_size(self._kernel.workspace_size) - self._host_function_list, self._host_function_order = self._parse_host_function_list( + self._host_function_params = self._parse_host_function_params( self._kernel.host_function_list ) self._storage_id = self._parse_storageid(self._kernel.storageid) + self._describe() + def _parse_shape_and_type(self): """ Infer for input and output shape. @@ -229,13 +258,13 @@ def _prepare_input_dict(self): input_slot_dict[sid] = f"outputs[{i}]" # 2. for inputs, including variable and constants - storage_id_to_allocate_size = {} # different entry id may map to same storage id + storage_id_to_workspace_size = {} # different entry id may map to same storage id for eid in range(len(self._workspace_size)): sid = int(self._storage_id[eid]) - if sid not in storage_id_to_allocate_size.keys(): - storage_id_to_allocate_size[sid] = 0 - storage_id_to_allocate_size[sid] = max( - int(self._workspace_size[eid]), int(storage_id_to_allocate_size[sid]) + if sid not in storage_id_to_workspace_size.keys(): + storage_id_to_workspace_size[sid] = 0 + storage_id_to_workspace_size[sid] = max( + int(self._workspace_size[eid]), int(storage_id_to_workspace_size[sid]) ) for eid in range(len(self._workspace_size)): @@ -253,16 +282,17 @@ def _prepare_input_dict(self): self._input_dict[str(eid)] = "workspace" else: self._input_dict[str(eid)] = f"(workspace + {workspace_size})" - workspace_size += int(storage_id_to_allocate_size[sid]) + workspace_size += int(storage_id_to_workspace_size[sid]) + key = self._input_dict[str(eid)] if ( - self._input_dict[str(eid)] not in self._tvm_workspace_constant.keys() + not key in self._trt_workspace_constant.keys() and str(sid) in self._constant_params.keys() ): - self._tvm_workspace_constant[self._input_dict[str(eid)]] = ( - self._constant_params[str(sid)], - tvm_to_c_type_mapping[self._workspace_dtype[eid]], - int(eid), + self._trt_workspace_constant[key] = ( + self._constant_params[str(sid)], # value + tvm_to_c_type_mapping[self._workspace_dtype[eid]], # type + int(eid), # id ) input_slot_dict[sid] = self._input_dict[str(eid)] @@ -283,63 +313,55 @@ def _prepare_device_function_config(self): """ Grid, Block Layout, etc. """ - output_json = {} - kernel_call_times = {} - for i in range(len(self._device_function_order)): - device_funtion_name = self._device_function_order[i] - host_function_name = re.sub(r"_kernel_?\d*", "", device_funtion_name, count=1) - - if device_funtion_name not in output_json.keys(): - output_json[device_funtion_name] = {} - kernel_call_times[device_funtion_name] = 0 - unique_device_function_name = device_funtion_name + configuration = {} + frequency = {} + + for i in range(len(self._device_function_list)): + device_function_name = self._device_function_list[i] + host_function_name = re.sub(r"_kernel_?\d*", "", device_function_name, count=1) + + if device_function_name not in configuration.keys(): + configuration[device_function_name] = {} + frequency[device_function_name] = 0 else: - kernel_call_times[device_funtion_name] += 1 - host_function_name = ( - host_function_name + "_" + str(kernel_call_times[device_funtion_name]) - ) - unique_device_function_name = ( - device_funtion_name + "_" + str(kernel_call_times[device_funtion_name]) - ) - output_json[unique_device_function_name] = {} + frequency[device_function_name] += 1 + host_function_name = f"{host_function_name}_{frequency[device_function_name]}" + device_function_name = f"{device_function_name}_{frequency[device_function_name]}" + configuration[device_function_name] = {} # grid and block dim - output_json[unique_device_function_name]["grid_dim"] = self._device_thread_config[ - device_funtion_name + configuration[device_function_name]["grid_dim"] = self._device_thread_config[ + device_function_name ][0].strip("grid=") - output_json[unique_device_function_name]["block_dim"] = self._device_thread_config[ - device_funtion_name + configuration[device_function_name]["block_dim"] = self._device_thread_config[ + device_function_name ][1].strip("block=") - device_param_order = self._device_function_list[device_funtion_name] - host_param_order = self._host_function_list[host_function_name] # eid + device_params = self._device_function_params[device_function_name] + host_params = self._host_function_params[host_function_name] # eid of params enqueue_params = "" - for j in range(len(device_param_order)): - if device_param_order[j].isdigit(): - eid = host_param_order[int(device_param_order[j])] + for j in range(len(device_params)): + if device_params[j].isdigit(): # correspond to eid + eid = host_params[int(device_params[j])] + dtype = self._workspace_dtype[int(eid)] enqueue_params += ( - "(" - + tvm_to_c_type_mapping[self._workspace_dtype[int(eid)]] - + "*)" - + self._input_dict[str(eid)] + "(" + tvm_to_c_type_mapping[dtype] + "*)" + self._input_dict[str(eid)] ) else: - if device_param_order[j] in self._input_dict.keys(): - enqueue_params += self._input_dict[device_param_order[j]] + if ( + device_params[j] in self._input_dict.keys() + ): # correspond to device memory, intermediate variable + enqueue_params += self._input_dict[device_params[j]] - if j != len(device_param_order) - 1: + if j != len(device_params) - 1: enqueue_params += ", " - output_json[unique_device_function_name]["enqueue_params"] = enqueue_params - self._device_function_configuration = output_json - - @property - def host_func_order(self): - return self._host_function_order + configuration[device_function_name]["enqueue_params"] = enqueue_params + self._device_function_configuration = configuration @property - def device_function_order(self): - return self._device_function_order + def device_function_list(self): + return self._device_function_list @property def device_function_configuration(self): @@ -367,7 +389,7 @@ def tensor_type(self): @property def workspace_constant(self): - return self._tvm_workspace_constant + return self._trt_workspace_constant @property def cuda_source_code(self): From f0248b4fc1ced0c8caa2e118d5db3ab4d23eed8f Mon Sep 17 00:00:00 2001 From: Civitasv Date: Wed, 16 Aug 2023 18:31:08 +0800 Subject: [PATCH 11/14] [tensorrt] [plugin] [byoc] fix resolving device function order --- python/tvm/tpat/cuda/kernel.py | 15 ++++++--- python/tvm/tpat/cuda/pipeline.py | 45 ++++++++++++++++--------- python/tvm/tpat/cuda/rewrite.py | 9 +++-- python/tvm/tpat/cuda/template_params.py | 6 ++-- 4 files changed, 49 insertions(+), 26 deletions(-) diff --git a/python/tvm/tpat/cuda/kernel.py b/python/tvm/tpat/cuda/kernel.py index e0ffc35b7595..b0f3d4f6c6af 100644 --- a/python/tvm/tpat/cuda/kernel.py +++ b/python/tvm/tpat/cuda/kernel.py @@ -23,11 +23,17 @@ class Config(object): - def __init__(self, onnx_model, input_shapes, target, tunning_option) -> None: + def __init__(self, name, onnx_model, input_shapes, target, tunning_option) -> None: + self.name = name self.onnx_model = onnx_model self.input_shapes = input_shapes self.tunning_option = tunning_option - self.work_dir = tunning_option["work_dir"] if tunning_option["work_dir"] else "./log_db" + self.work_dir = ( + f"{tunning_option['work_dir']}/{name}" + if tunning_option["work_dir"] + else f"./log_db/{name}" + ) + print("WORK DIR:::", self.work_dir) if target == "gpu": self.target = self._detect_cuda_target() @@ -39,10 +45,11 @@ def _tune_option(self): "runner": ms.runner.LocalRunner(), "max_trials_global": 1000, "max_trials_per_task": 100, - "work_dir": self.work_dir, } default.update(self.tunning_option) + default["work_dir"] = self.work_dir + return default def _detect_cuda_target(self): @@ -66,7 +73,7 @@ class Kernel(object): def __init__(self, name, onnx_model, input_shapes, enable_tunning, tunning_option): self._name = name self._enable_tunning = enable_tunning - self._config = Config(onnx_model, input_shapes, "gpu", tunning_option) + self._config = Config(name, onnx_model, input_shapes, "gpu", tunning_option) self._lib = None self._module = None diff --git a/python/tvm/tpat/cuda/pipeline.py b/python/tvm/tpat/cuda/pipeline.py index bdd441dae8b1..a4281e9737b5 100644 --- a/python/tvm/tpat/cuda/pipeline.py +++ b/python/tvm/tpat/cuda/pipeline.py @@ -15,6 +15,7 @@ # specific language governing permissions and limitations # under the License. +import os from typing import Tuple import onnx @@ -55,7 +56,11 @@ def _extract_target_onnx_node(model, tunning_node): def pipeline( - onnx_file: str, node_names: list[str], enable_tunning: bool, tunning_option: object, output_onnx: str + onnx_file: str, + node_names: list[str], + enable_tunning: bool, + tunning_option: object, + output_onnx: str, ) -> Tuple[str, list[str]]: """Generate plugins for specified nodes in an ONNX model. @@ -80,9 +85,16 @@ def pipeline( A tuple containing the path to the output ONNX file and a list of generated plugin paths. """ - # 1. load onnx - onnx_model = onnx.load(onnx_file) - inferred_model = shape_inference.infer_shapes(onnx_model) + # 1. load onnx and inference shapes + try: + onnx_model = onnx.load(onnx_file) + inferred_model = shape_inference.infer_shapes(onnx_model) + except: + dummy_file = "tensor_shape_inference.onnx" + shape_inference.infer_shapes_path(onnx_file, output_path=dummy_file) + inferred_model = onnx.load(dummy_file) + os.remove(dummy_file) + graph = gs.import_onnx(inferred_model) # 2. retrieve all node which need to transform to plugins @@ -103,22 +115,23 @@ def pipeline( subgraph, submodel, shapes = _extract_target_onnx_node(inferred_model, node) - kernel = Kernel(plugin_name, submodel, shapes, enable_tunning, tunning_option) - kernel.run() - if not kernel.cuda_source_code: - print(f"Skip {name}, because cuda source code is None") - continue + try: + kernel = Kernel(plugin_name, submodel, shapes, enable_tunning, tunning_option) + kernel.run() - ## 3.1 fill in template - params = PluginTemplateParams(kernel, submodel, subgraph, node, name) - template = StaticBatchPluginTemplate(params) - lib = template.fill() + ## 3.1 fill in template + params = PluginTemplateParams(kernel, submodel, subgraph, node, name) + template = StaticBatchPluginTemplate(params) + lib = template.fill() - plugin_path.append(lib) + plugin_path.append(lib) - node_name_to_plugin_name[name] = plugin_name + node_name_to_plugin_name[name] = plugin_name + except Exception as e: + print(f"Skip {name}, ERROR:: {e}") + continue # 4. generate the modified onnx - rewrite(inferred_model, node_to_be_tunned, node_name_to_plugin_name, output_onnx) + rewrite(graph, node_to_be_tunned, node_name_to_plugin_name, output_onnx) return output_onnx, plugin_path diff --git a/python/tvm/tpat/cuda/rewrite.py b/python/tvm/tpat/cuda/rewrite.py index c071f7662620..f505e769753d 100644 --- a/python/tvm/tpat/cuda/rewrite.py +++ b/python/tvm/tpat/cuda/rewrite.py @@ -71,7 +71,11 @@ def _handle_trt_not_support_type( assert count == len(node_name_to_plugin_name) if insert_cast_nodes: _remove_unnecessary_cast_nodes(graph) - onnx.save(gs.export_onnx(graph), output_model_path) + + try: + onnx.save(gs.export_onnx(graph), output_model_path) + except: + onnx.save(gs.export_onnx(graph), output_model_path, save_as_external_data=True) def _remove_unnecessary_cast_nodes(graph): @@ -110,7 +114,7 @@ def _compute_tensor_type(graph, tunning_nodes): def rewrite( - inferred_model, + graph, tunning_nodes, node_name_to_plugin_name, output_model_path, @@ -120,7 +124,6 @@ def rewrite( Modify operator type in onnx model for tensorRT can run plugin. """ - graph = gs.import_onnx(inferred_model) _onnx_original_tensor_type = _compute_tensor_type(graph, tunning_nodes) _handle_trt_not_support_type( diff --git a/python/tvm/tpat/cuda/template_params.py b/python/tvm/tpat/cuda/template_params.py index 89771fd9a304..efb2c2a0e6c7 100644 --- a/python/tvm/tpat/cuda/template_params.py +++ b/python/tvm/tpat/cuda/template_params.py @@ -90,9 +90,9 @@ def _parse_constant_params(self, constant_params): tvm_constant[key] = value.flatten() return tvm_constant - def _parse_device_function_list(self, device_function_list): + def _parse_device_function_list(self, device_function_thread_config): function_list = [] - for item in device_function_list.split("\n"): + for item in device_function_thread_config.split("\n"): if len(item) == 0: continue item = item.split() @@ -207,7 +207,7 @@ def _parse_kernel_params(self): self._kernel.device_function_list ) self._device_function_list = self._parse_device_function_list( - self._kernel.device_function_list + self._kernel.device_function_thread_config ) self._device_thread_config = self._parse_device_function_thread_config( self._kernel.device_function_thread_config From d6b1cdb21e31b00d8889570a54197b0352b08223 Mon Sep 17 00:00:00 2001 From: Civitasv Date: Thu, 17 Aug 2023 14:32:58 +0800 Subject: [PATCH 12/14] [tensorrt] [byoc] [plugin] enhance type inference using ort --- python/tvm/tpat/cuda/kernel.py | 1 - .../tpat/cuda/{rewrite.py => onnx_util.py} | 21 +++- python/tvm/tpat/cuda/pipeline.py | 95 ++++++++++++++----- python/tvm/tpat/cuda/template.py | 27 ++++-- python/tvm/tpat/cuda/template_params.py | 11 +-- python/tvm/tpat/cuda/type_mapping.py | 3 + 6 files changed, 118 insertions(+), 40 deletions(-) rename python/tvm/tpat/cuda/{rewrite.py => onnx_util.py} (90%) diff --git a/python/tvm/tpat/cuda/kernel.py b/python/tvm/tpat/cuda/kernel.py index b0f3d4f6c6af..a1a6c57f57ad 100644 --- a/python/tvm/tpat/cuda/kernel.py +++ b/python/tvm/tpat/cuda/kernel.py @@ -33,7 +33,6 @@ def __init__(self, name, onnx_model, input_shapes, target, tunning_option) -> No if tunning_option["work_dir"] else f"./log_db/{name}" ) - print("WORK DIR:::", self.work_dir) if target == "gpu": self.target = self._detect_cuda_target() diff --git a/python/tvm/tpat/cuda/rewrite.py b/python/tvm/tpat/cuda/onnx_util.py similarity index 90% rename from python/tvm/tpat/cuda/rewrite.py rename to python/tvm/tpat/cuda/onnx_util.py index f505e769753d..2c2fa5b702f2 100644 --- a/python/tvm/tpat/cuda/rewrite.py +++ b/python/tvm/tpat/cuda/onnx_util.py @@ -15,11 +15,28 @@ # specific language governing permissions and limitations # under the License. +import os + import onnx import onnx_graphsurgeon as gs +from onnx import shape_inference + from .type_mapping import onnx_type_mapping +def load_model(onnx_file): + try: + onnx_model = onnx.load(onnx_file) + inferred_model = shape_inference.infer_shapes(onnx_model) + except: + dummy_file = "tensor_shape_inference.onnx" + shape_inference.infer_shapes_path(onnx_file, output_path=dummy_file) + inferred_model = onnx.load(dummy_file) + os.remove(dummy_file) + + return inferred_model + + def _handle_trt_not_support_type( graph, output_model_path, @@ -114,7 +131,7 @@ def _compute_tensor_type(graph, tunning_nodes): def rewrite( - graph, + model, tunning_nodes, node_name_to_plugin_name, output_model_path, @@ -124,6 +141,8 @@ def rewrite( Modify operator type in onnx model for tensorRT can run plugin. """ + graph = gs.import_onnx(model) + _onnx_original_tensor_type = _compute_tensor_type(graph, tunning_nodes) _handle_trt_not_support_type( diff --git a/python/tvm/tpat/cuda/pipeline.py b/python/tvm/tpat/cuda/pipeline.py index a4281e9737b5..45ca7747d9e4 100644 --- a/python/tvm/tpat/cuda/pipeline.py +++ b/python/tvm/tpat/cuda/pipeline.py @@ -15,25 +15,59 @@ # specific language governing permissions and limitations # under the License. +import gc import os from typing import Tuple +import numpy as np import onnx import onnx_graphsurgeon as gs -from onnx import shape_inference +import onnxruntime as ort from tvm.tpat.cuda.kernel import Kernel from tvm.tpat.cuda.template import StaticBatchPluginTemplate from tvm.tpat.cuda.template_params import PluginTemplateParams -from .rewrite import rewrite +from tvm.tpat.cuda.onnx_util import rewrite, load_model + + +def _enhance_onnx_shape(graph, inputs, outputs): + graph.outputs = [] + graph.outputs.extend(inputs) + graph.outputs.extend(outputs) + + graph.cleanup() + + half_model = gs.export_onnx(graph) + half_model_path = "half_model.onnx" + onnx.save(half_model, half_model_path) + + EP_list = ["CPUExecutionProvider", "CUDAExecutionProvider"] + session = ort.InferenceSession(half_model_path, providers=EP_list) + outname = [output.name for output in session.get_outputs()] + dummy_input = {} + for gi in graph.inputs: + dummy_input[gi.name] = (1 + np.random.random([int(i) for i in gi.shape])).astype(gi.dtype) + dummy_output = session.run(outname, dummy_input) + + tensor_shapes = [] + for i in range(len(inputs)): + assert inputs[i].name == outname[i] + tensor_shapes.append(dummy_output[i].shape) + for i in range(len(outputs)): + assert outputs[i].name == outname[len(inputs) + i] + tensor_shapes.append(dummy_output[len(inputs) + i].shape) + os.remove(half_model_path) + return tensor_shapes def _extract_target_onnx_node(model, tunning_node): """ Extract target node from onnx graph """ + graph = gs.import_onnx(model) + tensors = graph.tensors() subgraph_inputs = [ @@ -45,14 +79,39 @@ def _extract_target_onnx_node(model, tunning_node): tensors[oup.name].to_variable(dtype=oup.dtype, shape=oup.shape) for oup in tunning_node.outputs ] + + computed_tensor_shapes = _enhance_onnx_shape(graph, subgraph_inputs, subgraph_outputs) + + for i in range(len(subgraph_inputs)): + subgraph_inputs[i].shape = computed_tensor_shapes[i] + for i in range(len(subgraph_outputs)): + subgraph_outputs[i].shape = computed_tensor_shapes[len(subgraph_inputs) + i] + input_shapes = [(inp.name, inp.shape, inp.dtype.name) for inp in subgraph_inputs] + output_shapes = [oup.shape for oup in subgraph_outputs] graph.inputs = subgraph_inputs graph.outputs = subgraph_outputs graph.cleanup() submodel = gs.export_onnx(graph) - return graph, submodel, input_shapes + return submodel, input_shapes, output_shapes + + +def _get_node_to_be_tunned(model, node_names): + graph = gs.import_onnx(model) + + # 2. retrieve all node which need to transform to plugins + if node_names is None or len(node_names) == 0: + return [] + + node_to_be_tunned = [node for node in graph.nodes if node.name in node_names] + + del graph + del model + gc.collect() + + return node_to_be_tunned def pipeline( @@ -86,22 +145,10 @@ def pipeline( """ # 1. load onnx and inference shapes - try: - onnx_model = onnx.load(onnx_file) - inferred_model = shape_inference.infer_shapes(onnx_model) - except: - dummy_file = "tensor_shape_inference.onnx" - shape_inference.infer_shapes_path(onnx_file, output_path=dummy_file) - inferred_model = onnx.load(dummy_file) - os.remove(dummy_file) - - graph = gs.import_onnx(inferred_model) + model = load_model(onnx_file) # 2. retrieve all node which need to transform to plugins - if node_names is None or len(node_names) == 0: - return - - node_to_be_tunned = [node for node in graph.nodes if node.name in node_names] + node_to_be_tunned = _get_node_to_be_tunned(model, node_names) assert len(node_to_be_tunned) > 0, "The number of nodes to be tunned should larger than zero" @@ -113,25 +160,25 @@ def pipeline( print(f"Processing ---- {name}") plugin_name = "tpat_{}".format(name.replace("/", "_").replace(".", "_")) - subgraph, submodel, shapes = _extract_target_onnx_node(inferred_model, node) + submodel, input_shapes, output_shapes = _extract_target_onnx_node(model, node) try: - kernel = Kernel(plugin_name, submodel, shapes, enable_tunning, tunning_option) + kernel = Kernel(plugin_name, submodel, input_shapes, enable_tunning, tunning_option) kernel.run() ## 3.1 fill in template - params = PluginTemplateParams(kernel, submodel, subgraph, node, name) + params = PluginTemplateParams(kernel, submodel, output_shapes, node, name) template = StaticBatchPluginTemplate(params) lib = template.fill() - plugin_path.append(lib) - - node_name_to_plugin_name[name] = plugin_name + if lib: + plugin_path.append(lib) + node_name_to_plugin_name[name] = plugin_name except Exception as e: print(f"Skip {name}, ERROR:: {e}") continue # 4. generate the modified onnx - rewrite(graph, node_to_be_tunned, node_name_to_plugin_name, output_onnx) + rewrite(model, node_to_be_tunned, node_name_to_plugin_name, output_onnx) return output_onnx, plugin_path diff --git a/python/tvm/tpat/cuda/template.py b/python/tvm/tpat/cuda/template.py index 9621394661ae..4e3fd66e8c14 100644 --- a/python/tvm/tpat/cuda/template.py +++ b/python/tvm/tpat/cuda/template.py @@ -42,11 +42,16 @@ def __init__(self, template_params): self._plugin_output_dtype = template_params.output_dtype self._plugin_workspace_size = template_params.total_workspace_size self._plugin_source_code = template_params.cuda_source_code - self._plugin_output_shape = self._parse_plugin_output_shape(template_params.output_shape) - self._plugin_tensor_format = self._parse_plugin_tensor_format(template_params.tensor_type) + self._plugin_output_shape = self._parse_plugin_output_shape( + template_params.output_shape + ) + self._plugin_tensor_format = self._parse_plugin_tensor_format( + template_params.tensor_type + ) self._plugin_device_function_configuration = ( self._parse_plugin_device_function_configuration( - template_params.device_function_configuration, template_params.device_function_list + template_params.device_function_configuration, + template_params.device_function_list, ) ) self._plugin_workspace_constant = self._parse_plugin_workspace_constant( @@ -122,8 +127,12 @@ def __init__( ): self.batch_size = batch_size self.plugin_template = plugin_template - self.dy_plugin_input_size_type_without_bs = dy_plugin_input_size_type_without_bs - self.dy_plugin_output_size_type_without_bs = dy_plugin_output_size_type_without_bs + self.dy_plugin_input_size_type_without_bs = ( + dy_plugin_input_size_type_without_bs + ) + self.dy_plugin_output_size_type_without_bs = ( + dy_plugin_output_size_type_without_bs + ) class Shape: def __init__(self, size, dtype): @@ -200,9 +209,12 @@ def fill(self): with pushd(os.path.normpath(os.path.dirname(__file__))): self.generate_header_file() self.generate_source_file() - self._build_plugin() + result = self._build_plugin() - return f"{os.path.dirname(os.path.abspath(__file__))}/plugin/lib/{self._plugin_name}.so" + if result: + return f"{os.path.dirname(os.path.abspath(__file__))}/plugin/lib/{self._plugin_name}.so" + else: + return False def _build_plugin(self): os.chdir("./plugin") @@ -211,6 +223,7 @@ def _build_plugin(self): os.system(f"make plugin_name={self._plugin_name}") os.chdir("../") + return True class StaticBatchPluginTemplate(PluginTemplate): diff --git a/python/tvm/tpat/cuda/template_params.py b/python/tvm/tpat/cuda/template_params.py index efb2c2a0e6c7..c03f9d83a9dd 100644 --- a/python/tvm/tpat/cuda/template_params.py +++ b/python/tvm/tpat/cuda/template_params.py @@ -25,10 +25,9 @@ class PluginTemplateParams(object): Generate useable params for TensorRT plugin. """ - def __init__(self, kernel, model, graph, tunning_node, name): + def __init__(self, kernel, model, output_shapes, tunning_node, name): self._kernel = kernel self._model = model - self._graph = graph self._tunning_name = name self._tunning_node = tunning_node @@ -54,7 +53,7 @@ def __init__(self, kernel, model, graph, tunning_node, name): self._nums_inputs = 0 # number of inputs self._nums_outputs = 0 # number of outputs self._output_dtype = [] # dtype of outputs - self._output_shape = [] # shape of outputs + self._output_shape = output_shapes # shape of outputs self._constant_params = {} # constant params, storage_id -> data self._trt_workspace_constant = {} @@ -63,7 +62,7 @@ def __init__(self, kernel, model, graph, tunning_node, name): self._storage_id = [] # eid -> storage id self._device_function_configuration = None - self._parse_shape_and_type() + self._parse_tensor_type() self._parse_kernel_params() self._prepare_input_dict() self._prepare_device_function_config() @@ -226,7 +225,7 @@ def _parse_kernel_params(self): self._describe() - def _parse_shape_and_type(self): + def _parse_tensor_type(self): """ Infer for input and output shape. """ @@ -238,8 +237,6 @@ def _parse_shape_and_type(self): for oup in tunning_node.outputs: self._tensor_type.append(python_to_trt_type_mapping[oup.dtype.name]) - self._output_shape = [oup.shape for oup in tunning_node.outputs] - def _prepare_input_dict(self): """ The memory address used by functions params. diff --git a/python/tvm/tpat/cuda/type_mapping.py b/python/tvm/tpat/cuda/type_mapping.py index 92ec2a1f7808..492d36930982 100644 --- a/python/tvm/tpat/cuda/type_mapping.py +++ b/python/tvm/tpat/cuda/type_mapping.py @@ -24,6 +24,7 @@ "uint8": "uchar", "uint32": "int", "uint64": "int", + "float16": "half", "float32": "float", "float64": "float", } @@ -35,6 +36,7 @@ "int64": "INT32", "uint64": "INT32", "uint8": "INT8", + "float16": "FLOAT", "float32": "FLOAT", "float64": "FLOAT", } @@ -48,6 +50,7 @@ "uint8": 1, "uint32": 4, "uint64": 4, + "float16": 4, "float32": 4, "float64": 4, } From 493142dbe74939d42c2604f519e27b5313574556 Mon Sep 17 00:00:00 2001 From: Civitasv Date: Mon, 21 Aug 2023 10:51:04 +0800 Subject: [PATCH 13/14] [tensorrt] [byoc] [plugin] allows save external data --- python/tvm/tpat/cuda/kernel.py | 4 +++- python/tvm/tpat/cuda/onnx_util.py | 9 +++++++-- python/tvm/tpat/cuda/pipeline.py | 7 +++++-- 3 files changed, 15 insertions(+), 5 deletions(-) diff --git a/python/tvm/tpat/cuda/kernel.py b/python/tvm/tpat/cuda/kernel.py index a1a6c57f57ad..80877d4892e9 100644 --- a/python/tvm/tpat/cuda/kernel.py +++ b/python/tvm/tpat/cuda/kernel.py @@ -15,6 +15,8 @@ # specific language governing permissions and limitations # under the License. +import os + import tvm import tvm.contrib.graph_executor as runtime import tvm.relay as relay @@ -86,7 +88,7 @@ def run(self): mod, params = relay.frontend.from_onnx(self._config.onnx_model) # 2. Tune it - if self._enable_tunning: + if self._enable_tunning and not os.path.exists(self._config.work_dir): tunning_option = self._config._tune_option() ms.relay_integration.tune_relay(mod=mod, params=params, **tunning_option) diff --git a/python/tvm/tpat/cuda/onnx_util.py b/python/tvm/tpat/cuda/onnx_util.py index 2c2fa5b702f2..dd2ef1ab0c33 100644 --- a/python/tvm/tpat/cuda/onnx_util.py +++ b/python/tvm/tpat/cuda/onnx_util.py @@ -90,9 +90,14 @@ def _handle_trt_not_support_type( _remove_unnecessary_cast_nodes(graph) try: - onnx.save(gs.export_onnx(graph), output_model_path) + onnx.save(gs.export_onnx(graph), output_model_path["name"]) except: - onnx.save(gs.export_onnx(graph), output_model_path, save_as_external_data=True) + onnx.save( + gs.export_onnx(graph), + output_model_path["name"], + save_as_external_data=True, + location=output_model_path["weights"], + ) def _remove_unnecessary_cast_nodes(graph): diff --git a/python/tvm/tpat/cuda/pipeline.py b/python/tvm/tpat/cuda/pipeline.py index 45ca7747d9e4..5bdcf31ed623 100644 --- a/python/tvm/tpat/cuda/pipeline.py +++ b/python/tvm/tpat/cuda/pipeline.py @@ -119,7 +119,7 @@ def pipeline( node_names: list[str], enable_tunning: bool, tunning_option: object, - output_onnx: str, + output_onnx: object, ) -> Tuple[str, list[str]]: """Generate plugins for specified nodes in an ONNX model. @@ -135,8 +135,11 @@ def pipeline( Flag indicating whether tunning is enabled. tunning_option : object Tunning option provided for ms.relay_integration.tune_relay, you don't need to specify mod, params and target. - output_onnx : str + output_onnx : object + { "name": xx, "weights": xx } Path to the output ONNX file where the modified model will be saved. + It will firstly try to save without weights, if it fails, it will then + save it with weights. Returns ------- From e9c5a58951909262d8316dbe8a97034ebdd7a8b9 Mon Sep 17 00:00:00 2001 From: Civitasv Date: Tue, 29 Aug 2023 19:11:40 +0800 Subject: [PATCH 14/14] No need to use gc --- python/tvm/tpat/cuda/pipeline.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/python/tvm/tpat/cuda/pipeline.py b/python/tvm/tpat/cuda/pipeline.py index 5bdcf31ed623..0b9143ce0db6 100644 --- a/python/tvm/tpat/cuda/pipeline.py +++ b/python/tvm/tpat/cuda/pipeline.py @@ -15,7 +15,6 @@ # specific language governing permissions and limitations # under the License. -import gc import os from typing import Tuple @@ -107,10 +106,6 @@ def _get_node_to_be_tunned(model, node_names): node_to_be_tunned = [node for node in graph.nodes if node.name in node_names] - del graph - del model - gc.collect() - return node_to_be_tunned