From 78974a101d46d423e92f98bc057375ff9bac1324 Mon Sep 17 00:00:00 2001 From: Luka Macan Date: Fri, 1 Dec 2023 19:43:28 +0100 Subject: [PATCH 01/30] Harmonize onnx mangling --- dory/Frontend_frameworks/Quantlab/Parser.py | 6 +++++- dory/Parsers/Parser_ONNX_to_DORY.py | 8 +++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/dory/Frontend_frameworks/Quantlab/Parser.py b/dory/Frontend_frameworks/Quantlab/Parser.py index ffaaba3b..a2ae14be 100644 --- a/dory/Frontend_frameworks/Quantlab/Parser.py +++ b/dory/Frontend_frameworks/Quantlab/Parser.py @@ -93,7 +93,11 @@ def frontend_mapping_to_DORY_nodes(self): # input 1 parameters #### Look at the order of inputs in Onnx. If the lowest index #is not the first argument, revert the order inmul1 and inmul2 - if int(node.input_indexes[0]) > int(node.input_indexes[1]): + def pos_in_schedule(node: str) -> int: + for idx, node_iterating in enumerate(list(self.graph.graph.node)): + if node_iterating.output[0] == node: + return idx + if pos_in_schedule(node.input_indexes[0]) > pos_in_schedule(node.input_indexes[1]): temp_shift = node.in1_shift temp_mul = node.in1_mul temp_add = node.in1_add diff --git a/dory/Parsers/Parser_ONNX_to_DORY.py b/dory/Parsers/Parser_ONNX_to_DORY.py index 48179a63..cc17c35f 100644 --- a/dory/Parsers/Parser_ONNX_to_DORY.py +++ b/dory/Parsers/Parser_ONNX_to_DORY.py @@ -60,6 +60,10 @@ def create_node(self, node_iterating, graph): return new_node def ONNXtoDORY(self): + def pos_in_schedule(node: str) -> int: + for idx, node_iterating in enumerate(list(self.graph.graph.node)): + if node_iterating.output[0] == node: + return idx ######### CREATING NODES ########### print("\nParsing ONNX Graph to create DORY graph.") for node_iterating in (self.graph.graph.node): @@ -68,7 +72,9 @@ def ONNXtoDORY(self): ### Neglecting some nodes since they are not translated to any operation on any backend if node_iterating.op_type in self.layers_neglected: for node in self.DORY_Graph[::-1]: - if int(node_iterating.output[0]) > int(node.get_parameter('output_index')) and node.get_parameter("name") != "Constant": + node_iter_output = pos_in_schedule(node_iterating.output[0]) + node_output_index = pos_in_schedule(node.get_parameter('output_index')) + if node_iter_output > node_output_index and node.get_parameter("name") != "Constant": node.add_existing_parameter('output_index', node_iterating.output[0]) break # Adding a new layer From 646710fa26d314f46f907fdf45727eaf4af4b13b Mon Sep 17 00:00:00 2001 From: Luka Macan Date: Fri, 1 Dec 2023 19:43:58 +0100 Subject: [PATCH 02/30] Harmonize quantlib renaming last output node --- dory/Parsers/HW_node.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/dory/Parsers/HW_node.py b/dory/Parsers/HW_node.py index 9e5fd49e..e7abc008 100644 --- a/dory/Parsers/HW_node.py +++ b/dory/Parsers/HW_node.py @@ -213,6 +213,9 @@ def add_checksum_activations_integer(self, load_directory, node_number, n_inputs self.check_sum_in.append(int(sum(x))) outfile = f'out_layer{node_number}.txt' if n_inputs == 1 else f'out_{in_idx}_layer{node_number}.txt' + # quantlib hack + if not os.path.isfile(os.path.join(load_directory, outfile)): + outfile = "output.txt" try: y = np.loadtxt(os.path.join(load_directory, outfile), delimiter=',', dtype=np.int64, usecols=[0]) except ValueError: From 71d75bf25e5abccbe18a23b253760ef4dc9d27d6 Mon Sep 17 00:00:00 2001 From: Luka Macan Date: Tue, 5 Dec 2023 17:33:55 +0100 Subject: [PATCH 03/30] Add some string parsing --- dory/Hardware_targets/PULP/GAP9_NE16/HW_Parser.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/dory/Hardware_targets/PULP/GAP9_NE16/HW_Parser.py b/dory/Hardware_targets/PULP/GAP9_NE16/HW_Parser.py index d5697441..6ae78200 100644 --- a/dory/Hardware_targets/PULP/GAP9_NE16/HW_Parser.py +++ b/dory/Hardware_targets/PULP/GAP9_NE16/HW_Parser.py @@ -42,7 +42,7 @@ def get_pattern_rewriter(self): def get_tiler(self): return Tiler_GAP9 - def valid_ne16_node(self, node): + def valid_ne16_node(self, node, idx): try: Ne16TestConf( in_height=node.input_dimensions[0], @@ -60,19 +60,19 @@ def valid_ne16_node(self, node): bias_type=IntegerType(name=f"{node.constant_type}{node.bias_bits}"), has_norm_quant=True, # TODO has_bias=True, - has_relu=True + has_relu="Relu" in node.name ) return True, "" except ValidationError as e: - msg = f"WARNING: Failed allocating node {node.name} to the NE16 accelerator. Errors:\n" + msg = f"WARNING: Failed allocating node {node.name}{idx} to the NE16 accelerator. Errors:\n" for error in e.errors(): msg += f" - {error['msg']}\n" msg += "\nNOTE: Falling back to cluster engine.\n" return False, msg - def engine_coloring(self, node): + def engine_coloring(self, node, idx): if "Conv" in node.op_type or "Convolution" in node.op_type: - is_valid, msg = self.valid_ne16_node(node) + is_valid, msg = self.valid_ne16_node(node, idx) if is_valid: node.engine = "ne16" return @@ -83,8 +83,8 @@ def engine_coloring(self, node): def mapping_to_HW_nodes(self): super().mapping_to_HW_nodes() print("\nPULP Backend: Assigning nodes to engines.") - for node in self.DORY_Graph: - self.engine_coloring(node) + for i, node in enumerate(self.DORY_Graph): + self.engine_coloring(node, i) assert all(hasattr(node, "engine") for node in self.DORY_Graph) def transform_nodes_to_hw_nodes(self): From 41fd9b65e825bfbc596239ff3264c5a4d3793035 Mon Sep 17 00:00:00 2001 From: Luka Macan Date: Tue, 19 Dec 2023 18:09:23 +0100 Subject: [PATCH 04/30] Parser fixes --- dory/Parsers/HW_node.py | 5 +++-- dory/Parsers/Parser_DORY_to_HW.py | 3 --- .../Utils/Templates_writer/Layer2D_template_writer.py | 11 ++++------- 3 files changed, 7 insertions(+), 12 deletions(-) diff --git a/dory/Parsers/HW_node.py b/dory/Parsers/HW_node.py index e7abc008..7a544743 100644 --- a/dory/Parsers/HW_node.py +++ b/dory/Parsers/HW_node.py @@ -72,6 +72,7 @@ def create_tiling_dimensions(self, previous_node, config_file): # ATTENTION MEMORY L3 --> TILE MEMORY DIMENSION --> Decide how to set. Re-init the whole memory? for level in np.arange(self.HW_description["memory"]["levels"],1, -1): (weights_dim, input_dims, output_dims) = self.Tiler(self, previous_node, config_file["code reserved space"]).get_tiling(level) + assert all(dim > 0 for dim in weights_dim + input_dims + output_dims) self.tiling_dimensions["L{}".format(level-1)]["input_dimensions"] = input_dims self.tiling_dimensions["L{}".format(level-1)]["output_dimensions"] = output_dims if "Convolution" in self.name or "FullyConnected" in self.name: @@ -98,8 +99,8 @@ def create_tiling_dimensions(self, previous_node, config_file): self.tiling_dimensions["L{}".format(level-1)]["bias_memory"] = int(bias_memory) self.tiling_dimensions["L{}".format(level-1)]["constants_memory"] = int(constants_memory) - self.tiling_dimensions["L{}".format(level-1)]["input_activation_memory"] = np.prod(self.tiling_dimensions["L{}".format(level-1)]["input_dimensions"])*self.input_activation_bits/8 - self.tiling_dimensions["L{}".format(level-1)]["output_activation_memory"] = np.prod(self.tiling_dimensions["L{}".format(level-1)]["output_dimensions"])*self.output_activation_bits/8 + self.tiling_dimensions["L{}".format(level-1)]["input_activation_memory"] = int(np.prod(self.tiling_dimensions["L{}".format(level-1)]["input_dimensions"])*self.input_activation_bits/8) + self.tiling_dimensions["L{}".format(level-1)]["output_activation_memory"] = int(np.prod(self.tiling_dimensions["L{}".format(level-1)]["output_dimensions"])*self.output_activation_bits/8) def rename_weights(self): weight_name = "" diff --git a/dory/Parsers/Parser_DORY_to_HW.py b/dory/Parsers/Parser_DORY_to_HW.py index dcd75d10..902e0a64 100644 --- a/dory/Parsers/Parser_DORY_to_HW.py +++ b/dory/Parsers/Parser_DORY_to_HW.py @@ -171,9 +171,6 @@ def tiling(self): print("\nInsert tiling parameters per layer inside graph nodes") prev = None for node in self.DORY_Graph: - ######################## NEED A FIX #################################################### - #### OTHERWISE ONLY WEIGHT < L2/2 GO in L2 --> much more L3 tiling not needed############ - ######################################################################################### node.create_tiling_dimensions(prev if prev else node, self.config_file) prev = node diff --git a/dory/Utils/Templates_writer/Layer2D_template_writer.py b/dory/Utils/Templates_writer/Layer2D_template_writer.py index 6964acbd..78c8dcfd 100644 --- a/dory/Utils/Templates_writer/Layer2D_template_writer.py +++ b/dory/Utils/Templates_writer/Layer2D_template_writer.py @@ -408,13 +408,10 @@ def print_template_layer(node, layer_type, double_buffering = 2): # x last tk['x_tile_size_nif_last'] = n_in % tile_n_in if (n_in % tile_n_in) > 0 else tile_n_in tk['x_tile_size_nif_byte_last'] = int(math.ceil(tk['x_tile_size_nif_last'] * ds_x / 8.0)) - tk['x_tile_size_h_last'] = tk['y_tile_size_h_last'] * s[0] + ks[0] - s[0] - (padding_bottom - ((h_in + padding_bottom + padding_top) - (h_out* s[0] + ks[0] - s[0]))) - tk['x_tile_size_w_last'] = tk['y_tile_size_w_last'] * s[1] + ks[1] - s[1] - (padding_right - ((w_in + padding_left + padding_right) - (w_out* s[1] + ks[1] - s[1]))) - ## single tile execution - if tk['x_tile_size_h_last'] > tk['x_tile_size_h']: - tk['x_tile_size_h_last'] = tk['x_tile_size_h'] - if tk['x_tile_size_w_last'] > tk['x_tile_size_w']: - tk['x_tile_size_w_last'] = tk['x_tile_size_w'] + tk['x_tile_size_h_last'] = tk['y_tile_size_h_last'] * s[0] + ks[0] - s[0] + assert tk['x_tile_size_h_last'] >= 0 and tk['x_tile_size_h_last'] <= tk['x_tile_size_h'] + tk['x_tile_size_w_last'] = tk['y_tile_size_w_last'] * s[1] + ks[1] - s[1] + assert tk['x_tile_size_w_last'] >= 0 and tk['x_tile_size_w_last'] <= tk['x_tile_size_w'] l = "" for k, v in tk.items(): From 115f8b37f58cac789ec80530e8b705baee7cc133 Mon Sep 17 00:00:00 2001 From: Luka Macan Date: Tue, 19 Dec 2023 18:46:36 +0100 Subject: [PATCH 05/30] Fix L3-L2 tiling --- .../PULP/GAP9_NE16/Tiler/tiler_conv2d_ne16.py | 132 +++++++++++------- 1 file changed, 82 insertions(+), 50 deletions(-) diff --git a/dory/Hardware_targets/PULP/GAP9_NE16/Tiler/tiler_conv2d_ne16.py b/dory/Hardware_targets/PULP/GAP9_NE16/Tiler/tiler_conv2d_ne16.py index 66648fb7..e4b76297 100644 --- a/dory/Hardware_targets/PULP/GAP9_NE16/Tiler/tiler_conv2d_ne16.py +++ b/dory/Hardware_targets/PULP/GAP9_NE16/Tiler/tiler_conv2d_ne16.py @@ -65,14 +65,14 @@ def get_tiling_conv2d_L3(self): # We assume that the first nodes input is always in L2 input_in_l2 = is_first_node or prev_tiling["L3"]["output_dimensions"] == prev_tiling["L2"]["output_dimensions"] - self.node.tiling_dimensions["L2"]["db_x"] = 1 - self.node.tiling_dimensions["L2"]["db_y"] = 1 - self.node.tiling_dimensions["L2"]["db_w"] = 1 - buffer_total = self.node.input_activation_memory + self.node.output_activation_memory + self.node.weight_memory + self.node.bias_memory + self.node.constants_memory # Don't tile if the whole thing fits into L2 - if (buffer_total <= L2_memory) and (input_in_l2 or is_first_node): + if (buffer_total <= L2_memory) and input_in_l2: + self.node.tiling_dimensions["L2"]["db_x"] = 1 + self.node.tiling_dimensions["L2"]["db_y"] = 1 + self.node.tiling_dimensions["L2"]["db_w"] = 1 + if "PointwiseDepthwisePointwise" in self.node.name: return ([self.node.output_channels, self.node.input_channels], [self.node.input_channels, self.node.input_dimensions[0], self.node.input_dimensions[1]], @@ -94,7 +94,6 @@ def get_tiling_conv2d_L3(self): in_ch = self.node.input_channels s = self.node.strides g = self.node.group - p = self.node.pads depthwise = g > 1 db_x = 1 if input_in_l2 else 2 @@ -113,16 +112,30 @@ def get_tiling_conv2d_L3(self): h_in = in_dim[0] w_in = in_dim[1] n_in = in_ch - tile_h_out = solver.IntVar(1, out_dim[0], 'tile_h_out') + + # optimization variables + opt_vars = [] + + def add_var_cond(min: int, max: int, name: str, cond: bool): + if cond: + var = solver.IntVar(min, max, name) + opt_vars.append(var) + else: + var = max + return var + + tile_h_out = add_var_cond(1, h_out, 'tile_h_out', db_y > 1) + tile_n_out = add_var_cond(1, out_ch, 'tile_n_out', db_w > 1) + tile_h_in = add_var_cond(ks[0], h_in, 'tile_h_in', db_x > 1) + + # We are not tiling width nor input channel tile_w_out = w_out - tile_n_out = solver.IntVar(1, out_ch, 'tile_n_out') - tile_h_in = solver.IntVar(in_dim[0] if input_in_l2 else ks[0], in_dim[0], 'tile_h_in') tile_w_in = w_in tile_n_in = tile_n_out if depthwise else n_in # size constraint - input_tile_dimension = db_x * n_in * tile_h_in * in_dim[1] * (self.node.input_activation_bits // 8) - output_tile_dimension = tile_h_out * db_y * n_out * out_dim[1] * (self.node.output_activation_bits // 8) + input_tile_dimension = db_x * n_in * tile_h_in * w_in * (self.node.input_activation_bits // 8) + output_tile_dimension = tile_h_out * db_y * n_out * w_out * (self.node.output_activation_bits // 8) if "DepthwisePointwise" in self.node.name: weight_tile_dimension = db_w * (self.node.calculate_weights_size(tile_n_in, tile_n_in, ks, self.node.weight_bits, dw=True) + @@ -148,23 +161,23 @@ def get_tiling_conv2d_L3(self): solver.Add(total_size <= L2_memory) - # geometrical constraint - if db_y == 1: - solver.Add(tile_h_out == h_out) - else: + # policy constraints + if db_y > 1: solver.Add(solver.IntConst(h_out) % tile_h_out == 0) - if db_w == 1: - solver.Add(tile_n_out == out_ch) - else: + if db_w > 1: solver.Add(solver.IntConst(n_out) % tile_n_out == 0) - if db_x == 2 and db_y == 2: - solver.Add(tile_h_out * s[0] == tile_h_in - (ks[0] - 1) + (s[0] - 1)) - - if db_x == 2: + if db_x > 1: solver.Add(solver.IntConst(h_out) % ((tile_h_in - ks[0] + s[0]) // s[0]) == 0) + # geometrical constraints + if db_x > 1: + solver.Add((tile_h_in - ks[0]) % s[0] == 0) + + if db_x > 1 and db_y > 1: + solver.Add(tile_h_in == tile_h_out * s[0] + (ks[0] - s[0])) + # objective obj_expr = solver.IntVar(0, 100000000000000, "obj_expr") @@ -177,7 +190,7 @@ def get_tiling_conv2d_L3(self): # maximize the objective objective = solver.Maximize(obj_expr, 1) - decision_builder = solver.Phase([tile_n_out, tile_h_in, tile_h_out], + decision_builder = solver.Phase(opt_vars, solver.CHOOSE_FIRST_UNBOUND, solver.ASSIGN_MIN_VALUE) @@ -185,18 +198,20 @@ def get_tiling_conv2d_L3(self): collector = solver.LastSolutionCollector() # Add the decision variables. - collector.Add(tile_n_out) - collector.Add(tile_h_in) - collector.Add(tile_h_out) + for var in opt_vars: + collector.Add(var) # Add the objective. collector.AddObjective(obj_expr) solver.Solve(decision_builder, [objective, collector]) if collector.SolutionCount() > 0: best_solution = collector.SolutionCount() - 1 - tile_n_out = collector.Value(best_solution, tile_n_out) - tile_h_in = collector.Value(best_solution, tile_h_in) - tile_h_out = collector.Value(best_solution, tile_h_out) + if db_y > 1: + tile_h_out = collector.Value(best_solution, tile_h_out) + if db_w > 1: + tile_n_out = collector.Value(best_solution, tile_n_out) + if db_x > 1: + tile_h_in = collector.Value(best_solution, tile_h_in) self.node.tiling_dimensions["L2"]["db_x"] = db_x self.node.tiling_dimensions["L2"]["db_y"] = db_y self.node.tiling_dimensions["L2"]["db_w"] = db_w @@ -232,38 +247,55 @@ def get_tiling_conv2d_L2(self): # We are recalculating these variables because the output could be tiled but the input isn't or vice versa. in_mem = self.node.tiling_dimensions["L2"]["input_activation_memory"] out_mem = self.node.tiling_dimensions["L2"]["output_activation_memory"] - h_in = self.node.tiling_dimensions["L2"]["input_dimensions"][1] - h_out = self.node.tiling_dimensions["L2"]["output_dimensions"][1] - if self.node.tiling_dimensions["L3"]["output_dimensions"][1] > self.node.tiling_dimensions["L2"]["output_dimensions"][1]: - h_in = self.node.tiling_dimensions["L2"]["output_dimensions"][1] * s[0] + (ks[0] - 1) - (s[0] - 1) - in_mem = int(self.node.tiling_dimensions["L2"]["input_activation_memory"] / self.node.tiling_dimensions["L2"]["input_dimensions"][1] * h_in) - if self.node.tiling_dimensions["L3"]["input_dimensions"][1] > self.node.tiling_dimensions["L2"]["input_dimensions"][1] or \ - (self.node.tiling_dimensions["L2"]["db_x"] > 1 and self.node.tiling_dimensions["L2"]["db_y"] == 1): - h_out = int(np.floor((self.node.tiling_dimensions["L2"]["input_dimensions"][1] - (ks[0] - 1) + (s[0] - 1)) / s[0])) - out_mem = int(self.node.tiling_dimensions["L2"]["output_activation_memory"] / self.node.tiling_dimensions["L2"]["output_dimensions"][1] * h_out) + h_in_l2 = self.node.tiling_dimensions["L2"]["input_dimensions"][1] + h_out_l2 = self.node.tiling_dimensions["L2"]["output_dimensions"][1] + h_in_l3 = self.node.tiling_dimensions["L3"]["input_dimensions"][1] + h_out_l3 = self.node.tiling_dimensions["L3"]["output_dimensions"][1] + ch_out_output = self.node.tiling_dimensions["L2"]["output_dimensions"][0] + ch_out_weights = self.node.tiling_dimensions["L2"]["weights_dimensions"][0] + + # Fix output height if input height was not tiled + if h_out_l3 > h_out_l2 and h_in_l3 == h_in_l2: + h_in_l1 = h_out_l2 * s[0] + ks[0] - s[0] + in_mem = (in_mem // h_in_l2) * h_in_l1 + else: + h_in_l1 = h_in_l2 + + # Fix input height if output height was not tiled + if h_in_l3 > h_in_l2 and h_out_l3 == h_out_l2: + h_out_l1 = (h_in_l2 - ks[0] + s[0]) // s[0] + out_mem = (out_mem // h_out_l2) * h_out_l1 + else: + h_out_l1 = h_out_l2 + + # Fix if we tiled the weights output channel but not the output tiles channel if "Addition" not in self.node.name and "Pool" not in self.node.name: - out_mem = int(self.node.tiling_dimensions["L2"]["output_activation_memory"] / self.node.tiling_dimensions["L2"]["output_dimensions"][0] * self.node.tiling_dimensions["L2"]["weights_dimensions"][0]) + out_mem = int((out_mem / ch_out_output) * ch_out_weights) + buffer_total = self.node.tiling_dimensions["L2"]["weight_memory"] + self.node.tiling_dimensions["L2"]["constants_memory"] + self.node.tiling_dimensions["L2"]["bias_memory"] + in_mem + out_mem + if h_in_l3 > h_in_l2 or h_out_l3 > h_out_l2: + assert h_in_l1 == h_out_l1 * s[0] + ks[0] - s[0] + # Add intermediate buffer if "PointwiseDepthwisePointwise" in self.node.name: - buffer_total += h_in * self.node.tiling_dimensions["L2"]["input_dimensions"][2] * self.node.output_channels_list[0] - buffer_total += h_out * self.node.tiling_dimensions["L2"]["output_dimensions"][2] * self.node.output_channels_list[0] + buffer_total += h_in_l1 * self.node.tiling_dimensions["L2"]["input_dimensions"][2] * self.node.output_channels_list[0] + buffer_total += h_out_l1 * self.node.tiling_dimensions["L2"]["output_dimensions"][2] * self.node.output_channels_list[0] elif "DepthwisePointwise" in self.node.name: - buffer_total += h_out * self.node.tiling_dimensions["L2"]["output_dimensions"][2] * self.node.tiling_dimensions["L2"]["output_dimensions"][0] - - self.node.tiling_dimensions["L1"]["db_x"] = 1 - self.node.tiling_dimensions["L1"]["db_y"] = 1 - self.node.tiling_dimensions["L1"]["db_w"] = 1 + buffer_total += h_out_l1 * self.node.tiling_dimensions["L2"]["output_dimensions"][2] * self.node.tiling_dimensions["L2"]["output_dimensions"][0] # return immediately if the memory fits the L1 if buffer_total <= L1_memory: + self.node.tiling_dimensions["L1"]["db_x"] = 1 + self.node.tiling_dimensions["L1"]["db_y"] = 1 + self.node.tiling_dimensions["L1"]["db_w"] = 1 + return (self.node.tiling_dimensions["L2"]["weights_dimensions"], [self.node.tiling_dimensions["L2"]["input_dimensions"][0], - h_in, + h_in_l1, self.node.tiling_dimensions["L2"]["input_dimensions"][2]], [out_ch, - h_out, + h_out_l1, self.node.tiling_dimensions["L2"]["output_dimensions"][2]] ) @@ -281,10 +313,10 @@ def get_tiling_conv2d_L2(self): ############################################### parameters = pywrapcp.Solver.DefaultSolverParameters() solver = pywrapcp.Solver("simple_CP", parameters) - h_in = h_in + h_in = h_in_l1 w_in = in_dim[1] n_in = in_ch - h_out = h_out + h_out = h_out_l1 w_out = out_dim[1] n_out = out_ch tile_h_out = solver.IntVar(1, out_dim[0], 'tile_h_out') From 56553f86567139a0c47fc764e9162e8d7c47facd Mon Sep 17 00:00:00 2001 From: Luka Macan Date: Thu, 21 Dec 2023 10:34:41 +0000 Subject: [PATCH 06/30] Fix l2 mapping overwriting l3 layers --- dory/Parsers/Parser_HW_to_C.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dory/Parsers/Parser_HW_to_C.py b/dory/Parsers/Parser_HW_to_C.py index 63535abd..24064d9d 100644 --- a/dory/Parsers/Parser_HW_to_C.py +++ b/dory/Parsers/Parser_HW_to_C.py @@ -86,8 +86,8 @@ def l2_c_template(self, node, backend_library): def l2_template_mapping(self, node, backend_library): tmpl_c = self.l2_c_template(node, backend_library) return { - os.path.join(self.src_dir, node.prefixed_name + ".c"): os.path.join(self.tmpl_dir, tmpl_c), - os.path.join(self.inc_dir, node.prefixed_name + ".h"): os.path.join(self.tmpl_dir, "layer_L2_h_template.h"), + os.path.join(self.src_dir, node.name + ".c"): os.path.join(self.tmpl_dir, tmpl_c), + os.path.join(self.inc_dir, node.name + ".h"): os.path.join(self.tmpl_dir, "layer_L2_h_template.h"), } def mapping_layers_to_C_files(self): From ed308bb7995901f04804e2a7b4a0e46f5244d9c6 Mon Sep 17 00:00:00 2001 From: Luka Macan Date: Mon, 8 Jan 2024 12:47:26 +0000 Subject: [PATCH 07/30] Fix func_names doesn't have the padded func versions --- .../Templates/layer_templates/layer_L3_c_template.c | 8 +++----- .../Templates/layer_templates/layer_L3_c_template.c | 12 +++++++----- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/dory/Hardware_targets/PULP/Common/Templates/layer_templates/layer_L3_c_template.c b/dory/Hardware_targets/PULP/Common/Templates/layer_templates/layer_L3_c_template.c index 37e2aa56..c3ecc77b 100644 --- a/dory/Hardware_targets/PULP/Common/Templates/layer_templates/layer_L3_c_template.c +++ b/dory/Hardware_targets/PULP/Common/Templates/layer_templates/layer_L3_c_template.c @@ -17,11 +17,9 @@ * limitations under the License. */ #include "${func_name_L3}.h" -#include "${func_name[0]}.h" -% if len(func_name)>1: -#include "${func_name[1]}.h" -#include "${func_name[2]}.h" -%endif +% for name in func_name: +#include "${name}.h" +% endfor #include "dory_get_tile.h" #include "pmsis.h" #include "bsp/fs.h" diff --git a/dory/Hardware_targets/PULP/GAP9/Templates/layer_templates/layer_L3_c_template.c b/dory/Hardware_targets/PULP/GAP9/Templates/layer_templates/layer_L3_c_template.c index c52c75f1..61d3523e 100644 --- a/dory/Hardware_targets/PULP/GAP9/Templates/layer_templates/layer_L3_c_template.c +++ b/dory/Hardware_targets/PULP/GAP9/Templates/layer_templates/layer_L3_c_template.c @@ -17,11 +17,9 @@ * limitations under the License. */ #include "${func_name_L3}.h" -#include "${func_name[0]}.h" -% if len(func_name)>1: -#include "${func_name[1]}.h" -#include "${func_name[2]}.h" -%endif +% for name in func_name: +#include "${name}.h" +% endfor #include "dory_get_tile.h" #include "pmsis.h" #include "bsp/fs.h" @@ -187,6 +185,7 @@ void __attribute__ ((noinline)) ${func_name_L3}(void *args) tile_args.L2_output = dory_get_tile_3d(db[i_db_y].y, ${0 if n_tile_y > 1 else 'j'}, 0, k, ${h_out}, ${w_out}, ${n_out}, ${w_out}, ${n_out * n_tile_W}, 0, 0, 0, 0, 0, 0, ${y_data_size_byte}); tile_args.L2_weights = db[i_db_w].w; +% if len(func_name) > 1: if (j==0) { ${func_name[1] if (n_tile_x > 1 or n_tile_y > 1) and padding > 0 else func_name[0]}((void*)&tile_args); } \ @@ -199,6 +198,9 @@ else if (j == (${n_tile_y-1})) { } else { ${func_name[0]}((void*)&tile_args); } +% else: + ${func_name[0]}(&tile_args); +% endif % if n_tile_W > 1: // waiting for weights, lambda, and k From d8e8e7d8079f1e914bc4a3a6d2839dde629e1093 Mon Sep 17 00:00:00 2001 From: Luka Macan Date: Mon, 8 Jan 2024 12:47:59 +0000 Subject: [PATCH 08/30] Fix missing node.L3_input for GAP9_NE16 --- dory/Hardware_targets/PULP/GAP9_NE16/Tiler/tiler_conv2d_ne16.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dory/Hardware_targets/PULP/GAP9_NE16/Tiler/tiler_conv2d_ne16.py b/dory/Hardware_targets/PULP/GAP9_NE16/Tiler/tiler_conv2d_ne16.py index e4b76297..f66b9697 100644 --- a/dory/Hardware_targets/PULP/GAP9_NE16/Tiler/tiler_conv2d_ne16.py +++ b/dory/Hardware_targets/PULP/GAP9_NE16/Tiler/tiler_conv2d_ne16.py @@ -65,6 +65,8 @@ def get_tiling_conv2d_L3(self): # We assume that the first nodes input is always in L2 input_in_l2 = is_first_node or prev_tiling["L3"]["output_dimensions"] == prev_tiling["L2"]["output_dimensions"] + self.node.L3_input = not input_in_l2 + buffer_total = self.node.input_activation_memory + self.node.output_activation_memory + self.node.weight_memory + self.node.bias_memory + self.node.constants_memory # Don't tile if the whole thing fits into L2 From 726401dd8a047ba7bb0848ee448a09c5008f68c2 Mon Sep 17 00:00:00 2001 From: Luka Macan Date: Mon, 8 Jan 2024 12:49:13 +0000 Subject: [PATCH 09/30] Revert remove mapping_layers_to_C_files --- .../PULP/GAP9_NE16/C_Parser.py | 34 +++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/dory/Hardware_targets/PULP/GAP9_NE16/C_Parser.py b/dory/Hardware_targets/PULP/GAP9_NE16/C_Parser.py index 263820ce..3a1da89a 100644 --- a/dory/Hardware_targets/PULP/GAP9_NE16/C_Parser.py +++ b/dory/Hardware_targets/PULP/GAP9_NE16/C_Parser.py @@ -17,6 +17,8 @@ from dory.Hardware_targets.PULP.GAP9.C_Parser import C_Parser as C_Parser_gap9 from dory.Hardware_targets.PULP.GAP9_NE16.Ne16_HW_node import Ne16_HW_node +from dory.Utils.Templates_writer.TemplateWriter import TemplateWriter +import dory.Utils.Templates_writer.Layer2D_template_writer as Layer2D_writer import os import sys sys.path.append(os.path.join(os.path.dirname(__file__), "..", "Backend_Kernels", "pulp-nnx", "test")) @@ -56,6 +58,38 @@ def l2_template_keywords(self, node, backend_library): tk = self.__nnx_vars(tk, node) return tk + def mapping_layers_to_C_files(self): + print("\nMapping the layers files to their templates and copying the kernels associated.") + n_memory_levels = self.HW_description['memory']['levels'] + + for i, node in enumerate(self.HWgraph): + backend_library = self.node_backend_library(node) + self.copy_backend_files(node, backend_library) + + #if (node.kernel_shape[0] == 3 and node.group == 96): + #breakpoint() + + if n_memory_levels > 2 and (node.L3_input != 0 or (node.tiling_dimensions["L3"]["output_dimensions"] != node.tiling_dimensions["L2"]["output_dimensions"]) or (node.tiling_dimensions["L3"]["weights_dimensions"] != node.tiling_dimensions["L2"]["weights_dimensions"])): + #breakpoint() + tk = Layer2D_writer.print_template_layer_L3(node) + TemplateWriter.write(tk, {os.path.join(self.src_dir, node.prefixed_name + ".c"): os.path.join(self.tmpl_dir, "layer_L3_c_template.c"), + os.path.join(self.inc_dir, node.prefixed_name + ".h"): os.path.join(self.tmpl_dir, "layer_L3_h_template.h")}) + if node.tiling_dimensions["L3"]["input_dimensions"][1] > node.tiling_dimensions["L2"]["input_dimensions"][1]: + node.tiling_dimensions["L2"]["output_dimensions"][1] = (node.tiling_dimensions["L2"]["input_dimensions"][1] - node.kernel_shape[0] + node.strides[0]) // node.strides[0] + if node.tiling_dimensions["L3"]["output_dimensions"][1] > node.tiling_dimensions["L2"]["output_dimensions"][1]: + node.tiling_dimensions["L2"]["input_dimensions"][1] = node.tiling_dimensions["L2"]["output_dimensions"][1] * node.strides[0] + node.kernel_shape[0] - node.strides[0] + node.name = node.name + "_L2" + tk = self.l2_template_keywords(node, backend_library) + TemplateWriter.write(tk, self.l2_template_mapping(node, backend_library)) + node.name = node.name[:-3] + else: + if node.tiling_dimensions["L2"]["input_dimensions"][2] == node.tiling_dimensions["L1"]["input_dimensions"][2]: + node.tiling_dimensions["L1"]["output_dimensions"][2] = int((node.tiling_dimensions["L1"]["input_dimensions"][2] + (node.pads[1] + node.pads[3]) - node.kernel_shape[1] + node.strides[1]) / node.strides[1]) + if node.tiling_dimensions["L2"]["input_dimensions"][1] == node.tiling_dimensions["L1"]["input_dimensions"][1]: + node.tiling_dimensions["L1"]["output_dimensions"][1] = int((node.tiling_dimensions["L1"]["input_dimensions"][1] + (node.pads[0] + node.pads[2]) - node.kernel_shape[0] + node.strides[0]) / node.strides[0]) + tk = self.l2_template_keywords(node, backend_library) + TemplateWriter.write(tk, self.l2_template_mapping(node, backend_library)) + def __mem_tmpl_vars(self, tk, node, mem_level): mem_name = f'L{mem_level}' upper_mem_name = f'L{mem_level + 1}' From e95828a9d9dc7c709e796a44f0232269caac4530 Mon Sep 17 00:00:00 2001 From: Luka Macan Date: Mon, 8 Jan 2024 12:50:08 +0000 Subject: [PATCH 10/30] Add ne16 check for padded functions --- dory/Utils/Templates_writer/Layer2D_template_writer.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/dory/Utils/Templates_writer/Layer2D_template_writer.py b/dory/Utils/Templates_writer/Layer2D_template_writer.py index 78c8dcfd..9e0ba4b9 100644 --- a/dory/Utils/Templates_writer/Layer2D_template_writer.py +++ b/dory/Utils/Templates_writer/Layer2D_template_writer.py @@ -25,6 +25,7 @@ import sys import os import re +from dory.Hardware_targets.PULP.GAP9_NE16.Ne16_HW_node import Ne16_HW_node def print_template_layer_L3(node): ks = node.kernel_shape @@ -92,7 +93,7 @@ def print_template_layer_L3(node): tk['n_tile_x'] = factor_h_in tk['n_tile_y'] = factor_h_out tk['verbose'] = False - if tk['padding'] > 0: + if tk['padding'] > 0 and not isinstance(node, Ne16_HW_node): tk['func_name'] = [node.prefixed_name + "_L2", node.prefixed_name + "_L2_p_t", node.prefixed_name + "_L2_p_b"] else: tk['func_name'] = [node.prefixed_name + "_L2"] @@ -408,10 +409,8 @@ def print_template_layer(node, layer_type, double_buffering = 2): # x last tk['x_tile_size_nif_last'] = n_in % tile_n_in if (n_in % tile_n_in) > 0 else tile_n_in tk['x_tile_size_nif_byte_last'] = int(math.ceil(tk['x_tile_size_nif_last'] * ds_x / 8.0)) - tk['x_tile_size_h_last'] = tk['y_tile_size_h_last'] * s[0] + ks[0] - s[0] - assert tk['x_tile_size_h_last'] >= 0 and tk['x_tile_size_h_last'] <= tk['x_tile_size_h'] - tk['x_tile_size_w_last'] = tk['y_tile_size_w_last'] * s[1] + ks[1] - s[1] - assert tk['x_tile_size_w_last'] >= 0 and tk['x_tile_size_w_last'] <= tk['x_tile_size_w'] + tk['x_tile_size_h_last'] = min(tk['y_tile_size_h_last'] * s[0] + ks[0] - s[0], tile_h_in) + tk['x_tile_size_w_last'] = min(tk['y_tile_size_w_last'] * s[1] + ks[1] - s[1], tile_w_in) l = "" for k, v in tk.items(): From 38052e4d022ca13c3bc470445bd9ba34180808e2 Mon Sep 17 00:00:00 2001 From: Luka Macan Date: Mon, 8 Jan 2024 19:22:25 +0100 Subject: [PATCH 11/30] Bump pulp-nnx --- dory/Hardware_targets/PULP/Backend_Kernels/pulp-nnx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dory/Hardware_targets/PULP/Backend_Kernels/pulp-nnx b/dory/Hardware_targets/PULP/Backend_Kernels/pulp-nnx index 06e3e91c..ca56a9d4 160000 --- a/dory/Hardware_targets/PULP/Backend_Kernels/pulp-nnx +++ b/dory/Hardware_targets/PULP/Backend_Kernels/pulp-nnx @@ -1 +1 @@ -Subproject commit 06e3e91c6308948d0c060755e61cc38805b8c280 +Subproject commit ca56a9d4ae5fe8a300ea3738ea3ffadbba7353c3 From 7037b4636c8cff823ce004b6e02789871c0156ed Mon Sep 17 00:00:00 2001 From: Luka Macan Date: Tue, 9 Jan 2024 10:31:36 +0100 Subject: [PATCH 12/30] Fix padding flag passing --- .../Templates/layer_templates/layer_L3_c_template.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/dory/Hardware_targets/PULP/GAP9/Templates/layer_templates/layer_L3_c_template.c b/dory/Hardware_targets/PULP/GAP9/Templates/layer_templates/layer_L3_c_template.c index 61d3523e..92e1735c 100644 --- a/dory/Hardware_targets/PULP/GAP9/Templates/layer_templates/layer_L3_c_template.c +++ b/dory/Hardware_targets/PULP/GAP9/Templates/layer_templates/layer_L3_c_template.c @@ -129,6 +129,8 @@ void __attribute__ ((noinline)) ${func_name_L3}(void *args) int offset_x = ${dim_in-int(conv_overlap1*n_in*w_in*BitIn/8) - int(padding*n_in*w_in*BitIn/8)}; % if n_tile_x > 1: + uint32_t padding = tile_args.padding; + // loop over input/output tiles for(int j = 0; j < ${n_tile_x}; j++) { // Fetching next input tile @@ -199,6 +201,16 @@ else if (j == (${n_tile_y-1})) { ${func_name[0]}((void*)&tile_args); } % else: + % if n_tile_x > 1: + // LMACAN: Hacky way to add the padding flag passing. + tile_args.padding = 0; + if (j == 0) { + tile_args.padding |= padding & NET_UTILS_PAD_TOP; + } + if (j == ${n_tile_x - 1}) { + tile_args.padding |= padding & NET_UTILS_PAD_BOTTOM; + } + % endif ${func_name[0]}(&tile_args); % endif From 997b24146a12522fe462284b0febfdec995318bb Mon Sep 17 00:00:00 2001 From: Luka Macan Date: Tue, 9 Jan 2024 10:32:42 +0100 Subject: [PATCH 13/30] Fix unused padding with strided convolution for ne16 --- dory/Hardware_targets/PULP/GAP9_NE16/HW_Parser.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/dory/Hardware_targets/PULP/GAP9_NE16/HW_Parser.py b/dory/Hardware_targets/PULP/GAP9_NE16/HW_Parser.py index 6ae78200..a63b3210 100644 --- a/dory/Hardware_targets/PULP/GAP9_NE16/HW_Parser.py +++ b/dory/Hardware_targets/PULP/GAP9_NE16/HW_Parser.py @@ -113,3 +113,13 @@ def adjust_node_data_layout(self, node, node_id): weights["value"] = Ne16.weight_unroll(weights["value"].astype(np.uint8), node.weight_bits, node.group > 1) weights["layout"] = "CoutCinMajKQwCinMin" # Ne16's special layout + + self.adjust_padding(node) + + def adjust_padding(self, node): + inp_dim = node.input_dimensions + ks = node.kernel_shape + s = node.strides + padding_top, padding_left, padding_bottom, padding_right = node.pads + node.pads[2] = padding_bottom if (inp_dim[0] - ks[0] + padding_top + padding_bottom) % s[0] == 0 else 0 + node.pads[3] = padding_right if (inp_dim[1] - ks[1] + padding_left + padding_right) % s[1] == 0 else 0 From 858a62b84af8dddd201486c2c61f7ec82e4df3b3 Mon Sep 17 00:00:00 2001 From: Luka Macan Date: Tue, 9 Jan 2024 10:33:03 +0100 Subject: [PATCH 14/30] Rewrite convolution overlap --- .../Templates_writer/Layer2D_template_writer.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/dory/Utils/Templates_writer/Layer2D_template_writer.py b/dory/Utils/Templates_writer/Layer2D_template_writer.py index 9e0ba4b9..6e88b3f0 100644 --- a/dory/Utils/Templates_writer/Layer2D_template_writer.py +++ b/dory/Utils/Templates_writer/Layer2D_template_writer.py @@ -33,8 +33,6 @@ def print_template_layer_L3(node): g = node.group p = node.pads padding_top = p[0]; padding_left = p[1]; padding_bottom = p[2]; padding_right = p[3]; - conv_overlap1 = 2 * (ks[0] // 2) + ks[0] % 2 - 1 - (s[0] - 1) - conv_overlap2 = 2 * (ks[1] // 2) + ks[1] % 2 - 1 - (s[1] - 1) tk = OrderedDict([]) tk['flag_DW'] = 1 if node.group > 1 else 0 tk['ULTRA_VERBOSE'] = False @@ -75,8 +73,8 @@ def print_template_layer_L3(node): ################################################################################ - tk['conv_overlap1'] = conv_overlap1 - tk['conv_overlap2'] = conv_overlap2 + tk['conv_overlap1'] = ks[0] - s[0] + tk['conv_overlap2'] = ks[1] - s[1] tk['padding'] = padding_top if (node.L3_input): tk['input_L3'] = 1 @@ -151,11 +149,8 @@ def print_template_layer(node, layer_type, double_buffering = 2): s = node.strides g = node.group p = node.pads - conv_overlap_h = 2 * (ks[0] // 2) + ks[0] % 2 - 1 - (s[0] - 1) padding_top = p[0]; padding_left = p[1]; padding_bottom = p[2]; padding_right = p[3]; name_layer = node.prefixed_name + '.h' - conv_overlap1 = 2 * (ks[0] // 2) + ks[0] % 2 - 1 - (s[0] - 1) - conv_overlap2 = 2 * (ks[1] // 2) + ks[1] % 2 - 1 - (s[1] - 1) tk = OrderedDict([]) if (re.search('.0',name_layer)): try: @@ -178,8 +173,8 @@ def print_template_layer(node, layer_type, double_buffering = 2): tk['has_bias'] = int(len([1 for name in node.constant_names if "bias" in name])>0) tk['FLAG_RELU'] = 1 if 'outshift' in node.constant_names else 0 tk['type'] = f"{node.input_activation_type}8_t" if node.input_activation_type in ["int", "uint"] else "float" - tk['conv_overlap1'] = conv_overlap1 - tk['conv_overlap2'] = conv_overlap2 + tk['conv_overlap1'] = ks[0] - s[0] + tk['conv_overlap2'] = ks[1] - s[1] tk['padding_top'] = padding_top tk['padding_bottom'] = padding_bottom tk['padding_left'] = padding_left From fe647a5f3683aa6f9c59704c42c7452fe9c38232 Mon Sep 17 00:00:00 2001 From: Luka Macan Date: Thu, 18 Jan 2024 17:48:33 +0100 Subject: [PATCH 15/30] Small rewrites for add --- .../PULP/Common/Tiler/tiler_add.py | 45 +++++++------------ .../layer_L2_c_addition_template.c | 2 +- 2 files changed, 16 insertions(+), 31 deletions(-) diff --git a/dory/Hardware_targets/PULP/Common/Tiler/tiler_add.py b/dory/Hardware_targets/PULP/Common/Tiler/tiler_add.py index 68021612..8b50ef05 100644 --- a/dory/Hardware_targets/PULP/Common/Tiler/tiler_add.py +++ b/dory/Hardware_targets/PULP/Common/Tiler/tiler_add.py @@ -91,62 +91,47 @@ def get_tiling_Add_L2(self): # return immediatly if the memory fits the L1 if buffer_total <= L1_memory: return ([], self.HW_node.tiling_dimensions["L2"]["input_dimensions"] , self.HW_node.tiling_dimensions["L2"]["output_dimensions"] ) - else: - db = self.double_buffering + + db = self.double_buffering parameters = pywrapcp.Solver.DefaultSolverParameters() solver = pywrapcp.Solver("simple_CP", parameters) - # integer positive variables. - tile_n = solver.IntVar(1, in_ch, 'tile_n') - tile_h_in = solver.IntVar(ks[0], inp_dim[0], 'tile_h_in') - tile_w_in = solver.IntVar(ks[1], inp_dim[1], 'tile_w_in') - tile_h_out = solver.IntVar(1, out_dim[0], 'tile_h_out') - tile_w_out = solver.IntVar(1, out_dim[1], 'tile_w_out') + assert in_ch == out_ch - # scaling is used to ensure datasize is integer - solver.Add(tile_h_out == tile_h_in) - solver.Add(tile_w_out == tile_w_in) - solver.Add(tile_n == in_ch) + # integer positive variables. + tile_n = in_ch + tile_h = solver.IntVar(1, inp_dim[0], 'tile_h_in') + tile_w = inp_dim[1] # CONSTRAINTS: managing of correct dimensions (no decimal h_out and any # type of rounding) - input_tile_dimension = (db * in_ch * tile_h_in * inp_dim[1] * self.HW_node.input_activation_bits + 7 ) // 8 # the 7 is to account for bit precision of 1, which still occupy an entire byte - output_tile_dimension = (db * out_ch * tile_h_out * out_dim[1] * self.HW_node.output_activation_bits + 7 ) // 8 # the 7 is to account for bit precision of 1, which still occupy an entire byte + input_tile_dimension = (db * tile_n * tile_h * tile_w * self.HW_node.input_activation_bits + 7 ) // 8 # the 7 is to account for bit precision of 1, which still occupy an entire byte + output_tile_dimension = (db * tile_n * tile_h * tile_w * self.HW_node.output_activation_bits + 7 ) // 8 # the 7 is to account for bit precision of 1, which still occupy an entire byte constraint_all = input_tile_dimension * int(np.ceil(1+self.HW_node.second_input_activation_bits/self.HW_node.input_activation_bits)) + output_tile_dimension solver.Add(constraint_all <= L1_memory) + # objective obj_expr = solver.IntVar(0, 1000000000000, "obj_expr") - solver.Add(obj_expr == 1000000 * constraint_all - + 10 * tile_w_in - + 1 * tile_h_in - + 1000 * tile_n) + solver.Add(obj_expr == constraint_all) objective = solver.Maximize(obj_expr, 1) - decision_builder = solver.Phase([tile_n, tile_h_in, tile_w_in, tile_h_out, tile_w_out], + decision_builder = solver.Phase([tile_h], solver.CHOOSE_FIRST_UNBOUND, solver.ASSIGN_MIN_VALUE) # Create a solution collector. collector = solver.LastSolutionCollector() # Add the decision variables. - collector.Add(tile_n) - collector.Add(tile_h_in) - collector.Add(tile_w_in) - collector.Add(tile_h_out) - collector.Add(tile_w_out) + collector.Add(tile_h) # Add the objective. collector.AddObjective(obj_expr) solver.Solve(decision_builder, [objective, collector]) if collector.SolutionCount() > 0: best_solution = collector.SolutionCount() - 1 - tile_n = collector.Value(best_solution, tile_n) - tile_h_in = collector.Value(best_solution, tile_h_in) - tile_w_in = collector.Value(best_solution, tile_w_in) - tile_h_out = collector.Value(best_solution, tile_h_out) - tile_w_out = collector.Value(best_solution, tile_w_out) - return ([], [tile_n, tile_h_in, tile_w_in], [tile_n, tile_h_out, tile_w_out]) + tile_h = collector.Value(best_solution, tile_h) + return ([], [tile_n, tile_h, tile_w], [tile_n, tile_h, tile_w]) print(" Add ERROR: no tiling found. Exiting...") os._exit(0) return None diff --git a/dory/Hardware_targets/PULP/GAP9/Templates/layer_templates/layer_L2_c_addition_template.c b/dory/Hardware_targets/PULP/GAP9/Templates/layer_templates/layer_L2_c_addition_template.c index 5508d7c3..a384bc59 100644 --- a/dory/Hardware_targets/PULP/GAP9/Templates/layer_templates/layer_L2_c_addition_template.c +++ b/dory/Hardware_targets/PULP/GAP9/Templates/layer_templates/layer_L2_c_addition_template.c @@ -119,8 +119,8 @@ static void kernel(Layer tile, Layer tile2) { tile.addr.input, tile2.addr.input, tile.addr.output, - ${inmul2}, ${inmul1}, + ${inmul2}, ${outshift}, tile.input.width, tile.input.height, From d195966d2dbcf0a768ca4c7b71839c3ad8ebca19 Mon Sep 17 00:00:00 2001 From: Luka Macan Date: Thu, 18 Jan 2024 17:49:07 +0100 Subject: [PATCH 16/30] Add last checksum --- .../PULP/GAP9/Templates/network_c_template.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/dory/Hardware_targets/PULP/GAP9/Templates/network_c_template.c b/dory/Hardware_targets/PULP/GAP9/Templates/network_c_template.c index 49ebbbcf..90678e47 100644 --- a/dory/Hardware_targets/PULP/GAP9/Templates/network_c_template.c +++ b/dory/Hardware_targets/PULP/GAP9/Templates/network_c_template.c @@ -479,6 +479,12 @@ void ${prefix}network_run_cluster(void *args) { ${prefix}cycle_network_execution += io_cyc; % endif + % if 'Last' in verbose_level: + checksum("final output", L2_output, + activations_out_size[${len(DORY_HW_graph)-1}], + activations_out_checksum[${len(DORY_HW_graph)-1}][exec]); + % endif + //memcpy(L2_output, l2_final_output, activations_out_size[${len(DORY_HW_graph)-1}]); // BUGGY! for (int i=0; i Date: Tue, 30 Jan 2024 12:09:40 +0100 Subject: [PATCH 17/30] WIP: Upgrade pulp-nnx --- .../Backend_Kernels/BackendKernelsAdapter.py | 28 ++++--- .../PULP/Backend_Kernels/pulp-nnx | 2 +- .../PULP/GAP9_NE16/HW_Parser.py | 6 +- .../layer_L2_c_conv_ne16_multicore_template.c | 59 ++++++++------ .../PULP/GAP9_NE16/Utils_files/execute.h | 81 +++++++++---------- .../Layer2D_template_writer.py | 2 + 6 files changed, 96 insertions(+), 82 deletions(-) diff --git a/dory/Hardware_targets/PULP/Backend_Kernels/BackendKernelsAdapter.py b/dory/Hardware_targets/PULP/Backend_Kernels/BackendKernelsAdapter.py index 3b0c3c17..a09ee025 100644 --- a/dory/Hardware_targets/PULP/Backend_Kernels/BackendKernelsAdapter.py +++ b/dory/Hardware_targets/PULP/Backend_Kernels/BackendKernelsAdapter.py @@ -177,24 +177,32 @@ def _check_valid_node(self, node: HW_node) -> None: if self.accelerator_name == "ne16": assert True # TODO - def _get_ne16_src_files(self) -> List[str]: - return ["src/pulp_nnx_ne16.c", "ne16/hal/ne16_hal.c"] + def _get_acc_src_files(self) -> List[str]: + return [ + f"{self.accelerator_name}/hal/{self.accelerator_name}.c", + f"{self.accelerator_name}/hal/{self.accelerator_name}_task.c", + f"{self.accelerator_name}/bsp/{self.accelerator_name}_pulp_bsp.c", + f"src/pulp_nnx_{self.accelerator_name}.c", + ] - def _get_ne16_inc_files(self) -> List[str]: + def _get_acc_inc_files(self) -> List[str]: return [ - "ne16/gvsoc/ne16_gvsoc_logging.h", - "ne16/hal/ne16_defs.h", - "ne16/hal/ne16_hal.h", + f"{self.accelerator_name}/gvsoc/{self.accelerator_name}_gvsoc.h", + f"{self.accelerator_name}/hal/{self.accelerator_name}_task_defs.h", + f"{self.accelerator_name}/hal/{self.accelerator_name}.h", + f"{self.accelerator_name}/hal/{self.accelerator_name}_task.h", + f"{self.accelerator_name}/bsp/{self.accelerator_name}_pulp_bsp.h", + f"inc/pulp_nnx_{self.accelerator_name}.h", ] def _get_src_files(self) -> List[str]: - src_files = ["util/pulp_nnx_util.c"] + src_files = ["util/pulp_nnx_util.c", "util/hwpe.c"] if self.accelerator_name == "ne16": - src_files += self._get_ne16_src_files() + src_files += self._get_acc_src_files() return [os.path.join(self._root_dir(), file) for file in src_files] def _get_inc_files(self) -> List[str]: - inc_files = ["inc/pulp_nnx.h", "util/pulp_nnx_util.h"] + inc_files = ["util/pulp_nnx_util.h", "util/hwpe.h"] if self.accelerator_name == "ne16": - inc_files += self._get_ne16_inc_files() + inc_files += self._get_acc_inc_files() return [os.path.join(self._root_dir(), file) for file in inc_files] diff --git a/dory/Hardware_targets/PULP/Backend_Kernels/pulp-nnx b/dory/Hardware_targets/PULP/Backend_Kernels/pulp-nnx index ca56a9d4..70b089ed 160000 --- a/dory/Hardware_targets/PULP/Backend_Kernels/pulp-nnx +++ b/dory/Hardware_targets/PULP/Backend_Kernels/pulp-nnx @@ -1 +1 @@ -Subproject commit ca56a9d4ae5fe8a300ea3738ea3ffadbba7353c3 +Subproject commit 70b089ed7a050fd99c775f9b7504dfcbd732c02f diff --git a/dory/Hardware_targets/PULP/GAP9_NE16/HW_Parser.py b/dory/Hardware_targets/PULP/GAP9_NE16/HW_Parser.py index a63b3210..a5588601 100644 --- a/dory/Hardware_targets/PULP/GAP9_NE16/HW_Parser.py +++ b/dory/Hardware_targets/PULP/GAP9_NE16/HW_Parser.py @@ -25,10 +25,10 @@ from dory.Parsers.HW_node import HW_node sys.path.append(os.path.join(os.path.dirname(__file__), "..", "Backend_Kernels", "pulp-nnx", "test")) -from Ne16TestClasses import Ne16TestConf +from Ne16TestConf import Ne16TestConf from TestClasses import IntegerType +from Ne16MemoryLayout import Ne16MemoryLayout from pydantic import ValidationError -from Ne16 import Ne16 class onnx_manager(onnx_manager_gap9): @@ -110,7 +110,7 @@ def adjust_node_data_layout(self, node, node_id): ## Unroll expects layout to be "CoutCinK" if weights["layout"] == "CoutKCin": weights["value"] = np.transpose(weights["value"], (0,3,1,2)) - weights["value"] = Ne16.weight_unroll(weights["value"].astype(np.uint8), + weights["value"] = Ne16MemoryLayout.weightEncode(weights["value"].astype(np.uint8), node.weight_bits, node.group > 1) weights["layout"] = "CoutCinMajKQwCinMin" # Ne16's special layout diff --git a/dory/Hardware_targets/PULP/GAP9_NE16/Templates/layer_templates/layer_L2_c_conv_ne16_multicore_template.c b/dory/Hardware_targets/PULP/GAP9_NE16/Templates/layer_templates/layer_L2_c_conv_ne16_multicore_template.c index 35b71150..b1b42d84 100644 --- a/dory/Hardware_targets/PULP/GAP9_NE16/Templates/layer_templates/layer_L2_c_conv_ne16_multicore_template.c +++ b/dory/Hardware_targets/PULP/GAP9_NE16/Templates/layer_templates/layer_L2_c_conv_ne16_multicore_template.c @@ -20,8 +20,15 @@ */ #include "${func_name}.h" -#include "pulp_nnx.h" + +// ne16 includes +#include "pulp_nnx_ne16.h" #include "pulp_nnx_util.h" +#include "ne16_pulp_bsp.h" +#include "ne16_task.h" +#include "ne16.h" +#include "hwpe.h" + #include "network.h" #include "net_utils.h" #include "dory_dma.h" @@ -105,7 +112,7 @@ static inline void load_weights_prepare(Layer tile, Kernel kernel, DmaTransferConf * const conf_bias ) { // Special because of accelerators special memory layout - const int size_weights = (kernel.groups > 1 ? divnceil(tile.output.channel, 16) : tile.output.channel) * ${l1_W_tile_ki_size}; + const int size_weights = (kernel.groups > 1 ? nnx_calculate_number_of_tiles(tile.output.channel, 16) : tile.output.channel) * ${l1_W_tile_ki_size}; const int size_scale = tile.output.channel * ${int(act_dim_bit/8)}; const int size_bias = tile.output.channel * ${int(b_data_size_byte/8)}; @@ -176,13 +183,14 @@ static int inc(int index, int end) { #define BUFFER_SIZE (2) static Layer tiles[BUFFER_SIZE]; -static nnx_task_t nnx_tasks[BUFFER_SIZE]; +static ne16_task_t ne16_tasks[BUFFER_SIZE]; static DmaTransferConf store_conf[BUFFER_SIZE]; static struct { Monitor input, output, store_conf; } monitor; + static void layer_task_fork(void *args) { const int total_tiles = end_index.height * end_index.width * end_index.output_channel; @@ -262,9 +270,9 @@ static void layer_task_fork(void *args) { load_async(tiles[i_buff], &tile_status, body, layer, kernel); dma_mutex_unlock(); % if stride == 1: - execute_prepare(tile, &nnx_tasks[i_buff]); + execute_prepare(tile, &ne16_tasks[i_buff]); % elif stride == 2: - execute_stride2x2_prepare(tile, kernel, &nnx_tasks[i_buff]); + execute_stride2x2_prepare(tile, kernel, &ne16_tasks[i_buff]); % endif dma_mutex_lock(); dma_transfer_wait(transfer); @@ -290,9 +298,9 @@ static void layer_task_fork(void *args) { monitor_produce_begin(monitor.output); % if stride == 1: - execute_async(&nnx_tasks[i_buff]); + execute_async(&ne16_tasks[i_buff]); % elif stride == 2: - execute_stride2x2_blocking(&nnx_tasks[i_buff], tiles[i_buff], kernel, tiles[i_buff].output.channel); + execute_stride2x2_blocking(&ne16_tasks[i_buff], tiles[i_buff], kernel); % endif monitor_produce_end(monitor.output); @@ -310,7 +318,7 @@ static void layer_task_fork(void *args) { monitor_consume_begin(monitor.store_conf); monitor_consume_begin(monitor.output); - execute_wait(&nnx_tasks[i_buff]); + execute_wait(&ne16_tasks[i_buff]); monitor_consume_end(monitor.input); dma_mutex_lock(); @@ -362,24 +370,25 @@ void ${func_name}(void *args) { // Init nnx tasks for (int i = 0; i < BUFFER_SIZE; i++) { - nnx_task_init(&nnx_tasks[i], - ${fs1}, ${int(flag_DW)}, - ${x_data_size_byte}, ${y_data_size_byte}, ${W_data_size_byte}, - weightOffsetModeLayerWise, ${-(2**(W_data_size_byte-1))}, - (nnx_quant_t) { - .shift_amount = ${out_shift}, - .mode = quantMode8Bit, - .function = quantFunctionRelu, - .flag_rounding = NE16_FLAG_UNUSED - }, (nnx_norm_t) { - .mode = ${"normMode32Bit" if act_dim_bit == 32 else "normMode8Bit" if act_dim_bit == 8 else "NE16_FLAG_UNUSED"}, - .flag_bias = NE16_FLAG_USED, - .flag_shift = NE16_FLAG_UNUSED - }, ${stride}); + ne16_task_init(&ne16_tasks[i]); + ne16_task_set_op_to_conv(&ne16_tasks[i], ${fs1}, ${int(flag_DW)}, ${stride}); + ne16_task_set_bits(&ne16_tasks[i], ${x_data_size_byte}, ${y_data_size_byte}, ${W_data_size_byte}); + ne16_task_set_norm_quant( + &ne16_tasks[i], + (ne16_quant_t) { + .shift_amount = ${out_shift}, + .function = quantFunctionRelu, // todo: un-hardcode it + .flag_rounding = ne16TaskFlagFalse + }, (ne16_norm_t) { + .mode = ${"normMode32Bit" if act_dim_bit == 32 else "normMode8Bit" if act_dim_bit == 8 else ""}, + .flag_bias = ne16TaskFlagTrue, + .flag_shift = ne16TaskFlagFalse + }); + ne16_task_set_weight_offset(&ne16_tasks[i], weightOffsetModeLayerWise, ${weight_offset}); } - const int max_stall = 8; - nnx_init(max_stall); + const ne16_pulp_conf_t ne16_pulp_conf = {.max_stall = 8}; + ne16_nnx_init(ne16_pulp_get_dev(), &ne16_pulp_conf); dma_mutex_init(); @@ -393,5 +402,5 @@ void ${func_name}(void *args) { monitor_term(monitor.input); monitor_term(monitor.output); monitor_term(monitor.store_conf); - nnx_term(); + ne16_nnx_term(ne16_pulp_get_dev()); } diff --git a/dory/Hardware_targets/PULP/GAP9_NE16/Utils_files/execute.h b/dory/Hardware_targets/PULP/GAP9_NE16/Utils_files/execute.h index 5e078d64..aa07add1 100644 --- a/dory/Hardware_targets/PULP/GAP9_NE16/Utils_files/execute.h +++ b/dory/Hardware_targets/PULP/GAP9_NE16/Utils_files/execute.h @@ -2,64 +2,59 @@ #define __EXECUTE_H__ #include "dory_get_tile.h" -#include "ne16_hal.h" -#include "pulp_nnx.h" +#include "hwpe.h" +#include "ne16.h" +#include "ne16_pulp_bsp.h" +#include "ne16_task.h" +#include "pulp_nnx_ne16.h" #include "pulp_nnx_util.h" #include "tile_status.h" #include -static inline void execute_prepare(Layer tile, nnx_task_t *const task) { - nnx_task_set_dims(task, tile.input.width, tile.input.channel, - tile.input.width, tile.input.channel, tile.output.height, - tile.output.width, tile.output.channel, tile.output.width, - tile.output.channel, tile.padding.top, tile.padding.bottom, - tile.padding.right, tile.padding.left); - nnx_task_set_ptrs(task, tile.addr.input, tile.input.width, tile.input.channel, - 8 /*bits_in*/, tile.padding.top, tile.padding.left, - tile.addr.output, tile.addr.weights, tile.addr.scale, - 0 /*shift_ptr*/, tile.addr.bias); +static inline void execute_prepare(Layer tile, ne16_task_t *const task) { + ne16_task_set_dims(task, tile.input.width, tile.input.channel, + tile.input.width * tile.input.channel, tile.input.channel, + tile.output.height, tile.output.width, tile.output.channel, + tile.output.width * tile.output.channel, + tile.output.channel, tile.padding.top, tile.padding.bottom, + tile.padding.right, tile.padding.left); + ne16_task_set_ptrs(task, tile.addr.input, tile.input.width, + tile.input.channel, tile.padding.top, tile.padding.left, + tile.addr.output, tile.addr.weights, tile.addr.scale, + 0 /*shift_ptr*/, tile.addr.bias); } -static inline void execute_async(nnx_task_t *task) { - nnx_dispatch_check_blocking(); - nnx_dispatch_task(task); +static inline void execute_async(ne16_task_t *task) { + ne16_nnx_dispatch_wait(ne16_pulp_get_dev()); + ne16_nnx_dispatch(ne16_pulp_get_dev(), task); } static inline void execute_stride2x2_prepare(Layer tile, Kernel kernel, - nnx_task_t *const task) { - nnx_task_set_dims_stride2x2( + ne16_task_t *const task) { + ne16_task_set_dims_stride2x2( task, tile.input.height, tile.input.width, tile.input.channel, - tile.input.width, tile.input.channel, tile.output.height, - tile.output.width, tile.output.channel, tile.output.width, - tile.output.channel, kernel.shape.height, kernel.shape.width, - tile.padding.top, tile.padding.bottom, tile.padding.right, - tile.padding.left); - nnx_task_set_ptrs(task, tile.addr.input, tile.input.width, tile.input.channel, - 8 /*bits_in*/, tile.padding.top, tile.padding.left, - tile.addr.output, tile.addr.weights, tile.addr.scale, - 0 /*shift_ptr*/, tile.addr.bias); + tile.input.width * tile.input.channel, tile.input.channel, + tile.output.height, tile.output.width, tile.output.channel, + tile.output.width * tile.output.channel, tile.output.channel, + kernel.shape.height, kernel.shape.width, tile.padding.top, + tile.padding.bottom, tile.padding.right, tile.padding.left); + ne16_task_set_ptrs(task, tile.addr.input, tile.input.width, + tile.input.channel, tile.padding.top, tile.padding.left, + tile.addr.output, tile.addr.weights, tile.addr.scale, + 0 /*shift_ptr*/, tile.addr.bias); } -static inline void execute_stride2x2_blocking(nnx_task_t *task, Layer tile, - Kernel kernel, - uint32_t output_channel_stride) { - nnx_dispatch_task_stride2x2( - task, tile.input.width, tile.input.channel, tile.input.width, - tile.input.channel, tile.output.height, tile.output.width, - tile.output.channel, tile.output.width, output_channel_stride, - kernel.shape.height, kernel.shape.width); +static inline void execute_stride2x2_blocking(ne16_task_t *task, Layer tile, + Kernel kernel) { + ne16_nnx_dispatch_stride2x2(ne16_pulp_get_dev(), task, tile.input.width, tile.input.channel, + tile.output.height, tile.output.width, + tile.output.channel, kernel.shape.height, + kernel.shape.width); } -static inline void execute_wait(nnx_task_t *task) { -#if __PLATFORM__ == ARCHI_PLATFORM_GVSOC && defined GAP_SDK - // Temporary hack because the gvsoc model of ne16 in gap_sdk - // has a broken running_id. - while(!ne16_empty()) +static inline void execute_wait(ne16_task_t *task) { + while (!ne16_nnx_resolve_check(ne16_pulp_get_dev(), task)) ; -#else - while(!nnx_resolve_check(task)) - ; -#endif } #endif // __EXECUTE_H__ diff --git a/dory/Utils/Templates_writer/Layer2D_template_writer.py b/dory/Utils/Templates_writer/Layer2D_template_writer.py index 6e88b3f0..556bf200 100644 --- a/dory/Utils/Templates_writer/Layer2D_template_writer.py +++ b/dory/Utils/Templates_writer/Layer2D_template_writer.py @@ -290,6 +290,8 @@ def print_template_layer(node, layer_type, double_buffering = 2): tk['W_data_size_byte'] = ds_W tk['b_data_size_byte'] = ds_bias tk['W_tile_size_nof'] = tile_n_out + if ds_W is not None: + tk['weight_offset'] = -(2**(ds_W-1)) if tk['has_bias'] == 1: tk['b_size_byte'] = int(math.ceil(n_out * ds_bias / 8.0)) else: From 397e95cc41ef5a96876080a22386eccd08f1f3ff Mon Sep 17 00:00:00 2001 From: Luka Macan Date: Tue, 30 Jan 2024 14:22:27 +0100 Subject: [PATCH 18/30] Copy network template to GAP9_NE16 --- .../GAP9_NE16/Templates/network_c_template.c | 506 +++++++++++++++++- 1 file changed, 505 insertions(+), 1 deletion(-) mode change 120000 => 100644 dory/Hardware_targets/PULP/GAP9_NE16/Templates/network_c_template.c diff --git a/dory/Hardware_targets/PULP/GAP9_NE16/Templates/network_c_template.c b/dory/Hardware_targets/PULP/GAP9_NE16/Templates/network_c_template.c deleted file mode 120000 index 72359887..00000000 --- a/dory/Hardware_targets/PULP/GAP9_NE16/Templates/network_c_template.c +++ /dev/null @@ -1 +0,0 @@ -../../GAP9/Templates/network_c_template.c \ No newline at end of file diff --git a/dory/Hardware_targets/PULP/GAP9_NE16/Templates/network_c_template.c b/dory/Hardware_targets/PULP/GAP9_NE16/Templates/network_c_template.c new file mode 100644 index 00000000..de554311 --- /dev/null +++ b/dory/Hardware_targets/PULP/GAP9_NE16/Templates/network_c_template.c @@ -0,0 +1,505 @@ +/* + * network.c + * Alessio Burrello + * Thorir Mar Ingolfsson + * + * Copyright (C) 2019-2020 University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +<% +l3_supported = DORY_HW_graph[0].HW_description['memory']['levels'] > 2 +%>\ +#define DEFINE_CONSTANTS +%if not l3_supported: +#include "${prefix}weights.h" +%endif +#include "net_utils.h" +#include "pmsis.h" +#include "${prefix}network.h" +#include "directional_allocator.h" +#include "mem.h" +#include "dory_dma.h" +#include +% for layer in list_h: +#include "${layer}" +% endfor + +% if sdk == 'pulp-sdk': +#define ICACHE_CTRL_UNIT 0x10201400 +#define ICACHE_PREFETCH ICACHE_CTRL_UNIT + 0x1C +% endif + +% if verbose: +#define VERBOSE 1 +% endif + +% if l3_supported: +#define L3_WEIGHTS_SIZE 4000000 +#define L3_INPUT_SIZE 1500000 +#define L3_OUTPUT_SIZE 1500000 +% endif +static void *L3_weights = NULL; +static void *L3_input = NULL; +static void *L3_output = NULL; +% if 'Yes' in performance or 'Perf_final' in verbose_level: +int ${prefix}cycle_network_execution; +% endif +% if l3_supported: +/* Moves the weights and the biases from hyperflash to hyperram */ +void ${prefix}network_initialize() { + + L3_weights = ram_malloc(L3_WEIGHTS_SIZE); + L3_input = ram_malloc(L3_INPUT_SIZE); + L3_output = ram_malloc(L3_OUTPUT_SIZE); + +#ifdef VERBOSE + printf("\nL3 Buffer alloc initial\t@ %d:\t%s\n", (unsigned int)L3_weights, L3_weights?"Ok":"Failed"); + printf("\nL3 Buffer alloc initial\t@ %d:\t%s\n", (unsigned int)L3_input, L3_input?"Ok":"Failed"); + printf("\nL3 Buffer alloc initial\t@ %d:\t%s\n", (unsigned int)L3_output, L3_output?"Ok":"Failed"); +#endif + + void *w_ptr = L3_weights; + for (int i = 0; i < ${weights_number}; i++) { + size_t size = load_file_to_ram(w_ptr, L3_weights_files[i]); + L3_weights_size[i] = size; + w_ptr += size; + } +} +% endif + +% if l3_supported: +/* Remove RAM memory */ +void ${prefix}network_terminate() { + % if l3_supported: + ram_free(L3_weights, L3_WEIGHTS_SIZE); + ram_free(L3_input, L3_INPUT_SIZE); + ram_free(L3_output, L3_OUTPUT_SIZE); + % endif +} +% endif + +void ${prefix}execute_layer_fork(void *args) { + layer_args_t *layer_args = (layer_args_t *)args; +#ifdef TARGET_CHIP_FAMILY_GAP9 + layer_args->L1_buffer = pi_cl_l1_malloc(NULL, ${l1_buffer}); +#else + layer_args->L1_buffer = pmsis_l1_malloc(${l1_buffer}); +#endif + + if (NULL == layer_args->L1_buffer) { +#ifdef VERBOSE + printf("ERROR: Failed to allocate the L1 buffer.\n"); +#endif // VERBOSE + return; + } + + switch (layer_args->layer_id) + { +% for i in range(len(DORY_HW_graph)): + case ${i}: + ${func_name[i]}(args); + break; +% endfor + } + +#ifdef TARGET_CHIP_FAMILY_GAP9 + pi_cl_l1_free(NULL, layer_args->L1_buffer, ${l1_buffer}); +#else + pmsis_l1_malloc_free(layer_args->L1_buffer, ${l1_buffer}); +#endif +} + +struct ${prefix}network_run_token ${prefix}network_run_async(void *l2_buffer, size_t l2_buffer_size, void *l2_final_output, int exec, int initial_dir${", void *L2_input_h" if not l3_supported else ""}) +{ + struct pi_device cluster_dev = {0}; + struct pi_cluster_conf conf; + struct pi_cluster_task cluster_task = {0}; + // First open the cluster + pi_cluster_conf_init(&conf); + conf.id=0; +#ifdef TARGET_CHIP_FAMILY_GAP9 + conf.icache_conf = PI_CLUSTER_MASTER_CORE_ICACHE_ENABLE | PI_CLUSTER_ICACHE_PREFETCH_ENABLE | PI_CLUSTER_ICACHE_ENABLE; +#endif +<% + n_args = 4 if l3_supported else 5 +%>\ + unsigned int args[${n_args}]; + args[0] = (unsigned int) l2_buffer; + args[1] = (unsigned int) l2_buffer_size; + args[2] = (unsigned int) l2_final_output; + args[3] = (unsigned int) exec; + args[4] = (unsigned int) initial_dir; + % if not l3_supported: + args[5] = (unsigned int) L2_input_h; + % endif + // open cluster... + pi_cluster_task(&cluster_task, ${prefix}network_run_cluster, args); + pi_open_from_conf(&cluster_dev, &conf); + if (pi_cluster_open(&cluster_dev)) + return; + // Then offload an entry point, this will get executed on the cluster controller +#ifndef TARGET_CHIP_FAMILY_GAP9 + cluster_task.stack_size = ${master_stack}; +#endif + cluster_task.slave_stack_size = ${slave_stack}; + pi_cluster_send_task_to_cl(&cluster_dev, &cluster_task); + return (struct ${prefix}network_run_token) { + .cluster_dev = cluster_dev + }; +} + +void ${prefix}network_run_wait(struct ${prefix}network_run_token token) +{ + pi_cluster_close(&token.cluster_dev); + % if 'Perf_final' in verbose_level: + print_perf("Final", ${prefix}cycle_network_execution, ${MACs}); + % endif +} + +void ${prefix}network_run(void *l2_buffer, size_t l2_buffer_size, void *l2_final_output, int exec, int initial_dir${", void *L2_input_h" if not l3_supported else ""}) +{ + ${prefix}network_run_wait(network_run_async(l2_buffer, l2_buffer_size, l2_final_output, exec, initial_dir${", L2_input_h" if not l3_supported else ""})); +} + +void ${prefix}network_run_cluster(void *args) { + unsigned int * real_args = (unsigned int *) args; + void * l2_buffer = (void *) real_args[0]; + size_t l2_buffer_size = (size_t) real_args[1]; + void * l2_final_output = (void *) real_args[2]; + int exec = (int) real_args[3]; + int dir = (int) real_args[4]; + % if not l3_supported: + void * L2_input_h = (void *)real_args[5]; + % endif +/* + - initial buffer allocation L2 and L1 + - variable declaration +*/ +/* ---------------------------------- */ +/* -------- SECTION 0 BEGIN --------- */ +/* ---------------------------------- */ + void *L2_output = NULL; + void *L2_input = NULL; + void *L2_weights = NULL; + void *L3_weights_curr = L3_weights; + void *bypass_activations = NULL; + + int residual_number = 0; + int bypass_dimension = 0; + % if not l3_supported: + int left_branch_nodes = 0, right_branch_nodes = 0; + int z = 0; + int end_left = 0; + % endif + + pi_perf_conf(1< 0 && branch_output[i-1] == 0 && branch_change[i-1] == 0) + dfree(activations_size[i], dir); + % endif + // Residual connections + if (i < ${len(DORY_HW_graph) - 1}) { + % if l3_supported: + if (branch_input[i+1] == 1) { + bypass_activations = dmalloc(bypass_dimension, !dir); + residual_number--; + cl_ram_read(bypass_activations, layers_pointers[residual_number], bypass_dimension); + cl_ram_free(layers_pointers[residual_number], bypass_dimension); + } + + // TODO I feel like this should look ahead instead of back + if (i > 0 && branch_output[i-1]==1 && L3_input_layers[i]==1) { // TODO don't understand this condition + L3_input = cl_ram_malloc(1500000); + } + if (branch_output[i]==1 && L3_output_layers[i]==1) { + cl_ram_free(L3_input + activations_out_size[i], 1500000 - activations_out_size[i]); + layers_pointers[residual_number] = L3_input; + residual_number++; + bypass_dimension = activations_out_size[i]; + } else + if (branch_output[i]==1 || branch_change[i] == 1) { + layers_pointers[residual_number] = cl_ram_malloc(activations_out_size[i]); + cl_ram_write(layers_pointers[residual_number], L2_output, activations_out_size[i]); + residual_number++; + bypass_dimension = activations_out_size[i]; + } + + if (branch_change[i]==1) { + dfree(activations_out_size[i], !dir); + L2_input = dmalloc(activations_size[i + 1], !dir); + cl_ram_read(L2_input, layers_pointers[residual_number - 2], activations_size[i + 1]); + cl_ram_free(layers_pointers[residual_number - 2], activations_size[i + 1]); + } + if (L3_output_layers[i] == 1) + dfree(activations_out_size[i], !dir); + % else: + + if (branch_output[i]==1) { + left_branch_nodes = 0; + right_branch_nodes = 0; + z = i+1; + end_left = 0; + while (branch_input[z] == 0) { + if (end_left == 0) + left_branch_nodes+=1; + else + right_branch_nodes+=1; + if (branch_change[z] == 1) + end_left = 1; + z+=1; + } + if ((left_branch_nodes % 2 == 1) && (right_branch_nodes == 0)) + dir = !dir; + if ((left_branch_nodes % 2 == 0) && (right_branch_nodes > 0)) + dir = !dir; + } + + if (branch_change[i]==1) { + L2_input = bypass_activations; + bypass_activations = L2_output; + bypass_dimension = activations_out_size[i]; + if (right_branch_nodes % 2 == 1) + dir = !dir; + } + % endif + } + % if l3_supported: + if (layer_with_weights[i]) + L3_weights_curr += L3_weights_size[weight_l_cnt++]; + % endif + dir = !dir; + } + + % if 'Yes' in performance or 'Perf_final' in verbose_level: + pi_perf_stop(); + io_cyc += pi_perf_read(PI_PERF_CYCLES); + % endif + % if 'Yes' in performance: + print_perf("IO wait", io_cyc, 0 /*ops*/); + % endif + % if 'Perf_final' in verbose_level: + ${prefix}cycle_network_execution += io_cyc; + % endif + + % if 'Last' in verbose_level: + checksum("final output", L2_output, + activations_out_size[${len(DORY_HW_graph)-1}], + activations_out_checksum[${len(DORY_HW_graph)-1}][exec]); + % endif + + //memcpy(L2_output, l2_final_output, activations_out_size[${len(DORY_HW_graph)-1}]); // BUGGY! + for (int i=0; i Date: Tue, 30 Jan 2024 14:24:02 +0100 Subject: [PATCH 19/30] Move dma mutex init to network --- .../layer_templates/layer_L2_c_conv_ne16_multicore_template.c | 1 - .../PULP/GAP9_NE16/Templates/network_c_template.c | 3 +++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/dory/Hardware_targets/PULP/GAP9_NE16/Templates/layer_templates/layer_L2_c_conv_ne16_multicore_template.c b/dory/Hardware_targets/PULP/GAP9_NE16/Templates/layer_templates/layer_L2_c_conv_ne16_multicore_template.c index b1b42d84..f965c317 100644 --- a/dory/Hardware_targets/PULP/GAP9_NE16/Templates/layer_templates/layer_L2_c_conv_ne16_multicore_template.c +++ b/dory/Hardware_targets/PULP/GAP9_NE16/Templates/layer_templates/layer_L2_c_conv_ne16_multicore_template.c @@ -389,7 +389,6 @@ void ${func_name}(void *args) { const ne16_pulp_conf_t ne16_pulp_conf = {.max_stall = 8}; ne16_nnx_init(ne16_pulp_get_dev(), &ne16_pulp_conf); - dma_mutex_init(); // Fork diff --git a/dory/Hardware_targets/PULP/GAP9_NE16/Templates/network_c_template.c b/dory/Hardware_targets/PULP/GAP9_NE16/Templates/network_c_template.c index de554311..93dc78fe 100644 --- a/dory/Hardware_targets/PULP/GAP9_NE16/Templates/network_c_template.c +++ b/dory/Hardware_targets/PULP/GAP9_NE16/Templates/network_c_template.c @@ -206,6 +206,9 @@ void ${prefix}network_run_cluster(void *args) { pi_perf_conf(1< Date: Tue, 30 Jan 2024 14:55:24 +0100 Subject: [PATCH 20/30] Move ne16 init to network --- .../layer_L2_c_conv_ne16_multicore_template.c | 8 +++----- .../PULP/GAP9_NE16/Templates/network_c_template.c | 9 +++++++++ 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/dory/Hardware_targets/PULP/GAP9_NE16/Templates/layer_templates/layer_L2_c_conv_ne16_multicore_template.c b/dory/Hardware_targets/PULP/GAP9_NE16/Templates/layer_templates/layer_L2_c_conv_ne16_multicore_template.c index f965c317..bfd85ad6 100644 --- a/dory/Hardware_targets/PULP/GAP9_NE16/Templates/layer_templates/layer_L2_c_conv_ne16_multicore_template.c +++ b/dory/Hardware_targets/PULP/GAP9_NE16/Templates/layer_templates/layer_L2_c_conv_ne16_multicore_template.c @@ -339,11 +339,13 @@ static void layer_task_fork(void *args) { } void ${func_name}(void *args) { - #ifdef DEBUG_GVSOC nnx_activate_gvsoc_logging(GVSOC_LOG_LEVEL_CONFIG, GVSOC_LOGGING_FORMAT_DECIMAL); #endif + ne16_dev_t *ne16_dev = ne16_pulp_get_dev(); + hwpe_soft_clear(&ne16_dev->hwpe_dev); + layer_args_t *layer_args = (layer_args_t *)args; // Initialization @@ -387,9 +389,6 @@ void ${func_name}(void *args) { ne16_task_set_weight_offset(&ne16_tasks[i], weightOffsetModeLayerWise, ${weight_offset}); } - const ne16_pulp_conf_t ne16_pulp_conf = {.max_stall = 8}; - ne16_nnx_init(ne16_pulp_get_dev(), &ne16_pulp_conf); - // Fork @@ -401,5 +400,4 @@ void ${func_name}(void *args) { monitor_term(monitor.input); monitor_term(monitor.output); monitor_term(monitor.store_conf); - ne16_nnx_term(ne16_pulp_get_dev()); } diff --git a/dory/Hardware_targets/PULP/GAP9_NE16/Templates/network_c_template.c b/dory/Hardware_targets/PULP/GAP9_NE16/Templates/network_c_template.c index 93dc78fe..cfbf6096 100644 --- a/dory/Hardware_targets/PULP/GAP9_NE16/Templates/network_c_template.c +++ b/dory/Hardware_targets/PULP/GAP9_NE16/Templates/network_c_template.c @@ -30,6 +30,8 @@ l3_supported = DORY_HW_graph[0].HW_description['memory']['levels'] > 2 #include "directional_allocator.h" #include "mem.h" #include "dory_dma.h" +#include "ne16_pulp_bsp.h" +#include "pulp_nnx_ne16.h" #include % for layer in list_h: #include "${layer}" @@ -209,6 +211,11 @@ void ${prefix}network_run_cluster(void *args) { // dma init dma_mutex_init(); + + // ne16 init + const ne16_pulp_conf_t ne16_pulp_conf = {.max_stall = 8}; + ne16_nnx_init(ne16_pulp_get_dev(), &ne16_pulp_conf); + /* ---------------------------------- */ /* --------- SECTION 0 END ---------- */ /* ---------------------------------- */ @@ -505,4 +512,6 @@ void ${prefix}network_run_cluster(void *args) { /* ---------------------------------- */ /* --------- SECTION 3 END ---------- */ /* ---------------------------------- */ + + ne16_nnx_term(ne16_pulp_get_dev()); } From 456816b7b0fbf9bd6bfd980c7797e977b150e1a5 Mon Sep 17 00:00:00 2001 From: Luka Macan Date: Tue, 30 Jan 2024 15:11:58 +0100 Subject: [PATCH 21/30] Move ne16_tasks from static global to the stack --- .../layer_L2_c_conv_ne16_multicore_template.c | 45 +++++++++++++------ 1 file changed, 32 insertions(+), 13 deletions(-) diff --git a/dory/Hardware_targets/PULP/GAP9_NE16/Templates/layer_templates/layer_L2_c_conv_ne16_multicore_template.c b/dory/Hardware_targets/PULP/GAP9_NE16/Templates/layer_templates/layer_L2_c_conv_ne16_multicore_template.c index bfd85ad6..6b125c85 100644 --- a/dory/Hardware_targets/PULP/GAP9_NE16/Templates/layer_templates/layer_L2_c_conv_ne16_multicore_template.c +++ b/dory/Hardware_targets/PULP/GAP9_NE16/Templates/layer_templates/layer_L2_c_conv_ne16_multicore_template.c @@ -183,7 +183,6 @@ static int inc(int index, int end) { #define BUFFER_SIZE (2) static Layer tiles[BUFFER_SIZE]; -static ne16_task_t ne16_tasks[BUFFER_SIZE]; static DmaTransferConf store_conf[BUFFER_SIZE]; static struct { @@ -191,33 +190,44 @@ static struct { } monitor; -static void layer_task_fork(void *args) { +struct layer_task_fork_args_t { + uint32_t L2_input; + uint32_t L2_weights; + uint32_t L2_output; + uint32_t L1_buffer; + uint32_t padding; + ne16_task_t *ne16_tasks; +}; + + +static void layer_task_fork(void *void_args) { const int total_tiles = end_index.height * end_index.width * end_index.output_channel; + struct layer_task_fork_args_t *args = (struct layer_task_fork_args_t *)void_args; + ne16_task_t *ne16_tasks = args->ne16_tasks; + // Loader if (pi_core_id() == LOADER_ID) { - layer_args_t *layer_args = (layer_args_t *)args; - Layer layer = { .addr = { - .input = layer_args->L2_input, - .weights = layer_args->L2_weights, - .scale = layer_args->L2_weights + ${l2_k_offset}, - .bias = layer_args->L2_weights + ${l2_lambda_offset}, - .output = layer_args->L2_output + .input = args->L2_input, + .weights = args->L2_weights, + .scale = args->L2_weights + ${l2_k_offset}, + .bias = args->L2_weights + ${l2_lambda_offset}, + .output = args->L2_output }, .padding = { - .top = layer_args->padding & NET_UTILS_PAD_TOP ? ${padding_top} : NE16_DONT_PAD, + .top = args->padding & NET_UTILS_PAD_TOP ? ${padding_top} : NE16_DONT_PAD, .right = ${padding_right}, - .bottom = layer_args->padding & NET_UTILS_PAD_BOTTOM ? ${padding_bottom} : NE16_DONT_PAD, + .bottom = args->padding & NET_UTILS_PAD_BOTTOM ? ${padding_bottom} : NE16_DONT_PAD, .left = ${padding_left} } }; // Double buffer address init - const unsigned int l1_buffer = layer_args->L1_buffer; + const unsigned int l1_buffer = args->L1_buffer; const int l1_buffer_input = l1_buffer + ${l1_x_offset}; const int l1_buffer_output = l1_buffer + ${l1_y_offset}; const int l1_buffer_weights = l1_buffer + ${l1_W_offset}; @@ -371,6 +381,7 @@ void ${func_name}(void *args) { } // Init nnx tasks + ne16_task_t ne16_tasks[BUFFER_SIZE]; for (int i = 0; i < BUFFER_SIZE; i++) { ne16_task_init(&ne16_tasks[i]); ne16_task_set_op_to_conv(&ne16_tasks[i], ${fs1}, ${int(flag_DW)}, ${stride}); @@ -392,7 +403,15 @@ void ${func_name}(void *args) { // Fork - pi_cl_team_fork(CORES, (void *)layer_task_fork, args); + struct layer_task_fork_args_t layer_task_fork_args = { + .L2_input = layer_args->L2_input, + .L2_weights = layer_args->L2_weights, + .L2_output = layer_args->L2_output, + .L1_buffer = layer_args->L1_buffer, + .padding = layer_args->padding, + .ne16_tasks = ne16_tasks, + }; + pi_cl_team_fork(CORES, layer_task_fork, (void *)&layer_task_fork_args); // Terminate From 08a58191690b3966a68be3b1d0e2c79b1a0a1034 Mon Sep 17 00:00:00 2001 From: Luka Macan Date: Tue, 30 Jan 2024 15:18:04 +0100 Subject: [PATCH 22/30] Move global static tiles to stack --- .../layer_L2_c_conv_ne16_multicore_template.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/dory/Hardware_targets/PULP/GAP9_NE16/Templates/layer_templates/layer_L2_c_conv_ne16_multicore_template.c b/dory/Hardware_targets/PULP/GAP9_NE16/Templates/layer_templates/layer_L2_c_conv_ne16_multicore_template.c index 6b125c85..34803e5e 100644 --- a/dory/Hardware_targets/PULP/GAP9_NE16/Templates/layer_templates/layer_L2_c_conv_ne16_multicore_template.c +++ b/dory/Hardware_targets/PULP/GAP9_NE16/Templates/layer_templates/layer_L2_c_conv_ne16_multicore_template.c @@ -182,7 +182,6 @@ static int inc(int index, int end) { #define BUFFER_SIZE (2) -static Layer tiles[BUFFER_SIZE]; static DmaTransferConf store_conf[BUFFER_SIZE]; static struct { @@ -197,6 +196,7 @@ struct layer_task_fork_args_t { uint32_t L1_buffer; uint32_t padding; ne16_task_t *ne16_tasks; + Layer *tiles; }; @@ -205,6 +205,7 @@ static void layer_task_fork(void *void_args) { struct layer_task_fork_args_t *args = (struct layer_task_fork_args_t *)void_args; ne16_task_t *ne16_tasks = args->ne16_tasks; + Layer *tiles = args->tiles; // Loader @@ -400,6 +401,7 @@ void ${func_name}(void *args) { ne16_task_set_weight_offset(&ne16_tasks[i], weightOffsetModeLayerWise, ${weight_offset}); } + Layer tiles[BUFFER_SIZE]; // Fork @@ -410,6 +412,7 @@ void ${func_name}(void *args) { .L1_buffer = layer_args->L1_buffer, .padding = layer_args->padding, .ne16_tasks = ne16_tasks, + .tiles = tiles, }; pi_cl_team_fork(CORES, layer_task_fork, (void *)&layer_task_fork_args); From 361eb0c436f9df998953f4776cf0afa0f0bfe075 Mon Sep 17 00:00:00 2001 From: Luka Macan Date: Tue, 30 Jan 2024 15:22:48 +0100 Subject: [PATCH 23/30] Move global static store_conf to stack --- .../layer_L2_c_conv_ne16_multicore_template.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/dory/Hardware_targets/PULP/GAP9_NE16/Templates/layer_templates/layer_L2_c_conv_ne16_multicore_template.c b/dory/Hardware_targets/PULP/GAP9_NE16/Templates/layer_templates/layer_L2_c_conv_ne16_multicore_template.c index 34803e5e..834fdcfd 100644 --- a/dory/Hardware_targets/PULP/GAP9_NE16/Templates/layer_templates/layer_L2_c_conv_ne16_multicore_template.c +++ b/dory/Hardware_targets/PULP/GAP9_NE16/Templates/layer_templates/layer_L2_c_conv_ne16_multicore_template.c @@ -182,8 +182,6 @@ static int inc(int index, int end) { #define BUFFER_SIZE (2) -static DmaTransferConf store_conf[BUFFER_SIZE]; - static struct { Monitor input, output, store_conf; } monitor; @@ -197,6 +195,7 @@ struct layer_task_fork_args_t { uint32_t padding; ne16_task_t *ne16_tasks; Layer *tiles; + DmaTransferConf *store_conf; }; @@ -206,6 +205,7 @@ static void layer_task_fork(void *void_args) { struct layer_task_fork_args_t *args = (struct layer_task_fork_args_t *)void_args; ne16_task_t *ne16_tasks = args->ne16_tasks; Layer *tiles = args->tiles; + DmaTransferConf *store_conf = args->store_conf; // Loader @@ -402,6 +402,7 @@ void ${func_name}(void *args) { } Layer tiles[BUFFER_SIZE]; + DmaTransferConf store_conf[BUFFER_SIZE]; // Fork @@ -413,6 +414,7 @@ void ${func_name}(void *args) { .padding = layer_args->padding, .ne16_tasks = ne16_tasks, .tiles = tiles, + .store_conf = store_conf, }; pi_cl_team_fork(CORES, layer_task_fork, (void *)&layer_task_fork_args); From ef7dcb71f1252a23603d544324046bde130a8e4b Mon Sep 17 00:00:00 2001 From: Luka Macan Date: Tue, 30 Jan 2024 15:31:09 +0100 Subject: [PATCH 24/30] Move global static monitor to stack --- .../layer_L2_c_conv_ne16_multicore_template.c | 33 ++++++++++--------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/dory/Hardware_targets/PULP/GAP9_NE16/Templates/layer_templates/layer_L2_c_conv_ne16_multicore_template.c b/dory/Hardware_targets/PULP/GAP9_NE16/Templates/layer_templates/layer_L2_c_conv_ne16_multicore_template.c index 834fdcfd..d9e584e8 100644 --- a/dory/Hardware_targets/PULP/GAP9_NE16/Templates/layer_templates/layer_L2_c_conv_ne16_multicore_template.c +++ b/dory/Hardware_targets/PULP/GAP9_NE16/Templates/layer_templates/layer_L2_c_conv_ne16_multicore_template.c @@ -182,10 +182,9 @@ static int inc(int index, int end) { #define BUFFER_SIZE (2) -static struct { +typedef struct { Monitor input, output, store_conf; -} monitor; - +} TaskMonitors; struct layer_task_fork_args_t { uint32_t L2_input; @@ -196,6 +195,7 @@ struct layer_task_fork_args_t { ne16_task_t *ne16_tasks; Layer *tiles; DmaTransferConf *store_conf; + TaskMonitors *monitor; }; @@ -206,6 +206,7 @@ static void layer_task_fork(void *void_args) { ne16_task_t *ne16_tasks = args->ne16_tasks; Layer *tiles = args->tiles; DmaTransferConf *store_conf = args->store_conf; + TaskMonitors *monitor = args->monitor; // Loader @@ -274,7 +275,7 @@ static void layer_task_fork(void *void_args) { const Address local_addr = tile_status_get_addr(tile_status, buffer_addresses); Layer tile = tile_create(tile_status.index, end_index, body, border, layer, local_addr); - monitor_produce_begin(monitor.input); + monitor_produce_begin(monitor->input); tiles[i_buff] = tile; dma_mutex_lock(); DmaTransfer transfer = dma_transfer_create(); @@ -288,11 +289,11 @@ static void layer_task_fork(void *void_args) { dma_mutex_lock(); dma_transfer_wait(transfer); dma_mutex_unlock(); - monitor_produce_end(monitor.input); + monitor_produce_end(monitor->input); - monitor_produce_begin(monitor.store_conf); + monitor_produce_begin(monitor->store_conf); store_prepare(tiles[i_buff], body, layer, tile_status.index, &store_conf[i_buff]); - monitor_produce_end(monitor.store_conf); + monitor_produce_end(monitor->store_conf); i_buff = inc(i_buff, BUFFER_SIZE); tile_status = tile_status_get_next(tile_status, end_index, layer, 0, kernel); @@ -305,8 +306,8 @@ static void layer_task_fork(void *void_args) { if (pi_core_id() == EXECUTER_ID) { int i_buff = 0; for (int i_tile = 0; i_tile < total_tiles; i_tile++) { - monitor_consume_begin(monitor.input); - monitor_produce_begin(monitor.output); + monitor_consume_begin(monitor->input); + monitor_produce_begin(monitor->output); % if stride == 1: execute_async(&ne16_tasks[i_buff]); @@ -314,7 +315,7 @@ static void layer_task_fork(void *void_args) { execute_stride2x2_blocking(&ne16_tasks[i_buff], tiles[i_buff], kernel); % endif - monitor_produce_end(monitor.output); + monitor_produce_end(monitor->output); i_buff = inc(i_buff, BUFFER_SIZE); } @@ -326,11 +327,11 @@ static void layer_task_fork(void *void_args) { if (pi_core_id() == STORER_ID) { int i_buff = 0; for (int i_tile = 0; i_tile < total_tiles; i_tile++) { - monitor_consume_begin(monitor.store_conf); - monitor_consume_begin(monitor.output); + monitor_consume_begin(monitor->store_conf); + monitor_consume_begin(monitor->output); execute_wait(&ne16_tasks[i_buff]); - monitor_consume_end(monitor.input); + monitor_consume_end(monitor->input); dma_mutex_lock(); DmaTransfer transfer = dma_transfer_create(); @@ -341,8 +342,8 @@ static void layer_task_fork(void *void_args) { dma_transfer_wait(transfer); dma_mutex_unlock(); - monitor_consume_end(monitor.store_conf); - monitor_consume_end(monitor.output); + monitor_consume_end(monitor->store_conf); + monitor_consume_end(monitor->output); i_buff = inc(i_buff, BUFFER_SIZE); } @@ -360,6 +361,7 @@ void ${func_name}(void *args) { layer_args_t *layer_args = (layer_args_t *)args; // Initialization + TaskMonitors monitor; int err = 0; @@ -415,6 +417,7 @@ void ${func_name}(void *args) { .ne16_tasks = ne16_tasks, .tiles = tiles, .store_conf = store_conf, + .monitor = &monitor, }; pi_cl_team_fork(CORES, layer_task_fork, (void *)&layer_task_fork_args); From f080507b4ca1e8d7d6d41fa139605c078d3b68f8 Mon Sep 17 00:00:00 2001 From: Luka Macan Date: Tue, 30 Jan 2024 16:02:10 +0100 Subject: [PATCH 25/30] Fix path to network template --- dory/Hardware_targets/PULP/Common/C_Parser.py | 4 ++-- dory/Hardware_targets/PULP/GAP9_NE16/C_Parser.py | 4 ++-- dory/Parsers/Parser_HW_to_C.py | 11 ++++++++--- .../Utils/Templates_writer/Network_template_writer.py | 7 ++++--- 4 files changed, 16 insertions(+), 10 deletions(-) diff --git a/dory/Hardware_targets/PULP/Common/C_Parser.py b/dory/Hardware_targets/PULP/Common/C_Parser.py index e7da3f60..514bbb3e 100644 --- a/dory/Hardware_targets/PULP/Common/C_Parser.py +++ b/dory/Hardware_targets/PULP/Common/C_Parser.py @@ -93,8 +93,8 @@ def mapping_layers_to_C_files(self): if n_memory_levels > 2 and (node.L3_input != 0 or (node.tiling_dimensions["L3"]["output_dimensions"] != node.tiling_dimensions["L2"]["output_dimensions"]) or (node.tiling_dimensions["L3"]["weights_dimensions"] != node.tiling_dimensions["L2"]["weights_dimensions"])): tk = Layer2D_writer.print_template_layer_L3(node) - TemplateWriter.write(tk, {os.path.join(self.src_dir, node.prefixed_name + ".c"): os.path.join(self.tmpl_dir, "layer_L3_c_template.c"), - os.path.join(self.inc_dir, node.prefixed_name + ".h"): os.path.join(self.tmpl_dir, "layer_L3_h_template.h")}) + TemplateWriter.write(tk, {os.path.join(self.src_dir, node.prefixed_name + ".c"): os.path.join(self.layer_tmpl_dir, "layer_L3_c_template.c"), + os.path.join(self.inc_dir, node.prefixed_name + ".h"): os.path.join(self.layer_tmpl_dir, "layer_L3_h_template.h")}) if node.tiling_dimensions["L3"]["input_dimensions"][1] > node.tiling_dimensions["L2"]["input_dimensions"][1]: node.tiling_dimensions["L2"]["output_dimensions"][1] = int(np.floor((node.tiling_dimensions["L2"]["input_dimensions"][1] - node.kernel_shape[0] + node.strides[0]) / node.strides[0])) if node.tiling_dimensions["L3"]["output_dimensions"][1] > node.tiling_dimensions["L2"]["output_dimensions"][1]: diff --git a/dory/Hardware_targets/PULP/GAP9_NE16/C_Parser.py b/dory/Hardware_targets/PULP/GAP9_NE16/C_Parser.py index 3a1da89a..29ad78b0 100644 --- a/dory/Hardware_targets/PULP/GAP9_NE16/C_Parser.py +++ b/dory/Hardware_targets/PULP/GAP9_NE16/C_Parser.py @@ -72,8 +72,8 @@ def mapping_layers_to_C_files(self): if n_memory_levels > 2 and (node.L3_input != 0 or (node.tiling_dimensions["L3"]["output_dimensions"] != node.tiling_dimensions["L2"]["output_dimensions"]) or (node.tiling_dimensions["L3"]["weights_dimensions"] != node.tiling_dimensions["L2"]["weights_dimensions"])): #breakpoint() tk = Layer2D_writer.print_template_layer_L3(node) - TemplateWriter.write(tk, {os.path.join(self.src_dir, node.prefixed_name + ".c"): os.path.join(self.tmpl_dir, "layer_L3_c_template.c"), - os.path.join(self.inc_dir, node.prefixed_name + ".h"): os.path.join(self.tmpl_dir, "layer_L3_h_template.h")}) + TemplateWriter.write(tk, {os.path.join(self.src_dir, node.prefixed_name + ".c"): os.path.join(self.layer_tmpl_dir, "layer_L3_c_template.c"), + os.path.join(self.inc_dir, node.prefixed_name + ".h"): os.path.join(self.layer_tmpl_dir, "layer_L3_h_template.h")}) if node.tiling_dimensions["L3"]["input_dimensions"][1] > node.tiling_dimensions["L2"]["input_dimensions"][1]: node.tiling_dimensions["L2"]["output_dimensions"][1] = (node.tiling_dimensions["L2"]["input_dimensions"][1] - node.kernel_shape[0] + node.strides[0]) // node.strides[0] if node.tiling_dimensions["L3"]["output_dimensions"][1] > node.tiling_dimensions["L2"]["output_dimensions"][1]: diff --git a/dory/Parsers/Parser_HW_to_C.py b/dory/Parsers/Parser_HW_to_C.py index 24064d9d..212e1671 100644 --- a/dory/Parsers/Parser_HW_to_C.py +++ b/dory/Parsers/Parser_HW_to_C.py @@ -57,6 +57,7 @@ def mapping_network_to_C_file(self): self.config_file, self.verbose_level, self.perf_layer, + self.tmpl_dir, self.app_directory, self.inc_dir_rel, self.src_dir_rel) @@ -86,8 +87,8 @@ def l2_c_template(self, node, backend_library): def l2_template_mapping(self, node, backend_library): tmpl_c = self.l2_c_template(node, backend_library) return { - os.path.join(self.src_dir, node.name + ".c"): os.path.join(self.tmpl_dir, tmpl_c), - os.path.join(self.inc_dir, node.name + ".h"): os.path.join(self.tmpl_dir, "layer_L2_h_template.h"), + os.path.join(self.src_dir, node.name + ".c"): os.path.join(self.layer_tmpl_dir, tmpl_c), + os.path.join(self.inc_dir, node.name + ".h"): os.path.join(self.layer_tmpl_dir, "layer_L2_h_template.h"), } def mapping_layers_to_C_files(self): @@ -170,7 +171,11 @@ def get_file_path(self): @property def tmpl_dir(self): - return os.path.realpath(os.path.join(self.get_file_path(), 'Templates/layer_templates')) + return os.path.realpath(os.path.join(self.get_file_path(), 'Templates')) + + @property + def layer_tmpl_dir(self): + return os.path.realpath(os.path.join(self.tmpl_dir, 'layer_templates')) @property def utils_files_dir(self): diff --git a/dory/Utils/Templates_writer/Network_template_writer.py b/dory/Utils/Templates_writer/Network_template_writer.py index b7c66be9..b8abdd88 100644 --- a/dory/Utils/Templates_writer/Network_template_writer.py +++ b/dory/Utils/Templates_writer/Network_template_writer.py @@ -29,6 +29,7 @@ def print_template_network( config_file, verbose_level, perf_layer, + tmpl_dir, app_directory, inc_dir_rel, src_dir_rel @@ -86,19 +87,19 @@ def print_template_network( l += "// %s %s\n" % (k.ljust(30), v) tk['DORY_HW_graph'] = graph root = os.path.realpath(os.path.dirname(__file__)) - tmpl = Template(filename=os.path.join(root, "../../Hardware_targets", HW_description["name"], "Templates/network_c_template.c")) + tmpl = Template(filename=os.path.join(tmpl_dir, "network_c_template.c")) s = tmpl.render(verbose_log=l, **tk) save_string = os.path.join(app_directory, src_dir_rel, prefix + 'network.c') with open(save_string, "w") as f: f.write(s) - tmpl = Template(filename=os.path.join(root, "../../Hardware_targets", HW_description["name"], "Templates/network_h_template.h")) + tmpl = Template(filename=os.path.join(tmpl_dir, "network_h_template.h")) s = tmpl.render(verbose_log=l, **tk) save_string = os.path.join(app_directory, inc_dir_rel, prefix + 'network.h') with open(save_string, "w") as f: f.write(s) - tmpl = Template(filename=os.path.join(root, "../../Hardware_targets", HW_description["name"], "Templates/main_template.c")) + tmpl = Template(filename=os.path.join(tmpl_dir, "main_template.c")) s = tmpl.render(verbose_log=l, **tk) save_string = os.path.join(app_directory, src_dir_rel, prefix + 'main.c') with open(save_string, "w") as f: From ce5a4f45f91d4d08c3e942802a5f69fe268e7f58 Mon Sep 17 00:00:00 2001 From: Luka Macan Date: Tue, 30 Jan 2024 16:02:27 +0100 Subject: [PATCH 26/30] Initialize monitors only once in network --- .../PULP/GAP9/Utils_files/net_utils.h | 6 +++ .../layer_L2_c_conv_ne16_multicore_template.c | 33 +----------- .../GAP9_NE16/Templates/network_c_template.c | 27 +++++++++- .../PULP/GAP9_NE16/Utils_files/monitor.c | 42 +++++++++++++++ .../PULP/GAP9_NE16/Utils_files/monitor.h | 51 +++++-------------- 5 files changed, 87 insertions(+), 72 deletions(-) create mode 100644 dory/Hardware_targets/PULP/GAP9_NE16/Utils_files/monitor.c diff --git a/dory/Hardware_targets/PULP/GAP9/Utils_files/net_utils.h b/dory/Hardware_targets/PULP/GAP9/Utils_files/net_utils.h index 1ed57c63..0464d0f4 100644 --- a/dory/Hardware_targets/PULP/GAP9/Utils_files/net_utils.h +++ b/dory/Hardware_targets/PULP/GAP9/Utils_files/net_utils.h @@ -2,6 +2,7 @@ #define __PERF_UTILS_H__ #include #include +#include "monitor.h" // Padding flags @@ -11,6 +12,10 @@ #define NET_UTILS_PAD_LEFT (1 << 0) #define NET_UTILS_NO_PAD (0) +typedef struct { + Monitor input, output, store_conf; +} TaskMonitors; + typedef struct { unsigned int L3_input; unsigned int L3_output; @@ -23,6 +28,7 @@ typedef struct { unsigned int ram; unsigned int padding; unsigned int layer_id; + TaskMonitors *monitor; } layer_args_t; void print_perf(const char *name, const int cycles, const int macs); diff --git a/dory/Hardware_targets/PULP/GAP9_NE16/Templates/layer_templates/layer_L2_c_conv_ne16_multicore_template.c b/dory/Hardware_targets/PULP/GAP9_NE16/Templates/layer_templates/layer_L2_c_conv_ne16_multicore_template.c index d9e584e8..e6fe39d4 100644 --- a/dory/Hardware_targets/PULP/GAP9_NE16/Templates/layer_templates/layer_L2_c_conv_ne16_multicore_template.c +++ b/dory/Hardware_targets/PULP/GAP9_NE16/Templates/layer_templates/layer_L2_c_conv_ne16_multicore_template.c @@ -182,10 +182,6 @@ static int inc(int index, int end) { #define BUFFER_SIZE (2) -typedef struct { - Monitor input, output, store_conf; -} TaskMonitors; - struct layer_task_fork_args_t { uint32_t L2_input; uint32_t L2_weights; @@ -360,29 +356,6 @@ void ${func_name}(void *args) { layer_args_t *layer_args = (layer_args_t *)args; - // Initialization - TaskMonitors monitor; - - int err = 0; - - if (err = monitor_init(&monitor.input, BUFFER_SIZE)) { - printf("Input monitor initialization failed with status %d.\n", err); - return; - } - - if (err = monitor_init(&monitor.output, BUFFER_SIZE)) { - printf("Output monitor initialization failed with status %d.\n", err); - monitor_term(monitor.input); - return; - } - - if (err = monitor_init(&monitor.store_conf, BUFFER_SIZE)) { - printf("Store conf monitor initialization failed with status %d.\n", err); - monitor_term(monitor.input); - monitor_term(monitor.output); - return; - } - // Init nnx tasks ne16_task_t ne16_tasks[BUFFER_SIZE]; for (int i = 0; i < BUFFER_SIZE; i++) { @@ -417,14 +390,10 @@ void ${func_name}(void *args) { .ne16_tasks = ne16_tasks, .tiles = tiles, .store_conf = store_conf, - .monitor = &monitor, + .monitor = layer_args->monitor, }; pi_cl_team_fork(CORES, layer_task_fork, (void *)&layer_task_fork_args); // Terminate - - monitor_term(monitor.input); - monitor_term(monitor.output); - monitor_term(monitor.store_conf); } diff --git a/dory/Hardware_targets/PULP/GAP9_NE16/Templates/network_c_template.c b/dory/Hardware_targets/PULP/GAP9_NE16/Templates/network_c_template.c index cfbf6096..ac443d03 100644 --- a/dory/Hardware_targets/PULP/GAP9_NE16/Templates/network_c_template.c +++ b/dory/Hardware_targets/PULP/GAP9_NE16/Templates/network_c_template.c @@ -216,6 +216,27 @@ void ${prefix}network_run_cluster(void *args) { const ne16_pulp_conf_t ne16_pulp_conf = {.max_stall = 8}; ne16_nnx_init(ne16_pulp_get_dev(), &ne16_pulp_conf); + TaskMonitors monitor; + + int err = 0; + + if (err = monitor_init(&monitor.input, 2)) { + printf("Input monitor initialization failed with status %d.\n", err); + return; + } + + if (err = monitor_init(&monitor.output, 2)) { + printf("Output monitor initialization failed with status %d.\n", err); + monitor_term(monitor.input); + return; + } + + if (err = monitor_init(&monitor.store_conf, 2)) { + printf("Store conf monitor initialization failed with status %d.\n", err); + monitor_term(monitor.input); + monitor_term(monitor.output); + return; + } /* ---------------------------------- */ /* --------- SECTION 0 END ---------- */ /* ---------------------------------- */ @@ -317,7 +338,8 @@ void ${prefix}network_run_cluster(void *args) { .L1_buffer = 0, .ram = (unsigned int) get_ram_ptr(), .padding = NET_UTILS_PAD_TOP | NET_UTILS_PAD_BOTTOM, - .layer_id = i + .layer_id = i, + .monitor = &monitor, }; % if 'Yes' in performance or 'Perf_final' in verbose_level: @@ -514,4 +536,7 @@ void ${prefix}network_run_cluster(void *args) { /* ---------------------------------- */ ne16_nnx_term(ne16_pulp_get_dev()); + monitor_term(monitor.input); + monitor_term(monitor.output); + monitor_term(monitor.store_conf); } diff --git a/dory/Hardware_targets/PULP/GAP9_NE16/Utils_files/monitor.c b/dory/Hardware_targets/PULP/GAP9_NE16/Utils_files/monitor.c new file mode 100644 index 00000000..730a2c45 --- /dev/null +++ b/dory/Hardware_targets/PULP/GAP9_NE16/Utils_files/monitor.c @@ -0,0 +1,42 @@ +#include "monitor.h" +#include "pmsis.h" + + +int monitor_init(Monitor * const monitor, int buffer_size) { + monitor->empty = pi_cl_sem_alloc(); + if (monitor->empty == 0) { + return -1; + } + + monitor->full = pi_cl_sem_alloc(); + if (monitor->full == 0) { + pi_cl_sem_free(monitor->empty); + return -2; + } + + pi_cl_sem_set(monitor->full, 0); + pi_cl_sem_set(monitor->empty, buffer_size); + + return 0; +} + +void monitor_term(Monitor monitor) { + pi_cl_sem_free(monitor.empty); + pi_cl_sem_free(monitor.full); +} + +void monitor_produce_begin(Monitor monitor) { + pi_cl_sem_dec(monitor.empty); +} + +void monitor_produce_end(Monitor monitor) { + pi_cl_sem_inc(monitor.full, 1); +} + +void monitor_consume_begin(Monitor monitor) { + pi_cl_sem_dec(monitor.full); +} + +void monitor_consume_end(Monitor monitor) { + pi_cl_sem_inc(monitor.empty, 1); +} diff --git a/dory/Hardware_targets/PULP/GAP9_NE16/Utils_files/monitor.h b/dory/Hardware_targets/PULP/GAP9_NE16/Utils_files/monitor.h index f1179401..b471d994 100644 --- a/dory/Hardware_targets/PULP/GAP9_NE16/Utils_files/monitor.h +++ b/dory/Hardware_targets/PULP/GAP9_NE16/Utils_files/monitor.h @@ -1,4 +1,8 @@ -#include "pmsis.h" +#ifndef __MONITOR_H__ +#define __MONITOR_H__ + +#include + typedef struct Monitor { uint32_t empty; @@ -6,42 +10,11 @@ typedef struct Monitor { } Monitor; -static int monitor_init(Monitor * const monitor, int buffer_size) { - monitor->empty = pi_cl_sem_alloc(); - if (monitor->empty == 0) { - return -1; - } - - monitor->full = pi_cl_sem_alloc(); - if (monitor->full == 0) { - pi_cl_sem_free(monitor->empty); - return -2; - } - - pi_cl_sem_set(monitor->full, 0); - pi_cl_sem_set(monitor->empty, buffer_size); - - return 0; -} - -static void monitor_term(Monitor monitor) { - pi_cl_sem_free(monitor.empty); - pi_cl_sem_free(monitor.full); -} - -static void monitor_produce_begin(Monitor monitor) { - pi_cl_sem_dec(monitor.empty); -} - -static void monitor_produce_end(Monitor monitor) { - pi_cl_sem_inc(monitor.full, 1); -} - -static void monitor_consume_begin(Monitor monitor) { - pi_cl_sem_dec(monitor.full); -} - -static void monitor_consume_end(Monitor monitor) { - pi_cl_sem_inc(monitor.empty, 1); -} +int monitor_init(Monitor * const monitor, int buffer_size); +void monitor_term(Monitor monitor); +void monitor_produce_begin(Monitor monitor); +void monitor_produce_end(Monitor monitor); +void monitor_consume_begin(Monitor monitor); +void monitor_consume_end(Monitor monitor); +#endif // __MONITOR_H__ From fd5bdf91d621d319f327725fd3ee0b027d0b3ede Mon Sep 17 00:00:00 2001 From: Luka Macan Date: Thu, 1 Feb 2024 08:46:31 +0100 Subject: [PATCH 27/30] Fix relu check for ne16 --- dory/Hardware_targets/PULP/GAP9_NE16/HW_Parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dory/Hardware_targets/PULP/GAP9_NE16/HW_Parser.py b/dory/Hardware_targets/PULP/GAP9_NE16/HW_Parser.py index a5588601..45fd3520 100644 --- a/dory/Hardware_targets/PULP/GAP9_NE16/HW_Parser.py +++ b/dory/Hardware_targets/PULP/GAP9_NE16/HW_Parser.py @@ -60,7 +60,7 @@ def valid_ne16_node(self, node, idx): bias_type=IntegerType(name=f"{node.constant_type}{node.bias_bits}"), has_norm_quant=True, # TODO has_bias=True, - has_relu="Relu" in node.name + has_relu=node.min >= 0 ) return True, "" except ValidationError as e: From 55734135127a673e22a91f3e410b4ebdeceeae78 Mon Sep 17 00:00:00 2001 From: Luka Macan Date: Thu, 1 Feb 2024 09:43:49 +0100 Subject: [PATCH 28/30] Fix ne16 relu flag in the layer template --- .../layer_templates/layer_L2_c_conv_ne16_multicore_template.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dory/Hardware_targets/PULP/GAP9_NE16/Templates/layer_templates/layer_L2_c_conv_ne16_multicore_template.c b/dory/Hardware_targets/PULP/GAP9_NE16/Templates/layer_templates/layer_L2_c_conv_ne16_multicore_template.c index e6fe39d4..85642bfb 100644 --- a/dory/Hardware_targets/PULP/GAP9_NE16/Templates/layer_templates/layer_L2_c_conv_ne16_multicore_template.c +++ b/dory/Hardware_targets/PULP/GAP9_NE16/Templates/layer_templates/layer_L2_c_conv_ne16_multicore_template.c @@ -366,7 +366,7 @@ void ${func_name}(void *args) { &ne16_tasks[i], (ne16_quant_t) { .shift_amount = ${out_shift}, - .function = quantFunctionRelu, // todo: un-hardcode it + .function = ${"quantFunctionIdentity" if node.min < 0 else "quantFunctionRelu"}, .flag_rounding = ne16TaskFlagFalse }, (ne16_norm_t) { .mode = ${"normMode32Bit" if act_dim_bit == 32 else "normMode8Bit" if act_dim_bit == 8 else ""}, From 2f5d6ce4cc3679f13084b717742dc9536ccd143f Mon Sep 17 00:00:00 2001 From: Luka Macan Date: Wed, 14 Feb 2024 10:47:56 +0100 Subject: [PATCH 29/30] Reduce stack size to 2kB for all cluster cores --- dory/Hardware_targets/PULP/GAP9/HW_description.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dory/Hardware_targets/PULP/GAP9/HW_description.json b/dory/Hardware_targets/PULP/GAP9/HW_description.json index 46b28b5c..00a1e6ee 100644 --- a/dory/Hardware_targets/PULP/GAP9/HW_description.json +++ b/dory/Hardware_targets/PULP/GAP9/HW_description.json @@ -37,8 +37,8 @@ "accelerator frequency": 370000000, "peripheral frequency": 370000000, "HW specific parameters":{ - "accelerator core0 stack": 3500, - "accelerator core1-7 stack": 3400 + "accelerator core0 stack": 2048, + "accelerator core1-7 stack": 2048 }, "double_buffering": 2, "split_ints": true, From 4bc1357cd0a340faef69db7b48af16b90ca52c68 Mon Sep 17 00:00:00 2001 From: Luka Macan Date: Tue, 19 Mar 2024 12:16:41 +0100 Subject: [PATCH 30/30] Bump pulp-nnx --- .../PULP/Backend_Kernels/pulp-nnx | 2 +- .../PULP/GAP9_NE16/Utils_files/execute.h | 20 +++++++++---------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/dory/Hardware_targets/PULP/Backend_Kernels/pulp-nnx b/dory/Hardware_targets/PULP/Backend_Kernels/pulp-nnx index 70b089ed..234971fc 160000 --- a/dory/Hardware_targets/PULP/Backend_Kernels/pulp-nnx +++ b/dory/Hardware_targets/PULP/Backend_Kernels/pulp-nnx @@ -1 +1 @@ -Subproject commit 70b089ed7a050fd99c775f9b7504dfcbd732c02f +Subproject commit 234971fca4a0eba5e8b703e9ccb62b7764dac7fa diff --git a/dory/Hardware_targets/PULP/GAP9_NE16/Utils_files/execute.h b/dory/Hardware_targets/PULP/GAP9_NE16/Utils_files/execute.h index aa07add1..46a237a6 100644 --- a/dory/Hardware_targets/PULP/GAP9_NE16/Utils_files/execute.h +++ b/dory/Hardware_targets/PULP/GAP9_NE16/Utils_files/execute.h @@ -17,11 +17,11 @@ static inline void execute_prepare(Layer tile, ne16_task_t *const task) { tile.output.height, tile.output.width, tile.output.channel, tile.output.width * tile.output.channel, tile.output.channel, tile.padding.top, tile.padding.bottom, - tile.padding.right, tile.padding.left); - ne16_task_set_ptrs(task, tile.addr.input, tile.input.width, - tile.input.channel, tile.padding.top, tile.padding.left, - tile.addr.output, tile.addr.weights, tile.addr.scale, - 0 /*shift_ptr*/, tile.addr.bias); + tile.padding.left, tile.padding.right); + ne16_task_set_addr_conv(task, tile.addr.input, tile.input.width, + tile.input.channel, tile.padding.top, tile.padding.left, + tile.addr.output, tile.addr.weights); + ne16_task_set_addr_norm_quant(task, tile.addr.scale, 0 /*shift_addr*/, tile.addr.bias); } static inline void execute_async(ne16_task_t *task) { @@ -37,11 +37,11 @@ static inline void execute_stride2x2_prepare(Layer tile, Kernel kernel, tile.output.height, tile.output.width, tile.output.channel, tile.output.width * tile.output.channel, tile.output.channel, kernel.shape.height, kernel.shape.width, tile.padding.top, - tile.padding.bottom, tile.padding.right, tile.padding.left); - ne16_task_set_ptrs(task, tile.addr.input, tile.input.width, - tile.input.channel, tile.padding.top, tile.padding.left, - tile.addr.output, tile.addr.weights, tile.addr.scale, - 0 /*shift_ptr*/, tile.addr.bias); + tile.padding.bottom, tile.padding.left, tile.padding.right); + ne16_task_set_addr_conv(task, tile.addr.input, tile.input.width, + tile.input.channel, tile.padding.top, tile.padding.left, + tile.addr.output, tile.addr.weights); + ne16_task_set_addr_norm_quant(task, tile.addr.scale, 0 /*shift_addr*/, tile.addr.bias); } static inline void execute_stride2x2_blocking(ne16_task_t *task, Layer tile,