From 78974a101d46d423e92f98bc057375ff9bac1324 Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@fer.hr>
Date: Fri, 1 Dec 2023 19:43:28 +0100
Subject: [PATCH 01/30] Harmonize onnx mangling

---
 dory/Frontend_frameworks/Quantlab/Parser.py | 6 +++++-
 dory/Parsers/Parser_ONNX_to_DORY.py         | 8 +++++++-
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/dory/Frontend_frameworks/Quantlab/Parser.py b/dory/Frontend_frameworks/Quantlab/Parser.py
index ffaaba3b..a2ae14be 100644
--- a/dory/Frontend_frameworks/Quantlab/Parser.py
+++ b/dory/Frontend_frameworks/Quantlab/Parser.py
@@ -93,7 +93,11 @@ def frontend_mapping_to_DORY_nodes(self):
                 # input 1 parameters
                 #### Look at the order of inputs in Onnx. If the lowest index
                 #is not the first argument, revert the order inmul1 and inmul2
-                if int(node.input_indexes[0]) > int(node.input_indexes[1]):
+                def pos_in_schedule(node: str) -> int:
+                    for idx, node_iterating in enumerate(list(self.graph.graph.node)):
+                        if node_iterating.output[0] == node:
+                            return idx
+                if pos_in_schedule(node.input_indexes[0]) > pos_in_schedule(node.input_indexes[1]):
                     temp_shift = node.in1_shift
                     temp_mul   = node.in1_mul
                     temp_add   = node.in1_add
diff --git a/dory/Parsers/Parser_ONNX_to_DORY.py b/dory/Parsers/Parser_ONNX_to_DORY.py
index 48179a63..cc17c35f 100644
--- a/dory/Parsers/Parser_ONNX_to_DORY.py
+++ b/dory/Parsers/Parser_ONNX_to_DORY.py
@@ -60,6 +60,10 @@ def create_node(self, node_iterating, graph):
         return new_node
 
     def ONNXtoDORY(self):
+        def pos_in_schedule(node: str) -> int:
+            for idx, node_iterating in enumerate(list(self.graph.graph.node)):
+                if node_iterating.output[0] == node:
+                    return idx
         ######### CREATING NODES ###########
         print("\nParsing ONNX Graph to create DORY graph.")
         for node_iterating in (self.graph.graph.node):
@@ -68,7 +72,9 @@ def ONNXtoDORY(self):
             ### Neglecting some nodes since they are not translated to any operation on any backend
             if node_iterating.op_type in self.layers_neglected:
                 for node in self.DORY_Graph[::-1]:
-                    if int(node_iterating.output[0]) > int(node.get_parameter('output_index')) and node.get_parameter("name") != "Constant":
+                    node_iter_output = pos_in_schedule(node_iterating.output[0])
+                    node_output_index = pos_in_schedule(node.get_parameter('output_index'))
+                    if node_iter_output > node_output_index and node.get_parameter("name") != "Constant":
                         node.add_existing_parameter('output_index', node_iterating.output[0]) 
                         break
             # Adding a new layer

From 646710fa26d314f46f907fdf45727eaf4af4b13b Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@fer.hr>
Date: Fri, 1 Dec 2023 19:43:58 +0100
Subject: [PATCH 02/30] Harmonize quantlib renaming last output node

---
 dory/Parsers/HW_node.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/dory/Parsers/HW_node.py b/dory/Parsers/HW_node.py
index 9e5fd49e..e7abc008 100644
--- a/dory/Parsers/HW_node.py
+++ b/dory/Parsers/HW_node.py
@@ -213,6 +213,9 @@ def add_checksum_activations_integer(self, load_directory, node_number, n_inputs
 
             self.check_sum_in.append(int(sum(x)))
             outfile = f'out_layer{node_number}.txt' if n_inputs == 1 else f'out_{in_idx}_layer{node_number}.txt'
+            # quantlib hack
+            if not os.path.isfile(os.path.join(load_directory, outfile)):
+                outfile = "output.txt"
             try:
                 y = np.loadtxt(os.path.join(load_directory, outfile), delimiter=',', dtype=np.int64, usecols=[0])
             except ValueError:

From 71d75bf25e5abccbe18a23b253760ef4dc9d27d6 Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@fer.hr>
Date: Tue, 5 Dec 2023 17:33:55 +0100
Subject: [PATCH 03/30] Add some string parsing

---
 dory/Hardware_targets/PULP/GAP9_NE16/HW_Parser.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/dory/Hardware_targets/PULP/GAP9_NE16/HW_Parser.py b/dory/Hardware_targets/PULP/GAP9_NE16/HW_Parser.py
index d5697441..6ae78200 100644
--- a/dory/Hardware_targets/PULP/GAP9_NE16/HW_Parser.py
+++ b/dory/Hardware_targets/PULP/GAP9_NE16/HW_Parser.py
@@ -42,7 +42,7 @@ def get_pattern_rewriter(self):
     def get_tiler(self):
         return Tiler_GAP9
 
-    def valid_ne16_node(self, node):
+    def valid_ne16_node(self, node, idx):
         try:
             Ne16TestConf(
                 in_height=node.input_dimensions[0],
@@ -60,19 +60,19 @@ def valid_ne16_node(self, node):
                 bias_type=IntegerType(name=f"{node.constant_type}{node.bias_bits}"),
                 has_norm_quant=True, # TODO
                 has_bias=True,
-                has_relu=True
+                has_relu="Relu" in node.name
             )
             return True, ""
         except ValidationError as e:
-            msg = f"WARNING: Failed allocating node {node.name} to the NE16 accelerator. Errors:\n"
+            msg = f"WARNING: Failed allocating node {node.name}{idx} to the NE16 accelerator. Errors:\n"
             for error in e.errors():
                 msg += f" - {error['msg']}\n"
             msg += "\nNOTE: Falling back to cluster engine.\n"
             return False, msg
 
-    def engine_coloring(self, node):
+    def engine_coloring(self, node, idx):
         if "Conv" in node.op_type or "Convolution" in node.op_type:
-            is_valid, msg = self.valid_ne16_node(node)
+            is_valid, msg = self.valid_ne16_node(node, idx)
             if is_valid:
                 node.engine = "ne16"
                 return
@@ -83,8 +83,8 @@ def engine_coloring(self, node):
     def mapping_to_HW_nodes(self):
         super().mapping_to_HW_nodes()
         print("\nPULP Backend: Assigning nodes to engines.")
-        for node in self.DORY_Graph:
-            self.engine_coloring(node)
+        for i, node in enumerate(self.DORY_Graph):
+            self.engine_coloring(node, i)
         assert all(hasattr(node, "engine") for node in self.DORY_Graph)
 
     def transform_nodes_to_hw_nodes(self):

From 41fd9b65e825bfbc596239ff3264c5a4d3793035 Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Tue, 19 Dec 2023 18:09:23 +0100
Subject: [PATCH 04/30] Parser fixes

---
 dory/Parsers/HW_node.py                               |  5 +++--
 dory/Parsers/Parser_DORY_to_HW.py                     |  3 ---
 .../Utils/Templates_writer/Layer2D_template_writer.py | 11 ++++-------
 3 files changed, 7 insertions(+), 12 deletions(-)

diff --git a/dory/Parsers/HW_node.py b/dory/Parsers/HW_node.py
index e7abc008..7a544743 100644
--- a/dory/Parsers/HW_node.py
+++ b/dory/Parsers/HW_node.py
@@ -72,6 +72,7 @@ def create_tiling_dimensions(self, previous_node, config_file):
         #  ATTENTION MEMORY L3 --> TILE MEMORY DIMENSION --> Decide how to set. Re-init the whole memory?
         for level in np.arange(self.HW_description["memory"]["levels"],1, -1):
             (weights_dim, input_dims, output_dims) = self.Tiler(self, previous_node, config_file["code reserved space"]).get_tiling(level)
+            assert all(dim > 0 for dim in weights_dim + input_dims + output_dims)
             self.tiling_dimensions["L{}".format(level-1)]["input_dimensions"] = input_dims
             self.tiling_dimensions["L{}".format(level-1)]["output_dimensions"] = output_dims
             if "Convolution" in self.name or "FullyConnected" in self.name:
@@ -98,8 +99,8 @@ def create_tiling_dimensions(self, previous_node, config_file):
 
             self.tiling_dimensions["L{}".format(level-1)]["bias_memory"] = int(bias_memory)
             self.tiling_dimensions["L{}".format(level-1)]["constants_memory"] = int(constants_memory)
-            self.tiling_dimensions["L{}".format(level-1)]["input_activation_memory"] = np.prod(self.tiling_dimensions["L{}".format(level-1)]["input_dimensions"])*self.input_activation_bits/8
-            self.tiling_dimensions["L{}".format(level-1)]["output_activation_memory"] = np.prod(self.tiling_dimensions["L{}".format(level-1)]["output_dimensions"])*self.output_activation_bits/8
+            self.tiling_dimensions["L{}".format(level-1)]["input_activation_memory"] = int(np.prod(self.tiling_dimensions["L{}".format(level-1)]["input_dimensions"])*self.input_activation_bits/8)
+            self.tiling_dimensions["L{}".format(level-1)]["output_activation_memory"] = int(np.prod(self.tiling_dimensions["L{}".format(level-1)]["output_dimensions"])*self.output_activation_bits/8)
 
     def rename_weights(self):
         weight_name = ""
diff --git a/dory/Parsers/Parser_DORY_to_HW.py b/dory/Parsers/Parser_DORY_to_HW.py
index dcd75d10..902e0a64 100644
--- a/dory/Parsers/Parser_DORY_to_HW.py
+++ b/dory/Parsers/Parser_DORY_to_HW.py
@@ -171,9 +171,6 @@ def tiling(self):
         print("\nInsert tiling parameters per layer inside graph nodes")
         prev = None
         for node in self.DORY_Graph:
-            ######################## NEED A  FIX ####################################################
-            #### OTHERWISE ONLY WEIGHT < L2/2 GO in L2 --> much more L3 tiling not needed############
-            #########################################################################################
             node.create_tiling_dimensions(prev if prev else node, self.config_file)
             prev = node
 
diff --git a/dory/Utils/Templates_writer/Layer2D_template_writer.py b/dory/Utils/Templates_writer/Layer2D_template_writer.py
index 6964acbd..78c8dcfd 100644
--- a/dory/Utils/Templates_writer/Layer2D_template_writer.py
+++ b/dory/Utils/Templates_writer/Layer2D_template_writer.py
@@ -408,13 +408,10 @@ def print_template_layer(node, layer_type, double_buffering = 2):
     # x last
     tk['x_tile_size_nif_last'] = n_in % tile_n_in if (n_in % tile_n_in) > 0 else tile_n_in
     tk['x_tile_size_nif_byte_last'] = int(math.ceil(tk['x_tile_size_nif_last'] * ds_x / 8.0))
-    tk['x_tile_size_h_last'] = tk['y_tile_size_h_last'] * s[0] + ks[0] - s[0] - (padding_bottom - ((h_in + padding_bottom + padding_top) - (h_out* s[0] + ks[0] - s[0])))
-    tk['x_tile_size_w_last'] = tk['y_tile_size_w_last'] * s[1] + ks[1] - s[1] - (padding_right - ((w_in + padding_left + padding_right) - (w_out* s[1] + ks[1] - s[1])))
-    ## single tile execution
-    if tk['x_tile_size_h_last'] > tk['x_tile_size_h']:
-        tk['x_tile_size_h_last'] = tk['x_tile_size_h']
-    if tk['x_tile_size_w_last'] > tk['x_tile_size_w']:
-        tk['x_tile_size_w_last'] = tk['x_tile_size_w']
+    tk['x_tile_size_h_last'] = tk['y_tile_size_h_last'] * s[0] + ks[0] - s[0]
+    assert tk['x_tile_size_h_last'] >= 0 and tk['x_tile_size_h_last'] <= tk['x_tile_size_h']
+    tk['x_tile_size_w_last'] = tk['y_tile_size_w_last'] * s[1] + ks[1] - s[1]
+    assert tk['x_tile_size_w_last'] >= 0 and tk['x_tile_size_w_last'] <= tk['x_tile_size_w']
 
     l = ""
     for k, v in tk.items():

From 115f8b37f58cac789ec80530e8b705baee7cc133 Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Tue, 19 Dec 2023 18:46:36 +0100
Subject: [PATCH 05/30] Fix L3-L2 tiling

---
 .../PULP/GAP9_NE16/Tiler/tiler_conv2d_ne16.py | 132 +++++++++++-------
 1 file changed, 82 insertions(+), 50 deletions(-)

diff --git a/dory/Hardware_targets/PULP/GAP9_NE16/Tiler/tiler_conv2d_ne16.py b/dory/Hardware_targets/PULP/GAP9_NE16/Tiler/tiler_conv2d_ne16.py
index 66648fb7..e4b76297 100644
--- a/dory/Hardware_targets/PULP/GAP9_NE16/Tiler/tiler_conv2d_ne16.py
+++ b/dory/Hardware_targets/PULP/GAP9_NE16/Tiler/tiler_conv2d_ne16.py
@@ -65,14 +65,14 @@ def get_tiling_conv2d_L3(self):
         # We assume that the first nodes input is always in L2
         input_in_l2 = is_first_node or prev_tiling["L3"]["output_dimensions"] == prev_tiling["L2"]["output_dimensions"]
 
-        self.node.tiling_dimensions["L2"]["db_x"] = 1
-        self.node.tiling_dimensions["L2"]["db_y"] = 1
-        self.node.tiling_dimensions["L2"]["db_w"] = 1
-
         buffer_total = self.node.input_activation_memory + self.node.output_activation_memory + self.node.weight_memory + self.node.bias_memory + self.node.constants_memory
 
         # Don't tile if the whole thing fits into L2
-        if (buffer_total <= L2_memory) and (input_in_l2 or is_first_node):
+        if (buffer_total <= L2_memory) and input_in_l2:
+            self.node.tiling_dimensions["L2"]["db_x"] = 1
+            self.node.tiling_dimensions["L2"]["db_y"] = 1
+            self.node.tiling_dimensions["L2"]["db_w"] = 1
+
             if "PointwiseDepthwisePointwise" in self.node.name:
                 return ([self.node.output_channels, self.node.input_channels],
                         [self.node.input_channels, self.node.input_dimensions[0], self.node.input_dimensions[1]],
@@ -94,7 +94,6 @@ def get_tiling_conv2d_L3(self):
         in_ch = self.node.input_channels
         s = self.node.strides
         g = self.node.group
-        p = self.node.pads
         depthwise = g > 1
 
         db_x = 1 if input_in_l2 else 2
@@ -113,16 +112,30 @@ def get_tiling_conv2d_L3(self):
             h_in = in_dim[0]
             w_in = in_dim[1]
             n_in = in_ch
-            tile_h_out = solver.IntVar(1, out_dim[0], 'tile_h_out')
+
+            # optimization variables
+            opt_vars = []
+
+            def add_var_cond(min: int, max: int, name: str, cond: bool):
+                if cond:
+                    var = solver.IntVar(min, max, name)
+                    opt_vars.append(var)
+                else:
+                    var = max
+                return var
+
+            tile_h_out = add_var_cond(1, h_out, 'tile_h_out', db_y > 1)
+            tile_n_out = add_var_cond(1, out_ch, 'tile_n_out', db_w > 1)
+            tile_h_in = add_var_cond(ks[0], h_in, 'tile_h_in', db_x > 1)
+
+            # We are not tiling width nor input channel
             tile_w_out = w_out
-            tile_n_out = solver.IntVar(1, out_ch, 'tile_n_out')
-            tile_h_in = solver.IntVar(in_dim[0] if input_in_l2 else ks[0], in_dim[0], 'tile_h_in')
             tile_w_in = w_in
             tile_n_in = tile_n_out if depthwise else n_in
 
             # size constraint
-            input_tile_dimension = db_x * n_in * tile_h_in * in_dim[1] * (self.node.input_activation_bits // 8)
-            output_tile_dimension = tile_h_out * db_y * n_out * out_dim[1] * (self.node.output_activation_bits // 8)
+            input_tile_dimension = db_x * n_in * tile_h_in * w_in * (self.node.input_activation_bits // 8)
+            output_tile_dimension = tile_h_out * db_y * n_out * w_out * (self.node.output_activation_bits // 8)
 
             if "DepthwisePointwise" in self.node.name:
                 weight_tile_dimension = db_w * (self.node.calculate_weights_size(tile_n_in, tile_n_in, ks, self.node.weight_bits, dw=True) +
@@ -148,23 +161,23 @@ def get_tiling_conv2d_L3(self):
 
             solver.Add(total_size <= L2_memory)
 
-            # geometrical constraint
-            if db_y == 1:
-                solver.Add(tile_h_out == h_out)
-            else:
+            # policy constraints
+            if db_y > 1:
                 solver.Add(solver.IntConst(h_out) % tile_h_out == 0)
 
-            if db_w == 1:
-                solver.Add(tile_n_out == out_ch)
-            else:
+            if db_w > 1:
                 solver.Add(solver.IntConst(n_out) % tile_n_out == 0)
 
-            if db_x == 2 and db_y == 2:
-                solver.Add(tile_h_out * s[0] == tile_h_in - (ks[0] - 1) + (s[0] - 1))
-
-            if db_x == 2:
+            if db_x > 1:
                 solver.Add(solver.IntConst(h_out) % ((tile_h_in - ks[0] + s[0]) // s[0]) == 0)
 
+            # geometrical constraints
+            if db_x > 1:
+                solver.Add((tile_h_in - ks[0]) % s[0] == 0)
+
+            if db_x > 1 and db_y > 1:
+                solver.Add(tile_h_in == tile_h_out * s[0] + (ks[0] - s[0]))
+
             # objective
             obj_expr = solver.IntVar(0, 100000000000000, "obj_expr")
 
@@ -177,7 +190,7 @@ def get_tiling_conv2d_L3(self):
 
             # maximize the objective
             objective = solver.Maximize(obj_expr, 1)
-            decision_builder = solver.Phase([tile_n_out, tile_h_in, tile_h_out],
+            decision_builder = solver.Phase(opt_vars,
                                             solver.CHOOSE_FIRST_UNBOUND,
                                             solver.ASSIGN_MIN_VALUE)
 
@@ -185,18 +198,20 @@ def get_tiling_conv2d_L3(self):
             collector = solver.LastSolutionCollector()
 
             # Add the decision variables.
-            collector.Add(tile_n_out)
-            collector.Add(tile_h_in)
-            collector.Add(tile_h_out)
+            for var in opt_vars:
+                collector.Add(var)
 
             # Add the objective.
             collector.AddObjective(obj_expr)
             solver.Solve(decision_builder, [objective, collector])
             if collector.SolutionCount() > 0:
                 best_solution = collector.SolutionCount() - 1
-                tile_n_out = collector.Value(best_solution, tile_n_out)
-                tile_h_in = collector.Value(best_solution, tile_h_in)
-                tile_h_out = collector.Value(best_solution, tile_h_out)
+                if db_y > 1:
+                    tile_h_out = collector.Value(best_solution, tile_h_out)
+                if db_w > 1:
+                    tile_n_out = collector.Value(best_solution, tile_n_out)
+                if db_x > 1:
+                    tile_h_in = collector.Value(best_solution, tile_h_in)
                 self.node.tiling_dimensions["L2"]["db_x"] = db_x
                 self.node.tiling_dimensions["L2"]["db_y"] = db_y
                 self.node.tiling_dimensions["L2"]["db_w"] = db_w
@@ -232,38 +247,55 @@ def get_tiling_conv2d_L2(self):
         # We are recalculating these variables because the output could be tiled but the input isn't or vice versa.
         in_mem = self.node.tiling_dimensions["L2"]["input_activation_memory"]
         out_mem = self.node.tiling_dimensions["L2"]["output_activation_memory"]
-        h_in   = self.node.tiling_dimensions["L2"]["input_dimensions"][1]
-        h_out   = self.node.tiling_dimensions["L2"]["output_dimensions"][1]
-        if self.node.tiling_dimensions["L3"]["output_dimensions"][1] > self.node.tiling_dimensions["L2"]["output_dimensions"][1]:
-            h_in   = self.node.tiling_dimensions["L2"]["output_dimensions"][1] * s[0] + (ks[0] - 1) - (s[0] - 1)
-            in_mem = int(self.node.tiling_dimensions["L2"]["input_activation_memory"] / self.node.tiling_dimensions["L2"]["input_dimensions"][1] * h_in)
-        if self.node.tiling_dimensions["L3"]["input_dimensions"][1] > self.node.tiling_dimensions["L2"]["input_dimensions"][1] or \
-                (self.node.tiling_dimensions["L2"]["db_x"] > 1 and self.node.tiling_dimensions["L2"]["db_y"] == 1):
-            h_out  = int(np.floor((self.node.tiling_dimensions["L2"]["input_dimensions"][1] - (ks[0] - 1) + (s[0] - 1)) / s[0]))
-            out_mem = int(self.node.tiling_dimensions["L2"]["output_activation_memory"] / self.node.tiling_dimensions["L2"]["output_dimensions"][1] * h_out)
+        h_in_l2 = self.node.tiling_dimensions["L2"]["input_dimensions"][1]
+        h_out_l2 = self.node.tiling_dimensions["L2"]["output_dimensions"][1]
+        h_in_l3 = self.node.tiling_dimensions["L3"]["input_dimensions"][1]
+        h_out_l3 = self.node.tiling_dimensions["L3"]["output_dimensions"][1]
+        ch_out_output = self.node.tiling_dimensions["L2"]["output_dimensions"][0]
+        ch_out_weights = self.node.tiling_dimensions["L2"]["weights_dimensions"][0]
+
+        # Fix output height if input height was not tiled
+        if h_out_l3 > h_out_l2 and h_in_l3 == h_in_l2:
+            h_in_l1 = h_out_l2 * s[0] + ks[0] - s[0]
+            in_mem = (in_mem // h_in_l2) * h_in_l1
+        else:
+            h_in_l1 = h_in_l2
+
+        # Fix input height if output height was not tiled
+        if h_in_l3 > h_in_l2 and h_out_l3 == h_out_l2:
+            h_out_l1 = (h_in_l2 - ks[0] + s[0]) // s[0]
+            out_mem = (out_mem // h_out_l2) * h_out_l1
+        else:
+            h_out_l1 = h_out_l2
+
+        # Fix if we tiled the weights output channel but not the output tiles channel
         if "Addition" not in self.node.name and "Pool" not in self.node.name:
-            out_mem = int(self.node.tiling_dimensions["L2"]["output_activation_memory"] / self.node.tiling_dimensions["L2"]["output_dimensions"][0] * self.node.tiling_dimensions["L2"]["weights_dimensions"][0])
+            out_mem = int((out_mem / ch_out_output) * ch_out_weights)
+
         buffer_total = self.node.tiling_dimensions["L2"]["weight_memory"] + self.node.tiling_dimensions["L2"]["constants_memory"] + self.node.tiling_dimensions["L2"]["bias_memory"] + in_mem + out_mem
 
+        if h_in_l3 > h_in_l2 or h_out_l3 > h_out_l2:
+            assert h_in_l1 == h_out_l1 * s[0] + ks[0] - s[0]
+
         # Add intermediate buffer
         if "PointwiseDepthwisePointwise" in self.node.name:
-            buffer_total += h_in * self.node.tiling_dimensions["L2"]["input_dimensions"][2] * self.node.output_channels_list[0]
-            buffer_total += h_out * self.node.tiling_dimensions["L2"]["output_dimensions"][2] * self.node.output_channels_list[0]
+            buffer_total += h_in_l1 * self.node.tiling_dimensions["L2"]["input_dimensions"][2] * self.node.output_channels_list[0]
+            buffer_total += h_out_l1 * self.node.tiling_dimensions["L2"]["output_dimensions"][2] * self.node.output_channels_list[0]
         elif "DepthwisePointwise" in self.node.name:
-            buffer_total += h_out * self.node.tiling_dimensions["L2"]["output_dimensions"][2] * self.node.tiling_dimensions["L2"]["output_dimensions"][0]
-
-        self.node.tiling_dimensions["L1"]["db_x"] = 1
-        self.node.tiling_dimensions["L1"]["db_y"] = 1
-        self.node.tiling_dimensions["L1"]["db_w"] = 1
+            buffer_total += h_out_l1 * self.node.tiling_dimensions["L2"]["output_dimensions"][2] * self.node.tiling_dimensions["L2"]["output_dimensions"][0]
 
         # return immediately if the memory fits the L1
         if buffer_total <= L1_memory:
+            self.node.tiling_dimensions["L1"]["db_x"] = 1
+            self.node.tiling_dimensions["L1"]["db_y"] = 1
+            self.node.tiling_dimensions["L1"]["db_w"] = 1
+
             return (self.node.tiling_dimensions["L2"]["weights_dimensions"],
                     [self.node.tiling_dimensions["L2"]["input_dimensions"][0],
-                     h_in,
+                     h_in_l1,
                      self.node.tiling_dimensions["L2"]["input_dimensions"][2]],
                     [out_ch,
-                     h_out,
+                     h_out_l1,
                      self.node.tiling_dimensions["L2"]["output_dimensions"][2]]
                     )
 
@@ -281,10 +313,10 @@ def get_tiling_conv2d_L2(self):
         ###############################################
         parameters = pywrapcp.Solver.DefaultSolverParameters()
         solver = pywrapcp.Solver("simple_CP", parameters)
-        h_in = h_in
+        h_in = h_in_l1
         w_in = in_dim[1]
         n_in = in_ch
-        h_out = h_out
+        h_out = h_out_l1
         w_out = out_dim[1]
         n_out = out_ch
         tile_h_out = solver.IntVar(1, out_dim[0], 'tile_h_out')

From 56553f86567139a0c47fc764e9162e8d7c47facd Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Thu, 21 Dec 2023 10:34:41 +0000
Subject: [PATCH 06/30] Fix l2 mapping overwriting l3 layers

---
 dory/Parsers/Parser_HW_to_C.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dory/Parsers/Parser_HW_to_C.py b/dory/Parsers/Parser_HW_to_C.py
index 63535abd..24064d9d 100644
--- a/dory/Parsers/Parser_HW_to_C.py
+++ b/dory/Parsers/Parser_HW_to_C.py
@@ -86,8 +86,8 @@ def l2_c_template(self, node, backend_library):
     def l2_template_mapping(self, node, backend_library):
         tmpl_c = self.l2_c_template(node, backend_library)
         return {
-            os.path.join(self.src_dir, node.prefixed_name + ".c"): os.path.join(self.tmpl_dir, tmpl_c),
-            os.path.join(self.inc_dir, node.prefixed_name + ".h"): os.path.join(self.tmpl_dir, "layer_L2_h_template.h"),
+            os.path.join(self.src_dir, node.name + ".c"): os.path.join(self.tmpl_dir, tmpl_c),
+            os.path.join(self.inc_dir, node.name + ".h"): os.path.join(self.tmpl_dir, "layer_L2_h_template.h"),
         }
 
     def mapping_layers_to_C_files(self):

From ed308bb7995901f04804e2a7b4a0e46f5244d9c6 Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Mon, 8 Jan 2024 12:47:26 +0000
Subject: [PATCH 07/30] Fix func_names doesn't have the padded func versions

---
 .../Templates/layer_templates/layer_L3_c_template.c  |  8 +++-----
 .../Templates/layer_templates/layer_L3_c_template.c  | 12 +++++++-----
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/dory/Hardware_targets/PULP/Common/Templates/layer_templates/layer_L3_c_template.c b/dory/Hardware_targets/PULP/Common/Templates/layer_templates/layer_L3_c_template.c
index 37e2aa56..c3ecc77b 100644
--- a/dory/Hardware_targets/PULP/Common/Templates/layer_templates/layer_L3_c_template.c
+++ b/dory/Hardware_targets/PULP/Common/Templates/layer_templates/layer_L3_c_template.c
@@ -17,11 +17,9 @@
  * limitations under the License. 
  */
 #include "${func_name_L3}.h"
-#include "${func_name[0]}.h"
-% if len(func_name)>1:
-#include "${func_name[1]}.h"
-#include "${func_name[2]}.h"
-%endif
+% for name in func_name:
+#include "${name}.h"
+% endfor
 #include "dory_get_tile.h"
 #include "pmsis.h"
 #include "bsp/fs.h"
diff --git a/dory/Hardware_targets/PULP/GAP9/Templates/layer_templates/layer_L3_c_template.c b/dory/Hardware_targets/PULP/GAP9/Templates/layer_templates/layer_L3_c_template.c
index c52c75f1..61d3523e 100644
--- a/dory/Hardware_targets/PULP/GAP9/Templates/layer_templates/layer_L3_c_template.c
+++ b/dory/Hardware_targets/PULP/GAP9/Templates/layer_templates/layer_L3_c_template.c
@@ -17,11 +17,9 @@
  * limitations under the License. 
  */
 #include "${func_name_L3}.h"
-#include "${func_name[0]}.h"
-% if len(func_name)>1:
-#include "${func_name[1]}.h"
-#include "${func_name[2]}.h"
-%endif
+% for name in func_name:
+#include "${name}.h"
+% endfor
 #include "dory_get_tile.h"
 #include "pmsis.h"
 #include "bsp/fs.h"
@@ -187,6 +185,7 @@ void __attribute__ ((noinline)) ${func_name_L3}(void *args)
     tile_args.L2_output = dory_get_tile_3d(db[i_db_y].y, ${0 if n_tile_y > 1 else 'j'}, 0, k, ${h_out}, ${w_out}, ${n_out}, ${w_out}, ${n_out * n_tile_W}, 0, 0, 0, 0, 0, 0, ${y_data_size_byte});
     tile_args.L2_weights = db[i_db_w].w;
 
+% if len(func_name) > 1:
     if (j==0) {
       ${func_name[1] if (n_tile_x > 1 or n_tile_y > 1) and padding > 0 else func_name[0]}((void*)&tile_args);
     } \
@@ -199,6 +198,9 @@ else if (j == (${n_tile_y-1})) {
     } else {
       ${func_name[0]}((void*)&tile_args);
     }    
+% else:
+    ${func_name[0]}(&tile_args);
+% endif
 
     % if n_tile_W > 1:
       // waiting for weights, lambda, and k

From d8e8e7d8079f1e914bc4a3a6d2839dde629e1093 Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Mon, 8 Jan 2024 12:47:59 +0000
Subject: [PATCH 08/30] Fix missing node.L3_input for GAP9_NE16

---
 dory/Hardware_targets/PULP/GAP9_NE16/Tiler/tiler_conv2d_ne16.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/dory/Hardware_targets/PULP/GAP9_NE16/Tiler/tiler_conv2d_ne16.py b/dory/Hardware_targets/PULP/GAP9_NE16/Tiler/tiler_conv2d_ne16.py
index e4b76297..f66b9697 100644
--- a/dory/Hardware_targets/PULP/GAP9_NE16/Tiler/tiler_conv2d_ne16.py
+++ b/dory/Hardware_targets/PULP/GAP9_NE16/Tiler/tiler_conv2d_ne16.py
@@ -65,6 +65,8 @@ def get_tiling_conv2d_L3(self):
         # We assume that the first nodes input is always in L2
         input_in_l2 = is_first_node or prev_tiling["L3"]["output_dimensions"] == prev_tiling["L2"]["output_dimensions"]
 
+        self.node.L3_input = not input_in_l2
+
         buffer_total = self.node.input_activation_memory + self.node.output_activation_memory + self.node.weight_memory + self.node.bias_memory + self.node.constants_memory
 
         # Don't tile if the whole thing fits into L2

From 726401dd8a047ba7bb0848ee448a09c5008f68c2 Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Mon, 8 Jan 2024 12:49:13 +0000
Subject: [PATCH 09/30] Revert remove mapping_layers_to_C_files

---
 .../PULP/GAP9_NE16/C_Parser.py                | 34 +++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/dory/Hardware_targets/PULP/GAP9_NE16/C_Parser.py b/dory/Hardware_targets/PULP/GAP9_NE16/C_Parser.py
index 263820ce..3a1da89a 100644
--- a/dory/Hardware_targets/PULP/GAP9_NE16/C_Parser.py
+++ b/dory/Hardware_targets/PULP/GAP9_NE16/C_Parser.py
@@ -17,6 +17,8 @@
 
 from dory.Hardware_targets.PULP.GAP9.C_Parser import C_Parser as C_Parser_gap9
 from dory.Hardware_targets.PULP.GAP9_NE16.Ne16_HW_node import Ne16_HW_node
+from dory.Utils.Templates_writer.TemplateWriter import TemplateWriter
+import dory.Utils.Templates_writer.Layer2D_template_writer as Layer2D_writer
 import os
 import sys
 sys.path.append(os.path.join(os.path.dirname(__file__), "..", "Backend_Kernels", "pulp-nnx", "test"))
@@ -56,6 +58,38 @@ def l2_template_keywords(self, node, backend_library):
             tk = self.__nnx_vars(tk, node)
         return tk
 
+    def mapping_layers_to_C_files(self):
+        print("\nMapping the layers files to their templates and copying the kernels associated.")
+        n_memory_levels = self.HW_description['memory']['levels']
+
+        for i, node in enumerate(self.HWgraph):
+            backend_library = self.node_backend_library(node)
+            self.copy_backend_files(node, backend_library)
+
+            #if (node.kernel_shape[0] == 3 and node.group == 96):
+                #breakpoint()
+
+            if n_memory_levels > 2 and (node.L3_input != 0 or (node.tiling_dimensions["L3"]["output_dimensions"] != node.tiling_dimensions["L2"]["output_dimensions"]) or (node.tiling_dimensions["L3"]["weights_dimensions"] != node.tiling_dimensions["L2"]["weights_dimensions"])):
+                #breakpoint()
+                tk = Layer2D_writer.print_template_layer_L3(node)
+                TemplateWriter.write(tk, {os.path.join(self.src_dir, node.prefixed_name + ".c"): os.path.join(self.tmpl_dir, "layer_L3_c_template.c"),
+                                          os.path.join(self.inc_dir, node.prefixed_name + ".h"): os.path.join(self.tmpl_dir, "layer_L3_h_template.h")})
+                if node.tiling_dimensions["L3"]["input_dimensions"][1] > node.tiling_dimensions["L2"]["input_dimensions"][1]:
+                    node.tiling_dimensions["L2"]["output_dimensions"][1] = (node.tiling_dimensions["L2"]["input_dimensions"][1] - node.kernel_shape[0] + node.strides[0]) // node.strides[0]
+                if node.tiling_dimensions["L3"]["output_dimensions"][1] > node.tiling_dimensions["L2"]["output_dimensions"][1]:
+                    node.tiling_dimensions["L2"]["input_dimensions"][1] = node.tiling_dimensions["L2"]["output_dimensions"][1] * node.strides[0] + node.kernel_shape[0] - node.strides[0]
+                node.name = node.name + "_L2"
+                tk = self.l2_template_keywords(node, backend_library)
+                TemplateWriter.write(tk, self.l2_template_mapping(node, backend_library))
+                node.name = node.name[:-3]
+            else:
+                if node.tiling_dimensions["L2"]["input_dimensions"][2] == node.tiling_dimensions["L1"]["input_dimensions"][2]:
+                    node.tiling_dimensions["L1"]["output_dimensions"][2] = int((node.tiling_dimensions["L1"]["input_dimensions"][2] + (node.pads[1] + node.pads[3]) - node.kernel_shape[1] + node.strides[1]) / node.strides[1])
+                if node.tiling_dimensions["L2"]["input_dimensions"][1] == node.tiling_dimensions["L1"]["input_dimensions"][1]:
+                    node.tiling_dimensions["L1"]["output_dimensions"][1] = int((node.tiling_dimensions["L1"]["input_dimensions"][1] + (node.pads[0] + node.pads[2]) - node.kernel_shape[0] + node.strides[0]) / node.strides[0])
+                tk = self.l2_template_keywords(node, backend_library)
+                TemplateWriter.write(tk, self.l2_template_mapping(node, backend_library))
+
     def __mem_tmpl_vars(self, tk, node, mem_level):
         mem_name = f'L{mem_level}'
         upper_mem_name = f'L{mem_level + 1}'

From e95828a9d9dc7c709e796a44f0232269caac4530 Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Mon, 8 Jan 2024 12:50:08 +0000
Subject: [PATCH 10/30] Add ne16 check for padded functions

---
 dory/Utils/Templates_writer/Layer2D_template_writer.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/dory/Utils/Templates_writer/Layer2D_template_writer.py b/dory/Utils/Templates_writer/Layer2D_template_writer.py
index 78c8dcfd..9e0ba4b9 100644
--- a/dory/Utils/Templates_writer/Layer2D_template_writer.py
+++ b/dory/Utils/Templates_writer/Layer2D_template_writer.py
@@ -25,6 +25,7 @@
 import sys
 import os
 import re
+from dory.Hardware_targets.PULP.GAP9_NE16.Ne16_HW_node import Ne16_HW_node
 
 def print_template_layer_L3(node):
     ks =      node.kernel_shape
@@ -92,7 +93,7 @@ def print_template_layer_L3(node):
     tk['n_tile_x'] = factor_h_in
     tk['n_tile_y'] = factor_h_out
     tk['verbose'] = False
-    if tk['padding'] > 0:
+    if tk['padding'] > 0 and not isinstance(node, Ne16_HW_node):
         tk['func_name'] = [node.prefixed_name + "_L2", node.prefixed_name + "_L2_p_t", node.prefixed_name + "_L2_p_b"]
     else:
         tk['func_name'] = [node.prefixed_name + "_L2"]
@@ -408,10 +409,8 @@ def print_template_layer(node, layer_type, double_buffering = 2):
     # x last
     tk['x_tile_size_nif_last'] = n_in % tile_n_in if (n_in % tile_n_in) > 0 else tile_n_in
     tk['x_tile_size_nif_byte_last'] = int(math.ceil(tk['x_tile_size_nif_last'] * ds_x / 8.0))
-    tk['x_tile_size_h_last'] = tk['y_tile_size_h_last'] * s[0] + ks[0] - s[0]
-    assert tk['x_tile_size_h_last'] >= 0 and tk['x_tile_size_h_last'] <= tk['x_tile_size_h']
-    tk['x_tile_size_w_last'] = tk['y_tile_size_w_last'] * s[1] + ks[1] - s[1]
-    assert tk['x_tile_size_w_last'] >= 0 and tk['x_tile_size_w_last'] <= tk['x_tile_size_w']
+    tk['x_tile_size_h_last'] = min(tk['y_tile_size_h_last'] * s[0] + ks[0] - s[0], tile_h_in)
+    tk['x_tile_size_w_last'] = min(tk['y_tile_size_w_last'] * s[1] + ks[1] - s[1], tile_w_in)
 
     l = ""
     for k, v in tk.items():

From 38052e4d022ca13c3bc470445bd9ba34180808e2 Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Mon, 8 Jan 2024 19:22:25 +0100
Subject: [PATCH 11/30] Bump pulp-nnx

---
 dory/Hardware_targets/PULP/Backend_Kernels/pulp-nnx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dory/Hardware_targets/PULP/Backend_Kernels/pulp-nnx b/dory/Hardware_targets/PULP/Backend_Kernels/pulp-nnx
index 06e3e91c..ca56a9d4 160000
--- a/dory/Hardware_targets/PULP/Backend_Kernels/pulp-nnx
+++ b/dory/Hardware_targets/PULP/Backend_Kernels/pulp-nnx
@@ -1 +1 @@
-Subproject commit 06e3e91c6308948d0c060755e61cc38805b8c280
+Subproject commit ca56a9d4ae5fe8a300ea3738ea3ffadbba7353c3

From 7037b4636c8cff823ce004b6e02789871c0156ed Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Tue, 9 Jan 2024 10:31:36 +0100
Subject: [PATCH 12/30] Fix padding flag passing

---
 .../Templates/layer_templates/layer_L3_c_template.c  | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/dory/Hardware_targets/PULP/GAP9/Templates/layer_templates/layer_L3_c_template.c b/dory/Hardware_targets/PULP/GAP9/Templates/layer_templates/layer_L3_c_template.c
index 61d3523e..92e1735c 100644
--- a/dory/Hardware_targets/PULP/GAP9/Templates/layer_templates/layer_L3_c_template.c
+++ b/dory/Hardware_targets/PULP/GAP9/Templates/layer_templates/layer_L3_c_template.c
@@ -129,6 +129,8 @@ void __attribute__ ((noinline)) ${func_name_L3}(void *args)
   int offset_x = ${dim_in-int(conv_overlap1*n_in*w_in*BitIn/8) - int(padding*n_in*w_in*BitIn/8)};
 
   % if n_tile_x > 1:
+  uint32_t padding = tile_args.padding;
+
   // loop over input/output tiles
   for(int j = 0; j < ${n_tile_x}; j++) {
     // Fetching next input tile
@@ -199,6 +201,16 @@ else if (j == (${n_tile_y-1})) {
       ${func_name[0]}((void*)&tile_args);
     }    
 % else:
+    % if n_tile_x > 1:
+    // LMACAN: Hacky way to add the padding flag passing.
+    tile_args.padding = 0;
+    if (j == 0) {
+      tile_args.padding |= padding & NET_UTILS_PAD_TOP;
+    }
+    if (j == ${n_tile_x - 1}) {
+      tile_args.padding |= padding & NET_UTILS_PAD_BOTTOM;
+    }
+    % endif
     ${func_name[0]}(&tile_args);
 % endif
 

From 997b24146a12522fe462284b0febfdec995318bb Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Tue, 9 Jan 2024 10:32:42 +0100
Subject: [PATCH 13/30] Fix unused padding with strided convolution for ne16

---
 dory/Hardware_targets/PULP/GAP9_NE16/HW_Parser.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/dory/Hardware_targets/PULP/GAP9_NE16/HW_Parser.py b/dory/Hardware_targets/PULP/GAP9_NE16/HW_Parser.py
index 6ae78200..a63b3210 100644
--- a/dory/Hardware_targets/PULP/GAP9_NE16/HW_Parser.py
+++ b/dory/Hardware_targets/PULP/GAP9_NE16/HW_Parser.py
@@ -113,3 +113,13 @@ def adjust_node_data_layout(self, node, node_id):
         weights["value"] = Ne16.weight_unroll(weights["value"].astype(np.uint8),
                                               node.weight_bits, node.group > 1)
         weights["layout"] = "CoutCinMajKQwCinMin" # Ne16's special layout
+
+        self.adjust_padding(node)
+
+    def adjust_padding(self, node):
+        inp_dim = node.input_dimensions
+        ks = node.kernel_shape
+        s = node.strides
+        padding_top, padding_left, padding_bottom, padding_right = node.pads
+        node.pads[2] = padding_bottom if (inp_dim[0] - ks[0] + padding_top + padding_bottom) % s[0] == 0 else 0
+        node.pads[3] = padding_right if (inp_dim[1] - ks[1] + padding_left + padding_right) % s[1] == 0 else 0

From 858a62b84af8dddd201486c2c61f7ec82e4df3b3 Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Tue, 9 Jan 2024 10:33:03 +0100
Subject: [PATCH 14/30] Rewrite convolution overlap

---
 .../Templates_writer/Layer2D_template_writer.py     | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/dory/Utils/Templates_writer/Layer2D_template_writer.py b/dory/Utils/Templates_writer/Layer2D_template_writer.py
index 9e0ba4b9..6e88b3f0 100644
--- a/dory/Utils/Templates_writer/Layer2D_template_writer.py
+++ b/dory/Utils/Templates_writer/Layer2D_template_writer.py
@@ -33,8 +33,6 @@ def print_template_layer_L3(node):
     g =       node.group
     p =       node.pads
     padding_top = p[0]; padding_left = p[1]; padding_bottom = p[2]; padding_right = p[3];
-    conv_overlap1 = 2 * (ks[0] // 2) + ks[0] % 2 - 1 - (s[0] - 1)
-    conv_overlap2 = 2 * (ks[1] // 2) + ks[1] % 2 - 1 - (s[1] - 1)
     tk = OrderedDict([])
     tk['flag_DW'] = 1 if node.group > 1 else 0
     tk['ULTRA_VERBOSE'] = False
@@ -75,8 +73,8 @@ def print_template_layer_L3(node):
 
     ################################################################################
 
-    tk['conv_overlap1'] = conv_overlap1
-    tk['conv_overlap2'] = conv_overlap2
+    tk['conv_overlap1'] = ks[0] - s[0]
+    tk['conv_overlap2'] = ks[1] - s[1]
     tk['padding'] = padding_top
     if (node.L3_input):
         tk['input_L3'] = 1
@@ -151,11 +149,8 @@ def print_template_layer(node, layer_type, double_buffering = 2):
     s =       node.strides
     g =       node.group
     p =       node.pads
-    conv_overlap_h = 2 * (ks[0] // 2) + ks[0] % 2 - 1 - (s[0] - 1)
     padding_top = p[0]; padding_left = p[1]; padding_bottom = p[2]; padding_right = p[3];
     name_layer = node.prefixed_name + '.h'
-    conv_overlap1 = 2 * (ks[0] // 2) + ks[0] % 2 - 1 - (s[0] - 1)
-    conv_overlap2 = 2 * (ks[1] // 2) + ks[1] % 2 - 1 - (s[1] - 1)
     tk = OrderedDict([])
     if (re.search('.0',name_layer)):
         try:
@@ -178,8 +173,8 @@ def print_template_layer(node, layer_type, double_buffering = 2):
     tk['has_bias'] = int(len([1 for name in node.constant_names if "bias" in name])>0)
     tk['FLAG_RELU'] = 1 if 'outshift' in node.constant_names else 0
     tk['type'] = f"{node.input_activation_type}8_t" if node.input_activation_type in ["int", "uint"] else "float"
-    tk['conv_overlap1'] = conv_overlap1
-    tk['conv_overlap2'] = conv_overlap2
+    tk['conv_overlap1'] = ks[0] - s[0]
+    tk['conv_overlap2'] = ks[1] - s[1]
     tk['padding_top'] = padding_top
     tk['padding_bottom'] = padding_bottom
     tk['padding_left'] = padding_left

From fe647a5f3683aa6f9c59704c42c7452fe9c38232 Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Thu, 18 Jan 2024 17:48:33 +0100
Subject: [PATCH 15/30] Small rewrites for add

---
 .../PULP/Common/Tiler/tiler_add.py            | 45 +++++++------------
 .../layer_L2_c_addition_template.c            |  2 +-
 2 files changed, 16 insertions(+), 31 deletions(-)

diff --git a/dory/Hardware_targets/PULP/Common/Tiler/tiler_add.py b/dory/Hardware_targets/PULP/Common/Tiler/tiler_add.py
index 68021612..8b50ef05 100644
--- a/dory/Hardware_targets/PULP/Common/Tiler/tiler_add.py
+++ b/dory/Hardware_targets/PULP/Common/Tiler/tiler_add.py
@@ -91,62 +91,47 @@ def get_tiling_Add_L2(self):
         # return immediatly if the memory fits the L1
         if buffer_total <= L1_memory:
             return ([], self.HW_node.tiling_dimensions["L2"]["input_dimensions"] , self.HW_node.tiling_dimensions["L2"]["output_dimensions"] )
-        else:
-            db = self.double_buffering
+
+        db = self.double_buffering
 
         parameters = pywrapcp.Solver.DefaultSolverParameters()
         solver = pywrapcp.Solver("simple_CP", parameters)
 
-        # integer positive variables.
-        tile_n = solver.IntVar(1, in_ch, 'tile_n')
-        tile_h_in =  solver.IntVar(ks[0], inp_dim[0], 'tile_h_in')
-        tile_w_in =  solver.IntVar(ks[1], inp_dim[1], 'tile_w_in')
-        tile_h_out = solver.IntVar(1, out_dim[0], 'tile_h_out')
-        tile_w_out = solver.IntVar(1, out_dim[1], 'tile_w_out')
+        assert in_ch == out_ch
 
-        # scaling is used to ensure datasize is integer
-        solver.Add(tile_h_out == tile_h_in)
-        solver.Add(tile_w_out == tile_w_in)
-        solver.Add(tile_n == in_ch)
+        # integer positive variables.
+        tile_n = in_ch
+        tile_h = solver.IntVar(1, inp_dim[0], 'tile_h_in')
+        tile_w = inp_dim[1]
 
         # CONSTRAINTS: managing of correct dimensions (no decimal h_out and any
         # type of rounding)
-        input_tile_dimension  = (db * in_ch * tile_h_in * inp_dim[1] * self.HW_node.input_activation_bits + 7 ) // 8 # the 7 is to account for bit precision of 1, which still occupy an entire byte
-        output_tile_dimension = (db * out_ch * tile_h_out * out_dim[1] * self.HW_node.output_activation_bits + 7 ) // 8 # the 7 is to account for bit precision of 1, which still occupy an entire byte
+        input_tile_dimension  = (db * tile_n * tile_h * tile_w * self.HW_node.input_activation_bits + 7 ) // 8 # the 7 is to account for bit precision of 1, which still occupy an entire byte
+        output_tile_dimension = (db * tile_n * tile_h * tile_w * self.HW_node.output_activation_bits + 7 ) // 8 # the 7 is to account for bit precision of 1, which still occupy an entire byte
         constraint_all = input_tile_dimension * int(np.ceil(1+self.HW_node.second_input_activation_bits/self.HW_node.input_activation_bits)) + output_tile_dimension
         solver.Add(constraint_all <= L1_memory)
+
         # objective
         obj_expr = solver.IntVar(0, 1000000000000, "obj_expr")
-        solver.Add(obj_expr == 1000000 * constraint_all
-                   + 10 * tile_w_in
-                   + 1 * tile_h_in
-                   + 1000 * tile_n)
+        solver.Add(obj_expr == constraint_all)
         objective = solver.Maximize(obj_expr, 1)
 
-        decision_builder = solver.Phase([tile_n, tile_h_in, tile_w_in, tile_h_out, tile_w_out],
+        decision_builder = solver.Phase([tile_h],
                                         solver.CHOOSE_FIRST_UNBOUND,
                                         solver.ASSIGN_MIN_VALUE)
 
         # Create a solution collector.
         collector = solver.LastSolutionCollector()
         # Add the decision variables.
-        collector.Add(tile_n)
-        collector.Add(tile_h_in)
-        collector.Add(tile_w_in)
-        collector.Add(tile_h_out)
-        collector.Add(tile_w_out)
+        collector.Add(tile_h)
         # Add the objective.
         collector.AddObjective(obj_expr)
 
         solver.Solve(decision_builder, [objective, collector])
         if collector.SolutionCount() > 0:
             best_solution = collector.SolutionCount() - 1
-            tile_n = collector.Value(best_solution, tile_n)
-            tile_h_in = collector.Value(best_solution, tile_h_in)
-            tile_w_in = collector.Value(best_solution, tile_w_in)
-            tile_h_out = collector.Value(best_solution, tile_h_out)
-            tile_w_out = collector.Value(best_solution, tile_w_out)
-            return ([], [tile_n, tile_h_in, tile_w_in], [tile_n, tile_h_out, tile_w_out])
+            tile_h = collector.Value(best_solution, tile_h)
+            return ([], [tile_n, tile_h, tile_w], [tile_n, tile_h, tile_w])
         print("  Add ERROR: no tiling found. Exiting...")
         os._exit(0)
         return None
diff --git a/dory/Hardware_targets/PULP/GAP9/Templates/layer_templates/layer_L2_c_addition_template.c b/dory/Hardware_targets/PULP/GAP9/Templates/layer_templates/layer_L2_c_addition_template.c
index 5508d7c3..a384bc59 100644
--- a/dory/Hardware_targets/PULP/GAP9/Templates/layer_templates/layer_L2_c_addition_template.c
+++ b/dory/Hardware_targets/PULP/GAP9/Templates/layer_templates/layer_L2_c_addition_template.c
@@ -119,8 +119,8 @@ static void kernel(Layer tile, Layer tile2) {
     tile.addr.input,
     tile2.addr.input,
     tile.addr.output,
-    ${inmul2},
     ${inmul1},
+    ${inmul2},
     ${outshift},
     tile.input.width,
     tile.input.height,

From d195966d2dbcf0a768ca4c7b71839c3ad8ebca19 Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Thu, 18 Jan 2024 17:49:07 +0100
Subject: [PATCH 16/30] Add last checksum

---
 .../PULP/GAP9/Templates/network_c_template.c                | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/dory/Hardware_targets/PULP/GAP9/Templates/network_c_template.c b/dory/Hardware_targets/PULP/GAP9/Templates/network_c_template.c
index 49ebbbcf..90678e47 100644
--- a/dory/Hardware_targets/PULP/GAP9/Templates/network_c_template.c
+++ b/dory/Hardware_targets/PULP/GAP9/Templates/network_c_template.c
@@ -479,6 +479,12 @@ void ${prefix}network_run_cluster(void *args) {
   ${prefix}cycle_network_execution += io_cyc;
   % endif
 
+  % if 'Last' in verbose_level:
+  checksum("final output", L2_output,
+           activations_out_size[${len(DORY_HW_graph)-1}],
+           activations_out_checksum[${len(DORY_HW_graph)-1}][exec]);
+  % endif
+
   //memcpy(L2_output, l2_final_output, activations_out_size[${len(DORY_HW_graph)-1}]); // BUGGY!
   for (int i=0; i<activations_out_size[${len(DORY_HW_graph)-1}]; i++)
     *((uint8_t*)(l2_final_output+i)) = *((uint8_t*)(L2_output+i));

From 9706289b13d5cd494c9fb0024dd32b6fa02e4a14 Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Tue, 30 Jan 2024 12:09:40 +0100
Subject: [PATCH 17/30] WIP: Upgrade pulp-nnx

---
 .../Backend_Kernels/BackendKernelsAdapter.py  | 28 ++++---
 .../PULP/Backend_Kernels/pulp-nnx             |  2 +-
 .../PULP/GAP9_NE16/HW_Parser.py               |  6 +-
 .../layer_L2_c_conv_ne16_multicore_template.c | 59 ++++++++------
 .../PULP/GAP9_NE16/Utils_files/execute.h      | 81 +++++++++----------
 .../Layer2D_template_writer.py                |  2 +
 6 files changed, 96 insertions(+), 82 deletions(-)

diff --git a/dory/Hardware_targets/PULP/Backend_Kernels/BackendKernelsAdapter.py b/dory/Hardware_targets/PULP/Backend_Kernels/BackendKernelsAdapter.py
index 3b0c3c17..a09ee025 100644
--- a/dory/Hardware_targets/PULP/Backend_Kernels/BackendKernelsAdapter.py
+++ b/dory/Hardware_targets/PULP/Backend_Kernels/BackendKernelsAdapter.py
@@ -177,24 +177,32 @@ def _check_valid_node(self, node: HW_node) -> None:
         if self.accelerator_name == "ne16":
             assert True  # TODO
 
-    def _get_ne16_src_files(self) -> List[str]:
-        return ["src/pulp_nnx_ne16.c", "ne16/hal/ne16_hal.c"]
+    def _get_acc_src_files(self) -> List[str]:
+        return [
+            f"{self.accelerator_name}/hal/{self.accelerator_name}.c",
+            f"{self.accelerator_name}/hal/{self.accelerator_name}_task.c",
+            f"{self.accelerator_name}/bsp/{self.accelerator_name}_pulp_bsp.c",
+            f"src/pulp_nnx_{self.accelerator_name}.c",
+        ]
 
-    def _get_ne16_inc_files(self) -> List[str]:
+    def _get_acc_inc_files(self) -> List[str]:
         return [
-            "ne16/gvsoc/ne16_gvsoc_logging.h",
-            "ne16/hal/ne16_defs.h",
-            "ne16/hal/ne16_hal.h",
+            f"{self.accelerator_name}/gvsoc/{self.accelerator_name}_gvsoc.h",
+            f"{self.accelerator_name}/hal/{self.accelerator_name}_task_defs.h",
+            f"{self.accelerator_name}/hal/{self.accelerator_name}.h",
+            f"{self.accelerator_name}/hal/{self.accelerator_name}_task.h",
+            f"{self.accelerator_name}/bsp/{self.accelerator_name}_pulp_bsp.h",
+            f"inc/pulp_nnx_{self.accelerator_name}.h",
         ]
 
     def _get_src_files(self) -> List[str]:
-        src_files = ["util/pulp_nnx_util.c"]
+        src_files = ["util/pulp_nnx_util.c", "util/hwpe.c"]
         if self.accelerator_name == "ne16":
-            src_files += self._get_ne16_src_files()
+            src_files += self._get_acc_src_files()
         return [os.path.join(self._root_dir(), file) for file in src_files]
 
     def _get_inc_files(self) -> List[str]:
-        inc_files = ["inc/pulp_nnx.h", "util/pulp_nnx_util.h"]
+        inc_files = ["util/pulp_nnx_util.h", "util/hwpe.h"]
         if self.accelerator_name == "ne16":
-            inc_files += self._get_ne16_inc_files()
+            inc_files += self._get_acc_inc_files()
         return [os.path.join(self._root_dir(), file) for file in inc_files]
diff --git a/dory/Hardware_targets/PULP/Backend_Kernels/pulp-nnx b/dory/Hardware_targets/PULP/Backend_Kernels/pulp-nnx
index ca56a9d4..70b089ed 160000
--- a/dory/Hardware_targets/PULP/Backend_Kernels/pulp-nnx
+++ b/dory/Hardware_targets/PULP/Backend_Kernels/pulp-nnx
@@ -1 +1 @@
-Subproject commit ca56a9d4ae5fe8a300ea3738ea3ffadbba7353c3
+Subproject commit 70b089ed7a050fd99c775f9b7504dfcbd732c02f
diff --git a/dory/Hardware_targets/PULP/GAP9_NE16/HW_Parser.py b/dory/Hardware_targets/PULP/GAP9_NE16/HW_Parser.py
index a63b3210..a5588601 100644
--- a/dory/Hardware_targets/PULP/GAP9_NE16/HW_Parser.py
+++ b/dory/Hardware_targets/PULP/GAP9_NE16/HW_Parser.py
@@ -25,10 +25,10 @@
 
 from dory.Parsers.HW_node import HW_node
 sys.path.append(os.path.join(os.path.dirname(__file__), "..", "Backend_Kernels", "pulp-nnx", "test"))
-from Ne16TestClasses import Ne16TestConf
+from Ne16TestConf import Ne16TestConf
 from TestClasses import IntegerType
+from Ne16MemoryLayout import Ne16MemoryLayout
 from pydantic import ValidationError
-from Ne16 import Ne16
 
 
 class onnx_manager(onnx_manager_gap9):
@@ -110,7 +110,7 @@ def adjust_node_data_layout(self, node, node_id):
         ## Unroll expects layout to be "CoutCinK"
         if weights["layout"] == "CoutKCin":
             weights["value"] = np.transpose(weights["value"], (0,3,1,2))
-        weights["value"] = Ne16.weight_unroll(weights["value"].astype(np.uint8),
+        weights["value"] = Ne16MemoryLayout.weightEncode(weights["value"].astype(np.uint8),
                                               node.weight_bits, node.group > 1)
         weights["layout"] = "CoutCinMajKQwCinMin" # Ne16's special layout
 
diff --git a/dory/Hardware_targets/PULP/GAP9_NE16/Templates/layer_templates/layer_L2_c_conv_ne16_multicore_template.c b/dory/Hardware_targets/PULP/GAP9_NE16/Templates/layer_templates/layer_L2_c_conv_ne16_multicore_template.c
index 35b71150..b1b42d84 100644
--- a/dory/Hardware_targets/PULP/GAP9_NE16/Templates/layer_templates/layer_L2_c_conv_ne16_multicore_template.c
+++ b/dory/Hardware_targets/PULP/GAP9_NE16/Templates/layer_templates/layer_L2_c_conv_ne16_multicore_template.c
@@ -20,8 +20,15 @@
  */
 
 #include "${func_name}.h"
-#include "pulp_nnx.h"
+
+// ne16 includes
+#include "pulp_nnx_ne16.h"
 #include "pulp_nnx_util.h"
+#include "ne16_pulp_bsp.h"
+#include "ne16_task.h"
+#include "ne16.h"
+#include "hwpe.h"
+
 #include "network.h"
 #include "net_utils.h"
 #include "dory_dma.h"
@@ -105,7 +112,7 @@ static inline void load_weights_prepare(Layer tile, Kernel kernel,
                                         DmaTransferConf * const conf_bias
                                         ) {
     // Special because of accelerators special memory layout
-    const int size_weights = (kernel.groups > 1 ? divnceil(tile.output.channel, 16) : tile.output.channel) * ${l1_W_tile_ki_size};
+    const int size_weights = (kernel.groups > 1 ? nnx_calculate_number_of_tiles(tile.output.channel, 16) : tile.output.channel) * ${l1_W_tile_ki_size};
     const int size_scale = tile.output.channel * ${int(act_dim_bit/8)};
     const int size_bias = tile.output.channel * ${int(b_data_size_byte/8)};
 
@@ -176,13 +183,14 @@ static int inc(int index, int end) {
 #define BUFFER_SIZE (2)
 
 static Layer tiles[BUFFER_SIZE];
-static nnx_task_t nnx_tasks[BUFFER_SIZE];
+static ne16_task_t ne16_tasks[BUFFER_SIZE];
 static DmaTransferConf store_conf[BUFFER_SIZE];
 
 static struct {
     Monitor input, output, store_conf;
 } monitor;
 
+
 static void layer_task_fork(void *args) {
     const int total_tiles = end_index.height * end_index.width * end_index.output_channel;
 
@@ -262,9 +270,9 @@ static void layer_task_fork(void *args) {
             load_async(tiles[i_buff], &tile_status, body, layer, kernel);
             dma_mutex_unlock();
             % if stride == 1:
-            execute_prepare(tile, &nnx_tasks[i_buff]);
+            execute_prepare(tile, &ne16_tasks[i_buff]);
             % elif stride == 2:
-            execute_stride2x2_prepare(tile, kernel, &nnx_tasks[i_buff]);
+            execute_stride2x2_prepare(tile, kernel, &ne16_tasks[i_buff]);
             % endif
             dma_mutex_lock();
             dma_transfer_wait(transfer);
@@ -290,9 +298,9 @@ static void layer_task_fork(void *args) {
             monitor_produce_begin(monitor.output);
 
             % if stride == 1:
-            execute_async(&nnx_tasks[i_buff]);
+            execute_async(&ne16_tasks[i_buff]);
             % elif stride == 2:
-            execute_stride2x2_blocking(&nnx_tasks[i_buff], tiles[i_buff], kernel, tiles[i_buff].output.channel);
+            execute_stride2x2_blocking(&ne16_tasks[i_buff], tiles[i_buff], kernel);
             % endif
 
             monitor_produce_end(monitor.output);
@@ -310,7 +318,7 @@ static void layer_task_fork(void *args) {
             monitor_consume_begin(monitor.store_conf);
             monitor_consume_begin(monitor.output);
 
-            execute_wait(&nnx_tasks[i_buff]);
+            execute_wait(&ne16_tasks[i_buff]);
             monitor_consume_end(monitor.input);
 
             dma_mutex_lock();
@@ -362,24 +370,25 @@ void ${func_name}(void *args) {
 
     // Init nnx tasks
     for (int i = 0; i < BUFFER_SIZE; i++) {
-        nnx_task_init(&nnx_tasks[i],
-                ${fs1}, ${int(flag_DW)},
-                ${x_data_size_byte}, ${y_data_size_byte}, ${W_data_size_byte},
-                weightOffsetModeLayerWise, ${-(2**(W_data_size_byte-1))},
-                (nnx_quant_t) {
-                    .shift_amount = ${out_shift},
-                    .mode = quantMode8Bit,
-                    .function = quantFunctionRelu,
-                    .flag_rounding = NE16_FLAG_UNUSED
-                }, (nnx_norm_t) {
-                    .mode  = ${"normMode32Bit" if act_dim_bit == 32 else "normMode8Bit" if act_dim_bit == 8 else "NE16_FLAG_UNUSED"},
-                    .flag_bias  = NE16_FLAG_USED,
-                    .flag_shift = NE16_FLAG_UNUSED
-                }, ${stride});
+        ne16_task_init(&ne16_tasks[i]);
+        ne16_task_set_op_to_conv(&ne16_tasks[i], ${fs1}, ${int(flag_DW)}, ${stride});
+        ne16_task_set_bits(&ne16_tasks[i], ${x_data_size_byte}, ${y_data_size_byte}, ${W_data_size_byte});
+        ne16_task_set_norm_quant(
+            &ne16_tasks[i],
+            (ne16_quant_t) {
+                .shift_amount = ${out_shift},
+                .function = quantFunctionRelu, // todo: un-hardcode it
+                .flag_rounding = ne16TaskFlagFalse
+            }, (ne16_norm_t) {
+                .mode  = ${"normMode32Bit" if act_dim_bit == 32 else "normMode8Bit" if act_dim_bit == 8 else ""},
+                .flag_bias  = ne16TaskFlagTrue,
+                .flag_shift = ne16TaskFlagFalse
+            });
+        ne16_task_set_weight_offset(&ne16_tasks[i], weightOffsetModeLayerWise, ${weight_offset});
     }
 
-    const int max_stall = 8;
-    nnx_init(max_stall);
+    const ne16_pulp_conf_t ne16_pulp_conf = {.max_stall = 8};
+    ne16_nnx_init(ne16_pulp_get_dev(), &ne16_pulp_conf);
     dma_mutex_init();
 
 
@@ -393,5 +402,5 @@ void ${func_name}(void *args) {
     monitor_term(monitor.input);
     monitor_term(monitor.output);
     monitor_term(monitor.store_conf);
-    nnx_term();
+    ne16_nnx_term(ne16_pulp_get_dev());
 }
diff --git a/dory/Hardware_targets/PULP/GAP9_NE16/Utils_files/execute.h b/dory/Hardware_targets/PULP/GAP9_NE16/Utils_files/execute.h
index 5e078d64..aa07add1 100644
--- a/dory/Hardware_targets/PULP/GAP9_NE16/Utils_files/execute.h
+++ b/dory/Hardware_targets/PULP/GAP9_NE16/Utils_files/execute.h
@@ -2,64 +2,59 @@
 #define __EXECUTE_H__
 
 #include "dory_get_tile.h"
-#include "ne16_hal.h"
-#include "pulp_nnx.h"
+#include "hwpe.h"
+#include "ne16.h"
+#include "ne16_pulp_bsp.h"
+#include "ne16_task.h"
+#include "pulp_nnx_ne16.h"
 #include "pulp_nnx_util.h"
 #include "tile_status.h"
 #include <stdint.h>
 
-static inline void execute_prepare(Layer tile, nnx_task_t *const task) {
-  nnx_task_set_dims(task, tile.input.width, tile.input.channel,
-                    tile.input.width, tile.input.channel, tile.output.height,
-                    tile.output.width, tile.output.channel, tile.output.width,
-                    tile.output.channel, tile.padding.top, tile.padding.bottom,
-                    tile.padding.right, tile.padding.left);
-  nnx_task_set_ptrs(task, tile.addr.input, tile.input.width, tile.input.channel,
-                    8 /*bits_in*/, tile.padding.top, tile.padding.left,
-                    tile.addr.output, tile.addr.weights, tile.addr.scale,
-                    0 /*shift_ptr*/, tile.addr.bias);
+static inline void execute_prepare(Layer tile, ne16_task_t *const task) {
+  ne16_task_set_dims(task, tile.input.width, tile.input.channel,
+                     tile.input.width * tile.input.channel, tile.input.channel,
+                     tile.output.height, tile.output.width, tile.output.channel,
+                     tile.output.width * tile.output.channel,
+                     tile.output.channel, tile.padding.top, tile.padding.bottom,
+                     tile.padding.right, tile.padding.left);
+  ne16_task_set_ptrs(task, tile.addr.input, tile.input.width,
+                     tile.input.channel, tile.padding.top, tile.padding.left,
+                     tile.addr.output, tile.addr.weights, tile.addr.scale,
+                     0 /*shift_ptr*/, tile.addr.bias);
 }
 
-static inline void execute_async(nnx_task_t *task) {
-  nnx_dispatch_check_blocking();
-  nnx_dispatch_task(task);
+static inline void execute_async(ne16_task_t *task) {
+  ne16_nnx_dispatch_wait(ne16_pulp_get_dev());
+  ne16_nnx_dispatch(ne16_pulp_get_dev(), task);
 }
 
 static inline void execute_stride2x2_prepare(Layer tile, Kernel kernel,
-                                             nnx_task_t *const task) {
-  nnx_task_set_dims_stride2x2(
+                                             ne16_task_t *const task) {
+  ne16_task_set_dims_stride2x2(
       task, tile.input.height, tile.input.width, tile.input.channel,
-      tile.input.width, tile.input.channel, tile.output.height,
-      tile.output.width, tile.output.channel, tile.output.width,
-      tile.output.channel, kernel.shape.height, kernel.shape.width,
-      tile.padding.top, tile.padding.bottom, tile.padding.right,
-      tile.padding.left);
-  nnx_task_set_ptrs(task, tile.addr.input, tile.input.width, tile.input.channel,
-                    8 /*bits_in*/, tile.padding.top, tile.padding.left,
-                    tile.addr.output, tile.addr.weights, tile.addr.scale,
-                    0 /*shift_ptr*/, tile.addr.bias);
+      tile.input.width * tile.input.channel, tile.input.channel,
+      tile.output.height, tile.output.width, tile.output.channel,
+      tile.output.width * tile.output.channel, tile.output.channel,
+      kernel.shape.height, kernel.shape.width, tile.padding.top,
+      tile.padding.bottom, tile.padding.right, tile.padding.left);
+  ne16_task_set_ptrs(task, tile.addr.input, tile.input.width,
+                     tile.input.channel, tile.padding.top, tile.padding.left,
+                     tile.addr.output, tile.addr.weights, tile.addr.scale,
+                     0 /*shift_ptr*/, tile.addr.bias);
 }
 
-static inline void execute_stride2x2_blocking(nnx_task_t *task, Layer tile,
-                                              Kernel kernel,
-                                              uint32_t output_channel_stride) {
-  nnx_dispatch_task_stride2x2(
-      task, tile.input.width, tile.input.channel, tile.input.width,
-      tile.input.channel, tile.output.height, tile.output.width,
-      tile.output.channel, tile.output.width, output_channel_stride,
-      kernel.shape.height, kernel.shape.width);
+static inline void execute_stride2x2_blocking(ne16_task_t *task, Layer tile,
+                                              Kernel kernel) {
+  ne16_nnx_dispatch_stride2x2(ne16_pulp_get_dev(), task, tile.input.width, tile.input.channel,
+                              tile.output.height, tile.output.width,
+                              tile.output.channel, kernel.shape.height,
+                              kernel.shape.width);
 }
 
-static inline void execute_wait(nnx_task_t *task) {
-#if __PLATFORM__ == ARCHI_PLATFORM_GVSOC && defined GAP_SDK
-  // Temporary hack because the gvsoc model of ne16 in gap_sdk
-  // has a broken running_id.
-  while(!ne16_empty())
+static inline void execute_wait(ne16_task_t *task) {
+  while (!ne16_nnx_resolve_check(ne16_pulp_get_dev(), task))
     ;
-#else
-  while(!nnx_resolve_check(task))
-    ;
-#endif
 }
 
 #endif // __EXECUTE_H__
diff --git a/dory/Utils/Templates_writer/Layer2D_template_writer.py b/dory/Utils/Templates_writer/Layer2D_template_writer.py
index 6e88b3f0..556bf200 100644
--- a/dory/Utils/Templates_writer/Layer2D_template_writer.py
+++ b/dory/Utils/Templates_writer/Layer2D_template_writer.py
@@ -290,6 +290,8 @@ def print_template_layer(node, layer_type, double_buffering = 2):
     tk['W_data_size_byte'] = ds_W
     tk['b_data_size_byte'] = ds_bias
     tk['W_tile_size_nof'] = tile_n_out 
+    if ds_W is not None:
+        tk['weight_offset'] = -(2**(ds_W-1))
     if tk['has_bias'] == 1:
         tk['b_size_byte'] = int(math.ceil(n_out * ds_bias / 8.0))
     else:

From 397e95cc41ef5a96876080a22386eccd08f1f3ff Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Tue, 30 Jan 2024 14:22:27 +0100
Subject: [PATCH 18/30] Copy network template to GAP9_NE16

---
 .../GAP9_NE16/Templates/network_c_template.c  | 506 +++++++++++++++++-
 1 file changed, 505 insertions(+), 1 deletion(-)
 mode change 120000 => 100644 dory/Hardware_targets/PULP/GAP9_NE16/Templates/network_c_template.c

diff --git a/dory/Hardware_targets/PULP/GAP9_NE16/Templates/network_c_template.c b/dory/Hardware_targets/PULP/GAP9_NE16/Templates/network_c_template.c
deleted file mode 120000
index 72359887..00000000
--- a/dory/Hardware_targets/PULP/GAP9_NE16/Templates/network_c_template.c
+++ /dev/null
@@ -1 +0,0 @@
-../../GAP9/Templates/network_c_template.c
\ No newline at end of file
diff --git a/dory/Hardware_targets/PULP/GAP9_NE16/Templates/network_c_template.c b/dory/Hardware_targets/PULP/GAP9_NE16/Templates/network_c_template.c
new file mode 100644
index 00000000..de554311
--- /dev/null
+++ b/dory/Hardware_targets/PULP/GAP9_NE16/Templates/network_c_template.c
@@ -0,0 +1,505 @@
+/*
+ * network.c
+ * Alessio Burrello <alessio.burrello@unibo.it>
+ * Thorir Mar Ingolfsson <thoriri@iis.ee.ethz.ch>
+ *
+ * Copyright (C) 2019-2020 University of Bologna
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+<%
+l3_supported = DORY_HW_graph[0].HW_description['memory']['levels'] > 2
+%>\
+#define DEFINE_CONSTANTS
+%if not l3_supported:
+#include "${prefix}weights.h"
+%endif
+#include "net_utils.h"
+#include "pmsis.h"
+#include "${prefix}network.h"
+#include "directional_allocator.h"
+#include "mem.h"
+#include "dory_dma.h"
+#include <string.h>
+% for layer in list_h:
+#include "${layer}"
+% endfor
+
+% if sdk == 'pulp-sdk':
+#define ICACHE_CTRL_UNIT 0x10201400
+#define ICACHE_PREFETCH ICACHE_CTRL_UNIT + 0x1C
+% endif
+
+% if verbose:
+#define VERBOSE 1
+% endif
+
+% if l3_supported:
+#define L3_WEIGHTS_SIZE 4000000
+#define L3_INPUT_SIZE 1500000
+#define L3_OUTPUT_SIZE 1500000
+% endif
+static void *L3_weights = NULL;
+static void *L3_input = NULL;
+static void *L3_output = NULL;
+% if 'Yes' in performance or 'Perf_final' in verbose_level:
+int ${prefix}cycle_network_execution;
+% endif
+% if l3_supported:
+/* Moves the weights and the biases from hyperflash to hyperram */
+void ${prefix}network_initialize() {
+
+  L3_weights = ram_malloc(L3_WEIGHTS_SIZE);
+  L3_input = ram_malloc(L3_INPUT_SIZE);
+  L3_output = ram_malloc(L3_OUTPUT_SIZE);
+
+#ifdef VERBOSE
+  printf("\nL3 Buffer alloc initial\t@ %d:\t%s\n", (unsigned int)L3_weights, L3_weights?"Ok":"Failed");
+  printf("\nL3 Buffer alloc initial\t@ %d:\t%s\n", (unsigned int)L3_input, L3_input?"Ok":"Failed");
+  printf("\nL3 Buffer alloc initial\t@ %d:\t%s\n", (unsigned int)L3_output, L3_output?"Ok":"Failed");
+#endif
+
+  void *w_ptr = L3_weights;
+  for (int i = 0; i < ${weights_number}; i++) {
+    size_t size = load_file_to_ram(w_ptr, L3_weights_files[i]);
+    L3_weights_size[i] = size;
+    w_ptr += size;
+  }
+}
+% endif
+
+% if l3_supported:
+/* Remove RAM memory */
+void ${prefix}network_terminate() {
+  % if l3_supported:
+  ram_free(L3_weights, L3_WEIGHTS_SIZE);
+  ram_free(L3_input, L3_INPUT_SIZE);
+  ram_free(L3_output, L3_OUTPUT_SIZE);
+  % endif
+}
+% endif
+
+void ${prefix}execute_layer_fork(void *args) {
+  layer_args_t *layer_args = (layer_args_t *)args;
+#ifdef TARGET_CHIP_FAMILY_GAP9
+  layer_args->L1_buffer = pi_cl_l1_malloc(NULL, ${l1_buffer});
+#else
+  layer_args->L1_buffer = pmsis_l1_malloc(${l1_buffer});
+#endif
+
+  if (NULL == layer_args->L1_buffer) {
+#ifdef VERBOSE
+    printf("ERROR: Failed to allocate the L1 buffer.\n");
+#endif // VERBOSE
+    return;
+  }
+
+  switch (layer_args->layer_id)
+  {
+% for i in range(len(DORY_HW_graph)):
+    case ${i}:
+      ${func_name[i]}(args);
+      break;
+% endfor
+  }
+
+#ifdef TARGET_CHIP_FAMILY_GAP9
+  pi_cl_l1_free(NULL, layer_args->L1_buffer, ${l1_buffer});
+#else
+  pmsis_l1_malloc_free(layer_args->L1_buffer, ${l1_buffer});
+#endif
+}
+
+struct ${prefix}network_run_token ${prefix}network_run_async(void *l2_buffer, size_t l2_buffer_size, void *l2_final_output, int exec, int initial_dir${", void *L2_input_h" if not l3_supported else ""})
+{
+  struct pi_device cluster_dev = {0};
+  struct pi_cluster_conf conf;
+  struct pi_cluster_task cluster_task = {0};
+  // First open the cluster
+  pi_cluster_conf_init(&conf);
+  conf.id=0;
+#ifdef TARGET_CHIP_FAMILY_GAP9
+  conf.icache_conf = PI_CLUSTER_MASTER_CORE_ICACHE_ENABLE | PI_CLUSTER_ICACHE_PREFETCH_ENABLE | PI_CLUSTER_ICACHE_ENABLE;
+#endif
+<%
+    n_args = 4 if l3_supported else 5
+%>\
+  unsigned int args[${n_args}];
+  args[0] = (unsigned int) l2_buffer;
+  args[1] = (unsigned int) l2_buffer_size;
+  args[2] = (unsigned int) l2_final_output;
+  args[3] = (unsigned int) exec;
+  args[4] = (unsigned int) initial_dir;
+  % if not l3_supported:
+  args[5] = (unsigned int) L2_input_h;
+  % endif
+  // open cluster...
+  pi_cluster_task(&cluster_task, ${prefix}network_run_cluster, args);
+  pi_open_from_conf(&cluster_dev, &conf);
+  if (pi_cluster_open(&cluster_dev))
+    return;
+  // Then offload an entry point, this will get executed on the cluster controller
+#ifndef TARGET_CHIP_FAMILY_GAP9
+  cluster_task.stack_size = ${master_stack};
+#endif
+  cluster_task.slave_stack_size = ${slave_stack};
+  pi_cluster_send_task_to_cl(&cluster_dev, &cluster_task);
+  return (struct ${prefix}network_run_token) {
+    .cluster_dev = cluster_dev
+  };
+}
+
+void ${prefix}network_run_wait(struct ${prefix}network_run_token token)
+{
+  pi_cluster_close(&token.cluster_dev);
+  % if 'Perf_final' in verbose_level:
+  print_perf("Final", ${prefix}cycle_network_execution, ${MACs});
+  % endif
+}
+
+void ${prefix}network_run(void *l2_buffer, size_t l2_buffer_size, void *l2_final_output, int exec, int initial_dir${", void *L2_input_h" if not l3_supported else ""})
+{
+  ${prefix}network_run_wait(network_run_async(l2_buffer, l2_buffer_size, l2_final_output, exec, initial_dir${", L2_input_h" if not l3_supported else ""}));
+}
+
+void ${prefix}network_run_cluster(void *args) {
+  unsigned int * real_args = (unsigned int *) args;
+  void * l2_buffer = (void *) real_args[0];
+  size_t l2_buffer_size = (size_t) real_args[1];
+  void * l2_final_output = (void *) real_args[2];
+  int exec = (int) real_args[3];
+  int dir = (int) real_args[4];
+  % if not l3_supported:
+  void * L2_input_h = (void *)real_args[5];
+  % endif
+/*
+  - initial buffer allocation L2 and L1
+  - variable declaration
+*/
+/* ---------------------------------- */
+/* -------- SECTION 0 BEGIN --------- */
+/* ---------------------------------- */
+  void *L2_output = NULL;
+  void *L2_input = NULL;
+  void *L2_weights = NULL;
+  void *L3_weights_curr = L3_weights;
+  void *bypass_activations = NULL;
+
+  int residual_number = 0;
+  int bypass_dimension = 0;
+  % if not l3_supported:
+  int left_branch_nodes = 0, right_branch_nodes = 0;
+  int z = 0;
+  int end_left = 0;
+  % endif
+
+  pi_perf_conf(1<<PI_PERF_CYCLES);
+  int perf_cyc = 0;
+  int io_cyc = 0;
+/* ---------------------------------- */
+/* --------- SECTION 0 END ---------- */
+/* ---------------------------------- */
+
+/*
+  - initial copies from L3 of input
+  - copies of weights of first 2 layers
+*/
+/* ---------------------------------- */
+/* -------- SECTION 1 BEGIN --------- */
+/* ---------------------------------- */
+  % if not l3_supported:
+  L2_input = L2_input_h;
+  % endif
+  directional_allocator_init(l2_buffer, l2_buffer_size);
+
+/* ---------------------------------- */
+/* --------- SECTION 1 END ---------- */
+/* ---------------------------------- */
+  % if 'Yes' in performance or 'Perf_final' in verbose_level:
+  // perf measurement begin
+  ${prefix}cycle_network_execution = 0;
+  % endif
+/* MAIN SECTION
+  - for loop over all the layers of the network
+  - double buffering using L3
+  - check on layers to be executed from L3
+  - residual check at the end of each layer
+*/
+/* ---------------------------------- */
+/* -------- SECTION 2 BEGIN --------- */
+/* ---------------------------------- */
+
+  // IO cycle cycle measurement start
+  % if 'Yes' in performance or 'Perf_final' in verbose_level:
+  pi_perf_reset();
+  pi_perf_stop();
+  pi_perf_start();
+  % endif
+
+  int weight_l_cnt = 0; // count how many layers with weights we have processed to increment the weights_L3 pointer
+  for (int i = 0; i < ${len(DORY_HW_graph)}; i++) {
+/* MEMORY ALLOCATION
+  - allocate memory if layer is executed from L3;
+  - allocate weights
+  - read weights
+*/
+    L2_output = dmalloc(activations_out_size[i], !dir);
+    % if l3_supported:
+    if (L3_input_layers[i] == 1)
+      L2_input = dmalloc(activations_size[i], dir);
+
+    if (layer_with_weights[i] == 1)
+      L2_weights = dmalloc(weights_size[i], dir);
+
+    if (allocate_layer[i] == 1)
+      cl_ram_read(L2_weights, L3_weights_curr, weights_size[i]);
+    % else:
+    L2_weights = Weights_name[i];
+    % endif
+
+% if 'Check_all' in verbose_level:
+#ifdef VERBOSE
+    % if 'Yes' in performance or 'Perf_final' in verbose_level:
+    pi_perf_stop();
+    % endif
+    % if l3_supported:
+    if (L3_input_layers[i] == 1)
+      printf("Input in L3\n");
+    else
+    % endif
+    if (i == 0 || branch_change[i-1] == 0) {
+      checksum("L2 input", L2_input, activations_size[i], activations_checksum[i][exec]);
+      % if l3_supported:
+      if (allocate_layer[i] == 1)
+      % else:
+      if (layer_with_weights[i])
+      % endif
+        checksum("L2 weights", L2_weights, weights_size[i], weights_checksum[i]);
+      else
+        printf("Weights in L3\n");
+    }
+    else
+      printf("Switching branch, already checked activation\n");
+    % if 'Yes' in performance or 'Perf_final' in verbose_level:
+    pi_perf_start();
+    % endif
+#endif
+% endif
+
+    layer_args_t largs = {
+      .L3_input = (unsigned int) L3_input,
+      .L3_output = (unsigned int) L3_output,
+      .L3_after_weights = (unsigned int) L3_weights_curr,
+      .L2_input = (unsigned int) L2_input,
+      .bypass = (unsigned int) bypass_activations,
+      .L2_output = (unsigned int) L2_output,
+      .L2_weights = (unsigned int) L2_weights,
+      .L1_buffer = 0,
+      .ram = (unsigned int) get_ram_ptr(),
+      .padding = NET_UTILS_PAD_TOP | NET_UTILS_PAD_BOTTOM,
+      .layer_id = i
+    };
+
+    % if 'Yes' in performance or 'Perf_final' in verbose_level:
+    pi_perf_stop();
+    io_cyc += pi_perf_read(PI_PERF_CYCLES);
+    % endif
+
+/*
+- Execution of the layers_pointers
+*/
+    % if 'Yes' in performance or 'Perf_final' in verbose_level:
+    // perf measurement begin
+    pi_perf_reset();
+    pi_perf_stop();
+    pi_perf_start();
+    % endif
+    ${prefix}execute_layer_fork((void *) &largs);
+    % if 'Yes' in performance or 'Perf_final' in verbose_level:
+    // performance measurements: end
+    pi_perf_stop();
+    perf_cyc =  pi_perf_read(PI_PERF_CYCLES);
+    ${prefix}cycle_network_execution += perf_cyc;
+    % endif
+
+    % if 'Yes' in performance:
+    print_perf(Layers_name[i], perf_cyc, NODEs_MACS[i]);
+    % endif
+
+    % if 'Yes' in performance or 'Perf_final' in verbose_level:
+    pi_perf_reset();
+    pi_perf_stop();
+    pi_perf_start();
+    % endif
+
+    // TODO: What error?
+    // prevents error from compiler
+    asm volatile("": : :"memory");
+    unsigned int temp = L3_input;
+    L3_input = L3_output;
+    asm volatile("": : :"memory");
+    L3_output = temp;
+    asm volatile("": : :"memory");
+
+#ifdef VERBOSE
+    % if 'Yes' in performance or 'Perf_final' in verbose_level:
+    pi_perf_stop();
+    % endif
+    printf("Layer %s %d ended: \n", Layers_name[i], i);
+    % if 'Check_all' in verbose_level:
+    % if l3_supported:
+    if (L3_output_layers[i]==1) {
+      printf("Output in L3. Expected checksum: %d\n", activations_out_checksum[i][exec]);
+    } else {
+    % endif
+      checksum(i + 1 < ${len(DORY_HW_graph)} ? "L2 output" : "final output",
+               L2_output, activations_out_size[i], activations_out_checksum[i][exec]);
+    % if l3_supported:
+    }
+    % endif
+    printf("\n");
+    % elif 'Last' in verbose_level:
+    if (i == ${len(DORY_HW_graph) - 1})
+        checksum("final layer", L2_output, activations_out_size[i], activations_out_checksum[i][exec]);
+    % endif
+    % if 'Yes' in performance or 'Perf_final' in verbose_level:
+    pi_perf_start();
+    % endif
+#endif
+
+    // Free memory
+    % if l3_supported:
+    if (layer_with_weights[i] == 1)
+      dfree(weights_size[i], dir);
+    dfree(activations_size[i], dir);
+    % endif
+    if (branch_input[i] == 1)
+      dfree(bypass_dimension, dir);
+    L2_input = L2_output;
+    % if not l3_supported:
+    if  (branch_output[i]==1)
+      {
+        bypass_activations = L2_output;
+        bypass_dimension = activations_out_size[i];
+      }
+
+    if (i > 0 && branch_output[i-1] == 0 && branch_change[i-1] == 0)
+      dfree(activations_size[i], dir);
+    % endif
+    // Residual connections
+    if (i < ${len(DORY_HW_graph) - 1}) {
+      % if l3_supported:
+      if (branch_input[i+1] == 1) {
+        bypass_activations = dmalloc(bypass_dimension, !dir);
+        residual_number--;
+        cl_ram_read(bypass_activations, layers_pointers[residual_number], bypass_dimension);
+        cl_ram_free(layers_pointers[residual_number], bypass_dimension);
+      }
+
+      // TODO I feel like this should look ahead instead of back
+      if (i > 0 && branch_output[i-1]==1 && L3_input_layers[i]==1) { // TODO don't understand this condition
+        L3_input = cl_ram_malloc(1500000);
+      }
+      if (branch_output[i]==1 && L3_output_layers[i]==1) {
+        cl_ram_free(L3_input + activations_out_size[i], 1500000 - activations_out_size[i]);
+        layers_pointers[residual_number] = L3_input;
+        residual_number++;
+        bypass_dimension = activations_out_size[i];
+      } else
+      if (branch_output[i]==1 || branch_change[i] == 1) {
+          layers_pointers[residual_number] = cl_ram_malloc(activations_out_size[i]);
+          cl_ram_write(layers_pointers[residual_number], L2_output, activations_out_size[i]);
+          residual_number++;
+          bypass_dimension = activations_out_size[i];
+      }
+
+      if (branch_change[i]==1) {
+        dfree(activations_out_size[i], !dir);
+        L2_input = dmalloc(activations_size[i + 1], !dir);
+        cl_ram_read(L2_input, layers_pointers[residual_number - 2], activations_size[i + 1]);
+        cl_ram_free(layers_pointers[residual_number - 2], activations_size[i + 1]);
+      }
+      if (L3_output_layers[i] == 1)
+        dfree(activations_out_size[i], !dir);
+      % else:
+
+      if  (branch_output[i]==1) {
+        left_branch_nodes = 0;
+        right_branch_nodes = 0;
+        z = i+1;
+        end_left = 0;
+        while (branch_input[z] == 0) {
+          if (end_left == 0)
+            left_branch_nodes+=1;
+          else
+            right_branch_nodes+=1;
+          if (branch_change[z] == 1)
+            end_left = 1;
+          z+=1;
+        }
+        if ((left_branch_nodes % 2 == 1) && (right_branch_nodes == 0))
+          dir = !dir;
+        if ((left_branch_nodes % 2 == 0) && (right_branch_nodes > 0))
+          dir = !dir;
+      }
+
+      if  (branch_change[i]==1) {
+        L2_input = bypass_activations;
+        bypass_activations = L2_output;
+        bypass_dimension = activations_out_size[i];
+        if (right_branch_nodes % 2 == 1)
+          dir = !dir;
+      }
+      % endif
+    }
+    % if l3_supported:
+    if (layer_with_weights[i])
+       L3_weights_curr += L3_weights_size[weight_l_cnt++];
+    % endif
+    dir = !dir;
+  }
+
+  % if 'Yes' in performance or 'Perf_final' in verbose_level:
+  pi_perf_stop();
+  io_cyc += pi_perf_read(PI_PERF_CYCLES);
+  % endif
+  % if 'Yes' in performance:
+  print_perf("IO wait", io_cyc, 0 /*ops*/);
+  % endif
+  % if 'Perf_final' in verbose_level:
+  ${prefix}cycle_network_execution += io_cyc;
+  % endif
+
+  % if 'Last' in verbose_level:
+  checksum("final output", L2_output,
+           activations_out_size[${len(DORY_HW_graph)-1}],
+           activations_out_checksum[${len(DORY_HW_graph)-1}][exec]);
+  % endif
+
+  //memcpy(L2_output, l2_final_output, activations_out_size[${len(DORY_HW_graph)-1}]); // BUGGY!
+  for (int i=0; i<activations_out_size[${len(DORY_HW_graph)-1}]; i++)
+    *((uint8_t*)(l2_final_output+i)) = *((uint8_t*)(L2_output+i));
+
+/* ---------------------------------- */
+/* --------- SECTION 2 END ---------- */
+/* ---------------------------------- */
+
+/* ---------------------------------- */
+/* -------- SECTION 3 BEGIN --------- */
+/* ---------------------------------- */
+
+
+/* ---------------------------------- */
+/* --------- SECTION 3 END ---------- */
+/* ---------------------------------- */
+}

From a9c0320e72f1dc1957bccede2441b9916d9e628b Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Tue, 30 Jan 2024 14:24:02 +0100
Subject: [PATCH 19/30] Move dma mutex init to network

---
 .../layer_templates/layer_L2_c_conv_ne16_multicore_template.c  | 1 -
 .../PULP/GAP9_NE16/Templates/network_c_template.c              | 3 +++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/dory/Hardware_targets/PULP/GAP9_NE16/Templates/layer_templates/layer_L2_c_conv_ne16_multicore_template.c b/dory/Hardware_targets/PULP/GAP9_NE16/Templates/layer_templates/layer_L2_c_conv_ne16_multicore_template.c
index b1b42d84..f965c317 100644
--- a/dory/Hardware_targets/PULP/GAP9_NE16/Templates/layer_templates/layer_L2_c_conv_ne16_multicore_template.c
+++ b/dory/Hardware_targets/PULP/GAP9_NE16/Templates/layer_templates/layer_L2_c_conv_ne16_multicore_template.c
@@ -389,7 +389,6 @@ void ${func_name}(void *args) {
 
     const ne16_pulp_conf_t ne16_pulp_conf = {.max_stall = 8};
     ne16_nnx_init(ne16_pulp_get_dev(), &ne16_pulp_conf);
-    dma_mutex_init();
 
 
     // Fork
diff --git a/dory/Hardware_targets/PULP/GAP9_NE16/Templates/network_c_template.c b/dory/Hardware_targets/PULP/GAP9_NE16/Templates/network_c_template.c
index de554311..93dc78fe 100644
--- a/dory/Hardware_targets/PULP/GAP9_NE16/Templates/network_c_template.c
+++ b/dory/Hardware_targets/PULP/GAP9_NE16/Templates/network_c_template.c
@@ -206,6 +206,9 @@ void ${prefix}network_run_cluster(void *args) {
   pi_perf_conf(1<<PI_PERF_CYCLES);
   int perf_cyc = 0;
   int io_cyc = 0;
+
+  // dma init
+  dma_mutex_init();
 /* ---------------------------------- */
 /* --------- SECTION 0 END ---------- */
 /* ---------------------------------- */

From bd19a21e9004c23c028dfdf332163096c8f556e3 Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Tue, 30 Jan 2024 14:55:24 +0100
Subject: [PATCH 20/30] Move ne16 init to network

---
 .../layer_L2_c_conv_ne16_multicore_template.c            | 8 +++-----
 .../PULP/GAP9_NE16/Templates/network_c_template.c        | 9 +++++++++
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/dory/Hardware_targets/PULP/GAP9_NE16/Templates/layer_templates/layer_L2_c_conv_ne16_multicore_template.c b/dory/Hardware_targets/PULP/GAP9_NE16/Templates/layer_templates/layer_L2_c_conv_ne16_multicore_template.c
index f965c317..bfd85ad6 100644
--- a/dory/Hardware_targets/PULP/GAP9_NE16/Templates/layer_templates/layer_L2_c_conv_ne16_multicore_template.c
+++ b/dory/Hardware_targets/PULP/GAP9_NE16/Templates/layer_templates/layer_L2_c_conv_ne16_multicore_template.c
@@ -339,11 +339,13 @@ static void layer_task_fork(void *args) {
 }
 
 void ${func_name}(void *args) {
-
     #ifdef DEBUG_GVSOC
     nnx_activate_gvsoc_logging(GVSOC_LOG_LEVEL_CONFIG, GVSOC_LOGGING_FORMAT_DECIMAL);
     #endif
 
+    ne16_dev_t *ne16_dev = ne16_pulp_get_dev();
+    hwpe_soft_clear(&ne16_dev->hwpe_dev);
+
     layer_args_t *layer_args = (layer_args_t *)args;
 
     // Initialization
@@ -387,9 +389,6 @@ void ${func_name}(void *args) {
         ne16_task_set_weight_offset(&ne16_tasks[i], weightOffsetModeLayerWise, ${weight_offset});
     }
 
-    const ne16_pulp_conf_t ne16_pulp_conf = {.max_stall = 8};
-    ne16_nnx_init(ne16_pulp_get_dev(), &ne16_pulp_conf);
-
 
     // Fork
 
@@ -401,5 +400,4 @@ void ${func_name}(void *args) {
     monitor_term(monitor.input);
     monitor_term(monitor.output);
     monitor_term(monitor.store_conf);
-    ne16_nnx_term(ne16_pulp_get_dev());
 }
diff --git a/dory/Hardware_targets/PULP/GAP9_NE16/Templates/network_c_template.c b/dory/Hardware_targets/PULP/GAP9_NE16/Templates/network_c_template.c
index 93dc78fe..cfbf6096 100644
--- a/dory/Hardware_targets/PULP/GAP9_NE16/Templates/network_c_template.c
+++ b/dory/Hardware_targets/PULP/GAP9_NE16/Templates/network_c_template.c
@@ -30,6 +30,8 @@ l3_supported = DORY_HW_graph[0].HW_description['memory']['levels'] > 2
 #include "directional_allocator.h"
 #include "mem.h"
 #include "dory_dma.h"
+#include "ne16_pulp_bsp.h"
+#include "pulp_nnx_ne16.h"
 #include <string.h>
 % for layer in list_h:
 #include "${layer}"
@@ -209,6 +211,11 @@ void ${prefix}network_run_cluster(void *args) {
 
   // dma init
   dma_mutex_init();
+
+  // ne16 init
+  const ne16_pulp_conf_t ne16_pulp_conf = {.max_stall = 8};
+  ne16_nnx_init(ne16_pulp_get_dev(), &ne16_pulp_conf);
+
 /* ---------------------------------- */
 /* --------- SECTION 0 END ---------- */
 /* ---------------------------------- */
@@ -505,4 +512,6 @@ void ${prefix}network_run_cluster(void *args) {
 /* ---------------------------------- */
 /* --------- SECTION 3 END ---------- */
 /* ---------------------------------- */
+
+  ne16_nnx_term(ne16_pulp_get_dev());
 }

From 456816b7b0fbf9bd6bfd980c7797e977b150e1a5 Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Tue, 30 Jan 2024 15:11:58 +0100
Subject: [PATCH 21/30] Move ne16_tasks from static global to the stack

---
 .../layer_L2_c_conv_ne16_multicore_template.c | 45 +++++++++++++------
 1 file changed, 32 insertions(+), 13 deletions(-)

diff --git a/dory/Hardware_targets/PULP/GAP9_NE16/Templates/layer_templates/layer_L2_c_conv_ne16_multicore_template.c b/dory/Hardware_targets/PULP/GAP9_NE16/Templates/layer_templates/layer_L2_c_conv_ne16_multicore_template.c
index bfd85ad6..6b125c85 100644
--- a/dory/Hardware_targets/PULP/GAP9_NE16/Templates/layer_templates/layer_L2_c_conv_ne16_multicore_template.c
+++ b/dory/Hardware_targets/PULP/GAP9_NE16/Templates/layer_templates/layer_L2_c_conv_ne16_multicore_template.c
@@ -183,7 +183,6 @@ static int inc(int index, int end) {
 #define BUFFER_SIZE (2)
 
 static Layer tiles[BUFFER_SIZE];
-static ne16_task_t ne16_tasks[BUFFER_SIZE];
 static DmaTransferConf store_conf[BUFFER_SIZE];
 
 static struct {
@@ -191,33 +190,44 @@ static struct {
 } monitor;
 
 
-static void layer_task_fork(void *args) {
+struct layer_task_fork_args_t {
+    uint32_t L2_input;
+    uint32_t L2_weights;
+    uint32_t L2_output;
+    uint32_t L1_buffer;
+    uint32_t padding;
+    ne16_task_t *ne16_tasks;
+};
+
+
+static void layer_task_fork(void *void_args) {
     const int total_tiles = end_index.height * end_index.width * end_index.output_channel;
 
+    struct layer_task_fork_args_t *args = (struct layer_task_fork_args_t *)void_args;
+    ne16_task_t *ne16_tasks = args->ne16_tasks;
+
     // Loader
 
     if (pi_core_id() == LOADER_ID) {
-        layer_args_t *layer_args = (layer_args_t *)args;
-
         Layer layer = {
             .addr = {
-                .input = layer_args->L2_input,
-                .weights = layer_args->L2_weights,
-                .scale = layer_args->L2_weights + ${l2_k_offset},
-                .bias = layer_args->L2_weights + ${l2_lambda_offset},
-                .output = layer_args->L2_output
+                .input = args->L2_input,
+                .weights = args->L2_weights,
+                .scale = args->L2_weights + ${l2_k_offset},
+                .bias = args->L2_weights + ${l2_lambda_offset},
+                .output = args->L2_output
             },
             .padding = {
-                .top    = layer_args->padding & NET_UTILS_PAD_TOP ? ${padding_top} : NE16_DONT_PAD,
+                .top    = args->padding & NET_UTILS_PAD_TOP ? ${padding_top} : NE16_DONT_PAD,
                 .right  = ${padding_right},
-                .bottom = layer_args->padding & NET_UTILS_PAD_BOTTOM ? ${padding_bottom} : NE16_DONT_PAD,
+                .bottom = args->padding & NET_UTILS_PAD_BOTTOM ? ${padding_bottom} : NE16_DONT_PAD,
                 .left   = ${padding_left}
             }
         };
 
         // Double buffer address init
 
-        const unsigned int l1_buffer = layer_args->L1_buffer;
+        const unsigned int l1_buffer = args->L1_buffer;
         const int l1_buffer_input = l1_buffer + ${l1_x_offset};
         const int l1_buffer_output = l1_buffer + ${l1_y_offset};
         const int l1_buffer_weights = l1_buffer + ${l1_W_offset};
@@ -371,6 +381,7 @@ void ${func_name}(void *args) {
     }
 
     // Init nnx tasks
+    ne16_task_t ne16_tasks[BUFFER_SIZE];
     for (int i = 0; i < BUFFER_SIZE; i++) {
         ne16_task_init(&ne16_tasks[i]);
         ne16_task_set_op_to_conv(&ne16_tasks[i], ${fs1}, ${int(flag_DW)}, ${stride});
@@ -392,7 +403,15 @@ void ${func_name}(void *args) {
 
     // Fork
 
-    pi_cl_team_fork(CORES, (void *)layer_task_fork, args);
+    struct layer_task_fork_args_t layer_task_fork_args = {
+        .L2_input = layer_args->L2_input,
+        .L2_weights = layer_args->L2_weights,
+        .L2_output = layer_args->L2_output,
+        .L1_buffer = layer_args->L1_buffer,
+        .padding = layer_args->padding,
+        .ne16_tasks = ne16_tasks,
+    };
+    pi_cl_team_fork(CORES, layer_task_fork, (void *)&layer_task_fork_args);
 
 
     // Terminate

From 08a58191690b3966a68be3b1d0e2c79b1a0a1034 Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Tue, 30 Jan 2024 15:18:04 +0100
Subject: [PATCH 22/30] Move global static tiles to stack

---
 .../layer_L2_c_conv_ne16_multicore_template.c                | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/dory/Hardware_targets/PULP/GAP9_NE16/Templates/layer_templates/layer_L2_c_conv_ne16_multicore_template.c b/dory/Hardware_targets/PULP/GAP9_NE16/Templates/layer_templates/layer_L2_c_conv_ne16_multicore_template.c
index 6b125c85..34803e5e 100644
--- a/dory/Hardware_targets/PULP/GAP9_NE16/Templates/layer_templates/layer_L2_c_conv_ne16_multicore_template.c
+++ b/dory/Hardware_targets/PULP/GAP9_NE16/Templates/layer_templates/layer_L2_c_conv_ne16_multicore_template.c
@@ -182,7 +182,6 @@ static int inc(int index, int end) {
 
 #define BUFFER_SIZE (2)
 
-static Layer tiles[BUFFER_SIZE];
 static DmaTransferConf store_conf[BUFFER_SIZE];
 
 static struct {
@@ -197,6 +196,7 @@ struct layer_task_fork_args_t {
     uint32_t L1_buffer;
     uint32_t padding;
     ne16_task_t *ne16_tasks;
+    Layer *tiles;
 };
 
 
@@ -205,6 +205,7 @@ static void layer_task_fork(void *void_args) {
 
     struct layer_task_fork_args_t *args = (struct layer_task_fork_args_t *)void_args;
     ne16_task_t *ne16_tasks = args->ne16_tasks;
+    Layer *tiles = args->tiles;
 
     // Loader
 
@@ -400,6 +401,7 @@ void ${func_name}(void *args) {
         ne16_task_set_weight_offset(&ne16_tasks[i], weightOffsetModeLayerWise, ${weight_offset});
     }
 
+    Layer tiles[BUFFER_SIZE];
 
     // Fork
 
@@ -410,6 +412,7 @@ void ${func_name}(void *args) {
         .L1_buffer = layer_args->L1_buffer,
         .padding = layer_args->padding,
         .ne16_tasks = ne16_tasks,
+        .tiles = tiles,
     };
     pi_cl_team_fork(CORES, layer_task_fork, (void *)&layer_task_fork_args);
 

From 361eb0c436f9df998953f4776cf0afa0f0bfe075 Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Tue, 30 Jan 2024 15:22:48 +0100
Subject: [PATCH 23/30] Move global static store_conf to stack

---
 .../layer_L2_c_conv_ne16_multicore_template.c               | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/dory/Hardware_targets/PULP/GAP9_NE16/Templates/layer_templates/layer_L2_c_conv_ne16_multicore_template.c b/dory/Hardware_targets/PULP/GAP9_NE16/Templates/layer_templates/layer_L2_c_conv_ne16_multicore_template.c
index 34803e5e..834fdcfd 100644
--- a/dory/Hardware_targets/PULP/GAP9_NE16/Templates/layer_templates/layer_L2_c_conv_ne16_multicore_template.c
+++ b/dory/Hardware_targets/PULP/GAP9_NE16/Templates/layer_templates/layer_L2_c_conv_ne16_multicore_template.c
@@ -182,8 +182,6 @@ static int inc(int index, int end) {
 
 #define BUFFER_SIZE (2)
 
-static DmaTransferConf store_conf[BUFFER_SIZE];
-
 static struct {
     Monitor input, output, store_conf;
 } monitor;
@@ -197,6 +195,7 @@ struct layer_task_fork_args_t {
     uint32_t padding;
     ne16_task_t *ne16_tasks;
     Layer *tiles;
+    DmaTransferConf *store_conf;
 };
 
 
@@ -206,6 +205,7 @@ static void layer_task_fork(void *void_args) {
     struct layer_task_fork_args_t *args = (struct layer_task_fork_args_t *)void_args;
     ne16_task_t *ne16_tasks = args->ne16_tasks;
     Layer *tiles = args->tiles;
+    DmaTransferConf *store_conf = args->store_conf;
 
     // Loader
 
@@ -402,6 +402,7 @@ void ${func_name}(void *args) {
     }
 
     Layer tiles[BUFFER_SIZE];
+    DmaTransferConf store_conf[BUFFER_SIZE];
 
     // Fork
 
@@ -413,6 +414,7 @@ void ${func_name}(void *args) {
         .padding = layer_args->padding,
         .ne16_tasks = ne16_tasks,
         .tiles = tiles,
+        .store_conf = store_conf,
     };
     pi_cl_team_fork(CORES, layer_task_fork, (void *)&layer_task_fork_args);
 

From ef7dcb71f1252a23603d544324046bde130a8e4b Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Tue, 30 Jan 2024 15:31:09 +0100
Subject: [PATCH 24/30] Move global static monitor to stack

---
 .../layer_L2_c_conv_ne16_multicore_template.c | 33 ++++++++++---------
 1 file changed, 18 insertions(+), 15 deletions(-)

diff --git a/dory/Hardware_targets/PULP/GAP9_NE16/Templates/layer_templates/layer_L2_c_conv_ne16_multicore_template.c b/dory/Hardware_targets/PULP/GAP9_NE16/Templates/layer_templates/layer_L2_c_conv_ne16_multicore_template.c
index 834fdcfd..d9e584e8 100644
--- a/dory/Hardware_targets/PULP/GAP9_NE16/Templates/layer_templates/layer_L2_c_conv_ne16_multicore_template.c
+++ b/dory/Hardware_targets/PULP/GAP9_NE16/Templates/layer_templates/layer_L2_c_conv_ne16_multicore_template.c
@@ -182,10 +182,9 @@ static int inc(int index, int end) {
 
 #define BUFFER_SIZE (2)
 
-static struct {
+typedef struct {
     Monitor input, output, store_conf;
-} monitor;
-
+} TaskMonitors;
 
 struct layer_task_fork_args_t {
     uint32_t L2_input;
@@ -196,6 +195,7 @@ struct layer_task_fork_args_t {
     ne16_task_t *ne16_tasks;
     Layer *tiles;
     DmaTransferConf *store_conf;
+    TaskMonitors *monitor;
 };
 
 
@@ -206,6 +206,7 @@ static void layer_task_fork(void *void_args) {
     ne16_task_t *ne16_tasks = args->ne16_tasks;
     Layer *tiles = args->tiles;
     DmaTransferConf *store_conf = args->store_conf;
+    TaskMonitors *monitor = args->monitor;
 
     // Loader
 
@@ -274,7 +275,7 @@ static void layer_task_fork(void *void_args) {
             const Address local_addr = tile_status_get_addr(tile_status, buffer_addresses);
             Layer tile = tile_create(tile_status.index, end_index, body, border, layer, local_addr);
 
-            monitor_produce_begin(monitor.input);
+            monitor_produce_begin(monitor->input);
             tiles[i_buff] = tile;
             dma_mutex_lock();
             DmaTransfer transfer = dma_transfer_create();
@@ -288,11 +289,11 @@ static void layer_task_fork(void *void_args) {
             dma_mutex_lock();
             dma_transfer_wait(transfer);
             dma_mutex_unlock();
-            monitor_produce_end(monitor.input);
+            monitor_produce_end(monitor->input);
 
-            monitor_produce_begin(monitor.store_conf);
+            monitor_produce_begin(monitor->store_conf);
             store_prepare(tiles[i_buff], body, layer, tile_status.index, &store_conf[i_buff]);
-            monitor_produce_end(monitor.store_conf);
+            monitor_produce_end(monitor->store_conf);
 
             i_buff = inc(i_buff, BUFFER_SIZE);
             tile_status = tile_status_get_next(tile_status, end_index, layer, 0, kernel);
@@ -305,8 +306,8 @@ static void layer_task_fork(void *void_args) {
     if (pi_core_id() == EXECUTER_ID) {
         int i_buff = 0;
         for (int i_tile = 0; i_tile < total_tiles; i_tile++) {
-            monitor_consume_begin(monitor.input);
-            monitor_produce_begin(monitor.output);
+            monitor_consume_begin(monitor->input);
+            monitor_produce_begin(monitor->output);
 
             % if stride == 1:
             execute_async(&ne16_tasks[i_buff]);
@@ -314,7 +315,7 @@ static void layer_task_fork(void *void_args) {
             execute_stride2x2_blocking(&ne16_tasks[i_buff], tiles[i_buff], kernel);
             % endif
 
-            monitor_produce_end(monitor.output);
+            monitor_produce_end(monitor->output);
 
             i_buff = inc(i_buff, BUFFER_SIZE);
         }
@@ -326,11 +327,11 @@ static void layer_task_fork(void *void_args) {
     if (pi_core_id() == STORER_ID) {
         int i_buff = 0;
         for (int i_tile = 0; i_tile < total_tiles; i_tile++) {
-            monitor_consume_begin(monitor.store_conf);
-            monitor_consume_begin(monitor.output);
+            monitor_consume_begin(monitor->store_conf);
+            monitor_consume_begin(monitor->output);
 
             execute_wait(&ne16_tasks[i_buff]);
-            monitor_consume_end(monitor.input);
+            monitor_consume_end(monitor->input);
 
             dma_mutex_lock();
             DmaTransfer transfer = dma_transfer_create();
@@ -341,8 +342,8 @@ static void layer_task_fork(void *void_args) {
             dma_transfer_wait(transfer);
             dma_mutex_unlock();
 
-            monitor_consume_end(monitor.store_conf);
-            monitor_consume_end(monitor.output);
+            monitor_consume_end(monitor->store_conf);
+            monitor_consume_end(monitor->output);
 
             i_buff = inc(i_buff, BUFFER_SIZE);
         }
@@ -360,6 +361,7 @@ void ${func_name}(void *args) {
     layer_args_t *layer_args = (layer_args_t *)args;
 
     // Initialization
+    TaskMonitors monitor;
 
     int err = 0;
 
@@ -415,6 +417,7 @@ void ${func_name}(void *args) {
         .ne16_tasks = ne16_tasks,
         .tiles = tiles,
         .store_conf = store_conf,
+        .monitor = &monitor,
     };
     pi_cl_team_fork(CORES, layer_task_fork, (void *)&layer_task_fork_args);
 

From f080507b4ca1e8d7d6d41fa139605c078d3b68f8 Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Tue, 30 Jan 2024 16:02:10 +0100
Subject: [PATCH 25/30] Fix path to network template

---
 dory/Hardware_targets/PULP/Common/C_Parser.py         |  4 ++--
 dory/Hardware_targets/PULP/GAP9_NE16/C_Parser.py      |  4 ++--
 dory/Parsers/Parser_HW_to_C.py                        | 11 ++++++++---
 .../Utils/Templates_writer/Network_template_writer.py |  7 ++++---
 4 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/dory/Hardware_targets/PULP/Common/C_Parser.py b/dory/Hardware_targets/PULP/Common/C_Parser.py
index e7da3f60..514bbb3e 100644
--- a/dory/Hardware_targets/PULP/Common/C_Parser.py
+++ b/dory/Hardware_targets/PULP/Common/C_Parser.py
@@ -93,8 +93,8 @@ def mapping_layers_to_C_files(self):
 
             if n_memory_levels > 2 and (node.L3_input != 0 or (node.tiling_dimensions["L3"]["output_dimensions"] != node.tiling_dimensions["L2"]["output_dimensions"]) or (node.tiling_dimensions["L3"]["weights_dimensions"] != node.tiling_dimensions["L2"]["weights_dimensions"])):
                 tk = Layer2D_writer.print_template_layer_L3(node)
-                TemplateWriter.write(tk, {os.path.join(self.src_dir, node.prefixed_name + ".c"): os.path.join(self.tmpl_dir, "layer_L3_c_template.c"),
-                                          os.path.join(self.inc_dir, node.prefixed_name + ".h"): os.path.join(self.tmpl_dir, "layer_L3_h_template.h")})
+                TemplateWriter.write(tk, {os.path.join(self.src_dir, node.prefixed_name + ".c"): os.path.join(self.layer_tmpl_dir, "layer_L3_c_template.c"),
+                                          os.path.join(self.inc_dir, node.prefixed_name + ".h"): os.path.join(self.layer_tmpl_dir, "layer_L3_h_template.h")})
                 if node.tiling_dimensions["L3"]["input_dimensions"][1] > node.tiling_dimensions["L2"]["input_dimensions"][1]:
                     node.tiling_dimensions["L2"]["output_dimensions"][1]  = int(np.floor((node.tiling_dimensions["L2"]["input_dimensions"][1] - node.kernel_shape[0] + node.strides[0]) / node.strides[0]))
                 if node.tiling_dimensions["L3"]["output_dimensions"][1] > node.tiling_dimensions["L2"]["output_dimensions"][1]:
diff --git a/dory/Hardware_targets/PULP/GAP9_NE16/C_Parser.py b/dory/Hardware_targets/PULP/GAP9_NE16/C_Parser.py
index 3a1da89a..29ad78b0 100644
--- a/dory/Hardware_targets/PULP/GAP9_NE16/C_Parser.py
+++ b/dory/Hardware_targets/PULP/GAP9_NE16/C_Parser.py
@@ -72,8 +72,8 @@ def mapping_layers_to_C_files(self):
             if n_memory_levels > 2 and (node.L3_input != 0 or (node.tiling_dimensions["L3"]["output_dimensions"] != node.tiling_dimensions["L2"]["output_dimensions"]) or (node.tiling_dimensions["L3"]["weights_dimensions"] != node.tiling_dimensions["L2"]["weights_dimensions"])):
                 #breakpoint()
                 tk = Layer2D_writer.print_template_layer_L3(node)
-                TemplateWriter.write(tk, {os.path.join(self.src_dir, node.prefixed_name + ".c"): os.path.join(self.tmpl_dir, "layer_L3_c_template.c"),
-                                          os.path.join(self.inc_dir, node.prefixed_name + ".h"): os.path.join(self.tmpl_dir, "layer_L3_h_template.h")})
+                TemplateWriter.write(tk, {os.path.join(self.src_dir, node.prefixed_name + ".c"): os.path.join(self.layer_tmpl_dir, "layer_L3_c_template.c"),
+                                          os.path.join(self.inc_dir, node.prefixed_name + ".h"): os.path.join(self.layer_tmpl_dir, "layer_L3_h_template.h")})
                 if node.tiling_dimensions["L3"]["input_dimensions"][1] > node.tiling_dimensions["L2"]["input_dimensions"][1]:
                     node.tiling_dimensions["L2"]["output_dimensions"][1] = (node.tiling_dimensions["L2"]["input_dimensions"][1] - node.kernel_shape[0] + node.strides[0]) // node.strides[0]
                 if node.tiling_dimensions["L3"]["output_dimensions"][1] > node.tiling_dimensions["L2"]["output_dimensions"][1]:
diff --git a/dory/Parsers/Parser_HW_to_C.py b/dory/Parsers/Parser_HW_to_C.py
index 24064d9d..212e1671 100644
--- a/dory/Parsers/Parser_HW_to_C.py
+++ b/dory/Parsers/Parser_HW_to_C.py
@@ -57,6 +57,7 @@ def mapping_network_to_C_file(self):
             self.config_file,
             self.verbose_level,
             self.perf_layer,
+            self.tmpl_dir,
             self.app_directory,
             self.inc_dir_rel,
             self.src_dir_rel)
@@ -86,8 +87,8 @@ def l2_c_template(self, node, backend_library):
     def l2_template_mapping(self, node, backend_library):
         tmpl_c = self.l2_c_template(node, backend_library)
         return {
-            os.path.join(self.src_dir, node.name + ".c"): os.path.join(self.tmpl_dir, tmpl_c),
-            os.path.join(self.inc_dir, node.name + ".h"): os.path.join(self.tmpl_dir, "layer_L2_h_template.h"),
+            os.path.join(self.src_dir, node.name + ".c"): os.path.join(self.layer_tmpl_dir, tmpl_c),
+            os.path.join(self.inc_dir, node.name + ".h"): os.path.join(self.layer_tmpl_dir, "layer_L2_h_template.h"),
         }
 
     def mapping_layers_to_C_files(self):
@@ -170,7 +171,11 @@ def get_file_path(self):
 
     @property
     def tmpl_dir(self):
-        return os.path.realpath(os.path.join(self.get_file_path(), 'Templates/layer_templates'))
+        return os.path.realpath(os.path.join(self.get_file_path(), 'Templates'))
+
+    @property
+    def layer_tmpl_dir(self):
+        return os.path.realpath(os.path.join(self.tmpl_dir, 'layer_templates'))
 
     @property
     def utils_files_dir(self):
diff --git a/dory/Utils/Templates_writer/Network_template_writer.py b/dory/Utils/Templates_writer/Network_template_writer.py
index b7c66be9..b8abdd88 100644
--- a/dory/Utils/Templates_writer/Network_template_writer.py
+++ b/dory/Utils/Templates_writer/Network_template_writer.py
@@ -29,6 +29,7 @@ def print_template_network(
     config_file,
     verbose_level,
     perf_layer,
+    tmpl_dir,
         app_directory,
         inc_dir_rel,
         src_dir_rel
@@ -86,19 +87,19 @@ def print_template_network(
                 l += "// %s %s\n" % (k.ljust(30), v)
     tk['DORY_HW_graph'] = graph
     root = os.path.realpath(os.path.dirname(__file__))
-    tmpl = Template(filename=os.path.join(root, "../../Hardware_targets", HW_description["name"], "Templates/network_c_template.c"))
+    tmpl = Template(filename=os.path.join(tmpl_dir, "network_c_template.c"))
     s = tmpl.render(verbose_log=l, **tk)
     save_string = os.path.join(app_directory, src_dir_rel, prefix + 'network.c')
     with open(save_string, "w") as f:
         f.write(s)
 
-    tmpl = Template(filename=os.path.join(root, "../../Hardware_targets", HW_description["name"], "Templates/network_h_template.h"))
+    tmpl = Template(filename=os.path.join(tmpl_dir, "network_h_template.h"))
     s = tmpl.render(verbose_log=l, **tk)
     save_string = os.path.join(app_directory, inc_dir_rel, prefix + 'network.h')
     with open(save_string, "w") as f:
         f.write(s)
 
-    tmpl = Template(filename=os.path.join(root, "../../Hardware_targets", HW_description["name"], "Templates/main_template.c"))
+    tmpl = Template(filename=os.path.join(tmpl_dir, "main_template.c"))
     s = tmpl.render(verbose_log=l, **tk)
     save_string = os.path.join(app_directory, src_dir_rel, prefix + 'main.c')
     with open(save_string, "w") as f:

From ce5a4f45f91d4d08c3e942802a5f69fe268e7f58 Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Tue, 30 Jan 2024 16:02:27 +0100
Subject: [PATCH 26/30] Initialize monitors only once in network

---
 .../PULP/GAP9/Utils_files/net_utils.h         |  6 +++
 .../layer_L2_c_conv_ne16_multicore_template.c | 33 +-----------
 .../GAP9_NE16/Templates/network_c_template.c  | 27 +++++++++-
 .../PULP/GAP9_NE16/Utils_files/monitor.c      | 42 +++++++++++++++
 .../PULP/GAP9_NE16/Utils_files/monitor.h      | 51 +++++--------------
 5 files changed, 87 insertions(+), 72 deletions(-)
 create mode 100644 dory/Hardware_targets/PULP/GAP9_NE16/Utils_files/monitor.c

diff --git a/dory/Hardware_targets/PULP/GAP9/Utils_files/net_utils.h b/dory/Hardware_targets/PULP/GAP9/Utils_files/net_utils.h
index 1ed57c63..0464d0f4 100644
--- a/dory/Hardware_targets/PULP/GAP9/Utils_files/net_utils.h
+++ b/dory/Hardware_targets/PULP/GAP9/Utils_files/net_utils.h
@@ -2,6 +2,7 @@
 #define __PERF_UTILS_H__
 #include <stddef.h>
 #include <stdint.h>
+#include "monitor.h"
 
 // Padding flags
 
@@ -11,6 +12,10 @@
 #define NET_UTILS_PAD_LEFT   (1 << 0)
 #define NET_UTILS_NO_PAD     (0)
 
+typedef struct {
+    Monitor input, output, store_conf;
+} TaskMonitors;
+
 typedef struct {
   unsigned int L3_input;
   unsigned int L3_output;
@@ -23,6 +28,7 @@ typedef struct {
   unsigned int ram;
   unsigned int padding;
   unsigned int layer_id;
+  TaskMonitors *monitor;
 } layer_args_t;
 
 void print_perf(const char *name, const int cycles, const int macs);
diff --git a/dory/Hardware_targets/PULP/GAP9_NE16/Templates/layer_templates/layer_L2_c_conv_ne16_multicore_template.c b/dory/Hardware_targets/PULP/GAP9_NE16/Templates/layer_templates/layer_L2_c_conv_ne16_multicore_template.c
index d9e584e8..e6fe39d4 100644
--- a/dory/Hardware_targets/PULP/GAP9_NE16/Templates/layer_templates/layer_L2_c_conv_ne16_multicore_template.c
+++ b/dory/Hardware_targets/PULP/GAP9_NE16/Templates/layer_templates/layer_L2_c_conv_ne16_multicore_template.c
@@ -182,10 +182,6 @@ static int inc(int index, int end) {
 
 #define BUFFER_SIZE (2)
 
-typedef struct {
-    Monitor input, output, store_conf;
-} TaskMonitors;
-
 struct layer_task_fork_args_t {
     uint32_t L2_input;
     uint32_t L2_weights;
@@ -360,29 +356,6 @@ void ${func_name}(void *args) {
 
     layer_args_t *layer_args = (layer_args_t *)args;
 
-    // Initialization
-    TaskMonitors monitor;
-
-    int err = 0;
-
-    if (err = monitor_init(&monitor.input, BUFFER_SIZE)) {
-        printf("Input monitor initialization failed with status %d.\n", err);
-        return;
-    }
-
-    if (err = monitor_init(&monitor.output, BUFFER_SIZE)) {
-        printf("Output monitor initialization failed with status %d.\n", err);
-        monitor_term(monitor.input);
-        return;
-    }
-
-    if (err = monitor_init(&monitor.store_conf, BUFFER_SIZE)) {
-        printf("Store conf monitor initialization failed with status %d.\n", err);
-        monitor_term(monitor.input);
-        monitor_term(monitor.output);
-        return;
-    }
-
     // Init nnx tasks
     ne16_task_t ne16_tasks[BUFFER_SIZE];
     for (int i = 0; i < BUFFER_SIZE; i++) {
@@ -417,14 +390,10 @@ void ${func_name}(void *args) {
         .ne16_tasks = ne16_tasks,
         .tiles = tiles,
         .store_conf = store_conf,
-        .monitor = &monitor,
+        .monitor = layer_args->monitor,
     };
     pi_cl_team_fork(CORES, layer_task_fork, (void *)&layer_task_fork_args);
 
 
     // Terminate
-
-    monitor_term(monitor.input);
-    monitor_term(monitor.output);
-    monitor_term(monitor.store_conf);
 }
diff --git a/dory/Hardware_targets/PULP/GAP9_NE16/Templates/network_c_template.c b/dory/Hardware_targets/PULP/GAP9_NE16/Templates/network_c_template.c
index cfbf6096..ac443d03 100644
--- a/dory/Hardware_targets/PULP/GAP9_NE16/Templates/network_c_template.c
+++ b/dory/Hardware_targets/PULP/GAP9_NE16/Templates/network_c_template.c
@@ -216,6 +216,27 @@ void ${prefix}network_run_cluster(void *args) {
   const ne16_pulp_conf_t ne16_pulp_conf = {.max_stall = 8};
   ne16_nnx_init(ne16_pulp_get_dev(), &ne16_pulp_conf);
 
+  TaskMonitors monitor;
+
+  int err = 0;
+
+  if (err = monitor_init(&monitor.input, 2)) {
+      printf("Input monitor initialization failed with status %d.\n", err);
+      return;
+  }
+
+  if (err = monitor_init(&monitor.output, 2)) {
+      printf("Output monitor initialization failed with status %d.\n", err);
+      monitor_term(monitor.input);
+      return;
+  }
+
+  if (err = monitor_init(&monitor.store_conf, 2)) {
+      printf("Store conf monitor initialization failed with status %d.\n", err);
+      monitor_term(monitor.input);
+      monitor_term(monitor.output);
+      return;
+  }
 /* ---------------------------------- */
 /* --------- SECTION 0 END ---------- */
 /* ---------------------------------- */
@@ -317,7 +338,8 @@ void ${prefix}network_run_cluster(void *args) {
       .L1_buffer = 0,
       .ram = (unsigned int) get_ram_ptr(),
       .padding = NET_UTILS_PAD_TOP | NET_UTILS_PAD_BOTTOM,
-      .layer_id = i
+      .layer_id = i,
+      .monitor = &monitor,
     };
 
     % if 'Yes' in performance or 'Perf_final' in verbose_level:
@@ -514,4 +536,7 @@ void ${prefix}network_run_cluster(void *args) {
 /* ---------------------------------- */
 
   ne16_nnx_term(ne16_pulp_get_dev());
+  monitor_term(monitor.input);
+  monitor_term(monitor.output);
+  monitor_term(monitor.store_conf);
 }
diff --git a/dory/Hardware_targets/PULP/GAP9_NE16/Utils_files/monitor.c b/dory/Hardware_targets/PULP/GAP9_NE16/Utils_files/monitor.c
new file mode 100644
index 00000000..730a2c45
--- /dev/null
+++ b/dory/Hardware_targets/PULP/GAP9_NE16/Utils_files/monitor.c
@@ -0,0 +1,42 @@
+#include "monitor.h"
+#include "pmsis.h"
+
+
+int monitor_init(Monitor * const monitor, int buffer_size) {
+    monitor->empty = pi_cl_sem_alloc();
+    if (monitor->empty == 0) {
+        return -1;
+    }
+
+    monitor->full = pi_cl_sem_alloc();
+    if (monitor->full == 0) {
+        pi_cl_sem_free(monitor->empty);
+        return -2;
+    }
+
+    pi_cl_sem_set(monitor->full, 0);
+    pi_cl_sem_set(monitor->empty, buffer_size);
+
+    return 0;
+}
+
+void monitor_term(Monitor monitor) {
+    pi_cl_sem_free(monitor.empty);
+    pi_cl_sem_free(monitor.full);
+}
+
+void monitor_produce_begin(Monitor monitor) {
+    pi_cl_sem_dec(monitor.empty);
+}
+
+void monitor_produce_end(Monitor monitor) {
+    pi_cl_sem_inc(monitor.full, 1);
+}
+
+void monitor_consume_begin(Monitor monitor) {
+    pi_cl_sem_dec(monitor.full);
+}
+
+void monitor_consume_end(Monitor monitor) {
+    pi_cl_sem_inc(monitor.empty, 1);
+}
diff --git a/dory/Hardware_targets/PULP/GAP9_NE16/Utils_files/monitor.h b/dory/Hardware_targets/PULP/GAP9_NE16/Utils_files/monitor.h
index f1179401..b471d994 100644
--- a/dory/Hardware_targets/PULP/GAP9_NE16/Utils_files/monitor.h
+++ b/dory/Hardware_targets/PULP/GAP9_NE16/Utils_files/monitor.h
@@ -1,4 +1,8 @@
-#include "pmsis.h"
+#ifndef __MONITOR_H__
+#define __MONITOR_H__
+
+#include <stdint.h>
+
 
 typedef struct Monitor {
     uint32_t empty;
@@ -6,42 +10,11 @@ typedef struct Monitor {
 } Monitor;
 
 
-static int monitor_init(Monitor * const monitor, int buffer_size) {
-    monitor->empty = pi_cl_sem_alloc();
-    if (monitor->empty == 0) {
-        return -1;
-    }
-
-    monitor->full = pi_cl_sem_alloc();
-    if (monitor->full == 0) {
-        pi_cl_sem_free(monitor->empty);
-        return -2;
-    }
-
-    pi_cl_sem_set(monitor->full, 0);
-    pi_cl_sem_set(monitor->empty, buffer_size);
-
-    return 0;
-}
-
-static void monitor_term(Monitor monitor) {
-    pi_cl_sem_free(monitor.empty);
-    pi_cl_sem_free(monitor.full);
-}
-
-static void monitor_produce_begin(Monitor monitor) {
-    pi_cl_sem_dec(monitor.empty);
-}
-
-static void monitor_produce_end(Monitor monitor) {
-    pi_cl_sem_inc(monitor.full, 1);
-}
-
-static void monitor_consume_begin(Monitor monitor) {
-    pi_cl_sem_dec(monitor.full);
-}
-
-static void monitor_consume_end(Monitor monitor) {
-    pi_cl_sem_inc(monitor.empty, 1);
-}
+int monitor_init(Monitor * const monitor, int buffer_size);
+void monitor_term(Monitor monitor);
+void monitor_produce_begin(Monitor monitor);
+void monitor_produce_end(Monitor monitor);
+void monitor_consume_begin(Monitor monitor);
+void monitor_consume_end(Monitor monitor);
 
+#endif // __MONITOR_H__

From fd5bdf91d621d319f327725fd3ee0b027d0b3ede Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Thu, 1 Feb 2024 08:46:31 +0100
Subject: [PATCH 27/30] Fix relu check for ne16

---
 dory/Hardware_targets/PULP/GAP9_NE16/HW_Parser.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dory/Hardware_targets/PULP/GAP9_NE16/HW_Parser.py b/dory/Hardware_targets/PULP/GAP9_NE16/HW_Parser.py
index a5588601..45fd3520 100644
--- a/dory/Hardware_targets/PULP/GAP9_NE16/HW_Parser.py
+++ b/dory/Hardware_targets/PULP/GAP9_NE16/HW_Parser.py
@@ -60,7 +60,7 @@ def valid_ne16_node(self, node, idx):
                 bias_type=IntegerType(name=f"{node.constant_type}{node.bias_bits}"),
                 has_norm_quant=True, # TODO
                 has_bias=True,
-                has_relu="Relu" in node.name
+                has_relu=node.min >= 0
             )
             return True, ""
         except ValidationError as e:

From 55734135127a673e22a91f3e410b4ebdeceeae78 Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Thu, 1 Feb 2024 09:43:49 +0100
Subject: [PATCH 28/30] Fix ne16 relu flag in the layer template

---
 .../layer_templates/layer_L2_c_conv_ne16_multicore_template.c   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dory/Hardware_targets/PULP/GAP9_NE16/Templates/layer_templates/layer_L2_c_conv_ne16_multicore_template.c b/dory/Hardware_targets/PULP/GAP9_NE16/Templates/layer_templates/layer_L2_c_conv_ne16_multicore_template.c
index e6fe39d4..85642bfb 100644
--- a/dory/Hardware_targets/PULP/GAP9_NE16/Templates/layer_templates/layer_L2_c_conv_ne16_multicore_template.c
+++ b/dory/Hardware_targets/PULP/GAP9_NE16/Templates/layer_templates/layer_L2_c_conv_ne16_multicore_template.c
@@ -366,7 +366,7 @@ void ${func_name}(void *args) {
             &ne16_tasks[i],
             (ne16_quant_t) {
                 .shift_amount = ${out_shift},
-                .function = quantFunctionRelu, // todo: un-hardcode it
+                .function = ${"quantFunctionIdentity" if node.min < 0 else "quantFunctionRelu"},
                 .flag_rounding = ne16TaskFlagFalse
             }, (ne16_norm_t) {
                 .mode  = ${"normMode32Bit" if act_dim_bit == 32 else "normMode8Bit" if act_dim_bit == 8 else ""},

From 2f5d6ce4cc3679f13084b717742dc9536ccd143f Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Wed, 14 Feb 2024 10:47:56 +0100
Subject: [PATCH 29/30] Reduce stack size to 2kB for all cluster cores

---
 dory/Hardware_targets/PULP/GAP9/HW_description.json | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dory/Hardware_targets/PULP/GAP9/HW_description.json b/dory/Hardware_targets/PULP/GAP9/HW_description.json
index 46b28b5c..00a1e6ee 100644
--- a/dory/Hardware_targets/PULP/GAP9/HW_description.json
+++ b/dory/Hardware_targets/PULP/GAP9/HW_description.json
@@ -37,8 +37,8 @@
 	"accelerator frequency": 370000000,
 	"peripheral frequency": 370000000,
 	"HW specific parameters":{
-		"accelerator core0 stack": 3500,
-		"accelerator core1-7 stack": 3400
+		"accelerator core0 stack": 2048,
+		"accelerator core1-7 stack": 2048
 	},
     "double_buffering": 2,
     "split_ints": true,

From 4bc1357cd0a340faef69db7b48af16b90ca52c68 Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Tue, 19 Mar 2024 12:16:41 +0100
Subject: [PATCH 30/30] Bump pulp-nnx

---
 .../PULP/Backend_Kernels/pulp-nnx             |  2 +-
 .../PULP/GAP9_NE16/Utils_files/execute.h      | 20 +++++++++----------
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/dory/Hardware_targets/PULP/Backend_Kernels/pulp-nnx b/dory/Hardware_targets/PULP/Backend_Kernels/pulp-nnx
index 70b089ed..234971fc 160000
--- a/dory/Hardware_targets/PULP/Backend_Kernels/pulp-nnx
+++ b/dory/Hardware_targets/PULP/Backend_Kernels/pulp-nnx
@@ -1 +1 @@
-Subproject commit 70b089ed7a050fd99c775f9b7504dfcbd732c02f
+Subproject commit 234971fca4a0eba5e8b703e9ccb62b7764dac7fa
diff --git a/dory/Hardware_targets/PULP/GAP9_NE16/Utils_files/execute.h b/dory/Hardware_targets/PULP/GAP9_NE16/Utils_files/execute.h
index aa07add1..46a237a6 100644
--- a/dory/Hardware_targets/PULP/GAP9_NE16/Utils_files/execute.h
+++ b/dory/Hardware_targets/PULP/GAP9_NE16/Utils_files/execute.h
@@ -17,11 +17,11 @@ static inline void execute_prepare(Layer tile, ne16_task_t *const task) {
                      tile.output.height, tile.output.width, tile.output.channel,
                      tile.output.width * tile.output.channel,
                      tile.output.channel, tile.padding.top, tile.padding.bottom,
-                     tile.padding.right, tile.padding.left);
-  ne16_task_set_ptrs(task, tile.addr.input, tile.input.width,
-                     tile.input.channel, tile.padding.top, tile.padding.left,
-                     tile.addr.output, tile.addr.weights, tile.addr.scale,
-                     0 /*shift_ptr*/, tile.addr.bias);
+                     tile.padding.left, tile.padding.right);
+  ne16_task_set_addr_conv(task, tile.addr.input, tile.input.width,
+                          tile.input.channel, tile.padding.top, tile.padding.left,
+                          tile.addr.output, tile.addr.weights);
+  ne16_task_set_addr_norm_quant(task, tile.addr.scale, 0 /*shift_addr*/, tile.addr.bias);
 }
 
 static inline void execute_async(ne16_task_t *task) {
@@ -37,11 +37,11 @@ static inline void execute_stride2x2_prepare(Layer tile, Kernel kernel,
       tile.output.height, tile.output.width, tile.output.channel,
       tile.output.width * tile.output.channel, tile.output.channel,
       kernel.shape.height, kernel.shape.width, tile.padding.top,
-      tile.padding.bottom, tile.padding.right, tile.padding.left);
-  ne16_task_set_ptrs(task, tile.addr.input, tile.input.width,
-                     tile.input.channel, tile.padding.top, tile.padding.left,
-                     tile.addr.output, tile.addr.weights, tile.addr.scale,
-                     0 /*shift_ptr*/, tile.addr.bias);
+      tile.padding.bottom, tile.padding.left, tile.padding.right);
+  ne16_task_set_addr_conv(task, tile.addr.input, tile.input.width,
+                          tile.input.channel, tile.padding.top, tile.padding.left,
+                          tile.addr.output, tile.addr.weights);
+  ne16_task_set_addr_norm_quant(task, tile.addr.scale, 0 /*shift_addr*/, tile.addr.bias);
 }
 
 static inline void execute_stride2x2_blocking(ne16_task_t *task, Layer tile,