Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
78974a1
Harmonize onnx mangling
Dec 1, 2023
646710f
Harmonize quantlib renaming last output node
Dec 1, 2023
71d75bf
Add some string parsing
Dec 5, 2023
41fd9b6
Parser fixes
lukamac Dec 19, 2023
115f8b3
Fix L3-L2 tiling
lukamac Dec 19, 2023
56553f8
Fix l2 mapping overwriting l3 layers
lukamac Dec 21, 2023
ed308bb
Fix func_names doesn't have the padded func versions
lukamac Jan 8, 2024
d8e8e7d
Fix missing node.L3_input for GAP9_NE16
lukamac Jan 8, 2024
726401d
Revert remove mapping_layers_to_C_files
lukamac Jan 8, 2024
e95828a
Add ne16 check for padded functions
lukamac Jan 8, 2024
38052e4
Bump pulp-nnx
lukamac Jan 8, 2024
7037b46
Fix padding flag passing
lukamac Jan 9, 2024
997b241
Fix unused padding with strided convolution for ne16
lukamac Jan 9, 2024
858a62b
Rewrite convolution overlap
lukamac Jan 9, 2024
fe647a5
Small rewrites for add
lukamac Jan 18, 2024
d195966
Add last checksum
lukamac Jan 18, 2024
9706289
WIP: Upgrade pulp-nnx
lukamac Jan 30, 2024
397e95c
Copy network template to GAP9_NE16
lukamac Jan 30, 2024
a9c0320
Move dma mutex init to network
lukamac Jan 30, 2024
bd19a21
Move ne16 init to network
lukamac Jan 30, 2024
456816b
Move ne16_tasks from static global to the stack
lukamac Jan 30, 2024
08a5819
Move global static tiles to stack
lukamac Jan 30, 2024
361eb0c
Move global static store_conf to stack
lukamac Jan 30, 2024
ef7dcb7
Move global static monitor to stack
lukamac Jan 30, 2024
f080507
Fix path to network template
lukamac Jan 30, 2024
ce5a4f4
Initialize monitors only once in network
lukamac Jan 30, 2024
fd5bdf9
Fix relu check for ne16
lukamac Feb 1, 2024
5573413
Fix ne16 relu flag in the layer template
lukamac Feb 1, 2024
2f5d6ce
Reduce stack size to 2kB for all cluster cores
lukamac Feb 14, 2024
4bc1357
Bump pulp-nnx
lukamac Mar 19, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion dory/Frontend_frameworks/Quantlab/Parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,11 @@ def frontend_mapping_to_DORY_nodes(self):
# input 1 parameters
#### Look at the order of inputs in Onnx. If the lowest index
#is not the first argument, revert the order inmul1 and inmul2
if int(node.input_indexes[0]) > int(node.input_indexes[1]):
def pos_in_schedule(node: str) -> int:
for idx, node_iterating in enumerate(list(self.graph.graph.node)):
if node_iterating.output[0] == node:
return idx
if pos_in_schedule(node.input_indexes[0]) > pos_in_schedule(node.input_indexes[1]):
temp_shift = node.in1_shift
temp_mul = node.in1_mul
temp_add = node.in1_add
Expand Down
28 changes: 18 additions & 10 deletions dory/Hardware_targets/PULP/Backend_Kernels/BackendKernelsAdapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,24 +177,32 @@ def _check_valid_node(self, node: HW_node) -> None:
if self.accelerator_name == "ne16":
assert True # TODO

def _get_ne16_src_files(self) -> List[str]:
return ["src/pulp_nnx_ne16.c", "ne16/hal/ne16_hal.c"]
def _get_acc_src_files(self) -> List[str]:
return [
f"{self.accelerator_name}/hal/{self.accelerator_name}.c",
f"{self.accelerator_name}/hal/{self.accelerator_name}_task.c",
f"{self.accelerator_name}/bsp/{self.accelerator_name}_pulp_bsp.c",
f"src/pulp_nnx_{self.accelerator_name}.c",
]

def _get_ne16_inc_files(self) -> List[str]:
def _get_acc_inc_files(self) -> List[str]:
return [
"ne16/gvsoc/ne16_gvsoc_logging.h",
"ne16/hal/ne16_defs.h",
"ne16/hal/ne16_hal.h",
f"{self.accelerator_name}/gvsoc/{self.accelerator_name}_gvsoc.h",
f"{self.accelerator_name}/hal/{self.accelerator_name}_task_defs.h",
f"{self.accelerator_name}/hal/{self.accelerator_name}.h",
f"{self.accelerator_name}/hal/{self.accelerator_name}_task.h",
f"{self.accelerator_name}/bsp/{self.accelerator_name}_pulp_bsp.h",
f"inc/pulp_nnx_{self.accelerator_name}.h",
]

def _get_src_files(self) -> List[str]:
src_files = ["util/pulp_nnx_util.c"]
src_files = ["util/pulp_nnx_util.c", "util/hwpe.c"]
if self.accelerator_name == "ne16":
src_files += self._get_ne16_src_files()
src_files += self._get_acc_src_files()
return [os.path.join(self._root_dir(), file) for file in src_files]

def _get_inc_files(self) -> List[str]:
inc_files = ["inc/pulp_nnx.h", "util/pulp_nnx_util.h"]
inc_files = ["util/pulp_nnx_util.h", "util/hwpe.h"]
if self.accelerator_name == "ne16":
inc_files += self._get_ne16_inc_files()
inc_files += self._get_acc_inc_files()
return [os.path.join(self._root_dir(), file) for file in inc_files]
2 changes: 1 addition & 1 deletion dory/Hardware_targets/PULP/Backend_Kernels/pulp-nnx
Submodule pulp-nnx updated 79 files
+20 −4 .gitlab-ci.yml
+59 −0 CHANGELOG.md
+52 −0 CMakeLists.txt
+61 −46 README.md
+0 −75 inc/pulp_nnx.h
+76 −0 inc/pulp_nnx_ne16.h
+61 −0 inc/pulp_nnx_neureka.h
+36 −0 ne16/README.md
+85 −0 ne16/bsp/ne16_pulp_bsp.c
+81 −0 ne16/bsp/ne16_pulp_bsp.h
+20 −15 ne16/gvsoc/ne16_gvsoc.h
+13 −6 ne16/hal/ne16.c
+15 −10 ne16/hal/ne16.h
+0 −157 ne16/hal/ne16_defs.h
+0 −224 ne16/hal/ne16_hal.c
+0 −197 ne16/hal/ne16_hal.h
+260 −0 ne16/hal/ne16_task.c
+193 −0 ne16/hal/ne16_task.h
+122 −0 ne16/hal/ne16_task_defs.h
+34 −0 neureka/README.md
+78 −0 neureka/bsp/neureka_siracusa_bsp.c
+67 −0 neureka/bsp/neureka_siracusa_bsp.h
+54 −0 neureka/gvsoc/neureka_gvsoc.h
+37 −0 neureka/hal/neureka.c
+15 −5 neureka/hal/neureka.h
+245 −0 neureka/hal/neureka_task.c
+188 −0 neureka/hal/neureka_task.h
+124 −0 neureka/hal/neureka_task_defs.h
+0 −167 neureka/inc/pulp_nnx_defs.h
+0 −217 neureka/inc/pulp_nnx_hal.h
+0 −412 neureka/src/pulp_nnx_hal.c
+68 −170 src/pulp_nnx_ne16.c
+76 −0 src/pulp_nnx_neureka.c
+1 −1 test/.gitignore
+4 −0 test/.isort.cfg
+52 −12 test/HeaderWriter.py
+0 −94 test/Ne16.py
+99 −0 test/Ne16MemoryLayout.py
+117 −0 test/Ne16TestConf.py
+134 −0 test/NeuralEngineFunctionalModel.py
+156 −0 test/NeurekaMemoryLayout.py
+110 −0 test/NeurekaTestConf.py
+160 −225 test/NnxTestClasses.py
+6 −0 test/README.md
+7 −7 test/TestClasses.py
+25 −15 test/app/Makefile
+11 −8 test/app/src/main.c
+190 −62 test/app/src/nnx_layer.c
+8 −5 test/conf.toml
+54 −9 test/conftest.py
+1 −0 test/requirements-dev.txt
+2 −2 test/requirements.txt
+40 −16 test/test.py
+118 −56 test/testgen.py
+29 −0 test/tests/test_100/conf.json
+29 −0 test/tests/test_101/conf.json
+29 −0 test/tests/test_102/conf.json
+29 −0 test/tests/test_103/conf.json
+29 −0 test/tests/test_104/conf.json
+29 −0 test/tests/test_105/conf.json
+29 −0 test/tests/test_106/conf.json
+29 −0 test/tests/test_107/conf.json
+29 −0 test/tests/test_108/conf.json
+29 −0 test/tests/test_109/conf.json
+29 −0 test/tests/test_110/conf.json
+29 −0 test/tests/test_111/conf.json
+29 −0 test/tests/test_112/conf.json
+29 −0 test/tests/test_113/conf.json
+29 −0 test/tests/test_114/conf.json
+29 −0 test/tests/test_115/conf.json
+29 −0 test/tests/test_116/conf.json
+29 −0 test/tests/test_117/conf.json
+29 −0 test/tests/test_118/conf.json
+30 −0 test/tests/test_119/conf.json
+30 −0 test/tests/test_120/conf.json
+85 −0 util/hwpe.c
+43 −0 util/hwpe.h
+7 −5 util/pulp_nnx_util.c
+11 −9 util/pulp_nnx_util.h
4 changes: 2 additions & 2 deletions dory/Hardware_targets/PULP/Common/C_Parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,8 @@ def mapping_layers_to_C_files(self):

if n_memory_levels > 2 and (node.L3_input != 0 or (node.tiling_dimensions["L3"]["output_dimensions"] != node.tiling_dimensions["L2"]["output_dimensions"]) or (node.tiling_dimensions["L3"]["weights_dimensions"] != node.tiling_dimensions["L2"]["weights_dimensions"])):
tk = Layer2D_writer.print_template_layer_L3(node)
TemplateWriter.write(tk, {os.path.join(self.src_dir, node.prefixed_name + ".c"): os.path.join(self.tmpl_dir, "layer_L3_c_template.c"),
os.path.join(self.inc_dir, node.prefixed_name + ".h"): os.path.join(self.tmpl_dir, "layer_L3_h_template.h")})
TemplateWriter.write(tk, {os.path.join(self.src_dir, node.prefixed_name + ".c"): os.path.join(self.layer_tmpl_dir, "layer_L3_c_template.c"),
os.path.join(self.inc_dir, node.prefixed_name + ".h"): os.path.join(self.layer_tmpl_dir, "layer_L3_h_template.h")})
if node.tiling_dimensions["L3"]["input_dimensions"][1] > node.tiling_dimensions["L2"]["input_dimensions"][1]:
node.tiling_dimensions["L2"]["output_dimensions"][1] = int(np.floor((node.tiling_dimensions["L2"]["input_dimensions"][1] - node.kernel_shape[0] + node.strides[0]) / node.strides[0]))
if node.tiling_dimensions["L3"]["output_dimensions"][1] > node.tiling_dimensions["L2"]["output_dimensions"][1]:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,9 @@
* limitations under the License.
*/
#include "${func_name_L3}.h"
#include "${func_name[0]}.h"
% if len(func_name)>1:
#include "${func_name[1]}.h"
#include "${func_name[2]}.h"
%endif
% for name in func_name:
#include "${name}.h"
% endfor
#include "dory_get_tile.h"
#include "pmsis.h"
#include "bsp/fs.h"
Expand Down
45 changes: 15 additions & 30 deletions dory/Hardware_targets/PULP/Common/Tiler/tiler_add.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,62 +91,47 @@ def get_tiling_Add_L2(self):
# return immediatly if the memory fits the L1
if buffer_total <= L1_memory:
return ([], self.HW_node.tiling_dimensions["L2"]["input_dimensions"] , self.HW_node.tiling_dimensions["L2"]["output_dimensions"] )
else:
db = self.double_buffering

db = self.double_buffering

parameters = pywrapcp.Solver.DefaultSolverParameters()
solver = pywrapcp.Solver("simple_CP", parameters)

# integer positive variables.
tile_n = solver.IntVar(1, in_ch, 'tile_n')
tile_h_in = solver.IntVar(ks[0], inp_dim[0], 'tile_h_in')
tile_w_in = solver.IntVar(ks[1], inp_dim[1], 'tile_w_in')
tile_h_out = solver.IntVar(1, out_dim[0], 'tile_h_out')
tile_w_out = solver.IntVar(1, out_dim[1], 'tile_w_out')
assert in_ch == out_ch

# scaling is used to ensure datasize is integer
solver.Add(tile_h_out == tile_h_in)
solver.Add(tile_w_out == tile_w_in)
solver.Add(tile_n == in_ch)
# integer positive variables.
tile_n = in_ch
tile_h = solver.IntVar(1, inp_dim[0], 'tile_h_in')
tile_w = inp_dim[1]

# CONSTRAINTS: managing of correct dimensions (no decimal h_out and any
# type of rounding)
input_tile_dimension = (db * in_ch * tile_h_in * inp_dim[1] * self.HW_node.input_activation_bits + 7 ) // 8 # the 7 is to account for bit precision of 1, which still occupy an entire byte
output_tile_dimension = (db * out_ch * tile_h_out * out_dim[1] * self.HW_node.output_activation_bits + 7 ) // 8 # the 7 is to account for bit precision of 1, which still occupy an entire byte
input_tile_dimension = (db * tile_n * tile_h * tile_w * self.HW_node.input_activation_bits + 7 ) // 8 # the 7 is to account for bit precision of 1, which still occupy an entire byte
output_tile_dimension = (db * tile_n * tile_h * tile_w * self.HW_node.output_activation_bits + 7 ) // 8 # the 7 is to account for bit precision of 1, which still occupy an entire byte
constraint_all = input_tile_dimension * int(np.ceil(1+self.HW_node.second_input_activation_bits/self.HW_node.input_activation_bits)) + output_tile_dimension
solver.Add(constraint_all <= L1_memory)

# objective
obj_expr = solver.IntVar(0, 1000000000000, "obj_expr")
solver.Add(obj_expr == 1000000 * constraint_all
+ 10 * tile_w_in
+ 1 * tile_h_in
+ 1000 * tile_n)
solver.Add(obj_expr == constraint_all)
objective = solver.Maximize(obj_expr, 1)

decision_builder = solver.Phase([tile_n, tile_h_in, tile_w_in, tile_h_out, tile_w_out],
decision_builder = solver.Phase([tile_h],
solver.CHOOSE_FIRST_UNBOUND,
solver.ASSIGN_MIN_VALUE)

# Create a solution collector.
collector = solver.LastSolutionCollector()
# Add the decision variables.
collector.Add(tile_n)
collector.Add(tile_h_in)
collector.Add(tile_w_in)
collector.Add(tile_h_out)
collector.Add(tile_w_out)
collector.Add(tile_h)
# Add the objective.
collector.AddObjective(obj_expr)

solver.Solve(decision_builder, [objective, collector])
if collector.SolutionCount() > 0:
best_solution = collector.SolutionCount() - 1
tile_n = collector.Value(best_solution, tile_n)
tile_h_in = collector.Value(best_solution, tile_h_in)
tile_w_in = collector.Value(best_solution, tile_w_in)
tile_h_out = collector.Value(best_solution, tile_h_out)
tile_w_out = collector.Value(best_solution, tile_w_out)
return ([], [tile_n, tile_h_in, tile_w_in], [tile_n, tile_h_out, tile_w_out])
tile_h = collector.Value(best_solution, tile_h)
return ([], [tile_n, tile_h, tile_w], [tile_n, tile_h, tile_w])
print(" Add ERROR: no tiling found. Exiting...")
os._exit(0)
return None
4 changes: 2 additions & 2 deletions dory/Hardware_targets/PULP/GAP9/HW_description.json
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,8 @@
"accelerator frequency": 370000000,
"peripheral frequency": 370000000,
"HW specific parameters":{
"accelerator core0 stack": 3500,
"accelerator core1-7 stack": 3400
"accelerator core0 stack": 2048,
"accelerator core1-7 stack": 2048
},
"double_buffering": 2,
"split_ints": true,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -119,8 +119,8 @@ static void kernel(Layer tile, Layer tile2) {
tile.addr.input,
tile2.addr.input,
tile.addr.output,
${inmul2},
${inmul1},
${inmul2},
${outshift},
tile.input.width,
tile.input.height,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,9 @@
* limitations under the License.
*/
#include "${func_name_L3}.h"
#include "${func_name[0]}.h"
% if len(func_name)>1:
#include "${func_name[1]}.h"
#include "${func_name[2]}.h"
%endif
% for name in func_name:
#include "${name}.h"
% endfor
#include "dory_get_tile.h"
#include "pmsis.h"
#include "bsp/fs.h"
Expand Down Expand Up @@ -131,6 +129,8 @@ void __attribute__ ((noinline)) ${func_name_L3}(void *args)
int offset_x = ${dim_in-int(conv_overlap1*n_in*w_in*BitIn/8) - int(padding*n_in*w_in*BitIn/8)};

% if n_tile_x > 1:
uint32_t padding = tile_args.padding;

// loop over input/output tiles
for(int j = 0; j < ${n_tile_x}; j++) {
// Fetching next input tile
Expand Down Expand Up @@ -187,6 +187,7 @@ void __attribute__ ((noinline)) ${func_name_L3}(void *args)
tile_args.L2_output = dory_get_tile_3d(db[i_db_y].y, ${0 if n_tile_y > 1 else 'j'}, 0, k, ${h_out}, ${w_out}, ${n_out}, ${w_out}, ${n_out * n_tile_W}, 0, 0, 0, 0, 0, 0, ${y_data_size_byte});
tile_args.L2_weights = db[i_db_w].w;

% if len(func_name) > 1:
if (j==0) {
${func_name[1] if (n_tile_x > 1 or n_tile_y > 1) and padding > 0 else func_name[0]}((void*)&tile_args);
} \
Expand All @@ -199,6 +200,19 @@ else if (j == (${n_tile_y-1})) {
} else {
${func_name[0]}((void*)&tile_args);
}
% else:
% if n_tile_x > 1:
// LMACAN: Hacky way to add the padding flag passing.
tile_args.padding = 0;
if (j == 0) {
tile_args.padding |= padding & NET_UTILS_PAD_TOP;
}
if (j == ${n_tile_x - 1}) {
tile_args.padding |= padding & NET_UTILS_PAD_BOTTOM;
}
% endif
${func_name[0]}(&tile_args);
% endif

% if n_tile_W > 1:
// waiting for weights, lambda, and k
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -479,6 +479,12 @@ void ${prefix}network_run_cluster(void *args) {
${prefix}cycle_network_execution += io_cyc;
% endif

% if 'Last' in verbose_level:
checksum("final output", L2_output,
activations_out_size[${len(DORY_HW_graph)-1}],
activations_out_checksum[${len(DORY_HW_graph)-1}][exec]);
% endif

//memcpy(L2_output, l2_final_output, activations_out_size[${len(DORY_HW_graph)-1}]); // BUGGY!
for (int i=0; i<activations_out_size[${len(DORY_HW_graph)-1}]; i++)
*((uint8_t*)(l2_final_output+i)) = *((uint8_t*)(L2_output+i));
Expand Down
6 changes: 6 additions & 0 deletions dory/Hardware_targets/PULP/GAP9/Utils_files/net_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#define __PERF_UTILS_H__
#include <stddef.h>
#include <stdint.h>
#include "monitor.h"

// Padding flags

Expand All @@ -11,6 +12,10 @@
#define NET_UTILS_PAD_LEFT (1 << 0)
#define NET_UTILS_NO_PAD (0)

typedef struct {
Monitor input, output, store_conf;
} TaskMonitors;

typedef struct {
unsigned int L3_input;
unsigned int L3_output;
Expand All @@ -23,6 +28,7 @@ typedef struct {
unsigned int ram;
unsigned int padding;
unsigned int layer_id;
TaskMonitors *monitor;
} layer_args_t;

void print_perf(const char *name, const int cycles, const int macs);
Expand Down
34 changes: 34 additions & 0 deletions dory/Hardware_targets/PULP/GAP9_NE16/C_Parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@

from dory.Hardware_targets.PULP.GAP9.C_Parser import C_Parser as C_Parser_gap9
from dory.Hardware_targets.PULP.GAP9_NE16.Ne16_HW_node import Ne16_HW_node
from dory.Utils.Templates_writer.TemplateWriter import TemplateWriter
import dory.Utils.Templates_writer.Layer2D_template_writer as Layer2D_writer
import os
import sys
sys.path.append(os.path.join(os.path.dirname(__file__), "..", "Backend_Kernels", "pulp-nnx", "test"))
Expand Down Expand Up @@ -56,6 +58,38 @@ def l2_template_keywords(self, node, backend_library):
tk = self.__nnx_vars(tk, node)
return tk

def mapping_layers_to_C_files(self):
print("\nMapping the layers files to their templates and copying the kernels associated.")
n_memory_levels = self.HW_description['memory']['levels']

for i, node in enumerate(self.HWgraph):
backend_library = self.node_backend_library(node)
self.copy_backend_files(node, backend_library)

#if (node.kernel_shape[0] == 3 and node.group == 96):
#breakpoint()

if n_memory_levels > 2 and (node.L3_input != 0 or (node.tiling_dimensions["L3"]["output_dimensions"] != node.tiling_dimensions["L2"]["output_dimensions"]) or (node.tiling_dimensions["L3"]["weights_dimensions"] != node.tiling_dimensions["L2"]["weights_dimensions"])):
#breakpoint()
tk = Layer2D_writer.print_template_layer_L3(node)
TemplateWriter.write(tk, {os.path.join(self.src_dir, node.prefixed_name + ".c"): os.path.join(self.layer_tmpl_dir, "layer_L3_c_template.c"),
os.path.join(self.inc_dir, node.prefixed_name + ".h"): os.path.join(self.layer_tmpl_dir, "layer_L3_h_template.h")})
if node.tiling_dimensions["L3"]["input_dimensions"][1] > node.tiling_dimensions["L2"]["input_dimensions"][1]:
node.tiling_dimensions["L2"]["output_dimensions"][1] = (node.tiling_dimensions["L2"]["input_dimensions"][1] - node.kernel_shape[0] + node.strides[0]) // node.strides[0]
if node.tiling_dimensions["L3"]["output_dimensions"][1] > node.tiling_dimensions["L2"]["output_dimensions"][1]:
node.tiling_dimensions["L2"]["input_dimensions"][1] = node.tiling_dimensions["L2"]["output_dimensions"][1] * node.strides[0] + node.kernel_shape[0] - node.strides[0]
node.name = node.name + "_L2"
tk = self.l2_template_keywords(node, backend_library)
TemplateWriter.write(tk, self.l2_template_mapping(node, backend_library))
node.name = node.name[:-3]
else:
if node.tiling_dimensions["L2"]["input_dimensions"][2] == node.tiling_dimensions["L1"]["input_dimensions"][2]:
node.tiling_dimensions["L1"]["output_dimensions"][2] = int((node.tiling_dimensions["L1"]["input_dimensions"][2] + (node.pads[1] + node.pads[3]) - node.kernel_shape[1] + node.strides[1]) / node.strides[1])
if node.tiling_dimensions["L2"]["input_dimensions"][1] == node.tiling_dimensions["L1"]["input_dimensions"][1]:
node.tiling_dimensions["L1"]["output_dimensions"][1] = int((node.tiling_dimensions["L1"]["input_dimensions"][1] + (node.pads[0] + node.pads[2]) - node.kernel_shape[0] + node.strides[0]) / node.strides[0])
tk = self.l2_template_keywords(node, backend_library)
TemplateWriter.write(tk, self.l2_template_mapping(node, backend_library))

def __mem_tmpl_vars(self, tk, node, mem_level):
mem_name = f'L{mem_level}'
upper_mem_name = f'L{mem_level + 1}'
Expand Down
30 changes: 20 additions & 10 deletions dory/Hardware_targets/PULP/GAP9_NE16/HW_Parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,10 @@

from dory.Parsers.HW_node import HW_node
sys.path.append(os.path.join(os.path.dirname(__file__), "..", "Backend_Kernels", "pulp-nnx", "test"))
from Ne16TestClasses import Ne16TestConf
from Ne16TestConf import Ne16TestConf
from TestClasses import IntegerType
from Ne16MemoryLayout import Ne16MemoryLayout
from pydantic import ValidationError
from Ne16 import Ne16


class onnx_manager(onnx_manager_gap9):
Expand All @@ -42,7 +42,7 @@ def get_pattern_rewriter(self):
def get_tiler(self):
return Tiler_GAP9

def valid_ne16_node(self, node):
def valid_ne16_node(self, node, idx):
try:
Ne16TestConf(
in_height=node.input_dimensions[0],
Expand All @@ -60,19 +60,19 @@ def valid_ne16_node(self, node):
bias_type=IntegerType(name=f"{node.constant_type}{node.bias_bits}"),
has_norm_quant=True, # TODO
has_bias=True,
has_relu=True
has_relu=node.min >= 0
)
return True, ""
except ValidationError as e:
msg = f"WARNING: Failed allocating node {node.name} to the NE16 accelerator. Errors:\n"
msg = f"WARNING: Failed allocating node {node.name}{idx} to the NE16 accelerator. Errors:\n"
for error in e.errors():
msg += f" - {error['msg']}\n"
msg += "\nNOTE: Falling back to cluster engine.\n"
return False, msg

def engine_coloring(self, node):
def engine_coloring(self, node, idx):
if "Conv" in node.op_type or "Convolution" in node.op_type:
is_valid, msg = self.valid_ne16_node(node)
is_valid, msg = self.valid_ne16_node(node, idx)
if is_valid:
node.engine = "ne16"
return
Expand All @@ -83,8 +83,8 @@ def engine_coloring(self, node):
def mapping_to_HW_nodes(self):
super().mapping_to_HW_nodes()
print("\nPULP Backend: Assigning nodes to engines.")
for node in self.DORY_Graph:
self.engine_coloring(node)
for i, node in enumerate(self.DORY_Graph):
self.engine_coloring(node, i)
assert all(hasattr(node, "engine") for node in self.DORY_Graph)

def transform_nodes_to_hw_nodes(self):
Expand All @@ -110,6 +110,16 @@ def adjust_node_data_layout(self, node, node_id):
## Unroll expects layout to be "CoutCinK"
if weights["layout"] == "CoutKCin":
weights["value"] = np.transpose(weights["value"], (0,3,1,2))
weights["value"] = Ne16.weight_unroll(weights["value"].astype(np.uint8),
weights["value"] = Ne16MemoryLayout.weightEncode(weights["value"].astype(np.uint8),
node.weight_bits, node.group > 1)
weights["layout"] = "CoutCinMajKQwCinMin" # Ne16's special layout

self.adjust_padding(node)

def adjust_padding(self, node):
inp_dim = node.input_dimensions
ks = node.kernel_shape
s = node.strides
padding_top, padding_left, padding_bottom, padding_right = node.pads
node.pads[2] = padding_bottom if (inp_dim[0] - ks[0] + padding_top + padding_bottom) % s[0] == 0 else 0
node.pads[3] = padding_right if (inp_dim[1] - ks[1] + padding_left + padding_right) % s[1] == 0 else 0
Loading