diff --git a/PyTorchSimFrontend/extension_codecache.py b/PyTorchSimFrontend/extension_codecache.py index cf50a26b..c217cb9c 100644 --- a/PyTorchSimFrontend/extension_codecache.py +++ b/PyTorchSimFrontend/extension_codecache.py @@ -135,7 +135,7 @@ def __init__(self, message="SPAD overflow occurred."): super().__init__(message) class TileSizeError(Exception): - def __init__(self, message="SPAD overflow occurred."): + def __init__(self, message="Tile size constraint violated."): super().__init__(message) class MLIRCodeCache: @@ -243,12 +243,10 @@ def load(cls, source_code, cycle_list = cyclesim.compile_and_simulate(os.path.join(write_path, cycle_binary_name), vectorlane_size, silent_mode=silent_mode) # Create TOG - w_offset, x_offset = vectorlane_size, vectorlane_size + x_offset = vectorlane_size if kwargs['loop_size'] is not None and kwargs['loop_size'][-3] < vectorlane_size: x_offset = kwargs['loop_size'][-3] - if kwargs['loop_size'] is not None and kwargs['loop_size'][-1] < vectorlane_size: - w_offset = kwargs['loop_size'][-1] - w_offset = 0 # max(w_offset - x_offset, 0) + w_offset = 0 tile_graph_generator = tog_generator(origins) tile_graph_generator.load_file(raw_tog_path) tile_graph_generator.generate_tile_graph( diff --git a/PyTorchSimFrontend/extension_op.py b/PyTorchSimFrontend/extension_op.py index e6351101..1fc439c6 100644 --- a/PyTorchSimFrontend/extension_op.py +++ b/PyTorchSimFrontend/extension_op.py @@ -113,71 +113,6 @@ def generate_outer_product_matrix(a, b, M, K, N, prefix, dir_path): address_matrix_c=address_matrix_b+(n_nonzeros*data_width) return 0, address_matrix_b, address_matrix_c -def generate_inner_product_matrix(a, b, M, K, N, file_name, in_file_bitmap_a, in_file_bitmap_b): - data_width = 4 - a_cpu = a.cpu() - b_cpu = b.cpu() - matrixA_size=int(M*K) - matrixB_size=int(N*K) - matrixC_size=int(M*N) - - random.seed(a=0, version=2) - - address_matrix_a = 0 - with open(file_name, "w") as fd, open(in_file_bitmap_a, "w") as fbA, open(in_file_bitmap_b, "w") as fbB: - #generating matrixA - n_nonzeros=0 - for m in range(M): # Row major - for k in range(K): - is_sparse = a_cpu[m,k] - if(torch.isclose(is_sparse, torch.zeros(1), atol=1e-1)): - if((m==(M-1)) and (k==(K-1))): - fbA.write(str(1)) - else: - fbA.write(str(1)+","); #writing a 1 in bitmap - ba = bytearray(struct.pack(">f", is_sparse)) # generating list of bytes - my_int = int.from_bytes(ba, "big") - fd.write(str(my_int)) - fd.write(",") - n_nonzeros+=1 - else: - if((m==(M-1)) and (k==(K-1))): # this is to insert a comma - fbA.write(str(0)) - # note no data element is inserted in this case - else: - # note no data element is inserted in this case - fbA.write(str(0)+",") - - address_matrix_b=n_nonzeros*data_width - #Generating matrix B - n_nonzeros=0 - bitmapB=list(range(0,matrixB_size)) - for n in range(0,N): # Row major - for k in range(0,K): - is_sparse = b_cpu[k,n] - if(torch.isclose(is_sparse, torch.zeros(1), atol=1e-1)): # value is generated - bitmapB[k*N+n]=1 - ba = bytearray(struct.pack(">f", float(is_sparse))) # generating list of bytes - my_int = int.from_bytes(ba, "big") - fd.write(str(my_int)) - fd.write(",") - n_nonzeros+=1 - else: - # no data element is inserted in this case - bitmapB[k*N+n]=0; #writing a 0 - # writing the bitmapB in the appropiate order - for i in range(0, matrixB_size): - fbB.write(str(bitmapB[i])) - if(i < (matrixB_size-1)): - fbB.write(",") - - fd.write(str(0)) # Adding a final 0 to the memory which will never be used. This is just to avoid having a last comma. - address_matrix_c=address_matrix_b+(n_nonzeros*data_width) - print("Offset matrix A: "+str(address_matrix_a)) - print("Offset matrix B: "+str(address_matrix_b)) - print("Offset matrix C: "+str(address_matrix_c)) - return address_matrix_a, matrixA_size, matrixA_size+matrixB_size - def prepare_outer_product_matrix(a, b, out): M, K, N = a.shape[0], b.shape[0], b.shape[1] diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py index 734ca967..38875dd0 100644 --- a/PyTorchSimFrontend/mlir/mlir_common.py +++ b/PyTorchSimFrontend/mlir/mlir_common.py @@ -22,8 +22,6 @@ from typing import Callable -import sympy - from torch.utils._sympy.value_ranges import ValueRanges from torch._inductor.utils import ( get_sympy_Expr_dtype, @@ -553,7 +551,7 @@ def get_name(self) -> str: return self.name def get_tile_size(self): return list(self._tile_size) def get_tile_stride(self): return list(self._tile_stride) def get_numel(self) -> int :return math.prod(self._tile_size) - def get_nr_dim(self) -> str: return len(self._tile_size) + def get_nr_dim(self) -> int: return len(self._tile_size) def get_reduction_numel(self): return reduce(mul, self.get_tile_size()[-1*self.nr_rdim:], 1) def set_tile_size(self, tile_size, tile_axis_order=None, constraints=None): diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py index 22d1011b..86323e7d 100644 --- a/PyTorchSimFrontend/mlir/mlir_scheduling.py +++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py @@ -60,7 +60,7 @@ def can_fuse_with_exceptions(self, node1: BaseSchedulerNode, node2: BaseSchedule return False if list(node1.read_writes.writes)[0].name in [dep.name for dep in node2.read_writes.reads]: - node1 = self.revert_group(node1) + self.revert_group(node1) return True return self.scheduler.can_fuse_origin(node1, node2) @@ -182,7 +182,7 @@ def can_fuse_horizontal(self, node1, node2): try: stride = [i.strip()[:-1].split(",")[-1].strip() for i in str(node2.get_nodes()[0].node).split("\n") if "r0" in i][1] stride = int(sympify(stride).coeff(target_symbol)) - except: + except Exception: return False # We can't fuse dim=-1 & N == 1