diff --git a/AstToEcoreConverter.py b/AstToEcoreConverter.py index 0678388..e1731ab 100644 --- a/AstToEcoreConverter.py +++ b/AstToEcoreConverter.py @@ -29,12 +29,13 @@ def __init__(self, resource_set: ResourceSet, repository, write_in_file, output_ Raises: ValueError: If the repository is None or empty. """ + self.current_module_name = None if repository is None or repository == '': raise ValueError('Directory is required') self.root_directory = repository.replace('\\', '/') - self.epackage = resource_set.get_resource(URI('Basic.ecore')).contents[0] - self.graph = self.epackage.getEClassifier('TypeGraph')( + self.e_package = resource_set.get_resource(URI('Basic.ecore')).contents[0] + self.graph = self.e_package.getEClassifier('TypeGraph')( tName=self.root_directory.split('/')[-1]) # initialize internal structures @@ -54,6 +55,7 @@ def __init__(self, resource_set: ResourceSet, repository, write_in_file, output_ # entries: [module_node, module_name, package_node, package_name] self.imported_libraries = [] self.imported_package = None + self.current_parent = None python_files = [os.path.join(root, file) for root, _, files in os.walk( self.root_directory) for file in files if file.endswith('.py')] @@ -91,7 +93,7 @@ def __init__(self, resource_set: ResourceSet, repository, write_in_file, output_ # create and process modules with contained program entities for file_path in python_files: try: - self.process_file(file_path) + self.process_file(str(file_path)) except Exception as e: if 'invalid syntax' in str(e): logger.warning(f'skipped: {file_path}') @@ -316,7 +318,7 @@ def create_package_hierarchy(self, parent_package, subpackage_names, lib_flag=Tr if e == 0: package_node.parent = parent_package else: - package_node.parent = current_parent + package_node.parent = self.current_parent if lib_flag is True: self.imported_libraries.append( [None, None, package_node, element_lib]) @@ -326,8 +328,8 @@ def create_package_hierarchy(self, parent_package, subpackage_names, lib_flag=Tr [package_node, element_lib, parent_package]) else: self.package_list.append( - [package_node, element_lib, current_parent]) - current_parent = package_node + [package_node, element_lib, self.current_parent]) + self.current_parent = package_node self.imported_package = package_node def create_imported_method_call(self, module_node, method_name, caller_node): @@ -430,12 +432,13 @@ def set_external_module_calls(self): if obj.eClass.name == NodeTypes.METHOD_DEFINITION.value: self.create_method_call(obj, method_name, caller_node) if module_node is None: - # if len==1 simple import .. statement, included only if import is used (in that case len>1) + # if len==1 simple import … statement, included only if import is used (in that case len>1) if len(split_import) > 1: self.call_imported_library.append( [caller_node, imported_instance]) - def set_import_names(self, split_import): + @staticmethod + def set_import_names(split_import): """ Sets the names for imported modules, classes, and methods. @@ -539,9 +542,9 @@ def create_method_call(self, method_node, method_name, caller_node): self.create_calls(caller_node, method_node) def check_for_missing_nodes(self): - """check_list contains all classes with method defs that are created during conversion. - They are compared to the classes with meth defs found in modules in the type graph at the end, - those not found need to be appended to a module, otherwise the meth defs are missing. + """check_list contains all classes with method def that are created during conversion. + They are compared to the classes with meth def found in modules in the type graph at the end, + those not found need to be appended to a module, otherwise the meth def are missing. Entire modules are missing! Perhaps because only .py files are processed. They are created and appended to their packages, which are also created when they are not in the type graph.""" # check if every created TClass node is in type graph @@ -567,7 +570,7 @@ def check_for_missing_nodes(self): ref, ty = self.get_reference_by_name(obj.tName) if ref is not None: imported = ref.split('.') - # if len==1 simple import .. statement, included only if import is used (in that case len>1) + # if len==1 simple import … statement, included only if import is used (in that case len>1) if len(imported) > 1: package_name, subpackage_names, module_name, class_name, method_name = self.set_import_names( imported) @@ -672,14 +675,14 @@ def create_missing_module(self, module_name, class_node, package_node): module_node.namespace = package_node self.graph.modules.append(module_node) - def get_epackage(self): + def get_e_package(self): """ Retrieves the EPackage associated with the graph. Returns: The EPackage instance. """ - return self.epackage + return self.e_package def get_graph(self): """ @@ -690,17 +693,17 @@ def get_graph(self): """ return self.graph - def create_ecore_instance(self, type): + def create_ecore_instance(self, ecore_type): """ Creates an Ecore instance of the specified type. Args: - type: The type of the Ecore instance to create. + ecore_type: The type of the Ecore instance to create. Returns: The created Ecore instance. """ - return self.epackage.getEClassifier(type.value)() + return self.e_package.getEClassifier(ecore_type.value)() def get_current_module(self): """ @@ -780,7 +783,7 @@ def process_file(self, path): self.classes_without_module.remove(class_object) class_object.delete() - # added errors='ignore' to fix encoding issues in some repositories ('charmap cannot decode byte..') + # added errors='ignore' to fix encoding issues in some repositories ('char-map cannot decode byte…') with open(path, 'r', errors='ignore') as file: code = file.read() # added following to fix some invalid character and syntax errors @@ -1015,8 +1018,8 @@ def add_instance(self, instance_name, class_name): instance_name (str?): The name of the instance. class_name (str?): The name of the class to which the instance belongs. """ - reference, type = self.get_reference_by_name(class_name) - if reference is not None and type == 0: + reference, reference_type = self.get_reference_by_name(class_name) + if reference is not None and reference_type == 0: classes = class_name.split('.')[1:] classes.insert(0, reference) class_name = ".".join(classes) @@ -1033,7 +1036,8 @@ def remove_instance(self, class_name): if instance[1] == class_name: self.instances.remove(instance) - def get_method_def_in_class(self, name, class_node): + @staticmethod + def get_method_def_in_class(name, class_node): """ Checks if a method definition exists in a class. @@ -1049,7 +1053,8 @@ def get_method_def_in_class(self, name, class_node): return method_def return None - def get_method_def_in_module(self, method_name, module): + @staticmethod + def get_method_def_in_module(method_name, module): """ Checks if a method definition exists in a module. @@ -1060,12 +1065,12 @@ def get_method_def_in_module(self, method_name, module): Returns: The method definition node or None if not found. """ - for object in module.contains: - if object.eClass.name == NodeTypes.METHOD_DEFINITION.value: - if object.signature.method.tName == method_name: - return object - if object.eClass.name == NodeTypes.CLASS.value: - for meth in object.defines: + for module_object in module.contains: + if module_object.eClass.name == NodeTypes.METHOD_DEFINITION.value: + if module_object.signature.method.tName == method_name: + return module_object + if module_object.eClass.name == NodeTypes.CLASS.value: + for meth in module_object.defines: if meth.signature.method.tName == method_name: return meth return None @@ -1116,11 +1121,12 @@ def create_method_signature(self, method_node, name, arguments): method_node.signature = method_signature - # for interal structure + # for internal structure module_node = self.get_current_module() self.method_list.append([method_node, name, module_node]) - def get_calls(self, caller_node, called_node): + @staticmethod + def get_calls(caller_node, called_node): """ Checks if a call already exists between two nodes. @@ -1222,12 +1228,12 @@ def create_inheritance_structure(self, node, child): """ base_node = None if isinstance(node, ast.Name): - base_node, type = self.graph_class.get_reference_by_name(node.id) + base_node, base_type = self.graph_class.get_reference_by_name(node.id) if base_node is None: base_node = self.graph_class.get_class_by_name( node.id, module=self.graph_class.get_current_module()) base_node.childClasses.append(child) - elif isinstance(base_node, str) and type == 0: + elif isinstance(base_node, str) and base_type == 0: import_parent = None for import_class in base_node.split('.'): import_node = self.graph_class.get_class_by_name( @@ -1271,7 +1277,7 @@ def visit_ClassDef(self, node): self.graph_class.create_method_signature( method_node, method_name, item.args.args) class_node.defines.append(method_node) - # to search for missing meth defs later + # to search for missing meth def later self.graph_class.check_list.append(class_node) self.generic_visit(node) @@ -1378,7 +1384,7 @@ def visit_Call(self, node): [self.current_module, caller_node, instance]) # for calls of imported instances, both within repo and external libraries - instance_from_graph, type = self.graph_class.get_reference_by_name( + instance_from_graph, instance_type = self.graph_class.get_reference_by_name( instance.replace(f".{instance.split('.')[-1]}", '')) # this is necessary to get all the called methods' names correctly diff --git a/CustomDataset.py b/CustomDataset.py index d4f12e1..130a63f 100644 --- a/CustomDataset.py +++ b/CustomDataset.py @@ -6,7 +6,7 @@ from torch.utils.data import Dataset from torch_geometric.data import Data -from DataformatUtils import convert_edge_dim, convert_list_to_floattensor, convert_list_to_longtensor, \ +from DataformatUtils import convert_edge_dim, convert_list_to_float_tensor, convert_list_to_long_tensor, \ convert_hashed_names_to_float from Encoder import multi_hot_encoding from GraphClasses import defined_labels @@ -22,6 +22,7 @@ def __init__(self, directory, label_list=None): label_list (str, optional): The path to the Excel file containing labeled graphs. If provided, the labels will be processed and encoded. """ + self.class_elements = [] if label_list is not None: try: self.encoded_labels = self.convert_labeled_graphs(label_list) @@ -33,6 +34,8 @@ def __init__(self, directory, label_list=None): self.directory = directory self.graph_names = [] self.graph_dir = os.listdir(directory) + self.graph = None + for g, graph in enumerate(self.graph_dir): if '_nodefeatures.csv' in graph: graph_name = graph.removesuffix('_nodefeatures.csv') @@ -69,32 +72,36 @@ def __getitem__(self, index): and optionally the label. """ graph_name = self.graph_names[index] - for g, graph in enumerate(self.graph_dir): + for g, self.graph in enumerate(self.graph_dir): try: - if f'{graph_name}_nodefeatures.csv' == graph: + if f'{graph_name}_nodefeatures.csv' == self.graph: node_features = pd.read_csv( - f'{self.directory}/{graph}', header=None) # load csv file - self.x = convert_hashed_names_to_float(node_features) - if f'{graph_name}_A.csv' == graph: + f'{self.directory}/{self.graph}', header=None) # load csv file + self.x = convert_hashed_names_to_float(node_features.to_numpy()) + if f'{graph_name}_A.csv' == self.graph: adjacency = pd.read_csv( - f'{self.directory}/{graph}', header=None) - edge_tensor = convert_list_to_longtensor(adjacency) + f'{self.directory}/{self.graph}', header=None) + edge_tensor = convert_list_to_long_tensor(adjacency.values.tolist()) self.edge_index = convert_edge_dim(edge_tensor) - if f'{graph_name}_edge_attributes.csv' == graph: + if f'{graph_name}_edge_attributes.csv' == self.graph: edge_attributes = pd.read_csv( - f'{self.directory}/{graph}', header=None) - self.edge_attr = convert_list_to_floattensor( - edge_attributes) + f'{self.directory}/{self.graph}', header=None) + self.edge_attr = convert_list_to_float_tensor( + edge_attributes.values.tolist()) except Exception as e: - print(graph, e) + print(self.graph, e) if hasattr(self, 'x') and hasattr(self, 'edge_index'): - graph = Data(x=self.x, edge_index=self.edge_index) + self.graph = Data(x=self.x, edge_index=self.edge_index) if hasattr(self, 'y'): label = self.y[index] - graph.y = label + self.graph.y = label if hasattr(self, 'edge_attr'): - graph.edge_attr = self.edge_attr - return graph + self.graph.edge_attr = self.edge_attr + return self.graph + + def __iter__(self): + for index in range(len(self)): + yield self[index] def sort_labels(self): """ @@ -105,16 +112,16 @@ def sort_labels(self): torch.FloatTensor: A tensor containing the sorted labels for the graphs. """ label_list = list(self.encoded_labels) - sorted = None + sorted_labels = None for n, item in enumerate(self.graph_names): for i, name in enumerate(label_list): if item == name[0]: label = name[1] - if sorted is None: - sorted = np.array(label, dtype=np.float16) + if sorted_labels is None: + sorted_labels = np.array(label, dtype=np.float16) else: - sorted = np.vstack((sorted, label)).astype(np.float16) - y = torch.FloatTensor(sorted) + sorted_labels = np.vstack((sorted_labels, label)).astype(np.float16) + y = torch.FloatTensor(sorted_labels) return y '''takes directory path of excel file with labeled repositories as input and converts the @@ -142,13 +149,13 @@ def convert_labeled_graphs(self, labels): # iterate over loaded file and retrieve labels for row in resource.iterrows(): - object = row[1] + row_data = row[1] # column header containing repository url - url = object.get('html_url') + url = row_data.get('html_url') repo_name = url.split('/')[-1] # last element is repository name graph_names.append(repo_name) # column header containing label - type_label = object.get('final type') + type_label = row_data.get('final type') graph_labels.append(type_label) self.class_elements = self.count_class_elements( @@ -159,7 +166,8 @@ def convert_labeled_graphs(self, labels): file = zip(graph_names, encoded_nodes) return file - def count_class_elements(self, labels): + @staticmethod + def count_class_elements(labels): """ Counts the number of occurrences of each class type in the provided labels. @@ -209,18 +217,18 @@ def check_dataset(self): """ for i, item in enumerate(self.graph_names): graph_name = self.graph_names[i] - for g, graph in enumerate(self.graph_dir): + for g in self.graph_dir: try: - if f'{graph_name}_nodefeatures.csv' == graph: - node_features = pd.read_csv( - f'{self.directory}/{graph}', header=None) - if f'{graph_name}_A.csv' == graph: - adjacency = pd.read_csv( - f'{self.directory}/{graph}', header=None) - if f'{graph_name}_edge_attributes.csv' == graph: - edge_attributes = pd.read_csv( - f'{self.directory}/{graph}', header=None) + if f'{graph_name}_nodefeatures.csv' == g: + pd.read_csv( + f'{self.directory}/{g}', header=None) + if f'{graph_name}_A.csv' == g: + pd.read_csv( + f'{self.directory}/{g}', header=None) + if f'{graph_name}_edge_attributes.csv' == g: + pd.read_csv( + f'{self.directory}/{g}', header=None) except Exception as e: if graph_name in self.graph_names: self.graph_names.remove(graph_name) - print(f'{graph}, {e}, removing {graph_name} from dataset') + print(f'{g}, {e}, removing {graph_name} from dataset') diff --git a/DataformatUtils.py b/DataformatUtils.py index 26ccd00..bb2fb3f 100644 --- a/DataformatUtils.py +++ b/DataformatUtils.py @@ -20,7 +20,7 @@ def convert_edge_dim(edge_tensor): return edge_tensor -def convert_list_to_floattensor(list): +def convert_list_to_float_tensor(input_list): """ Converts a list to a FloatTensor. @@ -28,16 +28,16 @@ def convert_list_to_floattensor(list): FloatTensor, which is suitable for use in deep learning models. Args: - list (list): The input list containing numerical values. + input_list (list): The input list containing numerical values. Returns: torch.FloatTensor: A FloatTensor representation of the input list. """ - tensor = torch.FloatTensor(np.array(list, dtype=float)) + tensor = torch.FloatTensor(np.array(input_list, dtype=float)) return tensor -def convert_list_to_longtensor(list): +def convert_list_to_long_tensor(input_list): """ Converts a list to a LongTensor. @@ -45,12 +45,12 @@ def convert_list_to_longtensor(list): LongTensor, which is useful for representing indices or counts in models. Args: - list (list): The input list containing integer values. + input_list (list): The input list containing integer values. Returns: torch.LongTensor: A LongTensor representation of the input list. """ - tensor = torch.LongTensor(np.array(list, dtype=int)) + tensor = torch.LongTensor(np.array(input_list, dtype=int)) return tensor @@ -70,10 +70,10 @@ def convert_hashed_names_to_float(features): torch.FloatTensor: A FloatTensor representation of the modified features. """ features = np.array(features) - for h, hash in enumerate(features): - helper = hash[8] + for hash_value in features: + helper = hash_value[8] dec_hash = int(str(helper), 16) dec_hash = dec_hash % 16 # fixed NaN issue with GCN - hash[8] = dec_hash + hash_value[8] = dec_hash tensor = torch.FloatTensor(np.array(features, dtype=float)) return tensor diff --git a/EcoreToMatrixConverter.py b/EcoreToMatrixConverter.py index b9aa2fe..eb8527c 100644 --- a/EcoreToMatrixConverter.py +++ b/EcoreToMatrixConverter.py @@ -122,7 +122,7 @@ def get_encoded_library_flags(self): return self.encoded_lib_flags def get_node_features(self): - """returns all of the node features: (ohe enc) node types, hashed names, and (ohe enc) library flags""" + """returns all the node features: (ohe enc) node types, hashed names, and (ohe enc) library flags""" return self.node_features def get_adjacency_list(self): @@ -141,7 +141,8 @@ def get_graph_name(self): """Returns the name of the graph.""" return self.typegraph_root.tName - def combine_node_features(self, features): + @staticmethod + def combine_node_features(features): """ Combines encoded node types, hashed names, and library flags into one feature array per node. @@ -153,8 +154,8 @@ def combine_node_features(self, features): """ feature_list = list(features) combined_list = [] - for arr, hash, flag in feature_list: - arr = np.append(arr, hash) + for arr, hash_name, flag in feature_list: + arr = np.append(arr, hash_name) arr = np.append(arr, flag) combined_list.append(arr) return combined_list @@ -169,93 +170,93 @@ def convert_nodes(self, typegraph): typegraph (Resource): The type graph to convert nodes from. """ # convert packages and subpackages - for tpackage in typegraph.packages: + for t_package in typegraph.packages: current_package = self.get_node( - tpackage.tName, NodeTypes.PACKAGE.value) + t_package.tName, NodeTypes.PACKAGE.value) # if package exists but has length 4 it was a subpackage --> different package, same name if current_package is None or len(current_package) == 4: self.node_matrix.append(NodeTypes.PACKAGE.value) self.node_dict[self.node_count] = [ - NodeTypes.PACKAGE.value, tpackage.tName] - if '_ExternalLibrary' in tpackage.tName: + NodeTypes.PACKAGE.value, t_package.tName] + if '_ExternalLibrary' in t_package.tName: self.library_flag.append('true') else: self.library_flag.append('false') - hashed_name = hashlib.md5(tpackage.tName.encode('utf-8')) + hashed_name = hashlib.md5(t_package.tName.encode('utf-8')) self.hashed_names.append(hashed_name.hexdigest()) self.node_count += 1 - if hasattr(tpackage, 'subpackages'): - self.convert_subpackages_recursive(tpackage) + if hasattr(t_package, 'subpackages'): + self.convert_subpackages_recursive(t_package) # convert modules and contained objects - for tmodule in typegraph.modules: + for t_module in typegraph.modules: current_module = self.get_node( - tmodule.location, NodeTypes.MODULE.value) + t_module.location, NodeTypes.MODULE.value) if current_module is None: self.node_matrix.append(NodeTypes.MODULE.value) - if '_ExternalLibrary' in tmodule.location: + if '_ExternalLibrary' in t_module.location: self.library_flag.append('true') else: self.library_flag.append('false') - hashed_name = hashlib.md5(tmodule.location.encode('utf-8')) + hashed_name = hashlib.md5(t_module.location.encode('utf-8')) self.hashed_names.append(hashed_name.hexdigest()) - if tmodule.namespace is not None: - self.node_dict[self.node_count] = [NodeTypes.MODULE.value, tmodule.location, + if t_module.namespace is not None: + self.node_dict[self.node_count] = [NodeTypes.MODULE.value, t_module.location, NodeTypes.PACKAGE.value, - tmodule.namespace.tName] # name of TPackage object + t_module.namespace.tName] # name of TPackage object else: self.node_dict[self.node_count] = [ - NodeTypes.MODULE.value, tmodule.location] + NodeTypes.MODULE.value, t_module.location] self.node_count += 1 - if hasattr(tmodule, 'contains'): + if hasattr(t_module, 'contains'): # can contain TContainableElements (TAbstractType and TMember) - for tobject in tmodule.contains: - if tobject.eClass.name == NodeTypes.CLASS.value: + for t_object in t_module.contains: + if t_object.eClass.name == NodeTypes.CLASS.value: current_class = self.get_node( - tobject.tName, NodeTypes.CLASS.value) + t_object.tName, NodeTypes.CLASS.value) if current_class is None: self.node_matrix.append(NodeTypes.CLASS.value) - self.node_dict[self.node_count] = [NodeTypes.CLASS.value, tobject.tName, - NodeTypes.MODULE.value, tmodule.location] - if hasattr(tobject, 'tLib'): - if tobject.tLib is True: + self.node_dict[self.node_count] = [NodeTypes.CLASS.value, t_object.tName, + NodeTypes.MODULE.value, t_module.location] + if hasattr(t_object, 'tLib'): + if t_object.tLib is True: self.library_flag.append('true') else: self.library_flag.append('false') else: self.library_flag.append('false') hashed_name = hashlib.md5( - tobject.tName.encode('utf-8')) + t_object.tName.encode('utf-8')) self.hashed_names.append( hashed_name.hexdigest()) self.node_count += 1 - if hasattr(tobject, 'childClasses'): - self.convert_childClasses(tobject) - if hasattr(tobject, 'defines'): - self.convert_defined_methods(tobject) - if tobject.eClass.name == NodeTypes.METHOD_DEFINITION.value: + if hasattr(t_object, 'childClasses'): + self.convert_child_classes(t_object) + if hasattr(t_object, 'defines'): + self.convert_defined_methods(t_object) + if t_object.eClass.name == NodeTypes.METHOD_DEFINITION.value: self.convert_method_definitions( - tobject, NodeTypes.MODULE.value, tmodule.location) + t_object, NodeTypes.MODULE.value, t_module.location) # convert methods and contained objects - for tmethod in typegraph.methods: + for t_method in typegraph.methods: self.node_matrix.append(NodeTypes.METHOD.value) self.node_dict[self.node_count] = [ - NodeTypes.METHOD.value, tmethod.tName] - if '_ExternalLibrary' in tmethod.tName: + NodeTypes.METHOD.value, t_method.tName] + if '_ExternalLibrary' in t_method.tName: self.library_flag.append('true') else: self.library_flag.append('false') - hashed_name = hashlib.md5(tmethod.tName.encode('utf-8')) + hashed_name = hashlib.md5(t_method.tName.encode('utf-8')) self.hashed_names.append(hashed_name.hexdigest()) self.node_count += 1 - node_name = tmethod.tName - for tobject in tmethod.signatures: + node_name = t_method.tName + for t_object in t_method.signatures: node_name += '_signature' signature_name = node_name self.node_matrix.append(NodeTypes.METHOD_SIGNATURE.value) self.node_dict[self.node_count] = [NodeTypes.METHOD_SIGNATURE.value, node_name, NodeTypes.METHOD.value, - tmethod.tName] + t_method.tName] if '_ExternalLibrary' in node_name: self.library_flag.append('true') else: @@ -263,15 +264,15 @@ def convert_nodes(self, typegraph): hashed_name = hashlib.md5(node_name.encode('utf-8')) self.hashed_names.append(hashed_name.hexdigest()) self.node_count += 1 - if hasattr(tobject, 'parameters'): + if hasattr(t_object, 'parameters'): node_name += '_param' - for p, tparam in enumerate(tobject.parameters): + for p, t_param in enumerate(t_object.parameters): param_counter = p + 1 current_param = str(param_counter) param_name = node_name + current_param # check for next parameter to save info for edges later - if tparam.next is None: + if t_param.next is None: self.node_matrix.append(NodeTypes.PARAMETER.value) self.node_dict[self.node_count] = [NodeTypes.PARAMETER.value, param_name, NodeTypes.METHOD_SIGNATURE.value, signature_name] @@ -285,7 +286,7 @@ def convert_nodes(self, typegraph): self.hashed_names.append(hashed_name.hexdigest()) self.node_count += 1 - if tparam.next is not None: + if t_param.next is not None: # create name of the next parameter next_param_counter = param_counter + 1 next_param = str(next_param_counter) @@ -304,65 +305,65 @@ def convert_nodes(self, typegraph): self.node_count += 1 # convert classes and contained objects - for tclass in typegraph.classes: - current_class = self.get_node(tclass.tName, NodeTypes.CLASS.value) + for t_class in typegraph.classes: + current_class = self.get_node(t_class.tName, NodeTypes.CLASS.value) if current_class is None: self.node_matrix.append(NodeTypes.CLASS.value) self.node_dict[self.node_count] = [ - NodeTypes.CLASS.value, tclass.tName] - # TClass objects have extra flag, instead of checking via name - if hasattr(tclass, 'tLib'): - if tclass.tLib is True: + NodeTypes.CLASS.value, t_class.tName] + # TClass objects have an extra Flag, instead of checking via name + if hasattr(t_class, 'tLib'): + if t_class.tLib is True: self.library_flag.append('true') else: self.library_flag.append('false') else: self.library_flag.append('false') - hashed_name = hashlib.md5(tclass.tName.encode('utf-8')) + hashed_name = hashlib.md5(t_class.tName.encode('utf-8')) self.hashed_names.append(hashed_name.hexdigest()) self.node_count += 1 - if hasattr(tclass, 'childClasses'): - self.convert_childClasses(tclass) - if hasattr(tclass, 'defines'): - self.convert_defined_methods(tclass) + if hasattr(t_class, 'childClasses'): + self.convert_child_classes(t_class) + if hasattr(t_class, 'defines'): + self.convert_defined_methods(t_class) - def convert_subpackages_recursive(self, tpackage): + def convert_subpackages_recursive(self, t_package): """ Recursively converts subpackages of a given package into the node matrix. Args: - tpackage: The package to convert subpackages from. + t_package: The package to convert subpackages from. """ - for tsubpackage in tpackage.subpackages: - current_subpackage = self.get_node_in_container(tsubpackage.tName, NodeTypes.PACKAGE.value, tpackage.tName, + for t_subpackage in t_package.subpackages: + current_subpackage = self.get_node_in_container(t_subpackage.tName, NodeTypes.PACKAGE.value, t_package.tName, NodeTypes.PACKAGE.value) if current_subpackage is None: self.node_matrix.append(NodeTypes.PACKAGE.value) - self.node_dict[self.node_count] = [NodeTypes.PACKAGE.value, tsubpackage.tName, NodeTypes.PACKAGE.value, - tpackage.tName] # save type and name for edge info - if '_ExternalLibrary' in tsubpackage.tName: + self.node_dict[self.node_count] = [NodeTypes.PACKAGE.value, t_subpackage.tName, NodeTypes.PACKAGE.value, + t_package.tName] # save type and name for edge info + if '_ExternalLibrary' in t_subpackage.tName: self.library_flag.append('true') else: self.library_flag.append('false') - hashed_name = hashlib.md5(tsubpackage.tName.encode('utf-8')) + hashed_name = hashlib.md5(t_subpackage.tName.encode('utf-8')) self.hashed_names.append(hashed_name.hexdigest()) self.node_count += 1 - if hasattr(tsubpackage, 'subpackages'): - self.convert_subpackages_recursive(tsubpackage) + if hasattr(t_subpackage, 'subpackages'): + self.convert_subpackages_recursive(t_subpackage) # classes have only one attribute childClasses # checking recursively will result in potential endless loop, without these child classes existing in the xmi file - def convert_childClasses(self, tclass): + def convert_child_classes(self, t_class): """ Converts child classes of a given class into the node matrix. Args: - tclass: The class to convert child classes from. + t_class: The class to convert child classes from. """ - for child in tclass.childClasses: + for child in t_class.childClasses: self.node_matrix.append(NodeTypes.CLASS.value) self.node_dict[self.node_count] = [ - NodeTypes.CLASS.value, child.tName, NodeTypes.CLASS.value, tclass.tName] + NodeTypes.CLASS.value, child.tName, NodeTypes.CLASS.value, t_class.tName] if hasattr(child, 'tLib'): if child.tLib is True: self.library_flag.append('true') @@ -376,61 +377,61 @@ def convert_childClasses(self, tclass): if hasattr(child, 'defines'): self.convert_defined_methods(child) - def convert_defined_methods(self, tclass): + def convert_defined_methods(self, t_class): """ Converts TMethod objects that are defined within a class. Args: - tclass: The class containing defined methods to convert. + t_class: The class containing defined methods to convert. """ - for tobject in tclass.defines: - if tobject.eClass.name == NodeTypes.METHOD_DEFINITION.value: + for t_object in t_class.defines: + if t_object.eClass.name == NodeTypes.METHOD_DEFINITION.value: self.convert_method_definitions( - tobject, NodeTypes.CLASS.value, tclass.tName) + t_object, NodeTypes.CLASS.value, t_class.tName) - def convert_method_definitions(self, t_meth_def, container_type, tcontainer_name): + def convert_method_definitions(self, t_meth_def, container_type, t_container_name): """ Converts TMethodDefinition objects and contained call objects. Args: t_meth_def: The method definition to convert. container_type: The type of the container (e.g., class or module). - tcontainer_name: The name of the container. + t_container_name: The name of the container. """ - tobject_name = t_meth_def.signature.method.tName - tobject_name += '_definition' + t_object_name = t_meth_def.signature.method.tName + t_object_name += '_definition' self.node_matrix.append(NodeTypes.METHOD_DEFINITION.value) - self.node_dict[self.node_count] = [NodeTypes.METHOD_DEFINITION.value, tobject_name, container_type, - tcontainer_name] - if '_ExternalLibrary' in tobject_name: + self.node_dict[self.node_count] = [NodeTypes.METHOD_DEFINITION.value, t_object_name, container_type, + t_container_name] + if '_ExternalLibrary' in t_object_name: self.library_flag.append('true') else: self.library_flag.append('false') - hashed_name = hashlib.md5(tobject_name.encode('utf-8')) + hashed_name = hashlib.md5(t_object_name.encode('utf-8')) self.hashed_names.append(hashed_name.hexdigest()) self.node_count += 1 if hasattr(t_meth_def, 'accessing'): - self.convert_call(t_meth_def, tobject_name) + self.convert_call(t_meth_def, t_object_name) - def convert_call(self, tmethod_def, tmethod_def_name): + def convert_call(self, t_method_def, t_method_def_name): """ Converts call objects contained in TMethodDefinition objects. Args: - tmethod_def: The method definition containing call objects. - tmethod_def_name: The name of the method definition. + t_method_def: The method definition containing call objects. + t_method_def_name: The name of the method definition. """ - call_source = tmethod_def_name - tmethod_def_name += '_call' - for c, call in enumerate(tmethod_def.accessing): - methoddef_target = call.target - if methoddef_target is not None: + call_source = t_method_def_name + t_method_def_name += '_call' + for c, call in enumerate(t_method_def.accessing): + method_def_target = call.target + if method_def_target is not None: # name of the TMethod object that's being called - target_name = methoddef_target.signature.method.tName + target_name = method_def_target.signature.method.tName # create a name for the call object call_counter = c + 1 calls = str(call_counter) - current_call = tmethod_def_name + calls + current_call = t_method_def_name + calls self.node_matrix.append(NodeTypes.CALL.value) self.node_dict[self.node_count] = [NodeTypes.CALL.value, current_call, 'Source', call_source, 'Target', target_name] @@ -443,31 +444,31 @@ def convert_call(self, tmethod_def, tmethod_def_name): self.hashed_names.append(hashed_name.hexdigest()) self.node_count += 1 - def get_node(self, node_name, type): + def get_node(self, node_name, node_type): """ Checks if a node already exists by comparing node type and name. Args: node_name (str): The name of the node to check. - type (str): The type of the node to check. + node_type (str): The type of the node to check. Returns: The node if it exists, otherwise None. """ for current_node in self.node_dict: node = self.node_dict[current_node] - if node[0] == type: + if node[0] == node_type: if node[1] == node_name: return node return None - def get_node_in_container(self, node_name, type, parent_name, parent_type): + def get_node_in_container(self, node_name, node_type, parent_name, parent_type): """ Checks for nodes with the same name but different parents/container objects. Args: node_name (str): The name of the node to check. - type (str): The type of the node to check. + node_type (str): The type of the node to check. parent_name (str): The name of the parent/container. parent_type (str): The type of the parent/container. @@ -477,7 +478,7 @@ def get_node_in_container(self, node_name, type, parent_name, parent_type): for current_node in self.node_dict: node = self.node_dict[current_node] if len(node) >= 4: - if node[0] == type: + if node[0] == node_type: if node[1] == node_name: if node[2] == parent_type: if node[3] == parent_name: diff --git a/GCN.py b/GCN.py index 126bded..d0c87ea 100644 --- a/GCN.py +++ b/GCN.py @@ -1,18 +1,16 @@ import torch -import torch.nn.functional as F -from torch_geometric.nn import GATConv -from torch_geometric.nn import global_mean_pool +import torch.nn.functional as f +import torch_geometric.nn '''defines the architecture of the graph convolutional network''' - class GCN(torch.nn.Module): def __init__(self, num_node_features, num_classes, hidden_channels): super(GCN, self).__init__() torch.manual_seed(12345) - self.conv1 = GATConv(num_node_features, hidden_channels) - self.conv2 = GATConv(hidden_channels, hidden_channels) - self.conv3 = GATConv(hidden_channels, num_classes) + self.conv1 = torch_geometric.nn.GATConv(num_node_features, hidden_channels) + self.conv2 = torch_geometric.nn.GATConv(hidden_channels, hidden_channels) + self.conv3 = torch_geometric.nn.GATConv(hidden_channels, num_classes) '''x is node feature matrix with shape x=[N, 11], N=number of nodes edge_index is sparse edge matrix with shape edge_index=[2, E], E=number of edges @@ -28,11 +26,11 @@ def forward(self, x, edge_index, edge_attr, batch=None): x = x.relu() # readout layer - x = global_mean_pool(x, batch) + x = torch_geometric.nn.global_mean_pool(x, batch) # dropout for regularization - x = F.dropout(x, p=0.5, training=self.training) + x = f.dropout(x, p=0.5, training=self.training) # sigmoid activation function for multi-label - x = F.sigmoid(x) + x = f.sigmoid(x) - return x + return x \ No newline at end of file diff --git a/Pipeline.py b/Pipeline.py index f773480..b8409fd 100644 --- a/Pipeline.py +++ b/Pipeline.py @@ -5,12 +5,19 @@ from pyecore.resources import ResourceSet, URI from AstToEcoreConverter import ProjectEcoreGraph -from DataformatUtils import convert_edge_dim, convert_list_to_floattensor, convert_list_to_longtensor, \ +from DataformatUtils import convert_edge_dim, convert_list_to_float_tensor, convert_list_to_long_tensor, \ convert_hashed_names_to_float from EcoreToMatrixConverter import EcoreToMatrixConverter '''in this file are the pipeline components put into reusable functions''' +ecore_graph = None +node_features = None +adj_list = None +edge_attr = None +repo_multiprocess = None +edge_attribute = None + def create_output_folders(directory): """ @@ -49,7 +56,7 @@ def download_repositories(repository_directory, repository_list): """ working_directory = os.getcwd() - # load labeled repository from excel/ods file + # load labeled repository from Excel/ods file # requirements for format: no empty rows in between and header name html_url resource = pd.read_excel(repository_list) @@ -60,8 +67,8 @@ def download_repositories(repository_directory, repository_list): # retrieve urls and clone repositories for row in resource.iterrows(): - object = row[1] - url = object.get('html_url') + row_data = row[1] + url = row_data.get('html_url') os.system(f'git clone {url}') # change working directory back to github-classifier, otherwise cannot load resources from there and run tool @@ -88,6 +95,7 @@ def create_ecore_graphs(repository, write_in_file, output_directory=None): Exception: Any exceptions raised during the graph creation process will be printed, and the function will skip the problematic repository. """ + global ecore_graph skip_counter = 0 resource_set = ResourceSet() if os.path.isdir(repository): @@ -121,7 +129,7 @@ def create_ecore_graphs(repository, write_in_file, output_directory=None): return None -def create_matrix_structure(write_in_file, xmi_file=None, ecore_graph=None, output_directory=None): +def create_matrix_structure(write_in_file, xmi_file=None, local_ecore_graph=None, output_directory=None): """ Convert an Ecore graph or XMI file into three matrices: node features, adjacency list, and edge attributes. @@ -133,7 +141,7 @@ def create_matrix_structure(write_in_file, xmi_file=None, ecore_graph=None, outp Args: write_in_file: A boolean indicating whether to write the matrices to files. xmi_file: Optional; the name of the XMI file to be processed. - ecore_graph: Optional; the Ecore graph to be converted into matrices. + local_ecore_graph: Optional; the Ecore graph to be converted into matrices. output_directory: The directory where output files will be written. Returns: @@ -147,15 +155,16 @@ def create_matrix_structure(write_in_file, xmi_file=None, ecore_graph=None, outp Exception: Any exceptions raised during the conversion process will be printed, and the function will skip the problematic XMI file if applicable. """ + global node_features, adj_list, edge_attr skip_xmi = 0 if write_in_file is True: - rset = ResourceSet() - resource = rset.get_resource(URI('Basic.ecore')) + resource_set = ResourceSet() + resource = resource_set.get_resource(URI('Basic.ecore')) mm_root = resource.contents[0] - rset.metamodel_registry[mm_root.nsURI] = mm_root + resource_set.metamodel_registry[mm_root.nsURI] = mm_root - resource = rset.get_resource( + resource = resource_set.get_resource( URI(f'{output_directory}/xmi_files/{xmi_file}')) try: EcoreToMatrixConverter( @@ -166,7 +175,7 @@ def create_matrix_structure(write_in_file, xmi_file=None, ecore_graph=None, outp skip_xmi += 1 if write_in_file is False: try: - matrix = EcoreToMatrixConverter(ecore_graph, write_in_file) + matrix = EcoreToMatrixConverter(local_ecore_graph, write_in_file) node_features = matrix.get_node_features() adj_list = matrix.get_adjacency_list() edge_attr = matrix.get_encoded_edge_attributes() @@ -224,9 +233,9 @@ def prepare_dataset(repository_directory, output_directory=None, repository_list Exception: If no repositories are found or if the output directory is missing when processing multiple repositories. """ - node_features = None - adj_list = None - edge_attr = None + global repo_multiprocess, ecore_graph + global node_features, adj_list, edge_attribute + # clone repositories for the dataset if repository_list is not None: @@ -248,7 +257,7 @@ def prepare_dataset(repository_directory, output_directory=None, repository_list print(e) # exit program because of missing output directory exit('output directory is required!') - # create pool for multiprocessing/parallelisation + # create pool for multiprocessing/parallelization repo_multiprocess = [] for repository in repositories: current_directory = os.path.join(repository_directory, repository) @@ -273,14 +282,14 @@ def prepare_dataset(repository_directory, output_directory=None, repository_list (write_in_file, xmi_file, None, output_directory)) parallel_processing(create_matrix_structure, xmi_multiprocess) else: - node_features, adj_list, edge_attr = create_matrix_structure( + node_features, adj_list, edge_attribute = create_matrix_structure( write_in_file, None, ecore_graph) # if only one repository is converted for classification, adjust data format needed by the gcn - if node_features is not None and adj_list is not None and edge_attr is not None: + if node_features is not None and adj_list is not None and edge_attribute is not None: node_features = convert_hashed_names_to_float(node_features) - adj_list = convert_list_to_longtensor(adj_list) + adj_list = convert_list_to_long_tensor(adj_list) adj_list = convert_edge_dim(adj_list) - edge_attr = convert_list_to_floattensor(edge_attr) + edge_attribute = convert_list_to_float_tensor(edge_attribute) - return node_features, adj_list, edge_attr + return node_features, adj_list, edge_attribute diff --git a/pep8autoformat.py b/pep8autoformat.py index 39d58f4..374f77a 100644 --- a/pep8autoformat.py +++ b/pep8autoformat.py @@ -1,21 +1,21 @@ import autopep8 -def format_python_file(file_path): +def format_python_file(path_to_file): try: # Read the current content of the file - with open(file_path, 'r') as file: + with open(path_to_file, 'r') as file: code = file.read() # Format the code using autopep8 formatted_code = autopep8.fix_code(code) # Write the formatted code back to the file - with open(file_path, 'w') as file: + with open(path_to_file, 'w') as file: file.write(formatted_code) - print(f"Formatted '{file_path}' successfully.") + print(f"Formatted '{path_to_file}' successfully.") except Exception as e: - print(f"Error formatting '{file_path}': {e}") + print(f"Error formatting '{path_to_file}': {e}") if __name__ == "__main__": diff --git a/tests/test_ETM_Converter.py b/tests/test_ETM_Converter.py index 86fd800..33945c0 100644 --- a/tests/test_ETM_Converter.py +++ b/tests/test_ETM_Converter.py @@ -359,7 +359,7 @@ def test_multiple_parameter(self): ecore_graph = graph.get_graph() matrix = EcoreToMatrixConverter(ecore_graph, False) node_features = matrix.get_node_matrix() - enc_node_features = matrix.get_encoded_node_matrix() + #enc_node_features = matrix.get_encoded_node_matrix() edges = matrix.get_adjacency_list() edge_attributes = matrix.get_edge_attributes() lib_flags = matrix.get_external_library_flags() @@ -387,7 +387,7 @@ def test_module_internal_method_call(self): ecore_graph = graph.get_graph() matrix = EcoreToMatrixConverter(ecore_graph, False) node_features = matrix.get_node_matrix() - enc_node_features = matrix.get_encoded_node_matrix() + #enc_node_features = matrix.get_encoded_node_matrix() edges = matrix.get_adjacency_list() edge_attributes = matrix.get_edge_attributes() lib_flags = matrix.get_external_library_flags() @@ -483,9 +483,9 @@ def test_internal_method_imports_package(self): ecore_graph = graph.get_graph() matrix = EcoreToMatrixConverter(ecore_graph, False) node_features = matrix.get_node_matrix() - enc_node_features = matrix.get_encoded_node_matrix() + #enc_node_features = matrix.get_encoded_node_matrix() edges = matrix.get_adjacency_list() - edge_attributes = matrix.get_edge_attributes() + #edge_attributes = matrix.get_edge_attributes() lib_flags = matrix.get_external_library_flags() self.assertEqual(lib_flags[2], 'false', 'library flag for internal object is wrong') @@ -509,9 +509,9 @@ def test_internal_method_class_imports_package(self): ecore_graph = graph.get_graph() matrix = EcoreToMatrixConverter(ecore_graph, False) node_features = matrix.get_node_matrix() - enc_node_features = matrix.get_encoded_node_matrix() + #enc_node_features = matrix.get_encoded_node_matrix() edges = matrix.get_adjacency_list() - edge_attributes = matrix.get_edge_attributes() + #edge_attributes = matrix.get_edge_attributes() lib_flags = matrix.get_external_library_flags() self.assertEqual(lib_flags[2], 'false', 'library flag for internal object is wrong') @@ -537,9 +537,9 @@ def test_module_internal_class_call(self): ecore_graph = graph.get_graph() matrix = EcoreToMatrixConverter(ecore_graph, False) node_features = matrix.get_node_matrix() - enc_node_features = matrix.get_encoded_node_matrix() + #enc_node_features = matrix.get_encoded_node_matrix() edges = matrix.get_adjacency_list() - edge_attributes = matrix.get_edge_attributes() + #edge_attributes = matrix.get_edge_attributes() lib_flags = matrix.get_external_library_flags() self.assertEqual(lib_flags[1], 'false', 'library flag for internal object is wrong') @@ -565,9 +565,9 @@ def test_class_internal_method_call(self): ecore_graph = graph.get_graph() matrix = EcoreToMatrixConverter(ecore_graph, False) node_features = matrix.get_node_matrix() - enc_node_features = matrix.get_encoded_node_matrix() + #enc_node_features = matrix.get_encoded_node_matrix() edges = matrix.get_adjacency_list() - edge_attributes = matrix.get_edge_attributes() + #edge_attributes = matrix.get_edge_attributes() lib_flags = matrix.get_external_library_flags() self.assertEqual(lib_flags[1], 'false', 'library flag for internal object is wrong') @@ -593,9 +593,9 @@ def test_internal_method_imports_multiple_packages(self): ecore_graph = graph.get_graph() matrix = EcoreToMatrixConverter(ecore_graph, False) node_features = matrix.get_node_matrix() - enc_node_features = matrix.get_encoded_node_matrix() + #enc_node_features = matrix.get_encoded_node_matrix() edges = matrix.get_adjacency_list() - edge_attributes = matrix.get_edge_attributes() + #edge_attributes = matrix.get_edge_attributes() lib_flags = matrix.get_external_library_flags() self.assertEqual(lib_flags[0], 'false', 'library flag for internal object is wrong') @@ -625,9 +625,9 @@ def test_internal_method_class_imports_multiple_packages(self): ecore_graph = graph.get_graph() matrix = EcoreToMatrixConverter(ecore_graph, False) node_features = matrix.get_node_matrix() - enc_node_features = matrix.get_encoded_node_matrix() + #enc_node_features = matrix.get_encoded_node_matrix() edges = matrix.get_adjacency_list() - edge_attributes = matrix.get_edge_attributes() + #edge_attributes = matrix.get_edge_attributes() lib_flags = matrix.get_external_library_flags() self.assertEqual(lib_flags[0], 'false', 'library flag for internal object is wrong') @@ -659,9 +659,9 @@ def test_call_external_library(self): ecore_graph = graph.get_graph() matrix = EcoreToMatrixConverter(ecore_graph, False) node_features = matrix.get_node_matrix() - enc_node_features = matrix.get_encoded_node_matrix() + #enc_node_features = matrix.get_encoded_node_matrix() edges = matrix.get_adjacency_list() - edge_attributes = matrix.get_edge_attributes() + #edge_attributes = matrix.get_edge_attributes() lib_flags = matrix.get_external_library_flags() self.assertEqual(len(lib_flags), len(node_features), 'wrong number of ext. library flags') @@ -691,9 +691,9 @@ def test_call_external_library_submodule(self): ecore_graph = graph.get_graph() matrix = EcoreToMatrixConverter(ecore_graph, False) node_features = matrix.get_node_matrix() - enc_node_features = matrix.get_encoded_node_matrix() + #enc_node_features = matrix.get_encoded_node_matrix() edges = matrix.get_adjacency_list() - edge_attributes = matrix.get_edge_attributes() + #edge_attributes = matrix.get_edge_attributes() lib_flags = matrix.get_external_library_flags() self.assertEqual(len(lib_flags), len(node_features), 'wrong number of ext. library flags') @@ -738,9 +738,9 @@ def test_call_external_library_class(self): ecore_graph = graph.get_graph() matrix = EcoreToMatrixConverter(ecore_graph, False) node_features = matrix.get_node_matrix() - enc_node_features = matrix.get_encoded_node_matrix() + #enc_node_features = matrix.get_encoded_node_matrix() edges = matrix.get_adjacency_list() - edge_attributes = matrix.get_edge_attributes() + #edge_attributes = matrix.get_edge_attributes() lib_flags = matrix.get_external_library_flags() self.assertEqual(len(lib_flags), len(node_features), 'wrong number of ext. library flags') @@ -772,9 +772,9 @@ def test_call_external_library_class_multiple_methods(self): ecore_graph = graph.get_graph() matrix = EcoreToMatrixConverter(ecore_graph, False) node_features = matrix.get_node_matrix() - enc_node_features = matrix.get_encoded_node_matrix() + #enc_node_features = matrix.get_encoded_node_matrix() edges = matrix.get_adjacency_list() - edge_attributes = matrix.get_edge_attributes() + #edge_attributes = matrix.get_edge_attributes() lib_flags = matrix.get_external_library_flags() self.assertEqual(lib_flags[9], 'true', 'flag for import is wrong') @@ -810,9 +810,9 @@ def test_call_external_library_multiple_methods(self): ecore_graph = graph.get_graph() matrix = EcoreToMatrixConverter(ecore_graph, False) node_features = matrix.get_node_matrix() - enc_node_features = matrix.get_encoded_node_matrix() + #enc_node_features = matrix.get_encoded_node_matrix() edges = matrix.get_adjacency_list() - edge_attributes = matrix.get_edge_attributes() + #edge_attributes = matrix.get_edge_attributes() lib_flags = matrix.get_external_library_flags() self.assertEqual(lib_flags[7], 'true', 'flag for import is wrong') @@ -848,9 +848,9 @@ def test_call_external_library_multiple_modules_same_package(self): ecore_graph = graph.get_graph() matrix = EcoreToMatrixConverter(ecore_graph, False) node_features = matrix.get_node_matrix() - enc_node_features = matrix.get_encoded_node_matrix() + #enc_node_features = matrix.get_encoded_node_matrix() edges = matrix.get_adjacency_list() - edge_attributes = matrix.get_edge_attributes() + #edge_attributes = matrix.get_edge_attributes() lib_flags = matrix.get_external_library_flags() self.assertEqual(lib_flags[1], 'true', 'library flag for imported object is wrong') @@ -895,9 +895,9 @@ def test_call_external_library_multiple_subpackages(self): ecore_graph = graph.get_graph() matrix = EcoreToMatrixConverter(ecore_graph, False) node_features = matrix.get_node_matrix() - enc_node_features = matrix.get_encoded_node_matrix() + #enc_node_features = matrix.get_encoded_node_matrix() edges = matrix.get_adjacency_list() - edge_attributes = matrix.get_edge_attributes() + #edge_attributes = matrix.get_edge_attributes() lib_flags = matrix.get_external_library_flags() self.assertEqual(lib_flags[0], 'true', 'library flag for imported object is wrong') @@ -952,11 +952,11 @@ def test_hashed_names(self): ecore_graph = graph.get_graph() matrix = EcoreToMatrixConverter(ecore_graph, False) node_features = matrix.get_node_matrix() - enc_node_features = matrix.get_encoded_node_matrix() + #enc_node_features = matrix.get_encoded_node_matrix() full_features = matrix.get_node_features() - edges = matrix.get_adjacency_list() - edge_attributes = matrix.get_edge_attributes() - lib_flags = matrix.get_external_library_flags() + #edges = matrix.get_adjacency_list() + #edge_attributes = matrix.get_edge_attributes() + #lib_flags = matrix.get_external_library_flags() self.assertEqual(len(node_features), len(full_features), 'number of nodes in type and type+hash is not equal') for item in full_features: diff --git a/train.py b/train.py index 95dc88b..8c2f3e9 100644 --- a/train.py +++ b/train.py @@ -16,14 +16,13 @@ '''please prepare the dataset you want to train the tool with by using prepareDataset.py, this file is for training the tool''' -# path to the folder containing the converted repositories +# Load settings from Config output_directory = CONFIG['training']['output_directory'] -labels = CONFIG['main']['repository_list_file'] +labels = CONFIG['training']['labels_file'] n_epoch = CONFIG['training']['n_epoch'] -k_folds = CONFIG['training']['k_folds'] # has to be at least 2 +k_folds = CONFIG['training']['k_folds'] learning_rate = CONFIG['training']['learning_rate'] figure_output = CONFIG['training']['figure_output'] -# value above which label is considered to be predicted by model threshold = CONFIG['training']['threshold'] save_classification_reports = CONFIG['training']['save_classification_reports'] experiment_name = CONFIG['training']['experiment_name'] @@ -61,6 +60,7 @@ def train(): def test(loader): + global class_report, report_dict model.eval() loss_test = 0 total = 0 @@ -89,23 +89,17 @@ def test(loader): graph.y = graph.y.cpu().detach().numpy() # transform output, if value above threshold label is considered to be predicted - trafo_output = [] - for slice in output: - new_item = [] - for item in slice: - if item >= threshold: - new_item.append(1.0) - else: - new_item.append(0.0) - trafo_output.append(new_item) - trafo_output = np.reshape(trafo_output, (size, num_classes)) + output = np.array(output) + # Transform output based on the threshold T -> 1 , F -> 0 + output_after_threshold = (output >= threshold).astype(float) + # Reshape the output to the desired shape + output_after_threshold = output_after_threshold.reshape((size, num_classes)) # better evaluation metrics for multilabel: precision, recall, f1_score # report is string, dict to extract results - report_dict = classification_report( - graph.y, trafo_output, target_names=defined_labels, output_dict=True) - class_report = classification_report( - graph.y, trafo_output, target_names=defined_labels) + report_dict = classification_report(graph.y, output_after_threshold, target_names=defined_labels, + output_dict=True) + class_report = classification_report(graph.y, output_after_threshold, target_names=defined_labels) return loss_test / total, class_report, report_dict @@ -148,6 +142,7 @@ def test(loader): trainset, testset = random_split(dataset, [0.9, 0.1]) print( f'size of train dataset: {len(trainset)}, test dataset: {len(testset)}') + trainloader = DataLoader(trainset, batch_size=32, shuffle=True) testloader = DataLoader(testset, batch_size=32, shuffle=False) print( @@ -174,15 +169,16 @@ def test(loader): mlflow.log_params(params) for epoch in range(n_epoch): - print(f'Fold {f}, Epoch {epoch}') + print( + f'Fold {f}, Epoch {epoch}') train() train_loss, train_report, train_rep_dict = test(trainloader) test_loss, test_report, test_rep_dict = test(testloader) # log loss metrics = {"training loss": train_loss, "test loss": test_loss} - # one folder per fold, because metrics needs to be key value pairs not dicts - mlflow.log_metrics(metrics, step=epoch) + mlflow.log_metrics(metrics, + step=epoch) # one folder per fold, because metrics needs to be key value pairs not dicts reports[f'Fold_{f}_Epoch_{epoch}_train'] = train_report reports[f'Fold_{f}_Epoch_{epoch}_test'] = test_report @@ -227,8 +223,7 @@ def test(loader): print(f'f1-score of plugin during testing: {plugin_f1_test}') av_test = test_rep_dict['weighted avg'] f1_test = av_test['f1-score'] - print( - f'weighted average of labels (f1-score) during testing: {f1_test}') + print(f'weighted average of labels (f1-score) during testing: {f1_test}') print('==============================================') # save trained model with best performance @@ -248,7 +243,7 @@ def test(loader): # plot visualization for training fig_n = f + k_folds + 1 # so figures are separate for training and testing fig = plt.figure(fig_n) - fig, (ax1, ax2) = plt.subplots(2) + _, (ax1, ax2) = plt.subplots(2) fig.suptitle(f'Fold {f}') ax1.plot(plt_epoch, plt_train_loss, 'k', label='test loss') ax1.set(ylabel='train loss') @@ -262,7 +257,7 @@ def test(loader): # plot visualization for testing fig = plt.figure(f) - fig, (ax1, ax2) = plt.subplots(2) + _, (ax1, ax2) = plt.subplots(2) fig.suptitle(f'Fold {f}') ax1.plot(plt_epoch, plt_test_loss, 'k', label='test loss') ax1.set(ylabel='test loss')