diff --git a/AstToEcoreConverter.py b/AstToEcoreConverter.py index 56bcd4c..a827618 100644 --- a/AstToEcoreConverter.py +++ b/AstToEcoreConverter.py @@ -1099,7 +1099,7 @@ def get_method_def_from_internal_structure(self, method_name, module): return current_method[0] return None - def create_method_signature(self, method_node, name, arguments, return_type = None): + def create_method_signature(self, method_node, name, arguments, return_type=None): """ Creates a method signature for a method definition. @@ -1134,7 +1134,7 @@ def create_method_signature(self, method_node, name, arguments, return_type = No # Add type for TParameter.type parameter_type = self.create_ecore_instance(NodeTypes.CLASS) - #parameter_type.tName = arg.annotation if arg.annotation else 'None' + # parameter_type.tName = arg.annotation if arg.annotation else 'None' parameter.type = parameter_type method_node.signature = method_signature diff --git a/CustomDataset.py b/CustomDataset.py index 130a63f..ebef310 100644 --- a/CustomDataset.py +++ b/CustomDataset.py @@ -9,7 +9,7 @@ from DataformatUtils import convert_edge_dim, convert_list_to_float_tensor, convert_list_to_long_tensor, \ convert_hashed_names_to_float from Encoder import multi_hot_encoding -from GraphClasses import defined_labels +from settings import CONFIG class RepositoryDataset(Dataset): @@ -30,7 +30,8 @@ def __init__(self, directory, label_list=None): print(e) # nodes have 11 features, their one hot encoded node type, hashed name, and one hot encoded library flag self.num_node_features = 11 - self.num_classes = len(defined_labels) + self.defined_labels = CONFIG['graph']['defined_labels'] + self.num_classes = len(self.defined_labels) self.directory = directory self.graph_names = [] self.graph_dir = os.listdir(directory) @@ -162,7 +163,7 @@ def convert_labeled_graphs(self, labels): graph_labels) # count how many repos are in each class # encode labels - encoded_nodes = multi_hot_encoding(defined_labels, graph_labels) + encoded_nodes = multi_hot_encoding(self.defined_labels, graph_labels) file = zip(graph_names, encoded_nodes) return file diff --git a/EcoreToMatrixConverter.py b/EcoreToMatrixConverter.py index eb8527c..badd41b 100644 --- a/EcoreToMatrixConverter.py +++ b/EcoreToMatrixConverter.py @@ -335,7 +335,8 @@ def convert_subpackages_recursive(self, t_package): t_package: The package to convert subpackages from. """ for t_subpackage in t_package.subpackages: - current_subpackage = self.get_node_in_container(t_subpackage.tName, NodeTypes.PACKAGE.value, t_package.tName, + current_subpackage = self.get_node_in_container(t_subpackage.tName, NodeTypes.PACKAGE.value, + t_package.tName, NodeTypes.PACKAGE.value) if current_subpackage is None: self.node_matrix.append(NodeTypes.PACKAGE.value) diff --git a/GCN.py b/GCN.py index d0c87ea..0609c9e 100644 --- a/GCN.py +++ b/GCN.py @@ -4,6 +4,7 @@ '''defines the architecture of the graph convolutional network''' + class GCN(torch.nn.Module): def __init__(self, num_node_features, num_classes, hidden_channels): super(GCN, self).__init__() @@ -33,4 +34,4 @@ def forward(self, x, edge_index, edge_attr, batch=None): # sigmoid activation function for multi-label x = f.sigmoid(x) - return x \ No newline at end of file + return x diff --git a/GraphClasses.py b/GraphClasses.py deleted file mode 100644 index 43ea481..0000000 --- a/GraphClasses.py +++ /dev/null @@ -1,3 +0,0 @@ -"""in this file are the defined labels for our dataset, they are not mutually exclusive""" -from settings import CONFIG -defined_labels = CONFIG['graph']['defined_labels'] diff --git a/NodeFeatures.py b/NodeFeatures.py index c61cf6d..7182f06 100644 --- a/NodeFeatures.py +++ b/NodeFeatures.py @@ -17,20 +17,20 @@ class NodeTypes(Enum): CLASS = "TClass" # TMethod METHOD = "TMethod" - METHOD_SIGNATURE = "TMethodSignature" # missing firstParameter does not need to be implemented. - METHOD_DEFINITION = "TMethodDefinition"# missing "".overloading and "".overloadedBY does not need to be implemented. + METHOD_SIGNATURE = "TMethodSignature" # missing firstParameter does not need to be implemented. + METHOD_DEFINITION = "TMethodDefinition" # missing "".overloading and "".overloadedBY does not need to be implemented. PARAMETER = "TParameter" # TField FIELD = "TField" - FIELD_SIGNATURE = "TFieldSignature" # Todo implement this in AstToEcoreConverter (only missing TFieldSignature.type) - FIELD_DEFINITION = "TFieldDefinition" # missing TFieldDefinition.hidden and "".hiddenBy does not to be implemented + FIELD_SIGNATURE = "TFieldSignature" # Todo implement this in AstToEcoreConverter (only missing TFieldSignature.type) + FIELD_DEFINITION = "TFieldDefinition" # missing TFieldDefinition.hidden and "".hiddenBy does not to be implemented # TAccess CALL = "TCall" READ = "TRead" # Todo implement this in AstToEcoreConverter WRITE = "TWrite" # Todo implement this in AstToEcoreConverter READ_WRITE = "TReadWrite" # Todo implement this in AstToEcoreConverter - #TInterface + # TInterface INTERFACE = "TInterface" # In Python, there is no formal concept of interfaces as found in some other programming languages like Java or C#. # However, Python supports a similar concept through the use of abstract base classes (ABCs) and duck typing. - # The return on investment probably is not sufficient to justify the implementation. \ No newline at end of file + # The return on investment probably is not sufficient to justify the implementation. diff --git a/Pipeline.py b/Pipeline.py index 699707b..80f5442 100644 --- a/Pipeline.py +++ b/Pipeline.py @@ -239,7 +239,6 @@ def prepare_dataset(repository_directory, output_directory=None, repository_list global repo_multiprocess, ecore_graph global node_features, adj_list, edge_attribute - # clone repositories for the dataset if repository_list is not None: download_repositories(repository_directory, repository_list) diff --git a/README.md b/README.md index 8d51ede..91752d4 100644 --- a/README.md +++ b/README.md @@ -1,27 +1,70 @@ -# github-classifier +# Classifier for GitHub Repos -**short description** +## Table of Contents +- [Intro](#intro) +- [Installation for Users](#installation-instruction-for-users) +- [Installation for Devs](#installation-instruction-for-devs) +- [Expectation for Devs](#expectation-for-devs) +- [Known Problems / Limitations](#known-problems--limitations) +- [Help](#help) -This repository contains a deep-learning based classification tool for software repositories. The tool utilizes the ecore metamodel 'type graph' and a graph convolutional network. To use the tool, run 'main.py' after adding the directory containing the repositories you want to classify. +## Intro: -If you want to train the tool with different labels, replace the current labels with your own (or add them to the labels) in GraphClasses.py, and in function 'multi_hot_encoding' in Encoder.py. Optionally also in function 'count_class_elements' in CustomDataset.py if you want to know the number of samples in each class in your dataset. -The labels in the tool are not mutually exclusive and are multi-hot encoded. +This repository features a deep learning classifier designed for the analysis of software repositories. +The tool employs the ecore metamodel's 'type graph' in conjunction with a graph convolutional network. +Presently, the classifier categorizes repositories into four distinct classes: Application, Framework, Library, and Plugin. +It is important to note that the labels utilized by the tool are **not mutually exclusive** and are represented in a multi-hot encoded format. -Currently, the tool only processes Python files. +## Installation Instruction for Users: +1. Clone the repository by executing the following command: +`git clone https://github.com/isselab/github-classifier.git` +2. Open the cloned repository using your preferred Integrated Development Environment (IDE). +For the purposes of this instruction, we will assume the use of PyCharm from JetBrains. +3. Change the directory to data/input by running the following command: +`cd ~/data/input` +4. Clone the repositories you wish to analyze by executing: +`git clone LINK_TO_REPO_YOU_WANT` +5. run main.py -**labels** +The default threshold for identification is set at 50%. +If you wish to modify this threshold, please locate the relevant settings in the settings.py file. +After making the necessary adjustments, ensure to rerun main.py to apply the changes. -Application, Framework, Library, Plugin +## Installation Instruction for Devs: -**data** +### Basic Installation: +1. Clone the repository by executing the following command: +`git clone https://github.com/isselab/github-classifier.git` +2. Open the cloned repository using your preferred Integrated Development Environment (IDE). -Dataset with Python software repositories from GitHub, all with a dependency on at least one ML library. -The labeled repositories the tool is trained with are in data/labeled_dataset_repos.xlsx. +### Retraining: +1. Check data/labeled_dataset_repos.xlsx. +This xlsx file contains the labeled repository's the tool is trained with. +You may want to change it accordingly to your needs. +2. We strongly recommend utilizing a GPU for training purposes. +To verify GPU availability, please run the TorchGPUCheck.py script. +If you get the Result "Cuda is available!" you may proceed to step 3. +If the output indicates that "Cuda is not available," please follow the instructions provided in the terminal. +Additionally, refer to the guide in the [Help](#help) section for further assistance in resolving any issues. +3. Run prepareDataset.py +4. Change the experiment_name in settings.py in the training section. +5. Run training.py -**requirements** -pyecore~=0.14.0 or higher versions +## Expectation for Devs: +### Recommended Workflow: +1. Create an issue in the GitHub issue page. +2. Open a branch named after the issue +3. Write code that fixes the issue +4. Write test code to be sure it works. +5. Comment your code well to be sure it can be understood. +6. Create a merge request -autopep8 +## Known Problems / Limitations: +- The Tool only processes Python files. +- Dataset contains Python software repositories from GitHub, all with a dependency on at least one ML library. +- Labels can not be changed easily, WIP -GRaViTY tool for visualizing the metamodels, see "https://github.com/GRaViTY-Tool/gravity-tool?tab=readme-ov-file" for instructions on how to install the tool +## Help +- Torch CUDA Guide, see "https://www.geeksforgeeks.org/how-to-set-up-and-run-cuda-operations-in-pytorch/" +- GRaViTY tool for visualizing the metamodels, see "https://github.com/GRaViTY-Tool/gravity-tool?tab=readme-ov-file" diff --git a/TorchGPUCheck.py b/TorchGPUCheck.py new file mode 100644 index 0000000..dcbf77f --- /dev/null +++ b/TorchGPUCheck.py @@ -0,0 +1,24 @@ +import torch + +""" +This code is a simple Python script that checks if CUDA is available on the system and provides instructions on how to enable it if it's not available. +""" + +if __name__ == "__main__": + print(torch.torch_version) + # Check if CUDA is available + if torch.cuda.is_available(): + print("CUDA is available!") + print(f"Number of GPUs: {torch.cuda.device_count()}") + print(f"Current GPU: {torch.cuda.get_device_name(torch.cuda.current_device())}") + else: + print("CUDA is not available.") + print("To enable CUDA, follow these steps:") + print("1. **Install NVIDIA Drivers**: Ensure you have the latest NVIDIA drivers installed on your system.") + print( + "2. **Install CUDA Toolkit**: Download and install the CUDA Toolkit from the official NVIDIA website: https://developer.nvidia.com/cuda-downloads") + print( + "3. **Verify CUDA Installation**: After installation, verify that CUDA is working correctly by running the `nvidia-smi` command in your terminal/command prompt.") + print( + "4. **Update PyTorch**: Make sure you're using the latest version of PyTorch. You can update PyTorch using pip: `pip install --upgrade torch`") + print("5. **Restart Your System**: Restart your system to ensure that the changes take effect.") diff --git a/pep8autoformat.py b/pep8autoformat.py index 374f77a..0f5071a 100644 --- a/pep8autoformat.py +++ b/pep8autoformat.py @@ -1,5 +1,6 @@ import autopep8 + def format_python_file(path_to_file): try: # Read the current content of the file @@ -21,4 +22,4 @@ def format_python_file(path_to_file): if __name__ == "__main__": # Specify the file path you want to format file_path = 'AstToEcoreConverter.py' - format_python_file(file_path) \ No newline at end of file + format_python_file(file_path) diff --git a/torch_gpu_check.py b/torch_gpu_check.py deleted file mode 100644 index 85c57bd..0000000 --- a/torch_gpu_check.py +++ /dev/null @@ -1,10 +0,0 @@ -import torch - -print(torch.torch_version) -# Check if CUDA is available -if torch.cuda.is_available(): - print("CUDA is available!") - print(f"Number of GPUs: {torch.cuda.device_count()}") - print(f"Current GPU: {torch.cuda.get_device_name(torch.cuda.current_device())}") -else: - print("CUDA is not available.") \ No newline at end of file diff --git a/train.py b/train.py index 0cdc5f1..73e3d3d 100644 --- a/train.py +++ b/train.py @@ -11,7 +11,6 @@ from CustomDataset import RepositoryDataset from GCN import GCN -from GraphClasses import defined_labels from settings import CONFIG '''please prepare the dataset you want to train the tool with by using prepareDataset.py, @@ -27,6 +26,7 @@ threshold = CONFIG['training']['threshold'] save_classification_reports = CONFIG['training']['save_classification_reports'] experiment_name = CONFIG['training']['experiment_name'] +defined_labels = CONFIG['graph']['defined_labels'] def train(): @@ -34,7 +34,7 @@ def train(): num_classes = int(len(defined_labels)) - for graph in tqdm(trainloader,desc = "Training"): + for graph in tqdm(trainloader, desc="Training"): if device == 'cuda': graph.x = graph.x.to(device) @@ -67,7 +67,7 @@ def test(loader): total = 0 num_classes = int(len(defined_labels)) - for graph in tqdm(loader,desc = "Testing"): + for graph in tqdm(loader, desc="Testing"): if device == 'cuda': graph.x = graph.x.to(device)