Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions AstToEcoreConverter.py
Original file line number Diff line number Diff line change
Expand Up @@ -1099,7 +1099,7 @@ def get_method_def_from_internal_structure(self, method_name, module):
return current_method[0]
return None

def create_method_signature(self, method_node, name, arguments, return_type = None):
def create_method_signature(self, method_node, name, arguments, return_type=None):
"""
Creates a method signature for a method definition.

Expand Down Expand Up @@ -1134,7 +1134,7 @@ def create_method_signature(self, method_node, name, arguments, return_type = No

# Add type for TParameter.type
parameter_type = self.create_ecore_instance(NodeTypes.CLASS)
#parameter_type.tName = arg.annotation if arg.annotation else 'None'
# parameter_type.tName = arg.annotation if arg.annotation else 'None'
parameter.type = parameter_type

method_node.signature = method_signature
Expand Down
7 changes: 4 additions & 3 deletions CustomDataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from DataformatUtils import convert_edge_dim, convert_list_to_float_tensor, convert_list_to_long_tensor, \
convert_hashed_names_to_float
from Encoder import multi_hot_encoding
from GraphClasses import defined_labels
from settings import CONFIG


class RepositoryDataset(Dataset):
Expand All @@ -30,7 +30,8 @@ def __init__(self, directory, label_list=None):
print(e)
# nodes have 11 features, their one hot encoded node type, hashed name, and one hot encoded library flag
self.num_node_features = 11
self.num_classes = len(defined_labels)
self.defined_labels = CONFIG['graph']['defined_labels']
self.num_classes = len(self.defined_labels)
self.directory = directory
self.graph_names = []
self.graph_dir = os.listdir(directory)
Expand Down Expand Up @@ -162,7 +163,7 @@ def convert_labeled_graphs(self, labels):
graph_labels) # count how many repos are in each class

# encode labels
encoded_nodes = multi_hot_encoding(defined_labels, graph_labels)
encoded_nodes = multi_hot_encoding(self.defined_labels, graph_labels)
file = zip(graph_names, encoded_nodes)
return file

Expand Down
3 changes: 2 additions & 1 deletion EcoreToMatrixConverter.py
Original file line number Diff line number Diff line change
Expand Up @@ -335,7 +335,8 @@ def convert_subpackages_recursive(self, t_package):
t_package: The package to convert subpackages from.
"""
for t_subpackage in t_package.subpackages:
current_subpackage = self.get_node_in_container(t_subpackage.tName, NodeTypes.PACKAGE.value, t_package.tName,
current_subpackage = self.get_node_in_container(t_subpackage.tName, NodeTypes.PACKAGE.value,
t_package.tName,
NodeTypes.PACKAGE.value)
if current_subpackage is None:
self.node_matrix.append(NodeTypes.PACKAGE.value)
Expand Down
3 changes: 2 additions & 1 deletion GCN.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

'''defines the architecture of the graph convolutional network'''


class GCN(torch.nn.Module):
def __init__(self, num_node_features, num_classes, hidden_channels):
super(GCN, self).__init__()
Expand Down Expand Up @@ -33,4 +34,4 @@ def forward(self, x, edge_index, edge_attr, batch=None):
# sigmoid activation function for multi-label
x = f.sigmoid(x)

return x
return x
3 changes: 0 additions & 3 deletions GraphClasses.py

This file was deleted.

12 changes: 6 additions & 6 deletions NodeFeatures.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,20 +17,20 @@ class NodeTypes(Enum):
CLASS = "TClass"
# TMethod
METHOD = "TMethod"
METHOD_SIGNATURE = "TMethodSignature" # missing firstParameter does not need to be implemented.
METHOD_DEFINITION = "TMethodDefinition"# missing "".overloading and "".overloadedBY does not need to be implemented.
METHOD_SIGNATURE = "TMethodSignature" # missing firstParameter does not need to be implemented.
METHOD_DEFINITION = "TMethodDefinition" # missing "".overloading and "".overloadedBY does not need to be implemented.
PARAMETER = "TParameter"
# TField
FIELD = "TField"
FIELD_SIGNATURE = "TFieldSignature" # Todo implement this in AstToEcoreConverter (only missing TFieldSignature.type)
FIELD_DEFINITION = "TFieldDefinition" # missing TFieldDefinition.hidden and "".hiddenBy does not to be implemented
FIELD_SIGNATURE = "TFieldSignature" # Todo implement this in AstToEcoreConverter (only missing TFieldSignature.type)
FIELD_DEFINITION = "TFieldDefinition" # missing TFieldDefinition.hidden and "".hiddenBy does not to be implemented
# TAccess
CALL = "TCall"
READ = "TRead" # Todo implement this in AstToEcoreConverter
WRITE = "TWrite" # Todo implement this in AstToEcoreConverter
READ_WRITE = "TReadWrite" # Todo implement this in AstToEcoreConverter
#TInterface
# TInterface
INTERFACE = "TInterface"
# In Python, there is no formal concept of interfaces as found in some other programming languages like Java or C#.
# However, Python supports a similar concept through the use of abstract base classes (ABCs) and duck typing.
# The return on investment probably is not sufficient to justify the implementation.
# The return on investment probably is not sufficient to justify the implementation.
1 change: 0 additions & 1 deletion Pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,6 @@ def prepare_dataset(repository_directory, output_directory=None, repository_list
global repo_multiprocess, ecore_graph
global node_features, adj_list, edge_attribute


# clone repositories for the dataset
if repository_list is not None:
download_repositories(repository_directory, repository_list)
Expand Down
73 changes: 58 additions & 15 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,27 +1,70 @@
# github-classifier
# Classifier for GitHub Repos

**short description**
## Table of Contents
- [Intro](#intro)
- [Installation for Users](#installation-instruction-for-users)
- [Installation for Devs](#installation-instruction-for-devs)
- [Expectation for Devs](#expectation-for-devs)
- [Known Problems / Limitations](#known-problems--limitations)
- [Help](#help)

This repository contains a deep-learning based classification tool for software repositories. The tool utilizes the ecore metamodel 'type graph' and a graph convolutional network. To use the tool, run 'main.py' after adding the directory containing the repositories you want to classify.
## Intro:

If you want to train the tool with different labels, replace the current labels with your own (or add them to the labels) in GraphClasses.py, and in function 'multi_hot_encoding' in Encoder.py. Optionally also in function 'count_class_elements' in CustomDataset.py if you want to know the number of samples in each class in your dataset.
The labels in the tool are not mutually exclusive and are multi-hot encoded.
This repository features a deep learning classifier designed for the analysis of software repositories.
The tool employs the ecore metamodel's 'type graph' in conjunction with a graph convolutional network.
Presently, the classifier categorizes repositories into four distinct classes: Application, Framework, Library, and Plugin.
It is important to note that the labels utilized by the tool are **not mutually exclusive** and are represented in a multi-hot encoded format.

Currently, the tool only processes Python files.
## Installation Instruction for Users:
1. Clone the repository by executing the following command:
`git clone https://github.com/isselab/github-classifier.git`
2. Open the cloned repository using your preferred Integrated Development Environment (IDE).
For the purposes of this instruction, we will assume the use of PyCharm from JetBrains.
3. Change the directory to data/input by running the following command:
`cd ~/data/input`
4. Clone the repositories you wish to analyze by executing:
`git clone LINK_TO_REPO_YOU_WANT`
5. run main.py

**labels**
The default threshold for identification is set at 50%.
If you wish to modify this threshold, please locate the relevant settings in the settings.py file.
After making the necessary adjustments, ensure to rerun main.py to apply the changes.

Application, Framework, Library, Plugin
## Installation Instruction for Devs:

**data**
### Basic Installation:
1. Clone the repository by executing the following command:
`git clone https://github.com/isselab/github-classifier.git`
2. Open the cloned repository using your preferred Integrated Development Environment (IDE).

Dataset with Python software repositories from GitHub, all with a dependency on at least one ML library.
The labeled repositories the tool is trained with are in data/labeled_dataset_repos.xlsx.
### Retraining:
1. Check data/labeled_dataset_repos.xlsx.
This xlsx file contains the labeled repository's the tool is trained with.
You may want to change it accordingly to your needs.
2. We strongly recommend utilizing a GPU for training purposes.
To verify GPU availability, please run the TorchGPUCheck.py script.
If you get the Result "Cuda is available!" you may proceed to step 3.
If the output indicates that "Cuda is not available," please follow the instructions provided in the terminal.
Additionally, refer to the guide in the [Help](#help) section for further assistance in resolving any issues.
3. Run prepareDataset.py
4. Change the experiment_name in settings.py in the training section.
5. Run training.py

**requirements**

pyecore~=0.14.0 or higher versions
## Expectation for Devs:
### Recommended Workflow:
1. Create an issue in the GitHub issue page.
2. Open a branch named after the issue
3. Write code that fixes the issue
4. Write test code to be sure it works.
5. Comment your code well to be sure it can be understood.
6. Create a merge request

autopep8
## Known Problems / Limitations:
- The Tool only processes Python files.
- Dataset contains Python software repositories from GitHub, all with a dependency on at least one ML library.
- Labels can not be changed easily, WIP

GRaViTY tool for visualizing the metamodels, see "https://github.com/GRaViTY-Tool/gravity-tool?tab=readme-ov-file" for instructions on how to install the tool
## Help
- Torch CUDA Guide, see "https://www.geeksforgeeks.org/how-to-set-up-and-run-cuda-operations-in-pytorch/"
- GRaViTY tool for visualizing the metamodels, see "https://github.com/GRaViTY-Tool/gravity-tool?tab=readme-ov-file"
24 changes: 24 additions & 0 deletions TorchGPUCheck.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import torch

"""
This code is a simple Python script that checks if CUDA is available on the system and provides instructions on how to enable it if it's not available.
"""

if __name__ == "__main__":
print(torch.torch_version)
# Check if CUDA is available
if torch.cuda.is_available():
print("CUDA is available!")
print(f"Number of GPUs: {torch.cuda.device_count()}")
print(f"Current GPU: {torch.cuda.get_device_name(torch.cuda.current_device())}")
else:
print("CUDA is not available.")
print("To enable CUDA, follow these steps:")
print("1. **Install NVIDIA Drivers**: Ensure you have the latest NVIDIA drivers installed on your system.")
print(
"2. **Install CUDA Toolkit**: Download and install the CUDA Toolkit from the official NVIDIA website: https://developer.nvidia.com/cuda-downloads")
print(
"3. **Verify CUDA Installation**: After installation, verify that CUDA is working correctly by running the `nvidia-smi` command in your terminal/command prompt.")
print(
"4. **Update PyTorch**: Make sure you're using the latest version of PyTorch. You can update PyTorch using pip: `pip install --upgrade torch`")
print("5. **Restart Your System**: Restart your system to ensure that the changes take effect.")
3 changes: 2 additions & 1 deletion pep8autoformat.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import autopep8


def format_python_file(path_to_file):
try:
# Read the current content of the file
Expand All @@ -21,4 +22,4 @@ def format_python_file(path_to_file):
if __name__ == "__main__":
# Specify the file path you want to format
file_path = 'AstToEcoreConverter.py'
format_python_file(file_path)
format_python_file(file_path)
10 changes: 0 additions & 10 deletions torch_gpu_check.py

This file was deleted.

6 changes: 3 additions & 3 deletions train.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@

from CustomDataset import RepositoryDataset
from GCN import GCN
from GraphClasses import defined_labels
from settings import CONFIG

'''please prepare the dataset you want to train the tool with by using prepareDataset.py,
Expand All @@ -27,14 +26,15 @@
threshold = CONFIG['training']['threshold']
save_classification_reports = CONFIG['training']['save_classification_reports']
experiment_name = CONFIG['training']['experiment_name']
defined_labels = CONFIG['graph']['defined_labels']


def train():
model.train()

num_classes = int(len(defined_labels))

for graph in tqdm(trainloader,desc = "Training"):
for graph in tqdm(trainloader, desc="Training"):

if device == 'cuda':
graph.x = graph.x.to(device)
Expand Down Expand Up @@ -67,7 +67,7 @@ def test(loader):
total = 0
num_classes = int(len(defined_labels))

for graph in tqdm(loader,desc = "Testing"):
for graph in tqdm(loader, desc="Testing"):

if device == 'cuda':
graph.x = graph.x.to(device)
Expand Down
Loading