-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathprincipal_component_analysis.py
More file actions
111 lines (86 loc) · 4.66 KB
/
principal_component_analysis.py
File metadata and controls
111 lines (86 loc) · 4.66 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import numpy as np
from matplotlib import pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import shuffle
from datetime import datetime
# Record start time
start_time = datetime.now()
# Load training data, extract the machine failed labels (last column), and delete the column from the original data
train_data = np.loadtxt("dataset\\ai4i2020_train_data.csv", delimiter=",")
train_data = shuffle(train_data, random_state=42) # Randomize the data
train_data_labels = train_data[:, 6]
train_data = np.delete(train_data, 6, axis=1)
# Load testing data, extract the machine failed labels (last column), and delete the column from the original data
test_data = np.loadtxt("dataset\\ai4i2020_test_data.csv", delimiter=",")
test_data_labels = test_data[:, 6]
test_data = np.delete(test_data, 6, axis=1)
# Find number of total features
attributes = np.shape(train_data)[1] # 6 Features
# Find covariance of the data
data_train_cov = np.cov(train_data.T)
# Calculate the eigen values and eigen vectors
eigenvalues, eigenvectors = np.linalg.eig(data_train_cov)
# Sort eigenvalues and store only indices
sorted_eigenvalue_indices = np.flip(np.argsort(eigenvalues))
# Sort eigenvectors using sorted eigenvalue indices
sorted_eigenvectors = np.empty(shape=(attributes, 0))
for dimension in range(attributes):
sorted_eigenvectors = np.append(sorted_eigenvectors,
eigenvectors[:, sorted_eigenvalue_indices[dimension]].reshape(attributes, 1),
axis=1)
# Calculate transformed matrix of training using eigenvectors
transformed_train_data = np.dot(train_data, sorted_eigenvectors)
# Calculate transformed matrix of testing using eigenvectors
transformed_test_data = np.dot(test_data, sorted_eigenvectors)
# Variables for storing errors
dtc_classification_error = []
rfc_classification_error = []
# Create instances of classifiers
clf_dtc = DecisionTreeClassifier()
clf_rfc = RandomForestClassifier()
# --- Decision Tree Classification and Random Forest Classification using PCA begins ---
# Iterate through dimensions to remove dimensions and calculate classification errors
for dimension in range(attributes, 0, -1):
# Reduce the transformed data by removing dimensions
transformed_data_reduced = transformed_train_data[:, 0:dimension]
transformed_data_test_reduced = transformed_test_data[:, 0:dimension]
# Decision Tree Classification and Random Forest Classification on the reduced data
clf_dtc.fit(transformed_data_reduced, train_data_labels) # Training of the model
dtc_prediction = clf_dtc.predict(transformed_data_test_reduced) # Testing of the model
dtc_classification_error.append(sum(dtc_prediction != test_data_labels)) # Error Calculation
clf_rfc.fit(transformed_data_reduced, train_data_labels) # Training of the model
rfc_prediction = clf_rfc.predict(transformed_data_test_reduced) # Testing of the model
rfc_classification_error.append(sum(rfc_prediction != test_data_labels)) # Error Calculation
print("\nDecision Tree Classification errors after PCA dimensionality reduction:\n")
for dimension in range(attributes, 0, -1):
print(f"{dimension} retained dimensions: {dtc_classification_error[attributes - dimension]}")
print("\nRandom Forest Classification errors after PCA dimensionality reduction:\n")
for dimension in range(attributes, 0, -1):
print(f"{dimension} retained dimensions: {rfc_classification_error[attributes - dimension]}")
# Plot the errors
x_axis = [i for i in range(6)]
pca_error_plot = plt.figure(1)
plt.plot(x_axis, dtc_classification_error)
plt.plot(x_axis, rfc_classification_error)
plt.xlabel("Retained dimensions out of 6 total dimensions in the dataset")
plt.ylabel("Number of errors")
plt.legend(["Decision Tree Classification Errors", "Random Forest Classification Errors"])
plt.xticks(ticks=[i for i in range(6)], labels=[i for i in range(6, 0, -1)])
plt.grid()
plt.title("Plot of Classification errors after dimensionality reduction using Principal Component Analysis")
pca_error_plot.show()
# Plot the eigenvalues or scores of PCA
x_axis_labels = [i for i in range(1, attributes+1)]
eigenvalues_plot = plt.figure(2)
plt.bar(x_axis_labels, np.flip(np.sort(eigenvalues)))
plt.xlabel("Principal Components")
plt.ylabel("Eigenvalues / PC Scores")
plt.title("Plot of Principal Component Scores")
plt.grid()
eigenvalues_plot.show()
# --- Decision Tree Classification and Random Forest Classification using PCA ends ---
# Record end time and show total run time
print(f"\nProgram completed in {(datetime.now() - start_time).seconds}."
f"{(datetime.now() - start_time).microseconds} seconds.")
plt.show()