-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathNested_CV_AUC_CARET.R
More file actions
229 lines (180 loc) · 11.2 KB
/
Nested_CV_AUC_CARET.R
File metadata and controls
229 lines (180 loc) · 11.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
# Function for doing a k-fold cross-validation for each C in CC
nested_cv <- function(data, k,formula.model, seed = NULL,sampling = sampling) {
# Set the seed, if given
if (!is.null(seed)) {
set.seed(seed)
}
folds <- createFolds(data$insulinRequire, k = k)
auc <- lapply(1:k, function(k) {
train_and_validate( data, folds[[k]], sampling = sampling,formula.model = formula.model)
})
auc
}
train_and_validate <- function( data, fold, sampling = sampling,formula.model = formula.model) {
data_train <- data[-fold,]
data_val <- data[fold,]
#for use in default and grid search optimised models
control <- trainControl(method = "repeatedcv", number = 5, repeats = 1,classProbs = TRUE,summaryFunction = twoClassSummary, sampling = sampling, savePredictions = TRUE)
#for use in random search optimised models
control_Rand_Search <- trainControl(method = "repeatedcv", number = 5, repeats = 1,classProbs = TRUE,summaryFunction = twoClassSummary, sampling = sampling, savePredictions = TRUE, search = "random")
##########################################################################
##############################
# 3 - Train the models
##############################
# The train function sets up a grid of tuning parameters for a number of classification and regression
# ROC used to select the optimal model using the largest value.
# train the Gradient bootstrap Machine model (Stochastic Gradient Boosting)
# utils::browseVignettes("gbm")
# verbose is an argument of the gmb package, indicating whether or not to print out progress and
# performance indicators
#build all the Gbm (Stochastic gradient boosting model) models
#tuning parameters: n.trees (number of iterations), interaction depth (complexity), shrinkage (learning rate), n.minobsinnode (min number of training det damples in a node to commence splitting)
#learning rate shrinks the contribution of each tree by learning_rate
getModelInfo()$gbm$parameters
#Shrinkage: the smaller the number, the better the predictive value, the more trees required, and the more computational cost.
#the smaller the shrinkage, the more trees you should have
# Fetch max Value for interaction.depth
floor(sqrt(NCOL(data)))
#set up the grid
gbmGrid <- expand.grid(interaction.depth = c(1, 3, 7, 10),
n.trees = c(10, 50,100,200),
shrinkage = seq(from = 0.01, to = 0.1, by = 0.02),
n.minobsinnode = c(5,10,20))
#tune the hyper-parameters using Grid Search
modelGbm_CC_GADA_Lr <- train(formula.model, data = data_train, method = "gbm", trControl = control, verbose = FALSE,metric = 'ROC',tuneGrid=gbmGrid)
#random search Independently draws from a uniform density from the same configuration space as would be spanned by a regular grid,
#we do not use random hyperparameter search for gbm models as it may be inefficients
# train the SVM model
# Support Vector Machines with Radial Basis Function Kernel (SVM classifier using a non-linear kernel)
#RBF is a reasonable first choice, it can handle nonlinear relationships
#C is the penalty parameter of the error term. It controls the trade off between smooth decision boundary (small c) and classifying the training points correctly.
#larger values of C focus attention more on (correctly classified) points near the decision boundary (wiggly boundary), while smaller values involve data further away (wider margins).
# sigma the radius/spread/decision boundary of the kernel
#When gamma is low, the 'curve' of the decision boundary is very low and thus the decision region is very broad.
#When gamma is high, the 'curve' of the decision boundary is high, which creates islands of decision-boundaries around data points.
#using training dataset and default parameters
getModelInfo()$svmRadial$parameters
svmGrid <- expand.grid(sigma = c(0.01, 0.1, 1, 10, 100),
C = seq(from = 0.1, to = 1, by = 0.05))
#using training dataset and tune the hyper-parameters using Caret Grid Search
modelSvm_CC_GADA_Lr <- train(formula.model, data = data_train, method = "svmRadial", trControl = control, verbose = TRUE,metric = 'ROC',tuneGrid=svmGrid)
# train the Random forest model
#parameter mtry is the number of variables available for splitting at each tree node
#The default is the square root of the number of predictor variables (rounded down)
#as we are only using three variables we do not optimise the parameters
#For mtry refer to http://code.env.duke.edu/projects/mget/export/HEAD/MGET/Trunk/PythonPackage/dist/TracOnlineDocumentation/Documentation/ArcGISReference/RandomForestModel.FitToArcGISTable.html
#using training dataset and default parameters
modelRf_CC_GADA <- train(formula.model, data = data_train, method = 'rf', trControl = control,metric = 'ROC')
# train a logistic regression model
#using training dataset
#there are no tuning parameters for glm method within caret
modelLG_CC_GADA <- train(formula.model, data = data_train, method = "glm", family = "binomial", trControl = control,metric = 'ROC')
# train neural network
getModelInfo()$nnet$parameters
#size parameter is the number of units in hidden layer (nnet fit a single hidden layer neural network)
#decay parameter is the regularization parameter to avoid over-fitting
nnetGrid <- expand.grid(size = seq(from = 1, to = 10, by = 1),
decay = c(0.5, 0.1, 0.01, 0.001, 0.0001, 0.00001, 0.000001, 0.0000001))
#tune the hyper-parameters using Caret Grid Search
modelnnet_CC_GADA_Lr <- train(formula.model, data = data_train, method = "nnet", trControl = control,metric = 'ROC', tuneGrid = nnetGrid)
# train a k-nearest-neighbours
#based on euclidean distance
getModelInfo()$knn$parameters
#k parameter is the number of neighbours.
knnGrid <- expand.grid(k = seq(from = 1, to = 100, by = 1))
#tune the hyper-parameters using Caret Grid Search
modelknn_CC_GADA_Lr <- train(formula.model, data = data_train, method = "knn", trControl = control,metric = 'ROC', tuneGrid = knnGrid)
marsgrid <- expand.grid(
degree = 1:3,
nprune = seq(1, 20, length.out = 20) %>% floor()
)
#tune the hyper-parameters using Grid Search
modelMARS_CC_GADA_Lr <- train(formula.model, data = data_train, method = "earth", glm=list(family='binomial'),trControl = control,,metric = 'ROC',tuneGrid=marsgrid)
#################################
# 4 - Perform validation
#################################
#for the Gbm grid search model (validation)
probsTestGbmGridVal = predict(modelGbm_CC_GADA_Lr, newdata=data_val, type = "prob")
data_val =data.frame(data_val,probsTestGbmGridVal$X2)
predTestGbmGridVal = log(as.numeric(probsTestGbmGridVal$X2)/(1-as.numeric(probsTestGbmGridVal$X2)))
#then create a roc object and calculate the ROC on the validation dataset
roc_objTestGbmGridval <- roc(data_val[,Youroutcomevariable], predTestGbmGridVal)
AUC_objTestGbmGridval = auc(roc_objTestGbmGridval)
AUC_objTestGbmGridval
ci.auc(roc_objTestGbmGridval)
#for the Svm grid search model (validation)
probsTestSvmGridVal = predict(modelSvm_CC_GADA_Lr, newdata=data_val, type = "prob")
data_val =data.frame(data_val,probsTestSvmGridVal$X2)
predTestSvmGridVal = log(as.numeric(probsTestSvmGridVal$X2)/(1-as.numeric(probsTestSvmGridVal$X2)))
#then create a roc object and calculate the ROC on the validation dataset
roc_objTestSvmGridval <- roc(data_val[,Youroutcomevariable], predTestSvmGridVal)
AUC_objTestSvmGridval = auc(roc_objTestSvmGridval)
AUC_objTestSvmGridval
ci.auc(roc_objTestSvmGridval)
#for the knn grid search model (validation)
probsTestknnGridVal = predict(modelknn_CC_GADA_Lr, newdata=data_val, type = "prob")
data_val =data.frame(data_val,probsTestknnGridVal$X2)
probsTestknnGridVal$X2[probsTestknnGridVal$X2 == 1] <- 0.999999
probsTestknnGridVal$X2[probsTestknnGridVal$X2 == 0] <- 0.000001
predTestknnGridVal = log(as.numeric(probsTestknnGridVal$X2)/(1-as.numeric(probsTestknnGridVal$X2)))
#then create a roc object and calculate the ROC on the validation dataset
roc_objTestknnGridval <- roc(data_val[,Youroutcomevariable], predTestknnGridVal)
AUC_objTestknnGridval = auc(roc_objTestknnGridval)
AUC_objTestknnGridval
ci.auc(roc_objTestknnGridval)
#for the nnet grid search model (validation)
probsTestnnetGridVal = predict(modelnnet_CC_GADA_Lr, newdata=data_val, type = "prob")
data_val =data.frame(data_val,probsTestnnetGridVal$X2)
predTestnnetGridVal = log(as.numeric(probsTestnnetGridVal$X2)/(1-as.numeric(probsTestnnetGridVal$X2)))
#then create a roc object and calculate the ROC on the validation dataset
roc_objTestnnetGridval <- roc(data_val[,Youroutcomevariable], predTestnnetGridVal)
AUC_objTestnnetGridval = auc(roc_objTestnnetGridval)
AUC_objTestnnetGridval
ci.auc(roc_objTestnnetGridval)
#for the rf model (validation)
probsTestRfVal = predict(modelRf_CC_GADA, newdata=data_val, type = "prob")
data_val =data.frame(data_val,probsTestRfVal$X2)
probsTestRfVal$X2[probsTestRfVal$X2 == 1] <- 0.999999
probsTestRfVal$X2[probsTestRfVal$X2 == 0] <- 0.000001
predTestRfVal = log(as.numeric(probsTestRfVal$X2)/(1-as.numeric(probsTestRfVal$X2)))
#then create a roc object and calculate the ROC on the validation dataset
roc_objTestRfval <- roc(data_val[,Youroutcomevariable], predTestRfVal)
AUC_objTestRfval = auc(roc_objTestRfval)
AUC_objTestRfval
ci.auc(roc_objTestRfval)
#for the logistic regression model (validation)
probsTestlgVal = predict(modelLG_CC_GADA, newdata=data_val, type = "prob")
data_val =data.frame(data_val,probsTestlgVal$X2)
predTestLGVal = log(as.numeric(probsTestlgVal$X2)/(1-as.numeric(probsTestlgVal$X2)))
#then create a roc object and calculate the ROC on the validation dataset
roc_objTestLGval <- roc(data_val[,Youroutcomevariable], predTestLGVal)
AUC_objTestLGval = auc(roc_objTestLGval)
AUC_objTestLGval
ci.auc(roc_objTestLGval)
#for the logistic regression model (validation)
probsTestMARSVal = predict( modelMARS_CC_GADA_Lr, newdata=data_val, type = "prob")
data_val =data.frame(data_val,probsTestMARSVal$X2)
predTestMARSVal = log(as.numeric(probsTestMARSVal$X2)/(1-as.numeric(probsTestMARSVal$X2)))
#then create a roc object and calculate the ROC on the validation dataset
roc_objTestMARSval <- roc(data_val[,Youroutcomevariable], predTestMARSVal)
AUC_objTestMARSval = auc(roc_objTestMARSval)
AUC_objTestMARSval
ci.auc(roc_objTestMARSval)
res <- c( AUC_objTestGbmGridval, AUC_objTestSvmGridval, AUC_objTestknnGridval, AUC_objTestnnetGridval,AUC_objTestRfval,AUC_objTestLGval,AUC_objTestMARSval)
}
sampling <- NULL
res_none_l <- nested_cv(data_train, 5,formula.model, sampling = sampling)
res_none <- do.call(rbind, res_none_l)
res <- apply(res_none,2,mean)
resvar <- apply(res_none,2,sd)
res -1.96*(resvar/sqrt(5))
res + 1.96*(resvar/sqrt(5))
models <- c("Gbm", "Svm", "knn", "nnet","Rf","LG","MARS")
res <- data.frame(models,res)
names(res) <- c("Models", "AUC")
myft <- regulartable(res)
myft <- theme_vanilla(myft)
myft <- autofit(myft)
doc <- read_docx()
doc <- body_add_flextable(doc, value = myft)
print(doc, target = paste0("nestedCV_sampling", sampling, "formula_",as.character(formula.model)[3],".docx"))