datasciencecoursera/HTML at master · cattailsswing/datasciencecoursera · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
<html>

<head>

<p> Coursera Machine Learning Class_Prediction Assignment <p>
<title>Title</title>
</head>

<body>

install.packages("Hmisc")    <br />
install.packages("caret") <br />
install.packages("randomForest") <br />
install.packages("AppliedPredictiveModeling") <br />
install.packages("rattle") <br />
install.packages("rpart.plot") <br />
install.packages("RColorBrewer") <br />
<br />
<br />
### Load packages <br />
library(Hmisc) <br />
library(caret) <br />
library(randomForest) <br />
library(AppliedPredictiveModeling) <br />
library(rattle) <br />
library(rpart.plot) <br />
library(RColorBrewer)
set.seed(888)<br />
<br />
<br />
### Download data <br />

<p> training.file   <- 'pml-training.csv' <br />
testing.file <- 'pml-test.csv'  <br />
training.url    <- 'http://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv'
<p> testing.url  <- 'http://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv' <p> <br />
download.file(training.url, training.file)    <br />
download.file(testing.url,testing.file )  <br />
<br />
<br />
### Clean data <br />
df.training <- read.csv(training.file, na.strings=c("NA","","#DIV/0!"), header=TRUE)  <br />
colnames.train <- colnames(df.training) <br />
df.testing <- read.csv(testing.file, na.strings=c("NA","","#DIV/0!"), header=TRUE) <br />
colnames.test <- colnames(df.testing) <br />
# Delete columns with all missing values <br />
df.training<-df.training[,colSums(is.na(df.training)) == 0] <br />
df.testing <-df.testing[,colSums(is.na(df.testing)) == 0] <br />

# Drop NA data and the first extra 7 columns <br />
df.training   <-df.training[,-c(1:7)] <br />
df.testing <-df.testing[,-c(1:7)]  <br />

# Verify that the column names are identical in the training and test set <br />
all.equal(colnames.train[1:length(colnames.train)-1], colnames.test[1:length(colnames.train)-1]) <br />
<br />
<br />
### Splitting data into training and testing sets <br />
inTraining.matrix    <- createDataPartition(df.training$classe, p = 0.75, list = FALSE) <br />
training.data.df <- df.training[inTraining.matrix, ]  <br />
testing.data.df  <- df.training[-inTraining.matrix, ]   <br />
dim(training.data.df)  <br />
dim(testing.data.df) <br />
head(training.data.df) <br />
head(testing.data.df)  <br />
<br />
<br />
#Decision tree <br />
mod.decisiontree <- rpart(classe ~ ., data=training.data.df, method="class")  <br />
pred.decisiontree<- predict(mod.decisiontree, testing.data.df, type = "class")  <br />

# Plot of the decision tree   <br />
rpart.plot(mod.decisiontree, main="Classification Tree", extra=102, under=TRUE, faclen=0)  <br />
# Test results on testing set  <br />
confusionMatrix(pred.decisiontree, testing.data.df$classe)  <br />
<br />
<br />
#Random forest  <br />
mod.randomforest <- randomForest(classe ~. , data=training.data.df, method="class")  <br />
pred.randomforest <- predict(mod.randomforest, training.data.df, type = "class")  <br />

confusionMatrix(pred.randomforest, training.data.df$classe)<br />


   <br />
   <br />
# Using random forest for final prediction  <br />
pred.final<- predict(mod.randomforest, df.training, type="class")  <br />
<br />
pred.final <br />

<p>
  <img src="C:/Users/Brooke/Dropbox/Rplot.png" alt="Rplot" style="float:left;width:420px;height:420px">

</p>
<body>


</html>