Data mining techniques for classification with R code

advertisement
Data Ming and Classification with R Examples
Ira Sharenow
Wednesday, August 19, 2015
Points within sphere of radius 0.8 are labeled "blue"; otherwise "red"
But actually criterion is based on the first four of six co-ordinates
The region is clearly non-linear
Then points are slightly perturbed so the labeling will no longer be perfect
Out of six co-ordinates only first five are jittered
Then a sample of half the points is taken
Then various tests are performed on the hold out test set.
Since the boundary is very non-linear, I expected the first few methods to perform poorly,
and they did so.
Bagging, random forest, and boosting all produced excellent results. However, bossting is
extremely slow.
0.
1.
2.
3.
4.
5.
6.
7.
8.
9.
10.
Actual percent red
Logistic regression
LDA
QDA
KNN
Tree
Bagging
Random Forest
Boosting
Support Vector Machines
Table of results
set.seed(20150819)
sampleSize = 10000
df = data.frame(x1
x2
x3
x4
x5
x6
=
=
=
=
=
=
runif(n
runif(n
runif(n
runif(n
runif(n
runif(n
=
=
=
=
=
=
sampleSize,
sampleSize,
sampleSize,
sampleSize,
sampleSize,
sampleSize,
min
min
min
min
min
min
=
=
=
=
=
=
-1,
-1,
-1,
-1,
-1,
-1,
max
max
max
max
max
max
=
=
=
=
=
=
1),
1),
1),
1),
1),
1))
# Function for labeling
color = function(x1,x2,x3,x4,x5,x6) if (x1^2 + x2^2 + x3^2 + x4^2 <= 0.8 )
return("blue") else return("red")
head(df)
##
##
##
##
##
##
##
x1
x2
x3
x4
x5
x6
1 -0.06276411 -0.8483214 0.347916695 0.02709725 -0.3690519 0.02848432
2 0.79737745 0.1749736 0.506957737 0.39603261 0.1119455 -0.71287355
3 -0.28757810 -0.4891981 0.923341176 -0.71710376 0.3040652 -0.82590296
4 0.82660601 0.2245092 0.002798933 0.69601031 0.5024880 0.96397908
5 0.43177409 0.5581343 -0.611626756 -0.11168633 0.2727495 0.95727541
6 -0.36671109 0.9372195 -0.341857872 -0.66116307 0.5929235 0.98582181
df$color1 = mapply(color, df$x1, df$x2,df$x3, df$x4,df$x5, df$x6)
df$color1 = factor(df$color1)
contrasts(df$color1) # blue is 0, red is 1
##
red
## blue
0
## red
1
head(df)
##
##
##
##
##
##
##
##
##
##
##
##
##
##
x1
x2
x3
x4
x5
x6
1 -0.06276411 -0.8483214 0.347916695 0.02709725 -0.3690519 0.02848432
2 0.79737745 0.1749736 0.506957737 0.39603261 0.1119455 -0.71287355
3 -0.28757810 -0.4891981 0.923341176 -0.71710376 0.3040652 -0.82590296
4 0.82660601 0.2245092 0.002798933 0.69601031 0.5024880 0.96397908
5 0.43177409 0.5581343 -0.611626756 -0.11168633 0.2727495 0.95727541
6 -0.36671109 0.9372195 -0.341857872 -0.66116307 0.5929235 0.98582181
color1
1
red
2
red
3
red
4
red
5
red
6
red
mean(df$color1 == "blue")
## [1] 0.1919
# Perturb co-ordinates 1-5
df$x1 = df$x1 + rnorm(sampleSize,
df$x2 = df$x2 + rnorm(sampleSize,
df$x3 = df$x3 + rnorm(sampleSize,
df$x4 = df$x4 + rnorm(sampleSize,
df$x5 = df$x5 + rnorm(sampleSize,
0,
0,
0,
0,
0,
0.1)
0.1)
0.1)
0.1)
0.1)
# Divide points into training and test sets
train = sample(1:sampleSize, sampleSize/2)
dfTrain = df[train, ]
dim(dfTrain)
## [1] 5000
7
dfTest = df[-c(train), ]
dim(dfTest)
## [1] 5000
7
# Set up is now completed
# Now start to perform tests
# 1. Logistic Regression
glm.fit = glm(color1 ~ ., data = dfTrain, family = binomial)
summary(glm.fit)
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
Call:
glm(formula = color1 ~ ., family = binomial, data = dfTrain)
Deviance Residuals:
Min
1Q
Median
-1.9033
0.6213
0.6528
3Q
0.6756
Max
0.7553
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 1.404084
0.035614 39.426
<2e-16 ***
x1
-0.080827
0.060590 -1.334
0.182
x2
-0.043699
0.060668 -0.720
0.471
x3
0.008068
0.060706
0.133
0.894
x4
-0.058953
0.060893 -0.968
0.333
x5
0.082850
0.060944
1.359
0.174
x6
0.064199
0.061538
1.043
0.297
--Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 4965.0
Residual deviance: 4958.8
AIC: 4972.8
on 4999
on 4993
degrees of freedom
degrees of freedom
Number of Fisher Scoring iterations: 4
# help(predict)
glm.probs = predict(glm.fit, newdata = dfTest, type = "response")
glm.probs[1:10]
##
1
4
6
9
11
12
13
## 0.8053388 0.8050761 0.8259650 0.8059137 0.8093152 0.8082293 0.8039616
##
16
17
18
## 0.8077144 0.8256474 0.8096335
glm.pred = rep("red", sampleSize/2)
glm.pred[glm.probs < .5]= "blue"
table(glm.pred, dfTest$color1)
##
## glm.pred blue red
##
red 933 4067
mean(glm.pred == df$color1) # 0.8081
## [1] 0.8081
# 2. Linear Discriminant Analysis
library( MASS)
lda.fit = lda(color1 ~ ., data = dfTrain)
lda.fit
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
Call:
lda(color1 ~ ., data = dfTrain)
Prior probabilities of groups:
blue
red
0.1972 0.8028
Group means:
x1
x2
x3
x4
x5
blue 0.030187117 0.00373233 -0.01429667 0.008993027 -0.01223305
red 0.002206837 -0.01218605 -0.01279927 -0.010448713 0.01591678
x6
blue 0.002238118
red 0.023971066
Coefficients of linear discriminants:
LD1
x1 -0.91223082
x2 -0.49229573
x3 0.09083985
x4 -0.66651260
x5 0.93584612
x6 0.72461144
lda.pred = predict(lda.fit, dfTest)
names(lda.pred )
## [1] "class"
"posterior" "x"
lda.class = lda.pred$class
table(lda.class, dfTest$color1)
##
## lda.class blue red
##
blue
0
0
##
red
933 4067
mean(lda.class == dfTest$color1) # 0.8134
## [1] 0.8134
# 3. Quadratic Discriminant Analysis
qda.fit = qda(color1 ~ ., data = dfTrain)
qda.fit
##
##
##
##
##
##
##
##
##
##
##
##
##
##
Call:
qda(color1 ~ ., data = dfTrain)
Prior probabilities of groups:
blue
red
0.1972 0.8028
Group means:
x1
x2
x3
x4
x5
blue 0.030187117 0.00373233 -0.01429667 0.008993027 -0.01223305
red 0.002206837 -0.01218605 -0.01279927 -0.010448713 0.01591678
x6
blue 0.002238118
red 0.023971066
qda.pred = predict(qda.fit, dfTest)
qda.class = qda.pred$class
table(qda.class, dfTest$color1)
##
## qda.class blue red
##
blue 131
0
##
red
802 4067
mean(qda.class == dfTest$color1) # 0.8396)
## [1] 0.8396
# 4. KNN
library(class)
knn.pred3 = knn(train = dfTrain[,1:6], test = dfTest[,1:6], cl =
dfTrain$color1, k = 3)
table(knn.pred3, dfTest$color1)
##
## knn.pred3 blue red
##
blue 596 267
##
red
337 3800
mean(knn.pred3 == dfTest$color1) #
0.8792
## [1] 0.8792
knn.pred5 = knn(train = dfTrain[,1:6], test = dfTest[,1:6], cl =
dfTrain$color1, k = 5)
table(knn.pred5, dfTest$color1)
##
## knn.pred5 blue red
##
blue 597 206
##
red
336 3861
mean(knn.pred5 == dfTest$color1) # 0.8916
## [1] 0.8916
knn.pred7 = knn(train = dfTrain[,1:6], test = dfTest[,1:6], cl =
dfTrain$color1, k = 7)
table(knn.pred7, dfTest$color1)
##
## knn.pred7 blue red
##
blue 593 176
##
red
340 3891
mean(knn.pred7 == dfTest$color1) # 0.8968
## [1] 0.8968
knn.pred15 = knn(train = dfTrain[,1:6], test = dfTest[,1:6], cl =
dfTrain$color1, k = 15)
table(knn.pred15, dfTest$color1)
##
## knn.pred15 blue red
##
blue 574 117
##
red
359 3950
mean(knn.pred15 == dfTest$color1) # 0.9048
## [1] 0.9048
# 5. Tree
library(tree)
## Warning: package 'tree' was built under R version 3.1.3
tree.colors = tree( color1 ~ ., dfTrain)
summary(tree.colors)
##
##
##
##
##
##
##
##
Classification tree:
tree(formula = color1 ~ ., data = dfTrain)
Variables actually used in tree construction:
[1] "x1" "x4" "x3" "x2"
Number of terminal nodes: 11
Residual mean deviance: 0.5755 = 2871 / 4989
Misclassification error rate: 0.0974 = 487 / 5000
tree.colors
## node), split, n, deviance, yval, (yprob)
##
* denotes terminal node
##
##
1) root 5000 4965.00 red ( 0.19720 0.80280 )
##
2) x1 < -0.650728 873 254.50 red ( 0.03322 0.96678 ) *
##
3) x1 > -0.650728 4127 4470.00 red ( 0.23189 0.76811 )
##
6) x1 < 0.641545 3206 3829.00 red ( 0.28447 0.71553 )
##
12) x4 < -0.642267 588 254.20 red ( 0.05612 0.94388 ) *
##
13) x4 > -0.642267 2618 3341.00 red ( 0.33575 0.66425 )
##
26) x4 < 0.546652 1920 2618.00 red ( 0.42448 0.57552 )
##
52) x3 < 0.678641 1612 2234.00 red ( 0.48883 0.51117 )
##
104) x3 < -0.549429 421 358.90 red ( 0.15202 0.84798 ) *
##
105) x3 > -0.549429 1191 1595.00 blue ( 0.60789 0.39211 )
##
210) x2 < 0.774734 1063 1343.00 blue ( 0.67357 0.32643 )
##
420) x2 < -0.675639 186 167.60 red ( 0.16667 0.83333 ) *
##
421) x2 > -0.675639 877 921.80 blue ( 0.78107 0.21893 )
##
842) x2 < 0.551288 757 691.20 blue ( 0.82959 0.17041 )
##
1684) x2 < -0.435063 152 202.10 blue ( 0.61842
0.38158 ) *
##
1685) x2 > -0.435063 605 437.60 blue ( 0.88264
0.11736 ) *
##
843) x2 > 0.551288 120 166.10 red ( 0.47500 0.52500 )
*
##
211) x2 > 0.774734 128
59.85 red ( 0.06250 0.93750 ) *
##
53) x3 > 0.678641 308 183.00 red ( 0.08766 0.91234 ) *
##
27) x4 > 0.546652 698 427.80 red ( 0.09169 0.90831 ) *
##
7) x1 > 0.641545 921 359.50 red ( 0.04886 0.95114 ) *
tree.pred = predict(tree.colors, dfTest, type = "class")
table(tree.pred, dfTest$color1)
##
## tree.pred blue
red
##
##
blue
red
574 145
359 3922
mean(tree.pred == dfTest$color1)
## [1] 0.8992
# Try pruning via cross-validation
cv.colors = cv.tree(tree.colors, FUN = prune.misclass)
names(cv.colors)
## [1] "size"
"dev"
"k"
"method"
cv.colors
##
##
##
##
##
##
##
##
##
##
##
##
##
##
$size
[1] 11 10
9
1
$dev
[1] 548 548 550 986
$k
[1]
-Inf
0.000
6.000 61.625
$method
[1] "misclass"
attr(,"class")
[1] "prune"
"tree.sequence"
# Now prune the tree
prune.colors = prune.misclass(tree.colors, best = 15 )
## Warning in prune.tree(tree = tree.colors, best = 15, method = "misclass"):
## best is bigger than tree size
plot(prune.colors)
text(prune.colors, pretty = 0)
# Performance on test set
tree.pred = predict (prune.colors, dfTest, type = "class")
table(tree.pred, dfTest$color1)
##
## tree.pred blue red
##
blue 574 145
##
red
359 3922
mean(tree.pred == dfTest$color1) # 0.8992
## [1] 0.8992
# 6 Bagging
library(randomForest)
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
bag.colors = randomForest(color1 ~ ., data = dfTrain,
mtry = 6, importance = TRUE)
yhat.bag = predict(bag.colors, newdata = dfTest)
table(yhat.bag, dfTest$color1)
##
## yhat.bag blue red
##
blue 676 119
##
red
257 3948
mean(yhat.bag == dfTest$color1) # 0.9248
## [1] 0.9248
# 7. Random Forest
# Use 3 of 6 predictors
bag.colorsF3 = randomForest(color1 ~ ., data = dfTrain,
mtry = 3, importance = TRUE)
yhat.bagF3 = predict(bag.colors, newdata = dfTest)
table(yhat.bagF3, dfTest$color1)
##
## yhat.bagF3 blue red
##
blue 675 120
##
red
258 3947
mean(yhat.bagF3 == dfTest$color1)
## [1] 0.9244
sum(yhat.bagF3 == dfTest$color1)
## [1] 4622
# Use 2 of 6 predictors
bag.colorsF2 = randomForest(color1 ~ ., data = dfTrain,
mtry = 2, importance = TRUE)
yhat.bagF2 = predict(bag.colors, newdata = dfTest)
table(yhat.bagF2, dfTest$color1)
##
## yhat.bagF2 blue red
##
blue 675 119
##
red
258 3948
mean(yhat.bagF2 == dfTest$color1) # 0.9246
## [1] 0.9246
table(yhat.bag == yhat.bagF2)
##
## FALSE
##
1
TRUE
4999
mean(yhat.bag == yhat.bagF2) # 0.9998
## [1] 0.9998
importance(bag.colors)
##
##
##
##
##
##
##
x1
x2
x3
x4
x5
x6
blue
154.2442790
142.5286583
142.0714991
151.4785433
0.9033374
-2.3339885
red MeanDecreaseAccuracy MeanDecreaseGini
123.3634200
176.245850
336.74727
107.2910048
160.290965
376.21805
110.0119122
163.214468
354.68932
117.4098121
180.011101
344.86527
3.5341044
3.557608
89.98382
0.6331717
-0.676042
79.20312
importance(bag.colorsF2)
##
##
##
##
##
##
##
x1
x2
x3
x4
x5
x6
blue
119.986645
108.844194
115.647941
122.994365
3.263936
-1.215705
red MeanDecreaseAccuracy MeanDecreaseGini
98.646841
138.8348004
350.4371
84.839652
121.1783311
331.8474
90.477256
127.1313051
340.0016
95.045096
134.3153792
350.9749
1.152052
2.7831518
107.2180
1.427644
0.4685702
101.0973
importance(bag.colorsF3)
##
##
##
##
##
##
##
x1
x2
x3
x4
x5
x6
blue
141.119517
130.473486
127.689675
140.001527
1.601669
-2.164920
red MeanDecreaseAccuracy MeanDecreaseGini
106.3559363
164.9998082
348.7953
100.7578932
143.6887800
351.8899
105.8876836
152.4531690
348.3248
109.0224862
163.5321583
352.3392
-1.4202147
-0.2838376
94.0987
-0.4052774
-1.6139238
86.1505
varImpPlot(bag.colorsF3)
# 8 Boosting
library(caret) # use this library to help with set up
## Loading required package: lattice
## Loading required package: ggplot2
library(gbm)
##
##
##
##
##
##
##
##
##
##
##
Loading required package: survival
Attaching package: 'survival'
The following object is masked from 'package:caret':
cluster
Loading required package: splines
Loading required package: parallel
Loaded gbm 2.1
system.time(boost.colors <- train(color1 ~ ., method = "gbm", data = dfTrain,
verbose = FALSE))
## Loading required package: plyr
##
##
user
55.51
system elapsed
0.08
56.52
ctrl <- trainControl(method = "repeatedcv", repeats = 5,
classProbs = TRUE,
summaryFunction = twoClassSummary)
grid <- expand.grid(interaction.depth = seq(3, 7, by = 2),
n.trees = seq(100, 1000, by = 50),
shrinkage = c(0.01, 0.1))
system.time(gbmTune <- train(color1 ~ ., data = dfTrain,
method = "gbm",
metric = "ROC",
tuneGrid = grid,
verbose = FALSE,
trControl = ctrl))
##
user
## 1923.45
system elapsed
0.51 1935.69
library(ggplot2)
ggplot(gbmTune) + theme(legend.position = "top")
gbmPred = predict(gbmTune, dfTest)
gbmProbs = predict(gbmTune, dfTest, type = "prob")
confusionMatrix(gbmPred, dfTest$color1)
## Confusion Matrix and Statistics
##
##
Reference
## Prediction blue red
##
blue 689
92
##
red
244 3975
##
##
Accuracy : 0.9328
##
95% CI : (0.9255, 0.9396)
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
##
No Information Rate : 0.8134
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.7638
Mcnemar's Test P-Value : < 2.2e-16
Sensitivity
Specificity
Pos Pred Value
Neg Pred Value
Prevalence
Detection Rate
Detection Prevalence
Balanced Accuracy
:
:
:
:
:
:
:
:
0.7385
0.9774
0.8822
0.9422
0.1866
0.1378
0.1562
0.8579
'Positive' Class : blue
mean(gbmPred == dfTest$color1)
## [1] 0.9328
sum(gbmPred == dfTest$color1)
## [1] 4664
Results Percent Correct
11.
12.
13.
14.
15.
16.
17.
18.
19.
Actual color is red: 80.81
Logistic Regression: 80.81
LDA: 81.34
QDA: 83.96
KNN (K = 15): 90.48 best result for KNN
Tree: 89.92
Bagging: 92.48
Random Forest: 92.46 best result for RF
Boosting: 93.14
Download