Data Ming and Classification with R Examples Ira Sharenow Wednesday, August 19, 2015 Points within sphere of radius 0.8 are labeled "blue"; otherwise "red" But actually criterion is based on the first four of six co-ordinates The region is clearly non-linear Then points are slightly perturbed so the labeling will no longer be perfect Out of six co-ordinates only first five are jittered Then a sample of half the points is taken Then various tests are performed on the hold out test set. Since the boundary is very non-linear, I expected the first few methods to perform poorly, and they did so. Bagging, random forest, and boosting all produced excellent results. However, bossting is extremely slow. 0. 1. 2. 3. 4. 5. 6. 7. 8. 9. 10. Actual percent red Logistic regression LDA QDA KNN Tree Bagging Random Forest Boosting Support Vector Machines Table of results set.seed(20150819) sampleSize = 10000 df = data.frame(x1 x2 x3 x4 x5 x6 = = = = = = runif(n runif(n runif(n runif(n runif(n runif(n = = = = = = sampleSize, sampleSize, sampleSize, sampleSize, sampleSize, sampleSize, min min min min min min = = = = = = -1, -1, -1, -1, -1, -1, max max max max max max = = = = = = 1), 1), 1), 1), 1), 1)) # Function for labeling color = function(x1,x2,x3,x4,x5,x6) if (x1^2 + x2^2 + x3^2 + x4^2 <= 0.8 ) return("blue") else return("red") head(df) ## ## ## ## ## ## ## x1 x2 x3 x4 x5 x6 1 -0.06276411 -0.8483214 0.347916695 0.02709725 -0.3690519 0.02848432 2 0.79737745 0.1749736 0.506957737 0.39603261 0.1119455 -0.71287355 3 -0.28757810 -0.4891981 0.923341176 -0.71710376 0.3040652 -0.82590296 4 0.82660601 0.2245092 0.002798933 0.69601031 0.5024880 0.96397908 5 0.43177409 0.5581343 -0.611626756 -0.11168633 0.2727495 0.95727541 6 -0.36671109 0.9372195 -0.341857872 -0.66116307 0.5929235 0.98582181 df$color1 = mapply(color, df$x1, df$x2,df$x3, df$x4,df$x5, df$x6) df$color1 = factor(df$color1) contrasts(df$color1) # blue is 0, red is 1 ## red ## blue 0 ## red 1 head(df) ## ## ## ## ## ## ## ## ## ## ## ## ## ## x1 x2 x3 x4 x5 x6 1 -0.06276411 -0.8483214 0.347916695 0.02709725 -0.3690519 0.02848432 2 0.79737745 0.1749736 0.506957737 0.39603261 0.1119455 -0.71287355 3 -0.28757810 -0.4891981 0.923341176 -0.71710376 0.3040652 -0.82590296 4 0.82660601 0.2245092 0.002798933 0.69601031 0.5024880 0.96397908 5 0.43177409 0.5581343 -0.611626756 -0.11168633 0.2727495 0.95727541 6 -0.36671109 0.9372195 -0.341857872 -0.66116307 0.5929235 0.98582181 color1 1 red 2 red 3 red 4 red 5 red 6 red mean(df$color1 == "blue") ## [1] 0.1919 # Perturb co-ordinates 1-5 df$x1 = df$x1 + rnorm(sampleSize, df$x2 = df$x2 + rnorm(sampleSize, df$x3 = df$x3 + rnorm(sampleSize, df$x4 = df$x4 + rnorm(sampleSize, df$x5 = df$x5 + rnorm(sampleSize, 0, 0, 0, 0, 0, 0.1) 0.1) 0.1) 0.1) 0.1) # Divide points into training and test sets train = sample(1:sampleSize, sampleSize/2) dfTrain = df[train, ] dim(dfTrain) ## [1] 5000 7 dfTest = df[-c(train), ] dim(dfTest) ## [1] 5000 7 # Set up is now completed # Now start to perform tests # 1. Logistic Regression glm.fit = glm(color1 ~ ., data = dfTrain, family = binomial) summary(glm.fit) ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## Call: glm(formula = color1 ~ ., family = binomial, data = dfTrain) Deviance Residuals: Min 1Q Median -1.9033 0.6213 0.6528 3Q 0.6756 Max 0.7553 Coefficients: Estimate Std. Error z value Pr(>|z|) (Intercept) 1.404084 0.035614 39.426 <2e-16 *** x1 -0.080827 0.060590 -1.334 0.182 x2 -0.043699 0.060668 -0.720 0.471 x3 0.008068 0.060706 0.133 0.894 x4 -0.058953 0.060893 -0.968 0.333 x5 0.082850 0.060944 1.359 0.174 x6 0.064199 0.061538 1.043 0.297 --Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 (Dispersion parameter for binomial family taken to be 1) Null deviance: 4965.0 Residual deviance: 4958.8 AIC: 4972.8 on 4999 on 4993 degrees of freedom degrees of freedom Number of Fisher Scoring iterations: 4 # help(predict) glm.probs = predict(glm.fit, newdata = dfTest, type = "response") glm.probs[1:10] ## 1 4 6 9 11 12 13 ## 0.8053388 0.8050761 0.8259650 0.8059137 0.8093152 0.8082293 0.8039616 ## 16 17 18 ## 0.8077144 0.8256474 0.8096335 glm.pred = rep("red", sampleSize/2) glm.pred[glm.probs < .5]= "blue" table(glm.pred, dfTest$color1) ## ## glm.pred blue red ## red 933 4067 mean(glm.pred == df$color1) # 0.8081 ## [1] 0.8081 # 2. Linear Discriminant Analysis library( MASS) lda.fit = lda(color1 ~ ., data = dfTrain) lda.fit ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## Call: lda(color1 ~ ., data = dfTrain) Prior probabilities of groups: blue red 0.1972 0.8028 Group means: x1 x2 x3 x4 x5 blue 0.030187117 0.00373233 -0.01429667 0.008993027 -0.01223305 red 0.002206837 -0.01218605 -0.01279927 -0.010448713 0.01591678 x6 blue 0.002238118 red 0.023971066 Coefficients of linear discriminants: LD1 x1 -0.91223082 x2 -0.49229573 x3 0.09083985 x4 -0.66651260 x5 0.93584612 x6 0.72461144 lda.pred = predict(lda.fit, dfTest) names(lda.pred ) ## [1] "class" "posterior" "x" lda.class = lda.pred$class table(lda.class, dfTest$color1) ## ## lda.class blue red ## blue 0 0 ## red 933 4067 mean(lda.class == dfTest$color1) # 0.8134 ## [1] 0.8134 # 3. Quadratic Discriminant Analysis qda.fit = qda(color1 ~ ., data = dfTrain) qda.fit ## ## ## ## ## ## ## ## ## ## ## ## ## ## Call: qda(color1 ~ ., data = dfTrain) Prior probabilities of groups: blue red 0.1972 0.8028 Group means: x1 x2 x3 x4 x5 blue 0.030187117 0.00373233 -0.01429667 0.008993027 -0.01223305 red 0.002206837 -0.01218605 -0.01279927 -0.010448713 0.01591678 x6 blue 0.002238118 red 0.023971066 qda.pred = predict(qda.fit, dfTest) qda.class = qda.pred$class table(qda.class, dfTest$color1) ## ## qda.class blue red ## blue 131 0 ## red 802 4067 mean(qda.class == dfTest$color1) # 0.8396) ## [1] 0.8396 # 4. KNN library(class) knn.pred3 = knn(train = dfTrain[,1:6], test = dfTest[,1:6], cl = dfTrain$color1, k = 3) table(knn.pred3, dfTest$color1) ## ## knn.pred3 blue red ## blue 596 267 ## red 337 3800 mean(knn.pred3 == dfTest$color1) # 0.8792 ## [1] 0.8792 knn.pred5 = knn(train = dfTrain[,1:6], test = dfTest[,1:6], cl = dfTrain$color1, k = 5) table(knn.pred5, dfTest$color1) ## ## knn.pred5 blue red ## blue 597 206 ## red 336 3861 mean(knn.pred5 == dfTest$color1) # 0.8916 ## [1] 0.8916 knn.pred7 = knn(train = dfTrain[,1:6], test = dfTest[,1:6], cl = dfTrain$color1, k = 7) table(knn.pred7, dfTest$color1) ## ## knn.pred7 blue red ## blue 593 176 ## red 340 3891 mean(knn.pred7 == dfTest$color1) # 0.8968 ## [1] 0.8968 knn.pred15 = knn(train = dfTrain[,1:6], test = dfTest[,1:6], cl = dfTrain$color1, k = 15) table(knn.pred15, dfTest$color1) ## ## knn.pred15 blue red ## blue 574 117 ## red 359 3950 mean(knn.pred15 == dfTest$color1) # 0.9048 ## [1] 0.9048 # 5. Tree library(tree) ## Warning: package 'tree' was built under R version 3.1.3 tree.colors = tree( color1 ~ ., dfTrain) summary(tree.colors) ## ## ## ## ## ## ## ## Classification tree: tree(formula = color1 ~ ., data = dfTrain) Variables actually used in tree construction: [1] "x1" "x4" "x3" "x2" Number of terminal nodes: 11 Residual mean deviance: 0.5755 = 2871 / 4989 Misclassification error rate: 0.0974 = 487 / 5000 tree.colors ## node), split, n, deviance, yval, (yprob) ## * denotes terminal node ## ## 1) root 5000 4965.00 red ( 0.19720 0.80280 ) ## 2) x1 < -0.650728 873 254.50 red ( 0.03322 0.96678 ) * ## 3) x1 > -0.650728 4127 4470.00 red ( 0.23189 0.76811 ) ## 6) x1 < 0.641545 3206 3829.00 red ( 0.28447 0.71553 ) ## 12) x4 < -0.642267 588 254.20 red ( 0.05612 0.94388 ) * ## 13) x4 > -0.642267 2618 3341.00 red ( 0.33575 0.66425 ) ## 26) x4 < 0.546652 1920 2618.00 red ( 0.42448 0.57552 ) ## 52) x3 < 0.678641 1612 2234.00 red ( 0.48883 0.51117 ) ## 104) x3 < -0.549429 421 358.90 red ( 0.15202 0.84798 ) * ## 105) x3 > -0.549429 1191 1595.00 blue ( 0.60789 0.39211 ) ## 210) x2 < 0.774734 1063 1343.00 blue ( 0.67357 0.32643 ) ## 420) x2 < -0.675639 186 167.60 red ( 0.16667 0.83333 ) * ## 421) x2 > -0.675639 877 921.80 blue ( 0.78107 0.21893 ) ## 842) x2 < 0.551288 757 691.20 blue ( 0.82959 0.17041 ) ## 1684) x2 < -0.435063 152 202.10 blue ( 0.61842 0.38158 ) * ## 1685) x2 > -0.435063 605 437.60 blue ( 0.88264 0.11736 ) * ## 843) x2 > 0.551288 120 166.10 red ( 0.47500 0.52500 ) * ## 211) x2 > 0.774734 128 59.85 red ( 0.06250 0.93750 ) * ## 53) x3 > 0.678641 308 183.00 red ( 0.08766 0.91234 ) * ## 27) x4 > 0.546652 698 427.80 red ( 0.09169 0.90831 ) * ## 7) x1 > 0.641545 921 359.50 red ( 0.04886 0.95114 ) * tree.pred = predict(tree.colors, dfTest, type = "class") table(tree.pred, dfTest$color1) ## ## tree.pred blue red ## ## blue red 574 145 359 3922 mean(tree.pred == dfTest$color1) ## [1] 0.8992 # Try pruning via cross-validation cv.colors = cv.tree(tree.colors, FUN = prune.misclass) names(cv.colors) ## [1] "size" "dev" "k" "method" cv.colors ## ## ## ## ## ## ## ## ## ## ## ## ## ## $size [1] 11 10 9 1 $dev [1] 548 548 550 986 $k [1] -Inf 0.000 6.000 61.625 $method [1] "misclass" attr(,"class") [1] "prune" "tree.sequence" # Now prune the tree prune.colors = prune.misclass(tree.colors, best = 15 ) ## Warning in prune.tree(tree = tree.colors, best = 15, method = "misclass"): ## best is bigger than tree size plot(prune.colors) text(prune.colors, pretty = 0) # Performance on test set tree.pred = predict (prune.colors, dfTest, type = "class") table(tree.pred, dfTest$color1) ## ## tree.pred blue red ## blue 574 145 ## red 359 3922 mean(tree.pred == dfTest$color1) # 0.8992 ## [1] 0.8992 # 6 Bagging library(randomForest) ## randomForest 4.6-10 ## Type rfNews() to see new features/changes/bug fixes. bag.colors = randomForest(color1 ~ ., data = dfTrain, mtry = 6, importance = TRUE) yhat.bag = predict(bag.colors, newdata = dfTest) table(yhat.bag, dfTest$color1) ## ## yhat.bag blue red ## blue 676 119 ## red 257 3948 mean(yhat.bag == dfTest$color1) # 0.9248 ## [1] 0.9248 # 7. Random Forest # Use 3 of 6 predictors bag.colorsF3 = randomForest(color1 ~ ., data = dfTrain, mtry = 3, importance = TRUE) yhat.bagF3 = predict(bag.colors, newdata = dfTest) table(yhat.bagF3, dfTest$color1) ## ## yhat.bagF3 blue red ## blue 675 120 ## red 258 3947 mean(yhat.bagF3 == dfTest$color1) ## [1] 0.9244 sum(yhat.bagF3 == dfTest$color1) ## [1] 4622 # Use 2 of 6 predictors bag.colorsF2 = randomForest(color1 ~ ., data = dfTrain, mtry = 2, importance = TRUE) yhat.bagF2 = predict(bag.colors, newdata = dfTest) table(yhat.bagF2, dfTest$color1) ## ## yhat.bagF2 blue red ## blue 675 119 ## red 258 3948 mean(yhat.bagF2 == dfTest$color1) # 0.9246 ## [1] 0.9246 table(yhat.bag == yhat.bagF2) ## ## FALSE ## 1 TRUE 4999 mean(yhat.bag == yhat.bagF2) # 0.9998 ## [1] 0.9998 importance(bag.colors) ## ## ## ## ## ## ## x1 x2 x3 x4 x5 x6 blue 154.2442790 142.5286583 142.0714991 151.4785433 0.9033374 -2.3339885 red MeanDecreaseAccuracy MeanDecreaseGini 123.3634200 176.245850 336.74727 107.2910048 160.290965 376.21805 110.0119122 163.214468 354.68932 117.4098121 180.011101 344.86527 3.5341044 3.557608 89.98382 0.6331717 -0.676042 79.20312 importance(bag.colorsF2) ## ## ## ## ## ## ## x1 x2 x3 x4 x5 x6 blue 119.986645 108.844194 115.647941 122.994365 3.263936 -1.215705 red MeanDecreaseAccuracy MeanDecreaseGini 98.646841 138.8348004 350.4371 84.839652 121.1783311 331.8474 90.477256 127.1313051 340.0016 95.045096 134.3153792 350.9749 1.152052 2.7831518 107.2180 1.427644 0.4685702 101.0973 importance(bag.colorsF3) ## ## ## ## ## ## ## x1 x2 x3 x4 x5 x6 blue 141.119517 130.473486 127.689675 140.001527 1.601669 -2.164920 red MeanDecreaseAccuracy MeanDecreaseGini 106.3559363 164.9998082 348.7953 100.7578932 143.6887800 351.8899 105.8876836 152.4531690 348.3248 109.0224862 163.5321583 352.3392 -1.4202147 -0.2838376 94.0987 -0.4052774 -1.6139238 86.1505 varImpPlot(bag.colorsF3) # 8 Boosting library(caret) # use this library to help with set up ## Loading required package: lattice ## Loading required package: ggplot2 library(gbm) ## ## ## ## ## ## ## ## ## ## ## Loading required package: survival Attaching package: 'survival' The following object is masked from 'package:caret': cluster Loading required package: splines Loading required package: parallel Loaded gbm 2.1 system.time(boost.colors <- train(color1 ~ ., method = "gbm", data = dfTrain, verbose = FALSE)) ## Loading required package: plyr ## ## user 55.51 system elapsed 0.08 56.52 ctrl <- trainControl(method = "repeatedcv", repeats = 5, classProbs = TRUE, summaryFunction = twoClassSummary) grid <- expand.grid(interaction.depth = seq(3, 7, by = 2), n.trees = seq(100, 1000, by = 50), shrinkage = c(0.01, 0.1)) system.time(gbmTune <- train(color1 ~ ., data = dfTrain, method = "gbm", metric = "ROC", tuneGrid = grid, verbose = FALSE, trControl = ctrl)) ## user ## 1923.45 system elapsed 0.51 1935.69 library(ggplot2) ggplot(gbmTune) + theme(legend.position = "top") gbmPred = predict(gbmTune, dfTest) gbmProbs = predict(gbmTune, dfTest, type = "prob") confusionMatrix(gbmPred, dfTest$color1) ## Confusion Matrix and Statistics ## ## Reference ## Prediction blue red ## blue 689 92 ## red 244 3975 ## ## Accuracy : 0.9328 ## 95% CI : (0.9255, 0.9396) ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## No Information Rate : 0.8134 P-Value [Acc > NIR] : < 2.2e-16 Kappa : 0.7638 Mcnemar's Test P-Value : < 2.2e-16 Sensitivity Specificity Pos Pred Value Neg Pred Value Prevalence Detection Rate Detection Prevalence Balanced Accuracy : : : : : : : : 0.7385 0.9774 0.8822 0.9422 0.1866 0.1378 0.1562 0.8579 'Positive' Class : blue mean(gbmPred == dfTest$color1) ## [1] 0.9328 sum(gbmPred == dfTest$color1) ## [1] 4664 Results Percent Correct 11. 12. 13. 14. 15. 16. 17. 18. 19. Actual color is red: 80.81 Logistic Regression: 80.81 LDA: 81.34 QDA: 83.96 KNN (K = 15): 90.48 best result for KNN Tree: 89.92 Bagging: 92.48 Random Forest: 92.46 best result for RF Boosting: 93.14