Ch-10-Lab-2.R rstudio-user 2021-12-13 # Kmeans function peforms clustering set.seed(2) x= matrix(rnorm(50*2), ncol = 2) x[1:25, 1] +3 ## [1] 2.1030855 ## [8] 2.7603020 ## [15] 4.7822290 ## [22] 1.8000742 3.1848492 4.9844739 0.6889309 4.5896382 4.5878453 2.8612130 3.8786046 4.9546516 1.8696243 2.9197482 3.1324203 3.7079547 3.4176508 3.9817528 2.6073046 1.9603310 3.0358067 4.0128287 3.4322652 5.0908192 3.0049378 x[1:25,2]+ 4 ## [1] 3.161713 6.066301 3.437753 5.275716 2.952427 2.034122 3.677029 4.9358 63 ## [9] 5.139230 5.671619 2.211758 6.031243 3.296856 4.158165 4.506235 3.1800 05 ## [17] 2.001153 3.520707 4.084180 3.104513 3.078724 4.330450 3.858339 4.4348 48 ## [25] 3.946277 km.out=kmeans(x,2, nstart = 20) # The cluster assignments are found in $cluster km.out$cluster ## [1] 1 2 1 2 1 1 1 2 2 2 1 2 1 2 2 1 1 1 1 1 1 2 1 2 1 1 2 2 2 1 2 1 1 1 1 2 1 1 ## [39] 1 2 2 2 1 1 1 1 2 1 1 1 # K means clustering has divided the observations into two defined groups eve n though know information was provided # Plot the data from the cluters plot(x,col= (km.out$cluster+1), main = "K-Means Clustering Results with K =2" , xlab = "", ylab = "", pch=20, cex=2) # The data can be plotted realtively easy becasue its two dimensional # If there was more data to be considered a PCA could be performed instead # Data doesn't predetermine clusters, so instead we could perform Kmeans clus ter set.seed(4) km.out=kmeans(x,3, nstart = 20) km.out ## K-means clustering with 3 clusters of sizes 24, 11, 15 ## ## Cluster means: ## [,1] [,2] ## 1 -0.4418111 -0.9784516 ## 2 1.6073703 -0.2662361 ## 3 -0.2413807 1.3256482 ## ## Clustering vector: ## [1] 1 3 2 3 1 1 2 3 2 3 1 3 1 1 2 1 1 1 2 1 2 3 2 2 1 1 3 3 3 1 3 1 2 1 1 3 1 1 ## [39] 1 3 3 3 1 2 1 2 3 1 1 1 ## ## Within cluster sum of squares by cluster: ## [1] 27.386724 8.167257 13.702775 ## (between_SS / total_SS = 62.8 %) ## ## Available components: ## ## [1] "cluster" nss" ## [6] "betweenss" "centers" "totss" "withinss" "size" "iter" "ifault" "tot.withi plot(x, col=(km.out$cluster+1), main = "K-Means Clustering Results With K=3", xlab = "", ylab = "", pch=20, cex=2) # Perform Kmeans when compared to nstart=1 to nstart=20 set.seed(3) km.out=kmeans(x,3, nstart=1) km.out$tot.withinss ## [1] 53.2604 km.out=kmeans(x,3, nstart = 20) km.out$tot.withinss ## [1] 49.25676 # Cluster observations using complete linkage hc.complete=hclust(dist(x), method = "complete") # Altneratively you could perform clustering with average single linkages hc.average=hclust(dist(x), method = "average") hc.single=hclust(dist(x), method = "single") # Plot the dendrograms obtained using plot function par(mfrow= c(2,3)) plot(hc.complete, main = "Complete Linkkage", xlab = "", sub = "", cex=.9) plot(hc.average, main = "Average", xlab = "", sub = "", cex=.9) plot(hc.single, main = "Single", xlab = "", sub = "", cex=.9) # Determine the cluster labels for each observation associated with a given c ut of the dendrogram # Use the cutree function cutree(hc.complete, 2) ## [1] 1 1 1 2 1 1 1 1 1 1 1 1 1 2 1 2 1 1 1 1 1 2 1 1 1 2 1 1 1 1 1 1 1 1 1 1 2 2 ## [39] 1 1 1 2 1 1 1 1 1 1 1 2 cutree(hc.average, 2) ## [1] 1 2 1 2 1 1 1 1 1 2 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 1 2 1 1 1 1 2 1 1 ## [39] 1 1 2 2 1 1 1 1 2 1 1 1 cutree(hc.single, 2) ## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 ## [39] 1 1 1 1 1 1 1 1 1 1 1 1 # Complete and average linkage seperate observations into correct groups # More sensible answers come from increase number of clusters cutree(hc.single, 4) ## [1] 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1 1 ## [39] 1 1 1 4 1 1 1 1 1 1 1 1 # Scale variables before performing hierarchal clustering xsc= scale(x) plot(hclust(dist(xsc), method = "complete"), main = "Hierarchiacal Clustering with Scaled Features") # Cluster three dimensional data set x=matrix(rnorm(30*3), ncol = 3) dd=as.dist(1-cor(t(x))) plot(hclust(dd, method = "complete"), main = "Complete Linkage with Correlati on - Based Distance", xlab="", sub="")