Uploaded by Justin Bradley

R Script: K-Means & Hierarchical Clustering

advertisement
Ch-10-Lab-2.R
rstudio-user
2021-12-13
# Kmeans function peforms clustering
set.seed(2)
x= matrix(rnorm(50*2), ncol = 2)
x[1:25, 1] +3
## [1] 2.1030855
## [8] 2.7603020
## [15] 4.7822290
## [22] 1.8000742
3.1848492
4.9844739
0.6889309
4.5896382
4.5878453
2.8612130
3.8786046
4.9546516
1.8696243 2.9197482 3.1324203 3.7079547
3.4176508 3.9817528 2.6073046 1.9603310
3.0358067 4.0128287 3.4322652 5.0908192
3.0049378
x[1:25,2]+ 4
## [1] 3.161713 6.066301 3.437753 5.275716 2.952427 2.034122 3.677029 4.9358
63
## [9] 5.139230 5.671619 2.211758 6.031243 3.296856 4.158165 4.506235 3.1800
05
## [17] 2.001153 3.520707 4.084180 3.104513 3.078724 4.330450 3.858339 4.4348
48
## [25] 3.946277
km.out=kmeans(x,2, nstart = 20)
# The cluster assignments are found in $cluster
km.out$cluster
## [1] 1 2 1 2 1 1 1 2 2 2 1 2 1 2 2 1 1 1 1 1 1 2 1 2 1 1 2 2 2 1 2 1 1 1 1
2 1 1
## [39] 1 2 2 2 1 1 1 1 2 1 1 1
# K means clustering has divided the observations into two defined groups eve
n though know information was provided
# Plot the data from the cluters
plot(x,col= (km.out$cluster+1), main = "K-Means Clustering Results with K =2"
, xlab = "", ylab = "", pch=20, cex=2)
# The data can be plotted realtively easy becasue its two dimensional
# If there was more data to be considered a PCA could be performed instead
# Data doesn't predetermine clusters, so instead we could perform Kmeans clus
ter
set.seed(4)
km.out=kmeans(x,3, nstart = 20)
km.out
## K-means clustering with 3 clusters of sizes 24, 11, 15
##
## Cluster means:
##
[,1]
[,2]
## 1 -0.4418111 -0.9784516
## 2 1.6073703 -0.2662361
## 3 -0.2413807 1.3256482
##
## Clustering vector:
## [1] 1 3 2 3 1 1 2 3 2 3 1 3 1 1 2 1 1 1 2 1 2 3 2 2 1 1 3 3 3 1 3 1 2 1 1
3 1 1
## [39] 1 3 3 3 1 2 1 2 3 1 1 1
##
## Within cluster sum of squares by cluster:
## [1] 27.386724 8.167257 13.702775
## (between_SS / total_SS = 62.8 %)
##
## Available components:
##
## [1] "cluster"
nss"
## [6] "betweenss"
"centers"
"totss"
"withinss"
"size"
"iter"
"ifault"
"tot.withi
plot(x, col=(km.out$cluster+1), main = "K-Means Clustering Results With K=3",
xlab = "", ylab = "", pch=20, cex=2)
# Perform Kmeans when compared to nstart=1 to nstart=20
set.seed(3)
km.out=kmeans(x,3, nstart=1)
km.out$tot.withinss
## [1] 53.2604
km.out=kmeans(x,3, nstart = 20)
km.out$tot.withinss
## [1] 49.25676
# Cluster observations using complete linkage
hc.complete=hclust(dist(x), method = "complete")
# Altneratively you could perform clustering with average single linkages
hc.average=hclust(dist(x), method = "average")
hc.single=hclust(dist(x), method = "single")
# Plot the dendrograms obtained using plot function
par(mfrow= c(2,3))
plot(hc.complete, main = "Complete Linkkage", xlab = "", sub = "", cex=.9)
plot(hc.average, main = "Average", xlab = "", sub = "", cex=.9)
plot(hc.single, main = "Single", xlab = "", sub = "", cex=.9)
# Determine the cluster labels for each observation associated with a given c
ut of the dendrogram
# Use the cutree function
cutree(hc.complete, 2)
## [1] 1 1 1 2 1 1 1 1 1 1 1 1 1 2 1 2 1 1 1 1 1 2 1 1 1 2 1 1 1 1 1 1 1 1 1
1 2 2
## [39] 1 1 1 2 1 1 1 1 1 1 1 2
cutree(hc.average, 2)
## [1] 1 2 1 2 1 1 1 1 1 2 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 1 2 1 1 1 1
2 1 1
## [39] 1 1 2 2 1 1 1 1 2 1 1 1
cutree(hc.single, 2)
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2
1 1 1
## [39] 1 1 1 1 1 1 1 1 1 1 1 1
# Complete and average linkage seperate observations into correct groups
# More sensible answers come from increase number of clusters
cutree(hc.single, 4)
## [1] 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3
1 1 1
## [39] 1 1 1 4 1 1 1 1 1 1 1 1
# Scale variables before performing hierarchal clustering
xsc= scale(x)
plot(hclust(dist(xsc), method = "complete"), main = "Hierarchiacal Clustering
with Scaled Features")
# Cluster three dimensional data set
x=matrix(rnorm(30*3), ncol = 3)
dd=as.dist(1-cor(t(x)))
plot(hclust(dd, method = "complete"), main = "Complete Linkage with Correlati
on - Based Distance", xlab="", sub="")
Download