Clustering Examples in R > > > > > > > > > > > > > > > > > > > > > > > > # Clustering Examples # First a data set with "clumps" set.seed(0) z1<-c(.2*rnorm(50)) w1<-c(.2*rnorm(50)) z2<-c(1+.1*rnorm(50)) w2<-c(1+.1*rnorm(50)) z3<-c(2+.2*rnorm(50)) w3<-c(.2*rnorm(50)) z4<-c(2+.4*rnorm(50)) w4<-c(2+.4*rnorm(50)) z5<-c(1+.2*rnorm(50)) w5<-c(-1+.2*rnorm(50)) z6<-c(.5+.05*rnorm(50)) w6<-c(.5+.05*rnorm(50)) z<-c(z1,z2,z3,z4,z5,z6) w<-c(w1,w2,w3,w4,w5,w6) ID<-c(rep(1,50),rep(2,50),rep(3,50),rep(4,50),rep(5,50),rep(6,50)) plot(z,w,pch=ID,col=ID,cex=2,lwd=3) > > > > > > > # Here is what k-means will do # Try it with different starting numbers of centroids kmeans.out<-kmeans(cbind(z,w),6,nstart=2000) ID2<-kmeans.out$cluster plot(z,w,pch=ID,col=ID2,cex=2,lwd=3) 1 > table(ID,ID2) ID2 ID 1 2 3 4 5 6 1 4 0 0 0 0 46 2 0 0 50 0 0 0 3 0 50 0 0 0 0 4 0 0 0 50 0 0 5 0 0 0 0 50 0 6 50 0 0 0 0 0 > #Here is use of hierarchical clustering > #Try it cutting at different heights > d<-dist(cbind(z,w)) > hclust.out<-hclust(d) > summary(hclust.out) Length Class merge 598 -noneheight 299 -noneorder 300 -nonelabels 0 -nonemethod 1 -nonecall 2 -nonedist.method 1 -none- Mode numeric numeric numeric NULL character call character > plot(hclust.out) 2 > > ID3<-cutree(hclust.out,6) > > plot(z,w,pch=ID,col=ID3,cex=2,lwd=3) > > table(ID3,ID) ID ID3 1 2 3 4 5 6 1 50 0 0 0 0 50 2 0 50 0 15 0 0 3 0 0 50 0 0 0 4 0 0 0 33 0 0 5 0 0 0 2 0 0 6 0 0 0 0 50 0 > > > > > > > > > > > > > > > > > > > > > > #Now here is a not-clumpy data set set.seed(0) x1<-c(seq(1:100)) y1<-c(seq(1:100)) x1<-(x1/100)*cos(x1*pi/25) y1<-(y1/100)*sin(y1*pi/25) x2<-c(seq(1:100)/50) y2<-c(2-(seq(1:100)/50)) x3<-c(2+.2*rnorm(100)) y3<-c(2+.2*rnorm(100)) x<-c(x1,x2,x3) y<-c(y1,y2,y3) ID<-c(rep(1,100),rep(2,100),rep(3,100)) plot(x,y,pch=ID,col=ID,lwd=3) 3 > > > > > > #Here is what k-means does on it kmeans.out<-kmeans(cbind(z,w),3,nstart=5000) ID2<-kmeans.out$cluster plot(x,y,pch=ID,col=ID2,cex=2,lwd=3) > table(ID,ID2) ID2 ID 1 2 3 1 100 0 0 2 0 50 50 3 50 50 0 4 > > > > > #Here is what hierarchical clustering will do on it d<-dist(cbind(x,y)) hclust.out<-hclust(d) summary(hclust.out) Length Class merge 598 -noneheight 299 -noneorder 300 -nonelabels 0 -nonemethod 1 -nonecall 2 -nonedist.method 1 -none- Mode numeric numeric numeric NULL character call character > plot(hclust.out) > ID3<-cutree(hclust.out,3) > plot(x,y,pch=ID,col=ID3,cex=2,lwd=3) > table(ID3,ID) ID ID3 1 2 3 1 84 0 0 2 16 43 0 3 0 57 100 5 > # Here is a bit of development of Spectral Clustering for this second Examp le > # (This probably could be handled using Spec() in the kernlab package, but > # for illustration purposes, we'll do it here from scratch) > > c<-.6 > S<-matrix(c(rep(0,90000)),nrow=300) > > for (i in 1:300) { + for (j in 1:300) { + S[i,j]<-exp(-((x[i]-x[j])^2+(y[i]-y[j])^2)/c) + } + } > > S[1:5,1:5] [,1] [,2] [,3] [,4] [,5] [1,] 1.0000000 0.9998281 0.9993022 0.9984076 0.9971313 [2,] 0.9998281 1.0000000 0.9998176 0.9992498 0.9982674 [3,] 0.9993022 0.9998176 1.0000000 0.9998018 0.9991766 [4,] 0.9984076 0.9992498 0.9998018 1.0000000 0.9997808 [5,] 0.9971313 0.9982674 0.9991766 0.9997808 1.0000000 > > G<-matrix(c(rep(0,90000)),nrow=300) > > L<-matrix(c(rep(0,90000)),nrow=300) > > g<-c(rep(0,300)) > > for (i in 1:300) { + g[i]<-sum(c(S[i,])) + G[i,i]<-g[i] + } > > G[1:5,1:5] [,1] [,2] [,3] [,4] [,5] [1,] 65.31836 0.00000 0.00000 0.00000 0.00000 [2,] 0.00000 65.33074 0.00000 0.00000 0.00000 [3,] 0.00000 0.00000 65.33212 0.00000 0.00000 [4,] 0.00000 0.00000 0.00000 65.32164 0.00000 [5,] 0.00000 0.00000 0.00000 0.00000 65.29803 > > L=G-S > > L[1:5,1:5] [,1] [,2] [,3] [,4] [,5] [1,] 64.3183634 -0.9998281 -0.9993022 -0.9984076 -0.9971313 [2,] -0.9998281 64.3307396 -0.9998176 -0.9992498 -0.9982674 [3,] -0.9993022 -0.9998176 64.3321154 -0.9998018 -0.9991766 [4,] -0.9984076 -0.9992498 -0.9998018 64.3216388 -0.9997808 [5,] -0.9971313 -0.9982674 -0.9991766 -0.9997808 64.2980322 > > spectral.out<-eigen(L) > > plot(c(seq(1:300)),spectral.out$values) 6 > spectral.out$values[296:300] [1] 2.182779e+01 1.019474e+01 6.796641e+00 2.490698e+00 7.105427e-14 > > spec<-cbind(spectral.out$vectors[,297],spectral.out$vectors[,298],spectral. out$vectors[,299]) > > # These are the 3-vector representations of the 300 cases that we use for c lustering > # The k-means version of this is as follows > > kmeans.out<-kmeans(spec,3,nstart=5000) > > ID4<-kmeans.out$cluster > > plot(x,y,pch=ID,col=ID4,cex=2,lwd=3) 7 > table(ID,ID4) ID4 ID 1 2 3 1 1 99 0 2 100 0 0 3 0 0 100 > > # Here is an hierarchical version > > d<-dist(spec) > hclust.out<-hclust(d,method="single") > summary(hclust.out) Length Class Mode merge 598 -none- numeric height 299 -none- numeric order 300 -none- numeric labels 0 -none- NULL method 1 -none- character call 3 -none- call dist.method 1 -none- character > plot(hclust.out) > ID5<-cutree(hclust.out,3) > > plot(x,y,pch=ID,col=ID5,cex=2,lwd=3) 8 > table(ID5,ID) ID ID5 1 2 3 1 100 0 0 2 0 100 0 3 0 0 100 9