Clustering Examples in R

advertisement
Clustering Examples in R
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
# Clustering Examples
# First a data set with "clumps"
set.seed(0)
z1<-c(.2*rnorm(50))
w1<-c(.2*rnorm(50))
z2<-c(1+.1*rnorm(50))
w2<-c(1+.1*rnorm(50))
z3<-c(2+.2*rnorm(50))
w3<-c(.2*rnorm(50))
z4<-c(2+.4*rnorm(50))
w4<-c(2+.4*rnorm(50))
z5<-c(1+.2*rnorm(50))
w5<-c(-1+.2*rnorm(50))
z6<-c(.5+.05*rnorm(50))
w6<-c(.5+.05*rnorm(50))
z<-c(z1,z2,z3,z4,z5,z6)
w<-c(w1,w2,w3,w4,w5,w6)
ID<-c(rep(1,50),rep(2,50),rep(3,50),rep(4,50),rep(5,50),rep(6,50))
plot(z,w,pch=ID,col=ID,cex=2,lwd=3)
>
>
>
>
>
>
>
# Here is what k-means will do
# Try it with different starting numbers of centroids
kmeans.out<-kmeans(cbind(z,w),6,nstart=2000)
ID2<-kmeans.out$cluster
plot(z,w,pch=ID,col=ID2,cex=2,lwd=3)
1 > table(ID,ID2)
ID2
ID
1 2 3 4 5 6
1 4 0 0 0 0 46
2 0 0 50 0 0 0
3 0 50 0 0 0 0
4 0 0 0 50 0 0
5 0 0 0 0 50 0
6 50 0 0 0 0 0
> #Here is use of hierarchical clustering
> #Try it cutting at different heights
> d<-dist(cbind(z,w))
> hclust.out<-hclust(d)
> summary(hclust.out)
Length Class
merge
598
-noneheight
299
-noneorder
300
-nonelabels
0
-nonemethod
1
-nonecall
2
-nonedist.method
1
-none-
Mode
numeric
numeric
numeric
NULL
character
call
character
> plot(hclust.out)
2 >
> ID3<-cutree(hclust.out,6)
>
> plot(z,w,pch=ID,col=ID3,cex=2,lwd=3)
>
> table(ID3,ID)
ID
ID3 1 2 3 4 5 6
1 50 0 0 0 0 50
2 0 50 0 15 0 0
3 0 0 50 0 0 0
4 0 0 0 33 0 0
5 0 0 0 2 0 0
6 0 0 0 0 50 0
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
#Now here is a not-clumpy data set
set.seed(0)
x1<-c(seq(1:100))
y1<-c(seq(1:100))
x1<-(x1/100)*cos(x1*pi/25)
y1<-(y1/100)*sin(y1*pi/25)
x2<-c(seq(1:100)/50)
y2<-c(2-(seq(1:100)/50))
x3<-c(2+.2*rnorm(100))
y3<-c(2+.2*rnorm(100))
x<-c(x1,x2,x3)
y<-c(y1,y2,y3)
ID<-c(rep(1,100),rep(2,100),rep(3,100))
plot(x,y,pch=ID,col=ID,lwd=3)
3 >
>
>
>
>
>
#Here is what k-means does on it
kmeans.out<-kmeans(cbind(z,w),3,nstart=5000)
ID2<-kmeans.out$cluster
plot(x,y,pch=ID,col=ID2,cex=2,lwd=3)
> table(ID,ID2)
ID2
ID
1
2
3
1 100
0
0
2
0 50 50
3 50 50
0
4 >
>
>
>
>
#Here is what hierarchical clustering will do on it
d<-dist(cbind(x,y))
hclust.out<-hclust(d)
summary(hclust.out)
Length Class
merge
598
-noneheight
299
-noneorder
300
-nonelabels
0
-nonemethod
1
-nonecall
2
-nonedist.method
1
-none-
Mode
numeric
numeric
numeric
NULL
character
call
character
> plot(hclust.out)
> ID3<-cutree(hclust.out,3)
> plot(x,y,pch=ID,col=ID3,cex=2,lwd=3)
> table(ID3,ID)
ID
ID3
1
2
3
1 84
0
0
2 16 43
0
3
0 57 100
5 > # Here is a bit of development of Spectral Clustering for this second Examp
le
> # (This probably could be handled using Spec() in the kernlab package, but
> # for illustration purposes, we'll do it here from scratch)
>
> c<-.6
> S<-matrix(c(rep(0,90000)),nrow=300)
>
> for (i in 1:300) {
+
for (j in 1:300) {
+
S[i,j]<-exp(-((x[i]-x[j])^2+(y[i]-y[j])^2)/c)
+
}
+
}
>
> S[1:5,1:5]
[,1]
[,2]
[,3]
[,4]
[,5]
[1,] 1.0000000 0.9998281 0.9993022 0.9984076 0.9971313
[2,] 0.9998281 1.0000000 0.9998176 0.9992498 0.9982674
[3,] 0.9993022 0.9998176 1.0000000 0.9998018 0.9991766
[4,] 0.9984076 0.9992498 0.9998018 1.0000000 0.9997808
[5,] 0.9971313 0.9982674 0.9991766 0.9997808 1.0000000
>
> G<-matrix(c(rep(0,90000)),nrow=300)
>
> L<-matrix(c(rep(0,90000)),nrow=300)
>
> g<-c(rep(0,300))
>
> for (i in 1:300) {
+
g[i]<-sum(c(S[i,]))
+
G[i,i]<-g[i]
+
}
>
> G[1:5,1:5]
[,1]
[,2]
[,3]
[,4]
[,5]
[1,] 65.31836 0.00000 0.00000 0.00000 0.00000
[2,] 0.00000 65.33074 0.00000 0.00000 0.00000
[3,] 0.00000 0.00000 65.33212 0.00000 0.00000
[4,] 0.00000 0.00000 0.00000 65.32164 0.00000
[5,] 0.00000 0.00000 0.00000 0.00000 65.29803
>
> L=G-S
>
> L[1:5,1:5]
[,1]
[,2]
[,3]
[,4]
[,5]
[1,] 64.3183634 -0.9998281 -0.9993022 -0.9984076 -0.9971313
[2,] -0.9998281 64.3307396 -0.9998176 -0.9992498 -0.9982674
[3,] -0.9993022 -0.9998176 64.3321154 -0.9998018 -0.9991766
[4,] -0.9984076 -0.9992498 -0.9998018 64.3216388 -0.9997808
[5,] -0.9971313 -0.9982674 -0.9991766 -0.9997808 64.2980322
>
> spectral.out<-eigen(L)
>
> plot(c(seq(1:300)),spectral.out$values)
6 > spectral.out$values[296:300]
[1] 2.182779e+01 1.019474e+01 6.796641e+00 2.490698e+00 7.105427e-14
>
> spec<-cbind(spectral.out$vectors[,297],spectral.out$vectors[,298],spectral.
out$vectors[,299])
>
> # These are the 3-vector representations of the 300 cases that we use for c
lustering
> # The k-means version of this is as follows
>
> kmeans.out<-kmeans(spec,3,nstart=5000)
>
> ID4<-kmeans.out$cluster
>
> plot(x,y,pch=ID,col=ID4,cex=2,lwd=3)
7 > table(ID,ID4)
ID4
ID
1
2
3
1
1 99
0
2 100
0
0
3
0
0 100
>
> # Here is an hierarchical version
>
> d<-dist(spec)
> hclust.out<-hclust(d,method="single")
> summary(hclust.out)
Length Class Mode
merge
598
-none- numeric
height
299
-none- numeric
order
300
-none- numeric
labels
0
-none- NULL
method
1
-none- character
call
3
-none- call
dist.method
1
-none- character
> plot(hclust.out)
> ID5<-cutree(hclust.out,3)
>
> plot(x,y,pch=ID,col=ID5,cex=2,lwd=3)
8 > table(ID5,ID)
ID
ID5
1
2
3
1 100
0
0
2
0 100
0
3
0
0 100
9 
Download