Multidimensional Scaling

advertisement
Multidimensional Scaling
From: http://en.wikipedia.org/wiki/Multidimensional_scaling
The data to be analyzed is a collection of
function is defined,
objects (colors, faces, stocks, . . .) on which a distance
δi,j := distance between i th and j th objects.
These distances are the entries of the dissimilarity matrix
The goal of MDS is, given Δ, to find
for all
vectors
such that
,
where
is a vector norm. In classical MDS, this norm is the Euclidean distance, but, in a
broader sense, it may be a metric or arbitrary distance function.[3]
In other words, MDS attempts to find an embedding from the objects into RN such that
distances are preserved. If the dimension N is chosen to be 2 or 3, we may plot the
vectors xi to obtain a visualization of the similarities between the objects. Note that the
vectors xi are not unique: With the Euclidean distance, they may be arbitrarily translated,
rotated, and reflected, since these transformations do not change the pairwise
distances
.
There are various approaches to determining the vectors xi. Usually, MDS is formulated as
an optimization problem, where
function, for example,
is found as a minimizer of some cost
1
> # Goal: Use multidimensional scaling (mds) to explore protein data
> # Take data and determine percentage of proteing obtained from each category
> # Use Eqn 5.5 to calculate “distances” between each country
> # Try different dimensions of MDS
>
> Protein<read.csv(file="http://users.humboldt.edu/rizzardi/Data.dir/EuroProtein.csv",header=T,s
kip=5)
> head(Protein)
Country red.meat white.meat eggs milk fish cereals starch nuts.oilseeds vegetables Total
1
Albania
10
1
1
9
0
42
1
6
2
72
2
Austria
9
14
4
20
2
28
4
1
4
86
3
Belgium
14
9
4
18
5
27
6
2
4
89
4
Bulgaria
8
6
2
8
1
57
1
4
4
91
5 Czechoslovakia
10
11
3
13
2
34
5
1
4
83
6
Denmark
11
11
4
25
10
22
5
1
2
91
> dim(Protein)
[1] 25 11
> X <- data.matrix(Protein[,2:10]/Protein$Total)
> dim(X)
[1] 25 9
> head(X)
red.meat
[1,] 0.13888889
[2,] 0.10465116
[3,] 0.15730337
[4,] 0.08791209
[5,] 0.12048193
[6,] 0.12087912
white.meat
0.01388889
0.16279070
0.10112360
0.06593407
0.13253012
0.12087912
eggs
0.01388889
0.04651163
0.04494382
0.02197802
0.03614458
0.04395604
milk
0.12500000
0.23255814
0.20224719
0.08791209
0.15662651
0.27472527
fish
0.00000000
0.02325581
0.05617978
0.01098901
0.02409639
0.10989011
cereals
0.5833333
0.3255814
0.3033708
0.6263736
0.4096386
0.2417582
starch nuts.oilseeds vegetables
0.01388889
0.08333333 0.02777778
0.04651163
0.01162791 0.04651163
0.06741573
0.02247191 0.04494382
0.01098901
0.04395604 0.04395604
0.06024096
0.01204819 0.04819277
0.05494505
0.01098901 0.02197802
> apply(X,1,sum)
[1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
>
> dist5.5 <- function(p,q)
+ {
+
# Equation 5.5 from Manly to determine distances
+
# between observations where each observation consists
+
# of a string of proportions which add to 1.
+
# p=proportions of popn1, q=proportions of popn2
+
k <- length(p)
+
di <- 0
+
for( i in 1:k )
+
{
+
di <- di + abs(p[i]-q[i])/2
+
}
+
return(di)
+ }
>
> # Demonstrate function
> dist5.5(c(.2,.3,.5),c(.2,.3,.5))
[1] 0
> dist5.5(c(.5,.5,0), c(0,0,1))
[1] 1
> dist5.5(c(.2,.3,.5),c(.5,.3,.2))
[1] 0.3
>
> # distance between first 2 countries
> dist5.5(X[1,],X[2,])
red.meat
0.3636951
>
> # matrix to store distances
> dmatrix <- matrix(NA,ncol=25,nrow=25)
>
2
> # The two for-loops below are for calculating the distance
> # between each possible country pairing.
> # It could have been done more efficiently
> # using symmetry and calculating only a triangle of the
> # matrix and reflecting it. The below loop, however,
> # is easier to understand - although (25*26/2) more calculations
> for( i in 1:25 ) # row
+ {
+
for( j in 1:25 ) # column
+
{
+
dmatrix[i,j] <- dist5.5(X[i,],X[j,])
+
}
+ }
>
> dim(dmatrix)
[1] 25 25
> dmatrix[1:5,1:5] # upper 5 rows and left 5 columns of matrix
[,1]
[,2]
[,3]
[,4]
[,5]
[1,] 0.0000000 0.3636951 0.3408240 0.1303419 0.2633869
[2,] 0.3636951 0.0000000 0.1173243 0.3331204 0.1165593
[3,] 0.3408240 0.1173243 0.0000000 0.3444870 0.1409232
[4,] 0.1303419 0.3331204 0.3444870 0.0000000 0.2486429
[5,] 0.2633869 0.1165593 0.1409232 0.2486429 0.0000000
> diag(dmatrix) # distance of each observation to itself=0
[1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
>
> #library(MASS)
> fit1 <- isoMDS( dmatrix, k=1 ) # k=1 dimension
> fit2 <- isoMDS( dmatrix, k=2 ) # k=2 dimensions
> fit3 <- isoMDS( dmatrix, k=3 )
initial value 6.521811
iter
5 value 5.066706
iter 10 value 4.979685
iter 15 value 4.879746
iter 20 value 4.798527
iter 20 value 4.797068
final value 4.766647
converged
> fit4 <- isoMDS( dmatrix, k=4 )
> fit5 <- isoMDS( dmatrix, k=5 , maxit=100)
>
> # Create scree plot to determine reasonable dimension
> dev.new()
> stressvct <- c(fit1$stress,fit2$stress,fit3$stress,fit4$stress,fit5$stress)
> plot(c(1:5), stressvct, type="o", xlab="k",ylab="stress" )
> title(main="Scree plot of stress and dimension")
stress
5
10
15
Scree plot of stress and dimension
1
>
2
3
4
5
k
3
> country <- as.character( Protein$Country ) # used for labeling graphs
>
> # Graph in one dimension
> # ifelse function and mod (%%) is to alternate text to the left and right
> dev.new()
> plot( rep(0,25),fit1$points)
> text( ifelse(rank(fit1$points)%%2==0,-.2,.2), fit1$points , country, cex=.7 )
> seq(6) %% 2 # 1:6 mod 2, which is "remainder" in division
[1] 1 0 1 0 1 0
> # 0 is even, 1 is odd
> seq(6) %% 3 # 1:6 mod 3
[1] 1 2 0 1 2 0
>
Yugoslavia
Albania
0.2
Bulgaria
Portugal
Romania
Italy
USSR
Greece
Spain
0.0
Czechoslovakia
Poland
E.Germany
Austria
Belgium
-0.1
France
Sw itzerland
UK
Netherlands
Ireland
Norw ay
W.Germany
Denmark
Sw eden
-0.2
fit1$points
0.1
Hungary
Finland
-1.0
-0.5
0.0
0.5
1.0
rep(0, 25)
4
# Two dimensions
dev.new()
par(pty="s") # square plotting frame
plot(fit2$points, asp=1 ) # asp=1 is aspect ratio to have x and y on same scale
text(fit2$points, country, cex=.7 )
0.3
0.4
>
>
>
>
>
0.1
Spain
Finland
Greece
E.Germany
Norw ay
0.0
fit2$points[,2]
0.2
Portugal
Italy
Sw eden
Denmark
France
Belgium
USSR
UK
Romania
Czechoslovakia
Ireland
Sw itzerland
Austria
Netherlands
W.Germany
Hungary
Albania
-0.2
-0.1
Yugoslavia
Bulgaria
Poland
-0.2
-0.1
0.0
0.1
0.2
0.3
fit2$points[,1]
>
5
# Three dimensions (static)
# package scatterplot3d
#library(scatterplot3d)
dev.new()
scatterplot3d(fit3$points)
0.05
0.3
-0.15
0.2
0.1
fit3$points[,2]
0.00
-0.05
-0.10
fit3$points[,3]
0.10
0.15
0.20
>
>
>
>
>
-0.20
0.0
-0.1
-0.4
-0.3
-0.2
-0.1
0.0
0.1
0.2
0.3
0.4
fit3$points[,1]
>
>
>
>
>
>
>
# Three dimensions (dynamic)
# package rgl (rg "el")
# library(rgl)
dev.new()
plot3d(fit3$points,xlab="x",ylab="y",zlab="z")
text3d(fit3$points,text=as.character(1:25))
# Graph not provided – perform yourself
6
#################################### R Code #############################
Protein<-read.csv(file="http://users.humboldt.edu/rizzardi/Data.dir/EuroProtein.csv",header=T,skip=5)
head(Protein)
dim(Protein)
X <- data.matrix(Protein[,2:10]/Protein$Total)
dim(X)
head(X)
apply(X,1,sum)
dist5.5 <- function(p,q)
{
# Equation 5.5 from Manly to determine distances
# between observations where each observation consists
# of a string of proportions which add to 1.
# p=proportions of popn1, q=proportions of popn2
k <- length(p)
di <- 0
for( i in 1:k )
{
di <- di + abs(p[i]-q[i])/2
}
return(di)
}
# Demonstrate function
dist5.5(c(.2,.3,.5),c(.2,.3,.5))
dist5.5(c(.5,.5,0), c(0,0,1))
dist5.5(c(.2,.3,.5),c(.5,.3,.2))
# distance between first 2 countries
dist5.5(X[1,],X[2,])
# matrix to store distances
dmatrix <- matrix(NA,ncol=25,nrow=25)
# The two for-loops below are for calculating the distance
# between each possible country pairing.
# It could have been done more efficiently
# using symmetry and calculating only a triangle of the
# matrix and reflecting it. The below loop, however,
# is easier to understand - although (25*26/2) more calculations
for( i in 1:25 ) # row
{
for( j in 1:25 ) # column
{
dmatrix[i,j] <- dist5.5(X[i,],X[j,])
}
}
dim(dmatrix)
dmatrix[1:5,1:5] # upper 5 rows and left 5 columns of matrix
diag(dmatrix) # distance of each observation to itself=0
#library(MASS)
fit1 <- isoMDS(
fit2 <- isoMDS(
fit3 <- isoMDS(
fit4 <- isoMDS(
fit5 <- isoMDS(
dmatrix,
dmatrix,
dmatrix,
dmatrix,
dmatrix,
k=1
k=2
k=3
k=4
k=5
) # k=1 dimension
) # k=2 dimensions
)
)
, maxit=100)
# Create scree plot to determine reasonable dimension
dev.new()
stressvct <- c(fit1$stress,fit2$stress,fit3$stress,fit4$stress,fit5$stress)
plot(c(1:5), stressvct, type="o", xlab="k",ylab="stress" )
title(main="Scree plot of stress and dimension")
country <- as.character( Protein$Country ) # used for labeling graphs
# Graph in one dimension
# ifelse function and mod (%%) is to alternate text to the left and right
dev.new()
plot( rep(0,25),fit1$points)
text( ifelse(rank(fit1$points)%%2==0,-.2,.2), fit1$points , country, cex=.7 )
seq(6) %% 2 # 1:6 mod 2, which is "remainder" in division
# 0 is even, 1 is odd
seq(6) %% 3 # 1:6 mod 3
# Two dimensions
dev.new()
par(pty="s") # square plotting frame
plot(fit2$points, asp=1 ) # asp=1 is aspect ratio to have x and y on same scale
text(fit2$points, country, cex=.7 )
# Three dimensions (static)
# package scatterplot3d
#library(scatterplot3d)
dev.new()
scatterplot3d(fit3$points)
# Three dimensions (dynamic)
# package rgl (rg "el")
# library(rgl)
dev.new()
plot3d(fit3$points,xlab="x",ylab="y",zlab="z")
text3d(fit3$points,text=as.character(1:25))
7
Download