Multidimensional Scaling From: http://en.wikipedia.org/wiki/Multidimensional_scaling The data to be analyzed is a collection of function is defined, objects (colors, faces, stocks, . . .) on which a distance δi,j := distance between i th and j th objects. These distances are the entries of the dissimilarity matrix The goal of MDS is, given Δ, to find for all vectors such that , where is a vector norm. In classical MDS, this norm is the Euclidean distance, but, in a broader sense, it may be a metric or arbitrary distance function.[3] In other words, MDS attempts to find an embedding from the objects into RN such that distances are preserved. If the dimension N is chosen to be 2 or 3, we may plot the vectors xi to obtain a visualization of the similarities between the objects. Note that the vectors xi are not unique: With the Euclidean distance, they may be arbitrarily translated, rotated, and reflected, since these transformations do not change the pairwise distances . There are various approaches to determining the vectors xi. Usually, MDS is formulated as an optimization problem, where function, for example, is found as a minimizer of some cost 1 > # Goal: Use multidimensional scaling (mds) to explore protein data > # Take data and determine percentage of proteing obtained from each category > # Use Eqn 5.5 to calculate “distances” between each country > # Try different dimensions of MDS > > Protein<read.csv(file="http://users.humboldt.edu/rizzardi/Data.dir/EuroProtein.csv",header=T,s kip=5) > head(Protein) Country red.meat white.meat eggs milk fish cereals starch nuts.oilseeds vegetables Total 1 Albania 10 1 1 9 0 42 1 6 2 72 2 Austria 9 14 4 20 2 28 4 1 4 86 3 Belgium 14 9 4 18 5 27 6 2 4 89 4 Bulgaria 8 6 2 8 1 57 1 4 4 91 5 Czechoslovakia 10 11 3 13 2 34 5 1 4 83 6 Denmark 11 11 4 25 10 22 5 1 2 91 > dim(Protein) [1] 25 11 > X <- data.matrix(Protein[,2:10]/Protein$Total) > dim(X) [1] 25 9 > head(X) red.meat [1,] 0.13888889 [2,] 0.10465116 [3,] 0.15730337 [4,] 0.08791209 [5,] 0.12048193 [6,] 0.12087912 white.meat 0.01388889 0.16279070 0.10112360 0.06593407 0.13253012 0.12087912 eggs 0.01388889 0.04651163 0.04494382 0.02197802 0.03614458 0.04395604 milk 0.12500000 0.23255814 0.20224719 0.08791209 0.15662651 0.27472527 fish 0.00000000 0.02325581 0.05617978 0.01098901 0.02409639 0.10989011 cereals 0.5833333 0.3255814 0.3033708 0.6263736 0.4096386 0.2417582 starch nuts.oilseeds vegetables 0.01388889 0.08333333 0.02777778 0.04651163 0.01162791 0.04651163 0.06741573 0.02247191 0.04494382 0.01098901 0.04395604 0.04395604 0.06024096 0.01204819 0.04819277 0.05494505 0.01098901 0.02197802 > apply(X,1,sum) [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 > > dist5.5 <- function(p,q) + { + # Equation 5.5 from Manly to determine distances + # between observations where each observation consists + # of a string of proportions which add to 1. + # p=proportions of popn1, q=proportions of popn2 + k <- length(p) + di <- 0 + for( i in 1:k ) + { + di <- di + abs(p[i]-q[i])/2 + } + return(di) + } > > # Demonstrate function > dist5.5(c(.2,.3,.5),c(.2,.3,.5)) [1] 0 > dist5.5(c(.5,.5,0), c(0,0,1)) [1] 1 > dist5.5(c(.2,.3,.5),c(.5,.3,.2)) [1] 0.3 > > # distance between first 2 countries > dist5.5(X[1,],X[2,]) red.meat 0.3636951 > > # matrix to store distances > dmatrix <- matrix(NA,ncol=25,nrow=25) > 2 > # The two for-loops below are for calculating the distance > # between each possible country pairing. > # It could have been done more efficiently > # using symmetry and calculating only a triangle of the > # matrix and reflecting it. The below loop, however, > # is easier to understand - although (25*26/2) more calculations > for( i in 1:25 ) # row + { + for( j in 1:25 ) # column + { + dmatrix[i,j] <- dist5.5(X[i,],X[j,]) + } + } > > dim(dmatrix) [1] 25 25 > dmatrix[1:5,1:5] # upper 5 rows and left 5 columns of matrix [,1] [,2] [,3] [,4] [,5] [1,] 0.0000000 0.3636951 0.3408240 0.1303419 0.2633869 [2,] 0.3636951 0.0000000 0.1173243 0.3331204 0.1165593 [3,] 0.3408240 0.1173243 0.0000000 0.3444870 0.1409232 [4,] 0.1303419 0.3331204 0.3444870 0.0000000 0.2486429 [5,] 0.2633869 0.1165593 0.1409232 0.2486429 0.0000000 > diag(dmatrix) # distance of each observation to itself=0 [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 > > #library(MASS) > fit1 <- isoMDS( dmatrix, k=1 ) # k=1 dimension > fit2 <- isoMDS( dmatrix, k=2 ) # k=2 dimensions > fit3 <- isoMDS( dmatrix, k=3 ) initial value 6.521811 iter 5 value 5.066706 iter 10 value 4.979685 iter 15 value 4.879746 iter 20 value 4.798527 iter 20 value 4.797068 final value 4.766647 converged > fit4 <- isoMDS( dmatrix, k=4 ) > fit5 <- isoMDS( dmatrix, k=5 , maxit=100) > > # Create scree plot to determine reasonable dimension > dev.new() > stressvct <- c(fit1$stress,fit2$stress,fit3$stress,fit4$stress,fit5$stress) > plot(c(1:5), stressvct, type="o", xlab="k",ylab="stress" ) > title(main="Scree plot of stress and dimension") stress 5 10 15 Scree plot of stress and dimension 1 > 2 3 4 5 k 3 > country <- as.character( Protein$Country ) # used for labeling graphs > > # Graph in one dimension > # ifelse function and mod (%%) is to alternate text to the left and right > dev.new() > plot( rep(0,25),fit1$points) > text( ifelse(rank(fit1$points)%%2==0,-.2,.2), fit1$points , country, cex=.7 ) > seq(6) %% 2 # 1:6 mod 2, which is "remainder" in division [1] 1 0 1 0 1 0 > # 0 is even, 1 is odd > seq(6) %% 3 # 1:6 mod 3 [1] 1 2 0 1 2 0 > Yugoslavia Albania 0.2 Bulgaria Portugal Romania Italy USSR Greece Spain 0.0 Czechoslovakia Poland E.Germany Austria Belgium -0.1 France Sw itzerland UK Netherlands Ireland Norw ay W.Germany Denmark Sw eden -0.2 fit1$points 0.1 Hungary Finland -1.0 -0.5 0.0 0.5 1.0 rep(0, 25) 4 # Two dimensions dev.new() par(pty="s") # square plotting frame plot(fit2$points, asp=1 ) # asp=1 is aspect ratio to have x and y on same scale text(fit2$points, country, cex=.7 ) 0.3 0.4 > > > > > 0.1 Spain Finland Greece E.Germany Norw ay 0.0 fit2$points[,2] 0.2 Portugal Italy Sw eden Denmark France Belgium USSR UK Romania Czechoslovakia Ireland Sw itzerland Austria Netherlands W.Germany Hungary Albania -0.2 -0.1 Yugoslavia Bulgaria Poland -0.2 -0.1 0.0 0.1 0.2 0.3 fit2$points[,1] > 5 # Three dimensions (static) # package scatterplot3d #library(scatterplot3d) dev.new() scatterplot3d(fit3$points) 0.05 0.3 -0.15 0.2 0.1 fit3$points[,2] 0.00 -0.05 -0.10 fit3$points[,3] 0.10 0.15 0.20 > > > > > -0.20 0.0 -0.1 -0.4 -0.3 -0.2 -0.1 0.0 0.1 0.2 0.3 0.4 fit3$points[,1] > > > > > > > # Three dimensions (dynamic) # package rgl (rg "el") # library(rgl) dev.new() plot3d(fit3$points,xlab="x",ylab="y",zlab="z") text3d(fit3$points,text=as.character(1:25)) # Graph not provided – perform yourself 6 #################################### R Code ############################# Protein<-read.csv(file="http://users.humboldt.edu/rizzardi/Data.dir/EuroProtein.csv",header=T,skip=5) head(Protein) dim(Protein) X <- data.matrix(Protein[,2:10]/Protein$Total) dim(X) head(X) apply(X,1,sum) dist5.5 <- function(p,q) { # Equation 5.5 from Manly to determine distances # between observations where each observation consists # of a string of proportions which add to 1. # p=proportions of popn1, q=proportions of popn2 k <- length(p) di <- 0 for( i in 1:k ) { di <- di + abs(p[i]-q[i])/2 } return(di) } # Demonstrate function dist5.5(c(.2,.3,.5),c(.2,.3,.5)) dist5.5(c(.5,.5,0), c(0,0,1)) dist5.5(c(.2,.3,.5),c(.5,.3,.2)) # distance between first 2 countries dist5.5(X[1,],X[2,]) # matrix to store distances dmatrix <- matrix(NA,ncol=25,nrow=25) # The two for-loops below are for calculating the distance # between each possible country pairing. # It could have been done more efficiently # using symmetry and calculating only a triangle of the # matrix and reflecting it. The below loop, however, # is easier to understand - although (25*26/2) more calculations for( i in 1:25 ) # row { for( j in 1:25 ) # column { dmatrix[i,j] <- dist5.5(X[i,],X[j,]) } } dim(dmatrix) dmatrix[1:5,1:5] # upper 5 rows and left 5 columns of matrix diag(dmatrix) # distance of each observation to itself=0 #library(MASS) fit1 <- isoMDS( fit2 <- isoMDS( fit3 <- isoMDS( fit4 <- isoMDS( fit5 <- isoMDS( dmatrix, dmatrix, dmatrix, dmatrix, dmatrix, k=1 k=2 k=3 k=4 k=5 ) # k=1 dimension ) # k=2 dimensions ) ) , maxit=100) # Create scree plot to determine reasonable dimension dev.new() stressvct <- c(fit1$stress,fit2$stress,fit3$stress,fit4$stress,fit5$stress) plot(c(1:5), stressvct, type="o", xlab="k",ylab="stress" ) title(main="Scree plot of stress and dimension") country <- as.character( Protein$Country ) # used for labeling graphs # Graph in one dimension # ifelse function and mod (%%) is to alternate text to the left and right dev.new() plot( rep(0,25),fit1$points) text( ifelse(rank(fit1$points)%%2==0,-.2,.2), fit1$points , country, cex=.7 ) seq(6) %% 2 # 1:6 mod 2, which is "remainder" in division # 0 is even, 1 is odd seq(6) %% 3 # 1:6 mod 3 # Two dimensions dev.new() par(pty="s") # square plotting frame plot(fit2$points, asp=1 ) # asp=1 is aspect ratio to have x and y on same scale text(fit2$points, country, cex=.7 ) # Three dimensions (static) # package scatterplot3d #library(scatterplot3d) dev.new() scatterplot3d(fit3$points) # Three dimensions (dynamic) # package rgl (rg "el") # library(rgl) dev.new() plot3d(fit3$points,xlab="x",ylab="y",zlab="z") text3d(fit3$points,text=as.character(1:25)) 7