1 SUPPLEMENTARY MATERIAL S2. R FUNCTIONS 2 3 4 S2.1. Simplified tutorial for the use of MI techniques selected in this work. 5 6 The following R functions requires the “missMDA”, “mice”,”Amelia”,”NORM”,and 7 “Hmisc” packages (See Material & Methods). 8 9 10 ## Required packages 11 library(mice) 12 library(Amelia) 13 library(Hmisc) 14 library(missMDA) 15 library(norm) 16 ## Load the dataset 17 data<-read.table("mydata.txt", sep="\t", dec=".", header=T) 18 ## Number of multiple imputations 19 m=20 20 21 ## Imputation of the dataset 22 ## Mice – Method = “pmm” (predictive mean matching) or “norm” 23 imp <- mice(data, m = m, method="pmm") 24 ## MI-PCA – ncp = number of dimensions to use for the imputation process 25 imp <- MIPCA(data, ncp = 2, scale = TRUE, method = "Regularized", nboot = 26 m) 27 ## Hmisc – (type= “pmm” or “regression”) A,B,C,D,E are the data colnames 28 imp<-aregImpute(~A+B+C+D+E,data,n.impute=m,type="pmm",match="weighted") 29 ## Amelia II 30 imp<- amelia(data, m = m) 31 ## Norm 32 data<-as.matrix(data) 33 preA <- prelim.norm(data) 34 datA <- em.norm(preA) #find the MLE for a starting value 35 rd<-trunc(1000000*runif(1) + 10) 36 rngseed(rd) 37 imp<-list() 38 for (i in 1:m){ 39 impA <- da.norm(preA,datA,steps=50,showits=FALSE) # take 50 steps 40 imp[[i]] <- data.frame(imp.norm(preA,impA,data)) 41 42 43 } 44 S.2.2. agglomerate.data & plot.MI functions 45 46 Based on m (>1) datasets imputed using a multiple imputation technique, the two following R 47 functions average the m imputed datasets and display the 95% confidence ellipses associated 48 to each specimen. The function to draw confidence ellipses is based on the R function “ELLI” 49 proposed by Claude (2008). The following R functions requires the “missMDA”, 50 “mice”,”Amelia”,”NORM”, “Hmisc” and “shapes” packages (See Material & Methods). 51 52 Combination of the results obtained with one of the MI methods (Mice, Amelia, Norm, MI- 53 PCA, or Hmisc; see Supplementary Material S.2.1.) is done using the “agglomerate.data” 54 function. This function generates an averaged dataset (agglomerate.data$ImpM), and a list 55 with the m imputed datasets (agglomerate.data$Mi). 56 57 Example: 58 59 IM<-agglomerate.data(data=data, imp=imp, Mimp=20, Method="mice") 60 61 Where “data” is the dataset with the missing values, “imp” is the MI dataset object obtained 62 with one of the MI methods (see above), “Mimp” is the number of MI, and “Method” is one 63 of the methods described above (“mice”,”norm”,”hmisc”,”missmda” or “amelia”). 64 65 plot.MI(IM, symmetric=TRUE, DIM=c(1,2), web=FALSE, ellipses=TRUE) 66 67 The “plot.MI” function allows the procrustes superimposition of the m imputed datasets onto 68 the principal components calculated from the average MI-dataset. Symmetric = whether or 69 not the matrices must be scaled to have unit sum of square. DIM = the dimensions to display 70 on the biplot; web = whether or not the m imputed points for each specimen are linked to their 71 related average MI-dataset points; ellipses = whether or not the 95% confidence ellipses 72 around each specimen is drawn. 73 74 Supplementary reference: 75 Claude J. 2008. Morphometrics with R. New York: Springer 76 77 78 ## Combination of the results 79 80 #Dependencies 81 library(shapes) 82 library(mice) 83 library(Amelia) 84 library(missMDA) 85 library(Hmisc) 86 library(norm) 87 88 # agglomerate.data (data,imp,Mimp,Method="mice") 89 # 90 # plot.MI(IM, symmetric=TRUE, DIM=c(1,2),web=FALSE,ellipses=TRUE) 91 # 92 #data= dataset with missing values 93 # 94 #imp=Imputed datasets 95 # 96 #Mimp=Number of MI 97 # 98 #Method= MI method used, must be "mice","amelia","hmisc","missmda" or “norm” 99 # 100 #Julien Clavel - 2013 101 ################################################# 102 ################################################# 103 ##Agglomerate.data. Function to store the m imputed datasets and to compute the 104 averaged dataset 105 106 agglomerate.data<-function(data,imp,Mimp,Method="mice"){ 107 108 Moy<-Mimp+1 109 redata<-as.matrix(data) 110 ximp<-array(redata,dim=c(nrow(redata),ncol(redata),Moy)) 111 ##################### 112 113 114 if(any(is.na(redata))==TRUE){ if(Method=="mice" || Method=="amelia" || Method=="missmda" || Method=="hmisc" 115 || Method=="norm"){ 116 #####################MICE 117 if(Method=="mice"){ 118 for(i in 1:Mimp){ 119 ximp[,,i]<-as.matrix(complete(imp,i)) 120 121 122 } 123 ##Averaged dataset 124 ximp[,,Moy]<-apply(ximp[,,1:Mimp],c(1,2),mean) 125 } 126 ##################### 127 #####################Amelia 128 if(Method=="amelia"){ for(i in 1:Mimp){ 129 ximp[,,i]<-as.matrix(imp$imputations[[i]]) 130 131 132 } 133 ##Averaged dataset 134 ximp[,,Moy]<-apply(ximp[,,1:Mimp],c(1,2),mean) 135 } 136 ##################### 137 ##################### 138 #####################NORM 139 if(Method=="norm"){ 140 for(i in 1:Mimp){ ximp[,,i]<-as.matrix(imp[[i]]) 141 142 143 } 144 ##Averaged dataset 145 ximp[,,Moy]<-apply(ximp[,,1:Mimp],c(1,2),mean) 146 } 147 ##################### 148 #####################MDA 149 if(Method=="missmda"){ 150 for(i in 1:Mimp){ 151 ximp[,,i]<-as.matrix(imp$res.MI[,,i]) 152 153 154 } 155 ##Averaged dataset 156 ximp[,,Moy]<-apply(ximp[,,1:Mimp],c(1,2),mean) 157 } 158 #################### 159 ####################Hmisc 160 if(Method=="hmisc"){ 161 ##Extract the m data imputed for each variables 162 ximp<-array(redata, dim=c(nrow(redata),ncol(redata),Moy)) 163 col<-1:ncol(redata) 164 for(j in 1:ncol(redata)){ if(sum(is.na(redata[,j]))==0){ 165 166 col<-col[-which(col==j)] 167 next } 168 169 } 170 for(m in 1:Mimp){ for(g in col){ 171 ximp[,,m][!complete.cases(ximp[,,m][,g]),g]<-imp$imputed[[g]][,m] 172 } 173 174 } 175 ##Averaged dataset 176 ximp[,,Moy]<-apply(ximp[,,1:Mimp],c(1,2),mean) 177 178 179 } }else{ 180 #################### 181 ##Warning messages 182 cat("Error! You must indicate if you are using Mice, Amelia, missMDA, NORM, or 183 184 Hmisc package","\n") }}else{ 185 ## Warning messages 186 cat("There is no missing value in your dataset","\n") 187 } 188 #return(ximp) 189 tabM<-ximp[,,Moy] 190 colnames(tabM)<-colnames(redata) 191 list("ImpM"=tabM,"Mi"=ximp[,,1:Mimp],"nbMI"=Mimp, 192 "missing"=as.data.frame(redata)) 193 }#End 194 ################################### 195 ######################################## 196 ############################################### 197 ###################################################### 198 ##Function to draw confidence ellipses 199 ELLI<-function(x,y,conf=0.95,np) 200 {centroid<-apply(cbind(x,y),2,mean) 201 ang <- seq(0,2*pi,length=np) 202 z<-cbind(cos(ang),sin(ang)) 203 radiuscoef<-qnorm((1-conf)/2, lower.tail=F) 204 vcvxy<-var(cbind(x,y)) 205 r<-cor(x,y) 206 M1<-matrix(c(1,1,-1,1),2,2) 207 M2<-matrix(c(var(x), var(y)),2,2) 208 M3<-matrix(c(1+r, 1-r),2,2, byrow=T) 209 ellpar<-M1*sqrt(M2*M3/2) 210 t(centroid + radiuscoef * ellpar %*% t(z))} 211 ############################### Function to Plot MI confidence ellipses using 212 procruste superimposition 213 ################################## 214 #################################### 215 ###################################### 216 217 plot.MI<- 218 function(IM,symmetric=FALSE,DIM=c(1,2),scale=FALSE,web=FALSE,ellipses=TRU 219 E,...){ if(any(is.na(IM$ImpM)==TRUE)) 220 221 222 223 { cat("There is still missing values in the imputed dataset, please check your imputation") break 224 }else{ 225 Mo<-IM$nbMI+1 226 pcaM<-princomp(IM$ImpM) 227 cpdimM<-as.matrix(pcaM$scores[,DIM]) 228 opa<-array(cpdimM,dim=c(nrow(cpdimM),ncol(cpdimM),Mo)) 229 for(i in 1:IM$nbMI){ 230 pca<-princomp(IM$Mi[,,i]) 231 opa[,,i]<-as.matrix(pca$scores[,DIM]) 232 } 233 if(symmetric==TRUE){ for (i in 1:IM$nbMI+1){ 234 235 trace<-sum(opa[,,i]^2) 236 opa[,,i]<-opa[,,i]/sqrt(trace) } 237 238 } 239 ############################ Ordinary Procrustes Analysis (library(shapes)) 240 for(k in 1:IM$nbMI){ 241 analyse<-procOPA(opa[,,Mo],opa[,,k], reflect=TRUE) 242 opa[,,k]<-analyse$Bhat 243 } 244 opa[,,Mo]<-analyse$Ahat 245 ######################## Principal component explained variance 246 pvar<-pcaM$sdev^2 247 tot<-sum(pvar) 248 valX<-pvar[DIM[1]] 249 valY<-pvar[DIM[2]] 250 valX<-round(valX*100/tot,digits=2) 251 valY<-round(valY*100/tot, digits=2) 252 ######################## Plot function 253 op <- par(no.readonly=TRUE) 254 if(scale==TRUE){ plot(opa[,1,Mo],opa[,2,Mo], type="p", pch=3, 255 256 col=c(as.factor(ifelse(complete.cases(IM$missing) ==T, 1, 257 5))),lwd=1,xlim=range(opa[,1,Mo]),ylim=range(opa[,1,Mo]),xlab=paste("DIM",DIM[1],v 258 alX,"%",sep=" "),ylab=paste("DIM",DIM[2],valY,"%",sep=" ")) 259 } 260 if(scale==FALSE){ plot(opa[,1,Mo],opa[,2,Mo], type="p", pch=3, 261 262 col=c(as.factor(ifelse(complete.cases(IM$missing) ==T, 1, 263 5))),lwd=1,xlab=paste("DIM",DIM[1],valX,"%",sep=" 264 "),ylab=paste("DIM",DIM[2],valY,"%",sep=" ")) 265 } 266 title("MI effect on Multivariate Analysis", font.main=3, adj=1) 267 ## Store row names 268 NR<-IM$missing 269 rownames(IM$missing)<-NULL 270 ## 271 if(ellipses==TRUE){ 272 coul<-as.numeric(rownames(IM$missing[complete.cases(IM$missing),])) 273 for (j in coul){ 274 lines(ELLI(opa[j,1,],opa[j,2,],np=Mo), col="black", lwd=1)} 275 coul<-as.numeric(rownames(IM$missing[!complete.cases(IM$missing),])) 276 for (j in coul){ 277 lines(ELLI(opa[j,1,],opa[j,2,],np=Mo), col="red", lwd=1)} 278 }else{ points(opa[,1,],opa[,2,],cex=0.5) } 279 if(web==TRUE){ 280 coul<-as.numeric(rownames(IM$missing[complete.cases(IM$missing),])) 281 for (j in coul){ 282 for(f in 1:IM$nbMI){ 283 segments(opa[j,1,Mo],opa[j,2,Mo], opa[j,1,f],opa[j,2,f], col="black", lwd=1) } 284 } coul<-as.numeric(rownames(IM$missing[!complete.cases(IM$missing),])) 285 for (j in coul){ 286 287 for(f in 1:IM$nbMI){ 288 segments(opa[j,1,Mo],opa[j,2,Mo], opa[j,1,f],opa[j,2,f], col="red", lwd=1)} 289 } 290 points(opa[,1,],opa[,2,],cex=0.5) 291 } 292 nom<-rownames(NR) 293 text(opa[,1,Mo],opa[,2,Mo],nom, pos=1) 294 abline(h=0,v=0, lty=3) 295 296 par(xpd=TRUE) # Do not clip to the drawing area 297 lambda <- .025 298 legend(par("usr")[1], 299 par("usr")[3],c("Complete", "Missing"), xjust = 0, yjust = 0,lwd=3, lty=1, col=c(par('fg'), 300 'red')) 301 par(op) 302 } 303 } 304 305 306 (1 + lambda) * par("usr")[4] - lambda *