Supplementary Material S2.

advertisement
1
SUPPLEMENTARY MATERIAL S2. R FUNCTIONS
2
3
4
S2.1. Simplified tutorial for the use of MI techniques selected in this work.
5
6
The following R functions requires the “missMDA”, “mice”,”Amelia”,”NORM”,and
7
“Hmisc” packages (See Material & Methods).
8
9
10
## Required packages
11
library(mice)
12
library(Amelia)
13
library(Hmisc)
14
library(missMDA)
15
library(norm)
16
## Load the dataset
17
data<-read.table("mydata.txt", sep="\t", dec=".", header=T)
18
## Number of multiple imputations
19
m=20
20
21
## Imputation of the dataset
22
## Mice – Method = “pmm” (predictive mean matching) or “norm”
23
imp <- mice(data, m = m, method="pmm")
24
## MI-PCA – ncp = number of dimensions to use for the imputation process
25
imp <- MIPCA(data, ncp = 2, scale = TRUE, method = "Regularized", nboot =
26
m)
27
## Hmisc – (type= “pmm” or “regression”) A,B,C,D,E are the data colnames
28
imp<-aregImpute(~A+B+C+D+E,data,n.impute=m,type="pmm",match="weighted")
29
## Amelia II
30
imp<- amelia(data, m = m)
31
## Norm
32
data<-as.matrix(data)
33
preA <- prelim.norm(data)
34
datA <- em.norm(preA) #find the MLE for a starting value
35
rd<-trunc(1000000*runif(1) + 10)
36
rngseed(rd)
37
imp<-list()
38
for (i in 1:m){
39
impA <- da.norm(preA,datA,steps=50,showits=FALSE) # take 50 steps
40
imp[[i]] <- data.frame(imp.norm(preA,impA,data))
41
42
43
}
44
S.2.2. agglomerate.data & plot.MI functions
45
46
Based on m (>1) datasets imputed using a multiple imputation technique, the two following R
47
functions average the m imputed datasets and display the 95% confidence ellipses associated
48
to each specimen. The function to draw confidence ellipses is based on the R function “ELLI”
49
proposed by Claude (2008). The following R functions requires the “missMDA”,
50
“mice”,”Amelia”,”NORM”, “Hmisc” and “shapes” packages (See Material & Methods).
51
52
Combination of the results obtained with one of the MI methods (Mice, Amelia, Norm, MI-
53
PCA, or Hmisc; see Supplementary Material S.2.1.) is done using the “agglomerate.data”
54
function. This function generates an averaged dataset (agglomerate.data$ImpM), and a list
55
with the m imputed datasets (agglomerate.data$Mi).
56
57
Example:
58
59
IM<-agglomerate.data(data=data, imp=imp, Mimp=20, Method="mice")
60
61
Where “data” is the dataset with the missing values, “imp” is the MI dataset object obtained
62
with one of the MI methods (see above), “Mimp” is the number of MI, and “Method” is one
63
of the methods described above (“mice”,”norm”,”hmisc”,”missmda” or “amelia”).
64
65
plot.MI(IM, symmetric=TRUE, DIM=c(1,2), web=FALSE, ellipses=TRUE)
66
67
The “plot.MI” function allows the procrustes superimposition of the m imputed datasets onto
68
the principal components calculated from the average MI-dataset. Symmetric = whether or
69
not the matrices must be scaled to have unit sum of square. DIM = the dimensions to display
70
on the biplot; web = whether or not the m imputed points for each specimen are linked to their
71
related average MI-dataset points; ellipses = whether or not the 95% confidence ellipses
72
around each specimen is drawn.
73
74
Supplementary reference:
75
Claude J. 2008. Morphometrics with R. New York: Springer
76
77
78
## Combination of the results
79
80
#Dependencies
81
library(shapes)
82
library(mice)
83
library(Amelia)
84
library(missMDA)
85
library(Hmisc)
86
library(norm)
87
88
# agglomerate.data (data,imp,Mimp,Method="mice")
89
#
90
# plot.MI(IM, symmetric=TRUE, DIM=c(1,2),web=FALSE,ellipses=TRUE)
91
#
92
#data= dataset with missing values
93
#
94
#imp=Imputed datasets
95
#
96
#Mimp=Number of MI
97
#
98
#Method= MI method used, must be "mice","amelia","hmisc","missmda" or “norm”
99
#
100
#Julien Clavel - 2013
101
#################################################
102
#################################################
103
##Agglomerate.data. Function to store the m imputed datasets and to compute the
104
averaged dataset
105
106
agglomerate.data<-function(data,imp,Mimp,Method="mice"){
107
108
Moy<-Mimp+1
109
redata<-as.matrix(data)
110
ximp<-array(redata,dim=c(nrow(redata),ncol(redata),Moy))
111
#####################
112
113
114
if(any(is.na(redata))==TRUE){
if(Method=="mice" || Method=="amelia" || Method=="missmda" || Method=="hmisc"
115
|| Method=="norm"){
116
#####################MICE
117
if(Method=="mice"){
118
for(i in 1:Mimp){
119
ximp[,,i]<-as.matrix(complete(imp,i))
120
121
122
}
123
##Averaged dataset
124
ximp[,,Moy]<-apply(ximp[,,1:Mimp],c(1,2),mean)
125
}
126
#####################
127
#####################Amelia
128
if(Method=="amelia"){
for(i in 1:Mimp){
129
ximp[,,i]<-as.matrix(imp$imputations[[i]])
130
131
132
}
133
##Averaged dataset
134
ximp[,,Moy]<-apply(ximp[,,1:Mimp],c(1,2),mean)
135
}
136
#####################
137
#####################
138
#####################NORM
139
if(Method=="norm"){
140
for(i in 1:Mimp){
ximp[,,i]<-as.matrix(imp[[i]])
141
142
143
}
144
##Averaged dataset
145
ximp[,,Moy]<-apply(ximp[,,1:Mimp],c(1,2),mean)
146
}
147
#####################
148
#####################MDA
149
if(Method=="missmda"){
150
for(i in 1:Mimp){
151
ximp[,,i]<-as.matrix(imp$res.MI[,,i])
152
153
154
}
155
##Averaged dataset
156
ximp[,,Moy]<-apply(ximp[,,1:Mimp],c(1,2),mean)
157
}
158
####################
159
####################Hmisc
160
if(Method=="hmisc"){
161
##Extract the m data imputed for each variables
162
ximp<-array(redata, dim=c(nrow(redata),ncol(redata),Moy))
163
col<-1:ncol(redata)
164
for(j in 1:ncol(redata)){
if(sum(is.na(redata[,j]))==0){
165
166
col<-col[-which(col==j)]
167
next
}
168
169
}
170
for(m in 1:Mimp){
for(g in col){
171
ximp[,,m][!complete.cases(ximp[,,m][,g]),g]<-imp$imputed[[g]][,m]
172
}
173
174
}
175
##Averaged dataset
176
ximp[,,Moy]<-apply(ximp[,,1:Mimp],c(1,2),mean)
177
178
179
}
}else{
180
####################
181
##Warning messages
182
cat("Error! You must indicate if you are using Mice, Amelia, missMDA, NORM, or
183
184
Hmisc package","\n")
}}else{
185
## Warning messages
186
cat("There is no missing value in your dataset","\n")
187
}
188
#return(ximp)
189
tabM<-ximp[,,Moy]
190
colnames(tabM)<-colnames(redata)
191
list("ImpM"=tabM,"Mi"=ximp[,,1:Mimp],"nbMI"=Mimp,
192
"missing"=as.data.frame(redata))
193
}#End
194
###################################
195
########################################
196
###############################################
197
######################################################
198
##Function to draw confidence ellipses
199
ELLI<-function(x,y,conf=0.95,np)
200
{centroid<-apply(cbind(x,y),2,mean)
201
ang <- seq(0,2*pi,length=np)
202
z<-cbind(cos(ang),sin(ang))
203
radiuscoef<-qnorm((1-conf)/2, lower.tail=F)
204
vcvxy<-var(cbind(x,y))
205
r<-cor(x,y)
206
M1<-matrix(c(1,1,-1,1),2,2)
207
M2<-matrix(c(var(x), var(y)),2,2)
208
M3<-matrix(c(1+r, 1-r),2,2, byrow=T)
209
ellpar<-M1*sqrt(M2*M3/2)
210
t(centroid + radiuscoef * ellpar %*% t(z))}
211
############################### Function to Plot MI confidence ellipses using
212
procruste superimposition
213
##################################
214
####################################
215
######################################
216
217
plot.MI<-
218
function(IM,symmetric=FALSE,DIM=c(1,2),scale=FALSE,web=FALSE,ellipses=TRU
219
E,...){
if(any(is.na(IM$ImpM)==TRUE))
220
221
222
223
{ cat("There is still missing values in the imputed dataset, please check your
imputation")
break
224
}else{
225
Mo<-IM$nbMI+1
226
pcaM<-princomp(IM$ImpM)
227
cpdimM<-as.matrix(pcaM$scores[,DIM])
228
opa<-array(cpdimM,dim=c(nrow(cpdimM),ncol(cpdimM),Mo))
229
for(i in 1:IM$nbMI){
230
pca<-princomp(IM$Mi[,,i])
231
opa[,,i]<-as.matrix(pca$scores[,DIM])
232
}
233
if(symmetric==TRUE){
for (i in 1:IM$nbMI+1){
234
235
trace<-sum(opa[,,i]^2)
236
opa[,,i]<-opa[,,i]/sqrt(trace)
}
237
238
}
239
############################ Ordinary Procrustes Analysis (library(shapes))
240
for(k in 1:IM$nbMI){
241
analyse<-procOPA(opa[,,Mo],opa[,,k], reflect=TRUE)
242
opa[,,k]<-analyse$Bhat
243
}
244
opa[,,Mo]<-analyse$Ahat
245
######################## Principal component explained variance
246
pvar<-pcaM$sdev^2
247
tot<-sum(pvar)
248
valX<-pvar[DIM[1]]
249
valY<-pvar[DIM[2]]
250
valX<-round(valX*100/tot,digits=2)
251
valY<-round(valY*100/tot, digits=2)
252
######################## Plot function
253
op <- par(no.readonly=TRUE)
254
if(scale==TRUE){
plot(opa[,1,Mo],opa[,2,Mo], type="p", pch=3,
255
256
col=c(as.factor(ifelse(complete.cases(IM$missing) ==T, 1,
257
5))),lwd=1,xlim=range(opa[,1,Mo]),ylim=range(opa[,1,Mo]),xlab=paste("DIM",DIM[1],v
258
alX,"%",sep=" "),ylab=paste("DIM",DIM[2],valY,"%",sep=" "))
259
}
260
if(scale==FALSE){
plot(opa[,1,Mo],opa[,2,Mo], type="p", pch=3,
261
262
col=c(as.factor(ifelse(complete.cases(IM$missing) ==T, 1,
263
5))),lwd=1,xlab=paste("DIM",DIM[1],valX,"%",sep="
264
"),ylab=paste("DIM",DIM[2],valY,"%",sep=" "))
265
}
266
title("MI effect on Multivariate Analysis", font.main=3, adj=1)
267
## Store row names
268
NR<-IM$missing
269
rownames(IM$missing)<-NULL
270
##
271
if(ellipses==TRUE){
272
coul<-as.numeric(rownames(IM$missing[complete.cases(IM$missing),]))
273
for (j in coul){
274
lines(ELLI(opa[j,1,],opa[j,2,],np=Mo), col="black", lwd=1)}
275
coul<-as.numeric(rownames(IM$missing[!complete.cases(IM$missing),]))
276
for (j in coul){
277
lines(ELLI(opa[j,1,],opa[j,2,],np=Mo), col="red", lwd=1)}
278
}else{ points(opa[,1,],opa[,2,],cex=0.5) }
279
if(web==TRUE){
280
coul<-as.numeric(rownames(IM$missing[complete.cases(IM$missing),]))
281
for (j in coul){
282
for(f in 1:IM$nbMI){
283
segments(opa[j,1,Mo],opa[j,2,Mo], opa[j,1,f],opa[j,2,f], col="black", lwd=1) }
284
}
coul<-as.numeric(rownames(IM$missing[!complete.cases(IM$missing),]))
285
for (j in coul){
286
287
for(f in 1:IM$nbMI){
288
segments(opa[j,1,Mo],opa[j,2,Mo], opa[j,1,f],opa[j,2,f], col="red", lwd=1)}
289
}
290
points(opa[,1,],opa[,2,],cex=0.5)
291
}
292
nom<-rownames(NR)
293
text(opa[,1,Mo],opa[,2,Mo],nom, pos=1)
294
abline(h=0,v=0, lty=3)
295
296
par(xpd=TRUE) # Do not clip to the drawing area
297
lambda <- .025
298
legend(par("usr")[1],
299
par("usr")[3],c("Complete", "Missing"), xjust = 0, yjust = 0,lwd=3, lty=1, col=c(par('fg'),
300
'red'))
301
par(op)
302
}
303
}
304
305
306
(1
+
lambda)
*
par("usr")[4]
-
lambda
*
Download