jbi12006-sup-0002-AppendixS2

advertisement
SUPPORTING INFORMATION
Using virtual species to study species distributions and model performance
Christine N. Meynard and David M. Kaplan
Journal of Biogeography
Appendix S2 R scripts associated with the simulations and theoretical calculations.
######################################################################
#
# This code is based on the logitreg function to be found in Venables
# and Ripley (2002) p 445. It is meant to fit binomial GLMs by direct
# maximization (see pages 198-199 of the same book). DMK has modified
# it a bit to correct certain problems when running the book code in R
# (version 2.14.1 was used) and so that it returns an object of class
# logitreg that has a predict method, etc.
#
# x, the matrix of variables passed to logitreg.dmk, should be a data
# frame.
#
# Venables WN, Ripley BD (2002) Modern applied statistics with
# S. Springer, New York
#
######################################################################
#
#
$Id: Appendix_logitreg.dmk.R 4177 2012-01-06 15:49:16Z dk1f0c9 $
#
# Copyright (C) 2012 David M. Kaplan
# Licence: GPL (Gnu Public License)
#
######################################################################
model.matrix.logitreg <- function(x,intercept=TRUE) {
if(is.null(dim(x))) dim(x) <- c(length(x), 1)
X <- model.matrix(~.,data=as.data.frame(x))
if (!intercept) {
X = X[,-1,drop=F]
}
return(X)
}
logitreg.dmk <- function(x,y,wt=rep(1,length(y)), start=rep(0,p),
intercept=TRUE,...){
fmin <- function(beta, X, y, w){
p <- plogis(X %*% beta)
-sum(2*w*ifelse(y, log(p), log(1-p)))
}
gmin <- function(beta, X, y, w){
eta <- X %*% beta; p <- plogis(eta)
-2*matrix(w*dlogis(eta)*ifelse(y, 1/p, -1/(1-p)),1) %*% X
}
X=model.matrix.logitreg(x,intercept=intercept)
p <- dim(X)[2]
dn <- dimnames(X)[[2]]
if(is.factor(y)) y <- (unclass(y) != 1)
fit <- optim(start, fmin, gmin, X = X, y=y, w=wt, method="BFGS", ...)
names(fit$par) <- dn
cat("\nCoefficients:\n"); print (fit$par)
cat("\nResidual Deviance:", format(fit$value), "\n")
cat("\nConvergence message:", fit$convergence, "\n")
r <- list(converged=fit$convergence,deviance=fit$value,intercept=intercept,
coefficients=fit$par,fitted.values=plogis(X%*%fit$par))
class(r) <- "logitreg"
invisible(r)
}
predict.logitreg <- function(obj,x,type=c("link","response")) {
type=match.arg(type)
X = model.matrix.logitreg(x,intercept=obj$intercept)
Y=X %*% obj$coefficients
switch(type,link=Y,response=plogis(Y))
}
#Example
#options(contrasts=c("contr.treatment","contr.poly"))
#X <- model.matrix(low~., data=bwt)
#logitreg.dmk(X, bwt$low)
######################################################################
#
# This script creates some functions to do theoretical calculations of
# confusion matrices and optimal models for a species probability of
# occupancy that is given and defined at a finite number of locations.
#
######################################################################
#
#
$Id: logitreg.dmk.R 4174 2012-01-06 15:37:43Z dk1f0c9 $
#
# Copyright (C) 2012 David M. Kaplan
# Licence: GPL (Gnu Public License)
#
######################################################################
##################################################
# Some basic useful functions and libraries
##################################################
library(PresenceAbsence)
imin <- function(x) { which( min(x) == x )[1] }
imax <- function(x) { which( max(x) == x )[1] }
meshgrid <- function(a,b) {
list(
x=outer(b,a,FUN=function(x,y) y),
y=outer(b,a,FUN=function(x,y) x)
)
}
##################################################
# Calculate theoretical confusion matrix and
# optimal thresholds, including sample bias.
##################################################
theoretical.cmx =
function(threshold,presence.prob,model.prob=presence.prob,sample.prev=mean(presence.pro
b)) {
presence.prob=as.vector(as.matrix(presence.prob))
model.prob=as.vector(as.matrix(model.prob))
I = model.prob > threshold
species.prev = mean(presence.prob)
PI = sum(presence.prob[I])
PI2 = sum(presence.prob)-PI
N = sum(I)
N2 = length(I) - N
# Actual elements of table
v1 = PI * sample.prev/species.prev
v2 = PI2 * sample.prev/species.prev
v3 = (N - PI) * (1-sample.prev) / (1-species.prev)
v4 = (N2 - PI2) * (1-sample.prev) / (1-species.prev)
# Return table
C = as.table( matrix( 1.0*c(v1,v2,v3,v4), nrow=2 ) )
dimnames(C) = list(predicted=c(1,0),observed=c(1,0))
return(C)
}
theoretical.stats = function(...) {
cc = theoretical.cmx(...)
c(Kappa=Kappa(cc,st.dev=FALSE),
sensitivity=sensitivity(cc,st.dev=FALSE),
specificity=specificity(cc,st.dev=FALSE))
}
# A function that returns thresholds and maxes
theoretical.maxes = function(presence.prob,threshold.tol=0.01,...) {
tt = seq(threshold.tol,1-threshold.tol,threshold.tol)
cc = mapply( theoretical.cmx, threshold=tt,
MoreArgs=list(presence.prob=presence.prob,...), SIMPLIFY=FALSE )
K=sapply(cc,Kappa,st.dev=FALSE)
se=sapply(cc,sensitivity,st.dev=FALSE)
sp=sapply(cc,specificity,st.dev=FALSE)
I = c("0.5T"=imin(abs(tt-0.5)),KMT=imax(K),MDT=imin(abs(se-sp)),MST=imax(se+sp))
M = cbind( tt[I], K[I], se[I], sp[I] )
dimnames(M)[[1]] = names(I)
dimnames(M)[[2]] = c("threshold","Kappa","sensitivity","specificity")
return(M)
}
##################################################
# Stuff to calculate theoretically best model for
# given alpha, beta and desired sample prevalence
##################################################
sample.presence.prob = function(presence.prob,sample.prev)
presence.prob * sample.prev / mean(presence.prob)
sample.absence.prob = function(presence.prob,sample.prev)
(1-presence.prob) * (1-sample.prev) / (1-mean(presence.prob))
model.log.likelihood = function(params,X,presence.prob,sample.prev) {
P = plogis(X %*% params)
P = ifelse( P < 1e-10, 1e-10, P ) # Avoid infinite logs
P = ifelse( 1-P < 1e-10, 1-1e-10, P )
-2*sum(log(P)*sample.presence.prob(presence.prob,sample.prev) +
log(1-P)*sample.absence.prob(presence.prob,sample.prev))
}
# X here should be a model matrix like that produced by
# model.matrix.logitreg. Note ... can be used to input additional
# options to optim, such as method (method="BFGS" seems to work well)
# and control (control=list(maxit=500) would seem to be a useful
# choice).
opt.model = function(X,presence.prob,sample.prev,
return.fit=TRUE, start=rep(0,dim(X)[2]),...) {
fit <- optim(start, model.log.likelihood,
X=X, presence.prob=presence.prob,
sample.prev=sample.prev,...)
fit$sample.prev=sample.prev
if (return.fit)
return(fit)
else
return(fit$par)
}
Download