SUPPORTING INFORMATION Using virtual species to study species distributions and model performance Christine N. Meynard and David M. Kaplan Journal of Biogeography Appendix S2 R scripts associated with the simulations and theoretical calculations. ###################################################################### # # This code is based on the logitreg function to be found in Venables # and Ripley (2002) p 445. It is meant to fit binomial GLMs by direct # maximization (see pages 198-199 of the same book). DMK has modified # it a bit to correct certain problems when running the book code in R # (version 2.14.1 was used) and so that it returns an object of class # logitreg that has a predict method, etc. # # x, the matrix of variables passed to logitreg.dmk, should be a data # frame. # # Venables WN, Ripley BD (2002) Modern applied statistics with # S. Springer, New York # ###################################################################### # # $Id: Appendix_logitreg.dmk.R 4177 2012-01-06 15:49:16Z dk1f0c9 $ # # Copyright (C) 2012 David M. Kaplan # Licence: GPL (Gnu Public License) # ###################################################################### model.matrix.logitreg <- function(x,intercept=TRUE) { if(is.null(dim(x))) dim(x) <- c(length(x), 1) X <- model.matrix(~.,data=as.data.frame(x)) if (!intercept) { X = X[,-1,drop=F] } return(X) } logitreg.dmk <- function(x,y,wt=rep(1,length(y)), start=rep(0,p), intercept=TRUE,...){ fmin <- function(beta, X, y, w){ p <- plogis(X %*% beta) -sum(2*w*ifelse(y, log(p), log(1-p))) } gmin <- function(beta, X, y, w){ eta <- X %*% beta; p <- plogis(eta) -2*matrix(w*dlogis(eta)*ifelse(y, 1/p, -1/(1-p)),1) %*% X } X=model.matrix.logitreg(x,intercept=intercept) p <- dim(X)[2] dn <- dimnames(X)[[2]] if(is.factor(y)) y <- (unclass(y) != 1) fit <- optim(start, fmin, gmin, X = X, y=y, w=wt, method="BFGS", ...) names(fit$par) <- dn cat("\nCoefficients:\n"); print (fit$par) cat("\nResidual Deviance:", format(fit$value), "\n") cat("\nConvergence message:", fit$convergence, "\n") r <- list(converged=fit$convergence,deviance=fit$value,intercept=intercept, coefficients=fit$par,fitted.values=plogis(X%*%fit$par)) class(r) <- "logitreg" invisible(r) } predict.logitreg <- function(obj,x,type=c("link","response")) { type=match.arg(type) X = model.matrix.logitreg(x,intercept=obj$intercept) Y=X %*% obj$coefficients switch(type,link=Y,response=plogis(Y)) } #Example #options(contrasts=c("contr.treatment","contr.poly")) #X <- model.matrix(low~., data=bwt) #logitreg.dmk(X, bwt$low) ###################################################################### # # This script creates some functions to do theoretical calculations of # confusion matrices and optimal models for a species probability of # occupancy that is given and defined at a finite number of locations. # ###################################################################### # # $Id: logitreg.dmk.R 4174 2012-01-06 15:37:43Z dk1f0c9 $ # # Copyright (C) 2012 David M. Kaplan # Licence: GPL (Gnu Public License) # ###################################################################### ################################################## # Some basic useful functions and libraries ################################################## library(PresenceAbsence) imin <- function(x) { which( min(x) == x )[1] } imax <- function(x) { which( max(x) == x )[1] } meshgrid <- function(a,b) { list( x=outer(b,a,FUN=function(x,y) y), y=outer(b,a,FUN=function(x,y) x) ) } ################################################## # Calculate theoretical confusion matrix and # optimal thresholds, including sample bias. ################################################## theoretical.cmx = function(threshold,presence.prob,model.prob=presence.prob,sample.prev=mean(presence.pro b)) { presence.prob=as.vector(as.matrix(presence.prob)) model.prob=as.vector(as.matrix(model.prob)) I = model.prob > threshold species.prev = mean(presence.prob) PI = sum(presence.prob[I]) PI2 = sum(presence.prob)-PI N = sum(I) N2 = length(I) - N # Actual elements of table v1 = PI * sample.prev/species.prev v2 = PI2 * sample.prev/species.prev v3 = (N - PI) * (1-sample.prev) / (1-species.prev) v4 = (N2 - PI2) * (1-sample.prev) / (1-species.prev) # Return table C = as.table( matrix( 1.0*c(v1,v2,v3,v4), nrow=2 ) ) dimnames(C) = list(predicted=c(1,0),observed=c(1,0)) return(C) } theoretical.stats = function(...) { cc = theoretical.cmx(...) c(Kappa=Kappa(cc,st.dev=FALSE), sensitivity=sensitivity(cc,st.dev=FALSE), specificity=specificity(cc,st.dev=FALSE)) } # A function that returns thresholds and maxes theoretical.maxes = function(presence.prob,threshold.tol=0.01,...) { tt = seq(threshold.tol,1-threshold.tol,threshold.tol) cc = mapply( theoretical.cmx, threshold=tt, MoreArgs=list(presence.prob=presence.prob,...), SIMPLIFY=FALSE ) K=sapply(cc,Kappa,st.dev=FALSE) se=sapply(cc,sensitivity,st.dev=FALSE) sp=sapply(cc,specificity,st.dev=FALSE) I = c("0.5T"=imin(abs(tt-0.5)),KMT=imax(K),MDT=imin(abs(se-sp)),MST=imax(se+sp)) M = cbind( tt[I], K[I], se[I], sp[I] ) dimnames(M)[[1]] = names(I) dimnames(M)[[2]] = c("threshold","Kappa","sensitivity","specificity") return(M) } ################################################## # Stuff to calculate theoretically best model for # given alpha, beta and desired sample prevalence ################################################## sample.presence.prob = function(presence.prob,sample.prev) presence.prob * sample.prev / mean(presence.prob) sample.absence.prob = function(presence.prob,sample.prev) (1-presence.prob) * (1-sample.prev) / (1-mean(presence.prob)) model.log.likelihood = function(params,X,presence.prob,sample.prev) { P = plogis(X %*% params) P = ifelse( P < 1e-10, 1e-10, P ) # Avoid infinite logs P = ifelse( 1-P < 1e-10, 1-1e-10, P ) -2*sum(log(P)*sample.presence.prob(presence.prob,sample.prev) + log(1-P)*sample.absence.prob(presence.prob,sample.prev)) } # X here should be a model matrix like that produced by # model.matrix.logitreg. Note ... can be used to input additional # options to optim, such as method (method="BFGS" seems to work well) # and control (control=list(maxit=500) would seem to be a useful # choice). opt.model = function(X,presence.prob,sample.prev, return.fit=TRUE, start=rep(0,dim(X)[2]),...) { fit <- optim(start, model.log.likelihood, X=X, presence.prob=presence.prob, sample.prev=sample.prev,...) fit$sample.prev=sample.prev if (return.fit) return(fit) else return(fit$par) }