1 2 Figure S1: Fits of seven models to rarefied phylogenetic gain values from Gibbons et al. 2013. 1 3 4 5 6 7 Figure S2: Sensitivity of neutral model predictions to the probability of an immigrant replacing open sites in the local community (m). Local-global overlap in phylogenetic diversity is plotted as a function of the ratio of community size and global richness (N/S). 8 9 10 11 2 12 13 14 15 Table S1: Data sources used for phylogenetic overlap and global richness estimation. Local samples are extremely deeply sequenced samples all derived from a single study (Caporaso et al. 2011). The global datasets are broadly spatially distributed surveys at the continental to global scale, and were obtained from the Qiime and Earth Microbiome Project databases. Local samples Human gut Human tongue Human skin Ocean Lake Sample ID Amz30adlt.418837 Amz1teenF.418404 USygt15.F.418688 M31Tong F11Tong M31Plmr M11Plmr F11Plmr seqs_I1.631407 NP3 NP5 LMEpi24M # sequences 3.8M 3.8M 3.5M 2.1M 1.8M 0.75M 0.47M 0.20M 10.7M 1.2M 1.4M 2.3M Source QIIME DB study 805 QIIME DB study 805 QIIME DB study 805 EMP study 722 EMP study 722 EMP study 722 EMP study 722 EMP study 722 ERP001778 EMP study 722 EMP study 722 EMP study 722 1.5M EMP study 722 CC1 SV1 CL3 Description Amazon adult Amazon teen U.S. teen Male #31 Female #11 Male #31 Male #11 Female#11 English Channel Newport Pier sample 3 Newport Pier sample 5 Lake Mendota, WI, US Sparkling Lake, WI, US Cedar Creek Sevilleta LTER Calhoun LTSE 1.5M 1M 1.1M EMP study 722 EMP study 722 EMP study 722 Source QIIME DB study 850 QIIME DB study 968 QIIME DB study 969 QIIME DB study 968 QIIME DB study 969 ICOMM EMP study 945 EMP study 1041 QIIME DB study 1552 EMP study 1242 EMP study 1288 EMP study 1627 QIIME DB study 619 EMP study 808 QIIME DB study 1792 QIIME DB study 103 QIIME DB study 213 EMP study 1702 QIIME DB study 397 # sequences 1,094M 1.8M 2.9M 1.9M 1.6M 9.1M 59M 4.5M 0.26M 45M 67M 1.4M 0.37M 2.5M 3.8M 0.15M 0.12M 1.9M 0.038M % of sequences used ~1% 100% 100% 100% 100% 100% 1.7% 22% 100% 1.5% 100% 100% 100% 44% 26% 100% 100% 52% 100% Source # sequences % of sequences used QIIME DB study 850 QIIME DB study 968, 969 QIIME DB study 968, 969 ICOMM EMP study 945 EMP study 632 1,094M 4.7M ~1% 100% 3.5M 100% 9.1M 59M 2.8M 100% 9% 100% SLEpi20M Soil Global datasets Human Gut Human Tongue Human Skin Ocean Lake Soil Sets used to estimate global OTU richness (S) Human gut Human Tongue Human Skin Ocean Lake Soil 16 17 3 18 Table S2: Justifications of estimated environmental sample sizes used in Fig. 2. Environment Human gut Human tongue Human skin Ocean Lake Soil Estimated typical sample size 1 x 1011 cells 1 x 107 cells 1 x 104 cells 2 x 108 cells 2 x 108 cells 1 x 108 cells Justification Franks et al. 1998 Appl Environ Microbiol Swab covers approximately 5% of mouth ~4 cm2 swab 2000 ml sample x 1e5 cells/ml 200 ml sample x 1e6 cells/ml one gram of soil 19 20 4 21 Code for neutral model of local-global overlap of phylogenetic diversity 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 rm(list=ls()) 69 70 71 72 73 74 75 ####### iterate across variety of global diversities # set global diversities to consider Ss=c(10,50,100,500,1000,5000,10000,50000,100000) #Note: 5e4 and 1e5 can take a long time to run library(ape) library(picante) # function for running neutral community simulations untbJones<function(start,prob=0,D=1,gens=150,keep=FALSE,meta=rep(1/length(start),length (start))){ # start: vector of starting abundances (length of start should be richness of metacommunity); # sum of start is local population size # prob: probability of immigrant vs. internal reproduction # D: number of deaths per generation # gens: number of generations to simulate # keep: if true, keep all time points # meta: vector representing probabilities of immigration from metacommunity; related to metacommunity SAD out=matrix(0,gens,length(start)) out[1,]=start N=sum(start) #local population size S=length(start) #metacommunity diversity for(i in 2:gens){ trans=rep(0,S) deaths=table(sample(rep(1:S,times=out[(i-1),]),D,replace=TRUE)) trans[as.numeric(names(deaths))]=-deaths Nbirths=sum(runif(D)>prob) births=table(sample(1:S,Nbirths,replace=TRUE,prob=(out[(i1),]+trans)/(N-D))) trans[as.numeric(names(births))]=trans[as.numeric(names(births))]+births imms=table(sample(1:S,D-Nbirths,replace=TRUE,prob=meta)) trans[as.numeric(names(imms))]=trans[as.numeric(names(imms))]+imms out[i,]=out[(i-1),]+trans } colnames(out)=1:S if(keep){ return(out) }else{ return(out[gens,]) } } # set replicates per global diversity and matrix to store results reps=10 5 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 collectSims=matrix(NA,reps,length(Ss)) collectPhyloSor=collectSims # local population size N=1000 # number of generations gens=1000 # iterate across global diversities for(i in 1:length(Ss)){ # generate global SAD (log normal) meanSAD=-13 # average mean of SAD estimated from observations sdSAD=1.43 # average sd of SAD estimated from observations lnDraw=rlnorm(Ss[i],meanSAD,sdSAD) SAD=lnDraw/sum(lnDraw) #global species abundance distribution # repeat process reps times for(j in 1:reps){ # create starting local community sam=table(sample(1:Ss[i],N,replace=TRUE)) start=rep(0,Ss[i]) start[as.numeric(names(sam))]=sam names(start)=1:Ss[i] tree=rtree(Ss[i],tip.label=1:Ss[i],rooted=TRUE) cur=untbJones(start=start,prob=0.5,D=1,gens=gens,keep=FALSE,meta=SAD) collectSims[j,i]=sum(cur>0)/Ss[i] collectPhyloSor[j,i]=phylosor(rbind(1,(cur>0)*1),tree) } } #Summarize replicate model runs UNiterMeans=colMeans(collectSims) UNiterSDs=apply(collectSims,2,sd) PSiterMeans=colMeans(collectPhyloSor) PSiterSDs=apply(collectPhyloSor,2,sd) PSiter=rbind(PSiterMeans,PSiterSDs) colnames(PSiter)=Ss colnames(storePhyloSor)=Ss # Fit model to predictions NSs=log10(N/Ss) # Likelihood function for fitting statistical model to simulation results fitSig<-function(p,x,y){ yhat=(x-p[1])^p[3]/(p[2]+(x-p[1])^p[3]) -sum(dnorm(y,yhat,exp(p[4]),log=TRUE)) } # Estimate statistical model parameters CompOverlapFit=optim(c(2,0,2,1),fitSig,x=NSs,y=UNiterMeans,control=list(maxit=1000000)) dev.new() 6 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 # Plot statistical model predictions plot(log10(N/Ss),UNiterMeans,pch=16,cex=1,xlab="log10(N/S)",ylab="Composition al overlap",ylim=c(0,1),xlim=c(-2,8)) arrows(log10(N/Ss),UNiterMeans+2*UNiterSDs,log10(N/Ss),UNiterMeans2*UNiterSDs,code=0) fit=CompOverlapFit lines(xp,(xp-fit$par[1])^fit$par[3]/(fit$par[2]+(xpfit$par[1])^fit$par[3]),lwd=2) # Repeat statistical model fits and plotting for phylogenetic overlap PhyloSorFit=optim(c(2,0,2,1),fitSig,x=NSs,y=PSiterMeans,control=list(maxit=1000000)) dev.new() plot(log10(N/Ss),PSiterMeans,pch=16,cex=1,xlab="log10(N/S)",ylab="Phylogeneti c overlap",ylim=c(0,1),xlim=c(-2,8)) arrows(log10(N/Ss),PSiterMeans+2*PSiterSDs,log10(N/Ss),PSiterMeans2*PSiterSDs,code=0) fit=PhyloSorFit lines(xp,(xp-fit$par[1])^fit$par[3]/(fit$par[2]+(xpfit$par[1])^fit$par[3]),lwd=2) 7