Geographic State Speciation and Extinction - Víctor Soria

advertisement
Running an R application - GeoSSE
(Geographic State Speciation and
Extinction) analyses run using R
and parallel functions
Download this file
http://goo.gl/rQL8Hd
Victor Soria-Carrasco
v.soria-carrasco@shef.ac.uk
GeoSSE
• Belongs to a family of likelihood-based methods for
studying diversification and character evolution using
phylogenetic trees (BiSSE - Binary State Speciation and
Extinction)
• GeoSSE models the reciprocal effects between
geographic range evolution and diversification
(speciation and extinction)
• It estimates region-dependent rates of speciation,
extinction and dispersal.
• R package diversitree
A shell script is used to call the R script
R_job.sh
#!/bin/bash
#$ -l h_rt=08:00:00
#$ -l mem=2G
#$ -m bea
#$ -M user@sheffield.ac.uk
R CMD BATCH --vanilla myscript.R
myscript.R
# Load diversitree package
library (diversitree)
# Load phylogenetic trees
trees<-read.tree(“mytrees.new”)
# Load states
states<-read.table(“mytable.txt”,
header=T)
...
$ qsub R_job.sh -j y -o rjob.log
Example 1
Running R using
multiple cores of 1 node
(shared memory)
Example 1. Run GeoSSE using multiple cores of 1 node (shared memory)
# Launch an interactive session
$ qrsh
# Copy the input files (trees and table with geographic states) to
your home folder
$ cp
/usr/local/extras/Genomics/HPC_course/R_GeoSSE/example_trees_10.new
$HOME/
$ cp
/usr/local/extras/Genomics/HPC_course/R_GeoSSE/example_states.txt
$HOME/
# Copy the R script to your home folder
$ cp /usr/local/extras/Genomics/submit_scripts/geosse_openmp.R
$HOME/
# Copy the shell script to call the R script to your home folder
$ cp /usr/local/extras/Genomics/submit_scripts/R_job_openmp.sh
$HOME/
Example 1. Run GeoSSE using multiple cores of 1 node (shared memory)
# If you experience problems, you can download all the files from
the web:
$ wget http://goo.gl/9GVykX
$ tar -xvf R_GeoSSE.tar.gz
$ rm R_GeoSSE.tar.gz
Example 1. Run GeoSSE using multiple cores of 1 node (shared memory)
# Check you have the following 4 files in your home directory:
example_states.txt
example_trees_10.new
geosse_openmp.R
R_job_openmp.sh
# Edit the shell script to change number of processors, e-mail
address, and path to R script
$ nano $HOME/R_job_openmp.sh
R_job_openmp.sh
#!/bin/bash
#$ -l h_rt=08:00:00
#$ -l mem=2G
#$ -pe openmp 2
#$ -m bea
#$ -M user@sheffield.ac.uk
# Number of cores to be used
export OMP_NUM_THREADS=2
# Add path to software repository libraries
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/extras/Genomics/lib/lib
R='/usr/local/extras/Genomics/apps/R/current/bin/R'
# Change R script path
RSCRIPT="$HOME/geosse_openmp.R"
# Some options
#
# --slave
#
Don't print input commands
#
# --vanilla (highly recommended)
#
Don't load anything from previous sessions,
#
and don't save data at the end of the current session
$R CMD BATCH --vanilla $RSCRIPT
Example 1. Run GeoSSE using multiple cores of 1 node (shared memory)
# Edit the R script to change input and output files
$ nano $HOME/geosse_openmp.R
geosse_openmp.R (1)
# Load parallel package
library(parallel)
# Number of cores
ncores<-2
# Load diversitree package
library(diversitree)
# Load phylogenetic trees
trees<-read.tree(“/home/user/example_trees_10.new”)
# Load states
states<-read.table(”/home/user/example_states.txt”, header=T)
tip.states<-states$state
names(tip.states)<-states$taxon
# Fit birth-death model to get an extinction/speciation ratio (eps)
# and use it as starting value for GeoSSE
bd.rates<-mclapply(trees, birthdeath)
geosse_openmp.R (2)
# Fit GeoSSE model by maximum likelihood to the trees
ML.est<-mclapply(1:length(trees),
function(x){
model<-make.geosse(
tree=trees[[x]],
states=tip.states)
start.values<-starting.point.geosse(trees[[x]], eps=bd.rates[[x]]$par[1])
res<-find.mle(model, x.init=start.values, method="subplex")
cat("Job # ", x, " finished\n", sep="")
return(res)
},
mc.cores=ncores)
# Format results into a table
ML.est.pars.table<-as.data.frame(do.call("rbind",
lapply(ML.est, function(x) c(x$par, npar=length(x$par),
lnl=x$lnLik))))
# Write the table to a file
output<-paste(Sys.getenv("HOME"), "/example_openmp_output.txt", sep='')
write.table(ML.est.pars.table, file=output,
row.names=F, quote=F, sep="\t")
# Save image of this session
image<-paste(Sys.getenv("HOME"), "/example_openmp_image.RData", sep='')
save.image(file=image, compress="bzip2")
Example 1. Run GeoSSE using multiple cores of 1 node (shared memory)
# Submit the job to the cluster
$ qsub -j y -o geosse_openmp.log R_job_openmp.sh
# Check if the job is running
$ Qstat
# When the job finishes, look at the logs and the output
$ less -S geosse_openmp.log
$ less -S geosse_openmp.Rout
$ less -S example_openmp_output.txt
Example 2
Running R using
multiple nodes
(distributed memory)
Example 2. Run GeoSSE using multiple nodes (distributed memory)
# Copy the R script to your home folder
$ cp /usr/local/extras/Genomics/submit_scripts/geosse_mpi.R $HOME/
# Copy the shell script to call the R script to your home folder
$ cp /usr/local/extras/Genomics/submit_scripts/R_job_mpi.sh $HOME/
# Check you have the following 4 files in your home directory:
example_states.txt
example_trees_10.new
geosse_mpi.R
R_job_mpi.sh
# Edit the shell script to change number of nodes, e-mail address,
and path to R script
$ nano $HOME/R_job_mpi.sh
#!/bin/bash
#$ -l h_rt=08:00:00
#$ -l mem=2G
#$ -pe ompigige 4
#$ -m bea
#$ -M user@sheffield.ac.uk
R_job_mpi.sh
# Required for running Rmpi
module add mpi/gcc/openmpi/1.4.4
export MPI_ROOT='/usr/local/mpi/gcc/openmpi/1.4.4/'
export OMPI_MCA_mtl=^psm
# Add path to software repository libraries
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/extras/Genomics/lib/lib
R='/usr/local/extras/Genomics/apps/R/current/bin/R'
# Change R script path
RSCRIPT=”$HOME/geosse_mpi.R"
# Some options
#
# --slave
#
Don't print input commands
#
# --vanilla (highly recommended)
#
Don't load anything from previous sessions,
#
and don't save data at the end of the current session
$R CMD BATCH --vanilla $RSCRIPT
Example 2. Run GeoSSE using multiple nodes (distributed memory)
# Edit the R script to change input and output files
$ nano $HOME/geosse_mpi.R
geosse_mpi.R (1)
# This R script uses multiple nodes to fit a GeoSSE model
# to multiple phylogenetic trees
# Ignore Infiniband interface
Sys.setenv(OMPI_MCA_mtl='^psm')
# Echo commands
options(echo=T)
# Load Rmpi package
library (Rmpi)
# Load diversitree package
library (diversitree)
# Load phylogenetic trees
trees<-read.tree(“/home/user/example_trees_10.new”)
# Load states
states<-read.table(”/home/user/example_states.txt”, header=T)
tip.states<-states$state
names(tip.states)<-states$taxon
geosse_mpi.R (2)
# Number of nodes
nnodes<-4
# spawn mpi slaves
mpi.spawn.Rslaves(nslaves=nnodes, needlog=T)
# load diversitree on slaves
mpi.bcast.cmd(library(diversitree))
# Broadcast trees to slaves
mpi.bcast.Robj2slave(trees)
# Broadcast tip.states to slaves
mpi.bcast.Robj2slave(tip.states)
# Fit birth-death model to get an extinction/speciation ratio (eps)
# and use it as starting value for GeoSSE
bd.rates<-mpi.parLapply(trees, birthdeath)
# Broadcast bd.rates to slaves
mpi.bcast.Robj2slave(bd.rates)
geosse_mpi.R (3)
# Fit GeoSSE model by maximum likelihood
ML.est<-mpi.parLapply(1:length(trees),
function(x){
model<-make.geosse(
tree=trees[[x]],
states=tip.states)
start.values<-starting.point.geosse(trees[[x]], eps=bd.rates[[x]]$par[1])
res<-find.mle(model, x.init=start.values, method="subplex")
cat("Job # ", x, " finished\n", sep="")
return(res)
},
job.num=nnodes)
# Format results into a table
ML.est.pars.table<-as.data.frame(do.call("rbind",
lapply(ML.est, function(x) c(x$par, npar=length(x$par),
lnl=x$lnLik))))
# Write the table to a file
output<-paste(Sys.getenv("HOME"), "/example_mpi_output.txt", sep='')
write.table(ML.est.pars.table, file=output,
row.names=F, quote=F, sep="\t")
# Save an image of this session
image<-paste(Sys.getenv("HOME"), "/example_mpi_image.RData", sep='')
save.image(file=image, compress="bzip2")
Example 2. Run GeoSSE using multiple nodes (distributed memory)
# Submit the job to the cluster
$ qsub -j y -o geosse_mpi.log R_job_mpi.sh
# Check if the job is running
$ Qstat
# When the job finishes, look at the logs
$ less -S geosse_mpi.log
$ less -S geosse_mpi.Rout
# look at the log for each node
$ find . -name "*node*.log" -exec sh -c "echo {}; cat {} | sed
's/^/\t/g'" \;
# Output (results)
$ less -S example_mpi_output.txt
https://myapps.shef.ac.uk
bo4cm14 / gen0mics
Download