Supplement 5

advertisement
Supplemental Table S3. Description of R functions, and packages used in the analyses.
rm(list=ls())
ls()
setwd("/Users/vibelamk/2012_projects/Field_BothYears_Analysis/14_PooledAnalysis_No_GE")
#Read data from a csv file
data=read.csv(file.choose(), header=TRUE, sep="\t", row.names=1)
#Basic summary statistics
dim(data)
str(data)
summary(data)
sapply(data,sd)
var(data)
#Install the “pastecs” package to obtain detailed summary statistics
#More information and options for summary statistics using “pastecs” can be obtained from the following weblink
#"http://www.ats.ucla.edu/stat/r/faq/basic_desc.htm"
install.packages("pastecs")
library(pastecs)
options(scipen=100) #to avoid scientific notation of numbers displayed
SummaryStatistics=stat.desc(data)
SummaryStatistics
write.table(SummaryStatistics, file="SummaryStatistics_PooledAnalysis_NoGXE", quote=FALSE, sep="\t")
#Exploratory analysis
#Scatter Plot
install.packages("YaleToolkit")
library(YaleToolkit)
gpairs(data, upper.pars=list(scatter=""))
?gpairs
#Parallel Coordinate Plot
install.packages("GGally")
library(GGally)
ggparcoord(data, columns=1:20)
#Estimation of Correlation Coefficients and P-values
install.packages("Hmisc")
library(Hmisc)
data_matrix=as.matrix(data)
Correlations=rcorr(data_matrix, type="pearson")
?rcorr
Correlations
str(Correlations)
write.table(Correlations$r, file="Correlations_rvalues_PooledAnalysis_NoGXE", quote=FALSE, sep="\t") #Correlation coefficients
write.table(Correlations$P, file="Correlations_Pvalues_PooledAnalysis_NoGXE", quote=FALSE, sep="\t") #P-values
#Lower half of the matrix with correlation coefficients and upper half as P-values
corr_r_P_values=lower.tri(Correlations$r)*Correlations$r+t(lower.tri(Correlations$P)*Correlations$P)
write.table(corr_r_P_values, file="Correlations_r_P_values_Combined_NoGXE", quote=FALSE, sep="\t")
#Correlation coefficients plotted as colored pie
#Red to Blue (-1 to +1): Negative values are red, where as positives values are Blue. Values close to zero are colorless
install.packages("corrgram")
library(corrgram)
?corrgram
corrgram(data, order=NULL, lower.panel=panel.pie, upper.panel=NULL, text.panel=panel.txt)
corrgram(data, order=NULL, lower.panel=panel.pie, upper.panel=NULL, text.panel=NULL) #Without labels easy to edit using other tools
#Note: The order in the above figure will be the same as the input file
#Hierarchical clustering analysis
#Transform the variables. The default transformation is “type=sd” i.e. subtract from the mean and divide by sd for each trait
library(reshape)
data.std <- rescaler(data)
?rescaler
#calculate distance
?dist
data.std.dist <- dist(data.std)
length(data.std.dist)
#The above len should be 1378 because (53*52/2) there are 53 genotypes
?hclust
#Single Linkage method
data.std.dist.hc.single <- hclust(data.std.dist, method="single")
plot(data.std.dist.hc.single, hang=-1)
#Wards linkage method
data.std.dist.hc.ward <- hclust(data.std.dist, method="ward")
plot(data.std.dist.hc.ward, hang=-1)
#Cut the dendrogram and look at variable responses, among groups, identified from the cluster analysis
cl <- cutree(data.std.dist.hc.ward, 4)
#Clone the data before including another variable in it
data_4factor_inclustering=data
data_4factor_inclustering[cl==1, 1:20]
data_4factor_inclustering[cl==2, 1:20]
data_4factor_inclustering[cl==3, 1:20]
data_4factor_inclustering[cl==4, 1:20]
data_4factor_inclustering$cl <- as.factor(cl)
#Calculate means of each cluster, for each trait
cluster1_means=colMeans(data_4factor_inclustering[data_4factor_inclustering$cl==1, 1:20])
cluster2_means=colMeans(data_4factor_inclustering[data_4factor_inclustering$cl==2, 1:20])
cluster3_means=colMeans(data_4factor_inclustering[data_4factor_inclustering$cl==3, 1:20])
cluster4_means=colMeans(data_4factor_inclustering[data_4factor_inclustering$cl==4, 1:20])
#Generate summary statistics of the clusters
library(pastecs)
options(scipen=100) #to avoid scientific notation of numbers displayed
cluster1_summary=stat.desc(data_4factor_inclustering[data_4factor_inclustering$cl==1, 1:20])
cluster2_summary=stat.desc(data_4factor_inclustering[data_4factor_inclustering$cl==2, 1:20])
cluster3_summary=stat.desc(data_4factor_inclustering[data_4factor_inclustering$cl==3, 1:20])
cluster4_summary=stat.desc(data_4factor_inclustering[data_4factor_inclustering$cl==4, 1:20])
# Generate a dataframe with summary statistics of each cluster
dataframe_clusterMeans=data.frame(cluster1_summary,cluster2_summary,cluster3_summary,cluster4_summary)
write.table(dataframe_clusterMeans, file="Clusters_SummaryStatistics_PooledAnalysis_NoGXE", quote=FALSE, sep="\t")
#Convert means of each cluster as a dataframe
dataframe_clusterMeans=data.frame(cluster1_means,cluster2_means,cluster3_means,cluster4_means)
dataframe_clusterMeans
#Transpose the dataframe for a parallel coordinate plots of cluster means
#Note that a dataframe needs to first converted into a data matrix before performing the transpose
t_dataframe_clusterMeans=data.frame(t(data.matrix(dataframe_clusterMeans)))
t_dataframe_clusterMeans
#Generate another column with the row names to be used for grouping in parallel coordinate plot
row.names=row.names(t_dataframe_clusterMeans)
t_dataframe_clusterMeans_clusternames=data.frame(t_dataframe_clusterMeans,row.names)
t_dataframe_clusterMeans_clusternames
#Parallel coordinate plot with each cluster represented in a different color
library(GGally)
?ggparcoord
ggparcoord(data_4factor_inclustering, columns=1:20, groupColumn="cl")
#Split the above parallel coordinate plot into four separate plots, representing four clusters
#from each cluster. ggparacoord uses the same scaling system as rescaler (subtract from mean, and divide by s.d.)
ggparcoord(data_4factor_inclustering, columns=1:20, groupColumn="cl") + facet_wrap(~cl,ncol=1)
#Parallel coordinate plot using cluster means
ggparcoord(t_dataframe_clusterMeans_clusternames, columns=1:20, groupColumn="row.names" )
#Principal component analysis (PCA)
?prcomp
?biplot
data.pca=prcomp(data, scale=TRUE, retx=T)
PCA_summary=summary(data.pca)
PCA_summary
screeplot(data.pca, type="l")
print(data.pca)
write.table(PCA_summary$sdev, file="PCA_sdev_NoGXE", quote=FALSE, sep="\t")
write.table(PCA_summary$rotation, file="PCA_EigenValues_NoGXE", quote=FALSE, sep="\t")
write.table(PCA_summary$x, file="PCA_GenoScores_NoGXE", quote=FALSE, sep="\t")
# PCA plots
#Generate a column of case ids
caseid=row.names(data)
caseid
data$PC1 = data.pca$x[,1]
data$PC2 = data.pca$x[,2]
#Make PC1 vs PC2 plot
biplot(data.pca)
#PC1 vs PC2 colored by 4 clusters obtained from cluster analysis
library(ggplot2)
qplot(PC1, PC2, data=data, colour=data_4factor_inclustering$cl) + geom_text(aes(label=caseid))
#Rank Correlations (rank trait values within each environment and then perform correlation analysis)
install.packages("Hmisc")
library(Hmisc)
data_matrix=as.matrix(data)
Correlations=rcorr(data_matrix, type="pearson")
?rcorr
#An example for two traits
rcorr(rank(data_matrix[,1],ties.method=c("average")),rank(data_matrix[,5],ties.method=c("average")))
Download