Supplemental Table S3. Description of R functions, and packages used in the analyses. rm(list=ls()) ls() setwd("/Users/vibelamk/2012_projects/Field_BothYears_Analysis/14_PooledAnalysis_No_GE") #Read data from a csv file data=read.csv(file.choose(), header=TRUE, sep="\t", row.names=1) #Basic summary statistics dim(data) str(data) summary(data) sapply(data,sd) var(data) #Install the “pastecs” package to obtain detailed summary statistics #More information and options for summary statistics using “pastecs” can be obtained from the following weblink #"http://www.ats.ucla.edu/stat/r/faq/basic_desc.htm" install.packages("pastecs") library(pastecs) options(scipen=100) #to avoid scientific notation of numbers displayed SummaryStatistics=stat.desc(data) SummaryStatistics write.table(SummaryStatistics, file="SummaryStatistics_PooledAnalysis_NoGXE", quote=FALSE, sep="\t") #Exploratory analysis #Scatter Plot install.packages("YaleToolkit") library(YaleToolkit) gpairs(data, upper.pars=list(scatter="")) ?gpairs #Parallel Coordinate Plot install.packages("GGally") library(GGally) ggparcoord(data, columns=1:20) #Estimation of Correlation Coefficients and P-values install.packages("Hmisc") library(Hmisc) data_matrix=as.matrix(data) Correlations=rcorr(data_matrix, type="pearson") ?rcorr Correlations str(Correlations) write.table(Correlations$r, file="Correlations_rvalues_PooledAnalysis_NoGXE", quote=FALSE, sep="\t") #Correlation coefficients write.table(Correlations$P, file="Correlations_Pvalues_PooledAnalysis_NoGXE", quote=FALSE, sep="\t") #P-values #Lower half of the matrix with correlation coefficients and upper half as P-values corr_r_P_values=lower.tri(Correlations$r)*Correlations$r+t(lower.tri(Correlations$P)*Correlations$P) write.table(corr_r_P_values, file="Correlations_r_P_values_Combined_NoGXE", quote=FALSE, sep="\t") #Correlation coefficients plotted as colored pie #Red to Blue (-1 to +1): Negative values are red, where as positives values are Blue. Values close to zero are colorless install.packages("corrgram") library(corrgram) ?corrgram corrgram(data, order=NULL, lower.panel=panel.pie, upper.panel=NULL, text.panel=panel.txt) corrgram(data, order=NULL, lower.panel=panel.pie, upper.panel=NULL, text.panel=NULL) #Without labels easy to edit using other tools #Note: The order in the above figure will be the same as the input file #Hierarchical clustering analysis #Transform the variables. The default transformation is “type=sd” i.e. subtract from the mean and divide by sd for each trait library(reshape) data.std <- rescaler(data) ?rescaler #calculate distance ?dist data.std.dist <- dist(data.std) length(data.std.dist) #The above len should be 1378 because (53*52/2) there are 53 genotypes ?hclust #Single Linkage method data.std.dist.hc.single <- hclust(data.std.dist, method="single") plot(data.std.dist.hc.single, hang=-1) #Wards linkage method data.std.dist.hc.ward <- hclust(data.std.dist, method="ward") plot(data.std.dist.hc.ward, hang=-1) #Cut the dendrogram and look at variable responses, among groups, identified from the cluster analysis cl <- cutree(data.std.dist.hc.ward, 4) #Clone the data before including another variable in it data_4factor_inclustering=data data_4factor_inclustering[cl==1, 1:20] data_4factor_inclustering[cl==2, 1:20] data_4factor_inclustering[cl==3, 1:20] data_4factor_inclustering[cl==4, 1:20] data_4factor_inclustering$cl <- as.factor(cl) #Calculate means of each cluster, for each trait cluster1_means=colMeans(data_4factor_inclustering[data_4factor_inclustering$cl==1, 1:20]) cluster2_means=colMeans(data_4factor_inclustering[data_4factor_inclustering$cl==2, 1:20]) cluster3_means=colMeans(data_4factor_inclustering[data_4factor_inclustering$cl==3, 1:20]) cluster4_means=colMeans(data_4factor_inclustering[data_4factor_inclustering$cl==4, 1:20]) #Generate summary statistics of the clusters library(pastecs) options(scipen=100) #to avoid scientific notation of numbers displayed cluster1_summary=stat.desc(data_4factor_inclustering[data_4factor_inclustering$cl==1, 1:20]) cluster2_summary=stat.desc(data_4factor_inclustering[data_4factor_inclustering$cl==2, 1:20]) cluster3_summary=stat.desc(data_4factor_inclustering[data_4factor_inclustering$cl==3, 1:20]) cluster4_summary=stat.desc(data_4factor_inclustering[data_4factor_inclustering$cl==4, 1:20]) # Generate a dataframe with summary statistics of each cluster dataframe_clusterMeans=data.frame(cluster1_summary,cluster2_summary,cluster3_summary,cluster4_summary) write.table(dataframe_clusterMeans, file="Clusters_SummaryStatistics_PooledAnalysis_NoGXE", quote=FALSE, sep="\t") #Convert means of each cluster as a dataframe dataframe_clusterMeans=data.frame(cluster1_means,cluster2_means,cluster3_means,cluster4_means) dataframe_clusterMeans #Transpose the dataframe for a parallel coordinate plots of cluster means #Note that a dataframe needs to first converted into a data matrix before performing the transpose t_dataframe_clusterMeans=data.frame(t(data.matrix(dataframe_clusterMeans))) t_dataframe_clusterMeans #Generate another column with the row names to be used for grouping in parallel coordinate plot row.names=row.names(t_dataframe_clusterMeans) t_dataframe_clusterMeans_clusternames=data.frame(t_dataframe_clusterMeans,row.names) t_dataframe_clusterMeans_clusternames #Parallel coordinate plot with each cluster represented in a different color library(GGally) ?ggparcoord ggparcoord(data_4factor_inclustering, columns=1:20, groupColumn="cl") #Split the above parallel coordinate plot into four separate plots, representing four clusters #from each cluster. ggparacoord uses the same scaling system as rescaler (subtract from mean, and divide by s.d.) ggparcoord(data_4factor_inclustering, columns=1:20, groupColumn="cl") + facet_wrap(~cl,ncol=1) #Parallel coordinate plot using cluster means ggparcoord(t_dataframe_clusterMeans_clusternames, columns=1:20, groupColumn="row.names" ) #Principal component analysis (PCA) ?prcomp ?biplot data.pca=prcomp(data, scale=TRUE, retx=T) PCA_summary=summary(data.pca) PCA_summary screeplot(data.pca, type="l") print(data.pca) write.table(PCA_summary$sdev, file="PCA_sdev_NoGXE", quote=FALSE, sep="\t") write.table(PCA_summary$rotation, file="PCA_EigenValues_NoGXE", quote=FALSE, sep="\t") write.table(PCA_summary$x, file="PCA_GenoScores_NoGXE", quote=FALSE, sep="\t") # PCA plots #Generate a column of case ids caseid=row.names(data) caseid data$PC1 = data.pca$x[,1] data$PC2 = data.pca$x[,2] #Make PC1 vs PC2 plot biplot(data.pca) #PC1 vs PC2 colored by 4 clusters obtained from cluster analysis library(ggplot2) qplot(PC1, PC2, data=data, colour=data_4factor_inclustering$cl) + geom_text(aes(label=caseid)) #Rank Correlations (rank trait values within each environment and then perform correlation analysis) install.packages("Hmisc") library(Hmisc) data_matrix=as.matrix(data) Correlations=rcorr(data_matrix, type="pearson") ?rcorr #An example for two traits rcorr(rank(data_matrix[,1],ties.method=c("average")),rank(data_matrix[,5],ties.method=c("average")))