nonparametric

advertisement
Correlation and non-parametric tests
# Download set of gene expression data
ge <- read.table(“http://www.ebi.ac.uk/goldmansrv/PredocCourse2009/gene_expression.txt”, header=TRUE)
ge[1:5,]
ProbeName
1
Exp1
Affymetrix:CompositeSequence:ATH1-121501:AFFX-Athal-5SrRNA_at
12.4
2 Affymetrix:CompositeSequence:ATH1-121501:AFFX-Athal-Actin_3_f_at 1297.0
3 Affymetrix:CompositeSequence:ATH1-121501:AFFX-Athal-Actin_5_r_at
4
73.9
Affymetrix:CompositeSequence:ATH1-121501:AFFX-Athal-Actin_M_at 943.6
5 Affymetrix:CompositeSequence:ATH1-121501:AFFX-Athal-GAPDH_3_s_at 1301.4
1
Exp2
Exp3
Exp4
Exp5
3.6
0.9
2.1
3.4
Exp6
12.0
2 1354.8 1401.5 1198.6 1017.4 1322.2
3
83.4
87.4 156.4 150.3
69.0
4 938.9 904.8 1133.4 958.2 940.1
5 1089.4 1153.5 1173.5 1157.8 1337.5
summary(ge$Exp1)
Min. 1st Qu. Median
0.0
9.0
38.6
Mean 3rd Qu.
135.2
Max.
119.5 4759.0
# Prepare log-scale data (must add small constant because
# of zero values
lge1 <- log(ge$Exp1 + 0.001)
lge2 <- log(ge$Exp2 + 0.001)
# Basic plot. See oddities due to rounding at left—hand
# side
plot( lge1 , lge2 , pch=’.’)
# Test correlation. Pearson is default method
cor.test(lge1,lge2)
Pearson's product-moment correlation
data: lge1 and lge2
t = 573.0363, df = 22808, p-value < 2.2e-16
alternative hypothesis:
true correlation is not equal to 0
95 percent confidence interval:
0.9661278 0.9678138
sample estimates:
cor
0.9669814
# Different scales
cor.test(lge1,ge$Exp2)
Pearson's product-moment correlation
data: lge1 and ge$Exp2
t = 98.9823, df = 22807, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.5390315 0.5571884
sample estimates:
cor
0.5481746
# Contrast to Spearman
cor.test(lge1,ge$Exp2,method="spearman")
Spearman's rank correlation rho
data: lge1 and ge$Exp2
S = 44539905431, p-value < 2.2e-16
alternative hypothesis: true rho is not equal to 0
sample estimates:
rho
0.9774793
Warning message:
In cor.test.default(lge1, ge$Exp2, method = "spearman") :
Cannot compute exact p-values with ties
# We get warning because calculations for exact
# distribution of Spearman cannot cope with tied data.
# For this many samples it uses an approximation in
# any case.
# Remember that Spearman is just Pearson with ranks
cor.test(rank(lge1),rank(ge$Exp2))
Pearson's product-moment correlation
data: rank(lge1) and rank(ge$Exp2)
t = 699.5114, df = 22807, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.976894 0.978050
sample estimates:
cor
0.9774793
# Could also investigate Kendall. Takes longer to
# calculate.
Look at over all distribution of gene expression and see if there are any
discernable differences.
# Plot an “empirical cumulative density graph”
# i.e. rank against value
plot( ecdf(lge1) , pch=’.’)
# Add second set of expression data
lines( ecdf(lge2) , pch=’.’)
# Can see that the two graphs diverge for low expression
ks.test(lge1,lge2)
Two-sample Kolmogorov-Smirnov test
data: lge1 and lge2
D = 0.0207, p-value = 0.0001146
alternative hypothesis: two-sided
Warning message:
In ks.test(lge1, lge2) : cannot compute correct p-values with ties
# Check different alternatives. Difference is one-sided
ks.test(lge1,lge2,alternative=”less”)$p.value
[1] 0.8757951
ks.test(lge1,lge2,alternative="greater")$p.value
[1] 5.729068e-05
# One experiment has consistently higher expression
# Look at differences between two experiments
# Genes form natural pairing, so use Wilcoxon Signed Rank
# conf.int=TRUE means report confidence intervals
# Test reports the pseudo median – the median of the
# differences between the two sets of expression data
wilcox.test( lge1 , lge2 , paired=TRUE, conf.int=TRUE)
Wilcoxon signed rank test with continuity correction
data: lge1 and lge2
V = 108321954, p-value < 2.2e-16
alternative hypothesis: true location shift is not equal to 0
95 percent confidence interval:
-0.05212866 -0.04213704
sample estimates:
(pseudo)median
-0.04713760
# If we ran same test using non-logged data we’d get a
# slightly different result since Signed Rank test ranks
# the differences not the actual values.
# Ignore genes and just look for changes in expression
# between experiments (Wilcoxon Rank Sum).
# P-value significant but much less. Knowing the
# gene-pairs provides much more info.
wilcox.test(lge1,lge2,paired=FALSE)
Wilcoxon rank sum test with continuity correction
data: lge1 and lge2
W = 256220403, p-value = 0.005493
alternative hypothesis: true location shift is not equal to 0
# Actually have 6 experiments. Instead of Wilcox Rank Sum
# Could just Kruskal-Walis to see if any of the
# experiments has different expression.
# Remember that first column is the probe names
kruskal.test(ge[,2:7])
Kruskal-Wallis rank sum test
data: ge[,2:7]
Kruskal-Wallis chi-squared = 58.4336, df = 5, p-value = 2.56e-11
# Highly significant, but it doesn’t tell us which
# Have a look
apply(ge[,2:7],2,median)
Exp1 Exp2 Exp3 Exp4 Exp5 Exp6
38.6 40.4 40.1 36.7 36.5 37.7
# MAD is the robust equivalent of standard deviation
# Median Absolute Deviation, that is the median distance
# from the median.
apply(ge[,2:7],2,mad)
Exp1
Exp2
Exp3
Exp4
Exp5
Exp6
52.33578 53.96664 53.81838 50.11188 50.11188 51.00144
# The largest difference appears to be between Exp2 and
# Exp5 but our measure of variance is large (although
# we haven’t scaled it for number of samples yet).
apply(ge[,2:7],2,mad)/sqrt(22809)
Exp1
Exp2
Exp3
Exp4
Exp5
Exp6
0.346534 0.357332 0.356351 0.331809 0.331809 0.337699
# Max difference is 3.9, so plausible that we could
# detect an effect.
# Have many genes and experiments, so most appropriate
# to use Friedman test.
# For arcane reasons, we have to tell R to interpret data
# as a matrix.
friedman.test(as.matrix(ge[,2:7]))
Friedman rank sum test
data: as.matrix(ge[,2:7])
Friedman chi-squared = 2462.051, df = 5, p-value < 2.2e-16
# Much more significant result than Kruskal-Walis test
friedman.test(as.matrix(ge[2:7]))$p.value
[1] 0
# Compare power by looking at subset of data
kruskal.test(ge[1:1000,2:7])
Kruskal-Wallis rank sum test
data: ge[1:1000, 2:7]
Kruskal-Wallis chi-squared = 7.4089, df = 5, p-value = 0.1920
friedman.test(as.matrix(ge[1:1000,2:7]))
Friedman rank sum test
data: as.matrix(ge[1:1000, 2:7])
Friedman chi-squared = 155.5744, df = 5, p-value < 2.2e-16
Download