R Scripts for Chapter 14

advertisement
R Scripts for Chapter 14
# Clean out workspace
rm(list = ls())
# Suppress levels of significance
options(show.signif.stars=FALSE)
library(car)
library(pwr)
source("C:/R/functions.txt")
# Define the groups
low <- c(13,13,21,16,22,15,18,20,22,14)
high <- c(40,28,18,32,17,27,19,21,16,30)
# Develop the summary statistics
n1 <- length(low)
n2 <- length(high)
sum1 <-sum(low)
sum2 <-sum(high)
sumsq1 <- sum(low*low)
sumsq2 <- sum(high*high)
m1 <- mean(low)
m2 <- mean(high)
ss1 <- sumsq1-(sum1*sum1)/n1
ss2 <- sumsq2-(sum2*sum2)/n2
sd1 <- sd(low)
sd2 <- sd(high)
n1; n2
sum1; sum2
sumsq1 ;sumsq2
m1; m2
ss1; ss2
sd1; sd2
# Calculate t
diff <- m1 - m2
s2.pooled <- (ss1 + ss2)/(n1 + n2 - 2)
t.obs <- (diff)/(sqrt(s2.pooled*((1/n1) + (1/n2))))
t.obs
df <- n1 + n2 - 2
# Two-tailed p-value
p.value <- 2*pt(t.obs, df)
p.value
t.crit <- qt(c(.025, .975), df)
t.crit
# Finding t the easy way
t.test(low, high, var.equal = TRUE)
# Playing with power
power.t.test(n = 10, d = 7.4, sd = 6.14, sig.level = 0.05, power = NULL,
type = "two.sample", alternative = "two.sided")
pwr.t.test(n = 10, d = 1.2, sig.level = .05, type = "two.sample",
alternative = "two.sided")
power.t.test(n = NULL, d = 1.22, sd = 6.14, sig.level = .05, power = .8,
type = "two.sample", alternative = "two.sided")
# Another Way
# Read in text file with data
# Group defined as 1 = Low, 2 = High
data <- read.table("c:/rbook/twoindmeans.txt", header = TRUE)
attach(data)
data$f.group <- factor(group, levels = 1:2,
labels = c("Low.SES", "High.SES"), ordered = TRUE)
attach(data)
t.test(read1 ~ f.group, equal.var = TRUE)
# Script 14.2: The Wilcoxon/Mann-Whitney test.
# Clean out workspace
rm(list = ls())
# Suppress levels of significance
options(show.signif.stars = FALSE)
library(car)
library(pwr)
source("C:/R/functions.txt")
# Define the groups
low <- c(13,13,21,16,22,15,18,20,22,14)
n1 <- length(low)
high <- c(40,28,18,32,17,27,19,21,16,30)
n2 <- length(high)
both <- c(low,high)
rank.both <-rank(both, ties.method = "average")
rank.both
ranks.low <-rank.both[1:n1]
first.2 <- n1 + 1
last.2 <- n1 + n2
ranks.high <- rank.both[first.2:last.2]
ranks.high
r1 <- sum(ranks.low)
r2 <- sum(ranks.high)
r1; r2
library(PASWR)
wilcox.test(low, high, alternative = "two.sided",
exact = TRUE, correct = FALSE)
wilcoxE.test(low, high, conf.level = .95)
# Another Way
# Read in text file with data
# Group defined as 1 = Low, 2 = High
data <- read.table("c:/rbook/twoindmeans.txt", header = TRUE)
attach(data)
data$f.group <- factor(group, levels = 1:2,
labels = c("Low.SES", "High.SES"), ordered = TRUE)
attach(data)
wilcox.test(read1 ~ f.group, alternative = "two.sided",
exact = TRUE, correct = FALSE)
# Note: wilcoxE.test can only compare two vectors/groups
# Script 14.3: Two-sample permutation test for means.
# Permutation test for the difference between two means
# by squaring the difference, we are do a two-tailed test, otherwise
# we would compare the "random" differences to the actual difference,
# in the way predicted by the alternate hypothesis
# Read in the data
low <- c(13,13,21,16,22,15,18,20,22,14)
n1 <- length(low)
high <- c(40,28,18,32,17,27,19,21,16,30)
n2 <- length(high)
mean.l = mean(low)
mean.h = mean(high)
diff = mean.l - mean.h
diff
diff2 = diff^2
# Finding first and last index for each group
f1 <- 1
l1 <- n1
f2 <- n1 + 1
l2 <- n1 + n2
pop <- c(low,high)
N <- 100000
stat <- numeric(N)
stat2 <- numeric(N)
counter <- 0
for (i in 1:N) {
perm <- sample(pop, l2)
lgp <- perm[1:l1]
hgp <- perm[f2:l2]
mean1p <- mean(lgp)
mean2p <- mean(hgp)
stat[i] <- (mean1p-mean2p)
stat2[i] <- stat[i]^2
# For two tailed test
if (stat2[i] >= diff2) counter <- counter + 1
}
p.value = counter/N
cat("The p-value is", p.value,"\n")
hist(stat, prob = TRUE)
lines(density(stat))
q <- round(quantile(stat, probs = c(.005,.025,.05,.95,.975,.995)), 3)
cat("The quartiles for using CI approach are:\n");q
# Script 14.4 - Two-sample test for proportions
# Clean out workspace
rm(list = ls())
# Suppress levels of significance
options(show.signif.stars = FALSE)
library(car)
library(MASS)
library(pwr)
source("C:/R/functions.txt")
# Define favor as the two values for "in favor"
# Define survey as the number in each group respectively
favor <- c(31, 19)
survey <- c(50, 50)
prop.test(favor, survey, correct = FALSE, alternative = "greater")
# Script 14.5 - Two sample test permutation test for proportions
# Clean out workspace
rm(list = ls())
# Suppress levels of significance
options(show.signif.stars=FALSE)
source("C:/R/functions.txt")
# Provide data
n1 <- 50
s1 <- 31
n2 <- 50
s2 <- 19
p1 <- s1/n1
p1
p2 <- s2/n2
p2
diff1 <- p1 - p2
diff2 <- diff1^2
n.success <- s1 + s2
n.total <- n1 + n2
n.fail <- n.total - n.success
# Establish ones and zeros
success <- rep(1, n.success)
failure <- rep(0, n.fail)
perm.data <- c(success, failure)
# Set first and last index for each group
f1 <- 1
l1 <- n1
f2 <- n1 + 1
l2 <- n1 + n2
N <- 100000
# stat1 for one-tailed test
stat1 <- numeric(N)
# stat2 for two-tailed test
stat2 <- numeric(N)
counter1 <- 0
counter2 <- 0
# Counter increased only for one tail
for (i in 1:N) {
perm <- sample(perm.data, l2)
g1.perm <- perm[1:l1]
g2.perm <- perm[f2:l2]
p1.perm <- mean(g1.perm)
p2.perm <- mean(g2.perm)
stat1[i] <- (p1.perm - p2.perm)
stat2[i] <- stat1[i]^2
if (stat1[i] >= diff1) counter1 <- counter1 + 1
if (stat2[i] >= diff2) counter2 <- counter2 + 1
}
p.value1 <- counter1/N
p.value2 <- counter2/N
cat("The observed difference is",diff1,"and the p-value is",p.value1,"\n")
q <- round(quantile(stat1,probs=c(.005,.025,.05,.95,.975,.995)),3)
cat("The quartiles for using CIs for hypothesis testing are: \n"); q
# Script 14.6: Testing the difference between two medians
# The median test
# Clean out workspace
rm(list = ls())
source("C:/R/functions.txt")
# Suppress levels of significance
options(show.signif.stars=FALSE)
# House prices taken from Allen Tate ad in N&O 1 May 2010
tate.price <- c(1210000,1085000,999950,850000,700000,600000,575000,550000,
550000,524900,515000,449900,399900,365000,339900,295000,274900,
229900,182500,179900,169900,134900,108000,895000,650000,650000,
599900,527500,475000,322000,286000,260000,215900,197000,179900,
167000)
# House prices taken from Chapel Hill News, 15 August 2010 - The Home Team
team.price <- c(659000,399000,222500,599000,350000,295000,495000,384900,
219000,599000,349000,269500,489900,335000,189000,439750,335000,
268500,475000,264500,189500,375000,300000,199000,449000,252500,
178900,369000,299000)
# Combine data
house.price <- c(tate.price, team.price)
# Find the three medians
med.all <- median(house.price)
med.tate <- median(tate.price)
med.team <- median(team.price)
med.all
med.tate
med.team
# Finding the cell entries
c11 <- length(tate.price[tate.price > med.all])
c12 <- length(tate.price[tate.price < med.all])
c21 <- length(team.price[team.price > med.all])
c22 <- length(team.price[team.price < med.all])
median.table <- rbind(c(c11, c12), c(c21, c22))
rownames(median.table) <- c("Tate.Raleigh", "Team.CH")
colnames(median.table) <- c("Above", "Below")
median.table
# Complete the median test
fisher.test(median.table)
# Bootstrapping the difference between two medians
diff1 <- med.tate - med.team
diff2 <- diff1^2
n1 <- length(tate.price)
n2 <- length(team.price)
N <- 100000
# Permutation Test from pooled data
counter = 0
stat1 = numeric(N)
stat2 = numeric(N)
f1 = 1
l1 = n1
f2 = n1 + 1
l2 = n1 + n2
# For a two-tailed test, we track diff2
for (i in 1:N) {
perm = sample(house.price,l2)
g1.perm = perm[1:l1]
g2.perm = perm[f2:l2]
median1p = median(g1.perm)
median2p = median(g2.perm)
stat1[i] = (median1p - median2p)
stat2[i] = stat1[i]^2
if (stat2[i] >= diff2) counter = counter + 1
}
p.value <- counter/N
# Script 14.7 - Test of location for dependent samples (Parametric,
# nonparametric, & resampling).
# Testing the difference between two dependent samples on location
# Clean out workspace
rm(list = ls())
source("C:/R/functions.txt")
# Suppress levels of significance
options(show.signif.stars = FALSE)
# Comparing the reading and math skills of 10 Native American students
read <- c(58,47,38,35,43,47,34,56,53,44)
math <- c(43,49,40,45,38,49,38,64,52,49)
diff <- read - math
diff
mean(diff)
sum(diff)
sum(diff*diff)
t.test(read, math, paired = TRUE)
qt(.025,9)
# As though we had done an independent samples t-test
t.test(read, math, var.equal = TRUE)
# Doing binomial (Sign) test
pos <- length(diff[diff > 0])
n <- length(diff)
binom.test(pos, n, alternative = "two.sided")
library(PASWR)
wilcox.test(read, math, paired = TRUE, correct = FALSE)
wilcoxE.test(read, math, paired = TRUE)
# A permutation test of the differences in the pairs
N = 100000
diff <- read - math
n <- length(diff)
mdiff <- mean(diff)
mdiff
stat <- numeric(n)
counter1 <- 0
for (i in 1:N){
which <- runif(n)
for (j in 1:n){
if (which[j] < .5) stat[j] <- -diff[j]
else stat[j] <- diff[j]
}
if (abs(mean(stat)) >= abs(mdiff)) counter1 <- counter1 + 1
}
# Absolute values comparison makes it a two-tailed test
pvalue2 <- counter1/N
cat("The mean difference is",mdiff,"and the p-value is",pvalue2,"\n")
# A different way to obtain the data
data <- read.table("c:/rbook/twodepmeans.txt", header = TRUE)
attach(data)
t.test(read1, math1, paired = TRUE)
# Script 14.8: Testing the difference between two dependent proportions
# Clean out workspace
rm(list = ls())
source("C:/R/functions.txt")
# Suppress levels of significance
options(show.signif.stars = FALSE)
# Provide the observed counts
obs <- c(18, 8)
sum(obs)
# Find the expected probabilities for each cell
ratios <- c(13, 13)
prob <- ratios/sum(ratios)
result <-chisq.test(obs, p = prob)
result
qchisq(.95,1)
# Using the mcnemar.test function
# Provide the observed counts in table form, bind the two rows
# of two columns
item.diff <- rbind(c(12,18), c(8,12))
colnames(item.diff) <- c("2.Correct", "2.Incorrect")
rownames(item.diff) <- c("1.Correct", "1.Incorrect")
item.diff
mcnemar.test(item.diff, correct = FALSE)
# Script 14.9 - Exercise14.6: Comparing two medians.
# Testing the difference between two medians
# Clean out workspace
rm(list = ls())
source("C:/R/functions.txt")
# Suppress levels of significance
options(show.signif.stars = FALSE)
# Read salaries for Boston Red Sox
brs <- c(420000,12100000,650000,7750000,14000000,8525000,18700000,12500000,
7700000,2750000,9350000,1155000,5500000,3000000,9375000)
# Read salaries for New York Yankees
nyy <- c(435650,9000000,487975,5500000,22600000,4000000,1200000,11750000,
15000000,33000000,6850000,900000,1100000)
n1 <- length(brs)
n2 <- length(nyy)
med1 <- median(brs)
med2 <- median(nyy)
n1; n2; med1; med2
salaries <- c(brs,nyy)
med <- median(salaries)
med
# Sign Test
# Finding the cell entries
c11 <- length(brs[brs > med])
c12 <- length(brs[brs < med])
c21 <- length(nyy[nyy > med])
c22 <- length(nyy[nyy < med])
median.table <- rbind(c(c11, c12), c(c21, c22))
rownames(median.table) <- c("Red Sox", "Yankees")
colnames(median.table) <- c("Above", "Below")
median.table
# Now do the chisq.test on the table
chisq.test(median.table, correct = FALSE)
# Permutation Test from pooled data - Two Tailed
diff1 <- med1 - med2
diff2 <- diff1^2
N <- 100000
counter <- 0
stat1 <- numeric(N)
stat2 <- numeric(N)
f1 <- 1
l1 <- n1
f2 <- n1 + 1
l2 <- n1 + n2
for (i in 1:N) {
perm <- sample(salaries, l2)
g1.perm <- perm[1:l1]
g2.perm <- perm[f2:l2]
median1p <- median(g1.perm)
median2p <- median(g2.perm)
stat1[i] <- (median1p - median2p)
stat2[i] <- stat1[i]^2
if (stat2[i] >= diff2) counter = counter + 1
}
p.value <- counter/N
cat("The observed difference is",diff1,"and the p-value is",p.value,"\n")
# Script14.10 - Exercise14.10: Dependent proportions.
# Testing the difference between two dependent proportions
# Clean out workspace
rm(list = ls())
source("C:/R/functions.txt")
# Suppress levels of significance
options(show.signif.stars = FALSE)
# Provide the observed counts
obs <- c(4, 8)
expect <- sum(obs)/2
#Find the expected probabilities for each cell
ratios <- c(expect, expect)
prob <- ratios/sum(ratios)
result <-chisq.test(obs, p = prob)
result
qchisq(.95,1)
# Using the mcnemar.test function
#Provide the observed counts in table form, bind the two rows
#of two columns
approve <- rbind(c(40,4),c(8,28))
rownames(approve) <- c("1.For","2.Against")
colnames(approve) <- c("1.For","2.Against")
approve
mcnemar.test(approve, correct = FALSE)
Download