hence compute

advertisement
####################################################################
## More exploration on bootstrap methods
## Some data
x <-c(8.26, 6.33, 10.4, 5.27, 5.35, 5.61, 6.12, 6.19, 5.2, 7.01, 8.74, 7.78, 7.02, 6, 6.5, 5.8,
5.12, 7.41, 6.52, 6.21, 12.28, 5.6, 5.38, 6.6, 8.74)
## Now let’s define a function in R, which we will call CV, to compute the coefficient
# of variation,
CV <- function(x) sqrt(var(x))/mean(x)
## So, let’s compute the CV
CV(x)
## To generate a single bootstrap sample from this data vector, we use the command
sample(x,replace=T)
## which generates a bootstrap sample of the data vector x by sampling with
##replacement. Hence, to compute the CV using a single bootstrap sample,
CV(sample(x,replace=T))
## So, let’s now generate 1000 bootstrap samples. We first need to specify a vector
## of real values of length 1000, which we will call boot
boot <-numeric(1000)
## We now generate 1000 samples, and assign the CV for bootstrap sample i as the
##ith element in the vector boot, using a for loop
for (i in 1:1000) boot[i] <- CV(sample(x,replace=T))
## Now, compute mean and variance of this 1000 bootstrapped CV values
mean(boot)
var(boot)
## A plot of the histogram of these values follows using
hist(boot)
1
## Likewise, the value corresponding to the (say) upper 97.5 quantile is
quantile(boot,0.975)
## while the value corresponding to the lower 2.5% follows from
quantile(boot,0.025)
## Bias is computed as:
bias <- mean(boot) - CV(x)
## and an bootstrap-corrected estimate of the CV is just the original estimate minus
## the bias
CV(x) – bias
## Assuming normality, the approximate 95% confidence interval is given by
CV(x) - bias - 1.96*sqrt(var(boot))
CV(x) - bias + 1.96*sqrt(var(boot))
## R function for bootstrap estimate of SE(Median)
bootstrapmedianfunc=function(X,bootreps){
medianX=median(X);
# vector that will store the bootstrapped medians
bootmedians=rep(0,bootreps);
for(i in 1:bootreps){
# Draw a sample of size n from X with replacement and
# calculate median of sample
Xstar=sample(X,size=length(X),replace=TRUE);
bootmedians[i]=median(Xstar);
}
seboot=var(bootmedians)^.5;
list(medianX=medianX,seboot=seboot);
}
2
## Example
rainfall=c(.02,.01,.05,.21,.003,.45,.001,.01,2.13,.07,.01,.01,.001,.003,.04,.32,.19,.18,.12,.0
01,1.1,.24,.002,.67,.08,.003,.02,.29,.01,.003,.42,.27,.001,.001,.04,.01,1.72,.001,.14,.29,.00
2,.04,.05,.06,.08,1.13,.07,.002)
median(rainfall)
## Use it
bootstrapmedianfunc(rainfall,1000)
## R function bootstrap for skewness
thetahat=(sum((rainfall -mean(rainfall))^3)/length(rainfall))/var(rainfall)^.5
## Bootstrap estimate of standard error of ‘thetahat’
bootstrapskewnessfunc=function(X,bootreps){
skewnessX=(sum((X-mean(X))^3)/length(X))/var(X)^.5;
# vector that will store the bootstrapped skewness estimates
bootskewness=rep(0,bootreps);
for(i in 1:bootreps){
# Draw a sample of size n from X with replacement and
# calculate skewness estimate of sample
Xstar=sample(X,size=length(X),replace=TRUE);
bootskewness[i]= (sum((X-mean(Xstar))^3)/length(Xstar))/var(Xstar)^.5;
}
seboot=var(bootskewness)^.5;
list(skewnessX=skewnessX,seboot=seboot);
}
## Use it
bootstrapskewnessfunc(rainfall,100)
Bootstrapping Sample Mean
## Download the library ‘bootstrap’ into T. The ‘mouse’ data (Page 11 in Efron and
## Tibshirani) describes mice assigned randomly to treatment and control group.
3
library(bootstrap)
print(mouse.c)
x <- mouse.c
theta.hat <- mean(x)
print(theta.hat)
nboot <- 1000
theta.star <- double(nboot)
for (i in 1:nboot) {
x.star <- sample(x, replace = TRUE)
theta.star[i] <- mean(x.star)
}
hist(theta.star)
abline(v = theta.hat, lty = 2)
sd(theta.star)
# theoretical value for comparison
sqrt(mean((x - mean(x))^2) / length(x))
## Nonparametric Bootstrap
library(bootstrap)
print(law)
attach(law)
n <- nrow(law)
rho.hat <- cor(LSAT, GPA)
print(rho.hat)
nboot <- 1000
rho.star <- double(nboot)
for (i in 1:nboot) {
k.star <- sample(n, replace = TRUE)
LSAT.star <- LSAT[k.star]
GPA.star <- GPA[k.star]
rho.star[i] <- cor(LSAT.star, GPA.star)
}
hist(rho.star)
abline(v = rho.hat, lty = 2)
sd(rho.star)
## Caution: Do NOT use the following statement
LSAT.star <- sample(LSAT, replace = TRUE)
GPA.star <- sample(GPA, replace = TRUE)
## Because, then you are drawing independent bootstrap samples from LSAT and GPA.
## The right way to proceed is to use ONE ‘sample’ statement
4
## Parametric bootstrap
library(bootstrap)
library(MASS)
n <- nrow(law)
rho.hat <- cor(law[,1], law[,2])
print(rho.hat)
cor.mat <- cor(law)
print(cor.mat)
nboot <- 10000
## nboot can be variable, playing with this leads to some sensitivity analysis
## In some cases, a high value of bootstrap samples leads to better results
rho.star <- double(nboot)
for (i in 1:nboot) {
law.star <- mvrnorm(n, c(0, 0), cor.mat)
rho.star[i] <- cor(law.star[,1], law.star[,2])
}
hist(rho.star)
abline(v = rho.hat, lty = 2)
sd(rho.star)
## Calculate bootstrapped mean
mean(rho.star)
## True standard error of the correlation co-efficient is given by
n <- 15
se.corr <- (1-rho.hat^2)/sqrt(n-3)
print(se.corr)
## Now, we do the Fisher's z transformation
print(z.hat <- 0.5 * log((1 + rho.hat) / (1 - rho.hat)))
z.star <- 0.5 * log((1 + rho.star) / (1 - rho.star))
hist(z.star)
abline(v = z.hat, lty = 2)
mean(z.star) # large-sample theoretical value is z
sd(z.star) # large-sample theoretical value is below
1 / sqrt(n - 3)
# good confidence interval for z
z.hat + c(-1,1) * qnorm(0.975) * sd(z.star)
# good confidence interval for rho
tanh(z.hat + c(-1,1) * qnorm(0.975) * sd(z.star))
5
## Notes
The nonparametric bootstrap is very different from the parametric bootstrap.
No resample with replacement from the original data. Instead we simulate from
the parametric model that is assumed for the data. This is still a bootstrap rather than pure
simulation because the estimated parameter value is not the true unknown parameter
value. So we still have a sample is not the population issue.
In this case the parametric model is bivariate normal. The mvrnorm function in the MASS
library simulates multivariate normal random vectors. A multivariate normal distribution
is determined by two structured parameters, the mean vector and the variance-covariance
matrix. But the correlation coefficient is invariant under changes of location and scale.
Thus the mean vector does not affect the distribution of the sample correlation coefficient,
nor do scale changes. Hence we can use the zero vector for the mean vector, and we can
use the correlation matrix for the variance matrix without affecting the distribution of the
sample correlation coefficient.
The statement
law.star <- mvrnorm(n, c(0, 0), cor.mat)
produces a random sample from the parametric model with parameter rho.hat. The
following statement calculates rho.star[i] from the (parametric) bootstrap data law.star in
exactly the same way rho.hat is calculated from the original data law.
In calculating Fisher's z we don't need to do another bootstrap. We have rho.star stored.
Just transform it to find the sampling distribution of z.
The transformation
z <- 0.5 * log((1 + rho) / (1 - rho))
has a name inverse hyperbolic tangent and a standard R function to calculate it
z <- atanh(rho)
which has an inverse function hyperbolic tangent
rho <- tanh(z)
Because of the skewness of the distribution of rho.hat (as shown by the skewness of the
histogram of rho.star)
6
rho.hat + c(-1, 1) * qnorm(0.975) * sd(rho.star)
is not a very good 95% confidence interval for the true unknown parameter rho. (Point
estimate plus or minus 1.96 standard errors of the point estimate assumes normality or at
least approximate normality and here we aren't anywhere close to normality.)
Because of the approximate normality of the distribution of z.hat (as shown by the
approximate normality of the histogram of z.star)
z.hat + c(-1, 1) * qnorm(0.975) * sd(z.star)
is a pretty good 95% confidence interval for the true unknown parameter
zeta = atanh(rho).
Hence
tanh(z.hat + c(-1, 1) * qnorm(0.975) * sd(z.star))
is a pretty good 95% confidence interval for the true unknown parameter rho.
7
Download