#################################################################### ## More exploration on bootstrap methods ## Some data x <-c(8.26, 6.33, 10.4, 5.27, 5.35, 5.61, 6.12, 6.19, 5.2, 7.01, 8.74, 7.78, 7.02, 6, 6.5, 5.8, 5.12, 7.41, 6.52, 6.21, 12.28, 5.6, 5.38, 6.6, 8.74) ## Now let’s define a function in R, which we will call CV, to compute the coefficient # of variation, CV <- function(x) sqrt(var(x))/mean(x) ## So, let’s compute the CV CV(x) ## To generate a single bootstrap sample from this data vector, we use the command sample(x,replace=T) ## which generates a bootstrap sample of the data vector x by sampling with ##replacement. Hence, to compute the CV using a single bootstrap sample, CV(sample(x,replace=T)) ## So, let’s now generate 1000 bootstrap samples. We first need to specify a vector ## of real values of length 1000, which we will call boot boot <-numeric(1000) ## We now generate 1000 samples, and assign the CV for bootstrap sample i as the ##ith element in the vector boot, using a for loop for (i in 1:1000) boot[i] <- CV(sample(x,replace=T)) ## Now, compute mean and variance of this 1000 bootstrapped CV values mean(boot) var(boot) ## A plot of the histogram of these values follows using hist(boot) 1 ## Likewise, the value corresponding to the (say) upper 97.5 quantile is quantile(boot,0.975) ## while the value corresponding to the lower 2.5% follows from quantile(boot,0.025) ## Bias is computed as: bias <- mean(boot) - CV(x) ## and an bootstrap-corrected estimate of the CV is just the original estimate minus ## the bias CV(x) – bias ## Assuming normality, the approximate 95% confidence interval is given by CV(x) - bias - 1.96*sqrt(var(boot)) CV(x) - bias + 1.96*sqrt(var(boot)) ## R function for bootstrap estimate of SE(Median) bootstrapmedianfunc=function(X,bootreps){ medianX=median(X); # vector that will store the bootstrapped medians bootmedians=rep(0,bootreps); for(i in 1:bootreps){ # Draw a sample of size n from X with replacement and # calculate median of sample Xstar=sample(X,size=length(X),replace=TRUE); bootmedians[i]=median(Xstar); } seboot=var(bootmedians)^.5; list(medianX=medianX,seboot=seboot); } 2 ## Example rainfall=c(.02,.01,.05,.21,.003,.45,.001,.01,2.13,.07,.01,.01,.001,.003,.04,.32,.19,.18,.12,.0 01,1.1,.24,.002,.67,.08,.003,.02,.29,.01,.003,.42,.27,.001,.001,.04,.01,1.72,.001,.14,.29,.00 2,.04,.05,.06,.08,1.13,.07,.002) median(rainfall) ## Use it bootstrapmedianfunc(rainfall,1000) ## R function bootstrap for skewness thetahat=(sum((rainfall -mean(rainfall))^3)/length(rainfall))/var(rainfall)^.5 ## Bootstrap estimate of standard error of ‘thetahat’ bootstrapskewnessfunc=function(X,bootreps){ skewnessX=(sum((X-mean(X))^3)/length(X))/var(X)^.5; # vector that will store the bootstrapped skewness estimates bootskewness=rep(0,bootreps); for(i in 1:bootreps){ # Draw a sample of size n from X with replacement and # calculate skewness estimate of sample Xstar=sample(X,size=length(X),replace=TRUE); bootskewness[i]= (sum((X-mean(Xstar))^3)/length(Xstar))/var(Xstar)^.5; } seboot=var(bootskewness)^.5; list(skewnessX=skewnessX,seboot=seboot); } ## Use it bootstrapskewnessfunc(rainfall,100) Bootstrapping Sample Mean ## Download the library ‘bootstrap’ into T. The ‘mouse’ data (Page 11 in Efron and ## Tibshirani) describes mice assigned randomly to treatment and control group. 3 library(bootstrap) print(mouse.c) x <- mouse.c theta.hat <- mean(x) print(theta.hat) nboot <- 1000 theta.star <- double(nboot) for (i in 1:nboot) { x.star <- sample(x, replace = TRUE) theta.star[i] <- mean(x.star) } hist(theta.star) abline(v = theta.hat, lty = 2) sd(theta.star) # theoretical value for comparison sqrt(mean((x - mean(x))^2) / length(x)) ## Nonparametric Bootstrap library(bootstrap) print(law) attach(law) n <- nrow(law) rho.hat <- cor(LSAT, GPA) print(rho.hat) nboot <- 1000 rho.star <- double(nboot) for (i in 1:nboot) { k.star <- sample(n, replace = TRUE) LSAT.star <- LSAT[k.star] GPA.star <- GPA[k.star] rho.star[i] <- cor(LSAT.star, GPA.star) } hist(rho.star) abline(v = rho.hat, lty = 2) sd(rho.star) ## Caution: Do NOT use the following statement LSAT.star <- sample(LSAT, replace = TRUE) GPA.star <- sample(GPA, replace = TRUE) ## Because, then you are drawing independent bootstrap samples from LSAT and GPA. ## The right way to proceed is to use ONE ‘sample’ statement 4 ## Parametric bootstrap library(bootstrap) library(MASS) n <- nrow(law) rho.hat <- cor(law[,1], law[,2]) print(rho.hat) cor.mat <- cor(law) print(cor.mat) nboot <- 10000 ## nboot can be variable, playing with this leads to some sensitivity analysis ## In some cases, a high value of bootstrap samples leads to better results rho.star <- double(nboot) for (i in 1:nboot) { law.star <- mvrnorm(n, c(0, 0), cor.mat) rho.star[i] <- cor(law.star[,1], law.star[,2]) } hist(rho.star) abline(v = rho.hat, lty = 2) sd(rho.star) ## Calculate bootstrapped mean mean(rho.star) ## True standard error of the correlation co-efficient is given by n <- 15 se.corr <- (1-rho.hat^2)/sqrt(n-3) print(se.corr) ## Now, we do the Fisher's z transformation print(z.hat <- 0.5 * log((1 + rho.hat) / (1 - rho.hat))) z.star <- 0.5 * log((1 + rho.star) / (1 - rho.star)) hist(z.star) abline(v = z.hat, lty = 2) mean(z.star) # large-sample theoretical value is z sd(z.star) # large-sample theoretical value is below 1 / sqrt(n - 3) # good confidence interval for z z.hat + c(-1,1) * qnorm(0.975) * sd(z.star) # good confidence interval for rho tanh(z.hat + c(-1,1) * qnorm(0.975) * sd(z.star)) 5 ## Notes The nonparametric bootstrap is very different from the parametric bootstrap. No resample with replacement from the original data. Instead we simulate from the parametric model that is assumed for the data. This is still a bootstrap rather than pure simulation because the estimated parameter value is not the true unknown parameter value. So we still have a sample is not the population issue. In this case the parametric model is bivariate normal. The mvrnorm function in the MASS library simulates multivariate normal random vectors. A multivariate normal distribution is determined by two structured parameters, the mean vector and the variance-covariance matrix. But the correlation coefficient is invariant under changes of location and scale. Thus the mean vector does not affect the distribution of the sample correlation coefficient, nor do scale changes. Hence we can use the zero vector for the mean vector, and we can use the correlation matrix for the variance matrix without affecting the distribution of the sample correlation coefficient. The statement law.star <- mvrnorm(n, c(0, 0), cor.mat) produces a random sample from the parametric model with parameter rho.hat. The following statement calculates rho.star[i] from the (parametric) bootstrap data law.star in exactly the same way rho.hat is calculated from the original data law. In calculating Fisher's z we don't need to do another bootstrap. We have rho.star stored. Just transform it to find the sampling distribution of z. The transformation z <- 0.5 * log((1 + rho) / (1 - rho)) has a name inverse hyperbolic tangent and a standard R function to calculate it z <- atanh(rho) which has an inverse function hyperbolic tangent rho <- tanh(z) Because of the skewness of the distribution of rho.hat (as shown by the skewness of the histogram of rho.star) 6 rho.hat + c(-1, 1) * qnorm(0.975) * sd(rho.star) is not a very good 95% confidence interval for the true unknown parameter rho. (Point estimate plus or minus 1.96 standard errors of the point estimate assumes normality or at least approximate normality and here we aren't anywhere close to normality.) Because of the approximate normality of the distribution of z.hat (as shown by the approximate normality of the histogram of z.star) z.hat + c(-1, 1) * qnorm(0.975) * sd(z.star) is a pretty good 95% confidence interval for the true unknown parameter zeta = atanh(rho). Hence tanh(z.hat + c(-1, 1) * qnorm(0.975) * sd(z.star)) is a pretty good 95% confidence interval for the true unknown parameter rho. 7