Stat 410/501 January 24, 2014: Intro to R & some models > # Stat410/510: January 24, 2014 > # Demonstrate simple simulation using hypothetical model of data > > HealthCntr <- read.table( file="http://users.humboldt.edu/rizzardi/Data.dir/jollyrancher.TXT", header=T) > names( HealthCntr ) [1] "Treatment" "Before" "After" > # HealthCntr # Typing the object's name would show entire dataset > head( HealthCntr, 5 ) # just show first 5 rows Treatment Before After 1 hot 98.7 98.9 2 hot 96.9 99.7 3 hot 95.9 98.2 4 hot 98.2 99.1 5 hot 97.0 98.5 > attach( HealthCntr ) # Make variables within data frame more easily accessible The following object is masked from HealthCntr (position 3): 97 98 99 100 101 102 After, Before, Treatment > table( Treatment ) # add up counts for each category of Treatment variable Treatment cool hot nothing 33 35 36 > boxplot( After ~ Treatment ) # Let's work with those who had had candy cool hot nothing > > x <- After[ Treatment!="nothing" ] # Remove those who got the "nothing" treatment > x [1] 98.9 99.7 98.2 99.1 98.5 98.7 97.4 98.1 98.5 97.8 99.4 99.8 [13] 99.9 98.7 100.4 98.6 98.3 98.7 98.5 98.3 99.3 99.5 99.4 99.5 [25] 99.3 97.8 98.3 99.5 99.3 100.1 98.4 98.8 102.3 97.4 98.7 98.9 [37] 97.5 98.5 99.5 98.5 100.5 99.1 98.2 99.1 98.5 99.3 99.7 98.2 [49] 98.6 98.6 99.2 98.4 98.5 98.6 99.1 98.0 100.4 97.5 99.6 99.4 [61] 98.5 99.0 99.0 98.0 99.1 99.3 98.7 98.9 > length( x) # Confirm we have smaller dataset: cool+hot=33+35=68 [1] 68 > detach( HealthCntr ) 1 > > summary( x ) Min. 1st Qu. Median Mean 3rd Qu. Max. 97.40 98.48 98.75 98.90 99.32 102.30 > sd( x ); var( x ) # semicolon allows multiple commands on same line [1] 0.8189856 [1] 0.6707375 > > dev.new() # create a new graph page while preserving the previous graph(s) > # not using dev.new() would cause the new graph to erase the old > hist( x ) # histogram of After temperatures > rug( jitter(x) ) #jitter adds random variation to reduce overlapping of points 10 0 5 Frequency 15 Histogram of x 97 98 99 100 101 102 x > x!=102.3 # Let's remove the obviously feverish student [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE [13] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE [25] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE [37] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE [49] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE [61] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE > x <- x[ x!=102.3 ] # x no longer has 102.3 data > length( x) [1] 67 > mean( x); sd( x ); var( x ) [1] 98.84627 [1] 0.7091028 [1] 0.5028268 > TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE 2 > dev.new() > qqnorm( x ) > qqline( x ) # normal quantile-quantile(qq) plot to inspect normality # perfectly normal data would follow line 99.0 98.5 97.5 98.0 Sample Quantiles 99.5 100.0 100.5 Normal Q-Q Plot -2 -1 0 1 2 Theoretical Quantiles > > > > > # Thick tail causes S shape; for example, compare to t-distribution data. y <- rt( 67, df=3 ) dev.new() qqnorm( y ); qqline( y ) 0 -4 -2 Sample Quantiles 2 4 Normal Q-Q Plot -2 -1 0 1 2 Theoretical Quantiles > > t.test( x, mu=98.6 ) # do a t-test on the hypothesis mean=98.6 One Sample t-test data: x t = 2.8427, df = 66, p-value = 0.005946 alternative hypothesis: true mean is not equal to 98.6 95 percent confidence interval: 98.67330 99.01923 sample estimates: mean of x 98.84627 3 > > > > > > > > > #Use model to simulate data: y = 98.6 + error, error ~ N( 0, 0.71^2 ) set.seed(1234) # Set the random generating seed so I can duplicate "random" data error <- rnorm( 67 , mean=0, sd=0.71 ) #67 error points from Normal distribution y <- 98.6 + error # could have just done y <- rnorm( 67, mean=98.6, sd=0.71 ) # demonstrate what random normal data can look like in q-q plot dev.new() qqnorm( y ); qqline( y ) 97.0 97.5 98.0 98.5 99.0 Sample Quantiles 99.5 100.0 100.5 Normal Q-Q Plot -2 -1 0 1 2 Theoretical Quantiles # Normal random data doesn't look like theoretical normal data because of randomness # Below show natural variability or normal data in 9 normal quantile plots dev.new() par( mfrow=c(3,3) ) # Three rows and three columns of graphs per page for( i in 1:9 ) # A for-loop to repeat procedure 9 times { Y <- rnorm( 67, mean=98.6, sd=0.71 ) qqnorm( Y ); qqline( Y ) } 1 2 -2 -1 0 1 97 98 99 Sample Quantiles 99 100 98 Sample Quantiles 0 97 99.5 98.5 -1 Normal Q-Q Plot 2 -2 -1 0 1 2 Normal Q-Q Plot Normal Q-Q Plot Normal Q-Q Plot -1 0 1 2 -1 0 1 98.0 Sample Quantiles -2 97.0 98 97 99.5 98.5 -2 99.0 100.0 Theoretical Quantiles 99 100 Theoretical Quantiles Sample Quantiles Theoretical Quantiles 97.5 2 -2 -1 0 1 2 Normal Q-Q Plot Normal Q-Q Plot Normal Q-Q Plot -1 0 1 Theoretical Quantiles 2 -2 -1 0 1 Theoretical Quantiles 2 98.5 Sample Quantiles 98.5 97.5 99 100 98 97 -2 99.5 Theoretical Quantiles 99.5 Theoretical Quantiles Sample Quantiles Theoretical Quantiles 96.5 Sample Quantiles -2 Sample Quantiles Normal Q-Q Plot 97.5 Sample Quantiles Normal Q-Q Plot 97.5 > > > > > + + + + -2 -1 0 1 2 Theoretical Quantiles 4 > par( mfrow=c(1,1) ) > > t.test( y, mu=98.6 ) # set graphing back to one graph a page # test randomly generated data (lower case y) against 98.6 One Sample t-test data: y t = -2.7155, df = 66, p-value = 0.008437 alternative hypothesis: true mean is not equal to 98.6 95 percent confidence interval: 98.18977 98.53741 sample estimates: mean of x 98.36359 > tresults <- t.test( y, mu=98.6 ) # can save results to an object > names(tresults) # object contains a list of many components [1] "statistic" "parameter" "p.value" "conf.int" "estimate" [6] "null.value" "alternative" "method" "data.name" > tresults$p.value # we can extract the components from the list [1] 0.008437235 > > ### Next we will show t-statistics and p-values from t-tests done on 1000 randomly generated datasets > ### where the null hypothesis is true. The t-statistics, theoretically, should follow a > ### t-distribution with df=67-1=66 and the p-values should follow a uniform distribution > ### with the lower bound of 0 and upper bound of 1. > tstats <- rep( NA, 1000 ) #Create a place to store the 1000 t-statistics, NA=not available; i.e., missing > pvals <- rep( NA, 1000) #Create a place to store 1000 p-values > for( i in 1:1000 ) + { + Y <- rnorm( 67, mean=98.6, sd=0.71 ) + testResult <- t.test( Y, mu=98.6 ) + tstats[i] <- testResult$statistic + pvals[i] <- testResult$p.value + } > > dev.new() > par(mfrow=c(2,2) ) > hist( tstats ) > qqplot(qt(ppoints(1000), df=66), tstats , main="QQ plot for t (df=66) and \n simulated t-stats" ) > qqline( tstats, distribution=function(p) qt(p,df=66) ) #compares data to t distribution with df=66 5 QQ plot for t (df=66) and simulated t-stats 1 0 0 -3 -2 -1 tstats 100 50 Frequency 150 2 3 200 Histogram of tstats -2 -1 0 1 2 3 -3 -2 -1 0 1 2 3 tstats qt(ppoints(1000), df = 66) Histogram of pvals QQ plot for Uniform(0,1) and simulated p- values 0.6 0.2 0.0 0 0.0 > 0.4 pvals 0.8 80 100 60 40 20 Frequency 1.0 -3 0.2 0.4 0.6 pvals 0.8 1.0 0.0 0.2 0.4 0.6 0.8 1.0 qunif(ppoints(1000)) > hist( pvals ) > qqplot(qunif(ppoints(1000)), pvals, main="QQ plot for Uniform(0,1) and \n simulated pvalues" ) > #Note: in the above, we could have just used ppoints(1000) since those are Uniform(0,1) quantiles > qqline( pvals, distribution=function(p){qunif(p)} ) > par( mfrow=c(1,1) ) # back to one graph per page > > # Show that about 5% of simulations resulted in type 1 errors > sum( pvals < 0.025 ) [1] 29 > sum( pvals < 0.025 )/1000 [1] 0.029 > mean( pvals < 0.025 ) [1] 0.029 > mean( pvals > 0.975 ) [1] 0.026 > mean( (pvals < 0.025 | pvals > 0.975) ) # using the logical "|" which is the syntax for "or" [1] 0.055 > sum( (pvals < 0.025 | pvals > 0.975) ) / 1000 [1] 0.055 ################################################################################# # Stat410/510: January 24, 2014 # Demonstrate simple simulation using hypothetical model of data HealthCntr <- read.table( file="http://users.humboldt.edu/rizzardi/Data.dir/jollyrancher.TXT", header=T) 6 names( HealthCntr ) # HealthCntr # Typing the object's name would show entire dataset head( HealthCntr, 5 ) # just show first 5 rows attach( HealthCntr ) # Make variables within data frame more easily accessible table( Treatment ) # add up counts for each category of Treatment variable boxplot( After ~ Treatment ) # Let's work with those who had had candy x <- After[ Treatment!="nothing" ] # Remove those who got the "nothing" treatment x length( x) # Confirm we have smaller dataset: cool+hot=33+35=68 detach( HealthCntr ) summary( x ) sd( x ); var( x ) # semicolon allows multiple commands on same line dev.new() # create a new graph page while preserving the previous graph(s) # not using dev.new() would cause the new graph to erase the old hist( x ) # histogram of After temperatures rug( jitter(x) ) #jitter adds random variation to reduce overlapping of points x!=102.3 # Let's remove the obviously feverish student x <- x[ x!=102.3 ] # x no longer has 102.3 data length( x) mean( x); sd( x ); var( x ) dev.new() qqnorm( x ) qqline( x ) # normal quantile-quantile(qq) plot to inspect normality # perfectly normal data would follow line # Thick tail causes S shape; for example, compare to t-distribution data. y <- rt( 67, df=3 ) dev.new() qqnorm( y ); qqline( y ) t.test( x, mu=98.6 ) # do a t-test on the hypothesis mean=98.6 #Use model to simulate data: y = 98.6 + error, error ~ N( 0, 0.71^2 ) set.seed(1234) # Set the random generating seed so I can duplicate "random" data error <- rnorm( 67 , mean=0, sd=0.71 ) #67 error points from Normal distribution y <- 98.6 + error # could have just done y <- rnorm( 67, mean=98.6, sd=0.71 ) # demonstrate what random normal data can look like in q-q plot dev.new() qqnorm( y ); qqline( y ) # Normal random data doesn't look like theoretical normal data because of randomness # Below show natural variability or normal data in 9 normal quantile plots dev.new() par( mfrow=c(3,3) ) # Three rows and three columns of graphs per page for( i in 1:9 ) # A for-loop to repeat procedure 9 times { Y <- rnorm( 67, mean=98.6, sd=0.71 ) qqnorm( Y ); qqline( Y ) } par( mfrow=c(1,1) ) # set graphing back to one graph a page t.test( y, mu=98.6 ) # test randomly generated data (lower case y) against 98.6 tresults <- t.test( y, mu=98.6 ) # can save results to an object names(tresults) # object contains a list of many components tresults$p.value # we can extract the components from the list ### Next we will show t-statistics and p-values from t-tests done on 1000 randomly generated datasets ### where the null hypothesis is true. The t-statistics, theoretically, should follow a ### t-distribution with df=67-1=66 and the p-values should follow a uniform distribution ### with the lower bound of 0 and upper bound of 1. tstats <- rep( NA, 1000 ) #Create a place to store the 1000 t-statistics, NA=not available; i.e., missing 7 pvals <- rep( NA, 1000) #Create a place to store 1000 p-values for( i in 1:1000 ) { Y <- rnorm( 67, mean=98.6, sd=0.71 ) testResult <- t.test( Y, mu=98.6 ) tstats[i] <- testResult$statistic pvals[i] <- testResult$p.value } dev.new() par(mfrow=c(2,2) ) hist( tstats ) qqplot(qt(ppoints(1000), df=66), tstats , main="QQ plot for t (df=66) and \n simulated t-stats" ) qqline( tstats, distribution=function(p) qt(p,df=66) ) #compares data to t distribution with df=66 hist( pvals ) qqplot(qunif(ppoints(1000)), pvals, main="QQ plot for Uniform(0,1) and \n simulated p- values" ) #Note: in the above, we could have just used ppoints(1000) since those are Uniform(0,1) quantiles qqline( pvals, distribution=function(p){qunif(p)} ) par( mfrow=c(1,1) ) # back to one graph per page # Show that about 5% of simulations resulted in type 1 errors sum( pvals < 0.025 ) sum( pvals < 0.025 )/1000 mean( pvals < 0.025 ) mean( pvals > 0.975 ) mean( (pvals < 0.025 | pvals > 0.975) ) # using the logical "|" which is the syntax for sum( (pvals < 0.025 | pvals > 0.975) ) / 1000 "or" 8