R - IntroGraphics - Amazon Web Services

advertisement
#R script for Ann Arbor ASA R graphics class, 16 Jan. 2012, Kyle Enger
#A few basics:
?FunctionName #Get help for an R function (should open in your web browser).
z = seq(from=2, to=100, by=2) #Generate & store a sequence of values. Some people type '<-' (a left arrow) instead of the first '='.
print(z)
#Look at the sequence we just generated.
z[3]
z[2:4]
#Retrieve a particular value by its index.
#Retrieve a series of values.
#Remember that row & column index numbers go in square brackets.
#Functions use parentheses to enclose their arguments.
#Start by trying out a few handy functions.
plot(x=runif(10), y=runif(10))
#Scatterplot of 10 random sets of xy coordinates. plot() starts a new graphing device.
plot(seq(from=0, to=10, by=0.1), sin(seq(from=0, to=10, by=0.1)))
curve(sin(x), xlim=c(0,10))
#Graphing the sine wave function directly over a certain range
points(seq(from=0,to=10,by=0.1), sin(seq(from=0,to=10,by=0.1)))
curve(1/x*sin(5*x), xlim=c(0,10))
#Points on a sine wave
#Overlaying some points on the sine wave. points() does not start a new graph.
#Another arbitrary function; curve() also starts a new graph. Note the automatic y axis label.
x = seq(from=1, to=9, by=0.01) #Can add another arbitrary function using lines(), but first we need to generate & store x values.
lines(x, cos(x), col='brown')
#The x and y axis settings set (implicitly) by curve() above are retained.
abline(h=0, lwd=4)
#Adding a fat horizontal line at 0 (hence 'h=0'). 'lwd'= line width.
abline(a=0, b=1, col='red')
#Adding a red line of intercept 0 and slope 1. 'col'= color.
arrows(x0=2.3, y0=-1/2, x1=1, y1=-1.1, angle=15)
#Adding an arrow.
text(x=3.2, y=-1/2, 'Minimum') #Adding a label at specified coordinates.
text(x=1.3, y=4, expression(y == over(1,x)~'sin(5x)'))#Formatting mathematical expressions is tricky; see ?plotmath or demo(plotmath).
legend('topright', title='The Legend', legend=c('y=0','y=x'), lty=c('solid','solid'), lwd=c(4,1), col=c('black','red'))
#Note that the legend is completely independent of anything else. You have to make certain to specify it correctly.
points(locator(1),pch=4,cex=3)
#Uses locator() to put a big X whereever you click on the graph. 'pch'= print character, 'cex'= character expansion.
#Now let's make a two-part graph.
par(mfrow=c(1,2),lwd=2,col='cyan')
#'mfrow' divides up the plot area. You can access many options with par() including 'lty', 'lwd', 'pch', 'cex', etc. See ?par.
curve(sin(x), main='Sine')
curve(cos(x), main='Cosine')
#Double paned graph.
#Any questions?
#Play around with these a little bit - see what you can do. Discuss with your neighbors, & raise your hand if you're stuck.
#Basic dataset exploration: R comes with many built-in datasets.
?ToothGrowth
tg = ToothGrowth
str(tg)
#See what the heck this dataset is about. Guinea pigs, naturally.
#Make a copy with a shorter name, for convenience
#View the structure of the dataset
summary(tg)
#Summarize the dataset (quantiles & mean)
head(tg)
#Look at the first few rows of the dataset
tail(tg)
tg[13:19,]
#Look at the last few rows of the dataset
#Look at an arbitrary section of the dataset
tg[sample(1:60,5),]
#Look at some random rows of the dataset
#Now we start graphing.
dev.new()
#Start a brand-new graphics window (AKA, 'device').
hist(tg$len, main='Histogram of tooth length') #Simple histogram. Good place to start.
par(mfrow=c(1,2))
#Split graphics device into 1 row, 2 columns.
plot(density(ToothGrowth$len), main='Density plot of tooth length')
#Density plot. Depends on bandwidth - see next line.
plot(density(ToothGrowth$len, bw=1), main='Density plot of tooth length,\nbandwidth=1')
#Note '\n' for a new line.
dev.off()
#Turn off (i.e., close) the graphics device.
dev.new()
#Start a new one. Note that the 'mfrow' option in par() a few lines above has been cleared.
boxplot(len ~ supp, data=tg, main='Box plot of tooth length by treatment')
par(mfrow=c(1,2))
#Box plot, with the 2 treatments side-by-side. Standard model syntax.
#Split graphics device into 1 row, 2 columns (again).
hist(tg$len[which(tg$supp=='OJ')], main='Histogram of tooth length,\norange juice group')
points(x=median(tg$len[which(tg$supp=='OJ')]), y=0)
#which() outputs index values that match the condition.
#Mark median.
hist(tg$len[which(tg$supp=='VC')], main='Histogram of tooth length,\nvit. C group')
points(x=median(tg$len[which(tg$supp=='VC')]), y=0, pch=4, cex=2)
#Mark median with a large X.
#Axes don't match on these 2 histograms, which makes it hard to compare them.
par(mfrow=c(2,1))
#Now we'll have 1 histogram atop the other.
hist(tg$len[which(tg$supp=='OJ')], main='Histogram of tooth length,\norange juice group', breaks=seq(from=0, to=35, by=5), ylim=c(0,10), xlab='Tooth length')
hist(tg$len[which(tg$supp=='VC')], main='Histogram of tooth length,\nvit. C group', breaks=seq(from=0, to=35, by=5), ylim=c(0,10), xlab='Tooth length')
#Now let's try a couple scatterplots.
plot(x=tg$dose, y=tg$len, main='Scatterplot of dose by length')
plot(x=jitter(tg$dose), y=tg$len, main='Scatterplot of dose by length with jitter') #What if points are on top of each other? jitter() addresses this by adding
'noise'.
mtext('Same graph, with and without jitter()', outer=TRUE, padj=1, cex=1.5)
#Can use mtext() for multichart titles.
#Bar plots
table(tg$dose,tg$supp)
par(mfrow=c(1,2))
#How many observations do we have for each dose?
#Switch back to 2 graphs next to each other
barplot(table(tg$dose), main='Number of observations\nfor each dose')
barplot(table(tg$supp), main='Number of observations\nfor each treatment')
#Not terribly interesting, but it illustrates barplot().
#Any questions?
#Now load your own data, like this:
YourData = read.csv('X:/Path/To/Your/File/YourFile.csv',header=TRUE) #Read in your own .CSV file and examine/graph as above.
#or pick a dataset that looks interesting from the many datasets already in R.
library(help='datasets') #Browse the list of datasets in R (use arrow keys to scroll). Hit 'q' when done.
?YourChoiceOfDataset
#Learn more about some dataset or another, and examine/graph it as above.
#Explore your data with graphs. Don't worry about making a brilliant chart;
#just try out a few things and get a feel for the graphing functions.
#Take some time, try out some functions, discuss with your neighbors, raise your hand if you have any questions.
#Exporting graphs, to include them in manuscripts, presentations, etc.
getwd()
#Find out what your working directory is. That's where R looks for files and saves files.
setwd('X:/Path/To/Your/Working/Directory')
#Set your working directory to whatever you like - maybe a flash drive.
dev.new()
plot(seq(from=1,to=16,by=1),rep(0,16),pch=seq(from=1,to=16,by=1), cex=2)
#Making a plot to show possible plot characters.
dev.copy(pdf, 'TestPlot.pdf')
#Creates a new graphics device and copies the current plot window to it.
dev.off()
#Turns off that new graphics device, saving the file.
#Now open your working directory and doubleclick the file, and see what it looks like. It often differs from the plot window.
#A more reliable way to export graphs: write them directly to the device/filetype you want: png(), tif(), pdf(), etc.
png('TestPlot2.png')
#In addition to .PNG and .PDF, R supports many other graphics formats. .TIF is also useful.
hist(rnorm(1000, mean=0, sd=1), main='1000 standard normal variates')
dev.off()
#Now you can open the file in your working directory and view it.
#It is convenient to tinker with these graphs by continually modifying & re-running code,
#then repeatedly viewing the graph until it looks right.
#===========================================================================================================#
#Apply what you have learned ASAP (tomorrow!) when you return to work. Learning R requires a LOT of practice.
#===========================================================================================================#
#R graphics are very powerful, but great power often comes with great confusion.
#There are options you can set to do just about anything you want - but they are not always well documented.
#To search for general R info, try http://rseek.org or just Google it, prefixing your query with 'GNU R'.
?par
#Remember that these options can also be used within graphing functions.
#Get any color you want: http://research.stowers-institute.org/efg/R/Color/Chart/index.htm
colors()[155]
#colors() is a function, but consider it a big vector. Use the names that it outputs with the 'col' option.
#How to add special symbols: http://statisticsr.blogspot.com/2008/01/special-symbols-on-r-plot.html
#Very useful site for learning R in general (or Google 'Quick-R'): http://www.statmethods.net/
#R graph gallery (includes source code): http://addictedtor.free.fr/graphiques/
#There are other packages that further extend graphing capabilities.
#The 'lattice' package is powerful and widely used. 'ggplot' is another popular package.
#Install packages like this (very easy): install.packages('YourPackageName')
#The standard 'R Editor' in Windows is pretty crummy. This free text editor is better: http://notepad-plus-plus.org/
#Better yet: Switch to Linux (try Ubuntu or Linux Mint) and use 'gedit' with the 'rgedit' plugin. R runs faster on Linux.
Download