Creating ScatterPlots and BoxPlots in R Courtney Saari (cnosac1@tigers.lsu.edu) 11/13/09 This is a step-by-step self-tutorial on how to create various plots in R. It examines the many uses of the plot() function. References used: Crawley, Michael J. 2007. The R Book. John Wiley & Sons, U.K. Venables, W. N., D.M. Smith, and the R Development Core Team. 1999-2009. An Introduction to R: Notes on R: A Programming Environment for Data Analysis and Graphics. Version 2.10.0 (2009-10-26). plot() is a generic function that can be used to produce a variety of plots, depending on the type or class of the first argument. For the most part, we would use the plot function to create scatterplots, but it can also be used to create a time-series plot, box-and-whisker plots, and distributional plots. plot(x,y) Scatterplot of y against x plot(xy) This also produces a scatterplot of y against x plot(x) This can be used to plot a time-series plot plot(x) This can also be used to plot the values in a numeric vector against their index in the vector plot(factor, y) box-and-whisker plot of y for each level of factor plot(f) or plot(factor) bar plot of f plot(df) distributional plot of the variables in a data frame plot(~ expr) distributional plot of the variables of a number of named objects plot(y ~ expr) plots y against very object named in expr abline(lm(y~x)) adds a regression line to your scatterplot, uses the function for a linear model The following pages have some step-by-step coding instructions with a sample dataset (body.csv) and graphics. #REMOVE ALL PREVIOUS VARIABLES rm(list=ls(all=TRUE)) #SET YOUR WORKING DIRECTORY TO READ IN FILES setwd("/Users/Courtney/R_Project/") #CLOSE ALL PREVIOUS GRAPHICS WINDOWS graphics.off() #CREATE AN OUTPUT DIRECTORY TO SAVE ALL PLOTS AND TABLES od="/Users/Courtney/R_Project/") # IMPORT THE DATA SET body=read.csv("/Users/Courtney/R_Project/body_data.csv.dat", header=T) #CHECK TO MAKE SURE IT LOADED CORRECTLY body[1:5,] #MAKING A SIMPLE SCATTERPLOT OF Chest Depth vs Chest Diameter plot(body$chest_diam, body$chest_depth) #ADDING A REGRESSION LINE TO THE SCATTERPLOT abline(lm(body$chest_depth~body$chest_diam)) #ADDING & EDITING THE AXIS LABELS plot(body$chest_diam, body$chest_depth, main="Chest Diameter vs Depth", xlab="Chest Diameter", ylab="Chest Depth") #SAVING THE SCATTERPLOT dev.copy(png, 'ChestDepthvDiam.png') dev.off() #make a new scatterplot of Knee Girth vs Knee Diameter plot(body$knee_diam, body$knee_girth,main="Scatterplot of Leg Measurements", xlab="Diameter (cm)",ylab="Girth (cm)") #adding a regression line to the scatterplot abline(lm(body$knee_girth~body$knee_diam)) #ADDING NEW POINTS TO THE SCATTERPLOT points(body$ankle_diam,body$ankle_girth, col="blue") #FINDING THE X & Y AXIS VALUES OF BOTH SETS OF POINTS range(c(body$knee_diam,body$ankle_diam)) [1] 9.9 24.3 range(c(body$knee_girth,body$ankle_girth)) [1] 16.4 49.0 #READJUSTING THE AXES & COLORCODING THE 2 SETS OF POINTS plot(c(body$knee_diam, body$knee_girth),c(body$ankle_diam,body$ankle_girth),xlim=c(5,30), ylim=c(10,60),main="Scatterplot of Leg Measurements",xlab="Diameter (cm)",ylab="Girth (cm)",type="n") points(body$knee_diam,body$knee_girth,col="turquoise2") points(body$ankle_diam,body$ankle_girth,col="hotpink2") #ADDING THE 2 REGRESSION LINES, COLODCODING, & ADJUSTING LINE WIDTH abline(lm(body$knee_girth~body$knee_diam,),col="turquoise2",lwd=2) abline(lm(body$ankle_girth~body$ankle_diam),col="hotpink2",lwd=2) #CHANGING THE SHAPES OF THE 2 SETS OF POINTS points(body$knee_diam,body$knee_girth,col="turquoise2",pch=24) #bell points(body$ankle_diam,body$ankle_girth,col="hotpink2",pch=22) #gear #changing the shapes of the 2 sets of points points(body$knee_diam,body$knee_girth,col="turquoise2",pch=5) #open diamond points(body$ankle_diam,body$ankle_girth,col="hotpink2",pch=8) #asterisk #ADDING A LEGEND #the locator function allows you to click where on the graph you want the legend legend(locator(1),c("knee","ankle"),pch=c(5,8),col=c("turquoise2","hotpink2")) #saving the scatterplot dev.copy(png, 'LegGirthsvDiams.png') dev.off() # IDENTIFYING INDIVIDUALS IN SCATTERPLOTS USING as.numeric() TO CONVERT THE GROUPING FACTOR / SUBJECT INDENTIFIER INTO A COLOR AND/OR SYMBOL subject=factor(body$height) plot(body$knee_diam,body$knee_girth,col=as.numeric(subject),pch=as.numeric(subject),xlab=" diameter",ylab="girth") #note this does not work well with this dataset, because there isn't a subject or ID variable # LABELING EACH POINT ON THE SCATTERPLOT WITH A 3RD VARIABLE plot(body$knee_diam, body$knee_girth,xlab="Diameter (cm)",ylab="Girth (cm)", main="Knee Dimensions by Age") text(body$knee_diam, body$knee_girth,labels=round(body$age,2),pos=1, offset=0.5,cex=0.7) # COLORCODING EACH POINT ON THE SCATTERPLOT WITH A 3RD VARIABLE #making the points above the median age are in red the others in black plot(body$knee_diam, body$knee_girth,xlab="Diameter (cm)",ylab="Girth (cm)", main="Knee Dimensions by Age", pch=16, col=ifelse(body$age>median(body$age), "red","black")) #ADDING TEXT TO A PLOT text(locator(1),"part a.") #you can also type in the exact location of where you would like the text to go, if you do not want to use the locator function text(x=16,y=45,"test") #ADDING SHAPES TO A PLOT plot(body$knee_diam, body$knee_girth,xlab="Diameter (cm)",ylab="Girth (cm)", main="Scatterplot With Shapes") #ADDING A RECTANGLE: rect(xleft, ybottom, xright, ytop) rect(21,45,24.5,49.5,border="dodgerblue") #drawing a rectangle around 3 outliers #FILLING IN A RECTANGLE WITH SHADING LINES density= and angle= rect(22,30,24,34,border="tomato2",density=12,angle=125,col="tomato2") #ADDING ARROWS WITH THE ARROWHEAD BY DEFAULT AT THE SECOND END arrows(16.1,28,16.1,29.9,col="darkorchid2",lwd=2) #ADDING A DOUBLE HEADED ARROW arrows(21.9,42.9,22.9,42.2,col="goldenrod3",code=3) #ADDING AN ERROR BAR (vertical bar with 2 straight ends) arrows(22.4,38,22.4,40,code=3,angle=90) #MAKING A POLYGON USING THE LOCATOR FUNCTION locations=locator(6) #click on 6 points class(locations) locations$x locations$y polygon(locations, density=15,angle=45,border="seagreen2",col="seagreen2") #PLOT MULTIPLE GRAPHS AT ONCE TIME par(mfrow=c(2,2)) plot(body$height,body$chest_depth,xlab="Height (cm)",ylab="Chest Depth (cm)") plot(body$weight,body$chest_depth,xlab="Weight (kg)",ylab="Chest Depth (cm)") plot(body$height,body$chest_diam,xlab="Height (cm)",ylab="Chest Diameter (cm)") plot(body$weight,body$chest_diam,xlab="Weight (kg)",ylab="Chest Diameter (cm)") #BOXPLOTS #first we need to declare the factors gender=factor(body$gender) #now we can make a simple boxplot with the plot(f,y) function of y for each level of f #you can use plot() or boxplot() they both do the same thing, as long as you have already declared the factors plot(gender,body$height,xlab="Gender (0=Female, 1=Male)",ylab="Height") #adding notches to indicate significant difference on the boxplot plot(body$height~gender,xlab="Gender (0=Female, 1=Male)",ylab="Height",notch=T) #here is a boxplot with more factors AgeGroup=factor(body$age_group) plot(AgeGroup,body$chest_diam,xlab="Age Group",ylab="Chest Diameter") #using boxplots for multiple comparisons with the function: plot(response~factor) plot(body$chest_diam~factor(AgeGroup)) #note this gives you the same plot as using the code plot(AgeGroup,body$chest_diam)