SUMMARY OF R COMMANDS GENERAL getwd() # shows the actual directory setwd(choose.dir()) # sets the directory setwd("C:/Users/rui.pinto/Documents/Rdocs/experiments") setwd("../") # goes down one directory in the path setwd("./data") #adds one directory to the path install.packages("calibrate") # to download new packages from the internet package ? calibrate # to get info on the package library(help = calibrate) class(x) # shows the Class of objects: character, numeric, integer, complex, logical. sapply(X[1,],class) #shows the class of every variable str(x) # shows info on the object as.numeric(x): # to exchange the class of the object. Inf, NA and NaN exist: is.na() # used to logical test if objects are NA. is.nan() the same for NaN. NOTE: NA includes NaN but inverse not true. table(c(0,1,2,3,NA,3,3,2,2,3),useNA="ifany") #shows how many times each type of value appears To remove missing values: miss_vals <- is.na(x), x[!miss_vals]. To remove missing values in more than one vector (or matrix): good_vals <- complete.cases(x, y), x[good_vals]. na.rm= FALSE # does not include NA values in the calculation of a function e.g. mean(x=mydata , na.rm=FALSE). na.omit #is used in some functions to not use NAs report[is.na(report)] <- 0 # to use zeros instead of NA If one needs an integer, we can specify it with "L", e.g. 1 is numeric, 1L is integer help(zemanel) # lists help for the function or dataset library () # lists a short description of the libraries available library (help="libraryname") # shows functions in that library library(calibrate) # search() # lists all the packages installed at the moment. Functions are used according to a certain order of their package. Order of packages : 1 - workspace (.Global environment), 2 - loaded libraries, 3 - packages according to "search()" "environment" is a collection of pairs "symbol,value" (e.g. x as symbol and 4 as its value). "closure" (or function closure) is the pair "function, environment") ls() # lists all objects created data () # lists all available datasets included in R data(trees) # shows the data "trees" rm(x,y) #removes (delete) x and y rm(list=ls()) # deletes all the variables in the workspace getwd() # gets the work directory list.files() # lists all the files in the work directory. It is the same as dir() b=list.files("C:/My Documents") # gets all the files in this directory. history(max.show=100, pattern="dimnames") #shows the last 100 commands that include the word "dimnames" AND, OR, AND/OR - &&, options(warn=-1) # to stop showing warnings stop("need to input two values") #error messages regexpr(string1,string2) # comparison of strings ageGroups<-cut2(X$age,g=5) # turns a numeric variable into a factor variable VECTORS vector(mode = "logical", length = 10) # creates a vector with 10 values of FALSE (in case of "numeric", it's 0's. z<-c(5,9,1,0) # creates vector names(z) <- c("foo", "bar", "norf","ze") # gives a name to each of the objects of z # create sequences of numbers x<-1:10 seq(1,9,by=2) # Result [1] 1 3 5 7 9 seq(8,20,length=6) # Result [1] 8.0 10.4 12.8 15.2 17.6 20.0 rep(0,100) # creates 100 zeros rep(1:3,2) # Result [1] 1 2 3 1 2 3 rep(1:3,rep(6,3)) # Result [1] 1 1 1 1 1 1 2 2 2 2 2 2 3 3 3 3 3 3 INDEXING [ # always returns object of the same class as the original (can be used to more than one element) [[ # used to extract elements of a list or data frame (single element, class is not necessarily the same) $ # used to extract elements of a list of data frame by name (similar to [[ ) z[1,1] # first value of matrix z[c(2,3),2] # indexing for two values in column 2 z[,2] # column 2 z[1:2,] # lines 1 and 2 x[1:6] # first six values of vector x x[c(2,4,9)] # selects only the values with index 2, 4 and 9 x[-(1:6)] # excludes some values. It is the same as x[7:12] y=subset(x,x<7) #select those cases that meet the logical condition (x<7) MATRICES (every element is the same class) head(x) # displays the first 6 lines of the matrix z<-cbind(x,y) # creates a matrix of column vectors x y. Can be applied to matrices (rbind also works, for lines) dim(x)<- c(2,5) # creates a matrix 2 x 5 from a vector with 10 values z<-matrix(c(5,7,9,6,3,4),nrow=3) # creates matrix with 3 rows (along columns) z<-matrix(c(5,7,9,6,3,4),nrow=3,byrow=T) # creates matrix with 3 rows (along rows) dim(z) # to see the dimension of a matrix nrow(x) and ncol(x) # to see nr of rows or columns of matrix y%*%x # matrix multiplication t(x) # transposed matrix new=old[,-5] # deletes column 5 attach(trees) # the labels of the variables become variables (then use e.g. mean(height)) detach(trees) # "un"-attaches the labels trees$height # works as attach apply(trees,2,mean) # calculates the mean of columns of matrix "trees" (for lines use 1) m <- matrix(1:4, nrow = 2, ncol = 2), dimnames(m) <- list(c("a", "b"), c("c", "d")) # names the elements of m dimnames(m) <- list(NULL,c("e", "f")) # deletes name of lines and puts a name in columns. Note: "names" and "rownames" do not work on matrices, only in data frames. rownames(x, do.NULL = TRUE, prefix = "row") # NOTE: prefix="col" can be used for the columns. rownames(x) <- value # value is a vector with the names of the rows. Used together with line below. order(X$var) #gives the indexes of the var by order of magnitude Y <- X[order(X$var1),] or we can use Y <- X[order(X$var1, X$var2),] to order a matrix according to variables y=x[1, 2, drop = FALSE] # using the drop=FALSE, y is a matrix, not a vector LISTS (vectors that can contain elements of different classes) x<-list(1,"a", TRUE) # creates a list with these 3 elements x <- list(a = 1, b = 2, c = 3) # to create names in lists ze<-x[[1]] # creates vector ze with the first element of the list. b<-x[1] # creates list b. And in this case it is the same as b=x["a"] b<-x[c(1,3)] # to extract multiple elements of a list x$a # to access the value of a (which is an element of x). The same as x[["a"]]. But the $ cannot use computed names. x[[c(1,3)]] # is a vector with the 3rd value of the 1st element of list x. The same as x[[1]][[3]] FACTORS (used to represent categorical data. May be ordered or unordered) x <- factor(c("yes", "yes", "no", "yes", "no")) unclass(x) # returns the numerical values of each level x <- factor(c("yes", "yes", "no", "yes", "no"), levels = c("yes", "no")) # defines the order of the values factorsx<-gl(2,5) #makes a vector of 2 levels with 5 points each interaction(factor1,factor2); split( table(X) # counts the number of times each level appears in X DATA FRAMES Data frames are a special type of list, where every element of the list has the same length (matrix). Elements are columns and length of each element is nr rows. Can store different classes of objects in each position. Looking at data: dim(X), names(X), nrow(X), ncol(X), quantile(X$var1), summary(X), head(X), class(X),sapply(X[1,],class) unique(X$var1), length(unique(X$var1)), table(X$var1) (for qual vars), table(X$var1,X$var2) (relation between vars, for qual vars), any(X$var>40), all(X$var>40); X[X$var1 > 0 & X$var2 > 0,c("Lat","Lon")] (we can also use "or" |); colSums(X) (has a problem with NA's); colMeans(X, na.rm=TRUE); rowMeans(X, na.rm=TRUE); mergedXY=merge(X,Y,all=TRUE) #other possibilities are x, y, by.x, by.y (eg. to merge according to one of the variables) x <- data.frame(foo = 1:4, bar = c(T, T, F, F)) (the next 2 functions can also be applied in vectors) ze<-x[[1]] # creates vector ze from data frame x, with the first element of the data frame (column 1). Same as ze<-x$a b<-x[1] # creates list b from data frame x, with the 1st element of the data frame (column 1). And in this case it is the same as b=x["a"] row.names # to use with data frames row.names(x)<-c("ze","manel","pedro", "filipe") # gives these names to the lines data.matrix() # transforms a data frame into a matrix. nrow(x) and ncol(x) can be used with data frames dataX_frame$time<-dataX$time # attribution of one column of one data frame to another data frame dataX_frame[,2]<-dataX[,2] # the same as above dataX_frame[,1]<-dataX_matrix[,2] # attribution of one column of matrix to a data frame dataX_frame[,2]<-dataX_matrix$time # ERROR can't pass atomic vector to frame (as above) using $ melt(misShaped,id.vars="people",variable.name="treatment",value.name="value") #reshapes matrix BASIC PLOTS par (col="red")# definitions to plots: col= "red" (color), pch=3 (symbol type), lty (line type), lwd =3 (line width), las=2 (orientation of the axis labels), bg (background color), mar =c(3.4, 4.2, 3.1, 5) (margins size), oma=c (0,0,1,0)(outer margin size), mfrow = c(2,3) (nr plots per row and column, filled row wise), mfcol (nr plots per row and column, filled column wise), cex=0.5 (point size) par(no.readonly = TRUE) # shows the actual values of all the par settings par("col") # shows the actual color par(mfrow=c(1,2)) # creates a window of graphics of 1 x 2. Can also use mfcol example(points) #opens a demo that shows different types of plots dev.new() # opens a new window for figure dev.set(1) # changes the window to nr 1, so we can produce a plot there dev.off(2) # closes graphical window nr 2 dev.list() # to know the number of plots open graphics.off() # closes all graphs par(mfrow=c(1,1)) # to return graphics window to standard size x=c(2,3) y=c(90,80) lines(x,y) # draws a line from point (2,90) to point (3,80) abline(50,2) # draws a line with origin in y=50 and slope = 2 abline(v=3) # vertical line at x = 3 abline(h=60) # horizontal line at y = 60 fit <- lm(y ~ x); abline(fit) #linear regression on points x,y lines (x,y) # creates lines text(x,y,labels) #adds text labels to points title # adds annotations to x, y axis labels, titles, subtitles, outer margin plot(x,y,xlab="travel time", ylab="whatever unit", cex.lab=2,cex.axis=1.5) #puts x and y labels and defines size mtext #adds arbitrary text to the margins (inner or outer) legend("topleft",legend="Data",pch=20) axis #adds axis ticks/labels export formats: pdf, postscript, xfig (to edit a plot by hand in Unix), png (nice bitmap format), jpeg (photos), bitmap (bnp), bmp (windows bitmap format) # to define ranges in a plot, histogram, etc range11<-range(as.numeric(outcome[, 11]), na.rm=TRUE) plot(jitter(x)) # to be able to see points that are on top of each other TYPES OF PLOTS boxplot(Height,col="blue") # boxplot with median, 25% and 75%, plus bars that are 1.5 times the limits of the boxes boxplot(X$var1 ~ as.factor(X$var2),col="blue","orange", names=c("yes","no"), varwidth=TRUE)) barplot(table(X$var1),col="blue") hist(X, main="Heart Attack",xlab="30-day Death Rate",xlim= range11,breaks=100) dens<-density(X$var1), plot(dens,lwd=3,col="blue") # its like a smoothed histogram, but with percentages. Allows to show more than one distribution. dens <- density(pData$AGEP), densMales <- density(pData$AGEP[which(pData$SEX==1)]), plot(dens,lwd=3,col="blue"),lines(densMales,lwd=3,col="orange") plot(Height,Volume, xlab="sample nr",ylab="temperature") # scatter plot (its identical to pairs) with axis labels # parameters for scatterplots x,y,type,xlab,ylab,xlim,ylim,cex,col,bg pairs(trees) # scatter plot of all the variables in matrix "trees" points(3,4) # draws a dot in coordinates (3,4) qqplot(x,y), abline(c(0,1)) #plots of percentiles of x against percentiles of y to plot groups points in different colors: plot(x,y,type="n") #creates axis but without the points points(x[g=="Male"], y[g==["Male"], col="green") # first plot one of the groups points(x[g=="Female"], y[g==["Female"], col="blue", pch=19) plot(X$var1,Xvar2,col=X$var3) #it can also be done like this image(1:10, 161:236, as.matrix(X[1:10, 161:236])) # heatmap transposed (maybe transpose the matrix first) pdf export commands: pdf(file = "testRplot.pdf") # open the pdf x <- rnorm(100) hist(x)# plot whatever we want dev.off()# it has to be called to close the pdf device dev.copy2pdf(file="graph1.pdf") #this will copy exactly the graph that we see LATTICE FUNCTIONS PLOTS library(lattice) library(nlme) library(lattice) library(nlme) xyplot(distance ~ age | Subject, data = Orthodont) # this will plot all individuals in different plots xyplot(distance ~ age | Subject, data = Orthodont, type = "b") # same, but with a line xyplot, bwplot,histogram,stripplot, dotplot, splot, levelplot, contourplot CONTROL STRUCTURES if, else; for; while; repeat; break; next; return FOR for(i in 1:ncol(dat)) {asdklfjaldfjk} for (i in 1:length(id) {jadfjakjsdflkjalsdj} if (X==TRUE) {ALKDJFADJKF }else {alkdjflajkdfkl} # NOTE: when using "else" or "else if", the preceding } must be on the same line as "else" # one can also use else if (with space between them) WHILE while (count<10) { asdfasdf } REPEAT/BREAK repeat { asdfalsdkfjalsdjk if (asdf<0) {break} } NEXT for (i in 1:100){ if (i<=20{ next # skips the first 20 iterations } } RETURN FUNCTIONS source("functions_meg.R") # will load all the functions in this script to the workspace fix (sd) # opens the editor to write a function called sd (although in general one opens it in FILE/OPEN) help(functionX) # shows the functionX help invisible(x) # when x is the output, it does not print it (for some functions) structure of a function: sd2<- function (x) { 2*x } formals(sd) # lists all the input arguments of the function (formal arguments). In this case "x" ... # indicates a variable number of arguments that are usually passed to other functions, eg: myplot<- function (x,y,type="l", ...) { plot(x,y,type = type, ...) } #this passes all the other arguments of "plot" to "myplot" #in case "..." is the first argument of a function, it means the function does not know how many arguments are going to be input into it (e.g. functions "paste" and "args") #arguments that come after the "..." must be named explicitly (no positional or partial matching). echo=T # use it in the source function, to see the functions in the console as they are processed #if one object is defined in the body of a function and has no value, the function will look for its value in memory; #but functions can be defined inside another function. In that case, the function will look for its value in the function. example: make.power<-function(n) { pow <- function(x) { x^n } pow} Results: cube<make.power(3) cube(3) result is 27 square(3) result is 9 SPECIAL FUNCTIONS lapply(mtcars,mean) # calculates the mean of columns of "mtcars" (always returns a "list") y1<-lapply(mat1.df,function(x,y) sum(x)+y,y = 5) # example of using built-in function sapply(split(mtcars$mpg, mtcars$cyl), mean, na.rm=TRUE) # similar to lapply. But if simplify=T, it simplifies (to a vector or matrix) apply(trees,2,mean) # calculates the mean of columns of matrix "trees" (for lines use 1) apply(X,1,quantile,probs=c(0.25,0.75)) # calculates the desired percentiles of each row split(mtcars$mpg, mtcars$cyl) #splits variable mpg into list with groups defined in variable cyl. Used with lapply, sapply tapply(iris$Sepal.Length,levels_x,mean) # used in vectors, to calculate something (mean) based on coder "levels_x" rowSums, rowMeans, colSums, colMeans # used only on matrices (only interesting in large ones) mapply(rep,1:4, 4:1) # it is like apply, but multivariate. newdata <- subset(mydata, age >=20 | age <10 newdata<-subset(mydata, sex=="m" | age>25) newdata<- mydata [which(mydata$gender=='F' & mydata$age >65) newdata<-subset(mydata, sex=="m" & age>25) order(vector) # gives the indexes of vector by (alphabetical, numerical) order ordered_vec<-matA[order (matA [,1], matA [,2]),] # orders matrix matA according to its column 1 and then column 2 sort(vector) #sorts the vector or matrix READ Some file types: tab-delimited text; comma-separated text; Excel file; JSON File; HTML/XML file; Database b<-scan("ap1.txt") # reads a text file with only numbers to a vector b X<-read.csv(file.choose()) # opens the window to choose the file read.table # arguments: file, header, sep, colClasses, nrows, comment.char, skip, stringsAsFactors d<-read.table("ap1.txt", sep=",",header = FALSE) # reads a text file with a table (only numbers) into a data frame d d<read.table("ap.txt",header=TRUE,row.names=1) # read tab or space delimited with labels not including pos. (1,1) if row.names=3 it means that the row names are in column 3 read.table("ap2.txt",header=TRUE) # read a tab or space delimited file (including position (1,1) NOTE: one can use also read.delimited (tab delimited) or read.csv (comma delimited) files. A <- matrix(scan("matrix.dat", n = 200*2000), 200, 2000, byrow = TRUE) # read big file (only numbers) #read files with labels in first row read.table(filename,header=TRUE,sep=',') #read csv files read.xls("doe.xls", sep="",stringsAsFactors=FALSE) #NOTE: needs library(xlsReadWrite) and only reads .xls (not .xlsx) read.xlsx(); read.xlsx2() # needs the xlsx package readLines # to read lines of a text file source("ap_data.R") # to read in R code files (inverse of dump). Doesn't need to attribute it to new variable. k<-dget("ap_data.R") # for reading in R code files (inverse of dput). Needs to be attributed to new variable. load("workspace_meg.RData") # for reading in saved workspaces. In binary load("ap.02") # loads the variables that are inside the file "ap.02". In binary (saved with "save") unserialize, for reading single R objects in binary form ## read files directly con <- file("./data/cameras.csv","r") cameraData <- read.csv(con) close(con) ##download files from the internet (may use "library(XML)") con <- url("http://www.jhsph.edu", "r") , x <- readLines(con,10) # to read 10 lines of an internet page fileUrl <- "https://data.baltimorecity.gov/api/views/dz54-2aru/rows.csv?accessType=DOWNLOAD" download.file(fileUrl,destfile="./data/cameras.csv") list.files("./data"), dateDownloaded <- date() # to read and parse an internet page html3 <- htmlTreeParse("http://scholar.google.com/citations?user=HI-I6C0AAAAJ&hl=en", useInternalNodes= xpathSApply(html3, "//title", xmlValue) xpathSApply(html3, "//td[@id='col-citedby']", xmlValue) # this one will get "col-citedby" of the table ("td") WRITE write.table writeLines dump("ap1",file="ap_data.R") # creates file with metadata ready to load using "source". Creates variable ap1 dump(c("a","k"),file="multi_dump.R") # to create files with more than one variable dput(ap1,file="ap_data.R") # creates file with metadata ready to load using "dget". Doesn't create variable ap1 save.image(file="workspace_meg.RData") # saves the workspace. In binary savehistory("filename.Rhistory") # saves the history of commands. To reload, do loadhistory("filename.Rhistory") save(a,file="ap.02") # saves the object "a" in a file "ap.02". In binary serialize filename_X<-paste("Pelagia_ctr_",depth_type[[1]],"_",season_type[[1]],".txt",sep="") write.table(final_data[[1]], file = filename_X,append = FALSE, sep = "\t") write.xls(response,paste(workpath,currentDOE,"/response.xls",sep="")) write.table(temp,file=paste(workpath,currentDOE,"/Run",i,".txt",sep=""),sep="\t",dec=",") save(X,Y,file="./data/cameras.rda") # allows one to save multiple datatables (also can use list instead of X,Y) DEBUGGING # message (function continues, maybe o problem), warning (something wrong, but not fatal) , error (fatal error), condition (indicates that something unexpected occurred; programmers can create their own conditions) traceback() # prints out the function call stack after an error occurs (does nothing if there's no error) debug # more important one. flags a function for debug (then it goes one line at a time). Use ENTER to continue. browser # suspends the execution of a function wherever it is called and puts the function in debug mode trace# allows to insert debugging code into a function in specific places (in general "browser" command) recover # allows to modify the error behavior so you can browse the function call stack. Use "options(error=recover)". print/cat # useful STATISTICS mean(x) var(x) summary(x) svd(x) # to do pca COMPLETE PCA example # How to open a txt (tab delimited) with a matrix (nothing on position 1,1) with labels on objects and variables and with commas as decimal point, and then run PCA C:\Users\rui.pinto\Documents\WorkMEg\data\Carragheenan Paris dataX<-read.table("carragR.txt",header=TRUE,sep="\t",dec=",",row.names=1) X<-data.matrix(dataX[6:1795]) Xclass<-dataX[1:5] plot(1:1790,X[1,], type="l") plot(1:1790,t(X), type="l") num_comp<-6 # if one wants to find six components pca_res<-svd(X, num_comp, num_comp) #Has to be applied on a matrix or dataframe eigenval<-pca_res$d[1: num_comp] diag_eig<-diag(eigenval) scoX<-pca_res$u%*%diag_eig loadX<-pca_res$v plot(scoX[,1],scoX[,2],type="p") # plots with dots. For lines do "type="l" (L) text(scoX[,1],scoX[,2],Xclass[[2]], pos=2,offset=0.5,cex=0.5) # plots labels 0.5 (offset) on the left (pos), size 0.5 (cex) PACKAGES httr - to work with http connections foreign - to get data into R from SAS, SPSS, Octave, etc library(maps) DATA ANALYSIS DATA MUNGING ## CLEANING names(X) tolower(names(X)) # to change everything to lowercase # to split names in the dots: splitNames = strsplit(names(X),"\\.") firstElement <- function(x){x[1]} sapply(splitNames,firstElement) # to split names names(X)=sapply(splitNames,firstElement) # to replace symbols or letters names(X)=sub("_","",names(X),) #use gsub to substitute all of them # TO CREATE RANGES IN QUANTITATIVE VARIABLES (KINDA HISTOGRAM) timeRanges <- cut(reviews$time_left,seq(0,3600,by=600)) table(timeRanges,useNA="ifany") # (ALTERNATIVELY, we can define the nr of ranges, using cut2) library(Hmisc) timeRanges<- cut2(reviews$time_left,g=6) table(timeRanges,useNA="ifany") # create new variable X$newvar<-varX # merge lines of Y that have the same value as X (in the columns defined) #NOTE: we can use by, by.x, by.y, all XY<-merge(X,Y,by.x="sporder",by.y="DDRS") # SORT AND ORDER Xvector_sorted<-sort(X) Xmatrix_sorted<-Xmatrix(order(Xmatrix$ind_nr)) # RESHAPE (create more lines, using only the defined variables) melt(X, id.vars="people",variable.name="treatment",value.name="value") INITIAL PLOTS plot(jitter(pData$sporder)) # boxplots boxplot(pData$AGEP ~ as.factor(pData$DDRS),col="blue") boxplot(pData$AGEP ~ as.factor(pData$DDRS),col=c("blue","orange"),names=c("yes","no"),varwidth=TRUE) # density plots (line histograms) dens <- density(pData$AGEP), densMales <- density(pData$AGEP[which(pData$SEX==1)]) density(na.omit(X$v1)) #does not uses the NAs plot(dens,lwd=3,col="blue"), lines(densMales,lwd=3,col="orange") #for multiple distributions # histograms hist(pData$AGEP,col="blue",breaks=100,main="Age") # scatterplots (with colors according to a column) plot(pData$JWMNP,pData$WAGP,pch=19,col=pData$PUMA,cex=0.5) # scatterplots (with size according to a column) size_dots<-pData$JWMNP/max(pData$JWMNP) plot(pData$JWMNP,Pdata$wagp,pch=19,cex=size_dots*10) # to plot the points of a variable that are NA in another variable y[x < 0] <- NA boxplot(x ~ is.na(y)) #NOTE: some plots may need library(Hmisc) HIERARCHICAL CLUSTERING set.seed(1234); par(mar=c(0,0,0,0)) x <- rnorm(12,mean=rep(1:3,each=4),sd=0.2) y <- rnorm(12,mean=rep(c(1,2,1),each=4),sd=0.2) plot(x,y,col="blue",pch=19,cex=2) text(x+0.05,y+0.05,labels=as.character(1:12)) dataFrame <- data.frame(x=x,y=y) distxy <- dist(dataFrame) hClustering <- hclust(distxy) # col=color_vector will add colour plot(hClustering) (heatmap) dataMatrix <- as.matrix(dataFrame)[sample(1:12),] heatmap(dataMatrix) K-MEANS dataFrame <- data.frame(x,y) kmeansObj <- kmeans(dataFrame,centers=3) names(kmeansObj) (heatmap) set.seed(1234) dataMatrix <- as.matrix(dataFrame)[sample(1:12),] kmeansObj2 <- kmeans(dataMatrix,centers=3) #nstart=100 will produce 100 models with random start points and average them par(mfrow=c(1,2),mar=rep(0.2,4)) image(t(dataMatrix)[,nrow(dataMatrix):1],yaxt="n") image(t(dataMatrix)[,order(kmeansObj$cluster)],yaxt="n") (to plot the variable centers, do the command below to the different cluster centers) plot(kClust$center[1,1:10],pch=19,ylab="Cluster Center",xlab="") PCA svd1 <- svd(scale(dataMatrixOrdered)) #UV-scaled par(mfrow=c(1,3)) image(t(dataMatrixOrdered)[,nrow(dataMatrixOrdered):1]) plot(svd1$u[,1],40:1,,xlab="Row",ylab="First left singular vector",pch=19) plot(svd1$v[,1],xlab="Column",ylab="First right singular vector",pch=19) (eigenvalues) par(mfrow=c(1,2)) plot(svd1$d,xlab="Column",ylab="Singluar value",pch=19) plot(svd1$d^2/sum(svd1$d^2),xlab="Column",ylab="Percent of variance explained",pch=19) (for faster pca calculation) fast.svd(scale(bigMatrix),tol=0)# tol=0 is the tolerance in variance that it calculates more pc's NOTE: in the case of missing values, we can impute (calculates close values to the ones missing, using knn) library(impute) dataMatrix2 <- impute.knn(dataMatrix2)$data (to regenerate the data) svd1 <- svd(scale(faceData)) # %*% is matrix multiplication # Here svd1$d[1] is a constant approx1 <- svd1$u[,1] %*% t(svd1$v[,1]) * svd1$d[1] # In these examples we need to make the diagonal matrix out of d approx5 <- svd1$u[,1:5] %*% diag(svd1$d[1:5])%*% t(svd1$v[,1:5]) approx10 <- svd1$u[,1:10] %*% diag(svd1$d[1:10])%*% t(svd1$v[,1:10]) REGRESSION plot(galton$parent,galton$child,pch=19,col="blue") lm1 <- lm(galton$child ~ galton$parent) # y~x lines(galton$parent,lm1$fitted,col="red",lwd=3) summary(lm1) (residuals) plot(galton$parent,lm1$residuals,col="blue",pch=19) abline(c(0,0),col="red",lwd=3) NOTE: plot(lm1) # gives some plots to evaluate the model (residuals, outliers) MULTIPLE REGRESSION lmBoth <- lm(hunger$Numeric ~ hunger$Year + hunger$Sex + hunger$Sex*hunger$Year) plot(hunger$Year,hunger$Numeric,pch=19) points(hunger$Year,hunger$Numeric,pch=19,col=((hunger$Sex=="Male")*1+1)) abline(c(lmBoth$coeff[1],lmBoth$coeff[2]),col="red",lwd=3) abline(c(lmBoth$coeff[1] + lmBoth$coeff[3],lmBoth$coeff[2] +lmBoth$coeff[4]),col="black",lwd=3) REGRESSION WITH FACTOR VARIABLES lm1 <- lm(movies$score ~ as.factor(movies$rating)) summary(lm1) REGRESSION WITH BINARY OUTCOME (logistic regression) logRegRavens <- glm(ravensData$ravenWinNum ~ ravensData$ravenScore,family="binomial") summary(logRegRavens) (confidence intervals) exp(logRegRavens$coeff) (ANOVA) anova(logRegRavens,test="Chisq") CONFIDENCE INTERVALS confint(sampleLm4,level=0.95) REGRESSION FOR COUNT OUTCOMES plot(gaData$julian,gaData$visits,pch=19,col="darkgrey",xlab="Julian",ylab="Visits") lm1 <- lm(gaData$visits ~ gaData$julian) abline(lm1,col="red",lwd=3) (CONFIDENCE INTERVALS) library(sandwich) confint.agnostic <- function (object, parm, level = 0.95, ...) { cf <- coef(object); pnames <- names(cf) if (missing(parm)) parm <- pnames else if (is.numeric(parm)) parm <- pnames[parm] a <- (1 - level)/2; a <- c(a, 1 - a) pct <- stats:::format.perc(a, 3) fac <- qnorm(a) ci <- array(NA, dim = c(length(parm), 2L), dimnames = list(parm, pct)) ses <- sqrt(diag(sandwich::vcovHC(object)))[parm] ci[] <- cf[parm] + ses %o% fac ci } confint(glm1) ############################################ glm2 <- glm(gaData$simplystats ~ julian(gaData$date),offset=log(visits+1), family="poisson",data=gaData) plot(julian(gaData$date),gaData$simplystats/(gaData$visits+1),col="grey",xlab="Date", ylab="Fitted Rates",pch=19) lines(julian(gaData$date),glm2$fitted/(gaData$visits+1),col="blue",lwd=3) REGRESSION FOR RATES glm2 <- glm(gaData$simplystats ~ julian(gaData$date),offset=log(visits+1), family="poisson",data=gaData) plot(julian(gaData$date),glm2$fitted,col="blue",pch=19,xlab="Date",ylab="Fitted Counts") points(julian(gaData$date),glm1$fitted,col="red",pch=19) ROBUST LINEAR REGRESSION lm1 <- lm(y ~ x); rlm1 <- rlm(y ~ x) ANOVA anova(lm1) #another way of doing anova: lm1 <- aov(movies$score ~ as.factor(movies$rating)) TukeyHSD(lm1)#honestly significant difference test ANOVA WITH MULTIPLE FACTORS aovObject4 <- aov(movies$score ~ movies$genre + movies$rating + movies$box.office) Model selection lm1 <- lm(score ~ .,data=movies) aicFormula <- step(lm1) library(leaps); regSub <- regsubsets(score ~ .,data=movies) plot(regSub) library(BMA) bicglm1 <- bic.glm(score ~.,data=movies,glm.family="gaussian") print(bicglm1) DATA http://skardhamar.github.com/rga/ #google analytics