130301 R commands

advertisement
SUMMARY OF R COMMANDS
GENERAL
getwd() # shows the actual directory
setwd(choose.dir()) # sets the directory
setwd("C:/Users/rui.pinto/Documents/Rdocs/experiments")
setwd("../") # goes down one directory in the path
setwd("./data") #adds one directory to the path
install.packages("calibrate") # to download new packages from the internet
package ? calibrate # to get info on the package
library(help = calibrate)
class(x) # shows the Class of objects: character, numeric, integer, complex, logical.
sapply(X[1,],class) #shows the class of every variable
str(x) # shows info on the object
as.numeric(x): # to exchange the class of the object.
Inf, NA and NaN exist:
is.na() # used to logical test if objects are NA. is.nan() the same for NaN. NOTE: NA includes NaN but inverse not true.
table(c(0,1,2,3,NA,3,3,2,2,3),useNA="ifany") #shows how many times each type of value appears
To remove missing values: miss_vals <- is.na(x), x[!miss_vals].
To remove missing values in more than one vector (or matrix): good_vals <- complete.cases(x, y), x[good_vals].
na.rm= FALSE # does not include NA values in the calculation of a function e.g. mean(x=mydata , na.rm=FALSE).
na.omit #is used in some functions to not use NAs
report[is.na(report)] <- 0 # to use zeros instead of NA
If one needs an integer, we can specify it with "L", e.g. 1 is numeric, 1L is integer
help(zemanel) # lists help for the function or dataset
library () # lists a short description of the libraries available
library (help="libraryname") # shows functions in that library
library(calibrate) #
search() # lists all the packages installed at the moment. Functions are used according to a certain order of their
package.
Order of packages : 1 - workspace (.Global environment), 2 - loaded libraries, 3 - packages according to "search()"
"environment" is a collection of pairs "symbol,value" (e.g. x as symbol and 4 as its value).
"closure" (or function closure) is the pair "function, environment")
ls() # lists all objects created
data () # lists all available datasets included in R
data(trees) # shows the data "trees"
rm(x,y) #removes (delete) x and y
rm(list=ls()) # deletes all the variables in the workspace
getwd() # gets the work directory
list.files() # lists all the files in the work directory. It is the same as dir()
b=list.files("C:/My Documents") # gets all the files in this directory.
history(max.show=100, pattern="dimnames") #shows the last 100 commands that include the word "dimnames"
AND, OR, AND/OR - &&,
options(warn=-1) # to stop showing warnings
stop("need to input two values") #error messages
regexpr(string1,string2) # comparison of strings
ageGroups<-cut2(X$age,g=5) # turns a numeric variable into a factor variable
VECTORS
vector(mode = "logical", length = 10) # creates a vector with 10 values of FALSE (in case of "numeric", it's 0's.
z<-c(5,9,1,0) # creates vector
names(z) <- c("foo", "bar", "norf","ze") # gives a name to each of the objects of z
# create sequences of numbers
x<-1:10
seq(1,9,by=2) # Result [1] 1 3 5 7 9
seq(8,20,length=6) # Result [1] 8.0 10.4 12.8 15.2 17.6 20.0
rep(0,100) # creates 100 zeros
rep(1:3,2) # Result [1] 1 2 3 1 2 3
rep(1:3,rep(6,3)) # Result [1] 1 1 1 1 1 1 2 2 2 2 2 2 3 3 3 3 3 3
INDEXING
[ # always returns object of the same class as the original (can be used to more than one element)
[[ # used to extract elements of a list or data frame (single element, class is not necessarily the same)
$ # used to extract elements of a list of data frame by name (similar to [[ )
z[1,1] # first value of matrix
z[c(2,3),2] # indexing for two values in column 2
z[,2] # column 2
z[1:2,] # lines 1 and 2
x[1:6] # first six values of vector x
x[c(2,4,9)] # selects only the values with index 2, 4 and 9
x[-(1:6)] # excludes some values. It is the same as x[7:12]
y=subset(x,x<7) #select those cases that meet the logical condition (x<7)
MATRICES (every element is the same class)
head(x) # displays the first 6 lines of the matrix
z<-cbind(x,y) # creates a matrix of column vectors x y. Can be applied to matrices (rbind also works, for lines)
dim(x)<- c(2,5) # creates a matrix 2 x 5 from a vector with 10 values
z<-matrix(c(5,7,9,6,3,4),nrow=3) # creates matrix with 3 rows (along columns)
z<-matrix(c(5,7,9,6,3,4),nrow=3,byrow=T) # creates matrix with 3 rows (along rows)
dim(z) # to see the dimension of a matrix
nrow(x) and ncol(x) # to see nr of rows or columns of matrix
y%*%x # matrix multiplication
t(x) # transposed matrix
new=old[,-5] # deletes column 5
attach(trees) # the labels of the variables become variables (then use e.g. mean(height))
detach(trees) # "un"-attaches the labels
trees$height # works as attach
apply(trees,2,mean) # calculates the mean of columns of matrix "trees" (for lines use 1)
m <- matrix(1:4, nrow = 2, ncol = 2), dimnames(m) <- list(c("a", "b"), c("c", "d")) # names the elements of m
dimnames(m) <- list(NULL,c("e", "f")) # deletes name of lines and puts a name in columns. Note: "names" and
"rownames" do not work on matrices, only in data frames.
rownames(x, do.NULL = TRUE, prefix = "row") # NOTE: prefix="col" can be used for the columns.
rownames(x) <- value # value is a vector with the names of the rows. Used together with line below.
order(X$var) #gives the indexes of the var by order of magnitude
Y <- X[order(X$var1),] or we can use Y <- X[order(X$var1, X$var2),] to order a matrix according to variables
y=x[1, 2, drop = FALSE] # using the drop=FALSE, y is a matrix, not a vector
LISTS (vectors that can contain elements of different classes)
x<-list(1,"a", TRUE) # creates a list with these 3 elements
x <- list(a = 1, b = 2, c = 3) # to create names in lists
ze<-x[[1]] # creates vector ze with the first element of the list.
b<-x[1] # creates list b. And in this case it is the same as b=x["a"]
b<-x[c(1,3)] # to extract multiple elements of a list
x$a # to access the value of a (which is an element of x). The same as x[["a"]]. But the $ cannot use computed names.
x[[c(1,3)]] # is a vector with the 3rd value of the 1st element of list x. The same as x[[1]][[3]]
FACTORS (used to represent categorical data. May be ordered or unordered)
x <- factor(c("yes", "yes", "no", "yes", "no"))
unclass(x) # returns the numerical values of each level
x <- factor(c("yes", "yes", "no", "yes", "no"), levels = c("yes", "no")) # defines the order of the values
factorsx<-gl(2,5) #makes a vector of 2 levels with 5 points each
interaction(factor1,factor2);
split(
table(X) # counts the number of times each level appears in X
DATA FRAMES
Data frames are a special type of list, where every element of the list has the same length (matrix). Elements are
columns and length of each element is nr rows. Can store different classes of objects in each position.
Looking at data:
dim(X), names(X), nrow(X), ncol(X), quantile(X$var1), summary(X), head(X), class(X),sapply(X[1,],class)
unique(X$var1), length(unique(X$var1)), table(X$var1) (for qual vars), table(X$var1,X$var2) (relation between vars, for
qual vars), any(X$var>40), all(X$var>40); X[X$var1 > 0 & X$var2 > 0,c("Lat","Lon")] (we can also use "or" |);
colSums(X) (has a problem with NA's); colMeans(X, na.rm=TRUE); rowMeans(X, na.rm=TRUE);
mergedXY=merge(X,Y,all=TRUE) #other possibilities are x, y, by.x, by.y (eg. to merge according to one of the variables)
x <- data.frame(foo = 1:4, bar = c(T, T, F, F))
(the next 2 functions can also be applied in vectors)
ze<-x[[1]] # creates vector ze from data frame x, with the first element of the data frame (column 1). Same as ze<-x$a
b<-x[1] # creates list b from data frame x, with the 1st element of the data frame (column 1). And in this case it is the
same as b=x["a"]
row.names # to use with data frames
row.names(x)<-c("ze","manel","pedro", "filipe") # gives these names to the lines
data.matrix() # transforms a data frame into a matrix.
nrow(x) and ncol(x) can be used with data frames
dataX_frame$time<-dataX$time # attribution of one column of one data frame to another data frame
dataX_frame[,2]<-dataX[,2] # the same as above
dataX_frame[,1]<-dataX_matrix[,2] # attribution of one column of matrix to a data frame
dataX_frame[,2]<-dataX_matrix$time # ERROR can't pass atomic vector to frame (as above) using $
melt(misShaped,id.vars="people",variable.name="treatment",value.name="value") #reshapes matrix
BASIC PLOTS
par (col="red")# definitions to plots: col= "red" (color), pch=3 (symbol type), lty (line type), lwd =3 (line width), las=2
(orientation of the axis labels), bg (background color), mar =c(3.4, 4.2, 3.1, 5) (margins size), oma=c (0,0,1,0)(outer
margin size), mfrow = c(2,3) (nr plots per row and column, filled row wise), mfcol (nr plots per row and column, filled
column wise), cex=0.5 (point size)
par(no.readonly = TRUE) # shows the actual values of all the par settings
par("col") # shows the actual color
par(mfrow=c(1,2)) # creates a window of graphics of 1 x 2. Can also use mfcol
example(points) #opens a demo that shows different types of plots
dev.new() # opens a new window for figure
dev.set(1) # changes the window to nr 1, so we can produce a plot there
dev.off(2) # closes graphical window nr 2
dev.list() # to know the number of plots open
graphics.off() # closes all graphs
par(mfrow=c(1,1)) # to return graphics window to standard size
x=c(2,3) y=c(90,80) lines(x,y) # draws a line from point (2,90) to point (3,80)
abline(50,2) # draws a line with origin in y=50 and slope = 2
abline(v=3) # vertical line at x = 3
abline(h=60) # horizontal line at y = 60
fit <- lm(y ~ x); abline(fit) #linear regression on points x,y
lines (x,y) # creates lines
text(x,y,labels) #adds text labels to points
title # adds annotations to x, y axis labels, titles, subtitles, outer margin
plot(x,y,xlab="travel time", ylab="whatever unit", cex.lab=2,cex.axis=1.5) #puts x and y labels and defines size
mtext #adds arbitrary text to the margins (inner or outer)
legend("topleft",legend="Data",pch=20)
axis #adds axis ticks/labels
export formats: pdf, postscript, xfig (to edit a plot by hand in Unix), png (nice bitmap format), jpeg (photos), bitmap
(bnp), bmp (windows bitmap format)
# to define ranges in a plot, histogram, etc
range11<-range(as.numeric(outcome[, 11]), na.rm=TRUE)
plot(jitter(x)) # to be able to see points that are on top of each other
TYPES OF PLOTS
boxplot(Height,col="blue") # boxplot with median, 25% and 75%, plus bars that are 1.5 times the limits of the boxes
boxplot(X$var1 ~ as.factor(X$var2),col="blue","orange", names=c("yes","no"), varwidth=TRUE))
barplot(table(X$var1),col="blue")
hist(X, main="Heart Attack",xlab="30-day Death Rate",xlim= range11,breaks=100)
dens<-density(X$var1), plot(dens,lwd=3,col="blue") # its like a smoothed histogram, but with percentages. Allows to
show more than one distribution.
dens <- density(pData$AGEP), densMales <- density(pData$AGEP[which(pData$SEX==1)]),
plot(dens,lwd=3,col="blue"),lines(densMales,lwd=3,col="orange")
plot(Height,Volume, xlab="sample nr",ylab="temperature") # scatter plot (its identical to pairs) with axis labels
# parameters for scatterplots x,y,type,xlab,ylab,xlim,ylim,cex,col,bg
pairs(trees) # scatter plot of all the variables in matrix "trees"
points(3,4) # draws a dot in coordinates (3,4)
qqplot(x,y), abline(c(0,1)) #plots of percentiles of x against percentiles of y
to plot groups points in different colors:
plot(x,y,type="n") #creates axis but without the points
points(x[g=="Male"], y[g==["Male"], col="green") # first plot one of the groups
points(x[g=="Female"], y[g==["Female"], col="blue", pch=19)
plot(X$var1,Xvar2,col=X$var3) #it can also be done like this
image(1:10, 161:236, as.matrix(X[1:10, 161:236])) # heatmap transposed (maybe transpose the matrix first)
pdf export commands:
pdf(file = "testRplot.pdf") # open the pdf
x <- rnorm(100)
hist(x)# plot whatever we want
dev.off()# it has to be called to close the pdf device
dev.copy2pdf(file="graph1.pdf") #this will copy exactly the graph that we see
LATTICE FUNCTIONS PLOTS
library(lattice)
library(nlme)
library(lattice)
library(nlme)
xyplot(distance ~ age | Subject, data = Orthodont) # this will plot all individuals in different plots
xyplot(distance ~ age | Subject, data = Orthodont, type = "b") # same, but with a line
xyplot, bwplot,histogram,stripplot, dotplot, splot, levelplot, contourplot
CONTROL STRUCTURES
if, else; for; while; repeat; break; next; return
FOR
for(i in 1:ncol(dat))
{asdklfjaldfjk}
for (i in 1:length(id)
{jadfjakjsdflkjalsdj}
if (X==TRUE)
{ALKDJFADJKF
}else
{alkdjflajkdfkl}
# NOTE: when using "else" or "else if", the preceding } must be on the same line as "else"
# one can also use else if (with space between them)
WHILE
while (count<10) {
asdfasdf
}
REPEAT/BREAK
repeat {
asdfalsdkfjalsdjk
if (asdf<0) {break}
}
NEXT
for (i in 1:100){
if (i<=20{
next # skips the first 20 iterations
}
}
RETURN
FUNCTIONS
source("functions_meg.R") # will load all the functions in this script to the workspace
fix (sd) # opens the editor to write a function called sd (although in general one opens it in FILE/OPEN)
help(functionX) # shows the functionX help
invisible(x) # when x is the output, it does not print it (for some functions)
structure of a function:
sd2<- function (x)
{
2*x
}
formals(sd) # lists all the input arguments of the function (formal arguments). In this case "x"
... # indicates a variable number of arguments that are usually passed to other functions, eg:
myplot<- function (x,y,type="l", ...) {
plot(x,y,type = type, ...) } #this passes all the other arguments of "plot" to "myplot"
#in case "..." is the first argument of a function, it means the function does not know how many arguments are going
to be input into it (e.g. functions "paste" and "args")
#arguments that come after the "..." must be named explicitly (no positional or partial matching).
echo=T # use it in the source function, to see the functions in the console as they are processed
#if one object is defined in the body of a function and has no value, the function will look for its value in memory;
#but functions can be defined inside another function. In that case, the function will look for its value in the function.
example:
make.power<-function(n) {
pow <- function(x) {
x^n
}
pow}
Results:
cube<make.power(3)
cube(3) result is 27
square(3) result is 9
SPECIAL FUNCTIONS
lapply(mtcars,mean) # calculates the mean of columns of "mtcars" (always returns a "list")
y1<-lapply(mat1.df,function(x,y) sum(x)+y,y = 5) # example of using built-in function
sapply(split(mtcars$mpg, mtcars$cyl), mean, na.rm=TRUE) # similar to lapply. But if simplify=T, it simplifies (to a vector
or matrix)
apply(trees,2,mean) # calculates the mean of columns of matrix "trees" (for lines use 1)
apply(X,1,quantile,probs=c(0.25,0.75)) # calculates the desired percentiles of each row
split(mtcars$mpg, mtcars$cyl) #splits variable mpg into list with groups defined in variable cyl. Used with lapply, sapply
tapply(iris$Sepal.Length,levels_x,mean) # used in vectors, to calculate something (mean) based on coder "levels_x"
rowSums, rowMeans, colSums, colMeans # used only on matrices (only interesting in large ones)
mapply(rep,1:4, 4:1) # it is like apply, but multivariate.
newdata <- subset(mydata, age >=20 | age <10
newdata<-subset(mydata, sex=="m" | age>25)
newdata<- mydata [which(mydata$gender=='F' & mydata$age >65)
newdata<-subset(mydata, sex=="m" & age>25)
order(vector) # gives the indexes of vector by (alphabetical, numerical) order
ordered_vec<-matA[order (matA [,1], matA [,2]),] # orders matrix matA according to its column 1 and then column 2
sort(vector) #sorts the vector or matrix
READ
Some file types: tab-delimited text; comma-separated text; Excel file; JSON File; HTML/XML file; Database
b<-scan("ap1.txt") # reads a text file with only numbers to a vector b
X<-read.csv(file.choose()) # opens the window to choose the file
read.table # arguments: file, header, sep, colClasses, nrows, comment.char, skip, stringsAsFactors
d<-read.table("ap1.txt", sep=",",header = FALSE) # reads a text file with a table (only numbers) into a data frame d
d<read.table("ap.txt",header=TRUE,row.names=1) # read tab or space delimited with labels not including pos. (1,1)
if row.names=3 it means that the row names are in column 3
read.table("ap2.txt",header=TRUE) # read a tab or space delimited file (including position (1,1)
NOTE: one can use also read.delimited (tab delimited) or read.csv (comma delimited) files.
A <- matrix(scan("matrix.dat", n = 200*2000), 200, 2000, byrow = TRUE) # read big file (only numbers)
#read files with labels in first row
read.table(filename,header=TRUE,sep=',') #read csv files
read.xls("doe.xls", sep="",stringsAsFactors=FALSE) #NOTE: needs library(xlsReadWrite) and only reads .xls (not .xlsx)
read.xlsx(); read.xlsx2() # needs the xlsx package
readLines # to read lines of a text file
source("ap_data.R") # to read in R code files (inverse of dump). Doesn't need to attribute it to new variable.
k<-dget("ap_data.R") # for reading in R code files (inverse of dput). Needs to be attributed to new variable.
load("workspace_meg.RData") # for reading in saved workspaces. In binary
load("ap.02") # loads the variables that are inside the file "ap.02". In binary (saved with "save")
unserialize, for reading single R objects in binary form
## read files directly
con <- file("./data/cameras.csv","r")
cameraData <- read.csv(con)
close(con)
##download files from the internet (may use "library(XML)")
con <- url("http://www.jhsph.edu", "r") , x <- readLines(con,10) # to read 10 lines of an internet page
fileUrl <- "https://data.baltimorecity.gov/api/views/dz54-2aru/rows.csv?accessType=DOWNLOAD"
download.file(fileUrl,destfile="./data/cameras.csv")
list.files("./data"), dateDownloaded <- date()
# to read and parse an internet page
html3 <- htmlTreeParse("http://scholar.google.com/citations?user=HI-I6C0AAAAJ&hl=en", useInternalNodes=
xpathSApply(html3, "//title", xmlValue)
xpathSApply(html3, "//td[@id='col-citedby']", xmlValue) # this one will get "col-citedby" of the table ("td")
WRITE
write.table
writeLines
dump("ap1",file="ap_data.R") # creates file with metadata ready to load using "source". Creates variable ap1
dump(c("a","k"),file="multi_dump.R") # to create files with more than one variable
dput(ap1,file="ap_data.R") # creates file with metadata ready to load using "dget". Doesn't create variable ap1
save.image(file="workspace_meg.RData") # saves the workspace. In binary
savehistory("filename.Rhistory") # saves the history of commands. To reload, do loadhistory("filename.Rhistory")
save(a,file="ap.02") # saves the object "a" in a file "ap.02". In binary
serialize
filename_X<-paste("Pelagia_ctr_",depth_type[[1]],"_",season_type[[1]],".txt",sep="")
write.table(final_data[[1]], file = filename_X,append = FALSE, sep = "\t")
write.xls(response,paste(workpath,currentDOE,"/response.xls",sep=""))
write.table(temp,file=paste(workpath,currentDOE,"/Run",i,".txt",sep=""),sep="\t",dec=",")
save(X,Y,file="./data/cameras.rda") # allows one to save multiple datatables (also can use list instead of X,Y)
DEBUGGING
# message (function continues, maybe o problem), warning (something wrong, but not fatal) , error (fatal error),
condition (indicates that something unexpected occurred; programmers can create their own conditions)
traceback() # prints out the function call stack after an error occurs (does nothing if there's no error)
debug # more important one. flags a function for debug (then it goes one line at a time). Use ENTER to continue.
browser # suspends the execution of a function wherever it is called and puts the function in debug mode
trace# allows to insert debugging code into a function in specific places (in general "browser" command)
recover # allows to modify the error behavior so you can browse the function call stack. Use "options(error=recover)".
print/cat # useful
STATISTICS
mean(x)
var(x)
summary(x)
svd(x) # to do pca
COMPLETE PCA example
# How to open a txt (tab delimited) with a matrix (nothing on position 1,1) with labels on objects and variables and
with commas as decimal point, and then run PCA
C:\Users\rui.pinto\Documents\WorkMEg\data\Carragheenan Paris
dataX<-read.table("carragR.txt",header=TRUE,sep="\t",dec=",",row.names=1)
X<-data.matrix(dataX[6:1795])
Xclass<-dataX[1:5]
plot(1:1790,X[1,], type="l")
plot(1:1790,t(X), type="l")
num_comp<-6 # if one wants to find six components
pca_res<-svd(X, num_comp, num_comp) #Has to be applied on a matrix or dataframe
eigenval<-pca_res$d[1: num_comp]
diag_eig<-diag(eigenval)
scoX<-pca_res$u%*%diag_eig
loadX<-pca_res$v
plot(scoX[,1],scoX[,2],type="p") # plots with dots. For lines do "type="l" (L)
text(scoX[,1],scoX[,2],Xclass[[2]], pos=2,offset=0.5,cex=0.5) # plots labels 0.5 (offset) on the left (pos), size 0.5 (cex)
PACKAGES
httr - to work with http connections
foreign - to get data into R from SAS, SPSS, Octave, etc
library(maps)
DATA ANALYSIS
DATA MUNGING
## CLEANING
names(X)
tolower(names(X)) # to change everything to lowercase
# to split names in the dots:
splitNames = strsplit(names(X),"\\.")
firstElement <- function(x){x[1]}
sapply(splitNames,firstElement) # to split names
names(X)=sapply(splitNames,firstElement)
# to replace symbols or letters
names(X)=sub("_","",names(X),) #use gsub to substitute all of them
# TO CREATE RANGES IN QUANTITATIVE VARIABLES (KINDA HISTOGRAM)
timeRanges <- cut(reviews$time_left,seq(0,3600,by=600))
table(timeRanges,useNA="ifany")
# (ALTERNATIVELY, we can define the nr of ranges, using cut2)
library(Hmisc)
timeRanges<- cut2(reviews$time_left,g=6)
table(timeRanges,useNA="ifany")
# create new variable
X$newvar<-varX
# merge lines of Y that have the same value as X (in the columns defined)
#NOTE: we can use by, by.x, by.y, all
XY<-merge(X,Y,by.x="sporder",by.y="DDRS")
# SORT AND ORDER
Xvector_sorted<-sort(X)
Xmatrix_sorted<-Xmatrix(order(Xmatrix$ind_nr))
# RESHAPE (create more lines, using only the defined variables)
melt(X, id.vars="people",variable.name="treatment",value.name="value")
INITIAL PLOTS
plot(jitter(pData$sporder))
# boxplots
boxplot(pData$AGEP ~ as.factor(pData$DDRS),col="blue")
boxplot(pData$AGEP ~ as.factor(pData$DDRS),col=c("blue","orange"),names=c("yes","no"),varwidth=TRUE)
# density plots (line histograms)
dens <- density(pData$AGEP), densMales <- density(pData$AGEP[which(pData$SEX==1)])
density(na.omit(X$v1)) #does not uses the NAs
plot(dens,lwd=3,col="blue"), lines(densMales,lwd=3,col="orange") #for multiple distributions
# histograms
hist(pData$AGEP,col="blue",breaks=100,main="Age")
# scatterplots (with colors according to a column)
plot(pData$JWMNP,pData$WAGP,pch=19,col=pData$PUMA,cex=0.5)
# scatterplots (with size according to a column)
size_dots<-pData$JWMNP/max(pData$JWMNP)
plot(pData$JWMNP,Pdata$wagp,pch=19,cex=size_dots*10)
# to plot the points of a variable that are NA in another variable
y[x < 0] <- NA
boxplot(x ~ is.na(y))
#NOTE: some plots may need library(Hmisc)
HIERARCHICAL CLUSTERING
set.seed(1234); par(mar=c(0,0,0,0))
x <- rnorm(12,mean=rep(1:3,each=4),sd=0.2)
y <- rnorm(12,mean=rep(c(1,2,1),each=4),sd=0.2)
plot(x,y,col="blue",pch=19,cex=2)
text(x+0.05,y+0.05,labels=as.character(1:12))
dataFrame <- data.frame(x=x,y=y)
distxy <- dist(dataFrame)
hClustering <- hclust(distxy) # col=color_vector will add colour
plot(hClustering)
(heatmap)
dataMatrix <- as.matrix(dataFrame)[sample(1:12),]
heatmap(dataMatrix)
K-MEANS
dataFrame <- data.frame(x,y)
kmeansObj <- kmeans(dataFrame,centers=3)
names(kmeansObj)
(heatmap)
set.seed(1234)
dataMatrix <- as.matrix(dataFrame)[sample(1:12),]
kmeansObj2 <- kmeans(dataMatrix,centers=3) #nstart=100 will produce 100 models with random start points and
average them
par(mfrow=c(1,2),mar=rep(0.2,4))
image(t(dataMatrix)[,nrow(dataMatrix):1],yaxt="n")
image(t(dataMatrix)[,order(kmeansObj$cluster)],yaxt="n")
(to plot the variable centers, do the command below to the different cluster centers)
plot(kClust$center[1,1:10],pch=19,ylab="Cluster Center",xlab="")
PCA
svd1 <- svd(scale(dataMatrixOrdered)) #UV-scaled
par(mfrow=c(1,3))
image(t(dataMatrixOrdered)[,nrow(dataMatrixOrdered):1])
plot(svd1$u[,1],40:1,,xlab="Row",ylab="First left singular vector",pch=19)
plot(svd1$v[,1],xlab="Column",ylab="First right singular vector",pch=19)
(eigenvalues)
par(mfrow=c(1,2))
plot(svd1$d,xlab="Column",ylab="Singluar value",pch=19)
plot(svd1$d^2/sum(svd1$d^2),xlab="Column",ylab="Percent of variance explained",pch=19)
(for faster pca calculation)
fast.svd(scale(bigMatrix),tol=0)# tol=0 is the tolerance in variance that it calculates more pc's
NOTE: in the case of missing values, we can impute (calculates close values to the ones missing, using knn)
library(impute)
dataMatrix2 <- impute.knn(dataMatrix2)$data
(to regenerate the data)
svd1 <- svd(scale(faceData))
# %*% is matrix multiplication
# Here svd1$d[1] is a constant
approx1 <- svd1$u[,1] %*% t(svd1$v[,1]) * svd1$d[1]
# In these examples we need to make the diagonal matrix out of d
approx5 <- svd1$u[,1:5] %*% diag(svd1$d[1:5])%*% t(svd1$v[,1:5])
approx10 <- svd1$u[,1:10] %*% diag(svd1$d[1:10])%*% t(svd1$v[,1:10])
REGRESSION
plot(galton$parent,galton$child,pch=19,col="blue")
lm1 <- lm(galton$child ~ galton$parent) # y~x
lines(galton$parent,lm1$fitted,col="red",lwd=3)
summary(lm1)
(residuals)
plot(galton$parent,lm1$residuals,col="blue",pch=19)
abline(c(0,0),col="red",lwd=3)
NOTE: plot(lm1) # gives some plots to evaluate the model (residuals, outliers)
MULTIPLE REGRESSION
lmBoth <- lm(hunger$Numeric ~ hunger$Year + hunger$Sex + hunger$Sex*hunger$Year)
plot(hunger$Year,hunger$Numeric,pch=19)
points(hunger$Year,hunger$Numeric,pch=19,col=((hunger$Sex=="Male")*1+1))
abline(c(lmBoth$coeff[1],lmBoth$coeff[2]),col="red",lwd=3)
abline(c(lmBoth$coeff[1] + lmBoth$coeff[3],lmBoth$coeff[2] +lmBoth$coeff[4]),col="black",lwd=3)
REGRESSION WITH FACTOR VARIABLES
lm1 <- lm(movies$score ~ as.factor(movies$rating))
summary(lm1)
REGRESSION WITH BINARY OUTCOME (logistic regression)
logRegRavens <- glm(ravensData$ravenWinNum ~ ravensData$ravenScore,family="binomial")
summary(logRegRavens)
(confidence intervals)
exp(logRegRavens$coeff)
(ANOVA)
anova(logRegRavens,test="Chisq")
CONFIDENCE INTERVALS
confint(sampleLm4,level=0.95)
REGRESSION FOR COUNT OUTCOMES
plot(gaData$julian,gaData$visits,pch=19,col="darkgrey",xlab="Julian",ylab="Visits")
lm1 <- lm(gaData$visits ~ gaData$julian)
abline(lm1,col="red",lwd=3)
(CONFIDENCE INTERVALS)
library(sandwich)
confint.agnostic <- function (object, parm, level = 0.95, ...)
{
cf <- coef(object); pnames <- names(cf)
if (missing(parm))
parm <- pnames
else if (is.numeric(parm))
parm <- pnames[parm]
a <- (1 - level)/2; a <- c(a, 1 - a)
pct <- stats:::format.perc(a, 3)
fac <- qnorm(a)
ci <- array(NA, dim = c(length(parm), 2L), dimnames = list(parm,
pct))
ses <- sqrt(diag(sandwich::vcovHC(object)))[parm]
ci[] <- cf[parm] + ses %o% fac
ci
}
confint(glm1)
############################################
glm2 <- glm(gaData$simplystats ~ julian(gaData$date),offset=log(visits+1),
family="poisson",data=gaData)
plot(julian(gaData$date),gaData$simplystats/(gaData$visits+1),col="grey",xlab="Date",
ylab="Fitted Rates",pch=19)
lines(julian(gaData$date),glm2$fitted/(gaData$visits+1),col="blue",lwd=3)
REGRESSION FOR RATES
glm2 <- glm(gaData$simplystats ~ julian(gaData$date),offset=log(visits+1),
family="poisson",data=gaData)
plot(julian(gaData$date),glm2$fitted,col="blue",pch=19,xlab="Date",ylab="Fitted Counts")
points(julian(gaData$date),glm1$fitted,col="red",pch=19)
ROBUST LINEAR REGRESSION
lm1 <- lm(y ~ x); rlm1 <- rlm(y ~ x)
ANOVA
anova(lm1)
#another way of doing anova:
lm1 <- aov(movies$score ~ as.factor(movies$rating))
TukeyHSD(lm1)#honestly significant difference test
ANOVA WITH MULTIPLE FACTORS
aovObject4 <- aov(movies$score ~ movies$genre + movies$rating + movies$box.office)
Model selection
lm1 <- lm(score ~ .,data=movies)
aicFormula <- step(lm1)
library(leaps);
regSub <- regsubsets(score ~ .,data=movies)
plot(regSub)
library(BMA)
bicglm1 <- bic.glm(score ~.,data=movies,glm.family="gaussian")
print(bicglm1)
DATA
http://skardhamar.github.com/rga/ #google analytics
Download