Uploaded by tzuching.h1229

R Programming Cheat Sheet: Data Manipulation & Analysis

advertisement
#1. file.choose(), df <- read.csv("")
#read file(probably need to skip some rows, check excel first, skip =
num)
#2. data cleaning
#-Date
#df$Date <- as.Date(df$Date, format = "%d-%m-%y"- 01-09-22 if 2023 %Y)
#Sep 1 2023 -%b %d %Y , December 1 2022- %B %d %Y
#df <- df[order(df$Date, decreasing = FALSE),] -order
#attend_graduate <-df[df$grad == "1",] --create new data
#df$Vol <- gsub("\\K", "", df$Vol ) --delete the K or $
#converting character to number
#df$number.of.shares[df$number.of.shares == "None"] <- NA -change None
into NA
#df$number.of.shares <- as.numeric(df$number.of.shares)
#delete columes
#df$print.table <- Null (when there is none or na for whole row)
#to see data missing - df$Date[is.na(df$open)]
#df <- na.omit(df) - drop row with missing data
#rename variables
#name(df)[name(df) == "Number.of.Trade"] <- "num_trade"
#namedf[2:5] <- c("open", "high", "low", "close")
#names(df) <- tolower(names(df)) lowercase
#Calculate
#(2 +4 )/(2-5 ) need bracket
#abs(-2) absolute value
#sqrt(9) square root = 9^(1/2)
#64^(1/4)
#exp(1) exponentials
#log(1) logarithms.-- log(100, base= 10)
#Vectors
#numeric vectors: a <- c(1, 2, 3, 4)
#Logical vectors : a <- c(TRUR, FALSE, TRUE, TRUE)
#character vectors: a <- c("programming", "and", " quantitative" )
#data frame
#df <- date.frame(travel_mode, time) -combine two vectors in a data
#list
#my_list <- list(x = 1:3 y = TRUE, z = c("a", "b"))
#Indexing
#a <- c(1, 2, 3, 4, 5)
#a[-2] - to run out except the second number
#a[c(1, 3, 4)] only run out the first third and the forth number
#sequance
#seq(from = 10, to = 100, by = 10) -10, 20, 30, 40 , 50, 60....100
#seq(from = 0, to = 1, length.out = 5) 0, 0.25, 0.5, 0.75, 1
#repeating
#rep(1, time = 100)
#rep(1:5, times = 4) 123451234512345123451234512345
#rep(1:3, each = 3) 111222333
#rep(seq(from=10, to=500, by=10), each=2) 10 10 20 20....500 500
#comparing numerical vector
#a > b, a< b, a == b, a<= b, a>= b, a!=b, a> 3
#a <- c(TRUE, TRUE, FALSE, FALSE)
#b <- c(TRUE, FALSE, TRUE, FALSE)
#a & b know where both a and b are true
# a | b know where either a or b are ture
#!a to flip true to false in a
# !a & !b, Both false
#install.packages("")
#library()
#Get data
#df[2,9] get the date of second colume third row
#df[c(2, 9, 13), c(1, 2)]
#df[2, ] get the second colume
#df$goals
#df[, c("team", goals)] get the row of team and goals
#df[df$wins >= 20, ] get the wins that bigger and equal to 20
#df$date[df$vol == max(df$vol)]--max vol in what date
#differences
#df$goaldiff <- df$goalsfor - df$goalagainst appears one more row for
goaldiff
#df$totalpoint <- 3 * df$wins + df$draws
#ranking
#df <- df[order(df$totalponts, decreasing = TRUE),]
#df <- df[order(df$totalponts, df$goaldiff, decreasing = TRUE),]
#df$ranking <- 1:nrow(df) --after the order
#%in%
#a %in% b checking if a match b
#weekend_sales <- df[df$days %in% c("Saturday", "Sunday"), ]
#july_sales <- df[df$month %in% (7),]
#summary dataframe
#head(df) head(df, n = 4)
#tail(df) tail(df, n=2)
#nrow(df)
#ncol(df) numbers of colume
#dim(df) find both row and colume
#plotting
#hist(penguins$body_mass_g)
#table(penguins$species)
#-> barplot(table(penguins$species))
#plot(penguins$bill_length_mm, penguins$flipper_length_mm)
#ggplot
#ggplot(penguins, aes(body_mass_g)) + geom_histogram(bins = 15, fill =
"navy") + xlab("Penguin weight (grams)") + ylab("Count") +theme_minimal()
#ggplot(penguins, aes(species, fill = island)) + geom_bar(color =
"black") + xlab("Penguin species") +ylab("Count")
+scale_fill_discrete(name = "Island", type = c("#0B0405", "#357BA2",
"#DEF5E5")) +theme_minimal()
#ggplot(penguins, aes(bill_length_mm, flipper_length_mm, color =
species)) + geom_point() + scale_color_discrete(name = "Species")
+xlab("Bill length (in mm)") +ylab("Flipper length (in mm)") +
theme_minimal()
#ggsave("my-plot.pdf") save the plot
#Function F(x)=-8-2x+x^2
#f <- function(x){
#y <- -8 -2 *x + x^2
#return(y)
# }
#graph of function
#library(ggplot2)
#x <- seq(from = -4, to = 6, length.out = 200)
# y <- f(x)
# df <- data.frame(x, y)
#ggplot(df, aes(x, y)) + geom_line()
#Optimization
#fmin <- optimize(f, interval= c(-100,100), maximum = FALSE)
#if-else
#my_abs <- function(x) {
#if (x < 0) {
#
return(-x)
# } else {
# return(x)
# }
# }
#if-else-if-else
#sgn <- function(x) {
# if (x < 0) {
# return(-1)
# } else if (x == 0) {
# return(0)
# } else {
# return(+1)
# }
# }
#Merging
#df <- merge(df1, df2, by = "date")
#df <- merge(df1, df2, by = c("date", "market_area"))
#df <- merge(df1, df2, by = "date", all.x = TRUE)
#all.x =TRUE--keep the first data
#all.y = TRUE --keep the second data
#all=TRUE --keep both data
#Reshaping
#library(reshape2)
#Long to wide :wide <- dcast(long, id ~ variable)
#Wide to long: long <- melt(wide, id.vas= "id")
#Aggregating
#library(lubridate)
#df$year <- year(df$date)
#aggregate(e10 ~ year, FUN = mean, data = df)
#aggregate(cbind(e5, e10, diesel) ~ year, FUN = mean, data = df) -appear all three
#aggregate(. ~ year, FUN = mean, data = df) --when too mnay variables
Download