#1. file.choose(), df <- read.csv("") #read file(probably need to skip some rows, check excel first, skip = num) #2. data cleaning #-Date #df$Date <- as.Date(df$Date, format = "%d-%m-%y"- 01-09-22 if 2023 %Y) #Sep 1 2023 -%b %d %Y , December 1 2022- %B %d %Y #df <- df[order(df$Date, decreasing = FALSE),] -order #attend_graduate <-df[df$grad == "1",] --create new data #df$Vol <- gsub("\\K", "", df$Vol ) --delete the K or $ #converting character to number #df$number.of.shares[df$number.of.shares == "None"] <- NA -change None into NA #df$number.of.shares <- as.numeric(df$number.of.shares) #delete columes #df$print.table <- Null (when there is none or na for whole row) #to see data missing - df$Date[is.na(df$open)] #df <- na.omit(df) - drop row with missing data #rename variables #name(df)[name(df) == "Number.of.Trade"] <- "num_trade" #namedf[2:5] <- c("open", "high", "low", "close") #names(df) <- tolower(names(df)) lowercase #Calculate #(2 +4 )/(2-5 ) need bracket #abs(-2) absolute value #sqrt(9) square root = 9^(1/2) #64^(1/4) #exp(1) exponentials #log(1) logarithms.-- log(100, base= 10) #Vectors #numeric vectors: a <- c(1, 2, 3, 4) #Logical vectors : a <- c(TRUR, FALSE, TRUE, TRUE) #character vectors: a <- c("programming", "and", " quantitative" ) #data frame #df <- date.frame(travel_mode, time) -combine two vectors in a data #list #my_list <- list(x = 1:3 y = TRUE, z = c("a", "b")) #Indexing #a <- c(1, 2, 3, 4, 5) #a[-2] - to run out except the second number #a[c(1, 3, 4)] only run out the first third and the forth number #sequance #seq(from = 10, to = 100, by = 10) -10, 20, 30, 40 , 50, 60....100 #seq(from = 0, to = 1, length.out = 5) 0, 0.25, 0.5, 0.75, 1 #repeating #rep(1, time = 100) #rep(1:5, times = 4) 123451234512345123451234512345 #rep(1:3, each = 3) 111222333 #rep(seq(from=10, to=500, by=10), each=2) 10 10 20 20....500 500 #comparing numerical vector #a > b, a< b, a == b, a<= b, a>= b, a!=b, a> 3 #a <- c(TRUE, TRUE, FALSE, FALSE) #b <- c(TRUE, FALSE, TRUE, FALSE) #a & b know where both a and b are true # a | b know where either a or b are ture #!a to flip true to false in a # !a & !b, Both false #install.packages("") #library() #Get data #df[2,9] get the date of second colume third row #df[c(2, 9, 13), c(1, 2)] #df[2, ] get the second colume #df$goals #df[, c("team", goals)] get the row of team and goals #df[df$wins >= 20, ] get the wins that bigger and equal to 20 #df$date[df$vol == max(df$vol)]--max vol in what date #differences #df$goaldiff <- df$goalsfor - df$goalagainst appears one more row for goaldiff #df$totalpoint <- 3 * df$wins + df$draws #ranking #df <- df[order(df$totalponts, decreasing = TRUE),] #df <- df[order(df$totalponts, df$goaldiff, decreasing = TRUE),] #df$ranking <- 1:nrow(df) --after the order #%in% #a %in% b checking if a match b #weekend_sales <- df[df$days %in% c("Saturday", "Sunday"), ] #july_sales <- df[df$month %in% (7),] #summary dataframe #head(df) head(df, n = 4) #tail(df) tail(df, n=2) #nrow(df) #ncol(df) numbers of colume #dim(df) find both row and colume #plotting #hist(penguins$body_mass_g) #table(penguins$species) #-> barplot(table(penguins$species)) #plot(penguins$bill_length_mm, penguins$flipper_length_mm) #ggplot #ggplot(penguins, aes(body_mass_g)) + geom_histogram(bins = 15, fill = "navy") + xlab("Penguin weight (grams)") + ylab("Count") +theme_minimal() #ggplot(penguins, aes(species, fill = island)) + geom_bar(color = "black") + xlab("Penguin species") +ylab("Count") +scale_fill_discrete(name = "Island", type = c("#0B0405", "#357BA2", "#DEF5E5")) +theme_minimal() #ggplot(penguins, aes(bill_length_mm, flipper_length_mm, color = species)) + geom_point() + scale_color_discrete(name = "Species") +xlab("Bill length (in mm)") +ylab("Flipper length (in mm)") + theme_minimal() #ggsave("my-plot.pdf") save the plot #Function F(x)=-8-2x+x^2 #f <- function(x){ #y <- -8 -2 *x + x^2 #return(y) # } #graph of function #library(ggplot2) #x <- seq(from = -4, to = 6, length.out = 200) # y <- f(x) # df <- data.frame(x, y) #ggplot(df, aes(x, y)) + geom_line() #Optimization #fmin <- optimize(f, interval= c(-100,100), maximum = FALSE) #if-else #my_abs <- function(x) { #if (x < 0) { # return(-x) # } else { # return(x) # } # } #if-else-if-else #sgn <- function(x) { # if (x < 0) { # return(-1) # } else if (x == 0) { # return(0) # } else { # return(+1) # } # } #Merging #df <- merge(df1, df2, by = "date") #df <- merge(df1, df2, by = c("date", "market_area")) #df <- merge(df1, df2, by = "date", all.x = TRUE) #all.x =TRUE--keep the first data #all.y = TRUE --keep the second data #all=TRUE --keep both data #Reshaping #library(reshape2) #Long to wide :wide <- dcast(long, id ~ variable) #Wide to long: long <- melt(wide, id.vas= "id") #Aggregating #library(lubridate) #df$year <- year(df$date) #aggregate(e10 ~ year, FUN = mean, data = df) #aggregate(cbind(e5, e10, diesel) ~ year, FUN = mean, data = df) -appear all three #aggregate(. ~ year, FUN = mean, data = df) --when too mnay variables