Uploaded by Kevin Kyuson Lim

400415239-Kyuson(final project code)

advertisement
STATS 790: On comparison of elastic net and lasso regression in
sparse dataset
Stduent number: 400415239
Kyuson Lim
April 22nd, 2022
1
knitr::opts_chunk$set(echo = TRUE, warning = F)
library(ggplot2);library(MASS);library(tidyverse);library(gam)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 -##
##
##
##
v
v
v
v
tibble
tidyr
readr
purrr
3.1.6
1.2.0
2.1.1
0.3.4
v dplyr
1.0.8
v stringr 1.4.0
v forcats 0.5.1
##
##
##
##
-- Conflicts ------------------------------------------ tidyverse_conflicts() -x dplyr::filter() masks stats::filter()
x dplyr::lag()
masks stats::lag()
x dplyr::select() masks MASS::select()
## Loading required package: splines
## Loading required package: foreach
##
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
##
##
accumulate, when
## Loaded gam 1.20
library(gridExtra);library(glmnet);library(foreach);library(elasticnet)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
##
combine
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
##
expand, pack, unpack
## Loaded glmnet 4.1-3
## Loading required package: lars
## Loaded lars 1.2
library(paletteer); library(knitr); library(caret); require(broom); library(glmnet)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
##
lift
## Loading required package: broom
2
library(caret); library(grid); library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
##
group_rows
3
Codes
Regression functions and tuning parameters
Lasso: training and validation set approach
# enet - LASSO
find_best_lasso_enet <- function(x, y, x_val, y_val) {
s_list <- seq(0, 1, by = 0.1)[-1][-10]# lasso penality
MSE<-numeric()
for (i in 1:length(s_list)){
## model for lasso
mod<-enet(x,y, lambda=0)
## estimated response values
yhat<-x_val %*% predict.enet(mod, s=s_list[i],
mode='fraction', type='coef')$coefficients
## RMSE
MSE<-c(MSE, sqrt(mean((y_val - yhat)ˆ2)))
}
}
return(s_list[which.min(MSE)])
Elastic net: training and validation set approach
# --------- helper function -------------------------- #
# enet - Nonnaive
find_best_enet_elasticnet <- function(x, y, x_val, y_val) {
## based on prior evidence of computation for the input
s_list <- seq(0, 1, by = 0.1)[-1][-10] # lasso penalty
lambda_list <-seq(0, 3, length = 30)[-1] # ridge penalty
## setting grid search for tuning parameters
MSE_grid<-matrix(ncol=length(lambda_list),
nrow=length(s_list)) ## row: lambda, column: alpha
rownames(MSE_grid) <- c(1:length(s_list))
j<-1
## grid search
while (j<=length(lambda_list)){
MSE<-numeric()
for (i in 1:length(s_list)){
## original paper is established using 'enet'
mod <- enet(x,y, lambda= lambda_list[j])
## setting naive elastic = False for elastic net solution
yhat <- x_val %*% predict.enet(mod, s=s_list[i], mode='fraction',
naive=FALSE, type='coef')$coefficients
## RMSE computation
4
MSE <- c(MSE, sqrt(mean((y_val - yhat)ˆ2)))
}
MSE_grid[,j] <- MSE
j<-j+1
}
}
coordinate<-which(MSE_grid == min(MSE_grid), arr.ind = TRUE)
return(c(s_list[coordinate[1]],
lambda_list[coordinate[2]])) ## 1: s, 2: lambda
R code: application and Monte-Carlo simulation
## number of MC simulation
nn<- 100
# RMSE data set
mse_enet_lasso4 <- numeric()
mse_enet_elasticnet4 <- numeric()
# Non-zero coefficient values
lasso_coeff <- numeric(); elastic_coeff <- numeric()
# coefficient estimates
lasso_est_coeff <- matrix(nrow=nn, ncol=40)
elastic_est_coeff <- matrix(nrow=nn, ncol=40)
# tuning parameter
lasso_par <- numeric(); elastic_par1 <- numeric(); elastic_par2 <- numeric()
# -------------- enet function ---------------------- #
for (i in 1:nn){
## generate the seed in MC simulation
set.seed(2022+100*i)
# ------------- Ex.4 training & validation set --------------- #
n4 <- 500
Z = replicate(3, rnorm(n4))
X1_5 = replicate(5, expr=Z[,1] + rnorm(n4,sd=1))
X6_10 = replicate(5, expr=Z[,2] + rnorm(n4,sd=1))
X11_15 = replicate(5, expr=Z[,3] + rnorm(n4,sd=1))
X16_40 = replicate(25, expr=rnorm(n4))
x4 = cbind(X1_5, X6_10, X11_15, X16_40)
b4 <- c(rep(3, 15), rep(0, 25))
y4 <- x4%*%b4 +rnorm(n4, mean=0, sd=15)
# ------------ data set ---------------------- #
# Ex.4 split data: validation set approach
numSam = sample(500)
x4.train <- x4[numSam[1:50],]; x4.val <- x4[numSam[51:100],]
x4.test <- x4[numSam[101:500],]
y4.train <- y4[numSam[1:50]]; y4.val <- y4[numSam[51:100]]
y4.test <- y4[numSam[101:500]]
5
# -------------------- enet best values -------------------- #
# optimal values, EX 4.
enet_best_lam_lasso4 <- find_best_lasso_enet(
x4.train, y4.train, x4.val, y4.val) # LASSO
enet_best_s_lambda_elastic4 <- find_best_enet_elasticnet (
x4.train, y4.train, x4.val, y4.val) # Elastic net
# ----------------- enet prediction model -------------- #
lasso4 = enet(x4.train, y4.train, lambda=0) # LASSO
lasso4_co<-predict.enet(lasso4, s=enet_best_lam_lasso4,
type='coef', mode='fraction')$coefficients
yhat_enet_lasso4 <- x4.test%*%lasso4_co
# without intercept, format to be matrix for prediction
enet4 = enet(x4.train, y4.train,
lambda = enet_best_s_lambda_elastic4[2]) # nonnaive elastic net
elastic4_co<-predict.enet(enet4, s=enet_best_s_lambda_elastic4[1],
type='coef', mode='fraction', naive=FALSE)$coefficients
yhat_enet_elasticnet4 <- x4.test %*%elastic4_co
# without intercept, format to be matrix for prediction
# ----------------- tuning parameter ---------------------- #
lasso_par = c(lasso_par, enet_best_lam_lasso4)
elastic_par1 = c(elastic_par1, enet_best_s_lambda_elastic4[1])
elastic_par2 = c(elastic_par2, enet_best_s_lambda_elastic4[2])
# ----------------------- coefficient estimated ---------------------- #
# Lasso non-zero coefficient record
lasso_coeff[i] = length(lasso4_co[lasso4_co != 0])
# Elastic net non-zero coefficient record
elastic_coeff[i]<- length(elastic4_co[elastic4_co != 0])
# coef estimated
lasso_est_coeff[i,] <- lasso4_co
elastic_est_coeff[i,] <- elastic4_co
# ----------------------- MSE enet ------------------------------------- #
mse_enet_lasso4<-c(mse_enet_lasso4,
sqrt(mean((yhat_enet_lasso4 - y4.test)ˆ2))) # LASSO
}
mse_enet_elasticnet4<-c(mse_enet_elasticnet4,
sqrt(mean((yhat_enet_elasticnet4 - y4.test)ˆ2))) # non-naive elastic net
6
Result: coefficient estimates
# coefficients
## Monte-carlo simuated coefficients, estimated
l1= round(apply(lasso_est_coeff, 2, median), 2)
l2= round(apply(elastic_est_coeff, 2, median), 2)
# mean Lasso coefficients
d1 = data.frame(cbind(c(round(apply(lasso_est_coeff, 2, median), 2),
round(apply(elastic_est_coeff, 2, median), 2)),
c(rep(1, 40), rep(2, 40))))
colnames(d1) = c('coefficients', 'type')
# Make a modified copy of the original data
d1 <- d1 %>%
mutate(type = recode(type, '1' = "Lasso", '2' = "elastic net"))
d1$type = as.factor(d1$type)
# Use semi-transparent fill
p<-ggplot(d1, aes(x=coefficients, fill=type, color=type)) +
geom_vline(aes(xintercept=0),
color="grey70", linetype="dashed", size=1)+
facet_grid(~ type)+
geom_vline(aes(xintercept=3),
color="grey70", linetype="dashed", size=1)+
geom_histogram(position="identity", alpha=0.5, bins=20)+
scale_fill_manual(values=c("blue","red"), name = "Distribution of coefficients",
labels = c('Elastic net', "Lasso"),
guide = guide_legend(direction = "horizontal", ncol = 1))+
scale_color_manual(values=c("blue","red"),
guide = 'none')+
theme(legend.position="bottom",
axis.ticks.length.x = unit(0, "cm"), axis.ticks.length.y = unit(0, "cm"),
axis.ticks.y=element_blank(),
panel.grid = element_blank(),
axis.title.x = element_text(size=11),
axis.text.x = element_text(size=11),
axis.text.y = element_text(size=11),
title =element_text(size=11),
axis.title=element_text(size=11),
legend.text=element_text(size=11),
legend.key = element_rect(fill = "white"),
legend.box.background = element_blank(),
legend.title=element_text(size=11),
legend.background = element_rect(fill="transparent",
size=0.15, linetype="solid"),
panel.background = element_blank(),
panel.grid.major.x = element_line(color = "grey90"),
panel.grid.major.y = element_line(color = "grey90"),
panel.grid.minor = element_blank())+
ggtitle('The coefficient estimates of two regularization models,
namely elastic net and Lasso, \n where 100 Monte-Carlo simulation
provides a values in \n the histrogram for the comparison.')
p
7
The coefficient estimates of two regularization models,
namely elastic net and Lasso,
where 100 Monte−Carlo simulation
provides a values in
the histrogram for the comparison.
elastic net
Lasso
25
count
20
15
10
5
0
0
1
2
3
0
coefficients
Distribution of coefficients
1
2
3
Elastic net
Lasso
Figure 1: The histogram illustrates the difference between two methods for the coefficients estimated by the
100 Monte-Carlo simulation.
Median number of non-zero coefficient estimates
# -------------- Non-zero coefficient table------------------------ #
# export into outcome table
ex4_nonzero<-as.data.frame(cbind(lasso_coeff, elastic_coeff))
median_ex4 <- cbind(median(ex4_nonzero[,1]), median(ex4_nonzero[,2]))
colnames(median_ex4) <-c('lasso median','elastic median')
# ----------------- final output --------------------------- #
kable(median_ex4, format= "latex", position = 'H',
caption = 'The summary table for median non-zero number of coefficients
in Elastic Net regression compared to the Lasso regression')
Table 1: The summary table for median non-zero number of coefficients in Elastic Net regression compared
to the Lasso regression
lasso median
21
elastic median
22
Comparison of median RMSE values
# Example: enet comparison table
err_ex_enet1<-matrix(nrow= 2,ncol=2)
colnames(err_ex_enet1)<-c('Elastic', 'Lasso')
rownames(err_ex_enet1)<-c('median', 'sd')
err_ex_enet1[1,1]<-median(mse_enet_elasticnet4)
err_ex_enet1[2,1]<-sd(mse_enet_elasticnet4)
8
err_ex_enet1[1,2]<-median(mse_enet_lasso4)
err_ex_enet1[2,2]<-sd(mse_enet_lasso4)
# export into outcome table
ex1<-as.data.frame(cbind(mse_enet_elasticnet4, mse_enet_lasso4))
# ---------------- Non-zero coefficient table ------------------------ #
# export into outcome table
ex1_nonzero<-as.data.frame(cbind(lasso_coeff, elastic_coeff))
median_ex1 <- cbind(median(ex1_nonzero[,1]), median(ex1_nonzero[,2]))
colnames(median_ex1) <-c('lasso median','elastic median')
# -------------------- final output ---------------------------- #
# combined of first: mse, second: coefficiet
ex1_final<-cbind(ex1)
# export for box-plot/error plot comparison
kable(round(err_ex_enet1,2), format= "latex", position = 'H',
caption = 'The summary table for median RMSE value of Elastic Net compared to
the median RMSE value of Lasso regression')
Table 2: The summary table for median RMSE value of Elastic Net compared to the median RMSE value of
Lasso regression
median
sd
Elastic
17.51
1.47
Lasso
19.02
1.63
Tuning parameter distribution
# ----------------- tuning parameter ---------------------- #
# mean Lasso coefficients
d1 = data.frame(cbind(c(lasso_par, elastic_par1, elastic_par2),
c(rep(1, nn), rep(2, nn), rep(3, nn))))
colnames(d1) = c('coefficients', 'type')
# Make a modified copy of the original data
d1 <- d1 %>%
mutate(type = recode(type, '1'="Lasso", '2'="elastic net: L1 penalty",
'3'="elastic net: L2 penalty"))
d1$type = as.factor(d1$type)
# Use semi-transparent fill
p<-ggplot(d1, aes(x=coefficients, fill=type, color=type)) +
geom_vline(aes(xintercept=0.3),
color="grey70", linetype="dashed", size=1)+
facet_grid(~ type)+
geom_histogram(position="identity", alpha=0.5, breaks = seq(0, 3, by = 0.25))+
scale_fill_manual(values=c("blue","red",'orange'),
name = "Distribution of coefficients",
9
labels = c('Elastic Net: L1 penalty', 'Elastic Net: L2 penalty',"Lasso"),
guide = guide_legend(direction = "horizontal", ncol = 1))+
scale_color_manual(values=c("blue","red",'orange'),
guide = 'none')+
theme(legend.position="bottom",
axis.ticks.length.x = unit(0, "cm"), axis.ticks.length.y = unit(0, "cm"),
axis.ticks.y=element_blank(),
panel.grid = element_blank(),
axis.title.x = element_text(size=11),
axis.text.x = element_text(size=11, angle=45, vjust = 0.25, hjust=0.25),
axis.text.y = element_text(size=11),
title =element_text(size=11),
axis.title=element_text(size=11),
legend.text=element_text(size=11),
legend.key = element_rect(fill = "white"),
legend.box.background = element_blank(),
legend.title=element_text(size=11),
legend.background = element_rect(fill="transparent",
size=0.15, linetype="solid"),
panel.background = element_blank(),
panel.grid.major.x = element_line(color = "grey90"),
panel.grid.major.y = element_line(color = "grey90"),
panel.grid.minor = element_blank())+
scale_x_continuous(breaks = seq(0, 3, by=0.25))+
ggtitle('100 Monte-Carlo simulation result in tuning parameters of
\n Lasso and Elastic Net regression for the L1 norm and L2 norm penalization.')
p
100 Monte−Carlo simulation result in tuning parameters of
Lasso and Elastic Net regression for the L1 norm and L2 norm penalization.
elastic net: L1 penalty
elastic net: L2 penalty
Lasso
80
count
60
40
20
0.
0
0. 0
2
0. 5
5
0. 0
7
1. 5
0
1. 0
2
1. 5
5
1. 0
7
2. 5
0
2. 0
2
2. 5
5
2. 0
7
3. 5
00
0.
0
0. 0
2
0. 5
5
0. 0
7
1. 5
0
1. 0
2
1. 5
5
1. 0
7
2. 5
0
2. 0
2
2. 5
5
2. 0
7
3. 5
00
0.
0
0. 0
2
0. 5
5
0. 0
7
1. 5
0
1. 0
2
1. 5
5
1. 0
7
2. 5
0
2. 0
2
2. 5
5
2. 0
7
3. 5
00
0
coefficients
Elastic Net: L1 penalty
Distribution of coefficients
Elastic Net: L2 penalty
Lasso
Figure 2: The histogram illustrates the tuning parameters of two methods for the L1 norm and L2 norm
which corresponds to Lasso and elastic net, estimated by the 100 Monte-Carlo simulation.
attach(ex1)
# 4th simulation
10
dat4<-c(mse_enet_elasticnet4, mse_enet_lasso4)
datt <- data.frame(Example4 = factor(
rep(c("Elastic net","LASSO"), each=100)), RMSE = dat4)
# plot
f4<-ggplot(data=datt, aes(x=Example4, y=RMSE, color=Example4)) +
geom_boxplot(color = c("#00AFBB", "#FC4E07")) +
theme(legend.position="bottom",
axis.ticks.length.x = unit(0, "cm"), axis.ticks.length.y = unit(0, "cm"),
axis.ticks.y=element_blank(),
panel.grid = element_blank(),
axis.title.x = element_text(size=11),
axis.text.x = element_text(size=11),
axis.text.y = element_text(size=11),
title =element_text(size=11),
axis.title=element_text(size=11),
legend.text=element_text(size=11),
legend.key = element_rect(fill = "white"),
legend.box.background = element_blank(),
legend.title=element_text(size=11),
legend.background = element_rect(fill="transparent",
size=0.15, linetype="solid"),
panel.background = element_blank(),
panel.grid.major.x = element_line(color = "grey90"),
panel.grid.major.y = element_line(color = "grey90"),
panel.grid.minor = element_blank())
Summary: RMSE and distribution for non-zero coefficients comparison
# --------------- non-zero coefficients -------------------------- #
# 4th simulation
dat4=data.frame(Example4=c('elastic net', 'lasso'),
median=c(median(ex4_nonzero$elastic_coeff), median(ex4_nonzero$lasso_coeff)),
lower=c(median(ex4_nonzero$elastic_coeff)-sd(ex4_nonzero$elastic_coeff),
median(ex4_nonzero$lasso_coeff)-sd(ex4_nonzero$lasso_coeff)),
upper=c(median(ex4_nonzero$elastic_coeff)+
sd(ex4_nonzero$elastic_coeff), median(ex4_nonzero$lasso_coeff)+
sd(ex4_nonzero$lasso_coeff)))
# plot
h4<-ggplot() +
geom_errorbar(data=dat4, mapping=aes(x=Example4, ymin=upper, ymax=lower),
width=0.1, size=1, color=c("#00AFBB", "#FC4E07")) +
geom_point(data=dat4, mapping=aes(x=Example4, y=median),
size=4.5, color = c("#00AFBB", "#FC4E07"))+
scale_color_manual(values=c("#00AFBB", "#FC4E07"),
name = "Distribution of coefficients",
labels = c("Lasso", 'Elastic Net'),
guide = guide_legend(direction = "horizontal", ncol = 1))+
theme(legend.position="bottom",
axis.ticks.length.x = unit(0, "cm"), axis.ticks.length.y = unit(0, "cm"),
axis.ticks.y=element_blank(),
panel.grid = element_blank(),
11
axis.title.x = element_text(size=11),
axis.text.x = element_text(size=11),
axis.text.y = element_text(size=11),
title =element_text(size=11),
axis.title=element_text(size=11),
legend.text=element_text(size=11),
legend.key = element_rect(fill = "white"),
legend.box.background = element_blank(),
legend.title=element_text(size=11),
legend.background = element_rect(fill="transparent",
size=0.15, linetype="solid"),
panel.background = element_blank(),
panel.grid.major.x = element_line(color = "grey90"),
panel.grid.major.y = element_line(color = "grey90"),
panel.grid.minor = element_blank())
# plots combined
gridExtra::grid.arrange(f4, h4, nrow=1,
top = textGrob('100 Monte-Carlo simulation result in RMSE and median number of
non-zero coefficients \n with its standard errors from
Lasso and Elastic Net regression.'))
100 Monte−Carlo simulation result in RMSE and median number of
non−zero coefficients
with its standard errors from
Lasso and Elastic Net regression.
25.0
25
median
RMSE
22.5
20.0
17.5
20
15.0
Elastic net
LASSO
elastic net
Example4
lasso
Example4
Figure 3: (a) The box-plot illustrates the RMSE of two methods for Lasso and elastic net regression by the
100 Monte-Carlo simulation. (b) The line error chart illustrates the median number of non-zero coefficients
and its standard error estimated by the Lasso and elastic net regression in 100 Monte-Carlo simulation.
12
Download