Best Subset Selection: run a linear regression for each possible combination of the X predictors. One simple approach would be to take the subset with the smallest RSS. Unfortunately, one can show that the model that includes all the variables will always have the smallest RSS. There are many measures that people use: Adjusted R2; AIC (Akaike information criterion); BIC (Bayesian information criterion); Cp (equivalent to AIC for linear regression)
What we would really like to do is to find the set of variables that give the lowest test (not training) error rate. If we have a large data set we can achieve this goal by splitting the data into training, and validation parts. We would then use the training part to build each possible model (i.e. the different combinations of variables) and choose the model that gave the lowest error rate when applied to the validation data. We can also split the data into training, validation and testing parts. We would then use the training part to build each possible model and choose the model that gave the lowest error rate when applied to the validation data. Finally, the error rate on the test data would give us an estimate of how well the method would work on new observations.
# lasso use {lars} package, ridge and optim.lm use {leaps} package
#######################################################################
# use leaps package to find the best linear model based on BIC AIC
# select 300 out of total 400 samples to be the training set and 100 be the test set.
###########################################################
# names(carseats)
# [1] "Sales" "CompPrice" "Income" "Advertising" "Population" "Price" "ShelveLoc" # "Age" "Education" "Urban" "US"
td=sample(400,300,replace=F);
carseat.train <- carseats[td, ]
carseat.test <- carseats[-td, ]
# use optim.lm function to try to select the “best” linear model from my training data:
###########################################################
"optim.lm" <-
function(data,y,val.size=NULL,cv=10,bic=T,test=T,really.big=F){
library(MASS)
library(leaps)
n.plots <- sum(c(bic,test,cv!=0))
par(mfrow=c(n.plots,1))
n <- nrow(data)
names(data)[(names(data)==y)] <- "y"
X <- lm(y~.,data,x=T)$x
data <- as.data.frame(cbind(data$y,X[,-1]))
names(data)[1] <- "y"
if (is.null(val.size))
val.size <- round(n/4)
s <- sample(n,val.size,replace=F)
data.train <- data[-s,]
data.test <- data[s,]
p <- ncol(data)-1
data.names <- names(data)[names(data)!="y"]
regfit.full <-
regsubsets(y~.,data=data,nvmax=p,nbest=1,really.big=really.big)
bic.lm <-
lm(as.formula(paste("y~",paste(data.names[summary(regfit.full)$which[order(summary(regfit.full)$bic)[1],][-1]],collapse="+"))),data=data)
if (bic){
plot(summary(regfit.full)$bic,type='l',xlab="Number of Predictors",
ylab="BIC",main="BIC Method")
points(summary(regfit.full)$bic,pch=20)
points(order(summary(regfit.full)$bic)[1],min(summary(regfit.full)$bic),col=2,pch=20,cex=1.5)}
regfit <- regsubsets(y~.,data=data.train,nvmax=p,nbest=1,really.big=really.big)
cv.rss <- rootmse <- rep(0,p)
for (i in 1:p){
data.lmfit <-lm(as.formula(paste("y~",paste(data.names[summary(regfit)$which[i,][-1]],collapse="+"))),data=data.train)
rootmse[i]<-sqrt(mean((predict(data.lmfit,data.test)-data.test$y)^2))}
if (test){
plot(rootmse,type='l',xlab="Number of Predictors",
ylab="Root Mean RSS on Validation Data",main="Validation Method")
points(rootmse,pch=20)
points(order(rootmse)[1],min(rootmse),col=2,pch=20,cex=1.5)}
validation.lm <-
lm(as.formula(paste("y~",paste(data.names[summary(regfit)$which[order(rootmse)[1],][-1]],collapse="+"))),data=data)
if (cv!=0){
s <- sample(cv,n,replace=T)
if (cv==n)
s <- 1:n
for (i in 1:p)
for (j in 1:cv){
data.train <- data[s!=j,]
data.test <- data[s==j,]
data.lmfit<-lm(as.formula(paste("y~",paste(data.names[summary(regfit.full)$which[i,][-1]],collapse="+"))),data=data.train)
cv.rss[i]<-cv.rss[i]+sum((predict(data.lmfit,data.test)-data.test$y)^2)
}
cv.rss <- sqrt(cv.rss/n)
plot(cv.rss,type='l',xlab="Number of Predictors",
ylab="Cross-Validated Root Mean RSS",main="Cross-Validation Method")
points(cv.rss,pch=20)
points(order(cv.rss)[1],min(cv.rss),col=2,pch=20,cex=1.5)
cv.lm <-
lm(as.formula(paste("y~",paste(data.names[summary(regfit.full)$which[order(cv.rss)[1],][-1]],collapse="+"))),data=data)}
else
cv.lm <- NULL
list(bic.lm = bic.lm,validation.lm=validation.lm,cv.lm=cv.lm)
}
library(leaps);
optim.lm.train <- optim.lm(carseat.train, "Sales");
names(optim.lm.train);
# Report the model selected by BIC, validation, or CV (cross validation)
summary(optim.lm.train$bic)
summary(optim.lm.train$validation)
summary(optim.lm.train$cv)
# Predict the residual sum of squares (or mean sum of squares) on my test data set.
# If we only use the mean of Y, the mean of sum squares is:
mean((carseats$Sales[-td] - mean(carseats$Sales[td]))^2)
# if we use model selected by BIC:
lm.bic=lm(Sales~CompPrice+Income+Advertising+Price+ShelveLoc+Age,data = carseats,subset=td);
# the mean of sum squares is:
mean((carseats$Sales[-td]-predict(lm.bic,carseats)[-td])^2);
#### ridge regression #####################################
Ridge Regression add a “penalty” on sum of squared betha. This has the effect of “shrinking” large values of beta towards zero. As a result the ridge regression estimates are often more accurate. Notice when lambda=0 we get OLS but as lambda gets larger the beta’s will get closer to zero: more shrinkage. Because: It turns out that the OLS estimates generally have low bias but can be highly variable. In particular when n and p are a similar size the OLS estimates will be extremely variable. The penalty term makes the ridge regression estimates biased but can also substantially reduce their variance.
# lambda is the penalty coefficient on the beta squares
# lambda 0, beta same as OLS
###########################################################
lambda.set <- 10^(seq(-2, 8, length = 100));
ridge.train <- lm.ridge(Sales~., carseats, subset = td, lambda = lambda.set)
select(ridge.train);
# modified HKB estimator is 0.6875941
# modified L-W estimator is 1.311013
# smallest value of GCV at 0.5214008
# The best lambda from GCV is 0.5214.
# Using this lambda, we can get the best model:
ridge.train.cv <- lm.ridge(Sales~., carseats, subset = td, lambda = 0.5214);
ridge.train.cv$coef;
ridge.pred.cv <- pred.ridge(ridge.train.cv, Sales~., carseats)
mean((carseats$Sales[-td] - ridge.pred.cv[-td])^2)
------ ## iterations ###############
rss.ridge <- rep(0, 100);
for(i in 1:100){
ridge.train <- lm.ridge(Sales~., carseats, subset = td, lambda = lambda.set[i]);
ridge.pred <- pred.ridge(ridge.train, Sales~., carseats);
rss.ridge[i] <- mean((carseats$Sales[-td] - ridge.pred[-td])^2);
}
min(rss.ridge);
plot(rss.ridge, type = "l")
best.lambda <- lambda.set[order(rss.ridge)[1]]
best.lambda;
ridge.best <- lm.ridge(Sales~., carseats, subset = td, lambda = best.lambda);
ridge.best$coef
#### LASSO #######################################################
Ridge Regression isn’t perfect. One significant problem is that the squared penalty will never force any of the coefficients to be exactly zero. Hence the final model will include all variables, making it harder to interpret. A very modern alternative is the LASSO. The LASSO works in a similar way to ridge regression except that it uses an L1 penalty. LASSO is not quite as computational efficient as ridge regression, however, there are efficient algorithm exist and still faster than subset selection.
# s is the constraint sum |beta| < s, s infinity, beta same as OLS
###################################################################
"cv.lasso" <-
function(formula,data,subset=NULL,K=10){
if (!is.null(subset))
data <- data[subset,]
y <- data[,names(data)==as.character(formula)[2]]
x <- model.matrix(as.formula(formula),data)[,-1]
larsfit <- cv.lars(x,y,K=K)
larsfit
}
"lasso" <-
function(formula,data,subset=NULL){
if (!is.null(subset))
data <- data[subset,]
y <- data[,names(data)==as.character(formula)[2]]
x <- model.matrix(as.formula(formula),data)[,-1]
larsfit <- lars(x,y,type="lasso")
larsfit
}
library(lars);
lasso.fit <- lasso(Sales~., carseats, subset = td);
plot(lasso.fit);
lasso.fit
## use cv.lasso to get best s:
lasso.cv <- cv.lasso(Sales~., carseats, subset = td);
s <- lasso.cv$fraction[order(lasso.cv$cv)[1]]
s
lasso.pred <- pred.lasso(lasso.fit, Sales~., carseats, s)
mean((carseats$Sales[-td] - lasso.pred[-td])^2);
pred.lasso(lasso.fit, Sales~., carseats, s, "coefficients")
#### iterations #########
s.set <- seq(0, 1, length = 100);
rss.lasso <- rep(0, 100);
for(i in 1:100){
lasso.pred <- pred.lasso(lasso.fit, Sales~., carseats, s = s.set[i]);
rss.lasso[i] <- mean((carseats$Sales[-td] - lasso.pred[-td])^2);
}
min(rss.lasso)
plot(rss.lasso,type="l")
s <- s.set[order(rss.lasso)[1]];
s
pred.lasso(lasso.fit, Sales~., carseats, s, "coefficients")
###########################################################
# we plot just predict mean, OLS, BIC, ridge regression and the
# LASSO in one plot, it can be explained more clearly.
rss.raw=mean((carseats$Sales[-td]-mean(carseats$Sales[td]))^2)
rss.raw
lmfit=lm(Sales~.,carseats,subset=td)
rss.ols=mean((carseats$Sales[-td]-predict(lmfit,carseats)[-td])^2)
rss.ols
plot(1:100,1:100,ylim=c(1,10),ylab="Test Mean RSS",xlab="Tuning Parameter", type="n")
abline(rss.raw,0,lwd=1,lty=2, col = "green")
abline(rss.ols,0,lwd=1,lty=3, col = "blue")
abline(rss.bic,0,lwd=1,lty=4, col = "grey")
lines(rss.lasso,lwd=1,lty=5, col = "red")
lines(rss.ridge,lwd=1,lty=6, col = "orange")
legend(70,7,c("Raw","OLS","BIC","LASSO","Ridge"),col = c("green", "blue", "grey", "red", "orange"), lty=c(2,3,4,5,1),lwd=1)
# OLS with all the variables give the smallest RSS,
# while the simple mean give the largest RSS