Tuesday, October 21, 2008

note for linear and logistic regression

# general way of linear and logistic regression
One common measure of accuracy is the residual sum of squares , U shape means non-linear relationship, normal quantile plot for normality, constant variance of error terms, independence of error terms.

# simple regression
car[1:10, ];
library(MASS)
lm.simple1 <- lm(Mpg ~ Weight, data = car);
summary(lm.simple1);
###########################################################
windows(); # plot the fitted line
plot(car$Weight, car$Mpg, xlab = "Weight", ylab = "Mpg");
abline(lm.simple1, col = "red");
###########################################################
windows(); # resid ~ x plots see any dependency
plot(car$Weight, residuals(lm.simple1), xlab = "Weight", ylab = "resids");
abline(h = 0, col = "red");
lines(loess.smooth(car$Weight, residuals(lm.simple1)), col = "green", lty = 5);
###########################################################
windows(); # std error against the fitted value
plot(fitted(lm.simple1), stdres(lm.simple1), xlab = "Fitted Mpg", ylab = "standardized resids");
abline(h = 0, col = "red");
lines(loess.smooth(fitted(lm.simple1), stdres(lm.simple1)), col = "green", lty = 5);
###########################################################
windows(20 ,20)
par(mfrow=c(2,2))
plot(density(stdres(lm.simple1)), main = "density of standardized residuals")
qqnorm(stdres(lm.simple1)); # QQ plot of the std errors
qqline(stdres(lm.simple1))
###########################################################
# Multiple Linear Regression
###########################################################
pairs(car);
lm.mlti <- lm(Mpg ~ Cylind. + Disp. + Horse. + Weight + Accel. + Year + Origin, data = car)
# dot say use all the variable in data except Mpg to do linear regression
lm.mlti <- lm(Mpg ~ ., data = car)
summary(lm.mlti)
# .-Weight say use all the variable in data except Mpg and Weight to do linear regression
lm.mlti <- lm(Mpg ~ .-Weight, data = car)
# means Disp. + Weight + Disp.:Weight (interactions)
lm.mlti.e <- lm(Mpg ~ Disp. * Weight, data = car)
summary(lm.mlti.e)
###########################################################
## Logistic Regression, use glm function
###########################################################
glm.CYT <- glm(Loc_CYT ~ mcg + gvh + alm + mit + erl + pox + vac + nuc, family = binomial, data = dataset_new);
summary(glm.CYT);
contrasts(dataset_new$Loc_CYT);
table1 <- table(predict(glm.CYT1, type = "response") > 0.5, dataset_new$Loc_CYT); # confusion table
table1
size <- dim(dataset_new);
## check the prediction accuracy
(table1[1, 1] + table1[2, 2]) / size[1];
## use partial of the dataset as training dataset, the others as test dataset:
glm.CYT5 <- glm(Loc_CYT ~ gvh + alm + mit + nuc, family = binomial,
data = dataset_new, subset = 1:900);
table2 <- table(predict(glm.CYT5, dataset_new, type = "response")[-(1:900)] > 0.5,
dataset_new$Loc_CYT[-(1:900)]);
table2
(table2[1, 1] + table2[2, 2]) / (size[1] - 900);

####################################################
A good website can be find here:
http://www.statmethods.net/stats/regression.html

No comments: