################################################################################## # Section 7.1: kaveret data # The file kaveret.txt contains the data about ``working activities" of bees in the bee-hive (Hebrew: "kaveret") # as a function of time of the day. One of the important characteristics of ``working activities" is the number of # bees leaving the bee-hive for outside activities. The file contains the number of bees that left the bee-hive and the time of the day (in hours). # X1 = ID Number. # X2 = Number of bees that left the bee-hive #X3 = The time in the day bee.data <- read.table("D:/Courses/regression/data sets/kaveret.txt",header=F) num.bees <- bee.data[,2] hour <- bee.data[,3] hour.sq <- hour^2 par(mfcol=c(1,1)) plot(hour,num.bees) lm.1 <- lm(num.bees ~ hour) lm.2 <- lm(num.bees ~ hour + hour.sq) # 2nd order polynomial model for num of bees summary(lm.1) summary(lm.2) ord.plt <- order(hour) lines(hour[ord.plt],lm.1$fitted[ord.plt],lty=2) lines(hour[ord.plt],lm.2$fitted[ord.plt],lty=1) par(mfcol=c(1,2)) plot(hour,lm.2$res) ; abline(0,0) plot(lm.2$fitted,lm.2$res) ; abline(0,0) boxplot(split(num.bees,hour)) boxplot(split(sqrt(num.bees),hour)) num.sqrt <- sqrt(num.bees) lm.3 <- lm(num.sqrt ~ hour + hour.sq) # 2nd order polynomial model for square root of num of bees hour.p2 <- hour^2 hour.p3 <- hour^3 hour.p4 <- hour^4 hour.p5 <- hour^5 hour.p6 <- hour^6 hour.p7 <- hour^7 lm.4 <- lm(num.sqrt ~ hour + hour.p2 + hour.p3 + hour.p4 + hour.p5 + hour.p6 + hour.p7) anova(lm.4) lm.5 <- lm(num.sqrt ~ hour + hour.p2 + hour.p3 + hour.p4 + hour.p5 + hour.p6) # 6th order polynomial model for square root of num of bees summary(lm.3) summary(lm.5) a3 <- predict.lm(lm.3,se.fit=T,interval="confidence") ; a3$fit[1,] a5 <- predict.lm(lm.5,se.fit=T,interval="confidence") ; a5$fit[1,] par(mfcol=c(1,1)) plot(hour,num.sqrt) lines(hour[ord.plt],a3$fit[ord.plt,1],lty=2) lines(hour[ord.plt],a3$fit[ord.plt,2],lty=2) lines(hour[ord.plt],a3$fit[ord.plt,3],lty=2) lines(hour[ord.plt],a5$fit[ord.plt,1],lty=1) lines(hour[ord.plt],a5$fit[ord.plt,2],lty=1) lines(hour[ord.plt],a5$fit[ord.plt,3],lty=1) par(mfcol=c(1,2)) plot(hour,lm.3$res) ; abline(0,0) lm.3.fit <- predict(lm.3,interval="confidence")$fitted plot(lm.3$fitted,lm.3$res) ; abline(0,0) summary(lm.3) ################################################################################## # Section 7.2: cancer data # Data from a study to test for a possible cancer marker, based on a band that sometimes appears in chemical spectrum analysis of Alkaline Phosphatase. # X1 = group code (1 = cancer, no band; 2 = healthy, no band; 3 = cancer, band; 4 = healthy, band) # X2 = level of AKP in blood. cancer.data <- read.table("D:/Courses/regression/data sets/cancer.txt",header=F) cancer.type <- cancer.data[,1] akp.level <- cancer.data[,2] par(mfcol=c(1,3)) plot(cancer.type,akp.level) boxplot(split(akp.level,cancer.type)) boxplot(split(log(akp.level),cancer.type)) i.0 <- rep(1,366) i.1 <- ifelse(cancer.type == 1,rep(1,366),rep(0,366)) i.2 <- ifelse(cancer.type == 2,rep(1,366),rep(0,366)) i.3 <- ifelse(cancer.type == 3,rep(1,366),rep(0,366)) i.4 <- ifelse(cancer.type == 4,rep(1,366),rep(0,366)) lm.0 <- lm(akp.level ~ i.0 + 0) lm.1 <- lm(akp.level ~ i.1 + i.2 + i.3 + i.4 + 0) lm.2 <- lm(akp.level ~ i.2 + i.3 + i.4 ) summary(lm.1) sapply(split(akp.level,cancer.type),mean) summary(lm.2) x.1 <- cbind(i.1,i.2,i.3,i.4) ; solve(t(x.1)%*%x.1)%*%t(x.1)%*%akp.level x.2 <- cbind(i.0,i.2,i.3,i.4) ; solve(t(x.2)%*%x.2)%*%t(x.2)%*%akp.level summary(lm(akp.level ~ cancer.type)) summary(lm(akp.level ~ factor(cancer.type))) summary(aov(akp.level ~ factor(cancer.type))) anova(lm.0,lm.2) ################################################################################## # Section 7.3 -- Company data (Goldenshluger and Faraggi p. 122): num.months <- c(17,26,21,30,22,0,12,19,4,16,28,15,11,38,31,21,20,13,30,14) company.size <- c(151,92,175,31,104,277,210,120,290,238,164,272,295,68,85,224,166,305,124,246) company.type <- c(rep(a,10),rep(b,10)) i.b <- c(rep(0,10),rep(1,10)) company.size.a <- c(company.size[1:10],rep(0,10)) company.size.b <- c(rep(0,10),company.size[11:20]) lm.0 <- lm(num.months ~ company.size) lm.1 <- lm(num.months ~ i.b + company.size) lm.2 <- lm(num.months ~ i.b + company.size.a + company.size.b) lm.3 <- lm(num.months ~ i.b * company.size) par(mfcol=c(1,1)) plot(company.size[1:10],num.months[1:10],pch="a", xlab="Company size",ylab="Num. of months", xlim=range(company.size),ylim=range(num.months)) points(company.size[11:20],num.months[11:20],pch="b") ord.1 <- order(company.size[1:10]) ord.2 <- 10 + order(company.size[11:20]) lines(company.size[ord.1],lm.3$fit[ord.1]) lines(company.size[ord.2],lm.3$fit[ord.2]) summary(lm.1) summary(lm.2) summary(lm.3) anova(lm.0,lm.1) anova(lm.1,lm.2) anova(lm.1,lm.3) ################################################################################## # Section 7.4: Multicolinearity -- new housing data: new.housing <- read.table("D:/Courses/regression/data sets/new housing.txt",header=T) attach(new.housing) pairs(new.housing) lm.1 <- lm(housing ~ intrate + gnp + pop) # Collinearity??? summary(lm.1) anova(lm.1) summary(lm(housing ~ intrate )) summary(lm(housing ~ intrate + gnp)) summary(lm(housing ~ intrate + pop)) summary(lm(housing ~ intrate + gnp + pop)) summary(lm(housing ~ gnp + pop)) cor(as.matrix(new.housing[,3:5])) # A simple expression for marginal coeff variance ..... vcov(lm.1) s.11 <- sum( (intrate - mean(intrate))^2) R.sq.1 <- summary(lm(intrate ~ + gnp + pop))$r.squ MSE <- summary( lm.1)$sigma^2 MSE / (s.11 * ( 1 - R.sq.1)) # install faraway & MASS packages - needed for computation of VIF and ridge regression library(faraway) library(MASS) 1 / ( 1 - R.sq.1) vif(lm.1) x <- as.matrix(new.housing[,3:5]) eigen(t(x)%*%x) e <- eigen(t(x)%*%x)$values sqrt(e[1]/e[3]) # Ridge regression par(mfcol=c(1,1)) plot(lm.ridge(lm.1, lambda = seq(0,4,0.001)),lty=1:3) abline(0,0) select(lm.ridge(lm.1, lambda = seq(0,10,0.001))) lm.ridge(lm.1,lambda=0.6)$coef / lm.ridge(lm.1,lambda=0.6)$scales lm.1$coeff x.mat <- model.matrix(lm.1) solve(t(x.mat) %*% x.mat) %*% t(x.mat) %*% housing sqrt(diag(vcov(lm.1))) solve(t(x.mat) %*% x.mat + 0.6 * diag(c(1,1,1,1))) %*% t(x.mat) %*% housing sqrt(MSE * diag( solve(t(x.mat) %*% x.mat + 0.6 * diag(c(1,1,1,1))) %*% t(x.mat) %*% x.mat %*% solve(t(x.mat) %*% x.mat + 0.6 * diag(c(1,1,1,1))))) # Ridge regression simulation sig.sim <- summary(lm.1)$sigma beta.sim <- lm.1$coeff mu.sim <- lm.1$fit beta.lse <- array(dim=c(4,1000)) beta.rdg <- array(dim=c(4,1000)) for(i in 1:1000) { housing.sim <- mu.sim + rnorm(23,mean=0,sd = sig.sim) beta.lse[,i] <- lm(housing.sim ~ intrate + gnp + pop)$coef beta.rdg[,i] <- coef(lm.ridge(housing.sim ~ intrate + gnp + pop,lambda=0.6)) } # plot regression coefficients for 1st 15 samples par(mfcol=c(3,1)) plot(beta.lse[2,1:15],ylab="coef",xlab="1st 15 samples",main="Int. Rate coeff.",type="b",ylim=range(c(0,beta.lse[2,1:15]))) lines(beta.rdg[2,1:15],type="b",lty=2) abline(h = beta.sim[2]) plot(beta.lse[3,1:15],ylab="coef",xlab="1st 15 samples",main="GNP coeff.",type="b",ylim=range(c(0,beta.lse[3,1:15]))) lines(beta.rdg[3,1:15],type="b",lty=2) abline(h = beta.sim[3]) plot(beta.lse[4,1:15],ylab="coef",xlab="1st 15 samples",main="Population coeff.",type="b",ylim=range(c(0,beta.lse[4,1:15]))) lines(beta.rdg[4,1:15],type="b",lty=2) abline(h = beta.sim[4]) # plot boxplots for regression coefficients par(mfcol=c(1,3)) boxplot(list(beta.lse[2,],beta.rdg[2,]),names=c("LSE","Ridge"),main =" Int. Rate coeff." ) abline(h = beta.sim[2]) boxplot(list(beta.lse[3,],beta.rdg[3,]),names=c("LSE","Ridge"),main = "GNP coeff." ) abline(h = beta.sim[3]) boxplot(list(beta.lse[4,],beta.rdg[4,]),names=c("LSE","Ridge"),main = "Population coeff.") abline(h = beta.sim[4]) # Ridge coeffs are biased but have smaller variance t.test(beta.lse[2,] - beta.sim[2]) ; t.test(beta.rdg[2,] - beta.sim[2]) t.test(beta.lse[3,] - beta.sim[3]) ; t.test(beta.rdg[3,] - beta.sim[3]) t.test(beta.lse[4,] - beta.sim[4]) ; t.test(beta.rdg[4,] - beta.sim[4]) t.test(apply((x.mat%*% beta.lse - mu.sim)^2,2,sum) / sig.sim^2) # the MSE for least sqaure estimators is sig^2 * p t.test(apply((x.mat%*% beta.rdg - mu.sim)^2,2,sum) / sig.sim^2) # ridge regression produces biased estimates with smaller MSE