######################################## # 5.1 ######################################## #new.housing <- read.table("D:/Courses/regression/data sets/new housing.txt",header=T) attach(new.housing) lm.1 <- lm(housing ~ intrate + gnp + pop) summary(lm.1) # Residual and plots and the qqplot par(mfcol=c(1,3)) plot(lm.1$res,ylab="",main="residuals") plot(rstudent(lm.1),ylab="",main="studentized res.") aa <- qqnorm(rstudent(lm.1),main="studentized res.") qqline(rstudent(lm.1),main="studentized res.") # What is the qqnorm plot? aa rstudent(lm.1) qnorm(ppoints(23))[rank(rstudent(lm.1))] ######################################## # 5.2 ######################################## # leverage and Cook's distance plot x <- c(0,seq(2,8,length=18),10) y <- rnorm(20,sd = 1) y[1] <- 0.5 y[20] <- 4 y[9] <- 4 par(mfcol=c(1,1)) plot(x,y) text(x[1]-.1,y[1],"1") text(x[9]-.1,y[9],"9") text(x[20]-.1,y[20],"20") lm.all <- lm(y~x) lm.n.1 <- lm(y~x,subset=(x != x[1])) lm.n.9 <- lm(y~x,subset=(x != x[9])) lm.n.20 <- lm(y~x,subset=(x != x[20])) abline(lm.all) abline(lm.n.1,lty = 2) abline(lm.n.9,lty = 3) abline(lm.n.20,lty = 4) par(mfcol=c(1,2)) plot(hat(model.matrix(lm.all)),ylab="",main="leverages") abline(h=2*2/20) plot(cooks.distance(lm.all),ylab="",main="Cook's distance") abline(h=0.5) cbind(res=lm.all$res,stand=rstandard(lm.all),student=rstudent(lm.all), leverage=hat(model.matrix(lm.all)),cook.dist=cooks.distance(lm.all)) # computation for the 20th observation y[20] - lm.all$fit[20] (h.20 <- hat(model.matrix(lm.all))[20]) (sigma <- summary(lm.all)$sigma) (sigma.n.20 <- summary(lm.n.20)$sigma) (r.20 <- (y[20] - lm.all$fit[20]) / (sigma * sqrt(1-h.20)) ) (y[20] - lm.all$fit[20]) / (sigma.n.20 * sqrt(1-h.20)) pred.n.20 <- predict.lm(lm.n.20,newdata=data.frame(x=x)) sum( (lm.all$fit-pred.n.20)^2) / (2 * sigma^2) 1/2 * r.20^2 * ( h.20 / (1-h.20)) ######################################## # 5.3 ######################################## # Bacteria example time <- 1:15 num.bact <- c(355,211,197,166,142,106,104,60,56,38,36,32,21,19,15) lm.1 <- lm(num.bact~time) par(mfcol=c(1,2)) plot(time,num.bact,xlab="Time",ylab="Num of bacteria") abline(lm.1) plot(time,lm.1$res,xlab="Time",ylab="Residuals") abline(0,0) summary(lm(num.bact~time)) log.num <- log(num.bact) lm.2 <- lm(log.num~time) pred.2 <- predict(lm.2,data.frame(time=seq(0,15,length=100))) par(mfrow=c(2,2)) plot(time,log.num,xlab="Time",ylab="log-Num of bacteria",xlim=c(0,15)) abline(lm.2) plot(time,lm.2$res,xlab="Time",ylab="log-Num of bacteria resid.",xlim=c(0,15)) abline(0,0) plot(time,num.bact,xlab="Time",ylab="Num of bacteria",xlim=c(0,15),ylim=c(0,395)) lines(seq(0,15,length=100),exp(pred.2)) plot(time,num.bact - exp(lm.2$fit),xlab="Time",ylab="Num of bacteria resid.",xlim=c(0,15)) abline(0,0) summary(lm.2) b.0 <- 5.973160 b.1 <- -0.218425 b.1 b.1 + 0.006575 * qt(0.975,13) b.1 - 0.006575 * qt(0.975,13) exp(b.0) exp(b.0 + 0.059778 * qt(0.975,13)) exp(b.0 - 0.059778 * qt(0.975,13)) ######################################## # 5.4 ######################################## # To run boxcox we need to load MASS package library(MASS) aa <- boxcox(num.bact ~ time, ,lambda=seq(-0.2,0.6,length=101)) abline(h = max(aa$y), col = 3) abline(h = max(aa$y) - qchisq(0.95,1)/2, col = 3,lty = 3) # Profile likelihood for the box-cox transform lambda.vec <- seq(-0.2,0.6,length=101) # In the following set of commands i explicitly compute profile.lik <- rep(NA,101) # explicit computation of the likelihood profile for each lambda for(i in 1:101) { n <- length(time) if(lambda.vec[i] == 0) trans.y <- log(num.bact) if(lambda.vec[i] != 0) trans.y <- (num.bact^lambda.vec[i] - 1) / lambda.vec[i] lm.1 <- lm(trans.y ~ time) SSE <- sum(lm.1$res^2) profile.lik[i] <- (2 * pi * SSE / n)^(-n/2) * exp(-n/2) * (prod(num.bact))^(lambda.vec[i]-1) } aa$y - log(profile.lik) ######################################## # 5.5 ######################################## library(lmtest) library(tseries) # dwtest -- lmtest package # runs.test -- tseries package set.seed(0) z.vec <- rnorm(20) pos.ar <- z.vec neg.ar <- z.vec for(i in 2:20) { neg.ar[i] <- sqrt(0.5) * z.vec[i] - sqrt(0.5) * neg.ar[i-1] pos.ar[i] <- sqrt(0.5) * z.vec[i] + sqrt(0.5) * pos.ar[i-1] } par(mfcol=c(1,2)) plot(pos.ar,type="b",ylab="Residuals",xlab="Time",main="Positive AR") ; abline(0,0) plot(neg.ar,type="b",ylab="Residuals",xlab="Time",main="Negative AR") ; abline(0,0) runs.test(factor(pos.ar>0)) runs.test(factor(neg.ar>0)) dwtest(lm(pos.ar ~ rep(1,20)+0),alternative = "two.sided") dwtest(lm(neg.ar ~ rep(1,20)+0),alternative = "two.sided") ######################################## # 5.5 ######################################## new.housing <- read.table("E:/Courses/regression/data sets/new housing.txt",header=T) attach(new.housing) pairs(new.housing) lm.1 <- lm(housing ~ intrate + gnp + pop) summary(lm.1) anova(lm.1) cor(as.matrix(new.housing[,3:5])) # A simple expression for marginal coeff variance ..... vcov(lm.1) s.11 <- sum( (intrate - mean(intrate))^2) R.sq.1 <- summary(lm(intrate ~ + gnp + pop))$r.squ MSE <- summary( lm.1)$sigma^2 MSE / (s.11 * ( 1 - R.sq.1)) # install faraway & MASS packages - needed for computation of VIF and ridge regression library(faraway) library(MASS) 1 / ( 1 - R.sq.1) vif(lm.1) x <- as.matrix(new.housing[,3:5]) eigen(t(x)%*%x) e <- eigen(t(x)%*%x)$values sqrt(e[1]/e[3])