Data on 1000 households include demographic information about the household and information specific to the households telephone service (Watson, 1986). Only for 757 households the complete information is available.
pick – factor indicating whether the household picked AT&T as their long distance phone company. The level ATT indicates they did, while the level OCC indicates they picked another company.
income – ordered factor indicating the income level of the household. Levels are: `<7.5 < 7.5-15 < 15-25 < 25-35 < 35-45 < 45-75 < >75’.
moves – ordered factor indicating the number of times the household moved in the preceding 10 years. Levels are: `0 < 1 < 2 < 3 < 4 < 5 < 6 < 7 < >10’.
age – ordered factor indicating the age level of the respondent. Levels are: `18-24 < 25-34 < 35-44 < 45-54 < 55-64 < 65+’.
education – factor indicating the highest education level achieved by respondent. Levels are: <HS, HS, Voc, Coll, BA, and >BA.
employment – factor indicating the type of employment of the respondent. Levels are: F, P, R, S, H, U, and D.
usage – numeric vector giving the average monthly telephone usage of the household.
nonpub – factor indicating whether the household had an unlisted telephone number. Levels are: Y, N, and NA.
reach.out – factor indicating whether the household had participated in a special AT&T plan (before the forced choice of long-distance carrier).
card – factor indicating whether the household had an AT&T calling card service (before the forced choice of long-distance carrier). Levels are: Y, N, and NA.
set.seed(77)
setwd("D:/R Scripts/Advanced Topics/Data Files")
market.survey<-read.csv("market.survey.csv",head=T,colClasses=c(rep("factor",6),"numeric",rep("factor",3)))
market.full <- na.omit(market.survey)
used <- match(row.names(market.survey), row.names(market.full))
omitted <- seq(nrow(market.survey))[is.na(used)]
market.NA <- market.survey[omitted, ]
library(MASS)
library(tree)
library(rpart)
full.survey <- tree(pick~.,market.full, na.action = na.pass,model=T,mindev=0.0001)
# full.survey <- rpart(pick~.,market.full, na.action = na.pass,model=T)
plot(full.survey)
title("full tree",col="red")
text(full.survey, all=T,cex = .4)
print(summary(full.survey))
##
## Classification tree:
## tree(formula = pick ~ ., data = market.full, na.action = na.pass,
## model = T, mindev = 1e-04)
## Number of terminal nodes: 103
## Residual mean deviance: 0.8879 = 580.7 / 654
## Misclassification error rate: 0.1929 = 146 / 757
print(table(market.full$pick,predict(full.survey,type="class")))
##
## ATT OCC
## ATT 335 68
## OCC 77 277
plot(prune.tree(full.survey))
prune.survey <- prune.tree(full.survey, best = 6)
plot(prune.survey)
title("best tree of size 6",col="red")
text(prune.survey, cex = 1)
prune.snip <- snip.tree(prune.survey, nodes = c(4, 5, 3))
plot(prune.snip)
title("best tree of size 6 (after snipping)",col="red")
text(prune.snip, cex = 1)
print(summary(prune.snip))
##
## Classification tree:
## snip.tree(tree = prune.survey, nodes = c(4L, 5L, 3L))
## Variables actually used in tree construction:
## [1] "usage" "nonpub"
## Number of terminal nodes: 3
## Residual mean deviance: 1.306 = 985 / 754
## Misclassification error rate: 0.358 = 271 / 757
print(table(market.full$pick,predict(prune.snip,type="class")))
##
## ATT OCC
## ATT 267 136
## OCC 135 219
predict.NA <- predict(prune.snip, market.survey[omitted, ],type="class")
tbl<-table(market.survey$pick[omitted],predict.NA)
print(tbl)
## predict.NA
## ATT OCC
## ATT 56 45
## OCC 51 91
cat("Misclassification error rate:", round((tbl[1,2]+tbl[2,1])/length(omitted),4),"=",
tbl[1,2]+tbl[2,1], "/",length(omitted), "\n")
## Misclassification error rate: 0.3951 = 96 / 243
plot(cv.tree(full.survey, , prune.tree))
cv.survey <- prune.tree(full.survey, best = 4)
plot(cv.survey)
title("pruned tree by cross-validation")
text(cv.survey)
cv.survey <- snip.tree(cv.survey, nodes = c(5))
plot(cv.survey)
title("pruned tree by cross-validation (after snipping)")
text(cv.survey)
print(table(market.full$pick,predict(cv.survey,type="class")))
##
## ATT OCC
## ATT 267 136
## OCC 135 219
cat("\n\n Prediction on the Cross-Validated Tree for incomplete observations\n")
##
##
## Prediction on the Cross-Validated Tree for incomplete observations
cv.predict <- predict(cv.survey, newdata= market.NA, type = "class")
tbl<-table(market.survey$pick[omitted],cv.predict)
print(tbl)
## cv.predict
## ATT OCC
## ATT 56 45
## OCC 51 91
cat("Misclassification error rate:", round((tbl[1,2]+tbl[2,1])/length(omitted),4),"=",
tbl[1,2]+tbl[2,1], "/",length(omitted), "\n")
## Misclassification error rate: 0.3951 = 96 / 243
library(gbm)
y<-as.numeric(market.survey$pick)-1
n.trees=1000
boost.survey <-gbm(as.numeric(market.full$pick)-1~.,data=market.full,n.trees=n.trees,interaction.depth=3,distribution="adaboost",cv.folds=10)
boost.survey
## gbm(formula = as.numeric(market.full$pick) - 1 ~ ., distribution = "adaboost",
## data = market.full, n.trees = n.trees, interaction.depth = 3,
## cv.folds = 10)
## A gradient boosted model with adaboost loss function.
## 1000 iterations were performed.
## The best cross-validation iteration was 17.
## There were 9 predictors of which 9 had non-zero influence.
summary(boost.survey)
## var rel.inf
## income income 21.6248379
## usage usage 21.0517810
## age age 16.9289177
## education education 14.7276678
## moves moves 12.2595962
## employment employment 8.7436098
## nonpub nonpub 2.3979374
## reach.out reach.out 1.8487456
## card card 0.4169067
preds <- predict(boost.survey, n.trees = 1:n.trees, type = "response")
class_preds <- apply(preds, 2, function(x) ifelse(x > 0.5, 1, 0))
overall_error <- colMeans(class_preds != as.numeric(market.full$pick)-1 )
class_OCC_error <- colMeans(class_preds[as.numeric(market.full$pick)-1 == 0, ] != 0)
class_ATT_error <- colMeans(class_preds[as.numeric(market.full$pick)-1 == 1, ] != 1)
matplot(1:n.trees,cbind(overall_error, class_OCC_error,class_ATT_error),type="lll",lty=1:3,xlab="trees",ylab="Error",ylim=c(0.10,0.50),col=1:3)
text(450,0.23,"OCC")
text(450,0.20,"total")
text(450,0.15,"ATT")
title("adaboost.survey (training set)")
cat("Misclassification for the observations with comlete data\n")
## Misclassification for the observations with comlete data
tbl.1<-table(market.full$pick,ifelse(boost.survey$fit<.5,c("ATT"),c("OCC")))
print(tbl.1)
##
## ATT OCC
## ATT 396 7
## OCC 143 211
cat("Misclassification error rate:", round((tbl.1[1,2]+tbl.1[2,1])/length(market.full$pick),4),"=",
tbl.1[1,2]+tbl.1[2,1], "/",length(market.full$pick), "\n")
## Misclassification error rate: 0.1982 = 150 / 757
cat("\n\nMisclassification for the observations with incomlete data\n")
##
##
## Misclassification for the observations with incomlete data
predict.NA <- ifelse(predict(boost.survey, market.NA,type="response")<0.5,c("ATT"),c("OCC"))
tbl.2 <-table(market.survey$pick[omitted],predict.NA)
print(tbl.2)
## predict.NA
## ATT OCC
## ATT 63 38
## OCC 50 92
cat("Misclassification error rate:", round((tbl.2[1,2]+tbl.2[2,1])/length(omitted),4),"=",
tbl.2[1,2]+tbl.2[2,1], "/",length(omitted), "\n")
## Misclassification error rate: 0.3621 = 88 / 243
library(gbm)
y<-as.numeric(market.survey$pick)-1
boost.survey <-gbm(as.numeric(market.full$pick)-1~.,data=market.full,n.trees=n.trees,interaction.depth=3,distribution="bernoulli",cv.folds=10)
summary(boost.survey)
## var rel.inf
## income income 21.0206493
## usage usage 20.8556415
## age age 16.3759184
## education education 15.4477016
## moves moves 11.7885902
## employment employment 9.5239456
## nonpub nonpub 2.4765736
## reach.out reach.out 1.9239263
## card card 0.5870536
preds <- predict(boost.survey, n.trees = 1:n.trees, type = "response")
class_preds <- apply(preds, 2, function(x) ifelse(x > 0.5, 1, 0))
overall_error <- colMeans(class_preds != as.numeric(market.full$pick)-1 )
class_OCC_error <- colMeans(class_preds[as.numeric(market.full$pick)-1 == 0, ] != 0)
class_ATT_error <- colMeans(class_preds[as.numeric(market.full$pick)-1 == 1, ] != 1)
matplot(1:n.trees,cbind(overall_error, class_OCC_error,class_ATT_error),type="lll",lty=1:3,xlab="trees",ylab="Error",ylim=c(0.10,0.50),col=1:3)
text(450,0.20,"OCC")
text(450,0.18,"total")
text(450,0.15,"ATT")
title("logitboost.survey (training set)")
cat("Misclassification for the observations with comlete data\n")
## Misclassification for the observations with comlete data
tbl.1<-table(market.full$pick,ifelse(boost.survey$fit<.5,c("ATT"),c("OCC")))
print(tbl.1)
##
## ATT OCC
## ATT 387 16
## OCC 84 270
cat("Misclassification error rate:", round((tbl.1[1,2]+tbl.1[2,1])/length(market.full$pick),4),"=",
tbl.1[1,2]+tbl.1[2,1], "/",length(market.full$pick), "\n")
## Misclassification error rate: 0.1321 = 100 / 757
cat("\n\nMisclassification for the observations with incomlete data\n")
##
##
## Misclassification for the observations with incomlete data
predict.NA <- ifelse(predict(boost.survey, market.NA,type="response")<0.5,c("ATT"),c("OCC"))
tbl.2 <-table(market.survey$pick[omitted],predict.NA)
print(tbl.2)
## predict.NA
## ATT OCC
## ATT 62 39
## OCC 55 87
cat("Misclassification error rate:", round((tbl.2[1,2]+tbl.2[2,1])/length(omitted),4),"=",
tbl.2[1,2]+tbl.2[2,1], "/",length(omitted), "\n")
## Misclassification error rate: 0.3868 = 94 / 243
library(randomForest)
set.seed(77)
ntree=1000
bagging.survey<-randomForest(pick~.,market.full,ntree=ntree,mtry=9, maxnodes=30, importance=T)
print(bagging.survey)
##
## Call:
## randomForest(formula = pick ~ ., data = market.full, ntree = ntree, mtry = 9, maxnodes = 30, importance = T)
## Type of random forest: classification
## Number of trees: 1000
## No. of variables tried at each split: 9
##
## OOB estimate of error rate: 38.84%
## Confusion matrix:
## ATT OCC class.error
## ATT 237 166 0.4119107
## OCC 128 226 0.3615819
print(bagging.survey$importance)
## ATT OCC MeanDecreaseAccuracy MeanDecreaseGini
## income -0.000893117 0.0071590393 0.0028624694 20.629637
## moves -0.003894477 0.0079883352 0.0016809134 14.913661
## age 0.001338238 0.0112230130 0.0060294691 16.859622
## education -0.005245364 0.0074293800 0.0006594853 15.862406
## employment 0.003616753 0.0041706232 0.0039106598 13.033296
## usage 0.038412825 0.0478423354 0.0427341200 37.071603
## nonpub 0.020698891 0.0061163082 0.0138608317 10.173101
## reach.out 0.007332402 -0.0011096710 0.0033704363 4.095812
## card 0.001304193 -0.0007224123 0.0003536788 1.558880
plot(bagging.survey,main="bagging, OOB",col=1:3)
text(900,0.41,"OCC")
text(900,0.39,"total")
text(900,0.36,"ATT")
library(randomForest)
set.seed(77)
ntree=1000
forest.survey<-randomForest(pick~.,market.full,ntree=ntree, mtry=4,maxnodes=30,importance=T)
plot(forest.survey,main="random forest, OOB",col=1:3)
text(900,0.39,"OCC")
text(900,0.38,"total")
text(900,0.37,"ATT")
print(forest.survey)
##
## Call:
## randomForest(formula = pick ~ ., data = market.full, ntree = ntree, mtry = 4, maxnodes = 30, importance = T)
## Type of random forest: classification
## Number of trees: 1000
## No. of variables tried at each split: 4
##
## OOB estimate of error rate: 38.18%
## Confusion matrix:
## ATT OCC class.error
## ATT 246 157 0.3895782
## OCC 132 222 0.3728814
print(forest.survey$importance)
## ATT OCC MeanDecreaseAccuracy MeanDecreaseGini
## income 0.001021621 0.0058106700 0.0032287561 17.141160
## moves -0.001356287 0.0076378899 0.0027866887 13.180899
## age 0.003784406 0.0146468134 0.0088692644 14.120922
## education -0.002947729 0.0066076206 0.0015665698 12.855820
## employment 0.002198937 0.0047642505 0.0033813515 10.891949
## usage 0.038020568 0.0432228865 0.0404293916 30.908406
## nonpub 0.019456309 0.0030502870 0.0118031324 7.991694
## reach.out 0.008248406 0.0009055933 0.0048476197 4.287364
## card 0.001603838 -0.0005579848 0.0006075804 2.059811
library(nnet)
survey.net <- nnet(pick~., data
= market.full, size = 5, decay = 0.01, skip = F,
trace = F, maxit = 1000)
tbl <- table(market.full$pick,predict(survey.net,market.full, type="class"))
print(tbl)
##
## ATT OCC
## ATT 307 96
## OCC 66 288
cat("Misclassification error rate:", round((tbl[1,2]+tbl[2,1])/length(market.full$pick),4),"=",
tbl[1,2]+tbl[2,1], "/",length(market.full$pick), "\n")
## Misclassification error rate: 0.214 = 162 / 757