Tree.knit

Classification trees: AT&T Telemarketing Data

Data on 1000 households include demographic information about the household and information specific to the households telephone service (Watson, 1986). Only for 757 households the complete information is available.

pick – factor indicating whether the household picked AT&T as their long distance phone company. The level ATT indicates they did, while the level OCC indicates they picked another company.

income – ordered factor indicating the income level of the household. Levels are: `<7.5 < 7.5-15 < 15-25 < 25-35 < 35-45 < 45-75 < >75’.

moves – ordered factor indicating the number of times the household moved in the preceding 10 years. Levels are: `0 < 1 < 2 < 3 < 4 < 5 < 6 < 7 < >10’.

age – ordered factor indicating the age level of the respondent. Levels are: `18-24 < 25-34 < 35-44 < 45-54 < 55-64 < 65+’.

education – factor indicating the highest education level achieved by respondent. Levels are: <HS, HS, Voc, Coll, BA, and >BA.

employment – factor indicating the type of employment of the respondent. Levels are: F, P, R, S, H, U, and D.

usage – numeric vector giving the average monthly telephone usage of the household.

nonpub – factor indicating whether the household had an unlisted telephone number. Levels are: Y, N, and NA.

reach.out – factor indicating whether the household had participated in a special AT&T plan (before the forced choice of long-distance carrier).

card – factor indicating whether the household had an AT&T calling card service (before the forced choice of long-distance carrier). Levels are: Y, N, and NA.

set.seed(77)
setwd("D:/R Scripts/Advanced Topics/Data Files")
market.survey<-read.csv("market.survey.csv",head=T,colClasses=c(rep("factor",6),"numeric",rep("factor",3)))
market.full <- na.omit(market.survey)
used <- match(row.names(market.survey), row.names(market.full))
omitted <- seq(nrow(market.survey))[is.na(used)]
market.NA <- market.survey[omitted,  ]

Full Tree

library(MASS)
library(tree)
library(rpart)


full.survey <- tree(pick~.,market.full, na.action = na.pass,model=T,mindev=0.0001)
# full.survey <- rpart(pick~.,market.full, na.action = na.pass,model=T)
plot(full.survey)
title("full tree",col="red")
text(full.survey, all=T,cex = .4)

print(summary(full.survey))

## 
## Classification tree:
## tree(formula = pick ~ ., data = market.full, na.action = na.pass, 
##     model = T, mindev = 1e-04)
## Number of terminal nodes:  103 
## Residual mean deviance:  0.8879 = 580.7 / 654 
## Misclassification error rate: 0.1929 = 146 / 757

print(table(market.full$pick,predict(full.survey,type="class")))

##      
##       ATT OCC
##   ATT 335  68
##   OCC  77 277

Pruning

  plot(prune.tree(full.survey))

    prune.survey <- prune.tree(full.survey, best = 6)
    plot(prune.survey)
    title("best tree of size 6",col="red")
    text(prune.survey, cex = 1)

    prune.snip <- snip.tree(prune.survey, nodes = c(4, 5, 3))
    plot(prune.snip)    
  title("best tree of size 6 (after snipping)",col="red")
    text(prune.snip, cex = 1)

    print(summary(prune.snip))

## 
## Classification tree:
## snip.tree(tree = prune.survey, nodes = c(4L, 5L, 3L))
## Variables actually used in tree construction:
## [1] "usage"  "nonpub"
## Number of terminal nodes:  3 
## Residual mean deviance:  1.306 = 985 / 754 
## Misclassification error rate: 0.358 = 271 / 757

  print(table(market.full$pick,predict(prune.snip,type="class")))

##      
##       ATT OCC
##   ATT 267 136
##   OCC 135 219

Prediction on the pruned tree using 243 observations with incompelete data

predict.NA <- predict(prune.snip, market.survey[omitted,  ],type="class")
tbl<-table(market.survey$pick[omitted],predict.NA)
print(tbl)

##      predict.NA
##       ATT OCC
##   ATT  56  45
##   OCC  51  91

cat("Misclassification error rate:", round((tbl[1,2]+tbl[2,1])/length(omitted),4),"=",
tbl[1,2]+tbl[2,1], "/",length(omitted), "\n")

## Misclassification error rate: 0.3951 = 96 / 243

Choice of the tree’s size for pruning by 10-fold cross validation

plot(cv.tree(full.survey,  , prune.tree))

cv.survey <- prune.tree(full.survey, best = 4)
plot(cv.survey)
title("pruned tree by cross-validation")
text(cv.survey)

cv.survey <- snip.tree(cv.survey, nodes = c(5))
plot(cv.survey)
title("pruned tree by cross-validation (after snipping)")
text(cv.survey)

  print(table(market.full$pick,predict(cv.survey,type="class")))

##      
##       ATT OCC
##   ATT 267 136
##   OCC 135 219

cat("\n\n Prediction on the Cross-Validated Tree for incomplete observations\n")

## 
## 
##  Prediction on the Cross-Validated Tree for incomplete observations

cv.predict <- predict(cv.survey, newdata= market.NA, type = "class")
tbl<-table(market.survey$pick[omitted],cv.predict)
    print(tbl)

##      cv.predict
##       ATT OCC
##   ATT  56  45
##   OCC  51  91

cat("Misclassification error rate:", round((tbl[1,2]+tbl[2,1])/length(omitted),4),"=",
tbl[1,2]+tbl[2,1], "/",length(omitted), "\n")

## Misclassification error rate: 0.3951 = 96 / 243

AdaBoost

library(gbm)
y<-as.numeric(market.survey$pick)-1
n.trees=1000
boost.survey <-gbm(as.numeric(market.full$pick)-1~.,data=market.full,n.trees=n.trees,interaction.depth=3,distribution="adaboost",cv.folds=10)
boost.survey

## gbm(formula = as.numeric(market.full$pick) - 1 ~ ., distribution = "adaboost", 
##     data = market.full, n.trees = n.trees, interaction.depth = 3, 
##     cv.folds = 10)
## A gradient boosted model with adaboost loss function.
## 1000 iterations were performed.
## The best cross-validation iteration was 17.
## There were 9 predictors of which 9 had non-zero influence.

summary(boost.survey)

##                   var    rel.inf
## income         income 21.6248379
## usage           usage 21.0517810
## age               age 16.9289177
## education   education 14.7276678
## moves           moves 12.2595962
## employment employment  8.7436098
## nonpub         nonpub  2.3979374
## reach.out   reach.out  1.8487456
## card             card  0.4169067

preds <- predict(boost.survey, n.trees = 1:n.trees, type = "response")
class_preds <- apply(preds, 2, function(x) ifelse(x > 0.5, 1, 0))

overall_error <- colMeans(class_preds != as.numeric(market.full$pick)-1 )
class_OCC_error <- colMeans(class_preds[as.numeric(market.full$pick)-1 == 0, ] != 0)
class_ATT_error <- colMeans(class_preds[as.numeric(market.full$pick)-1 == 1, ] != 1)

matplot(1:n.trees,cbind(overall_error, class_OCC_error,class_ATT_error),type="lll",lty=1:3,xlab="trees",ylab="Error",ylim=c(0.10,0.50),col=1:3)
text(450,0.23,"OCC")
text(450,0.20,"total")
text(450,0.15,"ATT")
title("adaboost.survey (training set)")

cat("Misclassification for the observations with comlete data\n")

## Misclassification for the observations with comlete data

tbl.1<-table(market.full$pick,ifelse(boost.survey$fit<.5,c("ATT"),c("OCC")))
print(tbl.1)

##      
##       ATT OCC
##   ATT 396   7
##   OCC 143 211

cat("Misclassification error rate:", round((tbl.1[1,2]+tbl.1[2,1])/length(market.full$pick),4),"=",
tbl.1[1,2]+tbl.1[2,1], "/",length(market.full$pick), "\n")

## Misclassification error rate: 0.1982 = 150 / 757

cat("\n\nMisclassification for the observations with incomlete data\n")

## 
## 
## Misclassification for the observations with incomlete data

predict.NA <- ifelse(predict(boost.survey, market.NA,type="response")<0.5,c("ATT"),c("OCC"))
tbl.2 <-table(market.survey$pick[omitted],predict.NA)
print(tbl.2)

##      predict.NA
##       ATT OCC
##   ATT  63  38
##   OCC  50  92

cat("Misclassification error rate:", round((tbl.2[1,2]+tbl.2[2,1])/length(omitted),4),"=",
tbl.2[1,2]+tbl.2[2,1], "/",length(omitted), "\n")

## Misclassification error rate: 0.3621 = 88 / 243

LogitBoost

library(gbm)
y<-as.numeric(market.survey$pick)-1
boost.survey <-gbm(as.numeric(market.full$pick)-1~.,data=market.full,n.trees=n.trees,interaction.depth=3,distribution="bernoulli",cv.folds=10)
summary(boost.survey)

##                   var    rel.inf
## income         income 21.0206493
## usage           usage 20.8556415
## age               age 16.3759184
## education   education 15.4477016
## moves           moves 11.7885902
## employment employment  9.5239456
## nonpub         nonpub  2.4765736
## reach.out   reach.out  1.9239263
## card             card  0.5870536

preds <- predict(boost.survey, n.trees = 1:n.trees, type = "response")
class_preds <- apply(preds, 2, function(x) ifelse(x > 0.5, 1, 0))

overall_error <- colMeans(class_preds != as.numeric(market.full$pick)-1 )
class_OCC_error <- colMeans(class_preds[as.numeric(market.full$pick)-1 == 0, ] != 0)
class_ATT_error <- colMeans(class_preds[as.numeric(market.full$pick)-1 == 1, ] != 1)

matplot(1:n.trees,cbind(overall_error, class_OCC_error,class_ATT_error),type="lll",lty=1:3,xlab="trees",ylab="Error",ylim=c(0.10,0.50),col=1:3)
text(450,0.20,"OCC")
text(450,0.18,"total")
text(450,0.15,"ATT")
title("logitboost.survey (training set)")

cat("Misclassification for the observations with comlete data\n")

## Misclassification for the observations with comlete data

tbl.1<-table(market.full$pick,ifelse(boost.survey$fit<.5,c("ATT"),c("OCC")))
print(tbl.1)

##      
##       ATT OCC
##   ATT 387  16
##   OCC  84 270

cat("Misclassification error rate:", round((tbl.1[1,2]+tbl.1[2,1])/length(market.full$pick),4),"=",
tbl.1[1,2]+tbl.1[2,1], "/",length(market.full$pick), "\n")

## Misclassification error rate: 0.1321 = 100 / 757

cat("\n\nMisclassification for the observations with incomlete data\n")

## 
## 
## Misclassification for the observations with incomlete data

predict.NA <- ifelse(predict(boost.survey, market.NA,type="response")<0.5,c("ATT"),c("OCC"))
tbl.2 <-table(market.survey$pick[omitted],predict.NA)
print(tbl.2)

##      predict.NA
##       ATT OCC
##   ATT  62  39
##   OCC  55  87

cat("Misclassification error rate:", round((tbl.2[1,2]+tbl.2[2,1])/length(omitted),4),"=",
tbl.2[1,2]+tbl.2[2,1], "/",length(omitted), "\n")

## Misclassification error rate: 0.3868 = 94 / 243

Bagging

library(randomForest)
set.seed(77)
ntree=1000
bagging.survey<-randomForest(pick~.,market.full,ntree=ntree,mtry=9, maxnodes=30, importance=T)
print(bagging.survey)

## 
## Call:
##  randomForest(formula = pick ~ ., data = market.full, ntree = ntree,      mtry = 9, maxnodes = 30, importance = T) 
##                Type of random forest: classification
##                      Number of trees: 1000
## No. of variables tried at each split: 9
## 
##         OOB estimate of  error rate: 38.84%
## Confusion matrix:
##     ATT OCC class.error
## ATT 237 166   0.4119107
## OCC 128 226   0.3615819

print(bagging.survey$importance)

##                     ATT           OCC MeanDecreaseAccuracy MeanDecreaseGini
## income     -0.000893117  0.0071590393         0.0028624694        20.629637
## moves      -0.003894477  0.0079883352         0.0016809134        14.913661
## age         0.001338238  0.0112230130         0.0060294691        16.859622
## education  -0.005245364  0.0074293800         0.0006594853        15.862406
## employment  0.003616753  0.0041706232         0.0039106598        13.033296
## usage       0.038412825  0.0478423354         0.0427341200        37.071603
## nonpub      0.020698891  0.0061163082         0.0138608317        10.173101
## reach.out   0.007332402 -0.0011096710         0.0033704363         4.095812
## card        0.001304193 -0.0007224123         0.0003536788         1.558880

plot(bagging.survey,main="bagging, OOB",col=1:3)
text(900,0.41,"OCC")
text(900,0.39,"total")
text(900,0.36,"ATT")

Random Forest

library(randomForest)
set.seed(77)
ntree=1000
forest.survey<-randomForest(pick~.,market.full,ntree=ntree, mtry=4,maxnodes=30,importance=T)
plot(forest.survey,main="random forest, OOB",col=1:3)
text(900,0.39,"OCC")
text(900,0.38,"total")
text(900,0.37,"ATT")

print(forest.survey)

## 
## Call:
##  randomForest(formula = pick ~ ., data = market.full, ntree = ntree,      mtry = 4, maxnodes = 30, importance = T) 
##                Type of random forest: classification
##                      Number of trees: 1000
## No. of variables tried at each split: 4
## 
##         OOB estimate of  error rate: 38.18%
## Confusion matrix:
##     ATT OCC class.error
## ATT 246 157   0.3895782
## OCC 132 222   0.3728814

print(forest.survey$importance)

##                     ATT           OCC MeanDecreaseAccuracy MeanDecreaseGini
## income      0.001021621  0.0058106700         0.0032287561        17.141160
## moves      -0.001356287  0.0076378899         0.0027866887        13.180899
## age         0.003784406  0.0146468134         0.0088692644        14.120922
## education  -0.002947729  0.0066076206         0.0015665698        12.855820
## employment  0.002198937  0.0047642505         0.0033813515        10.891949
## usage       0.038020568  0.0432228865         0.0404293916        30.908406
## nonpub      0.019456309  0.0030502870         0.0118031324         7.991694
## reach.out   0.008248406  0.0009055933         0.0048476197         4.287364
## card        0.001603838 -0.0005579848         0.0006075804         2.059811

Neural Network (1 hidden layer with 5 hidden units)

library(nnet)
survey.net <- nnet(pick~., data
             = market.full, size = 5, decay = 0.01, skip = F,
            trace = F, maxit = 1000)
tbl <- table(market.full$pick,predict(survey.net,market.full, type="class"))
print(tbl)

##      
##       ATT OCC
##   ATT 307  96
##   OCC  66 288

cat("Misclassification error rate:", round((tbl[1,2]+tbl[2,1])/length(market.full$pick),4),"=",
tbl[1,2]+tbl[2,1], "/",length(market.full$pick), "\n")

## Misclassification error rate: 0.214 = 162 / 757