Classification trees: AT&T Telemarketing Data

Data on 1000 households include demographic information about the household and information specific to the households telephone service (Watson, 1986). Only for 757 households the complete information is available.

pick – factor indicating whether the household picked AT&T as their long distance phone company. The level ATT indicates they did, while the level OCC indicates they picked another company.

income – ordered factor indicating the income level of the household. Levels are: `<7.5 < 7.5-15 < 15-25 < 25-35 < 35-45 < 45-75 < >75’.

moves – ordered factor indicating the number of times the household moved in the preceding 10 years. Levels are: `0 < 1 < 2 < 3 < 4 < 5 < 6 < 7 < >10’.

age – ordered factor indicating the age level of the respondent. Levels are: `18-24 < 25-34 < 35-44 < 45-54 < 55-64 < 65+’.

education – factor indicating the highest education level achieved by respondent. Levels are: <HS, HS, Voc, Coll, BA, and >BA.

employment – factor indicating the type of employment of the respondent. Levels are: F, P, R, S, H, U, and D.

usage – numeric vector giving the average monthly telephone usage of the household.

nonpub – factor indicating whether the household had an unlisted telephone number. Levels are: Y, N, and NA.

reach.out – factor indicating whether the household had participated in a special AT&T plan (before the forced choice of long-distance carrier).

card – factor indicating whether the household had an AT&T calling card service (before the forced choice of long-distance carrier). Levels are: Y, N, and NA.

set.seed(77)
setwd("D:/R Scripts/Advanced Topics/Data Files")
market.survey<-read.csv("market.survey.csv",head=T,colClasses=c(rep("factor",6),"numeric",rep("factor",3)))
market.full <- na.omit(market.survey)
used <- match(row.names(market.survey), row.names(market.full))
omitted <- seq(nrow(market.survey))[is.na(used)]
market.NA <- market.survey[omitted,  ]

Full Tree

library(MASS)
library(tree)
library(rpart)


full.survey <- tree(pick~.,market.full, na.action = na.pass,model=T,mindev=0.0001)
# full.survey <- rpart(pick~.,market.full, na.action = na.pass,model=T)
plot(full.survey)
title("full tree",col="red")
text(full.survey, all=T,cex = .4)

print(summary(full.survey))
## 
## Classification tree:
## tree(formula = pick ~ ., data = market.full, na.action = na.pass, 
##     model = T, mindev = 1e-04)
## Number of terminal nodes:  103 
## Residual mean deviance:  0.8879 = 580.7 / 654 
## Misclassification error rate: 0.1929 = 146 / 757
print(table(market.full$pick,predict(full.survey,type="class")))
##      
##       ATT OCC
##   ATT 335  68
##   OCC  77 277

Pruning

  plot(prune.tree(full.survey))

    prune.survey <- prune.tree(full.survey, best = 6)
    plot(prune.survey)
    title("best tree of size 6",col="red")
    text(prune.survey, cex = 1)

    prune.snip <- snip.tree(prune.survey, nodes = c(4, 5, 3))
    plot(prune.snip)    
  title("best tree of size 6 (after snipping)",col="red")
    text(prune.snip, cex = 1)

    print(summary(prune.snip))  
## 
## Classification tree:
## snip.tree(tree = prune.survey, nodes = c(4L, 5L, 3L))
## Variables actually used in tree construction:
## [1] "usage"  "nonpub"
## Number of terminal nodes:  3 
## Residual mean deviance:  1.306 = 985 / 754 
## Misclassification error rate: 0.358 = 271 / 757
  print(table(market.full$pick,predict(prune.snip,type="class")))
##      
##       ATT OCC
##   ATT 267 136
##   OCC 135 219

Prediction on the pruned tree using 243 observations with incompelete data

predict.NA <- predict(prune.snip, market.survey[omitted,  ],type="class")
tbl<-table(market.survey$pick[omitted],predict.NA)
print(tbl)
##      predict.NA
##       ATT OCC
##   ATT  56  45
##   OCC  51  91
cat("Misclassification error rate:", round((tbl[1,2]+tbl[2,1])/length(omitted),4),"=",
tbl[1,2]+tbl[2,1], "/",length(omitted), "\n")
## Misclassification error rate: 0.3951 = 96 / 243

Choice of the tree’s size for pruning by 10-fold cross validation

plot(cv.tree(full.survey,  , prune.tree))

cv.survey <- prune.tree(full.survey, best = 4)
plot(cv.survey)
title("pruned tree by cross-validation")
text(cv.survey)

cv.survey <- snip.tree(cv.survey, nodes = c(5))
plot(cv.survey)
title("pruned tree by cross-validation (after snipping)")
text(cv.survey)

  print(table(market.full$pick,predict(cv.survey,type="class")))
##      
##       ATT OCC
##   ATT 267 136
##   OCC 135 219
cat("\n\n Prediction on the Cross-Validated Tree for incomplete observations\n")
## 
## 
##  Prediction on the Cross-Validated Tree for incomplete observations
cv.predict <- predict(cv.survey, newdata= market.NA, type = "class")
tbl<-table(market.survey$pick[omitted],cv.predict)
    print(tbl)
##      cv.predict
##       ATT OCC
##   ATT  56  45
##   OCC  51  91
cat("Misclassification error rate:", round((tbl[1,2]+tbl[2,1])/length(omitted),4),"=",
tbl[1,2]+tbl[2,1], "/",length(omitted), "\n")
## Misclassification error rate: 0.3951 = 96 / 243

AdaBoost

library(gbm)
y<-as.numeric(market.survey$pick)-1
boost.survey <-gbm(as.numeric(market.full$pick)-1~.,data=market.full,n.trees=500,interaction.depth=4,distribution="adaboost",cv.folds=10)
summary(boost.survey)

##                   var    rel.inf
## usage           usage 22.0197141
## income         income 21.6160205
## age               age 15.5106751
## education   education 14.2976115
## moves           moves 11.8032738
## employment employment  9.4736969
## nonpub         nonpub  2.8988741
## reach.out   reach.out  1.8227422
## card             card  0.5573917
cat("Misclassification for the observations with comlete data\n")
## Misclassification for the observations with comlete data
tbl.1<-table(market.full$pick,ifelse(boost.survey$fit<.5,c("ATT"),c("OCC")))
print(tbl.1)
##      
##       ATT OCC
##   ATT 396   7
##   OCC 156 198
cat("Misclassification error rate:", round((tbl.1[1,2]+tbl.1[2,1])/length(market.full$pick),4),"=",
tbl.1[1,2]+tbl.1[2,1], "/",length(market.full$pick), "\n")
## Misclassification error rate: 0.2153 = 163 / 757
cat("\n\nMisclassification for the observations with incomlete data\n")
## 
## 
## Misclassification for the observations with incomlete data
predict.NA <- ifelse(predict(boost.survey, market.NA,type="response")<0.5,c("ATT"),c("OCC"))
tbl.2 <-table(market.survey$pick[omitted],predict.NA)
print(tbl.2)
##      predict.NA
##       ATT OCC
##   ATT  60  41
##   OCC  50  92
cat("Misclassification error rate:", round((tbl.2[1,2]+tbl.2[2,1])/length(omitted),4),"=",
tbl.2[1,2]+tbl.2[2,1], "/",length(omitted), "\n")
## Misclassification error rate: 0.3745 = 91 / 243

Bagging

library(randomForest)
set.seed(77)
bagging.survey<-randomForest(pick~.,market.full,ntree=500,mtry=9,importance=T)
print(bagging.survey)
## 
## Call:
##  randomForest(formula = pick ~ ., data = market.full, ntree = 500,      mtry = 9, importance = T) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 9
## 
##         OOB estimate of  error rate: 41.08%
## Confusion matrix:
##     ATT OCC class.error
## ATT 256 147   0.3647643
## OCC 164 190   0.4632768
print(bagging.survey$importance)
##                      ATT           OCC MeanDecreaseAccuracy MeanDecreaseGini
## income     -0.0050843369  0.0276799029         0.0103217619        58.964637
## moves      -0.0025007389  0.0087007477         0.0027044989        39.067815
## age        -0.0038500019  0.0238514559         0.0091603363        49.530618
## education  -0.0027419193  0.0165356096         0.0062584343        48.072102
## employment  0.0026611373  0.0202819341         0.0109834947        34.210781
## usage       0.0222345626  0.0451212663         0.0329951407       110.245429
## nonpub      0.0171722748  0.0084856010         0.0131275140        15.172041
## reach.out  -0.0005879737  0.0001602518        -0.0001759585        12.764486
## card        0.0047615418 -0.0012090490         0.0019730362         4.009055
plot(bagging.survey,col=1:3)
text(450,0.47,"OCC")
text(450,0.42,"total")
text(450,0.38,"ATT")

Random Forest

library(randomForest)
set.seed(77)
forest.survey<-randomForest(pick~.,market.full,ntree=1000, mtry=4,importance=T)
plot(forest.survey)
text(900,0.48,"OCC")
text(900,0.42,"total")
text(900,0.37,"ATT")

print(forest.survey)
## 
## Call:
##  randomForest(formula = pick ~ ., data = market.full, ntree = 1000,      mtry = 4, importance = T) 
##                Type of random forest: classification
##                      Number of trees: 1000
## No. of variables tried at each split: 4
## 
##         OOB estimate of  error rate: 40.29%
## Confusion matrix:
##     ATT OCC class.error
## ATT 259 144   0.3573201
## OCC 161 193   0.4548023
print(forest.survey$importance)
##                      ATT          OCC MeanDecreaseAccuracy MeanDecreaseGini
## income     -0.0013888918  0.021383468         0.0092222945        58.382101
## moves      -0.0011566413  0.007216793         0.0027641494        36.635717
## age        -0.0002442869  0.025241120         0.0116563749        46.621518
## education  -0.0014146661  0.010572170         0.0041978790        47.558822
## employment  0.0021263864  0.018673988         0.0098330961        33.310308
## usage       0.0233792608  0.036715866         0.0295419646        99.691640
## nonpub      0.0167216897  0.005292194         0.0113649000        14.584206
## reach.out   0.0006329127 -0.001885198        -0.0005775774        13.610915
## card        0.0033500867 -0.001032381         0.0013173458         4.917778

Neural Network (1 hidden layer with 5 hidden units)

library(nnet)
survey.net <- nnet(pick~., data
             = market.full, size = 5, decay = 0.01, skip = F,
            trace = F, maxit = 1000)
tbl <- table(market.full$pick,predict(survey.net,market.full, type="class"))
print(tbl)
##      
##       ATT OCC
##   ATT 328  75
##   OCC  65 289
cat("Misclassification error rate:", round((tbl[1,2]+tbl[2,1])/length(market.full$pick),4),"=",
tbl[1,2]+tbl[2,1], "/",length(market.full$pick), "\n")
## Misclassification error rate: 0.1849 = 140 / 757