Data on 1000 households include demographic information about the household and information specific to the households telephone service (Watson, 1986). Only for 757 households the complete information is available.
pick – factor indicating whether the household picked AT&T as their long distance phone company. The level ATT indicates they did, while the level OCC indicates they picked another company.
income – ordered factor indicating the income level of the household. Levels are: `<7.5 < 7.5-15 < 15-25 < 25-35 < 35-45 < 45-75 < >75’.
moves – ordered factor indicating the number of times the household moved in the preceding 10 years. Levels are: `0 < 1 < 2 < 3 < 4 < 5 < 6 < 7 < >10’.
age – ordered factor indicating the age level of the respondent. Levels are: `18-24 < 25-34 < 35-44 < 45-54 < 55-64 < 65+’.
education – factor indicating the highest education level achieved by respondent. Levels are: <HS, HS, Voc, Coll, BA, and >BA.
employment – factor indicating the type of employment of the respondent. Levels are: F, P, R, S, H, U, and D.
usage – numeric vector giving the average monthly telephone usage of the household.
nonpub – factor indicating whether the household had an unlisted telephone number. Levels are: Y, N, and NA.
reach.out – factor indicating whether the household had participated in a special AT&T plan (before the forced choice of long-distance carrier).
card – factor indicating whether the household had an AT&T calling card service (before the forced choice of long-distance carrier). Levels are: Y, N, and NA.
set.seed(77)
setwd("D:/R Scripts/Advanced Topics/Data Files")
market.survey<-read.csv("market.survey.csv",head=T,colClasses=c(rep("factor",6),"numeric",rep("factor",3)))
market.full <- na.omit(market.survey)
used <- match(row.names(market.survey), row.names(market.full))
omitted <- seq(nrow(market.survey))[is.na(used)]
market.NA <- market.survey[omitted, ]
library(MASS)
library(tree)
library(rpart)
full.survey <- tree(pick~.,market.full, na.action = na.pass,model=T,mindev=0.0001)
# full.survey <- rpart(pick~.,market.full, na.action = na.pass,model=T)
plot(full.survey)
title("full tree",col="red")
text(full.survey, all=T,cex = .4)
print(summary(full.survey))
##
## Classification tree:
## tree(formula = pick ~ ., data = market.full, na.action = na.pass,
## model = T, mindev = 1e-04)
## Number of terminal nodes: 103
## Residual mean deviance: 0.8879 = 580.7 / 654
## Misclassification error rate: 0.1929 = 146 / 757
print(table(market.full$pick,predict(full.survey,type="class")))
##
## ATT OCC
## ATT 335 68
## OCC 77 277
plot(prune.tree(full.survey))
prune.survey <- prune.tree(full.survey, best = 6)
plot(prune.survey)
title("best tree of size 6",col="red")
text(prune.survey, cex = 1)
prune.snip <- snip.tree(prune.survey, nodes = c(4, 5, 3))
plot(prune.snip)
title("best tree of size 6 (after snipping)",col="red")
text(prune.snip, cex = 1)
print(summary(prune.snip))
##
## Classification tree:
## snip.tree(tree = prune.survey, nodes = c(4L, 5L, 3L))
## Variables actually used in tree construction:
## [1] "usage" "nonpub"
## Number of terminal nodes: 3
## Residual mean deviance: 1.306 = 985 / 754
## Misclassification error rate: 0.358 = 271 / 757
print(table(market.full$pick,predict(prune.snip,type="class")))
##
## ATT OCC
## ATT 267 136
## OCC 135 219
predict.NA <- predict(prune.snip, market.survey[omitted, ],type="class")
tbl<-table(market.survey$pick[omitted],predict.NA)
print(tbl)
## predict.NA
## ATT OCC
## ATT 56 45
## OCC 51 91
cat("Misclassification error rate:", round((tbl[1,2]+tbl[2,1])/length(omitted),4),"=",
tbl[1,2]+tbl[2,1], "/",length(omitted), "\n")
## Misclassification error rate: 0.3951 = 96 / 243
plot(cv.tree(full.survey, , prune.tree))
cv.survey <- prune.tree(full.survey, best = 4)
plot(cv.survey)
title("pruned tree by cross-validation")
text(cv.survey)
cv.survey <- snip.tree(cv.survey, nodes = c(5))
plot(cv.survey)
title("pruned tree by cross-validation (after snipping)")
text(cv.survey)
print(table(market.full$pick,predict(cv.survey,type="class")))
##
## ATT OCC
## ATT 267 136
## OCC 135 219
cat("\n\n Prediction on the Cross-Validated Tree for incomplete observations\n")
##
##
## Prediction on the Cross-Validated Tree for incomplete observations
cv.predict <- predict(cv.survey, newdata= market.NA, type = "class")
tbl<-table(market.survey$pick[omitted],cv.predict)
print(tbl)
## cv.predict
## ATT OCC
## ATT 56 45
## OCC 51 91
cat("Misclassification error rate:", round((tbl[1,2]+tbl[2,1])/length(omitted),4),"=",
tbl[1,2]+tbl[2,1], "/",length(omitted), "\n")
## Misclassification error rate: 0.3951 = 96 / 243
library(gbm)
y<-as.numeric(market.survey$pick)-1
boost.survey <-gbm(as.numeric(market.full$pick)-1~.,data=market.full,n.trees=500,interaction.depth=4,distribution="adaboost",cv.folds=10)
summary(boost.survey)
## var rel.inf
## usage usage 22.0197141
## income income 21.6160205
## age age 15.5106751
## education education 14.2976115
## moves moves 11.8032738
## employment employment 9.4736969
## nonpub nonpub 2.8988741
## reach.out reach.out 1.8227422
## card card 0.5573917
cat("Misclassification for the observations with comlete data\n")
## Misclassification for the observations with comlete data
tbl.1<-table(market.full$pick,ifelse(boost.survey$fit<.5,c("ATT"),c("OCC")))
print(tbl.1)
##
## ATT OCC
## ATT 396 7
## OCC 156 198
cat("Misclassification error rate:", round((tbl.1[1,2]+tbl.1[2,1])/length(market.full$pick),4),"=",
tbl.1[1,2]+tbl.1[2,1], "/",length(market.full$pick), "\n")
## Misclassification error rate: 0.2153 = 163 / 757
cat("\n\nMisclassification for the observations with incomlete data\n")
##
##
## Misclassification for the observations with incomlete data
predict.NA <- ifelse(predict(boost.survey, market.NA,type="response")<0.5,c("ATT"),c("OCC"))
tbl.2 <-table(market.survey$pick[omitted],predict.NA)
print(tbl.2)
## predict.NA
## ATT OCC
## ATT 60 41
## OCC 50 92
cat("Misclassification error rate:", round((tbl.2[1,2]+tbl.2[2,1])/length(omitted),4),"=",
tbl.2[1,2]+tbl.2[2,1], "/",length(omitted), "\n")
## Misclassification error rate: 0.3745 = 91 / 243
library(randomForest)
set.seed(77)
bagging.survey<-randomForest(pick~.,market.full,ntree=500,mtry=9,importance=T)
print(bagging.survey)
##
## Call:
## randomForest(formula = pick ~ ., data = market.full, ntree = 500, mtry = 9, importance = T)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 9
##
## OOB estimate of error rate: 41.08%
## Confusion matrix:
## ATT OCC class.error
## ATT 256 147 0.3647643
## OCC 164 190 0.4632768
print(bagging.survey$importance)
## ATT OCC MeanDecreaseAccuracy MeanDecreaseGini
## income -0.0050843369 0.0276799029 0.0103217619 58.964637
## moves -0.0025007389 0.0087007477 0.0027044989 39.067815
## age -0.0038500019 0.0238514559 0.0091603363 49.530618
## education -0.0027419193 0.0165356096 0.0062584343 48.072102
## employment 0.0026611373 0.0202819341 0.0109834947 34.210781
## usage 0.0222345626 0.0451212663 0.0329951407 110.245429
## nonpub 0.0171722748 0.0084856010 0.0131275140 15.172041
## reach.out -0.0005879737 0.0001602518 -0.0001759585 12.764486
## card 0.0047615418 -0.0012090490 0.0019730362 4.009055
plot(bagging.survey,col=1:3)
text(450,0.47,"OCC")
text(450,0.42,"total")
text(450,0.38,"ATT")
library(randomForest)
set.seed(77)
forest.survey<-randomForest(pick~.,market.full,ntree=1000, mtry=4,importance=T)
plot(forest.survey)
text(900,0.48,"OCC")
text(900,0.42,"total")
text(900,0.37,"ATT")
print(forest.survey)
##
## Call:
## randomForest(formula = pick ~ ., data = market.full, ntree = 1000, mtry = 4, importance = T)
## Type of random forest: classification
## Number of trees: 1000
## No. of variables tried at each split: 4
##
## OOB estimate of error rate: 40.29%
## Confusion matrix:
## ATT OCC class.error
## ATT 259 144 0.3573201
## OCC 161 193 0.4548023
print(forest.survey$importance)
## ATT OCC MeanDecreaseAccuracy MeanDecreaseGini
## income -0.0013888918 0.021383468 0.0092222945 58.382101
## moves -0.0011566413 0.007216793 0.0027641494 36.635717
## age -0.0002442869 0.025241120 0.0116563749 46.621518
## education -0.0014146661 0.010572170 0.0041978790 47.558822
## employment 0.0021263864 0.018673988 0.0098330961 33.310308
## usage 0.0233792608 0.036715866 0.0295419646 99.691640
## nonpub 0.0167216897 0.005292194 0.0113649000 14.584206
## reach.out 0.0006329127 -0.001885198 -0.0005775774 13.610915
## card 0.0033500867 -0.001032381 0.0013173458 4.917778
library(nnet)
survey.net <- nnet(pick~., data
= market.full, size = 5, decay = 0.01, skip = F,
trace = F, maxit = 1000)
tbl <- table(market.full$pick,predict(survey.net,market.full, type="class"))
print(tbl)
##
## ATT OCC
## ATT 328 75
## OCC 65 289
cat("Misclassification error rate:", round((tbl[1,2]+tbl[2,1])/length(market.full$pick),4),"=",
tbl[1,2]+tbl[2,1], "/",length(market.full$pick), "\n")
## Misclassification error rate: 0.1849 = 140 / 757