Clustering: K-means

Swiss Provinces Data

5 measures of socio–economic indicators on 47 French-speaking Swiss provinces (1888):

Agriculture – % of males involved in agriculture as occupation

Examination – % “draftees” receiving highest mark on army examination

Education – % education beyond primary school for “draftees”

Catholic – % catholic (as opposed to “protestant”)

Infant.Mortality – % live births who live less than 1 year

K-means

library(cluster)
attach(swiss)
par(mfrow=c(1,2))
swiss.x=swiss[,-1]
W=rep(0,10)
W[1]=kmeans(swiss.x,1,nstart=25)$tot.withinss
S=rep(0,9)
for (k in 2:10){
swiss.kmeans=kmeans(swiss.x,k,nstart=25)
W[k]=swiss.kmeans$tot.withinss
S[k]=mean(silhouette(swiss.kmeans$cluster,dist(swiss.x,method="euclidean"))[,3])
}
plot(1:10,W[1:10],xlab="number of clusters",ylab="Within-cluster sum of squares",col="blue")
plot(2:10,S[2:10],xlab="number of clusters",ylab="Silhouette coefficient",col="blue")

swiss.kmeans=kmeans(swiss.x,2,nstart=25)
print(swiss.kmeans)
## K-means clustering with 2 clusters of sizes 16, 31
## 
## Cluster means:
##   Agriculture Examination Education Catholic Infant.Mortality
## 1    65.51875     9.43750   6.62500 96.15000          20.7750
## 2    42.99032    20.12903  13.22581 12.75355          19.5129
## 
## Clustering vector:
##   Courtelary     Delemont Franches-Mnt      Moutier   Neuveville   Porrentruy 
##            2            1            1            2            2            1 
##        Broye        Glane      Gruyere       Sarine      Veveyse        Aigle 
##            1            1            1            1            1            2 
##      Aubonne     Avenches     Cossonay    Echallens     Grandson     Lausanne 
##            2            2            2            2            2            2 
##    La Vallee       Lavaux       Morges       Moudon        Nyone         Orbe 
##            2            2            2            2            2            2 
##         Oron      Payerne Paysd'enhaut        Rolle        Vevey      Yverdon 
##            2            2            2            2            2            2 
##      Conthey    Entremont       Herens     Martigwy      Monthey   St Maurice 
##            1            1            1            1            1            1 
##       Sierre         Sion       Boudry La Chauxdfnd     Le Locle    Neuchatel 
##            1            1            2            2            2            2 
##   Val de Ruz ValdeTravers V. De Geneve  Rive Droite  Rive Gauche 
##            2            2            2            2            2 
## 
## Within cluster sum of squares by cluster:
## [1]  5516.926 25352.259
##  (between_SS / total_SS =  72.3 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
par(mfrow=c(1,1),pty="s")
swiss.pca=prcomp(swiss.x,scale=T)
swiss.centers=predict(swiss.pca,swiss.kmeans$centers)
plot(swiss.pca$x[,1],swiss.pca$x[,2],xlab="First pricniple component",ylab="Second principle component",type="n")
points(swiss.centers[,1:2],pch=19,cex=2,col=c("orange","blue"))
text(swiss.pca$x[,1],swiss.pca$x[,2],dimnames(swiss.x)[[1]],col=c("orange","blue")[swiss.kmeans$cluster],cex=.5)
title("2-means",col.main="red")

K-medoids

library(cluster)
par(mfrow=c(1,2))
S=rep(0,9)
for (k in 2:10){
swiss.kmedoids=pam(swiss.x,k,metric="euclidean")
S[k]=swiss.kmedoids$silinfo$avg.width
}
plot(2:10,S[2:10],xlab="number of clusters",ylab="Silhouette coefficient",col="blue")
title("Euclidean distance",col.main="red")

swiss.kmedoids=pam(swiss.x,2,metric="euclidean")

plot(swiss.pca$x[,1],swiss.pca$x[,2],xlab="First pricniple component",ylab="Second principle component",type="n")
swiss.medoids=predict(swiss.pca, swiss.kmedoids$medoids)
points(swiss.medoids,pch=19,cex=2,col=c("orange","blue"))
text(swiss.pca$x[,1],swiss.pca$x[,2],dimnames(swiss.x)[[1]],col=c("orange","blue")[swiss.kmeans$cluster],cex=.5)
title("2-medoids (Euclidean distance)",col.main="red")

print(swiss.kmedoids$medoids)
##         Agriculture Examination Education Catholic Infant.Mortality
## Yverdon        49.5          15         8     6.10             22.5
## Glane          67.8          14         8    97.16             24.9
par(mfrow=c(1,1))
plot(swiss.kmedoids,which=2,main=NA)
title("Silhouette plot (Euclidean distance)",col.main="red")

par(mfrow=c(1,2))
S=rep(0,9)
for (k in 2:10){
swiss.kmedoids=pam(swiss.x,k,metric="manhattan")
S[k]=swiss.kmedoids$silinfo$avg.width
}
plot(2:10,S[2:10],xlab="number of clusters",ylab="Silhouette coefficient",col="blue")
title("Manhattan distance",col.main="red")




swiss.kmedoids=pam(swiss.x,2,metric="manhattan")

plot(swiss.pca$x[,1],swiss.pca$x[,2],xlab="First pricniple component",ylab="Second principle component",type="n")
swiss.medoids=predict(swiss.pca, swiss.kmedoids$medoids)
points(swiss.medoids,pch=19,cex=2,col=c("orange","blue"))
text(swiss.pca$x[,1],swiss.pca$x[,2],dimnames(swiss.x)[[1]],col=c("orange","blue")[swiss.kmeans$cluster],cex=.5)
title("2-medoids (Manhattan distance)",col.main="red")

print(swiss.kmedoids$medoids)
##            Agriculture Examination Education Catholic Infant.Mortality
## Neuveville        43.5          17        15     5.16             20.6
## Monthey           64.9           7         3    98.22             20.2
par(mfrow=c(1,1))
plot(swiss.kmedoids,which=2,main=NA)
title("Silhouette plot (Manhattan distance)",col.main="red")

Agglomerative Clustering (Euclidean distance)

library(cluster)
swiss.agnes=agnes(swiss.x,method="single",metric="euclidean")
pltree(swiss.agnes,col.main="red")

cat("\nAgglomerative coefficient=",swiss.agnes$ac,"\n")
## 
## Agglomerative coefficient= 0.7620692
swiss.agnes=agnes(swiss.x,method="complete",metric="euclidean")
pltree(swiss.agnes,col.main="red")

cat("\nAgglomerative coefficient=",swiss.agnes$ac,"\n")
## 
## Agglomerative coefficient= 0.9085089
swiss.agnes=agnes(swiss.x,method="average",metric="euclidean")
pltree(swiss.agnes,col.main="red")

cat("\nAgglomerative coefficient=",swiss.agnes$ac,"\n")
## 
## Agglomerative coefficient= 0.8774795
swiss.agnes=agnes(swiss.x,method="average",metric="manhattan")
pltree(swiss.agnes,col.main="red")

cat("\nAgglomerative coefficient=",swiss.agnes$ac,"\n")
## 
## Agglomerative coefficient= 0.8539078

Divisive Clustering (Euclidean distance)

library(cluster)
swiss.diana=diana(swiss.x)
pltree(swiss.diana,col.main="red")

cat("\nDivisive coefficient=",swiss.agnes$ac,"\n")
## 
## Divisive coefficient= 0.8539078