1 Exercise 1 : Partition and matrix
Consider the Iris data set. Write a R code wich produces the partition matrix. Compute the gravity centers of the quantitative variables in the three classes using a matrix formula.
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## âś” ggplot2 3.3.6 âś” purrr 0.3.4
## âś” tibble 3.1.7 âś” dplyr 1.0.10
## âś” tidyr 1.2.0 âś” stringr 1.4.0
## âś” readr 2.1.2 âś” forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## âś– dplyr::filter() masks stats::filter()
## âś– dplyr::lag() masks stats::lag()
data(iris)
data(iris)
<-iris[,1:4]
Xlibrary(nnet)
<-class.ind(iris$Species) # Matrice partition
Ct((t(X)%*%C))/diag(t(C)%*%C)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## setosa 5.006 3.428 1.462 0.246
## versicolor 5.936 2.770 4.260 1.326
## virginica 6.588 2.974 5.552 2.026
image(C)
<- iris %>% select(c(-Species,-Sepal.Length,-Sepal.Width)) %>% kmeans(3,nstart = 10)
kmeans.res <-as.factor(kmeans.res$cluster)
cluster<- as.data.frame(kmeans.res$centers) centers
library(ggplot2)
ggplot(iris, aes(x=Petal.Length, y=Petal.Width, color=cluster)) + geom_point() +
geom_point(data=centers, color='coral',size=4,pch=21)+ geom_point(data=centers, color='coral',size=50,alpha=0.2)
# Exercise 2 : The bell number
- Show that the number of partition of n objects verifies
\[ B_{n+1} = \sum_{k=0}^n C^n_kB_k \, , \\ B_0 = 1 \] 2. Compute manually the bell number for 1,2,3,4,5,6 objects.
1, 2, 5, 15, 52, 203
- Write a R program which computes the Bell number for n objects.
<- function(n){
bell_number if(n<0){stop("n must be an non-negative integer")}
if(n<=1){return(1)}
<- 0
res for(k in 0:(n-1)){
<- res + choose(n =n-1 ,k=k) * bell_number(k)
res
}return(res)
}
bell_number(17)
## [1] 82864869804
2 Exercise 3 : Between-Within Variance relation
Consider \(n\) points from \(\mathbb{R}^p\) with a partition into \(K\) classes of size \(n_1,...,n_k\). Let us note \(\hat\mu_k\) the gravity center of class \(k\) and \(\hat\mu\) the gravity center of the entire cloud of points. Show that
\[ \sum_k\sum_{i\in k} \| x_i- \hat\mu_k \|^2 + \sum_k n_k\| \hat\mu_k - \hat\mu \|^2 = \sum_i \|x_i - \hat\mu \|^2 \]
3 Exercise 4 Clustering of the crabs (library MASS)
- Load the crabs dataset form library MASS.
library(MASS)
##
## Attachement du package : 'MASS'
## L'objet suivant est masqué depuis 'package:dplyr':
##
## select
data(crabs)
- Plot the dateset using pairs() with a color for each specy and a different symbol per sex.
pairs(crabs[,4:8], col=as.numeric(crabs$sp),pch=as.numeric(crabs$sex)+16)
pairs(crabs[,4:8], col=c("Yellow","Red")[crabs$sp],pch=as.numeric(crabs$sex)+16)
- Cluster the dataset reduced to its quantitative variables into four cluster using the kmeans.
<- crabs[,4:8] %>% kmeans(4,nstart = 1)
Kmeans.res Kmeans.res
## K-means clustering with 4 clusters of sizes 58, 67, 31, 44
##
## Cluster means:
## FL RW CL CW BD
## 1 13.60690 11.484483 28.10172 31.92241 12.160345
## 2 16.68060 13.607463 34.45224 39.20746 15.032836
## 3 10.35161 8.829032 21.00323 24.04839 8.851613
## 4 20.20227 15.822727 41.63182 46.79545 18.618182
##
## Clustering vector:
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## 3 3 3 3 3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1
## 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
## 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
## 4 4 4 4 4 4 4 4 4 4 3 3 3 3 3 3 3 3 3 3
## 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
## 3 3 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
## 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 4 4 4
## 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
## 3 3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
## 1 1 1 2 1 2 2 2 2 2 2 2 4 2 2 2 2 4 4 4
## 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160
## 4 4 4 4 4 4 4 4 4 4 3 3 3 1 1 1 1 1 1 1
## 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
## 1 2 2 1 1 2 2 2 2 2 2 2 2 2 2 4 2 2 2 2
## 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200
## 2 2 2 2 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
##
## Within cluster sum of squares by cluster:
## [1] 602.7828 894.7322 561.9458 996.2270
## (between_SS / total_SS = 89.3 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
<-matrix(1:4,2,2)
TrueClassescolnames(TrueClasses)<-levels(crabs$sex)
rownames(TrueClasses)<-levels(crabs$sp)
=diag(TrueClasses[crabs$sex,crabs$sp]) TrueClasses
table(Kmeans.res$cluster,TrueClasses)
## TrueClasses
## 1 2 3 4
## 1 18 12 10 18
## 2 16 19 20 12
## 3 13 9 3 6
## 4 3 10 17 14
- Run the algorithm with 1000 different initialization and keep track of the within sum of squares.
= rep(0,1000)
WSS for(i in 1:1000){
= (crabs[,4:8] %>% kmeans(4,nstart = 1))$tot.withinss
WSS[i]
}plot(WSS)
summary(WSS)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3041 3041 3041 3052 3072 3072
hist(WSS,breaks=50,freq=T)
- Comment the result.
- Divide all quantitative variable by the most correlated variable to produce a new dataset.
<- crabs[,-c(1,2,3)]
crabs2 <-(crabs2/crabs[,6])[,-3]
crabs2colnames(crabs2) <- c("FL / CL", "RW / CL", "CW / CL", "BD / CL")
head(crabs2)
## FL / CL RW / CL CW / CL BD / CL
## 1 0.5031056 0.4161491 1.180124 0.4347826
## 2 0.4861878 0.4254144 1.149171 0.4088398
## 3 0.4842105 0.4105263 1.178947 0.4052632
## 4 0.4776119 0.3930348 1.149254 0.4079602
## 5 0.4827586 0.3940887 1.133005 0.4039409
## 6 0.4695652 0.3913043 1.152174 0.4260870
pairs(crabs2)
- Compare the partitions obtained using the kmeans with the ’natural’ partition. Comment.
<- crabs2 %>% kmeans(4,nstart = 1)
Kmeans.res2 Kmeans.res2
## K-means clustering with 4 clusters of sizes 40, 44, 56, 60
##
## Cluster means:
## FL / CL RW / CL CW / CL BD / CL
## 1 0.4621970 0.3611123 1.148132 0.4169413
## 2 0.4928634 0.3629645 1.103291 0.4552271
## 3 0.5090559 0.4272405 1.125901 0.4509544
## 4 0.4743481 0.4291743 1.161222 0.4177354
##
## Clustering vector:
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## 4 4 4 1 1 1 4 1 4 4 1 4 1 1 4 4 1 1 4 1
## 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
## 1 1 1 1 1 1 1 1 1 1 4 4 4 4 4 4 4 4 4 4
## 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
## 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
## 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
## 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
## 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
## 3 3 3 2 3 2 2 3 2 3 2 2 2 2 2 2 2 2 2 2
## 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160
## 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3
## 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
## 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200
## 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
##
## Within cluster sum of squares by cluster:
## [1] 0.02767843 0.02670083 0.04285487 0.04930253
## (between_SS / total_SS = 74.8 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
table("Prediction" = Kmeans.res2$cluster,"True label"=TrueClasses)
## True label
## Prediction 1 2 3 4
## 1 0 40 0 0
## 2 0 0 0 44
## 3 0 0 50 6
## 4 50 10 0 0
pairs(crabs2,col=TrueClasses)
pairs(crabs2,col=Kmeans.res2$cluster)
<-kmeans(crabs2,4)
res# repr?sentons les donn?es dans le plan 2, 4 avec densit? et centres
<-kde2d(crabs2[,2],crabs2[,4])
zcontour(z)
# placons les points
points(crabs2[,c(2,4)],col=c("blue","orange")[crabs$sp],pch=c(20,21)[crabs$sex])
# placons les centres des classes
points(res$center[,c(2,4)],cex=3,pch=21,bg="red")
- Try to cluster the data in 1 to 20 groups. Plot the within sum of squares in function of the number of clusters. Comment the figure.
<-rep(0,10)
WSSkclusterfor (k in 1:10) {
<- Inf
WSSmax #On fait 10 initialisation différentes
for (i in 1:10) {
<-kmeans(crabs2,k)
resif (res$tot.withinss<WSSmax)
{<-res
partition<-res$tot.withinss
WSSmax
}
}<- partition$tot.withinss
WSSkcluster[k] }
plot(WSSkcluster,xlab="Number of Clusters",ylab="Total Within Sum Squared")
lines(WSSkcluster,col="red")