1 Exercise 1 : Partition and matrix

Consider the Iris data set. Write a R code wich produces the partition matrix. Compute the gravity centers of the quantitative variables in the three classes using a matrix formula.

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

## ✔ ggplot2 3.3.6      ✔ purrr   0.3.4 
## ✔ tibble  3.1.7      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.0      ✔ stringr 1.4.0 
## ✔ readr   2.1.2      ✔ forcats 0.5.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

data(iris)

data(iris)
X<-iris[,1:4]
library(nnet) 
C<-class.ind(iris$Species) # Matrice partition
t((t(X)%*%C))/diag(t(C)%*%C)

##            Sepal.Length Sepal.Width Petal.Length Petal.Width
## setosa            5.006       3.428        1.462       0.246
## versicolor        5.936       2.770        4.260       1.326
## virginica         6.588       2.974        5.552       2.026

image(C)

kmeans.res <- iris %>% select(c(-Species,-Sepal.Length,-Sepal.Width)) %>% kmeans(3,nstart = 10)
cluster<-as.factor(kmeans.res$cluster) 
centers <- as.data.frame(kmeans.res$centers)

library(ggplot2)
ggplot(iris, aes(x=Petal.Length, y=Petal.Width, color=cluster)) + geom_point() +
geom_point(data=centers, color='coral',size=4,pch=21)+ geom_point(data=centers, color='coral',size=50,alpha=0.2)

# Exercise 2 : The bell number

Show that the number of partition of n objects verifies

\[ B_{n+1} = \sum_{k=0}^n C^n_kB_k \, , \\ B_0 = 1 \] 2. Compute manually the bell number for 1,2,3,4,5,6 objects.

1, 2, 5, 15, 52, 203

Write a R program which computes the Bell number for n objects.

bell_number <- function(n){
  if(n<0){stop("n must be an non-negative integer")}
  if(n<=1){return(1)}
  res <- 0
  for(k in 0:(n-1)){
    res <- res +  choose(n =n-1 ,k=k) * bell_number(k)
  }
  return(res)
}

bell_number(17)

## [1] 82864869804

2 Exercise 3 : Between-Within Variance relation

Consider \(n\) points from \(\mathbb{R}^p\) with a partition into \(K\) classes of size \(n_1,...,n_k\). Let us note \(\hat\mu_k\) the gravity center of class \(k\) and \(\hat\mu\) the gravity center of the entire cloud of points. Show that

\[ \sum_k\sum_{i\in k} \| x_i- \hat\mu_k \|^2 + \sum_k n_k\| \hat\mu_k - \hat\mu \|^2 = \sum_i \|x_i - \hat\mu \|^2 \]

3 Exercise 4 Clustering of the crabs (library MASS)

Load the crabs dataset form library MASS.

library(MASS)

## 
## Attachement du package : 'MASS'

## L'objet suivant est masqué depuis 'package:dplyr':
## 
##     select

data(crabs)

Plot the dateset using pairs() with a color for each specy and a different symbol per sex.

pairs(crabs[,4:8], col=as.numeric(crabs$sp),pch=as.numeric(crabs$sex)+16)

pairs(crabs[,4:8], col=c("Yellow","Red")[crabs$sp],pch=as.numeric(crabs$sex)+16)

Cluster the dataset reduced to its quantitative variables into four cluster using the kmeans.

Kmeans.res <- crabs[,4:8] %>% kmeans(4,nstart = 1)
Kmeans.res

## K-means clustering with 4 clusters of sizes 58, 67, 31, 44
## 
## Cluster means:
##         FL        RW       CL       CW        BD
## 1 13.60690 11.484483 28.10172 31.92241 12.160345
## 2 16.68060 13.607463 34.45224 39.20746 15.032836
## 3 10.35161  8.829032 21.00323 24.04839  8.851613
## 4 20.20227 15.822727 41.63182 46.79545 18.618182
## 
## Clustering vector:
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
##   3   3   3   3   3   3   3   3   3   1   1   1   1   1   1   1   1   1   1   1 
##  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40 
##   1   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2 
##  41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60 
##   4   4   4   4   4   4   4   4   4   4   3   3   3   3   3   3   3   3   3   3 
##  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80 
##   3   3   1   1   3   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
##  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 
##   1   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   4   4   4 
## 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 
##   3   3   3   3   3   3   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 
##   1   1   1   2   1   2   2   2   2   2   2   2   4   2   2   2   2   4   4   4 
## 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 
##   4   4   4   4   4   4   4   4   4   4   3   3   3   1   1   1   1   1   1   1 
## 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 
##   1   2   2   1   1   2   2   2   2   2   2   2   2   2   2   4   2   2   2   2 
## 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 
##   2   2   2   2   4   4   4   4   4   4   4   4   4   4   4   4   4   4   4   4 
## 
## Within cluster sum of squares by cluster:
## [1] 602.7828 894.7322 561.9458 996.2270
##  (between_SS / total_SS =  89.3 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

TrueClasses<-matrix(1:4,2,2)
colnames(TrueClasses)<-levels(crabs$sex)
rownames(TrueClasses)<-levels(crabs$sp)
TrueClasses=diag(TrueClasses[crabs$sex,crabs$sp])

table(Kmeans.res$cluster,TrueClasses)

##    TrueClasses
##      1  2  3  4
##   1 18 12 10 18
##   2 16 19 20 12
##   3 13  9  3  6
##   4  3 10 17 14

Run the algorithm with 1000 different initialization and keep track of the within sum of squares.

WSS = rep(0,1000)
for(i in 1:1000){
  WSS[i] = (crabs[,4:8] %>% kmeans(4,nstart = 1))$tot.withinss
}
plot(WSS)

summary(WSS)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    3041    3041    3041    3052    3072    3072

hist(WSS,breaks=50,freq=T)

Comment the result.
Divide all quantitative variable by the most correlated variable to produce a new dataset.

crabs2 <- crabs[,-c(1,2,3)]
crabs2<-(crabs2/crabs[,6])[,-3]
colnames(crabs2) <- c("FL / CL", "RW / CL", "CW / CL", "BD / CL")
head(crabs2)

##     FL / CL   RW / CL  CW / CL   BD / CL
## 1 0.5031056 0.4161491 1.180124 0.4347826
## 2 0.4861878 0.4254144 1.149171 0.4088398
## 3 0.4842105 0.4105263 1.178947 0.4052632
## 4 0.4776119 0.3930348 1.149254 0.4079602
## 5 0.4827586 0.3940887 1.133005 0.4039409
## 6 0.4695652 0.3913043 1.152174 0.4260870

pairs(crabs2)

Compare the partitions obtained using the kmeans with the ’natural’ partition. Comment.

Kmeans.res2 <- crabs2 %>% kmeans(4,nstart = 1)
Kmeans.res2

## K-means clustering with 4 clusters of sizes 40, 44, 56, 60
## 
## Cluster means:
##     FL / CL   RW / CL  CW / CL   BD / CL
## 1 0.4621970 0.3611123 1.148132 0.4169413
## 2 0.4928634 0.3629645 1.103291 0.4552271
## 3 0.5090559 0.4272405 1.125901 0.4509544
## 4 0.4743481 0.4291743 1.161222 0.4177354
## 
## Clustering vector:
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
##   4   4   4   1   1   1   4   1   4   4   1   4   1   1   4   4   1   1   4   1 
##  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
##  41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60 
##   1   1   1   1   1   1   1   1   1   1   4   4   4   4   4   4   4   4   4   4 
##  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80 
##   4   4   4   4   4   4   4   4   4   4   4   4   4   4   4   4   4   4   4   4 
##  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 
##   4   4   4   4   4   4   4   4   4   4   4   4   4   4   4   4   4   4   4   4 
## 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 
##   3   3   3   2   3   2   2   3   2   3   2   2   2   2   2   2   2   2   2   2 
## 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 
##   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2 
## 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 
##   2   2   2   2   2   2   2   2   2   2   3   3   3   3   3   3   3   3   3   3 
## 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 
##   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3 
## 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 
##   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3 
## 
## Within cluster sum of squares by cluster:
## [1] 0.02767843 0.02670083 0.04285487 0.04930253
##  (between_SS / total_SS =  74.8 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

table("Prediction" = Kmeans.res2$cluster,"True label"=TrueClasses)

##           True label
## Prediction  1  2  3  4
##          1  0 40  0  0
##          2  0  0  0 44
##          3  0  0 50  6
##          4 50 10  0  0

pairs(crabs2,col=TrueClasses)

pairs(crabs2,col=Kmeans.res2$cluster)

res<-kmeans(crabs2,4)
# repr?sentons les donn?es dans le plan 2, 4 avec densit? et centres
z<-kde2d(crabs2[,2],crabs2[,4])
contour(z)
# placons les points
points(crabs2[,c(2,4)],col=c("blue","orange")[crabs$sp],pch=c(20,21)[crabs$sex])
# placons les centres des classes
points(res$center[,c(2,4)],cex=3,pch=21,bg="red")

Try to cluster the data in 1 to 20 groups. Plot the within sum of squares in function of the number of clusters. Comment the figure.

WSSkcluster<-rep(0,10) 
for (k in 1:10) {
    WSSmax <- Inf
    #On fait 10 initialisation différentes
    for (i in 1:10) {
      res<-kmeans(crabs2,k) 
      if (res$tot.withinss<WSSmax)
      {
        partition<-res 
        WSSmax<-res$tot.withinss
      }
    }
WSSkcluster[k]<- partition$tot.withinss
}

plot(WSSkcluster,xlab="Number of Clusters",ylab="Total Within Sum Squared")
lines(WSSkcluster,col="red")

Data analysis
TD 2 – Clustering

Kylliann De Santiago

Lundi 19 Septembre 2022

1 Exercise 1 : Partition and matrix

2 Exercise 3 : Between-Within Variance relation

3 Exercise 4 Clustering of the crabs (library MASS)

Data analysis TD 2 – Clustering

Kylliann De Santiago

Lundi 19 Septembre 2022

1 Exercise 1 : Partition and matrix

2 Exercise 3 : Between-Within Variance relation

3 Exercise 4 Clustering of the crabs (library MASS)

Data analysis
TD 2 – Clustering