Exercice 1

1

library(kernlab)
library(FactoMineR)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.6      ✔ purrr   0.3.4 
## ✔ tibble  3.1.7      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.0      ✔ stringr 1.4.0 
## ✔ readr   2.1.2      ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ ggplot2::alpha() masks kernlab::alpha()
## ✖ purrr::cross()   masks kernlab::cross()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
data(spam)

2

spam_quant <- spam %>% select(-"type")
spam_quant_norm <- scale(spam_quant)
res.pca <- PCA(spam_quant_norm,graph = F)
eigvalues<-data.frame(res.pca$eig)
barplot(eigvalues$percentage.of.variance, names.arg=row.names(eigvalues))

X <- res.pca$ind$coord

3

plot(X[,1],X[,2],col = c("Blue","Red")[spam$type])

4

N = 100
spam_subset_idx <- sample(1:nrow(spam),N)
table(spam[spam_subset_idx,58])
## 
## nonspam    spam 
##      58      42
kpc <- kpca(~.,data=as.data.frame(spam_quant_norm[spam_subset_idx,]),kernel="rbfdot",kpar=list(sigma=0.01))
#kpcv <- pcv(kpc)
plot(rotated(kpc)[,1:2],col=c("Blue","Red")[spam[spam_subset_idx,58]],xlab="1st Principal Comp.",ylab="2nd Principal Comp.")

barplot(eig(kpc)/sum(eig(kpc)))

kpc <- kpca(~.,data=as.data.frame(spam_quant_norm[spam_subset_idx,]),kernel="polydot",kpar=list(degree=2),features=2)
kpcv <- pcv(kpc)
plot(rotated(kpc),col=c("Blue","Red")[spam[spam_subset_idx,58]],xlab="1st Principal Comp.",ylab="2nd Principal Comp.")

barplot(eig(kpc)/sum(eig(kpc)))

kpc <- kpca(~.,data=as.data.frame(spam_quant_norm[spam_subset_idx,]),kernel="polydot",kpar=list(degree=5))
kpcv <- pcv(kpc)
plot(rotated(kpc)[,1:2],col=c("Blue","Red")[spam[spam_subset_idx,58]],xlab="1st Principal Comp.",ylab="2nd Principal Comp.")

barplot(eig(kpc)/sum(eig(kpc)))

Votre propre kpca

myKPCA<-function(X,k=2,kernel="Gaussian",sigma=1){
  X<-as.matrix(X)
  if (kernel == "Gaussian") 
      {K<- exp(-1/sigma*(as.matrix(dist(X))^2))
  } else {
    K<-X%*%t(X)
    }
  
  # Centering
  n<-nrow(X)
  II<-matrix(1/n,n,n)
  Ktilde<-K -2*II %*% K+ II %*%K%*%II
  
  # Eigenvalue decomposition
  res<-svd(Ktilde)
  alpha<-res$u
  lambda<-res$d^2
  
  # Projection
  Y<-K%*%alpha[,1:k]
  return(list(Y=Y,lambda=lambda[1:k]))
}
k=2
my_kpca <- myKPCA(spam_quant_norm[spam_subset_idx,],k=k,sigma=1)
plot(my_kpca$Y,col=c("Blue","Red")[spam[spam_subset_idx,58]],xlab="1st Principal Comp.",ylab="2nd Principal Comp.")

Exercice 2

1

library(mlbench)
set.seed(111)
obj <- mlbench.spirals(100,1,0.025)
my.data <- data.frame(4 * obj$x) 
names(my.data)<-c("X1","X2") 
plot(my.data)

my.data<-as.matrix(my.data)

2

Compute \(K\) the matrix of similarities for this dataset using the gaussian kernel

rbfkernel <- rbfdot(sigma = 1)
K <- as.matrix(kernelMatrix(rbfkernel,my.data))
image(K)

3

The next step consists in computing an affinity matrix \(A\) based on \(K\). \(A\) must be made of positive values and be symmetric.

diag(K) = 0
A <- matrix(0,N,N)
neighboor = 3
for(i in 1:N){
  idx = order(K[i,],decreasing=T)[1:neighboor]
  A[i,idx] = 1
}
image(A)

ATTENTIOn A n’est pas symétrique.

A <- K>0.5
diag(A) = 0
image(A)

library(igraph)
## 
## Attachement du package : 'igraph'
## Les objets suivants sont masqués depuis 'package:dplyr':
## 
##     as_data_frame, groups, union
## Les objets suivants sont masqués depuis 'package:purrr':
## 
##     compose, simplify
## L'objet suivant est masqué depuis 'package:tidyr':
## 
##     crossing
## L'objet suivant est masqué depuis 'package:tibble':
## 
##     as_data_frame
## Les objets suivants sont masqués depuis 'package:stats':
## 
##     decompose, spectrum
## L'objet suivant est masqué depuis 'package:base':
## 
##     union
plot(graph_from_adjacency_matrix(A,mode="undirected"))

4

D<-diag(colSums(A))
L<-D-A

5

unnormalized graph Laplacian \(U=D-A\)

U <- D - A

6.

k   <- 2
evL <- eigen(U, symmetric=TRUE)
Z   <- evL$vectors[,(ncol(evL$vectors)-k+1):ncol(evL$vectors)]

Z : matrice des k vecteurs propres associés aux plus petites valeurs propres.

plot(Z)

7.

library(stats)
km <- kmeans(Z, centers=k, nstart=5)
plot(my.data, col=km$cluster)

8.

spec_clust = specc(my.data,centers=k)
plot(my.data, col=spec_clust@.Data, pch=4)            # estimated classes (x)
points(my.data, col=obj$classes, pch=5) # true classes (<>)