1 Exercice 1 : IQ

Knowing that IQ is a normal measure of mean 100 and standard deviation 15, what is the probability of having an IQ more than 120? less than 100?

pnorm(120, mean = 100, sd = 15, lower.tail = F, log.p = FALSE)
## [1] 0.09121122

ou après quelques calculs :

1 - pnorm(4/3)
## [1] 0.09121122

Visualisation :

library(ggplot2)
QI.sup.120<-function(x){ ifelse(x>120,dnorm(x,mean=100,sd=15),NA) }

ggplot(data.frame(x=c(20, 180)),aes(x)) +  stat_function(fun = dnorm,args = list(mean=100,sd=15)) + stat_function(fun =QI.sup.120 , geom = "area", fill = "coral", alpha = 0.3) + geom_text(x = 127, y = 0.003, size = 4, fontface = "bold", label = paste0(round(pnorm(120,mean=100,sd=15,lower.tail = F),2))) + scale_x_continuous(breaks = c(80,100,120,130)) + geom_vline(xintercept=120,colour="coral")

2 Exercice 2 : Bias of the maximum likelihood estimator of the variance

Show that the maximum likelihood estimator of the variance is biased and propose an unbiased estimator.

Demo :

\[ \mathbb{E}[ \, \hat{\sigma}^2 \, ] = \mathbb{E}[\, \dfrac{1}{n} \sum_{i=1}^n (X_i - \bar X)^2] \\ = \mathbb{E}[\, \dfrac{1}{n} \sum_{i=1}^n X_i^2 - \bar{X}^2]\\ = \sigma^2 + \mu^2 - \dfrac{\sigma^2}{n} - \mu^2 \]

3 Exercice 3 : Extreme values

Consider the Fisher irises. Find flowers whose measured widths and lengths are exceptionally large or small.

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ tibble  3.1.7      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.0      ✔ stringr 1.4.0 
## ✔ readr   2.1.2      ✔ forcats 0.5.1 
## ✔ purrr   0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
data(iris)
head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
parameters <-
  iris %>%
  select(-"Species") %>%
  gather(factor_key = TRUE)  %>%
  group_by(key) %>%
  summarise(mean= mean(value), sd= sd(value)) %>%
  mutate(min=mean - 2*sd,max=mean + 2*sd)
parameters
## # A tibble: 4 × 5
##   key           mean    sd    min   max
##   <fct>        <dbl> <dbl>  <dbl> <dbl>
## 1 Sepal.Length  5.84 0.828  4.19   7.50
## 2 Sepal.Width   3.06 0.436  2.19   3.93
## 3 Petal.Length  3.76 1.77   0.227  7.29
## 4 Petal.Width   1.20 0.762 -0.325  2.72
#flower.outliers  <-(apply( X=t((t(iris[,1:4]) < parameters$min) + (t(iris[,1:4]) > parameters$max)),MARGIN = 1,FUN = function(x) if(x) return(1) else(0)))

flower.outliers  <- t((t(iris[,1:4]) < parameters$min) + (t(iris[,1:4]) > parameters$max))
flower.outliers <- rowSums(flower.outliers)

ggplot(iris,aes(x=Sepal.Length,y=Sepal.Width))+
  geom_point(colour=as.numeric(iris$Species),size= flower.outliers*2 + 1 )

4 Exercice 4 : Equiprobability Ellipses

Generate 1000 observations of a two-dimensional normal distribution \(\mathcal{N}(\mu, \Sigma)\)

\[ \mu = \left(\begin{array}{c} 0 \\ 0 \end{array}\right) \\ \Sigma = \left(\begin{array}{cc} 2 & 1\\ 1 & 0.75 \end{array}\right) \]

After, draw the ellipses of equiprobability of the multiples of 5%.

#par(mfrow=c(1,3)) # partage l'affichage en 2 
sigma<-matrix(c(2,1,1,0.75),2,2)
mu <- c(0,0)

cholesky_sigma =chol(sigma)
t(chol(sigma)) %*% chol(sigma) 
##      [,1] [,2]
## [1,]    2 1.00
## [2,]    1 0.75
#Y<- t(t(chol(sigma)) %*% t(matrix(rnorm(2000),1000,2)) + mu)

Y<- matrix(rnorm(2000),1000,2) %*% chol(sigma) + mu
plot(Y,xlab="x",ylab="y",pch='.')

Q<-qchisq(p=seq(0.05,0.95,by=0.1),df=2) 
x<-seq(-4,4,length=100) 
y<-seq(-4,4,length=100) 
sigmainv<-solve(sigma)
a<-sigmainv[1,1]
b<-sigmainv[2,2]
c<-sigmainv[1,2]
z<-outer(x,y,function(x,y) (a*x**2+b*y**2+2*c*x*y)) ## Fonction is t(y) %*% y
image(x,y,z)
contour(x,y,z,col="blue4",levels=Q,labels=seq(from=0.05,to=0.95,by=0.1),add=T)

persp(x,y,1/(2*pi)*det(sigma)**(-1/2)*exp(-0.5*z),col="cornflowerblue",zlab="f(x)")