This homework is to be sent by mail to aude.sportisse@inria.fr before ??? January 2023.

The first part consists in comparing imputation methods on a data set.
The second part is the implementation of the EM algorithm for a logistic regression with missing data in the outcome variable $y$.

Comparison of imputation methods

The goal is to compare imputation methods on the SBS5242 dataset containing missing values, which is a synthetic subset (but realistic) of the Austrian structural business statistics data.

library(VIM)
data(SBS5242)
XNA <- SBS5242
head(XNA)

##            PW       BWS_F      Umsatz         PERSA       BEZWD     BEZWDVK
## 1 43888.85242 35081.07414 3737.974701 11815.3610831 37577.19455 379.0521485
## 2  1449.40170  1239.13170  120.099200     5.9457950  1251.43080   8.6653528
## 3  1009.61541   944.93722   79.832300   163.2859277   857.75827   4.2969994
## 4  1218.17072   942.29390  104.851729     1.9011026  1063.73052  20.4941952
## 5   517.36342   450.46195   41.326691    18.0060038   448.09227   2.0590045
## 6    67.19103    44.91809    5.003297    -0.6017785    62.71034  -0.2673511
##       BESCH         USB      ISACH
## 1 310.09482 30.48368699 6575.33408
## 2  18.47358 -0.41414050   58.02441
## 3  24.73831  0.35008017   12.33191
## 4  30.44422 -0.37563076  513.91041
## 5  20.18175 -0.09648481   21.44578
## 6  13.85199 -0.68410116   25.90378

Visualize the missing values with the function aggr of the VIM package and compute the global percentage of missing values in the dataset.

summary(XNA)

##        PW              BWS_F              Umsatz              PERSA          
##  Min.   :   21.5   Min.   :   10.22   Min.   :    5.003   Min.   :    -0.60  
##  1st Qu.:  365.1   1st Qu.:  171.30   1st Qu.:  306.267   1st Qu.:     3.21  
##  Median :  985.1   Median :  503.18   Median :  801.245   Median :   105.09  
##  Mean   : 2515.4   Mean   : 1406.27   Mean   : 2100.978   Mean   :  3147.40  
##  3rd Qu.: 2691.6   3rd Qu.: 1309.43   3rd Qu.: 2548.480   3rd Qu.:  1002.91  
##  Max.   :43888.8   Max.   :35081.07   Max.   :23558.504   Max.   :127175.91  
##  NA's   :5         NA's   :5          NA's   :5           NA's   :5          
##      BEZWD             BEZWDVK             BESCH               USB          
##  Min.   :   10.43   Min.   : -0.6602   Min.   : -0.1676   Min.   : -0.6841  
##  1st Qu.:  192.40   1st Qu.:  0.0000   1st Qu.:  3.9790   1st Qu.:  0.5444  
##  Median :  517.46   Median :  5.5174   Median :  9.4356   Median :  4.7794  
##  Mean   : 1453.76   Mean   : 18.1511   Mean   : 17.6972   Mean   : 12.0593  
##  3rd Qu.: 1417.21   3rd Qu.: 20.4039   3rd Qu.: 20.1053   3rd Qu.: 18.0577  
##  Max.   :37577.19   Max.   :379.0521   Max.   :310.0948   Max.   :105.3674  
##  NA's   :5          NA's   :5          NA's   :5          NA's   :5         
##      ISACH         
##  Min.   :  -0.925  
##  1st Qu.:   3.753  
##  Median :  31.026  
##  Mean   : 191.003  
##  3rd Qu.: 127.189  
##  Max.   :6575.334  
##  NA's   :5

aggr(XNA)

countNA(XNA)/sum(dim(XNA))

## [1] 0.1660517

Introduce 30% synthetic missing values in the dataset. You can use the function synthetic_MCAR or create your own function. Explain why the synthetic missing values introduced as such are MCAR.

###This function takes as input 
# XNA: a dataset containing missing values
# percNA: the percentage of missing values we want to introduce in addition.
###This functions returns a list containing 
# XNA_new: a dataset containing the old and the new added missing values,
# mask: a vector with the indexes of the new added missing values

synthetic_MCAR <- function(XNA,percNA){
  trueNA <- which(is.na(XNA))
  nbNA <- round(sum(dim(XNA))*percNA)
  syntheticNA <- sample(setdiff(1:sum(dim(XNA)),trueNA),nbNA,replace=FALSE)
  XNA_new <- XNA
  XNA_new[syntheticNA] <- NA
  return(list(XNA_new=XNA_new,mask=syntheticNA))
}

res_addNA <- synthetic_MCAR(XNA,0.3)
XNA_addNA <- res_addNA$XNA_new
mask_addNA <- res_addNA$mask
head(XNA_addNA)

##            PW       BWS_F      Umsatz         PERSA       BEZWD     BEZWDVK
## 1 43888.85242 35081.07414 3737.974701 11815.3610831 37577.19455 379.0521485
## 2          NA  1239.13170  120.099200     5.9457950  1251.43080   8.6653528
## 3  1009.61541   944.93722   79.832300   163.2859277   857.75827   4.2969994
## 4  1218.17072   942.29390  104.851729     1.9011026  1063.73052  20.4941952
## 5   517.36342   450.46195   41.326691    18.0060038   448.09227   2.0590045
## 6    67.19103    44.91809    5.003297    -0.6017785    62.71034  -0.2673511
##       BESCH         USB      ISACH
## 1 310.09482 30.48368699 6575.33408
## 2  18.47358 -0.41414050   58.02441
## 3  24.73831  0.35008017   12.33191
## 4  30.44422 -0.37563076  513.91041
## 5  20.18175 -0.09648481   21.44578
## 6  13.85199 -0.68410116   25.90378

Compare three imputation methods (mean imputation, another single imputation, multiple imputation) by computing the MSE for the synthetic missing values.

ImputeMean <- function(tab){
  m <- apply(tab, 2, mean, na.rm = TRUE)
  tab <- sapply(1:ncol(tab), function(x) ifelse(is.na(tab[,x]), m[x], tab[,x]))
  tab <- as.data.frame(tab)
  return(tab)
}

Ximp.mean <- ImputeMean(XNA_addNA)

library(softImpute)
sft <- softImpute(x = XNA_addNA)
Ximp.sft <- sft$u %*% diag(sft$d) %*% t(sft$v) # compute the factorization
Ximp.sft[which(!is.na(XNA_addNA))] <- Ximp.sft[which(!is.na(XNA_addNA))] # replace missing values by computed values

library(mice)
Ximp.mice <- mice(XNA_addNA)

nb_imputeddataset <- 5
mice_mice <- mice(data = matrix(XNA_addNA,nrow=dim(XNA_addNA)[1],ncol=dim(XNA_addNA)[2]), m = nb_imputeddataset)
IMP <- 0
for (i in 1:nb_imputeddataset) { IMP <- IMP + mice::complete(mice_mice, i)}
Ximp.mice  <-  IMP/nb_imputeddataset  #5 is the default number of multiple imputations

MSE <- function(X1, X2){ return(norm(as.matrix(X1 - X2),type="F")/norm(as.matrix(X2),type="F"))}

MSE(as.matrix(Ximp.mean)[mask_addNA],XNA[mask_addNA])

## [1] 0.808731

MSE(as.matrix(Ximp.mice)[mask_addNA],XNA[mask_addNA])

## [1] 0.1011718

MSE(as.matrix(Ximp.sft)[mask_addNA],XNA[mask_addNA])

## [1] 0.03062457

Repeat the procedure several times (introduction of synthetic missing values, imputation, computation of MSE) and show the results, e.g. with boxplot.

EM algorithm for logistic regression

Let us consider an i.i.d. sample $(y_i,X_i)_{i=1,\dots,n}$.

$y_i\in \{0,1\}$ is the outcome variable, $X_i \in \mathbb{R}^d$ are the covariates, $\beta \in \mathbb{R}^d$ is the regression parameter.

We assume the logistic regression: \[\mathbb{P}(y_i=1|X_i;\beta)=\frac{1}{1+\exp(-X_i^T\beta)}\] In the sequel, we consider that $y$ contains some MCAR values and we derive an EM algorithm to estimate $\beta$.

Write the observed log-likelihood. Note that in this case, we can maximize it with numerical methods (for example using the R function glm and argument family=binomial).

Denoting $r=1$ if $y_i$ is observed, and $r=0$ otherwise, we have:

\[\ell_\mathrm{obs}(\beta;y,X)=\sum_{i=1}^n r_i \log p(y_i|x_i) + \sum_{i=1}^n (1-r_i) \log \sum_y p(y|x_i)=\sum_{i=1}^n r_i \log p(y_i|x_i) \]

Although maximizing the observed likelihood can be done easily, we will also derive an EM algorithm to maximize it. Write the full log-likelihood.

\[\ell_\mathrm{full}(\beta;y,X)=\sum_{i=1}^n \log p(y_i|x_i)\]

Of course, with missing values in $y$, we do not have access to this quantity!

Show that the E-step can be write as follows

\[Q(\beta;\beta^{(r)})=\left\{ \begin{array}{ll} \sum_{y_i\in \{0,1\}} p(y_i|x_i;\beta^{(r)}) \log p(y_i|x_i;\beta) & \mbox{if $y_i$ is missing} \\ \log p(y_i|x_i;\beta) & \mbox{otherwise.} \end{array} \right.\]

Code the EM algorithm.

Hint 1: remark that $Q(\beta;\beta^{(r)})$ can be seen as a weighted complete data log-likelihood based on $N=\sum_{i=1}^n n_i$, where $n_i=2$ if $y_i$ is missing and $n_i=1$ otherwise. The weights are $\omega_{y_i}=p(y_i|x_i;\beta^{(r)})$ if $y_i$ is missing and $\omega_{y_i}=1$ if $y_i$ is observed.

Hint 2: For the M-step, you can simply use the function glm with the argument weights.

Apply the EM algorithm for the synthetic data $(y_\mathrm{NA},X)$ defined below. Compare the following estimators for $\beta$ by computing the MSE: (i) without missing values, (ii) with missing values by using only the rows which do not contain missing values, (iii) with missing values by using the EM algorithm. Note that in (i) and (ii), you just have to use the function glm. Here, you can consider that the intercept is null. What do we notice for estimators (ii) and (iii) ?

library(mvtnorm)

d <- 3
beta_true <- c(0.1,0.5,0.9)

n <- 1000
mu <- rep(0,d)
Sigma <- diag(1,d)+matrix(0.5,d,d)

X <- rmvnorm(n, mean=mu, sigma=Sigma) #multivariate Gaussian variable
logit_link <- 1/(1+exp(-X%*%beta_true))
y <- (runif(n)<logit_link)*1


#### Introduction of MCAR values

nb_missingvalues <- 0.2*n*d
missing_idx <- sample(1:n,nb_missingvalues)
yna <- y
yna[missing_idx] <- NA

e_step <- function(X,y,beta){
  
  weights <- c()
  y_imp <- c()
  X_imp <- matrix(0, nrow = 0, ncol = length(beta))
  
  log_likelihood <- 0
  
  for (i in 1:n){
    
    weight_y0 <- 1/(1+exp(sum(X[i,]*beta)))
    weight_y1 <- 1 - weight_y0 
    
    if (is.na(y[i])){
      weights <- c(weights,weight_y0,weight_y1)
      y_imp <- c(y_imp,0,1)
      X_imp <- rbind(X_imp,X[i,],X[i,])
      
      log_likelihood <- log_likelihood #+ log(weight_y1+weight_y0)
      
    }else{
      
      weights <- c(weights,1)
      y_imp <- c(y_imp,y[i])
      X_imp <- rbind(X_imp,X[i,])
      log_likelihood <- log_likelihood + log(weight_y1)*y[i] + log(weight_y0)*(1-y[i])
    }
    
  }

  return(list(weights=weights,y_imp=y_imp,X_imp=X_imp,log_obs=log_likelihood))
}

res_full <- glm(y~X-1,family = binomial)
sqrt(sum((res_full$coefficients-beta_true)**2))

## [1] 0.1107048

res_cc <- glm(yna[-missing_idx]~X[-missing_idx,]-1,family = binomial)
beta_cc <- res_cc$coefficients 
sqrt(sum((beta_cc-beta_true)**2))

## [1] 0.117672

beta <- rep(-10,d)
error <- c()
log_likelihood <- c()
for (i in 1:50){
  error <- c(error,sqrt(sum((beta-beta_true)**2)))
  res = e_step(X,yna,beta)
  m_step <- glm(res$y_imp~res$X_imp-1,family = binomial(link="logit"),weights=res$weights) 
  log_likelihood <- c(log_likelihood,res$log_obs)
  beta <- m_step$coefficients
}
plot(error)

plot(log_likelihood)

Apply the EM algorithm for the cancer prostate dataset and compare the different estimators for $\beta$. Here, consider that the intercept is not null.

set.seed(1)

canpros <- read.table(file = 'cancerprostate.csv',header = T, sep = ";")
head(canpros)

##   age acide rayonx taille grade Y   log.acid
## 1  66  0.48      0      0     0 0 -0.7339692
## 2  68  0.56      0      0     0 0 -0.5798185
## 3  66  0.50      0      0     0 0 -0.6931472
## 4  56  0.52      0      0     0 0 -0.6539265
## 5  58  0.50      0      0     0 0 -0.6931472
## 6  60  0.49      0      0     0 0 -0.7133499

quanti_var <- c(1,2,6,7)
canpros <- canpros[,quanti_var] # we use only the quantitative variables.

n <- dim(canpros)[1]
y <- canpros$Y
X <- cbind(rep(1,n),canpros$age,canpros$acide,canpros$log.acid)
d <- dim(X)[2]

#### Introduction of MCAR values

nb_missingvalues <- 0.2*n*d
missing_idx <- sample(1:n,nb_missingvalues)

yna <- y
yna[missing_idx] <- NA

model_full <- glm(formula = Y ~ age + acide + log.acid, data = canpros, family = binomial)
summary(model_full)

## 
## Call:
## glm(formula = Y ~ age + acide + log.acid, family = binomial, 
##     data = canpros)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.5293  -0.8574  -0.5544   1.0427   2.0111  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept) 12.34700    6.00871   2.055   0.0399 *
## age         -0.02805    0.05188  -0.541   0.5887  
## acide       -9.96499    5.75020  -1.733   0.0831 .
## log.acid    10.54332    4.85855   2.170   0.0300 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 70.252  on 52  degrees of freedom
## Residual deviance: 59.948  on 49  degrees of freedom
## AIC: 67.948
## 
## Number of Fisher Scoring iterations: 4

model_cc <- glm(formula = yna ~ age + acide + log.acid, data = canpros, family = binomial)
summary(model_cc)

## 
## Call:
## glm(formula = yna ~ age + acide + log.acid, family = binomial, 
##     data = canpros)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.2070  -0.6990  -0.6613   0.3420   1.8610  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)
## (Intercept) -45.33700   63.16758  -0.718    0.473
## age           0.03567    0.12279   0.291    0.771
## acide        46.36343   67.87195   0.683    0.495
## log.acid    -27.20887   44.65008  -0.609    0.542
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 14.421  on 10  degrees of freedom
## Residual deviance: 10.560  on  7  degrees of freedom
##   (42 observations deleted due to missingness)
## AIC: 18.56
## 
## Number of Fisher Scoring iterations: 6

model_cc$coefficients

## (Intercept)         age       acide    log.acid 
## -45.3370031   0.0356729  46.3634266 -27.2088674

beta <- rep(0,d)
log_likelihood <- c()
for (i in 1:50){
  res = e_step(X,yna,beta)
  m_step <- glm(res$y_imp~res$X_imp-1,family = binomial(link="logit"),weights=res$weights) 
  log_likelihood <- c(log_likelihood,res$log_obs)
  beta <- m_step$coefficients
}
summary(m_step)

## 
## Call:
## glm(formula = res$y_imp ~ res$X_imp - 1, family = binomial(link = "logit"), 
##     weights = res$weights)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.20778  -0.66933  -0.01716   0.81753   1.85206  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## res$X_imp1 -44.05095   19.82127  -2.222   0.0263 *
## res$X_imp2   0.03585    0.05962   0.601   0.5477  
## res$X_imp3  45.01095   21.64264   2.080   0.0375 *
## res$X_imp4 -26.29170   13.71639  -1.917   0.0553 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 73.474  on 95  degrees of freedom
## Residual deviance: 54.010  on 91  degrees of freedom
## AIC: 44.188
## 
## Number of Fisher Scoring iterations: 7

Briefly discuss the link with the semi-supervised learning.

Homework 2: Handling missing values

Model Based Statistical Learning

12/7/2022

Comparison of imputation methods

EM algorithm for logistic regression