This homework is to be sent by mail to before ??? January 2023.

Comparison of imputation methods

The goal is to compare imputation methods on the SBS5242 dataset containing missing values, which is a synthetic subset (but realistic) of the Austrian structural business statistics data.

library(VIM)
data(SBS5242)
XNA <- SBS5242
head(XNA)
##            PW       BWS_F      Umsatz         PERSA       BEZWD     BEZWDVK
## 1 43888.85242 35081.07414 3737.974701 11815.3610831 37577.19455 379.0521485
## 2  1449.40170  1239.13170  120.099200     5.9457950  1251.43080   8.6653528
## 3  1009.61541   944.93722   79.832300   163.2859277   857.75827   4.2969994
## 4  1218.17072   942.29390  104.851729     1.9011026  1063.73052  20.4941952
## 5   517.36342   450.46195   41.326691    18.0060038   448.09227   2.0590045
## 6    67.19103    44.91809    5.003297    -0.6017785    62.71034  -0.2673511
##       BESCH         USB      ISACH
## 1 310.09482 30.48368699 6575.33408
## 2  18.47358 -0.41414050   58.02441
## 3  24.73831  0.35008017   12.33191
## 4  30.44422 -0.37563076  513.91041
## 5  20.18175 -0.09648481   21.44578
## 6  13.85199 -0.68410116   25.90378
summary(XNA)
##        PW              BWS_F              Umsatz              PERSA          
##  Min.   :   21.5   Min.   :   10.22   Min.   :    5.003   Min.   :    -0.60  
##  1st Qu.:  365.1   1st Qu.:  171.30   1st Qu.:  306.267   1st Qu.:     3.21  
##  Median :  985.1   Median :  503.18   Median :  801.245   Median :   105.09  
##  Mean   : 2515.4   Mean   : 1406.27   Mean   : 2100.978   Mean   :  3147.40  
##  3rd Qu.: 2691.6   3rd Qu.: 1309.43   3rd Qu.: 2548.480   3rd Qu.:  1002.91  
##  Max.   :43888.8   Max.   :35081.07   Max.   :23558.504   Max.   :127175.91  
##  NA's   :5         NA's   :5          NA's   :5           NA's   :5          
##      BEZWD             BEZWDVK             BESCH               USB          
##  Min.   :   10.43   Min.   : -0.6602   Min.   : -0.1676   Min.   : -0.6841  
##  1st Qu.:  192.40   1st Qu.:  0.0000   1st Qu.:  3.9790   1st Qu.:  0.5444  
##  Median :  517.46   Median :  5.5174   Median :  9.4356   Median :  4.7794  
##  Mean   : 1453.76   Mean   : 18.1511   Mean   : 17.6972   Mean   : 12.0593  
##  3rd Qu.: 1417.21   3rd Qu.: 20.4039   3rd Qu.: 20.1053   3rd Qu.: 18.0577  
##  Max.   :37577.19   Max.   :379.0521   Max.   :310.0948   Max.   :105.3674  
##  NA's   :5          NA's   :5          NA's   :5          NA's   :5         
##      ISACH         
##  Min.   :  -0.925  
##  1st Qu.:   3.753  
##  Median :  31.026  
##  Mean   : 191.003  
##  3rd Qu.: 127.189  
##  Max.   :6575.334  
##  NA's   :5
aggr(XNA)

countNA(XNA)/sum(dim(XNA))
## [1] 0.1660517
###This function takes as input 
# XNA: a dataset containing missing values
# percNA: the percentage of missing values we want to introduce in addition.
###This functions returns a list containing 
# XNA_new: a dataset containing the old and the new added missing values,
# mask: a vector with the indexes of the new added missing values

synthetic_MCAR <- function(XNA,percNA){
  trueNA <- which(is.na(XNA))
  nbNA <- round(sum(dim(XNA))*percNA)
  syntheticNA <- sample(setdiff(1:sum(dim(XNA)),trueNA),nbNA,replace=FALSE)
  XNA_new <- XNA
  XNA_new[syntheticNA] <- NA
  return(list(XNA_new=XNA_new,mask=syntheticNA))
}
res_addNA <- synthetic_MCAR(XNA,0.3)
XNA_addNA <- res_addNA$XNA_new
mask_addNA <- res_addNA$mask
head(XNA_addNA)
##            PW       BWS_F      Umsatz         PERSA       BEZWD     BEZWDVK
## 1 43888.85242 35081.07414 3737.974701 11815.3610831 37577.19455 379.0521485
## 2          NA  1239.13170  120.099200     5.9457950  1251.43080   8.6653528
## 3  1009.61541   944.93722   79.832300   163.2859277   857.75827   4.2969994
## 4  1218.17072   942.29390  104.851729     1.9011026  1063.73052  20.4941952
## 5   517.36342   450.46195   41.326691    18.0060038   448.09227   2.0590045
## 6    67.19103    44.91809    5.003297    -0.6017785    62.71034  -0.2673511
##       BESCH         USB      ISACH
## 1 310.09482 30.48368699 6575.33408
## 2  18.47358 -0.41414050   58.02441
## 3  24.73831  0.35008017   12.33191
## 4  30.44422 -0.37563076  513.91041
## 5  20.18175 -0.09648481   21.44578
## 6  13.85199 -0.68410116   25.90378
ImputeMean <- function(tab){
  m <- apply(tab, 2, mean, na.rm = TRUE)
  tab <- sapply(1:ncol(tab), function(x) ifelse(is.na(tab[,x]), m[x], tab[,x]))
  tab <- as.data.frame(tab)
  return(tab)
}

Ximp.mean <- ImputeMean(XNA_addNA)
library(softImpute)
sft <- softImpute(x = XNA_addNA)
Ximp.sft <- sft$u %*% diag(sft$d) %*% t(sft$v) # compute the factorization
Ximp.sft[which(!is.na(XNA_addNA))] <- Ximp.sft[which(!is.na(XNA_addNA))] # replace missing values by computed values
library(mice)
Ximp.mice <- mice(XNA_addNA)

nb_imputeddataset <- 5
mice_mice <- mice(data = matrix(XNA_addNA,nrow=dim(XNA_addNA)[1],ncol=dim(XNA_addNA)[2]), m = nb_imputeddataset)
IMP <- 0
for (i in 1:nb_imputeddataset) { IMP <- IMP + mice::complete(mice_mice, i)}
Ximp.mice  <-  IMP/nb_imputeddataset  #5 is the default number of multiple imputations
MSE <- function(X1, X2){ return(norm(as.matrix(X1 - X2),type="F")/norm(as.matrix(X2),type="F"))}
MSE(as.matrix(Ximp.mean)[mask_addNA],XNA[mask_addNA])
## [1] 0.808731
MSE(as.matrix(Ximp.mice)[mask_addNA],XNA[mask_addNA])
## [1] 0.1011718
MSE(as.matrix(Ximp.sft)[mask_addNA],XNA[mask_addNA])
## [1] 0.03062457

EM algorithm for logistic regression

Let us consider an i.i.d. sample \((y_i,X_i)_{i=1,\dots,n}\).

\(y_i\in \{0,1\}\) is the outcome variable, \(X_i \in \mathbb{R}^d\) are the covariates, \(\beta \in \mathbb{R}^d\) is the regression parameter.

We assume the logistic regression: \[\mathbb{P}(y_i=1|X_i;\beta)=\frac{1}{1+\exp(-X_i^T\beta)}\] In the sequel, we consider that \(y\) contains some MCAR values and we derive an EM algorithm to estimate \(\beta\).

Denoting \(r=1\) if \(y_i\) is observed, and \(r=0\) otherwise, we have:

\[\ell_\mathrm{obs}(\beta;y,X)=\sum_{i=1}^n r_i \log p(y_i|x_i) + \sum_{i=1}^n (1-r_i) \log \sum_y p(y|x_i)=\sum_{i=1}^n r_i \log p(y_i|x_i) \]

\[\ell_\mathrm{full}(\beta;y,X)=\sum_{i=1}^n \log p(y_i|x_i)\]

Of course, with missing values in \(y\), we do not have access to this quantity!

\[Q(\beta;\beta^{(r)})=\left\{ \begin{array}{ll} \sum_{y_i\in \{0,1\}} p(y_i|x_i;\beta^{(r)}) \log p(y_i|x_i;\beta) & \mbox{if $y_i$ is missing} \\ \log p(y_i|x_i;\beta) & \mbox{otherwise.} \end{array} \right.\]

Hint 1: remark that \(Q(\beta;\beta^{(r)})\) can be seen as a weighted complete data log-likelihood based on \(N=\sum_{i=1}^n n_i\), where \(n_i=2\) if \(y_i\) is missing and \(n_i=1\) otherwise. The weights are \(\omega_{y_i}=p(y_i|x_i;\beta^{(r)})\) if \(y_i\) is missing and \(\omega_{y_i}=1\) if \(y_i\) is observed.

Hint 2: For the M-step, you can simply use the function glm with the argument weights.

library(mvtnorm)

d <- 3
beta_true <- c(0.1,0.5,0.9)

n <- 1000
mu <- rep(0,d)
Sigma <- diag(1,d)+matrix(0.5,d,d)

X <- rmvnorm(n, mean=mu, sigma=Sigma) #multivariate Gaussian variable
logit_link <- 1/(1+exp(-X%*%beta_true))
y <- (runif(n)<logit_link)*1


#### Introduction of MCAR values

nb_missingvalues <- 0.2*n*d
missing_idx <- sample(1:n,nb_missingvalues)
yna <- y
yna[missing_idx] <- NA
e_step <- function(X,y,beta){
  
  weights <- c()
  y_imp <- c()
  X_imp <- matrix(0, nrow = 0, ncol = length(beta))
  
  log_likelihood <- 0
  
  for (i in 1:n){
    
    weight_y0 <- 1/(1+exp(sum(X[i,]*beta)))
    weight_y1 <- 1 - weight_y0 
    
    if (is.na(y[i])){
      weights <- c(weights,weight_y0,weight_y1)
      y_imp <- c(y_imp,0,1)
      X_imp <- rbind(X_imp,X[i,],X[i,])
      
      log_likelihood <- log_likelihood #+ log(weight_y1+weight_y0)
      
    }else{
      
      weights <- c(weights,1)
      y_imp <- c(y_imp,y[i])
      X_imp <- rbind(X_imp,X[i,])
      log_likelihood <- log_likelihood + log(weight_y1)*y[i] + log(weight_y0)*(1-y[i])
    }
    
  }

  return(list(weights=weights,y_imp=y_imp,X_imp=X_imp,log_obs=log_likelihood))
}
res_full <- glm(y~X-1,family = binomial)
sqrt(sum((res_full$coefficients-beta_true)**2))
## [1] 0.1107048
res_cc <- glm(yna[-missing_idx]~X[-missing_idx,]-1,family = binomial)
beta_cc <- res_cc$coefficients 
sqrt(sum((beta_cc-beta_true)**2))
## [1] 0.117672
beta <- rep(-10,d)
error <- c()
log_likelihood <- c()
for (i in 1:50){
  error <- c(error,sqrt(sum((beta-beta_true)**2)))
  res = e_step(X,yna,beta)
  m_step <- glm(res$y_imp~res$X_imp-1,family = binomial(link="logit"),weights=res$weights) 
  log_likelihood <- c(log_likelihood,res$log_obs)
  beta <- m_step$coefficients
}
plot(error)

plot(log_likelihood)

set.seed(1)

canpros <- read.table(file = 'cancerprostate.csv',header = T, sep = ";")
head(canpros)
##   age acide rayonx taille grade Y   log.acid
## 1  66  0.48      0      0     0 0 -0.7339692
## 2  68  0.56      0      0     0 0 -0.5798185
## 3  66  0.50      0      0     0 0 -0.6931472
## 4  56  0.52      0      0     0 0 -0.6539265
## 5  58  0.50      0      0     0 0 -0.6931472
## 6  60  0.49      0      0     0 0 -0.7133499
quanti_var <- c(1,2,6,7)
canpros <- canpros[,quanti_var] # we use only the quantitative variables.

n <- dim(canpros)[1]
y <- canpros$Y
X <- cbind(rep(1,n),canpros$age,canpros$acide,canpros$log.acid)
d <- dim(X)[2]

#### Introduction of MCAR values

nb_missingvalues <- 0.2*n*d
missing_idx <- sample(1:n,nb_missingvalues)

yna <- y
yna[missing_idx] <- NA
model_full <- glm(formula = Y ~ age + acide + log.acid, data = canpros, family = binomial)
summary(model_full)
## 
## Call:
## glm(formula = Y ~ age + acide + log.acid, family = binomial, 
##     data = canpros)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.5293  -0.8574  -0.5544   1.0427   2.0111  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept) 12.34700    6.00871   2.055   0.0399 *
## age         -0.02805    0.05188  -0.541   0.5887  
## acide       -9.96499    5.75020  -1.733   0.0831 .
## log.acid    10.54332    4.85855   2.170   0.0300 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 70.252  on 52  degrees of freedom
## Residual deviance: 59.948  on 49  degrees of freedom
## AIC: 67.948
## 
## Number of Fisher Scoring iterations: 4
model_cc <- glm(formula = yna ~ age + acide + log.acid, data = canpros, family = binomial)
summary(model_cc)
## 
## Call:
## glm(formula = yna ~ age + acide + log.acid, family = binomial, 
##     data = canpros)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.2070  -0.6990  -0.6613   0.3420   1.8610  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)
## (Intercept) -45.33700   63.16758  -0.718    0.473
## age           0.03567    0.12279   0.291    0.771
## acide        46.36343   67.87195   0.683    0.495
## log.acid    -27.20887   44.65008  -0.609    0.542
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 14.421  on 10  degrees of freedom
## Residual deviance: 10.560  on  7  degrees of freedom
##   (42 observations deleted due to missingness)
## AIC: 18.56
## 
## Number of Fisher Scoring iterations: 6
model_cc$coefficients
## (Intercept)         age       acide    log.acid 
## -45.3370031   0.0356729  46.3634266 -27.2088674
beta <- rep(0,d)
log_likelihood <- c()
for (i in 1:50){
  res = e_step(X,yna,beta)
  m_step <- glm(res$y_imp~res$X_imp-1,family = binomial(link="logit"),weights=res$weights) 
  log_likelihood <- c(log_likelihood,res$log_obs)
  beta <- m_step$coefficients
}
summary(m_step)
## 
## Call:
## glm(formula = res$y_imp ~ res$X_imp - 1, family = binomial(link = "logit"), 
##     weights = res$weights)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.20778  -0.66933  -0.01716   0.81753   1.85206  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## res$X_imp1 -44.05095   19.82127  -2.222   0.0263 *
## res$X_imp2   0.03585    0.05962   0.601   0.5477  
## res$X_imp3  45.01095   21.64264   2.080   0.0375 *
## res$X_imp4 -26.29170   13.71639  -1.917   0.0553 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 73.474  on 95  degrees of freedom
## Residual deviance: 54.010  on 91  degrees of freedom
## AIC: 44.188
## 
## Number of Fisher Scoring iterations: 7