Examples for the Matrix Minimum Covariance Determinant estimators

library(robustmatrix)
library(ggplot2)
library(dplyr)
library(ggnewscale)
library(ggrepel)
library(tidyr)
library(tibble)
library(forcats)

This file documents how to use the methods introduced by Mayrhofer, Radojičić, and Filzmoser (2024) and provides an overview of the methods of the robustmatrix package. All examples discussed in Mayrhofer, Radojičić, and Filzmoser (2024) can be reproduced using this file. This document aims to illustrate how to use the functions of the robustmatrix package.

The mmcd function implements the FastMMCD algorithm and computes the robust matrix-variate Matrix Minimum Covariance Determinant (MMCD) estimators. For outlier explanation, the matrixShapley function decomposes the squared Mahalanobis distance into outlyingness contributions form a matrix’s rows, columns, or cells.

The real-world data examples discussed here are the mountain weather, darwin, and video data sets. The former two data sets are included in the package and can be loaded using data(weather) and data(darwin), respectively. Related publications and data sources are listed in the help files. The video data set is from the Perception Test Images Sequences from Li et al. (2004) and the preprocessed video is available at https://wis.kuleuven.be/statdatascience/robust/software.

Glacier weather data – Sonnblick observatory

Weather data from Austria’s highest weather station, situated in the Austrian Central Alps on the glaciated mountain “Hoher Sonnblick”, standing at an elevation of 3106 meters above sea level. An array of dimension (p, q, n), comprising n = 136 observations, each represented by a p = 5 times q = 12 dimensional matrix. Observed parameters are monthly averages of

air pressure (AP)
precipitation (P)
sunshine hours (SH)
temperature (T)
proportion of solid precipitation (SP)

from 1891 to 2022.

data(weather)
X <- weather[,,!apply(weather,3, function(x) any(is.na(x)))]
n<- dim(X)[3]
p<- dim(X)[1]
q<- dim(X)[2]

Parameter estimation of mean, rowwise and columnwise covariance matrix using maximum likelihood estimation and robust parameter estimation based on MMCD.

par_MMLE <- mmle(X, lambda = 0)
set.seed(1)
par_MMCD <- mmcd(X, alpha = 0.5 ,lambda = 0, nsamp = 5000, nthreads = 1)

Computation of (robust) Mahalanbosi distances.

MD <- as.numeric(mmd(X = X, 
                     mu = par_MMLE$mu, 
                     cov_row = par_MMLE$cov_row_inv, 
                     cov_col = par_MMLE$cov_col_inv, 
                     inverted = TRUE))
MD_rob <- as.numeric(mmd(X = X, 
                         mu = par_MMCD$mu, 
                         cov_row = par_MMCD$cov_row_inv, 
                         cov_col = par_MMCD$cov_col_inv, 
                         inverted = TRUE))
out_quant <- qchisq(0.95, p*q)
outliers <- which(MD_rob > out_quant)

Distance-distance of yearly outlyingness scores, using the robust procedure, we can unmask several outliers.

names(MD_rob) <- dimnames(X)[[3]]
subs <- MD_rob < out_quant
plt_dd <- ggplot(data = NULL, aes(x = sqrt(MD), y = sqrt(MD_rob))) +
  geom_point() +
  geom_hline(yintercept = sqrt(out_quant)) +
  geom_vline(xintercept = sqrt(out_quant)) +
  geom_label_repel(aes(x = sqrt(MD[!subs]), 
                       y = sqrt(MD_rob[!subs]), 
                       label = names(MD_rob[!subs])), 
                   size = 2.5, max.overlaps = 200, nudge_y = 0.5) +
  theme_classic() +
  xlim(c(min(sqrt(MD)),max(sqrt(MD))+1)) +
  labs(x = "MMD", y = "Robust MMD")
plt_dd

Computation of cellwise Shapley values.

shv_cell <- array(dim = dim(X))
shv_cell[] <- matrixShapley(X, 
                            mu = par_MMCD$mu, 
                            cov_row = par_MMCD$cov_row_inv, 
                            cov_col = par_MMCD$cov_col_inv, 
                            inverted = TRUE, type = "cell")
dimnames(shv_cell) <- dimnames(X)

Preparing data for plots.

long_weather <- function(x, val_to, var_levels = NULL) {
  y <- x %>%
    rownames_to_column(var = "var") %>%
    pivot_longer(cols = -var, names_to = "parameter", values_to = val_to)
  if(!is.null(var_levels)){
    y <- y %>%
      mutate(var = factor(var, levels = var_levels))
  }
  y
}

var_names <- c("T [°C]" = "T",
               "SH [h]" = "SH",
               "P [mm/m²]" = "P",
               "AP [hPa]" = "AP",
               "SP [%]" = "SP")

set <- cbind("var" = 1891:2022,
             "Temperature [°C]" = 0,
             "Preciptiation [mm/m²]" = 0,
             "Solid Preciptiation [%]" = 0,
             "Sunshine hours [h]" = 0,
             "Air pressure [hPa]" = 0)
set[!(set[,1] %in% dimnames(X)[[3]]),-1] <- 1
set <- data.frame(set)
colnames(set) <- c("var",
                   "T [°C]",
                   "P [mm/m²]",
                   "SP [%]",
                   "SH [h]",
                   "AP [hPa]")
set_long <- set %>% pivot_longer(-c(var))

shv_year <- apply(shv_cell, c(1,3),sum)
X_center <- array(dim = dim(X), dimnames = dimnames(X))
X_center[] <- (apply(X,3,function(x) x - par_MMCD$mu))
X_center_sign <- array(dim = dim(X), dimnames = dimnames(X))
X_center_sign[] <- sign(X_center)
sign_mat <- apply(X_center,c(1,3),function(x) sign(mean(x)))
sign_mat[,-outliers] <- 0
shv_rel <- apply(shv_year, 2, function(x) x/sum(x))
shv_rel_outliers <- shv_rel
shv_rel_outliers[,-outliers] <- 0
shv_rel_outliers[shv_rel_outliers < 0] <- 0 # ONLY PLOT POSITIVE SHAPLEY VALUES


shv_years_long <- long_weather(data.frame(t(shv_rel_outliers)), val_to = "shv_rel")
sign_years_long <- long_weather(data.frame(t(sign_mat)), val_to = "sign")
plt_data_years <- inner_join(shv_years_long,sign_years_long) %>% 
  mutate(val = sign*abs(shv_rel), 
         sign = as.character(sign), 
         var = as.numeric(var),
         parameter = fct_recode(factor(parameter),!!! var_names))

shv_long_1895 <- long_weather(data.frame(t(shv_cell[,,outliers[1]])), val_to = "shv", var_levels = dimnames(X)[[2]])
X_long_1895 <- long_weather(data.frame(t(X[,,outliers[1]])) %>% mutate(SP = SP*100), val_to = "x", var_levels = dimnames(X)[[2]])
sign_long_1895 <- long_weather(data.frame(t(sign(X[,,outliers[1]] - par_MMCD$mu))), val_to = "sign", var_levels = dimnames(X)[[2]])
plt_data_1895 <- inner_join(shv_long_1895,sign_long_1895) %>% 
  mutate(val = sign*abs(shv), 
         sign = as.character(sign)) %>%
  inner_join(X_long_1895) %>%
  mutate(parameter = fct_recode(factor(parameter),!!! var_names))

shv_long_2022 <- long_weather(data.frame(t(shv_cell[,,n])), val_to = "shv", var_levels = dimnames(X)[[2]])
X_long_2022 <- long_weather(data.frame(t(X[,,n])) %>% mutate(SP = SP*100), val_to = "x", var_levels = dimnames(X)[[2]])
sign_long_2022 <- long_weather(data.frame(t(sign(X[,,n] - par_MMCD$mu))), val_to = "sign", var_levels = dimnames(X)[[2]])
plt_data_2022 <- inner_join(shv_long_2022,sign_long_2022) %>% 
  mutate(val = sign*abs(shv), 
         sign = as.character(sign)) %>%
  inner_join(X_long_2022) %>%
  mutate(parameter = fct_recode(factor(parameter),!!! var_names))

Yearly outlyingess contributions based on rowwise Shapley values. In the plot, non-outlying years are colored white, while years with missing data are gray. For the outliers, blue indicates “above average”, red indicates “below average”, and color intensity is proportional to the rowwise Shapley value.

ggplot(plt_data_years, aes(x = var, y = parameter, fill = val)) +
  geom_tile(color = "lightgray") +
  theme_minimal() +
  scale_fill_gradient2(low = "#1F78B4", mid = "white", high = "#E31A1C", 
                       midpoint = 0, guide = "none")  +
  scale_x_continuous(breaks = seq(from = 1880, to = 2020, by = 5), expand = c(0,0)) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1.2, hjust=1),
        axis.title = element_blank(), title = element_blank()) +
  geom_tile(data = set_long, aes(x = var, y = name, alpha = factor(value)), fill = "gray") +
  scale_alpha_manual(values = c(0,1)) +
  guides(alpha = FALSE, fill = FALSE)

Outlyingess contributions based on cellwise Shapley values are computed for 1895 and 2022 using the same color scheme as in the previous plot.

gridExtra::grid.arrange(ggplot(plt_data_1895, aes(x = var, y = parameter, fill = val, label = round(x))) +
                          geom_tile(color = "lightgray") +
                          theme_minimal() +
                          geom_text(size = 3) +
                          theme(axis.text.x = element_text(angle = 45, vjust = 1.4, hjust=1.4),
                                axis.title = element_blank())+
                          labs(title = "1895") + 
                          scale_fill_gradient2(low = "#1F78B4", mid = "white", high = "#E31A1C", 
                                               midpoint = 0, guide = "none"),
                        ggplot(plt_data_2022, aes(x = var, y = parameter, fill = val, label = round(x))) +
                          geom_tile(color = "lightgray") +
                          theme_minimal() +
                          geom_text(size = 3) +
                          theme(axis.text.x = element_text(angle = 45, vjust = 1.4, hjust=1.4),
                                axis.title = element_blank())+
                          labs(title = "2022") + 
                          scale_fill_gradient2(low = "#1F78B4", mid = "white", high = "#E31A1C", 
                                               midpoint = 0, guide = "none"),
                        nrow = 1)

Darwin

The DARWIN (Diagnosis AlzheimeR WIth haNdwriting) data set comprises handwriting samples from 174 individuals. Among them, 89 have been diagnosed with Alzheimer’s disease (AD), while the remaining 85 are considered healthy subjects (H). Each participant completed 25 handwriting tasks on paper, and their pen movements were recorded using a graphic tablet. A set of 18 features was extracted from the raw handwriting data.

An array of dimension (p, q, n), comprising n = 174 observations, each represented by a p = 18 times q = 25 dimensional matrix. The observed parameters are:

Total Time
Air Time
Paper Time
Mean Speed on paper
Mean Acceleration on paper
Mean Acceleration in air
Mean Jerk on paper
Pressure Mean
Pressure Variance
Generalization of the Mean Relative Tremor (GMRT) on paper
GMTR in air
Mean GMRT
Pendowns Number
Max X Extension
Max Y Extension
Dispersion Index

data(darwin)
darwin_class <- dimnames(darwin)[[3]]
sub_H <- which(darwin_class == "H")
sub_P <- which(darwin_class == "P")

The classes are stored in dimnames(darwin)[[3]]; the subjects with class H are the healthy, and the subjects with class P have AD.

Since the parameters Total Time (= Air Time + Paper Time) and Mean GMRT (= (GMRT on paper + GMRT in air)/2) are linearly dependent on other variables, they are excluded from the following analysis. Moreover, the variable Air Time is skewed due to several large univariate outliers in both the H and AD subjects. Across various tasks, some of the observed values of this variable are more than 25 median absolute deviations away from the median. Since the Air Time was measured using the graphic tablet, it could be that some individuals placed the pen too close to the device before the task was performed. While Shapley values based on the MMCD estimators allow us to unveil those large outlyingness contributions, they dominate and mask outlyingess contributions from other variables. Therefore, we also excluded the Air Time variable, and used the p = 15 remaining variables for our analysis.

darwin_array_clean <- darwin[-c(1,9,18),,]
X_H <- darwin_array_clean[,,sub_H] 
X_P <- darwin_array_clean[,,sub_P] 
dimnames(X_H)[[3]] <- sub_H
dimnames(X_P)[[3]] <- sub_P

p <- dim(X_H)[1]
q <- dim(X_H)[2]
n <- dim(X_H)[3]

Parameter estimation of mean, rowwise and columnwise covariance matrix using maximum likelihood estimation and robust parameter estimation based on MMCD.

set.seed(1)
par_mmle <- robustmatrix::mmle(X_H)
par_mmcd <- robustmatrix::mmcd(X_H, nsamp = 500, alpha = 0.5, 
                               nthreads = 1, scale_consistency = "quant")

Plotting the MMDs based on the parameters estimated from the H group.

plt_data <-   data.frame(patient_id = 1:dim(darwin_array_clean)[[3]], 
                         class = c(rep("AD", length(sub_P)),
                                   rep("H", length(sub_H))),
                         mmd = sqrt(mmd(darwin_array_clean, 
                                        par_mmcd$mu, par_mmcd$cov_row, 
                                        par_mmcd$cov_col)))

p_mmd <- ggplot(data = plt_data, 
                mapping = aes(x = patient_id, y = mmd, color = class)) + 
  geom_point(show.legend = FALSE) + 
  geom_hline(yintercept = sqrt(qchisq(0.975, p*q))) + 
  labs(x = "Subject ID", y = "MMD", color = element_blank()) + 
  theme_classic() +
  facet_grid(~ class, scales = "free")
p_mmd

Computing Shapley values and average proportional contributions of the variables to the MMDs for the H and P groups.

shv_cell_P <- array(dim = dim(X_P), dimnames = dimnames(X_P))
shv_cell_P[] <- matrixShapley(X_P, 
                              mu = par_mmcd$mu, 
                              cov_row = par_mmcd$cov_row_inv, 
                              cov_col = par_mmcd$cov_col_inv, 
                              inverted = TRUE, type = "cell")

shv_cell_H <- array(dim = dim(X_H), dimnames = dimnames(X_H))
shv_cell_H[] <- matrixShapley(X_H, 
                              mu = par_mmcd$mu, 
                              cov_row = par_mmcd$cov_row_inv, 
                              cov_col = par_mmcd$cov_col_inv, 
                              inverted = TRUE, type = "cell")


shv_row_mean_P <- data.frame("val" = apply(shv_cell_P,c(1),
                                           function(x)mean(x))) %>% 
  rownames_to_column()
shv_row_mean_H <- data.frame("val" = apply(shv_cell_H,c(1),
                                           function(x)mean(x))) %>% 
  rownames_to_column()

shv_row_mean <- rbind(cbind("class" = "AD", shv_row_mean_P),
                      cbind("class" = "H", shv_row_mean_H))


shv_row_mean_P_prop <- data.frame("val" = apply(shv_cell_P,c(1),
                                                function(x)sum(x)/sum(shv_cell_P))) %>% 
  rownames_to_column()
shv_row_mean_H_prop <- data.frame("val" = apply(shv_cell_H,c(1),
                                                function(x)sum(x)/sum(shv_cell_H))) %>% 
  rownames_to_column()

shv_row_mean_prop<- rbind(cbind("class" = "AD", shv_row_mean_P_prop),
                          cbind("class" = "H", shv_row_mean_H_prop))
fct_order <- order(shv_row_mean_P_prop$val)
shv_row_mean_prop <- shv_row_mean_prop %>%
  mutate(rowname = factor(rowname, levels = shv_row_mean_P_prop$rowname[fct_order]))

Average proportional contributions for the H and P groups.

p_shv_features <- ggplot(data = shv_row_mean_prop, 
                         aes(x = val, y = rowname, fill = class)) + 
  geom_bar(stat = "identity", position=position_dodge(), color = "black") + 
  labs(x = "Proportional contribution to MMD²", 
       y = element_blank(), fill = element_blank()) + 
  scale_x_continuous(expand = c(0, 0)) + 
  theme_classic() + 
  theme(legend.position = c(0.95,0.5))
#> Warning: A numeric `legend.position` argument in `theme()` was deprecated in ggplot2
#> 3.5.0.
#> ℹ Please use the `legend.position.inside` argument of `theme()` instead.
#> This warning is displayed once every 8 hours.
#> Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
#> generated.
p_shv_features

Video data

The following code can load the video data and reproduce the results outlined in Mayrhofer, Radojičić, and Filzmoser (2024). Due to the dimensionality of the data set, the following code is not executed.

# Loading the data
load(url("https://wis.kuleuven.be/stat/robust/Programs/DO/do-video-data-rdata"))

# Creating the overview plot
ind_overview <- c(1,487,491,495,500)
plts_overview <- list()
for(i in seq_along(ind_overview)){
  dim_Video <- dim(Video)
  video_grid <- expand.grid("x" = 1:dim_Video[2], "y" = 1:dim_Video[3])
  video_long <- data.frame(cbind(video_grid, 
                                 "r" = as.vector(Video[ind_overview[i],,,1]/255), 
                                 "g" = as.vector(Video[ind_overview[i],,,2]/255), 
                                 "b" = as.vector(Video[ind_overview[i],,,3]/255))) %>% mutate(rgb.val=rgb(r,g,b))
  plts_overview[[i]] <- ggplot(video_long, aes(y,x)) +
    geom_raster(aes(fill=rgb.val)) +
    scale_fill_identity() +
    theme_void()+
    coord_fixed() + 
    theme(plot.title = element_text(hjust = 0.5),
          panel.grid.major = element_blank(), 
          panel.grid.minor = element_blank(),
          panel.background = element_rect(fill = "transparent" ,colour = "black"),
          panel.ontop = TRUE,
          plot.margin = margin(1, 1, 1, 1, "pt")) + 
    scale_x_continuous(expand = c(0,0)) + 
    scale_y_continuous(expand = c(0,0), trans = "reverse") + 
    labs(title = paste("Frame", ind_overview[i]))
}
video_overview <- do.call(gridExtra::grid.arrange, args = list("grobs" = plts_overview, "nrow" = 1))

# Transform to grayscale image
video_grayscale <- apply(Video, 1:3, mean)
X <- aperm(video_grayscale,c(2,3,1))
n <- dim(X)[3]
p <- dim(X)[1]
q <- dim(X)[2]

# MMLE and MMCD parameter estimation
par_MMLE <- mmle(X, lambda = 0)
set.seed(1)
par_MMCD <- mmcd(X, alpha = 0.5,lambda = 0, nsamp = 500, nthreads = 1)

# Compute squared Mahlanobis distances
MD <- mmd(X, mu = par_MMLE$mu, cov_row = par_MMLE$cov_row_inv, 
          cov_col = par_MMLE$cov_col_inv, inverted = TRUE)
MD_rob <- mmd(X, mu = par_MMCD$mu, cov_row = par_MMCD$cov_row_inv, 
              cov_col = par_MMCD$cov_col_inv, inverted = TRUE)
out_quant <- qchisq(0.99, p*q)

# Plot MMD of all observations
data_plt_md <- data.frame(cbind("Frame" = 1:length(MD_rob), "MD" = sqrt(MD_rob)))
ggplot(data_plt_md, aes(x = Frame, y = MD)) + 
  geom_point() + 
  labs(x = "Frame", y = "MMD") + 
  scale_x_continuous(limits = range(1,n), breaks = seq(from = 50, to = n, by = 50), expand = c(0,3)) + 
  theme_classic()

# Plot MMD of last observations
sub_ind <- (1:length(MD_rob))[-(1:475)]
frame_labels <- rep(NA,length(sub_ind))
frame_labels[sub_ind %in% c(487,491,495,500)] <- c("Frame 487: man left of tree", "Frame 491: man behind tree", "Frame 495: man right of tree", "Frame 500: man fully visible")
data_plt_md1 <- data.frame(cbind("Frame" = sub_ind, "MD" = sqrt(MD_rob[sub_ind]), "labels" = frame_labels))
ggplot(data_plt_md1, aes(x = as.numeric(Frame), y = as.numeric(MD), label = labels)) + 
  geom_point() + 
  geom_path() + 
  geom_label_repel(nudge_x = 75, nudge_y = -80, color = "darkblue") + 
  labs(x = "Frame", y = "MMD") + 
  scale_x_continuous(limits = range(sub_ind), breaks = seq(from = 480, to = 630, by = 10), expand = c(0,1)) + 
  theme_classic()

# Compute Shapley values
shv <- array(dim = dim(X))
shv[,,] <- matrixShapley(X[,,], mu = par_MMCD$mu, 
                         cov_row = par_MMCD$cov_row_inv, cov_col = par_MMCD$cov_col_inv, 
                         inverted = TRUE, type = "cell")

# Function to plot Shapley values for image data
plot_shapley_image <- function(image,shapley_values, lower = -100, upper = 100, 
                               title = NULL, positive_only = FALSE, border = TRUE){
  shapley_values[shapley_values < lower] <- lower
  shapley_values[shapley_values > upper] <- upper
  if(positive_only){
    shapley_values[shapley_values < 0] <- 0
  }
  
  image_grid <- expand.grid("x" = 1:nrow(shapley_values), "y" = 1:ncol(shapley_values))
  image_data <- cbind(image_grid, "z" = as.vector(image), "shv" = as.vector(shapley_values))
  plt <- ggplot() 
  if(any(image != 0)){
    plt <- plt + 
      geom_raster(data = image_data, aes(x, y, fill = z)) + 
      scale_fill_gradient(low = "black", high = "white",guide=FALSE)
  }
  plt <- plt +
    new_scale("fill") +
    geom_raster(data = image_data, aes(x, y, fill = shv), na.rm = TRUE) +
    scale_fill_gradientn(colours= c("blue", "transparent", "red"),limits=c(lower, upper),
                         guide=FALSE, na.value = "transparent") + 
    labs(title = title) + 
    theme_void()+
    coord_fixed() + 
    theme(plot.title = element_text(hjust = 0.5))
  if(border){
    plt <- plt  +
      theme(panel.grid.major = element_blank(), 
            panel.grid.minor = element_blank(),
            panel.background = element_rect(fill = "transparent" ,colour = "black"),
            panel.ontop = TRUE,
            plot.margin = margin(1, 1, 1, 1, "pt")) + 
      scale_y_continuous(expand = c(0,0)) + 
      scale_x_continuous(expand = c(0,0))
  }
  plt 
}

# Plot Shapley values for selected frames
lims <- 100
gridExtra::grid.arrange(
  plot_shapley_image(image = t(apply(X[,,487],2,rev)), shapley_values = (t(apply(shv[,,487],2,rev))),
                     lower = -lims, upper = lims, title = "Frame 487", positive_only = TRUE), 
  plot_shapley_image(image = t(apply(X[,,491],2,rev)), shapley_values = (t(apply(shv[,,491],2,rev))),
                     lower = -lims, upper = lims, title = "Frame 491", positive_only = TRUE),
  plot_shapley_image(image = t(apply(X[,,495],2,rev)), shapley_values = (t(apply(shv[,,495],2,rev))),
                     lower = -lims, upper = lims, title = "Frame 495", positive_only = TRUE),
  plot_shapley_image(image = t(apply(X[,,500],2,rev)), shapley_values = (t(apply(shv[,,500],2,rev))),
                     lower = -lims, upper = lims, title = "Frame 500", positive_only = TRUE),
  nrow = 1
)

Li, Liyuan, Weimin Huang, Irene Yu-Hua Gu, and Qi Tian. 2004. “Statistical Modeling of Complex Backgrounds for Foreground Object Detection.” IEEE Transactions on Image Processing 13 (11): 1459–72.

Mayrhofer, Marcus, Una Radojičić, and Peter Filzmoser. 2024. “Robust Covariance Estimation and Explainable Outlier Detection for Matrix-Valued Data.” arXiv Preprint arXiv:2403.03975.