Machine Learning Nomogram Exemplar

Programming environment

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(iml)
library(rmlnomogram)

Binary outcome (or class-wise multinomial outcome)

1 - Categorical predictors and binary outcome without probability

# Load dataset
data("mtcars")

# Preprocess for training a classifier
mtcars$am <- factor(mtcars$am, levels = c(0, 1), labels = c("Auto", "Manual"))

# Convert some numerical features to categorical for demonstration
mtcars$cyl <- factor(mtcars$cyl)
mtcars$vs <- factor(mtcars$vs)
mtcars$qsec <- factor(as.integer(mtcars$qsec >= 18))

# Using tidyverse to filter only factor columns
mtcars <- select(mtcars, where(is.factor))

# Create dummy variables for all factor variables, excluding the target variable
dummy_vars <- dummyVars(am ~ ., data = mtcars) |> 
  predict(newdata = mtcars)

# Combine binarized predictors with the target variable
mtcars_binarized <- as.data.frame(dummy_vars) |>
  mutate_all(as.factor) |>
  mutate(am = mtcars$am) |>
  select(-vs.0, -cyl.4, -qsec.0)

# Train a random forest model using the full dataset
set.seed(1)
model <- train(
  am ~ ., 
  data = mtcars_binarized, 
  method = "rf", 
  trControl = trainControl(method = "none", classProbs = TRUE)
)
# Extract feature names used in the trained model from caret model object
caret_features <- str_remove_all(model$finalModel$xNames, "1$")

# Modify the training dataset to include only the model features
mtcars_selected <- mtcars_binarized[, caret_features]

# Generate all possible feature combinations for nomogram
nomogram_features <- expand.grid(lapply(mtcars_selected, unique))
##   cyl.6 cyl.8 qsec.1 vs.1
## 1     1     0      0    0
## 2     0     0      0    0
## 3     1     1      0    0
## 4     0     1      0    0
## 5     1     0      1    0
## 6     0     0      1    0
nomogram_outputs <- predict(model, nomogram_features, type = "prob") |>
  select(output = levels(mtcars_binarized$am)[2])
##   output
## 1  0.884
## 2  0.818
## 3  0.246
## 4  0.002
## 5  0.498
## 6  0.700
nomogram <- create_nomogram(nomogram_features, nomogram_outputs)

# Prepare data and model for SHAP value calculation using iml
X <- mtcars_binarized[, -which(names(mtcars_binarized) == "am")]

# Create a predictor object
predictor <- Predictor$new(model = model, data = X)

# Calculate SHAP values
nomogram_shaps <- list()

for(i in seq(nrow(nomogram_features))){
  shapley <-  Shapley$new(predictor, x.interest = nomogram_features[i, ])
  nomogram_shaps[[i]] <- shapley$results
}

names(nomogram_shaps) <- seq(nrow(nomogram_features))

nomogram_shaps <- reduce(imap(nomogram_shaps, ~ mutate(.x, i = .y)), rbind) |>
  filter(class == levels(mtcars_binarized$am)[2]) |>
  select(i, feature, phi) |>
  spread(feature, phi) |>
  arrange(as.numeric(i)) |>
  column_to_rownames(var = "i")
##   cyl.6 cyl.8 qsec.1 vs.1
## 1 -0.09  0.53   0.19    0
## 2  0.03  0.50   0.04    0
## 3 -0.05 -0.42   0.06    0
## 4  0.06 -0.50   0.09    0
## 5 -0.43  0.33  -0.25    0
## 6  0.13  0.38  -0.05    0

2 - Categorical predictors and binary outcome with probability

nomogram2 <- create_nomogram(nomogram_features, nomogram_outputs, prob = TRUE)

nomogram2_with_shap <-
  create_nomogram(
    nomogram_features, nomogram_outputs, nomogram_shaps
    , prob = TRUE
  )

3 - Categorical with single numerical predictors and binary outcome with probability

# Reload original dataset
data("mtcars")

# Round to 0 decimal to reduce possible combinations later
mtcars <- mutate(mtcars, qsec = round(qsec, 0))

# Add single numerical predictor to binarized predictors with the target variable
mtcars_mixed <- cbind(mtcars["qsec"], select(mtcars_binarized, -qsec.1))

# Train a random forest model using the full dataset
set.seed(1)
model2 <- train(
  am ~ ., 
  data = mtcars_mixed, 
  method = "rf", 
  trControl = trainControl(method = "none", classProbs = TRUE)
)
# Extract feature names used in the trained model from caret model object
caret_features2 <- str_remove_all(model2$finalModel$xNames, "1$")

# Modify the training dataset to include only the model features
mtcars_selected2 <- mtcars_mixed[, caret_features2]

# Generate all possible feature combinations for nomogram
nomogram_features2 <-
  select_if(mtcars_selected2, is.numeric) |>
  lapply(\(x) seq(min(x), max(x))) |>
  c(lapply(select_if(mtcars_selected2, is.factor), unique)) |>
  expand.grid()
##   qsec cyl.6 cyl.8 vs.1
## 1   14     1     0    0
## 2   15     1     0    0
## 3   16     1     0    0
## 4   17     1     0    0
## 5   18     1     0    0
## 6   19     1     0    0
nomogram_outputs2 <- predict(model2, nomogram_features2, type = "prob") |>
  select(output = levels(mtcars_mixed$am)[2])
nomogram3 <- create_nomogram(nomogram_features2, nomogram_outputs2, prob = TRUE)

# Prepare data and model for SHAP value calculation using iml
X2 <- mtcars_mixed[, -which(names(mtcars_mixed) == "am")]

# Create a predictor object
predictor2 <- Predictor$new(model = model2, data = X2)

# Calculate SHAP values
nomogram_shaps2 <- list()

for(i in seq(nrow(nomogram_features2))){
  shapley2 <-  Shapley$new(predictor2, x.interest = nomogram_features2[i, ])
  nomogram_shaps2[[i]] <- shapley2$results
}

names(nomogram_shaps2) <- seq(nrow(nomogram_features2))

nomogram_shaps2 <- reduce(imap(nomogram_shaps2, ~ mutate(.x, i = .y)), rbind) |>
  filter(class == levels(mtcars_mixed$am)[2]) |>
  select(i, feature, phi) |>
  spread(feature, phi) |>
  arrange(as.numeric(i)) |>
  column_to_rownames(var = "i")
nomogram3_with_shap <-
  create_nomogram(
    nomogram_features2, nomogram_outputs2, nomogram_shaps2
    , prob = TRUE
  )

Continuous outcome

4 - Categorical predictors and continuous outcome

# Load dataset
data("mtcars")

# Preprocess for training a regressor
outcome <- mtcars$wt

# Convert some numerical features to categorical for demonstration
mtcars$cyl <- factor(mtcars$cyl)
mtcars$vs <- factor(mtcars$vs)
mtcars$qsec <- factor(as.integer(mtcars$qsec >= 18))

# Using tidyverse to filter only factor columns
mtcars <- cbind(select(mtcars, where(is.factor)), select(mtcars, wt))

# Create dummy variables for all factor variables, excluding the target variable
dummy_vars2 <- dummyVars(wt ~ ., data = mtcars) |> 
  predict(newdata = mtcars)

# Combine binarized predictors with the target variable
mtcars_binarized2 <- as.data.frame(dummy_vars2) |>
  mutate_all(as.factor) |>
  mutate(wt = outcome) |>
  select(-vs.0, -cyl.4, -qsec.0)

# Train a random forest model using the full dataset
set.seed(1)
model3 <- train(
  wt ~ ., 
  data = mtcars_binarized2, 
  method = "rf", 
  trControl = trainControl(method = "none")
)
# Extract feature names used in the trained model from caret model object
caret_features3 <- str_remove_all(model3$finalModel$xNames, "1$")

# Modify the training dataset to include only the model features
mtcars_selected3 <- mtcars_binarized2[, caret_features3]

# Generate all possible feature combinations for nomogram
nomogram_features3 <- expand.grid(lapply(mtcars_selected3, unique))
nomogram_outputs3 <- data.frame(output = predict(model3, nomogram_features3))
##     output
## 1 2.958282
## 2 3.010966
## 3 3.537469
## 4 3.878820
## 5 3.037775
## 6 2.971140
nomogram4 <- create_nomogram(nomogram_features3, nomogram_outputs3, est = TRUE)

# Prepare data and model for SHAP value calculation using iml
X3 <- mtcars_binarized2[, -which(names(mtcars_binarized2) == "wt")]

# Create a predictor object
predictor3 <- Predictor$new(model = model3, data = X3)

# Calculate SHAP values
nomogram_shaps3 <- list()

for(i in seq(nrow(nomogram_features3))){
  shapley3 <-  Shapley$new(predictor3, x.interest = nomogram_features3[i, ])
  nomogram_shaps3[[i]] <- shapley3$results
}

names(nomogram_shaps3) <- seq(nrow(nomogram_features3))

nomogram_shaps3 <- reduce(imap(nomogram_shaps3, ~ mutate(.x, i = .y)), rbind) |>
  select(i, feature, phi) |>
  spread(feature, phi) |>
  arrange(as.numeric(i)) |>
  column_to_rownames(var = "i")
nomogram4_with_shap <-
  create_nomogram(
    nomogram_features3, nomogram_outputs3, nomogram_shaps3
    , est = TRUE
  )

5 - Categorical with single numerical predictors and continuous outcome

# Reload original dataset
data("mtcars")

# Round to 0 decimal to reduce possible combinations later
mtcars <- mutate(mtcars, qsec = round(qsec, 0))

# Add single numerical predictor to binarized predictors with the target variable
mtcars_mixed2 <- cbind(mtcars["qsec"], select(mtcars_binarized2, -qsec.1))

# Train a random forest model using the full dataset
set.seed(1)
model4 <- train(
  wt ~ ., 
  data = mtcars_mixed2, 
  method = "rf", 
  trControl = trainControl(method = "none")
)
# Extract feature names used in the trained model from caret model object
caret_features4 <- str_remove_all(model4$finalModel$xNames, "1$")

# Modify the training dataset to include only the model features
mtcars_selected4 <- mtcars_mixed2[, caret_features4]

# Generate all possible feature combinations for nomogram
nomogram_features4 <-
  select_if(mtcars_selected4, is.numeric) |>
  lapply(\(x) seq(min(x), max(x))) |>
  c(lapply(select_if(mtcars_selected4, is.factor), unique)) |>
  expand.grid()
nomogram_outputs4 <- data.frame(output = predict(model4, nomogram_features4))
nomogram5 <- create_nomogram(nomogram_features4, nomogram_outputs4, est = TRUE)

# Prepare data and model for SHAP value calculation using iml
X4 <- mtcars_mixed2[, -which(names(mtcars_mixed2) == "wt")]

# Create a predictor object
predictor4 <- Predictor$new(model = model4, data = X4)

# Calculate SHAP values
nomogram_shaps4 <- list()

for(i in seq(nrow(nomogram_features4))){
  shapley4 <-  Shapley$new(predictor4, x.interest = nomogram_features4[i, ])
  nomogram_shaps4[[i]] <- shapley4$results
}

names(nomogram_shaps4) <- seq(nrow(nomogram_features4))

nomogram_shaps4 <- reduce(imap(nomogram_shaps4, ~ mutate(.x, i = .y)), rbind) |>
  select(i, feature, phi) |>
  spread(feature, phi) |>
  arrange(as.numeric(i)) |>
  column_to_rownames(var = "i")
nomogram4_with_shap <-
  create_nomogram(
    nomogram_features4, nomogram_outputs4, nomogram_shaps4
    , est = TRUE
  )

sessionInfo()
## R version 4.4.2 (2024-10-31)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 24.04.1 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so;  LAPACK version 3.12.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=C              
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Etc/UTC
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] rmlnomogram_0.1.2 iml_0.11.3        caret_7.0-1       lattice_0.22-6   
##  [5] lubridate_1.9.4   forcats_1.0.0     stringr_1.5.1     dplyr_1.1.4      
##  [9] purrr_1.0.2       readr_2.1.5       tidyr_1.3.1       tibble_3.2.1     
## [13] ggplot2_3.5.1     tidyverse_2.0.0  
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_1.2.1     timeDate_4041.110    farver_2.1.2        
##  [4] fastmap_1.2.0        pROC_1.18.5          digest_0.6.37       
##  [7] rpart_4.1.24         timechange_0.3.0     lifecycle_1.0.4     
## [10] survival_3.8-3       magrittr_2.0.3       compiler_4.4.2      
## [13] rlang_1.1.4          sass_0.4.9           tools_4.4.2         
## [16] yaml_2.3.10          data.table_1.16.4    ggsignif_0.6.4      
## [19] knitr_1.49           labeling_0.4.3       plyr_1.8.9          
## [22] abind_1.4-8          withr_3.0.2          Metrics_0.1.4       
## [25] sys_3.4.3            nnet_7.3-20          grid_4.4.2          
## [28] stats4_4.4.2         ggpubr_0.6.0         colorspace_2.1-1    
## [31] future_1.34.0        globals_0.16.3       scales_1.3.0        
## [34] iterators_1.0.14     MASS_7.3-64          cli_3.6.3           
## [37] rmarkdown_2.29       generics_0.1.3       future.apply_1.11.3 
## [40] reshape2_1.4.4       tzdb_0.4.0           cachem_1.1.0        
## [43] splines_4.4.2        parallel_4.4.2       vctrs_0.6.5         
## [46] hardhat_1.4.0        Matrix_1.7-1         carData_3.0-5       
## [49] jsonlite_1.8.9       car_3.1-3            hms_1.1.3           
## [52] rstatix_0.7.2        Formula_1.2-5        listenv_0.9.1       
## [55] maketools_1.3.1      foreach_1.5.2        gower_1.0.2         
## [58] jquerylib_0.1.4      recipes_1.1.0        glue_1.8.0          
## [61] parallelly_1.41.0    codetools_0.2-20     cowplot_1.1.3       
## [64] stringi_1.8.4        gtable_0.3.6         munsell_0.5.1       
## [67] pillar_1.10.1        htmltools_0.5.8.1    ipred_0.9-15        
## [70] lava_1.8.0           R6_2.5.1             evaluate_1.0.1      
## [73] backports_1.5.0      broom_1.0.7          bslib_0.8.0         
## [76] class_7.3-23         Rcpp_1.0.13-1        nlme_3.1-166        
## [79] prodlim_2024.06.25   checkmate_2.3.2      xfun_0.50           
## [82] buildtools_1.0.0     pkgconfig_2.0.3      ModelMetrics_1.2.2.2