## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
# Load dataset
data("mtcars")
# Preprocess for training a classifier
mtcars$am <- factor(mtcars$am, levels = c(0, 1), labels = c("Auto", "Manual"))
# Convert some numerical features to categorical for demonstration
mtcars$cyl <- factor(mtcars$cyl)
mtcars$vs <- factor(mtcars$vs)
mtcars$qsec <- factor(as.integer(mtcars$qsec >= 18))
# Using tidyverse to filter only factor columns
mtcars <- select(mtcars, where(is.factor))
# Create dummy variables for all factor variables, excluding the target variable
dummy_vars <- dummyVars(am ~ ., data = mtcars) |>
predict(newdata = mtcars)
# Combine binarized predictors with the target variable
mtcars_binarized <- as.data.frame(dummy_vars) |>
mutate_all(as.factor) |>
mutate(am = mtcars$am) |>
select(-vs.0, -cyl.4, -qsec.0)
# Train a random forest model using the full dataset
set.seed(1)
model <- train(
am ~ .,
data = mtcars_binarized,
method = "rf",
trControl = trainControl(method = "none", classProbs = TRUE)
)
# Extract feature names used in the trained model from caret model object
caret_features <- str_remove_all(model$finalModel$xNames, "1$")
# Modify the training dataset to include only the model features
mtcars_selected <- mtcars_binarized[, caret_features]
# Generate all possible feature combinations for nomogram
nomogram_features <- expand.grid(lapply(mtcars_selected, unique))
## cyl.6 cyl.8 qsec.1 vs.1
## 1 1 0 0 0
## 2 0 0 0 0
## 3 1 1 0 0
## 4 0 1 0 0
## 5 1 0 1 0
## 6 0 0 1 0
nomogram_outputs <- predict(model, nomogram_features, type = "prob") |>
select(output = levels(mtcars_binarized$am)[2])
## output
## 1 0.884
## 2 0.818
## 3 0.246
## 4 0.002
## 5 0.498
## 6 0.700
# Prepare data and model for SHAP value calculation using iml
X <- mtcars_binarized[, -which(names(mtcars_binarized) == "am")]
# Create a predictor object
predictor <- Predictor$new(model = model, data = X)
# Calculate SHAP values
nomogram_shaps <- list()
for(i in seq(nrow(nomogram_features))){
shapley <- Shapley$new(predictor, x.interest = nomogram_features[i, ])
nomogram_shaps[[i]] <- shapley$results
}
names(nomogram_shaps) <- seq(nrow(nomogram_features))
nomogram_shaps <- reduce(imap(nomogram_shaps, ~ mutate(.x, i = .y)), rbind) |>
filter(class == levels(mtcars_binarized$am)[2]) |>
select(i, feature, phi) |>
spread(feature, phi) |>
arrange(as.numeric(i)) |>
column_to_rownames(var = "i")
## cyl.6 cyl.8 qsec.1 vs.1
## 1 -0.09 0.53 0.19 0
## 2 0.03 0.50 0.04 0
## 3 -0.05 -0.42 0.06 0
## 4 0.06 -0.50 0.09 0
## 5 -0.43 0.33 -0.25 0
## 6 0.13 0.38 -0.05 0
# Reload original dataset
data("mtcars")
# Round to 0 decimal to reduce possible combinations later
mtcars <- mutate(mtcars, qsec = round(qsec, 0))
# Add single numerical predictor to binarized predictors with the target variable
mtcars_mixed <- cbind(mtcars["qsec"], select(mtcars_binarized, -qsec.1))
# Train a random forest model using the full dataset
set.seed(1)
model2 <- train(
am ~ .,
data = mtcars_mixed,
method = "rf",
trControl = trainControl(method = "none", classProbs = TRUE)
)
# Extract feature names used in the trained model from caret model object
caret_features2 <- str_remove_all(model2$finalModel$xNames, "1$")
# Modify the training dataset to include only the model features
mtcars_selected2 <- mtcars_mixed[, caret_features2]
# Generate all possible feature combinations for nomogram
nomogram_features2 <-
select_if(mtcars_selected2, is.numeric) |>
lapply(\(x) seq(min(x), max(x))) |>
c(lapply(select_if(mtcars_selected2, is.factor), unique)) |>
expand.grid()
## qsec cyl.6 cyl.8 vs.1
## 1 14 1 0 0
## 2 15 1 0 0
## 3 16 1 0 0
## 4 17 1 0 0
## 5 18 1 0 0
## 6 19 1 0 0
nomogram_outputs2 <- predict(model2, nomogram_features2, type = "prob") |>
select(output = levels(mtcars_mixed$am)[2])
# Prepare data and model for SHAP value calculation using iml
X2 <- mtcars_mixed[, -which(names(mtcars_mixed) == "am")]
# Create a predictor object
predictor2 <- Predictor$new(model = model2, data = X2)
# Calculate SHAP values
nomogram_shaps2 <- list()
for(i in seq(nrow(nomogram_features2))){
shapley2 <- Shapley$new(predictor2, x.interest = nomogram_features2[i, ])
nomogram_shaps2[[i]] <- shapley2$results
}
names(nomogram_shaps2) <- seq(nrow(nomogram_features2))
nomogram_shaps2 <- reduce(imap(nomogram_shaps2, ~ mutate(.x, i = .y)), rbind) |>
filter(class == levels(mtcars_mixed$am)[2]) |>
select(i, feature, phi) |>
spread(feature, phi) |>
arrange(as.numeric(i)) |>
column_to_rownames(var = "i")
# Load dataset
data("mtcars")
# Preprocess for training a regressor
outcome <- mtcars$wt
# Convert some numerical features to categorical for demonstration
mtcars$cyl <- factor(mtcars$cyl)
mtcars$vs <- factor(mtcars$vs)
mtcars$qsec <- factor(as.integer(mtcars$qsec >= 18))
# Using tidyverse to filter only factor columns
mtcars <- cbind(select(mtcars, where(is.factor)), select(mtcars, wt))
# Create dummy variables for all factor variables, excluding the target variable
dummy_vars2 <- dummyVars(wt ~ ., data = mtcars) |>
predict(newdata = mtcars)
# Combine binarized predictors with the target variable
mtcars_binarized2 <- as.data.frame(dummy_vars2) |>
mutate_all(as.factor) |>
mutate(wt = outcome) |>
select(-vs.0, -cyl.4, -qsec.0)
# Train a random forest model using the full dataset
set.seed(1)
model3 <- train(
wt ~ .,
data = mtcars_binarized2,
method = "rf",
trControl = trainControl(method = "none")
)
# Extract feature names used in the trained model from caret model object
caret_features3 <- str_remove_all(model3$finalModel$xNames, "1$")
# Modify the training dataset to include only the model features
mtcars_selected3 <- mtcars_binarized2[, caret_features3]
# Generate all possible feature combinations for nomogram
nomogram_features3 <- expand.grid(lapply(mtcars_selected3, unique))
## output
## 1 2.958282
## 2 3.010966
## 3 3.537469
## 4 3.878820
## 5 3.037775
## 6 2.971140
# Prepare data and model for SHAP value calculation using iml
X3 <- mtcars_binarized2[, -which(names(mtcars_binarized2) == "wt")]
# Create a predictor object
predictor3 <- Predictor$new(model = model3, data = X3)
# Calculate SHAP values
nomogram_shaps3 <- list()
for(i in seq(nrow(nomogram_features3))){
shapley3 <- Shapley$new(predictor3, x.interest = nomogram_features3[i, ])
nomogram_shaps3[[i]] <- shapley3$results
}
names(nomogram_shaps3) <- seq(nrow(nomogram_features3))
nomogram_shaps3 <- reduce(imap(nomogram_shaps3, ~ mutate(.x, i = .y)), rbind) |>
select(i, feature, phi) |>
spread(feature, phi) |>
arrange(as.numeric(i)) |>
column_to_rownames(var = "i")
# Reload original dataset
data("mtcars")
# Round to 0 decimal to reduce possible combinations later
mtcars <- mutate(mtcars, qsec = round(qsec, 0))
# Add single numerical predictor to binarized predictors with the target variable
mtcars_mixed2 <- cbind(mtcars["qsec"], select(mtcars_binarized2, -qsec.1))
# Train a random forest model using the full dataset
set.seed(1)
model4 <- train(
wt ~ .,
data = mtcars_mixed2,
method = "rf",
trControl = trainControl(method = "none")
)
# Extract feature names used in the trained model from caret model object
caret_features4 <- str_remove_all(model4$finalModel$xNames, "1$")
# Modify the training dataset to include only the model features
mtcars_selected4 <- mtcars_mixed2[, caret_features4]
# Generate all possible feature combinations for nomogram
nomogram_features4 <-
select_if(mtcars_selected4, is.numeric) |>
lapply(\(x) seq(min(x), max(x))) |>
c(lapply(select_if(mtcars_selected4, is.factor), unique)) |>
expand.grid()
# Prepare data and model for SHAP value calculation using iml
X4 <- mtcars_mixed2[, -which(names(mtcars_mixed2) == "wt")]
# Create a predictor object
predictor4 <- Predictor$new(model = model4, data = X4)
# Calculate SHAP values
nomogram_shaps4 <- list()
for(i in seq(nrow(nomogram_features4))){
shapley4 <- Shapley$new(predictor4, x.interest = nomogram_features4[i, ])
nomogram_shaps4[[i]] <- shapley4$results
}
names(nomogram_shaps4) <- seq(nrow(nomogram_features4))
nomogram_shaps4 <- reduce(imap(nomogram_shaps4, ~ mutate(.x, i = .y)), rbind) |>
select(i, feature, phi) |>
spread(feature, phi) |>
arrange(as.numeric(i)) |>
column_to_rownames(var = "i")
nomogram4_with_shap <-
create_nomogram(
nomogram_features4, nomogram_outputs4, nomogram_shaps4
, est = TRUE
)
## R version 4.4.2 (2024-10-31)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 24.04.1 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so; LAPACK version 3.12.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=C
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Etc/UTC
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] rmlnomogram_0.1.2 iml_0.11.3 caret_7.0-1 lattice_0.22-6
## [5] lubridate_1.9.4 forcats_1.0.0 stringr_1.5.1 dplyr_1.1.4
## [9] purrr_1.0.2 readr_2.1.5 tidyr_1.3.1 tibble_3.2.1
## [13] ggplot2_3.5.1 tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] tidyselect_1.2.1 timeDate_4041.110 farver_2.1.2
## [4] fastmap_1.2.0 pROC_1.18.5 digest_0.6.37
## [7] rpart_4.1.24 timechange_0.3.0 lifecycle_1.0.4
## [10] survival_3.8-3 magrittr_2.0.3 compiler_4.4.2
## [13] rlang_1.1.4 sass_0.4.9 tools_4.4.2
## [16] yaml_2.3.10 data.table_1.16.4 ggsignif_0.6.4
## [19] knitr_1.49 labeling_0.4.3 plyr_1.8.9
## [22] abind_1.4-8 withr_3.0.2 Metrics_0.1.4
## [25] sys_3.4.3 nnet_7.3-20 grid_4.4.2
## [28] stats4_4.4.2 ggpubr_0.6.0 colorspace_2.1-1
## [31] future_1.34.0 globals_0.16.3 scales_1.3.0
## [34] iterators_1.0.14 MASS_7.3-64 cli_3.6.3
## [37] rmarkdown_2.29 generics_0.1.3 future.apply_1.11.3
## [40] reshape2_1.4.4 tzdb_0.4.0 cachem_1.1.0
## [43] splines_4.4.2 parallel_4.4.2 vctrs_0.6.5
## [46] hardhat_1.4.0 Matrix_1.7-1 carData_3.0-5
## [49] jsonlite_1.8.9 car_3.1-3 hms_1.1.3
## [52] rstatix_0.7.2 Formula_1.2-5 listenv_0.9.1
## [55] maketools_1.3.1 foreach_1.5.2 gower_1.0.2
## [58] jquerylib_0.1.4 recipes_1.1.0 glue_1.8.0
## [61] parallelly_1.41.0 codetools_0.2-20 cowplot_1.1.3
## [64] stringi_1.8.4 gtable_0.3.6 munsell_0.5.1
## [67] pillar_1.10.1 htmltools_0.5.8.1 ipred_0.9-15
## [70] lava_1.8.0 R6_2.5.1 evaluate_1.0.1
## [73] backports_1.5.0 broom_1.0.7 bslib_0.8.0
## [76] class_7.3-23 Rcpp_1.0.13-1 nlme_3.1-166
## [79] prodlim_2024.06.25 checkmate_2.3.2 xfun_0.50
## [82] buildtools_1.0.0 pkgconfig_2.0.3 ModelMetrics_1.2.2.2