--- title: "Machine Learning Nomogram Exemplar" author: - name: Herdiantri Sufriyana affiliation: - &ibi Institute of Biomedical Informatics, College of Medicine, National Yang Ming Chiao Tung University, Taipei, Taiwan. email: herdi@nycu.edu.tw - name: Emily Chia-Yu Su affiliation: - *ibi date: "2024-12-24" output: html_document: toc: true toc_depth: 3 toc_float: collapsed: false smooth_scroll: true vignette: > %\VignetteIndexEntry{Machine Learning Nomogram Exemplar} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- # Programming environment ```{r Load necessary packages} library(tidyverse) library(caret) library(iml) library(rmlnomogram) ``` # Binary outcome (or class-wise multinomial outcome) ## 1 - Categorical predictors and binary outcome without probability ```{r Example 1 - Train a prediction model, eval=FALSE} # Load dataset data("mtcars") # Preprocess for training a classifier mtcars$am <- factor(mtcars$am, levels = c(0, 1), labels = c("Auto", "Manual")) # Convert some numerical features to categorical for demonstration mtcars$cyl <- factor(mtcars$cyl) mtcars$vs <- factor(mtcars$vs) mtcars$qsec <- factor(as.integer(mtcars$qsec >= 18)) # Using tidyverse to filter only factor columns mtcars <- select(mtcars, where(is.factor)) # Create dummy variables for all factor variables, excluding the target variable dummy_vars <- dummyVars(am ~ ., data = mtcars) |> predict(newdata = mtcars) # Combine binarized predictors with the target variable mtcars_binarized <- as.data.frame(dummy_vars) |> mutate_all(as.factor) |> mutate(am = mtcars$am) |> select(-vs.0, -cyl.4, -qsec.0) # Train a random forest model using the full dataset set.seed(1) model <- train( am ~ ., data = mtcars_binarized, method = "rf", trControl = trainControl(method = "none", classProbs = TRUE) ) ``` ```{r Example 1 - Entire dataset of combinations of the selected features, eval=FALSE} # Extract feature names used in the trained model from caret model object caret_features <- str_remove_all(model$finalModel$xNames, "1$") # Modify the training dataset to include only the model features mtcars_selected <- mtcars_binarized[, caret_features] # Generate all possible feature combinations for nomogram nomogram_features <- expand.grid(lapply(mtcars_selected, unique)) ``` ```{r Example 1 - Load subs 1 to avoid multi-threading, include=FALSE} data(nomogram_features) ``` ```{r table-1, echo=FALSE} head(nomogram_features) ``` ```{r Example 1 - Prediction outputs for the entire dataset, eval=FALSE} nomogram_outputs <- predict(model, nomogram_features, type = "prob") |> select(output = levels(mtcars_binarized$am)[2]) ``` ```{r Example 1 - Load subs 2 to avoid multi-threading, include=FALSE} data(nomogram_outputs) ``` ```{r table-2, echo=FALSE} head(nomogram_outputs) ``` ```{r Example 1 - Create nomogram} nomogram <- create_nomogram(nomogram_features, nomogram_outputs) ``` ```{r figure-1, echo=FALSE, fig.height=3, fig.width=12} nomogram ``` ```{r Example 1 - Compute SHAP values for the entire dataset, eval=FALSE} # Prepare data and model for SHAP value calculation using iml X <- mtcars_binarized[, -which(names(mtcars_binarized) == "am")] # Create a predictor object predictor <- Predictor$new(model = model, data = X) # Calculate SHAP values nomogram_shaps <- list() for(i in seq(nrow(nomogram_features))){ shapley <- Shapley$new(predictor, x.interest = nomogram_features[i, ]) nomogram_shaps[[i]] <- shapley$results } names(nomogram_shaps) <- seq(nrow(nomogram_features)) nomogram_shaps <- reduce(imap(nomogram_shaps, ~ mutate(.x, i = .y)), rbind) |> filter(class == levels(mtcars_binarized$am)[2]) |> select(i, feature, phi) |> spread(feature, phi) |> arrange(as.numeric(i)) |> column_to_rownames(var = "i") ``` ```{r Example 1 - Load subs 3 to avoid multi-threading, include=FALSE} data(nomogram_shaps) ``` ```{r table-3, echo=FALSE} head(nomogram_shaps) ``` ## 2 - Categorical predictors and binary outcome with probability ```{r Example 2 - Create nomogram} nomogram2 <- create_nomogram(nomogram_features, nomogram_outputs, prob = TRUE) ``` ```{r figure-2, echo=FALSE, fig.height=4.5, fig.width=10} nomogram2 ``` ```{r Example 2 - Create nomogram with SHAP} nomogram2_with_shap <- create_nomogram( nomogram_features, nomogram_outputs, nomogram_shaps , prob = TRUE ) ``` ```{r figure-3, echo=FALSE, fig.height=6, fig.width=12} nomogram2_with_shap ``` ## 3 - Categorical with single numerical predictors and binary outcome with probability ```{r Example 3 - Train a prediction model, eval=FALSE} # Reload original dataset data("mtcars") # Round to 0 decimal to reduce possible combinations later mtcars <- mutate(mtcars, qsec = round(qsec, 0)) # Add single numerical predictor to binarized predictors with the target variable mtcars_mixed <- cbind(mtcars["qsec"], select(mtcars_binarized, -qsec.1)) # Train a random forest model using the full dataset set.seed(1) model2 <- train( am ~ ., data = mtcars_mixed, method = "rf", trControl = trainControl(method = "none", classProbs = TRUE) ) ``` ```{r Example 3 - Entire dataset of combinations of the selected features, eval=FALSE} # Extract feature names used in the trained model from caret model object caret_features2 <- str_remove_all(model2$finalModel$xNames, "1$") # Modify the training dataset to include only the model features mtcars_selected2 <- mtcars_mixed[, caret_features2] # Generate all possible feature combinations for nomogram nomogram_features2 <- select_if(mtcars_selected2, is.numeric) |> lapply(\(x) seq(min(x), max(x))) |> c(lapply(select_if(mtcars_selected2, is.factor), unique)) |> expand.grid() ``` ```{r Example 3 - Load subs 1 to avoid multi-threading, include=FALSE} data(nomogram_features2) ``` ```{r table-4, echo=FALSE} head(nomogram_features2) ``` ```{r Example 3 - Prediction outputs for the entire dataset, eval=FALSE} nomogram_outputs2 <- predict(model2, nomogram_features2, type = "prob") |> select(output = levels(mtcars_mixed$am)[2]) ``` ```{r Example 3 - Load subs 2 to avoid multi-threading, include=FALSE} data(nomogram_outputs2) ``` ```{r Example 3 - Create nomogram} nomogram3 <- create_nomogram(nomogram_features2, nomogram_outputs2, prob = TRUE) ``` ```{r figure-4, echo=FALSE, fig.height=7.5, fig.width=12} nomogram3 ``` ```{r Example 3 - Compute SHAP values for the entire dataset, eval=FALSE} # Prepare data and model for SHAP value calculation using iml X2 <- mtcars_mixed[, -which(names(mtcars_mixed) == "am")] # Create a predictor object predictor2 <- Predictor$new(model = model2, data = X2) # Calculate SHAP values nomogram_shaps2 <- list() for(i in seq(nrow(nomogram_features2))){ shapley2 <- Shapley$new(predictor2, x.interest = nomogram_features2[i, ]) nomogram_shaps2[[i]] <- shapley2$results } names(nomogram_shaps2) <- seq(nrow(nomogram_features2)) nomogram_shaps2 <- reduce(imap(nomogram_shaps2, ~ mutate(.x, i = .y)), rbind) |> filter(class == levels(mtcars_mixed$am)[2]) |> select(i, feature, phi) |> spread(feature, phi) |> arrange(as.numeric(i)) |> column_to_rownames(var = "i") ``` ```{r Example 3 - Load subs 3 to avoid multi-threading, include=FALSE} data(nomogram_shaps2) ``` ```{r Example 3 - Create nomogram with SHAP} nomogram3_with_shap <- create_nomogram( nomogram_features2, nomogram_outputs2, nomogram_shaps2 , prob = TRUE ) ``` ```{r figure-5, echo=FALSE, fig.height=7.5, fig.width=12} nomogram3_with_shap ``` # Continuous outcome ## 4 - Categorical predictors and continuous outcome ```{r Example 4 - Train a prediction model, eval=FALSE} # Load dataset data("mtcars") # Preprocess for training a regressor outcome <- mtcars$wt # Convert some numerical features to categorical for demonstration mtcars$cyl <- factor(mtcars$cyl) mtcars$vs <- factor(mtcars$vs) mtcars$qsec <- factor(as.integer(mtcars$qsec >= 18)) # Using tidyverse to filter only factor columns mtcars <- cbind(select(mtcars, where(is.factor)), select(mtcars, wt)) # Create dummy variables for all factor variables, excluding the target variable dummy_vars2 <- dummyVars(wt ~ ., data = mtcars) |> predict(newdata = mtcars) # Combine binarized predictors with the target variable mtcars_binarized2 <- as.data.frame(dummy_vars2) |> mutate_all(as.factor) |> mutate(wt = outcome) |> select(-vs.0, -cyl.4, -qsec.0) # Train a random forest model using the full dataset set.seed(1) model3 <- train( wt ~ ., data = mtcars_binarized2, method = "rf", trControl = trainControl(method = "none") ) ``` ```{r Example 4 - Entire dataset of combinations of the selected features, eval=FALSE} # Extract feature names used in the trained model from caret model object caret_features3 <- str_remove_all(model3$finalModel$xNames, "1$") # Modify the training dataset to include only the model features mtcars_selected3 <- mtcars_binarized2[, caret_features3] # Generate all possible feature combinations for nomogram nomogram_features3 <- expand.grid(lapply(mtcars_selected3, unique)) ``` ```{r Example 4 - Load subs 1 to avoid multi-threading, include=FALSE} data(nomogram_features3) ``` ```{r Example 4 - Prediction outputs for the entire dataset, eval=FALSE} nomogram_outputs3 <- data.frame(output = predict(model3, nomogram_features3)) ``` ```{r Example 4 - Load subs 2 to avoid multi-threading, include=FALSE} data(nomogram_outputs3) ``` ```{r table-5, echo=FALSE} head(nomogram_outputs3) ``` ```{r Example 4 - Create nomogram} nomogram4 <- create_nomogram(nomogram_features3, nomogram_outputs3, est = TRUE) ``` ```{r figure-6, echo=FALSE, fig.height=4.5, fig.width=10} nomogram4 ``` ```{r Example 4 - Compute SHAP values for the entire dataset, eval=FALSE} # Prepare data and model for SHAP value calculation using iml X3 <- mtcars_binarized2[, -which(names(mtcars_binarized2) == "wt")] # Create a predictor object predictor3 <- Predictor$new(model = model3, data = X3) # Calculate SHAP values nomogram_shaps3 <- list() for(i in seq(nrow(nomogram_features3))){ shapley3 <- Shapley$new(predictor3, x.interest = nomogram_features3[i, ]) nomogram_shaps3[[i]] <- shapley3$results } names(nomogram_shaps3) <- seq(nrow(nomogram_features3)) nomogram_shaps3 <- reduce(imap(nomogram_shaps3, ~ mutate(.x, i = .y)), rbind) |> select(i, feature, phi) |> spread(feature, phi) |> arrange(as.numeric(i)) |> column_to_rownames(var = "i") ``` ```{r Example 4 - Load subs 3 to avoid multi-threading, include=FALSE} data(nomogram_shaps3) ``` ```{r Example 4 - Create nomogram with SHAP} nomogram4_with_shap <- create_nomogram( nomogram_features3, nomogram_outputs3, nomogram_shaps3 , est = TRUE ) ``` ```{r figure-7, echo=FALSE, fig.height=7.5, fig.width=12} nomogram4_with_shap ``` ## 5 - Categorical with single numerical predictors and continuous outcome ```{r Example 5 - Train a prediction model, eval=FALSE} # Reload original dataset data("mtcars") # Round to 0 decimal to reduce possible combinations later mtcars <- mutate(mtcars, qsec = round(qsec, 0)) # Add single numerical predictor to binarized predictors with the target variable mtcars_mixed2 <- cbind(mtcars["qsec"], select(mtcars_binarized2, -qsec.1)) # Train a random forest model using the full dataset set.seed(1) model4 <- train( wt ~ ., data = mtcars_mixed2, method = "rf", trControl = trainControl(method = "none") ) ``` ```{r Example 5 - Entire dataset of combinations of the selected features, eval=FALSE} # Extract feature names used in the trained model from caret model object caret_features4 <- str_remove_all(model4$finalModel$xNames, "1$") # Modify the training dataset to include only the model features mtcars_selected4 <- mtcars_mixed2[, caret_features4] # Generate all possible feature combinations for nomogram nomogram_features4 <- select_if(mtcars_selected4, is.numeric) |> lapply(\(x) seq(min(x), max(x))) |> c(lapply(select_if(mtcars_selected4, is.factor), unique)) |> expand.grid() ``` ```{r Example 5 - Load subs 1 to avoid multi-threading, include=FALSE} data(nomogram_features4) ``` ```{r Example 5 - Prediction outputs for the entire dataset, eval=FALSE} nomogram_outputs4 <- data.frame(output = predict(model4, nomogram_features4)) ``` ```{r Example 5 - Load subs 2 to avoid multi-threading, include=FALSE} data(nomogram_outputs4) ``` ```{r Example 5 - Create nomogram} nomogram5 <- create_nomogram(nomogram_features4, nomogram_outputs4, est = TRUE) ``` ```{r figure-8, echo=FALSE, fig.height=7.5, fig.width=12} nomogram5 ``` ```{r Example 5 - Compute SHAP values for the entire dataset, eval=FALSE} # Prepare data and model for SHAP value calculation using iml X4 <- mtcars_mixed2[, -which(names(mtcars_mixed2) == "wt")] # Create a predictor object predictor4 <- Predictor$new(model = model4, data = X4) # Calculate SHAP values nomogram_shaps4 <- list() for(i in seq(nrow(nomogram_features4))){ shapley4 <- Shapley$new(predictor4, x.interest = nomogram_features4[i, ]) nomogram_shaps4[[i]] <- shapley4$results } names(nomogram_shaps4) <- seq(nrow(nomogram_features4)) nomogram_shaps4 <- reduce(imap(nomogram_shaps4, ~ mutate(.x, i = .y)), rbind) |> select(i, feature, phi) |> spread(feature, phi) |> arrange(as.numeric(i)) |> column_to_rownames(var = "i") ``` ```{r Example 5 - Load subs 3 to avoid multi-threading, include=FALSE} data(nomogram_shaps4) ``` ```{r Example 5 - Create nomogram with SHAP} nomogram4_with_shap <- create_nomogram( nomogram_features4, nomogram_outputs4, nomogram_shaps4 , est = TRUE ) ``` ```{r figure-9, echo=FALSE, fig.height=7.5, fig.width=12} nomogram4_with_shap ``` ```{r session-info, echo=TRUE} sessionInfo() ```