Programming environment

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(caret)

## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift

library(iml)
library(rmlnomogram)

Binary outcome (or class-wise multinomial outcome)

1 - Categorical predictors and binary outcome without probability

# Load dataset
data("mtcars")

# Preprocess for training a classifier
mtcars$am <- factor(mtcars$am, levels = c(0, 1), labels = c("Auto", "Manual"))

# Convert some numerical features to categorical for demonstration
mtcars$cyl <- factor(mtcars$cyl)
mtcars$vs <- factor(mtcars$vs)
mtcars$qsec <- factor(as.integer(mtcars$qsec >= 18))

# Using tidyverse to filter only factor columns
mtcars <- select(mtcars, where(is.factor))

# Create dummy variables for all factor variables, excluding the target variable
dummy_vars <- dummyVars(am ~ ., data = mtcars) |> 
  predict(newdata = mtcars)

# Combine binarized predictors with the target variable
mtcars_binarized <- as.data.frame(dummy_vars) |>
  mutate_all(as.factor) |>
  mutate(am = mtcars$am) |>
  select(-vs.0, -cyl.4, -qsec.0)

# Train a random forest model using the full dataset
set.seed(1)
model <- train(
  am ~ ., 
  data = mtcars_binarized, 
  method = "rf", 
  trControl = trainControl(method = "none", classProbs = TRUE)
)

# Extract feature names used in the trained model from caret model object
caret_features <- str_remove_all(model$finalModel$xNames, "1$")

# Modify the training dataset to include only the model features
mtcars_selected <- mtcars_binarized[, caret_features]

# Generate all possible feature combinations for nomogram
nomogram_features <- expand.grid(lapply(mtcars_selected, unique))

##   cyl.6 cyl.8 qsec.1 vs.1
## 1     1     0      0    0
## 2     0     0      0    0
## 3     1     1      0    0
## 4     0     1      0    0
## 5     1     0      1    0
## 6     0     0      1    0

nomogram_outputs <- predict(model, nomogram_features, type = "prob") |>
  select(output = levels(mtcars_binarized$am)[2])

##   output
## 1  0.884
## 2  0.818
## 3  0.246
## 4  0.002
## 5  0.498
## 6  0.700

nomogram <- create_nomogram(nomogram_features, nomogram_outputs)

# Prepare data and model for SHAP value calculation using iml
X <- mtcars_binarized[, -which(names(mtcars_binarized) == "am")]

# Create a predictor object
predictor <- Predictor$new(model = model, data = X)

# Calculate SHAP values
nomogram_shaps <- list()

for(i in seq(nrow(nomogram_features))){
  shapley <-  Shapley$new(predictor, x.interest = nomogram_features[i, ])
  nomogram_shaps[[i]] <- shapley$results
}

names(nomogram_shaps) <- seq(nrow(nomogram_features))

nomogram_shaps <- reduce(imap(nomogram_shaps, ~ mutate(.x, i = .y)), rbind) |>
  filter(class == levels(mtcars_binarized$am)[2]) |>
  select(i, feature, phi) |>
  spread(feature, phi) |>
  arrange(as.numeric(i)) |>
  column_to_rownames(var = "i")

##   cyl.6 cyl.8 qsec.1 vs.1
## 1 -0.09  0.53   0.19    0
## 2  0.03  0.50   0.04    0
## 3 -0.05 -0.42   0.06    0
## 4  0.06 -0.50   0.09    0
## 5 -0.43  0.33  -0.25    0
## 6  0.13  0.38  -0.05    0

2 - Categorical predictors and binary outcome with probability

nomogram2 <- create_nomogram(nomogram_features, nomogram_outputs, prob = TRUE)

nomogram2_with_shap <-
  create_nomogram(
    nomogram_features, nomogram_outputs, nomogram_shaps
    , prob = TRUE
  )

3 - Categorical with single numerical predictors and binary outcome with probability

# Reload original dataset
data("mtcars")

# Round to 0 decimal to reduce possible combinations later
mtcars <- mutate(mtcars, qsec = round(qsec, 0))

# Add single numerical predictor to binarized predictors with the target variable
mtcars_mixed <- cbind(mtcars["qsec"], select(mtcars_binarized, -qsec.1))

# Train a random forest model using the full dataset
set.seed(1)
model2 <- train(
  am ~ ., 
  data = mtcars_mixed, 
  method = "rf", 
  trControl = trainControl(method = "none", classProbs = TRUE)
)

# Extract feature names used in the trained model from caret model object
caret_features2 <- str_remove_all(model2$finalModel$xNames, "1$")

# Modify the training dataset to include only the model features
mtcars_selected2 <- mtcars_mixed[, caret_features2]

# Generate all possible feature combinations for nomogram
nomogram_features2 <-
  select_if(mtcars_selected2, is.numeric) |>
  lapply(\(x) seq(min(x), max(x))) |>
  c(lapply(select_if(mtcars_selected2, is.factor), unique)) |>
  expand.grid()

##   qsec cyl.6 cyl.8 vs.1
## 1   14     1     0    0
## 2   15     1     0    0
## 3   16     1     0    0
## 4   17     1     0    0
## 5   18     1     0    0
## 6   19     1     0    0

nomogram_outputs2 <- predict(model2, nomogram_features2, type = "prob") |>
  select(output = levels(mtcars_mixed$am)[2])

nomogram3 <- create_nomogram(nomogram_features2, nomogram_outputs2, prob = TRUE)

# Prepare data and model for SHAP value calculation using iml
X2 <- mtcars_mixed[, -which(names(mtcars_mixed) == "am")]

# Create a predictor object
predictor2 <- Predictor$new(model = model2, data = X2)

# Calculate SHAP values
nomogram_shaps2 <- list()

for(i in seq(nrow(nomogram_features2))){
  shapley2 <-  Shapley$new(predictor2, x.interest = nomogram_features2[i, ])
  nomogram_shaps2[[i]] <- shapley2$results
}

names(nomogram_shaps2) <- seq(nrow(nomogram_features2))

nomogram_shaps2 <- reduce(imap(nomogram_shaps2, ~ mutate(.x, i = .y)), rbind) |>
  filter(class == levels(mtcars_mixed$am)[2]) |>
  select(i, feature, phi) |>
  spread(feature, phi) |>
  arrange(as.numeric(i)) |>
  column_to_rownames(var = "i")

nomogram3_with_shap <-
  create_nomogram(
    nomogram_features2, nomogram_outputs2, nomogram_shaps2
    , prob = TRUE
  )

Continuous outcome

4 - Categorical predictors and continuous outcome

# Load dataset
data("mtcars")

# Preprocess for training a regressor
outcome <- mtcars$wt

# Convert some numerical features to categorical for demonstration
mtcars$cyl <- factor(mtcars$cyl)
mtcars$vs <- factor(mtcars$vs)
mtcars$qsec <- factor(as.integer(mtcars$qsec >= 18))

# Using tidyverse to filter only factor columns
mtcars <- cbind(select(mtcars, where(is.factor)), select(mtcars, wt))

# Create dummy variables for all factor variables, excluding the target variable
dummy_vars2 <- dummyVars(wt ~ ., data = mtcars) |> 
  predict(newdata = mtcars)

# Combine binarized predictors with the target variable
mtcars_binarized2 <- as.data.frame(dummy_vars2) |>
  mutate_all(as.factor) |>
  mutate(wt = outcome) |>
  select(-vs.0, -cyl.4, -qsec.0)

# Train a random forest model using the full dataset
set.seed(1)
model3 <- train(
  wt ~ ., 
  data = mtcars_binarized2, 
  method = "rf", 
  trControl = trainControl(method = "none")
)

# Extract feature names used in the trained model from caret model object
caret_features3 <- str_remove_all(model3$finalModel$xNames, "1$")

# Modify the training dataset to include only the model features
mtcars_selected3 <- mtcars_binarized2[, caret_features3]

# Generate all possible feature combinations for nomogram
nomogram_features3 <- expand.grid(lapply(mtcars_selected3, unique))

nomogram_outputs3 <- data.frame(output = predict(model3, nomogram_features3))

##     output
## 1 2.958282
## 2 3.010966
## 3 3.537469
## 4 3.878820
## 5 3.037775
## 6 2.971140

nomogram4 <- create_nomogram(nomogram_features3, nomogram_outputs3, est = TRUE)

# Prepare data and model for SHAP value calculation using iml
X3 <- mtcars_binarized2[, -which(names(mtcars_binarized2) == "wt")]

# Create a predictor object
predictor3 <- Predictor$new(model = model3, data = X3)

# Calculate SHAP values
nomogram_shaps3 <- list()

for(i in seq(nrow(nomogram_features3))){
  shapley3 <-  Shapley$new(predictor3, x.interest = nomogram_features3[i, ])
  nomogram_shaps3[[i]] <- shapley3$results
}

names(nomogram_shaps3) <- seq(nrow(nomogram_features3))

nomogram_shaps3 <- reduce(imap(nomogram_shaps3, ~ mutate(.x, i = .y)), rbind) |>
  select(i, feature, phi) |>
  spread(feature, phi) |>
  arrange(as.numeric(i)) |>
  column_to_rownames(var = "i")

nomogram4_with_shap <-
  create_nomogram(
    nomogram_features3, nomogram_outputs3, nomogram_shaps3
    , est = TRUE
  )

5 - Categorical with single numerical predictors and continuous outcome

# Reload original dataset
data("mtcars")

# Round to 0 decimal to reduce possible combinations later
mtcars <- mutate(mtcars, qsec = round(qsec, 0))

# Add single numerical predictor to binarized predictors with the target variable
mtcars_mixed2 <- cbind(mtcars["qsec"], select(mtcars_binarized2, -qsec.1))

# Train a random forest model using the full dataset
set.seed(1)
model4 <- train(
  wt ~ ., 
  data = mtcars_mixed2, 
  method = "rf", 
  trControl = trainControl(method = "none")
)

# Extract feature names used in the trained model from caret model object
caret_features4 <- str_remove_all(model4$finalModel$xNames, "1$")

# Modify the training dataset to include only the model features
mtcars_selected4 <- mtcars_mixed2[, caret_features4]

# Generate all possible feature combinations for nomogram
nomogram_features4 <-
  select_if(mtcars_selected4, is.numeric) |>
  lapply(\(x) seq(min(x), max(x))) |>
  c(lapply(select_if(mtcars_selected4, is.factor), unique)) |>
  expand.grid()

nomogram_outputs4 <- data.frame(output = predict(model4, nomogram_features4))

nomogram5 <- create_nomogram(nomogram_features4, nomogram_outputs4, est = TRUE)

# Prepare data and model for SHAP value calculation using iml
X4 <- mtcars_mixed2[, -which(names(mtcars_mixed2) == "wt")]

# Create a predictor object
predictor4 <- Predictor$new(model = model4, data = X4)

# Calculate SHAP values
nomogram_shaps4 <- list()

for(i in seq(nrow(nomogram_features4))){
  shapley4 <-  Shapley$new(predictor4, x.interest = nomogram_features4[i, ])
  nomogram_shaps4[[i]] <- shapley4$results
}

names(nomogram_shaps4) <- seq(nrow(nomogram_features4))

nomogram_shaps4 <- reduce(imap(nomogram_shaps4, ~ mutate(.x, i = .y)), rbind) |>
  select(i, feature, phi) |>
  spread(feature, phi) |>
  arrange(as.numeric(i)) |>
  column_to_rownames(var = "i")

nomogram4_with_shap <-
  create_nomogram(
    nomogram_features4, nomogram_outputs4, nomogram_shaps4
    , est = TRUE
  )

sessionInfo()

## R version 4.4.1 (2024-06-14)
## Platform: aarch64-unknown-linux-gnu
## Running under: Ubuntu 22.04.5 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/aarch64-linux-gnu/openblas-pthread/libblas.so.3 
## LAPACK: /usr/lib/aarch64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so;  LAPACK version 3.10.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=C              
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Etc/UTC
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] rmlnomogram_0.1.2 iml_0.11.3        caret_6.0-94      lattice_0.22-6   
##  [5] lubridate_1.9.3   forcats_1.0.0     stringr_1.5.1     dplyr_1.1.4      
##  [9] purrr_1.0.2       readr_2.1.5       tidyr_1.3.1       tibble_3.2.1     
## [13] ggplot2_3.5.1     tidyverse_2.0.0  
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_1.2.1     timeDate_4041.110    farver_2.1.2        
##  [4] fastmap_1.2.0        pROC_1.18.5          digest_0.6.37       
##  [7] rpart_4.1.23         timechange_0.3.0     lifecycle_1.0.4     
## [10] survival_3.6-4       magrittr_2.0.3       compiler_4.4.1      
## [13] rlang_1.1.4          sass_0.4.9           tools_4.4.1         
## [16] utf8_1.2.4           yaml_2.3.10          data.table_1.16.2   
## [19] ggsignif_0.6.4       knitr_1.48           labeling_0.4.3      
## [22] plyr_1.8.9           abind_1.4-8          withr_3.0.2         
## [25] Metrics_0.1.4        nnet_7.3-19          grid_4.4.1          
## [28] stats4_4.4.1         fansi_1.0.6          ggpubr_0.6.0        
## [31] colorspace_2.1-1     future_1.34.0        globals_0.16.3      
## [34] scales_1.3.0         iterators_1.0.14     MASS_7.3-60.2       
## [37] cli_3.6.3            rmarkdown_2.28       generics_0.1.3      
## [40] rstudioapi_0.17.1    future.apply_1.11.3  reshape2_1.4.4      
## [43] tzdb_0.4.0           cachem_1.1.0         splines_4.4.1       
## [46] parallel_4.4.1       vctrs_0.6.5          hardhat_1.4.0       
## [49] Matrix_1.7-0         carData_3.0-5        jsonlite_1.8.9      
## [52] car_3.1-3            hms_1.1.3            rstatix_0.7.2       
## [55] Formula_1.2-5        listenv_0.9.1        foreach_1.5.2       
## [58] gower_1.0.1          jquerylib_0.1.4      recipes_1.1.0       
## [61] glue_1.8.0           parallelly_1.38.0    codetools_0.2-20    
## [64] cowplot_1.1.3        stringi_1.8.4        gtable_0.3.6        
## [67] munsell_0.5.1        pillar_1.9.0         htmltools_0.5.8.1   
## [70] ipred_0.9-15         lava_1.8.0           R6_2.5.1            
## [73] evaluate_1.0.1       highr_0.11           backports_1.5.0     
## [76] broom_1.0.7          bslib_0.8.0          class_7.3-22        
## [79] Rcpp_1.0.13          nlme_3.1-164         prodlim_2024.06.25  
## [82] checkmate_2.3.2      xfun_0.48            pkgconfig_2.0.3     
## [85] ModelMetrics_1.2.2.2

Machine Learning Nomogram Exemplar

Herdiantri Sufriyana

Emily Chia-Yu Su

2024-12-24

Programming environment

Binary outcome (or class-wise multinomial outcome)

1 - Categorical predictors and binary outcome without probability

2 - Categorical predictors and binary outcome with probability

3 - Categorical with single numerical predictors and binary outcome with probability

Continuous outcome

4 - Categorical predictors and continuous outcome

5 - Categorical with single numerical predictors and continuous outcome