ensembleML provides a single, consistent
API for ensemble machine learning in R. Regardless of which
algorithm you choose, the core workflow is always:
em_fit() -> em_predict() -> em_evaluate()
Advanced usage adds:
em_cv() # k-fold cross-validation (stability estimates)
em_tune() # grid-search hyperparameter optimisation
em_compare() # side-by-side algorithm comparison
em_importance() # feature importance
em_partial() # partial dependence plots
em_confusion() # confusion matrix heatmap
em_calibration() # calibration / reliability diagram
em_residuals() # regression diagnostics
data(iris)
set.seed(42)
idx <- sample(nrow(iris), 120)
train <- iris[idx, ]
test <- iris[-idx, ]
rf <- em_fit(Species ~ ., data = train, method = "random_forest",
verbose = TRUE)
#> [ensembleML] task auto-detected as 'classification'
#>
#> ╭────────────────────────────────────────────────────╮
#> │ Algorithm: random_forest │
#> │ Task: classification │
#> │ Response: Species │
#> │ Classes: setosa, versicolor, virginica│
#> │ Predictors: 4 (Sepal.Length, Sepal.Width, Petal.Length, …)│
#> │ Training n: 120 │
#> │ Fit time: 0.025 sec │
#> │ Train metrics: accuracy=1.0000 kappa=1.0000 precision=1.0000 recall=1.0000 f1=1.0000 auc=NA│
#> │ ⚠ Use em_evaluate() on held-out data │
#> ╰────────────────────────────────────────────────────╯Switching algorithms requires changing a single argument:
xgb <- em_fit(Species ~ ., data = train, method = "xgboost")
ada <- em_fit(Species ~ ., data = train, method = "adaboost")
bag <- em_fit(Species ~ ., data = train, method = "bagging")preds <- em_predict(rf, newdata = test)
head(preds)
#> 7 11 12 19 23 28
#> setosa setosa setosa setosa setosa setosa
#> Levels: setosa versicolor virginicaClass probabilities:
probs <- em_predict(rf, newdata = test, type = "prob")
head(probs, 3)
#> setosa versicolor virginica
#> 7 1.000 0.000 0
#> 11 0.998 0.002 0
#> 12 1.000 0.000 0em_evaluate(rf, newdata = test)
#> accuracy kappa precision recall f1 auc
#> 0.9333 0.8997 0.9364 0.9364 0.9364 NASelect specific metrics:
em_evaluate(rf, newdata = test, metrics = c("accuracy", "f1", "kappa"))
#> accuracy f1 kappa
#> 0.9333 0.9364 0.8997Use em_cv() to get mean +/- SD across folds before
committing to a method:
cv_res <- em_cv(Species ~ ., data = iris, method = "random_forest",
cv_folds = 5, repeats = 3)
cv_res$summary
em_plot_cv(cv_res, metric = "accuracy")grid <- list(ntree = c(100, 300, 500), mtry = c(1, 2, 3))
tuned <- em_tune(
Species ~ ., data = train, method = "random_forest",
param_grid = grid, cv_folds = 5
)
tuned$best_params
tuned$best_score
tuned$resultsEverything works identically for numeric responses:
set.seed(7)
reg_data <- data.frame(
x1 = rnorm(200), x2 = rnorm(200),
y = 3 + 2 * rnorm(200) + rnorm(200))
reg_train <- reg_data[1:160, ]
reg_test <- reg_data[161:200, ]
reg_model <- em_fit(y ~ ., data = reg_train, method = "random_forest")
#> [ensembleML] task auto-detected as 'regression'
em_evaluate(reg_model, reg_test)
#> rmse mae mape rsq adj_rsq
#> 2.4320 1.8556 88.1007 -0.2193 -0.2852
em_residuals(reg_model, reg_test)
#> `geom_smooth()` using formula = 'y ~ x'If you use ensembleML in published work, please cite
it:
The individual algorithms should also be cited — see
citation("ensembleML") for the full list of references.
sessionInfo()
#> R version 4.6.0 (2026-04-24)
#> Platform: x86_64-pc-linux-gnu
#> Running under: Ubuntu 24.04.4 LTS
#>
#> Matrix products: default
#> BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
#> LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so; LAPACK version 3.12.0
#>
#> locale:
#> [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
#> [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
#> [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
#> [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
#> [9] LC_ADDRESS=C LC_TELEPHONE=C
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
#>
#> time zone: Etc/UTC
#> tzcode source: system (glibc)
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] ensembleML_0.2.5 rmarkdown_2.31
#>
#> loaded via a namespace (and not attached):
#> [1] Matrix_1.7-5 randomForest_4.7-1.2 gtable_0.3.6
#> [4] jsonlite_2.0.0 dplyr_1.2.1 compiler_4.6.0
#> [7] tidyselect_1.2.1 gridExtra_2.3 jquerylib_0.1.4
#> [10] splines_4.6.0 scales_1.4.0 yaml_2.3.12
#> [13] fastmap_1.2.0 lattice_0.22-9 ggplot2_4.0.3
#> [16] R6_2.6.1 labeling_0.4.3 generics_0.1.4
#> [19] knitr_1.51 tibble_3.3.1 maketools_1.3.2
#> [22] bslib_0.11.0 pillar_1.11.1 RColorBrewer_1.1-3
#> [25] rlang_1.2.0 cachem_1.1.0 xfun_0.58
#> [28] sass_0.4.10 sys_3.4.3 S7_0.2.2
#> [31] otel_0.2.0 cli_3.6.6 mgcv_1.9-4
#> [34] withr_3.0.2 magrittr_2.0.5 digest_0.6.39
#> [37] grid_4.6.0 nlme_3.1-169 lifecycle_1.0.5
#> [40] vctrs_0.7.3 evaluate_1.0.5 glue_1.8.1
#> [43] farver_2.1.2 buildtools_1.0.0 tools_4.6.0
#> [46] pkgconfig_2.0.3 htmltools_0.5.9