Advanced Modelling Workflows with NRMstatsML

Overview

This vignette covers advanced workflows built on top of the core NRMstatsML modules, including:

  1. Combining trend and uncertainty analysis
  2. Multi-variable response surface exploration
  3. Integrating bootstrap uncertainty into forecasting
  4. Reading data from the bundled CSV (reproducible pipelines)
  5. Extending NRMstatsML with custom stat_fn closures

1. Loading Data from CSV

The package ships a plain-text version of nrm_example under inst/extdata for users who want to demonstrate a full import-to-analysis pipeline:

library(NRMstatsML)

csv_path <- system.file("extdata", "nrm_example.csv",
                        package = "NRMstatsML")
d <- read.csv(csv_path, stringsAsFactors = FALSE)

nrm_data_check(d, time_var = "year", verbose = TRUE)
#> === NRMstatsML: Data Check ===
#>   Rows       : 20
#>   Columns    : 10
#>   Numeric    : year, crop_yield, soil_OC, N, P, K, rainfall, runoff, biomass
#>   Status     : OK

2. Combining Trend Detection and Uncertainty

A common question in long-term NRM studies is: is the observed trend in yield real, or could it arise by chance? Bootstrap sampling answers this by resampling the dataset many times and computing Sen’s slope on each resample.

# Define a statistic function: Sen's slope on crop_yield
sens_stat <- function(df) {
  nrm_sens_slope(df, time_var = "year", value_var = "crop_yield")$slope
}

# Bootstrap the slope — 500 replicates for speed
bs_slope <- nrm_bootstrap(d, stat_fn = sens_stat, n_iter = 500)
print(bs_slope)
#> -- Bootstrap --
#>   Mean : -0.0009  SD : 0.0249  CI : [-0.0548, 0.0521]

If the 95% CI for the slope excludes zero, the trend is significant. This approach is more robust than a single-pass Mann-Kendall test because it directly quantifies slope uncertainty.


3. Multi-Variable Response Surface

When two inputs interact (e.g. N × rainfall), a full response surface reveals the joint optimum. We build this by iterating nrm_response_curve() across a grid and collecting predicted yields.

# Fit a quadratic surface: yield ~ N + rainfall (manual grid approach)
mv_surface <- nrm_multivariate(d,
  formula = crop_yield ~ N + I(N^2) + rainfall + I(rainfall^2) + N:rainfall,
  scale   = FALSE)

# Prediction grid
N_seq   <- seq(60, 120, length.out = 30)
R_seq   <- seq(610, 720, length.out = 30)
grid    <- expand.grid(N = N_seq, rainfall = R_seq)

grid$predicted_yield <- stats::predict(mv_surface$model, newdata = grid)

# Simple contour summary (base graphics — no extra dependencies)
with(
  reshape(grid, idvar = "N", timevar = "rainfall",
          direction = "wide")[, 1:10],   # abbreviated for display
  message(sprintf("Predicted yield range: %.2f to %.2f t/ha",
                  min(grid$predicted_yield),
                  max(grid$predicted_yield)))
)
# ggplot2 visualisation of the response surface
library(ggplot2)

ggplot(grid, aes(x = N, y = rainfall, fill = predicted_yield)) +
  geom_tile() +
  scale_fill_viridis_c(name = "Yield\n(t/ha)", option = "C") +
  geom_contour(aes(z = predicted_yield), colour = "white",
               alpha = 0.5, linewidth = 0.4) +
  labs(
    title    = "Predicted Yield Response Surface",
    subtitle = "Quadratic model: crop_yield ~ N × rainfall",
    x        = "Nitrogen applied (kg/ha)",
    y        = "Rainfall (mm)"
  ) +
  theme_minimal(base_size = 13)


4. Uncertainty-Adjusted Forecasting

Instead of a single ARIMA forecast, we propagate parametric uncertainty by running nrm_forecast() on bootstrap resamples of the historical record.

# Collect horizon-1 point forecasts from 200 bootstrap resamples
set.seed(42)
boot_forecasts <- replicate(200, {
  idx      <- sample(nrow(d), replace = TRUE)
  boot_d   <- d[sort(idx), ]          # keep time order approximately
  boot_d$year <- d$year               # restore exact year sequence
  fc <- nrm_forecast(boot_d, value_var = "crop_yield",
                     horizon = 1, method = "auto_arima")
  as.numeric(fc$forecast$mean)
}, simplify = TRUE)

cat(sprintf(
  "Bootstrap forecast (h=1):\n  Mean = %.3f t/ha\n  95%% CI: [%.3f, %.3f]\n",
  mean(boot_forecasts),
  quantile(boot_forecasts, 0.025),
  quantile(boot_forecasts, 0.975)
))
#> Bootstrap forecast (h=1):
#>   Mean = 5.448 t/ha
#>   95% CI: [5.157, 5.548]

5. Custom stat_fn Closures

nrm_uncertainty() and nrm_bootstrap() accept any function of the form f(data) → scalar. This makes them highly extensible.

Example A — R-squared of an OLS model

rsq_fn <- function(df) {
  m  <- lm(crop_yield ~ N + P + K, data = df)
  summary(m)$r.squared
}

bs_rsq <- nrm_bootstrap(d, stat_fn = rsq_fn, n_iter = 500)
print(bs_rsq)
#> -- Bootstrap --
#>   Mean : 0.9995  SD : 0.0002  CI : [0.9990, 0.9998]

Example B — Optimum N from a quadratic response curve

opt_N_fn <- function(df) {
  rc <- nrm_response_curve(df, input_var = "N",
                            response_var = "crop_yield",
                            type = "quadratic")
  opt <- nrm_optimize_input(rc, price_ratio = 0.6)
  opt$economic_optimum
}

bs_optN <- nrm_bootstrap(d, stat_fn = opt_N_fn, n_iter = 300)
cat(sprintf(
  "Bootstrapped economic optimum N:\n  Mean = %.1f kg/ha\n  95%% CI: [%.1f, %.1f]\n",
  bs_optN$mean, bs_optN$ci[1], bs_optN$ci[2]
))
#> Bootstrapped economic optimum N:
#>   Mean = -98660.8 kg/ha
#>   95% CI: [-279854.7, -11328.2]

Example C — Mann-Kendall tau for soil OC

tau_fn <- function(df) {
  nrm_mann_kendall(df, time_var = "year", value_var = "soil_OC")$tau
}

mc_tau <- nrm_monte_carlo(d, stat_fn = tau_fn, n_iter = 300, noise_sd = 0.05)
print(mc_tau)
#> -- Monte Carlo --
#>   Mean : 0.9972  SD : 0.0054  CI : [0.9789, 1.0000]

6. Multi-Metric Model Comparison

Compare OLS, quadratic OLS, and PLS side-by-side using nrm_benchmark() on a simple hold-out split.

set.seed(123)
n_d   <- nrow(d)
idx   <- sample(n_d, floor(0.75 * n_d))
train <- d[ idx, ]
test  <- d[-idx, ]

# Fit three candidate models
m_ols  <- lm(crop_yield ~ N + P + K + rainfall,          data = train)
m_quad <- lm(crop_yield ~ N + I(N^2) + rainfall,         data = train)
m_full <- lm(crop_yield ~ N + P + K + rainfall + soil_OC, data = train)

bm <- nrm_benchmark(
  models       = list(ols = m_ols, quadratic = m_quad, full_ols = m_full),
  test_data    = test,
  response_var = "crop_yield"
)
print(bm)
#> === NRMstatsML: Model Benchmark ===
#> 
#>      model   RMSE    MAE    Rsq
#>   full_ols 0.0279 0.0200 0.9974
#>        ols 0.0285 0.0191 0.9973
#>  quadratic 0.0435 0.0380 0.9937

7. Complete Reproducible Pipeline

Putting it all together — a single self-contained script suitable for a supplementary materials section:

library(NRMstatsML)

# 1. Import
d <- read.csv(system.file("extdata", "nrm_example.csv",
                           package = "NRMstatsML"))

# 2. Validate
nrm_data_check(d)

# 3. Trend
tr  <- nrm_trend(d, time_var = "year", value_var = "crop_yield")
nrm_plot(tr)

# 4. Multivariate
mv  <- nrm_multivariate(d, crop_yield ~ N + P + K + rainfall)
nrm_summary(mv)

# 5. Response curve and optimum
rc  <- nrm_response_curve(d, "N", "crop_yield", type = "quadratic")
opt <- nrm_optimize_input(rc, price_ratio = 0.6)
nrm_plot(rc)

# 6. Forecast
fc  <- nrm_forecast(d, "crop_yield", horizon = 5)
nrm_plot(fc)

# 7. Uncertainty
unc <- nrm_uncertainty(d,
        stat_fn = function(x) mean(x$crop_yield),
        method  = "bootstrap", n_iter = 1000)
print(unc)

Session Info

sessionInfo()
#> R version 4.6.0 (2026-04-24)
#> Platform: x86_64-pc-linux-gnu
#> Running under: Ubuntu 24.04.4 LTS
#> 
#> Matrix products: default
#> BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
#> LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so;  LAPACK version 3.12.0
#> 
#> locale:
#>  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
#>  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
#>  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
#>  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
#>  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
#> 
#> time zone: Etc/UTC
#> tzcode source: system (glibc)
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#> [1] ggplot2_4.0.3    NRMstatsML_0.1.4 rmarkdown_2.31  
#> 
#> loaded via a namespace (and not attached):
#>  [1] sass_0.4.10         generics_0.1.4      lattice_0.22-9     
#>  [4] digest_0.6.39       magrittr_2.0.5      evaluate_1.0.5     
#>  [7] grid_4.6.0          RColorBrewer_1.1-3  fastmap_1.2.0      
#> [10] jsonlite_2.0.0      trend_1.1.6         forecast_9.0.2     
#> [13] extraDistr_1.10.0.4 viridisLite_0.4.3   scales_1.4.0       
#> [16] isoband_0.3.0       jquerylib_0.1.4     cli_3.6.6          
#> [19] rlang_1.2.0         Kendall_2.2.2       withr_3.0.2        
#> [22] cachem_1.1.0        yaml_2.3.12         otel_0.2.0         
#> [25] tools_4.6.0         parallel_4.6.0      dplyr_1.2.1        
#> [28] colorspace_2.1-2    boot_1.3-32         buildtools_1.0.0   
#> [31] vctrs_0.7.3         R6_2.6.1            zoo_1.8-15         
#> [34] lifecycle_1.0.5     pkgconfig_2.0.3     urca_1.3-4         
#> [37] pillar_1.11.1       bslib_0.11.0        gtable_0.3.6       
#> [40] glue_1.8.1          Rcpp_1.1.1-1.1      xfun_0.58          
#> [43] tibble_3.3.1        tidyselect_1.2.1    sys_3.4.3          
#> [46] knitr_1.51          farver_2.1.2        htmltools_0.5.9    
#> [49] nlme_3.1-169        maketools_1.3.2     labeling_0.4.3     
#> [52] timeDate_4052.112   fracdiff_1.5-4      compiler_4.6.0     
#> [55] S7_0.2.2