This vignette reproduces the main results of
Agustini, M., Fithriasari, K., & Prastyo, D.D. (2026). An accuracy-level method for robust evaluation in predictive analytics. Decision Analytics Journal, 18, 100661. doi:10.1016/j.dajour.2025.100661
It covers the simple case (Table 4–6), the
regression-with-outliers simulation, and the
time-series case, plus the integration helpers for
caret, tidymodels, and forecast.
The imputation case study is intentionally omitted: it
relies on confidential firm-level microdata from BPS-Statistics
Indonesia that cannot be redistributed.
The package ships no datasets. The article’s data come from public sources that are referenced by link only:
IPG3113N, public
domain);To keep this vignette fully reproducible and free of downloads, the regression and time-series sections below use small simulated datasets generated inline with a fixed seed. Code for downloading the real public series is shown but not executed.
The ten observations of Table 4 are reproduced exactly. Model 1 is the baseline and the threshold uses the second quartile (Q2) of the error, where the APE is closest to 0.10.
actual <- c(7.00, 6.03, 2.02, 5.10, 9.00, 1.00, 3.00, 4.38, 1.00, 8.07)
model1 <- c(6.05, 5.02, 1.32, 5.15, 8.00, 2.20, 2.70, 3.48, 1.00, 7.56)
model2 <- c(8.10, 7.04, 2.12, 5.20, 9.10, 1.00, 3.08, 4.40, 1.00, 6.15)
model3 <- c(7.01, 6.04, 2.09, 5.11, 9.01, 5.10, 3.01, 4.39, 1.00, 8.10)conv <- rbind(
Model1 = conventional_metrics(actual, model1),
Model2 = conventional_metrics(actual, model2),
Model3 = conventional_metrics(actual, model3)
)
round(conv, 4)
#> R_squared RMSE NRMSE MAE MAPE SMAPE
#> Model1 0.9194 0.7756 0.1664 0.662 23.3934 20.2449
#> Model2 0.9202 0.7716 0.1656 0.443 6.7401 6.7994
#> Model3 0.7746 1.2968 0.2783 0.426 41.5015 13.9380
rob <- rbind(
Model1 = robust_metrics(actual, model1),
Model2 = robust_metrics(actual, model2),
Model3 = robust_metrics(actual, model3)
)
round(rob, 4)
#> MedAE TMSE Huber_Loss Quantile_Loss
#> Model1 0.80 0.5719 0.2988 0.3310
#> Model2 0.10 0.2834 0.2548 0.2215
#> Model3 0.01 0.0008 0.3603 0.2130thresh <- calculate_threshold(actual, model1, error_type = "ape", quartile = 2)
thresh
#> Accuracy-Level Threshold
#> ========================
#> Error type: ape
#> Quartile: Q2
#> Base threshold (T): 0.1111
#> Multipliers: 2, 5
#>
#> Level Boundaries (ape):
#> L1: error < 0.1111
#> L2: 0.1111 <= error < 0.2222
#> L3: 0.2222 <= error < 0.5556
#> L4: error >= 0.5556
#>
#> Baseline Q2 for all error types:
#> SE: 0.49
#> AE: 0.7
#> APE: 0.1111
#> sAPE: 0.1176
al1 <- accuracy_level(actual, model1, threshold = thresh)
al2 <- accuracy_level(actual, model2, threshold = thresh)
al3 <- accuracy_level(actual, model3, threshold = thresh)
al1$metrics
#> Level CSE CAE CAPE SCAPE
#> 1 L1 40 40 40 40
#> 2 L2 30 60 40 40
#> 3 L3 30 0 10 10
#> 4 L4 0 0 10 10
al2$metrics
#> Level CSE CAE CAPE SCAPE
#> 1 L1 70 70 70 70
#> 2 L2 0 20 20 20
#> 3 L3 20 10 10 10
#> 4 L4 10 0 0 0
al3$metrics
#> Level CSE CAE CAPE SCAPE
#> 1 L1 90 90 90 90
#> 2 L2 0 0 0 0
#> 3 L3 0 0 0 0
#> 4 L4 10 10 10 10These match Table 6 of the paper exactly: Model 3 reaches 90% at Level 1 for every metric, while conventional metrics favour Model 2.
res <- compare_models(
Model1 = list(actual = actual, predicted = model1),
Model2 = list(actual = actual, predicted = model2),
Model3 = list(actual = actual, predicted = model3),
metric = "cape", threshold = thresh
)
res$optimal_model
#> [1] "Model3"
res$comparison[, c("Model", "L1", "L2", "L3", "L4")]
#> Model L1 L2 L3 L4
#> 1 Model1 40 40 10 10
#> 2 Model2 70 20 10 0
#> 3 Model3 90 0 0 10The article injects outliers into a clean regression and shows that the proposed metrics degrade gradually, unlike RMSE/MAPE. Here we mimic that with a small simulated dataset.
set.seed(2026)
n <- 200
x <- runif(n, 0, 100)
y <- 1.5 * x + rnorm(n, 0, 3) # clean response
pred_clean <- 1.5 * x # model prediction
# 5% outliers injected into the response
y_out <- y
idx <- sample(n, size = 0.05 * n)
y_out[idx] <- y_out[idx] + 80
baseline <- calculate_threshold(y, pred_clean, error_type = "ape", quartile = 3)
clean <- conventional_metrics(y, pred_clean)
dirty <- conventional_metrics(y_out, pred_clean)
rbind(clean = round(clean, 3), with_outliers = round(dirty, 3))
#> R_squared RMSE NRMSE MAE MAPE SMAPE
#> clean 0.996 2.77 0.040 2.220 11.495 10.756
#> with_outliers 0.853 17.99 0.248 6.078 14.044 14.654Conventional metrics (especially MAPE/RMSE) jump sharply. The proposed Level-1 accuracy, evaluated against a fixed baseline threshold, moves far less:
al_clean <- accuracy_level(y, pred_clean, threshold = baseline)
al_dirty <- accuracy_level(y_out, pred_clean, threshold = baseline)
data.frame(
scenario = c("clean", "with_outliers"),
CSE_L1 = c(al_clean$metrics$CSE[1], al_dirty$metrics$CSE[1]),
CAPE_L1 = c(al_clean$metrics$CAPE[1], al_dirty$metrics$CAPE[1])
)
#> scenario CSE_L1 CAPE_L1
#> 1 clean 74.5 74.5
#> 2 with_outliers 71.5 71.0The paper forecasts U.S. candy production (FRED
IPG3113N). That series is public; you could load it
directly:
# Public source (not run during vignette build):
# https://www.kaggle.com/code/goldens/candy-production-time-series-analysis
candy <- read.csv("candy_production.csv")For a self-contained demonstration we simulate a seasonal series and compare two naive forecasts.
set.seed(1)
m <- 48
trend <- seq(80, 120, length.out = m)
season <- 10 * sin(2 * pi * (1:m) / 12)
candy <- trend + season + rnorm(m, 0, 3)
fc_seasonal <- c(candy[1:12], candy[1:(m - 12)]) # seasonal-naive-like
fc_mean <- rep(mean(candy), m) # mean forecast
base_ts <- calculate_threshold(candy, fc_seasonal, error_type = "ape", quartile = 2)
data.frame(
model = c("seasonal", "mean"),
CSE_L1 = c(accuracy_level(candy, fc_seasonal, threshold = base_ts)$metrics$CSE[1],
accuracy_level(candy, fc_mean, threshold = base_ts)$metrics$CSE[1])
)
#> model CSE_L1
#> 1 seasonal 47.91667
#> 2 mean 43.75000The integration helpers require optional packages; the chunks below run only when those packages are installed.
library(caret)
#> Loading required package: ggplot2
#> Loading required package: lattice
#>
#> Attaching package: 'caret'
#> The following object is masked from 'package:accuracylevel':
#>
#> compare_models
dat <- data.frame(y = y, x = x)
ctrl <- trainControl(method = "cv", number = 3,
summaryFunction = caret_summary())
fit <- train(y ~ x, data = dat, method = "lm",
trControl = ctrl, metric = "CSE_L1", maximize = TRUE)
fit$results[, c("RMSE", "CSE_L1", "CAE_L1")]
#> RMSE CSE_L1 CAE_L1
#> 1 2.747379 74.49872 74.49872library(yardstick)
#>
#> Attaching package: 'yardstick'
#> The following objects are masked from 'package:caret':
#>
#> precision, recall, sensitivity, specificity
df <- data.frame(truth = actual, estimate = model3)
accuracy_level_metrics(df, truth, estimate)
#> # A tibble: 16 × 3
#> .metric .estimator .estimate
#> <chr> <chr> <dbl>
#> 1 cse_l1 standard 70
#> 2 cse_l2 standard 10
#> 3 cse_l3 standard 0
#> 4 cse_l4 standard 20
#> 5 cae_l1 standard 70
#> 6 cae_l2 standard 10
#> 7 cae_l3 standard 10
#> 8 cae_l4 standard 10
#> 9 cape_l1 standard 70
#> 10 cape_l2 standard 10
#> 11 cape_l3 standard 0
#> 12 cape_l4 standard 20
#> 13 scape_l1 standard 70
#> 14 scape_l2 standard 10
#> 15 scape_l3 standard 0
#> 16 scape_l4 standard 20
al_set <- al_metric_set(include_traditional = TRUE)
al_set(df, truth = truth, estimate = estimate)
#> # A tibble: 7 × 3
#> .metric .estimator .estimate
#> <chr> <chr> <dbl>
#> 1 cse_l1 standard 70
#> 2 cae_l1 standard 70
#> 3 cape_l1 standard 70
#> 4 scape_l1 standard 70
#> 5 rmse standard 1.30
#> 6 mae standard 0.426
#> 7 rsq standard 0.799library(forecast)
#>
#> Attaching package: 'forecast'
#> The following object is masked from 'package:yardstick':
#>
#> accuracy
train_ts <- ts(candy[1:36], frequency = 12)
fc <- forecast(ets(train_ts), h = 12)
al_forecast_accuracy(fc, candy[37:48])$metrics
#> Level CSE CAE CAPE SCAPE
#> 1 L1 66.666667 66.66667 66.66667 66.66667
#> 2 L2 25.000000 33.33333 33.33333 33.33333
#> 3 L3 8.333333 0.00000 0.00000 0.00000
#> 4 L4 0.000000 0.00000 0.00000 0.00000sessionInfo()
#> R version 4.6.0 (2026-04-24)
#> Platform: x86_64-pc-linux-gnu
#> Running under: Ubuntu 24.04.3 LTS
#>
#> Matrix products: default
#> BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
#> LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so; LAPACK version 3.12.0
#>
#> locale:
#> [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
#> [3] LC_TIME=en_US.UTF-8 LC_COLLATE=C
#> [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
#> [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
#> [9] LC_ADDRESS=C LC_TELEPHONE=C
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
#>
#> time zone: Asia/Jakarta
#> tzcode source: system (glibc)
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] forecast_8.24.0 yardstick_1.3.2 caret_7.0-1
#> [4] lattice_0.22-9 ggplot2_4.0.3 accuracylevel_0.1.0
#>
#> loaded via a namespace (and not attached):
#> [1] tidyselect_1.2.1 timeDate_4041.110 dplyr_1.2.1
#> [4] farver_2.1.2 S7_0.2.2 fastmap_1.2.0
#> [7] pROC_1.19.0.1 digest_0.6.39 rpart_4.1.27
#> [10] timechange_0.4.0 lifecycle_1.0.5 survival_3.8-6
#> [13] magrittr_2.0.5 compiler_4.6.0 rlang_1.2.0
#> [16] sass_0.4.10 tools_4.6.0 utf8_1.2.6
#> [19] yaml_2.3.12 data.table_1.18.4 knitr_1.51
#> [22] curl_7.1.0 TTR_0.24.4 plyr_1.8.9
#> [25] DiceDesign_1.10 RColorBrewer_1.1-3 parsnip_1.3.2
#> [28] withr_3.0.2 purrr_1.2.2 workflows_1.2.0
#> [31] nnet_7.3-20 grid_4.6.0 stats4_4.6.0
#> [34] tune_1.3.0 xts_0.14.1 colorspace_2.1-1
#> [37] future_1.70.0 globals_0.19.1 scales_1.4.0
#> [40] iterators_1.0.14 MASS_7.3-65 dichromat_2.0-0.1
#> [43] cli_3.6.6 rmarkdown_2.31 generics_0.1.4
#> [46] otel_0.2.0 rstudioapi_0.17.1 future.apply_1.20.2
#> [49] reshape2_1.4.5 cachem_1.1.0 stringr_1.6.0
#> [52] splines_4.6.0 dials_1.4.1 parallel_4.6.0
#> [55] urca_1.3-4 vctrs_0.7.3 hardhat_1.4.1
#> [58] Matrix_1.7-5 jsonlite_2.0.0 tseries_0.10-58
#> [61] listenv_0.10.1 foreach_1.5.2 gower_1.0.2
#> [64] jquerylib_0.1.4 tidyr_1.3.2 recipes_1.3.1
#> [67] quantmod_0.4.28 glue_1.8.1 parallelly_1.47.0
#> [70] codetools_0.2-20 rsample_1.3.1 lubridate_1.9.4
#> [73] stringi_1.8.7 gtable_0.3.6 quadprog_1.5-8
#> [76] lmtest_0.9-40 GPfit_1.0-9 tibble_3.3.1
#> [79] furrr_0.3.1 pillar_1.11.1 htmltools_0.5.9
#> [82] ipred_0.9-15 lava_1.8.1 R6_2.6.1
#> [85] lhs_1.2.0 evaluate_1.0.5 fracdiff_1.5-3
#> [88] bslib_0.10.0 class_7.3-23 Rcpp_1.1.1-1.1
#> [91] nlme_3.1-169 prodlim_2025.04.28 xfun_0.57
#> [94] zoo_1.8-14 ModelMetrics_1.2.2.2 pkgconfig_2.0.3