5  Extreme Gradient Boosting

Note

This chapter investigates how the distribution of estimated scores by an extreme gradient boosting model evolves with the number of boosting iterations. In the models, we vary the maximum depth of trees and consider boosting iterations up to 400. For each configuration, we compute the predicted scores from iteration 1 to 400; for each boosting iteration, we use the predicted scores (on train, calibration and test sets) to compute various metrics (performance, calibration, divergence between the distribution of scores and that of true underlying probabilities) on both the initial and recalibrated scores.

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggh4x)
library(ggrepel)
library(rpart)
library(locfit)
locfit 1.5-9.10      2024-06-24

Attaching package: 'locfit'

The following object is masked from 'package:purrr':

    none
library(philentropy)
library(betacal)

# Colours for train/validation/test
colour_samples <- c(
  "Train" = "#0072B2",
  "Validation" = "#009E73",
  "Calibration" = "#CC79A7",
  "Test" = "#D55E00"
)

colour_recalib <- c(
  "None" = "#56B4E9",
  "Platt" = "#009E73",
  "Isotonic" = "#CC79A7",
  "Beta" = "#D55E00",
  "Locfit" = "firebrick3"
)
definition of the theme_paper() function (for ggplot2 graphs)
#' Theme for ggplot2
#'
#' @param ... arguments passed to the theme function
#' @export
#' @importFrom ggplot2 element_rect element_text element_blank element_line unit
#'   rel
theme_paper <- function (...) {
  ggthemes::theme_base() +
    theme(
      plot.background = element_blank(),
      legend.background = element_rect(
        fill = "transparent", linetype="solid", colour ="black"),
      legend.position = "bottom",
      legend.direction = "horizontal",
      legend.box = "horizontal",
      legend.key = element_blank()
    )
}

5.1 Data

We generate data using the first 12 scenarios from Ojeda et al. (2023) and an additional set of 4 scenarios in which the true probability does not depend on the predictors in a linear way (see Chapter 4).

source("../scripts/functions/simul-data.R")
library(ks)
source("../scripts/functions/subsample_target_distribution.R")

When we simulate a dataset, we draw the following number of observations:

nb_obs <- 10000
Definition of the 16 scenarios
# Coefficients beta
coefficients <- list(
  # First category (baseline, 2 covariates)
  c(0.5, 1),  # scenario 1, 0 noise variable
  c(0.5, 1),  # scenario 2, 10 noise variables
  c(0.5, 1),  # scenario 3, 50 noise variables
  c(0.5, 1),  # scenario 4, 100 noise variables
  # Second category (same as baseline, with lower number of 1s)
  c(0.5, 1),  # scenario 5, 0 noise variable
  c(0.5, 1),  # scenario 6, 10 noise variables
  c(0.5, 1),  # scenario 7, 50 noise variables
  c(0.5, 1),  # scenario 8, 100 noise variables
  # Third category (same as baseline but with 5 num. and 5 categ. covariates)
  c(0.1, 0.2, 0.3, 0.4, 0.5, 0.01, 0.02, 0.03, 0.04, 0.05),
  c(0.1, 0.2, 0.3, 0.4, 0.5, 0.01, 0.02, 0.03, 0.04, 0.05),
  c(0.1, 0.2, 0.3, 0.4, 0.5, 0.01, 0.02, 0.03, 0.04, 0.05),
  c(0.1, 0.2, 0.3, 0.4, 0.5, 0.01, 0.02, 0.03, 0.04, 0.05),
  # Fourth category (nonlinear predictor, 3 covariates)
  c(0.5, 1, .3),  # scenario 5, 0 noise variable
  c(0.5, 1, .3),  # scenario 6, 10 noise variables
  c(0.5, 1, .3),  # scenario 7, 50 noise variables
  c(0.5, 1, .3)  # scenario 8, 100 noise variables
)

# Mean parameter for the normal distribution to draw from to draw num covariates
mean_num <- list(
  # First category (baseline, 2 covariates)
  rep(0, 2),  # scenario 1, 0 noise variable
  rep(0, 2),  # scenario 2, 10 noise variables
  rep(0, 2),  # scenario 3, 50 noise variables
  rep(0, 2),  # scenario 4, 100 noise variables
  # Second category (same as baseline, with lower number of 1s)
  rep(0, 2),  # scenario 5, 0 noise variable
  rep(0, 2),  # scenario 6, 10 noise variables
  rep(0, 2),  # scenario 7, 50 noise variables
  rep(0, 2),  # scenario 8, 100 noise variables
  # Third category (same as baseline but with 5 num. and 5 categ. covariates)
  rep(0, 5),
  rep(0, 5),
  rep(0, 5),
  rep(0, 5),
  # Fourth category (nonlinear predictor, 3 covariates)
  rep(0, 3),
  rep(0, 3),
  rep(0, 3),
  rep(0, 3)
)
# Sd parameter for the normal distribution to draw from to draw num covariates
sd_num <- list(
  # First category (baseline, 2 covariates)
  rep(1, 2),  # scenario 1, 0 noise variable
  rep(1, 2),  # scenario 2, 10 noise variables
  rep(1, 2),  # scenario 3, 50 noise variables
  rep(1, 2),  # scenario 4, 100 noise variables
  # Second category (same as baseline, with lower number of 1s)
  rep(1, 2),  # scenario 5, 0 noise variable
  rep(1, 2),  # scenario 6, 10 noise variables
  rep(1, 2),  # scenario 7, 50 noise variables
  rep(1, 2),  # scenario 8, 100 noise variables
  # Third category (same as baseline but with 5 num. and 5 categ. covariates)
  rep(1, 5),
  rep(1, 5),
  rep(1, 5),
  rep(1, 5),
  # Fourth category (nonlinear predictor, 3 covariates)
  rep(1, 3),
  rep(1, 3),
  rep(1, 3),
  rep(1, 3)
)

params_df <- tibble(
  scenario = 1:16,
  coefficients = coefficients,
  n_num = c(rep(2, 8), rep(5, 4), rep(3, 4)),
  add_categ = c(rep(FALSE, 8), rep(TRUE, 4), rep(FALSE, 4)),
  n_noise = rep(c(0, 10, 50, 100), 4),
  mean_num = mean_num,
  sd_num = sd_num,
  size_train = rep(nb_obs, 16),
  size_valid = rep(nb_obs, 16),
  size_calib = rep(nb_obs, 16),
  size_test = rep(nb_obs, 16),
  transform_probs = c(rep(FALSE, 4), rep(TRUE, 4), rep(FALSE, 4), rep(FALSE, 4)),
  linear_predictor = c(rep(TRUE, 12), rep(FALSE, 4)),
  seed = 202105
)
rm(coefficients, mean_num, sd_num)

5.2 Metrics

We load the functions from Chapter 3 to compute performance, calibration and divergence metrics.

source("../scripts/functions/metrics.R")

5.3 Simulations Setup

To train the models, we rely on the {xgboost} R package.

library(xgboost)

Attaching package: 'xgboost'
The following object is masked from 'package:dplyr':

    slice

Here, we define a function to recalibrate predicted scores using either Platt scaling or isotonic regression. The recalibration algorithm is first trained on the calibration set and then applied to both the calibration and test sets.

#' Recalibrates scores using a calibration
#' 
#' @param obs_calib vector of observed events in the calibration set
#' @param scores_calib vector of predicted probabilities in the calibration set
#' @param obs_test vector of observed events in the test set
#' @param scores_test vector of predicted probabilities in the test set
#' @param method recalibration method (`"platt"` for Platt scaling, 
#'   `"isotonic"` for isotonic regression)
#' @returns list of two elements: recalibrated scores on the calibration set,
#'   recalibrated scores on the test set
recalibrate <- function(obs_calib,
                        obs_test,
                        pred_calib,
                        pred_test,
                        method = c("platt", "isotonic", "beta", "locfit")) {
  data_calib <- tibble(d = obs_calib, scores = pred_calib)
  data_test <- tibble(d = obs_test, scores = pred_test)
  
  if (method == "platt") {
    lr <- glm(d ~ scores, family = binomial(link = 'logit'), data = data_calib)
    score_c_calib <- predict(lr, newdata = data_calib, type = "response")
    score_c_test <- predict(lr, newdata = data_test, type = "response")
  } else if (method == "isotonic") {
    iso <- isoreg(x = data_calib$scores, y = data_calib$d)
    fit_iso <- as.stepfun(iso)
    score_c_calib <- fit_iso(data_calib$scores)
    score_c_test <- fit_iso(data_test$scores)
  } else if (method == "beta") {
    fit_beta <- beta_calibration(data_calib$scores, data_calib$d, parameters = "abm")
    score_c_calib <- beta_predict(data_calib$scores, fit_beta)
    score_c_test <- beta_predict(data_test$scores, fit_beta)
  } else if (method == "locfit") {
    noise_scores <- data_calib$scores + rnorm(nrow(data_calib), 0, 0.01)
    noise_data_calib <- data_calib %>% mutate(scores = noise_scores)
    locfit_reg <- locfit(
      formula = d ~ lp(scores, nn = 0.15, deg = 0),
      kern = "rect", maxk = 200, data = noise_data_calib
    )
    score_c_calib <- predict(locfit_reg, newdata = data_calib)
    score_c_test <- predict(locfit_reg, newdata = data_test)
  } else {
    stop("Unrecognized method: platt, isotonic, beta or locfit only")
  }
  # Format results in tibbles:
  # For calibration set
  tb_score_c_calib <- tibble(
    d = obs_calib,
    p_u = pred_calib,
    p_c = score_c_calib
  )
  # For test set
  tb_score_c_test <- tibble(
    d = obs_test,
    p_u = pred_test,
    p_c = score_c_test
  )
  
  list(
    tb_score_c_calib = tb_score_c_calib,
    tb_score_c_test = tb_score_c_test
  )
}

As explained in the foreword of this page, we compute metrics based on scores obtained at various boosting iterations. To do so, we define a function, get_metrics_nb_iter(), that will be applied to a fitted model. This function will be called for all the boosting iterations (controlled by the nb_iter argument). The function returns a list with the following elements:

  • scenario: the ID of the scenario
  • ind: the index of the grid search (so that we can join with the hyperparameters values, if needed)
  • repn: the ID of the replication
  • nb_iter: the boosting iteration at which the metrics are computed
  • tb_metrics: the tibble with the performance, calibration, and divergence metrics (one row for the train sample, one row for the calibration sample, one row for the validation sample, and one row for the test sample)
  • tb_prop_scores: additional metrics (\(\mathbb{P}(q_1 < \hat{s}(\mathbf{x}) < q_2)\) for multiple values for \(q_1\) and \(q_2 = 1-q_1\))
  • scores_hist: elements to be able to plot an histogram of the scores on both the train set and the test set (using 20 equally-sized bins over \([0,1]\)).
Function get_metrics_nb_iter()
#' Computes the performance and calibration metrics for an xgb model,
#' depending on the number of iterations kept.
#'
#' @param nb_iter number of boosting iterations to keep
#' @param params hyperparameters of the current model
#' @param fitted_xgb xgb estimated model
#' @param tb_train_xgb train data (in xgb.DMatrix format)
#' @param tb_valid_xgb validation data (in xgb.DMatrix format)
#' @param tb_calib_xgb calibration data (in xgb.DMatrix format)
#' @param tb_test_xgb test data (in xgb.DMatrix format)
#' @param simu_data simulated dataset
#' @param true_prob list with true probabilities on train, calibration,
#'  validation and test sets
get_metrics_nb_iter <- function(nb_iter,
                                params,
                                fitted_xgb,
                                tb_train_xgb,
                                tb_valid_xgb,
                                tb_calib_xgb,
                                tb_test_xgb,
                                simu_data,
                                true_prob) {

  ind <- params$ind
  max_depth <- params$max_depth
  tb_train <- simu_data$data$train |> rename(d = y)
  tb_valid <- simu_data$data$valid |> rename(d = y)
  tb_calib <- simu_data$data$calib |> rename(d = y)
  tb_test <- simu_data$data$test |> rename(d = y)

  # Predicted scores
  scores_train <- predict(fitted_xgb, tb_train_xgb, iterationrange = c(1, nb_iter))
  scores_valid <- predict(fitted_xgb, tb_valid_xgb, iterationrange = c(1, nb_iter))
  scores_calib <- predict(fitted_xgb, tb_calib_xgb, iterationrange = c(1, nb_iter))
  scores_test <- predict(fitted_xgb, tb_test_xgb, iterationrange = c(1, nb_iter))
  
  # Recalibration
  # Platt scaling
  res_recalibration_platt <- recalibrate(
    obs_calib = tb_calib$d, 
    obs_test = tb_test$d, 
    pred_calib = scores_calib, 
    pred_test = scores_test, 
    method = "platt"
  )
  scores_c_platt_calib <- res_recalibration_platt$tb_score_c_calib$p_c
  scores_c_platt_test <- res_recalibration_platt$tb_score_c_test$p_c
  
  # Isotonic regression
  res_recalibration_iso <- recalibrate(
    obs_calib = tb_calib$d, 
    obs_test = tb_test$d, 
    pred_calib = scores_calib, 
    pred_test = scores_test, 
    method = "isotonic"
  )
  scores_c_iso_calib <- res_recalibration_iso$tb_score_c_calib$p_c
  scores_c_iso_test <- res_recalibration_iso$tb_score_c_test$p_c
  
  # Beta calibration
  res_recalibration_beta <- recalibrate(
    obs_calib = tb_calib$d, 
    obs_test = tb_test$d, 
    pred_calib = scores_calib, 
    pred_test = scores_test, 
    method = "beta"
  )
  scores_c_beta_calib <- res_recalibration_beta$tb_score_c_calib$p_c
  scores_c_beta_test <- res_recalibration_beta$tb_score_c_test$p_c
  
  # Locfit regression
  res_recalibration_locfit <- recalibrate(
    obs_calib = tb_calib$d, 
    obs_test = tb_test$d, 
    pred_calib = scores_calib, 
    pred_test = scores_test, 
    method = "locfit"
  )
  scores_c_locfit_calib <- res_recalibration_locfit$tb_score_c_calib$p_c
  scores_c_locfit_test <- res_recalibration_locfit$tb_score_c_test$p_c

  ## Histogram of scores----
  breaks <- seq(0, 1, by = .05)
  scores_train_hist <- hist(scores_train, breaks = breaks, plot = FALSE)
  scores_calib_hist <- hist(scores_calib, breaks = breaks, plot = FALSE)
  scores_valid_hist <- hist(scores_valid, breaks = breaks, plot = FALSE)
  scores_test_hist <- hist(scores_test, breaks = breaks, plot = FALSE)
  scores_c_platt_calib_hist <- hist(scores_c_platt_calib, breaks = breaks, plot = FALSE)
  scores_c_platt_test_hist <- hist(scores_c_platt_test, breaks = breaks, plot = FALSE)
  scores_c_iso_calib_hist <- hist(scores_c_iso_calib, breaks = breaks, plot = FALSE)
  scores_c_iso_test_hist <- hist(scores_c_iso_test, breaks = breaks, plot = FALSE)
  scores_c_beta_calib_hist <- hist(scores_c_beta_calib, breaks = breaks, plot = FALSE)
  scores_c_beta_test_hist <- hist(scores_c_beta_test, breaks = breaks, plot = FALSE)
  scores_c_locfit_calib_hist <- hist(scores_c_locfit_calib, breaks = breaks, plot = FALSE)
  scores_c_locfit_test_hist <- hist(scores_c_locfit_test, breaks = breaks, plot = FALSE)
  
  scores_hist <- list(
    train = scores_train_hist,
    valid = scores_valid_hist,
    calib = scores_calib_hist,
    test = scores_test_hist,
    calib_c_platt = scores_c_platt_calib_hist,
    test_c_platt = scores_c_platt_test_hist,
    calib_c_iso = scores_c_iso_calib_hist,
    test_c_iso = scores_c_iso_test_hist,
    calib_c_beta = scores_c_beta_calib_hist,
    test_c_beta = scores_c_beta_test_hist,
    calib_c_locfit = scores_c_locfit_calib_hist,
    test_c_locfit = scores_c_locfit_test_hist,
    scenario = simu_data$scenario,
    ind = ind,
    repn = simu_data$repn,
    max_depth = params$max_depth,
    nb_iter = nb_iter
  )

  ## Estimation of P(q1 < score < q2)----
  prop_btw_q_h <- function(s, sample_name, recalib_name) {
    map(
      c(.1, .2, .3, .4),
      ~prop_btw_quantiles(s = s, q1 = .x)
    ) |>
      list_rbind() |>
      mutate(sample = sample_name, recalib = recalib_name)
  }
  
  proq_scores_train <- prop_btw_q_h(
    scores_train, sample_name = "train", recalib_name = "none"
  )
  proq_scores_valid <- prop_btw_q_h(
    scores_valid, sample_name = "valid", recalib_name = "none"
  )
  proq_scores_calib <- prop_btw_q_h(
    scores_calib, sample_name = "calib", recalib_name = "none"
  )
  proq_scores_test <- prop_btw_q_h(
    scores_test, sample_name = "test", recalib_name = "none"
  )
  proq_scores_c_platt_calib <- prop_btw_q_h(
    scores_c_platt_calib, sample_name = "calib", recalib_name = "platt"
  )
  proq_scores_c_platt_test <- prop_btw_q_h(
    scores_c_platt_test, sample_name = "test", recalib_name = "platt"
  )
  proq_scores_c_iso_calib <- prop_btw_q_h(
    scores_c_iso_calib, sample_name = "calib", recalib_name = "isotonic"
  )
  proq_scores_c_iso_test <- prop_btw_q_h(
    scores_c_iso_test, sample_name = "test", recalib_name = "isotonic"
  )
  proq_scores_c_beta_calib <- prop_btw_q_h(
    scores_c_beta_calib, sample_name = "calib", recalib_name = "beta"
  )
  proq_scores_c_beta_test <- prop_btw_q_h(
    scores_c_beta_test, sample_name = "test", recalib_name = "beta"
  )
  proq_scores_c_locfit_calib <- prop_btw_q_h(
    scores_c_locfit_calib, sample_name = "calib", recalib_name = "locfit"
  )
  proq_scores_c_locfit_test <- prop_btw_q_h(
    scores_c_locfit_test, sample_name = "test", recalib_name = "locfit"
  )

  ## Dispersion Metrics----
  disp_train <- dispersion_metrics(
    true_probas = true_prob$train, scores = scores_train
  ) |> 
    mutate(sample = "train", recalib = "none")
  disp_valid <- dispersion_metrics(
    true_probas = true_prob$valid, scores = scores_valid
  ) |>
    mutate(sample = "valid", recalib = "none")
  
  disp_calib <- dispersion_metrics(
    true_probas = true_prob$calib, scores = scores_calib
  ) |>
    mutate(sample = "calib", recalib = "none")
  
  disp_test <- dispersion_metrics(
    true_probas = true_prob$test, scores = scores_test
  ) |> 
    mutate(sample = "test", recalib = "none")
  
  
  disp_c_platt_calib <- dispersion_metrics(
    true_probas = true_prob$calib, scores = scores_c_platt_calib
  ) |>
    mutate(sample = "calib", recalib = "platt")
  
  disp_c_platt_test <- dispersion_metrics(
    true_probas = true_prob$test, scores = scores_c_platt_test
  ) |> 
    mutate(sample = "test", recalib = "platt")
  
  disp_c_iso_calib <- dispersion_metrics(
    true_probas = true_prob$calib, scores = scores_c_iso_calib
  ) |>
    mutate(sample = "calib", recalib = "isotonic")
  
  disp_c_iso_test <- dispersion_metrics(
    true_probas = true_prob$test, scores = scores_c_iso_test
  ) |> 
    mutate(sample = "test", recalib = "isotonic")
  
  disp_c_beta_calib <- dispersion_metrics(
    true_probas = true_prob$calib, scores = scores_c_beta_calib
  ) |>
    mutate(sample = "calib", recalib = "beta")
  
  disp_c_beta_test <- dispersion_metrics(
    true_probas = true_prob$test, scores = scores_c_beta_test
  ) |> 
    mutate(sample = "test", recalib = "beta")
  
  disp_c_locfit_calib <- dispersion_metrics(
    true_probas = true_prob$calib, scores = scores_c_locfit_calib
  ) |>
    mutate(sample = "calib", recalib = "locfit")
  
  disp_c_locfit_test <- dispersion_metrics(
    true_probas = true_prob$test, scores = scores_c_locfit_test
  ) |> 
    mutate(sample = "test", recalib = "locfit")
  
  # Performance and Calibration Metrics
  # We add very small noise to predicted scores
  # otherwise the local regression may crash
  scores_train_noise <- scores_train +
    runif(n = length(scores_train), min = 0, max = 0.01)
  scores_train_noise[scores_train_noise > 1] <- 1
  metrics_train <- compute_metrics(
    obs = tb_train$d, scores = scores_train_noise, true_probas = true_prob$train
  ) |> mutate(sample = "train", recalib = "none")
  
  scores_valid_noise <- scores_valid +
    runif(n = length(scores_valid), min = 0, max = 0.01)
  scores_valid_noise[scores_valid_noise > 1] <- 1
  metrics_valid <- compute_metrics(
    obs = tb_valid$d, scores = scores_valid_noise, true_probas = true_prob$valid
  ) |> mutate(sample = "valid", recalib = "none")
  
  scores_calib_noise <- scores_calib +
    runif(n = length(scores_calib), min = 0, max = 0.01)
  scores_calib_noise[scores_calib_noise > 1] <- 1
  metrics_calib <- compute_metrics(
    obs = tb_calib$d, scores = scores_calib_noise, true_probas = true_prob$calib
  ) |> mutate(sample = "calib", recalib = "none")
  
  scores_test_noise <- scores_test +
    runif(n = length(scores_test), min = 0, max = 0.01)
  scores_test_noise[scores_test_noise > 1] <- 1
  metrics_test <- compute_metrics(
    obs = tb_test$d, scores = scores_test_noise, true_probas = true_prob$test
  ) |> mutate(sample = "test", recalib = "none")
  
  # With recalibrated scores (platt)
  scores_c_platt_calib_noise <- scores_c_platt_calib +
    runif(n = length(scores_c_platt_calib), min = 0, max = 0.01)
  scores_c_platt_calib_noise[scores_c_platt_calib_noise > 1] <- 1
  metrics_c_platt_calib <- compute_metrics(
    obs = tb_calib$d, scores = scores_c_platt_calib_noise, 
    true_probas = true_prob$calib
  ) |> mutate(sample = "calib", recalib = "platt")
  
  scores_c_platt_test_noise <- scores_c_platt_test +
    runif(n = length(scores_c_platt_test), min = 0, max = 0.01)
  scores_c_platt_test_noise[scores_c_platt_test_noise > 1] <- 1
  metrics_c_platt_test <- compute_metrics(
    obs = tb_test$d, scores = scores_c_platt_test_noise, 
    true_probas = true_prob$test
  ) |> mutate(sample = "test", recalib = "platt")
  
  # With recalibrated scores (isotonic)
  scores_c_iso_calib_noise <- scores_c_iso_calib +
    runif(n = length(scores_c_iso_calib), min = 0, max = 0.01)
  scores_c_iso_calib_noise[scores_c_iso_calib_noise > 1] <- 1
  metrics_c_iso_calib <- compute_metrics(
    obs = tb_calib$d, scores = scores_c_iso_calib_noise, 
    true_probas = true_prob$calib
  ) |> mutate(sample = "calib", recalib = "isotonic")
  
  scores_c_iso_test_noise <- scores_c_iso_test +
    runif(n = length(scores_c_iso_test), min = 0, max = 0.01)
  scores_c_iso_test_noise[scores_c_iso_test_noise > 1] <- 1
  metrics_c_iso_test <- compute_metrics(
    obs = tb_test$d, scores = scores_c_iso_test_noise, 
    true_probas = true_prob$test
  ) |> mutate(sample = "test", recalib = "isotonic")
  
  # With recalibrated scores (beta)
  scores_c_beta_calib_noise <- scores_c_beta_calib +
    runif(n = length(scores_c_beta_calib), min = 0, max = 0.01)
  scores_c_beta_calib_noise[scores_c_beta_calib_noise > 1] <- 1
  metrics_c_beta_calib <- compute_metrics(
    obs = tb_calib$d, scores = scores_c_beta_calib_noise, 
    true_probas = true_prob$calib
  ) |> mutate(sample = "calib", recalib = "beta")
  
  scores_c_beta_test_noise <- scores_c_beta_test +
    runif(n = length(scores_c_beta_test), min = 0, max = 0.01)
  scores_c_beta_test_noise[scores_c_beta_test_noise > 1] <- 1
  metrics_c_beta_test <- compute_metrics(
    obs = tb_test$d, scores = scores_c_beta_test_noise, 
    true_probas = true_prob$test
  ) |> mutate(sample = "test", recalib = "beta")
  
  # With recalibrated scores (locfit)
  scores_c_locfit_calib_noise <- scores_c_locfit_calib +
    runif(n = length(scores_c_locfit_calib), min = 0, max = 0.01)
  scores_c_locfit_calib_noise[scores_c_locfit_calib_noise > 1] <- 1
  metrics_c_locfit_calib <- compute_metrics(
    obs = tb_calib$d, scores = scores_c_locfit_calib_noise, 
    true_probas = true_prob$calib
  ) |> mutate(sample = "calib", recalib = "locfit")
  
  scores_c_locfit_test_noise <- scores_c_locfit_test +
    runif(n = length(scores_c_locfit_test), min = 0, max = 0.01)
  scores_c_locfit_test_noise[scores_c_locfit_test_noise > 1] <- 1
  metrics_c_locfit_test <- compute_metrics(
    obs = tb_test$d, scores = scores_c_locfit_test_noise, 
    true_probas = true_prob$test
  ) |> mutate(sample = "test", recalib = "locfit")
  
  # Decomposition of expected losses
  # Platt
  decomposition_platt_calib <- decomposition_metrics(
    obs = tb_calib$d, scores = scores_calib, 
    calibrated_scores = scores_c_platt_calib, true_probas = true_prob$calib
  ) |> mutate(sample = "calib", recalib = "platt")
  
  decomposition_platt_test <- decomposition_metrics(
    obs = tb_test$d, scores = scores_test, 
    calibrated_scores = scores_c_platt_test, true_probas = true_prob$test
  ) |> mutate(sample = "test", recalib = "platt")
  
  # Isotonic
  decomposition_iso_calib <- decomposition_metrics(
    obs = tb_calib$d, scores = scores_calib, 
    calibrated_scores = scores_c_iso_calib, true_probas = true_prob$calib
  ) |> mutate(sample = "calib", recalib = "iso")
  
  decomposition_iso_test <- decomposition_metrics(
    obs = tb_test$d, scores = scores_test, 
    calibrated_scores = scores_c_iso_test, true_probas = true_prob$test
  ) |> mutate(sample = "test", recalib = "iso")
  
  # Beta
  decomposition_beta_calib <- decomposition_metrics(
    obs = tb_calib$d, scores = scores_calib, 
    calibrated_scores = scores_c_beta_calib, true_probas = true_prob$calib
  ) |> mutate(sample = "calib", recalib = "beta")
  
  decomposition_beta_test <- decomposition_metrics(
    obs = tb_test$d, scores = scores_test, 
    calibrated_scores = scores_c_beta_test, true_probas = true_prob$test
  ) |> mutate(sample = "test", recalib = "beta")
  
  # Locfit
  decomposition_locfit_calib <- decomposition_metrics(
    obs = tb_calib$d, scores = scores_calib, 
    calibrated_scores = scores_c_locfit_calib, true_probas = true_prob$calib
  ) |> mutate(sample = "calib", recalib = "locfit")
  
  decomposition_locfit_test <- decomposition_metrics(
    obs = tb_test$d, scores = scores_test, 
    calibrated_scores = scores_c_locfit_test, true_probas = true_prob$test
  ) |> mutate(sample = "test", recalib = "locfit")
  
  tb_metrics <- metrics_train |>
    bind_rows(metrics_valid) |>
    bind_rows(metrics_calib) |>
    bind_rows(metrics_test) |>
    bind_rows(metrics_c_platt_calib) |>
    bind_rows(metrics_c_platt_test) |>
    bind_rows(metrics_c_iso_calib) |>
    bind_rows(metrics_c_iso_test) |>
    bind_rows(metrics_c_beta_calib) |>
    bind_rows(metrics_c_beta_test) |>
    bind_rows(metrics_c_locfit_calib) |>
    bind_rows(metrics_c_locfit_test) |>
    left_join(
      disp_train |>
        bind_rows(disp_valid) |> 
        bind_rows(disp_calib) |> 
        bind_rows(disp_test) |> 
        bind_rows(disp_c_platt_calib) |> 
        bind_rows(disp_c_platt_test) |> 
        bind_rows(disp_c_iso_calib) |> 
        bind_rows(disp_c_iso_test) |> 
        bind_rows(disp_c_beta_calib) |> 
        bind_rows(disp_c_beta_test) |> 
        bind_rows(disp_c_locfit_calib) |> 
        bind_rows(disp_c_locfit_test),
      by = c("sample", "recalib")
    ) |>
    mutate(
      scenario = simu_data$scenario,
      ind = ind,
      repn = simu_data$repn,
      max_depth = params$max_depth,
      nb_iter = nb_iter
    )
  
  tb_prop_scores <- proq_scores_train |>
    bind_rows(proq_scores_valid) |>
    bind_rows(proq_scores_calib) |>
    bind_rows(proq_scores_test) |>
    bind_rows(proq_scores_c_platt_calib) |>
    bind_rows(proq_scores_c_platt_test) |>
    bind_rows(proq_scores_c_iso_calib) |>
    bind_rows(proq_scores_c_iso_test) |>
    bind_rows(proq_scores_c_beta_calib) |>
    bind_rows(proq_scores_c_beta_test) |>
    bind_rows(proq_scores_c_locfit_calib) |>
    bind_rows(proq_scores_c_locfit_test) |>
    mutate(
      scenario = simu_data$scenario,
      ind = ind,
      repn = simu_data$repn,
      max_depth = params$max_depth,
      nb_iter = nb_iter
    )
  
  tb_decomposition_loss <- decomposition_platt_calib |>
    bind_rows(decomposition_platt_test) |>
    bind_rows(decomposition_iso_calib) |>
    bind_rows(decomposition_iso_test) |>
    bind_rows(decomposition_beta_calib) |>
    bind_rows(decomposition_beta_test) |>
    bind_rows(decomposition_locfit_calib) |>
    bind_rows(decomposition_locfit_test) |>
    mutate(
      scenario = simu_data$scenario,
      ind = ind,
      repn = simu_data$repn,
      max_depth = params$max_depth,
      nb_iter = nb_iter
    )

  list(
    scenario = simu_data$scenario,     # data scenario
    ind = ind,                         # index for grid
    repn = simu_data$repn,             # data replication ID
    nb_iter = nb_iter,                 # number of boosting iterations
    tb_metrics = tb_metrics,           # table with performance/calib/divergence
    tb_decomposition = tb_decomposition_loss,                                   #  metrics
    tb_prop_scores = tb_prop_scores,   # table with P(q1 < score < q2)
    scores_hist = scores_hist          # histogram of scores
  )
}

We define another function, simul_xgb() which trains an extreme gradient boosting model for a single replication. It calls the get_metrics_nb_iter() on each of the boosting iterations of the model from the second to the last (400th), and returns a list of length 400-1 where each element is a list returned by the get_metrics_nb_iter().

Function simul_xgb()
#' Train an xgboost model and compute performance, calibration, and dispersion
#' metrics
#'
#' @param params tibble with hyperparameters for the simulation
#' @param ind index of the grid (numerical ID)
#' @param simu_data simulated data obtained with `simulate_data_wrapper()`
simul_xgb <- function(params,
                      ind,
                      simu_data) {
  tb_train <- simu_data$data$train |> rename(d = y)
  tb_valid <- simu_data$data$valid |> rename(d = y)
  tb_calib <- simu_data$data$calib |> rename(d = y)
  tb_test <- simu_data$data$test |> rename(d = y)
  true_prob <-
    list(
      train = simu_data$data$probs_train,
      valid = simu_data$data$probs_valid,
      calib = simu_data$data$probs_calib,
      test = simu_data$data$probs_test
    )

  ## Format data for xgboost----
  tb_train_xgb <- xgb.DMatrix(
    data = model.matrix(d ~ -1 + ., tb_train), label = tb_train$d
  )
  tb_valid_xgb <- xgb.DMatrix(
    data = model.matrix(d ~ -1 + ., tb_valid), label = tb_valid$d
  )
  tb_calib_xgb <- xgb.DMatrix(
    data = model.matrix(d ~ -1 + ., tb_calib), label = tb_calib$d
  )
  tb_test_xgb <- xgb.DMatrix(
    data = model.matrix(d ~ -1 + ., tb_test), label = tb_test$d
  )
  # Parameters for the algorithm
  param <- list(
    max_depth = params$max_depth, #Note: root node is indexed 0
    eta = params$eta,
    nthread = 1,
    objective = "binary:logistic",
    eval_metric = "auc"
  )
  watchlist <- list(train = tb_train_xgb, eval = tb_valid_xgb)

  ## Estimation----
  xgb_fit <- xgb.train(
    param, tb_train_xgb,
    nrounds = params$nb_iter_total,
    watchlist,
    verbose = 0
  )

  # Then, for each boosting iteration number up to params$nb_iter_total
  # compute the predicted scores and evaluate the metrics
  resul <- map(
    seq(2, params$nb_iter_total),
    ~get_metrics_nb_iter(
      nb_iter = .x,
      params = params,
      fitted_xgb = xgb_fit,
      tb_train_xgb = tb_train_xgb,
      tb_valid_xgb = tb_valid_xgb,
      tb_calib_xgb = tb_calib_xgb,
      tb_test_xgb = tb_test_xgb,
      simu_data = simu_data,
      true_prob = true_prob
    ),
  )
  resul
}

simulate_xgb_scenario <- function(scenario, params_df, repn) {
  # Generate Data
  simu_data <- simulate_data_wrapper(
    scenario = scenario,
    params_df = params_df,
    repn = repn
  )

  # Looping over the grid hyperparameters for the scenario
  res_simul <- vector(mode = "list", length = nrow(grid))
  cli::cli_progress_bar("Iteration grid", total = nrow(grid), type = "tasks")
  for (j in 1:nrow(grid)) {
    curent_params <- grid |> dplyr::slice(!!j)
    res_simul[[j]] <- simul_xgb(
      params = curent_params,
      ind = curent_params$ind,
      simu_data = simu_data
    )
    cli::cli_progress_update()
  }


  # The metrics computed for all set of hyperparameters (identified with `ind`)
  # and for each number of boosting iterations (`nb_iter`), for the current
  # scenario (`scenario`) and current replication number (`repn`)
  metrics_simul <- map(
    res_simul,
    function(simul_grid_j) map(simul_grid_j, "tb_metrics") |> list_rbind()
  ) |>
    list_rbind()

  # P(q_1<s(x)<q_2)
  prop_scores_simul <- map(
    res_simul,
    function(simul_grid_j) map(simul_grid_j, "tb_prop_scores") |> list_rbind()
  ) |>
    list_rbind()
  
  # Decomposition of expected losses
  decomposition_scores_simul <- map(
    res_simul,
    function(simul_grid_j) map(simul_grid_j, "tb_decomposition") |> list_rbind()
  ) |>
    list_rbind()

  # Histogram of estimated scores
  scores_hist <- map(
    res_simul,
    function(simul_grid_j) map(simul_grid_j, "scores_hist")
  )

  list(
    metrics_simul = metrics_simul,
    scores_hist = scores_hist,
    prop_scores_simul = prop_scores_simul,
    decomposition_scores_simul = decomposition_scores_simul
  )
}

5.3.1 Grid

We consider the following grid:

grid <- expand_grid(
  max_depth = c(2, 4, 6),
  nb_iter_total = 400,
  eta = 0.3
) |>
  mutate(ind = row_number())

The desired number of replications for each scenario needs to be set:

repns_vector <- 1:100

The different configurations are reported in Table 5.1.

DT::datatable(grid)
Table 5.1: Grid Search Values

We define a function, simulate_xgb_scenario() to train the model on a dataset for all different values of the hyperparameters of the grid. This function performs a single replication of the simulations for a single scenario.

Function simulate_xgb_scenario()
simulate_xgb_scenario <- function(scenario, params_df, repn) {
  # Generate Data
  simu_data <- simulate_data_wrapper(
    scenario = scenario,
    params_df = params_df,
    repn = repn
  )

  # Looping over the grid hyperparameters for the scenario
  res_simul <- vector(mode = "list", length = nrow(grid))
  cli::cli_progress_bar("Iteration grid", total = nrow(grid), type = "tasks")
  for (j in 1:nrow(grid)) {
    curent_params <- grid |> dplyr::slice(!!j)
    res_simul[[j]] <- simul_xgb(
      params = curent_params,
      ind = curent_params$ind,
      simu_data = simu_data
    )
    cli::cli_progress_update()
  }


  # The metrics computed for all set of hyperparameters (identified with `ind`)
  # and for each number of boosting iterations (`nb_iter`), for the current
  # scenario (`scenario`) and current replication number (`repn`)
  metrics_simul <- map(
    res_simul,
    function(simul_grid_j) map(simul_grid_j, "tb_metrics") |> list_rbind()
  ) |>
    list_rbind()

  # Sanity check
  # metrics_simul |> count(scenario, repn, ind, sample, nb_iter) |>
  #   filter(n > 1)

  # P(q_1<s(x)<q_2)
  prop_scores_simul <- map(
    res_simul,
    function(simul_grid_j) map(simul_grid_j, "tb_prop_scores") |> list_rbind()
  ) |>
    list_rbind()

  # Sanity check
  # prop_scores_simul |> count(scenario, repn, ind, sample, nb_iter)

  # Histogram of estimated scores
  scores_hist <- map(
    res_simul,
    function(simul_grid_j) map(simul_grid_j, "scores_hist")
  )
  
  # Decomposition of expected losses
  decomposition_scores_simul <- map(
    res_simul,
    function(simul_grid_j) map(simul_grid_j, "tb_decomposition") |> list_rbind()
  ) |>
    list_rbind()

  list(
    metrics_simul = metrics_simul,
    scores_hist = scores_hist,
    prop_scores_simul = prop_scores_simul,
    decomposition_scores_simul = decomposition_scores_simul
  )
}

5.4 Estimations

We loop over the 16 scenarios and run the 100 replications in parallel.

Estimation codes
library(pbapply)
library(parallel)
ncl <- detectCores()-1
(cl <- makeCluster(ncl))

clusterEvalQ(cl, {
  library(tidyverse)
  library(locfit)
  library(philentropy)
  library(xgboost)
  library(ks)
  library(betacal)
}) |>
  invisible()

clusterExport(
  cl, c(
    # Functions
    "brier_score",
    "compute_metrics",
    "dispersion_metrics",
    "kullback_leibler",
    "prop_btw_quantiles",
    "subset_target",
    "simulate_data",
    "simulate_data_wrapper",
    "simul_xgb",
    "simulate_xgb_scenario",
    "get_metrics_nb_iter",
    "recalibrate",
    "calculate_log_loss",
    "decomposition_metrics",
    "calculate_kl",
    # Objects
    "grid",
    "params_df",
    "repns_vector"
  )
)

# make directory if not existing
if (!dir.exists("../output/simul/")) {
  dir.create("../output/simul/", recursive = TRUE)
}

# 1--4 Agathe
# 5--8 Ewen
# 9--12 AMSE
#
for (i_scenario in 1:16) {
  scenario <- i_scenario
  print(str_c("Scenario ", scenario, "/", nrow(params_df)))
  clusterExport(cl, c("scenario"))
  resul_xgb_scenario <-
    pblapply(
      1:length(repns_vector), function(i) simulate_xgb_scenario(
        scenario = scenario, params_df = params_df, repn = repns_vector[i]
      ),
      cl = cl
    )
  save(
    resul_xgb_scenario,
    file = str_c("../output/simul/resul_xgb_scenario_", scenario, ".rda")
  )
}
stopCluster(cl)

The results can be loaded as follows:

scenarios <- 1:16
files <- str_c(
  "../output/simul/resul_xgb_scenario_", scenarios, ".rda"
)
resul_xgb <- map(files[file.exists(files)], ~{load(.x) ; resul_xgb_scenario})

The resul_rf object is of length 16: each element contains the simulations for a scenario. For each scenario, the elements are a list of length max(repns_vector), i.e., the number of replications. Each replication gives, in a list, the following elements:

  • metrics_simul: the metrics (AUC, Calibration, KL Divergence, etc.) for each model from the grid search, for all boosting iterations
  • scores_hist: the counts on bins defined on estimated scores (on train, validation, calibration, and test sets ; for calibration and test sets, the counts are given with or without recalibration)
  • prop_scores_simul: the estimations of \(\mathbb{P}(q_1 < \hat{\mathbf{x}}< q_2)\) for various values of q_1 and q_2.

5.5 Results

We can now extract some information from the results.

We first aggregate all the computed metrics performance/calibration/divergence in a single tibble, metrics_xgb_all.

Codes to create the metrics table
metrics_xgb_all <- map(
  resul_xgb,
  function(resul_xgb_sc) map(resul_xgb_sc, "metrics_simul") |> list_rbind()
) |>
  list_rbind() |>
  mutate(
    sample = factor(
      sample,
      levels = c("train", "valid", "calib", "test"),
      labels = c("Train","Validation", "Calibration" ,"Test")
    ),
    recalib = factor(
      recalib,
      levels = c("none", "platt", "beta", "isotonic", "locfit"),
      labels = c("None", "Platt", "Beta", "Isotonic", "Locfit")
    )
  )

# Sanity check
# metrics_xgb_all |> count(scenario, ind, sample, nb_iter) |>
#   filter(n != max(repns_vector))

For each replication, we made some hyperparameters vary. Let us identify some models of interest:

  • smallest: model with the lowest number of boosting iteration
  • largest: model with the highest number of boosting iteration
  • largest_auc: model with the highest AUC on validation set
  • lowest_mse: model with the lowest MSE on validation set
  • lowest_ici: model with the lowest ICI on validation set
  • lowest_kl: model with the lowest KL Divergence on validation set
Code
# Identify the smallest tree on the validation set, when the scores are not
# recalibrated
smallest_xgb <-
  metrics_xgb_all |>
  filter(sample == "Validation", recalib == "None") |>
  group_by(scenario, repn) |>
  arrange(nb_iter) |>
  slice_head(n = 1) |>
  select(scenario, repn, ind, nb_iter, recalib) |>
  mutate(result_type = "smallest") |>
  ungroup()

# Identify the largest tree
largest_xgb <-
  metrics_xgb_all |>
  filter(sample == "Validation", recalib == "None") |>
  group_by(scenario, repn) |>
  arrange(desc(nb_iter)) |>
  slice_head(n = 1) |>
  select(scenario, repn, ind, nb_iter, recalib) |>
  mutate(result_type = "largest") |>
  ungroup()

# Identify tree with highest AUC on test set
highest_auc_xgb <-
  metrics_xgb_all |>
  filter(sample == "Validation", recalib == "None") |>
  group_by(scenario, repn) |>
  arrange(desc(AUC)) |>
  slice_head(n = 1) |>
  select(scenario, repn, ind, nb_iter, recalib) |>
  mutate(result_type = "largest_auc") |>
  ungroup()

# # Identify tree with lowest MSE
# lowest_mse_xgb <-
#   metrics_xgb_all |>
#   filter(sample == "Validation", recalib == "None") |>
#   group_by(scenario, repn) |>
#   arrange(mse) |>
#   slice_head(n = 1) |>
#   select(scenario, repn, ind, nb_iter, recalib) |>
#   mutate(result_type = "lowest_mse") |>
#   ungroup()

# Identify tree with lowest brier
lowest_brier_xgb <-
  metrics_xgb_all |>
  filter(sample == "Validation", recalib == "None") |>
  group_by(scenario, repn) |>
  arrange(brier) |>
  slice_head(n = 1) |>
  select(scenario, repn, ind, nb_iter, recalib) |>
  mutate(result_type = "lowest_brier") |>
  ungroup()

# Identify tree with lowest ICI
lowest_ici_xgb <-
  metrics_xgb_all |>
  filter(sample == "Validation", recalib == "None") |>
  group_by(scenario, repn) |>
  arrange(ici) |>
  slice_head(n = 1) |>
  select(scenario, repn, ind, nb_iter, recalib) |>
  mutate(result_type = "lowest_ici") |>
  ungroup()

# Identify tree with lowest KL
lowest_kl_xgb <-
  metrics_xgb_all |>
  filter(sample == "Validation", recalib == "None") |>
  group_by(scenario, repn) |>
  arrange(KL_20_true_probas) |>
  slice_head(n = 1) |>
  select(scenario, repn, ind, nb_iter, recalib) |>
  mutate(result_type = "lowest_kl") |>
  ungroup()

mediocre_ici_xgb <- 
  metrics_xgb_all |>
  filter(sample == "Validation", recalib == "None") |>
  group_by(scenario, repn) |>
  # For each replication for a scenario, we select a model with a mediocre 
  # calibration
  mutate(
    mean_ici = mean(ici),
    sd_ici = sd(ici),
    upb_ici = mean_ici + sd_ici,
  ) |> 
  filter(ici >  upb_ici) |> 
  # Among the configurations for which the calibration is not within 1-sd of the
  # average calibration, we select the model with the lowest ICI
  arrange(ici) |> 
  slice_head(n = 1) |> 
  select(scenario, repn, ind, nb_iter, recalib) |>
  mutate(result_type = "high_ici") |>
  ungroup()

# Merge these
models_of_interest_xgb <-
  smallest_xgb |>
  bind_rows(largest_xgb) |>
  bind_rows(highest_auc_xgb) |>
  # bind_rows(lowest_mse_xgb) |>
  bind_rows(lowest_brier_xgb) |>
  bind_rows(lowest_ici_xgb) |>
  bind_rows(lowest_kl_xgb) |> 
  bind_rows(mediocre_ici_xgb)

models_of_interest_metrics <- NULL
for (recalibration_method in c("None", "Platt", "Beta", "Isotonic")) {
  # Add metrics now
  models_of_interest_metrics <-
    models_of_interest_metrics |>
    bind_rows(
      models_of_interest_xgb |> select(-recalib) |>
        left_join(
          metrics_xgb_all |>
            filter(
              recalib == recalibration_method,
              sample %in% c("Validation", "Test")
            ),
          by = c("scenario", "repn", "ind", "nb_iter"),
          relationship = "many-to-many" # (calib, test)
        )
    )
}


models_of_interest_metrics <-
  models_of_interest_metrics |>
  mutate(
    result_type = factor(
      result_type,
      levels = c(
        "smallest", "largest", #"lowest_mse", 
        "largest_auc",
        "lowest_brier", "lowest_ici", "lowest_kl", "high_ici"),
      labels = c(
        "Smallest", "Largest", 
        #"MSE*", 
        "AUC*",
        "Brier*", "ICI*", "KL*", "High ICI"
      )
    )
  )

# Sanity check
# models_of_interest_metrics |> count(scenario, sample, result_type)

5.5.1 Metrics vs Number of Iterations

We define a function, plot_metrics() to plot selected metrics (AUC, ICI, and KL Divergence) as a function of the number of boosting iterations, for a given value for the hyperparameter max_depth. Each curve corresponds to a value of the maximal depth hyperparameter.

TBD

5.5.2 Distribution of Scores

Let us extract all the histogram information computed over the simulations and put that in a single object, scores_hist_all.

scores_hist_all <-
  map(
    resul_xgb,
    function(resul_xgb_sc) map(resul_xgb_sc, "scores_hist")
  )

We then define a function, plot_bp_xgb() which plots the distribution of scores on the test set for a single replication (repn), for a scenario, (scenario). We also define a helper function, plot_bp_interest(), which plots the histogram of the scores at a specific iteration number. We will then be able to plot the distributions at the beginning of the boosting iterations, at the end, at a point where the AUC was the highest on the validation set, and at a point where the KL divergence between the distribution of scores on the validation set and the distribution of the true probabilities was the lowest. We will plot the distributions of the scores returned by the classifier, as well as those obtained with the reclibrators.

Function plot_metrics()
plot_bp_interest <- function(metrics_interest,
                             scores_hist_interest,
                             label,
                             recalib_method) {
  subtitle <- str_c(
    "Depth = ", metrics_interest$max_depth, ", ",
    # "MSE = ", round(metrics_interest$mse, 2), ", ",
    "AUC = ", round(metrics_interest$AUC, 2), ", \n",
    "Brier = ", round(metrics_interest$brier, 2), ",",
    "ICI = ", round(metrics_interest$ici, 2), ", ",
    "KL = ", round(metrics_interest$KL_20_true_probas, 2)
  )
  
  if (recalib_method == "none") {
    plot(
      main = str_c(label, " (iter = ", metrics_interest$nb_iter,")"),
      scores_hist_interest$test,
      xlab = latex2exp::TeX("$\\hat{s}(x)$"),
      ylab = ""
    )
  } else if (recalib_method == "platt") {
    plot(
      main = str_c(label, " (iter = ", metrics_interest$nb_iter,")"),
      scores_hist_interest$test_c_platt,
      xlab = latex2exp::TeX("$\\hat{s}(x)$"),
      ylab = "",
      col = colour_recalib[["Platt"]]
    )
  } else if (recalib_method == "beta") {
    plot(
      main = str_c(label, " (iter = ", metrics_interest$nb_iter,")"),
      scores_hist_interest$test_c_beta,
      xlab = latex2exp::TeX("$\\hat{s}(x)$"),
      ylab = "",
      col = colour_recalib[["Beta"]]
    )
  } else if (recalib_method == "iso") {
    plot(
      main = str_c(label, " (iter = ", metrics_interest$nb_iter,")"),
      scores_hist_interest$test_c_iso,
      xlab = latex2exp::TeX("$\\hat{s}(x)$"),
      ylab = "",
      col = colour_recalib[["Isotonic"]]
    )
  }
  mtext(side = 3, line = -0.25, adj = .5, subtitle, cex = .5)
}

plot_bp_xgb <- function(scenario,
                        repn,
                        paper_version = FALSE) {
  # Focus on current scenario
  scores_hist_scenario <- scores_hist_all[[scenario]]
  # Focus on a particular replication
  scores_hist_repn <- scores_hist_scenario[[repn]]
  # # Focus on a value for max_depth
  max_depth_val <- map_dbl(scores_hist_repn, ~.x[[1]]$max_depth)
  # i_max_depth <- which(max_depth_val == max_depth)
  # scores_hist <- scores_hist_repn[[i_max_depth]]
  
  # True Probabilities
  simu_data <- simulate_data_wrapper(
    scenario = scenario,
    params_df = params_df,
    repn = repn # only one replication here
  )
  true_prob <- simu_data$data$probs_train
  
  for (recalib_method in c("none", "platt", "beta", "iso")) {
    
    i_method <- match(recalib_method, c("none", "platt", "beta", "iso"))
    recalib_method_lab <- c("None", "Platt", "Beta", "Isotonic")[i_method]
    
    # The metrics for the corresponding simulations, on the validation set
    metrics_xgb_current_valid <-
      metrics_xgb_all |>
      filter(
        scenario == !!scenario,
        repn == !!repn,
        sample == "Validation",
        recalib == "None"
      )
    # and on the test set
    metrics_xgb_current_test <-
      metrics_xgb_all |>
      filter(
        scenario == !!scenario,
        repn == !!repn,
        sample == "Test",
        recalib == recalib_method_lab
      )
    
    if (paper_version == FALSE) {
      hist(
        true_prob,
        breaks = seq(0, 1, by = .05),
        xlab = "p", ylab = "",
        main = "True Probabilities",
        xlim = c(0, 1)
      )
      mtext(
        side = 2, recalib_method_lab, line = 2.5, cex = 1,
        font.lab = 2
      )
      # Iterations of interest----
      ## Start of iterations
      scores_hist_start <- scores_hist_repn[[1]][[1]]
      metrics_start <- metrics_xgb_current_test |>
        filter(
          nb_iter == scores_hist_start$nb_iter,
          max_depth == scores_hist_start$max_depth
        )
      
      plot_bp_interest(
        metrics_interest = metrics_start,
        scores_hist_interest = scores_hist_start,
        label = "Start",
        recalib_method = recalib_method
      )
      
      ## End of iterations
      scores_hist_end <- scores_hist_repn[[1]][[length(scores_hist_repn[[1]])]]
      metrics_end <- metrics_xgb_current_test |>
        filter(
          nb_iter == scores_hist_end$nb_iter,
          max_depth == scores_hist_start$max_depth
        )
      plot_bp_interest(
        metrics_interest = metrics_end,
        scores_hist_interest = scores_hist_end,
        label = "End",
        recalib_method = recalib_method
      )
      
      #   ## Iteration with min MSE on validation set
      #   metrics_valid_mse_star <- metrics_xgb_current_valid |> arrange(mse) |>
      #     dplyr::slice(1)
      #   nb_iter_mse <- metrics_valid_mse_star$nb_iter
      #   max_depth_mse_star <- metrics_valid_mse_star$max_depth
      #   i_max_depth_mse_star <- which(max_depth_val == max_depth_mse_star)
      #   # Metrics at the same iteration on the test set
      #   metrics_min_mse <-
      #     metrics_xgb_current_test |>
      #     filter(
      #       nb_iter == !!nb_iter_mse,
      #       max_depth == max_depth_mse_star
      #     )
      #   # Note: indexing at 0 here...
      #   ind_mse <- which(map_dbl(scores_hist_repn[[i_max_depth_mse_star]], "nb_iter") == nb_iter_mse)
      #   scores_hist_min_mse <- scores_hist_repn[[i_max_depth_mse_star]][[ind_mse]]
      #   plot_bp_interest(
      #     metrics_interest = metrics_min_mse,
      #     scores_hist_interest = scores_hist_min_mse,
      #     label = "MSE*",
      #     recalib_method = recalib_method
      #   )
    }
    ## Iteration with max AUC on validation set
    metrics_valid_auc_star <-
      metrics_xgb_current_valid |> arrange(desc(AUC)) |> dplyr::slice(1)
    nb_iter_auc <- metrics_valid_auc_star$nb_iter
    max_depth_auc_star <- metrics_valid_auc_star$max_depth
    i_max_depth_auc_star <- which(max_depth_val == max_depth_auc_star)
    
    metrics_max_auc <-
      metrics_xgb_current_test |>
      filter(nb_iter == !!nb_iter_auc, max_depth == max_depth_auc_star)
    # Note: indexing at 0 here...
    ind_auc <- which(map_dbl(scores_hist_repn[[i_max_depth_auc_star]], "nb_iter") == nb_iter_auc)
    scores_hist_max_auc <- scores_hist_repn[[i_max_depth_auc_star]][[ind_auc]]
    plot_bp_interest(
      metrics_interest = metrics_max_auc,
      scores_hist_interest = scores_hist_max_auc,
      label = "AUC*",
      recalib_method = recalib_method
    )
    if (paper_version == TRUE) {
      mtext(
        side = 2, recalib_method_lab, line = 2.5, cex = 1,
        font.lab = 2
      )
    }
    
    ## Min Brier on validation set
    metrics_valid_brier_star <-
      metrics_xgb_current_valid |> arrange(brier) |> dplyr::slice(1)
    nb_iter_brier <- metrics_valid_brier_star$nb_iter
    max_depth_brier_star <- metrics_valid_brier_star$max_depth
    i_max_depth_brier_star <- which(max_depth_val == max_depth_brier_star)
    
    metrics_min_brier <-
      metrics_xgb_current_test |>
      filter(nb_iter == !!nb_iter_brier, max_depth == max_depth_brier_star)
    ind_brier <- which(map_dbl(scores_hist_repn[[i_max_depth_brier_star]], "nb_iter") == nb_iter_brier)
    scores_hist_min_brier <- scores_hist_repn[[i_max_depth_brier_star]][[ind_brier]]
    plot_bp_interest(
      metrics_interest = metrics_min_brier,
      scores_hist_interest = scores_hist_min_brier,
      label = "Brier*",
      recalib_method = recalib_method
    )
    
    ## Min ICI on validation set
    metrics_valid_ici_star <-
      metrics_xgb_current_valid |> arrange(ici) |> dplyr::slice(1)
    nb_iter_ici <-   metrics_valid_ici_star$nb_iter
    max_depth_ici_star <- metrics_valid_ici_star$max_depth
    i_max_depth_ici_star <- which(max_depth_val == max_depth_ici_star)
    
    metrics_min_ici <-
      metrics_xgb_current_test |>
      filter(nb_iter == !!nb_iter_ici, max_depth == max_depth_ici_star)
    ind_ici <- which(map_dbl(scores_hist_repn[[i_max_depth_ici_star]], "nb_iter") == nb_iter_ici)
    scores_hist_min_ici <- scores_hist_repn[[i_max_depth_ici_star]][[ind_ici]]
    plot_bp_interest(
      metrics_interest = metrics_min_ici,
      scores_hist_interest = scores_hist_min_ici,
      label = "ICI*",
      recalib_method = recalib_method
    )
    
    ## Min KL on validation set
    metrics_valid_kl_star <-
      metrics_xgb_current_valid |> arrange(KL_20_true_probas) |> dplyr::slice(1)
    nb_iter_kl <- metrics_valid_kl_star$nb_iter
    max_depth_kl_star <- metrics_valid_kl_star$max_depth
    i_max_depth_kl_star <- which(max_depth_val == max_depth_kl_star)
    
    metrics_min_kl <-
      metrics_xgb_current_test |>
      filter(nb_iter == !!nb_iter_kl, max_depth == max_depth_kl_star)
    ind_kl <- which(map_dbl(scores_hist_repn[[i_max_depth_kl_star]], "nb_iter") == nb_iter_kl)
    scores_hist_min_kl <- scores_hist_repn[[i_max_depth_kl_star]][[ind_kl]]
    plot_bp_interest(
      metrics_interest = metrics_min_kl,
      scores_hist_interest = scores_hist_min_kl,
      label = "KL*",
      recalib_method = recalib_method
    )
    
    ## Mediocre ICI on validation set
    identified_mici <- 
      mediocre_ici_xgb |> filter(scenario == !!scenario, repn == !!repn)
    
    metrics_valid_mici_star <- metrics_xgb_current_valid |> 
      filter(ind == identified_mici$ind, nb_iter == identified_mici$nb_iter)
    nb_iter_mici <- metrics_valid_mici_star$nb_iter
    max_depth_mici_star <- metrics_valid_mici_star$max_depth
    i_max_depth_mici_star <- which(max_depth_val == max_depth_mici_star)
    
    metrics_mici <-
      metrics_xgb_current_test |>
      filter(nb_iter == !!nb_iter_mici, max_depth == max_depth_mici_star)
    ind_mici <- which(map_dbl(scores_hist_repn[[i_max_depth_mici_star]], "nb_iter") == nb_iter_mici)
    scores_hist_mici <- scores_hist_repn[[i_max_depth_mici_star]][[ind_mici]]
    plot_bp_interest(
      metrics_interest = metrics_mici,
      scores_hist_interest = scores_hist_mici,
      label = "High ICI",
      recalib_method = recalib_method
    )
  }
}
Code
par(mfrow = c(4,8), mar = c(4.1, 4, 3.5, 1.5))
plot_bp_xgb(scenario = 1, repn = 1)
Figure 5.1: Distribution of scores on the test set (DGP 1, 0 noise variable)
Code
par(mfrow = c(4,8), mar = c(4.1, 4, 3.5, 1.5))
plot_bp_xgb(scenario = 2, repn = 1)
Figure 5.2: Distribution of scores on the test set (DGP 1, 10 noise variables)
Code
par(mfrow = c(4,8), mar = c(4.1, 4, 3.5, 1.5))
plot_bp_xgb(scenario = 3, repn = 1)
Figure 5.3: Distribution of scores on the test set (DGP 1, 50 noise variables)
Code
par(mfrow = c(4,8), mar = c(4.1, 4, 3.5, 1.5))
plot_bp_xgb(scenario = 4, repn = 1)
Figure 5.4: Distribution of scores on the test set (DGP 1, 100 noise variables)
Code
par(mfrow = c(4,8), mar = c(4.1, 4, 3.5, 1.5))
plot_bp_xgb(scenario = 5, repn = 1)
Figure 5.5: Distribution of scores on the test set (DGP 2, 0 noise variable)
Code
par(mfrow = c(4,8), mar = c(4.1, 4, 3.5, 1.5))
plot_bp_xgb(scenario = 6, repn = 1)
Figure 5.6: Distribution of scores on the test set (DGP 2, 10 noise variables)
Code
par(mfrow = c(4,8), mar = c(4.1, 4, 3.5, 1.5))
plot_bp_xgb(scenario = 7, repn = 1)
Figure 5.7: Distribution of scores on the test set (DGP 2, 50 noise variables)
Code
par(mfrow = c(4,8), mar = c(4.1, 4, 3.5, 1.5))
plot_bp_xgb(scenario = 8, repn = 1)
Figure 5.8: Distribution of scores on the test set (DGP 2, 100 noise variables)
Code
par(mfrow = c(4,8), mar = c(4.1, 4, 3.5, 1.5))
plot_bp_xgb(scenario = 9, repn = 1)
Figure 5.9: Distribution of scores on the test set (DGP 3, 0 noise variable)
Code
par(mfrow = c(4,8), mar = c(4.1, 4, 3.5, 1.5))
plot_bp_xgb(scenario = 10, repn = 1)
Figure 5.10: Distribution of scores on the test set (DGP 3, 10 noise variables)
Code
par(mfrow = c(4,8), mar = c(4.1, 4, 3.5, 1.5))
plot_bp_xgb(scenario = 11, repn = 1)
Figure 5.11: Distribution of scores on the test set (DGP 3, 50 noise variables)
Code
par(mfrow = c(4,8), mar = c(4.1, 4, 3.5, 1.5))
plot_bp_xgb(scenario = 12, repn = 1)
Figure 5.12: Distribution of scores on the test set (DGP 3, 100 noise variables)
Code
par(mfrow = c(4,8), mar = c(4.1, 4, 3.5, 1.5))
plot_bp_xgb(scenario = 13, repn = 1)
Figure 5.13: Distribution of scores on the test set (DGP 4, 0 noise variable)
Code
par(mfrow = c(4,8), mar = c(4.1, 4, 3.5, 1.5))
plot_bp_xgb(scenario = 14, repn = 1)
Figure 5.14: Distribution of scores on the test set (DGP 4, 10 noise variables)
Code
par(mfrow = c(4,8), mar = c(4.1, 4, 3.5, 1.5))
plot_bp_xgb(scenario = 15, repn = 1)
Figure 5.15: Distribution of scores on the test set (DGP 4, 50 noise variables)
Code
par(mfrow = c(4,8), mar = c(4.1, 4, 3.5, 1.5))
plot_bp_xgb(scenario = 16, repn = 1)
Figure 5.16: Distribution of scores on the test set (DGP 4, 100 noise variables)
Code to create PDF figures
if(!dir.exists("../figs/")) dir.create("../figs/")
for (scenario in 1:16) {
  pdf(
    file = str_c("../figs/bp_synthetic_xbg_", scenario, ".pdf"),
    height = 6.5, width = 10
  )
  par(mfrow = c(4,5), mar = c(4.1, 4, 3.5, 1.5))
  plot_bp_xgb(scenario = scenario, repn = 1, paper_version = TRUE)
  dev.off()
}

5.5.3 KL Divergence and Calibration along Boosting Iterations

We can examine the evolution of the relationship between the divergence of score distributions from true probabilities and model calibration across increasing boosting iterations.

Codes to create the figure
df_plot <-
  metrics_xgb_all |>
  mutate(
    dgp = case_when(
      scenario %in% 1:4 ~ 1,
      scenario %in% 5:8 ~ 2,
      scenario %in% 9:12 ~ 3,
      scenario %in% 13:16 ~ 4
    ),
    dgp = factor(dgp, levels = 1:4, labels = str_c("DGP ", 1:4)),
    no_noise = c(0, 10, 50, 100)[(scenario-1)%%4 + 1],
    no_noise = factor(
      no_noise, levels = c(no_noise),
      labels = str_c(no_noise, " noise variables")
    )
  ) |>
  select(
    dgp, no_noise, scenario, recalib, ind, sample, nb_iter, max_depth,
    brier, ici, KL_20_true_probas
  ) |>
  group_by(dgp, no_noise, scenario, recalib, ind, sample, nb_iter, max_depth) |>
  summarise(
    brier = mean(brier),
    ici = mean(ici),
    KL_20_true_probas = mean(KL_20_true_probas),
    .groups = "drop"
  ) |>
  mutate(
    max_depth = factor(
      max_depth,
      levels = c(2, 4, 6)
    )
  )

formatter1000 <- function(x) x*1000
Codes to create the figure
p_brier <- ggplot(
  data = df_plot |> arrange(nb_iter) |> filter(max_depth == 2),
  mapping = aes(x = brier, y = KL_20_true_probas)
) +
  geom_path(
    mapping = aes(colour = sample, linetype = recalib),
    arrow = arrow(type = "closed", ends = "last",
                  length = unit(0.08, "inches"))
  ) +
  # facet_wrap(~scenario) +
  ggh4x::facet_grid2(dgp~no_noise, scales = "free_y", independent = "y") +
  labs(
    x = latex2exp::TeX("Calibration (Brier), $\\times 10^{3}$, log scale"),
    y = "KL Divergence"
  ) +
  scale_x_log10(labels = formatter1000) + scale_y_log10() +
  scale_colour_manual("Sample", values = colour_samples) +
  scale_linetype_discrete("Recalibration") +
  theme_paper() +
  theme(legend.key.width = unit(1.5, "cm"))


ggsave(
  p_brier, file = "../figs/xgb-kl-calib-brier-leaves-all.pdf",
       width = 13, height = 8
)

p_brier
Figure 5.17: KL Divergence and Calibration (Brier) across increasing boosting iterations (log scales)
Codes to create the figure
p_ici <- ggplot(
  data = df_plot |> arrange(nb_iter) |> filter(max_depth == 2),
  mapping = aes(x = ici, y = KL_20_true_probas)
) +
  geom_path(
    mapping = aes(colour = sample, linetype = recalib),
    arrow = arrow(type = "closed", ends = "last",
                  length = unit(0.08, "inches"))
  ) +
  # facet_wrap(~scenario) +
  ggh4x::facet_grid2(dgp~no_noise, scales = "free_y", independent = "y") +
  labs(
    x = latex2exp::TeX("Calibration (ICI), $\\times 10^{3}$, log scale"),
    y = "KL Divergence"
  ) +
  scale_x_log10(labels = formatter1000) + scale_y_log10() +
  scale_colour_manual("Sample", values = colour_samples) +
  scale_linetype_discrete("Recalibration") +
  theme_paper() +
  theme(legend.key.width = unit(1.5, "cm"))


ggsave(
  p_ici, file = "../figs/xgb-kl-calib-ici-leaves-all.pdf",
  width = 13, height = 8
)
p_ici
Figure 5.18: KL Divergence and Calibration (ICI) across increasing boosting iterations (log scales)

5.5.4 Tables

We examine the average values of various metrics across 100 replications for the “best” model selected according to different criteria: AUC* for the model with hyperparameters chosen to maximize AUC on the validation set, ICI* for the model with hyperparameters selected to minimize ICI, Brier* for minimizing the Brier Score, KL* for minimizing the KL divergence between the distribution of scores on the validation set and the true probability distribution, smallest for the model with only 2 boosting iterations, largest for the model with 400 boosting iterations, and mediocre ICI for the model chosen to illustrate the effects of score recalibration when initial calibration is mediocre.

Display the R codes to produce the table
models_interest_xgb <- models_of_interest_metrics |> 
  group_by(scenario, recalib, sample, result_type) |> 
  summarise(
    AUC_lower = quantile(AUC, probs = 2.5/100),
    AUC_upper = quantile(AUC, probs = 97.5/100),
    AUC_sd = sd(AUC),
    AUC = mean(AUC),
    brier_lower = quantile(brier, probs = 2.5/100),
    brier_upper = quantile(brier, probs = 97.5/100),
    brier_sd = sd(brier),
    brier = mean(brier),
    ici_lower = quantile(ici, probs = 2.5/100),
    ici_upper = quantile(ici, probs = 97.5/100),
    ici_sd = sd(ici),
    ici = mean(ici),
    KL_20_true_probas_lower = quantile(KL_20_true_probas, probs = 2.5/100),
    KL_20_true_probas_upper = quantile(KL_20_true_probas, probs = 97.5/100),
    KL_20_true_probas_sd = sd(KL_20_true_probas),
    KL_20_true_probas = mean(KL_20_true_probas),
    quant_ratio_sd = sd(inter_quantile_10_90),
    quant_ratio = mean(inter_quantile_10_90),
    .groups = "drop"
  ) |> 
  mutate(
    model = "xgb",
    sample = str_to_lower(as.character(sample)),
    dgp = case_when(
      scenario %in% 1:4 ~ 1,
      scenario %in% 5:8 ~ 2,
      scenario %in% 9:12 ~ 3,
      scenario %in% 13:16 ~ 4
    ),
    no_noise = c(0, 10, 50, 100)[(scenario-1)%%4 + 1],
    no_noise = factor(
      no_noise,
      levels = c(no_noise),
      labels = str_c(no_noise, " noise variables")
    )
  )

# Sanity check
# metrics_xgb_all |> count(scenario, recalib, ind, sample, nb_iter) |>
#   filter(n != max(repns_vector))

table_models_interest_mean <- 
  models_interest_xgb |> 
  filter(sample == "test") |> 
  select(
    dgp, no_noise, recalib, sample, result_type, 
    brier, ici, kl = KL_20_true_probas,
  ) |> 
  filter(
    result_type %in% c("AUC*", "KL*", "High ICI", "Smallest", "Largest")
  ) |> 
  mutate(value_type = "mean")


table_models_interest_sd <- 
  models_interest_xgb |> 
  filter(sample == "test") |> 
  select(
    dgp, no_noise, recalib, sample, result_type, 
    brier = brier_sd, ici = ici_sd, 
    kl = KL_20_true_probas_sd,
  ) |> 
  filter(
    result_type %in% c("AUC*", "KL*", "High ICI", "Smallest", "Largest")
  ) |> 
  mutate(value_type = "sd")

digits <- 3

table_models_interest_mean |> 
  bind_rows(table_models_interest_sd) |> 
  pivot_longer(cols = c(brier, ici, kl)) |> 
  pivot_wider(names_from = "value_type", values_from = "value") |> 
  mutate(value = str_c(round(`mean`, digits), " (", round(`sd`, digits), ")")) |> 
  select(-mean, -sd, -sample) |> 
  pivot_wider(names_from = c(recalib, name), values_from = value) |> 
  knitr::kable(
    align = str_c("cl", str_c(rep("c", 3*4), collapse = ""), collapse = ""),
    escape = FALSE, booktabs = T, digits = 3, format = "markdown",
    col.names = c(
      "DGP", "Noise", "Optim.",
      rep(c("BS", "ICI", "KL"), 4)
    )
  ) |> 
  kableExtra::collapse_rows(columns = 1:2, valign = "top") |> 
  kableExtra::add_header_above(
    c(" " = 3,
      "None" = 3,
      "Platt" = 3,
      "Beta" = 3,
      "Isotonic" = 3
    )
  ) |> 
  kableExtra::scroll_box(fixed_thead = TRUE, height = "500px")
Table 5.2: Performance and calibration metrics (Brier Score, Integrated Calibration Index, Kullback-Leibler Divergence) computed on the test set, on scores returned by the model (column ‘None’), on scores recalibrated using Platt scaling (column ‘Platt’), or Isotonic regression (coliumn ‘Isotonic’)
None
Platt
Beta
Isotonic
DGP Noise Optim. BS ICI KL BS ICI KL BS ICI KL BS ICI KL
1 0 noise variables Smallest 0.231 (0.001) 0.115 (0.005) 1.878 (0.063) 0.214 (0.002) 0.012 (0.004) 2.037 (0.05) 0.214 (0.002) 0.012 (0.004) 2.042 (0.061) 0.214 (0.002) 0.011 (0.004) 2.04 (0.059)
Largest 0.206 (0.002) 0.026 (0.005) 0.046 (0.012) 0.206 (0.002) 0.013 (0.004) 0.095 (0.011) 0.206 (0.002) 0.013 (0.004) 0.037 (0.012) 0.206 (0.002) 0.011 (0.004) 0.306 (0.105)
AUC* 0.201 (0.002) 0.011 (0.005) 0.051 (0.03) 0.201 (0.002) 0.017 (0.005) 0.131 (0.031) 0.201 (0.002) 0.011 (0.004) 0.051 (0.024) 0.201 (0.002) 0.012 (0.004) 0.304 (0.095)
KL* 0.202 (0.002) 0.013 (0.005) 0.021 (0.006) 0.202 (0.002) 0.015 (0.005) 0.107 (0.016) 0.202 (0.002) 0.011 (0.004) 0.027 (0.011) 0.202 (0.002) 0.012 (0.004) 0.307 (0.101)
High ICI 0.217 (0.002) 0.062 (0.006) 0.158 (0.066) 0.213 (0.002) 0.018 (0.005) 0.199 (0.05) 0.212 (0.002) 0.013 (0.004) 0.08 (0.053) 0.212 (0.002) 0.011 (0.004) 0.348 (0.104)
10 noise variables Smallest 0.231 (0.001) 0.115 (0.005) 1.878 (0.063) 0.214 (0.002) 0.012 (0.004) 2.037 (0.05) 0.214 (0.002) 0.012 (0.004) 2.042 (0.061) 0.214 (0.002) 0.011 (0.004) 2.04 (0.059)
Largest 0.21 (0.002) 0.04 (0.005) 0.042 (0.011) 0.209 (0.002) 0.016 (0.004) 0.151 (0.027) 0.208 (0.002) 0.011 (0.005) 0.033 (0.011) 0.209 (0.002) 0.011 (0.005) 0.31 (0.117)
AUC* 0.201 (0.002) 0.014 (0.005) 0.063 (0.032) 0.201 (0.002) 0.018 (0.005) 0.135 (0.025) 0.201 (0.002) 0.011 (0.004) 0.053 (0.024) 0.201 (0.002) 0.011 (0.004) 0.296 (0.101)
KL* 0.204 (0.002) 0.015 (0.005) 0.01 (0.004) 0.204 (0.002) 0.016 (0.005) 0.109 (0.015) 0.203 (0.002) 0.01 (0.004) 0.017 (0.008) 0.204 (0.002) 0.012 (0.004) 0.302 (0.11)
High ICI 0.229 (0.003) 0.106 (0.006) 0.442 (0.194) 0.216 (0.002) 0.025 (0.005) 0.386 (0.102) 0.215 (0.002) 0.011 (0.004) 0.106 (0.144) 0.215 (0.002) 0.012 (0.004) 0.364 (0.137)
50 noise variables Smallest 0.231 (0.001) 0.115 (0.005) 1.875 (0.05) 0.214 (0.002) 0.012 (0.004) 2.037 (0.052) 0.214 (0.002) 0.011 (0.004) 2.041 (0.056) 0.214 (0.002) 0.011 (0.004) 2.041 (0.06)
Largest 0.213 (0.002) 0.048 (0.005) 0.047 (0.011) 0.211 (0.002) 0.017 (0.005) 0.192 (0.02) 0.21 (0.002) 0.011 (0.004) 0.043 (0.013) 0.211 (0.002) 0.011 (0.005) 0.333 (0.115)
AUC* 0.201 (0.002) 0.016 (0.005) 0.08 (0.032) 0.201 (0.002) 0.018 (0.005) 0.142 (0.029) 0.201 (0.002) 0.012 (0.004) 0.06 (0.027) 0.201 (0.002) 0.012 (0.004) 0.304 (0.098)
KL* 0.205 (0.002) 0.019 (0.005) 0.009 (0.003) 0.205 (0.002) 0.016 (0.004) 0.12 (0.02) 0.205 (0.002) 0.01 (0.004) 0.022 (0.01) 0.205 (0.002) 0.012 (0.004) 0.313 (0.095)
High ICI 0.235 (0.003) 0.129 (0.006) 0.717 (0.169) 0.216 (0.002) 0.029 (0.006) 0.453 (0.166) 0.215 (0.002) 0.01 (0.004) 0.117 (0.223) 0.215 (0.002) 0.011 (0.004) 0.359 (0.211)
100 noise variables Smallest 0.231 (0.001) 0.115 (0.005) 1.875 (0.05) 0.214 (0.002) 0.012 (0.004) 2.037 (0.052) 0.214 (0.002) 0.011 (0.004) 2.041 (0.056) 0.214 (0.002) 0.011 (0.004) 2.041 (0.06)
Largest 0.214 (0.002) 0.051 (0.005) 0.051 (0.012) 0.212 (0.002) 0.017 (0.004) 0.205 (0.014) 0.211 (0.002) 0.01 (0.004) 0.052 (0.013) 0.212 (0.002) 0.011 (0.005) 0.321 (0.103)
AUC* 0.201 (0.002) 0.016 (0.005) 0.087 (0.029) 0.201 (0.002) 0.018 (0.005) 0.144 (0.024) 0.201 (0.002) 0.012 (0.004) 0.061 (0.024) 0.201 (0.002) 0.011 (0.004) 0.324 (0.114)
KL* 0.206 (0.002) 0.019 (0.005) 0.009 (0.004) 0.206 (0.002) 0.015 (0.004) 0.125 (0.023) 0.206 (0.002) 0.01 (0.004) 0.025 (0.012) 0.206 (0.002) 0.011 (0.004) 0.302 (0.101)
High ICI 0.236 (0.003) 0.136 (0.006) 0.807 (0.093) 0.216 (0.002) 0.031 (0.005) 0.444 (0.032) 0.215 (0.002) 0.01 (0.004) 0.086 (0.055) 0.215 (0.002) 0.011 (0.004) 0.343 (0.115)
2 0 noise variables Smallest 0.192 (0.001) 0.226 (0.005) 3.212 (0.199) 0.131 (0.002) 0.035 (0.005) 1.889 (0.089) 0.131 (0.002) 0.028 (0.005) 1.894 (0.084) 0.13 (0.002) 0.009 (0.004) 1.718 (0.283)
Largest 0.123 (0.002) 0.016 (0.004) 0.024 (0.01) 0.124 (0.002) 0.042 (0.003) 0.881 (0.034) 0.122 (0.002) 0.01 (0.004) 0.019 (0.006) 0.122 (0.002) 0.009 (0.004) 0.209 (0.066)
AUC* 0.118 (0.002) 0.01 (0.004) 0.029 (0.014) 0.12 (0.002) 0.038 (0.004) 0.783 (0.217) 0.118 (0.002) 0.009 (0.004) 0.027 (0.012) 0.118 (0.002) 0.009 (0.004) 0.214 (0.069)
KL* 0.12 (0.002) 0.01 (0.004) 0.013 (0.005) 0.121 (0.002) 0.038 (0.004) 0.863 (0.141) 0.119 (0.002) 0.009 (0.004) 0.016 (0.007) 0.12 (0.002) 0.009 (0.004) 0.215 (0.073)
High ICI 0.131 (0.003) 0.048 (0.004) 0.128 (0.125) 0.13 (0.003) 0.05 (0.005) 0.845 (0.042) 0.127 (0.003) 0.009 (0.004) 0.046 (0.014) 0.127 (0.003) 0.009 (0.003) 0.237 (0.073)
10 noise variables Smallest 0.192 (0.001) 0.226 (0.005) 3.212 (0.199) 0.131 (0.002) 0.035 (0.005) 1.889 (0.089) 0.131 (0.002) 0.028 (0.005) 1.894 (0.084) 0.13 (0.002) 0.009 (0.004) 1.718 (0.283)
Largest 0.126 (0.002) 0.028 (0.004) 0.032 (0.01) 0.127 (0.002) 0.045 (0.003) 0.869 (0.035) 0.124 (0.002) 0.009 (0.004) 0.022 (0.008) 0.124 (0.002) 0.01 (0.004) 0.222 (0.075)
AUC* 0.119 (0.002) 0.012 (0.004) 0.03 (0.016) 0.12 (0.002) 0.038 (0.004) 0.759 (0.221) 0.118 (0.002) 0.01 (0.003) 0.026 (0.011) 0.119 (0.002) 0.009 (0.004) 0.213 (0.074)
KL* 0.12 (0.002) 0.011 (0.003) 0.007 (0.003) 0.122 (0.002) 0.04 (0.004) 0.887 (0.076) 0.12 (0.002) 0.009 (0.004) 0.012 (0.006) 0.121 (0.002) 0.01 (0.004) 0.205 (0.074)
High ICI 0.137 (0.003) 0.075 (0.004) 0.288 (0.127) 0.132 (0.002) 0.058 (0.004) 1.24 (0.354) 0.128 (0.002) 0.009 (0.004) 0.045 (0.029) 0.128 (0.002) 0.009 (0.003) 0.263 (0.1)
50 noise variables Smallest 0.192 (0.001) 0.226 (0.005) 3.212 (0.199) 0.131 (0.002) 0.035 (0.005) 1.889 (0.089) 0.131 (0.002) 0.028 (0.005) 1.894 (0.084) 0.13 (0.002) 0.009 (0.003) 1.718 (0.283)
Largest 0.127 (0.002) 0.033 (0.004) 0.039 (0.011) 0.128 (0.002) 0.047 (0.003) 0.863 (0.036) 0.125 (0.002) 0.009 (0.003) 0.028 (0.009) 0.125 (0.002) 0.009 (0.003) 0.223 (0.072)
AUC* 0.119 (0.002) 0.013 (0.004) 0.038 (0.018) 0.12 (0.002) 0.038 (0.004) 0.728 (0.221) 0.119 (0.002) 0.01 (0.004) 0.029 (0.013) 0.119 (0.002) 0.009 (0.003) 0.207 (0.069)
KL* 0.121 (0.002) 0.011 (0.003) 0.006 (0.003) 0.123 (0.002) 0.041 (0.004) 0.894 (0.045) 0.121 (0.002) 0.009 (0.003) 0.014 (0.008) 0.121 (0.002) 0.009 (0.004) 0.215 (0.068)
High ICI 0.139 (0.003) 0.089 (0.004) 0.429 (0.105) 0.133 (0.003) 0.064 (0.005) 1.799 (0.155) 0.127 (0.002) 0.01 (0.004) 0.041 (0.02) 0.127 (0.002) 0.009 (0.003) 0.235 (0.073)
100 noise variables Smallest 0.192 (0.001) 0.226 (0.005) 3.21 (0.199) 0.132 (0.002) 0.035 (0.005) 1.887 (0.091) 0.131 (0.002) 0.028 (0.005) 1.894 (0.087) 0.13 (0.002) 0.009 (0.003) 1.721 (0.286)
Largest 0.129 (0.002) 0.037 (0.004) 0.044 (0.011) 0.129 (0.002) 0.048 (0.003) 0.856 (0.035) 0.126 (0.002) 0.009 (0.003) 0.034 (0.01) 0.126 (0.002) 0.009 (0.003) 0.22 (0.073)
AUC* 0.119 (0.002) 0.014 (0.004) 0.044 (0.023) 0.121 (0.002) 0.038 (0.004) 0.729 (0.224) 0.119 (0.002) 0.01 (0.004) 0.03 (0.013) 0.119 (0.002) 0.009 (0.003) 0.214 (0.062)
KL* 0.122 (0.002) 0.012 (0.004) 0.006 (0.003) 0.124 (0.003) 0.041 (0.004) 0.89 (0.033) 0.122 (0.002) 0.009 (0.004) 0.016 (0.008) 0.122 (0.002) 0.009 (0.003) 0.217 (0.076)
High ICI 0.14 (0.003) 0.093 (0.004) 0.482 (0.099) 0.133 (0.003) 0.065 (0.005) 1.842 (0.155) 0.127 (0.002) 0.01 (0.004) 0.042 (0.014) 0.127 (0.002) 0.009 (0.004) 0.23 (0.07)
3 0 noise variables Smallest 0.24 (0.001) 0.075 (0.006) 1.827 (0.099) 0.233 (0.001) 0.011 (0.005) 1.65 (0.21) 0.233 (0.001) 0.011 (0.004) 1.663 (0.205) 0.233 (0.001) 0.011 (0.004) 1.674 (0.208)
Largest 0.229 (0.002) 0.044 (0.005) 0.108 (0.026) 0.226 (0.001) 0.012 (0.005) 0.092 (0.022) 0.226 (0.001) 0.012 (0.004) 0.068 (0.022) 0.226 (0.001) 0.012 (0.005) 0.291 (0.101)
AUC* 0.22 (0.002) 0.01 (0.004) 0.012 (0.009) 0.221 (0.002) 0.012 (0.004) 0.041 (0.013) 0.22 (0.002) 0.01 (0.004) 0.013 (0.008) 0.221 (0.002) 0.011 (0.004) 0.268 (0.106)
KL* 0.221 (0.002) 0.012 (0.004) 0.005 (0.002) 0.221 (0.002) 0.011 (0.004) 0.047 (0.014) 0.221 (0.002) 0.01 (0.004) 0.017 (0.011) 0.222 (0.002) 0.011 (0.004) 0.286 (0.115)
High ICI 0.246 (0.002) 0.105 (0.005) 0.631 (0.086) 0.231 (0.001) 0.014 (0.004) 0.268 (0.047) 0.231 (0.001) 0.011 (0.004) 0.174 (0.035) 0.231 (0.001) 0.011 (0.004) 0.376 (0.108)
10 noise variables Smallest 0.24 (0.001) 0.075 (0.006) 1.827 (0.099) 0.233 (0.001) 0.011 (0.005) 1.65 (0.21) 0.233 (0.001) 0.011 (0.004) 1.663 (0.205) 0.233 (0.001) 0.011 (0.004) 1.674 (0.208)
Largest 0.231 (0.002) 0.052 (0.005) 0.11 (0.024) 0.227 (0.001) 0.012 (0.004) 0.123 (0.025) 0.227 (0.001) 0.011 (0.004) 0.074 (0.024) 0.227 (0.001) 0.012 (0.004) 0.307 (0.103)
AUC* 0.221 (0.001) 0.011 (0.004) 0.027 (0.018) 0.221 (0.002) 0.012 (0.005) 0.046 (0.014) 0.221 (0.001) 0.01 (0.004) 0.017 (0.009) 0.221 (0.002) 0.011 (0.004) 0.28 (0.099)
KL* 0.222 (0.002) 0.014 (0.005) 0.004 (0.002) 0.222 (0.002) 0.012 (0.004) 0.056 (0.019) 0.222 (0.002) 0.01 (0.004) 0.023 (0.016) 0.222 (0.002) 0.011 (0.004) 0.284 (0.103)
High ICI 0.253 (0.003) 0.127 (0.005) 0.932 (0.118) 0.232 (0.001) 0.015 (0.004) 0.366 (0.035) 0.232 (0.001) 0.01 (0.004) 0.191 (0.04) 0.232 (0.001) 0.011 (0.004) 0.392 (0.108)
50 noise variables Smallest 0.24 (0.001) 0.075 (0.006) 1.827 (0.099) 0.233 (0.001) 0.011 (0.005) 1.65 (0.21) 0.233 (0.001) 0.011 (0.004) 1.663 (0.205) 0.233 (0.001) 0.011 (0.004) 1.674 (0.208)
Largest 0.233 (0.002) 0.06 (0.005) 0.12 (0.025) 0.228 (0.002) 0.012 (0.005) 0.163 (0.027) 0.228 (0.002) 0.01 (0.004) 0.097 (0.029) 0.229 (0.002) 0.011 (0.004) 0.342 (0.105)
AUC* 0.221 (0.001) 0.013 (0.005) 0.053 (0.03) 0.221 (0.002) 0.012 (0.004) 0.049 (0.015) 0.221 (0.002) 0.01 (0.004) 0.021 (0.011) 0.222 (0.002) 0.011 (0.004) 0.29 (0.124)
KL* 0.224 (0.002) 0.018 (0.006) 0.004 (0.002) 0.224 (0.002) 0.011 (0.004) 0.075 (0.023) 0.224 (0.002) 0.01 (0.004) 0.037 (0.021) 0.224 (0.002) 0.011 (0.004) 0.284 (0.104)
High ICI 0.259 (0.003) 0.145 (0.006) 1.285 (0.169) 0.233 (0.001) 0.017 (0.005) 0.402 (0.027) 0.232 (0.001) 0.01 (0.004) 0.204 (0.044) 0.232 (0.001) 0.011 (0.004) 0.424 (0.127)
100 noise variables Smallest 0.24 (0.001) 0.075 (0.007) 1.827 (0.099) 0.233 (0.001) 0.011 (0.005) 1.65 (0.21) 0.233 (0.001) 0.011 (0.004) 1.663 (0.205) 0.233 (0.001) 0.011 (0.004) 1.674 (0.208)
Largest 0.235 (0.002) 0.065 (0.005) 0.129 (0.026) 0.229 (0.001) 0.012 (0.004) 0.185 (0.028) 0.229 (0.001) 0.01 (0.004) 0.115 (0.029) 0.229 (0.001) 0.012 (0.005) 0.348 (0.119)
AUC* 0.222 (0.001) 0.015 (0.006) 0.067 (0.031) 0.222 (0.002) 0.012 (0.004) 0.052 (0.016) 0.221 (0.002) 0.01 (0.004) 0.024 (0.012) 0.222 (0.002) 0.011 (0.004) 0.286 (0.107)
KL* 0.225 (0.002) 0.019 (0.005) 0.004 (0.002) 0.224 (0.002) 0.011 (0.004) 0.08 (0.021) 0.224 (0.002) 0.01 (0.004) 0.042 (0.02) 0.225 (0.002) 0.011 (0.004) 0.301 (0.122)
High ICI 0.261 (0.003) 0.152 (0.005) 1.454 (0.18) 0.233 (0.001) 0.017 (0.004) 0.418 (0.036) 0.232 (0.001) 0.01 (0.004) 0.206 (0.038) 0.233 (0.001) 0.011 (0.004) 0.416 (0.11)
4 0 noise variables Smallest 0.239 (0.001) 0.081 (0.01) 2.366 (0.299) 0.229 (0.002) 0.011 (0.005) 2.059 (0.108) 0.229 (0.002) 0.011 (0.005) 2.055 (0.097) 0.229 (0.002) 0.011 (0.005) 2.061 (0.113)
Largest 0.209 (0.002) 0.028 (0.005) 0.019 (0.006) 0.208 (0.002) 0.015 (0.004) 0.117 (0.015) 0.208 (0.002) 0.011 (0.004) 0.024 (0.009) 0.208 (0.002) 0.011 (0.004) 0.315 (0.115)
AUC* 0.204 (0.002) 0.011 (0.004) 0.039 (0.021) 0.205 (0.002) 0.016 (0.004) 0.13 (0.02) 0.204 (0.002) 0.011 (0.004) 0.035 (0.014) 0.205 (0.002) 0.011 (0.005) 0.294 (0.1)
KL* 0.206 (0.002) 0.018 (0.005) 0.011 (0.004) 0.206 (0.002) 0.015 (0.004) 0.115 (0.012) 0.206 (0.002) 0.01 (0.004) 0.019 (0.007) 0.206 (0.002) 0.011 (0.004) 0.289 (0.105)
High ICI 0.222 (0.003) 0.073 (0.006) 0.199 (0.286) 0.215 (0.003) 0.019 (0.006) 0.249 (0.191) 0.215 (0.003) 0.011 (0.004) 0.113 (0.215) 0.215 (0.003) 0.012 (0.005) 0.36 (0.222)
10 noise variables Smallest 0.239 (0.001) 0.081 (0.011) 2.366 (0.299) 0.229 (0.002) 0.011 (0.005) 2.059 (0.108) 0.229 (0.002) 0.011 (0.005) 2.055 (0.097) 0.229 (0.002) 0.011 (0.005) 2.061 (0.113)
Largest 0.213 (0.002) 0.036 (0.005) 0.018 (0.006) 0.211 (0.002) 0.015 (0.004) 0.173 (0.02) 0.211 (0.002) 0.011 (0.005) 0.048 (0.013) 0.211 (0.002) 0.011 (0.005) 0.307 (0.106)
AUC* 0.206 (0.002) 0.014 (0.005) 0.089 (0.026) 0.206 (0.002) 0.016 (0.005) 0.142 (0.022) 0.206 (0.002) 0.012 (0.004) 0.06 (0.017) 0.206 (0.002) 0.012 (0.005) 0.294 (0.104)
KL* 0.211 (0.002) 0.028 (0.005) 0.014 (0.005) 0.21 (0.002) 0.015 (0.004) 0.156 (0.02) 0.21 (0.002) 0.011 (0.005) 0.043 (0.012) 0.21 (0.002) 0.011 (0.005) 0.307 (0.091)
High ICI 0.232 (0.002) 0.105 (0.005) 0.307 (0.236) 0.219 (0.002) 0.021 (0.005) 0.391 (0.109) 0.219 (0.002) 0.01 (0.004) 0.14 (0.144) 0.219 (0.002) 0.011 (0.005) 0.422 (0.181)
50 noise variables Smallest 0.239 (0.001) 0.081 (0.01) 2.366 (0.299) 0.229 (0.002) 0.011 (0.005) 2.059 (0.108) 0.229 (0.002) 0.011 (0.005) 2.055 (0.097) 0.229 (0.002) 0.011 (0.005) 2.061 (0.113)
Largest 0.216 (0.002) 0.042 (0.006) 0.019 (0.005) 0.214 (0.002) 0.014 (0.004) 0.206 (0.019) 0.214 (0.002) 0.011 (0.005) 0.079 (0.019) 0.215 (0.002) 0.012 (0.005) 0.327 (0.1)
AUC* 0.207 (0.002) 0.019 (0.005) 0.126 (0.031) 0.207 (0.002) 0.016 (0.005) 0.145 (0.025) 0.207 (0.002) 0.012 (0.004) 0.072 (0.02) 0.207 (0.002) 0.012 (0.005) 0.3 (0.104)
KL* 0.215 (0.002) 0.034 (0.005) 0.017 (0.004) 0.213 (0.002) 0.014 (0.004) 0.19 (0.021) 0.213 (0.002) 0.011 (0.004) 0.072 (0.018) 0.214 (0.002) 0.012 (0.004) 0.345 (0.101)
High ICI 0.238 (0.003) 0.125 (0.006) 0.422 (0.047) 0.221 (0.002) 0.024 (0.005) 0.424 (0.039) 0.22 (0.002) 0.011 (0.005) 0.134 (0.023) 0.22 (0.002) 0.012 (0.005) 0.401 (0.119)
100 noise variables Smallest 0.239 (0.001) 0.081 (0.01) 2.366 (0.299) 0.229 (0.002) 0.011 (0.005) 2.059 (0.108) 0.229 (0.002) 0.011 (0.005) 2.055 (0.097) 0.229 (0.002) 0.011 (0.005) 2.061 (0.113)
Largest 0.218 (0.002) 0.045 (0.006) 0.02 (0.005) 0.216 (0.002) 0.014 (0.005) 0.218 (0.019) 0.216 (0.002) 0.011 (0.004) 0.092 (0.02) 0.216 (0.002) 0.012 (0.005) 0.35 (0.105)
AUC* 0.208 (0.002) 0.021 (0.007) 0.145 (0.035) 0.207 (0.002) 0.015 (0.005) 0.147 (0.025) 0.207 (0.002) 0.012 (0.004) 0.078 (0.021) 0.207 (0.002) 0.012 (0.005) 0.334 (0.109)
KL* 0.216 (0.002) 0.037 (0.006) 0.017 (0.004) 0.215 (0.002) 0.013 (0.005) 0.202 (0.022) 0.215 (0.002) 0.011 (0.004) 0.084 (0.021) 0.215 (0.002) 0.012 (0.005) 0.367 (0.107)
High ICI 0.241 (0.003) 0.133 (0.006) 0.486 (0.046) 0.221 (0.002) 0.025 (0.004) 0.468 (0.057) 0.22 (0.002) 0.011 (0.005) 0.141 (0.023) 0.22 (0.002) 0.011 (0.005) 0.398 (0.101)

5.5.5 Before vs. After Recalibration

Let us visualize how the KL divergence and the ICI of a selected model changes after the scores are recalibrated, either using Platt scaling or isotonic regression.

Code
table_models_interest_mean <- 
  table_models_interest_mean |> 
  mutate(
    no_noise = fct_recode(no_noise, `0 noise variable` = "0 noise variables"),
    dgp = str_c("DGP ", dgp)
  )

initial_points <- table_models_interest_mean |> 
  filter(recalib == "None")

points_after_c <- initial_points |> 
  select(dgp, no_noise, result_type, ici, kl) |> 
  left_join(
    table_models_interest_mean |> 
      filter(recalib %in% c("Platt", "Beta", "Isotonic")) |> 
      select(dgp, no_noise, recalib, result_type, ici, kl) |> 
      rename(ici_end = ici, kl_end = kl),
    relationship = "many-to-many" # (Platt and Isotonic)
  )
Joining with `by = join_by(dgp, no_noise, result_type)`
Code
p_kl_vs_ici_recalib <- 
  ggplot() +
  geom_point(
    data = initial_points,
    mapping = aes(x = ici, y = kl, shape = result_type),
    size = 2
  ) +
  geom_segment(
    data = points_after_c,
    mapping = aes(
      x = ici, y = kl, xend = ici_end, yend = kl_end,
      colour = recalib, linetype = recalib
    ),
    arrow = arrow(length=unit(.2, 'cm'))
  ) +
  # facet_grid(dgp~no_noise, scales = "free") +
  ggh4x::facet_grid2(
    dgp~no_noise, independent = "none", scales = "free_y", axes = "all"
  ) +
  scale_shape_manual(
    NULL,
    values = c(
      # "Smallest" = 1,
      "AUC*" = 19,
      "KL*" = 15,
      "High ICI" = 17
    )
  ) +
  labs(x = "ICI", y = "KL Divergence") +
  scale_colour_manual("Recalibration", values = colour_recalib) +
  scale_linetype_discrete("Recalibration") +
  theme_paper()

ggsave(p_kl_vs_ici_recalib, file = "../figs/kl_vs_ici_recalib.pdf", width = 12, height = 8)
p_kl_vs_ici_recalib
Figure 5.19: Average KL divergence and ICI before and after recalibration of the estimated scores for DGP 1, for selected models.