8  Recalibration: Example

Note

This chapter illustrates the application of the method using one example of real-world datasets. We have trained a GAMSEL model on a binary variable to estimate the underlying event probabilities using available covariates. We therefore consider the Beta prior as the “true probability distribution”. Here, we train a certain XGBoost model (the hyperparameters are selected randomly on the validation sample), and predict it to train/validation/calibration/test sets. Recalibration models are fitted on the calibration sets and then applied to the test to calculate performance, dispersion and calibration metrics.

Code Availability

The functions for data preprocessing, model estimation, and Beta distribution fitting are stored in ../scripts/functions/real-data.R and will be used in subsequent chapters.

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(philentropy)
library(ranger)
library(xgboost)

Attaching package: 'xgboost'

The following object is masked from 'package:dplyr':

    slice
library(pbapply)
library(parallel)
library(gam)
Loading required package: splines
Loading required package: foreach

Attaching package: 'foreach'

The following objects are masked from 'package:purrr':

    accumulate, when

Loaded gam 1.22-5
library(betacal)
library(gamsel)
Loaded gamsel 1.8-5
library(locfit)
locfit 1.5-9.10      2024-06-24

Attaching package: 'locfit'

The following object is masked from 'package:purrr':

    none
# Colours for train/validation/calibration/test
colour_samples <- c(
  "Train" = "#0072B2",
  "Validation" = "#009E73",
  "Calibration" = "#CC79A7",
  "Test" = "#D55E00"
)

colour_recalib <- c(
  "None" = "#88CCEE",
  "Platt" = "#44AA99",
  "Isotonic" = "#882255",
  "Beta" = "darkgoldenrod",
  "Locfit" = "firebrick3"
)

# Functions
source("../scripts/functions/real-data.R")
source("../scripts/functions/metrics.R")
definition of the theme_paper() function (for ggplot2 graphs)
#' Theme for ggplot2
#'
#' @param ... arguments passed to the theme function
#' @export
#' @importFrom ggplot2 element_rect element_text element_blank element_line unit
#'   rel
theme_paper <- function (...) {
  ggthemes::theme_base() +
    theme(
      plot.background = element_blank(),
      legend.background = element_rect(
        fill = "transparent", linetype="solid", colour ="black"),
      legend.position = "bottom",
      legend.direction = "horizontal",
      legend.box = "horizontal",
      legend.key = element_blank()
    )
}

8.1 Setup

8.1.1 XGBoost

To train the models, we rely on the {xgboost} R package.

library(xgboost)

8.1.2 Recalibration

Here, we define a function to recalibrate predicted scores using either Platt scaling or isotonic regression. The recalibration algorithm is first trained on the calibration set and then applied to both the calibration and test sets.

#' Recalibrates scores using a calibration
#' 
#' @param obs_calib vector of observed events in the calibration set
#' @param scores_calib vector of predicted probabilities in the calibration set
#' @param obs_test vector of observed events in the test set
#' @param scores_test vector of predicted probabilities in the test set
#' @param method recalibration method (`"platt"` for Platt scaling, 
#'   `"isotonic"` for isotonic regression, `"beta"` for Beta calibration, 
#'   `"locfit"` for local regression techniques)
#' @returns list of two elements: recalibrated scores on the calibration set,
#'   recalibrated scores on the test set
recalibrate <- function(obs_calib,
                        obs_test,
                        pred_calib,
                        pred_test,
                        method = c("platt", "isotonic", "beta", "locfit")) {
  data_calib <- tibble(d = obs_calib, scores = pred_calib)
  data_test <- tibble(d = obs_test, scores = pred_test)
  
  if (method == "platt") {
    lr <- glm(d ~ scores, family = binomial(link = 'logit'), data = data_calib)
    score_c_calib <- predict(lr, newdata = data_calib, type = "response")
    score_c_test <- predict(lr, newdata = data_test, type = "response")
  } else if (method == "isotonic") {
    iso <- isoreg(x = data_calib$scores, y = data_calib$d)
    fit_iso <- as.stepfun(iso)
    score_c_calib <- fit_iso(data_calib$scores)
    score_c_test <- fit_iso(data_test$scores)
  } else if (method == "beta") {
    fit_beta <- beta_calibration(data_calib$scores, data_calib$d, parameters = "abm")
    score_c_calib <- beta_predict(data_calib$scores, fit_beta)
    score_c_test <- beta_predict(data_test$scores, fit_beta)
  } else if (method == "locfit") {
    noise_scores <- data_calib$scores + rnorm(nrow(data_calib), 0, 0.01)
    noise_data_calib <- data_calib %>% mutate(scores = noise_scores)
    locfit_reg <- locfit(
      formula = d ~ lp(scores, nn = 0.15, deg = 0),
      kern = "rect", maxk = 200, data = noise_data_calib
    )
    score_c_calib <- predict(locfit_reg, newdata = data_calib)
    score_c_test <- predict(locfit_reg, newdata = data_test)  
  } else {
    stop("Unrecognized method: platt or isotonic only")
  }
  # Format results in tibbles:
  # For calibration set
  tb_score_c_calib <- tibble(
    d = obs_calib,
    p_u = pred_calib,
    p_c = score_c_calib
  )
  # For test set
  tb_score_c_test <- tibble(
    d = obs_test,
    p_u = pred_test,
    p_c = score_c_test
  )
  
  list(
    tb_score_c_calib = tb_score_c_calib,
    tb_score_c_test = tb_score_c_test
  )
  
}

8.1.3 Dispersion Metrics

We modify our dispersion_metrics() function to replace the vector of simulated true probabilities with the parameters of a Beta distribution. This adjustment allows us to compute the divergence between the model-estimated scores and the specified Beta distribution.

Function dispersion_metrics_beta()
#' Computes the dispersion and divergence metrics for a vector of scores and
#' a Beta distribution
#'
#' @param shape_1 first parameter of the beta distribution
#' @param shape_2 second parameter of the beta distribution
#' @param scores predicted scores
#'
#' @returns
#' \itemize{
#'   \item \code{inter_quantile_25_75}: Difference of inter-quantile between 25% and 75%
#'   \item \code{inter_quantile_10_90}: Difference of inter-quantile between 10% and 90%
#'   \item \code{KL_10_true_probas}: KL of of predicted probabilities w.r. to true probabilities with 10 bins
#'   \item \code{KL_10_scores}: KL of of true probabilities w.r. to predicted probabilities with 10 bins
#'   \item \code{KL_20_true_probas}: KL of of predicted probabilities w.r. to true probabilities with 20 bins
#'   \item \code{KL_20_scores}: KL of of true probabilities w.r. to predicted probabilities with 20 bins
#'   \item \code{ind_cov}: Difference between the variance of true probabilities and the covariance between true probabilities and predicted scores
#' }
dispersion_metrics_beta <- function(shape_1 = 1, shape_2 = 1, scores){

  # Inter-quantiles
  inter_q_80 <- diff(quantile(scores, c(.9, .1))) /
    diff(qbeta(c(.9, .1), shape_1, shape_2))
  inter_q_50 <- diff(quantile(scores, c(.75,.25))) /
    diff(qbeta(c(.75,.25), shape_1, shape_2))

  # KL divergences
  m <- 10 # Number of bins
  h_phat <- hist(scores, breaks = seq(0, 1, length = m + 1), plot = FALSE)
  h_p <- list(breaks = h_phat$breaks, mids = h_phat$mids)
  h_p$density = diff(pbeta(h_p$breaks, shape_1, shape_2))
  h_p$counts =  h_p$density*length(scores)

  # Densities
  h1 <- rbind(h_phat$density / m, h_p$density / m) # Reference : true probabilities
  h2 <- rbind(h_p$density / m, h_phat$density / m) # Reference : predicted scores
  KL_10_true_probas <- distance(
    h1, method = "kullback-leibler", unit = "log2", mute.message = TRUE)
  KL_10_scores <- distance(
    h2, method = "kullback-leibler", unit = "log2", mute.message = TRUE)


  m <- 20 # Number of bins
  h_phat <- hist(scores, breaks = seq(0, 1, length = m + 1), plot = FALSE)
  h_p <- list(breaks = h_phat$breaks, mids = h_phat$mids)
  h_p$density = diff(pbeta(h_p$breaks, shape_1, shape_2))
  h_p$counts =  h_p$density * length(scores)
  # Densities
  h1 <- rbind(h_phat$density / m, h_p$density) # Reference : true probabilities
  h2 <- rbind(h_p$density, h_phat$density / m) # Reference : predicted scores
  KL_20_true_probas <- distance(
    h1, method = "kullback-leibler", unit = "log2", mute.message = TRUE)
  KL_20_scores <- distance(
    h2, method = "kullback-leibler", unit = "log2", mute.message = TRUE)

  # Indicator of the difference between variance and covariance
  var_p <- shape_1 * shape_2 / ((shape_1 + shape_2)^2 * (shape_1 + shape_2 + 1))
  cov_p_phat <- cov(
    qbeta(
      rank(scores, ties.method = "average") / (1 + length(scores)),
      shape_1,
      shape_2),
    scores
  )
  ind_cov <- abs(cov_p_phat - var_p)

  # Collection
  dispersion_metrics <- tibble(
    "inter_quantile_25_75" = as.numeric(inter_q_50),
    "inter_quantile_10_90" = as.numeric(inter_q_80),
    "KL_10_true_probas" = as.numeric(KL_10_true_probas),
    "KL_10_scores" = as.numeric(KL_10_scores),
    "KL_20_true_probas" = as.numeric(KL_20_true_probas),
    "KL_20_scores" = as.numeric(KL_20_scores),
    "ind_cov" = ind_cov
  )

  dispersion_metrics
}

disp_metrics_dataset <- function(prior, scores) {
  # GAMSEL prior
  shape_1 <- prior$mle_gamsel$estimate["shape1"]
  shape_2 <- prior$mle_gamsel$estimate["shape2"]

  # Divergence metrics
  dist_prior_gamsel <- dispersion_metrics_beta(
    shape_1 = shape_1, shape_2 = shape_2, scores = scores
  )

  dist_prior_gamsel |>
    mutate(
      prior = "gamsel", shape_1 = shape_1, shape_2 = shape_2
      )
}

8.1.4 Estimation Functions: Extreme Gradient Boosting

We train XGB models on the dataset “spambase”, varying the maximum tree depth (with values of 2, 4, or 6) and the number of boosting iterations (ranging from 2 to 500). At each iteration, we compute metrics based on scores from both the train, calibration, validation and test samples, as well as the recalibrated scores.

This function allows us to calculate the performance, dispersion and calibration metrics on all sets of the dataset at a given iteration for a single XGBoost algorithm:

Function get_metrics_xgb_iter()
#' Get the metrics based on scores estimated at a given boosting iteration
#'
#' @param scores scores estimated a boosting iteration `nb_iter` (list with
#'   train and test scores, returned by `predict_score_iter()`)
#' @param data_train train set
#' @param data_valid validation set
#' @param data_calib calibration set
#' @param data_test test set
#' @param prior_train train prior scores
#' @param prior_valid validation prior scores
#' @param prior_calib calibration prior scores
#' @param prior_test test prior scores
#' @param target_name name of the target variable
#' @param ind index of the grid search
#' @param nb_iter boosting iteration to consider
#' @param params hyperparameters to consider
#'
#' @returns A list with 4 elements:
#'  - `tb_metrics`: performance / calibration metrics
#'  - `tb_disp_metrics`: disp and div metrics
#'  - `tb_prop_scores`: table with P(q1 < score < q2)
#'  - `scores_hist`: histogram of scores
get_metrics_xgb_iter <- function(scores,
                                 prior,
                                 data_train,
                                 data_valid,
                                 data_calib,
                                 data_test,
                                 prior_train,
                                 prior_valid,
                                 prior_calib,
                                 prior_test,
                                 target_name,
                                 ind,
                                 nb_iter,
                                 params) {

  scores_train <- scores$scores_train
  scores_valid <- scores$scores_valid
  scores_calib <- scores$scores_calib
  scores_test <- scores$scores_test
  
  # Recalibration
  # Platt scaling
  res_recalibration_platt <- recalibrate(
    obs_calib = data_calib |> dplyr::pull(!!target_name), 
    obs_test = data_test |> dplyr::pull(!!target_name), 
    pred_calib = scores_calib, 
    pred_test = scores_test, 
    method = "platt"
  )
  scores_c_platt_calib <- res_recalibration_platt$tb_score_c_calib$p_c
  scores_c_platt_test <- res_recalibration_platt$tb_score_c_test$p_c
  
  # Isotonic regression
  res_recalibration_iso <- recalibrate(
    obs_calib = data_calib |> dplyr::pull(!!target_name), 
    obs_test = data_test |> dplyr::pull(!!target_name), 
    pred_calib = scores_calib, 
    pred_test = scores_test, 
    method = "isotonic"
  )
  scores_c_iso_calib <- res_recalibration_iso$tb_score_c_calib$p_c
  scores_c_iso_test <- res_recalibration_iso$tb_score_c_test$p_c
  
  # Beta calibration
  res_recalibration_beta <- recalibrate(
    obs_calib = data_calib |> dplyr::pull(!!target_name), 
    obs_test = data_test |> dplyr::pull(!!target_name), 
    pred_calib = scores_calib, 
    pred_test = scores_test,
    method = "beta"
  )
  scores_c_beta_calib <- res_recalibration_beta$tb_score_c_calib$p_c
  scores_c_beta_test <- res_recalibration_beta$tb_score_c_test$p_c
  
  # Locfit regression
  res_recalibration_locfit <- recalibrate(
    obs_calib = data_calib |> dplyr::pull(!!target_name), 
    obs_test = data_test |> dplyr::pull(!!target_name), 
    pred_calib = scores_calib, 
    pred_test = scores_test,
    method = "locfit"
  )
  scores_c_locfit_calib <- res_recalibration_locfit$tb_score_c_calib$p_c
  scores_c_locfit_test <- res_recalibration_locfit$tb_score_c_test$p_c

  ## Metrics----
  ## Histogram of scores----
  breaks <- seq(0, 1, by = .05)
  scores_train_hist <- hist(scores_train, breaks = breaks, plot = FALSE)
  scores_calib_hist <- hist(scores_calib, breaks = breaks, plot = FALSE)
  scores_valid_hist <- hist(scores_valid, breaks = breaks, plot = FALSE)
  scores_test_hist <- hist(scores_test, breaks = breaks, plot = FALSE)
  scores_c_platt_calib_hist <- hist(scores_c_platt_calib, breaks = breaks, plot = FALSE)
  scores_c_platt_test_hist <- hist(scores_c_platt_test, breaks = breaks, plot = FALSE)
  scores_c_iso_calib_hist <- hist(scores_c_iso_calib, breaks = breaks, plot = FALSE)
  scores_c_iso_test_hist <- hist(scores_c_iso_test, breaks = breaks, plot = FALSE)
  scores_c_beta_calib_hist <- hist(scores_c_beta_calib, breaks = breaks, plot = FALSE)
  scores_c_beta_test_hist <- hist(scores_c_beta_test, breaks = breaks, plot = FALSE)
  scores_c_locfit_calib_hist <- hist(scores_c_locfit_calib, breaks = breaks, plot = FALSE)
  scores_c_locfit_test_hist <- hist(scores_c_locfit_test, breaks = breaks, plot = FALSE)
  
  scores_hist <- list(
    train = scores_train_hist,
    valid = scores_valid_hist,
    calib = scores_calib_hist,
    test = scores_test_hist,
    calib_c_platt = scores_c_platt_calib_hist,
    test_c_platt = scores_c_platt_test_hist,
    calib_c_iso = scores_c_iso_calib_hist,
    test_c_iso = scores_c_iso_test_hist,
    calib_c_beta = scores_c_beta_calib_hist,
    test_c_beta = scores_c_beta_test_hist,
    calib_c_locfit = scores_c_locfit_calib_hist,
    test_c_locfit = scores_c_locfit_test_hist,
    ind = ind,
    nb_iter = nb_iter,
    max_depth = params$max_depth
    
  )
  
  ## Estimation of P(q1 < score < q2)----
  prop_btw_q_h <- function(s, sample_name, recalib_name) {
    map(
      c(.1, .2, .3, .4),
      ~prop_btw_quantiles(s = s, q1 = .x)
    ) |>
      list_rbind() |>
      mutate(sample = sample_name, recalib = recalib_name)
  }
  
  proq_scores_train <- prop_btw_q_h(
    scores_train, sample_name = "train", recalib_name = "none"
  )
  proq_scores_valid <- prop_btw_q_h(
    scores_valid, sample_name = "valid", recalib_name = "none"
  )
  proq_scores_calib <- prop_btw_q_h(
    scores_calib, sample_name = "calib", recalib_name = "none"
  )
  proq_scores_test <- prop_btw_q_h(
    scores_test, sample_name = "test", recalib_name = "none"
  )
  proq_scores_c_platt_calib <- prop_btw_q_h(
    scores_c_platt_calib, sample_name = "calib", recalib_name = "platt"
  )
  proq_scores_c_platt_test <- prop_btw_q_h(
    scores_c_platt_test, sample_name = "test", recalib_name = "platt"
  )
  proq_scores_c_iso_calib <- prop_btw_q_h(
    scores_c_iso_calib, sample_name = "calib", recalib_name = "isotonic"
  )
  proq_scores_c_iso_test <- prop_btw_q_h(
    scores_c_iso_test, sample_name = "test", recalib_name = "isotonic"
  )
   proq_scores_c_beta_calib <- prop_btw_q_h(
    scores_c_beta_calib, sample_name = "calib", recalib_name = "beta"
  )
  proq_scores_c_beta_test <- prop_btw_q_h(
    scores_c_beta_test, sample_name = "test", recalib_name = "beta"
  )
  proq_scores_c_locfit_calib <- prop_btw_q_h(
    scores_c_locfit_calib, sample_name = "calib", recalib_name = "locfit"
  )
  proq_scores_c_locfit_test <- prop_btw_q_h(
    scores_c_locfit_test, sample_name = "test", recalib_name = "locfit"
  )

  ## Dispersion Metrics----
  disp_train <- disp_metrics_dataset(
    prior = prior, scores = scores_train
  ) |> 
    mutate(sample = "train", recalib = "none")
  
  disp_valid <- disp_metrics_dataset(
    prior = prior, scores = scores_valid
  ) |>
    mutate(sample = "valid", recalib = "none")
  
  disp_calib <- disp_metrics_dataset(
    prior = prior, scores = scores_calib
  ) |> 
    mutate(sample = "calib", recalib = "none")
  
  disp_test <- disp_metrics_dataset(
    prior = prior, scores = scores_test
  ) |> 
    mutate(sample = "test", recalib = "none")
  
  disp_c_platt_calib <- disp_metrics_dataset(
    prior = prior, scores = scores_c_platt_calib
  ) |>
    mutate(sample = "calib", recalib = "platt")
  
  disp_c_platt_test <- disp_metrics_dataset(
    prior = prior, scores = scores_c_platt_test
  ) |>
    mutate(sample = "test", recalib = "platt")
  
  disp_c_iso_calib <- disp_metrics_dataset(
    prior = prior, scores = scores_c_iso_calib
  ) |>
    mutate(sample = "calib", recalib = "isotonic")
  
  disp_c_iso_test <- disp_metrics_dataset(
    prior = prior, scores = scores_c_iso_test
  ) |>
    mutate(sample = "test", recalib = "isotonic")
  
  disp_c_beta_calib <- disp_metrics_dataset(
    prior = prior, scores = scores_c_beta_calib
  ) |> 
    mutate(sample = "calib", recalib = "beta")
  
  disp_c_beta_test <- disp_metrics_dataset(
    prior = prior, scores = scores_c_beta_test
  ) |> 
    mutate(sample = "test", recalib = "beta")
  
  disp_c_locfit_calib <- disp_metrics_dataset(
    prior = prior, scores = scores_c_locfit_calib
  ) |> 
    mutate(sample = "calib", recalib = "locfit")
  
  disp_c_locfit_test <- disp_metrics_dataset(
    prior = prior, scores = scores_c_locfit_test
  ) |> 
    mutate(sample = "test", recalib = "locfit")

  # Performance and Calibration Metrics----
  # We add very small noise to predicted scores
  # otherwise the local regression may crash
  scores_train_noise <- scores_train +
    runif(n = length(scores_train), min = 0, max = 0.01)
  scores_train_noise[scores_train_noise > 1] <- 1
  metrics_train <- compute_metrics(
    obs = data_train |> pull(!!target_name),
    scores = scores_train_noise, true_probas = prior_train
  ) |> mutate(sample = "train", recalib = "none")

  scores_valid_noise <- scores_valid +
    runif(n = length(scores_valid), min = 0, max = 0.01)
  scores_valid_noise[scores_valid_noise > 1] <- 1
  metrics_valid <- compute_metrics(
    obs = data_valid |> pull(!!target_name),
    scores = scores_valid_noise, true_probas = prior_valid
  ) |> mutate(sample = "valid", recalib = "none")
  
  scores_calib_noise <- scores_calib +
    runif(n = length(scores_calib), min = 0, max = 0.01)
  scores_calib_noise[scores_calib_noise > 1] <- 1
  metrics_calib <- compute_metrics(
    obs = data_calib |> pull(!!target_name),
    scores = scores_calib_noise, true_probas = prior_calib
  ) |> mutate(sample = "calib", recalib = "none")
  
  scores_test_noise <- scores_test +
    runif(n = length(scores_test), min = 0, max = 0.01)
  scores_test_noise[scores_test_noise > 1] <- 1
  metrics_test <- compute_metrics(
    obs = data_test |> pull(!!target_name),
    scores = scores_test_noise, true_probas = prior_test
  ) |> mutate(sample = "test", recalib = "none")
  
  # With recalibrated scores (platt)
  scores_c_platt_calib_noise <- scores_c_platt_calib +
    runif(n = length(scores_calib), min = 0, max = 0.01)
  scores_c_platt_calib_noise[scores_c_platt_calib_noise > 1] <- 1
  metrics_c_platt_calib <- compute_metrics(
    obs = data_calib |> pull(!!target_name),
    scores = scores_c_platt_calib_noise, true_probas = prior_calib
  ) |> mutate(sample = "calib", recalib = "platt")
  
  scores_c_platt_test_noise <- scores_c_platt_test +
    runif(n = length(scores_test), min = 0, max = 0.01)
  scores_c_platt_test_noise[scores_c_platt_test_noise > 1] <- 1
  metrics_c_platt_test <- compute_metrics(
    obs = data_test |> pull(!!target_name),
    scores = scores_c_platt_test_noise, true_probas = prior_test
  ) |> mutate(sample = "test", recalib = "platt")
  
  # With recalibrated scores (isotonic)
  scores_c_iso_calib_noise <- scores_c_iso_calib +
    runif(n = length(scores_c_iso_calib), min = 0, max = 0.01)
  scores_c_iso_calib_noise[scores_c_iso_calib_noise > 1] <- 1
  metrics_c_iso_calib <- compute_metrics(
    obs = data_calib |> pull(!!target_name),
    scores = scores_c_iso_calib_noise, true_probas = prior_calib
  ) |> mutate(sample = "calib", recalib = "isotonic")
  
  scores_c_iso_test_noise <- scores_c_iso_test +
    runif(n = length(scores_c_iso_test), min = 0, max = 0.01)
  scores_c_iso_test_noise[scores_c_iso_test_noise > 1] <- 1
  metrics_c_iso_test <- compute_metrics(
    obs = data_test |> pull(!!target_name),
    scores = scores_c_iso_test_noise, true_probas = prior_test
  ) |> mutate(sample = "test", recalib = "isotonic")
  
 # With recalibrated scores (beta)
  scores_c_beta_calib_noise <- scores_c_beta_calib +
    runif(n = length(scores_c_beta_calib), min = 0, max = 0.01)
  scores_c_beta_calib_noise[scores_c_beta_calib_noise > 1] <- 1
  metrics_c_beta_calib <- compute_metrics(
    obs = data_calib |> pull(!!target_name),
    scores = scores_c_beta_calib_noise, true_probas = prior_calib
  ) |> mutate(sample = "calib", recalib = "beta")
  
  scores_c_beta_test_noise <- scores_c_beta_test +
    runif(n = length(scores_c_beta_test), min = 0, max = 0.01)
  scores_c_beta_test_noise[scores_c_beta_test_noise > 1] <- 1
  metrics_c_beta_test <- compute_metrics(
    obs = data_test |> pull(!!target_name),
    scores = scores_c_beta_test_noise, true_probas = prior_test
  ) |> mutate(sample = "test", recalib = "beta")
  
  # With recalibrated scores (locfit)
  scores_c_locfit_calib_noise <- scores_c_locfit_calib +
    runif(n = length(scores_c_locfit_calib), min = 0, max = 0.01)
  scores_c_locfit_calib_noise[scores_c_locfit_calib_noise > 1] <- 1
  metrics_c_locfit_calib <- compute_metrics(
    obs = data_calib |> pull(!!target_name),
    scores = scores_c_locfit_calib_noise, true_probas = prior_calib
  ) |> mutate(sample = "calib", recalib = "locfit")
  
  scores_c_locfit_test_noise <- scores_c_locfit_test +
    runif(n = length(scores_c_locfit_test), min = 0, max = 0.01)
  scores_c_locfit_test_noise[scores_c_locfit_test_noise > 1] <- 1
  metrics_c_locfit_test <- compute_metrics(
    obs = data_test |> pull(!!target_name),
    scores = scores_c_locfit_test_noise, true_probas = prior_test
  ) |> mutate(sample = "test", recalib = "locfit")
  
  # Decomposition of expected losses
  # Platt
  decomposition_platt_calib <- decomposition_metrics(
    obs = data_calib |> pull(!!target_name), scores = scores_calib, 
    calibrated_scores = scores_c_platt_calib, true_probas = prior_calib
  ) |> mutate(sample = "calib", recalib = "platt")
  
  decomposition_platt_test <- decomposition_metrics(
    obs = data_test |> pull(!!target_name), scores = scores_test, 
    calibrated_scores = scores_c_platt_test, true_probas = prior_test
  ) |> mutate(sample = "test", recalib = "platt")
  
  # Isotonic
  decomposition_iso_calib <- decomposition_metrics(
    obs = data_calib |> pull(!!target_name), scores = scores_calib, 
    calibrated_scores = scores_c_iso_calib, true_probas = prior_calib
  ) |> mutate(sample = "calib", recalib = "iso")
  
  decomposition_iso_test <- decomposition_metrics(
    obs = data_test |> pull(!!target_name), scores = scores_test, 
    calibrated_scores = scores_c_iso_test, true_probas = prior_test
  ) |> mutate(sample = "test", recalib = "iso")
  
  # Beta
  decomposition_beta_calib <- decomposition_metrics(
    obs = data_calib |> pull(!!target_name), scores = scores_calib, 
    calibrated_scores = scores_c_beta_calib, true_probas = prior_calib
  ) |> mutate(sample = "calib", recalib = "beta")
  
  decomposition_beta_test <- decomposition_metrics(
    obs = data_test |> pull(!!target_name), scores = scores_test, 
    calibrated_scores = scores_c_beta_test, true_probas = prior_test
  ) |> mutate(sample = "test", recalib = "beta")
  
  # Locfit
  decomposition_locfit_calib <- decomposition_metrics(
    obs = data_calib |> pull(!!target_name), scores = scores_calib, 
    calibrated_scores = scores_c_locfit_calib, true_probas = prior_calib
  ) |> mutate(sample = "calib", recalib = "locfit")
  
  decomposition_locfit_test <- decomposition_metrics(
    obs = data_test |> pull(!!target_name), scores = scores_test, 
    calibrated_scores = scores_c_locfit_test, true_probas = prior_test
  ) |> mutate(sample = "test", recalib = "locfit")
  
  tb_metrics <- metrics_train |>
    bind_rows(metrics_valid) |>
    bind_rows(metrics_calib) |>
    bind_rows(metrics_test) |>
    bind_rows(metrics_c_platt_calib) |>
    bind_rows(metrics_c_platt_test) |>
    bind_rows(metrics_c_iso_calib) |>
    bind_rows(metrics_c_iso_test) |>
    bind_rows(metrics_c_beta_calib) |>
    bind_rows(metrics_c_beta_test) |>
    bind_rows(metrics_c_locfit_calib) |>
    bind_rows(metrics_c_locfit_test) |>
    left_join(
      disp_train |>
        bind_rows(disp_valid) |> 
        bind_rows(disp_calib) |> 
        bind_rows(disp_test) |> 
        bind_rows(disp_c_platt_calib) |> 
        bind_rows(disp_c_platt_test) |> 
        bind_rows(disp_c_iso_calib) |> 
        bind_rows(disp_c_iso_test) |> 
        bind_rows(disp_c_beta_calib) |> 
        bind_rows(disp_c_beta_test) |> 
        bind_rows(disp_c_locfit_calib) |> 
        bind_rows(disp_c_locfit_test),
      by = c("sample", "recalib")
    ) |>
    mutate(ind = ind, nb_iter = nb_iter, max_depth = params$max_depth) 
  #|>
  #  select(-c(mse, mae))

  tb_prop_scores <- proq_scores_train |>
    bind_rows(proq_scores_valid) |>
    bind_rows(proq_scores_calib) |>
    bind_rows(proq_scores_test) |>
    bind_rows(proq_scores_c_platt_calib) |>
    bind_rows(proq_scores_c_platt_test) |>
    bind_rows(proq_scores_c_iso_calib) |>
    bind_rows(proq_scores_c_iso_test) |>
     bind_rows(proq_scores_c_beta_calib) |>
    bind_rows(proq_scores_c_beta_test) |>
    bind_rows(proq_scores_c_locfit_calib) |>
    bind_rows(proq_scores_c_locfit_test) |>
    mutate(ind = ind, nb_iter = nb_iter, max_depth = params$max_depth)
  
  tb_decomposition_loss <- decomposition_platt_calib |>
    bind_rows(decomposition_platt_test) |>
    bind_rows(decomposition_iso_calib) |>
    bind_rows(decomposition_iso_test) |>
    bind_rows(decomposition_beta_calib) |>
    bind_rows(decomposition_beta_test) |>
    bind_rows(decomposition_locfit_calib) |>
    bind_rows(decomposition_locfit_test) |>
    mutate(ind = ind, nb_iter = nb_iter, max_depth = params$max_depth)
  
   list(
    ind = ind,                         # index for grid
    nb_iter = nb_iter,                 # number of boosting iterations
    max_depth = params$max_depth,      # max depth of used trees
    tb_metrics = tb_metrics,           # # table with performance/calib/divergence
    tb_prop_scores = tb_prop_scores,   # table with P(q1 < score < q2)
    tb_decomposition = tb_decomposition_loss,
    scores_hist = scores_hist          # histogram of scores
  )
}

We then define a function to predict a XGBoost algorithm on train, validation, calibration and test samples given an xgboost object (fit). We are able to obtain the predicted scores at a specified iteration during the training of the algorithm:

Function predict_score_iter()
#' Predicts the scores at a given iteration of the XGB model
#'
#' @param fit_xgb estimated XGB model
#' @param tb_train_xgb train set
#' @param tb_valid_xgb validation set
#' @param tb_test_xgb test set
#' @param ind index of the grid search
#' @param nb_iter boosting iteration to consider
#'
#' @returns A list with three elements: `scores_train`, `scores_valid`, and
#'  `scores_train` which contain the estimated scores on the train and on the 
#'  test score, resp.
predict_score_iter <- function(fit_xgb,
                               tb_train_xgb,
                               tb_valid_xgb,
                               tb_calib_xgb,
                               tb_test_xgb,
                               nb_iter) {

  ## Predicted scores----
  scores_train <- predict(fit_xgb, tb_train_xgb, iterationrange = c(1, nb_iter))
  scores_valid <- predict(fit_xgb, tb_valid_xgb, iterationrange = c(1, nb_iter))
  scores_calib <- predict(fit_xgb, tb_calib_xgb, iterationrange = c(1, nb_iter))
  scores_test <- predict(fit_xgb, tb_test_xgb, iterationrange = c(1, nb_iter))

  list(
    scores_train = scores_train,
    scores_valid = scores_valid,
    scores_calib = scores_calib,
    scores_test = scores_test
  )
}

This function calculates the different metrics on train, validation, calibration and test samples for a given XGBoost algorithm at all iterations:

Function simul_xgb_helper()
#' Fit an XGB and returns metrics based on scores. The divergence metrics are
#' obtained using the prior distributions.
#'
#' @param data_train train set
#' @param data_valid validation set
#' @param data_calib calibration set
#' @param data_test test set
#' @param prior_train train prior scores
#' @param prior_valid validation prior scores
#' @param prior_calib calibration prior scores
#' @param prior_test test prior scores
#' @param target_name name of the target variable
#' @param parms tibble with hyperparameters for the current estimation
#' @param prior prior obtained with `get_beta_fit()`
#'
#' @returns A list with 4 elements:
#'  - `tb_metrics`: performance / calibration metrics
#'  - `tb_disp_metrics`: disp and div metrics
#'  - `tb_prop_scores`: table with P(q1 < score < q2)
#'  - `scores_hist`: histogram of scores
simul_xgb_helper <- function(data_train,
                             data_valid,
                             data_calib,
                             data_test,
                             prior_train,
                             prior_valid,
                             prior_calib,
                             prior_test,
                             target_name,
                             params,
                             prior) {

  ## Format data for xgboost----
  tb_train_xgb <- xgb.DMatrix(
    data = data_train |> dplyr::select(-!!target_name) |> as.matrix(),
    label = data_train |> dplyr::pull(!!target_name) |> as.matrix()
  )
  tb_valid_xgb <- xgb.DMatrix(
    data = data_valid |> dplyr::select(-!!target_name) |> as.matrix(),
    label = data_valid |> dplyr::pull(!!target_name) |> as.matrix()
  )
  tb_calib_xgb <- xgb.DMatrix(
    data = data_calib |> dplyr::select(-!!target_name) |> as.matrix(),
    label = data_calib |> dplyr::pull(!!target_name) |> as.matrix()
  )
  tb_test_xgb <- xgb.DMatrix(
    data = data_test |> dplyr::select(-!!target_name) |> as.matrix(),
    label = data_test |> dplyr::pull(!!target_name) |> as.matrix()
  )
  # Parameters for the algorithm
  param <- list(
    max_depth = params$max_depth, #Note: root node is indexed 0
    eta = params$eta,
    nthread = 1,
    objective = "binary:logistic",
    eval_metric = "auc"
  )
  watchlist <- list(train = tb_train_xgb, eval = tb_valid_xgb)
  ## Estimation----
  fit_xgb <- xgb.train(
    param, tb_train_xgb,
    nrounds = params$nb_iter_total,
    watchlist,
    verbose = 0
  )

  # First, we estimate the scores at each boosting iteration
  # As the xgb.Dmatrix objects cannot be easily serialised, we first estimate
  # these scores in a classical way, without parallelism...
  scores_iter <- vector(mode = "list", length = params$nb_iter_total)
  for (i_iter in 1:params$nb_iter_total) {
    scores_iter[[i_iter]] <- predict_score_iter(
      fit_xgb = fit_xgb,
      tb_train_xgb = tb_train_xgb,
      tb_valid_xgb = tb_valid_xgb,
      tb_calib_xgb = tb_calib_xgb,
      tb_test_xgb = tb_test_xgb,
      nb_iter = i_iter)
  }

  # Then, to compute the metrics, as it is a bit slower, we can use parallelism

  ncl <- detectCores() - 1
  (cl <- makeCluster(ncl))
  clusterEvalQ(cl, {
    library(tidyverse)
    library(locfit)
    library(philentropy)
    library(betacal)
  }) |>
    invisible()

  clusterExport(cl, c(
    "scores_iter", "prior", "data_train", "data_valid", "data_calib", "data_test", 
    "prior_train", "prior_valid", "prior_calib", "prior_test", "params", "target_name"
  ), envir = environment())
  clusterExport(cl, c(
    "get_metrics_xgb_iter",
    "brier_score",
    "compute_metrics",
    "disp_metrics_dataset", "dispersion_metrics_beta",
    "recalibrate", "prop_btw_quantiles", "calculate_log_loss",
    "calculate_kl", "decomposition_metrics", "kullback_leibler"
  ))

  metrics_iter <-
    pbapply::pblapply(
      X = seq_len(params$nb_iter_total),
      FUN = function(i_iter) {
        get_metrics_xgb_iter(
          scores = scores_iter[[i_iter]],
          prior = prior,
          data_train = data_train,
          data_valid = data_valid,
          data_calib = data_calib,
          data_test = data_test,
          prior_train = prior_train,
          prior_valid = prior_valid,
          prior_calib = prior_calib,
          prior_test = prior_test,
          target_name = target_name,
          ind = params$ind,
          nb_iter = i_iter,
          params = params
        )
      },
      cl = cl
    )
  stopCluster(cl)
  
  # Merge tibbles from each iteration into a single one
  tb_metrics <-
    map(metrics_iter, "tb_metrics") |>
    list_rbind()
  tb_prop_scores <-
    map(metrics_iter, "tb_prop_scores") |>
    list_rbind()
  tb_decomposition_scores <-
    map(metrics_iter, "tb_decomposition") |>
    list_rbind()
  scores_hist <- map(metrics_iter, "scores_hist")

  list(
    tb_metrics = tb_metrics,
    tb_prop_scores = tb_prop_scores,
    tb_decomposition_scores = tb_decomposition_scores,
    scores_hist = scores_hist
  )
}

8.2 Import Data and Prior

We load the dataset “spambase” and the associated Beta prior on both train and test sets.

load("../output/real-data/tb_spambase.rda") # dataset
load("../output/real-data/priors_spambase.rda") # scores_gamsel

prior <- priors_spambase
data <- tb_spambase

The target variable is is_spam.

name <- "spambase"
target_name <- "is_spam"

8.3 Grid

We consider the following grid:

grid <- expand_grid(
  max_depth = c(2, 4, 6),
  nb_iter_total = 500,
  eta = 0.3
) |>
  mutate(ind = row_number())

The different configurations are reported in Table 9.1.

DT::datatable(grid)
Table 8.1: Grid Search Values

We define a function, simul_xgb_real() to train the model on a dataset for all different values of the hyperparameters of the grid (and for all iterations of each algorithm, defined by a row in the grid).

Function simul_xgb_real()
#' Train an XGB on a dataset for a binary task for various
#' hyperparameters and computes metrics based on scores and on a set of prior
#' distributions of the underlying probability
#'
#' @param data dataset
#' @param target_name name of the target variable
#' @param prior prior obtained with `get_beta_fit()`
#' @param seed desired seed (default to `NULL`)
#'
#' @returns A list with two elements:
#'  - `res`: results for each estimated model of the grid. Each element is a
#'  list with the following elements:
#'      - `tb_metrics`: performance / calibration metrics
#'      - `tb_disp_metrics`: disp and div metrics
#'      - `tb_prop_scores`: table with P(q1 < score < q2)
#'      - `scores_hist`: histogram of scores.
#'  - `grid`: the grid search.
simul_xgb_real <- function(data,
                           target_name,
                           prior,
                           seed = NULL) {

  if (!is.null(seed)) set.seed(seed)

  # Split data into train and test set
  data_splitted <- split_train_test(data = data, prop_train = .7, seed = seed)
  data_encoded <- encode_dataset(
    data_train = data_splitted$train,
    data_test = data_splitted$test,
    target_name = target_name,
    intercept = FALSE
  )

  # Further split train into two samples (train/valid)
  data_splitted_train <- 
    split_train_test(data = data_encoded$train, prop_train = .8, seed = seed)
  
  # Further split test into two samples (calib/test)
  data_splitted_test <- 
    split_train_test(data = data_encoded$test, prop_train = .6, seed = seed)
  
  # Split prior scores
  # Further split train into two samples (train/valid)
  prior_splitted_train <- 
    split_train_test_prior(prior = prior$scores_gamsel$scores_train, prop_train = .8, seed = seed)
  
  # Further split test into two samples (calib/test)
  prior_splitted_test <- 
    split_train_test_prior(prior = prior$scores_gamsel$scores_test, prop_train = .6, seed = seed)
  
  res_grid <- vector(mode = "list", length = nrow(grid))
  for (i_grid in 1:nrow(grid)) {
    res_grid[[i_grid]] <- simul_xgb_helper(
      data_train = data_splitted_train$train,
      data_valid = data_splitted_train$test,
      data_calib = data_splitted_test$train,
      data_test = data_splitted_test$test,
      prior_train = prior_splitted_train$train,
      prior_valid = prior_splitted_train$test,
      prior_calib = prior_splitted_test$train,
      prior_test = prior_splitted_test$test,
      target_name = target_name,
      params = grid |> dplyr::slice(i_grid),
      prior = prior
    )
  }
  
  # The metrics computed for all set of hyperparameters (identified with `ind`)
  # and for each number of boosting iterations (`nb_iter`)
  metrics_simul <- map(res_grid, "tb_metrics") |> 
    list_rbind()
  
  # P(q_1<s(x)<q_2)
  prop_scores_simul <- map(res_grid, "tb_prop_scores") |> 
    list_rbind()
  
  # Decomposition of expected losses
  decomposition_scores_simul <- map(res_grid, "tb_decomposition_scores") |> 
    list_rbind()
  
  # Histogram of estimated scores
  scores_hist <- map(res_grid, "scores_hist")

  list(
    metrics_simul = metrics_simul,
    scores_hist = scores_hist,
    prop_scores_simul = prop_scores_simul,
    decomposition_scores_simul = decomposition_scores_simul
  )
}

8.4 Example: “spambase” dataset

We start to split the dataset “spambase” into train/validation/calibration/test sets.

data_splitted <- split_train_test(data = data, prop_train = .7, seed = 1234)
data_encoded <- encode_dataset(
  data_train = data_splitted$train,
  data_test = data_splitted$test,
  target_name = target_name,
  intercept = FALSE
  )

# Further split train into two samples (train/valid)
data_splitted_train <- 
  split_train_test(data = data_encoded$train, prop_train = .8, seed = 1234)
  
# Further split test into two samples (calib/test)
data_splitted_test <- 
  split_train_test(data = data_encoded$test, prop_train = .6, seed = 1234)

# The different sets
data_train = data_splitted_train$train
data_valid = data_splitted_train$test
data_calib = data_splitted_test$train
data_test = data_splitted_test$test

# Further split train into two samples (train/valid)
prior_splitted_train <- 
  split_train_test_prior(prior = prior$scores_gamsel$scores_train, prop_train = .8, seed = 1234)
  
# Further split test into two samples (calib/test)
prior_splitted_test <- 
  split_train_test_prior(prior = prior$scores_gamsel$scores_test, prop_train = .6, seed = 1234)

# The different sets
prior_train = prior_splitted_train$train
prior_valid = prior_splitted_train$test
prior_calib = prior_splitted_test$train
prior_test = prior_splitted_test$test

We try to fit and predict XGBoost with one set of hyperparameters. Here, we don’t need to use the validation set because we have specified the hyperparameters. In practice, hyperparameters will be selected based on different metrics (calibration, performance, dispersion) calculated on the validation set.

Let’s take the first row of the grid:

i_grid <- 1
params <- grid |> dplyr::slice(i_grid)

# Parameters for the algorithm
param <- list(
  max_depth = params$max_depth, #Note: root node is indexed 0
  eta = params$eta,
  nthread = 1,
  objective = "binary:logistic",
  eval_metric = "auc"
)

Next, we transform the different sets of the dataset in the right format to apply XGBoost:

## Format data for xgboost----
tb_train_xgb <- xgb.DMatrix(
  data = data_train |> dplyr::select(-!!target_name) |> as.matrix(),
  label = data_train |> dplyr::pull(!!target_name) |> as.matrix()
)
tb_valid_xgb <- xgb.DMatrix(
  data = data_valid |> dplyr::select(-!!target_name) |> as.matrix(),
  label = data_valid |> dplyr::pull(!!target_name) |> as.matrix()
)
tb_calib_xgb <- xgb.DMatrix(
  data = data_calib |> dplyr::select(-!!target_name) |> as.matrix(),
  label = data_calib |> dplyr::pull(!!target_name) |> as.matrix()
)
tb_test_xgb <- xgb.DMatrix(
  data = data_test |> dplyr::select(-!!target_name) |> as.matrix(),
  label = data_test |> dplyr::pull(!!target_name) |> as.matrix()
)
  
watchlist <- list(train = tb_train_xgb, eval = tb_valid_xgb)

We fit the specified algorithm on the train set:

## Estimation----
fit_xgb <- xgb.train(
  param, tb_train_xgb,
  nrounds = params$nb_iter_total,
  watchlist,
  verbose = 0
)

Then, we estimate the scores at each boosting iteration to specify it for the final model, by using the function predict_score_iter(). We obtain a list of 400 elements, corresponding to the predicted scores at each iteration (total of 400 iterations for a specified XGBoost algorithm):

# As the xgb.Dmatrix objects cannot be easily serialised, we first estimate
# these scores in a classical way, without parallelism...
scores_iter <- vector(mode = "list", length = params$nb_iter_total)
for (i_iter in 1:params$nb_iter_total) {
  scores_iter[[i_iter]] <- predict_score_iter(
    fit_xgb = fit_xgb,
    tb_train_xgb = tb_train_xgb,
    tb_valid_xgb = tb_valid_xgb,
    tb_calib_xgb = tb_calib_xgb,
    tb_test_xgb = tb_test_xgb,
    nb_iter = i_iter
  )
}

We are going to calculate different metrics for a single number of iterations (here 1 iteration only):

i_iter <- 1
scores <- scores_iter[[i_iter]]
scores_train <- scores$scores_train
scores_valid <- scores$scores_valid
scores_calib <- scores$scores_calib
scores_test <- scores$scores_test

We recalibrate those predicted scores (calibration and test sets only) using either Platt scaling or isotonic regression:

# Recalibration
# Platt scaling
res_recalibration_platt <- recalibrate(
  obs_calib = data_calib |> dplyr::pull(!!target_name),
  obs_test = data_test |> dplyr::pull(!!target_name), 
  pred_calib = scores_calib, 
  pred_test = scores_test, 
  method = "platt"
)
scores_c_platt_calib <- res_recalibration_platt$tb_score_c_calib$p_c
scores_c_platt_test <- res_recalibration_platt$tb_score_c_test$p_c
  
# Isotonic regression
res_recalibration_iso <- recalibrate(
  obs_calib = data_calib |> dplyr::pull(!!target_name), 
  obs_test = data_test |> dplyr::pull(!!target_name), 
  pred_calib = scores_calib, 
  pred_test = scores_test, 
  method = "isotonic"
)
scores_c_iso_calib <- res_recalibration_iso$tb_score_c_calib$p_c
scores_c_iso_test <- res_recalibration_iso$tb_score_c_test$p_c

# Beta calibration
res_recalibration_beta <- recalibrate(
  obs_calib = data_calib |> dplyr::pull(!!target_name), 
  obs_test = data_test |> dplyr::pull(!!target_name), 
  pred_calib = scores_calib, 
  pred_test = scores_test,
  method = "beta"
)
[1] -22.60554
[1] 23.41854
scores_c_beta_calib <- res_recalibration_beta$tb_score_c_calib$p_c
scores_c_beta_test <- res_recalibration_beta$tb_score_c_test$p_c
  
# Locfit regression
res_recalibration_locfit <- recalibrate(
  obs_calib = data_calib |> dplyr::pull(!!target_name), 
  obs_test = data_test |> dplyr::pull(!!target_name), 
  pred_calib = scores_calib, 
  pred_test = scores_test,
  method = "locfit"
)
scores_c_locfit_calib <- res_recalibration_locfit$tb_score_c_calib$p_c
scores_c_locfit_test <- res_recalibration_locfit$tb_score_c_test$p_c

Here are the histograms of predicted scores:

## Histogram of scores----
breaks <- seq(0, 1, by = .05)
scores_train_hist <- hist(scores_train, breaks = breaks, plot = FALSE)
scores_calib_hist <- hist(scores_calib, breaks = breaks, plot = FALSE)
scores_valid_hist <- hist(scores_valid, breaks = breaks, plot = FALSE)
scores_test_hist <- hist(scores_test, breaks = breaks, plot = FALSE)
scores_c_platt_calib_hist <- hist(scores_c_platt_calib, breaks = breaks, plot = FALSE)
scores_c_platt_test_hist <- hist(scores_c_platt_test, breaks = breaks, plot = FALSE)
scores_c_iso_calib_hist <- hist(scores_c_iso_calib, breaks = breaks, plot = FALSE)
scores_c_iso_test_hist <- hist(scores_c_iso_test, breaks = breaks, plot = FALSE)
scores_c_beta_calib_hist <- hist(scores_c_beta_calib, breaks = breaks, plot = FALSE)
scores_c_beta_test_hist <- hist(scores_c_beta_test, breaks = breaks, plot = FALSE)
scores_c_locfit_calib_hist <- hist(scores_c_locfit_calib, breaks = breaks, plot = FALSE)
scores_c_locfit_test_hist <- hist(scores_c_locfit_test, breaks = breaks, plot = FALSE)

ind <- params$ind
nb_iter <- i_iter
scores_hist <- list(
  train = scores_train_hist,
  valid = scores_valid_hist,
  calib = scores_calib_hist,
  test = scores_test_hist,
  calib_c_platt = scores_c_platt_calib_hist,
  test_c_platt = scores_c_platt_test_hist,
  calib_c_iso = scores_c_iso_calib_hist,
  test_c_iso = scores_c_iso_test_hist,
  calib_c_beta = scores_c_beta_calib_hist,
  test_c_beta = scores_c_beta_test_hist,
  calib_c_locfit = scores_c_locfit_calib_hist,
  test_c_locfit = scores_c_locfit_test_hist,
  ind = ind,
  nb_iter = nb_iter,
  max_depth = params$max_depth
)

We can also calculate the interquantile distances of recalibrated and non-recalibrated predicted scores:

 ## Estimation of P(q1 < score < q2)----
prop_btw_q_h <- function(s, sample_name, recalib_name) {
  map(
    c(.1, .2, .3, .4),
    ~prop_btw_quantiles(s = s, q1 = .x)
    ) |>
    list_rbind() |>
    mutate(sample = sample_name, recalib = recalib_name)
}
  
proq_scores_train <- prop_btw_q_h(
  scores_train, sample_name = "train", recalib_name = "none"
)
proq_scores_valid <- prop_btw_q_h(
  scores_valid, sample_name = "valid", recalib_name = "none"
)
proq_scores_calib <- prop_btw_q_h(
  scores_calib, sample_name = "calib", recalib_name = "none"
)
proq_scores_test <- prop_btw_q_h(
  scores_test, sample_name = "test", recalib_name = "none"
)
proq_scores_c_platt_calib <- prop_btw_q_h(
  scores_c_platt_calib, sample_name = "calib", recalib_name = "platt"
)
proq_scores_c_platt_test <- prop_btw_q_h(
  scores_c_platt_test, sample_name = "test", recalib_name = "platt"
)
proq_scores_c_iso_calib <- prop_btw_q_h(
  scores_c_iso_calib, sample_name = "calib", recalib_name = "isotonic"
)
proq_scores_c_iso_test <- prop_btw_q_h(
  scores_c_iso_test, sample_name = "test", recalib_name = "isotonic"
)
proq_scores_c_beta_calib <- prop_btw_q_h(
  scores_c_beta_calib, sample_name = "calib", recalib_name = "beta"
)
proq_scores_c_beta_test <- prop_btw_q_h(
  scores_c_beta_test, sample_name = "test", recalib_name = "beta"
)
proq_scores_c_locfit_calib <- prop_btw_q_h(
  scores_c_locfit_calib, sample_name = "calib", recalib_name = "locfit"
)
proq_scores_c_locfit_test <- prop_btw_q_h(
  scores_c_locfit_test, sample_name = "test", recalib_name = "locfit"
)

And the Kullback-Leibler divergence between predicted scores and Beta prior (obtained with GAMSEL model):

 ## Dispersion Metrics----
disp_train <- disp_metrics_dataset(
  prior = prior, scores = scores_train
  ) |> 
  mutate(sample = "train", recalib = "none")
  
disp_valid <- disp_metrics_dataset(
  prior = prior, scores = scores_valid
  ) |>
  mutate(sample = "valid", recalib = "none")
  
disp_calib <- disp_metrics_dataset(
  prior = prior, scores = scores_calib
  ) |> 
  mutate(sample = "calib", recalib = "none")
  
disp_test <- disp_metrics_dataset(
  prior = prior, scores = scores_test
  ) |> 
  mutate(sample = "test", recalib = "none")

disp_c_platt_calib <- disp_metrics_dataset(
  prior = prior, scores = scores_c_platt_calib
  ) |>
  mutate(sample = "calib", recalib = "platt")

disp_c_platt_test <- disp_metrics_dataset(
  prior = prior, scores = scores_c_platt_test
  ) |>
  mutate(sample = "test", recalib = "platt")

disp_c_iso_calib <- disp_metrics_dataset(
  prior = prior, scores = scores_c_iso_calib
  ) |>
  mutate(sample = "calib", recalib = "isotonic")
  
disp_c_iso_test <- disp_metrics_dataset(
  prior = prior, scores = scores_c_iso_test
  ) |>
  mutate(sample = "test", recalib = "isotonic")

disp_c_beta_calib <- disp_metrics_dataset(
  prior = prior, scores = scores_c_beta_calib
  ) |>
  mutate(sample = "calib", recalib = "beta")

disp_c_beta_test <- disp_metrics_dataset(
  prior = prior, scores = scores_c_beta_test
  ) |>
  mutate(sample = "test", recalib = "beta")

disp_c_locfit_calib <- disp_metrics_dataset(
  prior = prior, scores = scores_c_locfit_calib
  ) |>
  mutate(sample = "calib", recalib = "locfit")
  
disp_c_locfit_test <- disp_metrics_dataset(
  prior = prior, scores = scores_c_locfit_test
  ) |>
  mutate(sample = "test", recalib = "locfit")

The performance metrics:

# Performance and Calibration Metrics----
# We add very small noise to predicted scores
# otherwise the local regression may crash
scores_train_noise <- scores_train +
  runif(n = length(scores_train), min = 0, max = 0.01)
scores_train_noise[scores_train_noise > 1] <- 1
metrics_train <- compute_metrics(
  obs = data_train |> pull(!!target_name),
  scores = scores_train_noise, true_probas = prior_train
  ) |> mutate(sample = "train", recalib = "none")
Warning in ks.test.default(true_probas, scores): p-value will be approximate in
the presence of ties
scores_valid_noise <- scores_valid +
  runif(n = length(scores_valid), min = 0, max = 0.01)
scores_valid_noise[scores_valid_noise > 1] <- 1
metrics_valid <- compute_metrics(
  obs = data_valid |> pull(!!target_name),
  scores = scores_valid_noise, true_probas = prior_valid
  ) |> mutate(sample = "valid", recalib = "none")
Warning in ks.test.default(true_probas, scores): p-value will be approximate in
the presence of ties
scores_calib_noise <- scores_calib +
  runif(n = length(scores_calib), min = 0, max = 0.01)
scores_calib_noise[scores_calib_noise > 1] <- 1
metrics_calib <- compute_metrics(
  obs = data_calib |> pull(!!target_name),
  scores = scores_calib_noise, true_probas = prior_calib
  ) |> mutate(sample = "calib", recalib = "none")
Warning in ks.test.default(true_probas, scores): p-value will be approximate in
the presence of ties
scores_test_noise <- scores_test +
  runif(n = length(scores_test), min = 0, max = 0.01)
scores_test_noise[scores_test_noise > 1] <- 1
metrics_test <- compute_metrics(
  obs = data_test |> pull(!!target_name),
  scores = scores_test_noise, true_probas = prior_test
  ) |> mutate(sample = "test", recalib = "none")
Warning in ks.test.default(true_probas, scores): p-value will be approximate in
the presence of ties
# With recalibrated scores (platt)
scores_c_platt_calib_noise <- scores_c_platt_calib +
  runif(n = length(scores_calib), min = 0, max = 0.01)
scores_c_platt_calib_noise[scores_c_platt_calib_noise > 1] <- 1
metrics_c_platt_calib <- compute_metrics(
  obs = data_calib |> pull(!!target_name),
  scores = scores_c_platt_calib_noise, true_probas = prior_calib
  ) |> mutate(sample = "calib", recalib = "platt")
Warning in ks.test.default(true_probas, scores): p-value will be approximate in
the presence of ties
scores_c_platt_test_noise <- scores_c_platt_test +
  runif(n = length(scores_test), min = 0, max = 0.01)
scores_c_platt_test_noise[scores_c_platt_test_noise > 1] <- 1
metrics_c_platt_test <- compute_metrics(
  obs = data_test |> pull(!!target_name),
  scores = scores_c_platt_test_noise, true_probas = prior_test
  ) |> mutate(sample = "test", recalib = "platt")
Warning in ks.test.default(true_probas, scores): p-value will be approximate in
the presence of ties
# With recalibrated scores (isotonic)
scores_c_iso_calib_noise <- scores_c_iso_calib +
  runif(n = length(scores_c_iso_calib), min = 0, max = 0.01)
scores_c_iso_calib_noise[scores_c_iso_calib_noise > 1] <- 1
metrics_c_iso_calib <- compute_metrics(
  obs = data_calib |> pull(!!target_name),
  scores = scores_c_iso_calib_noise, true_probas = prior_calib
  ) |> mutate(sample = "calib", recalib = "isotonic")
Warning in ks.test.default(true_probas, scores): p-value will be approximate in
the presence of ties
scores_c_iso_test_noise <- scores_c_iso_test +
  runif(n = length(scores_c_iso_test), min = 0, max = 0.01)
scores_c_iso_test_noise[scores_c_iso_test_noise > 1] <- 1
metrics_c_iso_test <- compute_metrics(
  obs = data_test |> pull(!!target_name),
  scores = scores_c_iso_test_noise, true_probas = prior_test
  ) |> mutate(sample = "test", recalib = "isotonic")
Warning in ks.test.default(true_probas, scores): p-value will be approximate in
the presence of ties
# With recalibrated scores (beta)
scores_c_beta_calib_noise <- scores_c_beta_calib +
  runif(n = length(scores_c_beta_calib), min = 0, max = 0.01)
scores_c_beta_calib_noise[scores_c_beta_calib_noise > 1] <- 1
metrics_c_beta_calib <- compute_metrics(
  obs = data_calib |> pull(!!target_name),
  scores = scores_c_beta_calib_noise, true_probas = prior_calib
  ) |> mutate(sample = "calib", recalib = "beta")
Warning in ks.test.default(true_probas, scores): p-value will be approximate in
the presence of ties
scores_c_beta_test_noise <- scores_c_beta_test +
  runif(n = length(scores_c_beta_test), min = 0, max = 0.01)
scores_c_beta_test_noise[scores_c_beta_test_noise > 1] <- 1
metrics_c_beta_test <- compute_metrics(
  obs = data_test |> pull(!!target_name),
  scores = scores_c_beta_test_noise, true_probas = prior_test
  ) |> mutate(sample = "test", recalib = "beta")
Warning in ks.test.default(true_probas, scores): p-value will be approximate in
the presence of ties
# With recalibrated scores (locfit)
scores_c_locfit_calib_noise <- scores_c_locfit_calib +
  runif(n = length(scores_c_locfit_calib), min = 0, max = 0.01)
scores_c_locfit_calib_noise[scores_c_locfit_calib_noise > 1] <- 1
metrics_c_locfit_calib <- compute_metrics(
  obs = data_calib |> pull(!!target_name),
  scores = scores_c_locfit_calib_noise, true_probas = prior_calib
  ) |> mutate(sample = "calib", recalib = "locfit")
Warning in ks.test.default(true_probas, scores): p-value will be approximate in
the presence of ties
scores_c_locfit_test_noise <- scores_c_locfit_test +
  runif(n = length(scores_c_locfit_test), min = 0, max = 0.01)
scores_c_locfit_test_noise[scores_c_locfit_test_noise > 1] <- 1
metrics_c_locfit_test <- compute_metrics(
  obs = data_test |> pull(!!target_name),
  scores = scores_c_locfit_test_noise, true_probas = prior_test
  ) |> mutate(sample = "test", recalib = "locfit")
Warning in ks.test.default(true_probas, scores): p-value will be approximate in
the presence of ties

Finally, the decomposition of expected losses:

# Decomposition of expected losses
  # Platt
  decomposition_platt_calib <- decomposition_metrics(
    obs = data_calib |> pull(!!target_name), scores = scores_calib, 
    calibrated_scores = scores_c_platt_calib, true_probas = prior_calib
  ) |> mutate(sample = "calib", recalib = "platt")
  
  decomposition_platt_test <- decomposition_metrics(
    obs = data_test |> pull(!!target_name), scores = scores_test, 
    calibrated_scores = scores_c_platt_test, true_probas = prior_test
  ) |> mutate(sample = "test", recalib = "platt")
  
  # Isotonic
  decomposition_iso_calib <- decomposition_metrics(
    obs = data_calib |> pull(!!target_name), scores = scores_calib, 
    calibrated_scores = scores_c_iso_calib, true_probas = prior_calib
  ) |> mutate(sample = "calib", recalib = "iso")
  
  decomposition_iso_test <- decomposition_metrics(
    obs = data_test |> pull(!!target_name), scores = scores_test, 
    calibrated_scores = scores_c_iso_test, true_probas = prior_test
  ) |> mutate(sample = "test", recalib = "iso")
  
  # Beta
  decomposition_beta_calib <- decomposition_metrics(
    obs = data_calib |> pull(!!target_name), scores = scores_calib, 
    calibrated_scores = scores_c_beta_calib, true_probas = prior_calib
  ) |> mutate(sample = "calib", recalib = "beta")
  
  decomposition_beta_test <- decomposition_metrics(
    obs = data_test |> pull(!!target_name), scores = scores_test, 
    calibrated_scores = scores_c_beta_test, true_probas = prior_test
  ) |> mutate(sample = "test", recalib = "beta")
  
  # Locfit
  decomposition_locfit_calib <- decomposition_metrics(
    obs = data_calib |> pull(!!target_name), scores = scores_calib, 
    calibrated_scores = scores_c_locfit_calib, true_probas = prior_calib
  ) |> mutate(sample = "calib", recalib = "locfit")
  
  decomposition_locfit_test <- decomposition_metrics(
    obs = data_test |> pull(!!target_name), scores = scores_test, 
    calibrated_scores = scores_c_locfit_test, true_probas = prior_test
  ) |> mutate(sample = "test", recalib = "locfit")

We wrap-up all metrics (calibration, dispersion and performance):

tb_metrics <- metrics_train |>
    bind_rows(metrics_valid) |>
    bind_rows(metrics_calib) |>
    bind_rows(metrics_test) |>
    bind_rows(metrics_c_platt_calib) |>
    bind_rows(metrics_c_platt_test) |>
    bind_rows(metrics_c_iso_calib) |>
    bind_rows(metrics_c_iso_test) |>
    bind_rows(metrics_c_beta_calib) |>
    bind_rows(metrics_c_beta_test) |>
    bind_rows(metrics_c_locfit_calib) |>
    bind_rows(metrics_c_locfit_test) |>
    left_join(
      disp_train |>
        bind_rows(disp_valid) |> 
        bind_rows(disp_calib) |> 
        bind_rows(disp_test) |> 
        bind_rows(disp_c_platt_calib) |> 
        bind_rows(disp_c_platt_test) |> 
        bind_rows(disp_c_iso_calib) |> 
        bind_rows(disp_c_iso_test) |> 
        bind_rows(disp_c_beta_calib) |> 
        bind_rows(disp_c_beta_test) |> 
        bind_rows(disp_c_locfit_calib) |> 
        bind_rows(disp_c_locfit_test),
      by = c("sample", "recalib")
    ) |>
    mutate(ind = ind, nb_iter = nb_iter, max_depth = params$max_depth) 
#|>
#    select(-c(mse, mae))

tb_prop_scores <- proq_scores_train |>
  bind_rows(proq_scores_valid) |>
  bind_rows(proq_scores_calib) |>
  bind_rows(proq_scores_test) |>
  bind_rows(proq_scores_c_platt_calib) |>
  bind_rows(proq_scores_c_platt_test) |>
  bind_rows(proq_scores_c_iso_calib) |>
  bind_rows(proq_scores_c_iso_test) |>
  mutate(ind = ind, nb_iter = nb_iter, max_depth = params$max_depth)

tb_decomposition_loss <- decomposition_platt_calib |>
    bind_rows(decomposition_platt_test) |>
    bind_rows(decomposition_iso_calib) |>
    bind_rows(decomposition_iso_test) |>
    bind_rows(decomposition_beta_calib) |>
    bind_rows(decomposition_beta_test) |>
    bind_rows(decomposition_locfit_calib) |>
    bind_rows(decomposition_locfit_test) |>
    mutate(ind = ind, nb_iter = nb_iter, max_depth = params$max_depth)

metrics_iter_1 <- list(
  ind = ind,
  nb_iter = nb_iter,
  max_depth = params$max_depth,
  tb_metrics = tb_metrics,
  tb_prop_scores = tb_prop_scores,
  tb_decomposition = tb_decomposition_loss,
  scores_hist = scores_hist
  )
metrics_iter <- list(metrics_iter_1)

tb_metrics <-
  map(metrics_iter, "tb_metrics") |>
  list_rbind()
tb_prop_scores <-
  map(metrics_iter, "tb_prop_scores") |>
  list_rbind()
tb_decomposition <-
  map(metrics_iter, "tb_decomposition") |>
  list_rbind()
scores_hist <- map(metrics_iter, "scores_hist")

res_grid <- vector(mode = "list", length = 1)
res_grid[[i_grid]] <- list(
  tb_metrics = tb_metrics,
  tb_prop_scores = tb_prop_scores,
  tb_decomposition = tb_decomposition,
  scores_hist = scores_hist
  )

We can also reproduce the previous steps using the different functions defined above:

# Results for a single iteration
resul_iter_1 <- get_metrics_xgb_iter(
  scores = scores,
  prior = prior,
  data_train = data_train,
  data_valid = data_valid,
  data_calib = data_calib,
  data_test = data_test,
  prior_train = prior_train,
  prior_valid = prior_valid,
  prior_calib = prior_calib,
  prior_test = prior_test,
  target_name = target_name,
  ind = ind,
  nb_iter = i_iter,
  params = params
)
[1] -22.60554
[1] 23.41854
Warning in ks.test.default(true_probas, scores): p-value will be approximate in
the presence of ties
Warning in ks.test.default(true_probas, scores): p-value will be approximate in
the presence of ties
Warning in ks.test.default(true_probas, scores): p-value will be approximate in
the presence of ties
Warning in ks.test.default(true_probas, scores): p-value will be approximate in
the presence of ties
Warning in ks.test.default(true_probas, scores): p-value will be approximate in
the presence of ties
Warning in ks.test.default(true_probas, scores): p-value will be approximate in
the presence of ties
Warning in ks.test.default(true_probas, scores): p-value will be approximate in
the presence of ties
Warning in ks.test.default(true_probas, scores): p-value will be approximate in
the presence of ties
Warning in ks.test.default(true_probas, scores): p-value will be approximate in
the presence of ties
Warning in ks.test.default(true_probas, scores): p-value will be approximate in
the presence of ties
Warning in ks.test.default(true_probas, scores): p-value will be approximate in
the presence of ties
Warning in ks.test.default(true_probas, scores): p-value will be approximate in
the presence of ties

8.5 Final results: “spambase” dataset

We calculate the performance/calibration/dispersion metrics for all iterations and all hyperparameters of the grid:

xgb_resul <- simul_xgb_real(
  data,
  target_name,
  prior,
  seed = 1234
)

We save the results for the dataset “spambase”:

save(xgb_resul, file = str_c("../output/real-data/xgb_resul_", name, ".rda"))

We can now load the results:

load(str_c("../output/real-data/xgb_resul_", name, ".rda"))

8.6 Results

metrics_xgb_all <- xgb_resul$metrics_simul |>
  mutate(
    sample = factor(
      sample,
      levels = c("train", "valid", "calib", "test"),
      labels = c("Train","Validation", "Calibration" ,"Test")
    ),
    recalib = factor(
      recalib,
      levels = c("none", "platt", "isotonic", "beta", "locfit"),
      labels = c("None", "Platt", "Isotonic", "Beta", "Locfit")
    )
  )

For each replication, we made some hyperparameters vary. Let us identify some models of interest:

  • smallest: model with the lowest number of boosting iteration
  • largest: model with the highest number of boosting iteration
  • largest_auc: model with the highest AUC on validation set
  • lowest_ici: model with the lowest ICI on validation set
  • lowest_kl: model with the lowest KL Divergence on validation set
  • mediocre_ici: model with a rather high ICI (uncalibrated model)
Code
# Identify the smallest tree on the validation set, when the scores are not
# recalibrated
smallest_xgb <-
  metrics_xgb_all |>
  filter(sample == "Validation", recalib == "None") |>
  arrange(nb_iter) |>
  slice_head(n = 1) |>
  select(ind, nb_iter, recalib) |>
  mutate(result_type = "smallest")

# Identify the largest tree
largest_xgb <-
  metrics_xgb_all |>
  filter(sample == "Validation", recalib == "None") |>
  arrange(desc(nb_iter)) |>
  slice_head(n = 1) |>
  select(ind, nb_iter, recalib) |>
  mutate(result_type = "largest")

# Identify tree with highest AUC on test set
highest_auc_xgb <-
  metrics_xgb_all |>
  filter(sample == "Validation", recalib == "None") |>
  arrange(desc(AUC)) |>
  slice_head(n = 1) |>
  select(ind, nb_iter, recalib) |>
  mutate(result_type = "largest_auc")

# Identify tree with lowest brier
lowest_brier_xgb <-
  metrics_xgb_all |>
  filter(sample == "Validation", recalib == "None") |>
  arrange(brier) |>
  slice_head(n = 1) |>
  select(ind, nb_iter, recalib) |>
  mutate(result_type = "lowest_brier")

# Identify tree with lowest ICI
lowest_ici_xgb <-
  metrics_xgb_all |>
  filter(sample == "Validation", recalib == "None") |>
  arrange(ici) |>
  slice_head(n = 1) |>
  select(ind, nb_iter, recalib) |>
  mutate(result_type = "lowest_ici")

# Identify tree with lowest KL
lowest_kl_xgb <-
  metrics_xgb_all |>
  filter(sample == "Validation", recalib == "None") |>
  arrange(KL_20_true_probas) |>
  slice_head(n = 1) |>
  select(ind, nb_iter, recalib) |>
  mutate(result_type = "lowest_kl")

mediocre_ici_xgb <- 
  metrics_xgb_all |>
  filter(sample == "Validation", recalib == "None") |>
  arrange(desc(ici)) |>
  #slice_head(n = 1) |>
  filter(nb_iter == 10 & ind == 1) |>
  select(ind, nb_iter, recalib) |>
  mutate(result_type = "mediocre_ici")

# Merge these
models_of_interest_xgb <-
  smallest_xgb |>
  bind_rows(largest_xgb) |>
  bind_rows(highest_auc_xgb) |>
  bind_rows(lowest_brier_xgb) |>
  bind_rows(lowest_ici_xgb) |>
  bind_rows(lowest_kl_xgb) |> 
  bind_rows(mediocre_ici_xgb)

models_of_interest_metrics <- NULL
for (recalibration_method in c("None", "Platt", "Isotonic", "Beta", "Locfit")) {
  # Add metrics now
  models_of_interest_metrics <-
    models_of_interest_metrics |>
    bind_rows(
      models_of_interest_xgb |> select(-recalib) |>
        left_join(
          metrics_xgb_all |>
            filter(
              recalib == recalibration_method,
              sample %in% c("Validation", "Test")
            ),
          by = c("ind", "nb_iter"),
          relationship = "many-to-many" # (calib, test)
        )
    )
}


models_of_interest_metrics <-
  models_of_interest_metrics |>
  mutate(
    result_type = factor(
      result_type,
      levels = c(
        "smallest", "largest", "largest_auc", "lowest_brier", 
        "lowest_ici", "lowest_kl", "mediocre_ici"),
      labels = c(
        "Smallest", "Largest", "AUC*", "Brier*", 
        "ICI*", "KL*", "High ICI"
      )
    )
  )
models_of_interest_metrics |> 
  mutate(
    across(
      c(
      "acc", "AUC", "brier", "ici", "log_loss", "inter_quantile_25_75",
      "inter_quantile_10_90", starts_with("KL_"), "ind_cov", 
      starts_with("prior_")
      ), 
      ~round(.x, 3)
    )
  ) |> 
  DT::datatable()
Table 8.2: Metrics for spambase dataset

8.6.1 Distribution of Scores

Let us extract all the histogram information computed over the simulations and put that in a single object, scores_hist_all.

scores_hist_all <- xgb_resul$scores_hist

We then define a function, plot_bp_xgb() which plots the distribution of scores on the test set for the “spambase” dataset. We also define a helper function, plot_bp_interest(), which plots the histogram of the scores at a specific iteration number. We will then be able to plot the distributions at the beginning of the boosting iterations, at the end, at a point where the AUC was the highest on the validation set, and at a point where the KL divergence between the distribution of scores on the validation set and the distribution of the true probabilities was the lowest. We will plot the distributions of the scores returned by the classifier, as well as those obtained with the reclibrators.

Function plot_metrics()
plot_bp_interest <- function(metrics_interest,
                             scores_hist_interest,
                             label,
                             recalib_method) {
  subtitle <- str_c(
    "Depth = ", metrics_interest$max_depth, ", ",
    "AUC = ", round(metrics_interest$AUC, 2), ", \n",
    "Brier = ", round(metrics_interest$brier, 2), ",",
    "ICI = ", round(metrics_interest$ici, 2), ", ",
    "KL = ", round(metrics_interest$KL_20_true_probas, 2)
  )

  if (recalib_method == "none") {
    plot(
      main = str_c(label, " (iter = ", metrics_interest$nb_iter,")"),
      scores_hist_interest$test,
      xlab = latex2exp::TeX("$\\hat{s}(x)$"),
      ylab = ""
    )
  } else if (recalib_method == "platt") {
    plot(
      main = str_c(label, " (iter = ", metrics_interest$nb_iter,")"),
      scores_hist_interest$test_c_platt,
      xlab = latex2exp::TeX("$\\hat{s}(x)$"),
      ylab = "",
      col = colour_recalib[["Platt"]]
    )
  } else if (recalib_method == "iso") {
    plot(
      main = str_c(label, " (iter = ", metrics_interest$nb_iter,")"),
      scores_hist_interest$test_c_iso,
      xlab = latex2exp::TeX("$\\hat{s}(x)$"),
      ylab = "",
      col = colour_recalib[["Isotonic"]]
    )
  } else if (recalib_method == "beta") {
    plot(
      main = str_c(label, " (iter = ", metrics_interest$nb_iter,")"),
      scores_hist_interest$test_c_beta,
      xlab = latex2exp::TeX("$\\hat{s}(x)$"),
      ylab = "",
      col = colour_recalib[["Beta"]]
    )
  } else if (recalib_method == "locfit") {
    plot(
      main = str_c(label, " (iter = ", metrics_interest$nb_iter,")"),
      scores_hist_interest$test_c_locfit,
      xlab = latex2exp::TeX("$\\hat{s}(x)$"),
      ylab = "",
      col = colour_recalib[["Locfit"]]
    )
  } 
  mtext(side = 3, line = -0.25, adj = .5, subtitle, cex = .5)
}

plot_bp_xgb <- function(paper_version = FALSE) {
  # Focus on current scenario
  #scores_hist_scenario <- scores_hist_all[[scenario]]
  # Focus on a particular replication
  #scores_hist_repn <- scores_hist_scenario[[repn]]
  # # Focus on a value for max_depth
  max_depth_val <- map_dbl(scores_hist_all, ~.x[[1]]$max_depth)
  # i_max_depth <- which(max_depth_val == max_depth)
  # scores_hist <- scores_hist_repn[[i_max_depth]]
  
  # True Probabilities
  #simu_data <- simulate_data_wrapper(
    #scenario = scenario,
    #params_df = params_df,
   # repn = repn # only one replication here
  #)
  true_prob <- prior$scores_gamsel$scores_train
  #true_prob <-  simu_data$data$probs_train
  
  for (recalib_method in c("none", "platt", "iso", "beta", "locfit")) {
    
    i_method <- match(recalib_method, c("none", "platt", "iso", "beta", "locfit"))
    recalib_method_lab <- c("None", "Platt", "Isotonic", "Beta", "Locfit")[i_method]
    
    # The metrics for the corresponding results, on the validation set
    metrics_xgb_current_valid <-
      metrics_xgb_all |>
      filter(
        sample == "Validation",
        recalib == "None"
      )
    # and on the test set
    metrics_xgb_current_test <-
      metrics_xgb_all |>
      filter(
        sample == "Test",
        recalib == recalib_method_lab
      )
    
    if (paper_version == FALSE) {
      hist(
        true_prob,
        breaks = seq(0, 1, by = .05),
        xlab = "p", ylab = "",
        main = "Prior Probabilities",
        xlim = c(0, 1)
      )
      mtext(
        side = 2, recalib_method_lab, line = 2.5, cex = 1,
        font.lab = 2
      )
      # Iterations of interest----
      ## Start of iterations
      scores_hist_start <- scores_hist_all[[1]][[1]]
      metrics_start <- metrics_xgb_current_test |>
        filter(
          nb_iter == scores_hist_start$nb_iter,
          max_depth == scores_hist_start$max_depth
        )
      
      plot_bp_interest(
        metrics_interest = metrics_start,
        scores_hist_interest = scores_hist_start,
        label = "Start",
        recalib_method = recalib_method
      )
      
      ## End of iterations
      scores_hist_end <- scores_hist_all[[1]][[length(scores_hist_all[[1]])]]
      metrics_end <- metrics_xgb_current_test |>
        filter(
          nb_iter == scores_hist_end$nb_iter,
          max_depth == scores_hist_start$max_depth
        )
      plot_bp_interest(
        metrics_interest = metrics_end,
        scores_hist_interest = scores_hist_end,
        label = "End",
        recalib_method = recalib_method
      )
    }
    ## Iteration with max AUC on validation set
    metrics_valid_auc_star <-
      metrics_xgb_current_valid |> arrange(desc(AUC)) |> dplyr::slice(1)
    nb_iter_auc <- metrics_valid_auc_star$nb_iter
    max_depth_auc_star <- metrics_valid_auc_star$max_depth
    i_max_depth_auc_star <- which(max_depth_val == max_depth_auc_star)
    
    metrics_max_auc <-
      metrics_xgb_current_test |>
      filter(nb_iter == !!nb_iter_auc, max_depth == max_depth_auc_star)
    # Note: indexing at 0 here...
    ind_auc <- which(map_dbl(scores_hist_all[[i_max_depth_auc_star]], "nb_iter") == nb_iter_auc)
    scores_hist_max_auc <- scores_hist_all[[i_max_depth_auc_star]][[ind_auc]]
    plot_bp_interest(
      metrics_interest = metrics_max_auc,
      scores_hist_interest = scores_hist_max_auc,
      label = "AUC*",
      recalib_method = recalib_method
    )
    if (paper_version == TRUE) {
      mtext(
        side = 2, recalib_method_lab, line = 2.5, cex = 1,
        font.lab = 2
      )
    }
    
    ## Min Brier on validation set
    metrics_valid_brier_star <-
      metrics_xgb_current_valid |> arrange(brier) |> dplyr::slice(1)
    nb_iter_brier <- metrics_valid_brier_star$nb_iter
    max_depth_brier_star <- metrics_valid_brier_star$max_depth
    i_max_depth_brier_star <- which(max_depth_val == max_depth_brier_star)
    
    metrics_min_brier <-
      metrics_xgb_current_test |>
      filter(nb_iter == !!nb_iter_brier, max_depth == max_depth_brier_star)
    ind_brier <- which(map_dbl(scores_hist_all[[i_max_depth_brier_star]], "nb_iter") == nb_iter_brier)
    scores_hist_min_brier <- scores_hist_all[[i_max_depth_brier_star]][[ind_brier]]
    plot_bp_interest(
      metrics_interest = metrics_min_brier,
      scores_hist_interest = scores_hist_min_brier,
      label = "Brier*",
      recalib_method = recalib_method
    )
    
    ## Min ICI on validation set
    metrics_valid_ici_star <-
      metrics_xgb_current_valid |> arrange(ici) |> dplyr::slice(1)
    nb_iter_ici <-   metrics_valid_ici_star$nb_iter
    max_depth_ici_star <- metrics_valid_ici_star$max_depth
    i_max_depth_ici_star <- which(max_depth_val == max_depth_ici_star)
    
    metrics_min_ici <-
      metrics_xgb_current_test |>
      filter(nb_iter == !!nb_iter_ici, max_depth == max_depth_ici_star)
    ind_ici <- which(map_dbl(scores_hist_all[[i_max_depth_ici_star]], "nb_iter") == nb_iter_ici)
    scores_hist_min_ici <- scores_hist_all[[i_max_depth_ici_star]][[ind_ici]]
    plot_bp_interest(
      metrics_interest = metrics_min_ici,
      scores_hist_interest = scores_hist_min_ici,
      label = "ICI*",
      recalib_method = recalib_method
    )
    
    ## Min KL on validation set
    metrics_valid_kl_star <-
      metrics_xgb_current_valid |> arrange(KL_20_true_probas) |> dplyr::slice(1)
    nb_iter_kl <- metrics_valid_kl_star$nb_iter
    max_depth_kl_star <- metrics_valid_kl_star$max_depth
    i_max_depth_kl_star <- which(max_depth_val == max_depth_kl_star)
    
    metrics_min_kl <-
      metrics_xgb_current_test |>
      filter(nb_iter == !!nb_iter_kl, max_depth == max_depth_kl_star)
    ind_kl <- which(map_dbl(scores_hist_all[[i_max_depth_kl_star]], "nb_iter") == nb_iter_kl)
    scores_hist_min_kl <- scores_hist_all[[i_max_depth_kl_star]][[ind_kl]]
    plot_bp_interest(
      metrics_interest = metrics_min_kl,
      scores_hist_interest = scores_hist_min_kl,
      label = "KL*",
      recalib_method = recalib_method
    )
    
    ## Mediocre ICI on validation set
    identified_mici <-  mediocre_ici_xgb
    
    metrics_valid_mici_star <- metrics_xgb_current_valid |> 
      filter(ind == identified_mici$ind, nb_iter == identified_mici$nb_iter)
    nb_iter_mici <- metrics_valid_mici_star$nb_iter
    max_depth_mici_star <- metrics_valid_mici_star$max_depth
    i_max_depth_mici_star <- which(max_depth_val == max_depth_mici_star)
    
    metrics_mici <-
      metrics_xgb_current_test |>
      filter(nb_iter == !!nb_iter_mici, max_depth == max_depth_mici_star)
    ind_mici <- which(map_dbl(scores_hist_all[[i_max_depth_mici_star]], "nb_iter") == nb_iter_mici)
    scores_hist_mici <- scores_hist_all[[i_max_depth_mici_star]][[ind_mici]]
    plot_bp_interest(
      metrics_interest = metrics_mici,
      scores_hist_interest = scores_hist_mici,
      label = "Med. ICI",
      recalib_method = recalib_method
    )
  }
}
Code
par(mfrow = c(5,8), mar = c(4.1, 4, 3.5, 1.5))
plot_bp_xgb()
Figure 8.1: Distribution of scores on the test set (spambase)
table_models_interest <- 
  models_of_interest_metrics |> 
  filter(sample == "Test") |> 
  select(
    recalib, sample, result_type, 
    brier, ici, kl = KL_20_true_probas
  ) |> 
  filter(
    result_type %in% c("AUC*", "KL*", "Largest")
  )


initial_points <- table_models_interest |> 
  filter(recalib == "None")

points_after_c <- initial_points |> 
  select(result_type, ici, kl) |> 
  left_join(
    table_models_interest |> 
      filter(recalib %in% c("Platt", "Isotonic", "Beta", "Locfit")) |> 
      select(recalib, result_type, ici, kl) |> 
      rename(ici_end = ici, kl_end = kl),
    relationship = "many-to-many" # (Platt, Isotonic, Beta and Locfit)
  )
Joining with `by = join_by(result_type)`
ggplot() +
  geom_point(
    data = initial_points,
    mapping = aes(x = ici, y = kl, shape = result_type),
    size = 2
    ) +
  geom_segment(
    data = points_after_c,
    mapping = aes(
      x = ici, y = kl, xend = ici_end, yend = kl_end,
      colour = recalib, linetype = recalib
      ),
    arrow = arrow(length=unit(.2, 'cm'))
  ) +
  scale_shape_manual(
    NULL,
    values = c(
      # "Smallest" = 1,
      "Largest" = 2,
      "AUC*" = 19,
      "KL*" = 15
    )
  ) +
  labs(x = "ICI", y = "KL Divergence") +
  scale_colour_manual("Recalibration", values = colour_recalib) +
  scale_linetype_discrete("Recalibration") +
  theme_paper()