10  Real-data Results

Note

This chapter investigates the results of XGBoost estimations for the 10 real datasets.

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggh4x)
library(ggrepel)
library(rpart)
library(locfit)
locfit 1.5-9.10      2024-06-24

Attaching package: 'locfit'

The following object is masked from 'package:purrr':

    none
library(philentropy)
library(stringr)
library(purrr)

# Colours for train/validation/test
colour_samples <- c(
  "Train" = "#0072B2",
  "Validation" = "#009E73",
  "Calibration" = "#CC79A7",
  "Test" = "#D55E00"
)

colour_recalib <- c(
  "None" = "#88CCEE",
  "Platt" = "#44AA99",
  "Isotonic" = "#882255",
  "Beta" = "#D55E00"#,
  #"Locfit" = "firebrick3"
)
definition of the theme_paper() function (for ggplot2 graphs)
#' Theme for ggplot2
#'
#' @param ... arguments passed to the theme function
#' @export
#' @importFrom ggplot2 element_rect element_text element_blank element_line unit
#'   rel
theme_paper <- function (...) {
  ggthemes::theme_base() +
    theme(
      plot.background = element_blank(),
      legend.background = element_rect(
        fill = "transparent", linetype="solid", colour ="black"),
      legend.position = "bottom",
      legend.direction = "horizontal",
      legend.box = "horizontal",
      legend.key = element_blank()
    )
}

10.1 Data

The list of datasets and the name of the target variable:

datasets <- tribble(
  ~name, ~target_name,
  "abalone", "Sex",
  "adult", "high_income",
  "bank", "y",
  "default", "default",
  "drybean", "is_dermason",
  "coupon", "y",
  "mushroom", "edible",
  "occupancy", "Occupancy",
  "winequality", "high_quality",
  "spambase", "is_spam"
)

The priors:

for (name in datasets$name) {
  # The Prior on the distribution of the scores
  load(str_c("../output/real-data/priors_", name, ".rda"))
}

The results:

files <- str_c("../output/real-data/xgb_resul_", datasets$name, ".rda")
# Loop through each file and load it with a new name
walk2(files, datasets$name, ~{
  loaded_object_name <- load(.x)
  new_name <- str_c("xgb_resul_", .y)
  assign(new_name, get(loaded_object_name), envir = .GlobalEnv)
})

10.2 Distribution of Scores

Graph functions
plot_bp_interest <- function(metrics_interest,
                             scores_hist_interest,
                             label,
                             recalib_method) {
  subtitle <- str_c(
    "Depth = ", metrics_interest$max_depth, ", ",
    "AUC = ", round(metrics_interest$AUC, 2), ", \n",
    "Brier = ", round(metrics_interest$brier, 2), ",",
    "ICI = ", round(metrics_interest$ici, 2), ", ",
    "KL = ", round(metrics_interest$KL_20_true_probas, 2)
  )
  
  if (recalib_method == "none") {
    plot(
      main = str_c(label, " (iter = ", metrics_interest$nb_iter,")"),
      scores_hist_interest$test,
      xlab = latex2exp::TeX("$\\hat{s}(x)$"),
      ylab = ""
    )
  } else if (recalib_method == "platt") {
    plot(
      main = str_c(label, " (iter = ", metrics_interest$nb_iter,")"),
      scores_hist_interest$test_c_platt,
      xlab = latex2exp::TeX("$\\hat{s}(x)$"),
      ylab = "",
      col = colour_recalib[["Platt"]]
    )
  } else if (recalib_method == "beta") {
    plot(
      main = str_c(label, " (iter = ", metrics_interest$nb_iter,")"),
      scores_hist_interest$test_c_iso,
      xlab = latex2exp::TeX("$\\hat{s}(x)$"),
      ylab = "",
      col = colour_recalib[["Beta"]]
    )
  } else if (recalib_method == "iso") {
    plot(
      main = str_c(label, " (iter = ", metrics_interest$nb_iter,")"),
      scores_hist_interest$test_c_iso,
      xlab = latex2exp::TeX("$\\hat{s}(x)$"),
      ylab = "",
      col = colour_recalib[["Isotonic"]]
    )
  } 
  mtext(side = 3, line = -0.25, adj = .5, subtitle, cex = .5)
}

plot_bp_xgb <- function(paper_version = FALSE) {
  max_depth_val <- map_dbl(scores_hist_all, ~.x[[1]]$max_depth)
  true_prob <- prior$scores_gamsel$scores_train
  
  for (recalib_method in c("none", "platt", "beta", "iso")) {
    
    i_method <- match(recalib_method, c("none", "platt", "beta", "iso"))
    recalib_method_lab <- c("None", "Platt", "Beta", "Isotonic")[i_method]
    
    # The metrics for the corresponding results, on the validation set
    metrics_xgb_current_valid <-
      metrics_xgb_all |>
      filter(
        sample == "Validation",
        recalib == "None"
      )
    # and on the test set
    metrics_xgb_current_test <-
      metrics_xgb_all |>
      filter(
        sample == "Test",
        recalib == recalib_method_lab
      )
    
    if (paper_version == FALSE) {
      hist(
        true_prob,
        breaks = seq(0, 1, by = .05),
        xlab = "p", ylab = "",
        main = "Prior Probabilities",
        xlim = c(0, 1)
      )
      mtext(
        side = 2, recalib_method_lab, line = 2.5, cex = 1,
        font.lab = 2
      )
      # Iterations of interest----
      ## Start of iterations
      scores_hist_start <- scores_hist_all[[1]][[1]]
      metrics_start <- metrics_xgb_current_test |>
        filter(
          nb_iter == scores_hist_start$nb_iter,
          max_depth == scores_hist_start$max_depth
        )
      
      plot_bp_interest(
        metrics_interest = metrics_start,
        scores_hist_interest = scores_hist_start,
        label = "Start",
        recalib_method = recalib_method
      )
      
      ## End of iterations
      scores_hist_end <- scores_hist_all[[1]][[length(scores_hist_all[[1]])]]
      metrics_end <- metrics_xgb_current_test |>
        filter(
          nb_iter == scores_hist_end$nb_iter,
          max_depth == scores_hist_start$max_depth
        )
      plot_bp_interest(
        metrics_interest = metrics_end,
        scores_hist_interest = scores_hist_end,
        label = "End",
        recalib_method = recalib_method
      )
    }
    ## Iteration with max AUC on validation set
    metrics_valid_auc_star <-
      metrics_xgb_current_valid |> arrange(desc(AUC)) |> dplyr::slice(1)
    nb_iter_auc <- metrics_valid_auc_star$nb_iter
    max_depth_auc_star <- metrics_valid_auc_star$max_depth
    i_max_depth_auc_star <- which(max_depth_val == max_depth_auc_star)
    
    metrics_max_auc <-
      metrics_xgb_current_test |>
      filter(nb_iter == !!nb_iter_auc, max_depth == max_depth_auc_star)
    # Note: indexing at 0 here...
    ind_auc <- which(map_dbl(scores_hist_all[[i_max_depth_auc_star]], "nb_iter") == nb_iter_auc)
    scores_hist_max_auc <- scores_hist_all[[i_max_depth_auc_star]][[ind_auc]]
    plot_bp_interest(
      metrics_interest = metrics_max_auc,
      scores_hist_interest = scores_hist_max_auc,
      label = "AUC*",
      recalib_method = recalib_method
    )
    if (paper_version == TRUE) {
      mtext(
        side = 2, recalib_method_lab, line = 2.5, cex = 1,
        font.lab = 2
      )
    }
    
    ## Min Brier on validation set
    metrics_valid_brier_star <-
      metrics_xgb_current_valid |> arrange(brier) |> dplyr::slice(1)
    nb_iter_brier <- metrics_valid_brier_star$nb_iter
    max_depth_brier_star <- metrics_valid_brier_star$max_depth
    i_max_depth_brier_star <- which(max_depth_val == max_depth_brier_star)
    
    metrics_min_brier <-
      metrics_xgb_current_test |>
      filter(nb_iter == !!nb_iter_brier, max_depth == max_depth_brier_star)
    ind_brier <- which(map_dbl(scores_hist_all[[i_max_depth_brier_star]], "nb_iter") == nb_iter_brier)
    scores_hist_min_brier <- scores_hist_all[[i_max_depth_brier_star]][[ind_brier]]
    plot_bp_interest(
      metrics_interest = metrics_min_brier,
      scores_hist_interest = scores_hist_min_brier,
      label = "Brier*",
      recalib_method = recalib_method
    )
    
    ## Min ICI on validation set
    metrics_valid_ici_star <-
      metrics_xgb_current_valid |> arrange(ici) |> dplyr::slice(1)
    nb_iter_ici <-   metrics_valid_ici_star$nb_iter
    max_depth_ici_star <- metrics_valid_ici_star$max_depth
    i_max_depth_ici_star <- which(max_depth_val == max_depth_ici_star)
    
    metrics_min_ici <-
      metrics_xgb_current_test |>
      filter(nb_iter == !!nb_iter_ici, max_depth == max_depth_ici_star)
    ind_ici <- which(map_dbl(scores_hist_all[[i_max_depth_ici_star]], "nb_iter") == nb_iter_ici)
    scores_hist_min_ici <- scores_hist_all[[i_max_depth_ici_star]][[ind_ici]]
    plot_bp_interest(
      metrics_interest = metrics_min_ici,
      scores_hist_interest = scores_hist_min_ici,
      label = "ICI*",
      recalib_method = recalib_method
    )
    
    ## Min KL on validation set
    metrics_valid_kl_star <-
      metrics_xgb_current_valid |> arrange(KL_20_true_probas) |> dplyr::slice(1)
    nb_iter_kl <- metrics_valid_kl_star$nb_iter
    max_depth_kl_star <- metrics_valid_kl_star$max_depth
    i_max_depth_kl_star <- which(max_depth_val == max_depth_kl_star)
    
    metrics_min_kl <-
      metrics_xgb_current_test |>
      filter(nb_iter == !!nb_iter_kl, max_depth == max_depth_kl_star)
    ind_kl <- which(map_dbl(scores_hist_all[[i_max_depth_kl_star]], "nb_iter") == nb_iter_kl)
    scores_hist_min_kl <- scores_hist_all[[i_max_depth_kl_star]][[ind_kl]]
    plot_bp_interest(
      metrics_interest = metrics_min_kl,
      scores_hist_interest = scores_hist_min_kl,
      label = "KL*",
      recalib_method = recalib_method
    )
    
    ## Mediocre ICI on validation set
    identified_mici <-  mediocre_ici_xgb
    
    metrics_valid_mici_star <- metrics_xgb_current_valid |> 
      filter(ind == identified_mici$ind, nb_iter == identified_mici$nb_iter)
    nb_iter_mici <- metrics_valid_mici_star$nb_iter
    max_depth_mici_star <- metrics_valid_mici_star$max_depth
    i_max_depth_mici_star <- which(max_depth_val == max_depth_mici_star)
    
    metrics_mici <-
      metrics_xgb_current_test |>
      filter(nb_iter == !!nb_iter_mici, max_depth == max_depth_mici_star)
    ind_mici <- which(map_dbl(scores_hist_all[[i_max_depth_mici_star]], "nb_iter") == nb_iter_mici)
    scores_hist_mici <- scores_hist_all[[i_max_depth_mici_star]][[ind_mici]]
    plot_bp_interest(
      metrics_interest = metrics_mici,
      scores_hist_interest = scores_hist_mici,
      label = "Med. ICI",
      recalib_method = recalib_method
    )
  }
}



plot_bp_xgb <- function(scores_hist_all, 
                        prior, 
                        metrics_xgb_all,
                        mediocre_ici_xgb,
                        paper_version = FALSE) {
  # # Focus on a value for max_depth
  max_depth_val <- map_dbl(scores_hist_all, ~.x[[1]]$max_depth)
  
  # True Probabilities
  true_prob <- prior$scores_gamsel$scores_train
  
  for (recalib_method in c("none", "platt", "beta", "iso")) {
    
    i_method <- match(recalib_method, c("none", "platt", "beta", "iso"))
    recalib_method_lab <- c("None", "Platt", "Beta", "Isotonic")[i_method]
    
    # The metrics for the corresponding results, on the validation set
    metrics_xgb_current_valid <-
      metrics_xgb_all |>
      filter(
        sample == "Validation",
        recalib == "None"
      )
    # and on the test set
    metrics_xgb_current_test <-
      metrics_xgb_all |>
      filter(
        sample == "Test",
        recalib == recalib_method_lab
      )
    
    if (paper_version == FALSE) {
      hist(
        true_prob,
        breaks = seq(0, 1, by = .05),
        xlab = "p", ylab = "",
        main = "Prior Probabilities",
        xlim = c(0, 1)
      )
      mtext(
        side = 2, recalib_method_lab, line = 2.5, cex = 1,
        font.lab = 2
      )
      # Iterations of interest----
      ## Start of iterations
      scores_hist_start <- scores_hist_all[[1]][[1]]
      metrics_start <- metrics_xgb_current_test |>
        filter(
          nb_iter == scores_hist_start$nb_iter,
          max_depth == scores_hist_start$max_depth
        )
      
      plot_bp_interest(
        metrics_interest = metrics_start,
        scores_hist_interest = scores_hist_start,
        label = "Start",
        recalib_method = recalib_method
      )
      
      ## End of iterations
      scores_hist_end <- scores_hist_all[[1]][[length(scores_hist_all[[1]])]]
      metrics_end <- metrics_xgb_current_test |>
        filter(
          nb_iter == scores_hist_end$nb_iter,
          max_depth == scores_hist_start$max_depth
        )
      plot_bp_interest(
        metrics_interest = metrics_end,
        scores_hist_interest = scores_hist_end,
        label = "End",
        recalib_method = recalib_method
      )
    }
    ## Iteration with max AUC on validation set
    metrics_valid_auc_star <-
      metrics_xgb_current_valid |> arrange(desc(AUC)) |> dplyr::slice(1)
    nb_iter_auc <- metrics_valid_auc_star$nb_iter
    max_depth_auc_star <- metrics_valid_auc_star$max_depth
    i_max_depth_auc_star <- which(max_depth_val == max_depth_auc_star)
    
    metrics_max_auc <-
      metrics_xgb_current_test |>
      filter(nb_iter == !!nb_iter_auc, max_depth == max_depth_auc_star)
    # Note: indexing at 0 here...
    ind_auc <- which(map_dbl(scores_hist_all[[i_max_depth_auc_star]], "nb_iter") == nb_iter_auc)
    scores_hist_max_auc <- scores_hist_all[[i_max_depth_auc_star]][[ind_auc]]
    plot_bp_interest(
      metrics_interest = metrics_max_auc,
      scores_hist_interest = scores_hist_max_auc,
      label = "AUC*",
      recalib_method = recalib_method
    )
    if (paper_version == TRUE) {
      mtext(
        side = 2, recalib_method_lab, line = 2.5, cex = 1,
        font.lab = 2
      )
    }
    
    ## Min Brier on validation set
    metrics_valid_brier_star <-
      metrics_xgb_current_valid |> arrange(brier) |> dplyr::slice(1)
    nb_iter_brier <- metrics_valid_brier_star$nb_iter
    max_depth_brier_star <- metrics_valid_brier_star$max_depth
    i_max_depth_brier_star <- which(max_depth_val == max_depth_brier_star)
    
    metrics_min_brier <-
      metrics_xgb_current_test |>
      filter(nb_iter == !!nb_iter_brier, max_depth == max_depth_brier_star)
    ind_brier <- which(map_dbl(scores_hist_all[[i_max_depth_brier_star]], "nb_iter") == nb_iter_brier)
    scores_hist_min_brier <- scores_hist_all[[i_max_depth_brier_star]][[ind_brier]]
    plot_bp_interest(
      metrics_interest = metrics_min_brier,
      scores_hist_interest = scores_hist_min_brier,
      label = "Brier*",
      recalib_method = recalib_method
    )
    
    ## Min ICI on validation set
    metrics_valid_ici_star <-
      metrics_xgb_current_valid |> arrange(ici) |> dplyr::slice(1)
    nb_iter_ici <-   metrics_valid_ici_star$nb_iter
    max_depth_ici_star <- metrics_valid_ici_star$max_depth
    i_max_depth_ici_star <- which(max_depth_val == max_depth_ici_star)
    
    metrics_min_ici <-
      metrics_xgb_current_test |>
      filter(nb_iter == !!nb_iter_ici, max_depth == max_depth_ici_star)
    ind_ici <- which(map_dbl(scores_hist_all[[i_max_depth_ici_star]], "nb_iter") == nb_iter_ici)
    scores_hist_min_ici <- scores_hist_all[[i_max_depth_ici_star]][[ind_ici]]
    plot_bp_interest(
      metrics_interest = metrics_min_ici,
      scores_hist_interest = scores_hist_min_ici,
      label = "ICI*",
      recalib_method = recalib_method
    )
    
    ## Min KL on validation set
    metrics_valid_kl_star <-
      metrics_xgb_current_valid |> arrange(KL_20_true_probas) |> dplyr::slice(1)
    nb_iter_kl <- metrics_valid_kl_star$nb_iter
    max_depth_kl_star <- metrics_valid_kl_star$max_depth
    i_max_depth_kl_star <- which(max_depth_val == max_depth_kl_star)
    
    metrics_min_kl <-
      metrics_xgb_current_test |>
      filter(nb_iter == !!nb_iter_kl, max_depth == max_depth_kl_star)
    ind_kl <- which(map_dbl(scores_hist_all[[i_max_depth_kl_star]], "nb_iter") == nb_iter_kl)
    scores_hist_min_kl <- scores_hist_all[[i_max_depth_kl_star]][[ind_kl]]
    plot_bp_interest(
      metrics_interest = metrics_min_kl,
      scores_hist_interest = scores_hist_min_kl,
      label = "KL*",
      recalib_method = recalib_method
    )
    
    ## Mediocre ICI on validation set
    identified_mici <-  mediocre_ici_xgb
    
    metrics_valid_mici_star <- metrics_xgb_current_valid |> 
      filter(ind == identified_mici$ind, nb_iter == identified_mici$nb_iter)
    nb_iter_mici <- metrics_valid_mici_star$nb_iter
    max_depth_mici_star <- metrics_valid_mici_star$max_depth
    i_max_depth_mici_star <- which(max_depth_val == max_depth_mici_star)
    
    metrics_mici <-
      metrics_xgb_current_test |>
      filter(nb_iter == !!nb_iter_mici, max_depth == max_depth_mici_star)
    ind_mici <- which(map_dbl(scores_hist_all[[i_max_depth_mici_star]], "nb_iter") == nb_iter_mici)
    scores_hist_mici <- scores_hist_all[[i_max_depth_mici_star]][[ind_mici]]
    plot_bp_interest(
      metrics_interest = metrics_mici,
      scores_hist_interest = scores_hist_mici,
      label = "Med. ICI",
      recalib_method = recalib_method
    )
  }
}


plot_bp_xgb_dataset <- function(xgb_resul, prior) {
  scores_hist_all <- xgb_resul$scores_hist
  
  metrics_xgb_all <- xgb_resul$metrics_simul |>
    mutate(
      sample = factor(
        sample,
        levels = c("train", "valid", "calib", "test"),
        labels = c("Train","Validation", "Calibration" ,"Test")
      ),
      recalib = factor(
        recalib,
        levels = c("none", "platt", "beta", "isotonic"),
        labels = c("None", "Platt", "Beta", "Isotonic")
      )
    )
  
  mediocre_ici_xgb <- 
  metrics_xgb_all |>
  filter(sample == "Validation", recalib == "None") |>
  # For each replication for a scenario, we select a model with a mediocre 
  # calibration
  mutate(
    mean_ici = mean(ici),
    sd_ici = sd(ici),
    upb_ici = mean_ici + sd_ici,
  ) |> 
  filter(ici >  upb_ici) |> 
  arrange(desc(AUC)) |>
  # Among the configurations for which the calibration is not within 1-sd of the
  # average calibration, we select the model with the lowest ICI
  #arrange(ici) |> 
  slice_head(n = 1) |> 
  select(ind, nb_iter, recalib) |>
  mutate(result_type = "mediocre_ici")
  
  plot_bp_xgb(
    scores_hist_all = scores_hist_all, 
    prior = prior, 
    metrics_xgb_all = metrics_xgb_all,
    mediocre_ici_xgb = mediocre_ici_xgb)
}
Code
par(mfrow = c(4,8), mar = c(4.1, 4, 3.5, 1.5))
plot_bp_xgb_dataset(xgb_resul = xgb_resul_abalone, prior = priors_abalone)
Figure 10.1: Distribution of scores on the test set (abalone)
Code
par(mfrow = c(4,8), mar = c(4.1, 4, 3.5, 1.5))
plot_bp_xgb_dataset(xgb_resul = xgb_resul_adult, prior = priors_adult)
Figure 10.2: Distribution of scores on the test set (adult)
Code
par(mfrow = c(4,8), mar = c(4.1, 4, 3.5, 1.5))
plot_bp_xgb_dataset(xgb_resul = xgb_resul_bank, prior = priors_bank)
Figure 10.3: Distribution of scores on the test set (bank)
Code
par(mfrow = c(4,8), mar = c(4.1, 4, 3.5, 1.5))
plot_bp_xgb_dataset(xgb_resul = xgb_resul_default, prior = priors_default)
Figure 10.4: Distribution of scores on the test set (default)
Code
par(mfrow = c(4,8), mar = c(4.1, 4, 3.5, 1.5))
plot_bp_xgb_dataset(xgb_resul = xgb_resul_drybean, prior = priors_drybean)
Figure 10.5: Distribution of scores on the test set (drybean)
Code
par(mfrow = c(4,8), mar = c(4.1, 4, 3.5, 1.5))
plot_bp_xgb_dataset(xgb_resul = xgb_resul_coupon, prior = priors_coupon)
Figure 10.6: Distribution of scores on the test set (coupon)
Code
par(mfrow = c(4,8), mar = c(4.1, 4, 3.5, 1.5))
plot_bp_xgb_dataset(xgb_resul = xgb_resul_mushroom, prior = priors_mushroom)
Figure 10.7: Distribution of scores on the test set (mushroom)
Code
par(mfrow = c(4,8), mar = c(4.1, 4, 3.5, 1.5))
plot_bp_xgb_dataset(xgb_resul = xgb_resul_occupancy, prior = priors_occupancy)
Figure 10.8: Distribution of scores on the test set (occupancy)
Code
par(mfrow = c(4,8), mar = c(4.1, 4, 3.5, 1.5))
plot_bp_xgb_dataset(xgb_resul = xgb_resul_winequality, prior = priors_winequality)
Figure 10.9: Distribution of scores on the test set (winequality)
Code
par(mfrow = c(4,8), mar = c(4.1, 4, 3.5, 1.5))
plot_bp_xgb_dataset(xgb_resul = xgb_resul_spambase, prior = priors_spambase)
Figure 10.10: Distribution of scores on the test set (spambase)

10.3 Before vs. After Recalibration

Let us have a look at the evolution of ICI and KL divergence before and after calibration with both methods (Platt-scaling and Isotonic regression). We consider three models for each dataset: the one for which the hyperparameters were selected based on the maximization of AUC (AUC*), the one for which the hyperparameters were selected based on the minimization of the Kullback-Leibler divergence between the distribution of the estimated scores and that of the prior distribution, and one with a relatively high ICI, i.e., with poor calibration.

Functions to extract the metrics from the results
#' Function to extract the results for models of interest
gather_metrics <- function(xgb_resul) {
  
  metrics_xgb_all <- xgb_resul$metrics_simul |>
    mutate(
      sample = factor(
        sample,
        levels = c("train", "valid", "calib", "test"),
        labels = c("Train","Validation", "Calibration" ,"Test")
      ),
      recalib = factor(
        recalib,
        levels = c("none", "platt", "beta", "isotonic"),
        labels = c("None", "Platt", "Beta", "Isotonic")
      )
    )
  
  # Identify the smallest tree on the validation set, when the scores are not
  # recalibrated
  smallest_xgb <-
    metrics_xgb_all |>
    filter(sample == "Validation", recalib == "None") |>
    arrange(nb_iter) |>
    slice_head(n = 1) |>
    select(ind, nb_iter, recalib) |>
    mutate(result_type = "smallest")
  
  # Identify the largest tree
  largest_xgb <-
    metrics_xgb_all |>
    filter(sample == "Validation", recalib == "None") |>
    arrange(desc(nb_iter)) |>
    slice_head(n = 1) |>
    select(ind, nb_iter, recalib) |>
    mutate(result_type = "largest")
  
  # Identify tree with highest AUC on test set
  highest_auc_xgb <-
    metrics_xgb_all |>
    filter(sample == "Validation", recalib == "None") |>
    arrange(desc(AUC)) |>
    slice_head(n = 1) |>
    select(ind, nb_iter, recalib) |>
    mutate(result_type = "largest_auc")
  
  # Identify tree with lowest brier
  lowest_brier_xgb <-
    metrics_xgb_all |>
    filter(sample == "Validation", recalib == "None") |>
    arrange(brier) |>
    slice_head(n = 1) |>
    select(ind, nb_iter, recalib) |>
    mutate(result_type = "lowest_brier")
  
  # Identify tree with lowest ICI
  lowest_ici_xgb <-
    metrics_xgb_all |>
    filter(sample == "Validation", recalib == "None") |>
    arrange(ici) |>
    slice_head(n = 1) |>
    select(ind, nb_iter, recalib) |>
    mutate(result_type = "lowest_ici")
  
  # Identify tree with lowest KL
  lowest_kl_xgb <-
    metrics_xgb_all |>
    filter(sample == "Validation", recalib == "None") |>
    arrange(KL_20_true_probas) |>
    slice_head(n = 1) |>
    select(ind, nb_iter, recalib) |>
    mutate(result_type = "lowest_kl")
  
  mediocre_ici_xgb <- 
  metrics_xgb_all |>
  filter(sample == "Validation", recalib == "None") |>
  mutate(
    mean_ici = mean(ici),
    sd_ici = sd(ici),
    upb_ici = mean_ici + sd_ici,
  ) |> 
  filter(ici >  upb_ici) |> 
  arrange(desc(AUC)) |>
  slice_head(n = 1) |> 
  select(ind, nb_iter, recalib) |>
  mutate(result_type = "mediocre_ici")
  
  # Merge these
  models_of_interest_xgb <-
    smallest_xgb |>
    bind_rows(largest_xgb) |>
    bind_rows(highest_auc_xgb) |>
    bind_rows(lowest_brier_xgb) |>
    bind_rows(lowest_ici_xgb) |>
    bind_rows(lowest_kl_xgb) |> 
    bind_rows(mediocre_ici_xgb)
  
  models_of_interest_metrics <- NULL
  for (recalibration_method in c("None", "Platt", "Beta", "Isotonic")) {
    # Add metrics now
    models_of_interest_metrics <-
      models_of_interest_metrics |>
      bind_rows(
        models_of_interest_xgb |> select(-recalib) |>
          left_join(
            metrics_xgb_all |>
              filter(
                recalib == recalibration_method,
                sample %in% c("Validation", "Test")
              ),
            by = c("ind", "nb_iter"),
            relationship = "many-to-many" # (calib, test)
          )
      )
  }
  
  models_of_interest_metrics <-
    models_of_interest_metrics |>
    mutate(
      result_type = factor(
        result_type,
        levels = c(
          "smallest", "largest", "lowest_mse", "largest_auc",
          "lowest_brier", "lowest_ici", "lowest_kl", "mediocre_ici"),
        labels = c(
          "Smallest", "Largest", "MSE*", "AUC*",
          "Brier*", "ICI*", "KL*", "High ICI"
        )
      )
    )
  models_of_interest_metrics
}

#' Function to format results for selected models for later use in ggplot2
get_data_plot_before_after_recalib <- function(xgb_resul, dataset_name) {
  models_of_interest_metrics <- gather_metrics(xgb_resul = xgb_resul)
  
  table_models_interest <- 
    models_of_interest_metrics |> 
    filter(sample == "Test") |> 
    select(
      recalib, sample, result_type, 
      AUC, brier, ici, kl = KL_20_true_probas
    ) |> 
    filter(
      result_type %in% c("AUC*", "KL*", "High ICI")
    ) |> 
    mutate(dataset = dataset_name)
  
  
  initial_points <- table_models_interest |> 
    filter(recalib == "None") |> 
    mutate(dataset = dataset_name)
  
  points_after_c <- initial_points |> 
    select(result_type, ici, kl) |> 
    left_join(
      table_models_interest |> 
        filter(recalib %in% c("Platt", "Beta", "Isotonic")) |> 
        select(recalib, result_type, ici, kl) |> 
        rename(ici_end = ici, kl_end = kl),
      by = "result_type",
      relationship = "many-to-many" # (Platt, Isotonic and Beta)
    ) |> 
    mutate(dataset = dataset_name)
  
  list(
    table_models_interest = table_models_interest,
    initial_points = initial_points,
    points_after_c = points_after_c
  )
}


# The models of interest for all datasets
data_plot_before_after <- map2(
  .x = str_c("xgb_resul_", datasets$name), 
  .y = datasets$name, 
  .f = ~get_data_plot_before_after_recalib(xgb_resul = get(.x), dataset_name = .y)
)

# The metrics for these models
table_models_interest <- 
  map(data_plot_before_after, "table_models_interest") |> 
  list_rbind()

# The metrics for the three models of interest in the subsequent plot
# before recalibration
initial_points <- 
  map(data_plot_before_after, "initial_points") |> 
  list_rbind()

# and after recalibration
points_after_c <- 
  map(data_plot_before_after, "points_after_c") |> 
  list_rbind()

10.3.1 Tables

Display codes to create the Table
digits <- 2

table_models_interest |> 
  relocate(dataset, .before = recalib) |> 
  select(-sample, -AUC) |> 
  pivot_longer(cols = c(brier, ici, kl)) |> 
  pivot_wider(names_from = c(recalib, name), values_from = value) |> 
  knitr::kable(
    align = str_c("l", str_c(rep("c", 3*4), collapse = ""), collapse = ""),
    escape = FALSE, booktabs = T, digits = 3, format = "markdown",
    col.names = c(
      "Dataset", "Optim.",
      rep(c("BS", "ICI", "KL"), 4)
    )
  ) |> 
  kableExtra::collapse_rows(columns = 1, valign = "top") |> 
  kableExtra::add_header_above(
    c(" " = 2,
      "None" = 3,
      "Platt" = 3,
       "Beta" = 3,
      "Isotonic" = 3
    )
  ) |> 
  kableExtra::scroll_box(fixed_thead = TRUE, height = "500px")
Table 10.1: Performance and calibration metrics (Brier Score, Integrated Calibration Index, Kullback-Leibler Divergence) computed on the test set, on scores returned by the model (column ‘None’), on scores recalibrated using Platt scaling (column ‘Platt’), or Isotonic regression (coliumn ‘Isotonic’)
None
Platt
Beta
Isotonic
Dataset Optim. BS ICI KL BS ICI KL BS ICI KL BS ICI KL
abalone AUC* 0.214 0.069 0.397 0.209 0.044 0.411 0.208 0.042 0.492 0.206 0.031 1.169
KL* 0.210 0.057 0.320 0.208 0.052 0.475 0.208 0.048 0.453 0.205 0.037 1.239
High ICI 0.258 0.180 4.513 0.219 0.075 0.383 0.216 0.057 0.074 0.214 0.033 0.770
adult AUC* 0.090 0.008 0.325 0.092 0.039 0.461 0.090 0.008 0.328 0.090 0.008 0.639
KL* 0.102 0.036 0.092 0.101 0.039 0.301 0.100 0.015 0.288 0.100 0.008 0.680
High ICI 0.100 0.045 0.641 0.102 0.061 1.475 0.096 0.012 0.295 0.096 0.011 0.514
bank AUC* 0.062 0.017 0.485 0.066 0.047 0.650 0.062 0.009 0.489 0.062 0.009 0.594
KL* 0.070 0.039 0.062 0.071 0.037 0.441 0.069 0.010 0.315 0.069 0.004 0.734
High ICI 0.068 0.040 0.437 0.070 0.040 0.496 0.067 0.012 0.453 0.067 0.005 0.546
default AUC* 0.128 0.025 0.353 0.129 0.036 0.799 0.128 0.018 0.275 0.128 0.014 0.581
KL* 0.129 0.009 0.349 0.130 0.027 0.698 0.129 0.016 0.234 0.129 0.014 0.774
High ICI 0.142 0.095 1.498 0.133 0.036 1.317 0.131 0.015 0.660 0.131 0.013 0.892
drybean AUC* 0.029 0.011 0.687 0.031 0.025 0.871 0.028 0.008 0.650 0.029 0.009 0.870
KL* 0.036 0.042 0.370 0.040 0.039 0.843 0.037 0.030 0.680 0.034 0.019 0.731
High ICI 0.036 0.070 2.149 0.034 0.026 0.832 0.032 0.021 0.742 0.031 0.012 0.771
coupon AUC* 0.158 0.041 1.625 0.158 0.038 0.879 0.157 0.028 0.540 0.158 0.025 1.137
KL* 0.192 0.038 0.048 0.190 0.025 0.194 0.190 0.021 0.132 0.191 0.022 0.659
High ICI 0.162 0.075 2.354 0.159 0.054 1.007 0.157 0.028 0.522 0.157 0.023 1.169
mushroom AUC* 0.000 0.003 1.399 0.000 0.003 1.399 0.000 0.002 1.399 0.000 0.003 1.399
KL* 0.016 0.063 0.616 0.010 0.020 1.291 0.010 0.019 1.284 0.006 0.003 1.332
High ICI 0.010 0.038 0.750 0.006 0.022 1.315 0.006 0.018 1.315 0.001 0.003 1.399
occupancy AUC* 0.007 0.005 1.064 0.006 0.006 1.175 0.007 0.006 1.081 0.007 0.006 1.069
KL* 0.009 0.044 0.864 0.007 0.006 1.136 0.008 0.006 1.109 0.008 0.006 1.055
High ICI 0.009 0.033 0.919 0.007 0.006 1.157 0.008 0.007 1.115 0.008 0.008 1.052
winequality AUC* 0.153 0.110 4.927 0.143 0.047 1.862 0.137 0.017 1.090 0.136 0.027 1.872
KL* 0.170 0.057 0.118 0.166 0.040 0.418 0.166 0.026 0.365 0.166 0.018 1.009
High ICI 0.153 0.110 4.927 0.143 0.047 1.862 0.137 0.017 1.090 0.136 0.027 1.872
spambase AUC* 0.032 0.011 0.844 0.035 0.023 1.141 0.032 0.014 0.708 0.034 0.007 1.109
KL* 0.053 0.055 0.260 0.049 0.013 0.560 0.049 0.013 0.437 0.050 0.011 0.858
High ICI 0.040 0.035 0.432 0.041 0.023 0.865 0.039 0.015 0.677 0.040 0.011 0.975

\begin{tabular}{lccccccccccccl}
\toprule
\multicolumn{2}{c}{ } & \multicolumn{3}{c}{None} & \multicolumn{3}{c}{Platt} & \multicolumn{3}{c}{Beta} & \multicolumn{3}{c}{Isotonic} \\
\cmidrule(l{3pt}r{3pt}){3-5} \cmidrule(l{3pt}r{3pt}){6-8} \cmidrule(l{3pt}r{3pt}){9-11} \cmidrule(l{3pt}r{3pt}){12-14}
Dataset & Optim. & BS & ICI & KL & BS & ICI & KL & BS & ICI & KL & BS & ICI & KL\\
\midrule
 & AUC* & 0.214 & 0.069 & 0.397 & 0.209 & 0.044 & 0.411 & 0.208 & 0.042 & 0.492 & 0.206 & 0.031 & 1.169\\
\cmidrule{2-14}
 & KL* & 0.210 & 0.057 & 0.320 & 0.208 & 0.052 & 0.475 & 0.208 & 0.048 & 0.453 & 0.205 & 0.037 & 1.239\\
\cmidrule{2-14}
\multirow[t]{-3}{*}{\raggedright\arraybackslash abalone} & High ICI & 0.258 & 0.180 & 4.513 & 0.219 & 0.075 & 0.383 & 0.216 & 0.057 & 0.074 & 0.214 & 0.033 & 0.770\\
\cmidrule{1-14}
 & AUC* & 0.090 & 0.008 & 0.325 & 0.092 & 0.039 & 0.461 & 0.090 & 0.008 & 0.328 & 0.090 & 0.008 & 0.639\\
\cmidrule{2-14}
 & KL* & 0.102 & 0.036 & 0.092 & 0.101 & 0.039 & 0.301 & 0.100 & 0.015 & 0.288 & 0.100 & 0.008 & 0.680\\
\cmidrule{2-14}
\multirow[t]{-3}{*}{\raggedright\arraybackslash adult} & High ICI & 0.100 & 0.045 & 0.641 & 0.102 & 0.061 & 1.475 & 0.096 & 0.012 & 0.295 & 0.096 & 0.011 & 0.514\\
\cmidrule{1-14}
 & AUC* & 0.062 & 0.017 & 0.485 & 0.066 & 0.047 & 0.650 & 0.062 & 0.009 & 0.489 & 0.062 & 0.009 & 0.594\\
\cmidrule{2-14}
 & KL* & 0.070 & 0.039 & 0.062 & 0.071 & 0.037 & 0.441 & 0.069 & 0.010 & 0.315 & 0.069 & 0.004 & 0.734\\
\cmidrule{2-14}
\multirow[t]{-3}{*}{\raggedright\arraybackslash bank} & High ICI & 0.068 & 0.040 & 0.437 & 0.070 & 0.040 & 0.496 & 0.067 & 0.012 & 0.453 & 0.067 & 0.005 & 0.546\\
\cmidrule{1-14}
 & AUC* & 0.128 & 0.025 & 0.353 & 0.129 & 0.036 & 0.799 & 0.128 & 0.018 & 0.275 & 0.128 & 0.014 & 0.581\\
\cmidrule{2-14}
 & KL* & 0.129 & 0.009 & 0.349 & 0.130 & 0.027 & 0.698 & 0.129 & 0.016 & 0.234 & 0.129 & 0.014 & 0.774\\
\cmidrule{2-14}
\multirow[t]{-3}{*}{\raggedright\arraybackslash default} & High ICI & 0.142 & 0.095 & 1.498 & 0.133 & 0.036 & 1.317 & 0.131 & 0.015 & 0.660 & 0.131 & 0.013 & 0.892\\
\cmidrule{1-14}
 & AUC* & 0.029 & 0.011 & 0.687 & 0.031 & 0.025 & 0.871 & 0.028 & 0.008 & 0.650 & 0.029 & 0.009 & 0.870\\
\cmidrule{2-14}
 & KL* & 0.036 & 0.042 & 0.370 & 0.040 & 0.039 & 0.843 & 0.037 & 0.030 & 0.680 & 0.034 & 0.019 & 0.731\\
\cmidrule{2-14}
\multirow[t]{-3}{*}{\raggedright\arraybackslash drybean} & High ICI & 0.036 & 0.070 & 2.149 & 0.034 & 0.026 & 0.832 & 0.032 & 0.021 & 0.742 & 0.031 & 0.012 & 0.771\\
\cmidrule{1-14}
 & AUC* & 0.158 & 0.041 & 1.625 & 0.158 & 0.038 & 0.879 & 0.157 & 0.028 & 0.540 & 0.158 & 0.025 & 1.137\\
\cmidrule{2-14}
 & KL* & 0.192 & 0.038 & 0.048 & 0.190 & 0.025 & 0.194 & 0.190 & 0.021 & 0.132 & 0.191 & 0.022 & 0.659\\
\cmidrule{2-14}
\multirow[t]{-3}{*}{\raggedright\arraybackslash coupon} & High ICI & 0.162 & 0.075 & 2.354 & 0.159 & 0.054 & 1.007 & 0.157 & 0.028 & 0.522 & 0.157 & 0.023 & 1.169\\
\cmidrule{1-14}
 & AUC* & 0.000 & 0.003 & 1.399 & 0.000 & 0.003 & 1.399 & 0.000 & 0.002 & 1.399 & 0.000 & 0.003 & 1.399\\
\cmidrule{2-14}
 & KL* & 0.016 & 0.063 & 0.616 & 0.010 & 0.020 & 1.291 & 0.010 & 0.019 & 1.284 & 0.006 & 0.003 & 1.332\\
\cmidrule{2-14}
\multirow[t]{-3}{*}{\raggedright\arraybackslash mushroom} & High ICI & 0.010 & 0.038 & 0.750 & 0.006 & 0.022 & 1.315 & 0.006 & 0.018 & 1.315 & 0.001 & 0.003 & 1.399\\
\cmidrule{1-14}
 & AUC* & 0.007 & 0.005 & 1.064 & 0.006 & 0.006 & 1.175 & 0.007 & 0.006 & 1.081 & 0.007 & 0.006 & 1.069\\
\cmidrule{2-14}
 & KL* & 0.009 & 0.044 & 0.864 & 0.007 & 0.006 & 1.136 & 0.008 & 0.006 & 1.109 & 0.008 & 0.006 & 1.055\\
\cmidrule{2-14}
\multirow[t]{-3}{*}{\raggedright\arraybackslash occupancy} & High ICI & 0.009 & 0.033 & 0.919 & 0.007 & 0.006 & 1.157 & 0.008 & 0.007 & 1.115 & 0.008 & 0.008 & 1.052\\
\cmidrule{1-14}
 & AUC* & 0.153 & 0.110 & 4.927 & 0.143 & 0.047 & 1.862 & 0.137 & 0.017 & 1.090 & 0.136 & 0.027 & 1.872\\
\cmidrule{2-14}
 & KL* & 0.170 & 0.057 & 0.118 & 0.166 & 0.040 & 0.418 & 0.166 & 0.026 & 0.365 & 0.166 & 0.018 & 1.009\\
\cmidrule{2-14}
\multirow[t]{-3}{*}{\raggedright\arraybackslash winequality} & High ICI & 0.153 & 0.110 & 4.927 & 0.143 & 0.047 & 1.862 & 0.137 & 0.017 & 1.090 & 0.136 & 0.027 & 1.872\\
\cmidrule{1-14}
 & AUC* & 0.032 & 0.011 & 0.844 & 0.035 & 0.023 & 1.141 & 0.032 & 0.014 & 0.708 & 0.034 & 0.007 & 1.109\\
\cmidrule{2-14}
 & KL* & 0.053 & 0.055 & 0.260 & 0.049 & 0.013 & 0.560 & 0.049 & 0.013 & 0.437 & 0.050 & 0.011 & 0.858\\
\cmidrule{2-14}
\multirow[t]{-3}{*}{\raggedright\arraybackslash spambase} & High ICI & 0.040 & 0.035 & 0.432 & 0.041 & 0.023 & 0.865 & 0.039 & 0.015 & 0.677 & 0.040 & 0.011 & 0.975\\
\bottomrule
\end{tabular}

10.3.2 Figures

Codes to crate the figure
plot_before_after <- ggplot() +
  geom_point(
    data = initial_points,
    mapping = aes(x = ici, y = kl, shape = result_type),
    size = 2
  ) +
  geom_segment(
    data = points_after_c,
    mapping = aes(
      x = ici, y = kl, xend = ici_end, yend = kl_end,
      colour = recalib, linetype = recalib
    ),
    arrow = arrow(length=unit(.2, 'cm'))
  ) +
  scale_shape_manual(
    NULL,
    values = c(
      "AUC*" = 19,
      "KL*" = 15,
      "High ICI" = 17
    )
  ) +
  ggh4x::facet_wrap2(~dataset, ncol = 5, scales = "free", axes = "all") +
  labs(x = "ICI", y = "KL Divergence") +
  scale_colour_manual("Recalibration", values = colour_recalib) +
  scale_linetype_discrete("Recalibration") +
  theme_paper()

ggsave(
  plot_before_after, file = "../figs/real-dataset-before-after-calib.pdf",
  width = 12, height = 5
)

plot_before_after
Figure 10.11: Average KL divergence and ICI before and after recalibration of the estimated scores.