10 Real-data Results

Note

This chapter investigates the results of XGBoost estimations for the 10 real datasets.

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(ggh4x)
library(ggrepel)
library(rpart)
library(locfit)

locfit 1.5-9.10      2024-06-24

Attaching package: 'locfit'

The following object is masked from 'package:purrr':

    none

library(philentropy)
library(stringr)
library(purrr)

# Colours for train/validation/test
colour_samples <- c(
  "Train" = "#0072B2",
  "Validation" = "#009E73",
  "Calibration" = "#CC79A7",
  "Test" = "#D55E00"
)

colour_recalib <- c(
  "None" = "#88CCEE",
  "Platt" = "#44AA99",
  "Isotonic" = "#882255",
  "Beta" = "#D55E00"#,
  #"Locfit" = "firebrick3"
)

definition of the theme_paper() function (for ggplot2 graphs)

#' Theme for ggplot2
#'
#' @param ... arguments passed to the theme function
#' @export
#' @importFrom ggplot2 element_rect element_text element_blank element_line unit
#'   rel
theme_paper <- function (...) {
  ggthemes::theme_base() +
    theme(
      plot.background = element_blank(),
      legend.background = element_rect(
        fill = "transparent", linetype="solid", colour ="black"),
      legend.position = "bottom",
      legend.direction = "horizontal",
      legend.box = "horizontal",
      legend.key = element_blank()
    )
}

10.1 Data

The list of datasets and the name of the target variable:

datasets <- tribble(
  ~name, ~target_name,
  "abalone", "Sex",
  "adult", "high_income",
  "bank", "y",
  "default", "default",
  "drybean", "is_dermason",
  "coupon", "y",
  "mushroom", "edible",
  "occupancy", "Occupancy",
  "winequality", "high_quality",
  "spambase", "is_spam"
)

The priors:

for (name in datasets$name) {
  # The Prior on the distribution of the scores
  load(str_c("../output/real-data/priors_", name, ".rda"))
}

The results:

files <- str_c("../output/real-data/xgb_resul_", datasets$name, ".rda")
# Loop through each file and load it with a new name
walk2(files, datasets$name, ~{
  loaded_object_name <- load(.x)
  new_name <- str_c("xgb_resul_", .y)
  assign(new_name, get(loaded_object_name), envir = .GlobalEnv)
})

10.2 Distribution of Scores

Graph functions

plot_bp_interest <- function(metrics_interest,
                             scores_hist_interest,
                             label,
                             recalib_method) {
  subtitle <- str_c(
    "Depth = ", metrics_interest$max_depth, ", ",
    "AUC = ", round(metrics_interest$AUC, 2), ", \n",
    "Brier = ", round(metrics_interest$brier, 2), ",",
    "ICI = ", round(metrics_interest$ici, 2), ", ",
    "KL = ", round(metrics_interest$KL_20_true_probas, 2)
  )
  
  if (recalib_method == "none") {
    plot(
      main = str_c(label, " (iter = ", metrics_interest$nb_iter,")"),
      scores_hist_interest$test,
      xlab = latex2exp::TeX("$\\hat{s}(x)$"),
      ylab = ""
    )
  } else if (recalib_method == "platt") {
    plot(
      main = str_c(label, " (iter = ", metrics_interest$nb_iter,")"),
      scores_hist_interest$test_c_platt,
      xlab = latex2exp::TeX("$\\hat{s}(x)$"),
      ylab = "",
      col = colour_recalib[["Platt"]]
    )
  } else if (recalib_method == "beta") {
    plot(
      main = str_c(label, " (iter = ", metrics_interest$nb_iter,")"),
      scores_hist_interest$test_c_iso,
      xlab = latex2exp::TeX("$\\hat{s}(x)$"),
      ylab = "",
      col = colour_recalib[["Beta"]]
    )
  } else if (recalib_method == "iso") {
    plot(
      main = str_c(label, " (iter = ", metrics_interest$nb_iter,")"),
      scores_hist_interest$test_c_iso,
      xlab = latex2exp::TeX("$\\hat{s}(x)$"),
      ylab = "",
      col = colour_recalib[["Isotonic"]]
    )
  } 
  mtext(side = 3, line = -0.25, adj = .5, subtitle, cex = .5)
}

plot_bp_xgb <- function(paper_version = FALSE) {
  max_depth_val <- map_dbl(scores_hist_all, ~.x[[1]]$max_depth)
  true_prob <- prior$scores_gamsel$scores_train
  
  for (recalib_method in c("none", "platt", "beta", "iso")) {
    
    i_method <- match(recalib_method, c("none", "platt", "beta", "iso"))
    recalib_method_lab <- c("None", "Platt", "Beta", "Isotonic")[i_method]
    
    # The metrics for the corresponding results, on the validation set
    metrics_xgb_current_valid <-
      metrics_xgb_all |>
      filter(
        sample == "Validation",
        recalib == "None"
      )
    # and on the test set
    metrics_xgb_current_test <-
      metrics_xgb_all |>
      filter(
        sample == "Test",
        recalib == recalib_method_lab
      )
    
    if (paper_version == FALSE) {
      hist(
        true_prob,
        breaks = seq(0, 1, by = .05),
        xlab = "p", ylab = "",
        main = "Prior Probabilities",
        xlim = c(0, 1)
      )
      mtext(
        side = 2, recalib_method_lab, line = 2.5, cex = 1,
        font.lab = 2
      )
      # Iterations of interest----
      ## Start of iterations
      scores_hist_start <- scores_hist_all[[1]][[1]]
      metrics_start <- metrics_xgb_current_test |>
        filter(
          nb_iter == scores_hist_start$nb_iter,
          max_depth == scores_hist_start$max_depth
        )
      
      plot_bp_interest(
        metrics_interest = metrics_start,
        scores_hist_interest = scores_hist_start,
        label = "Start",
        recalib_method = recalib_method
      )
      
      ## End of iterations
      scores_hist_end <- scores_hist_all[[1]][[length(scores_hist_all[[1]])]]
      metrics_end <- metrics_xgb_current_test |>
        filter(
          nb_iter == scores_hist_end$nb_iter,
          max_depth == scores_hist_start$max_depth
        )
      plot_bp_interest(
        metrics_interest = metrics_end,
        scores_hist_interest = scores_hist_end,
        label = "End",
        recalib_method = recalib_method
      )
    }
    ## Iteration with max AUC on validation set
    metrics_valid_auc_star <-
      metrics_xgb_current_valid |> arrange(desc(AUC)) |> dplyr::slice(1)
    nb_iter_auc <- metrics_valid_auc_star$nb_iter
    max_depth_auc_star <- metrics_valid_auc_star$max_depth
    i_max_depth_auc_star <- which(max_depth_val == max_depth_auc_star)
    
    metrics_max_auc <-
      metrics_xgb_current_test |>
      filter(nb_iter == !!nb_iter_auc, max_depth == max_depth_auc_star)
    # Note: indexing at 0 here...
    ind_auc <- which(map_dbl(scores_hist_all[[i_max_depth_auc_star]], "nb_iter") == nb_iter_auc)
    scores_hist_max_auc <- scores_hist_all[[i_max_depth_auc_star]][[ind_auc]]
    plot_bp_interest(
      metrics_interest = metrics_max_auc,
      scores_hist_interest = scores_hist_max_auc,
      label = "AUC*",
      recalib_method = recalib_method
    )
    if (paper_version == TRUE) {
      mtext(
        side = 2, recalib_method_lab, line = 2.5, cex = 1,
        font.lab = 2
      )
    }
    
    ## Min Brier on validation set
    metrics_valid_brier_star <-
      metrics_xgb_current_valid |> arrange(brier) |> dplyr::slice(1)
    nb_iter_brier <- metrics_valid_brier_star$nb_iter
    max_depth_brier_star <- metrics_valid_brier_star$max_depth
    i_max_depth_brier_star <- which(max_depth_val == max_depth_brier_star)
    
    metrics_min_brier <-
      metrics_xgb_current_test |>
      filter(nb_iter == !!nb_iter_brier, max_depth == max_depth_brier_star)
    ind_brier <- which(map_dbl(scores_hist_all[[i_max_depth_brier_star]], "nb_iter") == nb_iter_brier)
    scores_hist_min_brier <- scores_hist_all[[i_max_depth_brier_star]][[ind_brier]]
    plot_bp_interest(
      metrics_interest = metrics_min_brier,
      scores_hist_interest = scores_hist_min_brier,
      label = "Brier*",
      recalib_method = recalib_method
    )
    
    ## Min ICI on validation set
    metrics_valid_ici_star <-
      metrics_xgb_current_valid |> arrange(ici) |> dplyr::slice(1)
    nb_iter_ici <-   metrics_valid_ici_star$nb_iter
    max_depth_ici_star <- metrics_valid_ici_star$max_depth
    i_max_depth_ici_star <- which(max_depth_val == max_depth_ici_star)
    
    metrics_min_ici <-
      metrics_xgb_current_test |>
      filter(nb_iter == !!nb_iter_ici, max_depth == max_depth_ici_star)
    ind_ici <- which(map_dbl(scores_hist_all[[i_max_depth_ici_star]], "nb_iter") == nb_iter_ici)
    scores_hist_min_ici <- scores_hist_all[[i_max_depth_ici_star]][[ind_ici]]
    plot_bp_interest(
      metrics_interest = metrics_min_ici,
      scores_hist_interest = scores_hist_min_ici,
      label = "ICI*",
      recalib_method = recalib_method
    )
    
    ## Min KL on validation set
    metrics_valid_kl_star <-
      metrics_xgb_current_valid |> arrange(KL_20_true_probas) |> dplyr::slice(1)
    nb_iter_kl <- metrics_valid_kl_star$nb_iter
    max_depth_kl_star <- metrics_valid_kl_star$max_depth
    i_max_depth_kl_star <- which(max_depth_val == max_depth_kl_star)
    
    metrics_min_kl <-
      metrics_xgb_current_test |>
      filter(nb_iter == !!nb_iter_kl, max_depth == max_depth_kl_star)
    ind_kl <- which(map_dbl(scores_hist_all[[i_max_depth_kl_star]], "nb_iter") == nb_iter_kl)
    scores_hist_min_kl <- scores_hist_all[[i_max_depth_kl_star]][[ind_kl]]
    plot_bp_interest(
      metrics_interest = metrics_min_kl,
      scores_hist_interest = scores_hist_min_kl,
      label = "KL*",
      recalib_method = recalib_method
    )
    
    ## Mediocre ICI on validation set
    identified_mici <-  mediocre_ici_xgb
    
    metrics_valid_mici_star <- metrics_xgb_current_valid |> 
      filter(ind == identified_mici$ind, nb_iter == identified_mici$nb_iter)
    nb_iter_mici <- metrics_valid_mici_star$nb_iter
    max_depth_mici_star <- metrics_valid_mici_star$max_depth
    i_max_depth_mici_star <- which(max_depth_val == max_depth_mici_star)
    
    metrics_mici <-
      metrics_xgb_current_test |>
      filter(nb_iter == !!nb_iter_mici, max_depth == max_depth_mici_star)
    ind_mici <- which(map_dbl(scores_hist_all[[i_max_depth_mici_star]], "nb_iter") == nb_iter_mici)
    scores_hist_mici <- scores_hist_all[[i_max_depth_mici_star]][[ind_mici]]
    plot_bp_interest(
      metrics_interest = metrics_mici,
      scores_hist_interest = scores_hist_mici,
      label = "Med. ICI",
      recalib_method = recalib_method
    )
  }
}



plot_bp_xgb <- function(scores_hist_all, 
                        prior, 
                        metrics_xgb_all,
                        mediocre_ici_xgb,
                        paper_version = FALSE) {
  # # Focus on a value for max_depth
  max_depth_val <- map_dbl(scores_hist_all, ~.x[[1]]$max_depth)
  
  # True Probabilities
  true_prob <- prior$scores_gamsel$scores_train
  
  for (recalib_method in c("none", "platt", "beta", "iso")) {
    
    i_method <- match(recalib_method, c("none", "platt", "beta", "iso"))
    recalib_method_lab <- c("None", "Platt", "Beta", "Isotonic")[i_method]
    
    # The metrics for the corresponding results, on the validation set
    metrics_xgb_current_valid <-
      metrics_xgb_all |>
      filter(
        sample == "Validation",
        recalib == "None"
      )
    # and on the test set
    metrics_xgb_current_test <-
      metrics_xgb_all |>
      filter(
        sample == "Test",
        recalib == recalib_method_lab
      )
    
    if (paper_version == FALSE) {
      hist(
        true_prob,
        breaks = seq(0, 1, by = .05),
        xlab = "p", ylab = "",
        main = "Prior Probabilities",
        xlim = c(0, 1)
      )
      mtext(
        side = 2, recalib_method_lab, line = 2.5, cex = 1,
        font.lab = 2
      )
      # Iterations of interest----
      ## Start of iterations
      scores_hist_start <- scores_hist_all[[1]][[1]]
      metrics_start <- metrics_xgb_current_test |>
        filter(
          nb_iter == scores_hist_start$nb_iter,
          max_depth == scores_hist_start$max_depth
        )
      
      plot_bp_interest(
        metrics_interest = metrics_start,
        scores_hist_interest = scores_hist_start,
        label = "Start",
        recalib_method = recalib_method
      )
      
      ## End of iterations
      scores_hist_end <- scores_hist_all[[1]][[length(scores_hist_all[[1]])]]
      metrics_end <- metrics_xgb_current_test |>
        filter(
          nb_iter == scores_hist_end$nb_iter,
          max_depth == scores_hist_start$max_depth
        )
      plot_bp_interest(
        metrics_interest = metrics_end,
        scores_hist_interest = scores_hist_end,
        label = "End",
        recalib_method = recalib_method
      )
    }
    ## Iteration with max AUC on validation set
    metrics_valid_auc_star <-
      metrics_xgb_current_valid |> arrange(desc(AUC)) |> dplyr::slice(1)
    nb_iter_auc <- metrics_valid_auc_star$nb_iter
    max_depth_auc_star <- metrics_valid_auc_star$max_depth
    i_max_depth_auc_star <- which(max_depth_val == max_depth_auc_star)
    
    metrics_max_auc <-
      metrics_xgb_current_test |>
      filter(nb_iter == !!nb_iter_auc, max_depth == max_depth_auc_star)
    # Note: indexing at 0 here...
    ind_auc <- which(map_dbl(scores_hist_all[[i_max_depth_auc_star]], "nb_iter") == nb_iter_auc)
    scores_hist_max_auc <- scores_hist_all[[i_max_depth_auc_star]][[ind_auc]]
    plot_bp_interest(
      metrics_interest = metrics_max_auc,
      scores_hist_interest = scores_hist_max_auc,
      label = "AUC*",
      recalib_method = recalib_method
    )
    if (paper_version == TRUE) {
      mtext(
        side = 2, recalib_method_lab, line = 2.5, cex = 1,
        font.lab = 2
      )
    }
    
    ## Min Brier on validation set
    metrics_valid_brier_star <-
      metrics_xgb_current_valid |> arrange(brier) |> dplyr::slice(1)
    nb_iter_brier <- metrics_valid_brier_star$nb_iter
    max_depth_brier_star <- metrics_valid_brier_star$max_depth
    i_max_depth_brier_star <- which(max_depth_val == max_depth_brier_star)
    
    metrics_min_brier <-
      metrics_xgb_current_test |>
      filter(nb_iter == !!nb_iter_brier, max_depth == max_depth_brier_star)
    ind_brier <- which(map_dbl(scores_hist_all[[i_max_depth_brier_star]], "nb_iter") == nb_iter_brier)
    scores_hist_min_brier <- scores_hist_all[[i_max_depth_brier_star]][[ind_brier]]
    plot_bp_interest(
      metrics_interest = metrics_min_brier,
      scores_hist_interest = scores_hist_min_brier,
      label = "Brier*",
      recalib_method = recalib_method
    )
    
    ## Min ICI on validation set
    metrics_valid_ici_star <-
      metrics_xgb_current_valid |> arrange(ici) |> dplyr::slice(1)
    nb_iter_ici <-   metrics_valid_ici_star$nb_iter
    max_depth_ici_star <- metrics_valid_ici_star$max_depth
    i_max_depth_ici_star <- which(max_depth_val == max_depth_ici_star)
    
    metrics_min_ici <-
      metrics_xgb_current_test |>
      filter(nb_iter == !!nb_iter_ici, max_depth == max_depth_ici_star)
    ind_ici <- which(map_dbl(scores_hist_all[[i_max_depth_ici_star]], "nb_iter") == nb_iter_ici)
    scores_hist_min_ici <- scores_hist_all[[i_max_depth_ici_star]][[ind_ici]]
    plot_bp_interest(
      metrics_interest = metrics_min_ici,
      scores_hist_interest = scores_hist_min_ici,
      label = "ICI*",
      recalib_method = recalib_method
    )
    
    ## Min KL on validation set
    metrics_valid_kl_star <-
      metrics_xgb_current_valid |> arrange(KL_20_true_probas) |> dplyr::slice(1)
    nb_iter_kl <- metrics_valid_kl_star$nb_iter
    max_depth_kl_star <- metrics_valid_kl_star$max_depth
    i_max_depth_kl_star <- which(max_depth_val == max_depth_kl_star)
    
    metrics_min_kl <-
      metrics_xgb_current_test |>
      filter(nb_iter == !!nb_iter_kl, max_depth == max_depth_kl_star)
    ind_kl <- which(map_dbl(scores_hist_all[[i_max_depth_kl_star]], "nb_iter") == nb_iter_kl)
    scores_hist_min_kl <- scores_hist_all[[i_max_depth_kl_star]][[ind_kl]]
    plot_bp_interest(
      metrics_interest = metrics_min_kl,
      scores_hist_interest = scores_hist_min_kl,
      label = "KL*",
      recalib_method = recalib_method
    )
    
    ## Mediocre ICI on validation set
    identified_mici <-  mediocre_ici_xgb
    
    metrics_valid_mici_star <- metrics_xgb_current_valid |> 
      filter(ind == identified_mici$ind, nb_iter == identified_mici$nb_iter)
    nb_iter_mici <- metrics_valid_mici_star$nb_iter
    max_depth_mici_star <- metrics_valid_mici_star$max_depth
    i_max_depth_mici_star <- which(max_depth_val == max_depth_mici_star)
    
    metrics_mici <-
      metrics_xgb_current_test |>
      filter(nb_iter == !!nb_iter_mici, max_depth == max_depth_mici_star)
    ind_mici <- which(map_dbl(scores_hist_all[[i_max_depth_mici_star]], "nb_iter") == nb_iter_mici)
    scores_hist_mici <- scores_hist_all[[i_max_depth_mici_star]][[ind_mici]]
    plot_bp_interest(
      metrics_interest = metrics_mici,
      scores_hist_interest = scores_hist_mici,
      label = "Med. ICI",
      recalib_method = recalib_method
    )
  }
}


plot_bp_xgb_dataset <- function(xgb_resul, prior) {
  scores_hist_all <- xgb_resul$scores_hist
  
  metrics_xgb_all <- xgb_resul$metrics_simul |>
    mutate(
      sample = factor(
        sample,
        levels = c("train", "valid", "calib", "test"),
        labels = c("Train","Validation", "Calibration" ,"Test")
      ),
      recalib = factor(
        recalib,
        levels = c("none", "platt", "beta", "isotonic"),
        labels = c("None", "Platt", "Beta", "Isotonic")
      )
    )
  
  mediocre_ici_xgb <- 
  metrics_xgb_all |>
  filter(sample == "Validation", recalib == "None") |>
  # For each replication for a scenario, we select a model with a mediocre 
  # calibration
  mutate(
    mean_ici = mean(ici),
    sd_ici = sd(ici),
    upb_ici = mean_ici + sd_ici,
  ) |> 
  filter(ici >  upb_ici) |> 
  arrange(desc(AUC)) |>
  # Among the configurations for which the calibration is not within 1-sd of the
  # average calibration, we select the model with the lowest ICI
  #arrange(ici) |> 
  slice_head(n = 1) |> 
  select(ind, nb_iter, recalib) |>
  mutate(result_type = "mediocre_ici")
  
  plot_bp_xgb(
    scores_hist_all = scores_hist_all, 
    prior = prior, 
    metrics_xgb_all = metrics_xgb_all,
    mediocre_ici_xgb = mediocre_ici_xgb)
}

Code

par(mfrow = c(4,8), mar = c(4.1, 4, 3.5, 1.5))
plot_bp_xgb_dataset(xgb_resul = xgb_resul_abalone, prior = priors_abalone)

Figure 10.1: Distribution of scores on the test set (abalone)

Code

par(mfrow = c(4,8), mar = c(4.1, 4, 3.5, 1.5))
plot_bp_xgb_dataset(xgb_resul = xgb_resul_adult, prior = priors_adult)

Figure 10.2: Distribution of scores on the test set (adult)

Code

par(mfrow = c(4,8), mar = c(4.1, 4, 3.5, 1.5))
plot_bp_xgb_dataset(xgb_resul = xgb_resul_bank, prior = priors_bank)

Figure 10.3: Distribution of scores on the test set (bank)

Code

par(mfrow = c(4,8), mar = c(4.1, 4, 3.5, 1.5))
plot_bp_xgb_dataset(xgb_resul = xgb_resul_default, prior = priors_default)

Figure 10.4: Distribution of scores on the test set (default)

Code

par(mfrow = c(4,8), mar = c(4.1, 4, 3.5, 1.5))
plot_bp_xgb_dataset(xgb_resul = xgb_resul_drybean, prior = priors_drybean)

Figure 10.5: Distribution of scores on the test set (drybean)

Code

par(mfrow = c(4,8), mar = c(4.1, 4, 3.5, 1.5))
plot_bp_xgb_dataset(xgb_resul = xgb_resul_coupon, prior = priors_coupon)

Figure 10.6: Distribution of scores on the test set (coupon)

Code

par(mfrow = c(4,8), mar = c(4.1, 4, 3.5, 1.5))
plot_bp_xgb_dataset(xgb_resul = xgb_resul_mushroom, prior = priors_mushroom)

Figure 10.7: Distribution of scores on the test set (mushroom)

Code

par(mfrow = c(4,8), mar = c(4.1, 4, 3.5, 1.5))
plot_bp_xgb_dataset(xgb_resul = xgb_resul_occupancy, prior = priors_occupancy)

Figure 10.8: Distribution of scores on the test set (occupancy)

Code

par(mfrow = c(4,8), mar = c(4.1, 4, 3.5, 1.5))
plot_bp_xgb_dataset(xgb_resul = xgb_resul_winequality, prior = priors_winequality)

Figure 10.9: Distribution of scores on the test set (winequality)

Code

par(mfrow = c(4,8), mar = c(4.1, 4, 3.5, 1.5))
plot_bp_xgb_dataset(xgb_resul = xgb_resul_spambase, prior = priors_spambase)

Figure 10.10: Distribution of scores on the test set (spambase)

10.3 Before vs. After Recalibration

Let us have a look at the evolution of ICI and KL divergence before and after calibration with both methods (Platt-scaling and Isotonic regression). We consider three models for each dataset: the one for which the hyperparameters were selected based on the maximization of AUC (AUC*), the one for which the hyperparameters were selected based on the minimization of the Kullback-Leibler divergence between the distribution of the estimated scores and that of the prior distribution, and one with a relatively high ICI, i.e., with poor calibration.

Functions to extract the metrics from the results

#' Function to extract the results for models of interest
gather_metrics <- function(xgb_resul) {
  
  metrics_xgb_all <- xgb_resul$metrics_simul |>
    mutate(
      sample = factor(
        sample,
        levels = c("train", "valid", "calib", "test"),
        labels = c("Train","Validation", "Calibration" ,"Test")
      ),
      recalib = factor(
        recalib,
        levels = c("none", "platt", "beta", "isotonic"),
        labels = c("None", "Platt", "Beta", "Isotonic")
      )
    )
  
  # Identify the smallest tree on the validation set, when the scores are not
  # recalibrated
  smallest_xgb <-
    metrics_xgb_all |>
    filter(sample == "Validation", recalib == "None") |>
    arrange(nb_iter) |>
    slice_head(n = 1) |>
    select(ind, nb_iter, recalib) |>
    mutate(result_type = "smallest")
  
  # Identify the largest tree
  largest_xgb <-
    metrics_xgb_all |>
    filter(sample == "Validation", recalib == "None") |>
    arrange(desc(nb_iter)) |>
    slice_head(n = 1) |>
    select(ind, nb_iter, recalib) |>
    mutate(result_type = "largest")
  
  # Identify tree with highest AUC on test set
  highest_auc_xgb <-
    metrics_xgb_all |>
    filter(sample == "Validation", recalib == "None") |>
    arrange(desc(AUC)) |>
    slice_head(n = 1) |>
    select(ind, nb_iter, recalib) |>
    mutate(result_type = "largest_auc")
  
  # Identify tree with lowest brier
  lowest_brier_xgb <-
    metrics_xgb_all |>
    filter(sample == "Validation", recalib == "None") |>
    arrange(brier) |>
    slice_head(n = 1) |>
    select(ind, nb_iter, recalib) |>
    mutate(result_type = "lowest_brier")
  
  # Identify tree with lowest ICI
  lowest_ici_xgb <-
    metrics_xgb_all |>
    filter(sample == "Validation", recalib == "None") |>
    arrange(ici) |>
    slice_head(n = 1) |>
    select(ind, nb_iter, recalib) |>
    mutate(result_type = "lowest_ici")
  
  # Identify tree with lowest KL
  lowest_kl_xgb <-
    metrics_xgb_all |>
    filter(sample == "Validation", recalib == "None") |>
    arrange(KL_20_true_probas) |>
    slice_head(n = 1) |>
    select(ind, nb_iter, recalib) |>
    mutate(result_type = "lowest_kl")
  
  mediocre_ici_xgb <- 
  metrics_xgb_all |>
  filter(sample == "Validation", recalib == "None") |>
  mutate(
    mean_ici = mean(ici),
    sd_ici = sd(ici),
    upb_ici = mean_ici + sd_ici,
  ) |> 
  filter(ici >  upb_ici) |> 
  arrange(desc(AUC)) |>
  slice_head(n = 1) |> 
  select(ind, nb_iter, recalib) |>
  mutate(result_type = "mediocre_ici")
  
  # Merge these
  models_of_interest_xgb <-
    smallest_xgb |>
    bind_rows(largest_xgb) |>
    bind_rows(highest_auc_xgb) |>
    bind_rows(lowest_brier_xgb) |>
    bind_rows(lowest_ici_xgb) |>
    bind_rows(lowest_kl_xgb) |> 
    bind_rows(mediocre_ici_xgb)
  
  models_of_interest_metrics <- NULL
  for (recalibration_method in c("None", "Platt", "Beta", "Isotonic")) {
    # Add metrics now
    models_of_interest_metrics <-
      models_of_interest_metrics |>
      bind_rows(
        models_of_interest_xgb |> select(-recalib) |>
          left_join(
            metrics_xgb_all |>
              filter(
                recalib == recalibration_method,
                sample %in% c("Validation", "Test")
              ),
            by = c("ind", "nb_iter"),
            relationship = "many-to-many" # (calib, test)
          )
      )
  }
  
  models_of_interest_metrics <-
    models_of_interest_metrics |>
    mutate(
      result_type = factor(
        result_type,
        levels = c(
          "smallest", "largest", "lowest_mse", "largest_auc",
          "lowest_brier", "lowest_ici", "lowest_kl", "mediocre_ici"),
        labels = c(
          "Smallest", "Largest", "MSE*", "AUC*",
          "Brier*", "ICI*", "KL*", "High ICI"
        )
      )
    )
  models_of_interest_metrics
}

#' Function to format results for selected models for later use in ggplot2
get_data_plot_before_after_recalib <- function(xgb_resul, dataset_name) {
  models_of_interest_metrics <- gather_metrics(xgb_resul = xgb_resul)
  
  table_models_interest <- 
    models_of_interest_metrics |> 
    filter(sample == "Test") |> 
    select(
      recalib, sample, result_type, 
      AUC, brier, ici, kl = KL_20_true_probas
    ) |> 
    filter(
      result_type %in% c("AUC*", "KL*", "High ICI")
    ) |> 
    mutate(dataset = dataset_name)
  
  
  initial_points <- table_models_interest |> 
    filter(recalib == "None") |> 
    mutate(dataset = dataset_name)
  
  points_after_c <- initial_points |> 
    select(result_type, ici, kl) |> 
    left_join(
      table_models_interest |> 
        filter(recalib %in% c("Platt", "Beta", "Isotonic")) |> 
        select(recalib, result_type, ici, kl) |> 
        rename(ici_end = ici, kl_end = kl),
      by = "result_type",
      relationship = "many-to-many" # (Platt, Isotonic and Beta)
    ) |> 
    mutate(dataset = dataset_name)
  
  list(
    table_models_interest = table_models_interest,
    initial_points = initial_points,
    points_after_c = points_after_c
  )
}


# The models of interest for all datasets
data_plot_before_after <- map2(
  .x = str_c("xgb_resul_", datasets$name), 
  .y = datasets$name, 
  .f = ~get_data_plot_before_after_recalib(xgb_resul = get(.x), dataset_name = .y)
)

# The metrics for these models
table_models_interest <- 
  map(data_plot_before_after, "table_models_interest") |> 
  list_rbind()

# The metrics for the three models of interest in the subsequent plot
# before recalibration
initial_points <- 
  map(data_plot_before_after, "initial_points") |> 
  list_rbind()

# and after recalibration
points_after_c <- 
  map(data_plot_before_after, "points_after_c") |> 
  list_rbind()

10.3.1 Tables

Display codes to create the Table

digits <- 2

table_models_interest |> 
  relocate(dataset, .before = recalib) |> 
  select(-sample, -AUC) |> 
  pivot_longer(cols = c(brier, ici, kl)) |> 
  pivot_wider(names_from = c(recalib, name), values_from = value) |> 
  knitr::kable(
    align = str_c("l", str_c(rep("c", 3*4), collapse = ""), collapse = ""),
    escape = FALSE, booktabs = T, digits = 3, format = "markdown",
    col.names = c(
      "Dataset", "Optim.",
      rep(c("BS", "ICI", "KL"), 4)
    )
  ) |> 
  kableExtra::collapse_rows(columns = 1, valign = "top") |> 
  kableExtra::add_header_above(
    c(" " = 2,
      "None" = 3,
      "Platt" = 3,
       "Beta" = 3,
      "Isotonic" = 3
    )
  ) |> 
  kableExtra::scroll_box(fixed_thead = TRUE, height = "500px")

Table 10.1: Performance and calibration metrics (Brier Score, Integrated Calibration Index, Kullback-Leibler Divergence) computed on the test set, on scores returned by the model (column ‘None’), on scores recalibrated using Platt scaling (column ‘Platt’), or Isotonic regression (coliumn ‘Isotonic’)

		None			Platt			Beta			Isotonic
Dataset	Optim.	BS	ICI	KL	BS	ICI	KL	BS	ICI	KL	BS	ICI	KL
abalone	AUC*	0.214	0.069	0.397	0.209	0.044	0.411	0.208	0.042	0.492	0.206	0.031	1.169
	KL*	0.210	0.057	0.320	0.208	0.052	0.475	0.208	0.048	0.453	0.205	0.037	1.239
	High ICI	0.258	0.180	4.513	0.219	0.075	0.383	0.216	0.057	0.074	0.214	0.033	0.770
adult	AUC*	0.090	0.008	0.325	0.092	0.039	0.461	0.090	0.008	0.328	0.090	0.008	0.639
	KL*	0.102	0.036	0.092	0.101	0.039	0.301	0.100	0.015	0.288	0.100	0.008	0.680
	High ICI	0.100	0.045	0.641	0.102	0.061	1.475	0.096	0.012	0.295	0.096	0.011	0.514
bank	AUC*	0.062	0.017	0.485	0.066	0.047	0.650	0.062	0.009	0.489	0.062	0.009	0.594
	KL*	0.070	0.039	0.062	0.071	0.037	0.441	0.069	0.010	0.315	0.069	0.004	0.734
	High ICI	0.068	0.040	0.437	0.070	0.040	0.496	0.067	0.012	0.453	0.067	0.005	0.546
default	AUC*	0.128	0.025	0.353	0.129	0.036	0.799	0.128	0.018	0.275	0.128	0.014	0.581
	KL*	0.129	0.009	0.349	0.130	0.027	0.698	0.129	0.016	0.234	0.129	0.014	0.774
	High ICI	0.142	0.095	1.498	0.133	0.036	1.317	0.131	0.015	0.660	0.131	0.013	0.892
drybean	AUC*	0.029	0.011	0.687	0.031	0.025	0.871	0.028	0.008	0.650	0.029	0.009	0.870
	KL*	0.036	0.042	0.370	0.040	0.039	0.843	0.037	0.030	0.680	0.034	0.019	0.731
	High ICI	0.036	0.070	2.149	0.034	0.026	0.832	0.032	0.021	0.742	0.031	0.012	0.771
coupon	AUC*	0.158	0.041	1.625	0.158	0.038	0.879	0.157	0.028	0.540	0.158	0.025	1.137
	KL*	0.192	0.038	0.048	0.190	0.025	0.194	0.190	0.021	0.132	0.191	0.022	0.659
	High ICI	0.162	0.075	2.354	0.159	0.054	1.007	0.157	0.028	0.522	0.157	0.023	1.169
mushroom	AUC*	0.000	0.003	1.399	0.000	0.003	1.399	0.000	0.002	1.399	0.000	0.003	1.399
	KL*	0.016	0.063	0.616	0.010	0.020	1.291	0.010	0.019	1.284	0.006	0.003	1.332
	High ICI	0.010	0.038	0.750	0.006	0.022	1.315	0.006	0.018	1.315	0.001	0.003	1.399
occupancy	AUC*	0.007	0.005	1.064	0.006	0.006	1.175	0.007	0.006	1.081	0.007	0.006	1.069
	KL*	0.009	0.044	0.864	0.007	0.006	1.136	0.008	0.006	1.109	0.008	0.006	1.055
	High ICI	0.009	0.033	0.919	0.007	0.006	1.157	0.008	0.007	1.115	0.008	0.008	1.052
winequality	AUC*	0.153	0.110	4.927	0.143	0.047	1.862	0.137	0.017	1.090	0.136	0.027	1.872
	KL*	0.170	0.057	0.118	0.166	0.040	0.418	0.166	0.026	0.365	0.166	0.018	1.009
	High ICI	0.153	0.110	4.927	0.143	0.047	1.862	0.137	0.017	1.090	0.136	0.027	1.872
spambase	AUC*	0.032	0.011	0.844	0.035	0.023	1.141	0.032	0.014	0.708	0.034	0.007	1.109
	KL*	0.053	0.055	0.260	0.049	0.013	0.560	0.049	0.013	0.437	0.050	0.011	0.858
	High ICI	0.040	0.035	0.432	0.041	0.023	0.865	0.039	0.015	0.677	0.040	0.011	0.975


\begin{tabular}{lccccccccccccl}
\toprule
\multicolumn{2}{c}{ } & \multicolumn{3}{c}{None} & \multicolumn{3}{c}{Platt} & \multicolumn{3}{c}{Beta} & \multicolumn{3}{c}{Isotonic} \\
\cmidrule(l{3pt}r{3pt}){3-5} \cmidrule(l{3pt}r{3pt}){6-8} \cmidrule(l{3pt}r{3pt}){9-11} \cmidrule(l{3pt}r{3pt}){12-14}
Dataset & Optim. & BS & ICI & KL & BS & ICI & KL & BS & ICI & KL & BS & ICI & KL\\
\midrule
 & AUC* & 0.214 & 0.069 & 0.397 & 0.209 & 0.044 & 0.411 & 0.208 & 0.042 & 0.492 & 0.206 & 0.031 & 1.169\\
\cmidrule{2-14}
 & KL* & 0.210 & 0.057 & 0.320 & 0.208 & 0.052 & 0.475 & 0.208 & 0.048 & 0.453 & 0.205 & 0.037 & 1.239\\
\cmidrule{2-14}
\multirow[t]{-3}{*}{\raggedright\arraybackslash abalone} & High ICI & 0.258 & 0.180 & 4.513 & 0.219 & 0.075 & 0.383 & 0.216 & 0.057 & 0.074 & 0.214 & 0.033 & 0.770\\
\cmidrule{1-14}
 & AUC* & 0.090 & 0.008 & 0.325 & 0.092 & 0.039 & 0.461 & 0.090 & 0.008 & 0.328 & 0.090 & 0.008 & 0.639\\
\cmidrule{2-14}
 & KL* & 0.102 & 0.036 & 0.092 & 0.101 & 0.039 & 0.301 & 0.100 & 0.015 & 0.288 & 0.100 & 0.008 & 0.680\\
\cmidrule{2-14}
\multirow[t]{-3}{*}{\raggedright\arraybackslash adult} & High ICI & 0.100 & 0.045 & 0.641 & 0.102 & 0.061 & 1.475 & 0.096 & 0.012 & 0.295 & 0.096 & 0.011 & 0.514\\
\cmidrule{1-14}
 & AUC* & 0.062 & 0.017 & 0.485 & 0.066 & 0.047 & 0.650 & 0.062 & 0.009 & 0.489 & 0.062 & 0.009 & 0.594\\
\cmidrule{2-14}
 & KL* & 0.070 & 0.039 & 0.062 & 0.071 & 0.037 & 0.441 & 0.069 & 0.010 & 0.315 & 0.069 & 0.004 & 0.734\\
\cmidrule{2-14}
\multirow[t]{-3}{*}{\raggedright\arraybackslash bank} & High ICI & 0.068 & 0.040 & 0.437 & 0.070 & 0.040 & 0.496 & 0.067 & 0.012 & 0.453 & 0.067 & 0.005 & 0.546\\
\cmidrule{1-14}
 & AUC* & 0.128 & 0.025 & 0.353 & 0.129 & 0.036 & 0.799 & 0.128 & 0.018 & 0.275 & 0.128 & 0.014 & 0.581\\
\cmidrule{2-14}
 & KL* & 0.129 & 0.009 & 0.349 & 0.130 & 0.027 & 0.698 & 0.129 & 0.016 & 0.234 & 0.129 & 0.014 & 0.774\\
\cmidrule{2-14}
\multirow[t]{-3}{*}{\raggedright\arraybackslash default} & High ICI & 0.142 & 0.095 & 1.498 & 0.133 & 0.036 & 1.317 & 0.131 & 0.015 & 0.660 & 0.131 & 0.013 & 0.892\\
\cmidrule{1-14}
 & AUC* & 0.029 & 0.011 & 0.687 & 0.031 & 0.025 & 0.871 & 0.028 & 0.008 & 0.650 & 0.029 & 0.009 & 0.870\\
\cmidrule{2-14}
 & KL* & 0.036 & 0.042 & 0.370 & 0.040 & 0.039 & 0.843 & 0.037 & 0.030 & 0.680 & 0.034 & 0.019 & 0.731\\
\cmidrule{2-14}
\multirow[t]{-3}{*}{\raggedright\arraybackslash drybean} & High ICI & 0.036 & 0.070 & 2.149 & 0.034 & 0.026 & 0.832 & 0.032 & 0.021 & 0.742 & 0.031 & 0.012 & 0.771\\
\cmidrule{1-14}
 & AUC* & 0.158 & 0.041 & 1.625 & 0.158 & 0.038 & 0.879 & 0.157 & 0.028 & 0.540 & 0.158 & 0.025 & 1.137\\
\cmidrule{2-14}
 & KL* & 0.192 & 0.038 & 0.048 & 0.190 & 0.025 & 0.194 & 0.190 & 0.021 & 0.132 & 0.191 & 0.022 & 0.659\\
\cmidrule{2-14}
\multirow[t]{-3}{*}{\raggedright\arraybackslash coupon} & High ICI & 0.162 & 0.075 & 2.354 & 0.159 & 0.054 & 1.007 & 0.157 & 0.028 & 0.522 & 0.157 & 0.023 & 1.169\\
\cmidrule{1-14}
 & AUC* & 0.000 & 0.003 & 1.399 & 0.000 & 0.003 & 1.399 & 0.000 & 0.002 & 1.399 & 0.000 & 0.003 & 1.399\\
\cmidrule{2-14}
 & KL* & 0.016 & 0.063 & 0.616 & 0.010 & 0.020 & 1.291 & 0.010 & 0.019 & 1.284 & 0.006 & 0.003 & 1.332\\
\cmidrule{2-14}
\multirow[t]{-3}{*}{\raggedright\arraybackslash mushroom} & High ICI & 0.010 & 0.038 & 0.750 & 0.006 & 0.022 & 1.315 & 0.006 & 0.018 & 1.315 & 0.001 & 0.003 & 1.399\\
\cmidrule{1-14}
 & AUC* & 0.007 & 0.005 & 1.064 & 0.006 & 0.006 & 1.175 & 0.007 & 0.006 & 1.081 & 0.007 & 0.006 & 1.069\\
\cmidrule{2-14}
 & KL* & 0.009 & 0.044 & 0.864 & 0.007 & 0.006 & 1.136 & 0.008 & 0.006 & 1.109 & 0.008 & 0.006 & 1.055\\
\cmidrule{2-14}
\multirow[t]{-3}{*}{\raggedright\arraybackslash occupancy} & High ICI & 0.009 & 0.033 & 0.919 & 0.007 & 0.006 & 1.157 & 0.008 & 0.007 & 1.115 & 0.008 & 0.008 & 1.052\\
\cmidrule{1-14}
 & AUC* & 0.153 & 0.110 & 4.927 & 0.143 & 0.047 & 1.862 & 0.137 & 0.017 & 1.090 & 0.136 & 0.027 & 1.872\\
\cmidrule{2-14}
 & KL* & 0.170 & 0.057 & 0.118 & 0.166 & 0.040 & 0.418 & 0.166 & 0.026 & 0.365 & 0.166 & 0.018 & 1.009\\
\cmidrule{2-14}
\multirow[t]{-3}{*}{\raggedright\arraybackslash winequality} & High ICI & 0.153 & 0.110 & 4.927 & 0.143 & 0.047 & 1.862 & 0.137 & 0.017 & 1.090 & 0.136 & 0.027 & 1.872\\
\cmidrule{1-14}
 & AUC* & 0.032 & 0.011 & 0.844 & 0.035 & 0.023 & 1.141 & 0.032 & 0.014 & 0.708 & 0.034 & 0.007 & 1.109\\
\cmidrule{2-14}
 & KL* & 0.053 & 0.055 & 0.260 & 0.049 & 0.013 & 0.560 & 0.049 & 0.013 & 0.437 & 0.050 & 0.011 & 0.858\\
\cmidrule{2-14}
\multirow[t]{-3}{*}{\raggedright\arraybackslash spambase} & High ICI & 0.040 & 0.035 & 0.432 & 0.041 & 0.023 & 0.865 & 0.039 & 0.015 & 0.677 & 0.040 & 0.011 & 0.975\\
\bottomrule
\end{tabular}

10.3.2 Figures

Codes to crate the figure

plot_before_after <- ggplot() +
  geom_point(
    data = initial_points,
    mapping = aes(x = ici, y = kl, shape = result_type),
    size = 2
  ) +
  geom_segment(
    data = points_after_c,
    mapping = aes(
      x = ici, y = kl, xend = ici_end, yend = kl_end,
      colour = recalib, linetype = recalib
    ),
    arrow = arrow(length=unit(.2, 'cm'))
  ) +
  scale_shape_manual(
    NULL,
    values = c(
      "AUC*" = 19,
      "KL*" = 15,
      "High ICI" = 17
    )
  ) +
  ggh4x::facet_wrap2(~dataset, ncol = 5, scales = "free", axes = "all") +
  labs(x = "ICI", y = "KL Divergence") +
  scale_colour_manual("Recalibration", values = colour_recalib) +
  scale_linetype_discrete("Recalibration") +
  theme_paper()

ggsave(
  plot_before_after, file = "../figs/real-dataset-before-after-calib.pdf",
  width = 12, height = 5
)

plot_before_after

Figure 10.11: Average KL divergence and ICI before and after recalibration of the estimated scores.