9 Estimations

Note

This chapter estimates the binary events from the datasets introduced in Chapter 7 using extreme gradient boosting models. For this model, we perform a grid search. For each set of hyperparameters of the grid search, we compute the estimated scores by the model and calculate performance, calibration, and divergence metrics. For the divergence metrics, we assume that the underlying probabilities are distributed according to a Beta distribution whose parameters were estimated in Chapter 7.

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(philentropy)
library(ranger)
library(xgboost)


Attaching package: 'xgboost'

The following object is masked from 'package:dplyr':

    slice

library(pbapply)
library(parallel)
library(gam)

Loading required package: splines
Loading required package: foreach

Attaching package: 'foreach'

The following objects are masked from 'package:purrr':

    accumulate, when

Loaded gam 1.22-5

library(gamsel)

Loaded gamsel 1.8-5

# Colours for train/validation/calibration/test
colour_samples <- c(
  "Train" = "#0072B2",
  "Validation" = "#009E73",
  "Calibration" = "#CC79A7",
  "Test" = "#D55E00"
)

colour_recalib <- c(
  "None" = "#88CCEE",
  "Platt" = "#44AA99",
  "Isotonic" = "#882255",
  "Beta" = "#D55E00",
  "Locfit" = "firebrick3"
)

# Functions
source("../scripts/functions/real-data.R")
source("../scripts/functions/metrics.R")

definition of the theme_paper() function (for ggplot2 graphs)

#' Theme for ggplot2
#'
#' @param ... arguments passed to the theme function
#' @export
#' @importFrom ggplot2 element_rect element_text element_blank element_line unit
#'   rel
theme_paper <- function (...) {
  ggthemes::theme_base() +
    theme(
      plot.background = element_blank(),
      legend.background = element_rect(
        fill = "transparent", linetype="solid", colour ="black"),
      legend.position = "bottom",
      legend.direction = "horizontal",
      legend.box = "horizontal",
      legend.key = element_blank()
    )
}

9.1 Functions

9.1.1 Recalibration

#' Recalibrates scores using a calibration
#' 
#' @param obs_calib vector of observed events in the calibration set
#' @param scores_calib vector of predicted probabilities in the calibration set
#' @param obs_test vector of observed events in the test set
#' @param scores_test vector of predicted probabilities in the test set
#' @param method recalibration method (`"platt"` for Platt scaling, 
#'   `"isotonic"` for isotonic regression, `"beta"` for Beta calibration, 
#'   `"locfit"` for local regression techniques)
#' @returns list of two elements: recalibrated scores on the calibration set,
#'   recalibrated scores on the test set
recalibrate <- function(obs_calib,
                        obs_test,
                        pred_calib,
                        pred_test,
                        method = c("platt", "isotonic", "beta", "locfit")) {
  data_calib <- tibble(d = obs_calib, scores = pred_calib)
  data_test <- tibble(d = obs_test, scores = pred_test)
  
  if (method == "platt") {
    lr <- glm(d ~ scores, family = binomial(link = 'logit'), data = data_calib)
    score_c_calib <- predict(lr, newdata = data_calib, type = "response")
    score_c_test <- predict(lr, newdata = data_test, type = "response")
  } else if (method == "isotonic") {
    iso <- isoreg(x = data_calib$scores, y = data_calib$d)
    fit_iso <- as.stepfun(iso)
    score_c_calib <- fit_iso(data_calib$scores)
    score_c_test <- fit_iso(data_test$scores)
  } else if (method == "beta") {
    fit_beta <- betacal::beta_calibration(data_calib$scores, data_calib$d, parameters = "abm")
    score_c_calib <- beta_predict(data_calib$scores, fit_beta)
    score_c_test <- beta_predict(data_test$scores, fit_beta)
  } else if (method == "locfit") {
    noise_scores <- data_calib$scores + rnorm(nrow(data_calib), 0, 0.01)
    noise_data_calib <- data_calib %>% mutate(scores = noise_scores)
    locfit_reg <- locfit(
      formula = d ~ lp(scores, nn = 0.15, deg = 0),
      kern = "rect", maxk = 200, data = noise_data_calib
    )
    score_c_calib <- predict(locfit_reg, newdata = data_calib)
    score_c_test <- predict(locfit_reg, newdata = data_test)  
  } else {
    stop("Unrecognized method: platt or isotonic only")
  }
  # Format results in tibbles:
  # For calibration set
  tb_score_c_calib <- tibble(
    d = obs_calib,
    p_u = pred_calib,
    p_c = score_c_calib
  )
  # For test set
  tb_score_c_test <- tibble(
    d = obs_test,
    p_u = pred_test,
    p_c = score_c_test
  )
  
  list(
    tb_score_c_calib = tb_score_c_calib,
    tb_score_c_test = tb_score_c_test
  )
}

9.1.2 Dispersion Metrics with Beta prior

Function dispersion_metrics_beta()

#' Computes the dispersion and divergence metrics for a vector of scores and
#' a Beta distribution
#'
#' @param shape_1 first parameter of the beta distribution
#' @param shape_2 second parameter of the beta distribution
#' @param scores predicted scores
#'
#' @returns
#' \itemize{
#'   \item \code{inter_quantile_25_75}: Difference of inter-quantile between 25% and 75%
#'   \item \code{inter_quantile_10_90}: Difference of inter-quantile between 10% and 90%
#'   \item \code{KL_10_true_probas}: KL of of predicted probabilities w.r. to true probabilities with 10 bins
#'   \item \code{KL_10_scores}: KL of of true probabilities w.r. to predicted probabilities with 10 bins
#'   \item \code{KL_20_true_probas}: KL of of predicted probabilities w.r. to true probabilities with 20 bins
#'   \item \code{KL_20_scores}: KL of of true probabilities w.r. to predicted probabilities with 20 bins
#'   \item \code{ind_cov}: Difference between the variance of true probabilities and the covariance between true probabilities and predicted scores
#' }
dispersion_metrics_beta <- function(shape_1 = 1, shape_2 = 1, scores){

  # Inter-quantiles
  inter_q_80 <- diff(quantile(scores, c(.9, .1))) /
    diff(qbeta(c(.9, .1), shape_1, shape_2))
  inter_q_50 <- diff(quantile(scores, c(.75,.25))) /
    diff(qbeta(c(.75,.25), shape_1, shape_1))

  # KL divergences
  m <- 10 # Number of bins
  h_phat <- hist(scores, breaks = seq(0, 1, length = m + 1), plot = FALSE)
  h_p <- list(breaks = h_phat$breaks, mids = h_phat$mids)
  h_p$density = diff(pbeta(h_p$breaks, shape_1, shape_2))
  h_p$counts =  h_p$density*length(scores)

  # Densities
  h1 <- rbind(h_phat$density / m, h_p$density / m) # Reference : true probabilities
  h2 <- rbind(h_p$density / m, h_phat$density / m) # Reference : predicted scores
  KL_10_true_probas <- distance(
    h1, method = "kullback-leibler", unit = "log2", mute.message = TRUE)
  KL_10_scores <- distance(
    h2, method = "kullback-leibler", unit = "log2", mute.message = TRUE)


  m <- 20 # Number of bins
  h_phat <- hist(scores, breaks = seq(0, 1, length = m + 1), plot = FALSE)
  h_p <- list(breaks = h_phat$breaks, mids = h_phat$mids)
  h_p$density = diff(pbeta(h_p$breaks, shape_1, shape_2))
  h_p$counts =  h_p$density * length(scores)
  # Densities
  h1 <- rbind(h_phat$density / m, h_p$density) # Reference : true probabilities
  h2 <- rbind(h_p$density, h_phat$density / m) # Reference : predicted scores
  KL_20_true_probas <- distance(
    h1, method = "kullback-leibler", unit = "log2", mute.message = TRUE)
  KL_20_scores <- distance(
    h2, method = "kullback-leibler", unit = "log2", mute.message = TRUE)

  # Indicator of the difference between variance and covariance
  var_p <- shape_1 * shape_2 / ((shape_1 + shape_2)^2 * (shape_1 + shape_2 + 1))
  cov_p_phat <- cov(
    qbeta(
      rank(scores, ties.method = "average") / (1 + length(scores)),
      shape_1,
      shape_2),
    scores
  )
  ind_cov <- abs(cov_p_phat - var_p)

  # Collection
  dispersion_metrics <- tibble(
    "inter_quantile_25_75" = as.numeric(inter_q_50),
    "inter_quantile_10_90" = as.numeric(inter_q_80),
    "KL_10_true_probas" = as.numeric(KL_10_true_probas),
    "KL_10_scores" = as.numeric(KL_10_scores),
    "KL_20_true_probas" = as.numeric(KL_20_true_probas),
    "KL_20_scores" = as.numeric(KL_20_scores),
    "ind_cov" = ind_cov
  )

  dispersion_metrics
}

disp_metrics_dataset <- function(prior, scores) {
  # GAMSEL prior
  shape_1 <- prior$mle_gamsel$estimate["shape1"]
  shape_2 <- prior$mle_gamsel$estimate["shape2"]

  # Divergence metrics
  dist_prior_gamsel <- dispersion_metrics_beta(
    shape_1 = shape_1, shape_2 = shape_2, scores = scores
  )

  dist_prior_gamsel |>
    mutate(
      prior = "gamsel", shape_1 = shape_1, shape_2 = shape_2
      )
}

9.1.3 Extreme Gradient Boosting

Function get_metrics_xgb_iter()

#' Get the metrics based on scores estimated at a given boosting iteration
#'
#' @param scores scores estimated a boosting iteration `nb_iter` (list with
#'   train and test scores, returned by `predict_score_iter()`)
#' @param data_train train set
#' @param data_valid validation set
#' @param data_calib calibration set
#' @param data_test test set
#' @param prior_train train prior scores
#' @param prior_valid validation prior scores
#' @param prior_calib calibration prior scores
#' @param prior_test test prior scores
#' @param target_name name of the target variable
#' @param ind index of the grid search
#' @param nb_iter boosting iteration to consider
#' @param params hyperparameters to consider
#'
#' @returns A list with 4 elements:
#'  - `tb_metrics`: performance / calibration metrics
#'  - `tb_disp_metrics`: disp and div metrics
#'  - `tb_prop_scores`: table with P(q1 < score < q2)
#'  - `scores_hist`: histogram of scores
get_metrics_xgb_iter <- function(scores,
                                 prior,
                                 data_train,
                                 data_valid,
                                 data_calib,
                                 data_test,
                                 prior_train,
                                 prior_valid,
                                 prior_calib,
                                 prior_test,
                                 target_name,
                                 ind,
                                 nb_iter,
                                 params) {

  scores_train <- scores$scores_train
  scores_valid <- scores$scores_valid
  scores_calib <- scores$scores_calib
  scores_test <- scores$scores_test
  
  # Recalibration
  # Platt scaling
  res_recalibration_platt <- recalibrate(
    obs_calib = data_calib |> dplyr::pull(!!target_name), 
    obs_test = data_test |> dplyr::pull(!!target_name), 
    pred_calib = scores_calib, 
    pred_test = scores_test, 
    method = "platt"
  )
  scores_c_platt_calib <- res_recalibration_platt$tb_score_c_calib$p_c
  scores_c_platt_test <- res_recalibration_platt$tb_score_c_test$p_c
  
  # Isotonic regression
  res_recalibration_iso <- recalibrate(
    obs_calib = data_calib |> dplyr::pull(!!target_name), 
    obs_test = data_test |> dplyr::pull(!!target_name), 
    pred_calib = scores_calib, 
    pred_test = scores_test, 
    method = "isotonic"
  )
  scores_c_iso_calib <- res_recalibration_iso$tb_score_c_calib$p_c
  scores_c_iso_test <- res_recalibration_iso$tb_score_c_test$p_c
  
  # Beta calibration
  res_recalibration_beta <- recalibrate(
    obs_calib = data_calib |> dplyr::pull(!!target_name), 
    obs_test = data_test |> dplyr::pull(!!target_name), 
    pred_calib = scores_calib, 
    pred_test = scores_test,
    method = "beta"
  )
  scores_c_beta_calib <- res_recalibration_beta$tb_score_c_calib$p_c
  scores_c_beta_test <- res_recalibration_beta$tb_score_c_test$p_c
  
  # Locfit regression
  res_recalibration_locfit <- recalibrate(
    obs_calib = data_calib |> dplyr::pull(!!target_name), 
    obs_test = data_test |> dplyr::pull(!!target_name), 
    pred_calib = scores_calib, 
    pred_test = scores_test,
    method = "locfit"
  )
  scores_c_locfit_calib <- res_recalibration_locfit$tb_score_c_calib$p_c
  scores_c_locfit_test <- res_recalibration_locfit$tb_score_c_test$p_c

  ## Metrics----
  ## Histogram of scores----
  breaks <- seq(0, 1, by = .05)
  scores_train_hist <- hist(scores_train, breaks = breaks, plot = FALSE)
  scores_calib_hist <- hist(scores_calib, breaks = breaks, plot = FALSE)
  scores_valid_hist <- hist(scores_valid, breaks = breaks, plot = FALSE)
  scores_test_hist <- hist(scores_test, breaks = breaks, plot = FALSE)
  scores_c_platt_calib_hist <- hist(scores_c_platt_calib, breaks = breaks, plot = FALSE)
  scores_c_platt_test_hist <- hist(scores_c_platt_test, breaks = breaks, plot = FALSE)
  scores_c_iso_calib_hist <- hist(scores_c_iso_calib, breaks = breaks, plot = FALSE)
  scores_c_iso_test_hist <- hist(scores_c_iso_test, breaks = breaks, plot = FALSE)
  scores_c_beta_calib_hist <- hist(scores_c_beta_calib, breaks = breaks, plot = FALSE)
  scores_c_beta_test_hist <- hist(scores_c_beta_test, breaks = breaks, plot = FALSE)
  scores_c_locfit_calib_hist <- hist(scores_c_locfit_calib, breaks = breaks, plot = FALSE)
  scores_c_locfit_test_hist <- hist(scores_c_locfit_test, breaks = breaks, plot = FALSE)
  
  scores_hist <- list(
    train = scores_train_hist,
    valid = scores_valid_hist,
    calib = scores_calib_hist,
    test = scores_test_hist,
    calib_c_platt = scores_c_platt_calib_hist,
    test_c_platt = scores_c_platt_test_hist,
    calib_c_iso = scores_c_iso_calib_hist,
    test_c_iso = scores_c_iso_test_hist,
    calib_c_beta = scores_c_beta_calib_hist,
    test_c_beta = scores_c_beta_test_hist,
    calib_c_locfit = scores_c_locfit_calib_hist,
    test_c_locfit = scores_c_locfit_test_hist,
    ind = ind,
    nb_iter = nb_iter,
    max_depth = params$max_depth
    
  )
  
  ## Estimation of P(q1 < score < q2)----
  prop_btw_q_h <- function(s, sample_name, recalib_name) {
    map(
      c(.1, .2, .3, .4),
      ~prop_btw_quantiles(s = s, q1 = .x)
    ) |>
      list_rbind() |>
      mutate(sample = sample_name, recalib = recalib_name)
  }
  
  proq_scores_train <- prop_btw_q_h(
    scores_train, sample_name = "train", recalib_name = "none"
  )
  proq_scores_valid <- prop_btw_q_h(
    scores_valid, sample_name = "valid", recalib_name = "none"
  )
  proq_scores_calib <- prop_btw_q_h(
    scores_calib, sample_name = "calib", recalib_name = "none"
  )
  proq_scores_test <- prop_btw_q_h(
    scores_test, sample_name = "test", recalib_name = "none"
  )
  proq_scores_c_platt_calib <- prop_btw_q_h(
    scores_c_platt_calib, sample_name = "calib", recalib_name = "platt"
  )
  proq_scores_c_platt_test <- prop_btw_q_h(
    scores_c_platt_test, sample_name = "test", recalib_name = "platt"
  )
  proq_scores_c_iso_calib <- prop_btw_q_h(
    scores_c_iso_calib, sample_name = "calib", recalib_name = "isotonic"
  )
  proq_scores_c_iso_test <- prop_btw_q_h(
    scores_c_iso_test, sample_name = "test", recalib_name = "isotonic"
  )
   proq_scores_c_beta_calib <- prop_btw_q_h(
    scores_c_beta_calib, sample_name = "calib", recalib_name = "beta"
  )
  proq_scores_c_beta_test <- prop_btw_q_h(
    scores_c_beta_test, sample_name = "test", recalib_name = "beta"
  )
  proq_scores_c_locfit_calib <- prop_btw_q_h(
    scores_c_locfit_calib, sample_name = "calib", recalib_name = "locfit"
  )
  proq_scores_c_locfit_test <- prop_btw_q_h(
    scores_c_locfit_test, sample_name = "test", recalib_name = "locfit"
  )

  ## Dispersion Metrics----
  disp_train <- disp_metrics_dataset(
    prior = prior, scores = scores_train
  ) |> 
    mutate(sample = "train", recalib = "none")
  
  disp_valid <- disp_metrics_dataset(
    prior = prior, scores = scores_valid
  ) |>
    mutate(sample = "valid", recalib = "none")
  
  disp_calib <- disp_metrics_dataset(
    prior = prior, scores = scores_calib
  ) |> 
    mutate(sample = "calib", recalib = "none")
  
  disp_test <- disp_metrics_dataset(
    prior = prior, scores = scores_test
  ) |> 
    mutate(sample = "test", recalib = "none")
  
  disp_c_platt_calib <- disp_metrics_dataset(
    prior = prior, scores = scores_c_platt_calib
  ) |>
    mutate(sample = "calib", recalib = "platt")
  
  disp_c_platt_test <- disp_metrics_dataset(
    prior = prior, scores = scores_c_platt_test
  ) |>
    mutate(sample = "test", recalib = "platt")
  
  disp_c_iso_calib <- disp_metrics_dataset(
    prior = prior, scores = scores_c_iso_calib
  ) |>
    mutate(sample = "calib", recalib = "isotonic")
  
  disp_c_iso_test <- disp_metrics_dataset(
    prior = prior, scores = scores_c_iso_test
  ) |>
    mutate(sample = "test", recalib = "isotonic")
  
  disp_c_beta_calib <- disp_metrics_dataset(
    prior = prior, scores = scores_c_beta_calib
  ) |> 
    mutate(sample = "calib", recalib = "beta")
  
  disp_c_beta_test <- disp_metrics_dataset(
    prior = prior, scores = scores_c_beta_test
  ) |> 
    mutate(sample = "test", recalib = "beta")
  
  disp_c_locfit_calib <- disp_metrics_dataset(
    prior = prior, scores = scores_c_locfit_calib
  ) |> 
    mutate(sample = "calib", recalib = "locfit")
  
  disp_c_locfit_test <- disp_metrics_dataset(
    prior = prior, scores = scores_c_locfit_test
  ) |> 
    mutate(sample = "test", recalib = "locfit")

  # Performance and Calibration Metrics----
  # We add very small noise to predicted scores
  # otherwise the local regression may crash
  scores_train_noise <- scores_train +
    runif(n = length(scores_train), min = 0, max = 0.01)
  scores_train_noise[scores_train_noise > 1] <- 1
  metrics_train <- compute_metrics(
    obs = data_train |> pull(!!target_name),
    scores = scores_train_noise, true_probas = prior_train
  ) |> mutate(sample = "train", recalib = "none")

  scores_valid_noise <- scores_valid +
    runif(n = length(scores_valid), min = 0, max = 0.01)
  scores_valid_noise[scores_valid_noise > 1] <- 1
  metrics_valid <- compute_metrics(
    obs = data_valid |> pull(!!target_name),
    scores = scores_valid_noise, true_probas = prior_valid
  ) |> mutate(sample = "valid", recalib = "none")
  
  scores_calib_noise <- scores_calib +
    runif(n = length(scores_calib), min = 0, max = 0.01)
  scores_calib_noise[scores_calib_noise > 1] <- 1
  metrics_calib <- compute_metrics(
    obs = data_calib |> pull(!!target_name),
    scores = scores_calib_noise, true_probas = prior_calib
  ) |> mutate(sample = "calib", recalib = "none")
  
  scores_test_noise <- scores_test +
    runif(n = length(scores_test), min = 0, max = 0.01)
  scores_test_noise[scores_test_noise > 1] <- 1
  metrics_test <- compute_metrics(
    obs = data_test |> pull(!!target_name),
    scores = scores_test_noise, true_probas = prior_test
  ) |> mutate(sample = "test", recalib = "none")
  
  # With recalibrated scores (platt)
  scores_c_platt_calib_noise <- scores_c_platt_calib +
    runif(n = length(scores_calib), min = 0, max = 0.01)
  scores_c_platt_calib_noise[scores_c_platt_calib_noise > 1] <- 1
  metrics_c_platt_calib <- compute_metrics(
    obs = data_calib |> pull(!!target_name),
    scores = scores_c_platt_calib_noise, true_probas = prior_calib
  ) |> mutate(sample = "calib", recalib = "platt")
  
  scores_c_platt_test_noise <- scores_c_platt_test +
    runif(n = length(scores_test), min = 0, max = 0.01)
  scores_c_platt_test_noise[scores_c_platt_test_noise > 1] <- 1
  metrics_c_platt_test <- compute_metrics(
    obs = data_test |> pull(!!target_name),
    scores = scores_c_platt_test_noise, true_probas = prior_test
  ) |> mutate(sample = "test", recalib = "platt")
  
  # With recalibrated scores (isotonic)
  scores_c_iso_calib_noise <- scores_c_iso_calib +
    runif(n = length(scores_c_iso_calib), min = 0, max = 0.01)
  scores_c_iso_calib_noise[scores_c_iso_calib_noise > 1] <- 1
  metrics_c_iso_calib <- compute_metrics(
    obs = data_calib |> pull(!!target_name),
    scores = scores_c_iso_calib_noise, true_probas = prior_calib
  ) |> mutate(sample = "calib", recalib = "isotonic")
  
  scores_c_iso_test_noise <- scores_c_iso_test +
    runif(n = length(scores_c_iso_test), min = 0, max = 0.01)
  scores_c_iso_test_noise[scores_c_iso_test_noise > 1] <- 1
  metrics_c_iso_test <- compute_metrics(
    obs = data_test |> pull(!!target_name),
    scores = scores_c_iso_test_noise, true_probas = prior_test
  ) |> mutate(sample = "test", recalib = "isotonic")
  
 # With recalibrated scores (beta)
  scores_c_beta_calib_noise <- scores_c_beta_calib +
    runif(n = length(scores_c_beta_calib), min = 0, max = 0.01)
  scores_c_beta_calib_noise[scores_c_beta_calib_noise > 1] <- 1
  metrics_c_beta_calib <- compute_metrics(
    obs = data_calib |> pull(!!target_name),
    scores = scores_c_beta_calib_noise, true_probas = prior_calib
  ) |> mutate(sample = "calib", recalib = "beta")
  
  scores_c_beta_test_noise <- scores_c_beta_test +
    runif(n = length(scores_c_beta_test), min = 0, max = 0.01)
  scores_c_beta_test_noise[scores_c_beta_test_noise > 1] <- 1
  metrics_c_beta_test <- compute_metrics(
    obs = data_test |> pull(!!target_name),
    scores = scores_c_beta_test_noise, true_probas = prior_test
  ) |> mutate(sample = "test", recalib = "beta")
  
  # With recalibrated scores (locfit)
  scores_c_locfit_calib_noise <- scores_c_locfit_calib +
    runif(n = length(scores_c_locfit_calib), min = 0, max = 0.01)
  scores_c_locfit_calib_noise[scores_c_locfit_calib_noise > 1] <- 1
  metrics_c_locfit_calib <- compute_metrics(
    obs = data_calib |> pull(!!target_name),
    scores = scores_c_locfit_calib_noise, true_probas = prior_calib
  ) |> mutate(sample = "calib", recalib = "locfit")
  
  scores_c_locfit_test_noise <- scores_c_locfit_test +
    runif(n = length(scores_c_locfit_test), min = 0, max = 0.01)
  scores_c_locfit_test_noise[scores_c_locfit_test_noise > 1] <- 1
  metrics_c_locfit_test <- compute_metrics(
    obs = data_test |> pull(!!target_name),
    scores = scores_c_locfit_test_noise, true_probas = prior_test
  ) |> mutate(sample = "test", recalib = "locfit")
  
  # Decomposition of expected losses
  # Platt
  decomposition_platt_calib <- decomposition_metrics(
    obs = data_calib |> pull(!!target_name), scores = scores_calib, 
    calibrated_scores = scores_c_platt_calib, true_probas = prior_calib
  ) |> mutate(sample = "calib", recalib = "platt")
  
  decomposition_platt_test <- decomposition_metrics(
    obs = data_test |> pull(!!target_name), scores = scores_test, 
    calibrated_scores = scores_c_platt_test, true_probas = prior_test
  ) |> mutate(sample = "test", recalib = "platt")
  
  # Isotonic
  decomposition_iso_calib <- decomposition_metrics(
    obs = data_calib |> pull(!!target_name), scores = scores_calib, 
    calibrated_scores = scores_c_iso_calib, true_probas = prior_calib
  ) |> mutate(sample = "calib", recalib = "iso")
  
  decomposition_iso_test <- decomposition_metrics(
    obs = data_test |> pull(!!target_name), scores = scores_test, 
    calibrated_scores = scores_c_iso_test, true_probas = prior_test
  ) |> mutate(sample = "test", recalib = "iso")
  
  # Beta
  decomposition_beta_calib <- decomposition_metrics(
    obs = data_calib |> pull(!!target_name), scores = scores_calib, 
    calibrated_scores = scores_c_beta_calib, true_probas = prior_calib
  ) |> mutate(sample = "calib", recalib = "beta")
  
  decomposition_beta_test <- decomposition_metrics(
    obs = data_test |> pull(!!target_name), scores = scores_test, 
    calibrated_scores = scores_c_beta_test, true_probas = prior_test
  ) |> mutate(sample = "test", recalib = "beta")
  
  # Locfit
  decomposition_locfit_calib <- decomposition_metrics(
    obs = data_calib |> pull(!!target_name), scores = scores_calib, 
    calibrated_scores = scores_c_locfit_calib, true_probas = prior_calib
  ) |> mutate(sample = "calib", recalib = "locfit")
  
  decomposition_locfit_test <- decomposition_metrics(
    obs = data_test |> pull(!!target_name), scores = scores_test, 
    calibrated_scores = scores_c_locfit_test, true_probas = prior_test
  ) |> mutate(sample = "test", recalib = "locfit")
  
  tb_metrics <- metrics_train |>
    bind_rows(metrics_valid) |>
    bind_rows(metrics_calib) |>
    bind_rows(metrics_test) |>
    bind_rows(metrics_c_platt_calib) |>
    bind_rows(metrics_c_platt_test) |>
    bind_rows(metrics_c_iso_calib) |>
    bind_rows(metrics_c_iso_test) |>
    bind_rows(metrics_c_beta_calib) |>
    bind_rows(metrics_c_beta_test) |>
    bind_rows(metrics_c_locfit_calib) |>
    bind_rows(metrics_c_locfit_test) |>
    left_join(
      disp_train |>
        bind_rows(disp_valid) |> 
        bind_rows(disp_calib) |> 
        bind_rows(disp_test) |> 
        bind_rows(disp_c_platt_calib) |> 
        bind_rows(disp_c_platt_test) |> 
        bind_rows(disp_c_iso_calib) |> 
        bind_rows(disp_c_iso_test) |> 
        bind_rows(disp_c_beta_calib) |> 
        bind_rows(disp_c_beta_test) |> 
        bind_rows(disp_c_locfit_calib) |> 
        bind_rows(disp_c_locfit_test),
      by = c("sample", "recalib")
    ) |>
    mutate(ind = ind, nb_iter = nb_iter, max_depth = params$max_depth) 
  #|>
  #  select(-c(mse, mae))

  tb_prop_scores <- proq_scores_train |>
    bind_rows(proq_scores_valid) |>
    bind_rows(proq_scores_calib) |>
    bind_rows(proq_scores_test) |>
    bind_rows(proq_scores_c_platt_calib) |>
    bind_rows(proq_scores_c_platt_test) |>
    bind_rows(proq_scores_c_iso_calib) |>
    bind_rows(proq_scores_c_iso_test) |>
     bind_rows(proq_scores_c_beta_calib) |>
    bind_rows(proq_scores_c_beta_test) |>
    bind_rows(proq_scores_c_locfit_calib) |>
    bind_rows(proq_scores_c_locfit_test) |>
    mutate(ind = ind, nb_iter = nb_iter, max_depth = params$max_depth)
  
  tb_decomposition_loss <- decomposition_platt_calib |>
    bind_rows(decomposition_platt_test) |>
    bind_rows(decomposition_iso_calib) |>
    bind_rows(decomposition_iso_test) |>
    bind_rows(decomposition_beta_calib) |>
    bind_rows(decomposition_beta_test) |>
    bind_rows(decomposition_locfit_calib) |>
    bind_rows(decomposition_locfit_test) |>
    mutate(ind = ind, nb_iter = nb_iter, max_depth = params$max_depth)
  
   list(
    ind = ind,                         # index for grid
    nb_iter = nb_iter,                 # number of boosting iterations
    max_depth = params$max_depth,      # max depth of used trees
    tb_metrics = tb_metrics,           # # table with performance/calib/divergence
    tb_prop_scores = tb_prop_scores,   # table with P(q1 < score < q2)
    tb_decomposition = tb_decomposition_loss,
    scores_hist = scores_hist          # histogram of scores
  )
}

Function predict_score_iter()

#' Predicts the scores at a given iteration of the XGB model
#'
#' @param fit_xgb estimated XGB model
#' @param tb_train_xgb train set
#' @param tb_valid_xgb validation set
#' @param tb_test_xgb test set
#' @param ind index of the grid search
#' @param nb_iter boosting iteration to consider
#'
#' @returns A list with three elements: `scores_train`, `scores_valid`, and
#'  `scores_train` which contain the estimated scores on the train and on the 
#'  test score, resp.
predict_score_iter <- function(fit_xgb,
                               tb_train_xgb,
                               tb_valid_xgb,
                               tb_calib_xgb,
                               tb_test_xgb,
                               nb_iter) {

  ## Predicted scores----
  scores_train <- predict(fit_xgb, tb_train_xgb, iterationrange = c(1, nb_iter))
  scores_valid <- predict(fit_xgb, tb_valid_xgb, iterationrange = c(1, nb_iter))
  scores_calib <- predict(fit_xgb, tb_calib_xgb, iterationrange = c(1, nb_iter))
  scores_test <- predict(fit_xgb, tb_test_xgb, iterationrange = c(1, nb_iter))

  list(
    scores_train = scores_train,
    scores_valid = scores_valid,
    scores_calib = scores_calib,
    scores_test = scores_test
  )
}

Function simul_xgb_helper()

#' Fit an XGB and returns metrics based on scores. The divergence metrics are
#' obtained using the prior distributions.
#'
#' @param data_train train set
#' @param data_valid validation set
#' @param data_calib calibration set
#' @param data_test test set
#' @param prior_train train prior scores
#' @param prior_valid validation prior scores
#' @param prior_calib calibration prior scores
#' @param prior_test test prior scores
#' @param target_name name of the target variable
#' @param parms tibble with hyperparameters for the current estimation
#' @param prior prior obtained with `get_beta_fit()`
#'
#' @returns A list with 4 elements:
#'  - `tb_metrics`: performance / calibration metrics
#'  - `tb_disp_metrics`: disp and div metrics
#'  - `tb_prop_scores`: table with P(q1 < score < q2)
#'  - `scores_hist`: histogram of scores
simul_xgb_helper <- function(data_train,
                             data_valid,
                             data_calib,
                             data_test,
                             prior_train,
                             prior_valid,
                             prior_calib,
                             prior_test,
                             target_name,
                             params,
                             prior) {

  ## Format data for xgboost----
  tb_train_xgb <- xgb.DMatrix(
    data = data_train |> dplyr::select(-!!target_name) |> as.matrix(),
    label = data_train |> dplyr::pull(!!target_name) |> as.matrix()
  )
  tb_valid_xgb <- xgb.DMatrix(
    data = data_valid |> dplyr::select(-!!target_name) |> as.matrix(),
    label = data_valid |> dplyr::pull(!!target_name) |> as.matrix()
  )
  tb_calib_xgb <- xgb.DMatrix(
    data = data_calib |> dplyr::select(-!!target_name) |> as.matrix(),
    label = data_calib |> dplyr::pull(!!target_name) |> as.matrix()
  )
  tb_test_xgb <- xgb.DMatrix(
    data = data_test |> dplyr::select(-!!target_name) |> as.matrix(),
    label = data_test |> dplyr::pull(!!target_name) |> as.matrix()
  )
  # Parameters for the algorithm
  param <- list(
    max_depth = params$max_depth, #Note: root node is indexed 0
    eta = params$eta,
    nthread = 1,
    objective = "binary:logistic",
    eval_metric = "auc"
  )
  watchlist <- list(train = tb_train_xgb, eval = tb_valid_xgb)
  ## Estimation----
  fit_xgb <- xgb.train(
    param, tb_train_xgb,
    nrounds = params$nb_iter_total,
    watchlist,
    verbose = 0
  )

  # First, we estimate the scores at each boosting iteration
  # As the xgb.Dmatrix objects cannot be easily serialised, we first estimate
  # these scores in a classical way, without parallelism...
  scores_iter <- vector(mode = "list", length = params$nb_iter_total)
  for (i_iter in 1:params$nb_iter_total) {
    scores_iter[[i_iter]] <- predict_score_iter(
      fit_xgb = fit_xgb,
      tb_train_xgb = tb_train_xgb,
      tb_valid_xgb = tb_valid_xgb,
      tb_calib_xgb = tb_calib_xgb,
      tb_test_xgb = tb_test_xgb,
      nb_iter = i_iter)
  }

  # Then, to compute the metrics, as it is a bit slower, we can use parallelism

  ncl <- detectCores() - 1
  (cl <- makeCluster(ncl))
  clusterEvalQ(cl, {
    library(tidyverse)
    library(locfit)
    library(philentropy)
    library(betacal)
  }) |>
    invisible()

  clusterExport(cl, c(
    "scores_iter", "prior", "data_train", "data_valid", "data_calib", "data_test", 
    "prior_train", "prior_valid", "prior_calib", "prior_test", "params", "target_name"
  ), envir = environment())
  clusterExport(cl, c(
    "get_metrics_xgb_iter",
    "brier_score",
    "compute_metrics",
    "disp_metrics_dataset", "dispersion_metrics_beta",
    "recalibrate", "prop_btw_quantiles", "calculate_log_loss",
    "calculate_kl", "decomposition_metrics"
  ))

  metrics_iter <-
    pbapply::pblapply(
      X = seq_len(params$nb_iter_total),
      FUN = function(i_iter) {
        get_metrics_xgb_iter(
          scores = scores_iter[[i_iter]],
          prior = prior,
          data_train = data_train,
          data_valid = data_valid,
          data_calib = data_calib,
          data_test = data_test,
          prior_train = prior_train,
          prior_valid = prior_valid,
          prior_calib = prior_calib,
          prior_test = prior_test,
          target_name = target_name,
          ind = params$ind,
          nb_iter = i_iter,
          params = params
        )
      },
      cl = cl
    )
  stopCluster(cl)
  
  # Merge tibbles from each iteration into a single one
  tb_metrics <-
    map(metrics_iter, "tb_metrics") |>
    list_rbind()
  tb_prop_scores <-
    map(metrics_iter, "tb_prop_scores") |>
    list_rbind()
  tb_decomposition_scores <-
    map(metrics_iter, "tb_decomposition") |>
    list_rbind()
  scores_hist <- map(metrics_iter, "scores_hist")

  list(
    tb_metrics = tb_metrics,
    tb_prop_scores = tb_prop_scores,
    tb_decomposition_scores = tb_decomposition_scores,
    scores_hist = scores_hist
  )
}

Function simul_xgb_real()

#' Train an XGB on a dataset for a binary task for various
#' hyperparameters and computes metrics based on scores and on a set of prior
#' distributions of the underlying probability
#'
#' @param data dataset
#' @param target_name name of the target variable
#' @param prior prior obtained with `get_beta_fit()`
#' @param seed desired seed (default to `NULL`)
#'
#' @returns A list with two elements:
#'  - `res`: results for each estimated model of the grid. Each element is a
#'  list with the following elements:
#'      - `tb_metrics`: performance / calibration metrics
#'      - `tb_disp_metrics`: disp and div metrics
#'      - `tb_prop_scores`: table with P(q1 < score < q2)
#'      - `scores_hist`: histogram of scores.
#'  - `grid`: the grid search.
simul_xgb_real <- function(data,
                           target_name,
                           prior,
                           seed = NULL) {

  if (!is.null(seed)) set.seed(seed)

  # Split data into train and test set
  data_splitted <- split_train_test(data = data, prop_train = .7, seed = seed)
  data_encoded <- encode_dataset(
    data_train = data_splitted$train,
    data_test = data_splitted$test,
    target_name = target_name,
    intercept = FALSE
  )

  # Further split train into two samples (train/valid)
  data_splitted_train <- 
    split_train_test(data = data_encoded$train, prop_train = .8, seed = seed)
  
  # Further split test into two samples (calib/test)
  data_splitted_test <- 
    split_train_test(data = data_encoded$test, prop_train = .6, seed = seed)
  
  # Split prior scores
  # Further split train into two samples (train/valid)
  prior_splitted_train <- 
    split_train_test_prior(prior = prior$scores_gamsel$scores_train, prop_train = .8, seed = seed)
  
  # Further split test into two samples (calib/test)
  prior_splitted_test <- 
    split_train_test_prior(prior = prior$scores_gamsel$scores_test, prop_train = .6, seed = seed)
  
  res_grid <- vector(mode = "list", length = nrow(grid))
  for (i_grid in 1:nrow(grid)) {
    res_grid[[i_grid]] <- simul_xgb_helper(
      data_train = data_splitted_train$train,
      data_valid = data_splitted_train$test,
      data_calib = data_splitted_test$train,
      data_test = data_splitted_test$test,
      prior_train = prior_splitted_train$train,
      prior_valid = prior_splitted_train$test,
      prior_calib = prior_splitted_test$train,
      prior_test = prior_splitted_test$test,
      target_name = target_name,
      params = grid |> dplyr::slice(i_grid),
      prior = prior
    )
  }
  
  # The metrics computed for all set of hyperparameters (identified with `ind`)
  # and for each number of boosting iterations (`nb_iter`)
  metrics_simul <- map(res_grid, "tb_metrics") |> 
    list_rbind()
  
  # P(q_1<s(x)<q_2)
  prop_scores_simul <- map(res_grid, "tb_prop_scores") |> 
    list_rbind()
  
  # Decomposition of expected losses
  decomposition_scores_simul <- map(res_grid, "tb_decomposition_scores") |> 
    list_rbind()
  
  # Histogram of estimated scores
  scores_hist <- map(res_grid, "scores_hist")

  list(
    metrics_simul = metrics_simul,
    scores_hist = scores_hist,
    prop_scores_simul = prop_scores_simul,
    decomposition_scores_simul = decomposition_scores_simul
  )
}

9.2 Grid

We consider the following grid:

grid <- expand_grid(
  max_depth = c(2, 4, 6),
  nb_iter_total = 500,
  eta = 0.3
) |>
  mutate(ind = row_number())

The different configurations are reported in Table 9.1.

DT::datatable(grid)

Table 9.1: Grid Search Values

9.3 Data

In Chapter 7, we estimated the shapes of Beta distributions using fitted scores from a GAMSEL model applied to various datasets from the UCI Machine Learning Repository. We saved these estimated priors and the datasets in R data files, which can now be easily loaded.

The list of datasets and the name of the target variable:

datasets <- tribble(
  ~name, ~target_name,
  "abalone", "Sex",
  "adult", "high_income",
  "bank", "y",
  "default", "default",
  "drybean", "is_dermason",
  "coupon", "y",
  "mushroom", "edible",
  "occupancy", "Occupancy",
  "winequality", "high_quality",
  "spambase", "is_spam"
)

for (name in datasets$name) {
  # The data
  load(str_c("../output/real-data/tb_", name, ".rda"))
  # The Prior on the distribution of the scores
  load(str_c("../output/real-data/priors_", name, ".rda"))
}

9.4 Estimations: Extreme Gradient Boosting

The models are estimated in parallel. The number of available cores can be determined using the following command:

library(future)
nb_cores <- future::availableCores() - 1

We use the following loop to estimate all the models across each dataset:

datasets <- datasets[-1,]
seed <- 1234
for (name in datasets$name) {
  plan(multisession, workers = nb_cores)
  current_data <- get(str_c("tb_", name))
  current_priors <- get(str_c("priors_", name))
  current_target_name <- datasets |>
    filter(name == !!name) |> pull(target_name)
  ## Extreme Gradient Boosting----
  xgb_resul <- simul_xgb_real(data = current_data,
                              target_name = current_target_name,
                              prior = current_priors,
                              seed = seed)
  save(xgb_resul, file = str_c("../output/real-data/xgb_resul_", name, ".rda"))
}