This chapter estimates the binary events from the datasets introduced in Chapter 7 using extreme gradient boosting models. For this model, we perform a grid search. For each set of hyperparameters of the grid search, we compute the estimated scores by the model and calculate performance, calibration, and divergence metrics. For the divergence metrics, we assume that the underlying probabilities are distributed according to a Beta distribution whose parameters were estimated in Chapter 7.
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.5
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ ggplot2 3.5.1 ✔ tibble 3.2.1
✔ lubridate 1.9.4 ✔ tidyr 1.3.1
✔ purrr 1.0.2
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Attaching package: 'xgboost'
The following object is masked from 'package:dplyr':
slice
library(pbapply)library(parallel)library(gam)
Loading required package: splines
Loading required package: foreach
Attaching package: 'foreach'
The following objects are masked from 'package:purrr':
accumulate, when
Loaded gam 1.22-5
library(gamsel)
Loaded gamsel 1.8-5
# Colours for train/validation/calibration/testcolour_samples <-c("Train"="#0072B2","Validation"="#009E73","Calibration"="#CC79A7","Test"="#D55E00")colour_recalib <-c("None"="#88CCEE","Platt"="#44AA99","Isotonic"="#882255","Beta"="#D55E00","Locfit"="firebrick3")# Functionssource("../scripts/functions/real-data.R")source("../scripts/functions/metrics.R")
definition of the theme_paper() function (for ggplot2 graphs)
#' Predicts the scores at a given iteration of the XGB model#'#' @param fit_xgb estimated XGB model#' @param tb_train_xgb train set#' @param tb_valid_xgb validation set#' @param tb_test_xgb test set#' @param ind index of the grid search#' @param nb_iter boosting iteration to consider#'#' @returns A list with three elements: `scores_train`, `scores_valid`, and#' `scores_train` which contain the estimated scores on the train and on the #' test score, resp.predict_score_iter <-function(fit_xgb, tb_train_xgb, tb_valid_xgb, tb_calib_xgb, tb_test_xgb, nb_iter) {## Predicted scores---- scores_train <-predict(fit_xgb, tb_train_xgb, iterationrange =c(1, nb_iter)) scores_valid <-predict(fit_xgb, tb_valid_xgb, iterationrange =c(1, nb_iter)) scores_calib <-predict(fit_xgb, tb_calib_xgb, iterationrange =c(1, nb_iter)) scores_test <-predict(fit_xgb, tb_test_xgb, iterationrange =c(1, nb_iter))list(scores_train = scores_train,scores_valid = scores_valid,scores_calib = scores_calib,scores_test = scores_test )}
Function simul_xgb_helper()
#' Fit an XGB and returns metrics based on scores. The divergence metrics are#' obtained using the prior distributions.#'#' @param data_train train set#' @param data_valid validation set#' @param data_calib calibration set#' @param data_test test set#' @param prior_train train prior scores#' @param prior_valid validation prior scores#' @param prior_calib calibration prior scores#' @param prior_test test prior scores#' @param target_name name of the target variable#' @param parms tibble with hyperparameters for the current estimation#' @param prior prior obtained with `get_beta_fit()`#'#' @returns A list with 4 elements:#' - `tb_metrics`: performance / calibration metrics#' - `tb_disp_metrics`: disp and div metrics#' - `tb_prop_scores`: table with P(q1 < score < q2)#' - `scores_hist`: histogram of scoressimul_xgb_helper <-function(data_train, data_valid, data_calib, data_test, prior_train, prior_valid, prior_calib, prior_test, target_name, params, prior) {## Format data for xgboost---- tb_train_xgb <-xgb.DMatrix(data = data_train |> dplyr::select(-!!target_name) |>as.matrix(),label = data_train |> dplyr::pull(!!target_name) |>as.matrix() ) tb_valid_xgb <-xgb.DMatrix(data = data_valid |> dplyr::select(-!!target_name) |>as.matrix(),label = data_valid |> dplyr::pull(!!target_name) |>as.matrix() ) tb_calib_xgb <-xgb.DMatrix(data = data_calib |> dplyr::select(-!!target_name) |>as.matrix(),label = data_calib |> dplyr::pull(!!target_name) |>as.matrix() ) tb_test_xgb <-xgb.DMatrix(data = data_test |> dplyr::select(-!!target_name) |>as.matrix(),label = data_test |> dplyr::pull(!!target_name) |>as.matrix() )# Parameters for the algorithm param <-list(max_depth = params$max_depth, #Note: root node is indexed 0eta = params$eta,nthread =1,objective ="binary:logistic",eval_metric ="auc" ) watchlist <-list(train = tb_train_xgb, eval = tb_valid_xgb)## Estimation---- fit_xgb <-xgb.train( param, tb_train_xgb,nrounds = params$nb_iter_total, watchlist,verbose =0 )# First, we estimate the scores at each boosting iteration# As the xgb.Dmatrix objects cannot be easily serialised, we first estimate# these scores in a classical way, without parallelism... scores_iter <-vector(mode ="list", length = params$nb_iter_total)for (i_iter in1:params$nb_iter_total) { scores_iter[[i_iter]] <-predict_score_iter(fit_xgb = fit_xgb,tb_train_xgb = tb_train_xgb,tb_valid_xgb = tb_valid_xgb,tb_calib_xgb = tb_calib_xgb,tb_test_xgb = tb_test_xgb,nb_iter = i_iter) }# Then, to compute the metrics, as it is a bit slower, we can use parallelism ncl <-detectCores() -1 (cl <-makeCluster(ncl))clusterEvalQ(cl, {library(tidyverse)library(locfit)library(philentropy)library(betacal) }) |>invisible()clusterExport(cl, c("scores_iter", "prior", "data_train", "data_valid", "data_calib", "data_test", "prior_train", "prior_valid", "prior_calib", "prior_test", "params", "target_name" ), envir =environment())clusterExport(cl, c("get_metrics_xgb_iter","brier_score","compute_metrics","disp_metrics_dataset", "dispersion_metrics_beta","recalibrate", "prop_btw_quantiles", "calculate_log_loss","calculate_kl", "decomposition_metrics" )) metrics_iter <- pbapply::pblapply(X =seq_len(params$nb_iter_total),FUN =function(i_iter) {get_metrics_xgb_iter(scores = scores_iter[[i_iter]],prior = prior,data_train = data_train,data_valid = data_valid,data_calib = data_calib,data_test = data_test,prior_train = prior_train,prior_valid = prior_valid,prior_calib = prior_calib,prior_test = prior_test,target_name = target_name,ind = params$ind,nb_iter = i_iter,params = params ) },cl = cl )stopCluster(cl)# Merge tibbles from each iteration into a single one tb_metrics <-map(metrics_iter, "tb_metrics") |>list_rbind() tb_prop_scores <-map(metrics_iter, "tb_prop_scores") |>list_rbind() tb_decomposition_scores <-map(metrics_iter, "tb_decomposition") |>list_rbind() scores_hist <-map(metrics_iter, "scores_hist")list(tb_metrics = tb_metrics,tb_prop_scores = tb_prop_scores,tb_decomposition_scores = tb_decomposition_scores,scores_hist = scores_hist )}
Function simul_xgb_real()
#' Train an XGB on a dataset for a binary task for various#' hyperparameters and computes metrics based on scores and on a set of prior#' distributions of the underlying probability#'#' @param data dataset#' @param target_name name of the target variable#' @param prior prior obtained with `get_beta_fit()`#' @param seed desired seed (default to `NULL`)#'#' @returns A list with two elements:#' - `res`: results for each estimated model of the grid. Each element is a#' list with the following elements:#' - `tb_metrics`: performance / calibration metrics#' - `tb_disp_metrics`: disp and div metrics#' - `tb_prop_scores`: table with P(q1 < score < q2)#' - `scores_hist`: histogram of scores.#' - `grid`: the grid search.simul_xgb_real <-function(data, target_name, prior,seed =NULL) {if (!is.null(seed)) set.seed(seed)# Split data into train and test set data_splitted <-split_train_test(data = data, prop_train = .7, seed = seed) data_encoded <-encode_dataset(data_train = data_splitted$train,data_test = data_splitted$test,target_name = target_name,intercept =FALSE )# Further split train into two samples (train/valid) data_splitted_train <-split_train_test(data = data_encoded$train, prop_train = .8, seed = seed)# Further split test into two samples (calib/test) data_splitted_test <-split_train_test(data = data_encoded$test, prop_train = .6, seed = seed)# Split prior scores# Further split train into two samples (train/valid) prior_splitted_train <-split_train_test_prior(prior = prior$scores_gamsel$scores_train, prop_train = .8, seed = seed)# Further split test into two samples (calib/test) prior_splitted_test <-split_train_test_prior(prior = prior$scores_gamsel$scores_test, prop_train = .6, seed = seed) res_grid <-vector(mode ="list", length =nrow(grid))for (i_grid in1:nrow(grid)) { res_grid[[i_grid]] <-simul_xgb_helper(data_train = data_splitted_train$train,data_valid = data_splitted_train$test,data_calib = data_splitted_test$train,data_test = data_splitted_test$test,prior_train = prior_splitted_train$train,prior_valid = prior_splitted_train$test,prior_calib = prior_splitted_test$train,prior_test = prior_splitted_test$test,target_name = target_name,params = grid |> dplyr::slice(i_grid),prior = prior ) }# The metrics computed for all set of hyperparameters (identified with `ind`)# and for each number of boosting iterations (`nb_iter`) metrics_simul <-map(res_grid, "tb_metrics") |>list_rbind()# P(q_1<s(x)<q_2) prop_scores_simul <-map(res_grid, "tb_prop_scores") |>list_rbind()# Decomposition of expected losses decomposition_scores_simul <-map(res_grid, "tb_decomposition_scores") |>list_rbind()# Histogram of estimated scores scores_hist <-map(res_grid, "scores_hist")list(metrics_simul = metrics_simul,scores_hist = scores_hist,prop_scores_simul = prop_scores_simul,decomposition_scores_simul = decomposition_scores_simul )}
The different configurations are reported in Table 9.1.
DT::datatable(grid)
9.3 Data
In Chapter 7, we estimated the shapes of Beta distributions using fitted scores from a GAMSEL model applied to various datasets from the UCI Machine Learning Repository. We saved these estimated priors and the datasets in R data files, which can now be easily loaded.
The list of datasets and the name of the target variable:
for (name in datasets$name) {# The dataload(str_c("../output/real-data/tb_", name, ".rda"))# The Prior on the distribution of the scoresload(str_c("../output/real-data/priors_", name, ".rda"))}
9.4 Estimations: Extreme Gradient Boosting
The models are estimated in parallel. The number of available cores can be determined using the following command: