8  Simulation Metrics and Viz

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(randomForest)
randomForest 4.7-1.1
Type rfNews() to see new features/changes/bug fixes.

Attaching package: 'randomForest'

The following object is masked from 'package:dplyr':

    combine

The following object is masked from 'package:ggplot2':

    margin
library(future)
library(binom)
library(locfit)
locfit 1.5-9.9   2024-03-01

Attaching package: 'locfit'

The following object is masked from 'package:purrr':

    none
library(pROC)
Type 'citation("pROC")' for a citation.

Attaching package: 'pROC'

The following objects are masked from 'package:stats':

    cov, smooth, var
library(betacal)
library(locfit)

colours <- c("Train" = "#0072B2", "Calibration" = "#D55E00", "Test" = "#009E73")
library(tidyverse)
colours_samples <- c("Train" = "#0072B2", "Calibration" = "#D55E00", "Test" = "#009E73")
colours_calib <- c(
  # "#332288", 
  "#117733",
  "#44AA99", "#88CCEE",
  "#DDCC77", "#CC6677", "#AA4499", "#882255") |> rev()

colours_test <- adjustcolor(colours_calib, alpha.f = .5)

colours <- NULL
for (k in 1:length(colours_calib)) 
  colours <- c(colours, colours_calib[k], colours_test[k])

In Chapter 7, we trained 200 random forest classifiers and 200 random forest regressors. Each replication corresponded to a different split of the data into a calibration and a test set. Note that the forests were all estimated on the same training test.

In this chapter, we present the calibration and goodness-of-fit metrics computed during the simulations.

8.1 Load Results

Let us load the results from Chapter 7.

For the regression forests:

load("output/simul-rf/metrics_rf_tuned_reg.rda")

For the classification forests:

load("output/simul-rf/metrics_rf_tuned_class.rda")

8.2 Metrics

We extract the summary metrics from these.

summary_metrics_tuned_reg <- 
  map(metrics_rf_tuned_reg, "summary_metrics") |> 
  list_rbind()
summary_metrics_tuned_class <- map(metrics_rf_tuned_class, "summary_metrics") |> 
  list_rbind()

Let us bind the results in a single tibble.

metrics_rf <- summary_metrics_tuned_reg |> 
  mutate(model = "Regression") |> 
  bind_rows(
    summary_metrics_tuned_class |> 
      mutate(model = "Classification")
  ) |> 
  filter(sample %in% c("calibration", "test")) |> 
  mutate(
    AUC = 1-AUC,
    model = factor(
      model, 
      levels = c("Regression", "Classification")
    ),
    method = factor(
      method,
      levels = c("No Calibration", "platt", "isotonic", "beta", 
                 "locfit_0", "locfit_1", "locfit_2") |> rev(),
      labels = c("No Calibration", "Platt Scaling", "Isotonic", "Beta", 
                 "Locfit (deg=0)", "Locfit (deg=1)", "Locfit (deg=2)") |> rev()
    ),
    sample = factor(
      sample,
      levels = c("calibration", "test") |> rev(),
      labels = c("Calibration", "Test") |> rev())
  )
metrics_rf
# A tibble: 5,600 × 17
   sample  model mse   accuracy missclass_rate sensitivity specificity threshold
   <fct>   <fct> <lgl>    <dbl>          <dbl>       <dbl>       <dbl>     <dbl>
 1 Calibr… Regr… NA       0.852          0.148       0.758       0.923       0.5
 2 Test    Regr… NA       0.852          0.148       0.755       0.923       0.5
 3 Calibr… Regr… NA       0.860          0.140       0.804       0.902       0.5
 4 Calibr… Regr… NA       0.861          0.139       0.812       0.897       0.5
 5 Calibr… Regr… NA       0.860          0.140       0.825       0.887       0.5
 6 Calibr… Regr… NA       0.860          0.140       0.813       0.895       0.5
 7 Calibr… Regr… NA       0.860          0.140       0.822       0.890       0.5
 8 Calibr… Regr… NA       0.860          0.140       0.818       0.892       0.5
 9 Test    Regr… NA       0.857          0.143       0.794       0.903       0.5
10 Test    Regr… NA       0.858          0.142       0.803       0.899       0.5
# ℹ 5,590 more rows
# ℹ 9 more variables: FPR <dbl>, AUC <dbl>, locfit_score <dbl>, brier <dbl>,
#   ece <dbl>, qmse <dbl>, wmse <dbl>, seed <int>, method <fct>

8.3 Boxplots

Let us define two functions to display the goodness-of-fit metrics (boxplot_metrics_gof()) and the calibration metrics (boxplot_metrics_calib()) in boxplots.

boxplot_metrics_gof <- function(data_metrics){
  
  gof_metrics <- c("accuracy", "sensitivity", "specificity")
  gof_metrics_lab <- c("Accuracy", "Sensitivity", "Specificity")
  
  data_metrics_plot <- 
    data_metrics |> 
    select(-mse) |> 
    select(sample, model, method, seed, !!!syms(gof_metrics)) |> 
    pivot_longer(
      cols = !!gof_metrics, 
      names_to = "metric", values_to = "value"
    )
  
  mat <- matrix(
    c(1:(6), rep(7,3)), ncol = 3, byrow = TRUE
  )
  layout(mat, heights = c(rep(3, 4),1))
  
  range_val <- data_metrics_plot |> 
    group_by(metric) |> 
    summarise(
      min_val = min(value),
      max_val = max(value)
    )
  
  # Goodness of Fit----
  for (i_model in 1:length(levels(data_metrics_plot$model))) {
    model <- levels(data_metrics_plot$model)[i_model]
    data_metrics_gof <- data_metrics_plot |> 
      filter(model == !!model)
    
    for (i_metric in 1:length(gof_metrics)) {
      
      metric <- gof_metrics[i_metric]
      metric_lab <- gof_metrics_lab[i_metric]
      val_min <- range_val |> filter(metric == !!metric) |> pull(min_val)
      val_max <- range_val |> filter(metric == !!metric) |> pull(max_val)
      title <- ""
      if (i_metric == 2) {
        title <- str_c(model, "\n", metric_lab)
      } else {
        title <- str_c("\n", metric_lab)
      }
      
      nb_methods <- data_metrics_gof$method |> levels() |> length()
      
      data_metrics_gof_current <- 
        data_metrics_gof |> 
        filter(metric == !!metric)
      
      par(mar = c(2.5, 1, 4.1, 1))
      boxplot(
        value ~ sample + method,
        data = data_metrics_gof_current,
        col = colours,
        horizontal = TRUE,
        las = 1, xlab = "", ylab = "",
        main = title,
        border = c("black", adjustcolor("black", alpha.f = .5)),
        yaxt = "n",
        ylim = c(val_min, val_max)
      )
      # Horizontal lines
      for (i in seq(3, nb_methods * 2, by = 2) - .5) {
        abline(h = i, lty = 1, col = "gray")
      }
    }
    
  }
  
  # Legend----
  par(mar = c(0, 4.3, 0, 4.3))
  plot.new()
  legend(
    "center", 
    legend = rev(levels(data_metrics_plot$method)),
    fill = rev(colours_calib),
    # lwd = 2,
    xpd = TRUE, ncol = 4
  )
}
boxplot_metrics_calib <- function(data_metrics){
  
  calib_metrics <- c("brier", "ece", "locfit_score")
  calib_metrics_lab <- c("Brier Score", "ECE", "LCS")
  
  data_metrics_plot <- 
    data_metrics |> 
    select(-mse) |> 
    select(sample, model, method, seed, !!!syms(calib_metrics)) |> 
    pivot_longer(
      cols = !!calib_metrics, 
      names_to = "metric", values_to = "value"
    )
  
  range_val <- data_metrics_plot |> 
    group_by(metric) |> 
    summarise(
      min_val = min(value),
      max_val = max(value)
    )
  
  mat <- matrix(
    c(1:(6), rep(7,3)), ncol = 3, byrow = TRUE
  )
  layout(mat, heights = c(rep(3, 4),1))
  
  # Calibration Metrics----
  for (i_model in 1:length(levels(data_metrics_plot$model))) {
    model <- levels(data_metrics_plot$model)[i_model]
    data_metrics_calib <- data_metrics_plot |> 
      filter(model == !!model)
    
    for (i_metric in 1:length(calib_metrics)) {
      metric <- calib_metrics[i_metric]
      metric_lab <- calib_metrics_lab[i_metric]
      val_min <- range_val |> filter(metric == !!metric) |> pull(min_val)
      val_max <- range_val |> filter(metric == !!metric) |> pull(max_val)
      if (i_metric == 2) {
        title <- str_c(model, "\n", metric_lab)
      } else {
        title <- str_c("\n", metric_lab)
      }
      
      nb_methods <- data_metrics_calib$method |> levels() |> length()
      
      data_metrics_calib_current <- 
        data_metrics_calib |> 
        filter(metric == !!metric)
      
      par(mar = c(2.5, 1, 4.1, 1))
      boxplot(
        value ~ sample + method,
        data = data_metrics_calib_current,
        col = colours,
        horizontal = TRUE,
        las = 1, xlab = "", ylab = "",
        main = title,
        border = c("black", adjustcolor("black", alpha.f = .5)),
        yaxt = "n",
        ylim = c(val_min, val_max)
      )
      # Horizontal lines
      for (i in seq(3, nb_methods * 2, by = 2) - .5) {
        abline(h = i, lty = 1, col = "gray")
      }
    }
    
  }
  
  # Legend----
  par(mar = c(0, 4.3, 0, 4.3))
  plot.new()
  legend(
    "center", 
    legend = rev(levels(data_metrics_plot$method)),
    fill = rev(colours_calib),
    # lwd = 2,
    xpd = TRUE, ncol = 4
  )
}
boxplot_metrics_gof(metrics_rf)
Figure 8.1: Goodness-of-fit Metrics Computed on 200 Replications, for the Regression Random Forest (top) and for the Classification Random Forest (bottom), on the Calibration (transparent colors) and on the Test Set (full colors). A probability threshold of \(\tau=0.5\) was used to compute the sensivity and the specificity.
boxplot_metrics_calib(metrics_rf)
Figure 8.2: Calibration Metrics Computed on 200 Replications, for the Regression Random Forest (top) and for the Classification Random Forest (bottom), on the Calibration (transparent colors) and on the Test Set (full colors).

8.4 Calibration vs. Performance

Let us load the results obtaine in Section 7.4.

compare_class <- read.csv("output/compare_cal_gof_class.csv")
compare_reg <- read.csv("output/compare_cal_gof_reg.csv")
compare_class <- compare_class |> arrange(err_oob) |> 
  mutate(AUC_train = 1-AUC_train,
         AUC_rest = 1-AUC_rest)
compare_reg <- compare_reg |> arrange(mse_oob)|> 
  mutate(AUC_train = 1-AUC_train,
         AUC_rest = 1-AUC_rest)

Some cosmetics:

samples <- c("train", "test")
sample_labels <- c("Train", "Test")

colours_samples <- c(
  "Train" = "#0072B2", "Test" = "#009E73"
)

We create a function to plot the calibration metrics vs goodness of fit metrics. Each point represent the result obtained in a simulation, computed either in the train set or in the test set.

plot_compare <- function(gof_metric, calib_metric) {
  
  gof_metrics <- c("sensitivity", "specificity", "AUC", "accuracy")
  calib_metrics <- c("brier", "LCS", "ece", "qmse", "wmse")
  types <- c("regression", "classification")
  types_labs <- c("Regression", "Classification")
  
  mat <- c(1,2,3,3) |> matrix(ncol=2, byrow = TRUE)
  layout(mat, heights = c(3,.5))
  
  for (i_type in 1:length(types)) {
    type <- types[i_type]
    type_lab <- types_labs[i_type]
    
    if (type == "regression"){
      data <- compare_reg
    } else if(type == "classification") {
      data <- compare_class
    } else {
      stop("wrong type")
    }
    
    train_gof <- paste(gof_metric, "train", sep = "_")
    rest_gof <- paste(gof_metric, "rest", sep = "_")
    train_calib <- paste(calib_metric, "train", sep = "_")
    rest_calib <- paste(calib_metric, "rest", sep = "_")
    
    # Plot boundaries
    xmin <- min(data[[train_gof]], data[[rest_gof]])
    xmax <- max(data[[train_gof]], data[[rest_gof]])
    ymin <- min(data[[train_calib]], data[[rest_calib]])
    ymax <- max(data[[train_calib]], data[[rest_calib]])
    
    
    par(mar = c(4.1, 4.1, 2.1, 1))
    
    # From train to test
    plot(
      xmin, xmax,
      xlim = c(xmin, xmax), ylim = c(ymin, ymax),
      xlab = gof_metric, ylab = calib_metric,
      main = type_lab,
    )
    segments(
      x0 = data[[train_gof]], 
      y0 = data[[train_calib]], 
      x1 = data[[rest_gof]], 
      y1 = data[[rest_calib]],
      col = adjustcolor("gray", alpha.f = .3)
    )
    
    # Train set
    points(
      x = data[[train_gof]], 
      y = data[[train_calib]],
      pch = 19, cex = .8,
      col = adjustcolor(colours_samples[["Train"]], alpha.f = .3)
    )
    # Loess
    lw_train <- loess(data[[train_calib]] ~ data[[train_gof]])
    ord <- order(data[[train_gof]])
    lines(data[[train_gof]][ord],lw_train$fitted[ord], col = colours_samples[["Train"]],lwd = 3, lty = 3)
    # Test set
    points(
      x = data[[rest_gof]], 
      y = data[[rest_calib]],
      col = adjustcolor(colours_samples[["Test"]], alpha.f = .3),
      pch = 19, cex = .8
    )
    # Loess
    lw_rest <- loess(data[[rest_calib]] ~ data[[rest_gof]])
    ord <- order(data[[rest_gof]])
    lines(data[[rest_gof]][ord],lw_rest$fitted[ord], col = colours_samples[["Test"]],lwd = 3, lty = 3)
    
  }
  # Legend
  par(mar = c(0, 4.3, 0, 4.3))
  plot.new()
  legend(
    "center", 
    legend = c("Train", "Test"),
    col = colours_samples,
    pch = 19,
    xpd = TRUE, ncol = 2
  )
  
}
plot_compare(gof_metric = "AUC", calib_metric = "LCS")
Figure 8.3: LCS vs. AUC for Random Forest Regression (left) and Random Forest Classification (right). Each point represents an estimation obtained from a set of hyperparameters. The gray lines help identify the estimations made within each sample using the same model.
plot_compare(gof_metric = "AUC", calib_metric = "brier")
Figure 8.4: Brier Score vs. AUC for Random Forest Regression (left) and Random Forest Classification (right). Each point represents an estimation obtained from a set of hyperparameters. The gray lines help identify the estimations made within each sample using the same model.
plot_compare(gof_metric = "AUC", calib_metric = "ece")
Figure 8.5: Expected Calibration Error vs. AUC for Random Forest Regression (left) and Random Forest Classification (right). Each point represents an estimation obtained from a set of hyperparameters. The gray lines help identify the estimations made within each sample using the same model.

8.5 Calibration Curves

Let us extract the calibration curves computed in Chapter 7.

curves_reg <- 
  map(metrics_rf_tuned_reg, "recalibration_curves") |> 
  list_rbind()
curves_class <- 
  map(metrics_rf_tuned_class, "recalibration_curves") |> 
  list_rbind()

We merge those in a single tibble:

calib_curves <- 
  curves_reg |> 
  bind_rows(curves_class) |> 
  group_by(xlim, sample, type, method) |> 
  summarise(
    mean = mean(locfit_pred),
    lower = quantile(locfit_pred, probs = .025),
    upper = quantile(locfit_pred, probs = 0.975),
    .groups = "drop"
  )

Some cosmetics:

methods <- c(
  "No Calibration",
  "platt", "isotonic", "beta", 
  "locfit_0", "locfit_1", "locfit_2"
)
methods_labs <- c(
  "No Calibration", "Platt", "Isotonic", "Beta", 
  "Locfit (deg=0)", "Locfit (deg=1)", "Locfit (deg=2)"
)

And we apply them:

calib_curves <- 
  calib_curves |> 
  filter(sample %in% c("calibration", "test")) |> 
  mutate(
    sample = factor(
      sample, 
      levels = c("calibration", "test"), 
      labels = c("Calibration", "Test")
    ),
    type = factor(
      type,
      levels = c("regression", "classification"),
      labels = c("Regression", "Classification")
    ),
    method = factor(
      method,
      levels = methods,
      labels = methods_labs
    )
  )

We need to get the scores, to add boxplots on top of the curves.

## Uncalibrated scores----

### Train set----
scores_no_calib_reg_train <- 
  map(metrics_rf_tuned_reg, "scores") |> 
  map(~.x$"scores_train")
scores_no_calib_class_train <- 
  map(metrics_rf_tuned_class, "scores") |> 
  map(~.x$"scores_train")

### Calibration set----
scores_no_calib_reg_calib <- 
  map(metrics_rf_tuned_reg, "scores") |> 
  map(~.x$"scores_calib")
scores_no_calib_class_calib <- 
  map(metrics_rf_tuned_class, "scores") |> 
  map(~.x$"scores_calib")

### Test set----
scores_no_calib_reg_test <- 
  map(metrics_rf_tuned_reg, "scores") |> 
  map(~.x$"scores_test")
scores_no_calib_class_test <- 
  map(metrics_rf_tuned_class, "scores") |> 
  map(~.x$"scores_test")

## Recalibrated Scores----
scores_no_calib_reg_train <- 
  map(metrics_rf_tuned_reg, "scores") |> 
  map(~.x$"scores_train")

Now, we will partition the [0,1] segment into equal-sized bins. In each bin, we will compute the number of observations with scores falling into that bin, for each sample of each simulation (and each recalibration technique).

8.5.0.1 Helper Functions

First, the count_scores() function, which counts the number of observations in each bin, for a vector of scores.

#' For a vector of scores, compute the number of obs in each bin
#' The bins are defined on evenly separated values in [0,1]
#' 
count_scores <- function(scores) {
  breaks <- seq(0, 1, by = .05)
  
  if (!is.null(scores)) {
    n_bins <- table(cut(scores, breaks = breaks))
  } else {
    n_bins <- NA_integer_
  }
  tibble(
    bins = names(table(cut(breaks, breaks = breaks))),
    n_bins = as.vector(n_bins)
  )
}

Then, another function, count_scores_simul(), applies count_scores() to all the non-recalibrated scores in the simulations.

#' Applies the count_scores() function to a list of simulations
#' 
count_scores_simul <- function(scores_simul) {
  map(scores_simul, count_scores) |> 
    list_rbind() |> 
    group_by(bins) |> 
    summarise(
      n_bins = mean(n_bins)
    )
}

Lastly, the count_scores_simul_method() function does the same for the all the recalibrated scores.

#' Extract the recalibrated scores from a list of simulations and computes
#' the number of scores in bins (see count_scores())
#' For each simulation, there are multiple correction techniques
#' 
count_scores_simul_method <- function(scores_simul_methods) {
  map(scores_simul_methods, "scores_c") |> 
    map(
      .f = function(simul){
        map(
          simul, 
          .f = function(sim_method) {
            count_scores_simul(sim_method$tb_score_c_train) |> 
              mutate(sample = "train") |> 
              bind_rows(
                count_scores_simul(sim_method$tb_score_c_calib) |> 
                  mutate(sample = "calibration")
              ) |> 
              bind_rows(
                count_scores_simul(sim_method$tb_score_c_test) |> 
                  mutate(sample = "test")
              )
          }
        ) |> 
          list_rbind(names_to = "method")
      },
      .progress = TRUE
    ) |> 
    list_rbind() |> 
    group_by(method, sample, bins) |> 
    summarise(
      n_bins = mean(n_bins),
      .groups = "drop"
    )
}

Let us get the those values!

## Regression----
### No Calibration----
n_bins_no_calib_reg <- 
  count_scores_simul(scores_no_calib_reg_train) |> 
  mutate(sample = "train", method = "No Calibration", type = "Regression") |> 
  bind_rows(
    count_scores_simul(scores_no_calib_reg_calib) |> 
      mutate(sample = "calibration", method = "No Calibration", type = "Regression")
  ) |> 
  bind_rows(
    count_scores_simul(scores_no_calib_reg_test) |> 
      mutate(sample = "test", method = "No Calibration", type = "Regression")
  )
### With Recalibration methods----
n_bins_recalib_reg <- count_scores_simul_method(
  scores_simul_methods = metrics_rf_tuned_reg) |> 
  mutate(type = "Regression")

## Regression----
### No Calibration----
n_bins_no_calib_class <- 
  count_scores_simul(scores_no_calib_class_train) |> 
  mutate(sample = "train", method = "No Calibration", type = "Classification") |> 
  bind_rows(
    count_scores_simul(scores_no_calib_class_calib) |> 
      mutate(sample = "calibration", method = "No Calibration", type = "Classification")
  ) |> 
  bind_rows(
    count_scores_simul(scores_no_calib_class_test) |> 
      mutate(sample = "test", method = "No Calibration", type = "Classification")
  )
### With Recalibration methods----
n_bins_recalib_class <- count_scores_simul_method(
  scores_simul_methods = metrics_rf_tuned_class) |> 
  mutate(type = "Classification")

## Merge all simulations----
n_bins <- 
  n_bins_no_calib_reg |> 
  bind_rows(n_bins_recalib_reg) |> 
  bind_rows(n_bins_no_calib_class) |> 
  bind_rows(n_bins_recalib_class)

n_bins <- 
  n_bins |> 
  filter(sample %in% c("calibration", "test")) |> 
  mutate(
    sample = factor(
      sample, 
      levels = c("calibration", "test"), 
      labels = c("Calibration", "Test")
    ),
    type = factor(
      type,
      levels = c("Regression", "Classification"),
      labels = c("Regression", "Classification")
    ),
    method = factor(
      method,
      levels = !!methods,
      labels = !!methods_labs
    )
  )
load("output/n_bins.rda")

8.5.1 Visualization

Some cosmetics:

colours_samples <- c(
  "Train" = "#0072B2", "Calibration" = "#D55E00", "Test" = "#009E73"
)
colours_calib <- c(
  # "#332288", 
  "#117733",
  "#44AA99", "#88CCEE",
  "#DDCC77", "#CC6677", "#AA4499", "#882255")

The plot_calib_locfit_simuls() allows us to visualize the calibration curves (on the train set and on the test set) for a type of forests (regression or classification), for uncalibrated scores and recalibrated ones.

plot_calib_locfit_simuls <- function(calib_curve, type) {
  tb_calib_curve <- calib_curves |>filter(type == !!type)
  tb_n_bins <- n_bins |> filter(type == !!type)
  
  nb_methods <- tb_calib_curve$method |> unique() |> length()
  
  mat_init <- c(1, 1, 2, 4, 3, 5) |> matrix(ncol = 2, byrow = TRUE)
  mat <- mat_init
  for (j in 2:(ceiling(nb_methods/2))) {
    mat <- rbind(mat, mat_init + (5*(j-1)))
  }
  mat <- cbind(mat, mat + max(mat))
  
  layout(mat, height = rep(c(.35, 1, 3), nb_methods))
  
  y_lim <- c(0, 1)
  
  samples <- c("Calibration", "Test")
  
  for (i_method in 1:length(methods)) {
    method <- methods_labs[i_method]
    # Title for the models
    par(mar = c(0, 0.5, 0, 0.5))
    plot(
      0:1, 0:1,
      col = NULL,
      type="n",
      xlim = c(0,2), ylim = 0:1,
      xlab = "",
      ylab = "",
      main = "",
      xaxt = "n",
      yaxt = "n",
      new = TRUE,
      frame.plot = FALSE
    )
    # Get the center coordinates of the plot
    plot_center <- c(mean(par("usr")[1:2]), mean(par("usr")[3:4]))
    # Get the width and height of the text
    text_dim <- strwidth(method, cex = 1.2, font = 2)
    # Calculate the coordinates to center the text
    text_x <- plot_center[1] - text_dim / 2
    text_y <- plot_center[2]
    
    # Add text to the plot at the centered coordinates
    text(
      1, text_y,
      method, col = colours_calib[i_method],
      cex = 1, font = 2
    )
    for(i_sample in 1:length(samples)) {
      sample <- samples[i_sample]
      tb_plot <- tb_calib_curve |> 
        filter(
          sample == !!sample,
          method == !!method,
          type == !!type
        )
      n_bins_current <- tb_n_bins |> 
        filter(
          sample == !!sample,
          method == !!method,
          type == !!type
        )
      n_bins_no_calib_current <- tb_n_bins |> 
        filter(
          sample == !!sample,
          method == "No Calibration",
          type == !!type
        )
      # Barplots on top
      par(mar = c(0.5, 4.3, 1.0, 0.5))
      y_lim_bp <- range(
        c(n_bins_no_calib_current$n_bins,
          n_bins_current$n_bins, na.rm = TRUE)
      )
      barplot(
        n_bins_no_calib_current$n_bins,
        col = adjustcolor("gray", alpha.f = .3),
        ylim = y_lim_bp,
        border = "white",
        axes = FALSE,
        xlab = "", ylab = "", main = ""
      )
      barplot(
        n_bins_current$n_bins,
        col = adjustcolor("#0072B2", alpha.f = .3),
        ylim = y_lim_bp,
        border = "white",
        axes = FALSE,
        xlab = "", ylab = "", main = "",
        add = TRUE
      )
      
      # Calib curve
      par(mar = c(4.1, 4.3, 0.5, 0.5), mgp = c(2, 1, 0))
      plot(
        tb_plot$xlim, tb_plot$mean,
        type = "l", col = colours_samples[sample],
        xlim = 0:1, ylim = 0:1,
        xlab = latex2exp::TeX("$p^u$"), 
        ylab = latex2exp::TeX("$E(D | p^u = p^c)$"),
        main = ""
      )
      polygon(
        c(tb_plot$xlim, rev(tb_plot$xlim)),
        c(tb_plot$lower, rev(tb_plot$upper)),
        col = adjustcolor(col = colours_samples[sample], alpha.f = .4),
        border = NA
      )
      segments(0, 0, 1, 1, col = "black", lty = 2)
    }
  }
  
}
plot_calib_locfit_simuls(
  calib_curve = calib_curves, type = "Regression"
)
Figure 8.6: Calibration Curves Obtained with Local Regression, for the Regression Random Forest, for the Calibration Set and for the Test Set. The curves are the averages values obtained on 200 different splits of the calibration and test datasets, and the color bands are the 95% bootstrap confidence intervals. The histogram on top of each graph show the distribution of the uncalibrated scores, and that of the calibrated scores.
plot_calib_locfit_simuls(
  calib_curve = calib_curves, type = "Classification"
)
Figure 8.7: Calibration Curves Obtained with Local Regression, for the Classification Random Forest, for the Calibration Set and for the Test Set. The curves are the averages values obtained on 200 different splits of the calibration and test datasets, and the color bands are the 95% bootstrap confidence intervals. The histogram on top of each graph show the distribution of the uncalibrated scores, and that of the calibrated scores.