7 Simulations

Display the required packages and the colour schemes.

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Display the required packages and the colour schemes.

library(randomForest)

randomForest 4.7-1.1
Type rfNews() to see new features/changes/bug fixes.

Attaching package: 'randomForest'

The following object is masked from 'package:dplyr':

    combine

The following object is masked from 'package:ggplot2':

    margin

Display the required packages and the colour schemes.

library(future)
library(binom)
library(locfit)

locfit 1.5-9.9   2024-03-01

Attaching package: 'locfit'

The following object is masked from 'package:purrr':

    none

Display the required packages and the colour schemes.

library(pROC)

Type 'citation("pROC")' for a citation.

Attaching package: 'pROC'

The following objects are masked from 'package:stats':

    cov, smooth, var

Display the required packages and the colour schemes.

library(betacal)
library(locfit)

colours <- c("Train" = "#0072B2", "Calibration" = "#D55E00", "Test" = "#009E73")

In Chapter 6, we prepared the data on which we will run some simulations, and obtained a set of hyperparameters for a regression forest and for a classification forest that optimize a goodness-of-fit criterion. The forests were trained on a train set. We left aside 50% of the observations to run simulations. This is what we do in this chapter. We consider the remainder of the data, and we randomly create 200 different splits to create a calibration set and a test set. On each obtained split, we train a recalibrator (as in Chapter 5) on the calibration set. We then compute calibration and goodness of fit metrics. We also calculate the calibration curves (using local regression only).

7.1 Helper Functions

Before running the simulations, we need some helper functions.

7.1.1 Calibration/Test Splits

We define get_samples() which is used to create a partition of the data to create a calibration and a test set.

#' Get calibration/test samples from the DGP
#'
#' @param seed seed to use to generate the data
#' @param n_obs number of desired observations
get_samples <- function(seed,
                        data) {
  set.seed(seed)
  n_obs <- nrow(data)
  
  # Calibration/test sets----
  calib_index <- sample(1:nrow(data), size = .5 * nrow(data), replace = FALSE)
  tb_calib <- data |> slice(calib_index)
  tb_test <- data |> slice(-calib_index)
  
  list(
    data = data,
    tb_calib = tb_calib,
    tb_test = tb_test,
    calib_index = calib_index,
    seed = seed,
    n_obs = n_obs
  )
}

7.1.2 Standard Metrics For Classification / Regression

We define the compute_gof() function wich computes multiple goodness-of-fit criteria.

Display the codes used to define the standard metrics

#' Computes goodness of fit metrics
#' 
#' @param true_prob true probabilities. If `NULL` (default), True MSE is not 
#'   computed and the `NA` value is returned for this metric
#' @param obs observed values (binary outcome)
#' @param pred predicted scores
#' @param threshold classification threshold (default to `.5`)
#' @returns tibble with MSE, accuracy, missclassification rate, sensititity 
#'  (TPR), specificity (TNR), FPR, and used threshold
compute_gof <- function(true_prob = NULL,
                        obs, 
                        pred, 
                        threshold = .5) {
  
  # MSE
  if (!is.null(true_prob)) {
    mse <- mean((true_prob - pred)^2)
  } else {
    mse = NA
  }
  
  pred_class <- as.numeric(pred > threshold)
  confusion_tb <- tibble(
    obs = obs,
    pred = pred_class
  ) |> 
    count(obs, pred)
  
  TN <- confusion_tb |> filter(obs == 0, pred == 0) |> pull(n)
  TP <- confusion_tb |> filter(obs == 1, pred == 1) |> pull(n)
  FP <- confusion_tb |> filter(obs == 0, pred == 1) |> pull(n)
  FN <- confusion_tb |> filter(obs == 1, pred == 0) |> pull(n)
  
  if (length(TN) == 0) TN <- 0
  if (length(TP) == 0) TP <- 0
  if (length(FP) == 0) FP <- 0
  if (length(FN) == 0) FN <- 0
  
  n_pos <- sum(obs == 1)
  n_neg <- sum(obs == 0)
  
  # Accuracy
  acc <- (TP + TN) / (n_pos + n_neg)
  # Missclassification rate
  missclass_rate <- 1 - acc
  # Sensitivity (True positive rate)
  # proportion of actual positives that are correctly identified as such
  TPR <- TP / n_pos
  # Specificity (True negative rate)
  # proportion of actual negatives that are correctly identified as such
  TNR <- TN / n_neg
  # False positive Rate
  FPR <- FP / n_neg
  # AUC
  AUC <- as.numeric(pROC::auc(obs, pred, levels = c("0", "1"), direction = ">"))
  
  # roc <- pROC::roc(obs, pred, levels = c("0", "1"), direction = ">")
  # AUC <- as.numeric(pROC::auc(roc))
  
  tibble(
    mse = mse,
    accuracy = acc,
    missclass_rate = missclass_rate,
    sensitivity = TPR,
    specificity = TNR,
    threshold = threshold,
    FPR = FPR,
    AUC = AUC
  )
}

We define functions to estimate calibration, as in Section 1.2 from Chapter 1 (brier_score(), get_summary_bins(), e_calib_error(), qmse_error(), local_ci_scores(), weighted_mse(), local_calib_score(), and a wrapper function, compute_metrics()).

Display the codes used to define calibration metrics functions.

## Brier Score----

brier_score <- function(obs, scores) mean((scores - obs)^2)

## Expected Calibration Error (ECE)----


#' Computes summary statistics for binomial observed data and predicted scores
#' returned by a model
#'
#' @param obs vector of observed events
#' @param scores vector of predicted probabilities
#' @param k number of classes to create (quantiles, default to `10`)
#' @param threshold classification threshold (default to `.5`)
#' @return a tibble where each row correspond to a bin, and each columns are:
#' - `score_class`: level of the decile that the bin represents
#' - `nb`: number of observation
#' - `mean_obs`: average of obs (proportion of positive events)
#' - `mean_score`: average predicted score (confidence)
#' - `sum_obs`: number of positive events (number of positive events)
#' - `accuracy`: accuracy (share of correctly predicted, using the
#'    threshold)
get_summary_bins <- function(obs,
                             scores,
                             k = 10, 
                             threshold = .5) {
  breaks <- quantile(scores, probs = (0:k) / k)
  tb_breaks <- tibble(breaks = breaks, labels = 0:k) |>
    group_by(breaks) |>
    slice_tail(n = 1) |>
    ungroup()
  
  x_with_class <- tibble(
    obs = obs,
    score = scores,
  ) |>
    mutate(
      score_class = cut(
        score,
        breaks = tb_breaks$breaks,
        labels = tb_breaks$labels[-1],
        include.lowest = TRUE
      ),
      pred_class = ifelse(score > threshold, 1, 0),
      correct_pred = obs == pred_class
    )
  
  x_with_class |>
    group_by(score_class) |>
    summarise(
      nb = n(),
      mean_obs = mean(obs),
      mean_score = mean(score), # confidence
      sum_obs = sum(obs),
      accuracy = mean(correct_pred)
    ) |>
    ungroup() |>
    mutate(
      score_class = as.character(score_class) |> as.numeric()
    ) |>
    arrange(score_class)
}

#' Expected Calibration Error
#'
#' @param obs vector of observed events
#' @param scores vector of predicted probabilities
#' @param k number of classes to create (quantiles, default to `10`)
#' @param threshold classification threshold (default to `.5`)
e_calib_error <- function(obs,
                          scores, 
                          k = 10, 
                          threshold = .5) {
  summary_bins <- get_summary_bins(
    obs = obs, scores = scores, k = k, threshold = .5
  )
  summary_bins |>
    mutate(ece_bin = nb * abs(accuracy - mean_score)) |>
    summarise(ece = 1 / sum(nb) * sum(ece_bin)) |>
    pull(ece)
}

## Quantile-based Mean Squared Error----

#' Quantile-Based MSE
#'
#' @param obs vector of observed events
#' @param scores vector of predicted probabilities
#' @param k number of classes to create (quantiles, default to `10`)
#' @param threshold classification threshold (default to `.5`)
qmse_error <- function(obs,
                       scores, 
                       k = 10, 
                       threshold = .5) {
  summary_bins <- get_summary_bins(
    obs = obs, scores = scores, k = k, threshold = .5
  )
  summary_bins |>
    mutate(qmse_bin = nb * (mean_obs - mean_score)^2) |>
    summarise(qmse = 1/sum(nb) * sum(qmse_bin)) |>
    pull(qmse)
}

## Weighted Mean Squared Error----

#' @param obs vector of observed events
#' @param scores vector of predicted probabilities
#' @param tau value at which to compute the confidence interval
#' @param nn fraction of nearest neighbors
#' @prob level of the confidence interval (default to `.95`)
#' @param method Which method to use to construct the interval. Any combination
#'  of c("exact", "ac", "asymptotic", "wilson", "prop.test", "bayes", "logit",
#'  "cloglog", "probit") is allowed. Default is "all".
#' @return a tibble with a single row that corresponds to estimations made in
#'   the neighborhood of a probability $p=\tau$`, using the fraction `nn` of
#'   neighbors, where the columns are:
#'  - `score`: score tau in the neighborhood of which statistics are computed
#'  - `mean`: estimation of $E(d | s(x) = \tau)$
#'  - `lower`: lower bound of the confidence interval
#'  - `upper`: upper bound of the confidence interval
#' @importFrom binom binom.confint
local_ci_scores <- function(obs,
                            scores,
                            tau,
                            nn,
                            prob = .95,
                            method = "probit") {
  
  # Identify the k nearest neighbors based on hat{p}
  k <- round(length(scores) * nn)
  rgs <- rank(abs(scores - tau), ties.method = "first")
  idx <- which(rgs <= k)
  
  binom.confint(
    x = sum(obs[idx]),
    n = length(idx),
    conf.level = prob,
    methods = method
  )[, c("mean", "lower", "upper")] |>
    tibble() |>
    mutate(xlim = tau) |>
    relocate(xlim, .before = mean)
}

#' Compute the Weighted Mean Squared Error to assess the calibration of a model
#'
#' @param local_scores tibble with expected scores obtained with the 
#'   `local_ci_scores()` function
#' @param scores vector of raw predicted probabilities
weighted_mse <- function(local_scores, scores) {
  # To account for border bias (support is [0,1])
  scores_reflected <- c(-scores, scores, 2 - scores)
  dens <- density(
    x = scores_reflected, from = 0, to = 1, 
    n = length(local_scores$xlim)
  )
  # The weights
  weights <- dens$y
  local_scores |>
    mutate(
      wmse_p = (xlim - mean)^2,
      weight = !!weights
    ) |>
    summarise(wmse = sum(weight * wmse_p) / sum(weight)) |>
    pull(wmse)
}

## Local regression score
#' Calibration score using Local Regression
#' 
#' @param obs vector of observed events
#' @param scores vector of predicted probabilities
local_calib_score <- function(obs, 
                              scores) {
  
  # Add a little noise to the scores, to avoir crashing R
  scores <- scores + rnorm(length(scores), 0, .001)
  locfit_0 <- locfit(
    formula = d ~ lp(scores, nn = 0.15, deg = 0), 
    kern = "rect", maxk = 200, 
    data = tibble(
      d = obs,
      scores = scores
    )
  )
  # Predictions on [0,1]
  linspace_raw <- seq(0, 1, length.out = 100)
  # Restricting this space to the range of observed scores
  keep_linspace <- which(linspace_raw >= min(scores) & linspace_raw <= max(scores))
  linspace <- linspace_raw[keep_linspace]
  
  locfit_0_linspace <- predict(locfit_0, newdata = linspace)
  locfit_0_linspace[locfit_0_linspace > 1] <- 1
  locfit_0_linspace[locfit_0_linspace < 0] <- 0
  
  # Squared difference between predicted value and the bissector, weighted by the density of values
  scores_reflected <- c(-scores, scores, 2 - scores)
  dens <- density(
    x = scores_reflected, from = 0, to = 1, 
    n = length(linspace_raw)
  )
  # The weights
  weights <- dens$y[keep_linspace]
  
  weighted.mean((linspace - locfit_0_linspace)^2, weights)
}

## Wrapper----

#' Computes the calibration metrics for a set of observed and predicted 
#' probabilities
#'  - mse: Mean Squared Error based on true proba
#'  - brier: Brier score
#'  - ece: Expectec Calibration Error
#'  - qmse: MSE on bins defined by the quantiles of the predicted scores
#'  - wmse: MSE weighted by the density of the predicted scores
#' 
#' @param obs observed events
#' @param scores predicted scores
#' @param true_probas true probabilities from the PGD (to compute MSE)
#' @param linspace vector of values at which to compute the WMSE
#' @param k number of classes (bins) to create (default to `10`)
compute_metrics <- function(obs, 
                            scores, 
                            true_probas = NULL,
                            linspace = NULL,
                            k = 10) {
  
  ## True MSE
  if (!is.null(true_probas)) {
    mse <- mean((true_probas - scores)^2)
  } else {
    mse <- NA
  }
  
  
  brier <- brier_score(obs = obs, scores = scores)
  if (length(unique(scores)) > 1) {
    ece <- e_calib_error(obs = obs, scores = scores, k = k, threshold = .5)
    qmse <- qmse_error(obs = obs, scores = scores, k = k, threshold = .5)
  } else {
    ece <- NA
    qmse <- NA
  }
  
  locfit_score <- local_calib_score(obs = obs, scores = scores)
  
  if (is.null(linspace)) linspace <- seq(0, 1, length.out = 101)
  expected_events <- map(
    .x = linspace,
    .f = ~local_ci_scores(
      obs = obs, 
      scores = scores,
      tau = .x, nn = .15, prob = .95, method = "probit")
  ) |> 
    bind_rows()
  wmse <- weighted_mse(local_scores = expected_events, scores = scores)
  tibble(
    mse = mse, 
    locfit_score = locfit_score,
    brier = brier, 
    ece = ece, 
    qmse = qmse, 
    wmse = wmse
  )
}

7.1.3 Functions to Train Random Forests

We define two functions to train a random forest on the train set, apply_rf() (for a regressor) and apply_rf_vote() (for a classifier). Both functions return the estimated scores on the train set, the calibration set, and the test set.

#' Apply Random Forest algorithm
#' 
#' @param train_data train dataset
#' @param calib_data calibration dataset
#' @param test_data test dataset
apply_rf <- function(train_data, 
                     calib_data, 
                     test_data,
                     mtry = max(floor(ncol(train_data)/3), 1),
                     nodesize = 1,
                     ntree = 500) {
  
  rf <- randomForest(
    d ~ ., 
    data = train_data, 
    mtry = mtry, 
    nodesize = nodesize, 
    ntree = ntree
  )
  
  scores_train <- predict(rf, newdata = train_data, type = "response")
  scores_calib <- predict(rf, newdata = calib_data, type = "response")
  scores_test <- predict(rf, newdata = test_data, type = "response")
  
  list(
    scores_train = scores_train,
    scores_calib = scores_calib,
    scores_test = scores_test
  )
}

#' Apply Random Forest algorithm (classification task)
#' 
#' @param train_data train dataset
#' @param calib_data calibration dataset
#' @param test_data test dataset
apply_rf_vote <- function(train_data, 
                          calib_data, 
                          test_data,
                          mtry = 1,
                          nodesize = 0.1 * nrow(train_data),
                          ntree = 500) {
  rf <- randomForest(
    d ~ ., 
    data = train_data |> mutate(d = factor(d)), 
    mtry = mtry, 
    nodesize = nodesize, 
    ntree = ntree
  )
  
  scores_train <- predict(rf, newdata = train_data, type = "vote")[, "1"]
  scores_calib <- predict(rf, newdata = calib_data, type = "vote")[, "1"]
  scores_test <- predict(rf, newdata = test_data, type = "vote")[, "1"]
  list(
    scores_train = scores_train,
    scores_calib = scores_calib,
    scores_test = scores_test
  )
}

7.1.4 Calibration Curve

We define the calib_curve_locfit_simul_rf() function to compute the calibration curve for a simulation.

Code

#' Get the calibration curve for one simulation for the random forest,
#' using the local regression approach
#' 
#' @param simul results of a simulation obtained with simul_rf
#' @param linspace values at which to compute the mean observed event when 
#'   computing the WMSE
calib_curve_locfit_simul_rf <- function(simul,
                                        linspace = NULL) {
  if (is.null(linspace)) linspace <- seq(0, 1, length.out = 101)
  
  n_obs <- simul$n_obs
  seed <- simul$seed
  # Get the data used to train the forest
  data <- get_samples(n_obs = n_obs, seed = seed)
  tb_train <- data$tb_train
  tb_calib <- data$tb_calib
  tb_test <- data$tb_test
  
  # Scores estimated by the RF
  scores_train <- simul$scores$scores_train
  scores_calib <- simul$scores$scores_calib
  scores_test <- simul$scores$scores_test
  
  locfit_0_train <- locfit(
    formula = d ~ lp(score, nn = 0.15, deg = 0), 
    kern = "rect", maxk = 200, 
    data = tibble(d = tb_train$d, score = scores_train)
  )
  locfit_0_calib <- locfit(
    formula = d ~ lp(score, nn = 0.15, deg = 0), 
    kern = "rect", maxk = 200, 
    data = tibble(d = tb_calib$d, score = scores_calib)
  )
  locfit_0_test <- locfit(
    formula = d ~ lp(score, nn = 0.15, deg = 0), 
    kern = "rect", maxk = 200, 
    data = tibble(d = tb_test$d, score = scores_test)
  )
  
  score_c_locfit_0_train <- predict(locfit_0_train, newdata = linspace)
  score_c_locfit_0_calib <- predict(locfit_0_calib, newdata = linspace)
  score_c_locfit_0_test <- predict(locfit_0_test, newdata = linspace)
  
  # Make sure to have values in [0,1]
  score_c_locfit_0_train[score_c_locfit_0_train > 1] <- 1
  score_c_locfit_0_train[score_c_locfit_0_train < 0] <- 0
  
  score_c_locfit_0_calib[score_c_locfit_0_calib > 1] <- 1
  score_c_locfit_0_calib[score_c_locfit_0_calib < 0] <- 0
  
  score_c_locfit_0_test[score_c_locfit_0_test > 1] <- 1
  score_c_locfit_0_test[score_c_locfit_0_test < 0] <- 0
  
  res_train <- tibble(
    xlim = linspace,
    locfit_pred = score_c_locfit_0_train,
    sample = "train"
  )
  res_calib <- tibble(
    xlim = linspace,
    locfit_pred = score_c_locfit_0_calib,
    sample = "calibration"
  )
  res_test <- tibble(
    xlim = linspace,
    locfit_pred = score_c_locfit_0_test,
    sample = "test"
  )
  
  res_train |> 
    bind_rows(
      res_calib
    ) |> 
    bind_rows(
      res_test
    ) |> 
    mutate(
      n_obs = n_obs,
      seed = seed
    )
}

7.1.5 Recalibration

We define a core function, recalibrate() which recalibrates the scores using a recalibrator (we use, as in Chapter 2, the following recalibrator: Platt scaling, isotonic regression, beta calibration, and locfit).

#' Recalibrates scores using a calibrator
#' 
#' @param obs_train vector of observed events in the train set
#' @param scores_train vector of predicted probabilities in the train set
#' @param obs_calib vector of observed events in the calibration set
#' @param scores_calib vector of predicted probabilities in the calibration set
#' @param obs_test vector of observed events in the test set
#' @param scores_test vector of predicted probabilities in the test set
#' @param method recalibration method (`"platt"` for Platt-Scaling, 
#'   `"isotonic"` for isotonic regression, `"beta"` for beta calibration, 
#'   `"locfit"` for local regression)
#' @param params list of named parameters to use in the local regression 
#'   (`nn` for fraction of nearest neighbors to use, `deg` for degree)
#' @param linspace vector of alues at which to compute the recalibrated scores
#' @returns list of three elements: recalibrated scores on the calibration set,
#'   recalibrated scores on the test set, and recalibrated scores on a segment 
#'   of values
recalibrate <- function(obs_train,
                        pred_train, 
                        obs_calib,
                        pred_calib,
                        obs_test,
                        pred_test,
                        method = c("platt", "isotonic", "beta", "locfit"),
                        params = NULL,
                        linspace = NULL) {
  
  if (is.null(linspace)) linspace <- seq(0, 1, length.out = 101)
  data_train <- tibble(d = obs_train, scores = pred_train)
  data_calib <- tibble(d = obs_calib, scores = pred_calib)
  data_test <- tibble(d = obs_test, scores = pred_test)
  
  # Recalibrator trained on calibration data
  if (method == "platt") {
    # Recalibrator
    lr <- glm(
      d ~ scores, family = binomial(link = 'logit'), data = data_calib
    )
    # Recalibrated scores on calib/test sets
    score_c_train <- predict(lr, newdata = data_train, type = "response")
    score_c_calib <- predict(lr, newdata = data_calib, type = "response")
    score_c_test <- predict(lr, newdata = data_test, type = "response")
    # Recalibrated scores on [0,1]
    score_c_linspace <- predict(
      lr, newdata = tibble(scores = linspace), type = "response"
    )
  } else if (method == "isotonic") {
    iso <- isoreg(x = data_calib$scores, y = data_calib$d)
    fit_iso <- as.stepfun(iso)
    score_c_train <- fit_iso(data_train$scores)
    score_c_calib <- fit_iso(data_calib$scores)
    score_c_test <- fit_iso(data_test$scores)
    score_c_linspace <- fit_iso(linspace)
  } else if (method == "beta") {
    capture.output({
      bc <- try(beta_calibration(
        p = data_calib$scores, 
        y = data_calib$d, 
        parameters = "abm" # 3 parameters a, b & m
      ))
    })
    if (!inherits(bc, "try-error")) {
      score_c_train <- beta_predict(p = data_train$scores, bc)
      score_c_calib <- beta_predict(p = data_calib$scores, bc)
      score_c_test <- beta_predict(p = data_test$scores, bc)
      score_c_linspace <- beta_predict(p = linspace, bc)
    } else {
      score_c_train <- score_c_calib <- score_c_test <- score_c_linspace <- NA
    }
    
  } else if (method == "locfit") {
    noise_scores <- data_calib$scores + rnorm(nrow(data_calib), 0, 0.01)
    noise_data_calib <- data_calib %>% mutate(scores = noise_scores)
    locfit_reg <- locfit(
      formula = d ~ lp(scores, nn = params$nn, deg = params$deg), 
      kern = "rect", maxk = 200, data = noise_data_calib
    )
    score_c_train <- predict(locfit_reg, newdata = data_train)
    score_c_train[score_c_train < 0] <- 0
    score_c_train[score_c_train > 1] <- 1
    score_c_calib <- predict(locfit_reg, newdata = data_calib)
    score_c_calib[score_c_calib < 0] <- 0
    score_c_calib[score_c_calib > 1] <- 1
    score_c_test <- predict(locfit_reg, newdata = data_test)
    score_c_test[score_c_test < 0] <- 0
    score_c_test[score_c_test > 1] <- 1
    score_c_linspace <- predict(locfit_reg, newdata = linspace)
    score_c_linspace[score_c_linspace < 0] <- 0
    score_c_linspace[score_c_linspace > 1] <- 1
  } else {
    stop(str_c(
      'Wrong method. Use one of the following:',
      '"platt", "isotonic", "beta", "locfit"'
    ))
  }
  
  # Format results in tibbles:
  # For train set
  tb_score_c_train <- tibble(
    d = obs_train,
    p_u = pred_train,
    p_c = score_c_train
  )
  # For calibration set
  tb_score_c_calib <- tibble(
    d = obs_calib,
    p_u = pred_calib,
    p_c = score_c_calib
  )
  # For test set
  tb_score_c_test <- tibble(
    d = obs_test,
    p_u = pred_test,
    p_c = score_c_test
  )
  # For linear space
  tb_score_c_linspace <- tibble(
    linspace = linspace,
    p_c = score_c_linspace
  )
  
  list(
    tb_score_c_train = tb_score_c_train,
    tb_score_c_calib = tb_score_c_calib,
    tb_score_c_test = tb_score_c_test,
    tb_score_c_linspace = tb_score_c_linspace
  )
  
}

7.1.6 Wrapper Function

Lastly, we define a wrapper function to perform a single replication of the simulations, for a type of problem (regression or classification).

simul_calib_rf <- function(seed,
                           type = c("regression", "classification"),
                           tuning = TRUE){
  
  set.seed(seed)
  linspace <- seq(0, 1, length.out = 101)
  
  # 1. Get and Split the dataset----
  
  data <- get_samples(seed = seed, data = tb_rest)
  tb_train <- tb_train
  tb_calib <- data$tb_calib
  tb_test <- data$tb_test
  
  # 2. Train RF----
  
  if (type == "regression") {
    # RF regressor parameters tuned
    if (tuning == TRUE) {
      # Use hyperparameters selected in `rf-grid-search.R`
      nodesize <- best_params_rf_reg$nodesize
      mtry <- best_params_rf_reg$mtry
      ntree <- best_params_rf_reg$num_trees
    } else {
      # Use default values from `randomForest()`
      nodesize <- 5
      # nodesize <- 0.1 * nrow(tb_train)
      mtry <- max(floor(ncol(tb_train)/3), 1)
      ntree <- 500
    }
    
    # Regression
    scores_reg <- apply_rf(
      train_data = tb_train,
      calib_data = tb_calib, 
      test_data = tb_test,
      mtry = mtry,
      nodesize = nodesize,
      ntree = ntree
    )
    scores <- scores_reg
  } else {
    if (tuning == TRUE) {
      # Use hyperparameters selected in `rf-grid-search.R`
      nodesize <- best_params_rf_classif$nodesize
      mtry <- best_params_rf_classif$mtry
      ntree <- best_params_rf_classif$num_trees
    } else {
      # Use default values from `randomForest()`
      nodesize <- 0.1 * nrow(tb_train)
      mtry <- max(floor(ncol(tb_train)/3), 1)
      ntree <- 500
    }
    # Classification
    scores_classif <- apply_rf_vote(
      train_data = tb_train, 
      calib_data = tb_calib, 
      test_data = tb_test,
      mtry = mtry,
      nodesize = nodesize,
      ntree = ntree
    )
    scores <- scores_classif
  }
  
  # 3. Calibration----
  
  ## 3.1. Metrics----
  
  # For regression
  calibration_train <- compute_metrics(
    obs = tb_train$d, 
    scores = scores$scores_train,
    k = 10
  ) |> 
    mutate(sample = "train")
  
  calibration_calib <- compute_metrics(
    obs = tb_calib$d, 
    scores = scores$scores_calib,
    k = 5
  ) |> 
    mutate(sample = "calibration")
  
  calibration_test <- compute_metrics(
    obs = tb_test$d, 
    scores = scores$scores_test,
    k = 5
  ) |> 
    mutate(sample = "test")
  
  calibration <- 
    calibration_train |> 
    bind_rows(calibration_calib) |> 
    bind_rows(calibration_test) |>
    mutate(model = type)
  
  # Comparison with standard metrics
  gof_train <- compute_gof(
    obs = tb_train$d, 
    pred = scores$scores_train
  ) |> mutate(sample = "train")
  
  gof_calib <- compute_gof( 
    obs = tb_calib$d, 
    pred = scores$scores_calib
  ) |> mutate(sample = "calibration")
  
  gof_test <- compute_gof(
    obs = tb_test$d, 
    pred = scores$scores_test
  ) |> mutate(sample = "test")
  
  gof <- 
    gof_train |> 
    bind_rows(gof_calib) |> 
    bind_rows(gof_test) |>
    mutate(model = type)
  
  summary_metrics <- 
    gof |> 
    left_join(calibration, by = c("mse", "sample", "model")) |> 
    relocate(sample, model, .before = "mse") |>
    mutate(seed = seed)
  
  # 5. Calibration curves with locfit ----
  
  scores_train <- scores$scores_train
  scores_calib <- scores$scores_calib
  scores_test <- scores$scores_test
  # Add a little noise to the scores, to avoir crashing R
  scores_train <- scores_train + rnorm(length(scores_train), 0, .001)
  scores_calib <- scores_calib + rnorm(length(scores_calib), 0, .001)
  scores_test <- scores_test + rnorm(length(scores_test), 0, .001)
  
  locfit_0_train <- locfit(
    formula = d ~ lp(score, nn = 0.15, deg = 0), 
    kern = "rect", maxk = 200, 
    data = tibble(d = tb_train$d, score = scores_train)
  )
  locfit_0_calib <- locfit(
    formula = d ~ lp(score, nn = 0.15, deg = 0), 
    kern = "rect", maxk = 200, 
    data = tibble(d = tb_calib$d, score = scores_calib)
  )
  locfit_0_test <- locfit(
    formula = d ~ lp(score, nn = 0.15, deg = 0), 
    kern = "rect", maxk = 200, 
    data = tibble(d = tb_test$d, score = scores_test)
  )
  
  score_locfit_0_train <- predict(locfit_0_train, newdata = linspace)
  score_locfit_0_calib <- predict(locfit_0_calib, newdata = linspace)
  score_locfit_0_test <- predict(locfit_0_test, newdata = linspace)
  # Make sure to have values in [0,1]
  score_locfit_0_train[score_locfit_0_train > 1] <- 1
  score_locfit_0_train[score_locfit_0_train < 0] <- 0
  
  score_locfit_0_calib[score_locfit_0_calib > 1] <- 1
  score_locfit_0_calib[score_locfit_0_calib < 0] <- 0
  
  score_locfit_0_test[score_locfit_0_test > 1] <- 1
  score_locfit_0_test[score_locfit_0_test < 0] <- 0
  
  res_train <- tibble(
    xlim = linspace,
    locfit_pred = score_locfit_0_train,
    sample = "train"
  )
  res_calib <- tibble(
    xlim = linspace,
    locfit_pred = score_locfit_0_calib,
    sample = "calibration"
  )
  res_test <- tibble(
    xlim = linspace,
    locfit_pred = score_locfit_0_test,
    sample = "test"
  )
  
  calibration_curves <- 
    res_train |> 
    bind_rows(
      res_calib
    ) |> 
    bind_rows(
      res_test
    ) |> 
    mutate(
      seed = seed,
      type = type,
      method = "No Calibration"
    )
  
  # 6. Recalibration----
  
  methods <- c("platt", "isotonic", "beta", "locfit", "locfit", "locfit")
  params <- list(
    NULL, NULL, NULL, 
    list(nn = .15, deg = 0), list(nn = .15, deg = 1), list(nn = .15, deg = 2)
  )
  method_names <- c(
    "platt", "isotonic", "beta", "locfit_0", "locfit_1", "locfit_2"
  )
  
  scores_c <- vector(mode = "list", length = length(methods))
  names(scores_c) <- method_names
  res_c <- tibble()
  
  for (i_method in 1:length(methods)) {
    method <- methods[i_method]
    params_current <- params[[i_method]]
    
    ## 6.1 Recalibrated Scores----
    scores_c[[i_method]] <- recalibrate(
      obs_train = tb_train$d, 
      pred_train = scores$scores_train, 
      obs_calib = tb_calib$d, 
      pred_calib = scores$scores_calib, 
      obs_test = tb_test$d, 
      pred_test = scores$scores_test, 
      method = method,
      params = params_current
    )
    ## 6.2 Recalibration Curves----
    scores_c_train <- scores_c[[i_method]]$tb_score_c_train$p_c
    scores_c_calib <- scores_c[[i_method]]$tb_score_c_calib$p_c
    scores_c_test <- scores_c[[i_method]]$tb_score_c_test$p_c
    # Add a little noise to the scores, to avoir crashing R
    scores_c_train <- scores_c_train + rnorm(length(scores_c_train), 0, .001)
    scores_c_calib <- scores_c_calib + rnorm(length(scores_c_calib), 0, .001)
    scores_c_test <- scores_c_test + rnorm(length(scores_c_test), 0, .001)
    
    locfit_0_train <- locfit(
      formula = d ~ lp(score, nn = 0.15, deg = 0), 
      kern = "rect", maxk = 200, 
      data = tibble(d = tb_train$d, score = scores_c_train)
    )
    locfit_0_calib <- locfit(
      formula = d ~ lp(score, nn = 0.15, deg = 0), 
      kern = "rect", maxk = 200, 
      data = tibble(d = tb_calib$d, score = scores_c_calib)
    )
    locfit_0_test <- locfit(
      formula = d ~ lp(score, nn = 0.15, deg = 0), 
      kern = "rect", maxk = 200, 
      data = tibble(d = tb_test$d, score = scores_c_test)
    )
    
    score_c_locfit_0_train <- predict(locfit_0_train, newdata = linspace)
    score_c_locfit_0_calib <- predict(locfit_0_calib, newdata = linspace)
    score_c_locfit_0_test <- predict(locfit_0_test, newdata = linspace)
    
    # Make sure to have values in [0,1]
    score_c_locfit_0_train[score_c_locfit_0_train > 1] <- 1
    score_c_locfit_0_train[score_c_locfit_0_train < 0] <- 0
    
    score_c_locfit_0_calib[score_c_locfit_0_calib > 1] <- 1
    score_c_locfit_0_calib[score_c_locfit_0_calib < 0] <- 0
    
    score_c_locfit_0_test[score_c_locfit_0_test > 1] <- 1
    score_c_locfit_0_test[score_c_locfit_0_test < 0] <- 0
    
    res_c_train <- tibble(
      xlim = linspace,
      locfit_pred = score_c_locfit_0_train,
      sample = "train",
      method = method_names[i_method]
    )
    res_c_calib <- tibble(
      xlim = linspace,
      locfit_pred = score_c_locfit_0_calib,
      sample = "calibration",
      method = method_names[i_method]
    )
    res_c_test <- tibble(
      xlim = linspace,
      locfit_pred = score_c_locfit_0_test,
      sample = "test",
      method = method_names[i_method]
    )
    
    res_c <- res_c  |>
      bind_rows(
        res_c_train) |>
      bind_rows(
        res_c_calib
      ) |> 
      bind_rows(
        res_c_test
      )
  }
  
  res_c <- res_c |>
    mutate(
      seed = seed,
      type = type
    )
  
  recalibration_curves <- calibration_curves |> bind_rows(res_c)
  
  # 7. Recalibration Metrics----
  
  ## 7.1. Standard Metrics----
  
  gof_c_train <- map(
    .x = scores_c,
    .f = ~compute_gof(
      obs = tb_train$d, 
      pred = .x$tb_score_c_train$p_c
    )
  ) |> 
    list_rbind(names_to = "method") |> 
    mutate(sample = "train")
  
  gof_c_calib <- map(
    .x = scores_c,
    .f = ~compute_gof(
      obs = tb_calib$d, 
      pred = .x$tb_score_c_calib$p_c
    ) 
  ) |> 
    list_rbind(names_to = "method") |> 
    mutate(sample = "calibration")
  
  gof_c_test <- map(
    .x = scores_c,
    .f = ~compute_gof(
      obs = tb_test$d, 
      pred = .x$tb_score_c_test$p_c
    )
  ) |> 
    list_rbind(names_to = "method") |> 
    mutate(sample = "test")
  
  gof_c <- 
    gof_c_train |> 
    bind_rows(gof_c_calib) |> 
    bind_rows(gof_c_test) |>
    mutate(model = type)
  
  ## 7.2. Calibration Metrics----
  
  calibration_c_train <- map(
    .x = scores_c,
    .f = ~compute_metrics(
      obs = tb_train$d, 
      scores = .x$tb_score_c_train$p_c,
      k = 10
    )
  ) |> 
    list_rbind(names_to = "method") |> 
    mutate(sample = "train")
  
  calibration_c_calib <- map(
    .x = scores_c,
    .f = ~compute_metrics(
      obs = tb_calib$d, 
      scores = .x$tb_score_c_calib$p_c,
      k = 5
    )
  ) |> 
    list_rbind(names_to = "method") |> 
    mutate(sample = "calibration")
  
  calibration_c_test <- map(
    .x = scores_c,
    .f = ~compute_metrics(
      obs = tb_test$d, 
      scores = .x$tb_score_c_test$p_c,
      k = 5
    )
  ) |> 
    list_rbind(names_to = "method") |> 
    mutate(sample = "test")
  
  
  calibration_c <- 
    calibration_c_train |> 
    bind_rows(calibration_c_calib) |> 
    bind_rows(calibration_c_test) |>
    mutate(model = type)
  
  summary_metrics_c <- 
    gof_c |> 
    left_join(calibration_c, by = c("mse", "sample", "model", "method")) |> 
    relocate(sample, model, method, .before = "mse") |>
    mutate(seed = seed)
  
  summary_metrics <- summary_metrics |> mutate(method = "No Calibration")
  
    list(
      recalibration_curves = recalibration_curves,
      summary_metrics = list_rbind(list(summary_metrics, summary_metrics_c)),
      seed = seed,
      scores = scores,
      scores_c = scores_c,
      type = type,
      tuning = tuning
    )
}

7.2 Import Data

Let us load the data obtained in Chapter 6.

tb_train <- read.csv(
  "data/data_credit_smote_train.csv")
tb_rest <- read.csv(
  "data/data_credit_smote_rest.csv")

The set of hyperparameters and the associated oob MSE/oob error rate:

best_params_rf_reg <- read_csv(
  "output/best_params_rf_reg.csv")
best_params_rf_classif <- read_csv(
  "output/best_params_rf_classif.csv")

We select the hyperparameters with:

the lowest MSE (for regression)
the highest accuracy (for classification).

The set of hyperparameters for the regression task:

best_params_rf_reg <- 
  best_params_rf_reg |> 
  arrange(mse_oob) |> 
  slice(1)
best_params_rf_reg

# A tibble: 1 × 4
   mtry nodesize num_trees mse_oob
  <dbl>    <dbl>     <dbl>   <dbl>
1    12        5       500   0.116

The set of hyperparameters for the classification task:

best_params_rf_classif <- 
  best_params_rf_classif |> 
  arrange(err_oob) |> 
  slice(1)
best_params_rf_classif

# A tibble: 1 × 4
   mtry nodesize num_trees err_oob
  <dbl>    <dbl>     <dbl>   <dbl>
1    12        5       300   0.157

7.3 Running the Simulations

We consider 200 different splits of the data on which a recalibration technique will be applied.

n_repl <- 200
seed <- 1:n_repl

7.3.1 Regression

library(future)
nb_cores <- future::availableCores()-1
plan(multisession, workers = nb_cores)
progressr::with_progress({
  p <- progressr::progressor(steps = n_repl)
  metrics_rf_tuned_reg <- furrr::future_map(
    .x = 1:n_repl,
    .f = ~{
      resul <- simul_calib_rf(
        seed = .x,
        type = "regression",
        tuning = TRUE
      )
      p()
      resul
    },
    .options = furrr::furrr_options(seed = FALSE)
  )
})

We save the results for later use.

save(
  metrics_rf_tuned_reg, 
  file = "/output/simul-rf/metrics_rf_tuned_reg.rda")

7.3.2 Classification

nb_cores <- future::availableCores()-1
plan(multisession, workers = nb_cores)
progressr::with_progress({
  p <- progressr::progressor(steps = n_repl)
  metrics_rf_tuned_class <- furrr::future_map(
    .x = 1:n_repl,
    .f = ~{
      resul <- simul_calib_rf(
        seed = .x,
        type = "classification",
        tuning = TRUE
      )
      p()
      resul
    },
    .options = furrr::furrr_options(seed = FALSE)
  )
})

We save the results for later use.

save(
  metrics_rf_tuned_class, 
  file = "/output/simul-rf/metrics_rf_tuned_class.rda"
  )

7.4 Calibration vs. Goodness-of-fit

We now explore the relationship between calibration and goodness-of-fit.

Let us consider different set of hyperparameters for the random forests. We will train a forest on each set and compute calibration and goodness-of-fit metrics on both the training set and on the remaining set (we no longer split the remaining data into a calibration set and a test set).

grid_params <- 
  expand_grid(
    num_trees = c(100, 300, 500),
    mtry = seq(1, (ncol(tb_train)/2)),
    nodesize = c(5, 10, 15, 20)
  )

Let us run the simulations.

7.4.1 Regression

# Grid search : regression
nb_cores <- future::availableCores()-1
plan(multisession, workers = nb_cores)
progressr::with_progress({
  p <- progressr::progressor(steps = nrow(grid_params))
  compare_cal_gof_reg <- furrr::future_map(
    .x = 1:nrow(grid_params),
    .f = ~{
      # Estim random forest and get the evaluation metric
      rf <- randomForest(
        d ~ ., 
        data = tb_train, 
        mtry = grid_params$mtry[.x], 
        nodesize = grid_params$nodesize[.x], 
        ntree = grid_params$num_trees[.x],
        keep.inbag = TRUE
      )
      
      num_trees <- grid_params$num_trees[.x]
      
      # Identify out of bag observations in each tree
      out_of_bag <- map(.x = 1:nrow(tb_train), .f = ~which(rf[["inbag"]][.x,] == 0))
      rf_pred_all <- predict(rf, tb_train,
                             predict.all = TRUE,
                             type = "response")$individual
      rf_pred <- unlist(map(.x = 1:nrow(tb_train), .f = ~mean(rf_pred_all[.x,out_of_bag[[.x]]])))
      
      oob_err <- mse_function(pred = rf_pred, obs = tb_train |> pull(d))
      mse_oob <- oob_err
      
      # Predict RF on train/rest dataset
      scores_train <- predict(rf, newdata = tb_train, type = "response")
      scores_rest <- predict(rf, newdata = tb_rest, type = "response")
      
      # Calibration metrics (Brier Score and LCS) on train/rest dataset
      calib_metrics_train <- compute_metrics(obs = tb_train$d, scores = scores_train)
      #calib_metrics_train <- calib_metrics_train |> 
      #  select(locfit_score, brier)
      calib_metrics_rest <- compute_metrics(obs = tb_rest$d, scores = scores_rest)
      #calib_metrics_rest <- calib_metrics_rest |> 
      #  select(locfit_score, brier)
      
      # GOF metrics on train/rest dataset
      gof_metrics_train <- compute_gof(obs = tb_train$d, pred = scores_train)
      #gof_metrics_train <- gof_metrics_train |> 
      #  select(accuracy, AUC)
      gof_metrics_rest <- compute_gof(obs = tb_rest$d, pred = scores_rest)
      #gof_metrics_rest <- gof_metrics_rest |> 
      #  select(accuracy, AUC)
      
      # Update progressbar
      p()
      
      # Return object:
      tibble(
        mtry = grid_params$mtry[.x], 
        nodesize = grid_params$nodesize[.x], 
        num_trees = grid_params$num_trees[.x],
        mse_oob = mse_oob,
        brier_train = calib_metrics_train$brier,
        LCS_train = calib_metrics_train$locfit_score,
        ece_train = calib_metrics_train$ece,
        qmse_train = calib_metrics_train$qmse,
        wmse_train = calib_metrics_train$wmse,
        sensitivity_train = gof_metrics_train$sensitivity,
        specificity_train = gof_metrics_train$specificity,
        AUC_train = gof_metrics_train$AUC,
        accuracy_train = gof_metrics_train$accuracy,
        brier_rest = calib_metrics_rest$brier,
        LCS_rest = calib_metrics_rest$locfit_score,
        ece_rest = calib_metrics_rest$ece,
        qmse_rest = calib_metrics_rest$qmse,
        wmse_rest = calib_metrics_rest$wmse,
        sensitivity_rest = gof_metrics_rest$sensitivity,
        specificity_rest = gof_metrics_rest$specificity,
        AUC_rest = gof_metrics_rest$AUC,
        accuracy_rest = gof_metrics_rest$accuracy
      )
    },
    .options = furrr::furrr_options(seed = FALSE)
  )
})
compare_cal_gof_reg <- list_rbind(compare_cal_gof_reg)

We save the results for later use.

write.csv(
  compare_cal_gof_reg, 
  file = "/output/compare_cal_gof_reg.csv", 
  row.names = FALSE
)

7.4.2 Classification

nb_cores <- future::availableCores()-1
plan(multisession, workers = nb_cores)
progressr::with_progress({
  p <- progressr::progressor(steps = nrow(grid_params))
  compare_cal_gof_class <- furrr::future_map(
    .x = 1:nrow(grid_params),
    .f = ~{
      # Estim random forest and get the evaluation metric
      rf <- randomForest(
        as.factor(d) ~ ., 
        data = tb_train, 
        mtry = grid_params$mtry[.x], 
        nodesize = grid_params$nodesize[.x], 
        ntree = grid_params$num_trees[.x],
        keep.inbag = TRUE
      )
      
      num_trees <- grid_params$num_trees[.x]
      
      # Identify out of bag observations in each tree
      err_oob <- rf$err.rate[num_trees,1]
      
      # Predict RF on train/rest dataset
      scores_train <- predict(rf, newdata = tb_train, type = "vote")[, "1"]
      scores_rest <- predict(rf, newdata = tb_rest, type = "vote")[, "1"]
      
      # Calibration metrics (Brier Score and LCS) on train/rest dataset
      calib_metrics_train <- compute_metrics(obs = tb_train$d, scores = scores_train)
      #calib_metrics_train <- calib_metrics_train |> 
      #  select(locfit_score, brier)
      calib_metrics_rest <- compute_metrics(obs = tb_rest$d, scores = scores_rest)
      #calib_metrics_rest <- calib_metrics_rest |> 
      #  select(locfit_score, brier)
      
      # GOF metrics on train/rest dataset
      gof_metrics_train <- compute_gof(obs = tb_train$d, pred = scores_train)
      #gof_metrics_train <- gof_metrics_train |> 
      #  select(accuracy, AUC)
      gof_metrics_rest <- compute_gof(obs = tb_rest$d, pred = scores_rest)
      #gof_metrics_rest <- gof_metrics_rest |> 
      #  select(accuracy, AUC)
      
      # Update progressbar
      p()
      
      # Return object:
      tibble(
        mtry = grid_params$mtry[.x], 
        nodesize = grid_params$nodesize[.x], 
        num_trees = grid_params$num_trees[.x],
        err_oob = err_oob,
        brier_train = calib_metrics_train$brier,
        LCS_train = calib_metrics_train$locfit_score,
        ece_train = calib_metrics_train$ece,
        qmse_train = calib_metrics_train$qmse,
        wmse_train = calib_metrics_train$wmse,
        sensitivity_train = gof_metrics_train$sensitivity,
        specificity_train = gof_metrics_train$specificity,
        AUC_train = gof_metrics_train$AUC,
        accuracy_train = gof_metrics_train$accuracy,
        brier_rest = calib_metrics_rest$brier,
        LCS_rest = calib_metrics_rest$locfit_score,
        ece_rest = calib_metrics_rest$ece,
        qmse_rest = calib_metrics_rest$qmse,
        wmse_rest = calib_metrics_rest$wmse,
        sensitivity_rest = gof_metrics_rest$sensitivity,
        specificity_rest = gof_metrics_rest$specificity,
        AUC_rest = gof_metrics_rest$AUC,
        accuracy_rest = gof_metrics_rest$accuracy
      )
    },
    .options = furrr::furrr_options(seed = FALSE)
  )
})

compare_cal_gof_class <- list_rbind(compare_cal_gof_class)

We save the results for later use.

write.csv(
  compare_cal_gof_class, 
  file = "/output/compare_cal_gof_class.csv", 
  row.names = FALSE
)