14  Compas

Objectives

In this page, we calculate the average total causal effect, the average natural direct effect and the causal mediation effect (or indirect effect) of COMPAS dataset assuming a known causal graph. We compare three methodologies to estimate those effects:

  • Using Optimal Transport (OT) to first derive counterfactuals at the individual level and then, averaging over all the individuals in the dataset,
  • Using Sequential Transport (ST) to first derive counterfactuals at the individual level and then, averaging over all the individuals in the dataset,
  • Using LSEM from causal mediation analysis that fits linear models to estimate the different average causal effects.
library(tidyverse)
# remotes::install_github(
#   repo = "fer-agathe/sequential_transport", subdir = "seqtransfairness"
# )
library(seqtransfairness)
# remotes::install_github(repo = "fer-agathe/transport-simplex")
library(transportsimplex)
library(randomForest)
library(fairadapt)
library(grf)
library(cluster)

# Also required:
# install.packages(mlr3fairness)

\[ \definecolor{wongBlack}{RGB}{0,0,0} \definecolor{wongGold}{RGB}{230, 159, 0} \definecolor{wongLightBlue}{RGB}{86, 180, 233} \definecolor{wongGreen}{RGB}{0, 158, 115} \definecolor{wongYellow}{RGB}{240, 228, 66} \definecolor{wongBlue}{RGB}{0, 114, 178} \definecolor{wongOrange}{RGB}{213, 94, 0} \definecolor{wongPurple}{RGB}{204, 121, 167} \definecolor{colA}{RGB}{255, 221, 85} \definecolor{colB}{RGB}{148, 78, 223} \definecolor{colC}{RGB}{63, 179, 178} \definecolor{colGpeZero}{RGB}{127, 23, 14} \definecolor{colGpeUn}{RGB}{27, 149, 224} \]

Codes for graphical parameters.
library(extrafont, quietly = TRUE)
font_family <- "CMU Serif"


path <- "./figs/"
if (!dir.exists(path)) dir.create(path)

We load the functions that will allow us to build the counterfactuals (see Chapter 4):

source("../scripts/functions.R")

14.1 Data

We load the COMPAS dataset that is available in the {fairadapt} package.

data(compas, package = "fairadapt")

The outcome variable is binary here: whether the defendant is rearrested at any time. The “treatment” \(A\) will be the sensitive attribute, race. In the data compiled in {fairadapt}, race is defined as White (which we will consider as \(A=1\)) and Non-White (which we will consider as \(A=0\)). The idea is to build counterfactuals for non-white people to ask questions such as “had this non-white individual been white, what would the prediction of an algorithm modeling recidivism be?”.

To train the predictive model of recidivism, we will use the following covariates: the age, the prior criminal records of defendants, and the charge degree (felony or misdemeanor).

tb <- compas |> 
  as_tibble() |> 
  select(
    race, # sensitive
    age, 
    priors_count, # The prior criminal records of defendants. 
    c_charge_degree, # F: Felony M: Misdemeanor
    is_recid = two_year_recid # outcome
  ) |> 
  mutate(
   race = ifelse(race == "Non-White", 0, 1), # non-white indiv. as "untreated"
  )
dim(tb)
[1] 7214    5
summary(tb)
      race             age         priors_count    c_charge_degree
 Min.   :0.0000   Min.   :18.00   Min.   : 0.000   F:4666         
 1st Qu.:0.0000   1st Qu.:25.00   1st Qu.: 0.000   M:2548         
 Median :0.0000   Median :31.00   Median : 2.000                  
 Mean   :0.3402   Mean   :34.82   Mean   : 3.472                  
 3rd Qu.:1.0000   3rd Qu.:42.00   3rd Qu.: 5.000                  
 Max.   :1.0000   Max.   :96.00   Max.   :38.000                  
    is_recid     
 Min.   :0.0000  
 1st Qu.:0.0000  
 Median :0.0000  
 Mean   :0.4507  
 3rd Qu.:1.0000  
 Max.   :1.0000  
#write.csv(tb, "compas.csv", row.names = FALSE)

We assume the DAG shown in Figure 14.1.

variables <- c("race", 
               "age", "priors_count", "c_charge_degree", 
               "is_recid")
# Row: outgoing arrow
adj <- matrix(
  # S  1  2  3  Y
  c(0, 1, 1, 1, 1,# S
    0, 0, 1, 1, 1,# 1 (age)
    0, 0, 0, 0, 1,# 2 (priors_count)
    0, 0, 0, 0, 1,# 3 (c_charge_degree)
    0, 0, 0, 0, 0 # Y
  ),
  ncol = length(variables),
  dimnames = rep(list(variables), 2),
  byrow = TRUE
)

causal_graph <- fairadapt::graphModel(adj)
plot(causal_graph)
Figure 14.1: Assumed structural model for the probability of recidivism.

14.2 Counterfactuals

Let us set a seed for reproducibility.

seed <- 1234
set.seed(seed)
A_name <- "race" # treatment name
Y_name <- "is_recid" # outcome name
A_untreated <- 0
A <- tb[[A_name]]
ind_untreated <- which(A == A_untreated)
tb_untreated <- tb[ind_untreated, ]
tb_treated <- tb[-ind_untreated, ]

Let us follow the DAG from Figure 14.1 and build the counterfactuals of non-white individuals: we thus transport individuals from \(A=0\) to \(A=1\), using the predictions on the test set.

14.2.1 Multivariate Optimal Transport

We apply multivariate optimal transport (OT), following the methodology developed in De Lara et al. (2024).

tb_untreated_wo_A <- tb_untreated[ , !(names(tb_untreated) %in% A_name)]
tb_treated_wo_A <- tb_treated[ , !(names(tb_treated) %in% A_name)]
n0 <- nrow(tb_untreated_wo_A)
n1 <- nrow(tb_treated_wo_A)
y0 <- tb_untreated_wo_A[[Y_name]]
y1 <- tb_treated_wo_A[[Y_name]]
X0 <- tb_untreated_wo_A[ , !(names(tb_untreated_wo_A) %in% Y_name)]
X1 <- tb_treated_wo_A[ , !(names(tb_treated_wo_A) %in% Y_name)]

To apply Optimal Transport on the COMPAS dataset, we first need to one-hot the categorical variables.

num_cols <- names(X0)[sapply(X0, is.numeric)]
cat_cols <- names(X0)[sapply(X0, function(col) is.factor(col) || is.character(col))]
X0_num <- X0[ , num_cols]
X1_num <- X1[ , num_cols]
X0_cat <- X0[ , cat_cols]
X1_cat <- X1[ , cat_cols]

cat_counts <- sapply(X0[ , cat_cols], function(col) length(unique(col)))

Categorical variables are one-hot encoded:

library(caret)
X0_cat_encoded <- list()
X1_cat_encoded <- list()
for (col in cat_cols) {
  # One-hot encoding with dummyVars
  formula <- as.formula(paste("~", col))
  dummies <- dummyVars(formula, data = X0_cat)
  
  # Dummy variable
  dummy_0 <- predict(dummies, newdata = X0_cat) %>% as.data.frame()
  dummy_1 <- predict(dummies, newdata = X1_cat) %>% as.data.frame()
  
  # Scaling
  dummy_0_scaled <- scale(dummy_0)
  dummy_1_scaled <- scale(dummy_1)

  dummy_0_df <- as.data.frame(dummy_0_scaled)
  dummy_1_df <- as.data.frame(dummy_1_scaled)
  
  # Aling categories in both treated/untreated groups
  all_cols <- union(colnames(dummy_0_df), colnames(dummy_1_df))
  dummy_0_df <- dummy_0_df %>% mutate(across(.fns = identity)) %>% select(all_of(all_cols)) %>% replace(is.na(.), 0)
  dummy_1_df <- dummy_1_df %>% mutate(across(.fns = identity)) %>% select(all_of(all_cols)) %>% replace(is.na(.), 0)
  
  # Sauvegarde dans les listes
  X0_cat_encoded[[col]] <- dummy_0_df
  X1_cat_encoded[[col]] <- dummy_1_df
}

We calculate Euclidean distance for numerical variables.

# library(proxy)
num_dist <- proxy::dist(x = X0_num, y = X1_num, method = "Euclidean")
num_dist <- as.matrix(num_dist)

For categorical variables, we use the Hamming distance.

cat_dists <- list()
for (col in cat_cols) {
  mat_0 <- as.matrix(X0_cat_encoded[[col]])
  mat_1 <- as.matrix(X1_cat_encoded[[col]])
  dist_mat <- proxy::dist(x = mat_0, y = mat_1, method = "Euclidean")
  cat_dists[[col]] <- as.matrix(dist_mat)
}

Then we need to combine the two distance matrices. We use weights equal to the proportion of numerical variables and the proportion of categorical variables, respectively for distances based on numerical and categorical variables.

combined_cost <- num_dist
for (i in seq_along(cat_dists)) {
  combined_cost <- combined_cost + cat_dists[[i]]
}

Then, we can compute the transport map:

# Uniform weights (equal mass)
w0 <- rep(1 / n0, n0)
w1 <- rep(1 / n1, n1)
# Compute transport plan
transport_res <- transport::transport(
  a = w0,
  b = w1,
  costm = combined_cost,
  method = "shortsimplex"
)

transport_plan <- matrix(0, nrow = n0, ncol = n1)
for(i in seq_len(nrow(transport_res))) {
  transport_plan[transport_res$from[i], transport_res$to[i]] <- transport_res$mass[i]
}

We first transport the numerical variables.

num_transported <- n0 * (transport_plan %*% as.matrix(X1_num))

Then, we transport the categorical variables with label reconstruction (not perfect here).

cat_transported <- list()
for (col in cat_cols) {
  cat_probs <- transport_plan %*% as.matrix(X1_cat_encoded[[col]])
  cat_encoded_columns <- colnames(X1_cat_encoded[[col]])
  # For each obs., we take the index with the maximum value (approx. proba)
  max_indices <- apply(cat_probs, 1, which.max)
  prefix_pattern <- paste0("^", col, "\\.")
  cat_transported[[col]] <- sapply(max_indices, function(x) sub(prefix_pattern, "", cat_encoded_columns[x]))
}

We can now store the results into a tibble.

tb_ot_transported <- as_tibble(num_transported)
for (col in cat_cols) {
  tb_ot_transported[[col]] <- cat_transported[[col]]
}

save(tb_ot_transported, file = "../output/ot-compas.rda")
# Load tb_ot_transported
load("../output/ot-compas.rda")
tb_ot_transported <- tb_ot_transported |>
  mutate(c_charge_degree = as.factor(c_charge_degree))
tb_ot_transported <- as.list(tb_ot_transported)

14.2.2 Penalized Optimal Transport

We can directly compute the transport map using Sinkhorn penalty.

# Compute transport plan
sinkhorn_transport_res <- T4transport::sinkhornD(
  combined_cost, wx = w0, wy = w1, lambda = 0.1
)

sinkhorn_transport_plan <- sinkhorn_transport_res$plan

We first transport the numerical variables.

num_sinkhorn_transported <- n0 * (sinkhorn_transport_plan %*% as.matrix(X1_num))

Then, we transport the categorical variables with label reconstruction (not perfect here).

cat_sinkhorn_transported <- list()
for (col in cat_cols) {
  cat_probs <- sinkhorn_transport_plan %*% as.matrix(X1_cat_encoded[[col]])
  cat_encoded_columns <- colnames(X1_cat_encoded[[col]])
  # For each obs., we take the index with the maximum value (approx. proba)
  max_indices <- apply(cat_probs, 1, which.max)
  prefix_pattern <- paste0("^", col, "\\.")
  cat_sinkhorn_transported[[col]] <- sapply(max_indices, function(x) sub(prefix_pattern, "", cat_encoded_columns[x]))
}

We can now store the results into a tibble.

tb_sinkhorn_transported <- as_tibble(num_sinkhorn_transported)
for (col in cat_cols) {
  tb_sinkhorn_transported[[col]] <- cat_sinkhorn_transported[[col]]
}

save(tb_sinkhorn_transported, file = "../output/sinkhorn-compas.rda")
# Load tb_sinkhorn_transported
load("../output/sinkhorn-compas.rda")
tb_sinkhorn_transported <- tb_sinkhorn_transported |>
  mutate(c_charge_degree = as.factor(c_charge_degree))
tb_sinkhorn_transported <- as.list(tb_sinkhorn_transported)

14.2.3 Sequential Transport

We call the seq_trans() function (see Chapter 4) function to build the counterfactuals of Black individuals. The estimations are done using parallel computation.

library(pbapply)
library(parallel)
ncl <- detectCores()-1
(cl <- makeCluster(ncl))
socket cluster with 9 nodes on host 'localhost'
clusterEvalQ(cl, {
  library(transportsimplex)
}) |>
  invisible()


A_name <- "race" # treatment name
Y_name <- "is_recid" # outcome name

sequential_transport <- seq_trans(
  data = tb, 
  adj = adj, 
  s = A_name, 
  S_0 = 0, # source: untreated
  y = Y_name, 
  num_neighbors = 50, 
  num_neighbors_q = NULL,
  silent = FALSE,
  cl = cl
)
Transporting  age 
Transporting  priors_count 
Transporting  c_charge_degree 
save(sequential_transport, file = "../output/seq-t-compas.rda")

stopCluster(cl)
# Load sequential_transport
load("../output/seq-t-compas.rda")

14.2.4 Fairadapt

# RF
fpt_model_rf <- fairadapt(
    priors_count ~ ., 
    train.data = tb,
    prot.attr = A_name, adj.mat = adj,
    quant.method = rangerQuants
)

adapt_df_rf <- adaptedData(fpt_model_rf)
adapt_df_rf_untreated <- adapt_df_rf[ind_untreated,]

# Linear model
fpt_model_lin <- fairadapt(
    priors_count ~ ., 
    train.data = tb,
    prot.attr = A_name, adj.mat = adj,
    quant.method = linearQuants
)

adapt_df_lin <- adaptedData(fpt_model_lin)
adapt_df_lin_untreated <- adapt_df_lin[ind_untreated,]

14.3 Measuring the Causal Effect

14.3.1 With Causal Mediation Analysis

Let us use the multimed() function from {mediation} to estimate the direct effect:

  • race -> is_recid (ir), and the different indirect effects:
  • race -> age -> is_recid,
  • race -> priors_count (pc) -> is_recid,
  • race -> c_charge_degree (ccd) -> is_recid,
  • race -> age -> priors_count -> is_recid,
  • race -> age -> c_charge_degree -> is_recid.
# library(mediation) # we do not load it
#  otherwise it masks a lot of useful functions

# We encode the categorical variable as for optimal transport
tb_med <- tb |> 
  mutate(
    c_charge_degree = ifelse(c_charge_degree == "F", 0, 1)
  )

med_mod_age <- mediation::multimed(
  outcome = "is_recid", 
  med.main = "age", 
  med.alt = c("priors_count", "c_charge_degree"), 
  treat = "race", 
  data = tb_med
)
# Indirect effect for age: race -> age -> ir
delta_0_med_age <- mean((med_mod_age$d0.lb + med_mod_age$d0.ub) / 2)
# Direct + Other indirect effects: race -> ir, race -> pc -> ir, race -> ccd -> ir, 
# race -> age -> pc -> ir, race -> age -> ccd -> ir
zeta_1_med_age <- mean((med_mod_age$z1.lb + med_mod_age$z1.ub) / 2)
# Total effect
tot_effect_med_age <- delta_0_med_age + zeta_1_med_age

med_mod_pc <- mediation::multimed(
  outcome = "is_recid", 
  med.main = "priors_count", 
  med.alt = c("age", "c_charge_degree"), 
  treat = "race", 
  data = tb_med
)
# Indirect effect for pc: race -> pc -> ir, race -> age -> pc -> ir
delta_0_med_pc <- mean((med_mod_pc$d0.lb + med_mod_pc$d0.ub) / 2)
# Direct + Other indirect effects: race -> ir, race -> age -> ir, race -> ccd -> ir, 
# race -> age -> ccd -> ir
zeta_1_med_pc <- mean((med_mod_pc$z1.lb + med_mod_pc$z1.ub) / 2)
# Total effect
tot_effect_med_pc <- delta_0_med_pc + zeta_1_med_pc

med_mod_ccd <- mediation::multimed(
  outcome = "is_recid", 
  med.main = "c_charge_degree", 
  med.alt = c("age", "priors_count"), 
  treat = "race", 
  data = tb_med
)
# Indirect effect for ccd: race -> ccd -> ir, race -> age -> ccd -> ir
delta_0_med_ccd <- mean((med_mod_ccd$d0.lb + med_mod_ccd$d0.ub) / 2)
# Direct + Other indirect effects: race -> ir, race -> age -> ir, race -> pc -> ir, 
# race -> age -> pc -> ir
zeta_1_med_ccd <- mean((med_mod_ccd$z1.lb + med_mod_ccd$z1.ub) / 2)
# Total effect
tot_effect_med_ccd <- delta_0_med_ccd + zeta_1_med_ccd

The estimated values:

# Total effect
tot_effect_med <- tot_effect_med_age
# Indirect effects
delta_0_med <- delta_0_med_age + delta_0_med_pc + delta_0_med_ccd
# Direct effect 
zeta_1_med <- tot_effect_med - delta_0_med
cbind(delta_0 = delta_0_med, zeta_1 = zeta_1_med, tot_effect = tot_effect_med)
         delta_0      zeta_1  tot_effect
[1,] -0.09138055 0.004981563 -0.08639899

14.3.2 With Optimal Transport

library(randomForest)

We use a random forest to estimate the outcome model (see causal_effects_cf() in Chapter 4).

causal_effects_ot <- causal_effects_cf(
  data_untreated = tb_untreated, 
  data_treated = tb_treated,
  data_cf_untreated = as_tibble(tb_ot_transported), 
  data_cf_treated = NULL, 
  Y_name = Y_name, 
  A_name = A_name, 
  A_untreated = A_untreated # 0
)

cbind(
  delta_0 = causal_effects_ot$delta_0,
  zeta_1 = causal_effects_ot$zeta_1, 
  tot_effect = causal_effects_ot$tot_effect
)
         delta_0      zeta_1  tot_effect
[1,] -0.05991528 -0.02702946 -0.08694475

14.3.3 With Penalized Optimal Transport

causal_effects_sink_ot <- causal_effects_cf(
  data_untreated = tb_untreated, 
  data_treated = tb_treated,
  data_cf_untreated = as_tibble(tb_sinkhorn_transported), 
  Y_name = Y_name, 
  A_name = A_name, 
  A_untreated = A_untreated # 0
)

cbind(
  delta_0 = causal_effects_sink_ot$delta_0,
  zeta_1 = causal_effects_sink_ot$zeta_1, 
  tot_effect = causal_effects_sink_ot$tot_effect
)
        delta_0      zeta_1    tot_effect
[1,] 0.03842991 -0.03926703 -0.0008371113

14.3.4 With Sequential Transport

causal_effects_st <- causal_effects_cf(
  data_untreated = tb_untreated, 
  data_treated = tb_treated,
  data_cf_untreated = as_tibble(sequential_transport$transported), 
  Y_name = Y_name, 
  A_name = A_name, 
  A_untreated = A_untreated
)

cbind(
  delta_0 = causal_effects_st$delta_0,
  zeta_1 = causal_effects_st$zeta_1, 
  tot_effect = causal_effects_st$tot_effect
)
         delta_0      zeta_1  tot_effect
[1,] -0.05742751 -0.02291117 -0.08033868

14.3.5 With fairadapt

causal_effects_fairadapt_rf <- causal_effects_cf(
  data_untreated = tb_untreated, 
  data_treated = tb_treated,
  data_cf_untreated = as_tibble(adapt_df_rf_untreated |> select(-c(!!A_name, !!Y_name))), 
  Y_name = Y_name, 
  A_name = A_name, 
  A_untreated = A_untreated
)

causal_effects_fairadapt_lin <- causal_effects_cf(
  data_untreated = tb_untreated, 
  data_treated = tb_treated,
  data_cf_untreated = as_tibble(adapt_df_lin_untreated |> select(-c(!!A_name, !!Y_name))), 
  Y_name = Y_name, 
  A_name = A_name, 
  A_untreated = A_untreated
)

cbind(
  delta_0_rf = causal_effects_fairadapt_rf$delta_0,
  zeta_1_rf = causal_effects_fairadapt_rf$zeta_1, 
  tot_effect_rf = causal_effects_fairadapt_rf$tot_effect,
  delta_0_lin = causal_effects_fairadapt_lin$delta_0,
  zeta_1_lin = causal_effects_fairadapt_lin$zeta_1, 
  tot_effect_lin = causal_effects_fairadapt_lin$tot_effect
)
       delta_0_rf   zeta_1_rf tot_effect_rf  delta_0_lin  zeta_1_lin
[1,] 7.209485e-05 -0.03145709     -0.031385 0.0001355641 -0.03182883
     tot_effect_lin
[1,]    -0.03169326

Let us visualize the distribution on the individuals effects for each transport-based method, in Figure 14.2 (frequency on the y-axis).

Codes to create the Figure.
library(tikzDevice)
source("../scripts/utils.R")

plot_hist_effects <- function(x, 
                              var_name, 
                              tikz = FALSE,
                              fill = "red",
                              printed_method = "",
                              x_lim = NULL,
                              print_main = TRUE,
                              print_x_axis = TRUE) {
  if (print_main == TRUE) {
    name_effect <- case_when(
      str_detect(var_name, "^delta_0") ~ "$\\delta_i$",#"$\\delta_i(0)$",
      str_detect(var_name, "^zeta_1") ~ "$\\zeta_i$",#"$\\zeta_i(1)$",
      str_detect(var_name, "^tot_effect") ~ "$\\tau_i$",#"$\\tau_i(1)$",
      TRUE ~ "other"
    )
    if (tikz == FALSE) name_effect <- latex2exp::TeX(name_effect)
  } else {
    name_effect <- ""
  }
  
  
  if (var_name == "tot_effect") {
    data_plot <- x[["delta_0_i"]] + x[["zeta_1_i"]]
  } else {
    data_plot <- x[[var_name]]
  }
  
  if (is.null(x_lim)) {
    hist(
      data_plot, 
      main = "", xlab = "", ylab = "", family = font_family,
      col = fill, axes = FALSE
    )
  } else {
    hist(
      data_plot, 
      main = "", xlab = "", ylab = "", family = font_family,
      col = fill, xlim = x_lim, axes = FALSE
    )
  }
  
  if (print_x_axis) axis(1, family = font_family)
  axis(2, family = font_family)
  
  title(
    main = name_effect, cex.main = 1, family = font_family
  )
  
  if (printed_method != "") {
    title(
      ylab = printed_method, line = 2, 
      cex.lab = 1, family = font_family
    )
  }
  abline(v = mean(data_plot), col = "darkred", lty = 2, lwd = 2)
}


colour_methods <- c(
   "fairadapt_rf" = "#9966FF",
  # "OT" = "#CC79A7",
  "OT-M" = "#009E73",
  "skh" = "darkgray",
  "seq_1" = "#0072B2"
  #"seq_2" = "#D55E00",
)


export_tikz <- FALSE

scale <- 1.42
file_name <- "compas-dist-indiv-effects"
width_tikz <- 2.25*scale
height_tikz <- 1.4*scale
if (export_tikz == TRUE)
  tikz(paste0("figs/", file_name, ".tex"), width = width_tikz, height = height_tikz)

layout(
  matrix(1:12, byrow = TRUE, ncol = 3),
  widths = c(1, rep(.9, 2)), heights = c(1, rep(.72, 2))
)

for (i in 1:4) {
  x <- case_when(
    i == 1 ~ causal_effects_fairadapt_rf,
    i == 2 ~ causal_effects_ot,
    i == 3 ~ causal_effects_sink_ot,
    i == 4 ~ causal_effects_st
    
  )
  method <- case_when(
    i == 1 ~ "FPT-RF",
    i == 2 ~ "OT-M",
    i == 3 ~ "SKH",
    i == 4 ~ "ST",
  )
  
  for (var_name in c("delta_0_i", "zeta_1_i", "tot_effect")) {
    mar_bottom <- ifelse(i == 4, 2.1, .6)
    mar_left <- ifelse(var_name == "delta_0_i", 3.1, 2.1)
    mar_top <- ifelse(i == 1, 2.1, .1)
    mar_right <- .4
    printed_method <- ifelse(
      var_name == "delta_0_i", 
      ifelse(method == "OT-M", "OT", method),
      ""
    )
    
    par(mar = c(mar_bottom, mar_left, mar_top, mar_right))
    x_lim_list <- list(
      "delta_0_i" = c(-.5, .5),
      "zeta_1_i" = c(-.4, .4),
      "tot_effect" = c(-.6, .6)
    )
    
    plot_hist_effects(
      x = x, var_name = var_name, tikz = export_tikz, 
      fill = colour_methods[i],
      printed_method = printed_method, 
      x_lim = x_lim_list[[var_name]],
      print_main = i == 1,
      print_x_axis = i == 4
    )
  }
}


if (export_tikz == TRUE) {
  dev.off()
  plot_to_pdf(
    filename = file_name, 
    path = "./figs/", keep_tex = FALSE, crop = T
  )
}
Figure 14.2: Distribution of individual direct effect (\(\delta_i(0)\)), indirect effect (\(\zeta_i(0)\)), and total causal effect (\(\tau_i\)) estimated with transport-based counterfactuals with optimal transport (OT-M), penalized transport (SKH), and sequential transport (ST).

14.3.6 With Normalizing Flows

We export the dataset to estimate the causal effect using normalizing flows with the python library medflow (see the Github page of medflow). The python script is provided below but is not run during the compilation of this page. Note that this procedure does not provide individual effects, but aggregated ones.

save(tb, file = "../output/compas.rda")
write_csv(tb, file = "../output/compas.csv")
import numpy as np
import os
from medflow import train_med, sim_med
import pandas as pd

df2 = pd.read_csv("compas.csv")
df2["c_charge_degree"] = (df2["c_charge_degree"] == "F").astype(int)

df2.to_csv("compas_cleaned.csv", index=False)

base_path = './'  # Define the base path for file operations.
folder = '.'  # Define the folder where files will be stored.
path = os.path.join(base_path, folder, '')  # Combines the base path and folder into a complete path.
dataset_name = 'compas_cleaned'  # Define the name of the dataset.

if not (os.path.isdir(path)):  # checks if a directory with the name 'path' exists.
    os.makedirs(path)  # if not, creates a new directory with this name. This is where the logs and model weights will be saved.
    
## MODEL TRAINING (~25 min)
train_med(path=path, 
  dataset_name=dataset_name, 
  treatment='race', 
  confounder=[], 
  mediator=["age", "priors_count","c_charge_degree"], 
  outcome='is_recid', 
  test_size=0.2, 
  cat_var=["c_charge_degree"], 
  sens_corr=None, 
  seed_split=1,
  model_name=path + 'seed_1', 
  trn_batch_size=64, 
  val_batch_size=512, 
  learning_rate=1e-4, 
  seed=1,
  nb_epoch=3000, 
  nb_estop=20, 
  val_freq=1, 
  emb_net=[164, 48, 32], 
  int_net=[32, 24, 16]
)

## EFFECT ESTIMATION (~3 minutes)
# Path-specific effects
sim_med(
  path=path, 
  dataset_name=dataset_name, 
  model_name=path + 'seed_1', 
  n_mce_samples=10000, seed=1, 
  inv_datafile_name='1_path_100k_v2',
  cat_list=[0, 1], 
  moderator=None
)

In this page, we load the results:

nf_estim <- read.csv("../output/1_path_100k_v2_results.csv")

We compute the direct, indirect and total effects

nf_tot_effect <- 
  nf_estim |> filter(Potential.Outcome == "E[is_recid(race=1)]") |> pull("Value") -
  nf_estim |> filter(Potential.Outcome == "E[is_recid(race=0)]") |> pull("Value")

# Indirect
nf_delta_0 <- 
  nf_estim |> filter(
    Potential.Outcome == "E[is_recid(race=0, age(race=1), priors_count(race=1), c_charge_degree(race=1))]"
    ) |> pull("Value") -
  nf_estim |> filter(
    Potential.Outcome == "E[is_recid(race=0)]"
  ) |> pull("Value")
  
# Direct
nf_zeta_1 <- nf_tot_effect - nf_delta_0

# Sanity check for direct effect
nf_estim |> filter(
  Potential.Outcome == "E[is_recid(race=1)]"
) |> pull("Value")-
nf_estim |> filter(
  Potential.Outcome == "E[is_recid(race=0, age(race=1), priors_count(race=1), c_charge_degree(race=1))]"
) |> pull("Value")
[1] -0.0004484355

14.3.7 Summary

tibble::tribble(
  ~method, ~delta_0, ~zeta_1, ~tot_effect,
  "CM", delta_0_med, zeta_1_med, tot_effect_med,
  "FPT-RF", causal_effects_fairadapt_rf$delta_0, causal_effects_fairadapt_rf$zeta_1, causal_effects_fairadapt_rf$tot_effect,
  "FPT-LIN", causal_effects_fairadapt_lin$delta_0, causal_effects_fairadapt_lin$zeta_1, causal_effects_fairadapt_lin$tot_effect,
  "NF", nf_delta_0, nf_zeta_1, nf_tot_effect,
  "OT", causal_effects_ot$delta_0, causal_effects_ot$zeta_1, causal_effects_ot$tot_effect,
  "SKH", causal_effects_sink_ot$delta_0, causal_effects_sink_ot$zeta_1, causal_effects_sink_ot$tot_effect,
  "ST", causal_effects_st$delta_0, causal_effects_st$zeta_1, causal_effects_st$tot_effect
) |> 
  knitr::kable(digits = 3)
method delta_0 zeta_1 tot_effect
CM -0.091 0.005 -0.086
FPT-RF 0.000 -0.031 -0.031
FPT-LIN 0.000 -0.032 -0.032
NF -0.061 0.000 -0.061
OT -0.060 -0.027 -0.087
SKH 0.038 -0.039 -0.001
ST -0.057 -0.023 -0.080

14.3.8 Decomposition of the Indirect Effect

Due to the iterative nature of the sequential transport approach, which follows the topological order of the mediators in the causal graph, it is possible to further decompose the causal influence of each mediator on the indirect effect at the individual level.

# Load sequential_transport
load("../output/seq-t-compas.rda")

Total indirect effect with the Sequential Transport approach:

causal_effects_st$delta_0
[1] -0.05742751

Before decomposing the indirect effect, we need to retrieve the fitted RF model of the total indirect effect.

# Observations
data_untreated <- tb_untreated
# All variables transported
data_cf_untreated <- as_tibble(sequential_transport$transported)

Y_name <- Y_name
A_name <- A_name

# Outcome model for untreated
mu_untreated_model <- randomForest(
  x = data_untreated |> dplyr::select(-!!Y_name, -!!A_name),
  y = pull(data_untreated, !!Y_name)
)

Now we can modify the causal_effects_cf() to take the fitted RF model as an argument to be able to decompose the indirect effect and we return only the indirect effect at the individual level, delta_0_i and the average delta_0.

modified_causal_effects_cf <- function(data_untreated,
                                       data_cf_untreated,
                                       mu_untreated_model) {
  
  n_untreated <- nrow(data_untreated)
  
  # Natural Indirect Effect, using predictions
  delta_0_i <- predict(mu_untreated_model, newdata = data_cf_untreated) -
      predict(mu_untreated_model, newdata = data_untreated)
  delta_0 <- mean(delta_0_i)
  
  list(
    delta_0_i = delta_0_i,
    delta_0 = delta_0
  )
}

The total indirect effect is:

modified_causal_effects_cf(data_untreated, data_cf_untreated, mu_untreated_model)$delta_0
[1] -0.05608915

14.3.8.1 Influence of Age

cf_treated_first <- data_untreated |> 
  mutate(age = as_tibble(sequential_transport$transported)$age)

indirect_age <- modified_causal_effects_cf(
  data_untreated = data_untreated, 
  data_cf_untreated = cf_treated_first, 
  mu_untreated_model = mu_untreated_model
)

cbind(
  delta_0_age = indirect_age$delta_0
)
     delta_0_age
[1,] -0.03466058

14.3.8.2 Influence of Prior Counts

cf_treated_second <- cf_treated_first |> 
  mutate(priors_count = as_tibble(sequential_transport$transported)$priors_count)

indirect_priors_count <- modified_causal_effects_cf(
  data_untreated = cf_treated_first, 
  data_cf_untreated = cf_treated_second, 
  mu_untreated_model = mu_untreated_model
)

cbind(
  delta_0_priors_count = indirect_priors_count$delta_0
)
     delta_0_priors_count
[1,]          -0.01750712

14.3.8.3 Influence of Charge Degree

cf_treated_third <- cf_treated_second |> 
  mutate(c_charge_degree = as_tibble(sequential_transport$transported)$c_charge_degree)

indirect_c_charge_degree <- modified_causal_effects_cf(
  data_untreated = cf_treated_second, 
  data_cf_untreated = cf_treated_third, 
  mu_untreated_model = mu_untreated_model
)

cbind(
  delta_0_c_charge_degree = indirect_c_charge_degree$delta_0
)
     delta_0_c_charge_degree
[1,]             -0.00392145