14 Datasets and Priors

Note

This chapter provides priors for the distribution of the true probability of binary events from the real-world dataset used in the paper. It uses the functions defined in Chapter 13. We assume that the true probabilities are distributed according to a Beta distribution. The parameters of that distribution are obtained by fitting a Beta distribution by maximimum likelihood on the scores estimated using three models: a Generalized Linear Model (GLM), a Generalized Additive Model (GAM), and a Generalized Additive Model with model selection (GAMSEL).

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(gam)

Loading required package: splines
Loading required package: foreach

Attaching package: 'foreach'

The following objects are masked from 'package:purrr':

    accumulate, when

Loaded gam 1.22-3

library(gamsel)

Loaded gamsel 1.8-4

# Colours for train/test
colour_samples <- c(
  "Train" = "#0072B2",
  "Test" = "#D55E00"
)

Let us load the functions defined in Chapter 13:

source("functions/real-data.R")

We will store the results obtained from the estimations in the output/real-data/ folder.

if (!dir.exists("output/real-data/")) dir.create("output/real-data/")

14.1 Datasets

All the datasets used here are from the UCI Machine Learning Repository.

14.1.1 Abalone

URL to the data: https://archive.ics.uci.edu/dataset/1/abalone
Description: Predict the age of abalone from physical measurements.
Number of instances: 4,177
Features: 8
Reference: Nash et al. (1995)

name <- "abalone"

The dataset needs to be download.

Code to download the data

if (!dir.exists("data")) dir.create("data")
download.file(
  url = str_c("https://archive.ics.uci.edu/static/public/1/", name, ".zip"), 
  destfile = str_c("data/", name, ".zip")
)

Then, we can import the dataset:

Code to import the data

tb_abalone <- read_csv(
  file = unz(str_c("data/", name, ".zip"), str_c(name, ".data")), 
  col_names = c(
    "Sex", "Length", "Diameter", "Height", "Whole_weight", 
  "Shucked_weight", "Viscera_weight", "Shell_weight", "Rings"),
  show_col_types = FALSE
)

The target variable is sex. Let us turn it in a $\{0,1\}$ variable.

tb_abalone <- tb_abalone |> 
  mutate(Sex = ifelse(Sex == "M", 1, 0)) 
target_name <- "Sex"

Let us call the get_beta_fit() from Chapter 13 to get our priors.

priors_abalone <- get_beta_fit(
  dataset = tb_abalone, target_name = target_name, seed = 1234
)

Let us save the results and the dataset:

save(priors_abalone, file = "output/real-data/priors_abalone.rda")
save(tb_abalone, file = "output/real-data/tb_abalone.rda")

plot_hist_scores_beta(priors_abalone, "abalone")

Figure 14.1: Distribution of estimated probabilities by the GAMSEL model and Beta distribution fitted to the scores of each of the three models, for the Abalone dataset.

14.1.2 Adult

URL to the data: https://archive.ics.uci.edu/dataset/2/adult
Description: Predict whether income exceeds $50K/yr based on census data. Also known as “Census Income” dataset.
Number of instances: 48,842
Features: 14
Reference: Becker and Kohavi (1996)

name <- "adult"

The dataset needs to be download.

Code to download the data

if (!dir.exists("data")) dir.create("data")
download.file(
  url = str_c("https://archive.ics.uci.edu/static/public/2/", name, ".zip"), 
  destfile = str_c("data/", name, ".zip")
)

info_data <- scan(
  unz(str_c("data/", name, ".zip"), str_c(name, ".names")), 
  what = "character", sep = "\n"
)
# Print the names for this dataset (not very convenient...)
str_extract(info_data[94:length(info_data)], "^(.*):") |> 
  str_remove(":$") |> 
  (\(.x) str_c('"', .x, '",'))() |> 
  cat()

Then, we can import the dataset:

Code to import the data

tb_adult <- read_csv(
  file = unz(str_c("data/", name, ".zip"), str_c(name, ".data")), 
  col_names = c(
    "age", "workclass", "fnlwgt", "education", "education-num",
    "marital-status", "occupation", "relationship", "race", "sex",
    "capital-gain", "capital-loss", "hours-per-week", "native-country",
    "income"
  ),
  show_col_types = FALSE
)

The target variable is income. Let us turn it in a $\{0,1\}$ variable and call it high_income.

tb_adult <- tb_adult |> 
  mutate(high_income = ifelse(income == ">50K", 1, 0)) |> 
  dplyr::select(-income)
target_name <- "high_income"

Let us call the get_beta_fit() from Chapter 13 to get our priors.

priors_adult <- get_beta_fit(
  dataset = tb_adult, target_name = target_name, seed = 1234
)

Let us save the results and the dataset:

save(priors_adult, file = "output/real-data/priors_adult.rda")
save(tb_adult, file = "output/real-data/tb_adult.rda")

plot_hist_scores_beta(priors_adult, "adult")

Figure 14.2: Distribution of estimated probabilities by the GAMSEL model and Beta distribution fitted to the scores of each of the three models, for the Adult dataset.

14.1.3 Bank Marketing

URL to the data: https://archive.ics.uci.edu/dataset/222/bank+marketing
Description: The data is related with direct marketing campaigns (phone calls) of a Portuguese banking institution. The classification goal is to predict if the client will subscribe a term deposit (variable y).
Number of instances: 45,211
Features: 16
Reference: Moro, Rita, and Cortez (2012)

name <- "bank"

The dataset needs to be download.

Code to download the data

if (!dir.exists("data")) dir.create("data")
download.file(
  url = "https://archive.ics.uci.edu/static/public/222/bank+marketing.zip", 
  destfile = str_c("data/", name, ".zip")
)

Then, we can import the dataset:

Code to import the data

dir.create("data/bank/")
system("unzip data/bank.zip -d data/bank/")
system("unzip data/bank/bank.zip -d data/bank/")
tb_bank <- read_csv2(
  file = unz(str_c("data/bank/", name, ".zip"), str_c("bank-full.csv")), 
  skip = 1,
  col_names = c(
    "age", "job", "marital", "education", "default", "balance", "housing", 
    "loan", "contact", "day", "month", "duration", "campaign", "pdays", 
    "previous", "poutcome", "y"
  ),
  show_col_types = FALSE
)

ℹ Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.

Code to import the data

system("rm -rf data/bank/")

The target variable is y (whether the client will subscribe a term deposit). Let us turn it in a $\{0,1\}$ variable.

tb_bank <- tb_bank |> 
  mutate(y = ifelse(y == "yes", 1, 0)) 
target_name <- "y"

Let us call the get_beta_fit() from Chapter 13 to get our priors.

priors_bank <- get_beta_fit(
  dataset = tb_bank, target_name = target_name, seed = 1234
)

Let us save the results and the dataset:

save(priors_bank, file = "output/real-data/priors_bank.rda")
save(tb_bank, file = "output/real-data/tb_bank.rda")

plot_hist_scores_beta(priors_bank, "bank")

Figure 14.3: Distribution of estimated probabilities by the GAMSEL model and Beta distribution fitted to the scores of each of the three models, for the Bank Marketing dataset.

14.1.4 Default of Credit Card Clients

URL to the data: https://archive.ics.uci.edu/dataset/350/default+of+credit+card+clients
Description: This research aimed at the case of customers’ default payments in Taiwan and compares the predictive accuracy of probability of default among six data mining methods.
Number of instances: 30,000
Features: 23
Reference: Yeh (2016)

name <- "default"

The dataset needs to be download.

Code to download the data

if (!dir.exists("data")) dir.create("data")
download.file(
  url = str_c("https://archive.ics.uci.edu/static/public/350/",
              "default+of+credit+card+clients.zip"
  ), 
  destfile = str_c("data/", name, ".zip")
)

Then, we can import the dataset:

Code to import the data

dir.create("data/default/")
system("unzip data/default.zip -d data/default/")
tb_default <- readxl::read_excel(
  path = "data/default/default of credit card clients.xls",
  skip = 1
) |> 
  select(-ID)
system("rm -rf data/default")

The target variable is defalut (1 if default, 0 otherwise).

tb_default <- 
  tb_default |> 
  mutate(
    across(all_of(c(
      "SEX", "EDUCATION", "MARRIAGE", 
      "PAY_0", "PAY_2", "PAY_3", "PAY_4", "PAY_5", "PAY_6")), as.factor)
  ) |> 
  mutate(
    across(all_of(c(
      "EDUCATION", "PAY_0", "PAY_2", "PAY_3", "PAY_4", "PAY_5", "PAY_6"
    )), ~fct_lump(.x, prop = .05)
    )
  ) |> 
  rename(default = `default payment next month`)
target_name <- "default"

Let us call the get_beta_fit() from Chapter 13 to get our priors.

priors_default <- get_beta_fit(
  dataset = tb_default, target_name = target_name, seed = 1234
)

Let us save the results and the dataset:

save(priors_default, file = "output/real-data/priors_default.rda")
save(tb_default, file = "output/real-data/tb_default.rda")

plot_hist_scores_beta(priors_default, "default")

Figure 14.4: Distribution of estimated probabilities by the GAMSEL model and Beta distribution fitted to the scores of each of the three models, for the Default of Credit Card Clients dataset.

14.1.5 Dry Bean

URL to the data: https://archive.ics.uci.edu/dataset/602/dry+bean+dataset
Description: Images of 13,611 grains of 7 different registered dry beans were taken with a high-resolution camera. A total of 16 features; 12 dimensions and 4 shape forms, were obtained from the grains.
Number of instances: 13,611
Features: 16
References: “Dry Bean” (2020)

name <- "drybean"

The dataset needs to be download.

Code to download the data

if (!dir.exists("data")) dir.create("data")
download.file(
  url = "https://archive.ics.uci.edu/static/public/602/dry+bean+dataset.zip", 
  destfile = str_c("data/", name, ".zip")
)

Then, we can import the dataset:

Code to import the data

dir.create("data/drybean/")
system("unzip data/drybean.zip -d data/drybean/")
tb_drybean <- readxl::read_excel(
  path = "data/drybean/DryBeanDataset/Dry_Bean_Dataset.xlsx"
)
system("rm -rf data/drybean")

The target variable is sex. Let us turn it in a $\{0,1\}$ variable.

tb_drybean <- tb_drybean |> 
  mutate(is_dermason = ifelse(Class == "DERMASON", 1, 0)) |> 
  select(-Class)
target_name <- "is_dermason"

Let us call the get_beta_fit() from Chapter 13 to get our priors.

priors_drybean <- get_beta_fit(
  dataset = tb_drybean, target_name = target_name, seed = 1234
)

Let us save the results and the dataset:

save(priors_drybean, file = "output/real-data/priors_drybean.rda")
save(tb_drybean, file = "output/real-data/tb_drybean.rda")

plot_hist_scores_beta(priors_drybean, "drybean")

Figure 14.5: Distribution of estimated probabilities by the GAMSEL model and Beta distribution fitted to the scores of each of the three models, for the Dry Bean dataset.

14.1.6 In-Vehicle Coupon Recommendation

URL to the data: https://archive.ics.uci.edu/dataset/603/in+vehicle+coupon+recommendation
Description: This data studies whether a person will accept the coupon recommended to him in different driving scenarios.
Number of instances: 12,684
Features: 25
References: “In-Vehicle Coupon Recommendation” (2020)

name <- "coupon"

The dataset needs to be download.

Code to download the data

if (!dir.exists("data")) dir.create("data")
download.file(
  url = str_c("https://archive.ics.uci.edu/static/public/603/", 
              "in+vehicle+coupon+recommendation.zip"), 
  destfile = str_c("data/", name, ".zip")
)

Then, we can import the dataset:

Code to import the data

tb_coupon <- read_csv(
  file = unz(str_c("data/", name, ".zip"), "in-vehicle-coupon-recommendation.csv"),
  show_col_types = FALSE
)

The target variable is y (1 if the person accepted the coupon, 0 otherwise).

tb_coupon <- 
  tb_coupon |> 
  mutate(
    temperature = as.factor(temperature),
    has_children = as.factor(has_children),
    toCoupon_GEQ15min = as.factor(toCoupon_GEQ15min),
    toCoupon_GEQ25min = as.factor(toCoupon_GEQ25min),
    direction_same = as.factor(direction_same)
  ) |> 
  select(-toCoupon_GEQ5min, -direction_opp, -car) |> 
  rename(y = Y)

tb_coupon <- na.omit(tb_coupon)

target_name <- "y"

Let us call the get_beta_fit() from Chapter 13 to get our priors.

priors_coupon <- get_beta_fit(
  dataset = tb_coupon, target_name = target_name, seed = 1234
)

Let us save the results and the dataset:

save(priors_coupon, file = "output/real-data/priors_coupon.rda")
save(tb_coupon, file = "output/real-data/tb_coupon.rda")

plot_hist_scores_beta(priors_coupon, "coupon")

Figure 14.6: Distribution of estimated probabilities by the GAMSEL model and Beta distribution fitted to the scores of each of the three models, for the In-Vehicle Coupon Recommendation dataset.

14.1.7 Mushroom

URL to the data: https://archive.ics.uci.edu/dataset/73/mushroom
Description: From Audobon Society Field Guide; mushrooms described in terms of physical characteristics; classification: poisonous or edible.
Number of instances: 8,124
Features: 22
References: “Mushroom” (1987)

name <- "mushroom"

The dataset needs to be download.

Code to download the data

if (!dir.exists("data")) dir.create("data")
download.file(
  url = str_c("https://archive.ics.uci.edu/static/public/73/mushroom.zip"), 
  destfile = str_c("data/", name, ".zip")
)

Then, we can import the dataset:

Code to import the data

tb_mushroom <- read_csv(
  file = unz(str_c("data/", name, ".zip"), "agaricus-lepiota.data"), 
  col_names = c(
    "edible",
    "cap_shape", "cap_surface", "cap_color", "bruises", "odor", 
    "gill_attachment", "gill_spacing", "gill_size", "gill_color", 
    "stalk_shape", "stalk_root", "stalk_surface_above_ring",
    "stalk_surface_below_ring", "stalk_color_above_ring", 
    "stalk_color_below_ring", "veil_type", "veil_color", "ring_number", 
    "ring_type", "spore_print_color", "population", "habitat"
  ),
  show_col_types = FALSE
)

The target variable is edible. Let us turn it in a $\{0,1\}$ variable.

tb_mushroom <- tb_mushroom |> 
  mutate(bruises = ifelse(bruises == TRUE, "yes", "no")) |> 
  mutate(edible = ifelse(edible == "e", 1, 0)) |> 
  select(-veil_type)
target_name <- "edible"

Let us call the get_beta_fit() from Chapter 13 to get our priors.

priors_mushroom <- get_beta_fit(
  dataset = tb_mushroom, target_name = target_name, seed = 1234
)

Let us save the results and the dataset:

save(priors_mushroom, file = "output/real-data/priors_mushroom.rda")
save(tb_mushroom, file = "output/real-data/tb_mushroom.rda")

plot_hist_scores_beta(priors_mushroom, "mushroom")

Figure 14.7: Distribution of estimated probabilities by the GAMSEL model and Beta distribution fitted to the scores of each of the three models, for the Mushroom dataset.

14.1.8 Occupancy Detection

URL to the data: https://archive.ics.uci.edu/dataset/357/occupancy+detection
Description: Predict the age of occupancy from physical measurements.
Number of instances: 20,560
Features: 6
References: Candanedo (2016)

name <- "occupancy"

The dataset needs to be download.

Code to download the data

if (!dir.exists("data")) dir.create("data")
download.file(
  url = str_c("https://archive.ics.uci.edu/static/public/357/",
              "occupancy+detection.zip"), 
  destfile = str_c("data/", name, ".zip")
)

Then, we can import the dataset:

Code to import the data

tb_occupancy <- read_csv(
  file = unz(str_c("data/", name, ".zip"), "datatraining.txt"), 
  col_names = c(
    "id", "date","Temperature","Humidity","Light","CO2",
    "HumidityRatio","Occupancy"
  ),
  show_col_types = FALSE, skip = 1
) |> 
  bind_rows(
    read_csv(
      file = unz(str_c("data/", name, ".zip"), "datatest.txt"), 
      col_names = c(
        "id", "date","Temperature","Humidity","Light","CO2",
        "HumidityRatio","Occupancy"
      ),
      show_col_types = FALSE, skip = 1,
    )
  ) |> 
  bind_rows(
    read_csv(
      file = unz(str_c("data/", name, ".zip"), "datatest2.txt"), 
      show_col_types = FALSE, skip = 1,
      col_names = c(
        "id", "date","Temperature","Humidity","Light","CO2",
        "HumidityRatio","Occupancy"
      ),
    )
  ) |> 
  select(-id)

The target variable is Occupancy.

tb_occupancy <- tb_occupancy |> 
  select(-date)
target_name <- "Occupancy"

Let us call the get_beta_fit() from Chapter 13 to get our priors.

priors_occupancy <- get_beta_fit(
  dataset = tb_occupancy, target_name = target_name, seed = 1234
)

Let us save the results and the dataset:

save(priors_occupancy, file = "output/real-data/priors_occupancy.rda")
save(tb_occupancy, file = "output/real-data/tb_occupancy.rda")

plot_hist_scores_beta(priors_occupancy, "occupancy")

Figure 14.8: Distribution of estimated probabilities by the GAMSEL model and Beta distribution fitted to the scores of each of the three models, for the Occupancy Detection dataset.

14.1.9 Wine Quality

URL to the data: https://archive.ics.uci.edu/dataset/186/wine+quality
Description: Two datasets are included, related to red and white vinho verde wine samples, from the north of Portugal. The goal is to model wine quality based on physicochemical tests (see [Cortez et al., 2009], http://www3.dsi.uminho.pt/pcortez/wine/).
Number of instances: 4,898
Features: 11
References: Cortez et al. (2009)

name <- "winequality"

The dataset needs to be download.

Code to download the data

if (!dir.exists("data")) dir.create("data")
download.file(
  url = str_c("https://archive.ics.uci.edu/static/public/186/",
              "wine+quality.zip"), 
  destfile = str_c("data/", name, ".zip")
)

Then, we can import the dataset:

Code to import the data

red_wine <- read_csv2(
  file = unz(str_c("data/", name, ".zip"), "winequality-red.csv"),
  show_col_types = FALSE) |>
  mutate(wine_type = "red")

ℹ Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.

Warning: One or more parsing issues, call `problems()` on your data frame for details,
e.g.:
  dat <- vroom(...)
  problems(dat)

Code to import the data

white_wine <- read_csv2(
  file = unz(str_c("data/", name, ".zip"), "winequality-white.csv"),
  show_col_types = FALSE) |> 
  mutate(wine_type = "white") |> 
  mutate(`residual sugar` = as.numeric(`residual sugar`))

ℹ Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.

The target variable is quality. Let us use it to define a $\{0,1\}$ variable. We define the variable high_quality which equals 1 if the quality is larger or equal than 6.

tb_winequality <- red_wine |> bind_rows(white_wine) |> 
  mutate(high_quality = ifelse(quality >= 6, 1, 0)) |> 
  mutate(across(all_of(c(
    "density", "chlorides", "volatile acidity", "sulphates", "citric acid"
    )), ~as.numeric(.x))) |> 
  select(-quality)
tb_winequality <- na.omit(tb_winequality)
target_name <- "high_quality"

Let us call the get_beta_fit() from Chapter 13 to get our priors.

priors_winequality <- get_beta_fit(
  dataset = tb_winequality, target_name = target_name, seed = 1234
)

Let us save the results and the dataset:

save(priors_winequality, file = "output/real-data/priors_winequality.rda")
save(tb_winequality, file = "output/real-data/tb_winequality.rda")

plot_hist_scores_beta(priors_winequality, "winequality")

Figure 14.9: Distribution of estimated probabilities by the GAMSEL model and Beta distribution fitted to the scores of each of the three models, for the Wine Quality dataset.

14.1.10 Spambase

URL to the data: https://archive.ics.uci.edu/dataset/94/spambase
Description: Classifying Email as Spam or Non-Spam
Number of instances: 4,601
Features: 57
References: Hopkins et al. (1999)

name <- "spambase"

The dataset needs to be download.

Code to download the data

if (!dir.exists("data")) dir.create("data")
download.file(
  url = str_c("https://archive.ics.uci.edu/static/public/2/", name, ".zip"), 
  destfile = str_c("data/", name, ".zip")
)

info_data <- scan(
  unz(str_c("data/", name, ".zip"), str_c(name, ".names")), 
  what = "character", sep = "\n"
)
# Print the names for this dataset (not very convenient...)
str_extract(info_data[94:length(info_data)], "^(.*):") |> 
  str_remove(":$") |> 
  (\(.x) str_c('"', .x, '",'))() |> 
  cat()

Then, we can import the dataset:

Code to import the data

tb_spambase <- read_csv(
  file = unz(str_c("data/", name, ".zip"), str_c(name, ".data")),
  col_names = c(
    "word_freq_make", "word_freq_address", "word_freq_all", "word_freq_3d",
    "word_freq_our", "word_freq_over", "word_freq_remove", "word_freq_internet",
    "word_freq_order", "word_freq_mail", "word_freq_receive", "word_freq_will",
    "word_freq_people", "word_freq_report", "word_freq_addresses",
    "word_freq_free", "word_freq_business", "word_freq_email", "word_freq_you",
    "word_freq_credit", "word_freq_your", "word_freq_font", "word_freq_000",
    "word_freq_money", "word_freq_hp", "word_freq_hpl", "word_freq_george",
    "word_freq_650", "word_freq_lab", "word_freq_labs", "word_freq_telnet",
    "word_freq_857", "word_freq_data", "word_freq_415", "word_freq_85",
    "word_freq_technology", "word_freq_1999", "word_freq_parts", "word_freq_pm",
    "word_freq_direct", "word_freq_cs", "word_freq_meeting",
    "word_freq_original", "word_freq_project", "word_freq_re", "word_freq_edu",
    "word_freq_table", "word_freq_conference", "char_freq_;", "char_freq_(",
    "char_freq_[", "char_freq_!", "char_freq_$", "char_freq_#",
    "capital_run_length_average", "capital_run_length_longest",
    "capital_run_length_total", "is_spam"
  ),
  show_col_types = FALSE
)

The target variable:

target_name <- "is_spam"

Let us call the get_beta_fit() from Chapter 13 to get our priors.

priors_spambase <- get_beta_fit(
  dataset = tb_spambase, target_name = target_name, seed = 1234
)

Let us save the results and the dataset:

save(priors_spambase, file = str_c("output/real-data/priors_spambase.rda"))
save(tb_spambase, file = "output/real-data/tb_spambase.rda")

plot_hist_scores_beta(priors_spambase, "spambase")

Figure 14.10: Distribution of estimated probabilities by the GAMSEL model and Beta distribution fitted to the scores of each of the three models, for the spambase dataset.

14.2 Summary

Codes to get the key characteristics of the datasets

datasets <- tribble(
  ~name, ~target_name, ~reference,
  "abalone", "Sex", "@misc_abalone_1",
  "adult", "high_income", "@misc_adult_2",
  "bank", "y", "@misc_bank_marketing_222",
  "default", "default", "@misc_default_of_credit_card_clients_350",
  "drybean", "is_dermason", "@misc_dry_bean_602",
  "coupon", "y", "@misc_vehicle_coupon_recommendation_603",
  "mushroom", "edible", "@misc_mushroom_73",
  "occupancy", "Occupancy", "@misc_occupancy_detection__357",
  "winequality", "high_quality", "@misc_wine_quality_186",
  "spambase", "is_spam", "@misc_spambase_94"
)

dataset_info <- vector(mode = "list", length = nrow(datasets))
for (i in 1:nrow(datasets)) {
  name <- datasets$name[i]
  target_name <- datasets$target_name[i]
  current_data <- get(str_c('tb_', name))
  current_target <- current_data |> pull(!!target_name)
  current_ref <- datasets$reference[i]
  n <- nrow(current_data)
  n_col <- ncol(current_data)
  n_numeric <- current_data |> select(-!!target_name) |> 
    select(where(is.numeric)) |> 
    ncol()
  dataset_info[[i]] <- tibble(
    Dataset = name, 
    n = n, 
    `# features` = n_col-1,
    `# numeric features` = n_numeric,
    `Prop. target = 1` = round(sum(current_target == 1) / n, 2),
    Reference = current_ref
  )
}

dataset_info <- list_rbind(dataset_info)
knitr::kable(dataset_info, booktabs = TRUE, format.args = list(big.mark = ","))

Table 14.1: Key characteristics of the datasets.

Dataset	n	# features	# numeric features	Prop. target = 1	Reference
abalone	4,177	8	8	0.37	Nash et al. (1995)
adult	32,561	14	6	0.24	Becker and Kohavi (1996)
bank	45,211	16	7	0.12	Moro, Rita, and Cortez (2012)
default	30,000	23	14	0.22	Yeh (2016)
drybean	13,611	16	16	0.26	“Dry Bean” (2020)
coupon	12,079	22	0	0.57	“In-Vehicle Coupon Recommendation” (2020)
mushroom	8,124	21	0	0.52	“Mushroom” (1987)
occupancy	20,560	5	5	0.23	Candanedo (2016)
winequality	6,495	12	11	0.63	Cortez et al. (2009)
spambase	4,601	57	57	0.39	Hopkins et al. (1999)