14  Datasets and Priors

Note

This chapter provides priors for the distribution of the true probability of binary events from the real-world dataset used in the paper. It uses the functions defined in Chapter 13. We assume that the true probabilities are distributed according to a Beta distribution. The parameters of that distribution are obtained by fitting a Beta distribution by maximimum likelihood on the scores estimated using three models: a Generalized Linear Model (GLM), a Generalized Additive Model (GAM), and a Generalized Additive Model with model selection (GAMSEL).

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(gam)
Loading required package: splines
Loading required package: foreach

Attaching package: 'foreach'

The following objects are masked from 'package:purrr':

    accumulate, when

Loaded gam 1.22-3
library(gamsel)
Loaded gamsel 1.8-4
# Colours for train/test
colour_samples <- c(
  "Train" = "#0072B2",
  "Test" = "#D55E00"
)

Let us load the functions defined in Chapter 13:

source("functions/real-data.R")

We will store the results obtained from the estimations in the output/real-data/ folder.

if (!dir.exists("output/real-data/")) dir.create("output/real-data/")

14.1 Datasets

All the datasets used here are from the UCI Machine Learning Repository.

14.1.1 Abalone

name <- "abalone"

The dataset needs to be download.

Code to download the data
if (!dir.exists("data")) dir.create("data")
download.file(
  url = str_c("https://archive.ics.uci.edu/static/public/1/", name, ".zip"), 
  destfile = str_c("data/", name, ".zip")
)

Then, we can import the dataset:

Code to import the data
tb_abalone <- read_csv(
  file = unz(str_c("data/", name, ".zip"), str_c(name, ".data")), 
  col_names = c(
    "Sex", "Length", "Diameter", "Height", "Whole_weight", 
  "Shucked_weight", "Viscera_weight", "Shell_weight", "Rings"),
  show_col_types = FALSE
)

The target variable is sex. Let us turn it in a \(\{0,1\}\) variable.

tb_abalone <- tb_abalone |> 
  mutate(Sex = ifelse(Sex == "M", 1, 0)) 
target_name <- "Sex"

Let us call the get_beta_fit() from Chapter 13 to get our priors.

priors_abalone <- get_beta_fit(
  dataset = tb_abalone, target_name = target_name, seed = 1234
)

Let us save the results and the dataset:

save(priors_abalone, file = "output/real-data/priors_abalone.rda")
save(tb_abalone, file = "output/real-data/tb_abalone.rda")
plot_hist_scores_beta(priors_abalone, "abalone")
Figure 14.1: Distribution of estimated probabilities by the GAMSEL model and Beta distribution fitted to the scores of each of the three models, for the Abalone dataset.

14.1.2 Adult

  • URL to the data: https://archive.ics.uci.edu/dataset/2/adult
  • Description: Predict whether income exceeds $50K/yr based on census data. Also known as “Census Income” dataset.
  • Number of instances: 48,842
  • Features: 14
  • Reference: Becker and Kohavi (1996)
name <- "adult"

The dataset needs to be download.

Code to download the data
if (!dir.exists("data")) dir.create("data")
download.file(
  url = str_c("https://archive.ics.uci.edu/static/public/2/", name, ".zip"), 
  destfile = str_c("data/", name, ".zip")
)

info_data <- scan(
  unz(str_c("data/", name, ".zip"), str_c(name, ".names")), 
  what = "character", sep = "\n"
)
# Print the names for this dataset (not very convenient...)
str_extract(info_data[94:length(info_data)], "^(.*):") |> 
  str_remove(":$") |> 
  (\(.x) str_c('"', .x, '",'))() |> 
  cat()

Then, we can import the dataset:

Code to import the data
tb_adult <- read_csv(
  file = unz(str_c("data/", name, ".zip"), str_c(name, ".data")), 
  col_names = c(
    "age", "workclass", "fnlwgt", "education", "education-num",
    "marital-status", "occupation", "relationship", "race", "sex",
    "capital-gain", "capital-loss", "hours-per-week", "native-country",
    "income"
  ),
  show_col_types = FALSE
)

The target variable is income. Let us turn it in a \(\{0,1\}\) variable and call it high_income.

tb_adult <- tb_adult |> 
  mutate(high_income = ifelse(income == ">50K", 1, 0)) |> 
  dplyr::select(-income)
target_name <- "high_income"

Let us call the get_beta_fit() from Chapter 13 to get our priors.

priors_adult <- get_beta_fit(
  dataset = tb_adult, target_name = target_name, seed = 1234
)

Let us save the results and the dataset:

save(priors_adult, file = "output/real-data/priors_adult.rda")
save(tb_adult, file = "output/real-data/tb_adult.rda")
plot_hist_scores_beta(priors_adult, "adult")
Figure 14.2: Distribution of estimated probabilities by the GAMSEL model and Beta distribution fitted to the scores of each of the three models, for the Adult dataset.

14.1.3 Bank Marketing

  • URL to the data: https://archive.ics.uci.edu/dataset/222/bank+marketing
  • Description: The data is related with direct marketing campaigns (phone calls) of a Portuguese banking institution. The classification goal is to predict if the client will subscribe a term deposit (variable y).
  • Number of instances: 45,211
  • Features: 16
  • Reference: Moro, Rita, and Cortez (2012)
name <- "bank"

The dataset needs to be download.

Code to download the data
if (!dir.exists("data")) dir.create("data")
download.file(
  url = "https://archive.ics.uci.edu/static/public/222/bank+marketing.zip", 
  destfile = str_c("data/", name, ".zip")
)

Then, we can import the dataset:

Code to import the data
dir.create("data/bank/")
system("unzip data/bank.zip -d data/bank/")
system("unzip data/bank/bank.zip -d data/bank/")
tb_bank <- read_csv2(
  file = unz(str_c("data/bank/", name, ".zip"), str_c("bank-full.csv")), 
  skip = 1,
  col_names = c(
    "age", "job", "marital", "education", "default", "balance", "housing", 
    "loan", "contact", "day", "month", "duration", "campaign", "pdays", 
    "previous", "poutcome", "y"
  ),
  show_col_types = FALSE
)
ℹ Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.
Code to import the data
system("rm -rf data/bank/")

The target variable is y (whether the client will subscribe a term deposit). Let us turn it in a \(\{0,1\}\) variable.

tb_bank <- tb_bank |> 
  mutate(y = ifelse(y == "yes", 1, 0)) 
target_name <- "y"

Let us call the get_beta_fit() from Chapter 13 to get our priors.

priors_bank <- get_beta_fit(
  dataset = tb_bank, target_name = target_name, seed = 1234
)

Let us save the results and the dataset:

save(priors_bank, file = "output/real-data/priors_bank.rda")
save(tb_bank, file = "output/real-data/tb_bank.rda")
plot_hist_scores_beta(priors_bank, "bank")
Figure 14.3: Distribution of estimated probabilities by the GAMSEL model and Beta distribution fitted to the scores of each of the three models, for the Bank Marketing dataset.

14.1.4 Default of Credit Card Clients

name <- "default"

The dataset needs to be download.

Code to download the data
if (!dir.exists("data")) dir.create("data")
download.file(
  url = str_c("https://archive.ics.uci.edu/static/public/350/",
              "default+of+credit+card+clients.zip"
  ), 
  destfile = str_c("data/", name, ".zip")
)

Then, we can import the dataset:

Code to import the data
dir.create("data/default/")
system("unzip data/default.zip -d data/default/")
tb_default <- readxl::read_excel(
  path = "data/default/default of credit card clients.xls",
  skip = 1
) |> 
  select(-ID)
system("rm -rf data/default")

The target variable is defalut (1 if default, 0 otherwise).

tb_default <- 
  tb_default |> 
  mutate(
    across(all_of(c(
      "SEX", "EDUCATION", "MARRIAGE", 
      "PAY_0", "PAY_2", "PAY_3", "PAY_4", "PAY_5", "PAY_6")), as.factor)
  ) |> 
  mutate(
    across(all_of(c(
      "EDUCATION", "PAY_0", "PAY_2", "PAY_3", "PAY_4", "PAY_5", "PAY_6"
    )), ~fct_lump(.x, prop = .05)
    )
  ) |> 
  rename(default = `default payment next month`)
target_name <- "default"

Let us call the get_beta_fit() from Chapter 13 to get our priors.

priors_default <- get_beta_fit(
  dataset = tb_default, target_name = target_name, seed = 1234
)

Let us save the results and the dataset:

save(priors_default, file = "output/real-data/priors_default.rda")
save(tb_default, file = "output/real-data/tb_default.rda")
plot_hist_scores_beta(priors_default, "default")
Figure 14.4: Distribution of estimated probabilities by the GAMSEL model and Beta distribution fitted to the scores of each of the three models, for the Default of Credit Card Clients dataset.

14.1.5 Dry Bean

  • URL to the data: https://archive.ics.uci.edu/dataset/602/dry+bean+dataset
  • Description: Images of 13,611 grains of 7 different registered dry beans were taken with a high-resolution camera. A total of 16 features; 12 dimensions and 4 shape forms, were obtained from the grains.
  • Number of instances: 13,611
  • Features: 16
  • References: Dry Bean (2020)
name <- "drybean"

The dataset needs to be download.

Code to download the data
if (!dir.exists("data")) dir.create("data")
download.file(
  url = "https://archive.ics.uci.edu/static/public/602/dry+bean+dataset.zip", 
  destfile = str_c("data/", name, ".zip")
)

Then, we can import the dataset:

Code to import the data
dir.create("data/drybean/")
system("unzip data/drybean.zip -d data/drybean/")
tb_drybean <- readxl::read_excel(
  path = "data/drybean/DryBeanDataset/Dry_Bean_Dataset.xlsx"
)
system("rm -rf data/drybean")

The target variable is sex. Let us turn it in a \(\{0,1\}\) variable.

tb_drybean <- tb_drybean |> 
  mutate(is_dermason = ifelse(Class == "DERMASON", 1, 0)) |> 
  select(-Class)
target_name <- "is_dermason"

Let us call the get_beta_fit() from Chapter 13 to get our priors.

priors_drybean <- get_beta_fit(
  dataset = tb_drybean, target_name = target_name, seed = 1234
)

Let us save the results and the dataset:

save(priors_drybean, file = "output/real-data/priors_drybean.rda")
save(tb_drybean, file = "output/real-data/tb_drybean.rda")
plot_hist_scores_beta(priors_drybean, "drybean")
Figure 14.5: Distribution of estimated probabilities by the GAMSEL model and Beta distribution fitted to the scores of each of the three models, for the Dry Bean dataset.

14.1.6 In-Vehicle Coupon Recommendation

name <- "coupon"

The dataset needs to be download.

Code to download the data
if (!dir.exists("data")) dir.create("data")
download.file(
  url = str_c("https://archive.ics.uci.edu/static/public/603/", 
              "in+vehicle+coupon+recommendation.zip"), 
  destfile = str_c("data/", name, ".zip")
)

Then, we can import the dataset:

Code to import the data
tb_coupon <- read_csv(
  file = unz(str_c("data/", name, ".zip"), "in-vehicle-coupon-recommendation.csv"),
  show_col_types = FALSE
)

The target variable is y (1 if the person accepted the coupon, 0 otherwise).

tb_coupon <- 
  tb_coupon |> 
  mutate(
    temperature = as.factor(temperature),
    has_children = as.factor(has_children),
    toCoupon_GEQ15min = as.factor(toCoupon_GEQ15min),
    toCoupon_GEQ25min = as.factor(toCoupon_GEQ25min),
    direction_same = as.factor(direction_same)
  ) |> 
  select(-toCoupon_GEQ5min, -direction_opp, -car) |> 
  rename(y = Y)

tb_coupon <- na.omit(tb_coupon)

target_name <- "y"

Let us call the get_beta_fit() from Chapter 13 to get our priors.

priors_coupon <- get_beta_fit(
  dataset = tb_coupon, target_name = target_name, seed = 1234
)

Let us save the results and the dataset:

save(priors_coupon, file = "output/real-data/priors_coupon.rda")
save(tb_coupon, file = "output/real-data/tb_coupon.rda")
plot_hist_scores_beta(priors_coupon, "coupon")
Figure 14.6: Distribution of estimated probabilities by the GAMSEL model and Beta distribution fitted to the scores of each of the three models, for the In-Vehicle Coupon Recommendation dataset.

14.1.7 Mushroom

  • URL to the data: https://archive.ics.uci.edu/dataset/73/mushroom
  • Description: From Audobon Society Field Guide; mushrooms described in terms of physical characteristics; classification: poisonous or edible.
  • Number of instances: 8,124
  • Features: 22
  • References: Mushroom (1987)
name <- "mushroom"

The dataset needs to be download.

Code to download the data
if (!dir.exists("data")) dir.create("data")
download.file(
  url = str_c("https://archive.ics.uci.edu/static/public/73/mushroom.zip"), 
  destfile = str_c("data/", name, ".zip")
)

Then, we can import the dataset:

Code to import the data
tb_mushroom <- read_csv(
  file = unz(str_c("data/", name, ".zip"), "agaricus-lepiota.data"), 
  col_names = c(
    "edible",
    "cap_shape", "cap_surface", "cap_color", "bruises", "odor", 
    "gill_attachment", "gill_spacing", "gill_size", "gill_color", 
    "stalk_shape", "stalk_root", "stalk_surface_above_ring",
    "stalk_surface_below_ring", "stalk_color_above_ring", 
    "stalk_color_below_ring", "veil_type", "veil_color", "ring_number", 
    "ring_type", "spore_print_color", "population", "habitat"
  ),
  show_col_types = FALSE
)

The target variable is edible. Let us turn it in a \(\{0,1\}\) variable.

tb_mushroom <- tb_mushroom |> 
  mutate(bruises = ifelse(bruises == TRUE, "yes", "no")) |> 
  mutate(edible = ifelse(edible == "e", 1, 0)) |> 
  select(-veil_type)
target_name <- "edible"

Let us call the get_beta_fit() from Chapter 13 to get our priors.

priors_mushroom <- get_beta_fit(
  dataset = tb_mushroom, target_name = target_name, seed = 1234
)

Let us save the results and the dataset:

save(priors_mushroom, file = "output/real-data/priors_mushroom.rda")
save(tb_mushroom, file = "output/real-data/tb_mushroom.rda")
plot_hist_scores_beta(priors_mushroom, "mushroom")
Figure 14.7: Distribution of estimated probabilities by the GAMSEL model and Beta distribution fitted to the scores of each of the three models, for the Mushroom dataset.

14.1.8 Occupancy Detection

name <- "occupancy"

The dataset needs to be download.

Code to download the data
if (!dir.exists("data")) dir.create("data")
download.file(
  url = str_c("https://archive.ics.uci.edu/static/public/357/",
              "occupancy+detection.zip"), 
  destfile = str_c("data/", name, ".zip")
)

Then, we can import the dataset:

Code to import the data
tb_occupancy <- read_csv(
  file = unz(str_c("data/", name, ".zip"), "datatraining.txt"), 
  col_names = c(
    "id", "date","Temperature","Humidity","Light","CO2",
    "HumidityRatio","Occupancy"
  ),
  show_col_types = FALSE, skip = 1
) |> 
  bind_rows(
    read_csv(
      file = unz(str_c("data/", name, ".zip"), "datatest.txt"), 
      col_names = c(
        "id", "date","Temperature","Humidity","Light","CO2",
        "HumidityRatio","Occupancy"
      ),
      show_col_types = FALSE, skip = 1,
    )
  ) |> 
  bind_rows(
    read_csv(
      file = unz(str_c("data/", name, ".zip"), "datatest2.txt"), 
      show_col_types = FALSE, skip = 1,
      col_names = c(
        "id", "date","Temperature","Humidity","Light","CO2",
        "HumidityRatio","Occupancy"
      ),
    )
  ) |> 
  select(-id)

The target variable is Occupancy.

tb_occupancy <- tb_occupancy |> 
  select(-date)
target_name <- "Occupancy"

Let us call the get_beta_fit() from Chapter 13 to get our priors.

priors_occupancy <- get_beta_fit(
  dataset = tb_occupancy, target_name = target_name, seed = 1234
)

Let us save the results and the dataset:

save(priors_occupancy, file = "output/real-data/priors_occupancy.rda")
save(tb_occupancy, file = "output/real-data/tb_occupancy.rda")
plot_hist_scores_beta(priors_occupancy, "occupancy")
Figure 14.8: Distribution of estimated probabilities by the GAMSEL model and Beta distribution fitted to the scores of each of the three models, for the Occupancy Detection dataset.

14.1.9 Wine Quality

  • URL to the data: https://archive.ics.uci.edu/dataset/186/wine+quality
  • Description: Two datasets are included, related to red and white vinho verde wine samples, from the north of Portugal. The goal is to model wine quality based on physicochemical tests (see [Cortez et al., 2009], http://www3.dsi.uminho.pt/pcortez/wine/).
  • Number of instances: 4,898
  • Features: 11
  • References: Cortez et al. (2009)
name <- "winequality"

The dataset needs to be download.

Code to download the data
if (!dir.exists("data")) dir.create("data")
download.file(
  url = str_c("https://archive.ics.uci.edu/static/public/186/",
              "wine+quality.zip"), 
  destfile = str_c("data/", name, ".zip")
)

Then, we can import the dataset:

Code to import the data
red_wine <- read_csv2(
  file = unz(str_c("data/", name, ".zip"), "winequality-red.csv"),
  show_col_types = FALSE) |>
  mutate(wine_type = "red")
ℹ Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.
Warning: One or more parsing issues, call `problems()` on your data frame for details,
e.g.:
  dat <- vroom(...)
  problems(dat)
Code to import the data
white_wine <- read_csv2(
  file = unz(str_c("data/", name, ".zip"), "winequality-white.csv"),
  show_col_types = FALSE) |> 
  mutate(wine_type = "white") |> 
  mutate(`residual sugar` = as.numeric(`residual sugar`))
ℹ Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.

The target variable is quality. Let us use it to define a \(\{0,1\}\) variable. We define the variable high_quality which equals 1 if the quality is larger or equal than 6.

tb_winequality <- red_wine |> bind_rows(white_wine) |> 
  mutate(high_quality = ifelse(quality >= 6, 1, 0)) |> 
  mutate(across(all_of(c(
    "density", "chlorides", "volatile acidity", "sulphates", "citric acid"
    )), ~as.numeric(.x))) |> 
  select(-quality)
tb_winequality <- na.omit(tb_winequality)
target_name <- "high_quality"

Let us call the get_beta_fit() from Chapter 13 to get our priors.

priors_winequality <- get_beta_fit(
  dataset = tb_winequality, target_name = target_name, seed = 1234
)

Let us save the results and the dataset:

save(priors_winequality, file = "output/real-data/priors_winequality.rda")
save(tb_winequality, file = "output/real-data/tb_winequality.rda")
plot_hist_scores_beta(priors_winequality, "winequality")
Figure 14.9: Distribution of estimated probabilities by the GAMSEL model and Beta distribution fitted to the scores of each of the three models, for the Wine Quality dataset.

14.1.10 Spambase

name <- "spambase"

The dataset needs to be download.

Code to download the data
if (!dir.exists("data")) dir.create("data")
download.file(
  url = str_c("https://archive.ics.uci.edu/static/public/2/", name, ".zip"), 
  destfile = str_c("data/", name, ".zip")
)

info_data <- scan(
  unz(str_c("data/", name, ".zip"), str_c(name, ".names")), 
  what = "character", sep = "\n"
)
# Print the names for this dataset (not very convenient...)
str_extract(info_data[94:length(info_data)], "^(.*):") |> 
  str_remove(":$") |> 
  (\(.x) str_c('"', .x, '",'))() |> 
  cat()

Then, we can import the dataset:

Code to import the data
tb_spambase <- read_csv(
  file = unz(str_c("data/", name, ".zip"), str_c(name, ".data")),
  col_names = c(
    "word_freq_make", "word_freq_address", "word_freq_all", "word_freq_3d",
    "word_freq_our", "word_freq_over", "word_freq_remove", "word_freq_internet",
    "word_freq_order", "word_freq_mail", "word_freq_receive", "word_freq_will",
    "word_freq_people", "word_freq_report", "word_freq_addresses",
    "word_freq_free", "word_freq_business", "word_freq_email", "word_freq_you",
    "word_freq_credit", "word_freq_your", "word_freq_font", "word_freq_000",
    "word_freq_money", "word_freq_hp", "word_freq_hpl", "word_freq_george",
    "word_freq_650", "word_freq_lab", "word_freq_labs", "word_freq_telnet",
    "word_freq_857", "word_freq_data", "word_freq_415", "word_freq_85",
    "word_freq_technology", "word_freq_1999", "word_freq_parts", "word_freq_pm",
    "word_freq_direct", "word_freq_cs", "word_freq_meeting",
    "word_freq_original", "word_freq_project", "word_freq_re", "word_freq_edu",
    "word_freq_table", "word_freq_conference", "char_freq_;", "char_freq_(",
    "char_freq_[", "char_freq_!", "char_freq_$", "char_freq_#",
    "capital_run_length_average", "capital_run_length_longest",
    "capital_run_length_total", "is_spam"
  ),
  show_col_types = FALSE
)

The target variable:

target_name <- "is_spam"

Let us call the get_beta_fit() from Chapter 13 to get our priors.

priors_spambase <- get_beta_fit(
  dataset = tb_spambase, target_name = target_name, seed = 1234
)

Let us save the results and the dataset:

save(priors_spambase, file = str_c("output/real-data/priors_spambase.rda"))
save(tb_spambase, file = "output/real-data/tb_spambase.rda")
plot_hist_scores_beta(priors_spambase, "spambase")
Figure 14.10: Distribution of estimated probabilities by the GAMSEL model and Beta distribution fitted to the scores of each of the three models, for the spambase dataset.

14.2 Summary

Codes to get the key characteristics of the datasets
datasets <- tribble(
  ~name, ~target_name, ~reference,
  "abalone", "Sex", "@misc_abalone_1",
  "adult", "high_income", "@misc_adult_2",
  "bank", "y", "@misc_bank_marketing_222",
  "default", "default", "@misc_default_of_credit_card_clients_350",
  "drybean", "is_dermason", "@misc_dry_bean_602",
  "coupon", "y", "@misc_vehicle_coupon_recommendation_603",
  "mushroom", "edible", "@misc_mushroom_73",
  "occupancy", "Occupancy", "@misc_occupancy_detection__357",
  "winequality", "high_quality", "@misc_wine_quality_186",
  "spambase", "is_spam", "@misc_spambase_94"
)

dataset_info <- vector(mode = "list", length = nrow(datasets))
for (i in 1:nrow(datasets)) {
  name <- datasets$name[i]
  target_name <- datasets$target_name[i]
  current_data <- get(str_c('tb_', name))
  current_target <- current_data |> pull(!!target_name)
  current_ref <- datasets$reference[i]
  n <- nrow(current_data)
  n_col <- ncol(current_data)
  n_numeric <- current_data |> select(-!!target_name) |> 
    select(where(is.numeric)) |> 
    ncol()
  dataset_info[[i]] <- tibble(
    Dataset = name, 
    n = n, 
    `# features` = n_col-1,
    `# numeric features` = n_numeric,
    `Prop. target = 1` = round(sum(current_target == 1) / n, 2),
    Reference = current_ref
  )
}

dataset_info <- list_rbind(dataset_info)
knitr::kable(dataset_info, booktabs = TRUE, format.args = list(big.mark = ","))
Table 14.1: Key characteristics of the datasets.
Dataset n # features # numeric features Prop. target = 1 Reference
abalone 4,177 8 8 0.37 Nash et al. (1995)
adult 32,561 14 6 0.24 Becker and Kohavi (1996)
bank 45,211 16 7 0.12 Moro, Rita, and Cortez (2012)
default 30,000 23 14 0.22 Yeh (2016)
drybean 13,611 16 16 0.26 Dry Bean (2020)
coupon 12,079 22 0 0.57 In-Vehicle Coupon Recommendation (2020)
mushroom 8,124 21 0 0.52 Mushroom (1987)
occupancy 20,560 5 5 0.23 Candanedo (2016)
winequality 6,495 12 11 0.63 Cortez et al. (2009)
spambase 4,601 57 57 0.39 Hopkins et al. (1999)