Skip to contents

Overview

flexBART now supports fitting logistic models with Bayesian ensembles of regression trees. Specifically, it can be used to fit models of the form \[ Y \sim \text{Binomial}(p) \] where the log-odds of \(p\) can be modeled as a linear combination of covariates \[ \log\left(\frac{p}{1-p}\right) = \beta_{0}(X) + \beta_{1}(X) \times Z_{1} + \cdots + \beta_{R-1}(X) \times Z_{R-1} \] and each function \(\beta_{r}(X)\) is approximated with an ensemble of regression trees. flexBART allows the size (specified using the optional argument M_vec or M), the tree prior hyperparameters (specified using the optional arguments alpha_vec and beta_vec), and the variables used for splitting (specified via the formula argument) to vary in each ensemble.

Example

We first demonstrate logit_flexBART’s functionality using data generated from the model \(p = \beta_{0}(X) + \beta_{1}(X)Z_{1} + \beta_{2}(X)Z_{2}\), \(Y \sim \text{Binomial}(p)\) where \(X\) contains \(p_{\text{cont}} = 10\) continuous and \(p_{\text{cat}} = 10\) categorical covariates. Each categorical covariate will take on 10 different values.

We generate \(n = 10000\) training observations and \(n = 1000\) testing observations.

set.seed(107)
n_train <- 10000
n_test <- 1000
p_cont <- 10
p_cat <- 10

train_data <- data.frame(Y = rep(NA, times = n_train))
test_data <- data.frame(Y = rep(NA, times = n_test))
for(j in 1:p_cont){
  train_data[,paste0("X",j)] <- 
    runif(n = n_train, min = -1, max = 1)
  test_data[,paste0("X",j)] <- 
    runif(n = n_test, min = -1, max = 1)
}
for(j in 1:p_cat){
  train_data[,paste0("X",j+p_cont)] <- 
    factor(sample(0:9, size = n_train, replace = TRUE), levels = 0:9)
  test_data[,paste0("X",j+p_cont)] <-
    factor(sample(0:9, size = n_test, replace = TRUE), levels = 0:9)
}

train_data[,"Z1"] <-
  rnorm(n = n_train, mean = 0, sd = 1)
test_data[,"Z1"] <-
  rnorm(n = n_test, mean = 0, sd = 1)

train_data[,"Z2"] <-
  rnorm(n = n_train, mean = 0, sd = 1)
test_data[,"Z2"] <-
  rnorm(n = n_test, mean = 0, sd = 1)

We define the true coefficient functions below.

beta0_true <- function(df){
  tmp_X1 <- (1+df[,"X1"])/2 # re-scale X1 to [0,1]
  tmp_X11 <- 1*(df[,"X11"] %in% c(0, 1, 4, 5, 6, 8, 9))
  return(3 * tmp_X1 + (2 - 5 * (tmp_X11)) * sin(pi * tmp_X1) - 2 * (tmp_X11))
}

D <- 5000
omega0 <- rnorm(n = D, mean = 0, sd = 1.5)
b0 <- 2 * pi * runif(n = D, min = 0, max = 1)
weights <- rnorm(n = D, mean = 0, sd = 2*1/sqrt(D))
beta1_true <- function(df){
  u <- 5 * df[,"X1"] # re-scale x1 to [-5,5]
  if(length(u) == 1){
    phi <- sqrt(2) * cos(b0 + omega0 * u)
    out <- sum(weights * phi)
  } else{
    phi_mat <- matrix(nrow = length(u), ncol = D)
    for(d in 1:D){
      phi_mat[,d] <- sqrt(2) * cos(b0[d] + omega0[d] * u)
    }
    out <- phi_mat %*% weights
  }
  return(out)
}

beta2_true <- function(df){
  tmp_X <- (1 + df[,"X1"]) / 2 #re-scale X1 to [0,1]
  return(3 - 3 * cos(6*pi*tmp_X) * tmp_X^2) * (tmp_X > 0.6) - 
    (10 * sqrt(tmp_X)) * (tmp_X < 0.25)
}

We now evaluate the true slopes and response surface function \(\mathbb{E}[Y \vert X, Z]\) on the training and testing observations

beta0_train <- beta0_true(train_data)
beta0_test <- beta0_true(test_data)

beta1_train <- beta1_true(train_data)
beta1_test <- beta1_true(test_data)

beta2_train <- beta2_true(train_data)
beta2_test <- beta2_true(test_data)

mu_train <-
  beta0_train + train_data[,"Z1"] * beta1_train + train_data[,"Z2"] * beta2_train

mu_test <- 
  beta0_test + test_data[,"Z1"] * beta1_test + test_data[,"Z2"] * beta2_test

prob_train <- 1/(1 + exp(-mu_train))
prob_test <- 1/(1 + exp(-mu_test))
train_data[,"Y"] <- rbinom(n = length(mu_train), size = 1, p = prob_train)

We’re now ready to fit our model.

fit <-
  flexBART::flexBART(formula = Y ~ bart(.) + Z1 * bart(.) + Z2 * bart(.),
                     family = binomial(link = "logit"),
                     train_data = train_data,
                     test_data = test_data)

flexBART::flexBART returns the posterior mean estimate and posterior samples for each \(\beta_{j}(x)\) evaluation (outputs named beta) and for the whole response surface (outputs named yhat). We first plot the posterior mean evaluations of each \(\beta_{j}\) and the response surface for both training and testing.

# Okabe-Ito colorblind friendly palette
oi_colors <- c("#999999", "#E69F00", "#56B4E9", "#009E73", 
               "#F0E442", "#0072B2", "#D55E00", "#CC79A7")
par(mar = c(3,3,2,1), mgp = c(1.8, 0.5, 0), mfrow = c(2,4))

plot(prob_train, fit$prob.train.mean, pch = 16, cex = 0.5,
     xlab = "Actual", ylab = "Posterior Mean", main = "Response surface (train)")
abline(a = 0, b = 1, col = oi_colors[3])
plot(beta0_train, fit$beta.train.mean[,1], pch = 16, cex = 0.5,
     xlab = "Actual", ylab = "Posterior Mean", main = "Intercept (train)")
abline(a = 0, b = 1, col = oi_colors[3])
plot(beta1_train, fit$beta.train.mean[,2], pch = 16, cex = 0.5,
     xlab = "Actual", ylab = "Posterior Mean", main = "beta1 (train)")
abline(a = 0, b = 1, col = oi_colors[3])
plot(beta2_train, fit$beta.train.mean[,3], pch = 16, cex = 0.5,
     xlab = "Actual", ylab = "Posterior Mean", main = "beta2 (train)")
abline(a = 0, b = 1, col = oi_colors[3])


plot(prob_test, fit$prob.test.mean, pch = 16, cex = 0.5,
     xlab = "Actual", ylab = "Posterior Mean", main = "Response surface (test)")
abline(a = 0, b = 1, col = oi_colors[3])

plot(beta0_test, fit$beta.test.mean[,1], pch = 16, cex = 0.5,
     xlab = "Actual", ylab = "Posterior Mean", main = "Intercept (test)")
abline(a = 0, b = 1, col = oi_colors[3])

plot(beta1_test, fit$beta.test.mean[,2], pch = 16, cex = 0.5,
     xlab = "Actual", ylab = "Posterior Mean", main = "beta1 (test)")
abline(a = 0, b = 1, col = oi_colors[3])

plot(beta2_test, fit$beta.test.mean[,3], pch = 16, cex = 0.5,
     xlab = "Actual", ylab = "Posterior Mean", main = "beta2 (test)")
abline(a = 0, b = 1, col = oi_colors[3])

### Pointwise uncertainty We can also assess the coverage of the pointwise credible intervals

coverage <- 
  matrix(nrow = 2, ncol = 4,
         dimnames = list(c("train", "test"),
                         c("p", "beta0", "beta1", "beta2")))

p_train_l95 <- 
  apply(fit$prob.train, MARGIN = 2, FUN = quantile, probs = 0.025)
p_train_u95 <- 
  apply(fit$prob.train, MARGIN = 2, FUN = quantile, probs = 0.975)
beta_train_l95 <-
  apply(fit$beta.train, MARGIN = c(2,3), FUN = quantile, probs = 0.025)
beta_train_u95 <-
  apply(fit$beta.train, MARGIN = c(2,3), FUN = quantile, probs = 0.975)

p_test_l95 <- 
  apply(fit$prob.test, MARGIN = 2, FUN = quantile, probs = 0.025)
p_test_u95 <- 
  apply(fit$prob.test, MARGIN = 2, FUN = quantile, probs = 0.975)
beta_test_l95 <-
  apply(fit$beta.test, MARGIN = c(2,3), FUN = quantile, probs = 0.025)
beta_test_u95 <-
  apply(fit$beta.test, MARGIN = c(2,3), FUN = quantile, probs = 0.975)

coverage["train", "p"] <-
  mean(prob_train >= p_train_l95 & prob_train <= p_train_u95)
coverage["train", "beta0"] <-
  mean(beta0_train >= beta_train_l95[,1] & beta0_train <= beta_train_u95[,1])
coverage["train", "beta1"] <-
  mean(beta1_train >= beta_train_l95[,2] & beta1_train <= beta_train_u95[,2])
coverage["train", "beta2"] <-
  mean(beta2_train >= beta_train_l95[,3] & beta2_train <= beta_train_u95[,3])


coverage["test", "p"] <-
  mean(prob_test >= p_test_l95 & prob_test <= p_test_u95)
coverage["test", "beta0"] <-
  mean(beta0_test >= beta_test_l95[,1] & beta0_test <= beta_test_u95[,1])
coverage["test", "beta1"] <-
  mean(beta1_test >= beta_test_l95[,2] & beta1_test <= beta_test_u95[,2])
coverage["test", "beta2"] <-
  mean(beta2_test >= beta_test_l95[,3] & beta2_test <= beta_test_u95[,3])

print(round(coverage, digits = 3))
#>           p beta0 beta1 beta2
#> train 0.374 0.310 0.368 0.137
#> test  0.364 0.279 0.357 0.158

Variable selection

By default flexBART::flexBART uses Linero (2018)’s sparsity-inducing Dirichlet prior on splitting variables. Following the recommendations from Section 3.2 of that paper, we can compute the posterior probability that each \(X_{j}\) is used at least once in the ensemble for each \(\beta_{r}(X).\) We can then select those covariates for which this probability exceeds 50%. The varcounts element of the output is an array holding the posterior samples of the counts of rules based on covariate.

dim(fit$varcounts)
#> [1] 4000   20    3

The first dimension of fit$varcounts indexes MCMC samples. The second indexes covariates (i.e., elements of \(X\)). The third indexes ensembles (i.e., \(\beta_{0}\), , \(\beta_{R-1}\)).

The selected variables are

marginal_split_prob <- 
  apply(fit$varcounts >= 1, MARGIN = c(2,3), FUN = mean)
which(marginal_split_prob >= 0.5, arr.ind = TRUE)
#>     row col
#> X1    1   1
#> X11  11   1
#> X1    1   2
#> X1    1   3

We see that \(X_{1}\) and \(X_{11}\) are selected for \(\beta_{0};\) \(X_{1}\) is selected for \(\beta_{2};\) and \(X_{1}\) is selected for \(\beta_{3}.\) So, at least for this run, there are no false positive and no false negative identifications.