Fitting varying coefficient models with flexBART • flexBART

Overview

flexBART now supports fitting varying coefficient models with Bayesian ensembles of regression trees. Specifically, it can be used to fit models of the form \[ Y = \beta_{0}(X) + \beta_{1}(X) \times Z_{1} + \cdots + \beta_{R-1}(X) \times Z_{R-1} + \sigma \epsilon \] where each function $\beta_{r}(X)$ is approximated with an ensemble of regression trees. flexBART allows the size (specified using the optional argument M_vec or M), the tree prior hyperparameters (specified using the optional arguments alpha_vec and beta_vec), and the variables used for splitting (specified via the formula argument) to vary in each ensemble. It further allows for redundancy/non-identifiability in which some of the $Z_{r}$’s are exactly equal to one another. This is useful if one wishes to fit, say, an additive model with BART (e.g., $Y = f_{1}(X_{1}) + f_{2}(X_{2})$ with BART priors on $f_{1}$ and $f_{2}$).

Identified example

We first demonstrate flexBART’s functionality using data generated from the model $Y = \beta_{0}(X) + \beta_{1}(X)Z_{1} + \beta_{2}(X)Z_{2} + \sigma\epsilon$ where $X$ contains $p_{\text{cont}} = 10$ continuous and $p_{\text{cat}} = 10$ categorical covariates. Each categorical covariate will take on 10 different values.

We generate $n = 10000$ training observations and $n = 5000$ testing observations.

set.seed(727)
n_train <- 10000
n_test <- 1000
p_cont <- 10
p_cat <- 10
sigma <- 1

train_data <- data.frame(Y = rep(NA, times = n_train))
test_data <- data.frame(Y = rep(NA, times = n_test))
for(j in 1:p_cont){
  train_data[,paste0("X",j)] <- 
    runif(n = n_train, min = -1, max = 1)
  test_data[,paste0("X",j)] <- 
    runif(n = n_test, min = -1, max = 1)
}
for(j in 1:p_cat){
  train_data[,paste0("X",j+p_cont)] <- 
    factor(sample(0:9, size = n_train, replace = TRUE), levels = 0:9)
  test_data[,paste0("X",j+p_cont)] <-
    factor(sample(0:9, size = n_test, replace = TRUE), levels = 0:9)
}

train_data[,"Z1"] <-
  rnorm(n = n_train, mean = 0, sd = 1)
test_data[,"Z1"] <-
  rnorm(n = n_test, mean = 0, sd = 1)

train_data[,"Z2"] <-
  rnorm(n = n_train, mean = 0, sd = 1)
test_data[,"Z2"] <-
  rnorm(n = n_test, mean = 0, sd = 1)

We define the true coefficient functions below.

beta0_true <- function(df){
  tmp_X1 <- (1+df[,"X1"])/2 # re-scale X1 to [0,1]
  tmp_X11 <- 1*(df[,"X11"] %in% c(0, 1, 4,5,6,8,9))
  return(3 * tmp_X1 + (2 - 5 * (tmp_X11)) * sin(pi * tmp_X1) - 2 * (tmp_X11))
}

set.seed(517)
D <- 5000
omega0 <- rnorm(n = D, mean = 0, sd = 1.5)
b0 <- 2 * pi * runif(n = D, min = 0, max = 1)
weights <- rnorm(n = D, mean = 0, sd = 2*1/sqrt(D))
beta1_true <- function(df){
  u <- 5*df[,"X1"] # re-scale x1 to [-5,5]
  if(length(u) == 1){
    phi <- sqrt(2) * cos(b0 + omega0*u)
    out <- sum(weights * phi)
  } else{
    phi_mat <- matrix(nrow = length(u), ncol = D)
    for(d in 1:D){
      phi_mat[,d] <- sqrt(2) * cos(b0[d] + omega0[d] * u)
    }
    out <- phi_mat %*% weights
  }
  return(out)
}

beta2_true <- function(df){
  tmp_X <- (1+df[,"X1"])/2 #re-scale X1 to [0,1]
  return(3 - 3*cos(6*pi*tmp_X) * tmp_X^2) * (tmp_X > 0.6) - 
    (10 * sqrt(tmp_X)) * (tmp_X < 0.25)
}

We now evaluate the true slopes and response surface function $\mathbb{E}[Y \vert X, Z]$ on the training and testing observations

beta0_train <- beta0_true(train_data)
beta0_test <- beta0_true(test_data)

beta1_train <- beta1_true(train_data)
beta1_test <- beta1_true(test_data)

beta2_train <- beta2_true(train_data)
beta2_test <- beta2_true(test_data)

mu_train <-
  beta0_train + train_data[,"Z1"] * beta1_train + train_data[,"Z2"] * beta2_train

mu_test <- 
  beta0_test + test_data[,"Z1"] * beta1_test + test_data[,"Z2"] * beta2_test

train_data[,"Y"] <-
  mu_train + sigma * rnorm(n = n_train, mean = 0, sd = 1)

test_data <- test_data[,colnames(test_data) != "Y"]

We’re now ready to fit our model.

set.seed(99)
fit <-
  flexBART::flexBART(formula = Y ~ bart(.) + Z1 * bart(.) + Z2 * bart(.),
                     train_data = train_data,
                     test_data = test_data)

flexBART::flexBART returns the posterior mean estimate and posterior samples for each $\beta_{j}(x)$ evaluation (outputs named beta) and for the whole response surface (outputs named yhat). We first plot the posterior mean evaluations of each $\beta_{j}$ and the response surface for both training and testing.

# Okabe-Ito colorblind friendly palette
oi_colors <- c("#999999", "#E69F00", "#56B4E9", "#009E73", 
               "#F0E442", "#0072B2", "#D55E00", "#CC79A7")
par(mar = c(3,3,2,1), mgp = c(1.8, 0.5, 0), mfrow = c(2,4))

plot(mu_train, fit$yhat.train.mean, pch = 16, cex = 0.5,
     xlab = "Actual", ylab = "Posterior Mean", main = "Response surface (train)")
abline(a = 0, b = 1, col = oi_colors[3])
plot(beta0_train, fit$beta.train.mean[,1], pch = 16, cex = 0.5,
     xlab = "Actual", ylab = "Posterior Mean", main = "Intercept (train)")
abline(a = 0, b = 1, col = oi_colors[3])
plot(beta1_train, fit$beta.train.mean[,2], pch = 16, cex = 0.5,
     xlab = "Actual", ylab = "Posterior Mean", main = "beta1 (train)")
abline(a = 0, b = 1, col = oi_colors[3])
plot(beta2_train, fit$beta.train.mean[,3], pch = 16, cex = 0.5,
     xlab = "Actual", ylab = "Posterior Mean", main = "beta2 (train)")
abline(a = 0, b = 1, col = oi_colors[3])


plot(mu_test, fit$yhat.test.mean, pch = 16, cex = 0.5,
     xlab = "Actual", ylab = "Posterior Mean", main = "Response surface (test)")
abline(a = 0, b = 1, col = oi_colors[3])

plot(beta0_test, fit$beta.test.mean[,1], pch = 16, cex = 0.5,
     xlab = "Actual", ylab = "Posterior Mean", main = "Intercept (test)")
abline(a = 0, b = 1, col = oi_colors[3])

plot(beta1_test, fit$beta.test.mean[,2], pch = 16, cex = 0.5,
     xlab = "Actual", ylab = "Posterior Mean", main = "beta1 (test)")
abline(a = 0, b = 1, col = oi_colors[3])

plot(beta2_test, fit$beta.test.mean[,3], pch = 16, cex = 0.5,
     xlab = "Actual", ylab = "Posterior Mean", main = "beta2 (test)")
abline(a = 0, b = 1, col = oi_colors[3])

### Pointwise uncertainty We can also assess the coverage of the pointwise credible intervals

coverage <- 
  matrix(nrow = 2, ncol = 4,
         dimnames = list(c("train", "test"),
                         c("mu", "beta0", "beta1", "beta2")))

mu_train_l95 <- 
  apply(fit$yhat.train, MARGIN = 2, FUN = quantile, probs = 0.025)
mu_train_u95 <- 
  apply(fit$yhat.train, MARGIN = 2, FUN = quantile, probs = 0.975)
beta_train_l95 <-
  apply(fit$beta.train, MARGIN = c(2,3), FUN = quantile, probs = 0.025)
beta_train_u95 <-
  apply(fit$beta.train, MARGIN = c(2,3), FUN = quantile, probs = 0.975)

mu_test_l95 <- 
  apply(fit$yhat.test, MARGIN = 2, FUN = quantile, probs = 0.025)
mu_test_u95 <- 
  apply(fit$yhat.test, MARGIN = 2, FUN = quantile, probs = 0.975)
beta_test_l95 <-
  apply(fit$beta.test, MARGIN = c(2,3), FUN = quantile, probs = 0.025)
beta_test_u95 <-
  apply(fit$beta.test, MARGIN = c(2,3), FUN = quantile, probs = 0.975)

coverage["train", "mu"] <-
  mean(mu_train >= mu_train_l95 & mu_train <= mu_train_u95)
coverage["train", "beta0"] <-
  mean(beta0_train >= beta_train_l95[,1] & beta0_train <= beta_train_u95[,1])
coverage["train", "beta1"] <-
  mean(beta1_train >= beta_train_l95[,2] & beta1_train <= beta_train_u95[,2])
coverage["train", "beta2"] <-
  mean(beta2_train >= beta_train_l95[,3] & beta2_train <= beta_train_u95[,3])


coverage["test", "mu"] <-
  mean(mu_test >= mu_test_l95 & mu_test <= mu_test_u95)
coverage["test", "beta0"] <-
  mean(beta0_test >= beta_test_l95[,1] & beta0_test <= beta_test_u95[,1])
coverage["test", "beta1"] <-
  mean(beta1_test >= beta_test_l95[,2] & beta1_test <= beta_test_u95[,2])
coverage["test", "beta2"] <-
  mean(beta2_test >= beta_test_l95[,3] & beta2_test <= beta_test_u95[,3])

print(round(coverage, digits = 3))
#>          mu beta0 beta1 beta2
#> train 0.970  0.97 0.953 0.963
#> test  0.979  0.97 0.960 0.959

Variable selection

By default flexBART::flexBART uses Linero (2018)’s sparsity-inducing Dirichlet prior on splitting variables. Following the recommendations from Section 3.2 of that paper, we can compute the posterior probability that each $X_{j}$ is used at least once in the ensemble for each $\beta_{r}(X).$ We can then select those covariates for which this probability exceeds 50%. The varcounts element of flexBART::flexBART’s output is an array holding the posterior samples of the counts of rules based on covariate.

dim(fit$varcounts)
#> [1] 4000   20    3

The first dimension of fit$varcounts indexes MCMC samples. The second indexes covariates (i.e., elements of $X$). The third indexes ensembles (i.e., $\beta_{0}$, , $\beta_{R-1}$).

The selected variables are

marginal_split_prob <- 
  apply(fit$varcounts >= 1, MARGIN = c(2,3), FUN = mean)
which(marginal_split_prob >= 0.5, arr.ind = TRUE)
#>     row col
#> X1    1   1
#> X2    2   1
#> X11  11   1
#> X1    1   2
#> X1    1   3

We see that $X_{1}, X_{2}$ and $X_{11}$ are selected for $\beta_{0};$ $X_{1}$ is selected for $\beta_{2};$ and $X_{1}$ is selected for $\beta_{3}.$ So, at least for this run, there is one false positive and no false negative identifications

Redundant/non-identified example

flexBART was originally envisioned to facilitate additive BART models and multi-level BART models. For instance ones with some ensembles splitting on grouping variables (to induce group-specific intercepts) and others splitting only on covariates. All of these can be viewed as special cases of varying coefficient models in which some of the $Z_{r}$’s are identical. The formula parser in flexBART::flexBART can identify those instances. When there are redundancies, the function returns the raw posterior samples of each ensemble on the standardized scale and posterior samples of the identified components on the original scale.

To illustrate, suppose we generate data from the redundant model \[ Y = [\beta_{0}(X) + \beta_{1}(X] + [\beta_{2}(X) + \beta_{3}(X)] \times Z_{1} + [\beta_{4}(X) + \beta_{5}(X)] \times Z_{2} + \sigma \epsilon. \] If we specify a formula like

Y~bart(.) + bart(.) + Z1*bart(.) + Z1*bart(.) + Z2*bart(.) + Z2*bart(),

flexBART::flexBART will return posterior samples for 6 ensembles on the standardized scale. These samples are saved in the raw_beta slot of the output. As written, although the functions $\beta_{0}$ and $\beta_{1}$ are not identified, their sum is. So, flexBART::flexBART also returns posterior draws of their sum on the standardized scale.

Generating data from a redundant VC model

To illustrate, we introduce a few more functions

beta3_true <- function(df){
  tmp_X <- (1+df[,paste0("X", 1:p_cont)])/2
  return(exp(sin((0.9 * (tmp_X[,1] + 0.48))^10)) + tmp_X[,2] * tmp_X[,3] + tmp_X[,4])
}
beta4_true <- function(df){
  tmp_X <- (1+df[,paste0("X", 1:p_cont)])/2
  return(exp(3*tmp_X[,5])*cos(7*pi*tmp_X[,5]/2))
}
beta5_true <- function(df){
  tmp_X <- (1+df[,1:p_cont])/2
  return(10*sin(pi*tmp_X[,6] * tmp_X[,7]) + 
           20 * (tmp_X[,8] - 0.5)^2 + 
           10 * tmp_X[,9] + 
           5 * tmp_X[,10])
}

We now generate $X$ and $Z$ values and evaluate the true slopes and response surface for training and testing observations.

set.seed(727)
train_data <- data.frame(Y = rep(NA, times = n_train))
test_data <- data.frame(Y = rep(NA, times = n_test))
for(j in 1:p_cont){
  train_data[,paste0("X",j)] <- 
    runif(n = n_train, min = -1, max = 1)
  test_data[,paste0("X",j)] <- 
    runif(n = n_test, min = -1, max = 1)
}
for(j in 1:p_cat){
  train_data[,paste0("X",j+p_cont)] <- 
    factor(sample(0:9, size = n_train, replace = TRUE), levels = 0:9)
  test_data[,paste0("X",j+p_cont)] <-
    factor(sample(0:9, size = n_test, replace = TRUE), levels = 0:9)
}

train_data[,"Z1"] <-
  rnorm(n = n_train, mean = 0, sd = 1)
test_data[,"Z1"] <-
  rnorm(n = n_test, mean = 0, sd = 1)

train_data[,"Z2"] <-
  rnorm(n = n_train, mean = 0, sd = 1)
test_data[,"Z2"] <-
  rnorm(n = n_test, mean = 0, sd = 1)


beta0_train <- beta0_true(train_data)
beta0_test <- beta0_true(test_data)

beta1_train <- beta1_true(train_data)
beta1_test <- beta1_true(test_data)

beta2_train <- beta2_true(train_data)
beta2_test <- beta2_true(test_data)

beta3_train <- beta3_true(train_data)
beta3_test <- beta3_true(test_data)

beta4_train <- beta4_true(train_data)
beta4_test <- beta4_true(test_data)

beta5_train <- beta5_true(train_data)
beta5_test <- beta5_true(test_data)



mu_train <-
  (beta0_train + beta1_train) +
  train_data[,"Z1"] * (beta2_train + beta3_train) + 
  train_data[,"Z2"] * (beta4_train + beta5_train)

mu_test <-
  (beta0_test + beta1_test) +
  test_data[,"Z1"] * (beta2_test + beta3_test) + 
  test_data[,"Z2"] * (beta4_test + beta5_test)

train_data[,"Y"] <-
  mu_train + sigma * rnorm(n = n_train, mean = 0, sd = 1)

test_data <- test_data[,colnames(test_data) != "Y"]

Fitting a redundant model

set.seed(99)
fit <-flexBART::flexBART(
  formula = Y ~ bart(.) + bart(.) + Z1 * bart(.) + Z1 * bart(.) + Z2 * bart(.) + Z2 * bart(.),
  train_data = train_data,
  test_data = test_data)

Evaluations of the identified parameters are available in fit$beta.train and fit$beta.test

par(mar = c(3,3,2,1), mgp = c(1.8, 0.5, 0), mfrow = c(2,4))
plot(mu_train, fit$yhat.train.mean, 
     pch = 16, cex = 0.5,
     xlab = "Actual", ylab = "Posterior Mean", 
     main = "Response surface (train)")
abline(a = 0, b = 1, col = oi_colors[3])
plot(beta0_train + beta1_train, fit$beta.train.mean[,1], 
     pch = 16, cex = 0.5,
     xlab = "Actual", ylab = "Posterior Mean", 
     main = "beta0+beta1 (train)")
abline(a = 0, b = 1, col = oi_colors[3])
plot(beta2_train + beta3_train, fit$beta.train.mean[,2], 
     pch = 16, cex = 0.5,
     xlab = "Actual", ylab = "Posterior Mean", main = "beta2+beta3 (train)")
abline(a = 0, b = 1, col = oi_colors[3])
plot(beta4_train+beta5_train, fit$beta.train.mean[,3], 
     pch = 16, cex = 0.5,
     xlab = "Actual", ylab = "Posterior Mean", main = "beta4+beta5 (train)")
abline(a = 0, b = 1, col = oi_colors[3])

plot(mu_test, fit$yhat.test.mean, 
     pch = 16, cex = 0.5,
     xlab = "Actual", ylab = "Posterior Mean", 
     main = "Response surface (test)")
abline(a = 0, b = 1, col = oi_colors[3])
plot(beta0_test + beta1_test, fit$beta.test.mean[,1], 
     pch = 16, cex = 0.5,
     xlab = "Actual", ylab = "Posterior Mean", 
     main = "beta0+beta1 (test)")
abline(a = 0, b = 1, col = oi_colors[3])
plot(beta2_test + beta3_test, fit$beta.test.mean[,2], 
     pch = 16, cex = 0.5,
     xlab = "Actual", ylab = "Posterior Mean", main = "beta2+beta3 (test)")
abline(a = 0, b = 1, col = oi_colors[3])
plot(beta4_test+beta5_test, fit$beta.test.mean[,3], 
     pch = 16, cex = 0.5,
     xlab = "Actual", ylab = "Posterior Mean", main = "beta4+beta5 (test)")
abline(a = 0, b = 1, col = oi_colors[3])