Binomial (Logistic) Regression

UCB Admissions

This is a famous (and a bit old) data set related to a gender discrimination case in the 1970s. It contains data on whether students were admitted to various graduate programs and their gender. (The data here are for the six largest graduate programs.)

data(UCBadmit)
UCB <-
  UCBadmit %>%
  mutate(
    gidx = ifelse(applicant.gender == "male", 1, 2),
    didx = as.numeric(as.factor(dept)),
    p.admit = admit / applications
  )

UCB %>% pander()

dept	applicant.gender	admit	reject	applications	gidx	didx	p.admit
A	male	512	313	825	1	1	0.6206
A	female	89	19	108	2	1	0.8241
B	male	353	207	560	1	2	0.6304
B	female	17	8	25	2	2	0.68
C	male	120	205	325	1	3	0.3692
C	female	202	391	593	2	3	0.3406
D	male	138	279	417	1	4	0.3309
D	female	131	244	375	2	4	0.3493
E	male	53	138	191	1	5	0.2775
E	female	94	299	393	2	5	0.2392
F	male	22	351	373	1	6	0.05898
F	female	24	317	341	2	6	0.07038

Let’s create a logistic regression model with the following core:

alist(
  admit ~ dbinom(applications, p),
  logit(p) <- a[gidx],
  a[gidx] ~ dnorm(0, ???)
)

Choosing the prior

What should we choose for ??? ?

logit(c(0.05, 0.95))

## [1] -2.944439  2.944439

u11.7 <-
  ulam(
    data = UCB %>% select(admit, applications, gidx, didx),
    alist(
      admit ~ dbinom(applications, p),
      logit(p) <- a[gidx],
      a[gidx] ~ dnorm(0, 1.5)
    ),
    chains = 4, log_lik = TRUE,
    file = "fits/u11.7"
  )

What does this model say?

precis(u11.7, depth = 2)

##            mean         sd       5.5%      94.5%    n_eff    Rhat4
## a[1] -0.2201425 0.03854428 -0.2815066 -0.1596877 1350.559 1.001015
## a[2] -0.8299547 0.05144294 -0.9114045 -0.7471930 1342.646 1.000958

u11.7 %>% stanfit() %>% mcmc_areas(pars = vars(-matches("lp"), -matches('log')))

Post11.7 <-
  u11.7 %>% extract.samples() %>% as.data.frame() 
Post11.7 %>%
  head(3) %>% pander()

a.1	a.2
-0.2051	-0.8662
-0.2341	-0.7979
-0.1642	-0.8512

Post11.7 <-
  Post11.7 %>%
  mutate(
    p.1 = ilogit(a.1),
    p.2 = ilogit(a.2),
    a.diff = a.1 - a.2,
    p.diff = p.1 - p.2
  )

Post11.7 %>%
  head(3) %>% pander()

a.1	a.2	p.1	p.2	a.diff	p.diff
-0.2051	-0.8662	0.4489	0.2961	0.6611	0.1529
-0.2341	-0.7979	0.4417	0.3105	0.5637	0.1313
-0.1642	-0.8512	0.459	0.2992	0.687	0.1599

Post11.7 %>% mcmc_areas(pars = vars(matches("a")))

Post11.7 %>% mcmc_areas(pars = vars(matches("p")))

Note: this model is not super profound. It has basically calculated the proportion of men and women who were admitted, and added some information about uncertainty.

UCB %>% 
  group_by(applicant.gender) %>%
  summarise(p.admit = sum(admit) / sum(applications)) %>%
  pander()

applicant.gender	p.admit
female	0.3035
male	0.4452

What is this plot, and can we make a better version ourselves?

postcheck(u11.7)

Avg11.7 <- 
  link(u11.7) %>%
  apply(2, mean_hdi) %>%
  bind_rows() %>% 
  bind_cols(UCB)
Avg11.7 %>% head(3) %>% pander()

Table continues below
y	ymin	ymax	.width	.point	.interval	dept
0.4452	0.426	0.4631	0.95	mean	hdi	A
0.3038	0.2828	0.3252	0.95	mean	hdi	A
0.4452	0.426	0.4631	0.95	mean	hdi	B

applicant.gender	admit	reject	applications	gidx	didx	p.admit
male	512	313	825	1	1	0.6206
female	89	19	108	2	1	0.8241
male	353	207	560	1	2	0.6304

Ind11.7 <- 
  sim(u11.7) %>%
  apply(2, mean_hdi) %>%
  bind_rows() %>% 
  bind_cols(UCB)

Ind11.7 %>% head(3) %>% pander()

Table continues below
y	ymin	ymax	.width	.point	.interval	dept	applicant.gender
368.4	337	399	0.95	mean	hdi	A	male
32.61	22	42	0.95	mean	hdi	A	female
249.5	225	274	0.95	mean	hdi	B	male

admit	reject	applications	gidx	didx	p.admit
512	313	825	1	1	0.6206
89	19	108	2	1	0.8241
353	207	560	1	2	0.6304

Ind11.7 <-
  Ind11.7 %>%
  mutate(
    y = y / applications,
    ymin = ymin / applications,
    ymax = ymax / applications
  )
Ind11.7 %>% head(3) %>% pander()

Table continues below
y	ymin	ymax	.width	.point	.interval	dept
0.4466	0.4085	0.4836	0.95	mean	hdi	A
0.3019	0.2037	0.3889	0.95	mean	hdi	A
0.4455	0.4018	0.4893	0.95	mean	hdi	B

applicant.gender	admit	reject	applications	gidx	didx	p.admit
male	512	313	825	1	1	0.6206
female	89	19	108	2	1	0.8241
male	353	207	560	1	2	0.6304

gf_point(p.admit ~ applicant.gender | ~ dept, data = Avg11.7, color = "steelblue") %>%
  gf_line(group = ~ 1, color = "steelblue") %>%
  gf_pointrange(y + ymin + ymax ~ applicant.gender, data = Ind11.7, size = 0.5, color = "orange") %>%
  gf_pointrange(y + ymin + ymax ~ applicant.gender, data = Avg11.7, size = 1, 
                color = "forestgreen", alpha = 0.5) %>%
  gf_labs(subtitle = "u11.7")

Another model

The pictures above suggest that department might be important to consider.

u11.8 <-
  ulam(
    data = UCB %>% select(admit, applications, gidx, didx),
    alist(
      admit ~ dbinom(applications, p),
      logit(p) <- a[gidx] + d[didx],
      a[gidx] ~ dnorm(0, 1.5),
      d[didx] ~ dnorm(0, 1.5)
    ),
    chains = 4, iter = 4000, log_lik = TRUE,
    file = "fits/u11.8"
  )

precis(u11.8, depth = 2)

##            mean        sd       5.5%      94.5%    n_eff    Rhat4
## a[1] -0.5336758 0.5609700 -1.3972806  0.3813229 491.9636 1.005323
## a[2] -0.4374360 0.5611640 -1.3009684  0.4823896 489.2978 1.005422
## d[1]  1.1142385 0.5627877  0.2028962  1.9850470 493.3756 1.005191
## d[2]  1.0699099 0.5649148  0.1542118  1.9453018 499.5685 1.005046
## d[3] -0.1443865 0.5631952 -1.0602804  0.7233960 486.5720 1.005407
## d[4] -0.1775489 0.5624797 -1.0936324  0.6959173 497.5455 1.005050
## d[5] -0.6207938 0.5657673 -1.5374129  0.2499016 501.6840 1.005434
## d[6] -2.1778875 0.5764149 -3.1025113 -1.2840076 516.3134 1.005045

u11.8 %>% stanfit() %>% mcmc_areas(pars = vars(-matches("lp"), -matches('log')))

In this model, for department x, the log-odds of admission is \(a_1 + d_x\) for men and \(a_2 + d_x\) for women. The difference in log-odds is the log of the odds ratio:

\[ a_1 - d_x - (a_2 + d_x) = a_1 - a_2 = \mbox{difference in log odds} = \mbox{log(odds ratio)} \] So

\[ e^{a_1 - a_2} = \exp(a_1 - a_2) = \mbox{odds ratio} \]

Note that in this model, the odds ratio between acceptance rates for men and women is the same for each department.

u11.8 %>% extract.samples() %>% as.data.frame() %>%
  mutate(diff_a = a.1 - a.2) %>%
  mcmc_areas(pars = vars(matches('diff')))

Odds are odd; Probability, please

If we want to work on a probability scale, we can do that too.

\[ p_1 - p_2 = \operatorname{ilogit}(a_1 + d_x) - \operatorname{ilogit}(a_2 + d_x) \] This doesn’t simplify as nicely but a little numerical calculation shows that \(p_1 - p_2\) depends on the department!

Post11.8 <- 
  u11.8 %>% extract.samples() %>% as.data.frame() %>%
  mutate(
    diff_a = a.1 - a.2,
    diff_p1 = ilogit(a.1 + d.1) - ilogit(a.2 + d.1),
    diff_p6 = ilogit(a.1 + d.6) - ilogit(a.2 + d.6)
  )

Post11.8 %>% mcmc_areas(pars = vars(matches("diff_p")))

So on the probablity scale, we have an interaction between gender and department, but on the log odds scale, we do not.

Rethinking interaction

Let \(f(x_1, x_2, \dots, x_k)\) be our model function with predictors \(x_1, x_2, \dots, x_k\). Two predictors \(x_i\) and \(x_j\) have an interaction if the amount \(f\) changes when we change \(x_i\) depends the value of \(x_j\).

For a discrete predictor (like gender and department), we can determine this by taking a finite difference (comparing \(f\) for two genders or two departments). Sometimes this can easily be done algebraically. Above, we did this numerically and saw that the difference between the model’s) probability of admission for men and for women depends on the department. That’s an interaction.

For a continuous predictor, we can look at partial derivatives. The partial derivatives \(\frac{\partial f}{\partial x_i}\) tell us how \(f\) changes as predictor \(x_i\) changes. If that partial derivative invovles another predictor \(x_j\), then we have an interaction between \(x_i\) and \(x_j\). For a GLM with a link function, this may depend on whether we are thinking on the linear model scale or the link (probability for a binomial model) scale.

Here is an interesting thing. Consider a logistic regression model with

logit(p) <- a + b * x

Then

\[ \frac{\partial \operatorname{logit}(p)}{\partial x} = b \]

But \[ \frac{\partial p}{\partial x} = \frac{\partial}{\partial x} \left( \frac{\exp(a + bx)}{1 + \exp(a+bx)}\right) = \frac{b}{2(1 + \cosh(a + b x))} \] which depends on \(x\). So on the probability scale, \(x\) interacts with itself: How much \(p\) changes as we change \(x\) a fixed amount depends on the value of \(x\).

Again, the important question for determining if you have an interaction is this: Does how much \(f\) changes when we change \(x_i\) by a fixed amount depend on the value of \(x_j\)? We can determine this using either partial derivatives (when a predictor is continuous) or finite differences.

Posterior plots for our two models

# want to make the same plot for multiple models? put it in a function!
ucb_posterior_plot <- 
  function(model, data = model@data) {
    model_name <- deparse(substitute(model))
    
    Avg <- link(model) %>%
      apply(2, mean_hdi) %>%
      bind_rows() %>% 
      mutate(case = 1:n()) %>%
      bind_cols(data)
    
    Ind <- 
      sim(model) %>%
      apply(2, mean_hdi) %>%
      bind_rows() %>% 
      mutate(case = 1:n()) %>%
      bind_cols(data) %>%
      mutate(
        y = y / applications,
        ymin = ymin / applications,
        ymax = ymax / applications
      )
    gf_point(p.admit ~ applicant.gender | ~ dept, data = Avg, color = "steelblue") %>%
      gf_line(group = ~ 1, color = "steelblue") %>%
      gf_pointrange(y + ymin + ymax ~ applicant.gender, data = Ind, 
                    size = 0.5, color = "orange", alpha = 0.6) %>%
      gf_pointrange(y + ymin + ymax ~ applicant.gender, data = Avg, 
                    size = 0.8, color = "forestgreen", alpha = 0.6) %>%
      gf_labs(subtitle = model_name)
  }

ucb_posterior_plot(u11.7, data = UCB) /
ucb_posterior_plot(u11.8, data = UCB)

With some more effort, we could get even fancier and pull additional information out of the model so this sort of thing could work for a wider range of models – more like rethinking::postcheck(). There is a trade-off between generality/ease of use and customization/suitability for a particular situation. It’s good to have some quick tools that work reasonably well in a wide range of settings, but often you will need to customize to get just what you want/need.

See the end of these notes for an example that is more general than the one above, but not as general as rethinking::postcheck().

Compairing with WAIC or PSIS

We know what to expect here based on our plots above, but let’s take a look anyway.

compare(u11.7, u11.8)

##           WAIC        SE    dWAIC      dSE      pWAIC        weight
## u11.8 108.3742  15.56611   0.0000       NA   9.314334  1.000000e+00
## u11.7 992.4804 314.56471 884.1062 327.2919 112.882930 1.044184e-192

compare(u11.7, u11.8, func = PSIS)

## Some Pareto k values are very high (>1). Set pointwise=TRUE to inspect individual points.
## Some Pareto k values are very high (>1). Set pointwise=TRUE to inspect individual points.

##           PSIS        SE    dPSIS      dSE    pPSIS        weight
## u11.8 114.5145  16.96132   0.0000       NA 12.38451  1.000000e+00
## u11.7 958.6966 313.72102 844.1821 312.3773 95.99104 4.877424e-184

compare(u11.7, u11.8) %>% plot()

But

What does “out of sample” mean in this context?
What about those PSIS warnings?

PSIS(u11.7, pointwise = TRUE)

## Some Pareto k values are very high (>1). Set pointwise=TRUE to inspect individual points.

##         PSIS        lppd    penalty std_err         k
## 1  135.92790  -67.963951 24.4026881 313.721 2.1013446
## 2  138.85420  -69.427098  8.0906499 313.721 1.3279442
## 3   99.51184  -49.755920 14.3666232 313.721 1.4541170
## 4   18.78027   -9.390137  0.2349273 313.721 0.2064426
## 5   15.07391   -7.536955  0.9441810 313.721 0.5657719
## 6   12.81941   -6.409706  1.4974470 313.721 0.8182791
## 7   33.31645  -16.658223  3.5117486 313.721 0.9727225
## 8   11.13786   -5.568932  0.8471267 313.721 0.6009900
## 9   30.04929  -15.024644  1.5533874 313.721 0.6249022
## 10  16.49344   -8.246722  1.7567106 313.721 0.6620045
## 11 313.40268 -156.701342 24.2257575 313.721 2.6281245
## 12 133.32936  -66.664678 14.5597918 313.721 1.6365865

PSIS(u11.8, pointwise = TRUE)

## Some Pareto k values are very high (>1). Set pointwise=TRUE to inspect individual points.

##         PSIS       lppd     penalty  std_err          k
## 1  14.564118  -7.282059 3.011019551 16.96132 1.12627842
## 2  22.327629 -11.163815 2.931833656 16.96132 0.63226400
## 3   8.965047  -4.482523 0.791516072 16.96132 0.86995220
## 4   3.722656  -1.861328 0.009341683 16.96132 0.06684188
## 5  10.512333  -5.256167 1.467551548 16.96132 0.91482020
## 6  10.483914  -5.241957 1.356002582 16.96132 0.89604989
## 7   7.396845  -3.698422 0.262801913 16.96132 0.59151335
## 8   7.238667  -3.619334 0.236838131 16.96132 0.54468247
## 9   8.464311  -4.232156 0.809105423 16.96132 0.43874688
## 10  9.213670  -4.606835 1.071555163 16.96132 0.79113318
## 11  5.819637  -2.909818 0.231738525 16.96132 0.59974421
## 12  5.805709  -2.902854 0.205200995 16.96132 0.56694809

Thinking with DAGs

dag1 <- 
  dagify(
    A ~ G + D,
    D ~ G,
    coords = 
      tribble(
        ~name, ~x, ~y,
        "G", 0, 0,
        "D", 0.5, 1,
        "A", 1, 0
      )
  )

gg_dag(dag1)

What are the consequences of including department in our model?
What are the consequences of excluding department from our model?

Here’s a different DAG, with an additional variable.

dag2 <- 
  dagify(
    A ~ G + D + U,
    D ~ G + U,
    coords = 
      tribble(
        ~name, ~x, ~y,
        "G", 0, 0,
        "D", 0.5, 1,
        "A", 1, 0,
        "U", 1, 1
      )
  )

gg_dag(dag2, highlight = "U")

What are the consequences of excluding \(U\) from our model? Does including or excluding \(D\) help?

Fancy Binomial Plots

Binomial plots are a little bit tricky, and it is hard to come up with one style of plot that works well in all situations. The raw data consists of 0’s and 1’s (successes and failures), but the model is based on a proportion and the data may be presented in aggregated form. If we have a lot of cases at each combination of predictor values, we can compute proportions, but if there are few observations (perhaps just one) these proportions can be pretty noisy. Different numbers of observations can mean that some proportions are representing more data than others are.

binomial_post_plot <- 
  function(model, data = model@data %>% as.data.frame(), 
           x = case_, show_ind = NULL) {
    
    # create a quosure (unevaluated expression + some info)
    x <- enquo(x)                             
    
    # for labeling the plot
    model_name <- deparse(substitute(model))  
    
    # what is the name of the response variable?
    response <- model@formula[[1]][[2]]       
    
    # variable or expression counting trials
    n_var <- model@formula[[1]][[3]][[2]]     
    
    Avg <- link(model, data = data) %>%
      apply(2, mean_hdci) %>%
      bind_rows() %>% 
      bind_cols(data) %>%
      mutate(case_ = 1:n(), x_ = !!x)    # !! deals with unevaluated expressions
    
    Ind <- 
      sim(model, data = data) %>%
      apply(2, mean_hdci) %>%
      bind_rows() %>% 
      bind_cols(data) %>%
      mutate(case_ = 1:n(), x_ = !!x) 
    
    if (any(Ind$ymax > 1)) {
      Ind <-
        Ind %>% 
        mutate(
          y = y / !!n_var,
          ymin = ymin / !!n_var,
          ymax = ymax / !!n_var
        )
      if (is.null(show_ind)) { show_ind <- TRUE }
    }
    if (is.null(show_ind)) { show_ind <- FALSE }
    
    plot_data <-
      data %>% 
      as.data.frame() %>%
      mutate(case_ = 1:n(),
             p_ = !! response / !! n_var,
             x_ = !! x)
   
    p <-
      gf_blank(p_ ~ x_, data = plot_data, color = "steelblue") 
    
    if (show_ind) {
      p <- p %>%
      gf_pointrange(y + ymin + ymax ~ x_, data = Ind, 
                    size = 0.5, color = "red", alpha = 0.6) 
    }
   
    p <- p %>%
      gf_pointrange(y + ymin + ymax ~ x_, data = Avg, 
                    size = 0.8, color = "forestgreen", alpha = 0.6) %>%
      gf_point(p_ ~ x_, data = plot_data, color = "steelblue") %>%
      gf_labs(subtitle = model_name, x = "case")
    
    p
  }

# example uses

binomial_post_plot(u11.8, data = UCB, x = applicant.gender) %>%
  gf_line(group = 1) %>%
  gf_facet_grid( ~ dept, scales = "free")

data(chimpanzees)
Chimps <-
  chimpanzees %>% 
  mutate(treatment = 1 + prosoc_left + 2 * condition) %>% 
  # this will come in handy, later for plots
  mutate(
    label = factor(treatment,
                   levels = 1:4,
                   labels = c("R/N", "L/N", "R/P", "L/P")))
u11.4 <- readRDS('fits/u11.4.rds')
binomial_post_plot(u11.4, data = Chimps, x = label) %>%
  gf_facet_grid( ~ actor)

Chimps2 <-
  Chimps %>%
  group_by(actor, treatment, label, condition, prosoc_left) %>%
  summarise(left_pulls = sum(pulled_left), pulls = n()) %>%
  mutate(prop_left = left_pulls / pulls) %>% 
  ungroup()

## `summarise()` has grouped output by 'actor', 'treatment', 'label', 'condition'. You can override using the `.groups` argument.

u11.6 <- readRDS('fits/u11.6.rds')
binomial_post_plot(u11.6, data = Chimps2, x = label) %>%
  gf_facet_grid( ~ actor) %>%
  gf_line(group = ~ prosoc_left)

binomial_post_plot(u11.6, data = Chimps2, x = label) %>%
  gf_facet_grid( ~ actor) %>%
  gf_line(group = ~ condition)

This function does OK in some situations. It could be tweaked to handle other situations better perhaps. If you are curious to see how rethinking::postcheck() works, you can look at the code with

rethinking::postcheck