Goals for today

  1. More practice with Binomial (Logistic Regression)

UCB Admissions

This is a famous (and a bit old) data set related to a gender discrimination case in the 1970s. It contains data on whether students were admitted to various graduate programs and their gender. (The data here are for the six largest graduate programs.)

data(UCBadmit)
UCB <-
  UCBadmit %>%
  mutate(
    gidx = ifelse(applicant.gender == "male", 1, 2),
    didx = as.numeric(as.factor(dept)),
    p.admit = admit / applications
  )

UCB %>% pander()
dept applicant.gender admit reject applications gidx didx p.admit
A male 512 313 825 1 1 0.6206
A female 89 19 108 2 1 0.8241
B male 353 207 560 1 2 0.6304
B female 17 8 25 2 2 0.68
C male 120 205 325 1 3 0.3692
C female 202 391 593 2 3 0.3406
D male 138 279 417 1 4 0.3309
D female 131 244 375 2 4 0.3493
E male 53 138 191 1 5 0.2775
E female 94 299 393 2 5 0.2392
F male 22 351 373 1 6 0.05898
F female 24 317 341 2 6 0.07038

Let’s create a logistic regression model with the following core:

alist(
  admit ~ dbinom(applications, p),
  logit(p) <- a[gidx],
  a[gidx] ~ dnorm(0, ???)
)

Choosing the prior

What should we choose for ??? ?

logit(c(0.05, 0.95))
## [1] -2.944439  2.944439
u11.7 <-
  ulam(
    data = UCB %>% select(admit, applications, gidx, didx),
    alist(
      admit ~ dbinom(applications, p),
      logit(p) <- a[gidx],
      a[gidx] ~ dnorm(0, 1.5)
    ),
    chains = 4, log_lik = TRUE,
    file = "fits/u11.7"
  )

What does this model say?

precis(u11.7, depth = 2)
##            mean         sd       5.5%      94.5%    n_eff    Rhat4
## a[1] -0.2201425 0.03854428 -0.2815066 -0.1596877 1350.559 1.001015
## a[2] -0.8299547 0.05144294 -0.9114045 -0.7471930 1342.646 1.000958
u11.7 %>% stanfit() %>% mcmc_areas(pars = vars(-matches("lp"), -matches('log')))

Post11.7 <-
  u11.7 %>% extract.samples() %>% as.data.frame() 
Post11.7 %>%
  head(3) %>% pander()
a.1 a.2
-0.2051 -0.8662
-0.2341 -0.7979
-0.1642 -0.8512
Post11.7 <-
  Post11.7 %>%
  mutate(
    p.1 = ilogit(a.1),
    p.2 = ilogit(a.2),
    a.diff = a.1 - a.2,
    p.diff = p.1 - p.2
  )

Post11.7 %>%
  head(3) %>% pander()
a.1 a.2 p.1 p.2 a.diff p.diff
-0.2051 -0.8662 0.4489 0.2961 0.6611 0.1529
-0.2341 -0.7979 0.4417 0.3105 0.5637 0.1313
-0.1642 -0.8512 0.459 0.2992 0.687 0.1599
Post11.7 %>% mcmc_areas(pars = vars(matches("a")))

Post11.7 %>% mcmc_areas(pars = vars(matches("p")))

Note: this model is not super profound. It has basically calculated the proportion of men and women who were admitted, and added some information about uncertainty.

UCB %>% 
  group_by(applicant.gender) %>%
  summarise(p.admit = sum(admit) / sum(applications)) %>%
  pander()
applicant.gender p.admit
female 0.3035
male 0.4452

What is this plot, and can we make a better version ourselves?

postcheck(u11.7)

Avg11.7 <- 
  link(u11.7) %>%
  apply(2, mean_hdi) %>%
  bind_rows() %>% 
  bind_cols(UCB)
Avg11.7 %>% head(3) %>% pander()
Table continues below
y ymin ymax .width .point .interval dept
0.4452 0.426 0.4631 0.95 mean hdi A
0.3038 0.2828 0.3252 0.95 mean hdi A
0.4452 0.426 0.4631 0.95 mean hdi B
applicant.gender admit reject applications gidx didx p.admit
male 512 313 825 1 1 0.6206
female 89 19 108 2 1 0.8241
male 353 207 560 1 2 0.6304
Ind11.7 <- 
  sim(u11.7) %>%
  apply(2, mean_hdi) %>%
  bind_rows() %>% 
  bind_cols(UCB)

Ind11.7 %>% head(3) %>% pander()
Table continues below
y ymin ymax .width .point .interval dept applicant.gender
368.4 337 399 0.95 mean hdi A male
32.61 22 42 0.95 mean hdi A female
249.5 225 274 0.95 mean hdi B male
admit reject applications gidx didx p.admit
512 313 825 1 1 0.6206
89 19 108 2 1 0.8241
353 207 560 1 2 0.6304
Ind11.7 <-
  Ind11.7 %>%
  mutate(
    y = y / applications,
    ymin = ymin / applications,
    ymax = ymax / applications
  )
Ind11.7 %>% head(3) %>% pander()
Table continues below
y ymin ymax .width .point .interval dept
0.4466 0.4085 0.4836 0.95 mean hdi A
0.3019 0.2037 0.3889 0.95 mean hdi A
0.4455 0.4018 0.4893 0.95 mean hdi B
applicant.gender admit reject applications gidx didx p.admit
male 512 313 825 1 1 0.6206
female 89 19 108 2 1 0.8241
male 353 207 560 1 2 0.6304
gf_point(p.admit ~ applicant.gender | ~ dept, data = Avg11.7, color = "steelblue") %>%
  gf_line(group = ~ 1, color = "steelblue") %>%
  gf_pointrange(y + ymin + ymax ~ applicant.gender, data = Ind11.7, size = 0.5, color = "orange") %>%
  gf_pointrange(y + ymin + ymax ~ applicant.gender, data = Avg11.7, size = 1, 
                color = "forestgreen", alpha = 0.5) %>%
  gf_labs(subtitle = "u11.7")

Another model

The pictures above suggest that department might be important to consider.

u11.8 <-
  ulam(
    data = UCB %>% select(admit, applications, gidx, didx),
    alist(
      admit ~ dbinom(applications, p),
      logit(p) <- a[gidx] + d[didx],
      a[gidx] ~ dnorm(0, 1.5),
      d[didx] ~ dnorm(0, 1.5)
    ),
    chains = 4, iter = 4000, log_lik = TRUE,
    file = "fits/u11.8"
  )
precis(u11.8, depth = 2)
##            mean        sd       5.5%      94.5%    n_eff    Rhat4
## a[1] -0.5336758 0.5609700 -1.3972806  0.3813229 491.9636 1.005323
## a[2] -0.4374360 0.5611640 -1.3009684  0.4823896 489.2978 1.005422
## d[1]  1.1142385 0.5627877  0.2028962  1.9850470 493.3756 1.005191
## d[2]  1.0699099 0.5649148  0.1542118  1.9453018 499.5685 1.005046
## d[3] -0.1443865 0.5631952 -1.0602804  0.7233960 486.5720 1.005407
## d[4] -0.1775489 0.5624797 -1.0936324  0.6959173 497.5455 1.005050
## d[5] -0.6207938 0.5657673 -1.5374129  0.2499016 501.6840 1.005434
## d[6] -2.1778875 0.5764149 -3.1025113 -1.2840076 516.3134 1.005045
u11.8 %>% stanfit() %>% mcmc_areas(pars = vars(-matches("lp"), -matches('log')))

In this model, for department x, the log-odds of admission is \(a_1 + d_x\) for men and \(a_2 + d_x\) for women. The difference in log-odds is the log of the odds ratio:

\[ a_1 - d_x - (a_2 + d_x) = a_1 - a_2 = \mbox{difference in log odds} = \mbox{log(odds ratio)} \] So

\[ e^{a_1 - a_2} = \exp(a_1 - a_2) = \mbox{odds ratio} \]

Note that in this model, the odds ratio between acceptance rates for men and women is the same for each department.

u11.8 %>% extract.samples() %>% as.data.frame() %>%
  mutate(diff_a = a.1 - a.2) %>%
  mcmc_areas(pars = vars(matches('diff')))

Odds are odd; Probability, please

If we want to work on a probability scale, we can do that too.

\[ p_1 - p_2 = \operatorname{ilogit}(a_1 + d_x) - \operatorname{ilogit}(a_2 + d_x) \] This doesn’t simplify as nicely but a little numerical calculation shows that \(p_1 - p_2\) depends on the department!

Post11.8 <- 
  u11.8 %>% extract.samples() %>% as.data.frame() %>%
  mutate(
    diff_a = a.1 - a.2,
    diff_p1 = ilogit(a.1 + d.1) - ilogit(a.2 + d.1),
    diff_p6 = ilogit(a.1 + d.6) - ilogit(a.2 + d.6)
  )

Post11.8 %>% mcmc_areas(pars = vars(matches("diff_p")))

So on the probablity scale, we have an interaction between gender and department, but on the log odds scale, we do not.

Rethinking interaction

Let \(f(x_1, x_2, \dots, x_k)\) be our model function with predictors \(x_1, x_2, \dots, x_k\). Two predictors \(x_i\) and \(x_j\) have an interaction if the amount \(f\) changes when we change \(x_i\) depends the value of \(x_j\).

For a discrete predictor (like gender and department), we can determine this by taking a finite difference (comparing \(f\) for two genders or two departments). Sometimes this can easily be done algebraically. Above, we did this numerically and saw that the difference between the model’s) probability of admission for men and for women depends on the department. That’s an interaction.

For a continuous predictor, we can look at partial derivatives. The partial derivatives \(\frac{\partial f}{\partial x_i}\) tell us how \(f\) changes as predictor \(x_i\) changes. If that partial derivative invovles another predictor \(x_j\), then we have an interaction between \(x_i\) and \(x_j\). For a GLM with a link function, this may depend on whether we are thinking on the linear model scale or the link (probability for a binomial model) scale.

Here is an interesting thing. Consider a logistic regression model with

logit(p) <- a + b * x

Then

\[ \frac{\partial \operatorname{logit}(p)}{\partial x} = b \]

But \[ \frac{\partial p}{\partial x} = \frac{\partial}{\partial x} \left( \frac{\exp(a + bx)}{1 + \exp(a+bx)}\right) = \frac{b}{2(1 + \cosh(a + b x))} \] which depends on \(x\). So on the probability scale, \(x\) interacts with itself: How much \(p\) changes as we change \(x\) a fixed amount depends on the value of \(x\).

Again, the important question for determining if you have an interaction is this: Does how much \(f\) changes when we change \(x_i\) by a fixed amount depend on the value of \(x_j\)? We can determine this using either partial derivatives (when a predictor is continuous) or finite differences.

Posterior plots for our two models

# want to make the same plot for multiple models? put it in a function!
ucb_posterior_plot <- 
  function(model, data = model@data) {
    model_name <- deparse(substitute(model))
    
    Avg <- link(model) %>%
      apply(2, mean_hdi) %>%
      bind_rows() %>% 
      mutate(case = 1:n()) %>%
      bind_cols(data)
    
    Ind <- 
      sim(model) %>%
      apply(2, mean_hdi) %>%
      bind_rows() %>% 
      mutate(case = 1:n()) %>%
      bind_cols(data) %>%
      mutate(
        y = y / applications,
        ymin = ymin / applications,
        ymax = ymax / applications
      )
    gf_point(p.admit ~ applicant.gender | ~ dept, data = Avg, color = "steelblue") %>%
      gf_line(group = ~ 1, color = "steelblue") %>%
      gf_pointrange(y + ymin + ymax ~ applicant.gender, data = Ind, 
                    size = 0.5, color = "orange", alpha = 0.6) %>%
      gf_pointrange(y + ymin + ymax ~ applicant.gender, data = Avg, 
                    size = 0.8, color = "forestgreen", alpha = 0.6) %>%
      gf_labs(subtitle = model_name)
  }
ucb_posterior_plot(u11.7, data = UCB) /
ucb_posterior_plot(u11.8, data = UCB)

With some more effort, we could get even fancier and pull additional information out of the model so this sort of thing could work for a wider range of models – more like rethinking::postcheck(). There is a trade-off between generality/ease of use and customization/suitability for a particular situation. It’s good to have some quick tools that work reasonably well in a wide range of settings, but often you will need to customize to get just what you want/need.

See the end of these notes for an example that is more general than the one above, but not as general as rethinking::postcheck().

Compairing with WAIC or PSIS

We know what to expect here based on our plots above, but let’s take a look anyway.

compare(u11.7, u11.8)
##           WAIC        SE    dWAIC      dSE      pWAIC        weight
## u11.8 108.3742  15.56611   0.0000       NA   9.314334  1.000000e+00
## u11.7 992.4804 314.56471 884.1062 327.2919 112.882930 1.044184e-192
compare(u11.7, u11.8, func = PSIS)
## Some Pareto k values are very high (>1). Set pointwise=TRUE to inspect individual points.
## Some Pareto k values are very high (>1). Set pointwise=TRUE to inspect individual points.
##           PSIS        SE    dPSIS      dSE    pPSIS        weight
## u11.8 114.5145  16.96132   0.0000       NA 12.38451  1.000000e+00
## u11.7 958.6966 313.72102 844.1821 312.3773 95.99104 4.877424e-184
compare(u11.7, u11.8) %>% plot()

But

  • What does “out of sample” mean in this context?
  • What about those PSIS warnings?
PSIS(u11.7, pointwise = TRUE)
## Some Pareto k values are very high (>1). Set pointwise=TRUE to inspect individual points.
##         PSIS        lppd    penalty std_err         k
## 1  135.92790  -67.963951 24.4026881 313.721 2.1013446
## 2  138.85420  -69.427098  8.0906499 313.721 1.3279442
## 3   99.51184  -49.755920 14.3666232 313.721 1.4541170
## 4   18.78027   -9.390137  0.2349273 313.721 0.2064426
## 5   15.07391   -7.536955  0.9441810 313.721 0.5657719
## 6   12.81941   -6.409706  1.4974470 313.721 0.8182791
## 7   33.31645  -16.658223  3.5117486 313.721 0.9727225
## 8   11.13786   -5.568932  0.8471267 313.721 0.6009900
## 9   30.04929  -15.024644  1.5533874 313.721 0.6249022
## 10  16.49344   -8.246722  1.7567106 313.721 0.6620045
## 11 313.40268 -156.701342 24.2257575 313.721 2.6281245
## 12 133.32936  -66.664678 14.5597918 313.721 1.6365865
PSIS(u11.8, pointwise = TRUE)
## Some Pareto k values are very high (>1). Set pointwise=TRUE to inspect individual points.
##         PSIS       lppd     penalty  std_err          k
## 1  14.564118  -7.282059 3.011019551 16.96132 1.12627842
## 2  22.327629 -11.163815 2.931833656 16.96132 0.63226400
## 3   8.965047  -4.482523 0.791516072 16.96132 0.86995220
## 4   3.722656  -1.861328 0.009341683 16.96132 0.06684188
## 5  10.512333  -5.256167 1.467551548 16.96132 0.91482020
## 6  10.483914  -5.241957 1.356002582 16.96132 0.89604989
## 7   7.396845  -3.698422 0.262801913 16.96132 0.59151335
## 8   7.238667  -3.619334 0.236838131 16.96132 0.54468247
## 9   8.464311  -4.232156 0.809105423 16.96132 0.43874688
## 10  9.213670  -4.606835 1.071555163 16.96132 0.79113318
## 11  5.819637  -2.909818 0.231738525 16.96132 0.59974421
## 12  5.805709  -2.902854 0.205200995 16.96132 0.56694809

Thinking with DAGs

dag1 <- 
  dagify(
    A ~ G + D,
    D ~ G,
    coords = 
      tribble(
        ~name, ~x, ~y,
        "G", 0, 0,
        "D", 0.5, 1,
        "A", 1, 0
      )
  )

gg_dag(dag1)

  1. What are the consequences of including department in our model?

  2. What are the consequences of excluding department from our model?

Here’s a different DAG, with an additional variable.

dag2 <- 
  dagify(
    A ~ G + D + U,
    D ~ G + U,
    coords = 
      tribble(
        ~name, ~x, ~y,
        "G", 0, 0,
        "D", 0.5, 1,
        "A", 1, 0,
        "U", 1, 1
      )
  )

gg_dag(dag2, highlight = "U")

  1. What are the consequences of excluding \(U\) from our model? Does including or excluding \(D\) help?

Fancy Binomial Plots

Binomial plots are a little bit tricky, and it is hard to come up with one style of plot that works well in all situations. The raw data consists of 0’s and 1’s (successes and failures), but the model is based on a proportion and the data may be presented in aggregated form. If we have a lot of cases at each combination of predictor values, we can compute proportions, but if there are few observations (perhaps just one) these proportions can be pretty noisy. Different numbers of observations can mean that some proportions are representing more data than others are.

binomial_post_plot <- 
  function(model, data = model@data %>% as.data.frame(), 
           x = case_, show_ind = NULL) {
    
    # create a quosure (unevaluated expression + some info)
    x <- enquo(x)                             
    
    # for labeling the plot
    model_name <- deparse(substitute(model))  
    
    # what is the name of the response variable?
    response <- model@formula[[1]][[2]]       
    
    # variable or expression counting trials
    n_var <- model@formula[[1]][[3]][[2]]     
    
    Avg <- link(model, data = data) %>%
      apply(2, mean_hdci) %>%
      bind_rows() %>% 
      bind_cols(data) %>%
      mutate(case_ = 1:n(), x_ = !!x)    # !! deals with unevaluated expressions
    
    Ind <- 
      sim(model, data = data) %>%
      apply(2, mean_hdci) %>%
      bind_rows() %>% 
      bind_cols(data) %>%
      mutate(case_ = 1:n(), x_ = !!x) 
    
    if (any(Ind$ymax > 1)) {
      Ind <-
        Ind %>% 
        mutate(
          y = y / !!n_var,
          ymin = ymin / !!n_var,
          ymax = ymax / !!n_var
        )
      if (is.null(show_ind)) { show_ind <- TRUE }
    }
    if (is.null(show_ind)) { show_ind <- FALSE }
    
    plot_data <-
      data %>% 
      as.data.frame() %>%
      mutate(case_ = 1:n(),
             p_ = !! response / !! n_var,
             x_ = !! x)
   
    p <-
      gf_blank(p_ ~ x_, data = plot_data, color = "steelblue") 
    
    if (show_ind) {
      p <- p %>%
      gf_pointrange(y + ymin + ymax ~ x_, data = Ind, 
                    size = 0.5, color = "red", alpha = 0.6) 
    }
   
    p <- p %>%
      gf_pointrange(y + ymin + ymax ~ x_, data = Avg, 
                    size = 0.8, color = "forestgreen", alpha = 0.6) %>%
      gf_point(p_ ~ x_, data = plot_data, color = "steelblue") %>%
      gf_labs(subtitle = model_name, x = "case")
    
    p
  }
# example uses

binomial_post_plot(u11.8, data = UCB, x = applicant.gender) %>%
  gf_line(group = 1) %>%
  gf_facet_grid( ~ dept, scales = "free")

data(chimpanzees)
Chimps <-
  chimpanzees %>% 
  mutate(treatment = 1 + prosoc_left + 2 * condition) %>% 
  # this will come in handy, later for plots
  mutate(
    label = factor(treatment,
                   levels = 1:4,
                   labels = c("R/N", "L/N", "R/P", "L/P")))
u11.4 <- readRDS('fits/u11.4.rds')
binomial_post_plot(u11.4, data = Chimps, x = label) %>%
  gf_facet_grid( ~ actor)

Chimps2 <-
  Chimps %>%
  group_by(actor, treatment, label, condition, prosoc_left) %>%
  summarise(left_pulls = sum(pulled_left), pulls = n()) %>%
  mutate(prop_left = left_pulls / pulls) %>% 
  ungroup()
## `summarise()` has grouped output by 'actor', 'treatment', 'label', 'condition'. You can override using the `.groups` argument.
u11.6 <- readRDS('fits/u11.6.rds')
binomial_post_plot(u11.6, data = Chimps2, x = label) %>%
  gf_facet_grid( ~ actor) %>%
  gf_line(group = ~ prosoc_left)

binomial_post_plot(u11.6, data = Chimps2, x = label) %>%
  gf_facet_grid( ~ actor) %>%
  gf_line(group = ~ condition)

This function does OK in some situations. It could be tweaked to handle other situations better perhaps. If you are curious to see how rethinking::postcheck() works, you can look at the code with

rethinking::postcheck