Binomial (Logistic) Regression

Goals for today

Learn how to model an integer response.
Learn the GLM framework.

Logistic regression: Prosocial chimpanzees.

data(chimpanzees, package = "rethinking")
Chimps <- chimpanzees
rm(chimpanzees)

The data include two experimental conditions, prosoc_left and condition, each of which has two levels. This results in four combinations.

Chimps %>% 
  distinct(prosoc_left, condition) %>% 
  mutate(description = c("Two food items on right and no partner",
                         "Two food items on left and no partner",
                         "Two food items on right and partner present",
                         "Two food items on left and partner present")) %>% 
  knitr::kable()

condition	prosoc_left	description
0	0	Two food items on right and no partner
0	1	Two food items on left and no partner
1	0	Two food items on right and partner present
1	1	Two food items on left and partner present

Chimps <-
  Chimps %>% 
  mutate(treatment = 1 + prosoc_left + 2 * condition) %>% 
  # this will come in handy, later for plots
  mutate(
    label = factor(treatment,
                   levels = 1:4,
                   labels = c("R/N", "L/N", "R/P", "L/P")))

We can use the dplyr::count() function to get a sense of the distribution of the conditions in the data.

Chimps %>% 
  count(condition, prosoc_left, treatment, label)

##   condition prosoc_left treatment label   n
## 1         0           0         1   R/N 126
## 2         0           1         2   L/N 126
## 3         1           0         3   R/P 126
## 4         1           1         4   L/P 126

This is the same as

Chimps %>% 
  group_by(condition, prosoc_left, treatment, label) %>%
  summarise(n = n())

## `summarise()` has grouped output by 'condition', 'prosoc_left', 'treatment'. You can override using the `.groups` argument.

## # A tibble: 4 x 5
## # Groups:   condition, prosoc_left, treatment [4]
##   condition prosoc_left treatment label     n
##       <int>       <int>     <dbl> <fct> <int>
## 1         0           0         1 R/N     126
## 2         0           1         2 L/N     126
## 3         1           0         3 R/P     126
## 4         1           1         4 L/P     126

We start with the simple intercept-only logistic regression model, which follows the statistical formula

\[\begin{align*} \text{pulled_left}_i & \sim {\sf Binom}(1, p_i) \\ \operatorname{logit}(p_i) & = \alpha \\ \alpha & \sim {\sf Norm}(0, w), \end{align*}\]

where \(w\) is a distribution parameter we still need to choose.

Let’s start by trying something really flat: \(w = 10\). (Spoiler alert: This will be a bad choice.)

m11.1

m11.1 <- 
  quap(
    data = Chimps, 
    alist(pulled_left ~ dbinom(1, p),
          logit(p) <- a,
          a ~ dnorm(0, 10))
  )

prior11.1 <- extract.prior(m11.1, 1e4) 
str(prior)

## function (prior, ...)

Prior11.1 <- prior11.1 %>%
  as.data.frame() %>%
  mutate(p = ilogit(a)
  )
head(Prior11.1) %>% pander()

a	p
7.095	0.9992
-1.093	0.251
-4.535	0.01061
6.059	0.9977
-18.18	1.273e-08
6.301	0.9982

Prior11.1 %>%
  gf_dhistogram(~p, bins = 100) %>%
  gf_dens(~ p, size = 1, color = "red") %>%
  gf_dens(~ p, adjust = 0.1, size = 1, color = "purple")

m11.1a <- 
  quap(
    data = Chimps, 
    alist(pulled_left ~ dbinom(1, p),
          logit(p) <- a,
          a ~ dnorm(0, 1.5))
  )
Prior11.1a <- 
  extract.prior(m11.1a, 1e4) %>%
  as.data.frame() %>%
  mutate(p = ilogit(a)
  )

gf_dens(~ p, adjust = 0.1, size = 1, color = ~"m11.1", data = Prior11.1) %>%
  gf_dens(~ p, adjust = 0.1, size = 1, color = ~"m11.1a", data = Prior11.1a)

gf_dens(~ p, adjust = 0.1, size = 1, color = ~"m11.1", data = Prior11.1) %>%
  gf_dens(~ p, adjust = 0.5, size = 1, color = ~"m11.1a", data = Prior11.1a)

m11.2

\[\begin{align*} \text{pulled_left}_i & \sim {\sf Binom}(1, p_i) \\ \operatorname{logit}(p_i) & = \alpha_{\color{#54635e}{\text{actor}}[i]} + \beta_{\color{#a4692f}{\text{treatment}}[i]} \\ \alpha_{\color{#54635e}j} & \sim \Normal(0, 1.5) \\ \beta_{\color{#a4692f}k} & \sim \Normal(0, 0.5). \end{align*}\]

Chimpanzees

m11.2 <- 
  quap(
    alist(
      pulled_left ~ dbinom(1, p),
      logit(p) <- a + b[treatment],
      a ~ dnorm(0, 1.5),
      b[treatment] ~ dnorm(0, 10)
    ),
    data = Chimps)

To explore some other options for \(w\), it is handy to write a function that creates a plot like the previous one for any \(w\).

explore_prior_for_b <- function(sd = 1) {
  
  flist <-
    eval(
      substitute(
        alist(
          pulled_left ~ dbinom(1, p),
          logit(p) <- a + b[treatment],
          a ~ dnorm(0, 1.5),
          b[treatment] ~ dnorm(0, SD)
        ), list(SD = sd)
      )
    )
  
  m <- 
    quap(
      data = Chimps,
      flist
    )
  
  Prior <-
    m %>% extract.prior(1e4) %>%
    as.data.frame() %>%
    mutate(p1 = ilogit(a + b.1),
           p2 = ilogit(a + b.2),
           p3 = ilogit(a + b.3),
           p4 = ilogit(a + b.4),
           diff12 = p2 - p1,
           diff13 = p3 - p1,
           diff14 = p4 - p1,
           diff23 = p3 - p2,
           diff24 = p4 - p2,
           diff34 = p4 - p3
    )
  
  Prior %>%
    gf_dens(~ diff12, adjust = 0.5, size = 1, color = ~ "1 vs 2") %>%
    gf_dens(~ diff13, adjust = 0.5, size = 1, color = ~ "1 vs 3") %>%
    gf_dens(~ diff14, adjust = 0.5, size = 1, color = ~ "1 vs 4") %>%
    gf_dens(~ diff23, adjust = 0.5, size = 1, color = ~ "2 vs 3") %>%
    gf_dens(~ diff24, adjust = 0.5, size = 1, color = ~ "2 vs 4") %>%
    gf_dens(~ diff34, adjust = 0.5, size = 1, color = ~ "3 vs 4") %>%
    gf_labs(title = paste("sd =", sd))
}

explore_prior_for_b(1.5)

explore_prior_for_b(0.5)

m11.3

m11.3 <- 
  quap(
    data = Chimps,
    alist(
      pulled_left ~ dbinom(1, p),
      logit(p) <- a + b[treatment],
      a ~ dnorm(0, 1.5),
      b[treatment] ~ dnorm(0, 0.5)
    )
  )

m11.4

u11.4 <- ulam(
  data = Chimps %>% select(pulled_left, actor, treatment, label),
  alist(
    pulled_left ~ dbinom(1, p),
    logit(p) <- a[actor] + b[treatment],
    a[actor] ~ dnorm(0, 1.5),
    b[treatment] ~ dnorm(0, 0.5)
  ),
  chains = 4, iter = 2500, warmup = 500, refresh = 0,
  file = "fits/u11.4", log_lik = TRUE
)
precis(u11.4, depth = 2)

##             mean        sd        5.5%       94.5%    n_eff    Rhat4
## a[1] -0.44826382 0.3277945 -0.97659455  0.07054808 3223.110 1.001626
## a[2]  3.90153809 0.7375350  2.78941094  5.14990447 4750.050 1.000794
## a[3] -0.74630681 0.3289169 -1.26413825 -0.22782240 3177.684 1.000988
## a[4] -0.74311111 0.3265333 -1.26616560 -0.21810146 3355.483 1.000886
## a[5] -0.44551017 0.3227926 -0.96079886  0.07065379 3109.572 1.001447
## a[6]  0.48087695 0.3246459 -0.02814224  1.00402295 3013.638 1.001862
## a[7]  1.95778814 0.4205367  1.29811744  2.64459279 3925.681 1.000517
## b[1] -0.04242607 0.2822240 -0.48685026  0.40699665 2769.351 1.001605
## b[2]  0.47848176 0.2791331  0.03035002  0.91565346 2868.849 1.001785
## b[3] -0.38674428 0.2834148 -0.84346836  0.06669854 2881.009 1.002533
## b[4]  0.36687030 0.2800094 -0.08312746  0.81016304 2892.438 1.001537

Post11.4 <-
  extract.samples(u11.4) 
str(Post11.4)

## List of 2
##  $ a: num [1:8000, 1:7] -0.585 -0.735 -0.31 -0.923 -0.613 ...
##  $ b: num [1:8000, 1:4] -0.0729 -0.1851 -0.2171 -0.0492 -0.2422 ...
##  - attr(*, "source")= chr "ulam posterior: 8000 samples from object"

Post11.4$p <-
  ilogit(Post11.4$a) 
str(Post11.4)

## List of 3
##  $ a: num [1:8000, 1:7] -0.585 -0.735 -0.31 -0.923 -0.613 ...
##  $ b: num [1:8000, 1:4] -0.0729 -0.1851 -0.2171 -0.0492 -0.2422 ...
##  $ p: num [1:8000, 1:7] 0.358 0.324 0.423 0.284 0.351 ...
##  - attr(*, "source")= chr "ulam posterior: 8000 samples from object"

Post11.4$p %>% 
  apply(2, mean_hdi) %>%
  bind_rows() %>%
  str()

## 'data.frame':    7 obs. of  6 variables:
##  $ y        : num  0.392 0.975 0.326 0.326 0.393 ...
##  $ ymin     : num  0.245 0.94 0.195 0.194 0.25 ...
##  $ ymax     : num  0.539 0.998 0.468 0.467 0.54 ...
##  $ .width   : num  0.95 0.95 0.95 0.95 0.95 0.95 0.95
##  $ .point   : chr  "mean" "mean" "mean" "mean" ...
##  $ .interval: chr  "hdi" "hdi" "hdi" "hdi" ...

Post11.4$p %>% 
  apply(2, mean_hdi) %>%
  bind_rows() %>%
  mutate(actor = factor(1:7)) %>%
  gf_pointrange(actor ~ y + ymin + ymax) %>%
  gf_vline(xintercept = ~ 0.5, color = "red", linetype = "dashed") %>%
  gf_lims(x = c(0, 1)) %>%
  gf_labs(x = "posterior probabilty of pulling left for each chimp")

Post11.4$b %>% 
  apply(2, mean_hdi) %>%
  bind_rows() %>%
  bind_cols(Chimps %>% select(treatment, label) %>% distinct()) %>%
  gf_pointrange(label ~ y + ymin + ymax) %>%
  gf_vline(xintercept = ~ 0.0, color = "red", linetype = "dashed") %>%
  gf_labs(y = "condition", y = "model coefficient (log odds scale)", subtitle = "u11.4")

Chimps %>%
  group_by(actor, treatment, condition, prosoc_left, label) %>%
  summarise(prop_left = mean(pulled_left))  %>% pander()

summarise() has grouped output by ‘actor’, ‘treatment’, ‘condition’, ‘prosoc_left’. You can override using the .groups argument.

actor	treatment	condition	prosoc_left	label	prop_left
1	1	0	0	R/N	0.3333
1	2	0	1	L/N	0.5
1	3	1	0	R/P	0.2778
1	4	1	1	L/P	0.5556
2	1	0	0	R/N	1
2	2	0	1	L/N	1
2	3	1	0	R/P	1
2	4	1	1	L/P	1
3	1	0	0	R/N	0.2778
3	2	0	1	L/N	0.6111
3	3	1	0	R/P	0.1667
3	4	1	1	L/P	0.3333
4	1	0	0	R/N	0.3333
4	2	0	1	L/N	0.5
4	3	1	0	R/P	0.1111
4	4	1	1	L/P	0.4444
5	1	0	0	R/N	0.3333
5	2	0	1	L/N	0.5556
5	3	1	0	R/P	0.2778
5	4	1	1	L/P	0.5
6	1	0	0	R/N	0.7778
6	2	0	1	L/N	0.6111
6	3	1	0	R/P	0.5556
6	4	1	1	L/P	0.6111
7	1	0	0	R/N	0.7778
7	2	0	1	L/N	0.8333
7	3	1	0	R/P	0.9444
7	4	1	1	L/P	1

p11.4a <-
  Chimps %>%
  group_by(actor, treatment, condition, prosoc_left, label) %>%
  summarise(prop_left = mean(pulled_left)) %>%
  gf_point(prop_left ~ treatment | ~ actor, color = ~factor(condition)) %>%
  gf_line(prop_left ~ treatment | ~ actor, group = ~prosoc_left, color = "gray50") %>%
  gf_labs(title = "observed proportions")

p11.4a

PChimps <- 
  Chimps %>%
  select(actor, treatment, label, condition, prosoc_left) %>% distinct()
PChimps %>% glimpse()

## Rows: 28
## Columns: 5
## $ actor       <int> 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5…
## $ treatment   <dbl> 1, 2, 3, 4, 2, 1, 3, 4, 1, 2, 4, 3, 1, 2, 4, 3, 2, 1, 4, 3…
## $ label       <fct> R/N, L/N, R/P, L/P, L/N, R/N, R/P, L/P, R/N, L/N, L/P, R/P…
## $ condition   <int> 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1…
## $ prosoc_left <int> 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0…

Avg11.4 <-
  link(u11.4, data = PChimps, n = 1e4) %>%
  apply(2, mean_hdi) %>%
  bind_rows() %>%
  bind_cols(PChimps)

p11.4b <-
  Avg11.4 %>%
  gf_pointrange(y + ymin + ymax ~ treatment | ~ actor, color = ~ factor(condition)) %>%
  gf_line(y ~ treatment | ~ actor, group = ~prosoc_left, color = "gray50") %>%
  gf_labs(title = "posterior predicted proportions")

p11.4a / p11.4b

u11.5

We could also fit a model that uses condition and prosocial left as separate variables. (See the book.)

Aggregated binomial: Chimpanzees again, condensed.

Chimps2 <-
  Chimps %>%
  group_by(actor, treatment, label, condition, prosoc_left) %>%
  summarise(left_pulls = sum(pulled_left), pulls = n()) %>%
  mutate(prop_left = left_pulls / pulls) %>% 
  ungroup()

Chimps2 %>% head() %>% pander()

actor	treatment	label	condition	prosoc_left	left_pulls	pulls	prop_left
1	1	R/N	0	0	6	18	0.3333
1	2	L/N	0	1	9	18	0.5
1	3	R/P	1	0	5	18	0.2778
1	4	L/P	1	1	10	18	0.5556
2	1	R/N	0	0	18	18	1
2	2	L/N	0	1	18	18	1

u11.6 <- ulam(
  data = Chimps2 %>% select(left_pulls, pulls, actor, treatment, label),
  alist(
    left_pulls ~ dbinom(pulls, p),
    logit(p) <- a[actor] + b[treatment],
    a[actor] ~ dnorm(0, 1.5),
    b[treatment] ~ dnorm(0, 0.5)
  ),
  chains = 4, iter = 2500, warmup = 500, refresh = 0,
  file = "fits/u11.6", log_lik = TRUE
)
precis(u11.6, depth = 2)

##            mean        sd        5.5%       94.5%    n_eff     Rhat4
## a[1] -0.4435676 0.3253178 -0.96516464  0.07638391 2857.998 1.0007892
## a[2]  3.9032900 0.7544974  2.76557053  5.17033948 4396.974 1.0006135
## a[3] -0.7455684 0.3302017 -1.29206316 -0.22857475 2780.720 1.0002517
## a[4] -0.7426621 0.3331159 -1.27179935 -0.19984670 2925.146 1.0002438
## a[5] -0.4419994 0.3236938 -0.95703891  0.08375442 2782.626 1.0003624
## a[6]  0.4811539 0.3365557 -0.05210568  1.02765389 2828.542 1.0009433
## a[7]  1.9652826 0.4147323  1.31679471  2.63910008 3480.137 0.9997978
## b[1] -0.0445345 0.2828582 -0.50344568  0.39988274 2484.577 1.0003110
## b[2]  0.4741794 0.2820499  0.01100776  0.92575094 2592.811 1.0006490
## b[3] -0.3920240 0.2818646 -0.84456731  0.05461812 2441.356 1.0007101
## b[4]  0.3643734 0.2760523 -0.06594405  0.81016194 2493.932 1.0005271

coeftab(u11.4, u11.6) %>% plot()

compare(u11.4, u11.6)

## Warning in compare(u11.4, u11.6): Different numbers of observations found for at least two models.
## Model comparison is valid only for models fit to exactly the same observations.
## Number of observations for each model:
## u11.4 504 
## u11.6 28

##           WAIC        SE    dWAIC      dSE    pWAIC       weight
## u11.6 112.8606  8.105077   0.0000       NA 7.750591 1.000000e+00
## u11.4 531.7652 18.909188 418.9046 39.74621 8.267414 1.086462e-91

compare(u11.4, u11.6) %>% plot()

## Warning in compare(u11.4, u11.6): Different numbers of observations found for at least two models.
## Model comparison is valid only for models fit to exactly the same observations.
## Number of observations for each model:
## u11.4 504 
## u11.6 28