Code from Statistical Rethinking modified by R Pruim is shown below. Differences to the oringal include:

Side note on lists, data frames, and matrices

As our models become more complicated, the containers used to store the data we use to fit the model and the data we generate from the model (using link(), sim(), and extract.samples(), for example) also be come more complicated. Lists, data frames, and matrices are the containers we will use the most.

Reed Frogs

Tadpoles really. Let’s return to our example about survival rates of tadpoles in various tanks.

R Code 12.1

library(rethinking)
data(reedfrogs)
Frogs <- reedfrogs %>% 
  mutate(tank = 1:n())   # make the tank cluster variable
glimpse(Frogs)
## Observations: 48
## Variables: 6
## $ density  <int> 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 1...
## $ pred     <fctr> no, no, no, no, no, no, no, no, pred, pred, pred, pr...
## $ size     <fctr> big, big, big, big, small, small, small, small, big,...
## $ surv     <int> 9, 10, 7, 10, 9, 9, 10, 9, 4, 9, 7, 6, 7, 5, 9, 9, 24...
## $ propsurv <dbl> 0.90, 1.00, 0.70, 1.00, 0.90, 0.90, 1.00, 0.90, 0.40,...
## $ tank     <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16...

R Code 12.2

# fit
m12.1 <- 
  map2stan(
    alist(
      surv ~ dbinom(density, p),
      logit(p) <- a_tank[tank],
      a_tank[tank] ~ dnorm(0, 5)
    ),
    data = Frogs,
    refresh = 0)
## 
##  Elapsed Time: 0.197381 seconds (Warm-up)
##                0.185141 seconds (Sampling)
##                0.382522 seconds (Total)
## Error in summary(fit)$summary: $ operator is invalid for atomic vectors

R Code 12.3

m12.2 <- 
  map2stan(
    alist(
      surv ~ dbinom(density, p),
      logit(p) <- a_tank[tank],
      a_tank[tank] ~ dnorm(a, sigma),
      a ~ dnorm(0, 1),
      sigma ~ dcauchy(0, 1)
    ),
    data = Frogs,
    iter = 4000,
    chains = 4,
    refresh = 0
  )
## 
##  Elapsed Time: 0.345143 seconds (Warm-up)
##                0.227562 seconds (Sampling)
##                0.572705 seconds (Total)
## 
## 
##  Elapsed Time: 0.41178 seconds (Warm-up)
##                0.233401 seconds (Sampling)
##                0.645181 seconds (Total)
## 
## 
##  Elapsed Time: 0.364448 seconds (Warm-up)
##                0.226281 seconds (Sampling)
##                0.590729 seconds (Total)
## 
## 
##  Elapsed Time: 0.333683 seconds (Warm-up)
##                0.2274 seconds (Sampling)
##                0.561083 seconds (Total)
## The following numerical problems occurred the indicated number of times on chain 4
##                                                                                 count
## Exception thrown at line 17: normal_log: Scale parameter is 0, but must be > 0!     1
## When a numerical problem occurs, the Hamiltonian proposal gets rejected.
## See http://mc-stan.org/misc/warnings.html#exception-hamiltonian-proposal-rejected
## If the number in the 'count' column is small, there is no need to ask about this message on stan-users.
## Error in summary(fit)$summary: $ operator is invalid for atomic vectors

R Code 12.4

compare(m12.1, m12.2)
## Error in compare(m12.1, m12.2): object 'm12.1' not found

R Code 12.5

# extract Stan samples
m12.2_post <- extract.samples(m12.2)
## Error in extract.samples(m12.2): object 'm12.2' not found
# compute median intercept for each tank and transform with logistic()
Frogs <- Frogs %>%
  mutate(
    propsurv.est = logistic(apply(m12.2_post$a_tank, 2, median))
  )
## Error in eval(substitute(expr), envir, enclos): object 'm12.2_post' not found
gf_point(propsurv ~ tank, data = Frogs) %>%
  gf_point(propsurv.est ~ tank, data = Frogs, shape = 1) %>%
  # mark posterior median probability across tanks
  gf_hline(yintercept = logistic(median(m12.2_post$a)), lty = 2, alpha = 0.5) %>%
  gf_facet_grid( ~ density, scale="free") %>%
  gf_labs(y = "proportion surviving") 
## Error in typeof(x): object 'm12.2_post' not found

R Code 12.6

# show first 100 populations in the posterior
p <-  
  gf_point(y ~ x, data = data_frame(y = 0, x = -4:7), color = "transparent") 

for (i in 1:100) {
  p <- 
    p %>% 
    gf_function(fun = dnorm, 
                args = list(mean = m12.2_post$a[i], sd = m12.2_post$sigma[i]),
                alpha = 0.2)
}
## Error in layer(data = data, mapping = mapping, stat = StatFunction, geom = geom, : object 'm12.2_post' not found
p

# sample 8000 imaginary tanks from the posterior distribution
sim_tanks <- rnorm(8000, m12.2_post$a, m12.2_post$sigma)
## Error in rnorm(8000, m12.2_post$a, m12.2_post$sigma): object 'm12.2_post' not found
# transform to probability and visualize
gf_dens( ~ (logistic(sim_tanks))) %>%
  gf_labs(x = "probability survive")
## Error in logistic(sim_tanks): object 'sim_tanks' not found

R Code 12.7-12.12

R Code 12

FrogSim <-
  function(
    reps = 15,
    n = c(5L, 10L, 25L, 35L),
    a = 1.4,
    sigma = 1.5
    ) {
    nponds <- reps * length(n)
    expand.grid(rep = 1:reps, n = n) %>%
      mutate(
        pond = 1:nponds,
        a_true = rnorm(nponds, mean = a, sd = sigma),
        s = rbinom(nponds, prob = logistic(a_true), size = n)
      )
  }

R Code 12.10

Stan sometimes requires integer data. In R, you need to be a bit careful to ensure that you really have integers and not “numeric” (floating point) values.

class(1:3)
## [1] "integer"
class(c(1, 2, 3))
## [1] "numeric"
class(as.integer(c(1, 2, 3)))
## [1] "integer"
class(c(1L, 2L, 3L))  # L for Integer -- go figure
## [1] "integer"

R Code 12.13

SFrogs <- FrogSim()
m12.3 <- map2stan(
  alist(
    s ~ dbinom(n, p),
    logit(p) <- a_pond[pond],
    a_pond[pond] ~ dnorm(a, sigma),
    a ~ dnorm(0, 1),
    sigma ~ dcauchy(0, 1)
  ),
  data = SFrogs,
  iter = 1e4,
  warmup = 1000,
  refresh = 0
)
## 
##  Elapsed Time: 0.243333 seconds (Warm-up)
##                1.45265 seconds (Sampling)
##                1.69599 seconds (Total)
## Error in summary(fit)$summary: $ operator is invalid for atomic vectors

R Code 12.14

precis(m12.3, depth = 2)
## Error in precis(m12.3, depth = 2): object 'm12.3' not found

R Code 12.15-12.17

By combining this code into a function, we can apply it again later without retyping. This

  • makes it clearer what is going on, and
  • makes it easier to make systematic changes.
frog_plot <- function(model, data = model@data) {
  data <-
    data %>% as.data.frame() %>% 
    mutate(
      a_pond_est = as.numeric(coef(model)[1:60]),
      p_est = logistic(a_pond_est),
      p_true = logistic(a_true),
      p_raw = s / n,
      nopool_error = abs(p_raw - p_true),
      partpool_error = abs(p_est - p_true)
    )
  gf_point(nopool_error ~ pond, data = data, shape = 16) %>%
    gf_point(partpool_error ~ pond, data = data, shape = 1) %>%
    gf_line(partpool_error ~ pond, 
            data = data %>% group_by(n) %>%
              mutate(partpool_error = mean(partpool_error)),
            linetype = "dashed", color = "navy") %>%
    gf_line(nopool_error ~ pond, 
            data = data %>% group_by(n) %>%
              mutate(nopool_error = mean(nopool_error)),
            linetype = "dotted", color = "red") %>%
    gf_facet_grid( ~ n, scale = "free") %>%
    gf_labs(y = "absolute error")
}

Generating a plot from a model is a one-liner now:

frog_plot(model = m12.3)
## Error in eval(expr, envir, enclos): object 'm12.3' not found

R Code 12.19

NewData <- FrogSim()
m12.3new <- 
  map2stan(
    m12.3,
    data = NewData,
    iter = 1e4,
    warmup = 1000)
## Error in map2stan(m12.3, data = NewData, iter = 10000, warmup = 1000): object 'm12.3' not found
frog_plot(m12.3new)
## Error in eval(expr, envir, enclos): object 'm12.3new' not found

R Code 12

y1 <- rnorm(1e4, 10, 1)
y2 <- 10 + rnorm(1e4, 0, 1)

R Code 12

library(rethinking)
data(chimpanzees)
Chimps <- chimpanzees %>% select(-recipient)   # get rid of NAs


m12.4 <- map2stan(
  alist(
    pulled_left ~ dbinom(1, p),
    logit(p) <-
      a + a_actor[actor] + (bp + bpC * condition) * prosoc_left,
    a_actor[actor] ~ dnorm(0, sigma_actor),
    a ~ dnorm(0, 10),
    bp ~ dnorm(0, 10),
    bpC ~ dnorm(0, 10),
    sigma_actor ~ dcauchy(0, 1)
  ),
  data = Chimps,
  warmup = 1000,
  iter = 5000,
  chains = 4,
  cores = 3,
  refresh = 0
)
## Error in summary(fit)$summary: $ operator is invalid for atomic vectors

R Code 12.22

m12.4_post <- extract.samples(m12.4) 
## Error in extract.samples(m12.4): object 'm12.4' not found
m12.4_post <- 
  m12.4_post %>%
  as.data.frame() %>%
  bind_cols(
    sapply(1:7, function(actor)
      m12.4_post$a + m12.4_post$a_actor[, actor]) %>%
      as.data.frame() %>%
      setNames(paste0("total_a.", 1:7))
  )
## Error in eval(expr, envir, enclos): object 'm12.4_post' not found

R Code 12.23

# prep data
Chimps <-
  Chimps %>% rename(block_id = block)   # name 'block' is reserved by Stan

m12.5 <- map2stan(
  alist(
    pulled_left ~ dbinom(1, p),
    logit(p) <- a + a_actor[actor] + a_block[block_id] +
      (bp + bpc * condition) * prosoc_left,
    a_actor[actor] ~ dnorm(0, sigma_actor),
    a_block[block_id] ~ dnorm(0, sigma_block),
    c(a, bp, bpc) ~ dnorm(0, 10),
    sigma_actor ~ dcauchy(0, 1),
    sigma_block ~ dcauchy(0, 1)
  ),
  data = Chimps,
  warmup = 1000,
  iter = 6000,
  chains = 4,
  cores = 3,
  refresh = 0
)
## Warning: There were 34 divergent transitions after warmup. Increasing adapt_delta above 0.8 may help. See
## http://mc-stan.org/misc/warnings.html#divergent-transitions-after-warmup
## Warning: Examine the pairs() plot to diagnose sampling problems
## Error in summary(fit)$summary: $ operator is invalid for atomic vectors

R Code 12.24

precis(m12.5, depth = 2) # depth=2 displays varying effects
## Error in precis(m12.5, depth = 2): object 'm12.5' not found
plot(precis(m12.5, depth = 2)) # also plot
## Error in precis(m12.5, depth = 2): object 'm12.5' not found

R Code 12.25

m12.5_post <- extract.samples(m12.5) %>% data.frame()
## Error in extract.samples(m12.5): object 'm12.5' not found
names(m12.5_post)
## Error in eval(expr, envir, enclos): object 'm12.5_post' not found
gf_dens(~sigma_block + color::"block", data = m12.5_post) %>%
  gf_dens(~sigma_actor + color::"actor", data = m12.5_post) %>%
  gf_labs(x = "sigma") %>%
  gf_lims(x = c(0,5))
## Error in gf_master(formula = gformula, data = data, geom = geom, gg_object = object, : object 'm12.5_post' not found

If we convert our posterior samples from a “wide” format (one row per sample) to a “long” format (one row for each sample of each parameter, we can plot posterior distributions for multiple parameters even more easily.

m12.5_post_long <-  m12.5_post %>% 
          tidyr::gather(param, value) 
## Error in eval(expr, envir, enclos): object 'm12.5_post' not found
m12.5_post_long %>% head(3)
## Error in eval(expr, envir, enclos): object 'm12.5_post_long' not found
gf_dens(~ value + color::param, alpha = 0.6,
        data = m12.5_post_long %>% filter(grepl("sigma", param)))
## Error in eval(expr, envir, enclos): object 'm12.5_post_long' not found
gf_dens(~ value, 
        data = m12.5_post_long %>% filter(grepl("a_actor\\.", param))) %>%
  gf_vline(xintercept = 0, color = "red", alpha = 0.5, linetype = "dashed") %>%
  gf_facet_grid( param ~ .)
## Error in eval(expr, envir, enclos): object 'm12.5_post_long' not found
gf_dens(~ value,
        data = m12.5_post_long %>% filter(grepl("a_block\\.", param))) %>%
  gf_vline(xintercept = 0, color = "red", alpha = 0.5, linetype = "dashed") %>%
  gf_facet_grid( param ~ .)
## Error in eval(expr, envir, enclos): object 'm12.5_post_long' not found

R Code 12.26

compare(m12.4, m12.5)
## Error in compare(m12.4, m12.5): object 'm12.4' not found

R Code 12.27

m12.4_pred <-
  expand.grid(
    prosoc_left = 0:1,
    condition = 0:1,
    actor = 1:7) %>%
  mutate(
    combo = paste0(prosoc_left, "/", condition)
  )
  
link.m12.4 <- link(m12.4, data = m12.4_pred)
## Error in link(m12.4, data = m12.4_pred): object 'm12.4' not found
m12.4_pred <- m12.4_pred %>% 
  mutate(
    p.pred = apply(link.m12.4, 2, mean),
    p.link.lo = apply(link.m12.4, 2, PI)[1,],
    p.link.hi = apply(link.m12.4, 2, PI)[2,]
  )
## Error in eval(substitute(expr), envir, enclos): object 'link.m12.4' not found
gf_ribbon(p.link.lo + p.link.hi ~ combo + group::"1", data = m12.4_pred) %>%
  gf_line(p.pred ~ combo + group::"1", data = m12.4_pred) %>%
  gf_facet_wrap( ~ actor)
## Error in eval(expr, envir, enclos): object 'p.link.lo' not found

R Code 12.28

m12.4_post <- extract.samples(m12.4) 
## Error in extract.samples(m12.4): object 'm12.4' not found
m12.4_postD <- m12.4_post %>% data.frame()
## Error in eval(expr, envir, enclos): object 'm12.4_post' not found
str(m12.4_post)
## Error in str(m12.4_post): object 'm12.4_post' not found
str(m12.4_postD)
## Error in str(m12.4_postD): object 'm12.4_postD' not found

R Code 12.29

gf_dens( ~ a_actor.5, data = m12.4_postD)
## Error in gf_master(formula = gformula, data = data, geom = geom, gg_object = object, : object 'm12.4_postD' not found
m12.4_post_long <- 
  m12.4_postD %>%
  tidyr::gather(param, value)
## Error in eval(expr, envir, enclos): object 'm12.4_postD' not found
gf_dens( ~ value, data = m12.4_post_long %>% filter(grepl("actor\\.", param))) %>%
  gf_vline(xintercept = 0, color = "red", alpha = 0.5, linetype = "dashed") %>%
  gf_facet_grid( param ~ .)
## Error in eval(expr, envir, enclos): object 'm12.4_post_long' not found

Kline data

R Code 12.39

# prep data
library(rethinking)
data(Kline)
Kline  <- Kline %>%
  mutate(
    logpop = log(population),
    society = 1:n()
  )
# fit model
m12.6 <- map2stan(
  alist(
    total_tools ~ dpois(mu),
    log(mu) <- a + a_society[society] + bp * logpop,
    a ~ dnorm(0, 10),
    bp ~ dnorm(0, 1),
    a_society[society] ~ dnorm(0, sigma_society),
    sigma_society ~ dcauchy(0, 1)
  ),
  data = Kline,
  iter = 4000,
  chains = 3,
  refresh = 0
)
## 
##  Elapsed Time: 0.452511 seconds (Warm-up)
##                0.429847 seconds (Sampling)
##                0.882358 seconds (Total)
## 
## 
##  Elapsed Time: 0.464029 seconds (Warm-up)
##                0.524889 seconds (Sampling)
##                0.988918 seconds (Total)
## The following numerical problems occurred the indicated number of times on chain 2
##                                                                                 count
## Exception thrown at line 17: normal_log: Scale parameter is 0, but must be > 0!     1
## When a numerical problem occurs, the Hamiltonian proposal gets rejected.
## See http://mc-stan.org/misc/warnings.html#exception-hamiltonian-proposal-rejected
## If the number in the 'count' column is small, there is no need to ask about this message on stan-users.
## 
##  Elapsed Time: 0.435656 seconds (Warm-up)
##                0.454678 seconds (Sampling)
##                0.890334 seconds (Total)
## The following numerical problems occurred the indicated number of times on chain 3
##                                                                                 count
## Exception thrown at line 17: normal_log: Scale parameter is 0, but must be > 0!     1
## When a numerical problem occurs, the Hamiltonian proposal gets rejected.
## See http://mc-stan.org/misc/warnings.html#exception-hamiltonian-proposal-rejected
## If the number in the 'count' column is small, there is no need to ask about this message on stan-users.
## Error in summary(fit)$summary: $ operator is invalid for atomic vectors

R Code 12.40

m12.6_post <- extract.samples(m12.6)
## Error in extract.samples(m12.6): object 'm12.6' not found
m12.6_pred <- 
  expand.grid(
    logpop = seq(from = 6, to = 14, length.out = 30),
    society = 1)
a_society_sims <- 
  rnorm(20000, 0, m12.6_post$sigma_society) %>%
  matrix(2000, 10)
## Error in rnorm(20000, 0, m12.6_post$sigma_society): object 'm12.6_post' not found
m12.6_link <- 
  link(
    m12.6,
    n = 2000,
    data = m12.6_pred,
    replace = list(a_society = a_society_sims)
  )
## Error in link(m12.6, n = 2000, data = m12.6_pred, replace = list(a_society = a_society_sims)): object 'm12.6' not found

R Code 12.41

m12.6_pred <- 
  m12.6_pred %>%
  mutate(
    mu.median = apply(m12.6_link, 2, median),
    link67.lo = apply(m12.6_link, 2, PI, prob = 0.67)[1,],
    link67.hi = apply(m12.6_link, 2, PI, prob = 0.67)[2,],
    link89.lo = apply(m12.6_link, 2, PI, prob = 0.89)[1,],
    link89.hi = apply(m12.6_link, 2, PI, prob = 0.89)[2,],
    link97.lo = apply(m12.6_link, 2, PI, prob = 0.97)[1,],
    link97.hi = apply(m12.6_link, 2, PI, prob = 0.97)[2,]
  )
## Error in eval(substitute(expr), envir, enclos): object 'm12.6_link' not found
gf_ribbon(link67.lo + link67.hi ~ logpop, data = m12.6_pred, alpha = 0.1) %>%
gf_ribbon(link89.lo + link89.hi ~ logpop, data = m12.6_pred, alpha = 0.1) %>%
gf_ribbon(link97.lo + link97.hi ~ logpop, data = m12.6_pred, alpha = 0.1) %>%
gf_point(total_tools ~ logpop, data = Kline) %>%
  gf_labs(x = "log population", y = "total tools") %>%
  gf_line(mu.median ~ logpop, data = m12.6_pred)
## Error in eval(expr, envir, enclos): object 'link67.lo' not found

R Code 12.42

There is one missing district number

data(bangladesh)
# would you have noticed that 54 is missing in this list?
sort(unique(bangladesh$district))
##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
## [24] 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46
## [47] 47 48 49 50 51 52 53 55 56 57 58 59 60 61
# it is easier to spot the 2 in this list
diff(sort(unique(bangladesh$district)))
##  [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1
# or we can use a plot to spot that there is one district number missing
gf_histogram(~ district, data = Bangladesh, binwidth = 1)
## Error in gf_master(formula = gformula, data = data, geom = geom, gg_object = object, : object 'Bangladesh' not found

R Code 12.43

Here are two ways to convert to a proper index. Notice that the results are not the same. The first is perhaps preferable since it retains the order of the original district numbers. (In the second one, 2, 3, 4, etc. come just before 20, 30, 40, etc. because “alphabetical” order is used rather than numerical order.

Bangladesh <-
  bangladesh %>% 
  mutate(
    district_id1 = as.integer(factor(district)),
    district_id2 = coerce_index(district)
  )
diff(sort(unique(Bangladesh$district_id1)))
##  [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
diff(sort(unique(Bangladesh$district_id2)))
##  [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
tally(~ (district_id1 == district_id2), data = Bangladesh)
## (district_id1 == district_id2)
##  TRUE FALSE 
##   117  1817
gf_abline(intercept = 0, slope = 1, alpha = 0.4) %>%
gf_point(district_id2 ~ district + color::"2 vs 0", data = Bangladesh) %>%
  gf_point(district_id1 ~ district + color::"1 vs 0", data = Bangladesh) %>%
  gf_labs(title = "Comparing two conversions to an index")