Statistical Rethinking (Code)

Code from Statistical Rethinking modified by R Pruim is shown below. Differences to the oringal include:

a preference for putting data into containers (data frames, mostly), rather than working with lose vectors.
use of lattice and ggplot2 rather than base graphics
use of tidyverse for data transformation
better (in my opinion) naming conventions

R code 3.1

PrPV <- 0.95
PrPM <- 0.01
PrV <- 0.001
PrP <- PrPV * PrV + PrPM * (1 - PrV)
(PrVP <- PrPV * PrV / PrP)

## [1] 0.08683729

R code 3.2

p_grid <- seq(from = 0,
              to = 1,
              length.out = 1000)
prior <- rep(1, 1000)
likelihood <- dbinom(6, size = 9, prob = p_grid)
posterior <- likelihood * prior
posterior <- posterior / sum(posterior)

# this function turns a "rasterized" kernel into a true (rasterized) density function
# useful for rescaling priors and posteriors when using the grid method

densify <- 
  function(x, y) {
    if ( diff(range(diff(x))) > 0.1 * min(abs(diff(x))) )
      stop("`x` must be (approximately) equally spaced.")
    
    width <- mean(diff(x))
    y / sum(y) / width
  }
  
Water9 <- 
  expand.grid(p = seq(from = 0, to = 1, length.out = 1000)) %>%
  mutate(
    prior = 1,
    likelihood = dbinom(6, size = 9, prob = p),
    posterior = likelihood * prior,
    posterior = posterior / sum(posterior),
    posterior.dens = densify(p, posterior)
  )

R code 3.3

samples <-
  sample(p_grid,
         prob = posterior,
         size = 1e4,
         replace = TRUE)
str(samples)

##  num [1:10000] 0.896 0.813 0.594 0.709 0.353 ...

water9.ps.p <- 
  with(Water9, sample(p, prob = posterior, size = 1e4, replace = TRUE))

R code 3.4

plot(samples)

xyplot(water9.ps.p ~ 1:length(water9.ps.p))

R code 3.5

library(rethinking)
dens(samples)

library(rethinking)
densityplot( ~ water9.ps.p, plot.points = FALSE)

plotPoints( posterior.dens ~ p, type = "l", add = TRUE, col = "red", data = Water9)

R code 3.6

# add up posterior probability where p < 0.5
sum(posterior[p_grid < 0.5])

## [1] 0.1718746

# add up posterior probability where p < 0.5; 3 different ways
with(Water9, sum(posterior[p < 0.5]))

## [1] 0.1718746

Water9 %>% summarize(sum(posterior[p < 0.5]))

##   sum(posterior[p < 0.5])
## 1               0.1718746

Water9 %>% group_by(p < 0.5) %>% summarize(sum(posterior))

## # A tibble: 2 × 2
##   `p < 0.5` `sum(posterior)`
##       <lgl>            <dbl>
## 1     FALSE        0.8281254
## 2      TRUE        0.1718746

R code 3.7

sum(samples < 0.5) / 1e4

## [1] 0.1772

# prop() is in the mosaic package and aviods need to use sum() and compute denominator
prop(water9.ps.p < 0.5)

##   TRUE 
## 0.1732

R code 3.8

sum(samples > 0.5 & samples < 0.75) / 1e4

## [1] 0.5952

prop(water9.ps.p > 0.5 & water9.ps.p < 0.75)

##  TRUE 
## 0.598

R code 3.9

quantile(samples, 0.8)

##       80% 
## 0.7617618

quantile(water9.ps.p, 0.8)

##       80% 
## 0.7627628

R code 3.10

quantile(samples, c(0.1, 0.9))

##       10%       90% 
## 0.4434434 0.8128128

quantile(water9.ps.p, c(0.1, 0.9))

##       10%       90% 
## 0.4494494 0.8148148

R code 3.11

p_grid <- 
  seq(from = 0,
      to = 1,
      length.out = 1000)
prior <- rep(1, 1000)
likelihood <- dbinom(3, size = 3, prob = p_grid)
posterior <- likelihood * prior
posterior <- posterior / sum(posterior)
samples <-
  sample(p_grid,
         size = 1e4,
         replace = TRUE,
         prob = posterior)

Water3 <-
  expand.grid(p = seq(from = 0, to = 1, length.out = 1000)) %>%
  mutate(
    prior = 1,   # recycling makes this into 1000 1's
    prior = prior / sum(prior),
    likelihood = dbinom(3, size = 3, prob = p),
    posterior = likelihood * prior,
    posterior = posterior / sum(posterior),
    posterior.dens = densify(p, posterior)    # true density
  )
water3.ps.p <- with(Water3, sample(p, size = 1e5, replace = TRUE, prob = posterior))

R code 3.12

PI(samples, prob = 0.5)

##       25%       75% 
## 0.7057057 0.9289289

PI(water3.ps.p, prob = 0.5)

##       25%       75% 
## 0.7087087 0.9309309

R code 3.13

HPDI(samples, prob = 0.5)

##      |0.5      0.5| 
## 0.8358358 0.9989990

HPDI(water3.ps.p, prob = 0.5)

##      |0.5      0.5| 
## 0.8418418 1.0000000

R code 3.14

p_grid[which.max(posterior)]

## [1] 1

Water3 %>% arrange(-posterior) %>% head(1)

##   p prior likelihood posterior posterior.dens
## 1 1 0.001          1  0.003996       3.992004

R code 3.15

chainmode(samples, adj = 0.01)

## [1] 0.9970504

chainmode(water3.ps.p, adj = 1)       # default amount of smoothing

## [1] 0.9693726

chainmode(water3.ps.p, adj = 0.1)     # less smoothing

## [1] 0.9950912

chainmode(water3.ps.p, adj = 0.01)    # lots less smoothing

## [1] 0.9985943

densityplot(water3.ps.p, adj = 1, plot.points = FALSE)

densityplot(water3.ps.p, adj = 0.1, plot.points = FALSE, )

densityplot(water3.ps.p, adj = 0.01, plot.points = FALSE)

R code 3.16

mean(samples)

## [1] 0.7984865

median(samples)

## [1] 0.8368368

favstats( ~ water3.ps.p)

##         min        Q1    median        Q3 max      mean        sd      n
##  0.05805806 0.7087087 0.8418418 0.9309309   1 0.8009678 0.1627364 100000
##  missing
##        0

R code 3.17

sum(posterior * abs(0.5 - p_grid))

## [1] 0.3128752

with(Water3, sum(posterior * abs(0.5 - p)))

## [1] 0.3128752

# We can compute this from the density too, but we need to adjust for the widths
# of the intervals.
with(Water3, sum(posterior.dens * abs(0.5 - p_grid) * ediff(p, pad.value = 0)))

## [1] 0.3128752

# easiest is to work from the posterior samples (small differences due to sampling)
mean(abs(water3.ps.p - 0.5))

## [1] 0.3131737

R code 3.18

loss <-
  sapply(p_grid, function(d)
    sum(posterior * abs(d - p_grid)))

Water3 <- 
  Water3 %>% 
  mutate(loss = 
           sapply(p_grid, function(d) {sum(posterior * abs(d - p_grid))})
  )

R code 3.19

p_grid[which.min(loss)]

## [1] 0.8408408

with(Water3, p[which.min(loss)])

## [1] 0.8408408

Water3 %>% arrange(loss) %>% head(1)

##           p prior likelihood   posterior posterior.dens      loss
## 1 0.8408408 0.001  0.5944857 0.002375565       2.373189 0.1273465

R code 3.20

dbinom(0:2, size = 2, prob = 0.7)

## [1] 0.09 0.42 0.49

R code 3.21

rbinom(1, size = 2, prob = 0.7)

## [1] 2

R code 3.22

rbinom(10, size = 2, prob = 0.7)

##  [1] 2 2 1 1 1 1 2 2 2 1

R code 3.23

dummy_w <- rbinom(1e5, size = 2, prob = 0.7)
table(dummy_w) / 1e5

## dummy_w
##       0       1       2 
## 0.08969 0.42177 0.48854

tally( ~ rbinom(1e5, size = 2, prob = 0.7), format = "proportion")

## rbinom(1e+05, size = 2, prob = 0.7)
##       0       1       2 
## 0.09106 0.41795 0.49099

R code 3.24

dummy_w <- rbinom(1e5, size = 9, prob = 0.7)
simplehist(dummy_w, xlab = "dummy water count")

histogram( ~ rbinom(1e5, size = 9, prob = 0.7), width = 1, xlab = "dummy water count")

dummy_w <- rbinom(1e5, size = 9, prob = 0.7)
histogram( ~ dummy_w, xlab = "dummy water count", width = 1)

R code 3.25

w <- rbinom(1e4, size = 9, prob = 0.6)

R code 3.26

w <- rbinom(1e4, size = 9, prob = samples)

w <- rbinom(1e4, size = 9, prob = water9.ps.p)

R code 3.27

p_grid <- seq(from = 0,
              to = 1,
              length.out = 1000)
prior <- rep(1, 1000)
likelihood <- dbinom(6, size = 9, prob = p_grid)
posterior <- likelihood * prior
posterior <- posterior / sum(posterior)
set.seed(100)
samples <-
  sample(p_grid,
         prob = posterior,
         size = 1e4,
         replace = TRUE)

Water9a <-
  expand.grid(p = seq(from = 0, to = 1, length.out = 1000)) %>%
  mutate( 
    prior = 1,
    likelihood = dbinom(6, size = 9, prob = p),
    posterior = likelihood * prior,
    posterior = posterior / sum(posterior),
    posterior.dens = densify(p, posterior)
  )
set.seed(100)
sampled_p <- 
  with(Water9a, sample(p, prob = posterior, size = 1e4, replace = TRUE))

R code 3.28

data(homeworkch3, package = "rethinking")
# command above avoids us having to type it all in by hand:
#
# birth1 <- c(1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1,
#             0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0,
#             1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1,
#             0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1)
# 
# birth2 <- c(0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1,
#             0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1,
#             1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0,
#             0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0)

# this puts the data into a data frame using a more transparent coding scheme
BirthSex <-
  data_frame(
    first = c("F", "M") [1 + birth1],
    second = c("F", "M")[1 + birth2]
  ) %>% 
  mutate(boys = (first == "M") + (second == "M"), girls = 2 - boys) 
head(BirthSex)

## # A tibble: 6 × 4
##   first second  boys girls
##   <chr>  <chr> <int> <dbl>
## 1     M      F     1     1
## 2     F      M     1     1
## 3     F      F     0     2
## 4     F      M     1     1
## 5     M      F     1     1
## 6     M      M     2     0

R code 3.29

library(rethinking)
data(homeworkch3)

R code 3.30

sum(birth1) + sum(birth2)

## [1] 111

BirthSex %>% 
  group_by(boys) %>%
  summarise(n = n()) %>%
  mutate(cumsum(n),  total.boys = boys * n,  cumsum(total.boys))

## # A tibble: 3 × 5
##    boys     n `cumsum(n)` total.boys `cumsum(total.boys)`
##   <int> <int>       <int>      <int>                <int>
## 1     0    10          10          0                    0
## 2     1    69          79         69                   69
## 3     2    21         100         42                  111

Statistical Rethinking (Code)

Chapter 3

Feburary, 2017

R code 3.1

R code 3.2

R code 3.3

R code 3.4

R code 3.5

R code 3.6

R code 3.7

R code 3.8

R code 3.9

R code 3.10

R code 3.11

R code 3.12

R code 3.13

R code 3.14

R code 3.15

R code 3.16

R code 3.17

R code 3.18

R code 3.19

R code 3.20

R code 3.21

R code 3.22

R code 3.23

R code 3.24

R code 3.25

R code 3.26

R code 3.27

R code 3.28

R code 3.29

R code 3.30