Code from Statistical Rethinking modified by R Pruim is shown below. Differences to the oringal include:
lattice
and/or ggplot2
rather than base graphicstidyverse
for data transformationThe following model suggests that divorce rates are higher in states with more Waffle Houses (per capita).
data(WaffleDivorce)
WaffleDivorce <-
WaffleDivorce %>%
mutate(
WaffleHouses.pm = WaffleHouses / Population
)
m5.0 <- map(
alist(
Divorce ~ dnorm(mu, sigma),
mu <- a + bW * WaffleHouses.pm,
a ~ dnorm(10, 10),
bW ~ dnorm(0, 1),
sigma ~ dunif(0, 10)
),
data = WaffleDivorce
)
precis(m5.0, digits = 3)
## Mean StdDev 5.5% 94.5%
## a 9.321 0.272 8.887 9.755
## bW 0.074 0.027 0.032 0.117
## sigma 1.677 0.168 1.409 1.945
Here is a plot.
require(ggformula) # need the version from rpruim on github
coef(m5.0)
## a bW sigma
## 9.32060116 0.07433683 1.67730153
gf_point(Divorce ~ WaffleHouses.pm, data = WaffleDivorce, color = rangi2) %>%
gf_coefline(coef = coef(m5.0))
## Warning in gf_coefline(., coef = coef(m5.0)): Ignoring all but first two
## values of coef.
# load data
library(rethinking)
data(WaffleDivorce)
# standardize predictor
WaffleDivorce <-
WaffleDivorce %>%
mutate(MedianAgeMarriage.s = zscore(MedianAgeMarriage))
# fit model
m5.1 <- map(
alist(
Divorce ~ dnorm(mu, sigma),
mu <- a + bA * MedianAgeMarriage.s,
a ~ dnorm(10, 10),
bA ~ dnorm(0, 1),
sigma ~ dunif(0, 10)
),
data = WaffleDivorce
)
# compute percentile interval of mean
m5.1.pred <-
data.frame(
MedianAgeMarriage.s = seq(from = -3, to = 3, by = 0.25)
)
m5.1.link <- link(m5.1, data = m5.1.pred)
## [ 100 / 1000 ]
[ 200 / 1000 ]
[ 300 / 1000 ]
[ 400 / 1000 ]
[ 500 / 1000 ]
[ 600 / 1000 ]
[ 700 / 1000 ]
[ 800 / 1000 ]
[ 900 / 1000 ]
[ 1000 / 1000 ]
m5.1.pred <-
m5.1.pred %>%
mutate(
mu.PIlo = apply(m5.1.link, 2, PI)[1,],
mu.PIhi = apply(m5.1.link, 2, PI)[2,]
)
# plot it all
gf_point(Divorce ~ MedianAgeMarriage.s + label:Loc, color = rangi2,
data = WaffleDivorce) %>%
gf_ribbon(mu.PIlo + mu.PIhi ~ MedianAgeMarriage.s, data = m5.1.pred,
alpha = 0.2) %>%
gf_coefline(coef = coef(m5.1), col = "red", alpha = 0.5) %>%
plotly::ggplotly()
## Warning: Ignoring unknown aesthetics: label
## Warning in gf_coefline(., coef = coef(m5.1), col = "red", alpha = 0.5):
## Ignoring all but first two values of coef.
WaffleDivorce <-
WaffleDivorce %>%
mutate(Marriage.s = zscore(Marriage))
m5.2 <- map(
alist(
Divorce ~ dnorm(mu, sigma),
mu <- a + bR * Marriage.s,
a ~ dnorm(10, 10),
bR ~ dnorm(0, 1),
sigma ~ dunif(0, 10)
),
data = WaffleDivorce
)
m5.2.pred <-
data.frame(
Marriage.s = seq(from = -3, to = 3, by = 0.25)
)
m5.2.link <- link(m5.2, data = m5.2.pred)
## [ 100 / 1000 ]
[ 200 / 1000 ]
[ 300 / 1000 ]
[ 400 / 1000 ]
[ 500 / 1000 ]
[ 600 / 1000 ]
[ 700 / 1000 ]
[ 800 / 1000 ]
[ 900 / 1000 ]
[ 1000 / 1000 ]
m5.2.pred <-
m5.2.pred %>%
mutate(
mu.PIlo = apply(m5.2.link, 2, PI)[1,],
mu.PIhi = apply(m5.2.link, 2, PI)[2,]
)
# plot it all
gf_point(Divorce ~ Marriage.s, data = WaffleDivorce, col = rangi2) %>%
gf_coefline(coef = coef(m5.2), col = "red", alpha = 0.5) %>%
gf_ribbon(mu.PIlo + mu.PIhi ~ Marriage.s, data = m5.2.pred, alpha = 0.1)
## Warning in gf_coefline(., coef = coef(m5.2), col = "red", alpha = 0.5):
## Ignoring all but first two values of coef.
plotly::ggplotly()
m5.3 <- map(
alist(
Divorce ~ dnorm(mu, sigma),
mu <- a + bR * Marriage.s + bA * MedianAgeMarriage.s,
a ~ dnorm(10, 10),
bR ~ dnorm(0, 1),
bA ~ dnorm(0, 1),
sigma ~ dunif(0, 10)
),
data = WaffleDivorce
)
precis(m5.3)
## Mean StdDev 5.5% 94.5%
## a 9.69 0.20 9.36 10.01
## bR -0.13 0.28 -0.58 0.31
## bA -1.13 0.28 -1.58 -0.69
## sigma 1.44 0.14 1.21 1.67
plot(precis(m5.3))
m5.4 <- map(
alist(
Marriage.s ~ dnorm(mu, sigma),
mu <- a + b * MedianAgeMarriage.s,
a ~ dnorm(0, 10),
b ~ dnorm(0, 1),
sigma ~ dunif(0, 10)
),
data = WaffleDivorce
)
# for each state, compute expected value at MAP and residual
WaffleDivorce <-
WaffleDivorce %>%
mutate(
mu.m5.4 = coef(m5.4)['a'] + coef(m5.4)['b'] * MedianAgeMarriage.s,
resid.m5.4 = Marriage.s - mu.m5.4
)
link()
takes care of computing the model value for each posterior sample so we don’t have to access the coefficients directly. As models become more complicated, this will be easier. Also, it provides us with more information, since we have a distribution of model responses for each case.
We are using the defaults here, but link()
takes two additional arguments in addition to the model.
n
: number of posterior samples to use. (The default is 1000.) Increase this to reduce sampling variability (at the cost of time).data
: optional data set to use in place of original data from map()
. Use this to make predictions using any predictor values you like.# for each state, compute expected value at MAP and residual
m5.4.link <- link(m5.4)
## [ 100 / 1000 ]
[ 200 / 1000 ]
[ 300 / 1000 ]
[ 400 / 1000 ]
[ 500 / 1000 ]
[ 600 / 1000 ]
[ 700 / 1000 ]
[ 800 / 1000 ]
[ 900 / 1000 ]
[ 1000 / 1000 ]
WaffleDivorce <-
WaffleDivorce %>%
mutate(
mu = apply(m5.4.link, 2, mean),
resid = Marriage.s - mu
)
WaffleDivorce %>% select(Loc, mu, resid) %>% head(3)
## Loc mu resid
## 1 AL 0.4289818 -0.40633776
## 2 AK 0.4858405 1.06396115
## 3 AZ 0.1446885 -0.09571417
gf_segment(mu.m5.4 + Marriage.s ~ MedianAgeMarriage.s + MedianAgeMarriage.s,
data = WaffleDivorce, color = "red", alpha = 0.5) %>%
gf_coefline(coef = coef(m5.4)) %>%
gf_point(Marriage.s ~ MedianAgeMarriage.s, col = rangi2, data = WaffleDivorce)
## Warning in gf_coefline(., coef = coef(m5.4)): Ignoring all but first two
## values of coef.
plotly::ggplotly()
# prepare new counterfactual data
m5.3.pred <-
data_frame(
Marriage.s = seq(from = -3, to = 3, by = 0.25),
MedianAgeMarriage.s = mean(WaffleDivorce$MedianAgeMarriage.s)
)
# compute counterfactual mean divorce (mu)
mu <- link(m5.3, data = m5.3.pred)
## [ 100 / 1000 ]
[ 200 / 1000 ]
[ 300 / 1000 ]
[ 400 / 1000 ]
[ 500 / 1000 ]
[ 600 / 1000 ]
[ 700 / 1000 ]
[ 800 / 1000 ]
[ 900 / 1000 ]
[ 1000 / 1000 ]
# simulate counterfactual divorce outcomes
R.sim <- sim(m5.3, data = m5.3.pred, n = 1e4)
## [ 1000 / 10000 ]
[ 2000 / 10000 ]
[ 3000 / 10000 ]
[ 4000 / 10000 ]
[ 5000 / 10000 ]
[ 6000 / 10000 ]
[ 7000 / 10000 ]
[ 8000 / 10000 ]
[ 9000 / 10000 ]
[ 10000 / 10000 ]
m5.3.pred <-
m5.3.pred %>%
mutate(
mu.mean = apply(mu, 2, mean),
mu.PIlo = apply(mu, 2, PI)[1,],
mu.PIhi = apply(mu, 2, PI)[2,],
R.PIlo = apply(R.sim, 2, PI)[1,],
R.PIhi = apply(R.sim, 2, PI)[2,]
)
gf_line(mu.mean ~ Marriage.s, data = m5.3.pred, color = "gray50") %>%
gf_ribbon(mu.PIlo + mu.PIhi ~ Marriage.s, fill = "gray50", alpha = 0.2) %>%
gf_ribbon(R.PIlo + R.PIhi ~ Marriage.s, fill = "gray50", alpha = 0.2) %>%
gf_labs( y = "Divorce rate", caption = "MedianAgeMarriage.s = 0")
m5.3.pred2 <-
data_frame(
MedianAgeMarriage.s = seq(from = -3, to = 3, by = 0.25),
Marriage.s = mean(WaffleDivorce$Marriage.s)
)
mu <- link(m5.3, data = m5.3.pred2, n = 1e4)
## [ 1000 / 10000 ]
[ 2000 / 10000 ]
[ 3000 / 10000 ]
[ 4000 / 10000 ]
[ 5000 / 10000 ]
[ 6000 / 10000 ]
[ 7000 / 10000 ]
[ 8000 / 10000 ]
[ 9000 / 10000 ]
[ 10000 / 10000 ]
A.sim <- sim(m5.3, data = m5.3.pred2, n = 1e4)
## [ 1000 / 10000 ]
[ 2000 / 10000 ]
[ 3000 / 10000 ]
[ 4000 / 10000 ]
[ 5000 / 10000 ]
[ 6000 / 10000 ]
[ 7000 / 10000 ]
[ 8000 / 10000 ]
[ 9000 / 10000 ]
[ 10000 / 10000 ]
m5.3.pred2 <-
m5.3.pred2 %>%
mutate(
mu.mean = apply(mu, 2, mean),
mu.PIlo = apply(mu, 2, PI)[1,],
mu.PIhi = apply(mu, 2, PI)[2,],
A.PIlo = apply(A.sim, 2, PI)[1,],
A.PIhi = apply(A.sim, 2, PI)[2,]
)
gf_line(mu.mean ~ MedianAgeMarriage.s, data = m5.3.pred2, color = "red") %>%
gf_ribbon(mu.PIlo + mu.PIhi ~ MedianAgeMarriage.s, alpha = 0.1) %>%
gf_ribbon(A.PIlo + A.PIhi ~ MedianAgeMarriage.s, alpha = 0.1) %>%
gf_labs( y = "Divorce rate", caption = "Marriage.s = 0")
# call link without specifying new data
# so it uses original data
divorce.mu <- link(m5.3)
## [ 100 / 1000 ]
[ 200 / 1000 ]
[ 300 / 1000 ]
[ 400 / 1000 ]
[ 500 / 1000 ]
[ 600 / 1000 ]
[ 700 / 1000 ]
[ 800 / 1000 ]
[ 900 / 1000 ]
[ 1000 / 1000 ]
# simulate observations
# again no new data, so uses original data
divorce.sim <- sim(m5.3, n = 1e4)
## [ 1000 / 10000 ]
[ 2000 / 10000 ]
[ 3000 / 10000 ]
[ 4000 / 10000 ]
[ 5000 / 10000 ]
[ 6000 / 10000 ]
[ 7000 / 10000 ]
[ 8000 / 10000 ]
[ 9000 / 10000 ]
[ 10000 / 10000 ]
# summarize samples across cases
m5.3.pred3 <-
WaffleDivorce %>%
mutate(
mu.mean = apply(divorce.mu, 2, mean),
mu.PIlo = apply(divorce.mu, 2, PI)[1,],
mu.PIhi = apply(divorce.mu, 2, PI)[2,],
divorce.PIlo = apply(divorce.sim, 2, PI)[1,],
divorce.PIhi = apply(divorce.sim, 2, PI)[2,]
)
gf_pointrange(mu.mean + divorce.PIlo + divorce.PIhi ~ Divorce, data = m5.3.pred3,
color = rangi2) %>%
gf_abline(slope = 1, intercept = 0) %>%
gf_labs(x = "Observed divorce", y = "Predicted divorce")
# make most recent ggplot interactive
# adding text to the geom_point() above makes it available on hover here.
plotly::ggplotly()
# compute residuals
m5.3.pred3 <-
m5.3.pred3 %>%
mutate(divorce.resid = Divorce - mu.mean,
state = reorder(Loc, divorce.resid))
gf_linerange((Divorce - divorce.PIlo) + (Divorce - divorce.PIhi) ~ Loc,
data = m5.3.pred3, alpha = 0.2, size = 0.9) %>%
gf_linerange((Divorce - mu.PIlo) + (Divorce - mu.PIhi) ~ Loc,
data = m5.3.pred3, alpha = 0.2, size = 1.4) %>%
gf_point(divorce.resid ~ Loc, data = m5.3.pred3) +
coord_flip()
WaffleDivorce <-
WaffleDivorce %>%
mutate(
WaffleHouses.pc = WaffleHouses / Population * 1e6,
divorce.resid = Divorce - apply(link(m5.3), 2, mean)
)
## [ 100 / 1000 ]
[ 200 / 1000 ]
[ 300 / 1000 ]
[ 400 / 1000 ]
[ 500 / 1000 ]
[ 600 / 1000 ]
[ 700 / 1000 ]
[ 800 / 1000 ]
[ 900 / 1000 ]
[ 1000 / 1000 ]
gf_point(divorce.resid ~ WaffleHouses.pc + label:Loc,
data = WaffleDivorce) %>%
gf_smooth(divorce.resid ~ WaffleHouses.pc, method = "lm", size = 0.5) %>%
plotly::ggplotly()
## Warning: Ignoring unknown aesthetics: label
### R code 5.15
n <- 100
Sim5.15Data <-
data_frame(
x_real = rnorm(n), # x_real as Gaussian with mean 0 and stddev 1
x_spur = rnorm(n, x_real), # x_spur as Gaussian with mean = x_real
y = rnorm(n, x_real) # y as Gaussian with mean=x_real
)
m_both <-
map(
alist(
y ~ dnorm(mu, sigma),
mu <- a + b_real * x_real + b_spur * x_spur,
a ~ dnorm(0, 3),
b_real ~ dnorm(0, 3),
b_spur ~ dnorm(0, 3),
sigma ~ dlnorm(0, 3)),
data = Sim5.15Data
)
m_real <-
map(
alist(
y ~ dnorm(mu, sigma),
mu <- a + b_real * x_real,
a ~ dnorm(0, 3),
b_real ~ dnorm(0, 3),
sigma ~ dlnorm(0, 3)),
data = Sim5.15Data
)
m_spur <-
map(
alist(
y ~ dnorm(mu, sigma),
mu <- a + b_spur * x_spur,
a ~ dnorm(0, 3),
b_spur ~ dnorm(0, 3),
sigma ~ dlnorm(0, 3)),
data = Sim5.15Data
)
lapply(list(both = m_both, real = m_real, spur = m_spur), precis)
## $both
## Mean StdDev 5.5% 94.5%
## a -0.29 0.10 -0.45 -0.13
## b_real 1.28 0.15 1.04 1.51
## b_spur -0.21 0.12 -0.40 -0.03
## sigma 0.98 0.07 0.87 1.09
##
## $real
## Mean StdDev 5.5% 94.5%
## a -0.30 0.10 -0.46 -0.14
## b_real 1.06 0.10 0.91 1.22
## sigma 0.99 0.07 0.88 1.10
##
## $spur
## Mean StdDev 5.5% 94.5%
## a -0.45 0.13 -0.66 -0.25
## b_spur 0.54 0.10 0.39 0.70
## sigma 1.28 0.09 1.14 1.42
gf_point(y ~ x_real, data = Sim5.15Data) %>%
gf_coefline(coef = coef(m_real))
## Warning in gf_coefline(., coef = coef(m_real)): Ignoring all but first two
## values of coef.
gf_point(y ~ x_spur, data = Sim5.15Data) %>%
gf_coefline(coef = coef(m_spur))
## Warning in gf_coefline(., coef = coef(m_spur)): Ignoring all but first two
## values of coef.
Comparative primate milk composition data, from Table 2 of Hinde and Milligan. 2011. Evolutionary Anthropology 20:9-23.
clade
: Broad taxonomic group
species
: Species name
kcal.per.g
: Kilocalories per gram of milk
perc.fat
: Percent fat
perc.protein
: Percent protein
perc.lactose
: Percent lactose
mass
: Body mass of mother, in kilograms
neocortex.perc
: Percent of brain mass that is neocortex
data(milk)
# This fails.
m5.5 <- map(
alist(
kcal.per.g ~ dnorm(mu, sigma),
mu <- a + bn * neocortex.perc,
a ~ dnorm(0, 100),
bn ~ dnorm(0, 1),
sigma ~ dunif(0, 1)
),
data = milk
)
## Error in map(alist(kcal.per.g ~ dnorm(mu, sigma), mu <- a + bn * neocortex.perc, : initial value in 'vmmin' is not finite
## The start values for the parameters were invalid. This could be caused by missing values (NA) in the data or by start values outside the parameter constraints. If there are no NA values in the data, try using explicit start values.
# Here's why the previous chunk failed -- missing data!
favstats( ~ neocortex.perc, data = milk)
## min Q1 median Q3 max mean sd n missing
## 55.16 64.54 68.85 71.26 76.3 67.57588 5.968612 17 12
# Let's grab just the rows that have no missing data.
MilkCC <- milk %>% filter(complete.cases(.))
favstats(~ neocortex.perc, data = MilkCC)
## min Q1 median Q3 max mean sd n missing
## 55.16 64.54 68.85 71.26 76.3 67.57588 5.968612 17 0
m5.5 <- map(
alist(
kcal.per.g ~ dnorm(mu, sigma),
mu <- a + bn * neocortex.perc,
a ~ dnorm(0, 100),
bn ~ dnorm(0, 1),
sigma ~ dunif(0, 1)
),
data = MilkCC
)
precis(m5.5, digits = 3)
## Mean StdDev 5.5% 94.5%
## a 0.353 0.471 -0.399 1.106
## bn 0.005 0.007 -0.007 0.016
## sigma 0.166 0.028 0.120 0.211
coef(m5.5)["bn"] * (76 - 55)
## bn
## 0.0945674
pred.data <- data.frame(neocortex.perc = 50:80)
mu <- link(m5.5, data = pred.data, n = 1e4)
## [ 1000 / 10000 ]
[ 2000 / 10000 ]
[ 3000 / 10000 ]
[ 4000 / 10000 ]
[ 5000 / 10000 ]
[ 6000 / 10000 ]
[ 7000 / 10000 ]
[ 8000 / 10000 ]
[ 9000 / 10000 ]
[ 10000 / 10000 ]
m5.5.pred <-
pred.data %>%
mutate(
mu.mean = apply(mu, 2, mean),
mu.PIlo = apply(mu, 2, PI)[1,],
mu.PIhi = apply(mu, 2, PI)[2,]
)
gf_point(kcal.per.g ~ neocortex.perc + label:species, data = MilkCC,
color = rangi2) %>%
gf_line(mu.mean ~ neocortex.perc, data = m5.5.pred) %>%
gf_ribbon(mu.PIlo + mu.PIhi ~ neocortex.perc, data = m5.5.pred,
alpha = 0.2) %>%
plotly::ggplotly()
## Warning: Ignoring unknown aesthetics: label
### R code 5.24
MilkCC <-
MilkCC %>%
mutate(log.mass = log(mass))
m5.6 <- map(
alist(
kcal.per.g ~ dnorm(mu, sigma),
mu <- a + bm * log.mass,
a ~ dnorm(0, 100),
bm ~ dnorm(0, 1),
sigma ~ dunif(0, 1)
),
data = MilkCC
)
precis(m5.6)
## Mean StdDev 5.5% 94.5%
## a 0.71 0.05 0.63 0.78
## bm -0.03 0.02 -0.06 0.00
## sigma 0.16 0.03 0.11 0.20
m5.7 <- map(
alist(
kcal.per.g ~ dnorm(mu, sigma),
mu <- a + bn * neocortex.perc + bm * log.mass,
a ~ dnorm(0, 100),
bn ~ dnorm(0, 1),
bm ~ dnorm(0, 1),
sigma ~ dunif(0, 1)
),
data = MilkCC,
start = list(a = 0, bn = 0, bm = 0, sigma = 0.50)
)
precis(m5.7)
## Mean StdDev 5.5% 94.5%
## a -1.08 0.47 -1.83 -0.34
## bn 0.03 0.01 0.02 0.04
## bm -0.10 0.02 -0.13 -0.06
## sigma 0.11 0.02 0.08 0.15
m5.7.pred <-
data_frame(
neocortex.perc = 50:80,
log.mass = mean(log(MilkCC$mass))
)
m5.7.link <- link(m5.7, data = m5.7.pred, n = 1e4)
## [ 1000 / 10000 ]
[ 2000 / 10000 ]
[ 3000 / 10000 ]
[ 4000 / 10000 ]
[ 5000 / 10000 ]
[ 6000 / 10000 ]
[ 7000 / 10000 ]
[ 8000 / 10000 ]
[ 9000 / 10000 ]
[ 10000 / 10000 ]
m5.7.pred <-
m5.7.pred %>%
mutate(
mu.mean = apply(mu, 2, mean),
mu.PIlo = apply(mu, 2, PI)[1,],
mu.PIhi = apply(mu, 2, PI)[2,]
)
gf_point(kcal.per.g ~ neocortex.perc + label:species, data = MilkCC) %>%
gf_ribbon(mu.PIlo + mu.PIhi ~ neocortex.perc, data = m5.7.pred, alpha = 0.2) %>%
gf_line(mu.mean ~ neocortex.perc, data = m5.7.pred, alpha = 0.5) %>%
plotly::ggplotly()
## Warning: Ignoring unknown aesthetics: label
# simulating a masking relationship
# n = number of cases
# rho = correlation between two variables x_pos and x_neg
sim_masking <- function(n = 100, rho = 0.7) {
data_frame(
x_pos = rnorm(n),
x_neg = rnorm(n, rho * x_pos, sd = sqrt(1 - rho^2)),
y = rnorm(n, x_pos - x_neg, sd = 1) # y equally associated to each var
)
}
MaskingData <- sim_masking()
splom(MaskingData) # splom = scatter plot matrix (lattice)
GGally::ggpairs(MaskingData) # ggplot2 version
So far we have seen examples where adding in additional variables helps us understand what is going on. But adding variables to the model can actually make thigs worse in some situations.
We start with a simulated example and then return to our milk data set.
sim_legs <- function(n = 100) {
data_frame(
height = rnorm(n, 10, 2), # units = ??
leg_prop = runif(n, 0.4, 0.5),
leg_left = leg_prop * height + rnorm(n, 0, 0.2),
leg_right = leg_prop * height + rnorm(n, 0, 0.2)
)
}
LegHeight <- sim_legs()
m5.8 <- map(
alist(
height ~ dnorm(mu, sigma),
mu <- a + bl * leg_left + br * leg_right,
a ~ dnorm(10, 100),
bl ~ dnorm(2, 10),
br ~ dnorm(2, 10),
sigma ~ dunif(0, 10)
),
data = LegHeight
)
precis(m5.8)
## Mean StdDev 5.5% 94.5%
## a 1.44 0.40 0.81 2.08
## bl 1.42 0.26 1.00 1.84
## br 0.49 0.26 0.06 0.91
## sigma 0.72 0.05 0.64 0.81
GGally::ggpairs(LegHeight)
plot(precis(m5.8))
m5.8.post <- extract.samples(m5.8)
gf_point(bl ~ br, data = m5.8.post, col = rangi2, alpha = 0.1, pch = 16)
gf_dens( ~ (bl + br), data = m5.8.post)
m5.9 <- map(alist(
height ~ dnorm(mu, sigma),
mu <- a + bl * leg_left,
a ~ dnorm(10, 100),
bl ~ dnorm(2, 10),
sigma ~ dunif(0, 10)
),
data = LegHeight)
precis(m5.9)
## Mean StdDev 5.5% 94.5%
## a 1.55 0.40 0.91 2.18
## bl 1.88 0.08 1.75 2.02
## sigma 0.74 0.05 0.65 0.82
plot(precis(m5.9))
library(rethinking)
data(milk)
# kcal.per.g regressed on perc.fat
m5.10 <- map(
alist(
kcal.per.g ~ dnorm(mu, sigma),
mu <- a + bf * perc.fat,
a ~ dnorm(0.6, 10),
bf ~ dnorm(0, 1),
sigma ~ dunif(0, 10)
),
data = milk
)
# kcal.per.g regressed on perc.lactose
m5.11 <- map(
alist(
kcal.per.g ~ dnorm(mu, sigma),
mu <- a + bl * perc.lactose,
a ~ dnorm(0.6, 10),
bl ~ dnorm(0, 1),
sigma ~ dunif(0, 10)
),
data = milk
)
lapply(list(fat = m5.10, lactose = m5.11), precis, digits = 3)
## $fat
## Mean StdDev 5.5% 94.5%
## a 0.301 0.036 0.244 0.358
## bf 0.010 0.001 0.008 0.012
## sigma 0.073 0.010 0.058 0.089
##
## $lactose
## Mean StdDev 5.5% 94.5%
## a 1.166 0.043 1.098 1.235
## bl -0.011 0.001 -0.012 -0.009
## sigma 0.062 0.008 0.049 0.075
m5.12 <- map(
alist(
kcal.per.g ~ dnorm(mu, sigma),
mu <- a + bf * perc.fat + bl * perc.lactose,
a ~ dnorm(0.6, 10),
bf ~ dnorm(0, 1),
bl ~ dnorm(0, 1),
sigma ~ dunif(0, 10)
),
data = milk
)
lapply(list(fat = m5.10, lactose = m5.11, `fat + lactose` = m5.12), precis, digits = 3)
## $fat
## Mean StdDev 5.5% 94.5%
## a 0.301 0.036 0.244 0.358
## bf 0.010 0.001 0.008 0.012
## sigma 0.073 0.010 0.058 0.089
##
## $lactose
## Mean StdDev 5.5% 94.5%
## a 1.166 0.043 1.098 1.235
## bl -0.011 0.001 -0.012 -0.009
## sigma 0.062 0.008 0.049 0.075
##
## $`fat + lactose`
## Mean StdDev 5.5% 94.5%
## a 1.007 0.200 0.688 1.327
## bf 0.002 0.002 -0.002 0.006
## bl -0.009 0.002 -0.013 -0.005
## sigma 0.061 0.008 0.048 0.074
GGally::ggpairs(milk %>% select(kcal.per.g, perc.fat, perc.lactose))
cor(milk$perc.fat, milk$perc.lactose)
## [1] -0.9416373
# alternative syntax using mosaic package
cor(perc.fat ~ perc.lactose, data = milk)
## [1] -0.9416373
library(rethinking)
data(milk)
collinearity_sim <-
function(rho = 0.9, data = milk) {
data <-
data %>%
mutate(
x = rnorm(nrow(data), mean = rho * perc.fat,
sd = sqrt((1 - rho^2) * var(perc.fat)))
)
model = lm(kcal.per.g ~ perc.fat + x, data = data)
sqrt(diag(vcov(model)))[2] # stddev of parameter
}
collinearity_sim <-
Vectorize(collinearity_sim, "rho")
SimData5.40 <-
expand.grid(
r = seq(from = 0, to = 0.99, by = 0.01),
rep = 1:100) %>%
mutate(
sd = collinearity_sim(r)
)
gf_point(sd ~ r, data = SimData5.40, alpha = 0.01) %>%
gf_spline(sd ~ r)
# simulate data where growth is inhibited by fungus, which is inhibit by soil treatments
# number of plants
SimData5.13 <-
expand.grid(
treatment = c(0, 1),
rep = 1:50 # 50 plants in each treatment group
) %>%
mutate(
height0 = rnorm(100, 10, 2), # initial heights of plants
# fungus grows in half of control group and 10% of treatment group
fungus = rbinom(100, size = 1, prob = 0.5 - 0.4 * treatment),
# mean growth is 5 without fungus, 2 with fungus
height1 = height0 + rnorm(100, 5 - 3 * fungus)
)
If we use treatment
and fungus
to predict growth, treatment
appears to have no effect. But we know it does (since we simulated it that way). The impact of treatment
is masked by the use of fungus
, since the way treatment
affects growth is by inhibiting fungus.
m5.13 <- map(
alist(
height1 ~ dnorm(mu, sigma),
mu <- a + bh * height0 + bt * treatment + bf * fungus,
a ~ dnorm(0, 100),
c(bh, bt, bf) ~ dnorm(0, 10),
sigma ~ dunif(0, 10)
),
data = SimData5.13
)
precis(m5.13)
## Mean StdDev 5.5% 94.5%
## a 5.13 0.54 4.26 6.00
## bh 1.01 0.05 0.93 1.09
## bt -0.28 0.25 -0.67 0.12
## bf -2.94 0.27 -3.37 -2.52
## sigma 1.06 0.07 0.94 1.18
If we remove fungus
, we can see the effect of treatment
.
m5.14 <- map(
alist(
height1 ~ dnorm(mu, sigma),
mu <- a + bh * height0 + bt * treatment,
a ~ dnorm(0, 100),
c(bh, bt) ~ dnorm(0, 10),
sigma ~ dunif(0, 10)
),
data = SimData5.13,
start = list(a = 2, bh = 1, bt = 1, sigma = 1)
)
precis(m5.14)
## Mean StdDev 5.5% 94.5%
## a 2.84 0.75 1.64 4.04
## bh 1.07 0.07 0.95 1.18
## bt 1.19 0.31 0.69 1.69
## sigma 1.57 0.11 1.39 1.75
data(Howell1)
m5.15 <-
map(
alist(
height ~ dnorm(mu, sigma),
mu <- a + bm * male,
a ~ dnorm(178, 100),
bm ~ dnorm(0, 10),
sigma ~ dunif(0, 50)
),
data = Howell1)
precis(m5.15)
## Mean StdDev 5.5% 94.5%
## a 134.83 1.59 132.29 137.38
## bm 7.28 2.28 3.63 10.93
## sigma 27.31 0.83 25.99 28.63
m5.15.post <-
extract.samples(m5.15) %>%
mutate(
mu.male = a + bm
)
PI(m5.15.post$mu.male)
## 5% 94%
## 139.4116 144.8732
m5.15b <-
map(
alist(
height ~ dnorm(mu, sigma),
mu <- af * (1 - male) + am * male,
af ~ dnorm(178, 100),
am ~ dnorm(178, 100),
sigma ~ dunif(0, 50)
),
data = Howell1,
start = list(af = 178, am = 178, sigma = 3)
)
Coding a \(k\)-level categorical variable as \(1, 2, 3, \dots k\), is not a good idea in most cases. (Think about what \(\mu_i = a + b x_i\) would say in that case – it’s usually not the model we want.) Instead, we need a different encoding that uses multiple predictors.
data(milk)
tally(~ clade, data = milk)
## clade
## Ape New World Monkey Old World Monkey Strepsirrhine
## 9 9 6 5
Here we create variables that are indicators for a particular level of clade
. That is, for each clade \(c\) we create a new variable \(x_c\) defined as
\[ x_c = [\![\mathrm{clade} = c ]\!] \]
milk <-
milk %>%
mutate(
clade.Ape = ifelse(clade == "Ape", 1, 0),
clade.NWM = ifelse(clade == "New World Monkey", 1, 0),
clade.OWM = ifelse(clade == "Old World Monkey", 1, 0),
clade.S = ifelse(clade == "Strepsirrhine", 1, 0)
)
milk %>% select(species, matches("clade")) %>% sample_n(5)
## species clade clade.Ape clade.NWM clade.OWM
## 18 M mulatta Old World Monkey 0 0 1
## 28 P troglodytes Ape 1 0 0
## 6 Alouatta seniculus New World Monkey 0 1 0
## 16 Miopithecus talpoin Old World Monkey 0 0 1
## 1 Eulemur fulvus Strepsirrhine 0 0 0
## clade.S
## 18 0
## 28 0
## 6 0
## 16 0
## 1 1
m5.16 <- map(
alist(
kcal.per.g ~ dnorm(mu, sigma),
mu <- a + b.NWM * clade.NWM + b.OWM * clade.OWM + b.S * clade.S,
a ~ dnorm(0.6, 10),
b.NWM ~ dnorm(0, 1),
b.OWM ~ dnorm(0, 1),
b.S ~ dnorm(0, 1),
sigma ~ dunif(0, 10)
),
data = milk
)
precis(m5.16)
## Mean StdDev 5.5% 94.5%
## a 0.55 0.04 0.49 0.61
## b.NWM 0.17 0.05 0.08 0.25
## b.OWM 0.24 0.06 0.15 0.34
## b.S -0.04 0.06 -0.14 0.06
## sigma 0.11 0.02 0.09 0.14
# sample posterior
m5.16.post <-
extract.samples(m5.16) %>%
mutate(
# compute averages for each category
mu.ape = a,
mu.NWM = a + b.NWM,
mu.OWM = a + b.OWM,
mu.S = a + b.S
)
# summarize using precis
# computes mean, sd and PI for each variable in data frame
precis(m5.16.post, digits = 3)
## Mean StdDev |0.89 0.89|
## a 0.546 0.038 0.486 0.606
## b.NWM 0.169 0.053 0.081 0.252
## b.OWM 0.242 0.061 0.148 0.341
## b.S -0.039 0.063 -0.140 0.061
## sigma 0.115 0.015 0.090 0.139
## mu.ape 0.546 0.038 0.486 0.606
## mu.NWM 0.715 0.038 0.654 0.775
## mu.OWM 0.788 0.047 0.711 0.860
## mu.S 0.507 0.051 0.425 0.588
quantile( ~ (mu.NWM - mu.OWM), data = m5.16.post, probs = c(0.025, 0.5, 0.975))
## 2.5% 50% 97.5%
## -0.19261998 -0.07340459 0.04437971
map()
provides ways to deal with all of the clade indicator variables systematically. First, we convert to the numbers \(1, 2, 3, 4\).
milk <-
milk %>%
mutate(
clade_id = coerce_index(clade)
)
milk %>% select(matches("clade")) %>% sample_n(5)
## clade clade.Ape clade.NWM clade.OWM clade.S clade_id
## 22 Ape 1 0 0 0 1
## 29 Ape 1 0 0 0 1
## 24 Ape 1 0 0 0 1
## 6 New World Monkey 0 1 0 0 2
## 13 New World Monkey 0 1 0 0 2
Now we fit a model with four coefficients a[1], a[2], a[3], a[4]
.
m5.16_alt <-
map(
alist(
kcal.per.g ~ dnorm(mu, sigma),
mu <- a[clade_id],
a[clade_id] ~ dnorm(0.6, 10),
sigma ~ dunif(0, 10)
),
data = milk)
precis(m5.16_alt, depth = 2)
## Mean StdDev 5.5% 94.5%
## a[1] 0.55 0.04 0.48 0.61
## a[2] 0.71 0.04 0.65 0.78
## a[3] 0.79 0.05 0.71 0.86
## a[4] 0.51 0.05 0.43 0.59
## sigma 0.11 0.02 0.09 0.14
Linear models can be fit in a non-Bayesian way using lm()
. Because lm()
knows the model is linear, it uses terser notation for the model – you just list the variables and lm()
supplies the coefficients to multiply by each one (and an intercept, unless you remove it). Also lm()
knows how to convert categorical data into multiple predictors, so you only have list the categorical variable and lm()
takes care of the rest.
m5.17 <- lm(y ~ 1 + x, data = d)
m5.18 <- lm(y ~ 1 + x + z + w, data = d)
m5.19a <- lm(y ~ 1 + x, data = d)
m5.19b <- lm(y ~ x, data = d)
m5.20 <- lm(y ~ 0 + x, data = d)
m5.21 <- lm(y ~ x - 1, data = d)
m5.22 <- lm(y ~ 1 + as.factor(season), data = d)
m5.23 <- lm(y ~ 1 + x + x2 + x3,
data = d %>% mutate(x2 = x^2, x3 = x^3))
m5.24 <- lm(y ~ 1 + x + I(x^2) + I(x^3), data = d)
If you know how to make a model using lm()
, glimmer()
will help you create a formula list for map()
that does the same job.
data(cars)
glimmer(dist ~ speed, data = cars)
## alist(
## dist ~ dnorm( mu , sigma ),
## mu <- Intercept +
## b_speed*speed,
## Intercept ~ dnorm(0,10),
## b_speed ~ dnorm(0,10),
## sigma ~ dcauchy(0,2)
## )
data(KidsFeet, package = "mosaicData")
glimmer(length ~ width + sex, data = KidsFeet)
## alist(
## length ~ dnorm( mu , sigma ),
## mu <- Intercept +
## b_width*width +
## b_sexG*sexG,
## Intercept ~ dnorm(0,10),
## b_width ~ dnorm(0,10),
## b_sexG ~ dnorm(0,10),
## sigma ~ dcauchy(0,2)
## )
# this shows how lm is coding the categorical variable
KidsFeet %>% head(5) %>% select(name, sex, length, width)
## name sex length width
## 1 David B 24.4 8.4
## 2 Lars B 25.4 8.8
## 3 Zach B 24.5 9.7
## 4 Josh B 25.2 9.8
## 5 Lang B 25.1 8.9
model.matrix(length ~ width + sex, data = KidsFeet) %>% head(5)
## (Intercept) width sexG
## 1 1 8.4 0
## 2 1 8.8 0
## 3 1 9.7 0
## 4 1 9.8 0
## 5 1 8.9 0
As you might guess, folks interested in a Bayesian approach to linar models (and some generalizations of linear models) have also written functions to make those models easier to describe and it. The rstanarm
and brms
packages are two such packages. Here’s a quick example using the default priors.
require(rstanarm)
## Loading required package: rstanarm
## Loading required package: Rcpp
## rstanarm (Version 2.14.1, packaged: 2017-01-16 18:47:11 UTC)
## - Do not expect the default priors to remain the same in future rstanarm versions.
## Thus, R scripts should specify priors explicitly, even if they are just the defaults.
## - For execution on a local, multicore CPU with excess RAM we recommend calling
## options(mc.cores = parallel::detectCores())
##
## Attaching package: 'rstanarm'
## The following object is masked from 'package:rethinking':
##
## se
model <- stan_lm(length ~ width + sex, data = KidsFeet, prior = NULL)
##
## SAMPLING FOR MODEL 'lm' NOW (CHAIN 1).
##
## Chain 1, Iteration: 1 / 2000 [ 0%] (Warmup)
## Chain 1, Iteration: 200 / 2000 [ 10%] (Warmup)
## Chain 1, Iteration: 400 / 2000 [ 20%] (Warmup)
## Chain 1, Iteration: 600 / 2000 [ 30%] (Warmup)
## Chain 1, Iteration: 800 / 2000 [ 40%] (Warmup)
## Chain 1, Iteration: 1000 / 2000 [ 50%] (Warmup)
## Chain 1, Iteration: 1001 / 2000 [ 50%] (Sampling)
## Chain 1, Iteration: 1200 / 2000 [ 60%] (Sampling)
## Chain 1, Iteration: 1400 / 2000 [ 70%] (Sampling)
## Chain 1, Iteration: 1600 / 2000 [ 80%] (Sampling)
## Chain 1, Iteration: 1800 / 2000 [ 90%] (Sampling)
## Chain 1, Iteration: 2000 / 2000 [100%] (Sampling)
## Elapsed Time: 0.774143 seconds (Warm-up)
## 0.778979 seconds (Sampling)
## 1.55312 seconds (Total)
##
##
## SAMPLING FOR MODEL 'lm' NOW (CHAIN 2).
##
## Chain 2, Iteration: 1 / 2000 [ 0%] (Warmup)
## Chain 2, Iteration: 200 / 2000 [ 10%] (Warmup)
## Chain 2, Iteration: 400 / 2000 [ 20%] (Warmup)
## Chain 2, Iteration: 600 / 2000 [ 30%] (Warmup)
## Chain 2, Iteration: 800 / 2000 [ 40%] (Warmup)
## Chain 2, Iteration: 1000 / 2000 [ 50%] (Warmup)
## Chain 2, Iteration: 1001 / 2000 [ 50%] (Sampling)
## Chain 2, Iteration: 1200 / 2000 [ 60%] (Sampling)
## Chain 2, Iteration: 1400 / 2000 [ 70%] (Sampling)
## Chain 2, Iteration: 1600 / 2000 [ 80%] (Sampling)
## Chain 2, Iteration: 1800 / 2000 [ 90%] (Sampling)
## Chain 2, Iteration: 2000 / 2000 [100%] (Sampling)
## Elapsed Time: 0.782155 seconds (Warm-up)
## 0.63203 seconds (Sampling)
## 1.41419 seconds (Total)
##
##
## SAMPLING FOR MODEL 'lm' NOW (CHAIN 3).
##
## Chain 3, Iteration: 1 / 2000 [ 0%] (Warmup)
## Chain 3, Iteration: 200 / 2000 [ 10%] (Warmup)
## Chain 3, Iteration: 400 / 2000 [ 20%] (Warmup)
## Chain 3, Iteration: 600 / 2000 [ 30%] (Warmup)
## Chain 3, Iteration: 800 / 2000 [ 40%] (Warmup)
## Chain 3, Iteration: 1000 / 2000 [ 50%] (Warmup)
## Chain 3, Iteration: 1001 / 2000 [ 50%] (Sampling)
## Chain 3, Iteration: 1200 / 2000 [ 60%] (Sampling)
## Chain 3, Iteration: 1400 / 2000 [ 70%] (Sampling)
## Chain 3, Iteration: 1600 / 2000 [ 80%] (Sampling)
## Chain 3, Iteration: 1800 / 2000 [ 90%] (Sampling)
## Chain 3, Iteration: 2000 / 2000 [100%] (Sampling)
## Elapsed Time: 0.904358 seconds (Warm-up)
## 0.588334 seconds (Sampling)
## 1.49269 seconds (Total)
##
##
## SAMPLING FOR MODEL 'lm' NOW (CHAIN 4).
##
## Chain 4, Iteration: 1 / 2000 [ 0%] (Warmup)
## Chain 4, Iteration: 200 / 2000 [ 10%] (Warmup)
## Chain 4, Iteration: 400 / 2000 [ 20%] (Warmup)
## Chain 4, Iteration: 600 / 2000 [ 30%] (Warmup)
## Chain 4, Iteration: 800 / 2000 [ 40%] (Warmup)
## Chain 4, Iteration: 1000 / 2000 [ 50%] (Warmup)
## Chain 4, Iteration: 1001 / 2000 [ 50%] (Sampling)
## Chain 4, Iteration: 1200 / 2000 [ 60%] (Sampling)
## Chain 4, Iteration: 1400 / 2000 [ 70%] (Sampling)
## Chain 4, Iteration: 1600 / 2000 [ 80%] (Sampling)
## Chain 4, Iteration: 1800 / 2000 [ 90%] (Sampling)
## Chain 4, Iteration: 2000 / 2000 [100%] (Sampling)
## Elapsed Time: 0.746751 seconds (Warm-up)
## 0.776023 seconds (Sampling)
## 1.52277 seconds (Total)
## Warning: There were 98 divergent transitions after warmup. Increasing adapt_delta above 0.95 may help. See
## http://mc-stan.org/misc/warnings.html#divergent-transitions-after-warmup
## Warning: Examine the pairs() plot to diagnose sampling problems
model
## stan_lm(formula = length ~ width + sex, data = KidsFeet, prior = NULL)
##
## Estimates:
## Median MAD_SD
## (Intercept) 11.1 3.2
## width 1.5 0.3
## sexG -0.1 0.4
## sigma 1.0 0.1
## log-fit_ratio 0.0 0.1
## R2 0.4 0.1
##
## Sample avg. posterior predictive
## distribution of y (X = xbar):
## Median MAD_SD
## mean_PPD 24.7 0.2