Code from Statistical Rethinking modified by R Pruim is shown below. Differences to the oringal include:
ggplot2
(via ggformula
) rather than base graphicstidyverse
for data transformationNaming a data frame after one of its variables is not a very good idea. Let’s rename the data and
continent
, which is handy for plotting laterlibrary(rethinking)
data(rugged)
Nations <-
rugged %>%
mutate(
log_gdp = log(rgdppc_2000),
continent = ifelse(cont_africa, "Africa", "Other")
) %>%
filter(! is.na(log_gdp))
dim(Nations)
## [1] 170 53
dim(rugged)
## [1] 234 51
Here we partition the data into African and non-African nations.
# split countries into Africa and not-Africa
Nations_A1 <- Nations %>% filter(cont_africa == 1)
Nations_A0 <- Nations %>% filter(cont_africa == 0)
sapply(list(All = Nations, Africa = Nations_A1, Other = Nations_A0), nrow)
## All Africa Other
## 170 49 121
gf_point(log_gdp ~ rugged + color::continent, data = Nations)
Here’s a bad idea (because there are better ideas): We could fit models on the African and non-African countries separately.
# African nations
m7.1 <-
map(
alist(
log_gdp ~ dnorm(mu, sigma),
mu <- a + bR * rugged,
a ~ dnorm(8, 100),
bR ~ dnorm(0, 1),
sigma ~ dunif(0, 10)
),
data = Nations_A1)
# non-African nations
m7.2 <-
map(
alist(
log_gdp ~ dnorm(mu, sigma),
mu <- a + bR * rugged,
a ~ dnorm(8, 100),
bR ~ dnorm(0, 1),
sigma ~ dunif(0, 10)
),
data = Nations_A0)
This models the association between GDP and ruggedness (using all the nations).
m7.3 <-
map(
alist(
log_gdp ~ dnorm(mu, sigma),
mu <- a + bR * rugged,
a ~ dnorm(8, 100),
bR ~ dnorm(0, 1),
sigma ~ dunif(0, 10)
),
data = Nations)
This model includes an additional predictor: whether or not the country is in Africa).
m7.4 <- map(
alist(
log_gdp ~ dnorm(mu, sigma),
mu <- a + bR * rugged + bA * cont_africa,
a ~ dnorm(8, 100),
bR ~ dnorm(0, 1),
bA ~ dnorm(0, 1),
sigma ~ dunif(0, 10)
),
data = Nations
)
compare(m7.3, m7.4)
## WAIC pWAIC dWAIC weight SE dSE
## m7.4 476.3 4.4 0.0 1 15.36 NA
## m7.3 539.8 2.8 63.5 0 13.31 15.14
This is quite different in approach to the code from the book.
Instead of creating two separate data sets, we can do this all at once, using expand.grid()
to create a sequence of rugged
values for both in and out of Africa.
Advantages:
At let’s use better names for things as well. In particular, we’ll name the data frame that is collecting predictions after the thing being predicted. (We don’t want to name it after the model, because we will be comparing different models’ predictions.)
Loggdp_predict <-
expand.grid(
cont_africa = 0:1,
rugged = seq(from = 0, to = 8, by = 0.2)
) %>%
mutate(
continent = ifelse(cont_africa, "Africa", "Other")
)
m7.4_link <- link(m7.4, data = Loggdp_predict, refresh = 0)
# add in means and intervals using m7.4
Loggdp_predict <-
Loggdp_predict %>%
mutate(
m7.4.mu = apply(m7.4_link, 2, mean),
m7.4.lo = apply(m7.4_link, 2, PI, prob = 0.97)[1,],
m7.4.hi = apply(m7.4_link, 2, PI, prob = 0.97)[2,]
)
gf_line(m7.4.mu ~ rugged + color:continent,
data = Loggdp_predict) %>%
gf_ribbon(m7.4.lo + m7.4.hi ~ rugged + fill:continent,
data = Loggdp_predict)
With the original data overlaid
last_plot() %>%
gf_point(log_gdp ~ rugged + color:continent, data = Nations)
A squre root transformation on the x-axis helps to spread out the cluster of dots near 0.
last_plot() %>%
gf_refine(scale_x_continuous(trans = "sqrt"))
Residuals
# computing residuals is expensive, so let's save them
m7.4.resid <- resid(m7.4)
## Residuals computed assuming outcome = log_gdp
gf_point(m7.4.resid ~ rugged + color:continent, data = Nations) %>%
gf_smooth(m7.4.resid ~ rugged + color:continent, data = Nations, alpha = 0.2)
## `geom_smooth()` using method = 'loess'
last_plot() %>%
gf_refine(scale_x_continuous(trans = "sqrt"))
## `geom_smooth()` using method = 'loess'
This reveals several things:
A model with “interaction”.
m7.5 <-
map(
alist(
log_gdp ~ dnorm(mu, sigma),
mu <- a + gamma * rugged + bA * cont_africa,
gamma <- bR + bAR * cont_africa,
a ~ dnorm(8, 100),
bA ~ dnorm(0, 1),
bR ~ dnorm(0, 1),
bAR ~ dnorm(0, 1),
sigma ~ dunif(0, 10)
),
data = Nations
)
compare(m7.3, m7.4, m7.5)
## WAIC pWAIC dWAIC weight SE dSE
## m7.5 469.3 5.1 0.0 0.97 15.06 NA
## m7.4 476.0 4.2 6.8 0.03 15.25 6.10
## m7.3 539.7 2.7 70.4 0.00 13.21 15.08
Alternative coding for m7.5b
.
m7.5b <-
map(
alist(
log_gdp ~ dnorm(mu, sigma),
mu <- a + bR * rugged + bAR * rugged * cont_africa + bA * cont_africa,
a ~ dnorm(8, 100),
bA ~ dnorm(0, 1),
bR ~ dnorm(0, 1),
bAR ~ dnorm(0, 1),
sigma ~ dunif(0, 10)
),
data = Nations
)
We can add the predictions from the new model to our existing Loggdp_predict
data frame since we want the same rows – just additional columns.
m7.5_link <- link(m7.5, data = Loggdp_predict, refresh = 0)
# add in means and intervals
Loggdp_predict <-
Loggdp_predict %>%
mutate(
m7.5.mu = apply(m7.5_link, 2, mean),
m7.5.lo = apply(m7.5_link, 2, PI, prob = 0.97)[1,],
m7.5.hi = apply(m7.5_link, 2, PI, prob = 0.97)[2,]
)
This model looks quite different.
gf_line(m7.5.mu ~ rugged + color:continent,
data = Loggdp_predict) %>%
gf_ribbon(m7.5.lo + m7.5.hi ~ rugged + fill:continent,
data = Loggdp_predict)
Now let’s compare the model to the data used to fit it.
# plot African nations with regression
gf_point(log_gdp ~ rugged + color:continent,
data = Nations) %>%
gf_line(m7.5.mu ~ rugged + col:continent, data = Loggdp_predict) %>%
gf_ribbon(m7.5.lo + m7.5.hi ~ rugged + fill:continent,
data = Loggdp_predict) %>%
gf_labs(y = "log GDP year 2000", x = "Terrain Ruggedness Index") %>%
gf_facet_grid(~ continent)
precis(m7.5)
## Mean StdDev 5.5% 94.5%
## a 9.18 0.14 8.97 9.40
## bA -1.85 0.22 -2.20 -1.50
## bR -0.18 0.08 -0.31 -0.06
## bAR 0.35 0.13 0.14 0.55
## sigma 0.93 0.05 0.85 1.01
We can compute \(\gamma_i\) for each posterior sample; \(\gamma_i\) only depends on whether a country is in Africa or not.
m7.5_post <- extract.samples(m7.5) %>%
mutate(
gamma.Africa = bR + bAR * 1,
gamma.notAfrica = bR + bAR * 0
)
Means of posterior \(\gamma_i\) values.
mean(m7.5_post$gamma.Africa)
## [1] 0.1628399
mean(m7.5_post$gamma.notAfrica)
## [1] -0.1827452
Posterior distributions of \(\gamma_i\)’s.
gf_dens(~gamma.Africa + color::"Africa", data = m7.5_post) %>%
gf_dens(~gamma.notAfrica + color::"Other", data = m7.5_post) %>%
gf_labs(x = "gamma")
Posterior distribution of the difference between the two \(\gamma_i\) values.
gf_dens( ~ (gamma.Africa - gamma.notAfrica), data = m7.5_post)
prop( ~(gamma.Africa < gamma.notAfrica), data = m7.5_post)
## TRUE
## 0.0036
Here’s the original code from the book.
# get minimum and maximum rugged values
q.rugged <- range(Nations$rugged)
# compute lines and confidence intervals
mu.ruggedlo <-
link(m7.5, data = data.frame(rugged = q.rugged[1],
cont_africa = 0:1),
refresh = 0)
mu.ruggedlo.mean <- apply(mu.ruggedlo, 2, mean)
mu.ruggedlo.PI <- apply(mu.ruggedlo, 2, PI)
mu.ruggedhi <-
link(m7.5,
data = data.frame(rugged = q.rugged[2],
cont_africa = 0:1),
refresh = 0)
mu.ruggedhi.mean <- apply(mu.ruggedhi, 2, mean)
mu.ruggedhi.PI <- apply(mu.ruggedhi, 2, PI)
# plot it all, splitting points at median
med.r <- median(Nations$rugged)
ox <- ifelse(Nations$rugged > med.r, 0.05,-0.05)
plot(
Nations$cont_africa + ox,
log(Nations$rgdppc_2000),
col = ifelse(Nations$rugged > med.r, rangi2, "black"),
xlim = c(-0.25, 1.25),
xaxt = "n",
ylab = "log GDP year 2000",
xlab = "Continent"
)
axis(1, at = c(0, 1), labels = c("other", "Africa"))
lines(0:1, mu.ruggedlo.mean, lty = 2)
shade(mu.ruggedlo.PI, 0:1)
lines(0:1, mu.ruggedhi.mean, col = rangi2)
shade(mu.ruggedhi.PI, 0:1, col = col.alpha(rangi2, 0.25))
We can simplify this a bit.
We don’t need to run link()
again – we’ve already run it.
We don’t have the exact min and max of rugged
in our Loggpd_predict
data, but it’s close. (And we could use a finer mesh if we wanted to.)
By putting a rugged_group
variable in both data sets, we can easily color/fill/group basd on that value
By using position_dodge()
and our continent
variable, we can create then entire plot using just three layers. (But notice the use of the group
aesthetic to override the default grouping that would have happened.)
mrugged <- median(~rugged, data = Nations); mrugged
## [1] 0.9795
Nations <-
Nations %>%
mutate(
rugged_group = ifelse( rugged > mrugged, "hi", "lo"))
Loggdp_predict <-
Loggdp_predict %>%
mutate(
rugged_group = ifelse( rugged > mrugged, "hi", "lo"))
gf_ribbon(
m7.5.lo + m7.5.hi ~ continent + fill:rugged_group + group:rugged_group,
data = Loggdp_predict %>% filter(rugged %in% c(0, 6))) %>%
gf_line(m7.5.mu ~ continent + color:rugged_group + group:rugged_group,
data = Loggdp_predict %>% filter(rugged %in% c(0, 6))) %>%
gf_point( log_gdp ~ continent + color:rugged_group, data = Nations,
position = position_dodge(width = 0.2), alpha = 0.5)
Here’s a fancier version that uses jitter and doge to minimize the issue of overlapping dots.
gf_ribbon(
m7.5.lo + m7.5.hi ~ continent + fill:rugged_group + group:rugged_group,
data = Loggdp_predict %>% filter(rugged %in% c(0, 6))) %>%
gf_line(m7.5.mu ~ continent + color:rugged_group + group:rugged_group,
data = Loggdp_predict %>% filter(rugged %in% c(0, 6))) %>%
gf_point( log_gdp ~ continent + color:rugged_group, data = Nations,
position = position_jitterdodge(jitter.width = 0.1, dodge.width = 0.2),
alpha = 0.5)
Note: This plot give a bit of a distorted view. The dots of each color are not all at the lowest/highest value for rugged. Using the 1st and 3rd quartiles seems fairer. That way some dots of each color will be above and some below the value of ruggedness used for the fits shown in that color.
Another option would be to introduce new colors so that we are less inclined to draw the false conclusion.
quantile(Nations$rugged)
## 0% 25% 50% 75% 100%
## 0.00300 0.44225 0.97950 1.95725 6.20200
gf_ribbon(
m7.5.lo + m7.5.hi ~ continent + fill:rugged_group + group:rugged_group,
data = Loggdp_predict %>% filter(rugged %in% c(0.5, 2))) %>%
gf_line(m7.5.mu ~ continent + color:rugged_group + group:rugged_group,
data = Loggdp_predict %>% filter(rugged %in% c(0.5, 2))) %>%
gf_point( log_gdp ~ continent + color:rugged_group, data = Nations,
position = position_jitterdodge(jitter.width = 0.1, dodge.width = 0.2),
alpha = 0.5)
gf_ribbon(
m7.5.lo + m7.5.hi ~ continent + fill:rugged_group + group:rugged_group,
data = Loggdp_predict %>% filter(rugged %in% c(0, 6))) %>%
gf_line(m7.5.mu ~ continent + color:"gray50" + group:rugged_group,
data = Loggdp_predict %>% filter(rugged %in% c(0, 6))) %>%
gf_point( log_gdp ~ continent + color:rugged_group, data = Nations,
position = position_jitterdodge(jitter.width = 0.1, dodge.width = 0.2),
alpha = 0.5) %>%
gf_refine(
scale_fill_manual(values = c("orange", "navy"))
)
As expected, the differences don’t look as pronounced when we use the 1st and 3rd quartiles.
library(rethinking)
data(tulips)
Tulips <- tulips # local version where we will add some variables later
glimpse(Tulips)
## Observations: 27
## Variables: 4
## $ bed <fctr> a, a, a, a, a, a, a, a, a, b, b, b, b, b, b, b, b, b, ...
## $ water <int> 1, 1, 1, 2, 2, 2, 3, 3, 3, 1, 1, 1, 2, 2, 2, 3, 3, 3, 1...
## $ shade <int> 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1...
## $ blooms <dbl> 0.00, 0.00, 111.04, 183.47, 59.16, 76.75, 224.97, 83.77...
m7.6 <- map(
alist(
blooms ~ dnorm(mu, sigma),
mu <- a + bW * water + bS * shade,
a ~ dnorm(0, 100),
bW ~ dnorm(0, 100),
bS ~ dnorm(0, 100),
sigma ~ dunif(0, 100)
),
data = Tulips
)
## Error in map(alist(blooms ~ dnorm(mu, sigma), mu <- a + bW * water + bS * : non-finite finite-difference value [4]
## Start values for parameters may be too far from MAP.
## Try better priors or use explicit start values.
## If you sampled random start values, just trying again may work.
## Start values used in this attempt:
## a = 147.288355316427
## bW = 229.137580192806
## bS = 157.52049939085
## sigma = 56.8666083738208
m7.7 <- map(
alist(
blooms ~ dnorm(mu, sigma),
mu <- a + bW * water + bS * shade + bWS * water * shade,
a ~ dnorm(0, 100),
bW ~ dnorm(0, 100),
bS ~ dnorm(0, 100),
bWS ~ dnorm(0, 100),
sigma ~ dunif(0, 100)
),
data = Tulips
)
## Error in map(alist(blooms ~ dnorm(mu, sigma), mu <- a + bW * water + bS * : non-finite finite-difference value [5]
## Start values for parameters may be too far from MAP.
## Try better priors or use explicit start values.
## If you sampled random start values, just trying again may work.
## Start values used in this attempt:
## a = 64.7330949238793
## bW = -97.859321311001
## bS = -64.7183755859978
## bWS = -177.368707970701
## sigma = 52.8894809773192
m7.6 <- map(
alist(
blooms ~ dnorm(mu, sigma),
mu <- a + bW * water + bS * shade,
a ~ dnorm(0, 100),
bW ~ dnorm(0, 100),
bS ~ dnorm(0, 100),
sigma ~ dunif(0, 100)
),
data = Tulips,
method = "Nelder-Mead",
control = list(maxit = 1e4)
)
m7.7 <- map(
alist(
blooms ~ dnorm(mu, sigma),
mu <- a + bW * water + bS * shade + bWS * water * shade,
a ~ dnorm(0, 100),
bW ~ dnorm(0, 100),
bS ~ dnorm(0, 100),
bWS ~ dnorm(0, 100),
sigma ~ dunif(0, 100)
),
data = Tulips,
method = "Nelder-Mead",
control = list(maxit = 1e4)
)
coeftab(m7.6, m7.7)
## Warning in sqrt(diag(vcov(model))): NaNs produced
## m7.6 m7.7
## a 53.60 2.62
## bW 76.31 110.85
## bS -38.94 -4.39
## sigma 57.39 98.45
## bWS NA -21.34
## nobs 27 27
compare(m7.6, m7.7)
## Error in mvrnorm(n = n, mu = mu, Sigma = vcov(object)): 'Sigma' is not positive definite
Tulips <-
Tulips %>%
mutate(
shade.c = shade - mean(shade),
water.c = water - mean(water)
)
m7.8 <- map(
alist(
blooms ~ dnorm(mu, sigma),
mu <- a + bW * water.c + bS * shade.c,
a ~ dnorm(130, 100),
bW ~ dnorm(0, 100),
bS ~ dnorm(0, 100),
sigma ~ dunif(0, 100)
),
data = Tulips,
start = list(
a = mean(Tulips$blooms),
bW = 0,
bS = 0,
sigma = sd(Tulips$blooms)
)
)
m7.9 <- map(
alist(
blooms ~ dnorm(mu, sigma),
mu <- a + bW * water.c + bS * shade.c + bWS * water.c * shade.c,
a ~ dnorm(130, 100),
bW ~ dnorm(0, 100),
bS ~ dnorm(0, 100),
bWS ~ dnorm(0, 100),
sigma ~ dunif(0, 100)
),
data = Tulips,
start = list(
a = mean(Tulips$blooms),
bW = 0,
bS = 0,
bWS = 0,
sigma = sd(Tulips$blooms)
)
)
coeftab(m7.8, m7.9)
## m7.8 m7.9
## a 129.00 129.01
## bW 74.22 74.96
## bS -40.74 -41.14
## sigma 57.35 45.22
## bWS NA -51.87
## nobs 27 27
k <- coef(m7.7); k
## a bW bS bWS sigma
## 2.616795 110.851737 -4.390966 -21.344340 98.445401
k[1] + k[2] * 2 + k[3] * 2 + k[4] * 2 * 2
## a
## 130.161
# here's a safer/clearer way
k["a"] + k["bW"] * 2 + k["bS"] * 2 + k["bWS"] * 2 * 2
## a
## 130.161
# annoyed by the name a? (parens required here)
(k["a"] + k["bW"] * 2 + k["bS"] * 2 + k["bWS"] * 2 * 2) %>% setNames("mu.S2W2")
## mu.S2W2
## 130.161
k <- coef(m7.9); k
## a bW bS bWS sigma
## 129.00797 74.95946 -41.14054 -51.87265 45.22497
(k["a"] + k["bW"] * 0 + k["bS"] * 0 + k["bWS"] * 0 * 0) %>% setNames("mu.S0W0")
## mu.S0W0
## 129.008
precis(m7.9)
## Mean StdDev 5.5% 94.5%
## a 129.01 8.67 115.15 142.87
## bW 74.96 10.60 58.02 91.90
## bS -41.14 10.60 -58.08 -24.20
## bWS -51.87 12.95 -72.57 -31.18
## sigma 45.22 6.15 35.39 55.06
Blooms_predict <-
expand.grid(shade.c = -1:1, water.c = -1:1)
m7.8.link <- link(m7.8, data = Blooms_predict, refresh = 0)
m7.9.link <- link(m7.9, data = Blooms_predict, refresh = 0)
# We can put two models together above and below or side by side.
# Above/below let's us use facets to compare the model predictions.
Blooms_predict <-
bind_rows(
Blooms_predict %>%
mutate(
mu = apply(m7.8.link, 2, mean),
lo = apply(m7.8.link, 2, PI, 0.97)[1,],
hi = apply(m7.8.link, 2, PI, 0.97)[2,],
model = "m7.8"
),
Blooms_predict %>%
mutate(
mu = apply(m7.9.link, 2, mean),
lo = apply(m7.9.link, 2, PI, 0.97)[1,],
hi = apply(m7.9.link, 2, PI, 0.97)[2,],
model = "m7.9"
)
)
gf_ribbon(lo + hi ~ shade.c, data = Blooms_predict, fill = "navy") %>%
gf_line(mu ~ shade.c, data = Blooms_predict) %>%
gf_point(blooms ~ shade.c, data = Tulips, alpha = 0.7, color = "navy") %>%
gf_facet_grid(model ~ paste0("water.c = ", water.c))
m7.x <- lm(blooms ~ water + shade + water:shade, data = Tulips)
coef(m7.x)
## (Intercept) water shade water:shade
## -150.81074 181.50500 64.10056 -52.85167
# short hand notation
m7.x <- lm(blooms ~ water*shade, data = Tulips)
coef(m7.x)
## (Intercept) water shade water:shade
## -150.81074 181.50500 64.10056 -52.85167
m7.x <- lm(blooms ~ water*shade - shade, data = Tulips)
coef(m7.x)
## (Intercept) water water:shade
## -22.60963 126.56167 -25.38000
m7.x <- lm(blooms ~ water*shade*bed, data = Tulips)
coef(m7.x)
## (Intercept) water shade bedb
## -186.253333 155.971667 86.246667 137.938889
## bedc water:shade water:bedb water:bedc
## -31.611111 -50.265000 11.911667 64.688333
## shade:bedb shade:bedc water:shade:bedb water:shade:bedc
## -75.431667 8.993333 8.640000 -16.400000
x <- z <- w <- 1
colnames(model.matrix( ~ x * z * w))
## [1] "(Intercept)" "x" "z" "w" "x:z"
## [6] "x:w" "z:w" "x:z:w"
data(nettle)
Nettle <-
nettle %>%
mutate(lang.per.cap = num.lang / k.pop)
head(Nettle, 3)
## country num.lang area k.pop num.stations mean.growing.season
## 1 Algeria 18 2381741 25660 102 6.60
## 2 Angola 42 1246700 10303 50 6.22
## 3 Australia 234 7713364 17336 134 6.00
## sd.growing.season lang.per.cap
## 1 2.29 0.0007014809
## 2 1.87 0.0040764826
## 3 4.17 0.0134979234