Code from Statistical Rethinking modified by R Pruim is shown below. Differences to the oringal include:
ggplot2
(via ggformula
) rather than base graphicstidyverse
for data transformationThe examples below illustrate that the Binomial distributions are the maximum intropy distributions among distributions with a fixed expected number of successes.
q <- list(
A = c(0, 0, 10, 0, 0),
B = c(0, 1, 8, 1, 0),
C = c(0, 2, 6, 2, 0),
D = c(1, 2, 4, 2, 1),
E = c(2, 2, 2, 2, 2))
# convert counts to probabilities
p <- purrr::map(q, ~ .x / sum(.x))
p
## $A
## [1] 0 0 1 0 0
##
## $B
## [1] 0.0 0.1 0.8 0.1 0.0
##
## $C
## [1] 0.0 0.2 0.6 0.2 0.0
##
## $D
## [1] 0.1 0.2 0.4 0.2 0.1
##
## $E
## [1] 0.2 0.2 0.2 0.2 0.2
# Entropy function -- being careful to handle 0 probabilities correctly
H <- function(p) {
- sum(ifelse(p > 0, p * log(p), 0))
}
sapply(p, H)
## A B C D E
## 0.0000000 0.6390319 0.9502705 1.4708085 1.6094379
ways <- c(1, 90, 1260, 37800, 113400)
log(ways) / 10
## [1] 0.0000000 0.4499810 0.7138867 1.0540064 1.1638677
# build list of the candidate distributions
p <- list(
A = c(1 / 4, 1 / 4, 1 / 4, 1 / 4),
B = c(2 / 6, 1 / 6, 1 / 6, 2 / 6),
C = c(1 / 6, 2 / 6, 2 / 6, 1 / 6),
D = c(1 / 8, 4 / 8, 2 / 8, 1 / 8)
)
# compute expected value of each -- should all be the same
sapply(p, function(x) sum(x * c(0, 1, 1, 2)))
## A B C D
## 1 1 1 1
# compute entropy of each distribution
sapply(p, H)
## A B C D
## 1.386294 1.329661 1.329661 1.213008
D <- data_frame(dist = LETTERS[1:4], H = sapply(p, H))
gf_point(H ~ dist, data = D)
p <- 0.7
A <- c((1 - p) ^ 2, p * (1 - p), (1 - p) * p, p ^ 2)
A
## [1] 0.09 0.21 0.21 0.49
H(A)
## [1] 1.221729
This version of sim.p()
prepares a nice data frame with all of the results. The (calculaed) expected value is addd for good measure. It should always equal the target expected value given in the call of the function.
sim.p <- function(n = 1e4, E = 1.4) {
Q <- matrix(runif(3 *n), ncol = 3)
q4 <- (E * (Q[, 1] + Q[, 2] + Q[, 3]) - Q[, 2] - Q[, 3]) / (2 - E)
Q <- cbind(Q, q4)
S <- apply(Q, 1, sum)
P <- Q / S
data_frame(
p1 = P[, 1],
p2 = P[, 2],
p3 = P[, 3],
p4 = P[, 4],
H = apply(P, 1, H),
E = apply(P, 1, function(q) sum(q * c(0, 1, 1, 2)))
)
}
sim.p(3)
## # A tibble: 3 × 6
## p1 p2 p3 p4 H E
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.01052492 0.46332043 0.1156297 0.4105249 1.0193330 1.4
## 2 0.23407434 0.05303432 0.0788170 0.6340743 0.9847772 1.4
## 3 0.17010913 0.14255252 0.1172292 0.5701091 1.1506709 1.4
Now let’s simulate a bunch of these and compare them to our distribution A.
Sims <- sim.p(1e5)
gf_histogram( ~ H, data = Sims, binwidth = 0.01, boundary = H(A)) %>%
gf_vline(xintercept = H(A), color = "red")
Since our simulations are preparing a nice data frame, we can get the information we want very easily.
H(A)
## [1] 1.221729
Sims %>% arrange(-H) %>% head(4) %>% round(4)
## # A tibble: 4 × 6
## p1 p2 p3 p4 H E
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.0900 0.2101 0.2099 0.4900 1.2217 1.4
## 2 0.0898 0.2101 0.2103 0.4898 1.2217 1.4
## 3 0.0899 0.2104 0.2098 0.4899 1.2217 1.4
## 4 0.0904 0.2094 0.2099 0.4904 1.2217 1.4