These examples appear in our text, but the code here is a bit different. In particular, I’m appending _star
to variable names to indicate that they are calcualted for each of the resamples.
set.seed(1234)
Dimes_boot <-
do(2000) * df_stats(~ mass, data = resample(Dimes),
xbar_star = mean, s_star = sd, n = length)
head(Dimes_boot)
## response xbar_star s_star n .row .index
## 1 mass 2.253567 0.02263151 30 1 1
## 2 mass 2.249367 0.01978938 30 1 2
## 3 mass 2.254900 0.02087458 30 1 3
## 4 mass 2.261467 0.02220086 30 1 4
## 5 mass 2.254367 0.02060714 30 1 5
## 6 mass 2.254367 0.02191281 30 1 6
xbar <- mean( ~mass, data = Dimes); xbar
## [1] 2.258233
s <- sd( ~mass, data = Dimes); s
## [1] 0.02206993
SE <- sd( ~xbar_star, data = Dimes_boot); SE
## [1] 0.003925842
xbar + c(-1, 1) * qt(0.975, df= 29) * SE
## [1] 2.250204 2.266263
cdata(~xbar_star, data = Dimes_boot)
## lower upper central.p
## 2.5% 2.250666 2.265668 0.95
Note: the usual method is to use \(s\) from our original sample, not \(s^*\) from each particular bootstrap sample. This avoids one level of approximation, and for small samples, it avoids the possibility of drawing a bootstrap sample with a very small standard deviation, which would lead to a very large value of \(t^*\). In this example, we see that it would make very little difference.
xbar <- mean(~mass, data = Dimes) # sample mean estimates population mean below
s <- sd(~mass, data = Dimes) # sample sd estimates population sd below
Dimes_boot <-
Dimes_boot |>
mutate(
t = (xbar_star - xbar) / (s/sqrt(n)), # this is the usual way
t2 = (xbar_star - xbar) / (s_star/sqrt(n)), # this is not usually done
)
head(Dimes_boot)
## response xbar_star s_star n .row .index t t2
## 1 mass 2.253567 0.02263151 30 1 1 -1.1581546 -1.1294161
## 2 mass 2.249367 0.01978938 30 1 2 -2.2004938 -2.4540807
## 3 mass 2.254900 0.02087458 30 1 3 -0.8272533 -0.8746243
## 4 mass 2.261467 0.02220086 30 1 4 0.8024357 0.7977032
## 5 mass 2.254367 0.02060714 30 1 5 -0.9596138 -1.0277317
## 6 mass 2.254367 0.02191281 30 1 6 -0.9596138 -0.9664943
gf_dens(~t, data = Dimes_boot, color = ~ "t") |>
gf_dens(~t2, data = Dimes_boot, color = ~ "t2")
SE <- s / sqrt(30); SE
## [1] 0.004029399
t_star <- cdata(~t, data = Dimes_boot, p = 0.95); t_star
## lower upper central.p
## 2.5% -1.878072 1.845188 0.95
xbar - t_star[2:1] * SE
## upper lower
## 2.5% 2.250798 2.265801
Small sample, but clearly not coming from a normal population. Skewed right.
Lifetime <-
tibble(life = c(16, 34, 53, 75, 93, 120, 150, 191, 240, 339))
gf_histogram(~life, data = Lifetime, bins = 5)
gf_qq(~life, data = Lifetime, bins = 5)
Life_boot <-
do(2000) * df_stats(~life, data = resample(Lifetime),
xbar_star = mean, s_star = sd, n = length)
head(Life_boot)
## response xbar_star s_star n .row .index
## 1 life 175.9 113.42589 10 1 1
## 2 life 96.7 40.44214 10 1 2
## 3 life 91.6 67.38975 10 1 3
## 4 life 84.5 68.30365 10 1 4
## 5 life 77.9 58.51961 10 1 5
## 6 life 148.0 103.88241 10 1 6
gf_histogram(~xbar_star, data = Life_boot)
xbar <- mean(~life, data = Lifetime)
SE <- sd(~xbar_star, data = Life_boot)
xbar + c(-1, 1) * qt(0.975, df = 9) * SE
## [1] 62.30303 199.89697
cdata(~xbar_star, data = Life_boot)
## lower upper central.p
## 2.5% 76.595 194.3 0.95
xbar <- mean(~life, data = Lifetime) # sample mean estiamtes population mean below
s <- sd(~life, data = Lifetime) # sample sd estimates population sd below
# add t stat to bootstrap data
Life_boot <-
Life_boot |>
mutate(
t = (xbar - xbar_star) / (s / sqrt(n))
)
head(Life_boot)
## response xbar_star s_star n .row .index t
## 1 life 175.9 113.42589 10 1 1 -1.3972366
## 2 life 96.7 40.44214 10 1 2 1.0728781
## 3 life 91.6 67.38975 10 1 3 1.2319385
## 4 life 84.5 68.30365 10 1 4 1.4533756
## 5 life 77.9 58.51961 10 1 5 1.6592185
## 6 life 148.0 103.88241 10 1 6 -0.5270826
gf_dens( ~t, data = Life_boot, color = ~"t")
SE <- s / sqrt(10); SE
## [1] 32.06329
t_star <- cdata(~t, data = Life_boot, p = 0.95); t_star
## lower upper central.p
## 2.5% -1.971102 1.699919 0.95
xbar - t_star[2:1] * SE
## upper lower
## 2.5% 76.595 194.3