Games <- readr::read_csv("Data/kenpom2019-games.csv") %>%
filter(away %in% .$home) %>%
filter(home %in% .$away)
## Parsed with column specification:
## cols(
## date = col_character(),
## away = col_character(),
## as = col_double(),
## home = col_character(),
## hs = col_double(),
## n = col_character(),
## location = col_character()
## )
teams <- sort(unique(c(Games$home, Games$away)))
index <- 1:length(teams) %>% setNames(teams)
steams <- standardizedTeamNames(teams)
Games <-
Games %>%
mutate(
home_win = as.numeric(hs > as),
h = as.numeric(factor(home, levels = teams)),
a = as.numeric(factor(away, levels = teams)),
home_game = as.numeric(! grepl("N", n, ignore.case = TRUE))
)
We can do a little experimenting to see what sort of shape parameter corresponds to what we think about the probability of good teams beating mediocre or poor teams.
tibble(
shape = c(0.9, 1, 1.5, 2),
r1 = qgamma(299.5/330, shape = shape, rate = 0.01),
r2 = qgamma( 0.5/330, shape = shape, rate = 0.01),
r3 = qgamma( 0.5, shape = shape, rate = 0.01),
topVbottom = r1 / (r1 + r2), # best against worst in all of division 1
topVmedian = r1 / (r1 + r3) # more like 1 seed vs 16 seed
)
shape | r1 | r2 | r3 | topVbottom | topVmedian |
---|---|---|---|---|---|
0.9 | 220.2835 | 0.0705542 | 59.67430 | 0.9996798 | 0.7868454 |
1.0 | 238.1366 | 0.1516301 | 69.31472 | 0.9993637 | 0.7745506 |
1.5 | 321.5494 | 1.6051441 | 118.29869 | 0.9950329 | 0.7310465 |
2.0 | 398.8502 | 5.6084529 | 167.83470 | 0.9861334 | 0.7038306 |
The following code could be used with a predetermined shape parameter, but we’ll opt to add another prior distribution for shape and let the data guide us.
ncaa_model_g2 <- function() {
for (g in 1:Ngames) {
home_win[g] ~ dbern(Rating[h[g]] * (1 + home_game[g] * home_advantage) /
(Rating[h[g]] * (1 + home_game[g] * home_advantage) + Rating[a[g]]))
}
for (t in 1:Nteams) {
Rating[t] ~ dgamma(2, 0.01)
}
home_advantage ~ dnorm(0, 0.01)
}
ncaa_jags_g2 <- jags(
model = ncaa_model_g2,
data = list(
h = Games$h,
a = Games$a,
home_game = Games$home_game,
home_win = Games$home_win,
Nteams = max(Games$h),
Ngames = nrow(Games)
),
parameters.to.save = c("home_advantage", "Rating"),
n.iter = 20000,
n.thin = 3,
n.burnin = 5000
)
Here’s a model that uses the data to select the shape parameter.
ncaa_model_g <- function() {
for (g in 1:Ngames) {
home_win[g] ~ dbern(Rating[h[g]] * (1 + home_game[g] * home_advantage) /
(Rating[h[g]] * (1 + home_game[g] * home_advantage) + Rating[a[g]]))
}
for (t in 1:Nteams) {
Rating[t] ~ dgamma(shape, 0.01)
}
shape ~ dnorm(1.7, 1 / 0.3^2)
home_advantage ~ dnorm(0, 0.01)
}
ncaa_jags_g <- jags(
model = ncaa_model_g,
data = list(
h = Games$h,
a = Games$a,
home_game = Games$home_game,
home_win = Games$home_win,
Nteams = max(Games$h),
Ngames = nrow(Games)
),
parameters.to.save = c("shape", "home_advantage", "Rating"),
n.iter = 20000,
n.thin = 3,
n.burnin = 5000
)
saveRDS(ncaa_jags_g, file = "ncaa2019g.Rds")
Another option would be to try using a normal/t distribution for the log of the ratings.
ncaa_model_lt <- function() {
for (g in 1:Ngames) {
home_win[g] ~ dbern(Rating[h[g]] * (1 + home_game[g] * home_advantage) /
(Rating[h[g]] * (1 + home_game[g] * home_advantage) + Rating[a[g]]))
}
for (t in 1:Nteams) {
Rating[t] <- exp(rating[t])
rating[t] ~ dt(0, 1, nu)
}
home_advantage ~ dnorm(0, 0.01)
nuMinusOne ~ dexp(1/29)
nu <- nuMinusOne + 1
}
ncaa_jags_lt <- jags(
model = ncaa_model_lt,
data = list(
h = Games$h,
a = Games$a,
home_game = Games$home_game,
home_win = Games$home_win,
Nteams = max(Games$h),
Ngames = nrow(Games)
),
parameters.to.save = c("home_advantage", "Rating", "nu"),
n.iter = 5000,
n.thin = 3,
n.burnin = 2000
)
saveRDS(ncaa_jags_lt, file = "ncaa2019lt.Rds")
The tables would be pretty big for these models, so plots are better for looking at rhat and n.eff. Things look pretty good for both models.
gf_histogram(~neff(ncaa_jags_lt, regex_pars = "Rating"))
gf_histogram(~rhat(ncaa_jags_lt, regex_pars = "Rating"))
gf_histogram(~neff(ncaa_jags_g, regex_pars = "Rating"))
gf_histogram(~rhat(ncaa_jags_g, regex_pars = "Rating"))
As it turns out, the two models don’t differ a lot, but the log t model has the teams a bit more spread out in ability (so the probability of upsets is a bit lower). This makes sense since they are using the same likelihood.
Note too that the estimates are not very precise. This is because we really don’t have that much data per team (approximately 30 games), and many pairs of teams have not played. This leaves a great deal of uncertainty about the ratings and hence the win probabilities.
Post_g <- posterior(ncaa_jags_g)
Post_lt <- posterior(ncaa_jags_lt)
library(tidyr)
Post_long_g <- Post_g %>%
select(-deviance) %>%
mutate(step = 1:nrow(.)) %>%
gather(team, rating, matches("rating")) %>%
separate(team, sep = "\\.", into = c("junk", "team")) %>%
mutate(
team = readr::parse_number(team),
Team = teams[team])
Post_long_lt <- Post_lt %>%
select(-deviance) %>%
mutate(step = 1:nrow(.)) %>%
gather(team, rating, matches("rating")) %>%
separate(team, sep = "\\.", into = c("junk", "team")) %>%
mutate(
team = readr::parse_number(team),
Team = teams[team])
Ratings <- df_stats(~rating | Team , data = Post_long_lt,
mean, median, sd, quantile(c(.1, .9)))
Ratings <- Ratings %>%
mutate(rank = rank(-median_rating))
Ratings %>% arrange(rank) %>% head(20)
Team | mean_rating | median_rating | sd_rating | 10% | 90% | rank |
---|---|---|---|---|---|---|
Virginia | 34.461824 | 26.822432 | 26.884881 | 12.519593 | 63.61942 | 1 |
Duke | 26.800491 | 22.321482 | 17.053686 | 11.074870 | 46.94642 | 2 |
Houston | 20.602086 | 16.282887 | 16.589581 | 8.138635 | 36.78627 | 3 |
Gonzaga | 20.456321 | 16.061217 | 19.092830 | 7.714202 | 37.19099 | 4 |
North Carolina | 17.876193 | 14.957631 | 11.313886 | 7.833370 | 30.76834 | 5 |
Tennessee | 17.599535 | 14.738763 | 11.349885 | 7.689304 | 30.64736 | 6 |
Kentucky | 15.300444 | 13.089050 | 9.183684 | 6.867625 | 26.21275 | 7 |
Buffalo | 15.121630 | 12.326218 | 10.444856 | 6.566173 | 26.29588 | 8 |
Michigan St. | 13.784521 | 11.882641 | 7.935310 | 6.465405 | 23.07033 | 9 |
Wofford | 13.892345 | 11.407173 | 10.421499 | 5.435158 | 24.77533 | 10 |
Michigan | 12.667269 | 10.785042 | 7.574121 | 5.798902 | 22.16609 | 11 |
Florida St. | 11.687488 | 10.041714 | 6.673186 | 5.406143 | 19.66056 | 12 |
LSU | 11.303755 | 9.853212 | 6.287509 | 5.359942 | 18.77253 | 13 |
Texas Tech | 10.335331 | 8.863916 | 6.185495 | 4.700039 | 17.54763 | 14 |
Kansas | 9.862487 | 8.774954 | 4.963422 | 4.917562 | 15.99742 | 15 |
Cincinnati | 9.871869 | 8.459120 | 6.057576 | 4.536714 | 16.70414 | 16 |
Nevada | 8.627225 | 7.297923 | 5.110862 | 3.901053 | 14.70705 | 17 |
Kansas St. | 8.006619 | 7.036777 | 4.226825 | 3.974849 | 13.08161 | 18 |
UNC Greensboro | 8.093284 | 6.913681 | 4.857412 | 3.576588 | 13.84940 | 19 |
Auburn | 7.580055 | 6.710302 | 3.812031 | 3.853237 | 12.23940 | 20 |
Post_long_lt <-
Post_long_lt %>%
left_join(Ratings %>% select(Team, rank))
## Joining, by = "Team"
Post_long_lt %>%
mutate(model = "lt") %>%
filter(Team %in% c("Wisconsin", "Michigan", "Michigan St.", "Purdue")) %>%
gf_dens(~rating, color = ~ Team, adjust = 2) %>%
gf_facet_grid(model ~ .) %>%
gf_refine(scale_color_viridis_d())
Post_long_g %>%
mutate(model = "g") %>%
filter(Team %in% c("Wisconsin", "Michigan", "Michigan St.", "Purdue")) %>%
gf_dens(~rating, color = ~ Team) %>%
gf_facet_grid(model ~ .) %>%
gf_refine(scale_color_viridis_d())
Post_long_lt %>%
filter(rank <= 16) %>%
gf_dens(~rating, color = ~ Team, group = ~ Team) %>%
gf_refine(scale_color_viridis_d())
win_probs <- function(team1, team2, post) {
t1 <- if (is.numeric(team1)) team1 else which(steams == team1)
t2 <- if (is.numeric(team2)) team2 else which(steams == team2)
r1 <- post[, paste0("Rating.", t1)]
r2 <- post[, paste0("Rating.", t2)]
r1 / (r1 + r2)
}
gf_dens(~ win_probs("Michigan", "Michigan St", post = Post_g), color = ~ "gamma") %>%
gf_dens(~ win_probs("Michigan", "Michigan St", post = Post_lt), color = ~"logt") %>%
gf_vline(xintercept = 0.5, color = "red", alpha = 0.5) %>%
gf_labs(x = "win probability")
gf_dens(~ win_probs("Michigan", "Wisconsin", post = Post_g), color = ~"gamma") %>%
gf_dens(~ win_probs("Michigan", "Wisconsin", post = Post_lt), color = ~"logt") %>%
gf_vline(xintercept = 0.5, color = "red", alpha = 0.5) %>%
gf_labs(x = "win probability")
gf_dens(~ win_probs("Michigan", "Abilene Christian", post = Post_g), color = ~"gamma") %>%
gf_dens(~ win_probs("Michigan", "Abilene Christian", post = Post_lt), color = ~"logt") %>%
gf_vline(xintercept = 0.5, color = "red", alpha = 0.5) %>%
gf_labs(x = "win probability")
mean(~ win_probs("Michigan", "Wisconsin", post = Post_g))
## [1] 0.5877085
mean(~ win_probs("Michigan", "Wisconsin", post = Post_lt))
## [1] 0.6418992
mean(~ win_probs("Michigan", "Abilene Christian", post = Post_g))
## [1] 0.7281838
mean(~ win_probs("Michigan", "Abilene Christian", post = Post_lt))
## [1] 0.8089126