oilSpills <- xlsx_cells('oilSpillsUS.xlsx') %>% #convert to cell format so that tidyxl and unpivotr work
filter(row != 1) %>% #first row is title of the table
select(row, col, data_type, character, numeric) #select the columns containing data and its location
oilSpills <- oilSpills %>%
behead("up-left", Year) %>% #behead Year
filter(row < 14) %>% #last rows are metadata so filter them out
behead("up", Value) %>% #behead Incidents and Gallons spilled
behead('left', Source) %>% #behead Source
pivot_wider(names_from = Value, values_from = numeric) #pivot wider so there is a column for Incidents and Gallons spilled
oilSpills <- oilSpills %>%
select(Year, Source, Incidents, `Gallons spilled`) %>%
group_by(Year, Source) %>%
summarise_all(list(~trimws(paste(., collapse = '')))) %>% #merge rows together
ungroup()
oilSpills$Incidents <- gsub('.{2}$', '', oilSpills$Incidents) %>% #trim NAs
as.numeric() %>% #convert string to double
suppressWarnings() #suppress warnings, there should be some NAs
oilSpills$`Gallons spilled` <- gsub('^[A-Za-z]{2}', '', oilSpills$`Gallons spilled`) %>% #trim NAs
as.numeric() %>% #convert string to double
suppressWarnings() #suppress warnings, there should be some NAs
The data came from the Bureau of Transportation Statistics. Only the pollution incidents where the Coast Guard investigated as the lead agency are included in this dataset. Data on spills where the Environmental Protection Agency or any of the state authorities are the lead agency is not included. These are mostly offshore spills as the EPA usually handles inshore spills. These statistics cover yearly gallons spilled, number of incidents, and source type from 1985, 1990, 1995-2020.
oilSpills %>% filter(Source == "Vessel sources, total" |
Source == "Nonvessel sources, total"|
Source == "Mysteryc") %>%
plot_ly(x = ~Year, y =~`Gallons spilled`, color = ~Source, type="bar", colors =
c("#a6cee3", "#1f78b4", "#b2df8a"),
width = 1000) %>%
layout(barmode = 'dodge', yaxis = list(fixedrange = FALSE))
oilSpills %>% filter(Source == "Vessel sources, total"|
Source == "Nonvessel sources, total"|
Source == "Mysteryc") %>%
drop_na() %>%
plot_ly(width = 1000) %>%
add_trace(x = ~Year, y = ~Incidents, type = 'scatter', color =~Source, mode = 'lines+markers',
colors = c("#a6cee3", "#1f78b4", "#b2df8a"))
y2 <- list(
overlaying = "y",
side = "right",
title = "Number of Incidents",
automargin = T)
oilSpills %>% filter(Source != "TOTAL all spills",
Source != "Vessel sources, total",
Source != "Nonvessel sources, total") %>%
drop_na() %>%
mutate(Source = fct_reorder(Source, desc(`Gallons spilled`))) %>%
plot_ly(x = ~Year, y =~`Gallons spilled`, color = ~Source, type="bar", colors = "Accent",
width = 1000, height = 500) %>%
group_by(Year) %>%
summarise(n = sum(Incidents)) %>%
add_trace(x = ~Year, y = ~n, type = 'scatter', mode = 'markers',
name = "Incidents", yaxis = "y2", color = I("black")) %>%
layout(barmode = 'stack', yaxis2 = y2,
yaxis = list(fixedrange = FALSE))
Wilke: stacked vs grouped bar charts and the implications of connecting dots with lines
wrangling - double header excel data (code)
Improvements: double y-axis? tree-map or ribbon plot, more user interactivity - button that switches from stacked bar plot to grouped bar plot