library(targets)
library(tarchetypes)
library(stantargets)
options(tidyverse.quiet = TRUE)
tar_source() # grab all functions in R folder
tar_option_set(packages = c(
"readr",
"tidyr",
"rvest",
"glue",
"dplyr",
"stringr",
"lubridate",
"forcats",
"ggplot2",
"svglite",
"scales",
"janitor",
"ggthemes",
"nzelect",
"purrr"
))#> Establish _targets.R and _targets_r/globals/example-globals.R.
Model pipeline
Intro
We use the {targets} package to manage the code pipeline used for running the model.
See setup for more info.
Globals
We first define some global options/functions common to all targets. The list of packages loaded below are what targets needs to run the pipeline - along with the targets package itself and two additional targets-related packages: tarchetypes and stantargets. If you don’t want to use renv then manually installing these should be sufficient.
Targets
The canonical location for NZ polling data appears to be Wikipedia pages. The first part of this pipeline downloads the poll results from the Wikipedia pages. It also fetches recent election results from the electoral commission.
Once the poll results have been downloaded they are filtered and marshalled into the format need by the model.
The model is then run and a few charts are generated from the model results.
URLs
Tarchetypes has a useful function tar_url
for working with urls. When the pipeline is run it will check the last-modified
header with the remote site. If this has changed any targets that depend on the url will be run again. Checking the URL is a little slow for regular development so tar_cue_skip can be used to skip the check - just set skip_url_check
below to TRUE
.
Note that often this results in one of the poll objects being extracted again but no further dependencies are run as the resulting data has not changed. This ensures that minor changes to the page do not trigger a model re-run.
The code to generate the urls is defined in R/urls.R
<- FALSE
skip_url_check list(
tar_url(polls2011_url, wikipedia_poll_url(2011), cue = tar_cue_skip(skip_url_check)),
tar_url(polls2014_url, wikipedia_poll_url(2014), cue = tar_cue_skip(skip_url_check)),
tar_url(polls2017_url, wikipedia_poll_url(2017), cue = tar_cue_skip(skip_url_check)),
tar_url(polls2020_url, wikipedia_poll_url(2020), cue = tar_cue_skip(skip_url_check)),
tar_url(polls2023_url, wikipedia_poll_url(2023), cue = tar_cue_skip(skip_url_check)),
tar_url(results2008_url, election_results_summary(2008), cue = tar_cue_skip(skip_url_check)),
tar_url(results2011_url, election_results_summary(2011), cue = tar_cue_skip(skip_url_check)),
tar_url(results2014_url, election_results_summary(2014), cue = tar_cue_skip(skip_url_check)),
tar_url(results2017_url, election_results_summary(2017), cue = tar_cue_skip(skip_url_check)),
tar_url(results2020_url, election_results_summary(2020), cue = tar_cue_skip(skip_url_check))
)#> Establish _targets.R and _targets_r/targets/urls.R.
Download polls
Polling for each election is extracted into it’s own object. Then all the objects are combined into a single consistent object.
The code that does the heavy lifting is in R/read-polls.R
list(
tar_target(polls2011, extract_poll_results_2011(polls2011_url)),
tar_target(polls2014, extract_poll_results_2014(polls2014_url)),
tar_target(polls2017, extract_poll_results_2017(polls2017_url)),
tar_target(polls2020, extract_poll_results_2020(polls2020_url)),
tar_target(polls2023, extract_poll_results_2023(polls2023_url)),
tar_target(polls, combine_polls(polls2011,polls2014,polls2017,polls2020,polls2023))
)#> Establish _targets.R and _targets_r/targets/polls.R.
The electoral commission produces usually formatted csv files - the code in R/fetch-results.R reads just a small portion of the results file.
list(
tar_target(results2020, fetch_results(results2020_url, 17, 2020)),
tar_target(results2017, fetch_results(results2017_url, 16, 2017)),
tar_target(results2014, fetch_results(results2014_url, 15, 2014)),
tar_target(results2011, fetch_results(results2011_url, 13, 2011)),
tar_target(results2008, fetch_results(results2008_url, 19, 2008)),
tar_target(
results,combine_results(
results2008,
results2011,
results2014,
results2017,
results2020
)
)
)#> Establish _targets.R and _targets_r/targets/results.R.
Selecting parties and pollsters
Only include pollsters who have provided a poll for the 2023 election and who use the NZ Political Polling Code.
Include parties who currently have a seat in parliament or who have polled over 2.5% three times.
list(
tar_target(pollsters, polls |>
filter(Election == 2023) |>
distinct(Pollster) |>
filter(!grepl('Roy|Horizon', Pollster))),
tar_target(parties_in_parliament,
tibble(Party = c('ACT', 'Green', 'Labour', 'National', 'Te Pāti Māori'))),
tar_target(parties, polls |>
filter(Election == 2023, VotingIntention > 0.025) |>
filter(n() > 3, .by=Party) |>
distinct(Party) |>
union(parties_in_parliament) |>
arrange(Party)),
tar_target(pollsters2020, polls |>
filter(Election == 2020,
!= 'YouGov') |> # Only one YouGov poll
Pollster distinct(Pollster)),
tar_target(parties_in_parliament2020,
tibble(Party = c('ACT', 'Green', 'Labour', 'National', 'NZ First'))),
tar_target(parties2020, polls |>
filter(Election == 2020, VotingIntention > 0.025) |>
filter(n() > 3, .by=Party) |>
distinct(Party) |>
union(parties_in_parliament2020) |>
arrange(Party)),
tar_target(party_colours,
tribble(
~Party, ~Colour,
"ACT", "#ffd100",
"Green", "#00491E",
"Labour", "#d82c20",
"Te Pāti Māori", "#D12C38",
"National", "#065BAA",
"NZ First", "#212529",
"TOP", "#09B598",
"Other", "#B3B3B3"
|> mutate(Party = as_factor(Party)))
)
)#> Establish _targets.R and _targets_r/targets/params.R.
Marshal the data
This code is very similar to Peter Ellis’ code for marshalling polls and results into a object for stan.
list(
tar_target(election_dates, ymd(c("2014-09-20", "2017-09-23", "2020-10-17", "2023-10-14"))),
tar_target(election_weeks, floor_date(election_dates, unit = 'week')),
tar_target(weeks_between_elections, as.integer(diff(election_weeks)) / 7),
tar_target(
elections,|>
results filter(Election >= year(min(election_weeks))) |>
mutate(
Party = fct_other(Party, keep = parties$Party) |>
fct_relevel(parties$Party)
|>
) count(Party, Election, wt = Percentage, name = 'Percentage') |>
arrange(Party, Election) |>
pivot_wider(names_from = Party, values_from = Percentage, values_fill = 0) |>
select(-Election)
),tar_target(
polls2,|>
polls filter(Pollster %in% c(pollsters$Pollster), !is.na(MidPoint)) |>
mutate(
MidPoint = floor_date(MidPoint, unit = 'week'),
Party = fct_other(Party, keep = parties$Party) |>
fct_relevel(parties$Party)
|>
) filter(Party != 'Other') |>
mutate(Polled = sum(VotingIntention), .by = c(Pollster, MidPoint)) |>
rename(MidDate = MidPoint, ElectionYear = Election) |>
select(
`Date range`,
Party,
Pollster,
MidDate,
ElectionYear,
Polled,
VotingIntention|>
) filter(ElectionYear %in% year(election_weeks[-1])) |>
arrange(Party, MidDate) |>
pivot_wider(
names_from = Party,
values_from = VotingIntention,
names_sort = TRUE,
values_fill = 0
|>
) mutate(Other = pmax(0, 1 - Polled),
MidDateNumber = 1 + as.numeric(MidDate - election_weeks[1]) / 7) |>
select(-Polled, -`Date range`)
),tar_target(
polls3,|>
polls2 arrange(Pollster, MidDate) |>
group_split(Pollster)
),tar_target(parties_ss, names(elections)),
tar_target(
# estimate the standard errors. Note we are pretending they all have a sample size of 1000 -
# which the main five do, but not some of the smaller ones. Improvement would be to better deal with this.
all_ses,|>
polls2 select(Pollster, ACT:Other) |>
pivot_longer(ACT:Other, names_to = 'Party', values_to = 'p') |>
summarise(
p = mean(p, na.rm = T),
se = sqrt(p * (1 - p) / 1000),
.by = c(Pollster, Party)
)
),tar_target(
ses3,|>
all_ses arrange(Pollster, Party) |>
group_split(Pollster)
),tar_target(
list(mu_elect1 = as.numeric(elections[1, ]),
d1, mu_elect2 = as.numeric(elections[2, ]),
mu_elect3 = as.numeric(elections[3, ]),
n_parties = length(parties_ss),
n_weeks = weeks_between_elections,
# multiply the variance of all polls by 2. See my blog post of 9 July 2017.
inflator = sqrt(2),
y1_n = nrow(polls3[[1]]),
y1_values = polls3[[1]][ , 4:11],
y1_weeks = as.numeric(polls3[[1]]$MidDateNumber),
y1_se = ses3[[1]]$se,
y2_n = nrow(polls3[[2]]),
y2_values = polls3[[2]][ , 4:11],
y2_weeks = as.numeric(polls3[[2]]$MidDateNumber),
y2_se = ses3[[2]]$se,
y3_n = nrow(polls3[[3]]),
y3_values = polls3[[3]][ , 4:11],
y3_weeks = as.numeric(polls3[[3]]$MidDateNumber),
y3_se = ses3[[3]]$se,
y4_n = nrow(polls3[[4]]),
y4_values = polls3[[4]][ , 4:11],
y4_weeks = as.numeric(polls3[[4]]$MidDateNumber),
y4_se = ses3[[4]]$se,
reid_method = as.numeric(polls3[[4]]$MidDate >= as.Date("2017-01-01")),
n_pollsters = 4)
)
)
#> Establish _targets.R and _targets_r/targets/prep.R.
2020 version of the model
This is a bit of a nasty copy-paste from above - it runs a version of the model for the 2020 election that we can use to check how well the model performs.
list(
tar_target(election_weeks2020, floor_date(ymd(
c("2011-11-26", "2014-09-20", "2017-09-23", "2020-10-17")
unit = 'week')),
), tar_target(weeks_between_elections2020, as.integer(diff(election_weeks)) / 7),
tar_target(
elections2020,|>
results filter(Election >= year(min(election_weeks2020)) &
< year(max(election_weeks2020))) |>
Election mutate(
Party = fct_other(Party, keep = parties2020$Party) |>
fct_relevel(parties2020$Party)
|>
) count(Party, Election, wt = Percentage, name = 'Percentage') |>
arrange(Party, Election) |>
pivot_wider(names_from = Party, values_from = Percentage, values_fill = 0) |>
select(-Election)
),tar_target(
polls22020,|>
polls filter(Election %in% c(2014, 2017, 2020),
%in% c(pollsters2020$Pollster),
Pollster !is.na(MidPoint)) |>
mutate(
MidPoint = floor_date(MidPoint, unit = 'week'),
Party = fct_other(Party, keep = parties2020$Party) |>
fct_relevel(parties2020$Party)
|>
) filter(Party != 'Other') |>
mutate(Polled = sum(VotingIntention), .by = c(Pollster, MidPoint)) |>
rename(MidDate = MidPoint, ElectionYear = Election) |>
count(
Party,
Pollster,
MidDate,
ElectionYear,
Polled,wt = VotingIntention,
name = "VotingIntention"
|>
) filter(MidDate <= election_weeks2020[4]) |>
arrange(Party, MidDate) |>
pivot_wider(
names_from = Party,
values_from = VotingIntention,
names_sort = TRUE,
values_fill = 0
|>
) mutate(Other = pmax(0, 1 - Polled),
MidDateNumber = 1 + as.numeric(MidDate - election_weeks2020[1]) / 7) |>
select(-Polled)
),tar_target(
polls32020,|>
polls22020 arrange(Pollster, MidDate) |>
group_split(Pollster)
),tar_target(parties_ss2020, names(elections2020)),
tar_target(
# estimate the standard errors. Note we are pretending they all have a sample size of 1000 -
# which the main five do, but not some of the smaller ones. Improvement would be to better deal with this.
all_ses2020,|>
polls22020 select(Pollster, ACT:Other) |>
pivot_longer(ACT:Other, names_to = 'Party', values_to = 'p') |>
summarise(
p = mean(p, na.rm = T),
se = sqrt(p * (1 - p) / 1000),
.by = c(Pollster, Party)
)
),tar_target(
ses32020,|>
all_ses2020 arrange(Pollster, Party) |>
group_split(Pollster)
),tar_target(
list(mu_elect1 = as.numeric(elections2020[1, ]),
d12020, mu_elect2 = as.numeric(elections2020[2, ]),
mu_elect3 = as.numeric(elections2020[3, ]),
n_parties = length(parties_ss2020),
n_weeks = weeks_between_elections2020,
# multiply the variance of all polls by 2. See my blog post of 9 July 2017.
inflator = sqrt(2),
y1_n = nrow(polls32020[[1]]),
y1_values = polls32020[[1]][ , 4:9],
y1_weeks = as.numeric(polls32020[[1]]$MidDateNumber),
y1_se = ses32020[[1]]$se,
y2_n = nrow(polls32020[[2]]),
y2_values = polls32020[[2]][ , 4:9],
y2_weeks = as.numeric(polls32020[[2]]$MidDateNumber),
y2_se = ses32020[[2]]$se,
reid_method = as.numeric(polls32020[[2]]$MidDate >= as.Date("2017-01-01")),
y3_n = nrow(polls32020[[3]]),
y3_values = polls32020[[3]][ , 4:9],
y3_weeks = as.numeric(polls32020[[3]]$MidDateNumber),
y3_se = ses32020[[3]]$se,
n_pollsters = 3)
)
)
#> Establish _targets.R and _targets_r/targets/prep2020.R.
Run the models
Uses stantargets
to run the model - see tar_stan_mcmc
list(
tar_stan_mcmc(
model2023,"stan/model2023.stan",
dir = ".stan",
data = d1,
chains = 4,
parallel_chains = 4,
iter_sampling = 2000,
max_treedepth = 20,
deployment = "worker"
),tar_stan_mcmc(
model2020,"stan/model2020.stan",
dir = ".stan",
data = d12020,
chains = 4,
parallel_chains = 4,
iter_sampling = 2000,
max_treedepth = 20,
deployment = "worker"
)
)#> Establish _targets.R and _targets_r/targets/modelling.R.
Charts
Voting intention
Current charts are more or less the same as some of Peter’s charts. Future versions will export the data to create more interactive charts.
list(
tar_target(
voting_intention_chartdata,|>
model2023_summary_model2023 filter(str_starts(variable, "mu")) |>
mutate(
Party = rep(parties_ss, each = sum(weeks_between_elections)),
week = rep(1:sum(weeks_between_elections), length(parties_ss)),
week = min(election_weeks) + weeks(week)
)
),tar_target(
voting_intention_chart,|>
voting_intention_chartdata ggplot(aes(
x = week,
y = mean,
colour = Party,
fill = Party
+
)) geom_point(
data = gather(
polls2,
Party,-Pollster,-MidDate,-ElectionYear,-MidDateNumber
VotingIntention,|>
) filter(VotingIntention != 0),
aes(x = MidDate, y = VotingIntention),
colour = "black",
size = 0.5
+
) geom_vline(
xintercept = as.numeric(election_dates),
colour = "grey60"
+
) scale_y_continuous(labels = percent) +
scale_x_date(breaks = ym(
c('2016-01', '2018-01', '2020-01', '2022-01')
date_labels = '%Y') +
), scale_fill_manual(values = party_colours$Colour, breaks = party_colours$Party) +
geom_line(
data = voting_intention_chartdata |> filter(week <= today()),
colour = "black"
+
) geom_ribbon(aes(ymin = q5, ymax = q95), alpha = 0.3, colour = NA) +
labs(y = NULL, x = NULL) +
theme_clean() +
theme(
plot.title.position = "plot",
legend.position = 'none',
strip.background = element_rect(fill = '#121617'),
strip.text = element_text(colour = 'white'),
plot.background = element_blank()
)
),tar_file(voting_intention620, {
<- "output/voting_intention620.svg"
f ggsave(
f,plot = voting_intention_chart +
facet_wrap(
~ factor(Party, levels = party_colours$Party),
scales = "free",
ncol = 2
),dpi = 100,
width = 6.2,
height = 8
)
f
}),tar_file(voting_intention375, {
<- "output/voting_intention375.svg"
f ggsave(
f,plot = voting_intention_chart +
facet_wrap(
~ factor(Party, levels = party_colours$Party),
scales = "free",
ncol = 1
),dpi = 100,
width = 3.75,
height = 10
)
f
})
)#> Establish _targets.R and _targets_r/targets/charts.R.
Possible coaltion plots
Grab the simulation results for party vote for election night and the current week. Currently hard coding the current week - but will need to do something better.
Assumption is hard-coded that parties that currently have an electorate seat will keep at least one and no additional parties will pick up an electorate seat. Future work will make this more flexible.
We render an mobile screen sized plot and the desktop sized plot and load these as SVGs - slightly clunky but is actually a good way to display ggplot2 images on the web.
<- function(sims) {
simulate_seats t(sapply(1:nrow(sims), function(i) {
allocate_seats(
votes = as.numeric(sims[i,]),
electorate = c(1, 1, 1, 1, 0, 1, 0),
parties = names(sims)
$seats_v
)|>
})) as_tibble() |>
mutate(
NatCoal = ACT + National,
LabGreen = Labour + Green,
LabGreenMaori = Labour + Green + `Te Pāti Māori`,
NatCoalMaori = NatCoal + `Te Pāti Māori`
)
}
list(
tar_target(
weekly_mu,$draws('mu', format = 'draws_matrix')
model2023_mcmc_model2023
),tar_target(
sims_election_night,1:8 * 473] |> as_tibble() |> set_names(parties_ss) |>
weekly_mu[, select(all_of(sort(parties_ss))) |>
select(-Other)
),tar_target(
sims_saturday,1:8 * 473 - 24] |> as_tibble() |> set_names(parties_ss) |>
weekly_mu[, select(all_of(sort(parties_ss))) |>
select(-Other)
),tar_target(seats_election_night, simulate_seats(sims_election_night)),
tar_target(seats_saturday, simulate_seats(sims_saturday))
)#> Establish _targets.R and _targets_r/targets/seats.R.
<- function(seats) {
coalition_plot |>
seats select(National,
Labour,
NatCoal,
LabGreen,
LabGreenMaori,|>
NatCoalMaori) gather(Coalition, Seats) |>
mutate(lab_in = ifelse(grepl("^Lab", Coalition), "Labour-based", "Nationals-based")) |>
mutate(
Coalition = gsub("^LabGreen", "Labour, Greens", Coalition),
Coalition = gsub("^NatCoal", "National, ACT", Coalition),
Coalition = gsub("Maori", ",\nTe Pāti Māori", Coalition)
|>
) ggplot(aes(x = Seats, fill = lab_in)) +
geom_histogram(
alpha = 0.5,
binwidth = 1,
position = "identity",
colour = NA
+
) scale_y_continuous() +
scale_fill_manual(values = c('#d82c20', '#065BAA')) +
theme_clean() +
theme(
legend.position = 'none',
strip.background = element_rect(fill = '#121617'),
strip.text = element_text(colour = 'white'),
plot.background = element_blank()
+
) labs(y = NULL) +
annotate(
'segment',
x = 61,
xend = 61,
y = 0,
yend = Inf
)
}
list(
tar_target(election_night_plot, coalition_plot(seats_election_night)),
tar_target(saturday_plot, coalition_plot(seats_saturday)),
tar_file(election_night620, {
<- "output/election_night620.svg"
f ggsave(
f,plot = election_night_plot +
facet_wrap(~ factor(
Coalition,levels = c(
"Labour",
"National",
"Labour, Greens",
"National, ACT",
"Labour, Greens,\nTe Pāti Māori",
"National, ACT,\nTe Pāti Māori"
)ncol = 2),
), dpi = 100,
width = 6.2,
height = 4
)
f
}),tar_file(election_night375, {
<- "output/election_night375.svg"
f ggsave(
f,plot = election_night_plot +
facet_wrap(~ Coalition,
ncol = 1),
dpi = 100,
width = 3.75,
height = 7
)
f
}),tar_file(saturday620, {
<- "output/saturday620.svg"
f ggsave(
f,plot = saturday_plot +
facet_wrap(~ factor(
Coalition,levels = c(
"Labour",
"National",
"Labour, Greens",
"National, ACT",
"Labour, Greens,\nTe Pāti Māori",
"National, ACT,\nTe Pāti Māori"
)ncol = 2),
), dpi = 100,
width = 6.2,
height = 4
)
f
}),tar_file(saturday375, {
<- "output/saturday375.svg"
f ggsave(
f,plot = saturday_plot +
facet_wrap(~ Coalition,
ncol = 1),
dpi = 100,
width = 3.75,
height = 7
)
f
})
)#> Establish _targets.R and _targets_r/targets/coalition-plot.R.