Model pipeline

Intro

We use the {targets} package to manage the code pipeline used for running the model.

See setup for more info.

Globals

We first define some global options/functions common to all targets. The list of packages loaded below are what targets needs to run the pipeline - along with the targets package itself and two additional targets-related packages: tarchetypes and stantargets. If you don’t want to use renv then manually installing these should be sufficient.

library(targets)
library(tarchetypes)
library(stantargets)
options(tidyverse.quiet = TRUE)
tar_source() # grab all functions in R folder
tar_option_set(packages = c(
  "readr",
  "tidyr",
  "rvest",
  "glue",
  "dplyr",
  "stringr",
  "lubridate",
  "forcats",
  "ggplot2",
  "svglite",
  "scales",
  "janitor",
  "ggthemes",
  "nzelect",
  "purrr"
))
#> Establish _targets.R and _targets_r/globals/example-globals.R.

Targets

The canonical location for NZ polling data appears to be Wikipedia pages. The first part of this pipeline downloads the poll results from the Wikipedia pages. It also fetches recent election results from the electoral commission.

Once the poll results have been downloaded they are filtered and marshalled into the format need by the model.

The model is then run and a few charts are generated from the model results.

URLs

Tarchetypes has a useful function tar_url for working with urls. When the pipeline is run it will check the last-modified header with the remote site. If this has changed any targets that depend on the url will be run again. Checking the URL is a little slow for regular development so tar_cue_skip can be used to skip the check - just set skip_url_check below to TRUE.

Note that often this results in one of the poll objects being extracted again but no further dependencies are run as the resulting data has not changed. This ensures that minor changes to the page do not trigger a model re-run.

The code to generate the urls is defined in R/urls.R

skip_url_check <- FALSE
list(
  tar_url(polls2011_url, wikipedia_poll_url(2011), cue = tar_cue_skip(skip_url_check)),
  tar_url(polls2014_url, wikipedia_poll_url(2014), cue = tar_cue_skip(skip_url_check)),
  tar_url(polls2017_url, wikipedia_poll_url(2017), cue = tar_cue_skip(skip_url_check)),
  tar_url(polls2020_url, wikipedia_poll_url(2020), cue = tar_cue_skip(skip_url_check)),
  tar_url(polls2023_url, wikipedia_poll_url(2023), cue = tar_cue_skip(skip_url_check)),
  tar_url(results2008_url, election_results_summary(2008), cue = tar_cue_skip(skip_url_check)),
  tar_url(results2011_url, election_results_summary(2011), cue = tar_cue_skip(skip_url_check)),
  tar_url(results2014_url, election_results_summary(2014), cue = tar_cue_skip(skip_url_check)),
  tar_url(results2017_url, election_results_summary(2017), cue = tar_cue_skip(skip_url_check)),
  tar_url(results2020_url, election_results_summary(2020), cue = tar_cue_skip(skip_url_check))
)
#> Establish _targets.R and _targets_r/targets/urls.R.

Download polls

Polling for each election is extracted into it’s own object. Then all the objects are combined into a single consistent object.

The code that does the heavy lifting is in R/read-polls.R

list(
  tar_target(polls2011, extract_poll_results_2011(polls2011_url)),
  tar_target(polls2014, extract_poll_results_2014(polls2014_url)),
  tar_target(polls2017, extract_poll_results_2017(polls2017_url)),
  tar_target(polls2020, extract_poll_results_2020(polls2020_url)),
  tar_target(polls2023, extract_poll_results_2023(polls2023_url)),
  tar_target(polls, combine_polls(polls2011,polls2014,polls2017,polls2020,polls2023))
)
#> Establish _targets.R and _targets_r/targets/polls.R.

The electoral commission produces usually formatted csv files - the code in R/fetch-results.R reads just a small portion of the results file.

list(
  tar_target(results2020, fetch_results(results2020_url, 17, 2020)),
  tar_target(results2017, fetch_results(results2017_url, 16, 2017)),
  tar_target(results2014, fetch_results(results2014_url, 15, 2014)),
  tar_target(results2011, fetch_results(results2011_url, 13, 2011)),
  tar_target(results2008, fetch_results(results2008_url, 19, 2008)),
  tar_target(
    results,
    combine_results(
      results2008,
      results2011,
      results2014,
      results2017,
      results2020
    )
  )
  
)
#> Establish _targets.R and _targets_r/targets/results.R.

Selecting parties and pollsters

Only include pollsters who have provided a poll for the 2023 election and who use the NZ Political Polling Code.

Include parties who currently have a seat in parliament or who have polled over 2.5% three times.

list(
  tar_target(pollsters, polls |> 
               filter(Election == 2023) |> 
               distinct(Pollster) |> 
               filter(!grepl('Roy|Horizon', Pollster))),
  tar_target(parties_in_parliament, 
             tibble(Party = c('ACT', 'Green', 'Labour', 'National', 'Te Pāti Māori'))),
  tar_target(parties,  polls |> 
               filter(Election == 2023, VotingIntention > 0.025) |>
               filter(n() > 3, .by=Party) |> 
               distinct(Party) |> 
               union(parties_in_parliament) |> 
               arrange(Party)),
  tar_target(pollsters2020, polls |> 
               filter(Election == 2020,
                      Pollster != 'YouGov') |> # Only one YouGov poll 
               distinct(Pollster)),
  tar_target(parties_in_parliament2020, 
             tibble(Party = c('ACT', 'Green', 'Labour', 'National', 'NZ First'))),
  tar_target(parties2020,  polls |> 
               filter(Election == 2020, VotingIntention > 0.025) |>
               filter(n() > 3, .by=Party) |> 
               distinct(Party) |> 
               union(parties_in_parliament2020) |> 
               arrange(Party)),
  tar_target(party_colours,
             tribble(
               ~Party, ~Colour,
               "ACT", "#ffd100",
               "Green", "#00491E",
               "Labour", "#d82c20",
               "Te Pāti Māori", "#D12C38",
               "National", "#065BAA",
               "NZ First", "#212529",
               "TOP", "#09B598",
               "Other", "#B3B3B3"
             ) |> mutate(Party = as_factor(Party)))
)
#> Establish _targets.R and _targets_r/targets/params.R.

Marshal the data

This code is very similar to Peter Ellis’ code for marshalling polls and results into a object for stan.

list(
  tar_target(election_dates, ymd(c("2014-09-20", "2017-09-23", "2020-10-17", "2023-10-14"))),
  tar_target(election_weeks, floor_date(election_dates, unit = 'week')),
  tar_target(weeks_between_elections, as.integer(diff(election_weeks)) / 7),
  tar_target(
    elections,
    results |>
      filter(Election >= year(min(election_weeks))) |>
      mutate(
        Party = fct_other(Party, keep = parties$Party) |>
          fct_relevel(parties$Party)
      ) |>
      count(Party, Election, wt = Percentage, name = 'Percentage') |>
      arrange(Party, Election) |>
      pivot_wider(names_from = Party, values_from = Percentage, values_fill = 0) |>
      select(-Election)
  ),
  tar_target(
    polls2,
    polls |>
  filter(Pollster %in% c(pollsters$Pollster), !is.na(MidPoint)) |>
  mutate(
    MidPoint = floor_date(MidPoint, unit = 'week'),
    Party = fct_other(Party, keep = parties$Party) |>
      fct_relevel(parties$Party)
  ) |>
  filter(Party != 'Other') |> 
  mutate(Polled = sum(VotingIntention), .by = c(Pollster, MidPoint)) |> 
  rename(MidDate = MidPoint, ElectionYear = Election) |>
  select(
    `Date range`,
    Party,
    Pollster,
    MidDate,
    ElectionYear,
    Polled,
    VotingIntention
  ) |>
  filter(ElectionYear %in% year(election_weeks[-1])) |>
  arrange(Party, MidDate) |>
  pivot_wider(
    names_from = Party,
    values_from = VotingIntention,
    names_sort = TRUE,
    values_fill = 0
  ) |>
  mutate(Other = pmax(0, 1 - Polled),
         MidDateNumber = 1 + as.numeric(MidDate - election_weeks[1]) / 7) |> 
  select(-Polled, -`Date range`)
),
  tar_target(
    polls3,
    polls2 |>
      arrange(Pollster, MidDate) |>
      group_split(Pollster)
  ),
  tar_target(parties_ss, names(elections)),
  tar_target(
    # estimate the standard errors.  Note we are pretending they all have a sample size of 1000 -
    # which the main five do, but not some of the smaller ones.  Improvement would be to better deal with this.
    
    all_ses,
    polls2 |>
      select(Pollster, ACT:Other) |>
      pivot_longer(ACT:Other, names_to = 'Party', values_to = 'p') |>
      summarise(
        p = mean(p, na.rm = T),
        se = sqrt(p * (1 - p) / 1000),
        .by = c(Pollster, Party)
      )
  ),
  tar_target(
    ses3,
    all_ses |>
      arrange(Pollster, Party) |>
      group_split(Pollster)
  ),
  tar_target(
    d1, list(mu_elect1 = as.numeric(elections[1, ]), 
           mu_elect2 = as.numeric(elections[2, ]),
           mu_elect3 = as.numeric(elections[3, ]), 

           n_parties = length(parties_ss),
           n_weeks = weeks_between_elections, 
           # multiply the variance of all polls by 2.  See my blog post of 9 July 2017.
           inflator = sqrt(2),
           
           y1_n = nrow(polls3[[1]]),
           y1_values = polls3[[1]][ , 4:11],
           y1_weeks = as.numeric(polls3[[1]]$MidDateNumber),
           y1_se = ses3[[1]]$se,
           
           y2_n = nrow(polls3[[2]]),
           y2_values = polls3[[2]][ , 4:11],
           y2_weeks = as.numeric(polls3[[2]]$MidDateNumber),
           y2_se = ses3[[2]]$se,
           
           y3_n = nrow(polls3[[3]]),
           y3_values = polls3[[3]][ , 4:11],
           y3_weeks = as.numeric(polls3[[3]]$MidDateNumber),
           y3_se = ses3[[3]]$se,
           
           y4_n = nrow(polls3[[4]]),
           y4_values = polls3[[4]][ , 4:11],
           y4_weeks = as.numeric(polls3[[4]]$MidDateNumber),
           y4_se = ses3[[4]]$se,
           reid_method = as.numeric(polls3[[4]]$MidDate >= as.Date("2017-01-01")),
           
           n_pollsters = 4)
  )
)

#> Establish _targets.R and _targets_r/targets/prep.R.

2020 version of the model

This is a bit of a nasty copy-paste from above - it runs a version of the model for the 2020 election that we can use to check how well the model performs.

list(
  tar_target(election_weeks2020, floor_date(ymd(
    c("2011-11-26", "2014-09-20", "2017-09-23", "2020-10-17")
  ), unit = 'week')),
  tar_target(weeks_between_elections2020, as.integer(diff(election_weeks)) / 7),
  tar_target(
    elections2020,
    results |>
      filter(Election >= year(min(election_weeks2020)) & 
               Election < year(max(election_weeks2020))) |>
      mutate(
        Party = fct_other(Party, keep = parties2020$Party) |>
          fct_relevel(parties2020$Party)
      ) |>
      count(Party, Election, wt = Percentage, name = 'Percentage') |>
      arrange(Party, Election) |>
      pivot_wider(names_from = Party, values_from = Percentage, values_fill = 0) |>
      select(-Election)
  ),
  tar_target(
    polls22020,
    polls |>
      filter(Election %in% c(2014, 2017, 2020),
             Pollster %in% c(pollsters2020$Pollster),
             !is.na(MidPoint)) |>
  mutate(
    MidPoint = floor_date(MidPoint, unit = 'week'),
    Party = fct_other(Party, keep = parties2020$Party) |>
      fct_relevel(parties2020$Party)
  ) |>
  filter(Party != 'Other') |> 
  mutate(Polled = sum(VotingIntention), .by = c(Pollster, MidPoint)) |> 
  rename(MidDate = MidPoint, ElectionYear = Election) |>
  count(
    Party,
    Pollster,
    MidDate,
    ElectionYear,
    Polled,
    wt = VotingIntention,
    name = "VotingIntention"
  ) |>
  filter(MidDate <= election_weeks2020[4]) |>
  arrange(Party, MidDate) |>
  pivot_wider(
    names_from = Party,
    values_from = VotingIntention,
    names_sort = TRUE,
    values_fill = 0
  ) |>
  mutate(Other = pmax(0, 1 - Polled),
         MidDateNumber = 1 + as.numeric(MidDate - election_weeks2020[1]) / 7) |> 
  select(-Polled)

  ),
  tar_target(
    polls32020,
    polls22020 |>
      arrange(Pollster, MidDate) |>
      group_split(Pollster)
  ),
  tar_target(parties_ss2020, names(elections2020)),
  tar_target(
    # estimate the standard errors.  Note we are pretending they all have a sample size of 1000 -
    # which the main five do, but not some of the smaller ones.  Improvement would be to better deal with this.
    
    all_ses2020,
    polls22020 |>
      select(Pollster, ACT:Other) |>
      pivot_longer(ACT:Other, names_to = 'Party', values_to = 'p') |>
      summarise(
        p = mean(p, na.rm = T),
        se = sqrt(p * (1 - p) / 1000),
        .by = c(Pollster, Party)
      )
  ),
  tar_target(
    ses32020,
    all_ses2020 |>
      arrange(Pollster, Party) |>
      group_split(Pollster)
  ),
  tar_target(
    d12020, list(mu_elect1 = as.numeric(elections2020[1, ]), 
           mu_elect2 = as.numeric(elections2020[2, ]),
           mu_elect3 = as.numeric(elections2020[3, ]), 

           n_parties = length(parties_ss2020),
           n_weeks = weeks_between_elections2020, 
           # multiply the variance of all polls by 2.  See my blog post of 9 July 2017.
           inflator = sqrt(2),
           
           y1_n = nrow(polls32020[[1]]),
           y1_values = polls32020[[1]][ , 4:9],
           y1_weeks = as.numeric(polls32020[[1]]$MidDateNumber),
           y1_se = ses32020[[1]]$se,
           
           y2_n = nrow(polls32020[[2]]),
           y2_values = polls32020[[2]][ , 4:9],
           y2_weeks = as.numeric(polls32020[[2]]$MidDateNumber),
           y2_se = ses32020[[2]]$se,
           reid_method = as.numeric(polls32020[[2]]$MidDate >= as.Date("2017-01-01")),
           
           y3_n = nrow(polls32020[[3]]),
           y3_values = polls32020[[3]][ , 4:9],
           y3_weeks = as.numeric(polls32020[[3]]$MidDateNumber),
           y3_se = ses32020[[3]]$se,
           
           n_pollsters = 3)
  )
)

#> Establish _targets.R and _targets_r/targets/prep2020.R.

Run the models

Uses stantargets to run the model - see tar_stan_mcmc

list(
  tar_stan_mcmc(
    model2023,
    "stan/model2023.stan",
    dir = ".stan",
    data = d1,
    chains = 4,
    parallel_chains = 4,
    iter_sampling = 2000,
    max_treedepth = 20,
    deployment = "worker"
  ),
    tar_stan_mcmc(
    model2020,
    "stan/model2020.stan",
    dir = ".stan",
    data = d12020,
    chains = 4,
    parallel_chains = 4,
    iter_sampling = 2000,
    max_treedepth = 20,
    deployment = "worker"
  )

)
#> Establish _targets.R and _targets_r/targets/modelling.R.

Charts

Voting intention

Current charts are more or less the same as some of Peter’s charts. Future versions will export the data to create more interactive charts.

list(
  tar_target(
    voting_intention_chartdata,
    model2023_summary_model2023 |>
      filter(str_starts(variable, "mu")) |>
      mutate(
        Party = rep(parties_ss, each = sum(weeks_between_elections)),
        week = rep(1:sum(weeks_between_elections), length(parties_ss)),
        week = min(election_weeks) + weeks(week)
      )
  ),
  tar_target(
    voting_intention_chart,
    voting_intention_chartdata |>
      ggplot(aes(
        x = week,
        y = mean,
        colour = Party,
        fill = Party
      )) +
      geom_point(
        data = gather(
          polls2,
          Party,
          VotingIntention,-Pollster,-MidDate,-ElectionYear,-MidDateNumber
        ) |>
          filter(VotingIntention != 0),
        aes(x = MidDate, y = VotingIntention),
        colour = "black",
        size = 0.5
      ) +
      geom_vline(
        xintercept = as.numeric(election_dates),
        colour = "grey60"
      ) +
      scale_y_continuous(labels = percent) +
      scale_x_date(breaks = ym(
        c('2016-01', '2018-01', '2020-01', '2022-01')
      ), date_labels = '%Y') +
      scale_fill_manual(values = party_colours$Colour, breaks = party_colours$Party) +
      geom_line(
        data = voting_intention_chartdata |> filter(week <= today()),
        colour = "black"
      ) +
      geom_ribbon(aes(ymin = q5, ymax = q95), alpha = 0.3, colour = NA) +
      labs(y = NULL, x = NULL) +
      theme_clean() +
      theme(
        plot.title.position = "plot",
        legend.position = 'none',
        strip.background = element_rect(fill = '#121617'),
        strip.text = element_text(colour = 'white'),
        plot.background = element_blank()
      )
  ),
  tar_file(voting_intention620, {
    f <- "output/voting_intention620.svg"
    ggsave(
      f,
      plot = voting_intention_chart +
        facet_wrap(
          ~ factor(Party, levels = party_colours$Party),
          scales = "free",
          ncol = 2
        ),
      dpi = 100,
      width = 6.2,
      height = 8
    )
    f
  }),
  tar_file(voting_intention375, {
    f <- "output/voting_intention375.svg"
    ggsave(
      f,
      plot = voting_intention_chart +
        facet_wrap(
          ~ factor(Party, levels = party_colours$Party),
          scales = "free",
          ncol = 1
        ),
      dpi = 100,
      width = 3.75,
      height = 10
    )
    f
  })
)
#> Establish _targets.R and _targets_r/targets/charts.R.

Possible coaltion plots

Grab the simulation results for party vote for election night and the current week. Currently hard coding the current week - but will need to do something better.

Assumption is hard-coded that parties that currently have an electorate seat will keep at least one and no additional parties will pick up an electorate seat. Future work will make this more flexible.

We render an mobile screen sized plot and the desktop sized plot and load these as SVGs - slightly clunky but is actually a good way to display ggplot2 images on the web.

simulate_seats <- function(sims) {
  t(sapply(1:nrow(sims), function(i) {
    allocate_seats(
      votes = as.numeric(sims[i,]),
      electorate = c(1, 1, 1, 1, 0, 1, 0),
      parties = names(sims)
    )$seats_v
  })) |>
    as_tibble() |>
    mutate(
      NatCoal = ACT + National,
      LabGreen = Labour + Green,
      LabGreenMaori = Labour + Green + `Te Pāti Māori`,
      NatCoalMaori = NatCoal + `Te Pāti Māori`
    )
}

list(
  tar_target(
    weekly_mu,
    model2023_mcmc_model2023$draws('mu', format = 'draws_matrix')
  ),
  tar_target(
    sims_election_night,
    weekly_mu[, 1:8 * 473] |> as_tibble() |> set_names(parties_ss) |>
      select(all_of(sort(parties_ss))) |>
      select(-Other) 
  ),
  tar_target(
    sims_saturday,
    weekly_mu[, 1:8 * 473 - 24] |> as_tibble() |> set_names(parties_ss) |>
      select(all_of(sort(parties_ss))) |>
      select(-Other) 
  ),
  tar_target(seats_election_night, simulate_seats(sims_election_night)),
  tar_target(seats_saturday, simulate_seats(sims_saturday))
)
#> Establish _targets.R and _targets_r/targets/seats.R.

coalition_plot <- function(seats) {
  seats  |>
    select(National,
           Labour,
           NatCoal,
           LabGreen,
           LabGreenMaori,
           NatCoalMaori) |>
    gather(Coalition, Seats) |>
    mutate(lab_in = ifelse(grepl("^Lab", Coalition), "Labour-based", "Nationals-based")) |>
    mutate(
      Coalition = gsub("^LabGreen", "Labour, Greens", Coalition),
      Coalition = gsub("^NatCoal", "National, ACT", Coalition),
      Coalition = gsub("Maori", ",\nTe Pāti Māori", Coalition)
    ) |>
    ggplot(aes(x = Seats, fill = lab_in)) +
    geom_histogram(
      alpha = 0.5,
      binwidth = 1,
      position = "identity",
      colour = NA
    )  +
    scale_y_continuous() +
    scale_fill_manual(values = c('#d82c20', '#065BAA')) +
    theme_clean() +
    theme(
      legend.position = 'none',
      strip.background = element_rect(fill = '#121617'),
      strip.text = element_text(colour = 'white'),
      plot.background = element_blank()
    ) +
    labs(y = NULL) +
    annotate(
      'segment',
      x = 61,
      xend = 61,
      y = 0,
      yend = Inf
    )
}

list(
  tar_target(election_night_plot, coalition_plot(seats_election_night)),
  tar_target(saturday_plot, coalition_plot(seats_saturday)),
  tar_file(election_night620, {
    f <- "output/election_night620.svg"
    ggsave(
      f,
      plot = election_night_plot +
        facet_wrap(~ factor(
          Coalition,
          levels = c(
            "Labour",
            "National",
            "Labour, Greens",
            "National, ACT",
            "Labour, Greens,\nTe Pāti Māori",
            "National, ACT,\nTe Pāti Māori"
          )
        ), ncol = 2),
      dpi = 100,
      width = 6.2,
      height = 4
    )
    f
  }),
  tar_file(election_night375, {
    f <- "output/election_night375.svg"
    ggsave(
      f,
      plot = election_night_plot +
        facet_wrap(~ Coalition,
                   ncol = 1),
      dpi = 100,
      width = 3.75,
      height = 7
    )
    f
  }),
  tar_file(saturday620, {
    f <- "output/saturday620.svg"
    ggsave(
      f,
      plot = saturday_plot +
        facet_wrap(~ factor(
          Coalition,
          levels = c(
            "Labour",
            "National",
            "Labour, Greens",
            "National, ACT",
            "Labour, Greens,\nTe Pāti Māori",
            "National, ACT,\nTe Pāti Māori"
          )
        ), ncol = 2),
      dpi = 100,
      width = 6.2,
      height = 4
    )
    f
  }),
  tar_file(saturday375, {
    f <- "output/saturday375.svg"
    ggsave(
      f,
      plot = saturday_plot +
        facet_wrap(~ Coalition,
                   ncol = 1),
      dpi = 100,
      width = 3.75,
      height = 7
    )
    f
  })
)
#> Establish _targets.R and _targets_r/targets/coalition-plot.R.