Skip to content

Instantly share code, notes, and snippets.

@lemurey
Forked from elliottmorris/election_night_live_model.R
Last active January 23, 2023 03:15
Show Gist options
  • Save lemurey/9c4dfb7fc28b64952ee48c89b73f3238 to your computer and use it in GitHub Desktop.
Save lemurey/9c4dfb7fc28b64952ee48c89b73f3238 to your computer and use it in GitHub Desktop.
A live election-night prediction model using The Economist's pre-election forecast
import pandas as pd
import numpy as np
import ipywidgets as widgets
RNG = np.random.default_rng()
def logit(x):
return np.log(x / (1 - x))
def inv_logit(x):
return 1 / (1 + np.exp(-x))
class SimRunner():
def __init__(self, target_nsim=1000, max_runs=20, rng=RNG):
self.target_nsim = target_nsim
self.max_runs = max_runs
self.rng = RNG
self.biden_states = set()
self.trump_states = set()
self.biden_state_dict = {}
self._initialize_data()
self._initialize_controls()
self.fstr = 'pr(biden wins the electoral college) = {:.0%}\n[nsim = {}; se = {:.1%}]'
self.display = widgets.HBox([self.button_container, self.display_area])
def _initialize_controls(self):
self.buttons = []
self.sliders = []
for st in self.ev.index:
b = widgets.ToggleButtons(options=['Biden', 'Trump', 'N/A'],
value='N/A',
description=st)
s = widgets.FloatRangeSlider(value=[0, 1], min=0, max=1, step=0.01,
description='', continuous_update=False)
s._state_label = st
if st in self.hides:
b.layout.display = 'none'
s.layout.display = 'none'
b.observe(self._state_button_press, names=['index'])
s.observe(self._slider_change, names=['value'])
self.buttons.append(b)
self.sliders.append(s)
self.button_container = widgets.VBox([widgets.HBox([b, s]) for b, s in zip(self.buttons, self.sliders)])
self.display_area = widgets.Output(layout=widgets.Layout(display="flex", justify_content="flex-start"))
def _initialize_data(self):
s = pd.read_csv('https://cdn.economistdatateam.com/us-2020-forecast/data/president/electoral_college_simulations.csv')
ev_state = pd.read_csv('https://raw.githubusercontent.com/TheEconomist/us-potus-model/master/data/2012.csv',
index_col='state')['ev']
sim = s.iloc[:, 3:]
sim = pd.concat([sim] * 5).reset_index(drop=True)
national = self.rng.normal(0, 0.01, size=(sim.shape[0], 1))
state = RNG.normal(0, 0.02, size=sim.shape)
sim_forecast = sim + national + state
sim_forecast[sim_forecast <= 0] = 0.0001
sim_forecast[sim_forecast >= 1] = 0.9999
self.sim_forecast = sim_forecast
self.ev = ev_state[sim_forecast.columns]
self._run_me_ne_updates()
self.sim_forecast = self.sim_forecast.sort_index(axis=1)
self.ev = self.ev[self.sim_forecast.columns]
self.ss = np.cov(logit(self.sim_forecast), rowvar=False)
self.sigma = pd.DataFrame(self.ss, index=self.sim_forecast.columns,
columns=self.sim_forecast.columns)
self.mu = pd.Series(logit(self.sim_forecast).mean(), index=self.sim_forecast.columns)
_, _, _, p = self.update_prob()
self.hides = p[p > 0.99].index.union(p[p < 0.01].index)
def _run_me_ne_updates(self):
self.ev['ME'] = 2
self.ev['NE'] = 2
self.ev['ME1'] = 1
self.ev['ME2'] = 1
self.ev['NE1'] = 1
self.ev['NE2'] = 1
self.ev['NE3'] = 1
me_ne_leans = pd.DataFrame({'state': ['ME', 'ME', 'NE', 'NE', 'NE'],
'district': [1, 2, 1, 2, 3],
'dem_cd_lean': [0.053745445221462296,
-0.060452990625671894,
0.023745883670417745,
0.10881583949611655,
-0.13915995886163904]})
for _, row in me_ne_leans.iterrows():
name = '{}{}'.format(row['state'], row['district'])
vals = self.rng.normal(row['dem_cd_lean'], 0.0075, size=self.sim_forecast.shape[0])
self.sim_forecast.loc[:, name] = self.sim_forecast[row['state']] + vals
def _state_button_press(self, change):
# self.display.layout.display = 'none'
if change['name'] in ('_property_lock', 'label', 'value'):
return
b = change['owner']
st = b.get_state()
index = st['index']
label = st['description']
if '-' in label:
label, _ = label.split('-')
if index == 0:
if label in self.trump_states:
self.trump_states.remove(label)
self.biden_states.add(label)
elif index == 1:
if label in self.biden_states:
self.biden_states.remove(label)
self.trump_states.add(label)
else:
if label in self.trump_states:
self.trump_states.remove(label)
if label in self.biden_states:
self.biden_states.remove(label)
self._update_display()
# self.display.layout.display = None
def _slider_change(self, change):
low, high = change['new']
state = change['owner']._state_label
if low == 0 and high == 1:
if state in self.biden_state_dict:
del self.biden_state_dict[state]
else:
self.biden_state_dict[state] = (low, high)
self._update_display()
def _update_display(self):
p, ev_dist, sd, state_win = self.update_prob()
with self.display_area:
self.display_area.clear_output()
print(self.fstr.format(p, ev_dist.shape[0], sd))
for child in self.buttons:
desc = child.description
if '-' in desc:
state, _ = desc.split('-')
else:
state = desc
val = state_win[state]
child.description = '{}-{: >10.0%}'.format(state, val)
def draw_samples(self, early_return):
output = pd.DataFrame()
num_runs = 0
while True:
if output.shape[0] > self.target_nsim:
break
num_runs += 1
sim = inv_logit(self.rng.multivariate_normal(self.mu, self.sigma, size=(100000)))
proposals = pd.DataFrame(sim, columns=self.mu.index)
if early_return:
return proposals, None
for state in self.biden_states:
proposals.loc[proposals[state] < 0.5, state] = np.nan
for state in self.trump_states:
proposals.loc[proposals[state] > 0.5, state] = np.nan
for state, (biden_low, biden_high) in self.biden_state_dict.items():
if biden_low > 1:
biden_low = biden_low / 100
if biden_high > 1:
biden_high = biden_high / 100
inds = (proposals[state] > biden_high) | (proposals[state] < biden_low)
proposals.loc[inds, state] = np.nan
output = pd.concat((output, proposals.dropna(how='any')), ignore_index=True)
if (output.shape[0] < self.target_nsim and
output.shape[0] / (proposals.shape[0] * num_runs) < (1 - 99.99 / 100)):
print('more than 99.88% of simulations are rejected, try relaxing some constraints and try again')
return None, None
if num_runs > self.max_runs:
print(f'more than {max_runs} runs, aborting')
return None, None
return output, output.shape[0] / (proposals.shape[0] * num_runs)
def update_prob(self, early_return=False, show_local=False):
sim, acceptance_rate = self.draw_samples(early_return=early_return)
if early_return:
return None, None, None, sim
ev_dist = ((sim > 0.5) * self.ev).sum(axis=1)
state_win = (sim > 0.5).mean()
p = (ev_dist > 269).mean()
sd = np.sqrt(p * (1 - p) / ev_dist.shape[0])
if show_local:
print('Pr(biden wins) by state:')
for st in state_win.index:
print('{}: {:.0%}'.format(st, state_win[st]))
# print(state_win)
print(self.fstr.format(p, ev_dist.shape[0], sd))
return
return p, ev_dist, sd, state_win
sr = SimRunner()
##### can ignore below if transfering to notebook
sr.update_prob(show_local=True)
sr.biden_states.add('FL')
sr.update_prob(show_local=True)
sr.biden_states.remove('FL')
sr.trump_states.add('FL')
sr.update_prob(show_local=True)
sr.trump_states.add('NC')
sr.biden_states.add('VA')
sr.biden_state_dict['MI'] = (45, 55)
sr.update_prob(show_local=True)
#' Description
#' This file runs a live election-night forecast based on The Economist's pre-election forecasting model
#' available at projects.economist.com/us-2020-forecast/president.
#' It is resampling model based on https://pkremp.github.io/update_prob.html.
#' This script does not input any real election results! You will have to enter your picks/constraints manually (scroll to the bottom of the script).
#'
#' Licence
#' This software is published by *[The Economist](https://www.economist.com)* under the [MIT licence](https://opensource.org/licenses/MIT). The data generated by *The Economist* are available under the [Creative Commons Attribution 4.0 International License](https://creativecommons.org/licenses/by/4.0/).
#' The licences include only the data and the software authored by *The Economist*, and do not cover any *Economist* content or third-party data or content made available using the software. More information about licensing, syndication and the copyright of *Economist* content can be found [here](https://www.economist.com/rights/).
rm(list=ls())
library(tidyverse)
library(mvtnorm)
# functions first ---------------------------------------------------------
logit <- function(x) log(x/(1-x))
inv_logit <- function(x) 1/(1 + exp(-x))
draw_samples <- function(biden_states = NULL, trump_states = NULL, states = NULL,
upper_biden = NULL, lower_biden = NULL, print_acceptance = FALSE, target_nsim = 1000){
sim <- matrix(NA, nr = 1, nc = length(mu))
n <- 0
while(nrow(sim) < target_nsim){
# randomly sample from the posterior distribution and reject when constraints are not met
n <- n + 1
proposals <- inv_logit(rmvnorm(1e5, mu, Sigma, method = "svd")) # "DC" is pretty much uncorrelated
colnames(proposals) <- names(mu)
if (!is.null(biden_states)) proposals[which(proposals[,biden_states] < .5)] <- NA
if (!is.null( trump_states)) proposals[which(proposals[, trump_states] > .5)] <- NA
if (!is.null( states)){
for (s in states){
proposals[which(proposals[, s] > upper_biden[s] |
proposals[, s] < lower_biden[s])] <- NA
}
}
reject <- apply(proposals, 1, function(x) any(is.na(x)))
sim <- rbind(sim, proposals[!reject,])
if (nrow(sim) < target_nsim & nrow(sim)/(nrow(proposals)*n) < 1-99.99/100){
stop(paste("rmvnorm() is working hard... but more than 99.99% of the samples are rejected; you should relax some contraints.", sep = ""))
}
}
return(list("matrix" = sim[-1,], "acceptance_rate" = nrow(sim)/(nrow(proposals)*n)))
}
update_prob <- function(biden_states = NULL, trump_states = NULL, biden_scores_list = NULL, target_nsim = 1000, show_all_states = TRUE){
states <- names(biden_scores_list)
lower_biden <- sapply(biden_scores_list, function(x) x[1]/100)
upper_biden <- sapply(biden_scores_list, function(x) x[2]/100)
sim <- draw_samples(biden_states = biden_states, trump_states = trump_states, states = states,
upper_biden = upper_biden, lower_biden = lower_biden,
target_nsim = target_nsim)
ev_dist <- (sim[["matrix"]] > .5) %*% ev
state_win <- colMeans(sim[["matrix"]] > .5)
p <- mean(ev_dist >= 270)
sd <- sqrt(p*(1-p)/length(ev_dist))
if (show_all_states){
cat("Pr(biden wins) by state, in %:\n")
print(t(round(100*state_win)))
cat("--------\n")
}
cat(paste("Pr(biden wins the electoral college) = ", round(100*p), "%\n[nsim = ", length(ev_dist), "; se = ", round(sd*100,1), "%]", sep = ""))
if (show_all_states) cat("\n--------\n")
}
# read data ---------------------------------------------------------------
# get simulations
sim_forecast <- read_csv('https://cdn.economistdatateam.com/us-2020-forecast/data/president/electoral_college_simulations.csv')
# check initial parameters
nrow(sim_forecast)
mean(sim_forecast$dem_ev > 269)
# select relevant columns and make the data frae into a matrix
sim_forecast <- sim_forecast %>% select(4:ncol(.))
sim_forecast <- list(sim_forecast,sim_forecast,sim_forecast,sim_forecast,sim_forecast) %>% bind_rows %>% as.matrix
# this bit is really hacky
# it make the simulations a little less correlated by add a bunch of random noise
# this helps our live model react to really implausible events that could happen on election night
# but will have the result of pushing the initial pre-results forecasts back toward 50-50
sim_forecast <- sim_forecast +
rnorm(nrow(sim_forecast), 0, 0.01) + # national error component
replicate(ncol(sim_forecast),rnorm(nrow(sim_forecast),0,0.02)) # state
sim_forecast <- ifelse(sim_forecast <= 0, 0.0001, sim_forecast)
sim_forecast <- ifelse(sim_forecast >= 1, 0.99999, sim_forecast)
sim_forecast <- as_tibble(sim_forecast)
# now, get electoral votes in each state
# and make sure they're in the right order
#ev_state <- read_csv('data/state_evs.csv')$ev
#names(ev_state) <- read_csv('data/state_evs.csv')$state
ev_state <- read_csv('https://raw.githubusercontent.com/TheEconomist/us-potus-model/master/data/2012.csv')$ev
names(ev_state) <- read_csv('https://raw.githubusercontent.com/TheEconomist/us-potus-model/master/data/2012.csv')$state
ev <- ev_state[colnames(sim_forecast)]
# check that the EVs and state forecasts add up to the right amounts
sim_evs <- apply(sim_forecast,
1,
function(x){
sum((x > 0.5 )* ev)
})
hist(sim_evs,breaks=538)
median(sim_evs)
mean(sim_evs)
mean(sim_evs > 269)
enframe(prop.table(table(sim_evs)),'dem_ev','pct') %>% arrange(desc(pct))
# adding ME1 and ME2, NE1 NE2 to sim_forecast matrix and ev vector
# we do this by adding the average district-level two-party dem presidential vote, relative
# to the state-level dem two-party vote, back to to our state-level forecast
# first, split up EVs
ev["ME"] <- 2
ev["NE"] <- 2
ev <- c(ev, "ME1" = 1, "ME2" = 1, "NE1" = 1, "NE2" = 1, "NE3" = 1)
sum(ev)
# create simulations for ME and NE districts
me_ne_leans <- politicaldata::pres_results_by_cd %>% filter(year >= 2012, state_abb %in% c("ME","NE")) %>%
select(-other) %>%
rename(state = state_abb) %>%
group_by(year,state) %>%
mutate(sum_pct = dem + rep,
total_votes = total_votes * sum_pct,
dem = dem /sum_pct,
rep = rep/sum_pct) %>%
mutate(dem_vote_state = sum(dem * total_votes) / sum(total_votes)) %>%
mutate(dem_cd_lean = dem - dem_vote_state) %>%
group_by(state,district) %>%
summarise(dem_cd_lean = weighted.mean(dem_cd_lean, c(0.3,0.7)))
# bind new simulation columns for the congressional districts, based on the above
sim_forecast <- bind_cols(
sim_forecast,
tibble(
ME1 = sim_forecast %>% pull(ME) +
rnorm(nrow(sim_forecast),
me_ne_leans[me_ne_leans$state == "ME" & me_ne_leans$district == 1,]$dem_cd_lean,
.0075),
ME2 = sim_forecast %>% pull(ME) +
rnorm(nrow(sim_forecast),
me_ne_leans[me_ne_leans$state == "ME" & me_ne_leans$district == 2,]$dem_cd_lean,
.0075),
NE1 = sim_forecast %>% pull(NE) +
rnorm(nrow(sim_forecast),
me_ne_leans[me_ne_leans$state == "NE" & me_ne_leans$district == 1,]$dem_cd_lean,
.0075),
NE2 = sim_forecast %>% pull(NE) +
rnorm(nrow(sim_forecast),
me_ne_leans[me_ne_leans$state == "NE" & me_ne_leans$district == 2,]$dem_cd_lean,
.0075),
NE3 = sim_forecast %>% pull(NE) +
rnorm(nrow(sim_forecast),
me_ne_leans[me_ne_leans$state == "NE" & me_ne_leans$district == 3,]$dem_cd_lean,
.0075)
)
)
sim_forecast
sim_evs <- apply(sim_forecast,
1,
function(x){
sum((x > 0.5 )* ev)
})
colMeans(sim_forecast > 0.5)
hist(sim_evs,breaks=538)
median(sim_evs)
mean(sim_evs)
mean(sim_evs > 269)
enframe(prop.table(table(sim_evs)),'dem_ev','pct') %>% arrange(desc(pct))
# final data wrangling -- this stuff getes passed into the update_prob function
# mainly we just want to make sure everything is in the right order with the right names
ev <- ev[colnames(sim_forecast)]
Sigma <- cov(logit(sim_forecast))
Sigma
mu <- colMeans(logit(sim_forecast))
names(mu) <- colnames(sim_forecast)
# run ---------------------------------------------------------------------
# for example...
# raw prob, no constraints
update_prob(biden_states = NULL,
trump_states = NULL,
biden_scores_list = NULL)
# biden wins florida
update_prob(biden_states = c("FL"),
trump_states = NULL,
biden_scores_list = NULL)
# trump wins florida
update_prob(biden_states = NULL,
trump_states = c("FL"),
biden_scores_list = NULL)
# trump wins fl and nc, biden wins va, biden margin in MI can't be outside -10 or 10
update_prob(biden_states = c("VA"),
trump_states = c("FL","NC"),
biden_scores_list = list("MI" = c(45,55)))
# now it's your turn....
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment