In [2]:
import pandas as pd
import numpy as np
from IPython.display import display, HTML, Image
In [3]:
# helper functions
def left_of_bracket(s):
if '(' in s:
needle = s.find('(')
r = s[:needle-1].strip()
return r
else:
return s
In [4]:
filepath = '1999_referenda_output/republic_referendum_by_electorate_by_polling_place.csv'
df_results = pd.read_csv(
filepath
)
display(df_results.head(3))
In [5]:
filepath = '1999_referenda_output/republic_referendum_by_polling_place.csv'
df_results_by_pp = pd.read_csv(
filepath
)
display(df_results_by_pp.head(3))
In [6]:
filepath = '1999_referenda_output/polling_places_geocoded.csv'
df_pp = pd.read_csv(
filepath
)
display(df_pp.head(3))
In [7]:
r = df_results[['state','yes_n','formal_n']].groupby('state').sum()
r['yes_p'] = round(r['yes_n']/r['formal_n'],4)
display(r.sort_values(['yes_p'],ascending=False))
In [8]:
r = df_results[['electorate','yes_n','formal_n']].groupby('electorate').sum()
r['yes_p'] = round(r['yes_n']/r['formal_n'],4)
display(r.sort_values(['yes_p'],ascending=False).head(5))
In [9]:
r = df_results[['electorate','yes_n','formal_n']].groupby('electorate').sum()
r['yes_p'] = round(r['yes_n']/r['formal_n'],4)
display(r.sort_values(['yes_p'],ascending=True).head(5))
In [10]:
# import geographic size of seats
filepath = '1999_referenda/electorate_boundaries/boundaries_republic_referendum_aus.csv'
df_area = pd.read_csv(
filepath,
skiprows = 1,
names = ['electorate','area_sqkm']
)
# make df grouped by electorate
df_by_electorate = df_results[['electorate','yes_n','formal_n']].groupby('electorate').sum()
df_by_electorate['yes_p'] = round(r['yes_n']/r['formal_n'],4)
df_by_electorate = df_by_electorate.reset_index()
# merge in area
df_by_electorate = pd.merge(df_by_electorate, df_area, on='electorate', how='left')
display(df_by_electorate.head(5))
In [11]:
from plotly.offline import *
import plotly.offline as py
import plotly.plotly as pyonline
import plotly.graph_objs as go
init_notebook_mode(connected=True) # render plotly charts in the notebook on the fly
In [12]:
series = go.Scatter(
y = df_by_electorate['yes_p'],
x = df_by_electorate['area_sqkm'],
name = '% Yes',
mode = 'markers',
text = df_by_electorate['electorate'],
marker = dict (
size = 10,
opacity = 0.6
)
)
xaxis=dict(
title = 'Size of Electorate, SqKm',
titlefont=dict(
family='Open Sans',
size=16
)
)
yaxis = dict(
title = '% Support',
titlefont=dict(
family='Open Sans',
size=16
),
tickformat = ',.0%',
range=[.2,.8]
)
title = '1999 Republic Referendum - % Support vs. Size of Electorate'
titlefont = dict(
family='Open Sans',
size=22
)
layout = go.Layout(
title = title,
titlefont = titlefont,
xaxis = xaxis,
yaxis = yaxis
)
data = [series]
figure01 = go.Figure(data=data, layout=layout)
In [13]:
py.iplot(figure01, filename='figure01')
#pyonline.image.ishow(figure01, width=1500, height=750)
In [14]:
xaxis=dict(
title = 'Log of size of Electorate, SqKm',
titlefont=dict(
family='Open Sans',
size=16
),
type='log'
)
title = '1999 Republic Referendum - % Support vs. log(size) of Electorate'
layout = go.Layout(
title = title,
titlefont = titlefont,
xaxis = xaxis,
yaxis = yaxis
)
figure02 = go.Figure(data=data, layout=layout)
There appears to be a relatively strong relationship between % support for the republic, and size of electorate
Smaller electorates by area (i.e, more densely populated inner-urban electorates) are more likely to support the republic
A noteable outlier is the Northern Terirtory, the second largest electorate by area, still had 49% support
In [31]:
py.iplot(figure02, filename='figure02')
#pyonline.image.ishow(figure02, width=1500, height=750)
In [16]:
import math
import statsmodels.formula.api as sm
# add log(area) var to df
df_by_electorate['area_sqkm_log'] = df_by_electorate['area_sqkm'].apply(lambda x: math.log(x))
df_by_electorate.head(3)
# run regression
result = sm.ols(formula="area_sqkm_log ~ yes_p", data=df_by_electorate).fit()
display(result.summary())
In [17]:
series = go.Scatter(
y = df_results_by_pp['yes_p'],
x = df_results_by_pp['total_n'],
name = '% Yes',
mode = 'markers',
text = df_results_by_pp['polling_place'],
marker = dict (
size = 10,
opacity = 0.6
)
)
xaxis=dict(
title = 'Number of Votes',
titlefont=dict(
family='Open Sans',
size=16
)
)
yaxis = dict(
title = '% Support',
titlefont=dict(
family='Open Sans',
size=16
),
tickformat = ',.0%'
)
title = '1999 Republic Referendum - % Support vs. Number of Votes by polling place'
titlefont = dict(
family='Open Sans',
size=22
)
layout = go.Layout(
title = title,
titlefont = titlefont,
xaxis = xaxis,
yaxis = yaxis
)
data = [series]
figure03 = go.Figure(data=data, layout=layout)
It's exceptionally noisy, but there is some relationship between % support for the republic a size of polling place
In [18]:
py.iplot(figure03, filename='figure03')
#pyonline.image.ishow(figure03, width=1500, height=750)
In [19]:
import fiona
import geopandas as gp
from shapely.geometry import Point
%matplotlib inline
df_pp['geometry'] = df_pp.apply(lambda z: Point(z.longitude, z.latitude), axis=1)
df_pp_geom = gp.GeoDataFrame(df_pp)
df_pp_geom.head(3)
Out[19]:
In [20]:
# import
filepath = 'federal_election_polling_places/pp_2016_election.csv'
df_pp_2016 = pd.read_csv(
filepath
)
# check
display(df_pp_2016.head(3))
# just ordinary polling places
df_pp_2016 = df_pp_2016[df_pp_2016['PollingPlaceTypeID'] == 1]
# create a polling place column (without seat in the name)
lambda_polling_places = lambda x: left_of_bracket(x)
df_pp_2016['polling_place'] = df_pp_2016['PollingPlaceNm'].apply(lambda_polling_places)
# filter for relevant columns
df_pp_2016 = df_pp_2016[[
'State',
'polling_place',
'Latitude',
'Longitude'
]]
# make headers lower case
df_pp_2016.columns = [x.lower() for x in df_pp_2016.columns]
# de dup
df_pp_2016 = df_pp_2016.reset_index()
del df_pp_2016['index']
df_pp_2016 = df_pp_2016.drop_duplicates()
df_pp_2016.head(3)
# test - is there only one braddon?
display(df_pp_2016[df_pp_2016['polling_place']=="Braddon"])
# export to csv
df_pp_2016.to_csv(
'federal_election_polling_places/pp_2016_election_ordinary.csv'
)
In [35]:
df_pp_2016['geometry'] = df_pp_2016.apply(lambda z: Point(z.longitude, z.latitude), axis=1)
df_pp_2016_geom = gp.GeoDataFrame(df_pp_2016)
df_pp_2016_geom.head(5)
Out[35]:
In [39]:
import geopandas as gpd
from shapely.geometry import Point
from shapely.ops import nearest_points
# makes geometry points for each pp
pts = df_pp_2016_geom.geometry.unary_union
# for a given point return nearest poling place
def near(point, polling_places=pts):
# get the data point from df_pp_2016_geom for which geometry = the geometry of the nearest point
nearest = df_pp_2016_geom.geometry == nearest_points(point,polling_places)[1]
# return the index col of pp_2016
return df_pp_2016_geom[nearest].index.get_values()[0]
# test run, limit dataset
df_pp_geom = df_pp_geom.head(10)
# run 'near' into a new column on the 1999 data frame
df_pp_geom['pp_2016_index'] = df_pp_geom.apply(lambda row: near(row.geometry), axis=1)
display(df_pp_geom.head(3))
In [38]:
# output to csv - commented out to prevent exporting by accident - the above code takes a while to run
# and the code above contains the line 'df_pp_geom = df_pp_geom.head(10)' so it can output a demo without rerunning
# df_pp_geom.to_csv('1999_referenda_output/polling_places_with_nearest_2016_polling_place.csv',index=False)
In [22]:
filepath = '1999_referenda_output/polling_places_with_nearest_2016_polling_place.csv'
df_pp_1999_nearest_2016 = pd.read_csv(
filepath
)
df_pp_1999_nearest_2016.head(3)
Out[22]:
In [23]:
# import swing data
filepath = '2016_federal_election_data/two_party_preferred_by_polling_place_2016.csv'
df_sw_2016 = pd.read_csv(
filepath
)
display(df_sw_2016.head(3))
# tidy up 2016 swing data
# make headers lower case
df_sw_2016.columns = [x.lower() for x in df_sw_2016.columns]
# make a seat-independent polling place column
lambda_polling_places = lambda x:left_of_bracket(x)
df_sw_2016['polling_place'] = df_sw_2016['pollingplace'].apply(lambda_polling_places)
# create a label for state
df_sw_2016['state'] = df_sw_2016['stateab']
# create a column for alp vote
df_sw_2016['alp_n'] = df_sw_2016['australian labor party votes']
# filter for relevant columns
df_sw_2016 = df_sw_2016[[
'state',
'polling_place',
'swing',
'alp_n',
'totalvotes'
]]
# convert swing to a percentage
df_sw_2016['swing'] = df_sw_2016['swing']/100
# before merge
print("Lets use Braddon as an example of a joint booth")
print("Before Merge:")
display(df_sw_2016[df_sw_2016['polling_place']=="Braddon"])
# make a weighted swing column
df_sw_2016['weight'] = df_sw_2016['swing'] * df_sw_2016['totalvotes']
print("With weight:")
display(df_sw_2016[df_sw_2016['polling_place']=="Braddon"])
del df_sw_2016['swing']
df_sw_2016 = df_sw_2016.groupby(['state','polling_place']).agg('sum')
df_sw_2016 = df_sw_2016.reset_index()
df_sw_2016['swing'] = df_sw_2016['weight']/df_sw_2016['totalvotes']
print("Merged with weight:")
display(df_sw_2016[df_sw_2016['polling_place']=="Braddon"])
del df_sw_2016['weight']
df_sw_2016['alp_p'] = df_sw_2016['alp_n'] / df_sw_2016['totalvotes']
# after merge
print("Final:")
display(df_sw_2016[df_sw_2016['polling_place']=="Braddon"])
display(df_sw_2016.head(5))
In [24]:
filepath = 'federal_election_polling_places/pp_2016_election_ordinary.csv'
df_pp_2016_with_ids = pd.read_csv(
filepath,
names=['pp_2016_index','state', 'polling_place', 'latitude', 'longitude'],
skiprows = 1
)
df_sw_2016 = pd.merge(df_sw_2016, df_pp_2016_with_ids, on=['state','polling_place'], how='inner')
display(df_sw_2016.head(3))
del df_sw_2016['state']
del df_sw_2016['polling_place']
del df_sw_2016['latitude']
del df_sw_2016['longitude']
display(df_sw_2016.head(3))
df_pp_1999_nearest_2016 = pd.merge(df_pp_1999_nearest_2016, df_sw_2016, on=['pp_2016_index'], how='left')
display(df_pp_1999_nearest_2016.head(3))
In [25]:
df_1999_v_2016 = pd.merge(df_pp_1999_nearest_2016, df_results_by_pp, on=['state','polling_place'], how='left')
display(df_1999_v_2016.head(3))
In [26]:
series = go.Scatter(
y = df_1999_v_2016['swing'],
x = df_1999_v_2016['yes_p'],
name = '2016 Federal Election Swing v. Republic Referendum Yes Vote',
mode = 'markers',
text = df_1999_v_2016['polling_place'],
marker = dict (
size = 10,
opacity = 0.6
)
)
xaxis=dict(
title = '% Yes, Republic Referendum 1999',
titlefont=dict(
family='Open Sans',
size=16
),
tickformat = ',.0%'
)
yaxis = dict(
title = 'Swing to ALP, Federal Election 2016',
titlefont=dict(
family='Open Sans',
size=16
),
tickformat = ',.0%'
)
title = '2016 Federal Election Swing v. Republic Referendum Yes Vote'
titlefont = dict(
family='Open Sans',
size=18
)
layout = go.Layout(
title = title,
titlefont = titlefont,
xaxis = xaxis,
yaxis = yaxis
)
data = [series]
figure04 = go.Figure(data=data, layout=layout)
In [27]:
py.iplot(figure04, filename='figure04')
#pyonline.image.ishow(figure04, width=1500, height=750)
In [28]:
series = go.Scatter(
y = df_1999_v_2016['alp_p'],
x = df_1999_v_2016['yes_p'],
name = 'Labor two-party vote, Federal Election 2016',
mode = 'markers',
text = df_1999_v_2016['polling_place'],
marker = dict (
size = 10,
opacity = 0.6,
color = '#c0211a'
)
)
yaxis = dict(
title = 'Labor two-party vote, Federal Election 2016',
titlefont=dict(
family='Open Sans',
size=16
),
tickformat = ',.0%'
)
title = '2016 Federal Election Labor two-party Vote v. Republic Referendum Yes Vote'
layout = go.Layout(
title = title,
titlefont = titlefont,
xaxis = xaxis,
yaxis = yaxis
)
data = [series]
figure05 = go.Figure(data=data, layout=layout)
In [29]:
py.iplot(figure05, filename='figure05')
#pyonline.image.ishow(figure05, width=1500, height=750)
In [30]:
# run regression
result = sm.ols(formula="yes_p ~alp_p", data=df_1999_v_2016).fit()
display(result.summary())