The AEC started putting co-ordinates on polling place files from 2007.
The code below matches to 2007 polling places, where the name of the venue is the same.
While not perfect, this should result the vast majority of the time in a co-ordinate closely representing the location of the polling location in 1999
In [6]:
import pandas as pd
import numpy as np
from IPython.display import display, HTML
import json
import googlemaps
In [7]:
def left_of_bracket(s):
if '(' in s:
needle = s.find('(')
r = s[:needle-1].strip()
return r
else:
return s
def state_abbreviation(state):
spaces = state.count(' ')
if spaces == 2:
bits = state.split(' ')
r=''
for b in bits:
r = r + b[:1].upper() # for each word in state grab first letter
return r
elif 'Australia' in state:
r = state[:1].upper() + 'A'
return r
elif state == 'Queensland':
return 'QLD'
elif state == 'Northern Territory':
return 'NT'
else:
r = state[:3].upper()
return r
# i keep forgetting the syntax for this so writing a wrapper
def dedup_df(df, keys, keep = False):
# for a data frame, drop anything thats a duplicate
# if you change keep to first, it'll keep the first row rather than none
df_dedup = df.drop_duplicates(keys, keep)
return df_dedup
import_polling_places(filepath)
['state','polling_place']
In [8]:
def import_polling_places(filepath):
# read csv
df_pp = pd.read_csv(
filepath
)
# pick the columns I want to keep
cols = [
'State',
'PollingPlaceNm',
'PremisesNm',
'PremisesAddress1',
'PremisesAddress2',
'PremisesAddress3',
'PremisesSuburb',
'PremisesStateAb',
'PremisesPostCode',
'Latitude',
'Longitude'
]
# filter for those
df_pp = df_pp[cols]
# create a polling place column missing the bracket
lambda_polling_places = lambda x: left_of_bracket(x)
df_pp['polling_place'] = df_pp['PollingPlaceNm'].apply(lambda_polling_places)
# rename columns to make joining easier
df_pp['premises'] = df_pp['PremisesNm']
df_pp['postcode'] = df_pp['PremisesPostCode']
# replace in the col headers list where I've modified/added the column
cols = [c.replace('PollingPlaceNm', 'polling_place') for c in cols]
cols = [c.replace('PremisesNm', 'premises') for c in cols]
cols = [c.replace('PremisesPostCode', 'postcode') for c in cols]
# reorder df
df_pp = df_pp[cols]
# dedup
df_pp = df_pp.drop_duplicates()
# make all headers lower case
df_pp.columns = [x.lower() for x in df_pp.columns]
return df_pp
In [9]:
filepath = 'federal_election_polling_places/pp_2007_election.csv'
test = import_polling_places(filepath)
display('Rows: ' + str(len(test.index)))
In [10]:
def import_1999_pp(filepath):
df_pp_1999 = pd.read_csv(
filepath
)
# add blank columns for match types and lat/lng
df_pp_1999['match_source'] = np.nan
df_pp_1999['match_type'] = np.nan
df_pp_1999['latitude'] = np.nan
df_pp_1999['longitude'] = np.nan
# tell it to index on state and polling place
df_pp_1999 = df_pp_1999.set_index(['state','polling_place'])
return df_pp_1999
In [11]:
filepath = '1999_referenda_output/polling_places.csv'
df_pp_1999 = import_1999_pp(filepath)
display(df_pp_1999.head(3))
pandas generates warnings for working with a data frame that's a copy of another it thinks I might think I'm changing df_pp_1999 when im working with df_pp_1999_working i'm turning this warning off because i'm doing this on purpose, so I can keep df_pp_1999 as a 'yet to be matched' file, and update it with each subsequent working file
In [12]:
pd.set_option('chained_assignment',None)
match_polling_places(df_pp_1999, df_pp, settings)
match_polling_places(
df_pp_1999,
df_pp,
dict(
keys = ['state','premises','postcode'],
match_source = '2007 Polling Places',
match_type = 'Match 01 - state, premises, postcode'
)
)
In [13]:
def match_polling_places(df1, df2, settings):
# split up our meta field
keys = settings['keys']
match_source = settings['match_source']
match_type = settings['match_type']
# filter for those columns
df_working = df1.reset_index()[[
'state',
'polling_place',
'premises',
'address',
'suburb',
'postcode',
'wheelchair_access'
]]
# the keys I want to keep from the second df in the join are the group_by keys, and also lat/lng
cols_df2 = keys + ['latitude','longitude']
# add cols for match type
df_working['match_source'] = match_source
df_working['match_type'] = match_type
# run the join
df_working = pd.merge(
df_working,
df2[cols_df2],
on=keys,
how='left'
)
# delete those which we didn't match
df_working = df_working[~df_working['latitude'].isnull()]
# dedup on the keys we matched on
df_working = dedup_df(df_working, keys)
return df_working
In [14]:
# test match_polling_places
filepath = '1999_referenda_output/polling_places.csv'
df1 = import_1999_pp(filepath)
filepath2 = 'federal_election_polling_places/pp_2007_election.csv'
df2 = import_polling_places(filepath2)
test = match_polling_places(
df1,
df2,
dict(
keys = ['state','premises','postcode'],
match_source = '2007 Polling Places',
match_type = 'Match 01 - state, premises, postcode'
)
)
display(test.head(3))
match_unmatched_polling_places(df1, settings)
match_polling_places
In [15]:
def match_unmatched_polling_places(df1, settings):
# get polling place file from settings
filepath = settings['pp_filepath']
df2 = import_polling_places(filepath)
# work out which rows we haven't yet matched
df1_unmatched = df1[df1.match_source.isnull()]
# run match for those
df1_matches = match_polling_places(df1_unmatched, df2, settings)
# dedup this file for combinations of state/polling_place (my unique key)
keys = ['state','polling_place']
df1_matches = dedup_df(df1_matches, keys)
# check that worked by making it a key now
df1_matches = df1_matches.set_index(keys)
# update with matches
df1.update(df1_matches)
# return
return df1
In [16]:
def match_status(df1):
# how many Nans are in match_type?
not_matched = len(df1[df1['match_type'].isnull()].index)
# make a df for none
none = pd.DataFrame(dict(
match_type = 'Not yet matched',
count = not_matched
), index=[0])
if not_matched == len(df1.index): # if all values are not-matched
return none
else:
df = pd.DataFrame(
df1.groupby('match_type')['match_type'].count().reset_index(name='count')
)
# add the non-matched row
df = df.append(none)
return df
In [17]:
# first match attempt - set up file
filepath = '1999_referenda_output/polling_places.csv'
df_pp_1999 = import_1999_pp(filepath)
# double check none are somehow magically matched yet
print('before')
display(match_status(df_pp_1999))
# configure match settings
settings = dict(
pp_filepath = 'federal_election_polling_places/pp_2007_election.csv',
keys = ['state','premises','postcode'],
match_source = '2007 Polling Places',
match_type = 'Match 01 - state, premises, postcode'
)
# run match
df_pp_1999 = match_unmatched_polling_places(df_pp_1999, settings)
print('after')
display(match_status(df_pp_1999))
In [18]:
## 2
print('before 2')
display(match_status(df_pp_1999))
# configure match settings
settings = dict(
pp_filepath = 'federal_election_polling_places/pp_2010_election.csv',
keys = ['state','premises','postcode'],
match_source = '2010 Polling Places',
match_type = 'Match 02 - state, premises, postcode'
)
# run match
df_pp_1999 = match_unmatched_polling_places(df_pp_1999, settings)
print('after')
display(match_status(df_pp_1999))
## 3
print('before 3')
display(match_status(df_pp_1999))
# configure match settings
settings = dict(
pp_filepath = 'federal_election_polling_places/pp_2013_election.csv',
keys = ['state','premises','postcode'],
match_source = '2013 Polling Places',
match_type = 'Match 03 - state, premises, postcode'
)
# run match
df_pp_1999 = match_unmatched_polling_places(df_pp_1999, settings)
print('after')
display(match_status(df_pp_1999))
## 4
print('before 4')
display(match_status(df_pp_1999))
# configure match settings
settings = dict(
pp_filepath = 'federal_election_polling_places/pp_2016_election.csv',
keys = ['state','premises','postcode'],
match_source = '2016 Polling Places',
match_type = 'Match 04 - state, premises, postcode'
)
# run match
df_pp_1999 = match_unmatched_polling_places(df_pp_1999, settings)
print('after')
display(match_status(df_pp_1999))
In [19]:
print('before 5')
display(match_status(df_pp_1999))
# configure match settings
settings = dict(
pp_filepath = 'federal_election_polling_places/pp_2007_election.csv',
keys = ['state','polling_place','postcode'],
match_source = '2007 Polling Places',
match_type = 'Match 05 - state, polling_place, postcode'
)
# run match
df_pp_1999 = match_unmatched_polling_places(df_pp_1999, settings)
print('after')
display(match_status(df_pp_1999))
In [20]:
print('before 6')
display(match_status(df_pp_1999))
# configure match settings
settings = dict(
pp_filepath = 'federal_election_polling_places/pp_2010_election.csv',
keys = ['state','polling_place','postcode'],
match_source = '2010 Polling Places',
match_type = 'Match 06 - state, polling_place, postcode'
)
# run match
df_pp_1999 = match_unmatched_polling_places(df_pp_1999, settings)
print('after')
display(match_status(df_pp_1999))
## 7
print('before 7')
display(match_status(df_pp_1999))
# configure match settings
settings = dict(
pp_filepath = 'federal_election_polling_places/pp_2013_election.csv',
keys = ['state','polling_place','postcode'],
match_source = '2013 Polling Places',
match_type = 'Match 07 - state, polling_place, postcode'
)
# run match
df_pp_1999 = match_unmatched_polling_places(df_pp_1999, settings)
print('after')
display(match_status(df_pp_1999))
## 8
print('before 8')
display(match_status(df_pp_1999))
# configure match settings
settings = dict(
pp_filepath = 'federal_election_polling_places/pp_2016_election.csv',
keys = ['state','polling_place','postcode'],
match_source = '2016 Polling Places',
match_type = 'Match 08 - state, polling_place, postcode'
)
# run match
df_pp_1999 = match_unmatched_polling_places(df_pp_1999, settings)
print('after')
display(match_status(df_pp_1999))
In [21]:
def get_google_api_key():
filepath = 'config/keys.json'
with open(filepath) as data_file:
data = json.load(data_file)
key = data['google_maps']
return key
In [22]:
def geocode_address(address):
key = get_google_api_key()
componentRestrictions = {'country':'AU'}
gmaps = googlemaps.Client(key=key)
# geocode_result = gmaps.geocode(address)
geocode_result = gmaps.geocode(address, componentRestrictions)
return geocode_result
In [23]:
# display(geocode_address('10/44 Lord St Richmond VIC 3121'))
display(geocode_address('Salt Creek Rd SALT CREEK 5275'))
In [24]:
def geocode_polling_place(row, match_source = 'google geocode'):
address = row['address'] + ' ' + row['suburb'] + ' ' + str(row['postcode'])[:4]
# geocode address
geocode = geocode_address(address)
# update the row
if geocode:
row['match_source'] = match_source
row['match_type'] = geocode[0]['geometry']['location_type']
row['latitude'] = geocode[0]['geometry']['location']['lat']
row['longitude'] = geocode[0]['geometry']['location']['lng']
else:
row['match_source'] = 'google geocode'
row['match_type'] = 'failed'
row = pd.DataFrame(row, index=[0])
return row
In [25]:
# test above function on a few rows
test = df_pp_1999.head(3)
geocode_test = pd.DataFrame()
i = 0
for row in test.reset_index().to_dict('records')[:3]:
row = geocode_polling_place(row, 'Match 09 - Google Geocode')
if geocode_test.empty:
geocode_test = row
else:
geocode_test = geocode_test.append(row)
geocode_test
Out[25]:
In [26]:
# test above function on a few rows
unmatched_places = df_pp_1999[df_pp_1999.match_source.isnull()]
geocode_matches = pd.DataFrame()
for row in unmatched_places.reset_index().to_dict('records'):
row = geocode_polling_place(row)
if geocode_matches.empty:
geocode_matches = row
else:
geocode_matches = geocode_matches.append(row)
geocode_matches.head(5)
Out[26]:
In [27]:
# reorder geocode in same pattern as non-matches
# get all keys from that table
keys = df_pp_1999.reset_index().columns.values
#reorder by that
geocode_matches_ordered = geocode_matches[keys]
# add state indexes
keys = ['state','polling_place']
# df1_matches = dedup_df(df1_matches, keys)
# check that worked by making it a key now
geocode_matches_ordered = geocode_matches_ordered.set_index(keys)
# update with matches
df_pp_1999.update(geocode_matches_ordered)
# where are we at?
display(match_status(df_pp_1999))
In [28]:
filepath = 'from_abs/ssc_2016_aust_centroid.csv'
subtown_centroids = results = pd.read_csv(
filepath
)
# create a column for abbreviated states
lambda_states = lambda x: state_abbreviation(x)
subtown_centroids['state'] = subtown_centroids['STE_NAME16'].apply(lambda_states)
# strip the brackets out of suburb names, and make all caps, to match the 1999 file
lambda_suburbs = lambda x: left_of_bracket(x).upper()
subtown_centroids['suburb'] = subtown_centroids['SSC_NAME16'].apply(lambda_suburbs)
display(subtown_centroids.head(5))
In [29]:
print('before suburbs')
display(match_status(df_pp_1999))
# configure match settings
settings = dict(
pp_filepath = 'from_abs/ssc_2016_aust_centroid.csv',
keys = ['state','suburb'],
match_source = 'ABS Suburb Centroids',
match_type = 'Match 10 - suburb centroid'
)
# rows to try
## try not matched rows, OR rows that the geocode failed on
df_to_match = df_pp_1999[df_pp_1999.match_source.isnull()]
df_to_match = df_to_match.append(df_pp_1999[df_pp_1999['match_type'] == 'failed'])
# get matches
df1_matches = match_polling_places(
df_to_match, # only run on empty rows
subtown_centroids,
settings
)
# dedup this file for combinations of state/polling_place (my unique key)
keys = ['state','polling_place']
# df1_matches = dedup_df(df1_matches, keys)
# check that worked by making it a key now
df1_matches = df1_matches.set_index(keys)
# update with matches
df_pp_1999.update(df1_matches)
print('after')
display(match_status(df_pp_1999))
In [30]:
df_to_match = df_pp_1999[df_pp_1999.match_source.isnull()]
df_to_match = df_to_match.append(df_pp_1999[df_pp_1999['match_type'] == 'failed'])
display(df_to_match)
In [33]:
df_pp_1999.to_csv(
'1999_referenda_output/polling_places_geocoded.csv',
sep = ','
)