In [161]:
import pandas as pd
import xlrd
import os
from os import listdir
from os.path import isfile, join
from IPython.display import display, HTML
In [162]:
# filter a list of files for xls files
def get_files(filepath):
return [f for f in listdir(filepath) if isfile(join(filepath, f))]
In [163]:
filepath = '1999_referenda/polling_places'
print(get_files(filepath)[:4])
In [164]:
# filter a list of files for xls files
def get_xls(files):
r = []
for file in files:
if 'xls' in file:
r.append(file)
return r
In [165]:
filepath = '1999_referenda/polling_places'
files = get_files(filepath)
files_xls = get_xls(files)
print(files_xls[:4])
get_polling_places(filename, columns)The files are a bit of a mess, they look like this:
The approach I take below is to loop through each row of the spreadsheet, and
if the row doesn't have any blank cells in the first columns of the spreadsheetif the row doesn't begin with polling place name
then append the row to a data frameThen I use the file name to add a state column
This will result in code we can run across all the states spreadsheets to end up with one file
In [166]:
def get_polling_places(filename, columns):
# get data
x = xlrd.open_workbook(file)
x_sheet = x.sheet_by_index(0)
# data frame to preserve outside loop
df_polling_places = pd.DataFrame()
# for each row in the spreadsheet
for rowindex in range(x_sheet.nrows):
row_slice = x_sheet.row_slice(
rowx=rowindex, # take the index of this loop, 0 -> max rows
start_colx=0, # we care about the first 6 columns hence 0->6
end_colx=6
)
# take keys off the row
row = []
for cell in row_slice:
row.append(cell.value)
# if there are any blank cells, this isn't a real row of data
if row.count('') == 0:
# except one of those rows is the header row, which we want to throw out as well
if row[0] != 'Polling Place Name':
# make a data frame
df = pd.DataFrame(data=[row], columns = columns)
# if the data frame is empty, create it, if not, append to it
if df_polling_places.empty:
df_polling_places = df
else:
df_polling_places = df_polling_places.append(df)
# work out state from filename
# state will be the chars from 3 chars after the final slash to .
# as the files are in the form 'ppsnt.xls'
# everything after the final slash
needle = file.rfind('/')
length = len(file)
filepart = file[needle+1:length]
# from the third char to the dot
needle = filepart.find('.')
state = filepart[3:needle].upper()
# create state column
df_polling_places['state'] = state
# put state column at front
#Make a list of all of the columns in the df
cols = list(df_polling_places.columns.values)
# remove state from the list
cols.pop(cols.index('state'))
# put state at the front of this list
df_polling_places = df_polling_places[['state']+cols]
return df_polling_places
In [167]:
file = '1999_referenda/polling_places/ppsnt.xls'
columns = [
'polling_place',
'premises',
'address',
'suburb',
'postcode',
'wheelchair_access'
]
get_polling_places(file, columns).head(5)
Out[167]:
In [168]:
filepath = '1999_referenda/polling_places/'
columns = [
'polling_place',
'premises',
'address',
'suburb',
'postcode',
'wheelchair_access'
]
# get xls files from filepath
files = get_files(filepath)
files_xls = get_xls(files)
df_polling_places = pd.DataFrame()
for file in files_xls:
file = filepath + file
df = get_polling_places(file, columns)
# if the data frame is empty, create it, if not, append to it
if df_polling_places.empty:
df_polling_places = df
else:
df_polling_places = df_polling_places.append(df)
df_polling_places.head(3)
Out[168]:
In [169]:
def left_of_bracket(s):
if '(' in s:
needle = s.find('(')
r = s[:needle-1].strip()
return r
else:
return s
In [170]:
print(left_of_bracket('Croydon Park'))
print(left_of_bracket('Croydon Park West (Adelaide)'))
In [171]:
# initial number of rows
print('inital num rows: ' + str(len(df_polling_places.index)))
# for each value, return the value of left_of_bracket
# left_of_bracket returns a whole string, or
# if it contains a (, whatever is to the left of it
lambda_polling_places = lambda x: left_of_bracket(x)
# update polling place column
df_polling_places['polling_place'] = df_polling_places['polling_place'].apply(lambda_polling_places)
# deduplicate
df_polling_places = df_polling_places.drop_duplicates()
# print final number of rows
print('final num rows: ' + str(len(df_polling_places.index)))
In [172]:
try:
tmp = pd.concat(g for _, g in df_polling_places.groupby(['state','polling_place']) if len(g) > 1)
except:
print('no duplicates found')
else:
display(tmp)
In [173]:
print('inital num rows: ' + str(len(df_polling_places.index)))
# patch the two hyde park booths
df_polling_places.loc[
(df_polling_places['polling_place'] == 'Hyde Park') &
(df_polling_places['state'] == 'NSW') &
(df_polling_places['premises'] == 'Unitarian Church Hall'),
['polling_place']
] = 'Hyde Park - Unitarian Church Hall'
df_polling_places.loc[
(df_polling_places['polling_place'] == 'Hyde Park') &
(df_polling_places['state'] == 'NSW') &
(df_polling_places['premises'] == 'Jubilee Hall, Kingsland Rd.'),
['polling_place']
] = 'Hyde Park - Jubilee Hall'
df_polling_places = df_polling_places.drop_duplicates(
['state','polling_place'],
keep='first' # where state and polling place have dups, keep the first row you find
)
print('final num rows: ' + str(len(df_polling_places.index)))
# just to be sure, run that find-duplicates code again
try:
tmp = pd.concat(g for _, g in df_polling_places.groupby(['state','polling_place']) if len(g) > 1)
except:
print('no duplicates found')
else:
display(tmp)
In [174]:
outfile = '1999_referenda_output/polling_places.csv'
df_polling_places.to_csv(
outfile,
sep=',',
index=False,
encoding='UTF8'
)