In [160]:
import os
import pandas as pd
import numpy as np
In [173]:
def percent_by_polling_district(riding, year):
print "Year: " + str(int(year)) + ", Riding: " + str(int(riding))
fileName = "pollresults_resultatsbureau" + str(int(riding)) + ".csv"
filePath = os.path.join(folderName, fileName)
# Load the data
pollData = pd.read_csv(filePath)
# Get column names, and remove French portions
colNames = list(pollData.columns.values)
colNames = [x.split('/')[0] for x in colNames]
pollData.columns = colNames
# Drop unnecessary columns
listColDrop = ['Electoral District Name_English',
'Electoral District Name_French',
'Void Poll Indicator',
'No Poll Held Indicator',
'Merge With',
'Rejected Ballots for Polling Station',
'Political Affiliation Name_French',
"Candidate's First Name",
"Candidate's Family Name",
"Candidate's Middle Name",
'Incumbent Indicator',
'Elected Candidate Indicator']
pollData = pollData.drop(listColDrop, axis=1)
# Strip the polling ID column of whitespace.
polCol = 'Polling Station Number'
s = lambda x: str(x).strip(" ")
pollData[polCol] = pollData[polCol].map(s)
# Keep the total for the polls because we later remove smaller parties
pollVoteTotals = pollData.groupby('Polling Station Number')['Candidate Poll Votes Count'].sum()
pollVoteTotals = pd.DataFrame(pollVoteTotals)
pollVoteTotals.reset_index(level=0, inplace=True)
# Remove parties that aren't one of the main ones
parties = "Liberal|Conservative|NDP|N.D.P.|Green|Bloc|Wild"
pollData = pollData[pollData['Political Affiliation Name_English'].str.contains(parties)]
# Create a pivot table of the data by polling district/candidate name
pollData = pollData.pivot(
index='Polling Station Number',
columns='Political Affiliation Name_English',
values='Candidate Poll Votes Count')
# Turn the index back into a column
pollData.reset_index(level=0, inplace=True)
# Strip the letters off polling stations since the geospatial data
# does not include these letters.
stripCharacters = "ABCDEFG"
s = lambda x: str(x).strip(stripCharacters)
statCol = 'Polling Station Number'
pollData[statCol] = pollData[statCol].map(s)
pollVoteTotals[statCol] = pollVoteTotals[statCol].map(s)
# Merge polling stations since some were split in two parts
pollData = pollData.groupby(statCol).sum()
pollVoteTotals = pollVoteTotals.groupby(statCol).sum()
pollsWithTotals = pd.concat([pollData, pollVoteTotals], axis=1)
pollsWithTotals.rename(columns={'Candidate Poll Votes Count': 'Vote Totals'}, inplace=True)
# Calculate the percent for each
# Grab the data we want converted to a percent
numColsPollData = len(pollsWithTotals.columns)
pollDataPercent = pollsWithTotals.iloc[:, range(0, numColsPollData-1)].copy()
# Divide it by the total votes for each polling district
pollDataPercent = pollDataPercent.div(pollsWithTotals['Vote Totals'], axis=0)
# Calculate the percent that every other party got
pollDataPercent['Other'] = 1 - pollDataPercent.sum(axis=1)
# For rows with no votes, the other column becomes 100%
pollDataPercent.loc[pollDataPercent['Other'] == 1, 'Other'] = None
# Round everything off
pollDataPercent = np.round(pollDataPercent*100, decimals=2)
# Rename columns
colNames = list(pollDataPercent.columns.values)
colNames = [x + " (%)" for x in colNames]
pollDataPercent.columns = colNames
# Merge it with the original data set
pollData = pd.concat([pollData, pollDataPercent], axis=1)
# Add back column for electoral district
pollData['District'] = riding
pollData.reset_index(level=0, inplace=True)
return pollData
In [174]:
# Create an array for all the years we're interested in
years = [2006, 2008, 2011]
In [175]:
for year in years:
# Get list of riding numbers
folderName = "pollresults_resultatsbureau_canada" + str(year)
# The 2006 data is missing a necessary column to automate this, so
# we need to treat it differently
if year != 2006:
# This file contains the provinces and their electoral districts
# It was originally table_tableau11.csv. I stripped the French parts
# of the names off and saved it with the UTF-8 encoding since it was causing trouble
ridingFile = os.path.join(folderName, "RidingListClean.csv")
ridingList = pd.read_csv(ridingFile)
# We only need the first three columns
ridingList = ridingList.iloc[:, [0, 2]]
ridingList = ridingList[ridingList["Province2"] == "New Brunswick"]
ridings = ridingList.ix[:, 1]
else:
ridings = [13001, 13002, 13003, 13004, 13005, 13006, 13007, 13008, 13009, 13010]
# Is this the first time looping for this year?
first = True
for riding in ridings:
pollData = percent_by_polling_district(riding, year)
if first == True:
combined = pollData
else:
combined = pd.concat([combined, pollData], axis=0, ignore_index=True)
first = False
# Reorder columns before writing
cols = list(combined)
cols.insert(0, cols.pop(cols.index('Polling Station Number')))
cols.insert(0, cols.pop(cols.index('District')))
combined = combined.ix[:, cols]
fileName = str(int(year)) + "Combined.csv"
filePath = os.path.join("Output", fileName)
combined.to_csv(filePath, index=False, encoding='utf-8')
In [ ]:
In [ ]: