In [160]:
import os
import pandas as pd
import numpy as np

In [173]:
def percent_by_polling_district(riding, year):
    print "Year: " + str(int(year)) + ", Riding: " + str(int(riding))
    fileName = "pollresults_resultatsbureau" + str(int(riding)) + ".csv"
    filePath = os.path.join(folderName, fileName)
    # Load the data
    pollData = pd.read_csv(filePath)
    
    # Get column names, and remove French portions
    colNames = list(pollData.columns.values)
    colNames = [x.split('/')[0] for x in colNames]
    pollData.columns = colNames
    
    # Drop unnecessary columns
    listColDrop = ['Electoral District Name_English',
                   'Electoral District Name_French',
                   'Void Poll Indicator',
                   'No Poll Held Indicator', 
                   'Merge With',
                   'Rejected Ballots for Polling Station',
                   'Political Affiliation Name_French',
                   "Candidate's First Name",
                   "Candidate's Family Name",
                   "Candidate's Middle Name", 
                   'Incumbent Indicator',
                   'Elected Candidate Indicator']
    pollData = pollData.drop(listColDrop, axis=1)
    
    # Strip the polling ID column of whitespace.
    polCol = 'Polling Station Number'
    s = lambda x: str(x).strip(" ")
    pollData[polCol] = pollData[polCol].map(s)
    
    # Keep the total for the polls because we later remove smaller parties
    pollVoteTotals = pollData.groupby('Polling Station Number')['Candidate Poll Votes Count'].sum()
    pollVoteTotals = pd.DataFrame(pollVoteTotals)
    pollVoteTotals.reset_index(level=0, inplace=True)
    
    # Remove parties that aren't one of the main ones
    parties = "Liberal|Conservative|NDP|N.D.P.|Green|Bloc|Wild"
    pollData = pollData[pollData['Political Affiliation Name_English'].str.contains(parties)]
    
    # Create a pivot table of the data by polling district/candidate name
    pollData = pollData.pivot(
        index='Polling Station Number',
        columns='Political Affiliation Name_English',
        values='Candidate Poll Votes Count')
    # Turn the index back into a column
    pollData.reset_index(level=0, inplace=True)
    
    # Strip the letters off polling stations since the geospatial data
    #  does not include these letters.
    stripCharacters = "ABCDEFG"
    s = lambda x: str(x).strip(stripCharacters)
    statCol = 'Polling Station Number'

    pollData[statCol] = pollData[statCol].map(s)
    pollVoteTotals[statCol] = pollVoteTotals[statCol].map(s)
    
    # Merge polling stations since some were split in two parts
    pollData = pollData.groupby(statCol).sum()
    pollVoteTotals = pollVoteTotals.groupby(statCol).sum()
    pollsWithTotals = pd.concat([pollData, pollVoteTotals], axis=1)
    pollsWithTotals.rename(columns={'Candidate Poll Votes Count': 'Vote Totals'}, inplace=True)

    # Calculate the percent for each
    # Grab the data we want converted to a percent
    numColsPollData = len(pollsWithTotals.columns)
    pollDataPercent = pollsWithTotals.iloc[:, range(0, numColsPollData-1)].copy()

    # Divide it by the total votes for each polling district
    pollDataPercent = pollDataPercent.div(pollsWithTotals['Vote Totals'], axis=0)
    
    # Calculate the percent that every other party got
    pollDataPercent['Other'] = 1 - pollDataPercent.sum(axis=1)
    # For rows with no votes, the other column becomes 100%
    pollDataPercent.loc[pollDataPercent['Other'] == 1, 'Other'] = None
    
    # Round everything off
    pollDataPercent = np.round(pollDataPercent*100, decimals=2)

    # Rename columns
    colNames = list(pollDataPercent.columns.values)
    colNames = [x + " (%)" for x in colNames]
    pollDataPercent.columns = colNames

    # Merge it with the original data set
    pollData = pd.concat([pollData, pollDataPercent], axis=1)
    
    # Add back column for electoral district
    pollData['District'] = riding
    
    pollData.reset_index(level=0, inplace=True)
    return pollData

In [174]:
# Create an array for all the years we're interested in
years = [2006, 2008, 2011]

In [175]:
for year in years:
    # Get list of riding numbers
    folderName = "pollresults_resultatsbureau_canada" + str(year)
    
    # The 2006 data is missing a necessary column to automate this, so
    # we need to treat it differently
    if year != 2006:
        # This file contains the provinces and their electoral districts
        # It was originally table_tableau11.csv. I stripped the French parts
        # of the names off and saved it with the UTF-8 encoding since it was causing trouble
        ridingFile = os.path.join(folderName, "RidingListClean.csv")
        ridingList = pd.read_csv(ridingFile)
        # We only need the first three columns
        ridingList = ridingList.iloc[:, [0, 2]]
        ridingList = ridingList[ridingList["Province2"] == "New Brunswick"]
        ridings = ridingList.ix[:, 1]
    else:
        ridings = [13001, 13002, 13003, 13004, 13005, 13006, 13007, 13008, 13009, 13010]
    
    # Is this the first time looping for this year?
    first = True
    for riding in ridings:
        pollData = percent_by_polling_district(riding, year)
        if first == True:
            combined = pollData
        else:
            combined = pd.concat([combined, pollData], axis=0, ignore_index=True)
        first = False

    # Reorder columns before writing
    cols = list(combined)
    cols.insert(0, cols.pop(cols.index('Polling Station Number')))
    cols.insert(0, cols.pop(cols.index('District')))
    combined = combined.ix[:, cols]
    
    fileName = str(int(year)) + "Combined.csv"
    filePath = os.path.join("Output", fileName)
    combined.to_csv(filePath, index=False, encoding='utf-8')


Year: 2006, Riding: 13001
Year: 2006, Riding: 13002
Year: 2006, Riding: 13003
Year: 2006, Riding: 13004
Year: 2006, Riding: 13005
Year: 2006, Riding: 13006
Year: 2006, Riding: 13007
Year: 2006, Riding: 13008
Year: 2006, Riding: 13009
Year: 2006, Riding: 13010
Year: 2008, Riding: 13001
Year: 2008, Riding: 13002
Year: 2008, Riding: 13003
Year: 2008, Riding: 13004
Year: 2008, Riding: 13005
Year: 2008, Riding: 13006
Year: 2008, Riding: 13007
Year: 2008, Riding: 13008
Year: 2008, Riding: 13009
Year: 2008, Riding: 13010
Year: 2011, Riding: 13001
Year: 2011, Riding: 13002
Year: 2011, Riding: 13003
Year: 2011, Riding: 13004
Year: 2011, Riding: 13005
Year: 2011, Riding: 13006
Year: 2011, Riding: 13007
Year: 2011, Riding: 13008
Year: 2011, Riding: 13009
Year: 2011, Riding: 13010

In [ ]:


In [ ]: