In [3]:
import pandas as pd

In [52]:
inp_df = pd.DataFrame.from_csv('data/poll/raw_polls_cut.csv')

In [62]:
states = {
        'AK': 'Alaska',
        'AL': 'Alabama',
        'AR': 'Arkansas',
        'AZ': 'Arizona',
        'CA': 'California',
        'CO': 'Colorado',
        'CT': 'Connecticut',
        'DC': 'District of Columbia',
        'DE': 'Delaware',
        'FL': 'Florida',
        'GA': 'Georgia',
        'HI': 'Hawaii',
        'IA': 'Iowa',
        'ID': 'Idaho',
        'IL': 'Illinois',
        'IN': 'Indiana',
        'KS': 'Kansas',
        'KY': 'Kentucky',
        'LA': 'Louisiana',
        'MA': 'Massachusetts',
        'MD': 'Maryland',
        'ME': 'Maine',
        'MI': 'Michigan',
        'MN': 'Minnesota',
        'MO': 'Missouri',
        'MS': 'Mississippi',
        'MT': 'Montana',
        'NC': 'North Carolina',
        'ND': 'North Dakota',
        'NE': 'Nebraska',
        'NH': 'New Hampshire',
        'NJ': 'New Jersey',
        'NM': 'New Mexico',
        'NV': 'Nevada',
        'NY': 'New York',
        'OH': 'Ohio',
        'OK': 'Oklahoma',
        'OR': 'Oregon',
        'PA': 'Pennsylvania',
        'RI': 'Rhode Island',
        'SC': 'South Carolina',
        'SD': 'South Dakota',
        'TN': 'Tennessee',
        'TX': 'Texas',
        'UT': 'Utah',
        'VA': 'Virginia',
        'VT': 'Vermont',
        'WA': 'Washington',
        'WI': 'Wisconsin',
        'WV': 'West Virginia',
        'WY': 'Wyoming'
}

parties = {
    'Clinton': 'Democrat',
    'Sanders': 'Democrat',
    'Rubio': 'Republican',
    'Cruz': 'Republican',
    'Trump': 'Republican',
}

In [86]:
res_df = pd.DataFrame(columns=['State','Party','Candidate','min','25th','median', '75th', 'max','mean', 'avg_size'])
i = 0
for state in states.keys():
    for cand in ['Clinton', 'Sanders', 'Rubio', 'Cruz', 'Trump']:
        c1_newd = inp_df[:][(inp_df['location'] == state) & (inp_df['cand1_name'] == cand)]
        c2_newd = inp_df[:][(inp_df['location'] == state) & (inp_df['cand2_name'] == cand)]
        
        res_df.set_value(i, 'State', states[state])
        res_df.set_value(i, 'Party', parties[cand])
        res_df.set_value(i, 'Candidate', cand)
        
        if len(c1_newd.index) > 0:
            res_df.set_value(i, 'min', c1_newd.cand1_pct.min())
            res_df.set_value(i, '25th', c1_newd.cand1_pct.quantile(0.25))
            res_df.set_value(i, 'median', c1_newd.cand1_pct.quantile(0.5))
            res_df.set_value(i, '75th', c1_newd.cand1_pct.quantile(0.75))
            res_df.set_value(i, 'max', c1_newd.cand1_pct.max())
            res_df.set_value(i, 'mean', c1_newd.cand1_pct.mean())
            res_df.set_value(i, 'avg_size', c1_newd.samplesize.mean())
        elif len(c2_newd.index) > 0:
            res_df.set_value(i, 'min', c2_newd.cand2_pct.min())
            res_df.set_value(i, '25th', c2_newd.cand2_pct.quantile(0.25))
            res_df.set_value(i, 'median', c2_newd.cand2_pct.quantile(0.5))
            res_df.set_value(i, '75th', c2_newd.cand2_pct.quantile(0.75))
            res_df.set_value(i, 'max', c2_newd.cand2_pct.max())
            res_df.set_value(i, 'mean', c2_newd.cand2_pct.mean())
            res_df.set_value(i, 'avg_size', c2_newd.samplesize.mean())
        else:
            res_df.set_value(i, 'min', 0)
            res_df.set_value(i, '25th', 0)
            res_df.set_value(i, 'median', 0)
            res_df.set_value(i, '75th', 0)
            res_df.set_value(i, 'max', 0)
            res_df.set_value(i, 'mean', 0)
            res_df.set_value(i, 'avg_size', 0)
        
        i += 1

In [94]:
res_df = res_df.sort_values(by='State')
res_df.to_csv('data/poll/polls_aggregated.csv')