In [1]:
import pandas as pd
Identify your problem statement, find all your datasets, identify the questions you want to answer, reach out to polling/consulting firms to work with.
Potential question--Why did these counties flip to Trump?
Explore your data to understand it--drop data that is not relevant
Look to predict something (next presidential election outcome).
Think about what would happen if more people became UNINSURED and the result that could have.
Should slcie by margin of county flip. First-fourth quartiles
Look at population counts per county.
Margin of victory/voting which way (Trump/Clinton) is more important to predict than simply whcih flipped (make that a subset)
A listing of the specific counties that flipped: http://www.npr.org/2016/11/15/502032052/lots-of-people-voted-for-obama-and-trump-heres-where-in-3-charts
Nate Silver postulates that education level is a key predctor. http://fivethirtyeight.com/features/education-not-income-predicted-who-would-vote-for-trump/?ex_cid=story-twitter
Daily Kos article: http://www.dailykos.com/story/2017/1/30/1627319/-Daily-Kos-Elections-presents-the-2016-presidential-election-results-by-congressional-district
Diversity Index scource: https://www.kaggle.com/mikejohnsonjr/us-counties-diversity-index
In [2]:
election = pd.read_csv('2016_election.csv')
In [3]:
prev_election = pd.read_csv('2012_election.csv')
In [4]:
div = pd.read_csv('diversityindex.csv')
In [5]:
edu = pd.read_excel('education_25_older_filt.xls')
Change in education the past 10 years--find the difference between them for each county
In [6]:
pop = pd.read_excel('us county populations.xls')
In [56]:
ue_rates = pd.read_excel('Unemployment Rates.xlsx')
ue_rates = ue_rates.drop(ue_rates[[0,1,2,4,5]],axis=1)
ue_rates = ue_rates.rename(columns={'Unnamed: 3':'county_state','Unnamed: 6':'labor_force', 'Unnamed: 7':'employed','Unnamed: 8':'unemployed','Unnamed: 9':'ue_rate'})
ue_rates = ue_rates.drop(ue_rates.index[[0,1,2,3,4]])
In [7]:
len(edu)
Out[7]:
In [8]:
len(pop)
Out[8]:
In [9]:
pop.dtypes
Out[9]:
In [10]:
div.head()
Out[10]:
In [11]:
div = div.rename(columns={'Location':'county_state','Diversity-Index':'div_index','Black or African American alone, percent, 2013':'af_am','American Indian and Alaska Native alone, percent, 2013':'native_2013','Asian alone, percent, 2013':'asian_am','Native Hawaiian and Other Pacific Islander alone, percent,':'pac_am','Two or More Races, percent, 2013':'two_or_more_races','Hispanic or Latino, percent, 2013':'hisp_lat_am','White alone, not Hispanic or Latino, percent, 2013':'white_am'})
In [12]:
div.head()
Out[12]:
In [13]:
len(div)
Out[13]:
In [14]:
election.head()
Out[14]:
In [15]:
election.county_name.count()
Out[15]:
In [16]:
#Need to drop Alaska as it doesn't have any county names
election = election[election.county_name!='Alaska']
pop = pop[pop.county!='Alaska']
In [17]:
election.head()
Out[17]:
In [18]:
election = election.drop(election[[0,10]], axis=1)
In [19]:
election['county_state'] = election['county_name'] + ', ' + election['state_abbr']
In [20]:
prev_election['county_state'] = prev_election['county_name'] + ', ' + prev_election['state_abbr']
In [21]:
pop.head()
Out[21]:
In [22]:
pop['county_state'] = pop['county'] + ', ' + pop['state']
In [23]:
election.head()
Out[23]:
In [24]:
edu.head()
Out[24]:
In [25]:
pop.head()
Out[25]:
In [26]:
edu['county_state'] = edu['Area name'] + ', ' + edu['State']
In [27]:
edu.dtypes
Out[27]:
In [28]:
edu.isnull().sum()
Out[28]:
In [29]:
edu = edu.dropna()
In [30]:
edu.isnull().sum()
Out[30]:
In [31]:
len(edu)
Out[31]:
In [32]:
import seaborn as sns
import matplotlib.pyplot as plt
ax = sns.distplot(edu.per_less_high_school_diploma_2011_15, kde=False)
ax.set(xlabel='Percentage per county with less than a High School Diploma, 2011-2015', ylabel='Count')
ax.set_title('Education Across All US Counties', fontsize=16, fontname='Ubuntu')
plt.show()
In [33]:
ax = sns.distplot(edu.per_hs_diploma_only_2011_15, kde=False)
ax.set(xlabel='Percentage per county with only High School Diploma, 2011-2015', ylabel='Count')
ax.set_title('Education Across All US Counties', fontsize=16, fontname='Ubuntu')
plt.show()
In [34]:
ax = sns.distplot(edu.per_less_4_years_2011_15, kde=False)
ax.set(xlabel='Percentage per county with less than four years of college, 2011-2015', ylabel='Count')
ax.set_title('Education Across All US Counties', fontsize=16, fontname='Ubuntu')
plt.show()
In [35]:
ax = sns.distplot(edu.per_four_or_higher_2011_15, kde=False)
ax.set(xlabel='Percentage per county with four or more years of college, 2011-2015', ylabel='Count')
ax.set_title('Education Across All US Counties', fontsize=16, fontname='Ubuntu')
plt.show()
In [36]:
election['per_dem'] = election['per_dem'].apply(lambda x: x*100)
election['per_gop'] = election['per_gop'].apply(lambda x: x*100)
In [37]:
prev_election['per_dem_2012'] = prev_election['per_dem_2012'].apply(lambda x: x*100)
prev_election['per_gop_2012'] = prev_election['per_gop_2012'].apply(lambda x: x*100)
In [38]:
election['per_point_diff'] = election['per_point_diff'].apply(lambda x: float(x.strip('%')))
In [39]:
# Making a new column for positive and negative--if per_dem is below 50%, negative. If
# above 50%, positive.
In [40]:
election.head()
Out[40]:
In [41]:
election['election_range'] = election['per_dem'] - election['per_gop']
In [42]:
prev_election['election_range'] = prev_election['per_dem_2012'] - prev_election['per_gop_2012']
In [43]:
prev_election.dtypes
Out[43]:
In [44]:
import seaborn as sns
import matplotlib.pyplot as plt
ax = sns.distplot(election.per_point_diff, kde=False)
ax.set(xlabel = "Percent Difference", ylabel='Count')
ax.set_title('2016 Election Margins in All US Counties', fontsize=16)
plt.show()
In [45]:
prev_election.head()
Out[45]:
In [46]:
ax = sns.distplot(prev_election.per_point_diff_2012, kde=False)
ax.set(xlabel = "Percent Difference 2012", ylabel='Count')
ax.set_title('2016 Election Margins in All US Counties', fontsize=16)
plt.show()
In [47]:
import matplotlib.pyplot
In [48]:
ax = sns.distplot(election.election_range, kde=False)
ax.set(xlabel = "(negative=Republican, positive=Democrat, %)", ylabel='Count')
ax.set_title('Partisan Pattern per All US Counties, 2016', fontsize=16, fontname='Ubuntu')
plt.show()
# Democrats are in HUGE trouble. Of course, this distribution doesn't mean that they're
# necessarily losing counties, but of those they held onto in 2016, they have a far, far
# weaker grasp on them than Republicans do on their side. Also, many of the Republican
# counties are in Red States with few electoral votes. However, for Congressional voting
# this is still a dangerous sign.
In [49]:
# What was it like in 2012?
ax = sns.distplot(prev_election.election_range, kde=False)
ax.set(xlabel = "(negative=Republican, positive=Democrat, %)", ylabel='Count')
ax.set_title('Partisan Degrees per County, 2012', fontsize=15, fontname='Ubuntu')
plt.show()
# It was already bad. But it's clearly gotten worse for Democrats.
In [50]:
election.describe()
Out[50]:
In [51]:
election['slight_dem'] = election['election_range'].apply(lambda x: 0< x <= 10)
election['slight_gop'] = election['election_range'].apply(lambda x: -10 <= x < 0)
election['med_dem'] = election['election_range'].apply(lambda x: 10< x <= 25)
election['med_gop'] = election['election_range'].apply(lambda x: -25 <= x < -10)
election['strong_dem'] = election['election_range'].apply(lambda x: 25 < x <= 50)
election['strong_gop'] = election['election_range'].apply(lambda x: -50 <= x < -25)
In [52]:
election.head()
Out[52]:
In [53]:
#Combine the states and counties into a single column.
# Have to find a way to join the dfs by matching up those with the same
# county names AND the same state (there are countys with the same name)
# Simply concatenating them won't work.
In [54]:
#http://www.cnbc.com/heres-a-map-of-the-us-counties-that-flipped-to-trump-from-democrats/
In [57]:
ue_rates.labor_force = ue_rates.labor_force.astype(float)
ue_rates.employed = ue_rates.employed.astype(float)
ue_rates.unemployed = ue_rates.unemployed.astype(float)
ue_rates.ue_rate = ue_rates.ue_rate.astype(float)
In [58]:
ue_rates.dtypes
Out[58]:
In [111]:
right = election.set_index('county_state')
left = ue_rates.set_index('county_state')
combined = left.join(right, lsuffix='', rsuffix='_r')
combined = combined_1.reset_index()
In [112]:
right = combined.set_index('county_state')
left = div.set_index('county_state')
combined_2 = left.join(right, lsuffix='', rsuffix = '_r')
combined_2 = combined_2.reset_index()
In [120]:
edu.head()
Out[120]:
In [118]:
combined_2.columns
Out[118]:
In [113]:
right = combined_2.set_index('county_state')
left = edu.set_index('county_state')
combined_3 = left.join(right, lsuffix='', rsuffix = '_r')
combined_3 = combined_2.reset_index()
In [117]:
combined_3.columns
Out[117]:
In [114]:
right = combined_3.set_index('county_state')
left = pop.set_index('county_state')
combined_4 = left.join(right, lsuffix='', rsuffix = '_r')
combined_4 = combined_4.reset_index()
In [115]:
combined_4.isnull().sum()
Out[115]:
In [71]:
combined_4.dropna(inplace=True)
In [76]:
combined_4= combined_4[combined_4.county_name!='Alaska']
#Just making sure Alaska isn't included
In [77]:
combined_4.head()
Out[77]:
In [78]:
election.describe()
Out[78]:
In [80]:
# Set up range variables
ax = sns.distplot(combined_4.election_range, kde=False)
ax.set(xlabel = "(negative=Republican, positive=Democrat, %)", ylabel='Count')
ax.set_title('Partisan Pattern per All US Counties, 2016', fontsize=16, fontname='Ubuntu')
plt.show()
Visualizations were a little dense as far as the content they were showing, make sure you slow down better explain or use visualizations whose labels make them a lot easier for the to understand.
In [ ]:
# All counties, not including those in Alaska.
In [83]:
VA = combined_4[combined_4.state_abbr=='VA']
VA.head()
Out[83]:
In [84]:
ax = sns.distplot(VA.election_range, kde=False)
ax.set(xlabel = "Negative=Republican, Positive=Democrat (%)", ylabel='Count')
ax.set_title('Partisan Degree in Virginia Counties, 2016', fontsize=16, fontname='Ubuntu')
plt.show()
In [85]:
import matplotlib.pyplot as plt
import seaborn as sns
In [89]:
ax = sns.regplot(VA.div_index, VA.per_dem)
ax.set(xlabel = 'Diversity Index', ylabel = 'County Vote Percent Democrat(%)')
ax.set_title("Diversity's Contribution to Democratic Votes in Virginia Counties", fontsize=20)
plt.show()
In [90]:
ax = sns.regplot(VA.div_index, VA.per_gop)
ax.set(xlabel = 'Diversity Index', ylabel = 'County Vote Percent Republican(%)')
ax.set_title("Diversity's Contribution to Republican Votes in Virginia Counties", fontsize=20)
plt.show()
In [100]:
ax = sns.regplot(VA.est_pop_2015, VA.per_dem)
ax.set(xlabel = 'Population Per County', ylabel = 'County Vote Percent Democrats(%)')
ax.set_title("Population Level's Contribution to Democratic Votes in Virginia Counties", fontsize=20)
plt.show()
# Not much of a contribution at all in VAb
In [99]:
ax = sns.regplot(VA.pop_change_2015, VA.per_dem)
ax.set(xlabel = 'Population Change Per County', ylabel = 'County Vote Percent Democrats(%)')
ax.set_title("Population Change's Contribution to Democratic Votes in Virginia Counties", fontsize=20)
plt.show()
# Again, not much
In [105]:
ax = sns.regplot(VA.white_am, VA.per_dem)
ax.set(xlabel = 'Whites Per County(%)', ylabel = 'County Vote For Democrats(%)')
ax.set_title("White Americans' Contribution to Democratic Votes in Virginia Counties", fontsize=20)
plt.show()
In [104]:
ax = sns.regplot(VA.white_am, VA.per_gop)
ax.set(xlabel = 'Whites Per County(%)', ylabel = 'County Vote For Republicans(%)')
ax.set_title("White Americans' Contribution to Republican Votes in Virginia Counties", fontsize=20)
plt.show()
In [107]:
ax = sns.regplot(VA.af_am, VA.per_dem)
ax.set(xlabel = 'African Americans Per County(%)', ylabel = 'County Vote For Democrats(%)')
ax.set_title("African Americans' Contribution to Democratic Votes in Virginia Counties", fontsize=20)
plt.show()
In [108]:
ax = sns.regplot(VA.af_am, VA.per_gop)
ax.set(xlabel = 'African Americans Per County(%)', ylabel = 'County Vote For Republicans(%)')
ax.set_title("African Americans' Contribution to Republican Votes in Virginia Counties", fontsize=20)
plt.show()
In [110]:
ax = sns.regplot(VA.asian_am, VA.per_dem)
ax.set(xlabel = 'Asian Americans Per County(%)', ylabel = 'County Vote For Democrats(%)')
ax.set_title("Asian Americans' Contribution to Democratic Votes in Virginia Counties", fontsize=20)
plt.show()
#There's a slight correlation, but nothing too substantive. Need to id these specific counties.
In [ ]:
ax = sns.regplot(VA., VA.)
ax.set(xlabel = 'Asian Americans Per County(%)', ylabel = 'County Vote For Democrats(%)')
ax.set_title("Asian Americans' Contribution to Democratic Votes in Virginia Counties", fontsize=20)
plt.show()
In [85]:
# Making swing state list based on the crucial swing states this election.
IA = combined_5[combined_5['state_abbr']==('IA')]
WI = combined_5[combined_5['state_abbr']==('WI')]
MI = combined_5[combined_5['state_abbr']==('MI')]
PA = combined_5[combined_5['state_abbr']==('PA')]
FL = combined_5[combined_5['state_abbr']==('FL')]
NC = combined_5[combined_5['state_abbr']==('NC')]
OH = combined_5[combined_5['state_abbr']==('OH')]
MN = combined_5[combined_5['state_abbr']==('MN')]
swing_states= pd.concat([IA, WI, MI, PA, FL, NC, OH, MN])
# 'IA', 'WI','MI','PA','FL','NC','OH','MN'
In [86]:
swing_states.head()
Out[86]:
In [87]:
ax = sns.distplot(swing_states.election_range, kde=False)
ax.set(xlabel = "Negative=Republican, Positive=Democrat (%)", ylabel='Count')
ax.set_title('Partisan Degree in All Swing State Counties, 2016', fontsize=16, fontname='Ubuntu')
plt.show()
# As expected, in swing states it's not AS bad for Democrats compared to the rest of the
# country but still quite dire.
In [96]:
ax = sns.regplot(combined_5.white_am, combined_5.per_dem)
ax.set(xlabel = 'Percentage White American(%)', ylabel = 'County Vote Percent Democrat(%)')
plt.show()
In [97]:
ax = sns.regplot(combined_5.white_am, combined_5.per_gop)
ax.set(xlabel = 'Percentage White American(%)', ylabel = 'County Vote Percent Republican(%)')
plt.show()
# It's scattered, but there is stil a strong correlation between percentage white
# population and Republican vote.
In [98]:
ax = sns.regplot(combined_5.af_am, combined_5.per_dem)
ax.set(xlabel = 'Percentage African American(%)', ylabel = 'County Vote Percent Democrat(%)')
ax.set_title('African American Influence on 2016 Democrtic Vote in All US Counties', fontsize=15)
plt.show()
In [99]:
ax = sns.regplot(combined_5.af_am, combined_5.per_gop)
ax.set(xlabel = 'Percentage African American(%)', ylabel = 'County Vote Percent Republican(%)')
ax.set_title('African American Influence on 2016 Republican Vote in All US Counties', fontsize=15)
plt.show()
In [100]:
ax = sns.regplot(combined_5.hisp_lat_am, combined_5.per_dem)
ax.set(xlabel = 'Percentage Hispanic/Latino(%)', ylabel = 'County Vote Percent Democrat(%)')
ax.set_title('Hispanic/Latino Influence on 2016 Democratic Vote in All US Counties', fontsize=15)
plt.show()
In [101]:
ax = sns.regplot(combined_5.hisp_lat_am, combined_5.per_gop)
ax.set(xlabel = 'Percentage Hispanic/Latino(%)', ylabel = 'County Vote Percent Republican(%)')
ax.set_title('Hispanic/Latino Influence on 2016 Republican Vote in All US Counties', fontsize=15)
plt.show()
# A correlation is there, but it's not that strong due to the sheer amount of
# counties with little hispanic/latino population.
In [102]:
ax = sns.regplot(combined_5.asian_am, combined_5.per_dem)
ax.set(xlabel = 'Percentage Asian American(%)', ylabel = 'County Vote Percent Republican(%)')
ax.set_title('Asian American Influence on 2016 Democratic Vote in All US Counties', fontsize=15)
plt.show()
In [103]:
ax = sns.regplot(combined_5.asian_am, combined_5.per_gop)
ax.set(xlabel = 'Percentage Asian American(%)', ylabel = 'County Vote Percent Republican(%)')
ax.set_title('Asian American Influence on 2016 Republican Vote in All US Counties', fontsize=15)
plt.show()
In [ ]:
In [104]:
ax = sns.regplot(swing_states.div_index, swing_states.election_range)
ax.set(xlabel = 'Diversity Index', ylabel = 'Election Range, Neg=Republican, Pos=Democrat(%)')
ax.set_title("Diversity's Effect on Swing State Votes", fontsize=20, fontname='Ubuntu')
plt.show()
In [105]:
ax = sns.regplot(swing_states.div_index, swing_states.per_dem)
ax.set(xlabel = 'Diversity Index', ylabel = 'County Vote Percent Democrat(%)')
ax.set_title("Diversity's Effect on Democratic Vote in Swing States", fontsize=20, fontname='Ubuntu')
plt.show()
In [106]:
ax = sns.regplot(swing_states.div_index, swing_states.per_gop)
ax.set(xlabel = 'Diversity Index', ylabel = 'County Vote Percent Republican(%)')
ax.set_title("Diversity's Effect on Republican Vote in Swing States", fontsize=20, fontname='Ubuntu')
plt.show()
In [107]:
ax = sns.regplot(swing_states.ue_rate, swing_states.election_range)
ax.set(xlabel = 'Unemployment Rate(%)', ylabel = 'Election Range(%)')
plt.show()
In [108]:
# No discernable realtionship for unemployment in the swing states, just as in the overall dataset.
In [109]:
ax = sns.regplot(swing_states.white_am, swing_states.per_dem)
ax.set(xlabel = 'Percentage White American(%)', ylabel = 'County Vote Percent Democrat(%)')
ax.set_title("White Americans' Contribtuion to 2016 Swing State Democratic Vote", fontsize=16)
plt.show()
In [110]:
ax = sns.regplot(swing_states.white_am, swing_states.per_gop)
ax.set(xlabel = 'Percentage White American(%)', ylabel = 'County Vote Percent Republican(%)')
ax.set_title("White Americans' Contribtuion to 2016 Swing State Republican Vote", fontsize=16)
plt.show()
In [111]:
# Look for how incomes of white americans influence how they vote.
In [112]:
ax = sns.regplot(swing_states.af_am, swing_states.per_dem)
ax.set(xlabel = 'Percentage African American(%)', ylabel = 'County Vote Percent Democrat(%)')
ax.set_title('African American Influence on 2016 Democratic Vote in Swing State Counties', fontsize=15)
plt.show()
In [113]:
ax = sns.regplot(swing_states.af_am, swing_states.per_gop)
ax.set(xlabel = 'Percentage African American(%)', ylabel = 'County Vote Percent Republican(%)')
ax.set_title('African American Influence on 2016 Republican Vote in Swing State Counties', fontsize=15)
plt.show()
In [114]:
ax = sns.regplot(swing_states.hisp_lat_am, swing_states.per_dem)
ax.set(xlabel = 'Percentage Hispanic/Latino(%)', ylabel = 'County Vote Percent Democrat(%)')
plt.show()
In [115]:
# Again, a scattered, but string correlation.
In [116]:
# The change in the uninsured rate does not appear to have benefitted Democrats,
# but does appear to have benefitted Republicans.
In [117]:
edu.columns
Out[117]:
In [118]:
ax = sns.regplot(combined_5.per_hs_diploma_only_2011_15, combined_5.per_gop)
ax.set(xlabel = 'High School Diploma Only(%)', ylabel = 'County Vote Percent Republican(%)')
ax.set_title("Lower Education's Contribution to 2016 Republican Vote in All US Counties", fontsize=16)
plt.show()
In [119]:
ax = sns.regplot(combined_5.per_four_or_higher_2011_15, combined_5.per_gop)
ax.set(xlabel = 'Four or more University Years(%)', ylabel = 'County Vote Percent Republican(%)')
ax.set_title("Higher Education's Contribution to 2016 Republican Vote in All US Counties", fontsize=16)
plt.show()
In [120]:
ax = sns.regplot(combined_5.per_hs_diploma_only_2011_15, combined_5.per_dem)
ax.set(xlabel = 'High School Diploma Only(%)', ylabel = 'County Vote Percent Democrat(%)')
ax.set_title("Lower Education's Contribution to 2016 Democratic Vote", fontsize=16)
plt.show()
In [121]:
ax = sns.regplot(combined_5.per_four_or_higher_2011_15, combined_5.per_dem)
ax.set(xlabel = 'Four or more University Years(%)', ylabel = 'County Vote Percent Republican(%)')
ax.set_title("Higher Education's Contribution to 2016 Democratic Vote in All US Counties", fontsize=16)
plt.show()
In [122]:
ax = sns.regplot(combined_5.per_hs_diploma_only_2011_15, combined_5.election_range)
ax.set(xlabel = 'High School Diploma Only per County(%)', ylabel = 'Election Range (neg=Rep, pos=Dem, %)')
ax.set_title("Lower Education's Contribution to 2016 Vote", fontsize=16)
plt.show()
In [123]:
ax = sns.regplot(combined_5.per_four_or_higher_2011_15, combined_5.election_range)
ax.set(xlabel = 'Four or more University Years per County(%)', ylabel = 'Election Range (neg=Rep, pos=Dem, %)')
ax.set_title("Higher Education's Contribution to 2016 Vote in All US Counties", fontsize=16)
plt.show()
In [124]:
ax = sns.regplot(swing_states.per_hs_diploma_only_2011_15, swing_states.election_range)
ax.set(xlabel = 'High School Diploma Only per County(%)', ylabel = 'Election Range (neg=Rep, pos=Dem, %)')
ax.set_title("Lower Education's Contribution to 2016 Vote in Swing State Counties", fontsize=16)
plt.show()
In [125]:
ax = sns.regplot(swing_states.per_four_or_higher_2011_15, swing_states.election_range)
ax.set(xlabel = 'Four or more University Years per County(%)', ylabel = 'Election Range (neg=Rep, pos=Dem, %)')
ax.set_title("Higher Education's Contribution to 2016 Vote in Swing State Counties", fontsize=16)
plt.show()
In [126]:
# If a county has a higher percentage of people with only a hs diploma, then more likely
# to vote Republican. If a county has a higher proportion of 4+ college degrees, then
# more likely to go Democrat. Pretty much aligns with Nat Silver's argument.
In [127]:
combined_5.labor_force.head()
Out[127]:
In [128]:
ax = sns.regplot(combined_5.labor_force, combined_5.election_range)
ax.set(xlabel = 'Labor Force Body per County', ylabel = 'Election Range(neg=Rep, pos=Dem, %)')
ax.set_title("Labor Force Contribution to Votes in All counties", fontsize=16)
plt.show()
In [129]:
combined_5.head(1)
Out[129]:
In [130]:
ax = sns.regplot(combined_5.est_pop_2015, combined_5.election_range)
ax.set(xlabel = 'Popularion per County (2015)', ylabel = '2016 Election Range(neg=Rep, pos=Dem, %)')
ax.set_title("Population Contribution to Votes in All Counties", fontsize=16)
plt.show()
In [131]:
# Population size per county does correlate with vote.
In [132]:
ax = sns.regplot(combined_5.pop_change_2015, combined_5.election_range)
ax.set(xlabel = 'Population Change per County(2015)', ylabel = '2016 Election Range(neg=Rep, pos=Dem, %)')
ax.set_title("Population Change Contribution to Votes in All Counties", fontsize=16)
plt.show()
In [133]:
# Counties that experienced a positve change in population saw a boost for Dems.
In [134]:
# Although there is that cluster towards zero, and the correlation is broad, there
# is still something there.
Most predictive features for counties' vote found through EDA:
(note that these variables, sometimes by their nature, don't necessarily follow a normal distribution)
Percentage White American population
Percentage African American population
Percentage Asian American population
Percentage High School Diploma only
Percentage Four or more years of University
In [135]:
combined_5.columns
Out[135]:
In [136]:
modeling = combined_5.drop(combined_5[[0,1,2,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,25,34,35,36,37,38,39,40,52,53]], axis=1)
In [137]:
modeling.head()
Out[137]:
In [138]:
modeling.isnull().sum()
Out[138]:
In [139]:
modeling.dropna(inplace=True)
#Only 46 isn't too significant.
In [140]:
from sklearn.cross_validation import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import confusion_matrix, mean_squared_error
In [141]:
lr = LinearRegression()
In [142]:
modeling.columns
Out[142]:
In [143]:
X = modeling[[0,1,2,3,4,5,6,7,8,9,10,11]]
y = modeling['election_range']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=99)
In [144]:
X.head(0)
Out[144]:
In [145]:
lr.fit(X_train,y_train)
y_pred = lr.predict(X_test)
In [146]:
ax = sns.regplot(y_test, y_pred)
ax.set(xlabel = 'Predicted Election Range (neg=Rep, pos=Dem)', ylabel = 'Actual Election Range(neg=Rep, pos=Dem)')
ax.set_title("Predicted vs. Actual Election Ranges for All Counties", fontsize=16)
plt.show()
In [147]:
lr.score(X_train, y_train)
Out[147]:
In [246]:
s_modeling = swing_states.drop(swing_states[[0,1,2,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,25,34,35,36,37,38,39,40,52,53]], axis=1)
In [247]:
swing_states.head(0)
Out[247]:
In [248]:
s_modeling.head(0)
Out[248]:
In [249]:
X = s_modeling[[0,1,2,3,4,5,6,7,8,9,10,11]]
y = s_modeling['election_range']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=99)
In [250]:
X.head()
Out[250]:
In [251]:
lr.fit(X_train,y_train)
y_pred = lr.predict(X_test)
In [253]:
ax = sns.regplot(y_test, y_pred)
ax.set(xlabel = 'Predicted Election Range (neg=Rep, pos=Dem)', ylabel = 'Actual Election Range(neg=Rep, pos=Dem)')
ax.set_title("Predicted vs. Actual Election Ranges for Swing State Counties", fontsize=16)
plt.show()
In [254]:
lr.score(X_train, y_train)
# Right around the same R^2 score as all counties.
Out[254]:
Now we want to see what features classify a county into being "slight dem", "slight gop, "med_dem", "med_gop", "strong_dem", and "strong_gop."
In [206]:
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, precision_score, recall_score, roc_curve, auc
In [159]:
# Setting the number of neighbors to the square root of number of instances is a good
# rule of thumb.
knn = KNeighborsClassifier(n_neighbors = 55)
rfc = RandomForestClassifier(max_depth = 5)
In [160]:
modeling.head()
Out[160]:
In [161]:
In [162]:
c_modeling = modeling.join(dummies)
c_modeling = c_modeling.reset_index()
c_modeling = c_modeling.drop(c_modeling[[0]], axis=1)
In [163]:
c_modeling.head()
Out[163]:
In [164]:
c_modeling.columns
Out[164]:
In [255]:
# Swing State Classifiers
dummies = pd.get_dummies(s_modeling[['slight_dem','slight_gop','med_dem','med_gop','strong_dem','strong_gop']])
cs_modeling = s_modeling.join(dummies)
cs_modeling = cs_modeling.reset_index()
cs_modeling = cs_modeling.drop(c_modeling[[0]], axis=1)
In [265]:
# First try KNN for just slight dem and slight gop.
X = c_modeling[[0,1,2,3,4,5,6,7,8,9,10,11]]
y = c_modeling[[29,30,31,32]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
In [266]:
X.head()
Out[266]:
In [267]:
y.head()
Out[267]:
In [268]:
knn.fit(X_train, y_train)
Out[268]:
In [269]:
y_pred = knn.predict(X_test)
In [270]:
print knn.score(X_train,y_train)
print accuracy_score(y_test, y_pred)
print cross_val_score(knn, X_train, y_train, cv=5)
print(classification_report(y_test,y_pred))
In [ ]:
In [215]:
#KNN for med_dem and med_gop
X = c_modeling[[0,1,2,3,4,5,6,7,8,9,10,11]]
y = c_modeling[[33,34,35,36]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
y_pred = knn.predict(X_test)
In [216]:
print knn.score(X_train,y_train)
print accuracy_score(y_test, y_pred)
print cross_val_score(knn, X_train, y_train, cv=5)
print(classification_report(y_test,y_pred))
In [ ]:
#KNN for strong dem and stronggop
X = c_modeling[[0,1,2,3,4,5,6,7,8,9,10,11]]
y = c_modeling[[37,38,39,40]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
y_pred = knn.predict(X_test)
In [218]:
print knn.score(X_train,y_train)
print accuracy_score(y_test, y_pred)
print cross_val_score(knn, X_train, y_train, cv=5)
print(classification_report(y_test,y_pred))
In [256]:
#First slight dem and slight gop
X = cs_modeling[[0,1,2,3,4,5,6,7,8,9,10,11]]
y = cs_modeling[[29,30,31,32]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
knn.fit(X_train, y_train)
Out[256]:
In [257]:
y_pred = knn.predict(X_test)
In [258]:
print knn.score(X_train,y_train)
print accuracy_score(y_test, y_pred)
print cross_val_score(knn, X_train, y_train, cv=5)
print(classification_report(y_test,y_pred))
In [259]:
#KNN for med_dem and med_gop
X = cs_modeling[[0,1,2,3,4,5,6,7,8,9,10,11]]
y = cs_modeling[[33,34,35,36]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
y_pred = knn.predict(X_test)
In [260]:
knn.fit(X_train,y_train)
Out[260]:
In [261]:
print knn.score(X_train,y_train)
print accuracy_score(y_test, y_pred)
print cross_val_score(knn, X_train, y_train, cv=5)
print(classification_report(y_test,y_pred))
In [262]:
X = cs_modeling[[0,1,2,3,4,5,6,7,8,9,10,11]]
y = cs_modeling[[37,38,39,40]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
y_pred = knn.predict(X_test)
knn.fit(X_train,y_train)
Out[262]:
In [263]:
print knn.score(X_train,y_train)
print accuracy_score(y_test, y_pred)
print cross_val_score(knn, X_train, y_train, cv=5)
print(classification_report(y_test,y_pred))
Modeling for the "strong" counties of 25-50% is not that predictive.
In [178]:
## Random Forests
In [224]:
X = c_modeling[[0,1,2,3,4,5,6,7,8,9,10,11]]
y = c_modeling[[29,30,31,32]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
In [225]:
rfc.fit(X_train, y_train)
Out[225]:
In [226]:
y_pred = rfc.predict(X_test)
In [227]:
print knn.score(X_train,y_train)
print accuracy_score(y_test, y_pred)
print cross_val_score(knn, X_train, y_train, cv=5)
print(classification_report(y_test,y_pred))
In [228]:
X = c_modeling[[0,1,2,3,4,5,6,7,8,9,10,11]]
y = c_modeling[[33,34,35,36]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
In [229]:
rfc.fit(X_train, y_train)
Out[229]:
In [230]:
y_pred = rfc.predict(X_test)
In [231]:
print knn.score(X_train,y_train)
print accuracy_score(y_test, y_pred)
print cross_val_score(knn, X_train, y_train, cv=5)
print(classification_report(y_test,y_pred))
In [232]:
X = c_modeling[[0,1,2,3,4,5,6,7,8,9,10,11]]
y = c_modeling[[37,38,39,40]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
In [233]:
rfc.fit(X_train, y_train)
Out[233]:
In [234]:
y_pred = rfc.predict(X_test)
In [235]:
print knn.score(X_train,y_train)
print accuracy_score(y_test, y_pred)
print cross_val_score(knn, X_train, y_train, cv=5)
print(classification_report(y_test,y_pred))
In [191]:
# Just like in KNN, not the best classifier for "strong counties."
In [192]:
## Problem statement: What are the economic and demographic factors we can use to predict
## whether a county votes Democrat or Republican? More specifically, how do these factors
## affect the margin of a Democrat or Republican winning the vote in a swing state county?
## Furthermore, are the parties becoming racial identity parties--how much does the data
## convey this?
In [193]:
## Potential questions down the line:
## Look closely at the election week's coverage and how to build off that
## How many misleading data driven stories have their been? Atlantic--said most predicitive
## question was whether Obama was born here (bunch of false positives)==precision vs recall
## problem. Look at HOW METRICS HAVE BEEN ABUSED.
## DEBUNK these stories.