notebook.community

Edit and run



In [181]:

    
import pandas as pd
from sklearn.cluster import KMeans

%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (10, 10)



In [3]:

    
#Load data
county_facts = pd.read_csv('../kaggle-data/county_facts.csv', index_col=['fips', 'area_name'])
county_facts_columns = pd.read_csv('../kaggle-data/county_facts_dictionary.csv')
primary_results = pd.read_csv('../kaggle-data/primary_results.csv')



In [4]:

    
#Take a look at the columns
county_facts_columns









    Out[4]:






  
    
      
      column_name
      description
    
  
  
    
      0
      PST045214
      Population, 2014 estimate
    
    
      1
      PST040210
      Population, 2010 (April 1) estimates base
    
    
      2
      PST120214
      Population, percent change - April 1, 2010 to ...
    
    
      3
      POP010210
      Population, 2010
    
    
      4
      AGE135214
      Persons under 5 years, percent, 2014
    
    
      5
      AGE295214
      Persons under 18 years, percent, 2014
    
    
      6
      AGE775214
      Persons 65 years and over, percent, 2014
    
    
      7
      SEX255214
      Female persons, percent, 2014
    
    
      8
      RHI125214
      White alone, percent, 2014
    
    
      9
      RHI225214
      Black or African American alone, percent, 2014
    
    
      10
      RHI325214
      American Indian and Alaska Native alone, perce...
    
    
      11
      RHI425214
      Asian alone, percent, 2014
    
    
      12
      RHI525214
      Native Hawaiian and Other Pacific Islander alo...
    
    
      13
      RHI625214
      Two or More Races, percent, 2014
    
    
      14
      RHI725214
      Hispanic or Latino, percent, 2014
    
    
      15
      RHI825214
      White alone, not Hispanic or Latino, percent, ...
    
    
      16
      POP715213
      Living in same house 1 year & over, percent, 2...
    
    
      17
      POP645213
      Foreign born persons, percent, 2009-2013
    
    
      18
      POP815213
      Language other than English spoken at home, pc...
    
    
      19
      EDU635213
      High school graduate or higher, percent of per...
    
    
      20
      EDU685213
      Bachelor's degree or higher, percent of person...
    
    
      21
      VET605213
      Veterans, 2009-2013
    
    
      22
      LFE305213
      Mean travel time to work (minutes), workers ag...
    
    
      23
      HSG010214
      Housing units, 2014
    
    
      24
      HSG445213
      Homeownership rate, 2009-2013
    
    
      25
      HSG096213
      Housing units in multi-unit structures, percen...
    
    
      26
      HSG495213
      Median value of owner-occupied housing units, ...
    
    
      27
      HSD410213
      Households, 2009-2013
    
    
      28
      HSD310213
      Persons per household, 2009-2013
    
    
      29
      INC910213
      Per capita money income in past 12 months (201...
    
    
      30
      INC110213
      Median household income, 2009-2013
    
    
      31
      PVY020213
      Persons below poverty level, percent, 2009-2013
    
    
      32
      BZA010213
      Private nonfarm establishments, 2013
    
    
      33
      BZA110213
      Private nonfarm employment,  2013
    
    
      34
      BZA115213
      Private nonfarm employment, percent change, 20...
    
    
      35
      NES010213
      Nonemployer establishments, 2013
    
    
      36
      SBO001207
      Total number of firms, 2007
    
    
      37
      SBO315207
      Black-owned firms, percent, 2007
    
    
      38
      SBO115207
      American Indian- and Alaska Native-owned firms...
    
    
      39
      SBO215207
      Asian-owned firms, percent, 2007
    
    
      40
      SBO515207
      Native Hawaiian- and Other Pacific Islander-ow...
    
    
      41
      SBO415207
      Hispanic-owned firms, percent, 2007
    
    
      42
      SBO015207
      Women-owned firms, percent, 2007
    
    
      43
      MAN450207
      Manufacturers shipments, 2007 ($1,000)
    
    
      44
      WTN220207
      Merchant wholesaler sales, 2007 ($1,000)
    
    
      45
      RTN130207
      Retail sales, 2007 ($1,000)
    
    
      46
      RTN131207
      Retail sales per capita, 2007
    
    
      47
      AFN120207
      Accommodation and food services sales, 2007 ($...
    
    
      48
      BPS030214
      Building permits, 2014
    
    
      49
      LND110210
      Land area in square miles, 2010
    
    
      50
      POP060210
      Population per square mile, 2010



In [199]:

    
#Subselect rows, let's just keep state level data
#df = county_facts[county_facts.state_abbreviation.isnull()][1:]
#Subselect at the county level
df = county_facts[county_facts.state_abbreviation.notnull()]
df.head()









    Out[199]:






  
    
      
      
      state_abbreviation
      PST045214
      PST040210
      PST120214
      POP010210
      AGE135214
      AGE295214
      AGE775214
      SEX255214
      RHI125214
      ...
      SBO415207
      SBO015207
      MAN450207
      WTN220207
      RTN130207
      RTN131207
      AFN120207
      BPS030214
      LND110210
      POP060210
    
    
      fips
      area_name
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      1001
      Autauga County
      AL
      55395
      54571
      1.5
      54571
      6.0
      25.2
      13.8
      51.4
      77.9
      ...
      0.7
      31.7
      0
      0
      598175
      12003
      88157
      131
      594.44
      91.8
    
    
      1003
      Baldwin County
      AL
      200111
      182265
      9.8
      182265
      5.6
      22.2
      18.7
      51.2
      87.1
      ...
      1.3
      27.3
      1410273
      0
      2966489
      17166
      436955
      1384
      1589.78
      114.6
    
    
      1005
      Barbour County
      AL
      26887
      27457
      -2.1
      27457
      5.7
      21.2
      16.5
      46.6
      50.2
      ...
      0.0
      27.0
      0
      0
      188337
      6334
      0
      8
      884.88
      31.0
    
    
      1007
      Bibb County
      AL
      22506
      22919
      -1.8
      22915
      5.3
      21.0
      14.8
      45.9
      76.3
      ...
      0.0
      0.0
      0
      0
      124707
      5804
      10757
      19
      622.58
      36.8
    
    
      1009
      Blount County
      AL
      57719
      57322
      0.7
      57322
      6.1
      23.6
      17.0
      50.5
      96.0
      ...
      0.0
      23.2
      341544
      0
      319700
      5622
      20941
      3
      644.78
      88.9
    
  

5 rows × 52 columns



In [200]:

    
#Select 2 columns (this is for visualization purposes)
#df = df.filter(['EDU685213', 'SEX255214'])
df = df.filter(['SBO315207', 'SBO015207'])



In [201]:

    
#Rename columns to use their meaning instead their codename
col_names = dict((k, v) for k,v in county_facts_columns.itertuples(index=False, name=None))
df.rename(columns=col_names, inplace=True)
df.head()









    Out[201]:






  
    
      
      
      Black-owned firms, percent, 2007
      Women-owned firms, percent, 2007
    
    
      fips
      area_name
      
      
    
  
  
    
      1001
      Autauga County
      15.2
      31.7
    
    
      1003
      Baldwin County
      2.7
      27.3
    
    
      1005
      Barbour County
      0.0
      27.0
    
    
      1007
      Bibb County
      14.9
      0.0
    
    
      1009
      Blount County
      0.0
      23.2



In [202]:

    
df.plot.scatter(0, 1, s=100)









    Out[202]:





<matplotlib.axes._subplots.AxesSubplot at 0x1166392d0>



In [204]:

    
#Run a clustering algorithm, group in 4 clusters
model = KMeans(n_clusters=4)
results = model.fit_predict(df.values)



In [205]:

    
#Assign cluster number to our dataframe
df['cluster'] = results
#Add a color for each cluster
df['color'] = df.cluster.map({0: 'black', 1:'red', 2:'green', 3:'yellow'})



In [206]:

    
df.head()









    Out[206]:






  
    
      
      
      Black-owned firms, percent, 2007
      Women-owned firms, percent, 2007
      cluster
      color
    
    
      fips
      area_name
      
      
      
      
    
  
  
    
      1001
      Autauga County
      15.2
      31.7
      0
      black
    
    
      1003
      Baldwin County
      2.7
      27.3
      1
      red
    
    
      1005
      Barbour County
      0.0
      27.0
      1
      red
    
    
      1007
      Bibb County
      14.9
      0.0
      2
      green
    
    
      1009
      Blount County
      0.0
      23.2
      3
      yellow



In [207]:

    
df.plot.scatter(x=0, y=1, s=100, c=df.color)









    Out[207]:





<matplotlib.axes._subplots.AxesSubplot at 0x11fe351d0>

	column_name	description
0	PST045214	Population, 2014 estimate
1	PST040210	Population, 2010 (April 1) estimates base
2	PST120214	Population, percent change - April 1, 2010 to ...
3	POP010210	Population, 2010
4	AGE135214	Persons under 5 years, percent, 2014
5	AGE295214	Persons under 18 years, percent, 2014
6	AGE775214	Persons 65 years and over, percent, 2014
7	SEX255214	Female persons, percent, 2014
8	RHI125214	White alone, percent, 2014
9	RHI225214	Black or African American alone, percent, 2014
10	RHI325214	American Indian and Alaska Native alone, perce...
11	RHI425214	Asian alone, percent, 2014
12	RHI525214	Native Hawaiian and Other Pacific Islander alo...
13	RHI625214	Two or More Races, percent, 2014
14	RHI725214	Hispanic or Latino, percent, 2014
15	RHI825214	White alone, not Hispanic or Latino, percent, ...
16	POP715213	Living in same house 1 year & over, percent, 2...
17	POP645213	Foreign born persons, percent, 2009-2013
18	POP815213	Language other than English spoken at home, pc...
19	EDU635213	High school graduate or higher, percent of per...
20	EDU685213	Bachelor's degree or higher, percent of person...
21	VET605213	Veterans, 2009-2013
22	LFE305213	Mean travel time to work (minutes), workers ag...
23	HSG010214	Housing units, 2014
24	HSG445213	Homeownership rate, 2009-2013
25	HSG096213	Housing units in multi-unit structures, percen...
26	HSG495213	Median value of owner-occupied housing units, ...
27	HSD410213	Households, 2009-2013
28	HSD310213	Persons per household, 2009-2013
29	INC910213	Per capita money income in past 12 months (201...
30	INC110213	Median household income, 2009-2013
31	PVY020213	Persons below poverty level, percent, 2009-2013
32	BZA010213	Private nonfarm establishments, 2013
33	BZA110213	Private nonfarm employment, 2013
34	BZA115213	Private nonfarm employment, percent change, 20...
35	NES010213	Nonemployer establishments, 2013
36	SBO001207	Total number of firms, 2007
37	SBO315207	Black-owned firms, percent, 2007
38	SBO115207	American Indian- and Alaska Native-owned firms...
39	SBO215207	Asian-owned firms, percent, 2007
40	SBO515207	Native Hawaiian- and Other Pacific Islander-ow...
41	SBO415207	Hispanic-owned firms, percent, 2007
42	SBO015207	Women-owned firms, percent, 2007
43	MAN450207	Manufacturers shipments, 2007 ($1,000)
44	WTN220207	Merchant wholesaler sales, 2007 ($1,000)
45	RTN130207	Retail sales, 2007 ($1,000)
46	RTN131207	Retail sales per capita, 2007
47	AFN120207	Accommodation and food services sales, 2007 ($...
48	BPS030214	Building permits, 2014
49	LND110210	Land area in square miles, 2010
50	POP060210	Population per square mile, 2010

		state_abbreviation	PST045214	PST040210	PST120214	POP010210	AGE135214	AGE295214	AGE775214	SEX255214	RHI125214	...	SBO415207	SBO015207	MAN450207	WTN220207	RTN130207	RTN131207	AFN120207	BPS030214	LND110210	POP060210
fips	area_name
1001	Autauga County	AL	55395	54571	1.5	54571	6.0	25.2	13.8	51.4	77.9	...	0.7	31.7	0	0	598175	12003	88157	131	594.44	91.8
1003	Baldwin County	AL	200111	182265	9.8	182265	5.6	22.2	18.7	51.2	87.1	...	1.3	27.3	1410273	0	2966489	17166	436955	1384	1589.78	114.6
1005	Barbour County	AL	26887	27457	-2.1	27457	5.7	21.2	16.5	46.6	50.2	...	0.0	27.0	0	0	188337	6334	0	8	884.88	31.0
1007	Bibb County	AL	22506	22919	-1.8	22915	5.3	21.0	14.8	45.9	76.3	...	0.0	0.0	0	0	124707	5804	10757	19	622.58	36.8
1009	Blount County	AL	57719	57322	0.7	57322	6.1	23.6	17.0	50.5	96.0	...	0.0	23.2	341544	0	319700	5622	20941	3	644.78	88.9