In [ ]:

    
# Algorithm
1. Merge all data files (census, business type, and Flickr) using zip codes as primary key. 
2. Run linear regression (at the end of the notebook)
3. Transform the problem as classification
4. Try multiple classifiers
5. Run



In [280]:

    
import requests
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import csv
import seaborn as sns
font = {'family' : 'Arial',
        'weight' : 'bold',
        'size'   : 25}
matplotlib.rc('font', **font)
from census import Census
from us import states
import csv

Cleaning and merging data



In [2]:

    
#biz type
"""
    11	Agriculture, Forestry, Fishing and Hunting
    21	Mining, Quarrying, and Oil and Gas Extraction
    22	Utilities
    #23	Construction
    31-33	Manufacturing
    #42	Wholesale Trade
    #44-45	Retail Trade
    48-49	Transportation and Warehousing
    #51	Information
    #52	Finance and Insurance
    #53	Real Estate and Rental and Leasing
    #54	Professional, Scientific, and Technical Services
    #55	Management of Companies and Enterprises
    56	Administrative and Support and Waste Management and Remediation Services
    61	Educational Services
    62	Health Care and Social Assistance
    71	Arts, Entertainment, and Recreation
    #72	Accommodation and Food Services
    81	Other Services (except Public Administration)
    92	Public Administration
    
    potential parameters: 23, 42, 45, 51, 52, 53, 54, 55, 72
"""
business_codes = [11, 21, 22, 23, 30, 42, 45, 49, 51, 52, 53, 54, 55, 56, 61, 62, 71, 72, 81, 99]
business_names = ['Agriculture/Fishing','Mining/Gas Extraction','Utilities','Construction',
                  'Manufacturing','Wholesale','Retail','Transportation','Media/Publishing',
                  'Finance','Real Estate','Law/Science/Ads','Management','Administrative',
                  'Education','Health','Arts','Accommodation/Food','Others','Public']



In [3]:

    
# zip code: this is from biz file
np.load('new_zip_codes.npy')
# flickr data -> no nan, no inf
df_art = pd.read_csv('art_rate.csv')
# census
df_cen = pd.read_csv('census_filtered.csv')
# predictor
df_biz = pd.read_csv('biz_growth.csv')



In [4]:

    
# zip code with city names
df_city = pd.read_csv('city_zip_df.csv')
df_city.head()









    Out[4]:






  
    
      
      city
      state
      zipcodes
    
  
  
    
      0
      Baltimore
      MD
      21201
    
    
      1
      Baltimore
      MD
      21202
    
    
      2
      Baltimore
      MD
      21205
    
    
      3
      Baltimore
      MD
      21206
    
    
      4
      Baltimore
      MD
      21207



In [5]:

    
df_cen.head()









    Out[5]:






  
    
      
      TotalPop
      M_Total
      M_18to19
      M_20
      M_21
      M_22to24
      M_25to29
      M_30to34
      F_Total
      F_18to19
      ...
      F_Total_Edu
      F_BA
      F_MA
      F_Prof
      F_Doc
      Total_poor
      M_Arts
      F_Arts
      year
      zip_code
    
  
  
    
      0
      -0.018745
      0.025664
      0.076923
      0.256410
      -0.500000
      0.083160
      0.031599
      0.219780
      -0.051964
      0.132812
      ...
      -0.076227
      -0.077049
      -0.040000
      -0.132743
      -0.020979
      -0.000169
      0.291925
      -0.109804
      2011
      21201
    
    
      1
      0.018566
      -0.004596
      0.038023
      0.244361
      -0.004454
      -0.021987
      -0.053431
      0.008483
      0.050930
      -0.060976
      ...
      0.058900
      0.040836
      0.076677
      0.130435
      0.112676
      0.037575
      -0.157895
      -0.104478
      2011
      21202
    
    
      2
      0.029790
      0.033835
      0.067524
      -0.071942
      0.151316
      -0.088384
      0.141667
      0.204819
      0.026208
      0.066845
      ...
      -0.008950
      0.200000
      0.265734
      -0.340000
      NaN
      0.172172
      inf
      -0.125000
      2011
      21205
    
    
      3
      -0.020165
      -0.011558
      0.159664
      0.231481
      0.033708
      -0.012547
      -0.165234
      0.019583
      -0.027440
      -0.103717
      ...
      0.010800
      -0.129573
      -0.086957
      0.383562
      -0.136364
      0.052441
      -0.247525
      -0.176000
      2011
      21206
    
    
      4
      -0.010570
      -0.020922
      0.301708
      -0.290844
      -0.015152
      -0.084223
      -0.050765
      -0.042348
      -0.002171
      -0.040140
      ...
      0.019922
      0.028470
      -0.016706
      0.012658
      0.854167
      0.035589
      0.026549
      0.169014
      2011
      21207
    
  

5 rows × 31 columns



In [6]:

    
temp = []
for zip_code in df_cen['zip_code'].values:
    temp.append(df_city.city[zip_code == df_city.zipcodes.values].values[0])



In [9]:

    
df_cen['city']=temp



In [10]:

    
# rename to prevent confusion
df_art = df_art.rename(columns={'growth':'art_growth'})
df_biz = df_biz.rename(columns={'growth':'biz_growth'})



In [11]:

    
df_biz.head()









    Out[11]:






  
    
      
      business_code
      biz_growth
      year
      zip_code
    
  
  
    
      0
      11
      0.000000
      2011
      21201
    
    
      1
      21
      0.000000
      2011
      21201
    
    
      2
      22
      0.000000
      2011
      21201
    
    
      3
      23
      0.052632
      2011
      21201
    
    
      4
      30
      -0.300000
      2011
      21201



In [12]:

    
print df_art.columns
print df_cen.columns
print df_biz.columns









    



Index([u'art_growth', u'year', u'zip_code'], dtype='object')
Index([u'TotalPop', u'M_Total', u'M_18to19', u'M_20', u'M_21', u'M_22to24',
       u'M_25to29', u'M_30to34', u'F_Total', u'F_18to19', u'F_20', u'F_21',
       u'F_22to24', u'F_25to29', u'F_30to34', u'Total_Edu', u'M_Total_Edu',
       u'M_BA', u'M_MA', u'M_Prof', u'M_Doc', u'F_Total_Edu', u'F_BA', u'F_MA',
       u'F_Prof', u'F_Doc', u'Total_poor', u'M_Arts', u'F_Arts', u'year',
       u'zip_code', u'city'],
      dtype='object')
Index([u'business_code', u'biz_growth', u'year', u'zip_code'], dtype='object')



In [13]:

    
# putting things together: inner join
df_feat = pd.merge(df_art, df_cen, how='inner', on=['year', 'zip_code'])
df_feat.head()









    Out[13]:






  
    
      
      art_growth
      year
      zip_code
      TotalPop
      M_Total
      M_18to19
      M_20
      M_21
      M_22to24
      M_25to29
      ...
      M_Doc
      F_Total_Edu
      F_BA
      F_MA
      F_Prof
      F_Doc
      Total_poor
      M_Arts
      F_Arts
      city
    
  
  
    
      0
      0.068309
      2011
      21201
      -0.018745
      0.025664
      0.076923
      0.256410
      -0.500000
      0.083160
      0.031599
      ...
      0.304167
      -0.076227
      -0.077049
      -0.040000
      -0.132743
      -0.020979
      -0.000169
      0.291925
      -0.109804
      Baltimore
    
    
      1
      -0.021482
      2011
      21202
      0.018566
      -0.004596
      0.038023
      0.244361
      -0.004454
      -0.021987
      -0.053431
      ...
      -0.156398
      0.058900
      0.040836
      0.076677
      0.130435
      0.112676
      0.037575
      -0.157895
      -0.104478
      Baltimore
    
    
      2
      0.000000
      2011
      21205
      0.029790
      0.033835
      0.067524
      -0.071942
      0.151316
      -0.088384
      0.141667
      ...
      1.000000
      -0.008950
      0.200000
      0.265734
      -0.340000
      NaN
      0.172172
      inf
      -0.125000
      Baltimore
    
    
      3
      0.000000
      2011
      21206
      -0.020165
      -0.011558
      0.159664
      0.231481
      0.033708
      -0.012547
      -0.165234
      ...
      -0.569767
      0.010800
      -0.129573
      -0.086957
      0.383562
      -0.136364
      0.052441
      -0.247525
      -0.176000
      Baltimore
    
    
      4
      0.000000
      2011
      21207
      -0.010570
      -0.020922
      0.301708
      -0.290844
      -0.015152
      -0.084223
      -0.050765
      ...
      0.354430
      0.019922
      0.028470
      -0.016706
      0.012658
      0.854167
      0.035589
      0.026549
      0.169014
      Baltimore
    
  

5 rows × 33 columns



In [14]:

    
df_corr = df_feat



In [15]:

    
# converting nan -> 0
# converting inf -> 2
columns = df_feat.columns
for column in columns[:-1]:
    for i in range(len(df_feat)):
        if np.isinf((df_feat[column][i])):
            df_corr[column].values[i]=2        
        if np.isnan(df_feat[column][i]):
            df_corr[column].values[i]=0



In [16]:

    
df_corr.head()









    Out[16]:






  
    
      
      art_growth
      year
      zip_code
      TotalPop
      M_Total
      M_18to19
      M_20
      M_21
      M_22to24
      M_25to29
      ...
      M_Doc
      F_Total_Edu
      F_BA
      F_MA
      F_Prof
      F_Doc
      Total_poor
      M_Arts
      F_Arts
      city
    
  
  
    
      0
      0.068309
      2011
      21201
      -0.018745
      0.025664
      0.076923
      0.256410
      -0.500000
      0.083160
      0.031599
      ...
      0.304167
      -0.076227
      -0.077049
      -0.040000
      -0.132743
      -0.020979
      -0.000169
      0.291925
      -0.109804
      Baltimore
    
    
      1
      -0.021482
      2011
      21202
      0.018566
      -0.004596
      0.038023
      0.244361
      -0.004454
      -0.021987
      -0.053431
      ...
      -0.156398
      0.058900
      0.040836
      0.076677
      0.130435
      0.112676
      0.037575
      -0.157895
      -0.104478
      Baltimore
    
    
      2
      0.000000
      2011
      21205
      0.029790
      0.033835
      0.067524
      -0.071942
      0.151316
      -0.088384
      0.141667
      ...
      1.000000
      -0.008950
      0.200000
      0.265734
      -0.340000
      0.000000
      0.172172
      2.000000
      -0.125000
      Baltimore
    
    
      3
      0.000000
      2011
      21206
      -0.020165
      -0.011558
      0.159664
      0.231481
      0.033708
      -0.012547
      -0.165234
      ...
      -0.569767
      0.010800
      -0.129573
      -0.086957
      0.383562
      -0.136364
      0.052441
      -0.247525
      -0.176000
      Baltimore
    
    
      4
      0.000000
      2011
      21207
      -0.010570
      -0.020922
      0.301708
      -0.290844
      -0.015152
      -0.084223
      -0.050765
      ...
      0.354430
      0.019922
      0.028470
      -0.016706
      0.012658
      0.854167
      0.035589
      0.026549
      0.169014
      Baltimore
    
  

5 rows × 33 columns



In [17]:

    
# check biz column value: if inf -> 2
for i in range(len(df_biz)):
    biz_growth_num = df_biz['biz_growth'][i]
    if np.isinf(biz_growth_num):
        df_biz['biz_growth'][i]=2



In [18]:

    
# merge with biz
df_all = pd.merge(df_corr, df_biz, how='inner', on=['year', 'zip_code'])



In [32]:

    
df_all.head()









    Out[32]:






  
    
      
      year
      zip_code
      TotalPop
      M_Total
      M_18to19
      M_20
      M_21
      M_22to24
      M_25to29
      M_30to34
      ...
      F_Prof
      F_Doc
      Total_poor
      M_Arts
      F_Arts
      art_growth
      business_code
      biz_growth
      city
      biz_index
    
  
  
    
      0
      2011
      21201
      -0.018745
      0.025664
      0.076923
      0.25641
      -0.5
      0.08316
      0.031599
      0.21978
      ...
      -0.132743
      -0.020979
      -0.000169
      0.291925
      -0.109804
      0.068309
      11
      0.000000
      Baltimore
      0
    
    
      1
      2011
      21201
      -0.018745
      0.025664
      0.076923
      0.25641
      -0.5
      0.08316
      0.031599
      0.21978
      ...
      -0.132743
      -0.020979
      -0.000169
      0.291925
      -0.109804
      0.068309
      21
      0.000000
      Baltimore
      0
    
    
      2
      2011
      21201
      -0.018745
      0.025664
      0.076923
      0.25641
      -0.5
      0.08316
      0.031599
      0.21978
      ...
      -0.132743
      -0.020979
      -0.000169
      0.291925
      -0.109804
      0.068309
      22
      0.000000
      Baltimore
      0
    
    
      3
      2011
      21201
      -0.018745
      0.025664
      0.076923
      0.25641
      -0.5
      0.08316
      0.031599
      0.21978
      ...
      -0.132743
      -0.020979
      -0.000169
      0.291925
      -0.109804
      0.068309
      23
      0.052632
      Baltimore
      2
    
    
      4
      2011
      21201
      -0.018745
      0.025664
      0.076923
      0.25641
      -0.5
      0.08316
      0.031599
      0.21978
      ...
      -0.132743
      -0.020979
      -0.000169
      0.291925
      -0.109804
      0.068309
      30
      -0.300000
      Baltimore
      0
    
  

5 rows × 36 columns



In [20]:

    
# change the column order for the future
temp = df_all.ix[:,1:32]
temp['art_growth']=df_all['art_growth']
temp['business_code']=df_all['business_code']
temp['biz_growth']=df_all['biz_growth']
temp['city']=df_all['city']
df_all = temp



In [21]:

    
df_all.head()









    Out[21]:






  
    
      
      year
      zip_code
      TotalPop
      M_Total
      M_18to19
      M_20
      M_21
      M_22to24
      M_25to29
      M_30to34
      ...
      F_MA
      F_Prof
      F_Doc
      Total_poor
      M_Arts
      F_Arts
      art_growth
      business_code
      biz_growth
      city
    
  
  
    
      0
      2011
      21201
      -0.018745
      0.025664
      0.076923
      0.25641
      -0.5
      0.08316
      0.031599
      0.21978
      ...
      -0.04
      -0.132743
      -0.020979
      -0.000169
      0.291925
      -0.109804
      0.068309
      11
      0.000000
      Baltimore
    
    
      1
      2011
      21201
      -0.018745
      0.025664
      0.076923
      0.25641
      -0.5
      0.08316
      0.031599
      0.21978
      ...
      -0.04
      -0.132743
      -0.020979
      -0.000169
      0.291925
      -0.109804
      0.068309
      21
      0.000000
      Baltimore
    
    
      2
      2011
      21201
      -0.018745
      0.025664
      0.076923
      0.25641
      -0.5
      0.08316
      0.031599
      0.21978
      ...
      -0.04
      -0.132743
      -0.020979
      -0.000169
      0.291925
      -0.109804
      0.068309
      22
      0.000000
      Baltimore
    
    
      3
      2011
      21201
      -0.018745
      0.025664
      0.076923
      0.25641
      -0.5
      0.08316
      0.031599
      0.21978
      ...
      -0.04
      -0.132743
      -0.020979
      -0.000169
      0.291925
      -0.109804
      0.068309
      23
      0.052632
      Baltimore
    
    
      4
      2011
      21201
      -0.018745
      0.025664
      0.076923
      0.25641
      -0.5
      0.08316
      0.031599
      0.21978
      ...
      -0.04
      -0.132743
      -0.020979
      -0.000169
      0.291925
      -0.109804
      0.068309
      30
      -0.300000
      Baltimore
    
  

5 rows × 35 columns



In [56]:

    
# save
df_all.to_csv('data_city.csv',encoding='utf-8',index=False)

0. Assign classes



In [5]:

    
df_all = pd.read_csv('data_city.csv')



In [6]:

    
# assign indices
df_all['biz_growth'].describe()









    Out[6]:





count    20640.000000
mean         0.021505
std          0.316719
min         -1.000000
25%         -0.026316
50%          0.000000
75%          0.060000
max         10.000000
Name: biz_growth, dtype: float64



In [7]:

    
plt.hist(df_all['biz_growth'],100)









    Out[7]:





(array([  3.23000000e+02,   2.00000000e+01,   2.90000000e+01,
          9.10000000e+01,   2.41000000e+02,   1.03000000e+02,
          5.37000000e+02,   9.98000000e+02,   3.47800000e+03,
          1.12430000e+04,   1.88100000e+03,   6.05000000e+02,
          3.64000000e+02,   2.06000000e+02,   5.00000000e+01,
          9.60000000e+01,   1.60000000e+01,   4.00000000e+00,
          2.10000000e+02,   1.10000000e+01,   5.00000000e+00,
          9.00000000e+00,   1.90000000e+01,   0.00000000e+00,
          3.00000000e+00,   1.00000000e+00,   1.00000000e+00,
          5.40000000e+01,   0.00000000e+00,   0.00000000e+00,
          1.00000000e+00,   3.00000000e+00,   0.00000000e+00,
          1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          1.50000000e+01,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   2.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          9.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          6.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          2.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          1.00000000e+00]),
 array([ -1.  ,  -0.89,  -0.78,  -0.67,  -0.56,  -0.45,  -0.34,  -0.23,
         -0.12,  -0.01,   0.1 ,   0.21,   0.32,   0.43,   0.54,   0.65,
          0.76,   0.87,   0.98,   1.09,   1.2 ,   1.31,   1.42,   1.53,
          1.64,   1.75,   1.86,   1.97,   2.08,   2.19,   2.3 ,   2.41,
          2.52,   2.63,   2.74,   2.85,   2.96,   3.07,   3.18,   3.29,
          3.4 ,   3.51,   3.62,   3.73,   3.84,   3.95,   4.06,   4.17,
          4.28,   4.39,   4.5 ,   4.61,   4.72,   4.83,   4.94,   5.05,
          5.16,   5.27,   5.38,   5.49,   5.6 ,   5.71,   5.82,   5.93,
          6.04,   6.15,   6.26,   6.37,   6.48,   6.59,   6.7 ,   6.81,
          6.92,   7.03,   7.14,   7.25,   7.36,   7.47,   7.58,   7.69,
          7.8 ,   7.91,   8.02,   8.13,   8.24,   8.35,   8.46,   8.57,
          8.68,   8.79,   8.9 ,   9.01,   9.12,   9.23,   9.34,   9.45,
          9.56,   9.67,   9.78,   9.89,  10.  ]),
 <a list of 100 Patch objects>)



In [8]:

    
# create index: 3
def assign_biz_multiclass1(num):
    temp = []
    for biz_growth in df_all['biz_growth'].values:
        if biz_growth <= 0.0:
            biz_index = 0
        elif biz_growth > 0.0 and biz_growth <= num:
            biz_index = 1
        elif biz_growth > num:
            biz_index = 2
        temp.append(biz_index)
    plt.hist(temp)
    return temp



In [9]:

    
# create index: 4
def assign_biz_multiclass2(num):
    temp = []
    for biz_growth in df_all['biz_growth'].values:
        if biz_growth <= 0.0:
            biz_index = 0
        elif biz_growth > 0.0 and biz_growth <= num:
            biz_index = 1
        elif biz_growth > num and biz_growth <= num*2:
            biz_index = 2
        elif biz_growth > num*2:
            biz_index = 3
        temp.append(biz_index)
    plt.hist(temp)
    return temp



In [10]:

    
def assign_biz_binary():
    temp = []
    for biz_growth in df_all['biz_growth'].values:
        if biz_growth <= 0.0:
            biz_index = 0
        elif biz_growth > 0.0:
            biz_index = 1
        temp.append(biz_index)
    plt.hist(temp)
    return temp



In [11]:

    
temp = assign_biz_multiclass1(0.05)
df_all['biz_index']=pd.DataFrame(temp)

Do certain businesses go together?



In [12]:

    
gb = df_all.groupby(('year','business_code'))



In [13]:

    
# compute average biz_growth per business type in a year, across zip_codes
ref_years = [2011, 2012]
biz_avr_by_zip = []
for ref_year in ref_years:
    for biz_code in business_codes:
        a = gb.get_group((ref_year,biz_code))
        biz_avr_by_zip.append(np.average(a['biz_growth'].values))



In [14]:

    
df_av = pd.DataFrame({'2011':biz_avr_by_zip[0:len(business_codes)],'2012':biz_avr_by_zip[len(business_codes):]})
df_av.index = business_names
df_av









    Out[14]:






  
    
      
      2011
      2012
    
  
  
    
      Agriculture/Fishing
      -0.012647
      -0.018897
    
    
      Mining/Gas Extraction
      -0.014696
      -0.021318
    
    
      Utilities
      -0.002071
      -0.002971
    
    
      Construction
      0.023427
      0.029157
    
    
      Manufacturing
      -0.008928
      0.001245
    
    
      Wholesale
      0.013011
      0.027247
    
    
      Retail
      0.022027
      0.003472
    
    
      Transportation
      0.053618
      0.029967
    
    
      Media/Publishing
      0.038387
      0.044592
    
    
      Finance
      0.032544
      0.003561
    
    
      Real Estate
      0.033791
      0.015557
    
    
      Law/Science/Ads
      0.041238
      0.027794
    
    
      Management
      0.035442
      0.002267
    
    
      Administrative
      0.032993
      0.026459
    
    
      Education
      0.073749
      0.071657
    
    
      Health
      0.027851
      0.011775
    
    
      Arts
      0.071444
      0.047217
    
    
      Accommodation/Food
      0.043032
      0.024339
    
    
      Others
      0.026080
      0.015499
    
    
      Public
      -0.376829
      0.368108



In [15]:

    
df_av['biz_name'] = business_names



In [16]:

    
sns.barplot(x='biz_name',y='2012',data=df_av)









    Out[16]:





<matplotlib.axes._subplots.AxesSubplot at 0x10aa96810>



In [18]:

    
x = df_av['2011'][:-1]
y = df_av['2012'][:-1]
plt.plot(df_av['2011'][:-1],df_av['2012'][:-1],'o',hold='on')
plt.plot(np.linspace(-.02,.08,100),np.linspace(-.02,.08,100),':')
# plt.axis('equal')
plt.axis([-0.03, 0.10, -0.05, 0.10])

sns.set_context("poster")
mu = 0.0025
sigma = 0.003
errors = sigma * np.random.randn(len(business_names)-1) + mu

for i in range(0,len(business_codes)-1):
    biz_name = business_names[i]
    plt.annotate(biz_name, (x[i]-errors[i],y[i]+errors[i]),fontsize=15)
plt.show()

1. Logistic Regression: multi-class



In [19]:

    
from sklearn import linear_model, cross_validation
from sklearn.metrics import accuracy_score

temp = assign_biz_multiclass1(0.10)
df_all['biz_index']=pd.DataFrame(temp)
gb = df_all.groupby(('year','business_code'))
scores = []
for biz_code,biz_name in zip(business_codes,business_names):

    df_2011 = gb.get_group((2011,biz_code))
    df_2012 = gb.get_group((2012,biz_code))
    test_size = 0.4
    X_train_df, X_test_df, y_train_df, y_test_df = cross_validation.train_test_split(df_2011, df_2012, test_size=test_size, random_state=0)    
    
    X_train = X_train_df.ix[:,2:32]
    X_test = X_test_df.ix[:,2:32]
    y_train = y_train_df.ix[:,-1]
    y_test = y_test_df.ix[:,-1]
    
    logreg = linear_model.LogisticRegression(multi_class='ovr',penalty='l1')#solver='lbfgs')
    logreg.fit(X_train, y_train)
    y_pred = logreg.predict(X_test)
    
    score = accuracy_score(y_test, y_pred)
#     print biz_name + ':' + str(score)
#     rf_eval_feat(df_2011.columns[2:32],logreg,biz_name,score)
    scores.append(score)
print sum(np.array(scores)>0.5)/float(len(business_codes))*100

2. Test all classifires

2. Classification criteria



In [20]:

    
# Selecting classes: choose based on +/-SD
df_all[df_all.year.values==2012].biz_growth.describe()









    Out[20]:





count    10320.000000
mean         0.035336
std          0.337001
min         -1.000000
25%         -0.016667
50%          0.000000
75%          0.054945
max         10.000000
Name: biz_growth, dtype: float64



In [22]:

    
# plot feature importances (as a confirmation)
def rf_eval_feat(feature_names,clf,title,score):
    fig = plt.figure(figsize=(15,10))
    # sort importances
    indices = np.argsort(clf.feature_importances_)
    # plot as bar chart
    plt.barh(np.arange(len(feature_names)), clf.feature_importances_[indices])
    plt.yticks(np.arange(len(feature_names)) + 0.5, np.array(feature_names)[indices])
    _ = plt.xlabel('Relative importance')
    plt.title([title + ':' + str(score)],fontsize=20)
    plt.tick_params(axis='x', labelsize=20)
    plt.tick_params(axis='y', labelsize=20)
    return indices



In [23]:

    
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.lda import LDA
from sklearn.qda import QDA
names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Decision Tree",
         "Random Forest", "AdaBoost", "Naive Bayes", "LDA", "QDA"]
classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=100),
    AdaBoostClassifier(),
    GaussianNB(),
    LDA(),
    QDA()]

2.1 Binomial



In [24]:

    
df_all.head()









    Out[24]:






  
    
      
      year
      zip_code
      TotalPop
      M_Total
      M_18to19
      M_20
      M_21
      M_22to24
      M_25to29
      M_30to34
      ...
      F_Prof
      F_Doc
      Total_poor
      M_Arts
      F_Arts
      art_growth
      business_code
      biz_growth
      city
      biz_index
    
  
  
    
      0
      2011
      21201
      -0.018745
      0.025664
      0.076923
      0.25641
      -0.5
      0.08316
      0.031599
      0.21978
      ...
      -0.132743
      -0.020979
      -0.000169
      0.291925
      -0.109804
      0.068309
      11
      0.000000
      Baltimore
      0
    
    
      1
      2011
      21201
      -0.018745
      0.025664
      0.076923
      0.25641
      -0.5
      0.08316
      0.031599
      0.21978
      ...
      -0.132743
      -0.020979
      -0.000169
      0.291925
      -0.109804
      0.068309
      21
      0.000000
      Baltimore
      0
    
    
      2
      2011
      21201
      -0.018745
      0.025664
      0.076923
      0.25641
      -0.5
      0.08316
      0.031599
      0.21978
      ...
      -0.132743
      -0.020979
      -0.000169
      0.291925
      -0.109804
      0.068309
      22
      0.000000
      Baltimore
      0
    
    
      3
      2011
      21201
      -0.018745
      0.025664
      0.076923
      0.25641
      -0.5
      0.08316
      0.031599
      0.21978
      ...
      -0.132743
      -0.020979
      -0.000169
      0.291925
      -0.109804
      0.068309
      23
      0.052632
      Baltimore
      1
    
    
      4
      2011
      21201
      -0.018745
      0.025664
      0.076923
      0.25641
      -0.5
      0.08316
      0.031599
      0.21978
      ...
      -0.132743
      -0.020979
      -0.000169
      0.291925
      -0.109804
      0.068309
      30
      -0.300000
      Baltimore
      0
    
  

5 rows × 36 columns



In [25]:

    
temp = assign_biz_binary()
df_binom = df_all
df_binom['biz_index']=pd.DataFrame(temp)
gb = df_binom.groupby(('year','business_code'))
for name, clf in zip(names, classifiers):
    scores = []
    for biz_code,biz_name in zip(business_codes,business_names):
        df_2011 = gb.get_group((2011,biz_code))
        df_2012 = gb.get_group((2012,biz_code))
        test_size = 0.4
        
        X = df_2011.ix[:,2:32]
        y = df_2012.ix[:,-1]
        X = StandardScaler().fit_transform(X)
        X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=test_size, random_state=0)    

        clf.fit(X_train, y_train)
        score = clf.score(X_test, y_test)
        scores.append(score)
        print name+ ':' + biz_name + ':' +str(score)
    print sum(np.array(scores)>0.5)/float(len(business_codes))*100









    



Nearest Neighbors:Agriculture/Fishing:0.971014492754
Nearest Neighbors:Mining/Gas Extraction:0.990338164251
Nearest Neighbors:Utilities:0.951690821256
Nearest Neighbors:Construction:0.521739130435
Nearest Neighbors:Manufacturing:0.632850241546
Nearest Neighbors:Wholesale:0.507246376812
Nearest Neighbors:Retail:0.531400966184
Nearest Neighbors:Transportation:0.565217391304
Nearest Neighbors:Media/Publishing:0.574879227053
Nearest Neighbors:Finance:0.473429951691
Nearest Neighbors:Real Estate:0.51690821256
Nearest Neighbors:Law/Science/Ads:0.613526570048
Nearest Neighbors:Management:0.792270531401
Nearest Neighbors:Administrative:0.545893719807
Nearest Neighbors:Education:0.468599033816
Nearest Neighbors:Health:0.502415458937
Nearest Neighbors:Arts:0.579710144928
Nearest Neighbors:Accommodation/Food:0.565217391304
Nearest Neighbors:Others:0.589371980676
Nearest Neighbors:Public:0.623188405797
90.0
Linear SVM:Agriculture/Fishing:0.971014492754
Linear SVM:Mining/Gas Extraction:0.990338164251
Linear SVM:Utilities:0.95652173913
Linear SVM:Construction:0.56038647343
Linear SVM:Manufacturing:0.714975845411
Linear SVM:Wholesale:0.536231884058
Linear SVM:Retail:0.531400966184
Linear SVM:Transportation:0.642512077295
Linear SVM:Media/Publishing:0.541062801932
Linear SVM:Finance:0.618357487923
Linear SVM:Real Estate:0.434782608696
Linear SVM:Law/Science/Ads:0.628019323671
Linear SVM:Management:0.806763285024
Linear SVM:Administrative:0.487922705314
Linear SVM:Education:0.463768115942
Linear SVM:Health:0.473429951691
Linear SVM:Arts:0.507246376812
Linear SVM:Accommodation/Food:0.661835748792
Linear SVM:Others:0.574879227053
Linear SVM:Public:0.714975845411
80.0
RBF SVM:Agriculture/Fishing:0.971014492754
RBF SVM:Mining/Gas Extraction:0.990338164251
RBF SVM:Utilities:0.95652173913
RBF SVM:Construction:0.545893719807
RBF SVM:Manufacturing:0.714975845411
RBF SVM:Wholesale:0.531400966184
RBF SVM:Retail:0.463768115942
RBF SVM:Transportation:0.647342995169
RBF SVM:Media/Publishing:0.550724637681
RBF SVM:Finance:0.628019323671
RBF SVM:Real Estate:0.463768115942
RBF SVM:Law/Science/Ads:0.628019323671
RBF SVM:Management:0.806763285024
RBF SVM:Administrative:0.487922705314
RBF SVM:Education:0.449275362319
RBF SVM:Health:0.478260869565
RBF SVM:Arts:0.51690821256
RBF SVM:Accommodation/Food:0.671497584541
RBF SVM:Others:0.555555555556
RBF SVM:Public:0.714975845411
75.0
Decision Tree:Agriculture/Fishing:0.971014492754
Decision Tree:Mining/Gas Extraction:0.966183574879
Decision Tree:Utilities:0.942028985507
Decision Tree:Construction:0.584541062802
Decision Tree:Manufacturing:0.68115942029
Decision Tree:Wholesale:0.51690821256
Decision Tree:Retail:0.550724637681
Decision Tree:Transportation:0.536231884058
Decision Tree:Media/Publishing:0.507246376812
Decision Tree:Finance:0.570048309179
Decision Tree:Real Estate:0.478260869565
Decision Tree:Law/Science/Ads:0.574879227053
Decision Tree:Management:0.690821256039
Decision Tree:Administrative:0.458937198068
Decision Tree:Education:0.56038647343
Decision Tree:Health:0.468599033816
Decision Tree:Arts:0.51690821256
Decision Tree:Accommodation/Food:0.570048309179
Decision Tree:Others:0.594202898551
Decision Tree:Public:0.642512077295
85.0
Random Forest:Agriculture/Fishing:0.971014492754
Random Forest:Mining/Gas Extraction:0.990338164251
Random Forest:Utilities:0.95652173913
Random Forest:Construction:0.613526570048
Random Forest:Manufacturing:0.714975845411
Random Forest:Wholesale:0.541062801932
Random Forest:Retail:0.545893719807
Random Forest:Transportation:0.589371980676
Random Forest:Media/Publishing:0.56038647343
Random Forest:Finance:0.6038647343
Random Forest:Real Estate:0.526570048309
Random Forest:Law/Science/Ads:0.599033816425
Random Forest:Management:0.806763285024
Random Forest:Administrative:0.56038647343
Random Forest:Education:0.51690821256
Random Forest:Health:0.454106280193
Random Forest:Arts:0.497584541063
Random Forest:Accommodation/Food:0.657004830918
Random Forest:Others:0.661835748792
Random Forest:Public:0.719806763285
90.0
AdaBoost:Agriculture/Fishing:0.971014492754
AdaBoost:Mining/Gas Extraction:0.990338164251
AdaBoost:Utilities:0.942028985507
AdaBoost:Construction:0.594202898551
AdaBoost:Manufacturing:0.599033816425
AdaBoost:Wholesale:0.536231884058
AdaBoost:Retail:0.536231884058
AdaBoost:Transportation:0.536231884058
AdaBoost:Media/Publishing:0.589371980676
AdaBoost:Finance:0.555555555556
AdaBoost:Real Estate:0.521739130435
AdaBoost:Law/Science/Ads:0.579710144928
AdaBoost:Management:0.710144927536
AdaBoost:Administrative:0.48309178744
AdaBoost:Education:0.473429951691
AdaBoost:Health:0.468599033816
AdaBoost:Arts:0.51690821256
AdaBoost:Accommodation/Food:0.589371980676
AdaBoost:Others:0.574879227053
AdaBoost:Public:0.685990338164
85.0
Naive Bayes:Agriculture/Fishing:0.971014492754
Naive Bayes:Mining/Gas Extraction:0.700483091787
Naive Bayes:Utilities:0.3961352657
Naive Bayes:Construction:0.599033816425
Naive Bayes:Manufacturing:0.381642512077
Naive Bayes:Wholesale:0.463768115942
Naive Bayes:Retail:0.531400966184
Naive Bayes:Transportation:0.420289855072
Naive Bayes:Media/Publishing:0.531400966184
Naive Bayes:Finance:0.429951690821
Naive Bayes:Real Estate:0.579710144928
Naive Bayes:Law/Science/Ads:0.599033816425
Naive Bayes:Management:0.31884057971
Naive Bayes:Administrative:0.507246376812
Naive Bayes:Education:0.536231884058
Naive Bayes:Health:0.502415458937
Naive Bayes:Arts:0.536231884058
Naive Bayes:Accommodation/Food:0.647342995169
Naive Bayes:Others:0.579710144928
Naive Bayes:Public:0.473429951691
65.0
LDA:Agriculture/Fishing:0.961352657005
LDA:Mining/Gas Extraction:0.990338164251
LDA:Utilities:0.946859903382
LDA:Construction:0.502415458937
LDA:Manufacturing:0.685990338164
LDA:Wholesale:0.541062801932
LDA:Retail:0.531400966184
LDA:Transportation:0.594202898551
LDA:Media/Publishing:0.51690821256
LDA:Finance:0.570048309179
LDA:Real Estate:0.48309178744
LDA:Law/Science/Ads:0.594202898551
LDA:Management:0.768115942029
LDA:Administrative:0.48309178744
LDA:Education:0.541062801932
LDA:Health:0.473429951691
LDA:Arts:0.487922705314
LDA:Accommodation/Food:0.618357487923
LDA:Others:0.570048309179
LDA:Public:0.690821256039
80.0
QDA:Agriculture/Fishing:0.971014492754
QDA:Mining/Gas Extraction:0.990338164251
QDA:Utilities:0.95652173913
QDA:Construction:0.618357487923
QDA:Manufacturing:0.48309178744
QDA:Wholesale:0.48309178744
QDA:Retail:0.541062801932
QDA:Transportation:0.468599033816
QDA:Media/Publishing:0.565217391304
QDA:Finance:0.449275362319
QDA:Real Estate:0.545893719807
QDA:Law/Science/Ads:0.599033816425
QDA:Management:0.579710144928
QDA:Administrative:0.526570048309
QDA:Education:0.584541062802
QDA:Health:0.521739130435
QDA:Arts:0.536231884058
QDA:Accommodation/Food:0.676328502415
QDA:Others:0.584541062802
QDA:Public:0.632850241546
80.0






    



/Users/Hongsup/anaconda/lib/python2.7/site-packages/sklearn/qda.py:133: UserWarning: Variables are collinear
  warnings.warn("Variables are collinear")

2.2 Multinomial



In [26]:

    
df_all.head()









    Out[26]:






  
    
      
      year
      zip_code
      TotalPop
      M_Total
      M_18to19
      M_20
      M_21
      M_22to24
      M_25to29
      M_30to34
      ...
      F_Prof
      F_Doc
      Total_poor
      M_Arts
      F_Arts
      art_growth
      business_code
      biz_growth
      city
      biz_index
    
  
  
    
      0
      2011
      21201
      -0.018745
      0.025664
      0.076923
      0.25641
      -0.5
      0.08316
      0.031599
      0.21978
      ...
      -0.132743
      -0.020979
      -0.000169
      0.291925
      -0.109804
      0.068309
      11
      0.000000
      Baltimore
      0
    
    
      1
      2011
      21201
      -0.018745
      0.025664
      0.076923
      0.25641
      -0.5
      0.08316
      0.031599
      0.21978
      ...
      -0.132743
      -0.020979
      -0.000169
      0.291925
      -0.109804
      0.068309
      21
      0.000000
      Baltimore
      0
    
    
      2
      2011
      21201
      -0.018745
      0.025664
      0.076923
      0.25641
      -0.5
      0.08316
      0.031599
      0.21978
      ...
      -0.132743
      -0.020979
      -0.000169
      0.291925
      -0.109804
      0.068309
      22
      0.000000
      Baltimore
      0
    
    
      3
      2011
      21201
      -0.018745
      0.025664
      0.076923
      0.25641
      -0.5
      0.08316
      0.031599
      0.21978
      ...
      -0.132743
      -0.020979
      -0.000169
      0.291925
      -0.109804
      0.068309
      23
      0.052632
      Baltimore
      1
    
    
      4
      2011
      21201
      -0.018745
      0.025664
      0.076923
      0.25641
      -0.5
      0.08316
      0.031599
      0.21978
      ...
      -0.132743
      -0.020979
      -0.000169
      0.291925
      -0.109804
      0.068309
      30
      -0.300000
      Baltimore
      0
    
  

5 rows × 36 columns



In [27]:

    
# class 3
temp = assign_biz_multiclass1(0.30)
df_multinom = df_all
df_multinom['biz_index']=pd.DataFrame(temp)
gb = df_multinom.groupby(('year','business_code'))
for name, clf in zip(names, classifiers):
    scores = []
    for biz_code,biz_name in zip(business_codes,business_names):
        df_2011 = gb.get_group((2011,biz_code))
        df_2012 = gb.get_group((2012,biz_code))
        test_size = 0.4
        
        X = df_2011.ix[:,2:32]
        y = df_2012.ix[:,-1]
        X = StandardScaler().fit_transform(X)
        X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=test_size, random_state=0)    

        clf.fit(X_train, y_train)
        score = clf.score(X_test, y_test)
        scores.append(score)
        print name+ ':' + biz_name + ':' +str(score)
    print sum(np.array(scores)>0.33)/float(len(business_codes))*100









    



Nearest Neighbors:Agriculture/Fishing:0.971014492754
Nearest Neighbors:Mining/Gas Extraction:0.990338164251
Nearest Neighbors:Utilities:0.951690821256
Nearest Neighbors:Construction:0.51690821256
Nearest Neighbors:Manufacturing:0.676328502415
Nearest Neighbors:Wholesale:0.492753623188
Nearest Neighbors:Retail:0.531400966184
Nearest Neighbors:Transportation:0.531400966184
Nearest Neighbors:Media/Publishing:0.521739130435
Nearest Neighbors:Finance:0.487922705314
Nearest Neighbors:Real Estate:0.497584541063
Nearest Neighbors:Law/Science/Ads:0.599033816425
Nearest Neighbors:Management:0.806763285024
Nearest Neighbors:Administrative:0.51690821256
Nearest Neighbors:Education:0.381642512077
Nearest Neighbors:Health:0.502415458937
Nearest Neighbors:Arts:0.51690821256
Nearest Neighbors:Accommodation/Food:0.565217391304
Nearest Neighbors:Others:0.584541062802
Nearest Neighbors:Public:0.618357487923
100.0
Linear SVM:Agriculture/Fishing:0.971014492754
Linear SVM:Mining/Gas Extraction:0.990338164251
Linear SVM:Utilities:0.95652173913
Linear SVM:Construction:0.565217391304
Linear SVM:Manufacturing:0.714975845411
Linear SVM:Wholesale:0.531400966184
Linear SVM:Retail:0.51690821256
Linear SVM:Transportation:0.642512077295
Linear SVM:Media/Publishing:0.526570048309
Linear SVM:Finance:0.623188405797
Linear SVM:Real Estate:0.473429951691
Linear SVM:Law/Science/Ads:0.584541062802
Linear SVM:Management:0.806763285024
Linear SVM:Administrative:0.487922705314
Linear SVM:Education:0.449275362319
Linear SVM:Health:0.463768115942
Linear SVM:Arts:0.512077294686
Linear SVM:Accommodation/Food:0.661835748792
Linear SVM:Others:0.579710144928
Linear SVM:Public:0.714975845411
100.0
RBF SVM:Agriculture/Fishing:0.971014492754
RBF SVM:Mining/Gas Extraction:0.990338164251
RBF SVM:Utilities:0.95652173913
RBF SVM:Construction:0.531400966184
RBF SVM:Manufacturing:0.714975845411
RBF SVM:Wholesale:0.531400966184
RBF SVM:Retail:0.473429951691
RBF SVM:Transportation:0.63768115942
RBF SVM:Media/Publishing:0.521739130435
RBF SVM:Finance:0.628019323671
RBF SVM:Real Estate:0.463768115942
RBF SVM:Law/Science/Ads:0.608695652174
RBF SVM:Management:0.806763285024
RBF SVM:Administrative:0.487922705314
RBF SVM:Education:0.454106280193
RBF SVM:Health:0.468599033816
RBF SVM:Arts:0.521739130435
RBF SVM:Accommodation/Food:0.671497584541
RBF SVM:Others:0.550724637681
RBF SVM:Public:0.714975845411
100.0
Decision Tree:Agriculture/Fishing:0.966183574879
Decision Tree:Mining/Gas Extraction:0.961352657005
Decision Tree:Utilities:0.942028985507
Decision Tree:Construction:0.497584541063
Decision Tree:Manufacturing:0.579710144928
Decision Tree:Wholesale:0.492753623188
Decision Tree:Retail:0.487922705314
Decision Tree:Transportation:0.512077294686
Decision Tree:Media/Publishing:0.478260869565
Decision Tree:Finance:0.536231884058
Decision Tree:Real Estate:0.478260869565
Decision Tree:Law/Science/Ads:0.555555555556
Decision Tree:Management:0.763285024155
Decision Tree:Administrative:0.458937198068
Decision Tree:Education:0.405797101449
Decision Tree:Health:0.468599033816
Decision Tree:Arts:0.43961352657
Decision Tree:Accommodation/Food:0.676328502415
Decision Tree:Others:0.550724637681
Decision Tree:Public:0.618357487923
100.0
Random Forest:Agriculture/Fishing:0.971014492754
Random Forest:Mining/Gas Extraction:0.990338164251
Random Forest:Utilities:0.95652173913
Random Forest:Construction:0.6038647343
Random Forest:Manufacturing:0.714975845411
Random Forest:Wholesale:0.526570048309
Random Forest:Retail:0.512077294686
Random Forest:Transportation:0.613526570048
Random Forest:Media/Publishing:0.48309178744
Random Forest:Finance:0.618357487923
Random Forest:Real Estate:0.487922705314
Random Forest:Law/Science/Ads:0.594202898551
Random Forest:Management:0.806763285024
Random Forest:Administrative:0.512077294686
Random Forest:Education:0.478260869565
Random Forest:Health:0.487922705314
Random Forest:Arts:0.502415458937
Random Forest:Accommodation/Food:0.695652173913
Random Forest:Others:0.6038647343
Random Forest:Public:0.719806763285
100.0
AdaBoost:Agriculture/Fishing:0.971014492754
AdaBoost:Mining/Gas Extraction:0.990338164251
AdaBoost:Utilities:0.937198067633
AdaBoost:Construction:0.536231884058
AdaBoost:Manufacturing:0.6038647343
AdaBoost:Wholesale:0.458937198068
AdaBoost:Retail:0.512077294686
AdaBoost:Transportation:0.570048309179
AdaBoost:Media/Publishing:0.473429951691
AdaBoost:Finance:0.487922705314
AdaBoost:Real Estate:0.545893719807
AdaBoost:Law/Science/Ads:0.449275362319
AdaBoost:Management:0.734299516908
AdaBoost:Administrative:0.463768115942
AdaBoost:Education:0.429951690821
AdaBoost:Health:0.463768115942
AdaBoost:Arts:0.434782608696
AdaBoost:Accommodation/Food:0.43961352657
AdaBoost:Others:0.628019323671
AdaBoost:Public:0.685990338164
100.0
Naive Bayes:Agriculture/Fishing:0.971014492754
Naive Bayes:Mining/Gas Extraction:0.700483091787
Naive Bayes:Utilities:0.497584541063
Naive Bayes:Construction:0.521739130435
Naive Bayes:Manufacturing:0.328502415459
Naive Bayes:Wholesale:0.3961352657
Naive Bayes:Retail:0.512077294686
Naive Bayes:Transportation:0.236714975845
Naive Bayes:Media/Publishing:0.405797101449
Naive Bayes:Finance:0.429951690821
Naive Bayes:Real Estate:0.367149758454
Naive Bayes:Law/Science/Ads:0.468599033816
Naive Bayes:Management:0.24154589372
Naive Bayes:Administrative:0.463768115942
Naive Bayes:Education:0.415458937198
Naive Bayes:Health:0.492753623188
Naive Bayes:Arts:0.347826086957
Naive Bayes:Accommodation/Food:0.652173913043
Naive Bayes:Others:0.579710144928
Naive Bayes:Public:0.468599033816
85.0
LDA:Agriculture/Fishing:0.95652173913
LDA:Mining/Gas Extraction:0.990338164251
LDA:Utilities:0.917874396135
LDA:Construction:0.48309178744
LDA:Manufacturing:0.676328502415
LDA:Wholesale:0.497584541063
LDA:Retail:0.51690821256
LDA:Transportation:0.618357487923
LDA:Media/Publishing:0.497584541063
LDA:Finance:0.574879227053
LDA:Real Estate:0.463768115942
LDA:Law/Science/Ads:0.574879227053
LDA:Management:0.734299516908
LDA:Administrative:0.463768115942
LDA:Education:0.458937198068
LDA:Health:0.449275362319
LDA:Arts:0.434782608696
LDA:Accommodation/Food:0.6038647343
LDA:Others:0.56038647343
LDA:Public:0.690821256039
100.0






    



---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-27-94ae2a635d3a> in <module>()
     16         X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=test_size, random_state=0)
     17 
---> 18         clf.fit(X_train, y_train)
     19         score = clf.score(X_test, y_test)
     20         scores.append(score)

/Users/Hongsup/anaconda/lib/python2.7/site-packages/sklearn/qda.pyc in fit(self, X, y, store_covariances, tol)
    125             if len(Xg) == 1:
    126                 raise ValueError('y has only 1 sample in class %s, covariance '
--> 127                                  'is ill defined.' % str(self.classes_[ind]))
    128             Xgc = Xg - meang
    129             # Xgc = U * S * V.T

ValueError: y has only 1 sample in class 1, covariance is ill defined.

3. Random forest

Compute guess rate



In [281]:

    
np.average(df_all.biz_index)









    Out[281]:





0.50794573643410856

Binary



In [29]:

    
temp = assign_biz_binary()
df_binom = df_all
df_binom['biz_index']=pd.DataFrame(temp)
gb = df_binom.groupby(('year','business_code'))

clf = RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)
scores = []
for biz_code,biz_name in zip(business_codes,business_names):
    df_2011 = gb.get_group((2011,biz_code))
    df_2012 = gb.get_group((2012,biz_code))
    test_size = 0.4

    X = df_2011.ix[:,2:32]
    y = df_2012.ix[:,-1]
    X = StandardScaler().fit_transform(X)
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=test_size, random_state=0)    

    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    scores.append(score)
    
#     print biz_name + ':' +str(score)
    # evaluate feature importance
    rf_eval_feat(df_2011.columns[2:32],clf,biz_name,score)
    
print sum(np.array(scores)>0.6)/float(len(business_codes))*100









    



50.0






    



/Users/Hongsup/anaconda/lib/python2.7/site-packages/matplotlib/pyplot.py:424: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  max_open_warning, RuntimeWarning)



In [30]:

    
# class 3
temp = assign_biz_multiclass1(0.10)
df_multinom = df_all
df_multinom['biz_index']=pd.DataFrame(temp)
gb = df_multinom.groupby(('year','business_code'))
for name, clf in zip(names, classifiers):
    scores = []
    for biz_code,biz_name in zip(business_codes,business_names):
        df_2011 = gb.get_group((2011,biz_code))
        df_2012 = gb.get_group((2012,biz_code))
        test_size = 0.4
        
        X = df_2011.ix[:,2:32]
        y = df_2012.ix[:,-1]
        X = StandardScaler().fit_transform(X)
        X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=test_size, random_state=0)    

        clf.fit(X_train, y_train)
        score = clf.score(X_test, y_test)
        scores.append(score)
        print name+ ':' + biz_name + ':' +str(score)
    print sum(np.array(scores)>0.6)/float(len(business_codes))*100









    



Nearest Neighbors:Agriculture/Fishing:0.971014492754
Nearest Neighbors:Mining/Gas Extraction:0.990338164251
Nearest Neighbors:Utilities:0.951690821256
Nearest Neighbors:Construction:0.410628019324
Nearest Neighbors:Manufacturing:0.661835748792
Nearest Neighbors:Wholesale:0.429951690821
Nearest Neighbors:Retail:0.48309178744
Nearest Neighbors:Transportation:0.512077294686
Nearest Neighbors:Media/Publishing:0.473429951691
Nearest Neighbors:Finance:0.468599033816
Nearest Neighbors:Real Estate:0.425120772947
Nearest Neighbors:Law/Science/Ads:0.400966183575
Nearest Neighbors:Management:0.787439613527
Nearest Neighbors:Administrative:0.410628019324
Nearest Neighbors:Education:0.381642512077
Nearest Neighbors:Health:0.43961352657
Nearest Neighbors:Arts:0.429951690821
Nearest Neighbors:Accommodation/Food:0.415458937198
Nearest Neighbors:Others:0.550724637681
Nearest Neighbors:Public:0.623188405797
30.0
Linear SVM:Agriculture/Fishing:0.971014492754
Linear SVM:Mining/Gas Extraction:0.990338164251
Linear SVM:Utilities:0.95652173913
Linear SVM:Construction:0.449275362319
Linear SVM:Manufacturing:0.714975845411
Linear SVM:Wholesale:0.531400966184
Linear SVM:Retail:0.48309178744
Linear SVM:Transportation:0.642512077295
Linear SVM:Media/Publishing:0.512077294686
Linear SVM:Finance:0.628019323671
Linear SVM:Real Estate:0.454106280193
Linear SVM:Law/Science/Ads:0.429951690821
Linear SVM:Management:0.806763285024
Linear SVM:Administrative:0.478260869565
Linear SVM:Education:0.43961352657
Linear SVM:Health:0.454106280193
Linear SVM:Arts:0.51690821256
Linear SVM:Accommodation/Food:0.502415458937
Linear SVM:Others:0.512077294686
Linear SVM:Public:0.714975845411
40.0
RBF SVM:Agriculture/Fishing:0.971014492754
RBF SVM:Mining/Gas Extraction:0.990338164251
RBF SVM:Utilities:0.95652173913
RBF SVM:Construction:0.449275362319
RBF SVM:Manufacturing:0.714975845411
RBF SVM:Wholesale:0.536231884058
RBF SVM:Retail:0.48309178744
RBF SVM:Transportation:0.642512077295
RBF SVM:Media/Publishing:0.51690821256
RBF SVM:Finance:0.628019323671
RBF SVM:Real Estate:0.458937198068
RBF SVM:Law/Science/Ads:0.376811594203
RBF SVM:Management:0.806763285024
RBF SVM:Administrative:0.492753623188
RBF SVM:Education:0.454106280193
RBF SVM:Health:0.463768115942
RBF SVM:Arts:0.51690821256
RBF SVM:Accommodation/Food:0.502415458937
RBF SVM:Others:0.473429951691
RBF SVM:Public:0.714975845411
40.0
Decision Tree:Agriculture/Fishing:0.966183574879
Decision Tree:Mining/Gas Extraction:0.966183574879
Decision Tree:Utilities:0.946859903382
Decision Tree:Construction:0.371980676329
Decision Tree:Manufacturing:0.657004830918
Decision Tree:Wholesale:0.497584541063
Decision Tree:Retail:0.444444444444
Decision Tree:Transportation:0.541062801932
Decision Tree:Media/Publishing:0.497584541063
Decision Tree:Finance:0.555555555556
Decision Tree:Real Estate:0.400966183575
Decision Tree:Law/Science/Ads:0.415458937198
Decision Tree:Management:0.75845410628
Decision Tree:Administrative:0.371980676329
Decision Tree:Education:0.463768115942
Decision Tree:Health:0.400966183575
Decision Tree:Arts:0.458937198068
Decision Tree:Accommodation/Food:0.463768115942
Decision Tree:Others:0.512077294686
Decision Tree:Public:0.63768115942
30.0
Random Forest:Agriculture/Fishing:0.971014492754
Random Forest:Mining/Gas Extraction:0.990338164251
Random Forest:Utilities:0.95652173913
Random Forest:Construction:0.48309178744
Random Forest:Manufacturing:0.714975845411
Random Forest:Wholesale:0.526570048309
Random Forest:Retail:0.512077294686
Random Forest:Transportation:0.642512077295
Random Forest:Media/Publishing:0.507246376812
Random Forest:Finance:0.623188405797
Random Forest:Real Estate:0.458937198068
Random Forest:Law/Science/Ads:0.454106280193
Random Forest:Management:0.806763285024
Random Forest:Administrative:0.492753623188
Random Forest:Education:0.449275362319
Random Forest:Health:0.429951690821
Random Forest:Arts:0.507246376812
Random Forest:Accommodation/Food:0.531400966184
Random Forest:Others:0.531400966184
Random Forest:Public:0.719806763285
40.0
AdaBoost:Agriculture/Fishing:0.971014492754
AdaBoost:Mining/Gas Extraction:0.990338164251
AdaBoost:Utilities:0.942028985507
AdaBoost:Construction:0.449275362319
AdaBoost:Manufacturing:0.618357487923
AdaBoost:Wholesale:0.478260869565
AdaBoost:Retail:0.512077294686
AdaBoost:Transportation:0.512077294686
AdaBoost:Media/Publishing:0.429951690821
AdaBoost:Finance:0.531400966184
AdaBoost:Real Estate:0.454106280193
AdaBoost:Law/Science/Ads:0.444444444444
AdaBoost:Management:0.724637681159
AdaBoost:Administrative:0.429951690821
AdaBoost:Education:0.381642512077
AdaBoost:Health:0.410628019324
AdaBoost:Arts:0.420289855072
AdaBoost:Accommodation/Food:0.449275362319
AdaBoost:Others:0.454106280193
AdaBoost:Public:0.685990338164
30.0
Naive Bayes:Agriculture/Fishing:0.971014492754
Naive Bayes:Mining/Gas Extraction:0.700483091787
Naive Bayes:Utilities:0.3961352657
Naive Bayes:Construction:0.386473429952
Naive Bayes:Manufacturing:0.256038647343
Naive Bayes:Wholesale:0.309178743961
Naive Bayes:Retail:0.468599033816
Naive Bayes:Transportation:0.299516908213
Naive Bayes:Media/Publishing:0.280193236715
Naive Bayes:Finance:0.294685990338
Naive Bayes:Real Estate:0.3961352657
Naive Bayes:Law/Science/Ads:0.391304347826
Naive Bayes:Management:0.270531400966
Naive Bayes:Administrative:0.309178743961
Naive Bayes:Education:0.256038647343
Naive Bayes:Health:0.420289855072
Naive Bayes:Arts:0.333333333333
Naive Bayes:Accommodation/Food:0.51690821256
Naive Bayes:Others:0.497584541063
Naive Bayes:Public:0.473429951691
10.0
LDA:Agriculture/Fishing:0.961352657005
LDA:Mining/Gas Extraction:0.990338164251
LDA:Utilities:0.946859903382
LDA:Construction:0.415458937198
LDA:Manufacturing:0.676328502415
LDA:Wholesale:0.497584541063
LDA:Retail:0.512077294686
LDA:Transportation:0.608695652174
LDA:Media/Publishing:0.478260869565
LDA:Finance:0.618357487923
LDA:Real Estate:0.405797101449
LDA:Law/Science/Ads:0.386473429952
LDA:Management:0.75845410628
LDA:Administrative:0.449275362319
LDA:Education:0.454106280193
LDA:Health:0.415458937198
LDA:Arts:0.449275362319
LDA:Accommodation/Food:0.473429951691
LDA:Others:0.478260869565
LDA:Public:0.690821256039
40.0
QDA:Agriculture/Fishing:0.971014492754
QDA:Mining/Gas Extraction:0.990338164251
QDA:Utilities:0.95652173913
QDA:Construction:0.454106280193
QDA:Manufacturing:0.570048309179
QDA:Wholesale:0.386473429952
QDA:Retail:0.521739130435
QDA:Transportation:0.357487922705
QDA:Media/Publishing:0.405797101449
QDA:Finance:0.391304347826
QDA:Real Estate:0.444444444444
QDA:Law/Science/Ads:0.449275362319
QDA:Management:0.705314009662
QDA:Administrative:0.36231884058
QDA:Education:0.43961352657
QDA:Health:0.458937198068
QDA:Arts:0.43961352657
QDA:Accommodation/Food:0.48309178744
QDA:Others:0.550724637681
QDA:Public:0.632850241546
25.0

We're settling with 0.46

Random forest: 4 classes and predictions



In [ ]:

    
feature_names = df_2011.columns[2:32]



In [159]:

    
temp = assign_biz_multiclass2(0.05)
df_multinom = df_all
df_multinom['biz_index']=pd.DataFrame(temp)
gb = df_multinom.groupby(('year','business_code'))

clf = RandomForestClassifier(max_depth=5, n_estimators=100)
scores = []
y_2013_list = []
art_importance = []
art_rank_list = []
for biz_code,biz_name in zip(business_codes,business_names):
    df_2011 = gb.get_group((2011,biz_code))
    df_2012 = gb.get_group((2012,biz_code))
    test_size = 0.4

    X = df_2011.ix[:,2:32]
    y = df_2012.ix[:,-1]
    X = StandardScaler().fit_transform(X)
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=test_size, random_state=0)    

    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    scores.append(score)

    # prediction
    X_2012 = df_2012.ix[:,2:32]
    y_2013 = list(clf.predict(X_2012))
    y_2013_list.append(y_2013)
    
    # evaluate feature importance
    indices = rf_eval_feat(df_2011.columns[2:32],clf,biz_name,score)

    # check the contribution of art_growth
    art_importance.append(clf.feature_importances_[-1])
    art_rank = len(feature_names)-np.arange(len(feature_names))[feature_names[indices]=='art_growth'][0]
    art_rank_list.append(art_rank)

# merge lists of list
import itertools
y_2013_merged = list(itertools.chain(*y_2013_list))
    
print sum(np.array(scores)>0.46)/float(len(business_codes))*100

Validation: precision and recall



In [46]:

    
# To build confusion matrix, you need # of data points corresponding to the index
# 2. per business, cm = confusion_matrix(y_test, y_pred)
df_multinom.columns









    Out[46]:





Index([u'year', u'zip_code', u'TotalPop', u'M_Total', u'M_18to19', u'M_20',
       u'M_21', u'M_22to24', u'M_25to29', u'M_30to34', u'F_Total', u'F_18to19',
       u'F_20', u'F_21', u'F_22to24', u'F_25to29', u'F_30to34', u'Total_Edu',
       u'M_Total_Edu', u'M_BA', u'M_MA', u'M_Prof', u'M_Doc', u'F_Total_Edu',
       u'F_BA', u'F_MA', u'F_Prof', u'F_Doc', u'Total_poor', u'M_Arts',
       u'F_Arts', u'art_growth', u'business_code', u'biz_growth', u'city',
       u'biz_index'],
      dtype='object')



In [203]:

    
from sklearn.metrics import confusion_matrix
from sklearn import metrics

temp = assign_biz_multiclass2(0.20)
df_multinom = df_all
df_multinom['biz_index']=pd.DataFrame(temp)
gb = df_multinom.groupby(('year','business_code'))

clf = RandomForestClassifier(max_depth=5, n_estimators=100)
vals = []
for biz_code,biz_name in zip(business_codes,business_names):
    df_2011 = gb.get_group((2011,biz_code))
    df_2012 = gb.get_group((2012,biz_code))
    test_size = 0.4

    X = df_2011.ix[:,2:32]
    y = df_2012.ix[:,-1]
    X = StandardScaler().fit_transform(X)
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=test_size, random_state=0)    

    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    y_true = y_test
    y_pred = clf.fit(X_train, y_train).predict(X_test)
#     vals.append(metrics.f1_score(y_true, y_pred, average=None))
    vals.append(metrics.precision_recall_fscore_support(y_test, y_pred, beta = 0.9, average=None))



In [213]:

    
# vals[i]: biz
# vals[i][2]: F1 score
# vals[i][0]: precision (precision is more important in this case)
# maximum precision = no false positive
# vals[i][3]: weights
vals_weighted = []
for i in range(len(vals)):
    vals_weighted.append(sum(vals[i][2]*vals[i][3]/sum(vals[i][3])))



In [284]:

    
# use bootstrap to evaluate the classifier
temp = assign_biz_multiclass2(0.05)
df_multinom = df_all
df_multinom['biz_index']=pd.DataFrame(temp)
gb = df_multinom.groupby(('year','business_code'))

clf = RandomForestClassifier(max_depth=5, n_estimators=100)
scores = []
y_2013_list = []
for biz_code,biz_name in zip(business_codes,business_names):
    df_2011 = gb.get_group((2011,biz_code))
    df_2012 = gb.get_group((2012,biz_code))
    test_size = 0.4

    X = df_2011.ix[:,2:32]
    y = df_2012.ix[:,-1]
    X = StandardScaler().fit_transform(X)
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=test_size, random_state=0)    

    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    scores.append(score)

    # prediction
    X_2012 = df_2012.ix[:,2:32]
    y_2013 = list(clf.predict(X_2012))
    y_2013_list.append(y_2013)
    
    # evaluate feature importance
    indices = rf_eval_feat(df_2011.columns[2:32],clf,biz_name,score)

    # check the contribution of art_growth
    art_importance.append(clf.feature_importances_[-1])
    art_rank = len(feature_names)-np.arange(len(feature_names))[feature_names[indices]=='art_growth'][0]
    art_rank_list.append(art_rank)

# merge lists of list
import itertools
y_2013_merged = list(itertools.chain(*y_2013_list))
    
print sum(np.array(scores)>0.46)/float(len(business_codes))*100









    Out[284]:






  
    
      
      year
      zip_code
      TotalPop
      M_Total
      M_18to19
      M_20
      M_21
      M_22to24
      M_25to29
      M_30to34
      ...
      F_Prof
      F_Doc
      Total_poor
      M_Arts
      F_Arts
      art_growth
      business_code
      biz_growth
      city
      biz_index
    
  
  
    
      0
      2011
      21201
      -0.018745
      0.025664
      0.076923
      0.25641
      -0.5
      0.08316
      0.031599
      0.21978
      ...
      -0.132743
      -0.020979
      -0.000169
      0.291925
      -0.109804
      0.068309
      11
      0.000000
      Baltimore
      0
    
    
      1
      2011
      21201
      -0.018745
      0.025664
      0.076923
      0.25641
      -0.5
      0.08316
      0.031599
      0.21978
      ...
      -0.132743
      -0.020979
      -0.000169
      0.291925
      -0.109804
      0.068309
      21
      0.000000
      Baltimore
      0
    
    
      2
      2011
      21201
      -0.018745
      0.025664
      0.076923
      0.25641
      -0.5
      0.08316
      0.031599
      0.21978
      ...
      -0.132743
      -0.020979
      -0.000169
      0.291925
      -0.109804
      0.068309
      22
      0.000000
      Baltimore
      0
    
    
      3
      2011
      21201
      -0.018745
      0.025664
      0.076923
      0.25641
      -0.5
      0.08316
      0.031599
      0.21978
      ...
      -0.132743
      -0.020979
      -0.000169
      0.291925
      -0.109804
      0.068309
      23
      0.052632
      Baltimore
      1
    
    
      4
      2011
      21201
      -0.018745
      0.025664
      0.076923
      0.25641
      -0.5
      0.08316
      0.031599
      0.21978
      ...
      -0.132743
      -0.020979
      -0.000169
      0.291925
      -0.109804
      0.068309
      30
      -0.300000
      Baltimore
      0
    
  

5 rows × 36 columns



In [282]:

    
vals[0][3]









    Out[282]:





array([201,   1,   3,   2])



In [214]:

    
vals_weighted









    Out[214]:





[0.95526022175899972,
 0.98502875176359883,
 0.93308906546236936,
 0.5340412391292424,
 0.58375360006261756,
 0.46410695128099444,
 0.52205932920088793,
 0.50111256487675793,
 0.42979651612648601,
 0.4953686762616204,
 0.4577668443287275,
 0.50230439430726892,
 0.7124794406879108,
 0.42170717261363178,
 0.2779842491959032,
 0.45814997537024282,
 0.35458590873907853,
 0.60495018236922204,
 0.60296083824453095,
 0.59827844372230998]



In [215]:

    
fig = plt.figure(figsize=(15,10))
# sort importances
indices = np.argsort(vals_weighted[:-1])
# plot as bar chart
plt.barh(np.arange(len(business_names[:-1])), np.array(vals_weighted)[indices])
plt.yticks(np.arange(len(business_names[:-1])) + 0.5, np.array(business_names)[indices])
_ = plt.xlabel('Precision')
# plt.title([title + ':' + str(score)],fontsize=20)
plt.tick_params(axis='x', labelsize=20)
plt.tick_params(axis='y', labelsize=20)

Art importance: absolute



In [279]:

    
fig = plt.figure(figsize=(15,10))
# sort importances
indices = np.argsort(art_importance[:-1])
# plot as bar chart
plt.barh(np.arange(len(business_names[:-1])), np.array(art_importance)[indices])
plt.yticks(np.arange(len(business_names[:-1])) + 0.5, np.array(business_names)[indices])
_ = plt.xlabel('Relative importance')
# plt.title([title + ':' + str(score)],fontsize=20)
plt.tick_params(axis='x', labelsize=20)
plt.tick_params(axis='y', labelsize=20)

Art importance: rank



In [523]:

    
art_rank_list









    Out[523]:





[28, 22, 17, 9, 25, 25, 14, 21, 6, 28, 8, 11, 16, 11, 24, 12, 18, 28, 29, 28]



In [529]:

    
np.arange(len(business_names[:-10]))









    Out[529]:





array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])



In [536]:

    
np.array(art_rank_list[:-10])









    Out[536]:





array([28, 22, 17,  9, 25, 25, 14, 21,  6, 28])



In [35]:

    
fig = plt.figure(figsize=(15,10))
# sort importances (descending)
indices = np.array(art_rank_list[:-1]).argsort()[::-1]
# indices = np.argsort(art_rank_list[:-1])
# plot as bar chart
plt.barh(np.arange(len(business_names[:-1])), np.array(art_rank_list[:-1])[indices])
plt.yticks(np.arange(len(business_names[:-1])) + 0.5, np.array(business_names[:-1])[indices])
_ = plt.xlabel('Importance Rank')
# plt.title([title + ':' + str(score)],fontsize=20)
plt.tick_params(axis='x', labelsize=20)
plt.tick_params(axis='y', labelsize=20)



In [164]:

    
fig = plt.figure(figsize=(15,10))
# sort importances

# indices = np.arange(len(scores)-1)
indices = np.argsort(scores[:-1])
# plot as bar chart
# plt.barh(np.arange(len(business_codes)-1), scores[:-1])
# plt.yticks(np.arange(len(business_codes)-1) + 0.5, np.array(business_names)[:-1])

plt.barh(np.arange(len(business_codes)-1), np.array(scores)[indices])
plt.yticks(np.arange(len(business_codes)-1) + 0.5, np.array(business_names)[indices])

# plt.barh(np.arange(len(feature_names)), clf.feature_importances_[indices])
# plt.yticks(np.arange(len(feature_names)) + 0.5, np.array(feature_names)[indices])


# _ = plt.xlabel('Relative importance')
# plt.title([title + ':' + str(score)],fontsize=20)
plt.tick_params(axis='x', labelsize=20)
plt.tick_params(axis='y', labelsize=20)
sns.set_context("poster")
sns.set(rc={'axes.facecolor':'white', 'figure.facecolor':'white'})
# plt.axvline(0.44, color='red', linewidth=2)



In [45]:

    
plt.hist(y_2013_merged)









    Out[45]:





(array([ 9216.,     0.,     0.,   840.,     0.,     0.,    13.,     0.,
            0.,   251.]),
 array([ 0. ,  0.3,  0.6,  0.9,  1.2,  1.5,  1.8,  2.1,  2.4,  2.7,  3. ]),
 <a list of 10 Patch objects>)

4.1 Random forest feature selection

Start with a single business type
Run random forest with all features.
Compute accuracy.
Compute feature importance.
Pick top 5, and run it again.
Compute accuracy.
Compare.
Loop over the rest of the features and repeat 2-3.
Plot the graph.



In [315]:

    
print business_codes
print business_names









    



[11, 21, 22, 23, 30, 42, 45, 49, 51, 52, 53, 54, 55, 56, 61, 62, 71, 72, 81, 99]
['Agriculture/Fishing', 'Mining/Gas Extraction', 'Utilities', 'Construction', 'Manufacturing', 'Wholesale', 'Retail', 'Transportation', 'Media/Publishing', 'Finance', 'Real Estate', 'Law/Science/Ads', 'Management', 'Administrative', 'Education', 'Health', 'Arts', 'Accommodation/Food', 'Others', 'Public']



In [316]:

    
np.array(business_codes)[np.array(business_names)=='Finance']









    Out[316]:





array([52])



In [319]:

    
# def rf_eval_feat(feature_names,clf,title,score):
#     fig = plt.figure(figsize=(15,10))
#     # sort importances
#     indices = np.argsort(clf.feature_importances_)
#     # plot as bar chart
#     plt.barh(np.arange(len(feature_names)), clf.feature_importances_[indices])
#     plt.yticks(np.arange(len(feature_names)) + 0.5, np.array(feature_names)[indices])
#     _ = plt.xlabel('Relative importance')
#     plt.title([title + ':' + str(score)],fontsize=20)
#     plt.tick_params(axis='x', labelsize=20)
#     plt.tick_params(axis='y', labelsize=20)
#     return indices



In [32]:

    
temp = assign_biz_multiclass2(0.05)
df_multinom = df_all
df_multinom['biz_index']=pd.DataFrame(temp)
gb = df_multinom.groupby(('year','business_code'))

# 0. Start with a single business type
biz_code = 52
biz_name = 'Finance'

# 1. Run random forest with all features.
clf = RandomForestClassifier(max_depth=5, n_estimators=100)
df_2011 = gb.get_group((2011,biz_code))
df_2012 = gb.get_group((2012,biz_code))
test_size = 0.4
X = df_2011.ix[:,2:32]
y = df_2012.ix[:,-1]
X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=test_size, random_state=0)    
clf.fit(X_train, y_train)

# 2. Compute accuracy.
score = clf.score(X_test, y_test)

# 3. Compute feature importance.
feature_names = df_2011.columns[2:32]
indices = rf_eval_feat(feature_names,clf,biz_name,score)



In [508]:

    
np.arange(len(feature_names))[feature_names[indices]=='art_growth']









    Out[508]:





array([12])



In [343]:

    
# so these are bottom 5
feature_names[indices[:5]]









    Out[343]:





Index([u'F_21', u'F_25to29', u'Total_poor', u'F_20', u'M_Total'], dtype='object')



In [342]:

    
# 4. Pick top 5, and run it again.
"""
    indices: ascending order
    ex. indices[:5]: the first 5 lowest
    so, top n features by index: indices[-n:]
"""
print indices
print indices[-5:]









    



[11 13 26 10  1 25 17 27 22 12 23 24 29  2  3 16 20 15  4  0  6  8 28 21  7
 14  5  9 18 19]
[14  5  9 18 19]



In [348]:

    
np.shape(X[:,indices[-5:]])









    Out[348]:





(516, 5)



In [354]:

    
X[:,indices[-5:]]









    Out[354]:





array([[-1.02620348,  0.04449427,  0.31283269, -0.88605526,  0.99599327],
       [-0.03132315, -0.20506342, -0.24513779, -0.11983613, -0.64037697],
       [ 0.41465327, -0.3626508 ,  0.12289347, -4.157093  , -0.2053299 ],
       ..., 
       [-0.19118855, -0.01087995, -0.36435163, -0.18858199, -0.2053299 ],
       [-0.19118855, -0.76601164,  0.21892195, -0.18858199,  5.4898318 ],
       [-0.19118855, -0.15287919, -0.06957186, -0.18858199, -1.24712777]])



In [367]:

    
# 5. Compute accuracy and compare (top 5)
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X[:,indices[-5:]], y, test_size=test_size, random_state=0)    
clf.fit(X_train, y_train)
print clf.score(X_test, y_test)









    



0.618357487923



In [382]:

    
# 6: compare: loop over first n indices, and see how accuracy changes

indices = np.argsort(clf.feature_importances_)
scores = []
for i in range(1,21):
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(X[:,indices[-i:]], y, test_size=test_size, random_state=0)    
    clf.fit(X_train, y_train)
    scores.append(clf.score(X_test, y_test))
plt.plot(scores,':o')
plt.show()
# 7. Loop over the rest of the features and repeat 2-3.
# 8. Plot the graph.



In [395]:

    
# 7. Loop over the rest of the features and repeat steps 2 and 3.

scores_all = []
df_feat_import = pd.DataFrame()
for biz_code,biz_name in zip(business_codes,business_names):
    df_2011 = gb.get_group((2011,biz_code))
    df_2012 = gb.get_group((2012,biz_code))
    test_size = 0.4

    X = df_2011.ix[:,2:32]
    y = df_2012.ix[:,-1]
    X = StandardScaler().fit_transform(X)
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=test_size, random_state=0)    

    clf.fit(X_train, y_train)
    score_all = clf.score(X_test, y_test)
    scores_all.append(score_all)

    scores = []
    for i in range(1,21):
        X_train, X_test, y_train, y_test = cross_validation.train_test_split(X[:,indices[-i:]], y, test_size=test_size, random_state=0)    
        clf.fit(X_train, y_train)
        scores.append(clf.score(X_test, y_test))
    df_feat_import[biz_name]=scores
    fig = plt.figure(figsize=(15,10))
    plt.plot(scores,':o')
    plt.title(biz_name)
    plt.ylim([np.average(scores)-.1, min(np.average(scores)+.2,1.0)])



In [40]:

    
# 8. The results depend on train/test datasets. Repeat this 10 times to get standard errors.
temp = assign_biz_multiclass2(0.05)
df_multinom = df_all
df_multinom['biz_index']=pd.DataFrame(temp)
gb = df_multinom.groupby(('year','business_code'))

n_runs = 10
df_feat_import_n = pd.DataFrame()
for n in range(n_runs):
    scores_all = []
    df_feat_import = pd.DataFrame()
    for biz_code,biz_name in zip(business_codes,business_names):
        df_2011 = gb.get_group((2011,biz_code))
        df_2012 = gb.get_group((2012,biz_code))
        test_size = 0.4

        X = df_2011.ix[:,2:32]
        y = df_2012.ix[:,-1]
        X = StandardScaler().fit_transform(X)
        X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=test_size, random_state=0)    

        clf.fit(X_train, y_train)
        score_all = clf.score(X_test, y_test)
        scores_all.append(score_all)

        scores = []
        for i in range(1,21):
            X_train, X_test, y_train, y_test = cross_validation.train_test_split(X[:,indices[-i:]], y, test_size=test_size, random_state=0)    
            clf.fit(X_train, y_train)
            scores.append(clf.score(X_test, y_test))
        df_feat_import['try']=range(1,21)
        df_feat_import['num_feat']=i
        df_feat_import[biz_name]=scores
    df_feat_import_n = pd.concat([df_feat_import_n, df_feat_import],ignore_index=True)

5. City level data visualization



In [99]:

    
gb_year = df_all.groupby('year',as_index=False)
# df_2011 = gb_year.get_group(2011)
# df_2012 = gb_year.get_group(2012)
# gb_11 = df_2011.groupby(('city','business_code'),as_index=False)
# gb_12 = df_2012.groupby(('city','business_code'),as_index=False)
cities = df_all.city.unique()



In [47]:

    
def plot2D_city_biz(gb_year,year,business_names,cities):
    
    # select a year
    df = gb_year.get_group(year)
    # groupby
    gb = df.groupby(('city','business_code'),as_index=False)
    # comput the average
    df_ = gb.agg(np.average)
    # simplify
    df_ = df_[['city','business_code','year','biz_growth']]
    city_biz = []
    for city in cities:
        temp = df_[df_['city']==city]
        city_biz.append(list(temp.biz_growth.values))
    city_biz_array = np.array(city_biz)
    # plot
    fig = plt.figure(figsize=(15,5))
    sns.set()
    sns.set_context("poster")
    ax = sns.heatmap(city_biz_array, xticklabels=business_names, yticklabels=cities, vmin=-.4, vmax=.4)



In [ ]:

    
plot2D_city_biz(gb_year,2011,business_names,cities)



In [53]:

    
plot2D_city_biz(gb_year,2011,business_names,cities)



In [54]:

    
plot2D_city_biz(gb_year,2012,business_names,cities)



In [100]:

    
year = 2011
df = gb_year.get_group(year)
# groupby
gb = df.groupby('city',as_index=False)
# comput the average
df_ = gb.agg(np.average)
# df_ = gb.get_group('Chicago')

df_.ix[:,3:33].head()
# df_.head()









    Out[100]:






  
    
      
      TotalPop
      M_Total
      M_18to19
      M_20
      M_21
      M_22to24
      M_25to29
      M_30to34
      F_Total
      F_18to19
      ...
      M_Doc
      F_Total_Edu
      F_BA
      F_MA
      F_Prof
      F_Doc
      Total_poor
      M_Arts
      F_Arts
      art_growth
    
  
  
    
      0
      0.001748
      0.001324
      0.020618
      0.005246
      0.028095
      0.031644
      0.025178
      0.026844
      0.003100
      0.062382
      ...
      0.211309
      0.005263
      0.019255
      0.016986
      0.074994
      0.104844
      0.051404
      0.274507
      -0.013465
      -0.197151
    
    
      1
      0.032049
      0.030430
      0.105785
      0.066143
      0.158681
      -0.055205
      0.031907
      0.075017
      0.037712
      -0.118574
      ...
      0.368897
      0.048242
      0.063376
      0.038888
      0.190450
      0.219049
      0.024658
      0.309405
      0.086541
      0.876392
    
    
      2
      0.019909
      0.020937
      0.054781
      0.091120
      0.081509
      0.059768
      0.048724
      0.058677
      0.020011
      -0.020444
      ...
      0.066947
      0.021335
      0.044961
      0.049016
      0.108378
      0.040022
      0.079600
      0.006944
      0.121396
      0.883500
    
    
      3
      -0.016089
      -0.010526
      0.068138
      0.138538
      0.528066
      0.328112
      0.015188
      -0.007962
      -0.020730
      0.029677
      ...
      0.207683
      -0.017705
      0.007216
      0.011462
      0.071001
      0.361591
      0.033603
      0.132747
      0.187996
      -0.120004
    
    
      4
      0.017494
      0.016594
      0.116341
      0.069482
      0.066660
      0.090491
      0.036934
      0.072921
      0.019226
      0.059746
      ...
      0.110857
      0.025804
      0.060023
      0.093117
      0.189645
      0.036699
      0.060130
      0.066459
      0.062769
      0.566287
    
  

5 rows × 30 columns



In [49]:

    
def plot2D_city_demo(gb_year,year,cities):
    
    df = gb_year.get_group(year)
    # groupby
    gb = df.groupby('city',as_index=False)
    # comput the average
    df_ = gb.agg(np.average)
    df_ = df_.ix[:,3:33]
    
    city_demo = []
    for i in range(len(cities)):
        temp = df_.ix[i,:]
        city_demo.append(list(temp.values))
    city_demo_array = np.array(city_demo)
    # plot
    fig = plt.figure(figsize=(19,5))
    sns.set()
    sns.set_context("poster")
    ax = sns.heatmap(city_demo_array, xticklabels=df_.columns, yticklabels=cities, vmin=-1,vmax=1)



In [57]:

    
plot2D_city_demo(gb_year,2011,cities)



In [58]:

    
plot2D_city_demo(gb_year,2012,cities)



In [101]:

    
df_all.biz_index.unique()









    Out[101]:





array([0, 2, 1, 3])



In [102]:

    
df = df_all[['city', 'zip_code', 'business_code', 'biz_growth', 'biz_index','year']]

Dropdown menu of all cities
- Biz growth data is stored by city
- City is queried
- Growth rate data is called.
  - SQL
    - This is 2012(-2013) data.
    - Columns: city, zip code, business code, growth rate, growth index
    - SELECT ZIP_CODE, GROWTH_RATE FROM TABLE; WHERE CITY = 'CHICAGO'
  - Python
    - Folium: create map
    - Folium: bind data
    - Create maps
      1. Map on average growth
      2. Map by category
User pick a city, then a map appears
Heatmap with overall growth index is shown first
There are buttons or another dropbown menu for users to choose a certain business category
- User clicks the button
- Then map changes
Based on the business category, map changes.

Bind data to folium



In [103]:

    
zip_codes = df.zip_code.unique()



In [341]:

    
# filter geojson file based on zip_codes
geo_file = 'zip_cities.json'
import json
with open(geo_file) as f:
    data = json.load(f)



In [349]:

    
df_feat = pd.DataFrame(data['features'])
temp = pd.DataFrame()
for i in range(len(data['features'])):
    if int(data['features'][i]['properties']['ZCTA5CE10']) in zip_codes:
        temp = pd.concat([temp,df_feat.iloc[[i]]],ignore_index=True)
dict_temp = temp.to_dict(orient='records')
data['features'] = dict_temp
with open('zip_cities_final.json', 'w') as f:
    json.dump(data, f)



In [351]:

    
import folium
m = folium.Map(location=[37.769959, -122.448679], zoom_start=9)
m.geo_json(geo_path='zip_cities_final.json')
m.create_map('test_.html')



In [356]:

    
data['features'][0]['properties']









    Out[356]:





{u'AFFGEOID10': u'8600000US94132',
 u'ALAND10': 8054467,
 u'AWATER10': 1299131,
 u'GEOID10': u'94132',
 u'ZCTA5CE10': u'94132'}



In [104]:

    
# add zero to 4 digit zip codes
temp = (df.zip_code.apply(str))
temp.unique()
zips = []
for zip_code in temp:
    if len(zip_code)==4:
        zip_code = '0' + zip_code
    zips.append(zip_code)
# attach it to the original table
df['zips'] = zips









    



/Users/Hongsup/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:10: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [105]:

    
df.head()









    Out[105]:






  
    
      
      city
      zip_code
      business_code
      biz_growth
      biz_index
      year
      zips
    
  
  
    
      0
      Baltimore
      21201
      11
      0.000000
      0
      2011
      21201
    
    
      1
      Baltimore
      21201
      21
      0.000000
      0
      2011
      21201
    
    
      2
      Baltimore
      21201
      22
      0.000000
      0
      2011
      21201
    
    
      3
      Baltimore
      21201
      23
      0.052632
      2
      2011
      21201
    
    
      4
      Baltimore
      21201
      30
      -0.300000
      0
      2011
      21201



In [106]:

    
print business_codes









    



[11, 21, 22, 23, 30, 42, 45, 49, 51, 52, 53, 54, 55, 56, 61, 62, 71, 72, 81, 99]



In [107]:

    
plt.hist(y_2013_merged)









    Out[107]:





(array([ 9282.,     0.,     0.,   776.,     0.,     0.,    32.,     0.,
            0.,   230.]),
 array([ 0. ,  0.3,  0.6,  0.9,  1.2,  1.5,  1.8,  2.1,  2.4,  2.7,  3. ]),
 <a list of 10 Patch objects>)



In [118]:

    
# test 2012 data
import folium
geo_path = 'zip_cities_final.json'

df_2012 = df[df.year==2012]
# df_2012['predict']=y_2013_merged
gb = df_2012.groupby('business_code')

# for biz_code in business_codes:
biz_code = 11
data = gb.get_group(biz_code)

# LOOP DOES NOT WORK! IT OVERWRITES IT
m = folium.Map(location=[37.769959, -122.448679], zoom_start=9)
m.geo_json(geo_path=geo_path, 
             data=data,
             columns=['zips', 'biz_index'],
             threshold_scale=range(0,4),
             key_on='feature.properties.ZCTA5CE10',
             fill_color='BuPu', fill_opacity=0.7, line_opacity=0.2,
             legend_name='Business Growth Index')
m.create_map('test_bind_' + str(biz_code) + '.html')



In [132]:

    
# Bind data to Folium: this is for prediction data
import folium

df_2012 = df[df.year==2012]
df_2012['predict']=y_2013_merged
gb = df_2012.groupby(('year','business_code'))

# biz_code=11
# for biz_code in business_codes:

def create_map(biz_code,gb):
    geo_path = 'zip_cities_final.json'
    
    data = gb.get_group((2012,biz_code))
    m = folium.Map(location=[37.769959, -122.448679], zoom_start=9)
    m.geo_json(geo_path=geo_path, 
                data_out = 'data' + str(biz_code) + '.json',
                data=data,
                columns=['zips', 'predict'],
                threshold_scale=range(0,4),
                key_on='feature.properties.ZCTA5CE10',
                fill_color='BuPu', fill_opacity=0.7, line_opacity=0.2,
                legend_name='Business Growth Index')
    m.create_map('biz_map_' + str(biz_code) + '.html')









    



/Users/Hongsup/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [133]:

    
for i in range(len(business_codes)-1):
    create_map(business_codes[i],gb)

City location coordinates



In [149]:

    
cities









    Out[149]:





array(['Baltimore', 'Boston', 'Chicago', 'Detroit', 'Los Angeles',
       'New York', 'Oakland', 'San Francisco', 'Seattle', 'Washington, DC'], dtype=object)



In [152]:

    
import requests
# Google Maps API
GOOGLE_KEY = ''

# Convert zip codes to coordinates
city_names = ['Baltimore,+MD', 'Boston,+MA', 'Chicago,+IL', 'Detroit,+MI', 'Los+Angeles,+CA',
       'New+York,+NY', 'Oakland,+CA', 'San+Francisco,+CA', 'Seattle,+WA', 'Washington,+DC']
lats2 = []
lngs2 = []
for city_name in city_names:
    query_url = 'https://maps.googleapis.com/maps/api/geocode/json?address=%s&key=%s' % (city_name, GOOGLE_KEY)
    r = requests.get(query_url)    
    temp = r.json()
    if len(temp['results'])==0:
        lat = 'none'
        lng = 'none'    
    else:
        lat = temp['results'][0]['geometry']['location']['lat']
        lng = temp['results'][0]['geometry']['location']['lng']
    lats2.append(lat)
    lngs2.append(lng)



In [157]:

    
city_coordinates = zip(lats2,lngs2)
city_coordinates









    Out[157]:





[(39.2903848, -76.6121893),
 (42.3600825, -71.0588801),
 (41.8781136, -87.6297982),
 (42.331427, -83.0457538),
 (34.0522342, -118.2436849),
 (40.7127837, -74.0059413),
 (37.8043637, -122.2711137),
 (37.7749295, -122.4194155),
 (47.6062095, -122.3320708),
 (38.9071923, -77.0368707)]

*Linear Regression

(census + art) vs. biz growth as a vector



In [195]:

    
# X: 2011-2012 data, Y: 2012-2013 data
# train and test decided by zip code
gb = df_all.groupby('year')
df_2011 = gb.get_group(2011)
df_2012 = gb.get_group(2012)
from sklearn import cross_validation
X_train_df, X_test_df, y_train_df, y_test_df = cross_validation.train_test_split(df_2011, df_2012, test_size=0.2, random_state=0)



In [196]:

    
from sklearn import linear_model
# choose features
X_train = X_train_df.ix[:,2:32]
X_test = X_test_df.ix[:,2:32]
y_train = y_train_df.ix[:,-1]
y_test = y_test_df.ix[:,-1]
regr = linear_model.LinearRegression()
regr.fit(X_train, y_train)
regr.score(X_test, y_test)









    Out[196]:





0.013475862019180718

(census+art) vs. average biz growth across all categories



In [197]:

    
gb = df_all.groupby(('year','zip_code'),as_index=False)
# gb = df2.groupby(('zip_code','year','business_code'),as_index=False)
gb_avr = gb.agg(np.average)
gb_avr.head()









    Out[197]:






  
    
      
      year
      zip_code
      TotalPop
      M_Total
      M_18to19
      M_20
      M_21
      M_22to24
      M_25to29
      M_30to34
      ...
      F_BA
      F_MA
      F_Prof
      F_Doc
      Total_poor
      M_Arts
      F_Arts
      art_growth
      business_code
      biz_growth
    
  
  
    
      0
      2011
      2108
      1.059138
      1.064752
      1.638889
      1.500000
      2.000000
      0.694915
      1.276265
      1.013274
      ...
      1.090663
      1.111446
      1.194969
      1.304348
      0.967213
      4.625000
      0.768116
      2.097561
      50.5
      0.912149
    
    
      1
      2011
      2109
      1.058140
      1.160180
      1.000000
      0.923077
      1.066667
      0.700000
      0.988304
      1.449761
      ...
      0.924370
      1.116667
      0.968354
      0.833333
      1.039474
      0.000000
      1.243243
      2.190722
      50.5
      1.178312
    
    
      2
      2011
      2110
      0.936948
      0.896216
      1.250000
      0.000000
      0.000000
      1.370370
      1.048780
      0.857143
      ...
      1.058824
      0.646552
      2.189189
      1.023810
      0.819209
      2.000000
      0.000000
      1.206266
      50.5
      0.908773
    
    
      3
      2011
      2111
      1.088778
      1.103334
      0.946429
      0.936842
      1.300000
      0.869458
      0.970721
      1.317797
      ...
      1.090211
      1.162011
      1.150259
      1.339286
      0.936445
      2.750000
      0.714286
      1.210425
      50.5
      1.032701
    
    
      4
      2011
      2113
      1.075059
      1.111183
      0.000000
      0.936441
      0.829787
      1.051635
      1.054224
      1.159664
      ...
      1.250264
      1.054825
      0.981132
      1.130435
      1.008277
      1.407407
      0.810458
      2.012000
      50.5
      0.683889
    
  

5 rows × 34 columns



In [198]:

    
gb = gb_avr.groupby('year')
df_2011 = gb.get_group(2011)
df_2012 = gb.get_group(2012)
X_train_df, X_test_df, y_train_df, y_test_df = cross_validation.train_test_split(df_2011, df_2012, test_size=0.4, random_state=0)



In [199]:

    
X_train = X_train_df.ix[:,2:32]
X_test = X_test_df.ix[:,2:32]
y_train = y_train_df.ix[:,-1]
y_test = y_test_df.ix[:,-1]
regr = linear_model.LinearRegression()
regr.fit(X_train, y_train)
regr.score(X_test, y_test)









    Out[199]:





-2.8262020865297672

1.3 (census+art) vs. biz growth by category



In [226]:

    
# X
df_corr2 = gb_avr.ix[:,:-2]
# Y
gb_biz = df_biz.groupby('business_code')
# biz list
biz_codes = df_biz['business_code'].unique()



In [253]:

    
# they are not in the same order, so let's merge and separate.
for biz_code in biz_codes:
    temp = gb_biz.get_group(biz_code)
    df_temp = pd.merge(df_corr2, temp, how='inner', on=['year', 'zip_code'])

    # split it by year
    gb_temp = df_temp.groupby('year')
    df_2011 = gb_temp.get_group(2011)
    df_2012 = gb_temp.get_group(2012)
    X_train_df, X_test_df, y_train_df, y_test_df = cross_validation.train_test_split(df_2011, df_2012, test_size=0.4, random_state=0)    
    
    X_train = X_train_df.ix[:,2:32]
    X_test = X_test_df.ix[:,2:32]
    y_train = y_train_df.ix[:,-1]
    y_test = y_test_df.ix[:,-1]
    regr = linear_model.LinearRegression()
    regr.fit(X_train, y_train)
    print 'business code:' + str(biz_code) + ' ' + str(regr.score(X_test, y_test))









    



business code:11 -1.58516850264
business code:21 -1.70549632154
business code:22 -0.570283676119
business code:23 -8.50125108496
business code:30 -8.63493387383
business code:42 -1.23052868325
business code:45 -20.4915116197
business code:49 -2.63808292364
business code:51 -3.89447159679
business code:52 -1.7561748792
business code:53 -2.81563074461
business code:54 -13.7551346429
business code:55 -6.39457612636
business code:56 -4.58178498096
business code:61 -4.86618160857
business code:62 -14.0123370005
business code:71 -21.3766659716
business code:72 -4.68961323121
business code:81 -13.7954539423
business code:99 -0.935851453119

PCA(census+art) vs. biz growth as a vector



In [206]:

    
# We are using 2011-2012 (data to predict 2012-2013 data (business pattern)
gb_avr_year = gb_avr.gropby('year')
df_2011 = gb_avr_year.get_group(2011)
df_2012 = gb_avr_year.get_group(2012)









    Out[206]:






  
    
      
      year
      zip_code
      TotalPop
      M_Total
      M_18to19
      M_20
      M_21
      M_22to24
      M_25to29
      M_30to34
      ...
      F_BA
      F_MA
      F_Prof
      F_Doc
      Total_poor
      M_Arts
      F_Arts
      art_growth
      business_code
      biz_growth
    
  
  
    
      0
      2011
      2108
      1.059138
      1.064752
      1.638889
      1.500000
      2.000000
      0.694915
      1.276265
      1.013274
      ...
      1.090663
      1.111446
      1.194969
      1.304348
      0.967213
      4.625000
      0.768116
      2.097561
      50.5
      0.912149
    
    
      1
      2011
      2109
      1.058140
      1.160180
      1.000000
      0.923077
      1.066667
      0.700000
      0.988304
      1.449761
      ...
      0.924370
      1.116667
      0.968354
      0.833333
      1.039474
      0.000000
      1.243243
      2.190722
      50.5
      1.178312
    
    
      2
      2011
      2110
      0.936948
      0.896216
      1.250000
      0.000000
      0.000000
      1.370370
      1.048780
      0.857143
      ...
      1.058824
      0.646552
      2.189189
      1.023810
      0.819209
      2.000000
      0.000000
      1.206266
      50.5
      0.908773
    
    
      3
      2011
      2111
      1.088778
      1.103334
      0.946429
      0.936842
      1.300000
      0.869458
      0.970721
      1.317797
      ...
      1.090211
      1.162011
      1.150259
      1.339286
      0.936445
      2.750000
      0.714286
      1.210425
      50.5
      1.032701
    
    
      4
      2011
      2113
      1.075059
      1.111183
      0.000000
      0.936441
      0.829787
      1.051635
      1.054224
      1.159664
      ...
      1.250264
      1.054825
      0.981132
      1.130435
      1.008277
      1.407407
      0.810458
      2.012000
      50.5
      0.683889
    
  

5 rows × 34 columns



In [259]:

    
# PCA
def plot_exp_var_ratio(X,N):
    """Create a bar plot of explained variance ratio of N components from the PCA results using data X"""
    pca = PCA(n_components=N)
    pca.fit(X)
    PCA(copy=True, n_components=3, whiten=False)
    print 'var: ' + str((pca.explained_variance_ratio_)) 
    print 'score: '+str((pca.score(X)))
    Y = pca.transform(X)
    plt.bar(range(N),pca.explained_variance_ratio_)
    plt.xticks(np.array(range(N))+0.4,range(1,N+1))
    plt.ylabel('Explained Var Ratio')
    plt.xlabel('pc #')
    plt.show()
    return Y



In [264]:

    
X = df_2011.ix[:,2:32]
X_PC = plot_exp_var_ratio(X,N)









    



var: [ 0.63564393  0.05988715  0.04627653  0.03844989  0.03167597]
score: -15.0665809915

	city	state	zipcodes
0	Baltimore	MD	21201
1	Baltimore	MD	21202
2	Baltimore	MD	21205
3	Baltimore	MD	21206
4	Baltimore	MD	21207

	TotalPop	M_Total	M_18to19	M_20	M_21	M_22to24	M_25to29	M_30to34	F_Total	F_18to19	...	F_Total_Edu	F_BA	F_MA	F_Prof	F_Doc	Total_poor	M_Arts	F_Arts	year	zip_code
0	-0.018745	0.025664	0.076923	0.256410	-0.500000	0.083160	0.031599	0.219780	-0.051964	0.132812	...	-0.076227	-0.077049	-0.040000	-0.132743	-0.020979	-0.000169	0.291925	-0.109804	2011	21201
1	0.018566	-0.004596	0.038023	0.244361	-0.004454	-0.021987	-0.053431	0.008483	0.050930	-0.060976	...	0.058900	0.040836	0.076677	0.130435	0.112676	0.037575	-0.157895	-0.104478	2011	21202
2	0.029790	0.033835	0.067524	-0.071942	0.151316	-0.088384	0.141667	0.204819	0.026208	0.066845	...	-0.008950	0.200000	0.265734	-0.340000	NaN	0.172172	inf	-0.125000	2011	21205
3	-0.020165	-0.011558	0.159664	0.231481	0.033708	-0.012547	-0.165234	0.019583	-0.027440	-0.103717	...	0.010800	-0.129573	-0.086957	0.383562	-0.136364	0.052441	-0.247525	-0.176000	2011	21206
4	-0.010570	-0.020922	0.301708	-0.290844	-0.015152	-0.084223	-0.050765	-0.042348	-0.002171	-0.040140	...	0.019922	0.028470	-0.016706	0.012658	0.854167	0.035589	0.026549	0.169014	2011	21207

	business_code	biz_growth	year	zip_code
0	11	0.000000	2011	21201
1	21	0.000000	2011	21201
2	22	0.000000	2011	21201
3	23	0.052632	2011	21201
4	30	-0.300000	2011	21201

	art_growth	year	zip_code	TotalPop	M_Total	M_18to19	M_20	M_21	M_22to24	M_25to29	...	M_Doc	F_Total_Edu	F_BA	F_MA	F_Prof	F_Doc	Total_poor	M_Arts	F_Arts	city
0	0.068309	2011	21201	-0.018745	0.025664	0.076923	0.256410	-0.500000	0.083160	0.031599	...	0.304167	-0.076227	-0.077049	-0.040000	-0.132743	-0.020979	-0.000169	0.291925	-0.109804	Baltimore
1	-0.021482	2011	21202	0.018566	-0.004596	0.038023	0.244361	-0.004454	-0.021987	-0.053431	...	-0.156398	0.058900	0.040836	0.076677	0.130435	0.112676	0.037575	-0.157895	-0.104478	Baltimore
2	0.000000	2011	21205	0.029790	0.033835	0.067524	-0.071942	0.151316	-0.088384	0.141667	...	1.000000	-0.008950	0.200000	0.265734	-0.340000	NaN	0.172172	inf	-0.125000	Baltimore
3	0.000000	2011	21206	-0.020165	-0.011558	0.159664	0.231481	0.033708	-0.012547	-0.165234	...	-0.569767	0.010800	-0.129573	-0.086957	0.383562	-0.136364	0.052441	-0.247525	-0.176000	Baltimore
4	0.000000	2011	21207	-0.010570	-0.020922	0.301708	-0.290844	-0.015152	-0.084223	-0.050765	...	0.354430	0.019922	0.028470	-0.016706	0.012658	0.854167	0.035589	0.026549	0.169014	Baltimore

	2011	2012
Agriculture/Fishing	-0.012647	-0.018897
Mining/Gas Extraction	-0.014696	-0.021318
Utilities	-0.002071	-0.002971
Construction	0.023427	0.029157
Manufacturing	-0.008928	0.001245
Wholesale	0.013011	0.027247
Retail	0.022027	0.003472
Transportation	0.053618	0.029967
Media/Publishing	0.038387	0.044592
Finance	0.032544	0.003561
Real Estate	0.033791	0.015557
Law/Science/Ads	0.041238	0.027794
Management	0.035442	0.002267
Administrative	0.032993	0.026459
Education	0.073749	0.071657
Health	0.027851	0.011775
Arts	0.071444	0.047217
Accommodation/Food	0.043032	0.024339
Others	0.026080	0.015499
Public	-0.376829	0.368108

	TotalPop	M_Total	M_18to19	M_20	M_21	M_22to24	M_25to29	M_30to34	F_Total	F_18to19	...	M_Doc	F_Total_Edu	F_BA	F_MA	F_Prof	F_Doc	Total_poor	M_Arts	F_Arts	art_growth
0	0.001748	0.001324	0.020618	0.005246	0.028095	0.031644	0.025178	0.026844	0.003100	0.062382	...	0.211309	0.005263	0.019255	0.016986	0.074994	0.104844	0.051404	0.274507	-0.013465	-0.197151
1	0.032049	0.030430	0.105785	0.066143	0.158681	-0.055205	0.031907	0.075017	0.037712	-0.118574	...	0.368897	0.048242	0.063376	0.038888	0.190450	0.219049	0.024658	0.309405	0.086541	0.876392
2	0.019909	0.020937	0.054781	0.091120	0.081509	0.059768	0.048724	0.058677	0.020011	-0.020444	...	0.066947	0.021335	0.044961	0.049016	0.108378	0.040022	0.079600	0.006944	0.121396	0.883500
3	-0.016089	-0.010526	0.068138	0.138538	0.528066	0.328112	0.015188	-0.007962	-0.020730	0.029677	...	0.207683	-0.017705	0.007216	0.011462	0.071001	0.361591	0.033603	0.132747	0.187996	-0.120004
4	0.017494	0.016594	0.116341	0.069482	0.066660	0.090491	0.036934	0.072921	0.019226	0.059746	...	0.110857	0.025804	0.060023	0.093117	0.189645	0.036699	0.060130	0.066459	0.062769	0.566287

	year	zip_code	TotalPop	M_Total	M_18to19	M_20	M_21	M_22to24	M_25to29	M_30to34	...	F_BA	F_MA	F_Prof	F_Doc	Total_poor	M_Arts	F_Arts	art_growth	business_code	biz_growth
0	2011	2108	1.059138	1.064752	1.638889	1.500000	2.000000	0.694915	1.276265	1.013274	...	1.090663	1.111446	1.194969	1.304348	0.967213	4.625000	0.768116	2.097561	50.5	0.912149
1	2011	2109	1.058140	1.160180	1.000000	0.923077	1.066667	0.700000	0.988304	1.449761	...	0.924370	1.116667	0.968354	0.833333	1.039474	0.000000	1.243243	2.190722	50.5	1.178312
2	2011	2110	0.936948	0.896216	1.250000	0.000000	0.000000	1.370370	1.048780	0.857143	...	1.058824	0.646552	2.189189	1.023810	0.819209	2.000000	0.000000	1.206266	50.5	0.908773
3	2011	2111	1.088778	1.103334	0.946429	0.936842	1.300000	0.869458	0.970721	1.317797	...	1.090211	1.162011	1.150259	1.339286	0.936445	2.750000	0.714286	1.210425	50.5	1.032701
4	2011	2113	1.075059	1.111183	0.000000	0.936441	0.829787	1.051635	1.054224	1.159664	...	1.250264	1.054825	0.981132	1.130435	1.008277	1.407407	0.810458	2.012000	50.5	0.683889