Data munghing for business pattern data



In [232]:

    
import requests
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import csv
import seaborn as sns
font = {'family' : 'Arial',
        'weight' : 'bold',
        'size'   : 25}
matplotlib.rc('font', **font)
from census import Census
from us import states
import csv

1. Download the table from the website

zip code, business type, # of establishment by # of employees

2. Concatenate tables for corresponding zip codes



In [427]:

    
# load zip codes
import pickle
with open('zipcode_final.txt', 'rb') as f:
    zip_codes = pickle.load(f)



In [236]:

    
# concatenate tables with different years and zip codes
df = pd.DataFrame()
years = range(2011,2014)
for year in years:
    temp = pd.read_csv('zbp'+ str(year-2000) +'detail.txt',error_bad_lines=False)
    temp['year']=year
    temp = temp.loc[temp['zip'].isin(map(int,zip_codes))]
    df = pd.concat([df,temp],ignore_index=True)
df.head()









    



Skipping line 1420726: expected 12 fields, saw 19
Skipping line 1420729: expected 12 fields, saw 16
Skipping line 1420731: expected 12 fields, saw 33
Skipping line 1420733: expected 12 fields, saw 13
Skipping line 1420735: expected 12 fields, saw 30
Skipping line 1420736: expected 12 fields, saw 18
Skipping line 1420737: expected 12 fields, saw 24
Skipping line 1420741: expected 12 fields, saw 22
Skipping line 1420745: expected 12 fields, saw 17
Skipping line 1420752: expected 12 fields, saw 26
Skipping line 1420753: expected 12 fields, saw 13
Skipping line 1420756: expected 12 fields, saw 22
Skipping line 1420762: expected 12 fields, saw 15
Skipping line 1420764: expected 12 fields, saw 13
Skipping line 1420766: expected 12 fields, saw 32
Skipping line 1420767: expected 12 fields, saw 34
Skipping line 1420769: expected 12 fields, saw 23
Skipping line 1420770: expected 12 fields, saw 19
Skipping line 1420774: expected 12 fields, saw 17
Skipping line 1420775: expected 12 fields, saw 17
Skipping line 1420777: expected 12 fields, saw 16
Skipping line 1420778: expected 12 fields, saw 14
Skipping line 1420779: expected 12 fields, saw 14
Skipping line 1420785: expected 12 fields, saw 19
Skipping line 1420786: expected 12 fields, saw 17
Skipping line 1420788: expected 12 fields, saw 22
Skipping line 1420793: expected 12 fields, saw 37
Skipping line 1420795: expected 12 fields, saw 17
Skipping line 1420796: expected 12 fields, saw 32
Skipping line 1420797: expected 12 fields, saw 23

/Users/Hongsup/anaconda/lib/python2.7/site-packages/pandas/io/parsers.py:1170: DtypeWarning: Columns (0,2,3,4,5,6,8) have mixed types. Specify dtype option on import or set low_memory=False.
  data = self._reader.read(nrows)






    Out[236]:






  
    
      
      zip
      naics
      est
      n1_4
      n5_9
      n10_19
      n20_49
      n50_99
      n100_249
      n250_499
      n500_999
      n1000
      year
    
  
  
    
      0
      2108
      ------
      1125
      575
      190
      153
      116
      47
      36
      7
      0
      1
      2011
    
    
      1
      2108
      23----
      4
      3
      0
      0
      1
      0
      0
      0
      0
      0
      2011
    
    
      2
      2108
      236115
      1
      1
      0
      0
      0
      0
      0
      0
      0
      0
      2011
    
    
      3
      2108
      236116
      2
      1
      0
      0
      1
      0
      0
      0
      0
      0
      2011
    
    
      4
      2108
      238220
      1
      1
      0
      0
      0
      0
      0
      0
      0
      0
      2011

3. Simplify the business code



In [238]:

    
"""
    11	Agriculture, Forestry, Fishing and Hunting
    21	Mining, Quarrying, and Oil and Gas Extraction
    22	Utilities
    #23	Construction
    31-33	Manufacturing
    #42	Wholesale Trade
    #44-45	Retail Trade
    48-49	Transportation and Warehousing
    #51	Information
    #52	Finance and Insurance
    #53	Real Estate and Rental and Leasing
    #54	Professional, Scientific, and Technical Services
    #55	Management of Companies and Enterprises
    56	Administrative and Support and Waste Management and Remediation Services
    61	Educational Services
    62	Health Care and Social Assistance
    71	Arts, Entertainment, and Recreation
    #72	Accommodation and Food Services
    81	Other Services (except Public Administration)
    92	Public Administration
    
    potential parameters: 23, 42, 45, 51, 52, 53, 54, 55, 72
"""
def conver_business_type(old_code):
    """
        Convert N digit nacis codes to 2 digits
        http://www.census.gov/cgi-bin/sssd/naics/naicsrch?chart=2012
    """
    # get the first two digit
    new_code = old_code[:2]
    # simplify
#     31-33	Manufacturing
#     44-45	Retail Trade
#     48-49	Transportation and Warehousing
    if new_code != '--':
        new_int = int(new_code)
        if new_int >= 31 and new_int <= 33:
            new_code = '30'
        elif new_int >= 44 and new_int <= 45:
            new_code = '45'
        elif new_int >= 48 and new_int <= 49:
            new_code = '49'
    return new_code



In [239]:

    
# create new column for simplified business_code
df['business_code']=df['naics'].apply(lambda x: conver_business_type(x))



In [240]:

    
# rename for consistence
df=df.rename(columns = {'zip':'zip_code'})
df.head()









    Out[240]:






  
    
      
      zip_code
      naics
      est
      n1_4
      n5_9
      n10_19
      n20_49
      n50_99
      n100_249
      n250_499
      n500_999
      n1000
      year
      business_code
    
  
  
    
      0
      2108
      ------
      1125
      575
      190
      153
      116
      47
      36
      7
      0
      1
      2011
      --
    
    
      1
      2108
      23----
      4
      3
      0
      0
      1
      0
      0
      0
      0
      0
      2011
      23
    
    
      2
      2108
      236115
      1
      1
      0
      0
      0
      0
      0
      0
      0
      0
      2011
      23
    
    
      3
      2108
      236116
      2
      1
      0
      0
      1
      0
      0
      0
      0
      0
      2011
      23
    
    
      4
      2108
      238220
      1
      1
      0
      0
      0
      0
      0
      0
      0
      0
      2011
      23



In [56]:

    
# Add column that show potentially good indicators
# 23	Construction
# 42	Wholesale Trade
# 45	Retail Trade
# 51	Information
# 52	Finance and Insurance
# 53	Real Estate and Rental and Leasing
# 54	Professional, Scientific, and Technical Services
# 55	Management of Companies and Enterprises
# 72	Accommodation and Food Services
# good_biz_code_list = [23,42,45,51,52,53,54,55,72]



In [243]:

    
# save
df.to_csv('biz_sum.csv',encoding='utf-8',index=False)



In [244]:

    
df = pd.read_csv('biz_sum.csv')
df.head()









    Out[244]:






  
    
      
      zip_code
      naics
      est
      n1_4
      n5_9
      n10_19
      n20_49
      n50_99
      n100_249
      n250_499
      n500_999
      n1000
      year
      business_code
    
  
  
    
      0
      2108
      ------
      1125
      575
      190
      153
      116
      47
      36
      7
      0
      1
      2011
      --
    
    
      1
      2108
      23----
      4
      3
      0
      0
      1
      0
      0
      0
      0
      0
      2011
      23
    
    
      2
      2108
      236115
      1
      1
      0
      0
      0
      0
      0
      0
      0
      0
      2011
      23
    
    
      3
      2108
      236116
      2
      1
      0
      0
      1
      0
      0
      0
      0
      0
      2011
      23
    
    
      4
      2108
      238220
      1
      1
      0
      0
      0
      0
      0
      0
      0
      0
      2011
      23

4. Fill the gaps for non-existing data



In [261]:

    
# simplify the table
df2 = df[['year','zip_code','business_code','est']]
df2.head()









    Out[261]:






  
    
      
      year
      zip_code
      business_code
      est
    
  
  
    
      0
      2011
      21201
      22
      4
    
    
      1
      2011
      21201
      23
      38
    
    
      2
      2011
      21201
      30
      20
    
    
      3
      2011
      21201
      42
      60
    
    
      4
      2011
      21201
      45
      286



In [247]:

    
# sum and normalize
# 1. Sum: use pandas groupby
gb = df2.groupby(('zip_code','year','business_code'),as_index=False)
gb_sum = gb.agg(np.sum)
gb_sum.head()









    Out[247]:






  
    
      
      zip_code
      year
      business_code
      est
    
  
  
    
      0
      2108
      2011
      --
      1125
    
    
      1
      2108
      2011
      23
      8
    
    
      2
      2108
      2011
      30
      20
    
    
      3
      2108
      2011
      42
      58
    
    
      4
      2108
      2011
      45
      220



In [288]:

    
# Compute growth rate: divide year_(N+1) by year_N (>1: growth)
gb2 = gb_sum.groupby(('zip_code','year'))



In [413]:

    
# Create empty cell if the business type does not exist.
business_codes = [11, 21, 22, 23, 30, 42, 45, 49, 51, 52, 53, 54, 55, 56, 61, 62, 71, 72, 81, 99]



In [431]:

    
# for every year and every zip code, create same list of business_codes
# if not exist, est value = 0
years = range(2011,2014)
df_biz = pd.DataFrame()
for year in years:
    for zip_code in zip_codes:
        try:
            a = gb2.get_group((int(zip_code),year))
            est_list = []
            temp = pd.DataFrame()
            for i in range(len(business_codes)):
                business_code = business_codes[i]
                if str(business_code) in a['business_code'].values:
                    est_list.append(int(a[a['business_code']==str(business_code)]['est']))
                else:
                    est_list.append(0)
            temp['business_code']=pd.Series(business_codes)
            temp['est']=pd.DataFrame(est_list)
            temp['zip_code']=zip_code
            temp['year']=year
            df_biz = pd.concat([df_biz,temp],ignore_index=True)
        except KeyError:
            continue



In [443]:

    
# save
df_biz.to_csv('biz_sum_final.csv',encoding='utf-8',index=False)

5. Predictor: growth rate per business category



In [444]:

    
df_biz.head()









    Out[444]:






  
    
      
      business_code
      est
      zip_code
      year
    
  
  
    
      0
      11
      0
      21201
      2011
    
    
      1
      21
      0
      21201
      2011
    
    
      2
      22
      4
      21201
      2011
    
    
      3
      23
      38
      21201
      2011
    
    
      4
      30
      20
      21201
      2011



In [445]:

    
gb_biz = df_biz.groupby(('year','zip_code','business_code'))



In [471]:

    
df_all









    Out[471]:






  
    
      
      business_code
      growth
      year
      zip_code
    
  
  
    
      0
      99
      [nan]
      2011
      90089



In [512]:

    
ref_years = [2011, 2012]
df_biz_growth = pd.DataFrame()
for ref_year in ref_years:
    for zip_code in zip_codes:
        for business_code in business_codes:
            try:
                prev_year = int(gb_biz.get_group((ref_year,zip_code,business_code))['est'].values)
                this_year = int(gb_biz.get_group((ref_year+1,zip_code,business_code))['est'].values)
                if prev_year == 0 & this_year > 0:
                    growth = 2
                elif prev_year == 0 & this_year == 0:
                    growth = 0
                else:
                    growth = (this_year - prev_year+.0)/prev_year
                data = [dict(year = ref_year,
                            zip_code = zip_code,
                            business_code = business_code,
                            growth = growth)]        
                temp = pd.DataFrame(data)
                df_biz_growth = pd.concat([df_biz_growth,temp],ignore_index=True)
            except KeyError:
                continue



In [513]:

    
print df_biz_growth.head()
len(df_biz_growth)









    



   business_code    growth  year zip_code
0             11  0.000000  2011    21201
1             21  0.000000  2011    21201
2             22  0.000000  2011    21201
3             23  0.052632  2011    21201
4             30 -0.300000  2011    21201






    Out[513]:





23120



In [515]:

    
df_biz_growth.to_csv('biz_growth.csv',encoding='utf-8',index=False)



In [516]:

    
gb3 = df_biz_growth.groupby(('year','zip_code'))



In [517]:

    
gb3.get_group((2012,'21201'))









    Out[517]:






  
    
      
      business_code
      growth
      year
      zip_code
    
  
  
    
      11560
      11
      0.000000
      2012
      21201
    
    
      11561
      21
      0.000000
      2012
      21201
    
    
      11562
      22
      0.000000
      2012
      21201
    
    
      11563
      23
      0.150000
      2012
      21201
    
    
      11564
      30
      0.000000
      2012
      21201
    
    
      11565
      42
      0.136364
      2012
      21201
    
    
      11566
      45
      -0.020833
      2012
      21201
    
    
      11567
      49
      0.333333
      2012
      21201
    
    
      11568
      51
      -0.054054
      2012
      21201
    
    
      11569
      52
      -0.085106
      2012
      21201
    
    
      11570
      53
      -0.032787
      2012
      21201
    
    
      11571
      54
      -0.003155
      2012
      21201
    
    
      11572
      55
      0.000000
      2012
      21201
    
    
      11573
      56
      -0.048387
      2012
      21201
    
    
      11574
      61
      0.062500
      2012
      21201
    
    
      11575
      62
      0.042857
      2012
      21201
    
    
      11576
      71
      -0.050000
      2012
      21201
    
    
      11577
      72
      0.000000
      2012
      21201
    
    
      11578
      81
      -0.038168
      2012
      21201
    
    
      11579
      99
      0.000000
      2012
      21201



In [518]:

    
# original zip code
len(zip_codes)









    Out[518]:





585



In [519]:

    
# new zip code based on available data: it's fewer
len(df_biz_growth.zip_code.unique())









    Out[519]:





578



In [520]:

    
# we're getting only these zip codes
new_zip = df_biz_growth.zip_code.unique()
np.save('new_zip_codes', new_zip)
"""
    load: np.load('new_zip_codes.npy')
"""









    Out[520]:





"\n    load: np.load('new_zip_codes.npy')\n"

	zip	naics	est	n1_4	n5_9	n10_19	n20_49	n50_99	n100_249	n250_499	n1000	year
0	2108	------	1125	575	190	153	116	47	36	7	1	2011
1	2108	23----	4	3	0	0	1	0	0	0	0	2011
2	2108	236115	1	1	0	0	0	0	0	0	0	2011
3	2108	236116	2	1	0	0	1	0	0	0	0	2011
4	2108	238220	1	1	0	0	0	0	0	0	0	2011

	year	zip_code	business_code	est
0	2011	21201	22	4
1	2011	21201	23	38
2	2011	21201	30	20
3	2011	21201	42	60
4	2011	21201	45	286

	business_code	growth	year	zip_code
11560	11	0.000000	2012	21201
11561	21	0.000000	2012	21201
11562	22	0.000000	2012	21201
11563	23	0.150000	2012	21201
11564	30	0.000000	2012	21201
11565	42	0.136364	2012	21201
11566	45	-0.020833	2012	21201
11567	49	0.333333	2012	21201
11568	51	-0.054054	2012	21201
11569	52	-0.085106	2012	21201
11570	53	-0.032787	2012	21201
11571	54	-0.003155	2012	21201
11572	55	0.000000	2012	21201
11573	56	-0.048387	2012	21201
11574	61	0.062500	2012	21201
11575	62	0.042857	2012	21201
11576	71	-0.050000	2012	21201
11577	72	0.000000	2012	21201
11578	81	-0.038168	2012	21201
11579	99	0.000000	2012	21201