Data munghing for business pattern data


In [232]:
import requests
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import csv
import seaborn as sns
font = {'family' : 'Arial',
        'weight' : 'bold',
        'size'   : 25}
matplotlib.rc('font', **font)
from census import Census
from us import states
import csv

1. Download the table from the website

  • zip code, business type, # of establishment by # of employees

2. Concatenate tables for corresponding zip codes


In [427]:
# load zip codes
import pickle
with open('zipcode_final.txt', 'rb') as f:
    zip_codes = pickle.load(f)

In [236]:
# concatenate tables with different years and zip codes
df = pd.DataFrame()
years = range(2011,2014)
for year in years:
    temp = pd.read_csv('zbp'+ str(year-2000) +'detail.txt',error_bad_lines=False)
    temp['year']=year
    temp = temp.loc[temp['zip'].isin(map(int,zip_codes))]
    df = pd.concat([df,temp],ignore_index=True)
df.head()


Skipping line 1420726: expected 12 fields, saw 19
Skipping line 1420729: expected 12 fields, saw 16
Skipping line 1420731: expected 12 fields, saw 33
Skipping line 1420733: expected 12 fields, saw 13
Skipping line 1420735: expected 12 fields, saw 30
Skipping line 1420736: expected 12 fields, saw 18
Skipping line 1420737: expected 12 fields, saw 24
Skipping line 1420741: expected 12 fields, saw 22
Skipping line 1420745: expected 12 fields, saw 17
Skipping line 1420752: expected 12 fields, saw 26
Skipping line 1420753: expected 12 fields, saw 13
Skipping line 1420756: expected 12 fields, saw 22
Skipping line 1420762: expected 12 fields, saw 15
Skipping line 1420764: expected 12 fields, saw 13
Skipping line 1420766: expected 12 fields, saw 32
Skipping line 1420767: expected 12 fields, saw 34
Skipping line 1420769: expected 12 fields, saw 23
Skipping line 1420770: expected 12 fields, saw 19
Skipping line 1420774: expected 12 fields, saw 17
Skipping line 1420775: expected 12 fields, saw 17
Skipping line 1420777: expected 12 fields, saw 16
Skipping line 1420778: expected 12 fields, saw 14
Skipping line 1420779: expected 12 fields, saw 14
Skipping line 1420785: expected 12 fields, saw 19
Skipping line 1420786: expected 12 fields, saw 17
Skipping line 1420788: expected 12 fields, saw 22
Skipping line 1420793: expected 12 fields, saw 37
Skipping line 1420795: expected 12 fields, saw 17
Skipping line 1420796: expected 12 fields, saw 32
Skipping line 1420797: expected 12 fields, saw 23

/Users/Hongsup/anaconda/lib/python2.7/site-packages/pandas/io/parsers.py:1170: DtypeWarning: Columns (0,2,3,4,5,6,8) have mixed types. Specify dtype option on import or set low_memory=False.
  data = self._reader.read(nrows)
Out[236]:
zip naics est n1_4 n5_9 n10_19 n20_49 n50_99 n100_249 n250_499 n500_999 n1000 year
0 2108 ------ 1125 575 190 153 116 47 36 7 0 1 2011
1 2108 23---- 4 3 0 0 1 0 0 0 0 0 2011
2 2108 236115 1 1 0 0 0 0 0 0 0 0 2011
3 2108 236116 2 1 0 0 1 0 0 0 0 0 2011
4 2108 238220 1 1 0 0 0 0 0 0 0 0 2011

3. Simplify the business code


In [238]:
"""
    11	Agriculture, Forestry, Fishing and Hunting
    21	Mining, Quarrying, and Oil and Gas Extraction
    22	Utilities
    #23	Construction
    31-33	Manufacturing
    #42	Wholesale Trade
    #44-45	Retail Trade
    48-49	Transportation and Warehousing
    #51	Information
    #52	Finance and Insurance
    #53	Real Estate and Rental and Leasing
    #54	Professional, Scientific, and Technical Services
    #55	Management of Companies and Enterprises
    56	Administrative and Support and Waste Management and Remediation Services
    61	Educational Services
    62	Health Care and Social Assistance
    71	Arts, Entertainment, and Recreation
    #72	Accommodation and Food Services
    81	Other Services (except Public Administration)
    92	Public Administration
    
    potential parameters: 23, 42, 45, 51, 52, 53, 54, 55, 72
"""
def conver_business_type(old_code):
    """
        Convert N digit nacis codes to 2 digits
        http://www.census.gov/cgi-bin/sssd/naics/naicsrch?chart=2012
    """
    # get the first two digit
    new_code = old_code[:2]
    # simplify
#     31-33	Manufacturing
#     44-45	Retail Trade
#     48-49	Transportation and Warehousing
    if new_code != '--':
        new_int = int(new_code)
        if new_int >= 31 and new_int <= 33:
            new_code = '30'
        elif new_int >= 44 and new_int <= 45:
            new_code = '45'
        elif new_int >= 48 and new_int <= 49:
            new_code = '49'
    return new_code

In [239]:
# create new column for simplified business_code
df['business_code']=df['naics'].apply(lambda x: conver_business_type(x))

In [240]:
# rename for consistence
df=df.rename(columns = {'zip':'zip_code'})
df.head()


Out[240]:
zip_code naics est n1_4 n5_9 n10_19 n20_49 n50_99 n100_249 n250_499 n500_999 n1000 year business_code
0 2108 ------ 1125 575 190 153 116 47 36 7 0 1 2011 --
1 2108 23---- 4 3 0 0 1 0 0 0 0 0 2011 23
2 2108 236115 1 1 0 0 0 0 0 0 0 0 2011 23
3 2108 236116 2 1 0 0 1 0 0 0 0 0 2011 23
4 2108 238220 1 1 0 0 0 0 0 0 0 0 2011 23

In [56]:
# Add column that show potentially good indicators
# 23	Construction
# 42	Wholesale Trade
# 45	Retail Trade
# 51	Information
# 52	Finance and Insurance
# 53	Real Estate and Rental and Leasing
# 54	Professional, Scientific, and Technical Services
# 55	Management of Companies and Enterprises
# 72	Accommodation and Food Services
# good_biz_code_list = [23,42,45,51,52,53,54,55,72]

In [243]:
# save
df.to_csv('biz_sum.csv',encoding='utf-8',index=False)

In [244]:
df = pd.read_csv('biz_sum.csv')
df.head()


Out[244]:
zip_code naics est n1_4 n5_9 n10_19 n20_49 n50_99 n100_249 n250_499 n500_999 n1000 year business_code
0 2108 ------ 1125 575 190 153 116 47 36 7 0 1 2011 --
1 2108 23---- 4 3 0 0 1 0 0 0 0 0 2011 23
2 2108 236115 1 1 0 0 0 0 0 0 0 0 2011 23
3 2108 236116 2 1 0 0 1 0 0 0 0 0 2011 23
4 2108 238220 1 1 0 0 0 0 0 0 0 0 2011 23

4. Fill the gaps for non-existing data


In [261]:
# simplify the table
df2 = df[['year','zip_code','business_code','est']]
df2.head()


Out[261]:
year zip_code business_code est
0 2011 21201 22 4
1 2011 21201 23 38
2 2011 21201 30 20
3 2011 21201 42 60
4 2011 21201 45 286

In [247]:
# sum and normalize
# 1. Sum: use pandas groupby
gb = df2.groupby(('zip_code','year','business_code'),as_index=False)
gb_sum = gb.agg(np.sum)
gb_sum.head()


Out[247]:
zip_code year business_code est
0 2108 2011 -- 1125
1 2108 2011 23 8
2 2108 2011 30 20
3 2108 2011 42 58
4 2108 2011 45 220

In [288]:
# Compute growth rate: divide year_(N+1) by year_N (>1: growth)
gb2 = gb_sum.groupby(('zip_code','year'))

In [413]:
# Create empty cell if the business type does not exist.
business_codes = [11, 21, 22, 23, 30, 42, 45, 49, 51, 52, 53, 54, 55, 56, 61, 62, 71, 72, 81, 99]

In [431]:
# for every year and every zip code, create same list of business_codes
# if not exist, est value = 0
years = range(2011,2014)
df_biz = pd.DataFrame()
for year in years:
    for zip_code in zip_codes:
        try:
            a = gb2.get_group((int(zip_code),year))
            est_list = []
            temp = pd.DataFrame()
            for i in range(len(business_codes)):
                business_code = business_codes[i]
                if str(business_code) in a['business_code'].values:
                    est_list.append(int(a[a['business_code']==str(business_code)]['est']))
                else:
                    est_list.append(0)
            temp['business_code']=pd.Series(business_codes)
            temp['est']=pd.DataFrame(est_list)
            temp['zip_code']=zip_code
            temp['year']=year
            df_biz = pd.concat([df_biz,temp],ignore_index=True)
        except KeyError:
            continue

In [443]:
# save
df_biz.to_csv('biz_sum_final.csv',encoding='utf-8',index=False)

5. Predictor: growth rate per business category


In [444]:
df_biz.head()


Out[444]:
business_code est zip_code year
0 11 0 21201 2011
1 21 0 21201 2011
2 22 4 21201 2011
3 23 38 21201 2011
4 30 20 21201 2011

In [445]:
gb_biz = df_biz.groupby(('year','zip_code','business_code'))

In [471]:
df_all


Out[471]:
business_code growth year zip_code
0 99 [nan] 2011 90089

In [512]:
ref_years = [2011, 2012]
df_biz_growth = pd.DataFrame()
for ref_year in ref_years:
    for zip_code in zip_codes:
        for business_code in business_codes:
            try:
                prev_year = int(gb_biz.get_group((ref_year,zip_code,business_code))['est'].values)
                this_year = int(gb_biz.get_group((ref_year+1,zip_code,business_code))['est'].values)
                if prev_year == 0 & this_year > 0:
                    growth = 2
                elif prev_year == 0 & this_year == 0:
                    growth = 0
                else:
                    growth = (this_year - prev_year+.0)/prev_year
                data = [dict(year = ref_year,
                            zip_code = zip_code,
                            business_code = business_code,
                            growth = growth)]        
                temp = pd.DataFrame(data)
                df_biz_growth = pd.concat([df_biz_growth,temp],ignore_index=True)
            except KeyError:
                continue

In [513]:
print df_biz_growth.head()
len(df_biz_growth)


   business_code    growth  year zip_code
0             11  0.000000  2011    21201
1             21  0.000000  2011    21201
2             22  0.000000  2011    21201
3             23  0.052632  2011    21201
4             30 -0.300000  2011    21201
Out[513]:
23120

In [515]:
df_biz_growth.to_csv('biz_growth.csv',encoding='utf-8',index=False)

In [516]:
gb3 = df_biz_growth.groupby(('year','zip_code'))

In [517]:
gb3.get_group((2012,'21201'))


Out[517]:
business_code growth year zip_code
11560 11 0.000000 2012 21201
11561 21 0.000000 2012 21201
11562 22 0.000000 2012 21201
11563 23 0.150000 2012 21201
11564 30 0.000000 2012 21201
11565 42 0.136364 2012 21201
11566 45 -0.020833 2012 21201
11567 49 0.333333 2012 21201
11568 51 -0.054054 2012 21201
11569 52 -0.085106 2012 21201
11570 53 -0.032787 2012 21201
11571 54 -0.003155 2012 21201
11572 55 0.000000 2012 21201
11573 56 -0.048387 2012 21201
11574 61 0.062500 2012 21201
11575 62 0.042857 2012 21201
11576 71 -0.050000 2012 21201
11577 72 0.000000 2012 21201
11578 81 -0.038168 2012 21201
11579 99 0.000000 2012 21201

In [518]:
# original zip code
len(zip_codes)


Out[518]:
585

In [519]:
# new zip code based on available data: it's fewer
len(df_biz_growth.zip_code.unique())


Out[519]:
578

In [520]:
# we're getting only these zip codes
new_zip = df_biz_growth.zip_code.unique()
np.save('new_zip_codes', new_zip)
"""
    load: np.load('new_zip_codes.npy')
"""


Out[520]:
"\n    load: np.load('new_zip_codes.npy')\n"