In [232]:
import requests
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import csv
import seaborn as sns
font = {'family' : 'Arial',
'weight' : 'bold',
'size' : 25}
matplotlib.rc('font', **font)
from census import Census
from us import states
import csv
In [427]:
# load zip codes
import pickle
with open('zipcode_final.txt', 'rb') as f:
zip_codes = pickle.load(f)
In [236]:
# concatenate tables with different years and zip codes
df = pd.DataFrame()
years = range(2011,2014)
for year in years:
temp = pd.read_csv('zbp'+ str(year-2000) +'detail.txt',error_bad_lines=False)
temp['year']=year
temp = temp.loc[temp['zip'].isin(map(int,zip_codes))]
df = pd.concat([df,temp],ignore_index=True)
df.head()
Out[236]:
In [238]:
"""
11 Agriculture, Forestry, Fishing and Hunting
21 Mining, Quarrying, and Oil and Gas Extraction
22 Utilities
#23 Construction
31-33 Manufacturing
#42 Wholesale Trade
#44-45 Retail Trade
48-49 Transportation and Warehousing
#51 Information
#52 Finance and Insurance
#53 Real Estate and Rental and Leasing
#54 Professional, Scientific, and Technical Services
#55 Management of Companies and Enterprises
56 Administrative and Support and Waste Management and Remediation Services
61 Educational Services
62 Health Care and Social Assistance
71 Arts, Entertainment, and Recreation
#72 Accommodation and Food Services
81 Other Services (except Public Administration)
92 Public Administration
potential parameters: 23, 42, 45, 51, 52, 53, 54, 55, 72
"""
def conver_business_type(old_code):
"""
Convert N digit nacis codes to 2 digits
http://www.census.gov/cgi-bin/sssd/naics/naicsrch?chart=2012
"""
# get the first two digit
new_code = old_code[:2]
# simplify
# 31-33 Manufacturing
# 44-45 Retail Trade
# 48-49 Transportation and Warehousing
if new_code != '--':
new_int = int(new_code)
if new_int >= 31 and new_int <= 33:
new_code = '30'
elif new_int >= 44 and new_int <= 45:
new_code = '45'
elif new_int >= 48 and new_int <= 49:
new_code = '49'
return new_code
In [239]:
# create new column for simplified business_code
df['business_code']=df['naics'].apply(lambda x: conver_business_type(x))
In [240]:
# rename for consistence
df=df.rename(columns = {'zip':'zip_code'})
df.head()
Out[240]:
In [56]:
# Add column that show potentially good indicators
# 23 Construction
# 42 Wholesale Trade
# 45 Retail Trade
# 51 Information
# 52 Finance and Insurance
# 53 Real Estate and Rental and Leasing
# 54 Professional, Scientific, and Technical Services
# 55 Management of Companies and Enterprises
# 72 Accommodation and Food Services
# good_biz_code_list = [23,42,45,51,52,53,54,55,72]
In [243]:
# save
df.to_csv('biz_sum.csv',encoding='utf-8',index=False)
In [244]:
df = pd.read_csv('biz_sum.csv')
df.head()
Out[244]:
In [261]:
# simplify the table
df2 = df[['year','zip_code','business_code','est']]
df2.head()
Out[261]:
In [247]:
# sum and normalize
# 1. Sum: use pandas groupby
gb = df2.groupby(('zip_code','year','business_code'),as_index=False)
gb_sum = gb.agg(np.sum)
gb_sum.head()
Out[247]:
In [288]:
# Compute growth rate: divide year_(N+1) by year_N (>1: growth)
gb2 = gb_sum.groupby(('zip_code','year'))
In [413]:
# Create empty cell if the business type does not exist.
business_codes = [11, 21, 22, 23, 30, 42, 45, 49, 51, 52, 53, 54, 55, 56, 61, 62, 71, 72, 81, 99]
In [431]:
# for every year and every zip code, create same list of business_codes
# if not exist, est value = 0
years = range(2011,2014)
df_biz = pd.DataFrame()
for year in years:
for zip_code in zip_codes:
try:
a = gb2.get_group((int(zip_code),year))
est_list = []
temp = pd.DataFrame()
for i in range(len(business_codes)):
business_code = business_codes[i]
if str(business_code) in a['business_code'].values:
est_list.append(int(a[a['business_code']==str(business_code)]['est']))
else:
est_list.append(0)
temp['business_code']=pd.Series(business_codes)
temp['est']=pd.DataFrame(est_list)
temp['zip_code']=zip_code
temp['year']=year
df_biz = pd.concat([df_biz,temp],ignore_index=True)
except KeyError:
continue
In [443]:
# save
df_biz.to_csv('biz_sum_final.csv',encoding='utf-8',index=False)
In [444]:
df_biz.head()
Out[444]:
In [445]:
gb_biz = df_biz.groupby(('year','zip_code','business_code'))
In [471]:
df_all
Out[471]:
In [512]:
ref_years = [2011, 2012]
df_biz_growth = pd.DataFrame()
for ref_year in ref_years:
for zip_code in zip_codes:
for business_code in business_codes:
try:
prev_year = int(gb_biz.get_group((ref_year,zip_code,business_code))['est'].values)
this_year = int(gb_biz.get_group((ref_year+1,zip_code,business_code))['est'].values)
if prev_year == 0 & this_year > 0:
growth = 2
elif prev_year == 0 & this_year == 0:
growth = 0
else:
growth = (this_year - prev_year+.0)/prev_year
data = [dict(year = ref_year,
zip_code = zip_code,
business_code = business_code,
growth = growth)]
temp = pd.DataFrame(data)
df_biz_growth = pd.concat([df_biz_growth,temp],ignore_index=True)
except KeyError:
continue
In [513]:
print df_biz_growth.head()
len(df_biz_growth)
Out[513]:
In [515]:
df_biz_growth.to_csv('biz_growth.csv',encoding='utf-8',index=False)
In [516]:
gb3 = df_biz_growth.groupby(('year','zip_code'))
In [517]:
gb3.get_group((2012,'21201'))
Out[517]:
In [518]:
# original zip code
len(zip_codes)
Out[518]:
In [519]:
# new zip code based on available data: it's fewer
len(df_biz_growth.zip_code.unique())
Out[519]:
In [520]:
# we're getting only these zip codes
new_zip = df_biz_growth.zip_code.unique()
np.save('new_zip_codes', new_zip)
"""
load: np.load('new_zip_codes.npy')
"""
Out[520]: