In [1]:
import requests
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import csv
import seaborn as sns
font = {'family' : 'Arial',
'weight' : 'bold',
'size' : 25}
matplotlib.rc('font', **font)
from census import Census
from us import states
import csv
In [56]:
# load the zip code file
import pickle
with open('zipcode_final.txt', 'rb') as f:
zip_codes = pickle.load(f)
In [69]:
# table list: total demographics by age
tables = ['B01001_001E',
'B01001_002E',
'B01001_007E',
'B01001_008E',
'B01001_009E',
'B01001_010E',
'B01001_011E',
'B01001_012E',
'B01001_026E',
'B01001_031E',
'B01001_032E',
'B01001_033E',
'B01001_034E',
'B01001_035E',
'B01001_036E',
'B15002_001E',
'B15002_002E',
'B15002_015E',
'B15002_016E',
'B15002_017E',
'B15002_018E',
'B15002_019E',
'B15002_032E',
'B15002_033E',
'B15002_034E',
'B15002_035E',
'B17001_002E',
'C24010_015E',
'C24010_051E']
len(tables)
Out[69]:
In [228]:
columns = ['TotalPop',
'M_Total',
'M_18to19',
'M_20',
'M_21',
'M_22to24',
'M_25to29',
'M_30to34',
'F_Total',
'F_18to19',
'F_20',
'F_21',
'F_22to24',
'F_25to29',
'F_30to34',
'Total_Edu',
'M_Total_Edu',
'M_BA',
'M_MA',
'M_Prof',
'M_Doc',
'F_Total_Edu',
'F_BA',
'F_MA',
'F_Prof',
'F_Doc',
'Total_poor',
'M_Arts',
'F_Arts']
len(columns)
Out[228]:
In [74]:
# estimate time to collect the data
len(zip_codes)*len(tables)/3600*3
Out[74]:
In [90]:
# per year
year = 2013
columns = tables + ['zip_code','year']
df_ = pd.DataFrame(columns=columns)
df = pd.DataFrame(columns=tables)
c = Census(api_key,year=year)
for table in tables:
data = [dict(d = c.acs.zipcode(table,zip_code)[0][table]) for zip_code in zip_codes]
temp = pd.DataFrame(data)
df[table]=temp
# print data
df['year']=year
df_ = pd.concat([df_,df],ignore_index=True)
# If you want to use a for loop...
"""# df for census data
years = range(2011,2014)
columns = tables + ['zip_code','year']
df_ = pd.DataFrame(columns=columns)
for year in years:
df = pd.DataFrame(columns=tables)
c = Census(api_key,year=year)
for table in tables:
data = [dict(zip_code = c.acs.zipcode(table,zip_code)[0][table]) for zip_code in zip_codes]
temp = pd.DataFrame(data)
df[table]=temp
df['year']=year
df_ = pd.concat([df_,df],ignore_index=True)"""
In [91]:
df_2013 = df_
In [98]:
df_2013['zip_code']=pd.DataFrame(zip_codes)
df_2012['zip_code']=pd.DataFrame(zip_codes)
df_2011['zip_code']=pd.DataFrame(zip_codes)
In [97]:
df_2013.head()
Out[97]:
In [89]:
df_2012.head()
Out[89]:
In [88]:
df_2011.head()
Out[88]:
In [99]:
df_census = pd.concat([df_2011,df_2012,df_2013],ignore_index=True)
In [101]:
df_census.head()
Out[101]:
In [102]:
# save
df_census.to_csv('census_final.csv',encoding='utf-8',index=False)
In [290]:
df2 = pd.read_csv('census_final.csv')
In [291]:
len(df2.columns)
column_names = columns + ['year','zip_code']
In [292]:
df2.columns = column_names
df2.head()
Out[292]:
In [ ]:
# deal with nan
np.isnan(df_new.M_18to19.values)
In [308]:
# normalize: do I have to?
df_new = pd.DataFrame()
# male age group
for i in range(2,8):
df_new[columns[i]]=df2.ix[:,i].values/df2.ix[:,1]
# female age group
for i in range(9,15):
df_new[columns[i]]=df2.ix[:,i].values/df2.ix[:,8]
# male edu
for i in range(17,21):
df_new[columns[i]]=df2.ix[:,i].values/df2.ix[:,16]
# female edu
for i in range(22,26):
df_new[columns[i]]=df2.ix[:,i].values/df2.ix[:,21]
# poor
df_new[columns[26]]=df2.ix[:,26].values/df2.ix[:,0]
# male art
df_new[columns[27]]=df2.ix[:,27].values/df2.ix[:,1]
# female art
df_new[columns[28]]=df2.ix[:,27].values/df2.ix[:,8]
In [309]:
df_new.head()
Out[309]:
In [314]:
gb2 = df2.groupby(('year','zip_code'))
In [315]:
(prev_year+.0)/this_year
Out[315]:
In [316]:
pd.DataFrame((this_year - prev_year+.0)/prev_year)
Out[316]:
In [319]:
growth = pd.DataFrame((this_year - prev_year+.0)/prev_year)
growth
Out[319]:
In [322]:
gb2.get_group((ref_year,int(zip_code)))[]
Out[322]:
In [325]:
ref_years = [2011, 2012]
df_cen_growth = pd.DataFrame()
for ref_year in ref_years:
for zip_code in zip_codes:
try:
prev_year = gb2.get_group((ref_year,int(zip_code))).ix[:,:-2].values
this_year = gb2.get_group((ref_year+1,int(zip_code))).ix[:,:-2].values
growth = pd.DataFrame((this_year - prev_year+.0)/prev_year)
growth.columns = columns
growth['year']=ref_year
growth['zip_code']=zip_code
df_cen_growth = pd.concat([df_cen_growth,growth],ignore_index=True)
except KeyError:
continue
In [327]:
df_cen_growth.head()
Out[327]:
In [328]:
# Selecting non-nan rows (total population)
df_cen_growth = df_cen_growth[np.isfinite(df_cen_growth['TotalPop'])]
In [330]:
df_cen_growth.head()
Out[330]:
In [380]:
# filtering
df_cen_growth2 = df_cen_growth[np.isfinite(df_cen_growth['F_Total_Edu'])]
df_cen_growth2 = df_cen_growth2[np.isfinite(df_cen_growth2['M_Total_Edu'])]
df_cen_growth2 = df_cen_growth2[np.isfinite(df_cen_growth2['M_Total'])]
df_cen_growth2 = df_cen_growth2[np.isfinite(df_cen_growth2['F_Total'])]
len(df_cen_growth2)
Out[380]:
In [378]:
df_cen_growth2
Out[378]:
In [381]:
df_cen_growth2.to_csv('census_filtered.csv',encoding='utf-8',index=False)