In [7]:
%matplotlib inline
import pandas as pd
import json
In [8]:
#File locations
acs_file = "/home/russ/Documents/DDL/Projects/03-censusables/source/Data/raw_files/acs5yr.csv"
zillow_HVI_file = "/home/russ/Documents/DDL/Projects/03-censusables/source/Data/raw_files/Zip_Zhvi_AllHomes_HomeValueIndex.csv"
zillow_RI_file = "/home/russ/Documents/DDL/Projects/03-censusables/source/Data/raw_files/Zip_Zri_AllHomes_RentIndex.csv"
urbanization_zip = "/home/russ/Documents/DDL/Projects/03-censusables/source/Data/raw_files/zcta2010_txt.csv"
ZCTA = "/home/russ/Documents/DDL/Projects/03-censusables/source/Data/raw_files/ZCTA.csv"
In [9]:
acs = pd.read_csv(acs_file)
In [10]:
diversity = acs[['zip5','pop','race_white','race_black','race_asian','race_indian','race_other','hisp']].copy(deep=True)
In [11]:
diversity['white_hisp'] = ((diversity['pop']*diversity['race_white'])*diversity['hisp'])/diversity['pop']
In [12]:
diversity['white_nonhisp'] = ((diversity['pop']*diversity['race_white'])*(1-diversity['hisp']))/diversity['pop']
In [13]:
diversity['div_index'] = 1- (diversity['race_black']**2 + diversity['white_hisp']**2 + diversity['white_nonhisp']**2 + diversity['race_asian']**2 + diversity['race_indian']**2)
In [14]:
diversity_index = diversity[['zip5','div_index']].dropna(axis=0,how='any',subset=['zip5','div_index'])
In [15]:
import numpy as np
diversity_index[diversity_index['div_index']==np.nan]
Out[15]:
In [16]:
urban = pd.read_csv(urbanization_zip)
In [17]:
urban.rename(columns={'Zip5':'zip5'},inplace=True)
In [18]:
urban['zip5'] = urban.apply(lambda x: float(x['zip5']),axis=1)
In [19]:
#urban['pop'] = urban.apply(lambda x: int(x['POPULATION'].replace(',','')),axis=1)
#alternate
#urban['pop'] = urban['POPULATION'].apply(lambda x: int(x.replace(',','')))
urban['pop'] = urban.apply(lambda x: int(x['POPULATION']),axis=1)
In [20]:
#strip Z from Zip Code Text
urban['ZCTA5'] = urban.apply(lambda x: x['ZCTA5'][1:],axis=1)
In [196]:
urban_index.min()
Out[196]:
In [199]:
urban_index.idxmax()
Out[199]:
In [197]:
urban_index[(urban_index['zip5']==21230) | (urban_index['zip5']==59736)]
Out[197]:
In [192]:
urban_index[urban_index['urban_index']==np.inf]
Out[192]:
In [185]:
urban.dtypes
Out[185]:
In [22]:
urban['urban_index'] = urban['LANDSQMT']/urban['pop']
In [23]:
urban_index = urban[['ZCTA5','zip5','urban_index']].dropna(axis=0,how='any',subset=['zip5','urban_index'])
In [ ]:
urban_index
In [24]:
urban_index.head(1)
Out[24]:
In [25]:
zillow_HVI = pd.read_csv(zillow_HVI_file)
In [26]:
zillow_HVI = zillow_HVI[['RegionName','2014-01','2014-07','2015-01','2015-07']]
In [27]:
zillow_HVI.rename(columns={'RegionName':'zip5'},inplace=True)
zillow_HVI.head()
Out[27]:
In [28]:
zillow_RI = pd.read_csv(zillow_RI_file)
In [29]:
len(zillow_HVI)
Out[29]:
In [30]:
zillow_RI.head(1)
Out[30]:
In [31]:
zillow_RI = zillow_RI[['RegionName','2014-01','2014-07','2015-01','2015-07']].copy(False)
In [32]:
zillow_RI.rename(columns={'RegionName':'zip5'},inplace=True)
In [33]:
zillow_RI.head()
Out[33]:
In [34]:
housing_index = pd.merge (zillow_HVI, zillow_RI,how='inner', on='zip5').dropna(axis=0,how='all')
In [35]:
#housing_index = zillow_HVI.dropna(axis=0,how='all')
In [36]:
acs.head(1).to_csv("ACS_Headers.csv")
In [37]:
#housing_index = pd.merge(housing_index, acs[['zip5','rent_median']], how='inner', on='zip5',copy=False)
In [38]:
income_index = acs[['zip5','inc_median','poverty','snap','gini_index']].dropna(axis=0,how='all')
In [189]:
len(income_index.columns)
Out[189]:
In [39]:
income_index[income_index['zip5']==90210]
Out[39]:
In [40]:
#Beginning PCA Analysis (reference: http://sebastianraschka.com/Articles/2015_pca_in_3_steps.html)
X_div = diversity_index.ix[:,1].values
y_div = diversity_index.ix[:,0].values
In [41]:
y_div
Out[41]:
In [48]:
#Standardization
from sklearn.preprocessing import StandardScaler
X_div_std = StandardScaler().fit_transform(X_div)
In [49]:
X_div_std
Out[49]:
In [50]:
#Same PCA analysis using scikit-learn
from sklearn.decomposition import PCA as sklearnPCA
sklearn_div_pca = sklearnPCA(n_components=1)
Y_div_sklearn = sklearn_div_pca.fit_transform(X_div_std)
In [51]:
sklearn_pca.explained_variance_ratio_
In [52]:
len(X_div)
Out[52]:
In [53]:
df_div = pd.DataFrame({'zip5':y_div,'diversity_index':X_div})
In [54]:
df_div.head()
Out[54]:
In [55]:
#Beginning PCA Analysis (reference: http://sebastianraschka.com/Articles/2015_pca_in_3_steps.html)
X_urb = urban_index.ix[:,1].values
y_urb = urban_index.ix[:,0].values
In [56]:
#Standardization
from sklearn.preprocessing import StandardScaler
X_urb_std = StandardScaler().fit_transform(X_urb)
In [57]:
X_urb_std
Out[57]:
In [58]:
#Same PCA analysis using scikit-learn
from sklearn.decomposition import PCA as sklearnPCA
sklearn_urb_pca = sklearnPCA(n_components=1)
Y_urb_sklearn = sklearn_pca.fit_transform(X_urb_std)
In [59]:
df_urb = pd.DataFrame({'zip5':y_div,'urban_index':X_div})
In [186]:
df_urb.dtypes
Out[186]:
In [60]:
df_urb.head(1)a
Out[60]:
In [61]:
housing_index['2014-07_x']
Out[61]:
In [62]:
housing_index.dtypes
Out[62]:
In [63]:
housing_index[housing_index['2014-07_x'].isnull()]
#income_index[income_index['zip5']==90210]
Out[63]:
In [184]:
#housing_index.isnull()==True
housing_index[housing_index.isnull().any(axis=1)]
Out[184]:
In [65]:
housing_index[housing_index['zip5']== 21211]
Out[65]:
In [183]:
housing_index.loc[housing_index['2014-07_x'].isnull(),'2014-07_x'] = housing_index['2014-01_x']
In [67]:
housing_index.head(1)
Out[67]:
In [68]:
#Beginning PCA Analysis (reference: http://sebastianraschka.com/Articles/2015_pca_in_3_steps.html)
X_hou = housing_index.ix[:,1:9].values
zip5_hou = housing_index.ix[:,0].values
In [69]:
#Standardization
from sklearn.preprocessing import StandardScaler
X_hou_std = StandardScaler().fit_transform(X_hou)
In [70]:
#Same PCA analysis using scikit-learn
from sklearn.decomposition import PCA as sklearnPCA
#sklearn_hou_pca = sklearnPCA(n_components=8)
sklearn_hou_pca = sklearnPCA(n_components=1)
Y_hou_sklearn = sklearn_hou_pca.fit_transform(X_hou_std)
In [71]:
sklearn_hou_pca
Out[71]:
In [72]:
expl_hou_var = sklearn_hou_pca.explained_variance_ratio_
In [73]:
sklearn_hou_pca.explained_variance_ratio_
Out[73]:
In [74]:
X_hou_std
Out[74]:
In [75]:
sklearn_hou_pca.get_covariance()
Out[75]:
In [76]:
sklearn_hou_pca.explained_variance_
Out[76]:
In [77]:
sklearn_hou_pca.explained_variance_ratio_
Out[77]:
In [78]:
expl_hou_val = sklearn_hou_pca.explained_variance_ratio_
In [79]:
ev = expl_hou_val.tolist()
ls = ['PC%s' %i for i in range(0,len(ev))]
df_hou_explainedValue = pd.DataFrame(ev,columns=['Value'],index=ls)
In [80]:
df_hou_explainedValue.plot(kind='bar')
Out[80]:
In [81]:
sklearn_hou_pca.components_
Out[81]:
In [82]:
df_hou = pd.DataFrame({'zip5':zip5_hou,'housing_index':Y_hou_sklearn[:,0]})
In [83]:
df_hou['housing_index'] = df_hou.apply(lambda x: x['housing_index']*-1,axis=1)
In [84]:
df_hou[df_hou['zip5']==21222]
Out[84]:
In [85]:
#Beginning PCA Analysis (reference: http://sebastianraschka.com/Articles/2015_pca_in_3_steps.html)
X_inc = income_index.ix[:,1:5].values
zip5_inc = income_index.ix[:,0].values
In [86]:
#Standardization
from sklearn.preprocessing import StandardScaler
X_inc_std = StandardScaler().fit_transform(X_inc)
In [87]:
#Same PCA analysis using scikit-learn
from sklearn.decomposition import PCA as sklearnPCA
#sklearn_inc_pca = sklearnPCA(n_components=4)
sklearn_inc_pca = sklearnPCA(n_components=1)
Y_inc_sklearn = sklearn_inc_pca.fit_transform(X_inc_std)
In [88]:
X_inc_std
Out[88]:
In [89]:
sklearn_inc_pca.explained_variance_ratio_
Out[89]:
In [90]:
sklearn_inc_pca.explained_variance_
Out[90]:
In [91]:
expl_inc_var = sklearn_inc_pca.explained_variance_ratio_
ev = expl_inc_var.tolist()
ls = ['PC%s' %i for i in range(0,len(ev))]
df_inc_explainedValue = pd.DataFrame(ev,columns=['Value'],index=ls)
In [92]:
df_inc_explainedValue.plot(kind='bar')
Out[92]:
In [93]:
df_inc = pd.DataFrame({'zip5':zip5_inc,'income_index':Y_inc_sklearn[:,0]})
In [94]:
df_inc[df_inc['zip5']==21223]
Out[94]:
In [95]:
df_final = pd.merge (df_inc,df_hou,on='zip5')
In [96]:
df_final= pd.merge (df_final,df_urb,on='zip5')
In [97]:
df_final= pd.merge (df_final,df_div,on='zip5')
In [98]:
df_final.head(1)
Out[98]:
In [99]:
df_final[df_final['zip5']==90210]
Out[99]:
In [100]:
#Rescale indexes
from sklearn import preprocessing
zip5_final = df_final['zip5'].values
std_scale = preprocessing.StandardScaler().fit(df_final[['income_index', 'housing_index','urban_index','diversity_index']])
df_std = std_scale.transform(df_final[['income_index', 'housing_index','urban_index','diversity_index']])
minmax_scale = preprocessing.MinMaxScaler().fit(df_final[['income_index', 'housing_index','urban_index','diversity_index']])
df_minmax = minmax_scale.transform(df_final[['income_index', 'housing_index','urban_index','diversity_index']])
In [101]:
df_minmax_final = pd.DataFrame({'zip5':zip5_final,'income_index':df_minmax[:,0],'housing_index':df_minmax[:,1],'urban_index':df_minmax[:,2],'diversity_index':df_minmax[:,3]})
In [102]:
df_minmax_final[df_minmax_final['zip5']==21230]
Out[102]:
In [190]:
df_minmax_final[df_minmax_final['zip5']==90210]
Out[190]:
In [103]:
ZCTA = pd.read_csv(ZCTA)
In [104]:
ZCTA.head(1)
Out[104]:
In [105]:
df_all_final = pd.merge (df_minmax_final,ZCTA[['zcta5','ZIPName','State']],left_on='zip5',right_on='zcta5',copy=False)
In [106]:
del df_all_final['zcta5']
In [107]:
df_all_final = pd.merge(df_all_final,urban[['zip5','ZCTA5']],copy=False)
In [108]:
df_all_final.to_csv('/home/russ/Documents/DDL/Projects/03-censusables/source/Data/final_files/Final.csv')
In [110]:
fit = [.5,.5,.5,.5]
#fit = [0,0,1,0]
df_all_final['fit'] = df_all_final.apply(lambda x: abs(fit[0]-x['diversity_index'])+abs(fit[1]-x['housing_index'])+\
abs(fit[2]-x['income_index'])+abs(fit[3]-x['urban_index']),axis=1)
In [111]:
state = 'MD'
if state:
df_display = df_all_final[df_all_final['State']==state].sort(['fit']).head(10)
else:
df_display = df_all_final.sort(['fit']).head(10)
df_display
Out[111]:
In [112]:
df_all_final.sort(['fit']).head(10)
Out[112]:
In [113]:
fit = []
fit.append(float(50)/100)
fit.append(float(50)/100)
fit.append(float(50)/100)
fit.append(float(50)/100)
df_all_final['fit'] = df_all_final.apply(lambda x: abs(fit[0]-x['diversity_index'])+abs(fit[1]-x['housing_index'])+\
abs(fit[2]-x['income_index'])+abs(fit[3]-x['urban_index']),axis=1)
In [537]:
import vincent
from vincent import AxisProperties, PropertySet, ValueRef
vincent.core.initialize_notebook()
In [593]:
zip_topo = r'zips_us_topo.json'
state_topo = r'us_states.topo.json'
geo_data = [{'name': 'states',
'url': state_topo,
'feature': 'us_states.geo'},
{'name': 'zip_codes',
'url': zip_topo,
'feature': 'zip_codes_for_the_usa'}]
vis = vincent.Map(data=df_all_final, geo_data=geo_data, scale=800, projection='albersUsa',
data_bind='fit', data_key='zip5',brew='YlOrRd',
map_key={'zip_codes': 'properties.zip'})
del vis.marks[0].properties.update
#del vis.marks[1].properties.update
#vis.marks[1].properties.update.fill.value = '#FFFFFF'
#vis.marks[1].properties.enter.stroke.value = '#CCCCFF'
vis.marks[1].properties.enter.stroke_opacity = ValueRef(value=0.05)
vis.marks[0].properties.enter.stroke.value = '#C0C0C0'
#vis.marks[1].properties.hover.fill.value = 'red'
vis.legend(title='Preferred ZipCode')
vis.display()
vis.to_json("USA_Preferred.json")
In [541]:
zip_topo = r'zips_us_topo.json'
state_topo = r'us_states.topo.json'
geo_data2 = [{'name': 'zip_codes',
'url': zip_topo,
'feature': 'zip_codes_for_the_usa'},
{'name': 'states',
'url': state_topo,
'feature': 'us_states.geo'}]
geo_data = [{'name': 'states',
'url': state_topo,
'feature': 'us_states.geo'},
{'name': 'zip_codes',
'url': zip_topo,
'feature': 'zip_codes_for_the_usa'}]
vis = vincent.Map(data=urban[urban['zip']== '83211'], geo_data=geo_data, scale=1100, projection='albersUsa',
data_bind='POPULATION', data_key='zip',brew='PuRd',
map_key={'zip_codes': 'properties.zip'})
del vis.marks[0].properties.update
#del vis.marks[1].properties.update
vis.marks[1].properties.update.fill.value = '#C390D4'
#vis.marks[1].properties.enter.stroke.value = '#CCCCFF'
#vis.marks[1].properties.enter.stroke_opacity = ValueRef(value=0.1)
vis.marks[0].properties.enter.stroke.value = '#FF0000'
vis.legend(title='POPULATION')
vis.display()
vis.to_json("USA.json")
In [4]:
import csv
with open("/home/russ/Documents/DDL/Projects/03-censusables/source/Data/raw_files/state_landarea_rank.csv") as f:
f.readline() # ignore first line (header)
land_area = dict(csv.reader(f, delimiter=','))
print land_area
In [146]:
ziplist = json.loads(df_all_final[['ZCTA5','ZIPName','fit']].head(5).to_json())
ziplist
Out[146]:
In [140]:
ziplist['ZCTA5']['0']
Out[140]:
In [150]:
ziplist = json.loads(df_all_final[['ZCTA5','ZIPName','fit']].sort(['fit']).reset_index().head(5).to_json())
In [151]:
ziplist
Out[151]:
In [153]:
table_data = []
for i in range (5):
dict_row = {}
dict_row['index'] = i
dict_row['ZCTA5'] = ziplist['ZCTA5'][str(i)]
dict_row['ZIPName'] = ziplist['ZIPName'][str(i)]
table_data.append(dict_row)
print table_data
In [160]:
json_export = json.dumps(table_data)
In [164]:
json_export
Out[164]:
In [170]:
df_all_final.head(1)
Out[170]:
In [179]:
state = "MD"
df_all_final[['ZCTA5','ZIPName','fit']][df_all_final['State']==state]
Out[179]:
In [182]:
state = "VT"
df_all_final[['ZCTA5','ZIPName','fit']][df_all_final['State']==state].sort(['fit']).reset_index().head(5)
Out[182]:
In [ ]: