In [2]:
%matplotlib inline
import pandas as pd
import json
In [5]:
#File locations
acs_file = "/home/russ/Documents/DDL/Data/JeffData/PCA/Jeff_data_acs5yr.csv"
zillow_HVI_file = "/home/russ/Documents/DDL/Data/JeffData/PCA/Zip_Zhvi_AllHomes_HomeValueIndex.csv"
zillow_RI_file = "/home/russ/Documents/DDL/Data/JeffData/PCA/Zip_Zri_AllHomes_RentIndex.csv"
FDIC_deposits_100K_file = "/home/russ/Documents/DDL/Data/JeffData/PCA/FDIC_All_Reports_20150630/All_Reports_20150630_Deposits Based on the $100,000 Reporting Threshold.csv"
FDIC_deposits_250K_file = "/home/russ/Documents/DDL/Data/JeffData/PCA/FDIC_All_Reports_20150630/All_Reports_20150630_Deposits Based on the $250,000 Reporting Threshold.csv"
library_file = "/home/russ/Documents/DDL/Data/JeffData/PCA/Public_Libraries_Survey_FY_2013_-_Outlet.csv"
complaints_file = "/home/russ/Documents/DDL/Data/JeffData/PCA/Consumer_Complaints.csv"
urbanization_zip = "/home/russ/Documents/DDL/Data/JeffData/PCA/zcta2010_txt.csv"
In [6]:
acs = pd.read_csv(acs_file)
In [7]:
acs.head()
Out[7]:
In [8]:
diversity = acs[['zip5','pop','race_white','race_black','race_asian','race_indian','race_other','hisp']].copy(deep=True)
In [9]:
diversity['white_hisp'] = ((diversity['pop']*diversity['race_white'])*diversity['hisp'])/diversity['pop']
In [10]:
diversity['white_nonhisp'] = ((diversity['pop']*diversity['race_white'])*(1-diversity['hisp']))/diversity['pop']
In [11]:
diversity['div_index'] = 1- (diversity['race_black']**2 + diversity['white_hisp']**2 + diversity['white_nonhisp']**2 + diversity['race_asian']**2 + diversity['race_indian']**2)
In [12]:
diversity[diversity['zip5']==21093]
Out[12]:
In [13]:
zillow_HVI = pd.read_csv(zillow_HVI_file)
In [14]:
zillow_RI = pd.read_csv(zillow_RI_file)
In [15]:
urban = pd.read_csv(urbanization_zip)
In [16]:
#del urban['pop']
urban.dtypes
Out[16]:
In [17]:
#urban['pop'] = urban.apply(lambda x: int(x['POPULATION'].replace(',','')),axis=1)
#alternate
#urban['pop'] = urban['POPULATION'].apply(lambda x: int(x.replace(',','')))
urban['pop'] = urban.apply(lambda x: int(x['POPULATION']),axis=1)
In [18]:
#strip Z from Zip Code Text
urban['ZCTA5'] = urban.apply(lambda x: x['ZCTA5'][1:],axis=1)
In [19]:
urban.head()
Out[19]:
In [20]:
urban['urban_index'] = urban['LANDSQMT']/urban['pop']
In [21]:
zillow_HVI = zillow_HVI[['RegionName','1996-07','1997-01','1997-07','1998-01','1998-07','1999-01','1999-07','2000-01','2000-07'\
,'2001-01','2001-07','2002-01','2002-07','2003-01','2003-07','2004-01','2004-07','2005-01','2005-07','2006-01','2006-07'\
,'2007-01','2007-07','2008-01','2008-07','2009-01','2009-07','2010-01','2010-07','2011-01','2011-07','2012-01','2012-07'\
,'2013-01','2013-07','2014-01','2014-07','2015-01','2015-07']]
In [22]:
zillow_HVI.rename(columns={'RegionName':'zip5'},inplace=True)
zillow_HVI.head()
Out[22]:
In [23]:
len(zillow_HVI)
Out[23]:
In [24]:
zillow_RI.head(1)
Out[24]:
In [25]:
zillow_RI = zillow_RI[['RegionName','2011-01','2011-07','2012-01','2012-07'\
,'2013-01','2013-07','2014-01','2014-07','2015-01','2015-07']].copy(False)
In [26]:
zillow_RI.rename(columns={'RegionName':'zip5'},inplace=True)
In [27]:
zillow_RI.head()
Out[27]:
In [28]:
deposits_250K = pd.read_csv(FDIC_deposits_250K_file)
In [29]:
#deposits_250K = deposits_250K[['zip'],['IDdepsmb'],['DEPSMRA'],['DEPSMRN'],['NTRCDSMJ'],['IDdeplam'],['IDdeplgb'],['DEPLGRA'],['DEPLGRN'],['NTRTMLGJ']]
deposits_250K = deposits_250K[['zip','IDdepsam','IDdepsmb','DEPSMRA','DEPSMRN','NTRCDSMJ','IDdeplam','IDdeplgb','DEPLGRA','DEPLGRN','NTRTMLGJ']]
In [30]:
deposits_250K.columns = ['zip5','dep_amt_low','dep_count_low','retirement_amt_low','retirement_count_low','time_deposits_low','dep_amt_high','dep_count_high','retirement_amt_high','retirement_count_high','time_deposits_high']
In [31]:
deposits_250K.head()
Out[31]:
In [32]:
deposits_zip = deposits_250K['dep_amt_high'].groupby(deposits_250K['zip5']).mean().reset_index()
In [33]:
deposits_zip.head()
Out[33]:
In [34]:
library = pd.read_csv(library_file)
In [35]:
#Slice field value based on hard coded State
#Next step to substitute ['STABR'] for State text to dynamically find location of each state within address field
#library_zip.apply(lambda x : x['Location'][0:15], axis =1)
#library_zip.apply(lambda x : x['Location'][x['Location'].find(', AK')+5:x['Location'].find(', AK')+10], axis =1)
#Strip Zip Code From Location Column
#library_zip = library[['Location','STABR']]
#library_zip['zip'] = library_zip.apply(lambda x : x['Location'][x['Location'].find(', ' + x['STABR'])+5:x['Location'].find(', ' + x['STABR'])+10], axis =1)
In [36]:
#Parse out Zip Code from Location field
library['zip'] = library.apply(lambda x : x['Location'][x['Location'].rfind(', ' + x['STABR'])+5:x['Location'].rfind(', ' + x['STABR'])+10], axis =1)
In [37]:
library.head(2)
Out[37]:
In [38]:
#Change to your local path
library.to_csv("/home/russ/Documents/DDL/Data/JeffData/PCA/Library_ZipCode.csv")
In [39]:
library_zip = library['STABR'].groupby(library['zip']).count().reset_index()
In [40]:
library_zip['zip5'] = library_zip.apply(lambda x: int(x['zip']),axis=1)
In [41]:
library_zip.columns = ['zip','LibraryCount','zip5']
In [42]:
library_zip.head(1)
Out[42]:
In [43]:
combined = pd.merge(acs[['zip5','snap','inc_median','poverty']],zillow[['zip5','2015-07']], on='zip5',copy=False)
In [ ]:
combined.head()
In [ ]:
combined[combined['zip5']==90210]
In [ ]:
zillow_combined = pd.merge(zillow_HVI[['zip5','2015-07']],zillow_RI[['zip5','2015-07']], on='zip5',copy=False)
In [ ]:
zillow_combined.columns = ['zip5','HVI','RI']
In [ ]:
#Beginning PCA Analysis (reference: http://sebastianraschka.com/Articles/2015_pca_in_3_steps.html)
X = combined.ix[:,1:5].values
y = combined.ix[:,0].values
In [44]:
#Standardization
from sklearn.preprocessing import StandardScaler
X_std = StandardScaler().fit_transform(X)
In [ ]:
X_std
In [ ]:
#Same PCA analysis using scikit-learn
from sklearn.decomposition import PCA as sklearnPCA
sklearn_pca = sklearnPCA(n_components=2)
Y_sklearn = sklearn_pca.fit_transform(X_std)
In [ ]:
Y_sklearn
In [ ]:
df = pd.DataFrame({'X':Y_sklearn[:,0],'Y':Y_sklearn[:,1]})
In [72]:
df = combined[['zip5']].merge(df,left_index=True, right_index=True)
In [73]:
df[df['zip5']==90210]
Out[73]:
In [74]:
#Top of arc = High Housing, low Income
#Negative = High Housing, High Income
#Positive = Low Housing, Low Income
df.plot(kind='scatter',x='X',y='Y')
Out[74]:
In [77]:
df[df['Y']<-10]
Out[77]:
In [79]:
df[df['X']>8]
Out[79]:
In [94]:
df[(df['X']>-.5) & (df['X']<.5) & (df['Y']<0)]
Out[94]:
In [80]:
df[df['Y']>0]
Out[80]:
In [84]:
combined[combined['zip5']== 28202]
Out[84]:
In [83]:
combined[combined['zip5']== 43240]
Out[83]:
In [82]:
combined[combined['zip5']== 90210]
Out[82]:
In [81]:
combined[combined['zip5']== 99901]
Out[81]:
In [100]:
combined[combined['zip5']== 58420]
Out[100]:
In [85]:
df[df['zip5']== 90210]
Out[85]:
In [37]:
traces = []
list = combined['zip5'].values.tolist()
list_short = list[0:200]
list_short.append('90210')
for name in (list_short):
trace = Scatter(
x=Y_sklearn[y==name,0],
y=Y_sklearn[y==name,1],
mode='markers',
name=name,
marker=Marker(
size=12,
line=Line(
color='rgba(217, 217, 217, 0.14)',
width=0.5),
opacity=0.8))
traces.append(trace)
data = Data(traces)
layout = Layout(xaxis=XAxis(title='PC1', showline=False),
yaxis=YAxis(title='PC2', showline=False))
fig = Figure(data=data, layout=layout)
py.iplot(fig)
Out[37]:
In [38]:
#Beginning PCA Analysis (reference: http://sebastianraschka.com/Articles/2015_pca_in_3_steps.html)
X = zillow_combined.ix[:,1:3].values
y = zillow_combined.ix[:,0].values
In [40]:
#Standardization
from sklearn.preprocessing import StandardScaler
X_std = StandardScaler().fit_transform(X)
#Same PCA analysis using scikit-learn
from sklearn.decomposition import PCA as sklearnPCA
sklearn_pca = sklearnPCA(n_components=2)
Y_sklearn = sklearn_pca.fit_transform(X_std)
In [ ]:
expl_var = sklearn_pca.explained_variance_ratio_
In [64]:
ev = expl_var.tolist()
ls = ['PC%s' %i for i in range(0,len(ev))]
df_explainedValue = pd.DataFrame(ev,columns=['Value'],index=ls)
In [82]:
Out[82]:
In [83]:
df_explainedValue.plot(kind='bar')
Out[83]:
In [43]:
df = zillow_combined[['zip5']].merge(df,left_index=True, right_index=True)
In [44]:
df.plot(kind='scatter',x='X',y='Y')
Out[44]:
In [84]:
income_combined = acs[['zip5','snap','inc_median','poverty']].copy(False)
In [105]:
#Beginning PCA Analysis (reference: http://sebastianraschka.com/Articles/2015_pca_in_3_steps.html)
X = income_combined.ix[:,1:4].values
y = income_combined.ix[:,0].values
#Standardization
from sklearn.preprocessing import StandardScaler
X_std = StandardScaler().fit_transform(X)
#Same PCA analysis using scikit-learn
from sklearn.decomposition import PCA as sklearnPCA
sklearn_pca = sklearnPCA(n_components=1)
Y_sklearn = sklearn_pca.fit_transform(X_std)
expl_var = sklearn_pca.explained_variance_ratio_
ev = expl_var.tolist()
ls = ['PC%s' %i for i in range(0,len(ev))]
df_explainedValue = pd.DataFrame(ev,columns=['Value'],index=ls)
In [106]:
sklearn_pca.explained_variance_ratio_
Out[106]:
In [107]:
df_explainedValue.plot(kind='bar')
Out[107]:
In [108]:
df = pd.DataFrame({'X':Y_sklearn[:,0],'Y':Y_sklearn[:,1]})
df = income_combined[['zip5']].merge(df,left_index=True, right_index=True)
df.plot(kind='scatter',x='X',y='Y')
In [110]:
df = pd.DataFrame({'Income_Level':Y_sklearn[:,0]})
df = income_combined[['zip5']].merge(df,left_index=True, right_index=True)
df.plot(kind='scatter',x='zip5',y='X')
Out[110]:
In [112]:
df.sort(['Income_Level'], ascending=[False])
Out[112]:
In [114]:
df[df['zip5']== 10001]
Out[114]:
In [75]:
urban.rename(columns={'ZCTA5':'zip'},inplace=True)
In [46]:
import vincent
from vincent import AxisProperties, PropertySet, ValueRef
vincent.core.initialize_notebook()
zip_topo = r'zips_us_topo.json'
state_topo = r'us_states.topo.json'
geo_data2 = [{'name': 'zip_codes',
'url': zip_topo,
'feature': 'zip_codes_for_the_usa'},
{'name': 'states',
'url': state_topo,
'feature': 'us_states.geo'}]
geo_data = [{'name': 'states',
'url': state_topo,
'feature': 'us_states.geo'},
{'name': 'zip_codes',
'url': zip_topo,
'feature': 'zip_codes_for_the_usa'}]
vis = vincent.Map(data=urban, geo_data=geo_data, scale=1100, projection='albersUsa',
data_bind='POPULATION', data_key='zip',brew='PuRd',
map_key={'zip_codes': 'properties.zip'})
del vis.marks[0].properties.update
#del vis.marks[1].properties.update
#vis.marks[1].properties.update.fill.value = '#FFFFFF'
#vis.marks[1].properties.enter.stroke.value = '#CCCCFF'
vis.marks[1].properties.enter.stroke_opacity = ValueRef(value=0.1)
vis.marks[0].properties.enter.stroke.value = '#FF0000'
vis.marks[1].properties.enter.hover.fill.value = '#f84525'
vis.legend(title='POPULATION')
vis.display()
In [122]:
zip_topo = r'zips_us_topo.json'
state_topo = r'us_states.topo.json'
geo_data2 = [{'name': 'zip_codes',
'url': zip_topo,
'feature': 'zip_codes_for_the_usa'},
{'name': 'states',
'url': state_topo,
'feature': 'us_states.geo'}]
geo_data = [{'name': 'states',
'url': state_topo,
'feature': 'us_states.geo'},
{'name': 'zip_codes',
'url': zip_topo,
'feature': 'zip_codes_for_the_usa'}]
vis = vincent.Map(data=urban[urban['zip']== '83211'], geo_data=geo_data, scale=1100, projection='albersUsa',
data_bind='POPULATION', data_key='zip',brew='PuRd',
map_key={'zip_codes': 'properties.zip'})
del vis.marks[0].properties.update
#del vis.marks[1].properties.update
vis.marks[1].properties.update.fill.value = '#C390D4'
#vis.marks[1].properties.enter.stroke.value = '#CCCCFF'
#vis.marks[1].properties.enter.stroke_opacity = ValueRef(value=0.1)
vis.marks[0].properties.enter.stroke.value = '#FF0000'
vis.legend(title='POPULATION')
vis.display()
In [ ]: