In [1]:
%matplotlib inline
import pandas as pd
import json
In [2]:
#File locations
acs_file = "/home/russ/Documents/DDL/Data/JeffData/PCA/Jeff_data_acs5yr.csv"
zillow_HVI_file = "/home/russ/Documents/DDL/Data/JeffData/PCA/Zip_Zhvi_AllHomes_HomeValueIndex.csv"
zillow_RI_file = "/home/russ/Documents/DDL/Data/JeffData/PCA/Zip_Zri_AllHomes_RentIndex.csv"
FDIC_deposits_100K_file = "/home/russ/Documents/DDL/Data/JeffData/PCA/FDIC_All_Reports_20150630/All_Reports_20150630_Deposits Based on the $100,000 Reporting Threshold.csv"
FDIC_deposits_250K_file = "/home/russ/Documents/DDL/Data/JeffData/PCA/FDIC_All_Reports_20150630/All_Reports_20150630_Deposits Based on the $250,000 Reporting Threshold.csv"
library_file = "/home/russ/Documents/DDL/Data/JeffData/PCA/Public_Libraries_Survey_FY_2013_-_Outlet.csv"
complaints_file = "/home/russ/Documents/DDL/Data/JeffData/PCA/Consumer_Complaints.csv"
urbanization_zip = "/home/russ/Documents/DDL/Data/JeffData/PCA/zcta2010_txt.csv"
In [3]:
acs = pd.read_csv(acs_file)
In [4]:
acs.head()
Out[4]:
In [5]:
diversity = acs[['zip5','pop','race_white','race_black','race_asian','race_indian','race_other','hisp']].copy(deep=True)
In [6]:
diversity['white_hisp'] = ((diversity['pop']*diversity['race_white'])*diversity['hisp'])/diversity['pop']
In [7]:
diversity['white_nonhisp'] = ((diversity['pop']*diversity['race_white'])*(1-diversity['hisp']))/diversity['pop']
In [8]:
diversity['div_index'] = 1- (diversity['race_black']**2 + diversity['white_hisp']**2 + diversity['white_nonhisp']**2 + diversity['race_asian']**2 + diversity['race_indian']**2)
In [9]:
zillow_HVI = pd.read_csv(zillow_HVI_file)
In [10]:
zillow_RI = pd.read_csv(zillow_RI_file)
In [11]:
urban = pd.read_csv(urbanization_zip)
In [12]:
#urban['pop'] = urban.apply(lambda x: int(x['POPULATION'].replace(',','')),axis=1)
#alternate
#urban['pop'] = urban['POPULATION'].apply(lambda x: int(x.replace(',','')))
urban['pop'] = urban.apply(lambda x: int(x['POPULATION']),axis=1)
In [13]:
#strip Z from Zip Code Text
urban['ZCTA5'] = urban.apply(lambda x: x['ZCTA5'][1:],axis=1)
In [14]:
urban.head()
Out[14]:
In [15]:
urban['urban_index'] = urban['LANDSQMT']/urban['pop']
In [16]:
zillow_HVI = zillow_HVI[['RegionName','1996-07','1997-01','1997-07','1998-01','1998-07','1999-01','1999-07','2000-01','2000-07'\
,'2001-01','2001-07','2002-01','2002-07','2003-01','2003-07','2004-01','2004-07','2005-01','2005-07','2006-01','2006-07'\
,'2007-01','2007-07','2008-01','2008-07','2009-01','2009-07','2010-01','2010-07','2011-01','2011-07','2012-01','2012-07'\
,'2013-01','2013-07','2014-01','2014-07','2015-01','2015-07']]
In [17]:
zillow_HVI.rename(columns={'RegionName':'zip5'},inplace=True)
zillow_HVI.head()
Out[17]:
In [18]:
len(zillow_HVI)
Out[18]:
In [19]:
zillow_RI.head(1)
Out[19]:
In [20]:
zillow_RI = zillow_RI[['RegionName','2011-01','2011-07','2012-01','2012-07'\
,'2013-01','2013-07','2014-01','2014-07','2015-01','2015-07']].copy(False)
In [21]:
zillow_RI.rename(columns={'RegionName':'zip5'},inplace=True)
In [22]:
zillow_RI.head()
Out[22]:
In [23]:
deposits_250K = pd.read_csv(FDIC_deposits_250K_file)
In [24]:
#deposits_250K = deposits_250K[['zip'],['IDdepsmb'],['DEPSMRA'],['DEPSMRN'],['NTRCDSMJ'],['IDdeplam'],['IDdeplgb'],['DEPLGRA'],['DEPLGRN'],['NTRTMLGJ']]
deposits_250K = deposits_250K[['zip','IDdepsam','IDdepsmb','DEPSMRA','DEPSMRN','NTRCDSMJ','IDdeplam','IDdeplgb','DEPLGRA','DEPLGRN','NTRTMLGJ']]
In [25]:
deposits_250K.columns = ['zip5','dep_amt_low','dep_count_low','retirement_amt_low','retirement_count_low','time_deposits_low','dep_amt_high','dep_count_high','retirement_amt_high','retirement_count_high','time_deposits_high']
In [26]:
deposits_250K.head()
Out[26]:
In [27]:
deposits_zip = deposits_250K['dep_amt_high'].groupby(deposits_250K['zip5']).mean().reset_index()
In [28]:
deposits_zip.head()
Out[28]:
In [29]:
library = pd.read_csv(library_file)
In [30]:
#Slice field value based on hard coded State
#Next step to substitute ['STABR'] for State text to dynamically find location of each state within address field
#library_zip.apply(lambda x : x['Location'][0:15], axis =1)
#library_zip.apply(lambda x : x['Location'][x['Location'].find(', AK')+5:x['Location'].find(', AK')+10], axis =1)
#Strip Zip Code From Location Column
#library_zip = library[['Location','STABR']]
#library_zip['zip'] = library_zip.apply(lambda x : x['Location'][x['Location'].find(', ' + x['STABR'])+5:x['Location'].find(', ' + x['STABR'])+10], axis =1)
In [31]:
#Parse out Zip Code from Location field
library['zip'] = library.apply(lambda x : x['Location'][x['Location'].rfind(', ' + x['STABR'])+5:x['Location'].rfind(', ' + x['STABR'])+10], axis =1)
In [32]:
library.head(2)
Out[32]:
In [33]:
#Change to your local path
library.to_csv("/home/russ/Documents/DDL/Data/JeffData/PCA/Library_ZipCode.csv")
In [34]:
library_zip = library['STABR'].groupby(library['zip']).count().reset_index()
In [35]:
library_zip['zip5'] = library_zip.apply(lambda x: int(x['zip']),axis=1)
In [36]:
library_zip.columns = ['zip','LibraryCount','zip5']
In [37]:
library_zip.head(1)
Out[37]:
In [38]:
combined = pd.merge(acs[['zip5','snap','inc_median','poverty']],zillow_HVI[['zip5','2015-07']], on='zip5',copy=False)
In [39]:
combined.head()
Out[39]:
In [40]:
combined[combined['zip5']==90210]
Out[40]:
In [41]:
zillow_combined = pd.merge(zillow_HVI[['zip5','2015-07']],zillow_RI[['zip5','2015-07']], on='zip5',copy=False)
In [42]:
zillow_combined.columns = ['zip5','HVI','RI']
In [43]:
#Beginning PCA Analysis (reference: http://sebastianraschka.com/Articles/2015_pca_in_3_steps.html)
X = combined.ix[:,1:5].values
y = combined.ix[:,0].values
In [44]:
#Standardization
from sklearn.preprocessing import StandardScaler
X_std = StandardScaler().fit_transform(X)
In [45]:
X_std
Out[45]:
In [46]:
#Same PCA analysis using scikit-learn
from sklearn.decomposition import PCA as sklearnPCA
sklearn_pca = sklearnPCA(n_components=2)
Y_sklearn = sklearn_pca.fit_transform(X_std)
In [47]:
Y_sklearn
Out[47]:
In [48]:
df = pd.DataFrame({'X':Y_sklearn[:,0],'Y':Y_sklearn[:,1]})
In [49]:
df = combined[['zip5']].merge(df,left_index=True, right_index=True)
In [50]:
df[df['zip5']==90210]
Out[50]:
In [51]:
#Top of arc = High Housing, low Income
#Negative = High Housing, High Income
#Positive = Low Housing, Low Income
df.plot(kind='scatter',x='X',y='Y')
Out[51]:
In [52]:
df[df['Y']<-10]
Out[52]:
In [53]:
df[df['X']>8]
Out[53]:
In [54]:
df[(df['X']>-.5) & (df['X']<.5) & (df['Y']<0)]
Out[54]:
In [55]:
df[df['Y']>0]
Out[55]:
In [56]:
#Beginning PCA Analysis (reference: http://sebastianraschka.com/Articles/2015_pca_in_3_steps.html)
X = zillow_combined.ix[:,1:3].values
y = zillow_combined.ix[:,0].values
In [57]:
#Standardization
from sklearn.preprocessing import StandardScaler
X_std = StandardScaler().fit_transform(X)
#Same PCA analysis using scikit-learn
from sklearn.decomposition import PCA as sklearnPCA
sklearn_pca = sklearnPCA(n_components=2)
Y_sklearn = sklearn_pca.fit_transform(X_std)
In [58]:
expl_var = sklearn_pca.explained_variance_ratio_
In [59]:
ev = expl_var.tolist()
ls = ['PC%s' %i for i in range(0,len(ev))]
df_explainedValue = pd.DataFrame(ev,columns=['Value'],index=ls)
In [60]:
df_explainedValue.plot(kind='bar')
Out[60]:
In [61]:
df = zillow_combined[['zip5']].merge(df,left_index=True, right_index=True)
In [62]:
df.plot(kind='scatter',x='X',y='Y')
Out[62]:
In [63]:
income_combined = acs[['zip5','snap','inc_median','poverty']].copy(False)
In [64]:
#Beginning PCA Analysis (reference: http://sebastianraschka.com/Articles/2015_pca_in_3_steps.html)
X = income_combined.ix[:,1:4].values
y = income_combined.ix[:,0].values
#Standardization
from sklearn.preprocessing import StandardScaler
X_std = StandardScaler().fit_transform(X)
#Same PCA analysis using scikit-learn
from sklearn.decomposition import PCA as sklearnPCA
sklearn_pca = sklearnPCA(n_components=1)
Y_sklearn = sklearn_pca.fit_transform(X_std)
expl_var = sklearn_pca.explained_variance_ratio_
ev = expl_var.tolist()
ls = ['PC%s' %i for i in range(0,len(ev))]
df_explainedValue = pd.DataFrame(ev,columns=['Value'],index=ls)
In [65]:
sklearn_pca.explained_variance_ratio_
Out[65]:
In [66]:
df_explainedValue.plot(kind='bar')
Out[66]:
In [67]:
df = pd.DataFrame({'X':Y_sklearn[:,0],'Y':Y_sklearn[:,1]})
df = income_combined[['zip5']].merge(df,left_index=True, right_index=True)
df.plot(kind='scatter',x='X',y='Y')
In [68]:
df = pd.DataFrame({'Income_Level':Y_sklearn[:,0]})
df = income_combined[['zip5']].merge(df,left_index=True, right_index=True)
df.plot(kind='scatter',x='zip5',y='X')
In [69]:
df.sort(['Income_Level'], ascending=[False])
Out[69]:
In [70]:
df[df['zip5']== 10001]
Out[70]:
In [71]:
urban.rename(columns={'ZCTA5':'zip'},inplace=True)
In [72]:
import vincent
from vincent import AxisProperties, PropertySet, ValueRef
vincent.core.initialize_notebook()
In [73]:
zip_topo = r'zips_us_topo.json'
state_topo = r'us_states.topo.json'
geo_data2 = [{'name': 'zip_codes',
'url': zip_topo,
'feature': 'zip_codes_for_the_usa'},
{'name': 'states',
'url': state_topo,
'feature': 'us_states.geo'}]
geo_data = [{'name': 'states',
'url': state_topo,
'feature': 'us_states.geo'},
{'name': 'zip_codes',
'url': zip_topo,
'feature': 'zip_codes_for_the_usa'}]
vis = vincent.Map(data=urban, geo_data=geo_data, scale=1100, projection='albersUsa',
data_bind='POPULATION', data_key='zip',brew='PuRd',
map_key={'zip_codes': 'properties.zip'})
del vis.marks[0].properties.update
#del vis.marks[1].properties.update
#vis.marks[1].properties.update.fill.value = '#FFFFFF'
#vis.marks[1].properties.enter.stroke.value = '#CCCCFF'
vis.marks[1].properties.enter.stroke_opacity = ValueRef(value=0.1)
vis.marks[0].properties.enter.stroke.value = '#FF0000'
#vis.marks[1].properties.hover.fill.value = 'red'
vis.legend(title='POPULATION')
vis.display()
vis.to_json("USA_population.json")
In [195]:
zip_topo = r'zips_us_topo.json'
state_topo = r'us_states.topo.json'
geo_data2 = [{'name': 'zip_codes',
'url': zip_topo,
'feature': 'zip_codes_for_the_usa'},
{'name': 'states',
'url': state_topo,
'feature': 'us_states.geo'}]
geo_data = [{'name': 'states',
'url': state_topo,
'feature': 'us_states.geo'},
{'name': 'zip_codes',
'url': zip_topo,
'feature': 'zip_codes_for_the_usa'}]
vis = vincent.Map(data=urban[urban['zip']== '83211'], geo_data=geo_data, scale=1100, projection='albersUsa',
data_bind='POPULATION', data_key='zip',brew='PuRd',
map_key={'zip_codes': 'properties.zip'})
del vis.marks[0].properties.update
#del vis.marks[1].properties.update
vis.marks[1].properties.update.fill.value = '#C390D4'
#vis.marks[1].properties.enter.stroke.value = '#CCCCFF'
#vis.marks[1].properties.enter.stroke_opacity = ValueRef(value=0.1)
vis.marks[0].properties.enter.stroke.value = '#FF0000'
vis.legend(title='POPULATION')
vis.display()
vis.to_json("USA.json")
In [ ]:
In [253]:
zip_topo = r'topo_files/Maryland.topo.json'
state_topo = r'us_states.topo.json'
geo_data = [{'name': 'Maryland',
'url': zip_topo,
'feature': 'Maryland.geo'}]
vis = vincent.Map(geo_data=geo_data)
del vis.marks[0].properties.update
#del vis.marks[1].properties.update
#vis.marks[1].properties.update.fill.value = '#C390D4'
#vis.marks[1].properties.enter.stroke.value = '#CCCCFF'
#vis.marks[1].properties.enter.stroke_opacity = ValueRef(value=0.1)
#vis.marks[0].properties.enter.stroke.value = '#FF0000'
#vis.legend(title='POPULATION')
#vis.marks[0].properties.enter.scale=500
#vis.data[0].transform[0].projection[0].scale = 1000
#vis.marks[0].properties.hover.text.value = 'help'
#vis.marks[0].properties.update.fill.rule.predicate.value = "red"
vis.display()
vis.grammar()
vis.to_json('maryland_scaled.json')
In [252]:
zip_topo = r'topo_files/or_counties.topo_copy.json'
state_topo = r'us_states.topo.json'
geo_data = [{'name': 'Maryland',
'url': zip_topo,
'feature': 'or_counties.geo'}]
vis = vincent.Map(geo_data=geo_data, projection='equirectangular')
del vis.marks[0].properties.update
#del vis.marks[1].properties.update
#vis.marks[1].properties.update.fill.value = '#C390D4'
#vis.marks[1].properties.enter.stroke.value = '#CCCCFF'
#vis.marks[1].properties.enter.stroke_opacity = ValueRef(value=0.1)
#vis.marks[0].properties.enter.stroke.value = '#FF0000'
#vis.legend(title='POPULATION')
#vis.marks[0].properties.enter.scale=500
#vis.data[0].transform[0].projection[0].scale = 1000
#vis.marks[0].properties.hover.text.value = 'help'
#vis.marks[0].properties.update.fill.rule.predicate.value = "red"
vis.display()
vis.grammar()
#vis.to_json('maryland.json')
Out[252]:
In [250]:
0.0010128706393299272*10
Out[250]:
In [251]:
0.0005370434828103437*10
Out[251]:
In [256]:
zip_topo = r'topo_files/or_counties.topo_copy.json'
state_topo = r'us_states.topo.json'
geo_data = [{'name': 'Maryland',
'url': zip_topo,
'feature': 'or_counties.geo'}]
vis = vincent.Map(geo_data=geo_data, projection="")
del vis.marks[0].properties.update
#del vis.marks[1].properties.update
#vis.marks[1].properties.update.fill.value = '#C390D4'
#vis.marks[1].properties.enter.stroke.value = '#CCCCFF'
#vis.marks[1].properties.enter.stroke_opacity = ValueRef(value=0.1)
#vis.marks[0].properties.enter.stroke.value = '#FF0000'
#vis.legend(title='POPULATION')
#vis.marks[0].properties.enter.scale=500
#vis.data[0].transform[0].projection[0].scale = 1000
#vis.marks[0].properties.hover.text.value = 'help'
#vis.marks[0].properties.update.fill.rule.predicate.value = "red"
vis.display()
vis.grammar()
#vis.to_json('maryland.json')
Out[256]:
In [270]:
zip_topo = r'topo_files/Maryland.topo.json'
geo_data = [{'name': 'Maryland',
'url': zip_topo,
'feature': 'Maryland.geo'}]
vis = vincent.Map(geo_data=geo_data,scale=8000,projection='equirectangular')
vis.display()
In [ ]: