In [1]:
%matplotlib inline
import pandas as pd
import json
In [2]:
#File locations
acs_file = "/home/russ/Documents/DDL/Data/JeffData/PCA/Jeff_data_acs5yr.csv"
zillow_HVI_file = "/home/russ/Documents/DDL/Data/JeffData/PCA/Zip_Zhvi_AllHomes_HomeValueIndex.csv"
FDIC_deposits_100K_file = "/home/russ/Documents/DDL/Data/JeffData/PCA/FDIC_All_Reports_20150630/All_Reports_20150630_Deposits Based on the $100,000 Reporting Threshold.csv"
FDIC_deposits_250K_file = "/home/russ/Documents/DDL/Data/JeffData/PCA/FDIC_All_Reports_20150630/All_Reports_20150630_Deposits Based on the $250,000 Reporting Threshold.csv"
library_file = "/home/russ/Documents/DDL/Data/JeffData/PCA/Public_Libraries_Survey_FY_2013_-_Outlet.csv"
complaints_file = "/home/russ/Documents/DDL/Data/JeffData/PCA/Consumer_Complaints.csv"
In [3]:
acs = pd.read_csv(acs_file)
In [4]:
acs.head()
Out[4]:
In [60]:
zillow = pd.read_csv(zillow_HVI_file)
In [61]:
zillow = zillow[['RegionName','1996-07','1997-01','1997-07','1998-01','1998-07','1999-01','1999-07','2000-01','2000-07'\
,'2001-01','2001-07','2002-01','2002-07','2003-01','2003-07','2004-01','2004-07','2005-01','2005-07','2006-01','2006-07'\
,'2007-01','2007-07','2008-01','2008-07','2009-01','2009-07','2010-01','2010-07','2011-01','2011-07','2012-01','2012-07'\
,'2013-01','2013-07','2014-01','2014-07','2015-01','2015-07']]
In [62]:
zillow.rename(columns={'RegionName':'zip5'},inplace=True)
zillow.head()
Out[62]:
In [8]:
deposits_250K = pd.read_csv(FDIC_deposits_250K_file)
In [9]:
#deposits_250K = deposits_250K[['zip'],['IDdepsmb'],['DEPSMRA'],['DEPSMRN'],['NTRCDSMJ'],['IDdeplam'],['IDdeplgb'],['DEPLGRA'],['DEPLGRN'],['NTRTMLGJ']]
deposits_250K = deposits_250K[['zip','IDdepsam','IDdepsmb','DEPSMRA','DEPSMRN','NTRCDSMJ','IDdeplam','IDdeplgb','DEPLGRA','DEPLGRN','NTRTMLGJ']]
In [10]:
deposits_250K.columns = ['zip5','dep_amt_low','dep_count_low','retirement_amt_low','retirement_count_low','time_deposits_low','dep_amt_high','dep_count_high','retirement_amt_high','retirement_count_high','time_deposits_high']
In [11]:
deposits_250K.head()
Out[11]:
In [12]:
deposits_zip = deposits_250K['dep_amt_high'].groupby(deposits_250K['zip5']).mean().reset_index()
In [13]:
deposits_zip.head()
Out[13]:
In [14]:
library = pd.read_csv(library_file)
In [14]:
#Slice field value based on hard coded State
#Next step to substitute ['STABR'] for State text to dynamically find location of each state within address field
#library_zip.apply(lambda x : x['Location'][0:15], axis =1)
#library_zip.apply(lambda x : x['Location'][x['Location'].find(', AK')+5:x['Location'].find(', AK')+10], axis =1)
#Strip Zip Code From Location Column
#library_zip = library[['Location','STABR']]
#library_zip['zip'] = library_zip.apply(lambda x : x['Location'][x['Location'].find(', ' + x['STABR'])+5:x['Location'].find(', ' + x['STABR'])+10], axis =1)
In [15]:
#Parse out Zip Code from Location field
library['zip'] = library.apply(lambda x : x['Location'][x['Location'].rfind(', ' + x['STABR'])+5:x['Location'].rfind(', ' + x['STABR'])+10], axis =1)
In [16]:
library.head(2)
Out[16]:
In [9]:
#Change to your local path
library.to_csv("/home/russ/Documents/DDL/Data/JeffData/PCA/Library_ZipCode.csv")
In [17]:
library_zip = library['STABR'].groupby(library['zip']).count().reset_index()
In [18]:
library_zip['zip5'] = library_zip.apply(lambda x: int(x['zip']),axis=1)
In [19]:
library_zip.columns = ['zip','LibraryCount','zip5']
In [20]:
library_zip.head(10)
Out[20]:
In [63]:
combined = pd.merge(acs[['zip5','snap','inc_median','poverty']],zillow[['zip5','2015-07']], on='zip5',copy=False)
In [64]:
combined.head()
Out[64]:
In [65]:
combined[combined['zip5']==90210]
Out[65]:
In [66]:
#Beginning PCA Analysis (reference: http://sebastianraschka.com/Articles/2015_pca_in_3_steps.html)
X = combined.ix[:,1:5].values
y = combined.ix[:,0].values
In [67]:
#Standardization
from sklearn.preprocessing import StandardScaler
X_std = StandardScaler().fit_transform(X)
In [68]:
X_std
Out[68]:
In [69]:
#Same PCA analysis using scikit-learn
from sklearn.decomposition import PCA as sklearnPCA
sklearn_pca = sklearnPCA(n_components=2)
Y_sklearn = sklearn_pca.fit_transform(X_std)
In [70]:
Y_sklearn
Out[70]:
In [71]:
df = pd.DataFrame({'X':Y_sklearn[:,0],'Y':Y_sklearn[:,1]})
In [72]:
df = combined[['zip5']].merge(df,left_index=True, right_index=True)
In [73]:
df[df['zip5']==90210]
Out[73]:
In [74]:
#Top of arc = High Housing, low Income
#Negative = High Housing, High Income
#Positive = Low Housing, Low Income
df.plot(kind='scatter',x='X',y='Y')
Out[74]:
In [77]:
df[df['Y']<-10]
Out[77]:
In [79]:
df[df['X']>8]
Out[79]:
In [94]:
df[(df['X']>-.5) & (df['X']<.5) & (df['Y']<0)]
Out[94]:
In [80]:
df[df['Y']>0]
Out[80]:
In [84]:
combined[combined['zip5']== 28202]
Out[84]:
In [83]:
combined[combined['zip5']== 43240]
Out[83]:
In [82]:
combined[combined['zip5']== 90210]
Out[82]:
In [81]:
combined[combined['zip5']== 99901]
Out[81]:
In [100]:
combined[combined['zip5']== 58420]
Out[100]:
In [85]:
df[df['zip5']== 90210]
Out[85]:
In [37]:
traces = []
list = combined['zip5'].values.tolist()
list_short = list[0:200]
list_short.append('90210')
for name in (list_short):
trace = Scatter(
x=Y_sklearn[y==name,0],
y=Y_sklearn[y==name,1],
mode='markers',
name=name,
marker=Marker(
size=12,
line=Line(
color='rgba(217, 217, 217, 0.14)',
width=0.5),
opacity=0.8))
traces.append(trace)
data = Data(traces)
layout = Layout(xaxis=XAxis(title='PC1', showline=False),
yaxis=YAxis(title='PC2', showline=False))
fig = Figure(data=data, layout=layout)
py.iplot(fig)
Out[37]:
In [ ]: