In [1]:
%matplotlib inline
import pandas as pd
import json

In [2]:
#File locations
acs_file = "/home/russ/Documents/DDL/Data/JeffData/PCA/Jeff_data_acs5yr.csv"
zillow_HVI_file = "/home/russ/Documents/DDL/Data/JeffData/PCA/Zip_Zhvi_AllHomes_HomeValueIndex.csv"
FDIC_deposits_100K_file = "/home/russ/Documents/DDL/Data/JeffData/PCA/FDIC_All_Reports_20150630/All_Reports_20150630_Deposits Based on the $100,000 Reporting Threshold.csv"
FDIC_deposits_250K_file = "/home/russ/Documents/DDL/Data/JeffData/PCA/FDIC_All_Reports_20150630/All_Reports_20150630_Deposits Based on the $250,000 Reporting Threshold.csv"
library_file = "/home/russ/Documents/DDL/Data/JeffData/PCA/Public_Libraries_Survey_FY_2013_-_Outlet.csv"
complaints_file = "/home/russ/Documents/DDL/Data/JeffData/PCA/Consumer_Complaints.csv"

In [3]:
acs = pd.read_csv(acs_file)

In [4]:
acs.head()


Out[4]:
zip5 pop race_white race_black race_indian race_asian race_pac_is race_other race_multi hisp ... marital_yes30to34 marital_yes35to39 marital_yes40to44 marital_yes45to49 marital_yes50to54 marital_yes55to59 marital_yes60to64 marital_yes65to74 marital_yes75to84 marital_yes85p
0 601 18450 0.946179 0.007805 0.000217 0.000000 0 0.042005 0.007588 0.994580 ... 0.038743 0.030818 0.049919 0.051477 0.053644 0.038607 0.049038 0.068274 0.029193 0.005690
1 602 41302 0.635611 0.035010 0.000654 0.001719 0 0.030676 0.592659 0.935838 ... 0.039094 0.037999 0.051465 0.049600 0.058420 0.054276 0.053892 0.067683 0.025244 0.002545
2 603 53683 0.775217 0.032953 0.000335 0.008345 0 0.030643 0.305013 0.958609 ... 0.037783 0.037464 0.033750 0.039014 0.042386 0.046215 0.056538 0.077891 0.031152 0.003168
3 606 6591 0.953421 0.011227 0.000000 0.000000 0 0.022151 0.026400 0.998635 ... 0.004145 0.024496 0.032410 0.029772 0.049934 0.034483 0.061428 0.052007 0.012436 0.003580
4 610 28963 0.740013 0.029141 0.000000 0.000725 0 0.167041 0.126161 0.992542 ... 0.032814 0.031205 0.067025 0.049411 0.052079 0.041282 0.048353 0.078161 0.026590 0.006563

5 rows × 77 columns


In [6]:
zillow = pd.read_csv(zillow_HVI_file)

In [12]:
zillow = zillow[['RegionName','1996-07','1997-01','1997-07','1998-01','1998-07','1999-01','1999-07','2000-01','2000-07'\
    ,'2001-01','2001-07','2002-01','2002-07','2003-01','2003-07','2004-01','2004-07','2005-01','2005-07','2006-01','2006-07'\
    ,'2007-01','2007-07','2008-01','2008-07','2009-01','2009-07','2010-01','2010-07','2011-01','2011-07','2012-01','2012-07'\
    ,'2013-01','2013-07','2014-01','2014-07','2015-01','2015-07']]

In [7]:
zillow.rename(columns={'RegionName':'zip'},inplace=True)
zillow.head()


Out[7]:
zip City State Metro CountyName 1996-04 1996-05 1996-06 1996-07 1996-08 ... 2014-10 2014-11 2014-12 2015-01 2015-02 2015-03 2015-04 2015-05 2015-06 2015-07
0 10025 New York NY New York New York NaN NaN NaN NaN NaN ... 1020000 1019600 1024100 1029600 1035800 1047600 1063000 1073100 1084100 1097100
1 60657 Chicago IL Chicago Cook 142500 142100 141900 141900 142400 ... 302900 304300 305000 304500 303600 302900 303300 304400 305900 307000
2 60614 Chicago IL Chicago Cook 197700 194800 193300 193200 193500 ... 381200 380700 379900 378900 378100 379500 383300 387900 391000 393000
3 79936 El Paso TX El Paso El Paso 70800 71000 70900 71100 71000 ... 112400 112200 111800 111600 111300 111200 111300 111600 111800 112100
4 10002 New York NY New York New York NaN NaN NaN NaN NaN ... 869300 881300 894800 903200 905900 899000 894500 901300 925500 951600

5 rows × 237 columns


In [5]:
deposits_250K = pd.read_csv(FDIC_deposits_250K_file)

In [6]:
#deposits_250K = deposits_250K[['zip'],['IDdepsmb'],['DEPSMRA'],['DEPSMRN'],['NTRCDSMJ'],['IDdeplam'],['IDdeplgb'],['DEPLGRA'],['DEPLGRN'],['NTRTMLGJ']]
deposits_250K = deposits_250K[['zip','IDdepsam','IDdepsmb','DEPSMRA','DEPSMRN','NTRCDSMJ','IDdeplam','IDdeplgb','DEPLGRA','DEPLGRN','NTRTMLGJ']]

In [7]:
deposits_250K.columns = ['zip5','dep_amt_low','dep_count_low','retirement_amt_low','retirement_count_low','time_deposits_low','dep_amt_high','dep_count_high','retirement_amt_high','retirement_count_high','time_deposits_high']

In [8]:
deposits_250K.head()


Out[8]:
zip5 dep_amt_low dep_count_low retirement_amt_low retirement_count_low time_deposits_low dep_amt_high dep_count_high retirement_amt_high retirement_count_high time_deposits_high
0 21613 105111 9871 11265 712 40000 49156 70 781 2 13062
1 21202 0 0 0 0 0 500 1 0 0 500
2 63376 42888 1250 1552 84 28590 21793 32 0 0 4088
3 59317 30205 2340 743 57 7043 11090 30 0 0 0
4 74728 82686 8762 6146 208 26712 28517 37 360 1 1787

In [9]:
deposits_zip = deposits_250K['dep_amt_high'].groupby(deposits_250K['zip5']).mean().reset_index()

In [10]:
deposits_zip.head()


Out[10]:
zip5 dep_amt_high
0 802 67941
1 820 36914
2 909 2542958
3 917 2690626
4 918 1567600

In [11]:
library = pd.read_csv(library_file)

In [12]:
#Slice field value based on hard coded State
#Next step to substitute ['STABR'] for State text to dynamically find location of each state within address field
#library_zip.apply(lambda x : x['Location'][0:15], axis =1)
#library_zip.apply(lambda x : x['Location'][x['Location'].find(', AK')+5:x['Location'].find(', AK')+10], axis =1)
#Strip Zip Code From Location Column
#library_zip = library[['Location','STABR']]
#library_zip['zip'] = library_zip.apply(lambda x : x['Location'][x['Location'].find(', ' + x['STABR'])+5:x['Location'].find(', ' + x['STABR'])+10], axis =1)

In [160]:
#Parse out Zip Code from Location field
library['zip'] = library.apply(lambda x : x['Location'][x['Location'].rfind(', ' + x['STABR'])+5:x['Location'].rfind(', ' + x['STABR'])+10], axis =1)

In [161]:
library.head(2)


Out[161]:
STABR FSCSKEY FSCS_SEQ LIBID LIBNAME CNTY PHONE C_OUT_TY C_MSA SQ_FEET ... CENTRACT CENBLOCK CDCODE CBSA MICROF GAL GALMS POSTMS Location zip
0 AK AK0001 2 AK0001-002 ANCHOR POINT PUBLIC LIBRARY KENAI PENINSULA 9072355692 CE NO 1287 ... 8.00 3014 200 0 0 house STD NND 72551 MILO FRITZ AVENUE\nANCHOR POINT, AK 9955... 99556
1 AK AK0002 7 AK0002-007 CHUGIAK/EAGLE RIVER NEIGHBORHOOD LIBRARY ANCHORAGE 9073431530 BR CC 17888 ... 2.01 2021 200 11260 0 addresspoint STD NND 12001 BUSINESS BOULEVARD #176\nEAGLE RIVER, AK... 99577

2 rows × 37 columns


In [9]:
#Change to your local path
library.to_csv("/home/russ/Documents/DDL/Data/JeffData/PCA/Library_ZipCode.csv")

In [162]:
library_zip = library['STABR'].groupby(library['zip']).count().reset_index()

In [163]:
library_zip['zip5'] = library_zip.apply(lambda x: int(x['zip']),axis=1)

In [166]:
library_zip.columns = ['zip','LibraryCount','zip5']

In [167]:
library_zip.head(10)


Out[167]:
zip LibraryCount zip5
0 00602 1 602
1 00603 1 603
2 00605 1 605
3 00610 1 610
4 00612 1 612
5 00613 2 613
6 00617 2 617
7 00624 1 624
8 00627 4 627
9 00638 1 638

In [13]:
combined = pd.merge(acs[['zip5','snap','inc_median','poverty']],deposits_zip[['zip5','dep_amt_high']], on='zip5',copy=False)

In [14]:
combined.head()


Out[14]:
zip5 snap inc_median poverty dep_amt_high
0 909 0.291393 16891 0.543534 2542958
1 917 0.328958 16802 0.531585 2690626
2 918 0.217590 26573 0.496521 1567600
3 926 0.173707 35308 0.262210 1874329
4 1005 0.029427 68644 0.037358 25720

In [15]:
combined[combined['zip5']==90210]


Out[15]:
zip5 snap inc_median poverty dep_amt_high
4759 90210 0.008544 132254 0.077309 23634809

In [16]:
#Beginning PCA Analysis (reference: http://sebastianraschka.com/Articles/2015_pca_in_3_steps.html)

X = combined.ix[:,1:5].values
y = combined.ix[:,0].values

In [17]:
y


Out[17]:
array([  909,   917,   918, ..., 99510, 99701, 99901])

In [18]:
#Standardization
from sklearn.preprocessing import StandardScaler
X_std = StandardScaler().fit_transform(X)

In [19]:
X_std


Out[19]:
array([[ 2.08209341, -1.72499341,  4.1443789 ,  0.13070951],
       [ 2.53961831, -1.72944417,  4.02510502,  0.14089822],
       [ 1.18319901, -1.24080997,  3.67508955,  0.06341232],
       ..., 
       [-1.46693953, -2.56968897, -1.28130945,  0.02365456],
       [ 0.0727278 , -0.04000304, -0.28411344, -0.03919475],
       [ 0.18459999,  0.5567998 , -0.37878253, -0.03200704]])

In [20]:
#Covariance Matrix
#Roughly speaking, the covariance of two features tells us how those random variables vary with respect to each other. 
#For example, if two variables tend to increase (or decrease) together, the covariance is positive; 
#the covariance is negative if one feature increases while the other decreases, respectively. 
#However, we have to be careful to not draw any conclusions from the magnitudes of the covariance values 
#at this point since they are not normalized, yet.
#
#Now, if we calculate the covariance between every feature pair, we will get the symmetric covariance matrix:


import numpy as np
mean_vec = np.mean(X_std, axis=0)
cov_mat = (X_std - mean_vec).T.dot((X_std - mean_vec)) / (X_std.shape[0]-1)
print 'Covariance matrix \n%s' %cov_mat


Covariance matrix 
[[ 1.00020222 -0.63663469  0.61415715 -0.0019517 ]
 [-0.63663469  1.00020222 -0.50314101  0.0237246 ]
 [ 0.61415715 -0.50314101  1.00020222  0.01121106]
 [-0.0019517   0.0237246   0.01121106  1.00020222]]

In [21]:
#Same As Above, simplified
print 'NumPy covariance matrix: \n%s' %np.cov(X_std.T)


NumPy covariance matrix: 
[[ 1.00020222 -0.63663469  0.61415715 -0.0019517 ]
 [-0.63663469  1.00020222 -0.50314101  0.0237246 ]
 [ 0.61415715 -0.50314101  1.00020222  0.01121106]
 [-0.0019517   0.0237246   0.01121106  1.00020222]]

In [22]:
#Generate eigendecomposition on the covariance matrix:
cov_mat = np.cov(X_std.T)

eig_vals, eig_vecs = np.linalg.eig(cov_mat)

print 'Eigenvectors \n%s' %eig_vecs
print '\nEigenvalues \n%s' %eig_vals


Eigenvectors 
[[ 0.60266076 -0.79578274  0.05883835  0.00823883]
 [-0.56835161 -0.47946993 -0.66786708  0.0322272 ]
 [ 0.56011182  0.36981944 -0.74038439  0.03659621]
 [-0.00715556  0.00848469  0.04819296  0.99877637]]

Eigenvalues 
[ 2.17141336  0.33122832  0.49680479  1.00136242]

In [23]:
#Correlation Matrix
cor_mat1 = np.corrcoef(X.T)
cor_mat1


Out[23]:
array([[ 1.        , -0.63650597,  0.61403298, -0.0019513 ],
       [-0.63650597,  1.        , -0.50303928,  0.0237198 ],
       [ 0.61403298, -0.50303928,  1.        ,  0.0112088 ],
       [-0.0019513 ,  0.0237198 ,  0.0112088 ,  1.        ]])

In [24]:
#Since the correlation matrix is a "normalized version" of the covariance matrix, 
#it is not affected by standardization of the features, which is demonstrated below:
cor_mat2 = np.corrcoef(X_std.T)
cor_mat2


Out[24]:
array([[ 1.        , -0.63650597,  0.61403298, -0.0019513 ],
       [-0.63650597,  1.        , -0.50303928,  0.0237198 ],
       [ 0.61403298, -0.50303928,  1.        ,  0.0112088 ],
       [-0.0019513 ,  0.0237198 ,  0.0112088 ,  1.        ]])

In [25]:
#Now, if we perform an eigendecomposition on the correlation matrix, 
#we expect and observe the same results as for the eigendecomposition of 
#the covariance matrix with standardized features earlier:

eig_vals, eig_vecs = np.linalg.eig(cor_mat1)

print 'Eigenvectors \n%s' %eig_vecs 
print '\nEigenvalues \n%s' %eig_vals


Eigenvectors 
[[ 0.60266076 -0.79578274  0.05883835  0.00823883]
 [-0.56835161 -0.47946993 -0.66786708  0.0322272 ]
 [ 0.56011182  0.36981944 -0.74038439  0.03659621]
 [-0.00715556  0.00848469  0.04819296  0.99877637]]

Eigenvalues 
[ 2.17097434  0.33116136  0.49670434  1.00115997]

In [26]:
#Singular Vector Decomposition
#While the eigendecomposition of the covariance or correlation matrix may be more intuitiuve,
#most PCA implementations perform a Singular Vector Decomposition (SVD) to improve the computational 
#efficiency. So, let us perform an SVD to confirm that the result are indeed the same:

u,s,v = np.linalg.svd(X_std.T)
u


Out[26]:
array([[-0.60266076,  0.00823883, -0.05883835,  0.79578274],
       [ 0.56835161,  0.0322272 ,  0.66786708,  0.47946993],
       [-0.56011182,  0.03659621,  0.74038439, -0.36981944],
       [ 0.00715556,  0.99877637, -0.04819296, -0.00848469]])

In [27]:
#Sorting Eigenpairs
#The typical goal of a PCA is to reduce the dimensionality of the original 
#feature space by projecting it onto a smaller subspace, where the eigenvectors 
#will form the axes. However, the eigenvectors only define the directions of the new axis, 
#since they have all the same unit length 1, which can confirmed by the following two lines of code:

for ev in eig_vecs:
    np.testing.assert_array_almost_equal(1.0, np.linalg.norm(ev))
print 'Everything ok!'


Everything ok!

In [28]:
#In order to decide which eigenvector(s) can dropped without losing too much 
#information for the construction of the lower-dimensional subspace, we need 
#to inspect the corresponding eigenvalues: The eigenvectors with the lowest 
#eigenvalues bear the least information about the distribution of the data; 
#those are the ones can be dropped.
#In order to do so, the common approach is to rank the eigenvalues from highest 
#to lowest in order choose the top k eigenvectors.

# Make a list of (eigenvalue, eigenvector) tuples
eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:,i]) for i in range(len(eig_vals))]

# Sort the (eigenvalue, eigenvector) tuples from high to low
eig_pairs.sort()
eig_pairs.reverse()

# Visually confirm that the list is correctly sorted by decreasing eigenvalues
print('Eigenvalues in descending order:')
for i in eig_pairs:
    print(i[0])


Eigenvalues in descending order:
2.17097433694
1.00115996519
0.496704342456
0.331161355414

In [29]:
import plotly.plotly as py
from plotly.graph_objs import *
import plotly.tools as tls

tot = sum(eig_vals)
var_exp = [(i / tot)*100 for i in sorted(eig_vals, reverse=True)]
cum_var_exp = np.cumsum(var_exp)

trace1 = Bar(
        x=['PC %s' %i for i in range(1,5)],
        y=var_exp,
        showlegend=False)

trace2 = Scatter(
        x=['PC %s' %i for i in range(1,5)],
        y=cum_var_exp,
        name='cumulative explained variance')

data = Data([trace1, trace2])

layout=Layout(
        yaxis=YAxis(title='Explained variance in percent'),
        title='Explained variance by different principal components')

fig = Figure(data=data, layout=layout)
py.iplot(fig)


Out[29]:

In [30]:
#Projection Matrix
"""
It's about time to get to the really interesting part: The construction of 
the projection matrix that will be used to transform the Iris data onto the new 
feature subspace. Although, the name "projection matrix" has a nice ring to it, 
it is basically just a matrix of our concatenated top k eigenvectors.

Here, we are reducing the 4-dimensional feature space to a 2-dimensional feature subspace, 
by choosing the "top 2" eigenvectors with the highest eigenvalues to construct our d×k-dimensional 
eigenvector matrix W.
"""

matrix_w = np.hstack((eig_pairs[0][1].reshape(4,1),
                      eig_pairs[1][1].reshape(4,1)))

print 'Matrix W:\n', matrix_w


Matrix W:
[[ 0.60266076  0.00823883]
 [-0.56835161  0.0322272 ]
 [ 0.56011182  0.03659621]
 [-0.00715556  0.99877637]]

In [31]:
#Projection Onto the New Feature Space

#In this last step we will use the 4×2-dimensional projection matrix W 
#to transform our samples onto the new subspace via the equation Y=X×W, 
#where Y is a 150×2 matrix of our transformed samples.

Y = X_std.dot(matrix_w)
Y


Out[31]:
array([[ 4.55557909,  0.24378043],
       [ 4.76696138,  0.25321774],
       [ 3.4762913 ,  0.16758941],
       ..., 
       [-0.14142586, -0.1181652 ],
       [-0.09228885, -0.05023425],
       [-0.41713844, -0.0263649 ]])

In [36]:
traces = []

list = combined['zip5'].values.tolist()
list_short = list[0:200]
list_short.append('90210')

for name in tuple(list_short):

    trace = Scatter(
        x=Y[y==name,0],
        y=Y[y==name,1],
        mode='markers',
        name=name,
        marker=Marker(
            size=12,
            line=Line(
                color='rgba(217, 217, 217, 0.14)',
                width=0.5),
            opacity=0.8))
    traces.append(trace)


data = Data(traces)
layout = Layout(showlegend=True,
                scene=Scene(xaxis=XAxis(title='PC1'),
                yaxis=YAxis(title='PC2'),))

fig = Figure(data=data, layout=layout)
py.iplot(fig)


Out[36]:

In [37]:
#Same PCA analysis using scikit-learn

from sklearn.decomposition import PCA as sklearnPCA
sklearn_pca = sklearnPCA(n_components=2)
Y_sklearn = sklearn_pca.fit_transform(X_std)

traces = []

list = combined['zip5'].values.tolist()
list_short = list[0:200]
list_short.append('90210')

for name in (list_short):

    trace = Scatter(
        x=Y_sklearn[y==name,0],
        y=Y_sklearn[y==name,1],
        mode='markers',
        name=name,
        marker=Marker(
            size=12,
            line=Line(
                color='rgba(217, 217, 217, 0.14)',
                width=0.5),
            opacity=0.8))
    traces.append(trace)


data = Data(traces)
layout = Layout(xaxis=XAxis(title='PC1', showline=False),
                yaxis=YAxis(title='PC2', showline=False))
fig = Figure(data=data, layout=layout)
py.iplot(fig)


Out[37]:

In [ ]: