In [1]:
%matplotlib inline
import pandas as pd
import json
In [2]:
#File locations
acs_file = "/home/russ/Documents/DDL/Data/JeffData/PCA/Jeff_data_acs5yr.csv"
zillow_HVI_file = "/home/russ/Documents/DDL/Data/JeffData/PCA/Zip_Zhvi_AllHomes_HomeValueIndex.csv"
FDIC_deposits_100K_file = "/home/russ/Documents/DDL/Data/JeffData/PCA/FDIC_All_Reports_20150630/All_Reports_20150630_Deposits Based on the $100,000 Reporting Threshold.csv"
FDIC_deposits_250K_file = "/home/russ/Documents/DDL/Data/JeffData/PCA/FDIC_All_Reports_20150630/All_Reports_20150630_Deposits Based on the $250,000 Reporting Threshold.csv"
library_file = "/home/russ/Documents/DDL/Data/JeffData/PCA/Public_Libraries_Survey_FY_2013_-_Outlet.csv"
complaints_file = "/home/russ/Documents/DDL/Data/JeffData/PCA/Consumer_Complaints.csv"
In [3]:
acs = pd.read_csv(acs_file)
In [4]:
acs.head()
Out[4]:
In [6]:
zillow = pd.read_csv(zillow_HVI_file)
In [12]:
zillow = zillow[['RegionName','1996-07','1997-01','1997-07','1998-01','1998-07','1999-01','1999-07','2000-01','2000-07'\
,'2001-01','2001-07','2002-01','2002-07','2003-01','2003-07','2004-01','2004-07','2005-01','2005-07','2006-01','2006-07'\
,'2007-01','2007-07','2008-01','2008-07','2009-01','2009-07','2010-01','2010-07','2011-01','2011-07','2012-01','2012-07'\
,'2013-01','2013-07','2014-01','2014-07','2015-01','2015-07']]
In [7]:
zillow.rename(columns={'RegionName':'zip'},inplace=True)
zillow.head()
Out[7]:
In [5]:
deposits_250K = pd.read_csv(FDIC_deposits_250K_file)
In [6]:
#deposits_250K = deposits_250K[['zip'],['IDdepsmb'],['DEPSMRA'],['DEPSMRN'],['NTRCDSMJ'],['IDdeplam'],['IDdeplgb'],['DEPLGRA'],['DEPLGRN'],['NTRTMLGJ']]
deposits_250K = deposits_250K[['zip','IDdepsam','IDdepsmb','DEPSMRA','DEPSMRN','NTRCDSMJ','IDdeplam','IDdeplgb','DEPLGRA','DEPLGRN','NTRTMLGJ']]
In [7]:
deposits_250K.columns = ['zip5','dep_amt_low','dep_count_low','retirement_amt_low','retirement_count_low','time_deposits_low','dep_amt_high','dep_count_high','retirement_amt_high','retirement_count_high','time_deposits_high']
In [8]:
deposits_250K.head()
Out[8]:
In [9]:
deposits_zip = deposits_250K['dep_amt_high'].groupby(deposits_250K['zip5']).mean().reset_index()
In [10]:
deposits_zip.head()
Out[10]:
In [11]:
library = pd.read_csv(library_file)
In [12]:
#Slice field value based on hard coded State
#Next step to substitute ['STABR'] for State text to dynamically find location of each state within address field
#library_zip.apply(lambda x : x['Location'][0:15], axis =1)
#library_zip.apply(lambda x : x['Location'][x['Location'].find(', AK')+5:x['Location'].find(', AK')+10], axis =1)
#Strip Zip Code From Location Column
#library_zip = library[['Location','STABR']]
#library_zip['zip'] = library_zip.apply(lambda x : x['Location'][x['Location'].find(', ' + x['STABR'])+5:x['Location'].find(', ' + x['STABR'])+10], axis =1)
In [160]:
#Parse out Zip Code from Location field
library['zip'] = library.apply(lambda x : x['Location'][x['Location'].rfind(', ' + x['STABR'])+5:x['Location'].rfind(', ' + x['STABR'])+10], axis =1)
In [161]:
library.head(2)
Out[161]:
In [9]:
#Change to your local path
library.to_csv("/home/russ/Documents/DDL/Data/JeffData/PCA/Library_ZipCode.csv")
In [162]:
library_zip = library['STABR'].groupby(library['zip']).count().reset_index()
In [163]:
library_zip['zip5'] = library_zip.apply(lambda x: int(x['zip']),axis=1)
In [166]:
library_zip.columns = ['zip','LibraryCount','zip5']
In [167]:
library_zip.head(10)
Out[167]:
In [13]:
combined = pd.merge(acs[['zip5','snap','inc_median','poverty']],deposits_zip[['zip5','dep_amt_high']], on='zip5',copy=False)
In [14]:
combined.head()
Out[14]:
In [15]:
combined[combined['zip5']==90210]
Out[15]:
In [16]:
#Beginning PCA Analysis (reference: http://sebastianraschka.com/Articles/2015_pca_in_3_steps.html)
X = combined.ix[:,1:5].values
y = combined.ix[:,0].values
In [17]:
y
Out[17]:
In [18]:
#Standardization
from sklearn.preprocessing import StandardScaler
X_std = StandardScaler().fit_transform(X)
In [19]:
X_std
Out[19]:
In [20]:
#Covariance Matrix
#Roughly speaking, the covariance of two features tells us how those random variables vary with respect to each other.
#For example, if two variables tend to increase (or decrease) together, the covariance is positive;
#the covariance is negative if one feature increases while the other decreases, respectively.
#However, we have to be careful to not draw any conclusions from the magnitudes of the covariance values
#at this point since they are not normalized, yet.
#
#Now, if we calculate the covariance between every feature pair, we will get the symmetric covariance matrix:
import numpy as np
mean_vec = np.mean(X_std, axis=0)
cov_mat = (X_std - mean_vec).T.dot((X_std - mean_vec)) / (X_std.shape[0]-1)
print 'Covariance matrix \n%s' %cov_mat
In [21]:
#Same As Above, simplified
print 'NumPy covariance matrix: \n%s' %np.cov(X_std.T)
In [22]:
#Generate eigendecomposition on the covariance matrix:
cov_mat = np.cov(X_std.T)
eig_vals, eig_vecs = np.linalg.eig(cov_mat)
print 'Eigenvectors \n%s' %eig_vecs
print '\nEigenvalues \n%s' %eig_vals
In [23]:
#Correlation Matrix
cor_mat1 = np.corrcoef(X.T)
cor_mat1
Out[23]:
In [24]:
#Since the correlation matrix is a "normalized version" of the covariance matrix,
#it is not affected by standardization of the features, which is demonstrated below:
cor_mat2 = np.corrcoef(X_std.T)
cor_mat2
Out[24]:
In [25]:
#Now, if we perform an eigendecomposition on the correlation matrix,
#we expect and observe the same results as for the eigendecomposition of
#the covariance matrix with standardized features earlier:
eig_vals, eig_vecs = np.linalg.eig(cor_mat1)
print 'Eigenvectors \n%s' %eig_vecs
print '\nEigenvalues \n%s' %eig_vals
In [26]:
#Singular Vector Decomposition
#While the eigendecomposition of the covariance or correlation matrix may be more intuitiuve,
#most PCA implementations perform a Singular Vector Decomposition (SVD) to improve the computational
#efficiency. So, let us perform an SVD to confirm that the result are indeed the same:
u,s,v = np.linalg.svd(X_std.T)
u
Out[26]:
In [27]:
#Sorting Eigenpairs
#The typical goal of a PCA is to reduce the dimensionality of the original
#feature space by projecting it onto a smaller subspace, where the eigenvectors
#will form the axes. However, the eigenvectors only define the directions of the new axis,
#since they have all the same unit length 1, which can confirmed by the following two lines of code:
for ev in eig_vecs:
np.testing.assert_array_almost_equal(1.0, np.linalg.norm(ev))
print 'Everything ok!'
In [28]:
#In order to decide which eigenvector(s) can dropped without losing too much
#information for the construction of the lower-dimensional subspace, we need
#to inspect the corresponding eigenvalues: The eigenvectors with the lowest
#eigenvalues bear the least information about the distribution of the data;
#those are the ones can be dropped.
#In order to do so, the common approach is to rank the eigenvalues from highest
#to lowest in order choose the top k eigenvectors.
# Make a list of (eigenvalue, eigenvector) tuples
eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:,i]) for i in range(len(eig_vals))]
# Sort the (eigenvalue, eigenvector) tuples from high to low
eig_pairs.sort()
eig_pairs.reverse()
# Visually confirm that the list is correctly sorted by decreasing eigenvalues
print('Eigenvalues in descending order:')
for i in eig_pairs:
print(i[0])
In [29]:
import plotly.plotly as py
from plotly.graph_objs import *
import plotly.tools as tls
tot = sum(eig_vals)
var_exp = [(i / tot)*100 for i in sorted(eig_vals, reverse=True)]
cum_var_exp = np.cumsum(var_exp)
trace1 = Bar(
x=['PC %s' %i for i in range(1,5)],
y=var_exp,
showlegend=False)
trace2 = Scatter(
x=['PC %s' %i for i in range(1,5)],
y=cum_var_exp,
name='cumulative explained variance')
data = Data([trace1, trace2])
layout=Layout(
yaxis=YAxis(title='Explained variance in percent'),
title='Explained variance by different principal components')
fig = Figure(data=data, layout=layout)
py.iplot(fig)
Out[29]:
In [30]:
#Projection Matrix
"""
It's about time to get to the really interesting part: The construction of
the projection matrix that will be used to transform the Iris data onto the new
feature subspace. Although, the name "projection matrix" has a nice ring to it,
it is basically just a matrix of our concatenated top k eigenvectors.
Here, we are reducing the 4-dimensional feature space to a 2-dimensional feature subspace,
by choosing the "top 2" eigenvectors with the highest eigenvalues to construct our d×k-dimensional
eigenvector matrix W.
"""
matrix_w = np.hstack((eig_pairs[0][1].reshape(4,1),
eig_pairs[1][1].reshape(4,1)))
print 'Matrix W:\n', matrix_w
In [31]:
#Projection Onto the New Feature Space
#In this last step we will use the 4×2-dimensional projection matrix W
#to transform our samples onto the new subspace via the equation Y=X×W,
#where Y is a 150×2 matrix of our transformed samples.
Y = X_std.dot(matrix_w)
Y
Out[31]:
In [36]:
traces = []
list = combined['zip5'].values.tolist()
list_short = list[0:200]
list_short.append('90210')
for name in tuple(list_short):
trace = Scatter(
x=Y[y==name,0],
y=Y[y==name,1],
mode='markers',
name=name,
marker=Marker(
size=12,
line=Line(
color='rgba(217, 217, 217, 0.14)',
width=0.5),
opacity=0.8))
traces.append(trace)
data = Data(traces)
layout = Layout(showlegend=True,
scene=Scene(xaxis=XAxis(title='PC1'),
yaxis=YAxis(title='PC2'),))
fig = Figure(data=data, layout=layout)
py.iplot(fig)
Out[36]:
In [37]:
#Same PCA analysis using scikit-learn
from sklearn.decomposition import PCA as sklearnPCA
sklearn_pca = sklearnPCA(n_components=2)
Y_sklearn = sklearn_pca.fit_transform(X_std)
traces = []
list = combined['zip5'].values.tolist()
list_short = list[0:200]
list_short.append('90210')
for name in (list_short):
trace = Scatter(
x=Y_sklearn[y==name,0],
y=Y_sklearn[y==name,1],
mode='markers',
name=name,
marker=Marker(
size=12,
line=Line(
color='rgba(217, 217, 217, 0.14)',
width=0.5),
opacity=0.8))
traces.append(trace)
data = Data(traces)
layout = Layout(xaxis=XAxis(title='PC1', showline=False),
yaxis=YAxis(title='PC2', showline=False))
fig = Figure(data=data, layout=layout)
py.iplot(fig)
Out[37]:
In [ ]: