notebook.community

Edit and run



In [1]:

    
%matplotlib inline
import pandas as pd
import json



In [2]:

    
#File locations
acs_file = "/home/russ/Documents/DDL/Data/JeffData/PCA/Jeff_data_acs5yr.csv"
zillow_HVI_file = "/home/russ/Documents/DDL/Data/JeffData/PCA/Zip_Zhvi_AllHomes_HomeValueIndex.csv"
FDIC_deposits_100K_file = "/home/russ/Documents/DDL/Data/JeffData/PCA/FDIC_All_Reports_20150630/All_Reports_20150630_Deposits Based on the $100,000 Reporting Threshold.csv"
FDIC_deposits_250K_file = "/home/russ/Documents/DDL/Data/JeffData/PCA/FDIC_All_Reports_20150630/All_Reports_20150630_Deposits Based on the $250,000 Reporting Threshold.csv"
library_file = "/home/russ/Documents/DDL/Data/JeffData/PCA/Public_Libraries_Survey_FY_2013_-_Outlet.csv"
complaints_file = "/home/russ/Documents/DDL/Data/JeffData/PCA/Consumer_Complaints.csv"



In [3]:

    
acs = pd.read_csv(acs_file)



In [4]:

    
acs.head()









    Out[4]:






  
    
      
      zip5
      pop
      race_white
      race_black
      race_indian
      race_asian
      race_pac_is
      race_other
      race_multi
      hisp
      ...
      marital_yes30to34
      marital_yes35to39
      marital_yes40to44
      marital_yes45to49
      marital_yes50to54
      marital_yes55to59
      marital_yes60to64
      marital_yes65to74
      marital_yes75to84
      marital_yes85p
    
  
  
    
      0
      601
      18450
      0.946179
      0.007805
      0.000217
      0.000000
      0
      0.042005
      0.007588
      0.994580
      ...
      0.038743
      0.030818
      0.049919
      0.051477
      0.053644
      0.038607
      0.049038
      0.068274
      0.029193
      0.005690
    
    
      1
      602
      41302
      0.635611
      0.035010
      0.000654
      0.001719
      0
      0.030676
      0.592659
      0.935838
      ...
      0.039094
      0.037999
      0.051465
      0.049600
      0.058420
      0.054276
      0.053892
      0.067683
      0.025244
      0.002545
    
    
      2
      603
      53683
      0.775217
      0.032953
      0.000335
      0.008345
      0
      0.030643
      0.305013
      0.958609
      ...
      0.037783
      0.037464
      0.033750
      0.039014
      0.042386
      0.046215
      0.056538
      0.077891
      0.031152
      0.003168
    
    
      3
      606
      6591
      0.953421
      0.011227
      0.000000
      0.000000
      0
      0.022151
      0.026400
      0.998635
      ...
      0.004145
      0.024496
      0.032410
      0.029772
      0.049934
      0.034483
      0.061428
      0.052007
      0.012436
      0.003580
    
    
      4
      610
      28963
      0.740013
      0.029141
      0.000000
      0.000725
      0
      0.167041
      0.126161
      0.992542
      ...
      0.032814
      0.031205
      0.067025
      0.049411
      0.052079
      0.041282
      0.048353
      0.078161
      0.026590
      0.006563
    
  

5 rows × 77 columns



In [6]:

    
zillow = pd.read_csv(zillow_HVI_file)



In [12]:

    
zillow = zillow[['RegionName','1996-07','1997-01','1997-07','1998-01','1998-07','1999-01','1999-07','2000-01','2000-07'\
    ,'2001-01','2001-07','2002-01','2002-07','2003-01','2003-07','2004-01','2004-07','2005-01','2005-07','2006-01','2006-07'\
    ,'2007-01','2007-07','2008-01','2008-07','2009-01','2009-07','2010-01','2010-07','2011-01','2011-07','2012-01','2012-07'\
    ,'2013-01','2013-07','2014-01','2014-07','2015-01','2015-07']]



In [7]:

    
zillow.rename(columns={'RegionName':'zip'},inplace=True)
zillow.head()









    Out[7]:






  
    
      
      zip
      City
      State
      Metro
      CountyName
      1996-04
      1996-05
      1996-06
      1996-07
      1996-08
      ...
      2014-10
      2014-11
      2014-12
      2015-01
      2015-02
      2015-03
      2015-04
      2015-05
      2015-06
      2015-07
    
  
  
    
      0
      10025
      New York
      NY
      New York
      New York
      NaN
      NaN
      NaN
      NaN
      NaN
      ...
      1020000
      1019600
      1024100
      1029600
      1035800
      1047600
      1063000
      1073100
      1084100
      1097100
    
    
      1
      60657
      Chicago
      IL
      Chicago
      Cook
      142500
      142100
      141900
      141900
      142400
      ...
      302900
      304300
      305000
      304500
      303600
      302900
      303300
      304400
      305900
      307000
    
    
      2
      60614
      Chicago
      IL
      Chicago
      Cook
      197700
      194800
      193300
      193200
      193500
      ...
      381200
      380700
      379900
      378900
      378100
      379500
      383300
      387900
      391000
      393000
    
    
      3
      79936
      El Paso
      TX
      El Paso
      El Paso
      70800
      71000
      70900
      71100
      71000
      ...
      112400
      112200
      111800
      111600
      111300
      111200
      111300
      111600
      111800
      112100
    
    
      4
      10002
      New York
      NY
      New York
      New York
      NaN
      NaN
      NaN
      NaN
      NaN
      ...
      869300
      881300
      894800
      903200
      905900
      899000
      894500
      901300
      925500
      951600
    
  

5 rows × 237 columns



In [5]:

    
deposits_250K = pd.read_csv(FDIC_deposits_250K_file)



In [6]:

    
#deposits_250K = deposits_250K[['zip'],['IDdepsmb'],['DEPSMRA'],['DEPSMRN'],['NTRCDSMJ'],['IDdeplam'],['IDdeplgb'],['DEPLGRA'],['DEPLGRN'],['NTRTMLGJ']]
deposits_250K = deposits_250K[['zip','IDdepsam','IDdepsmb','DEPSMRA','DEPSMRN','NTRCDSMJ','IDdeplam','IDdeplgb','DEPLGRA','DEPLGRN','NTRTMLGJ']]



In [7]:

    
deposits_250K.columns = ['zip5','dep_amt_low','dep_count_low','retirement_amt_low','retirement_count_low','time_deposits_low','dep_amt_high','dep_count_high','retirement_amt_high','retirement_count_high','time_deposits_high']



In [8]:

    
deposits_250K.head()









    Out[8]:






  
    
      
      zip5
      dep_amt_low
      dep_count_low
      retirement_amt_low
      retirement_count_low
      time_deposits_low
      dep_amt_high
      dep_count_high
      retirement_amt_high
      retirement_count_high
      time_deposits_high
    
  
  
    
      0
      21613
      105111
      9871
      11265
      712
      40000
      49156
      70
      781
      2
      13062
    
    
      1
      21202
      0
      0
      0
      0
      0
      500
      1
      0
      0
      500
    
    
      2
      63376
      42888
      1250
      1552
      84
      28590
      21793
      32
      0
      0
      4088
    
    
      3
      59317
      30205
      2340
      743
      57
      7043
      11090
      30
      0
      0
      0
    
    
      4
      74728
      82686
      8762
      6146
      208
      26712
      28517
      37
      360
      1
      1787



In [9]:

    
deposits_zip = deposits_250K['dep_amt_high'].groupby(deposits_250K['zip5']).mean().reset_index()



In [10]:

    
deposits_zip.head()









    Out[10]:






  
    
      
      zip5
      dep_amt_high
    
  
  
    
      0
      802
      67941
    
    
      1
      820
      36914
    
    
      2
      909
      2542958
    
    
      3
      917
      2690626
    
    
      4
      918
      1567600



In [11]:

    
library = pd.read_csv(library_file)



In [12]:

    
#Slice field value based on hard coded State
#Next step to substitute ['STABR'] for State text to dynamically find location of each state within address field
#library_zip.apply(lambda x : x['Location'][0:15], axis =1)
#library_zip.apply(lambda x : x['Location'][x['Location'].find(', AK')+5:x['Location'].find(', AK')+10], axis =1)
#Strip Zip Code From Location Column
#library_zip = library[['Location','STABR']]
#library_zip['zip'] = library_zip.apply(lambda x : x['Location'][x['Location'].find(', ' + x['STABR'])+5:x['Location'].find(', ' + x['STABR'])+10], axis =1)



In [160]:

    
#Parse out Zip Code from Location field
library['zip'] = library.apply(lambda x : x['Location'][x['Location'].rfind(', ' + x['STABR'])+5:x['Location'].rfind(', ' + x['STABR'])+10], axis =1)



In [161]:

    
library.head(2)









    Out[161]:






  
    
      
      STABR
      FSCSKEY
      FSCS_SEQ
      LIBID
      LIBNAME
      CNTY
      PHONE
      C_OUT_TY
      C_MSA
      SQ_FEET
      ...
      CENTRACT
      CENBLOCK
      CDCODE
      CBSA
      MICROF
      GAL
      GALMS
      POSTMS
      Location
      zip
    
  
  
    
      0
      AK
      AK0001
      2
      AK0001-002
      ANCHOR POINT PUBLIC LIBRARY
      KENAI PENINSULA
      9072355692
      CE
      NO
      1287
      ...
      8.00
      3014
      200
      0
      0
      house
      STD
      NND
      72551 MILO FRITZ AVENUE\nANCHOR POINT, AK 9955...
      99556
    
    
      1
      AK
      AK0002
      7
      AK0002-007
      CHUGIAK/EAGLE RIVER NEIGHBORHOOD LIBRARY
      ANCHORAGE
      9073431530
      BR
      CC
      17888
      ...
      2.01
      2021
      200
      11260
      0
      addresspoint
      STD
      NND
      12001 BUSINESS BOULEVARD #176\nEAGLE RIVER, AK...
      99577
    
  

2 rows × 37 columns



In [9]:

    
#Change to your local path
library.to_csv("/home/russ/Documents/DDL/Data/JeffData/PCA/Library_ZipCode.csv")



In [162]:

    
library_zip = library['STABR'].groupby(library['zip']).count().reset_index()



In [163]:

    
library_zip['zip5'] = library_zip.apply(lambda x: int(x['zip']),axis=1)



In [166]:

    
library_zip.columns = ['zip','LibraryCount','zip5']



In [167]:

    
library_zip.head(10)









    Out[167]:






  
    
      
      zip
      LibraryCount
      zip5
    
  
  
    
      0
      00602
      1
      602
    
    
      1
      00603
      1
      603
    
    
      2
      00605
      1
      605
    
    
      3
      00610
      1
      610
    
    
      4
      00612
      1
      612
    
    
      5
      00613
      2
      613
    
    
      6
      00617
      2
      617
    
    
      7
      00624
      1
      624
    
    
      8
      00627
      4
      627
    
    
      9
      00638
      1
      638



In [13]:

    
combined = pd.merge(acs[['zip5','snap','inc_median','poverty']],deposits_zip[['zip5','dep_amt_high']], on='zip5',copy=False)



In [14]:

    
combined.head()









    Out[14]:






  
    
      
      zip5
      snap
      inc_median
      poverty
      dep_amt_high
    
  
  
    
      0
      909
      0.291393
      16891
      0.543534
      2542958
    
    
      1
      917
      0.328958
      16802
      0.531585
      2690626
    
    
      2
      918
      0.217590
      26573
      0.496521
      1567600
    
    
      3
      926
      0.173707
      35308
      0.262210
      1874329
    
    
      4
      1005
      0.029427
      68644
      0.037358
      25720



In [15]:

    
combined[combined['zip5']==90210]









    Out[15]:






  
    
      
      zip5
      snap
      inc_median
      poverty
      dep_amt_high
    
  
  
    
      4759
      90210
      0.008544
      132254
      0.077309
      23634809



In [16]:

    
#Beginning PCA Analysis (reference: http://sebastianraschka.com/Articles/2015_pca_in_3_steps.html)

X = combined.ix[:,1:5].values
y = combined.ix[:,0].values



In [17]:

    
y









    Out[17]:





array([  909,   917,   918, ..., 99510, 99701, 99901])



In [18]:

    
#Standardization
from sklearn.preprocessing import StandardScaler
X_std = StandardScaler().fit_transform(X)



In [19]:

    
X_std









    Out[19]:





array([[ 2.08209341, -1.72499341,  4.1443789 ,  0.13070951],
       [ 2.53961831, -1.72944417,  4.02510502,  0.14089822],
       [ 1.18319901, -1.24080997,  3.67508955,  0.06341232],
       ..., 
       [-1.46693953, -2.56968897, -1.28130945,  0.02365456],
       [ 0.0727278 , -0.04000304, -0.28411344, -0.03919475],
       [ 0.18459999,  0.5567998 , -0.37878253, -0.03200704]])



In [20]:

    
#Covariance Matrix
#Roughly speaking, the covariance of two features tells us how those random variables vary with respect to each other. 
#For example, if two variables tend to increase (or decrease) together, the covariance is positive; 
#the covariance is negative if one feature increases while the other decreases, respectively. 
#However, we have to be careful to not draw any conclusions from the magnitudes of the covariance values 
#at this point since they are not normalized, yet.
#
#Now, if we calculate the covariance between every feature pair, we will get the symmetric covariance matrix:


import numpy as np
mean_vec = np.mean(X_std, axis=0)
cov_mat = (X_std - mean_vec).T.dot((X_std - mean_vec)) / (X_std.shape[0]-1)
print 'Covariance matrix \n%s' %cov_mat









    



Covariance matrix 
[[ 1.00020222 -0.63663469  0.61415715 -0.0019517 ]
 [-0.63663469  1.00020222 -0.50314101  0.0237246 ]
 [ 0.61415715 -0.50314101  1.00020222  0.01121106]
 [-0.0019517   0.0237246   0.01121106  1.00020222]]



In [21]:

    
#Same As Above, simplified
print 'NumPy covariance matrix: \n%s' %np.cov(X_std.T)









    



NumPy covariance matrix: 
[[ 1.00020222 -0.63663469  0.61415715 -0.0019517 ]
 [-0.63663469  1.00020222 -0.50314101  0.0237246 ]
 [ 0.61415715 -0.50314101  1.00020222  0.01121106]
 [-0.0019517   0.0237246   0.01121106  1.00020222]]



In [22]:

    
#Generate eigendecomposition on the covariance matrix:
cov_mat = np.cov(X_std.T)

eig_vals, eig_vecs = np.linalg.eig(cov_mat)

print 'Eigenvectors \n%s' %eig_vecs
print '\nEigenvalues \n%s' %eig_vals









    



Eigenvectors 
[[ 0.60266076 -0.79578274  0.05883835  0.00823883]
 [-0.56835161 -0.47946993 -0.66786708  0.0322272 ]
 [ 0.56011182  0.36981944 -0.74038439  0.03659621]
 [-0.00715556  0.00848469  0.04819296  0.99877637]]

Eigenvalues 
[ 2.17141336  0.33122832  0.49680479  1.00136242]



In [23]:

    
#Correlation Matrix
cor_mat1 = np.corrcoef(X.T)
cor_mat1









    Out[23]:





array([[ 1.        , -0.63650597,  0.61403298, -0.0019513 ],
       [-0.63650597,  1.        , -0.50303928,  0.0237198 ],
       [ 0.61403298, -0.50303928,  1.        ,  0.0112088 ],
       [-0.0019513 ,  0.0237198 ,  0.0112088 ,  1.        ]])



In [24]:

    
#Since the correlation matrix is a "normalized version" of the covariance matrix, 
#it is not affected by standardization of the features, which is demonstrated below:
cor_mat2 = np.corrcoef(X_std.T)
cor_mat2









    Out[24]:





array([[ 1.        , -0.63650597,  0.61403298, -0.0019513 ],
       [-0.63650597,  1.        , -0.50303928,  0.0237198 ],
       [ 0.61403298, -0.50303928,  1.        ,  0.0112088 ],
       [-0.0019513 ,  0.0237198 ,  0.0112088 ,  1.        ]])



In [25]:

    
#Now, if we perform an eigendecomposition on the correlation matrix, 
#we expect and observe the same results as for the eigendecomposition of 
#the covariance matrix with standardized features earlier:

eig_vals, eig_vecs = np.linalg.eig(cor_mat1)

print 'Eigenvectors \n%s' %eig_vecs 
print '\nEigenvalues \n%s' %eig_vals









    



Eigenvectors 
[[ 0.60266076 -0.79578274  0.05883835  0.00823883]
 [-0.56835161 -0.47946993 -0.66786708  0.0322272 ]
 [ 0.56011182  0.36981944 -0.74038439  0.03659621]
 [-0.00715556  0.00848469  0.04819296  0.99877637]]

Eigenvalues 
[ 2.17097434  0.33116136  0.49670434  1.00115997]



In [26]:

    
#Singular Vector Decomposition
#While the eigendecomposition of the covariance or correlation matrix may be more intuitiuve,
#most PCA implementations perform a Singular Vector Decomposition (SVD) to improve the computational 
#efficiency. So, let us perform an SVD to confirm that the result are indeed the same:

u,s,v = np.linalg.svd(X_std.T)
u









    Out[26]:





array([[-0.60266076,  0.00823883, -0.05883835,  0.79578274],
       [ 0.56835161,  0.0322272 ,  0.66786708,  0.47946993],
       [-0.56011182,  0.03659621,  0.74038439, -0.36981944],
       [ 0.00715556,  0.99877637, -0.04819296, -0.00848469]])



In [27]:

    
#Sorting Eigenpairs
#The typical goal of a PCA is to reduce the dimensionality of the original 
#feature space by projecting it onto a smaller subspace, where the eigenvectors 
#will form the axes. However, the eigenvectors only define the directions of the new axis, 
#since they have all the same unit length 1, which can confirmed by the following two lines of code:

for ev in eig_vecs:
    np.testing.assert_array_almost_equal(1.0, np.linalg.norm(ev))
print 'Everything ok!'









    



Everything ok!



In [28]:

    
#In order to decide which eigenvector(s) can dropped without losing too much 
#information for the construction of the lower-dimensional subspace, we need 
#to inspect the corresponding eigenvalues: The eigenvectors with the lowest 
#eigenvalues bear the least information about the distribution of the data; 
#those are the ones can be dropped.
#In order to do so, the common approach is to rank the eigenvalues from highest 
#to lowest in order choose the top k eigenvectors.

# Make a list of (eigenvalue, eigenvector) tuples
eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:,i]) for i in range(len(eig_vals))]

# Sort the (eigenvalue, eigenvector) tuples from high to low
eig_pairs.sort()
eig_pairs.reverse()

# Visually confirm that the list is correctly sorted by decreasing eigenvalues
print('Eigenvalues in descending order:')
for i in eig_pairs:
    print(i[0])









    



Eigenvalues in descending order:
2.17097433694
1.00115996519
0.496704342456
0.331161355414



In [29]:

    
import plotly.plotly as py
from plotly.graph_objs import *
import plotly.tools as tls

tot = sum(eig_vals)
var_exp = [(i / tot)*100 for i in sorted(eig_vals, reverse=True)]
cum_var_exp = np.cumsum(var_exp)

trace1 = Bar(
        x=['PC %s' %i for i in range(1,5)],
        y=var_exp,
        showlegend=False)

trace2 = Scatter(
        x=['PC %s' %i for i in range(1,5)],
        y=cum_var_exp,
        name='cumulative explained variance')

data = Data([trace1, trace2])

layout=Layout(
        yaxis=YAxis(title='Explained variance in percent'),
        title='Explained variance by different principal components')

fig = Figure(data=data, layout=layout)
py.iplot(fig)









    Out[29]:



In [30]:

    
#Projection Matrix
"""
It's about time to get to the really interesting part: The construction of 
the projection matrix that will be used to transform the Iris data onto the new 
feature subspace. Although, the name "projection matrix" has a nice ring to it, 
it is basically just a matrix of our concatenated top k eigenvectors.

Here, we are reducing the 4-dimensional feature space to a 2-dimensional feature subspace, 
by choosing the "top 2" eigenvectors with the highest eigenvalues to construct our d×k-dimensional 
eigenvector matrix W.
"""

matrix_w = np.hstack((eig_pairs[0][1].reshape(4,1),
                      eig_pairs[1][1].reshape(4,1)))

print 'Matrix W:\n', matrix_w









    



Matrix W:
[[ 0.60266076  0.00823883]
 [-0.56835161  0.0322272 ]
 [ 0.56011182  0.03659621]
 [-0.00715556  0.99877637]]



In [31]:

    
#Projection Onto the New Feature Space

#In this last step we will use the 4×2-dimensional projection matrix W 
#to transform our samples onto the new subspace via the equation Y=X×W, 
#where Y is a 150×2 matrix of our transformed samples.

Y = X_std.dot(matrix_w)
Y









    Out[31]:





array([[ 4.55557909,  0.24378043],
       [ 4.76696138,  0.25321774],
       [ 3.4762913 ,  0.16758941],
       ..., 
       [-0.14142586, -0.1181652 ],
       [-0.09228885, -0.05023425],
       [-0.41713844, -0.0263649 ]])



In [36]:

    
traces = []

list = combined['zip5'].values.tolist()
list_short = list[0:200]
list_short.append('90210')

for name in tuple(list_short):

    trace = Scatter(
        x=Y[y==name,0],
        y=Y[y==name,1],
        mode='markers',
        name=name,
        marker=Marker(
            size=12,
            line=Line(
                color='rgba(217, 217, 217, 0.14)',
                width=0.5),
            opacity=0.8))
    traces.append(trace)


data = Data(traces)
layout = Layout(showlegend=True,
                scene=Scene(xaxis=XAxis(title='PC1'),
                yaxis=YAxis(title='PC2'),))

fig = Figure(data=data, layout=layout)
py.iplot(fig)









    Out[36]:



In [37]:

    
#Same PCA analysis using scikit-learn

from sklearn.decomposition import PCA as sklearnPCA
sklearn_pca = sklearnPCA(n_components=2)
Y_sklearn = sklearn_pca.fit_transform(X_std)

traces = []

list = combined['zip5'].values.tolist()
list_short = list[0:200]
list_short.append('90210')

for name in (list_short):

    trace = Scatter(
        x=Y_sklearn[y==name,0],
        y=Y_sklearn[y==name,1],
        mode='markers',
        name=name,
        marker=Marker(
            size=12,
            line=Line(
                color='rgba(217, 217, 217, 0.14)',
                width=0.5),
            opacity=0.8))
    traces.append(trace)


data = Data(traces)
layout = Layout(xaxis=XAxis(title='PC1', showline=False),
                yaxis=YAxis(title='PC2', showline=False))
fig = Figure(data=data, layout=layout)
py.iplot(fig)









    Out[37]:



In [ ]:

	zip5	pop	race_white	race_black	race_indian	race_asian	race_other	race_multi	hisp	...	marital_yes30to34	marital_yes35to39	marital_yes40to44	marital_yes45to49	marital_yes50to54	marital_yes55to59	marital_yes60to64	marital_yes65to74	marital_yes75to84	marital_yes85p
0	601	18450	0.946179	0.007805	0.000217	0.000000	0.042005	0.007588	0.994580	...	0.038743	0.030818	0.049919	0.051477	0.053644	0.038607	0.049038	0.068274	0.029193	0.005690
1	602	41302	0.635611	0.035010	0.000654	0.001719	0.030676	0.592659	0.935838	...	0.039094	0.037999	0.051465	0.049600	0.058420	0.054276	0.053892	0.067683	0.025244	0.002545
2	603	53683	0.775217	0.032953	0.000335	0.008345	0.030643	0.305013	0.958609	...	0.037783	0.037464	0.033750	0.039014	0.042386	0.046215	0.056538	0.077891	0.031152	0.003168
3	606	6591	0.953421	0.011227	0.000000	0.000000	0.022151	0.026400	0.998635	...	0.004145	0.024496	0.032410	0.029772	0.049934	0.034483	0.061428	0.052007	0.012436	0.003580
4	610	28963	0.740013	0.029141	0.000000	0.000725	0.167041	0.126161	0.992542	...	0.032814	0.031205	0.067025	0.049411	0.052079	0.041282	0.048353	0.078161	0.026590	0.006563

	zip	City	State	Metro	CountyName	1996-04	1996-05	1996-06	1996-07	1996-08	...	2014-10	2014-11	2014-12	2015-01	2015-02	2015-03	2015-04	2015-05	2015-06	2015-07
0	10025	New York	NY	New York	New York	NaN	NaN	NaN	NaN	NaN	...	1020000	1019600	1024100	1029600	1035800	1047600	1063000	1073100	1084100	1097100
1	60657	Chicago	IL	Chicago	Cook	142500	142100	141900	141900	142400	...	302900	304300	305000	304500	303600	302900	303300	304400	305900	307000
2	60614	Chicago	IL	Chicago	Cook	197700	194800	193300	193200	193500	...	381200	380700	379900	378900	378100	379500	383300	387900	391000	393000
3	79936	El Paso	TX	El Paso	El Paso	70800	71000	70900	71100	71000	...	112400	112200	111800	111600	111300	111200	111300	111600	111800	112100
4	10002	New York	NY	New York	New York	NaN	NaN	NaN	NaN	NaN	...	869300	881300	894800	903200	905900	899000	894500	901300	925500	951600

	zip5	dep_amt_low	dep_count_low	retirement_amt_low	retirement_count_low	time_deposits_low	dep_amt_high	dep_count_high	retirement_amt_high	retirement_count_high	time_deposits_high
0	21613	105111	9871	11265	712	40000	49156	70	781	2	13062
1	21202	0	0	0	0	0	500	1	0	0	500
2	63376	42888	1250	1552	84	28590	21793	32	0	0	4088
3	59317	30205	2340	743	57	7043	11090	30	0	0	0
4	74728	82686	8762	6146	208	26712	28517	37	360	1	1787

	STABR	FSCSKEY	FSCS_SEQ	LIBID	LIBNAME	CNTY	PHONE	C_OUT_TY	C_MSA	SQ_FEET	...	CENTRACT	CENBLOCK	CDCODE	CBSA	MICROF	GAL	GALMS	POSTMS	Location	zip
0	AK	AK0001	2	AK0001-002	ANCHOR POINT PUBLIC LIBRARY	KENAI PENINSULA	9072355692	CE	NO	1287	...	8.00	3014	200	0	0	house	STD	NND	72551 MILO FRITZ AVENUE\nANCHOR POINT, AK 9955...	99556
1	AK	AK0002	7	AK0002-007	CHUGIAK/EAGLE RIVER NEIGHBORHOOD LIBRARY	ANCHORAGE	9073431530	BR	CC	17888	...	2.01	2021	200	11260	0	addresspoint	STD	NND	12001 BUSINESS BOULEVARD #176\nEAGLE RIVER, AK...	99577

	zip	LibraryCount	zip5
0	00602	1	602
1	00603	1	603
2	00605	1	605
3	00610	1	610
4	00612	1	612
5	00613	2	613
6	00617	2	617
7	00624	1	624
8	00627	4	627
9	00638	1	638

	zip5	snap	inc_median	poverty	dep_amt_high
0	909	0.291393	16891	0.543534	2542958
1	917	0.328958	16802	0.531585	2690626
2	918	0.217590	26573	0.496521	1567600
3	926	0.173707	35308	0.262210	1874329
4	1005	0.029427	68644	0.037358	25720