notebook.community

Edit and run



In [2]:

    
%matplotlib inline
import pandas as pd
import json



In [5]:

    
#File locations
acs_file = "/home/russ/Documents/DDL/Data/JeffData/PCA/Jeff_data_acs5yr.csv"
zillow_HVI_file = "/home/russ/Documents/DDL/Data/JeffData/PCA/Zip_Zhvi_AllHomes_HomeValueIndex.csv"
zillow_RI_file = "/home/russ/Documents/DDL/Data/JeffData/PCA/Zip_Zri_AllHomes_RentIndex.csv"
FDIC_deposits_100K_file = "/home/russ/Documents/DDL/Data/JeffData/PCA/FDIC_All_Reports_20150630/All_Reports_20150630_Deposits Based on the $100,000 Reporting Threshold.csv"
FDIC_deposits_250K_file = "/home/russ/Documents/DDL/Data/JeffData/PCA/FDIC_All_Reports_20150630/All_Reports_20150630_Deposits Based on the $250,000 Reporting Threshold.csv"
library_file = "/home/russ/Documents/DDL/Data/JeffData/PCA/Public_Libraries_Survey_FY_2013_-_Outlet.csv"
complaints_file = "/home/russ/Documents/DDL/Data/JeffData/PCA/Consumer_Complaints.csv"
urbanization_zip = "/home/russ/Documents/DDL/Data/JeffData/PCA/zcta2010_txt.csv"



In [6]:

    
acs = pd.read_csv(acs_file)



In [7]:

    
acs.head()









    Out[7]:






  
    
      
      zip5
      pop
      race_white
      race_black
      race_indian
      race_asian
      race_pac_is
      race_other
      race_multi
      hisp
      ...
      marital_yes30to34
      marital_yes35to39
      marital_yes40to44
      marital_yes45to49
      marital_yes50to54
      marital_yes55to59
      marital_yes60to64
      marital_yes65to74
      marital_yes75to84
      marital_yes85p
    
  
  
    
      0
      601
      18450
      0.946179
      0.007805
      0.000217
      0.000000
      0
      0.042005
      0.007588
      0.994580
      ...
      0.038743
      0.030818
      0.049919
      0.051477
      0.053644
      0.038607
      0.049038
      0.068274
      0.029193
      0.005690
    
    
      1
      602
      41302
      0.635611
      0.035010
      0.000654
      0.001719
      0
      0.030676
      0.592659
      0.935838
      ...
      0.039094
      0.037999
      0.051465
      0.049600
      0.058420
      0.054276
      0.053892
      0.067683
      0.025244
      0.002545
    
    
      2
      603
      53683
      0.775217
      0.032953
      0.000335
      0.008345
      0
      0.030643
      0.305013
      0.958609
      ...
      0.037783
      0.037464
      0.033750
      0.039014
      0.042386
      0.046215
      0.056538
      0.077891
      0.031152
      0.003168
    
    
      3
      606
      6591
      0.953421
      0.011227
      0.000000
      0.000000
      0
      0.022151
      0.026400
      0.998635
      ...
      0.004145
      0.024496
      0.032410
      0.029772
      0.049934
      0.034483
      0.061428
      0.052007
      0.012436
      0.003580
    
    
      4
      610
      28963
      0.740013
      0.029141
      0.000000
      0.000725
      0
      0.167041
      0.126161
      0.992542
      ...
      0.032814
      0.031205
      0.067025
      0.049411
      0.052079
      0.041282
      0.048353
      0.078161
      0.026590
      0.006563
    
  

5 rows × 77 columns



In [8]:

    
diversity = acs[['zip5','pop','race_white','race_black','race_asian','race_indian','race_other','hisp']].copy(deep=True)



In [9]:

    
diversity['white_hisp'] = ((diversity['pop']*diversity['race_white'])*diversity['hisp'])/diversity['pop']



In [10]:

    
diversity['white_nonhisp'] = ((diversity['pop']*diversity['race_white'])*(1-diversity['hisp']))/diversity['pop']



In [11]:

    
diversity['div_index'] = 1- (diversity['race_black']**2 + diversity['white_hisp']**2 + diversity['white_nonhisp']**2 + diversity['race_asian']**2 + diversity['race_indian']**2)



In [12]:

    
diversity[diversity['zip5']==21093]









    Out[12]:






  
    
      
      zip5
      pop
      race_white
      race_black
      race_asian
      race_indian
      race_other
      hisp
      white_hisp
      white_nonhisp
      div_index
    
  
  
    
      6518
      21093
      36945
      0.840628
      0.031696
      0.101042
      0.001705
      0.006307
      0.033726
      0.028351
      0.812277
      0.328185



In [13]:

    
zillow_HVI = pd.read_csv(zillow_HVI_file)



In [14]:

    
zillow_RI = pd.read_csv(zillow_RI_file)



In [15]:

    
urban = pd.read_csv(urbanization_zip)



In [16]:

    
#del urban['pop']
urban.dtypes









    Out[16]:





ZCTA5          object
Zip5            int64
LANDSQMT        int64
WATERSQMT       int64
LANDSQMI      float64
WATERSQMI     float64
POPULATION      int64
HSGUNITS        int64
INTPTLAT      float64
INTPTLON      float64
dtype: object



In [17]:

    
#urban['pop'] = urban.apply(lambda x: int(x['POPULATION'].replace(',','')),axis=1)
#alternate
#urban['pop'] = urban['POPULATION'].apply(lambda x: int(x.replace(',','')))
urban['pop'] = urban.apply(lambda x: int(x['POPULATION']),axis=1)



In [18]:

    
#strip Z from Zip Code Text
urban['ZCTA5'] = urban.apply(lambda x: x['ZCTA5'][1:],axis=1)



In [19]:

    
urban.head()



In [20]:

    
urban['urban_index'] = urban['LANDSQMT']/urban['pop']



In [21]:

    
zillow_HVI = zillow_HVI[['RegionName','1996-07','1997-01','1997-07','1998-01','1998-07','1999-01','1999-07','2000-01','2000-07'\
    ,'2001-01','2001-07','2002-01','2002-07','2003-01','2003-07','2004-01','2004-07','2005-01','2005-07','2006-01','2006-07'\
    ,'2007-01','2007-07','2008-01','2008-07','2009-01','2009-07','2010-01','2010-07','2011-01','2011-07','2012-01','2012-07'\
    ,'2013-01','2013-07','2014-01','2014-07','2015-01','2015-07']]



In [22]:

    
zillow_HVI.rename(columns={'RegionName':'zip5'},inplace=True)
zillow_HVI.head()









    Out[22]:






  
    
      
      zip5
      1996-07
      1997-01
      1997-07
      1998-01
      1998-07
      1999-01
      1999-07
      2000-01
      2000-07
      ...
      2011-01
      2011-07
      2012-01
      2012-07
      2013-01
      2013-07
      2014-01
      2014-07
      2015-01
      2015-07
    
  
  
    
      0
      10025
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      ...
      758600
      751200
      737500
      772400
      825700
      908100
      998300
      1001700
      1029600
      1097100
    
    
      1
      60657
      141900
      144600
      154500
      161400
      165300
      179000
      188700
      201900
      220300
      ...
      288300
      274600
      258900
      264000
      267900
      279900
      294700
      297000
      304500
      307000
    
    
      2
      60614
      193200
      197200
      199200
      200800
      216500
      228400
      244900
      261900
      276700
      ...
      351800
      348500
      332900
      332900
      335700
      354000
      386800
      382300
      378900
      393000
    
    
      3
      79936
      71100
      72600
      74000
      76000
      76200
      75600
      77100
      78400
      78900
      ...
      110100
      109200
      108700
      109000
      109900
      111300
      109600
      111900
      111600
      112100
    
    
      4
      10002
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      ...
      704800
      688900
      696000
      700000
      730300
      791900
      849400
      863900
      903200
      951600
    
  

5 rows × 40 columns



In [23]:

    
len(zillow_HVI)









    Out[23]:





12988



In [24]:

    
zillow_RI.head(1)









    Out[24]:






  
    
      
      RegionName
      City
      State
      Metro
      CountyName
      2010-11
      2010-12
      2011-01
      2011-02
      2011-03
      ...
      2014-10
      2014-11
      2014-12
      2015-01
      2015-02
      2015-03
      2015-04
      2015-05
      2015-06
      2015-07
    
  
  
    
      0
      10025
      New York
      NY
      New York
      New York
      NaN
      NaN
      NaN
      NaN
      NaN
      ...
      3511
      3488
      3500
      3520
      3557
      3588
      3585
      3586
      3581
      3599
    
  

1 rows × 62 columns



In [25]:

    
zillow_RI = zillow_RI[['RegionName','2011-01','2011-07','2012-01','2012-07'\
    ,'2013-01','2013-07','2014-01','2014-07','2015-01','2015-07']].copy(False)



In [26]:

    
zillow_RI.rename(columns={'RegionName':'zip5'},inplace=True)



In [27]:

    
zillow_RI.head()



In [28]:

    
deposits_250K = pd.read_csv(FDIC_deposits_250K_file)



In [29]:

    
#deposits_250K = deposits_250K[['zip'],['IDdepsmb'],['DEPSMRA'],['DEPSMRN'],['NTRCDSMJ'],['IDdeplam'],['IDdeplgb'],['DEPLGRA'],['DEPLGRN'],['NTRTMLGJ']]
deposits_250K = deposits_250K[['zip','IDdepsam','IDdepsmb','DEPSMRA','DEPSMRN','NTRCDSMJ','IDdeplam','IDdeplgb','DEPLGRA','DEPLGRN','NTRTMLGJ']]



In [30]:

    
deposits_250K.columns = ['zip5','dep_amt_low','dep_count_low','retirement_amt_low','retirement_count_low','time_deposits_low','dep_amt_high','dep_count_high','retirement_amt_high','retirement_count_high','time_deposits_high']



In [31]:

    
deposits_250K.head()









    Out[31]:






  
    
      
      zip5
      dep_amt_low
      dep_count_low
      retirement_amt_low
      retirement_count_low
      time_deposits_low
      dep_amt_high
      dep_count_high
      retirement_amt_high
      retirement_count_high
      time_deposits_high
    
  
  
    
      0
      21613
      105111
      9871
      11265
      712
      40000
      49156
      70
      781
      2
      13062
    
    
      1
      21202
      0
      0
      0
      0
      0
      500
      1
      0
      0
      500
    
    
      2
      63376
      42888
      1250
      1552
      84
      28590
      21793
      32
      0
      0
      4088
    
    
      3
      59317
      30205
      2340
      743
      57
      7043
      11090
      30
      0
      0
      0
    
    
      4
      74728
      82686
      8762
      6146
      208
      26712
      28517
      37
      360
      1
      1787



In [32]:

    
deposits_zip = deposits_250K['dep_amt_high'].groupby(deposits_250K['zip5']).mean().reset_index()



In [33]:

    
deposits_zip.head()









    Out[33]:






  
    
      
      zip5
      dep_amt_high
    
  
  
    
      0
      802
      67941
    
    
      1
      820
      36914
    
    
      2
      909
      2542958
    
    
      3
      917
      2690626
    
    
      4
      918
      1567600



In [34]:

    
library = pd.read_csv(library_file)



In [35]:

    
#Slice field value based on hard coded State
#Next step to substitute ['STABR'] for State text to dynamically find location of each state within address field
#library_zip.apply(lambda x : x['Location'][0:15], axis =1)
#library_zip.apply(lambda x : x['Location'][x['Location'].find(', AK')+5:x['Location'].find(', AK')+10], axis =1)
#Strip Zip Code From Location Column
#library_zip = library[['Location','STABR']]
#library_zip['zip'] = library_zip.apply(lambda x : x['Location'][x['Location'].find(', ' + x['STABR'])+5:x['Location'].find(', ' + x['STABR'])+10], axis =1)



In [36]:

    
#Parse out Zip Code from Location field
library['zip'] = library.apply(lambda x : x['Location'][x['Location'].rfind(', ' + x['STABR'])+5:x['Location'].rfind(', ' + x['STABR'])+10], axis =1)



In [37]:

    
library.head(2)









    Out[37]:






  
    
      
      STABR
      FSCSKEY
      FSCS_SEQ
      LIBID
      LIBNAME
      CNTY
      PHONE
      C_OUT_TY
      C_MSA
      SQ_FEET
      ...
      CENTRACT
      CENBLOCK
      CDCODE
      CBSA
      MICROF
      GAL
      GALMS
      POSTMS
      Location
      zip
    
  
  
    
      0
      AK
      AK0001
      2
      AK0001-002
      ANCHOR POINT PUBLIC LIBRARY
      KENAI PENINSULA
      9072355692
      CE
      NO
      1287
      ...
      8.00
      3014
      200
      0
      0
      house
      STD
      NND
      72551 MILO FRITZ AVENUE\nANCHOR POINT, AK 9955...
      99556
    
    
      1
      AK
      AK0002
      7
      AK0002-007
      CHUGIAK/EAGLE RIVER NEIGHBORHOOD LIBRARY
      ANCHORAGE
      9073431530
      BR
      CC
      17888
      ...
      2.01
      2021
      200
      11260
      0
      addresspoint
      STD
      NND
      12001 BUSINESS BOULEVARD #176\nEAGLE RIVER, AK...
      99577
    
  

2 rows × 37 columns



In [38]:

    
#Change to your local path
library.to_csv("/home/russ/Documents/DDL/Data/JeffData/PCA/Library_ZipCode.csv")



In [39]:

    
library_zip = library['STABR'].groupby(library['zip']).count().reset_index()



In [40]:

    
library_zip['zip5'] = library_zip.apply(lambda x: int(x['zip']),axis=1)



In [41]:

    
library_zip.columns = ['zip','LibraryCount','zip5']



In [42]:

    
library_zip.head(1)









    Out[42]:






  
    
      
      zip
      LibraryCount
      zip5
    
  
  
    
      0
      00602
      1
      602



In [43]:

    
combined = pd.merge(acs[['zip5','snap','inc_median','poverty']],zillow[['zip5','2015-07']], on='zip5',copy=False)









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-43-9cca79f80dba> in <module>()
----> 1 combined = pd.merge(acs[['zip5','snap','inc_median','poverty']],zillow[['zip5','2015-07']], on='zip5',copy=False)

NameError: name 'zillow' is not defined



In [ ]:

    
combined.head()



In [ ]:

    
combined[combined['zip5']==90210]



In [ ]:

    
zillow_combined = pd.merge(zillow_HVI[['zip5','2015-07']],zillow_RI[['zip5','2015-07']], on='zip5',copy=False)



In [ ]:

    
zillow_combined.columns = ['zip5','HVI','RI']



In [ ]:

    
#Beginning PCA Analysis (reference: http://sebastianraschka.com/Articles/2015_pca_in_3_steps.html)

X = combined.ix[:,1:5].values
y = combined.ix[:,0].values



In [44]:

    
#Standardization
from sklearn.preprocessing import StandardScaler
X_std = StandardScaler().fit_transform(X)









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-44-c39e596368d5> in <module>()
      1 #Standardization
      2 from sklearn.preprocessing import StandardScaler
----> 3 X_std = StandardScaler().fit_transform(X)

NameError: name 'X' is not defined



In [ ]:

    
X_std



In [ ]:

    
#Same PCA analysis using scikit-learn

from sklearn.decomposition import PCA as sklearnPCA
sklearn_pca = sklearnPCA(n_components=2)
Y_sklearn = sklearn_pca.fit_transform(X_std)



In [ ]:

    
Y_sklearn



In [ ]:

    
df = pd.DataFrame({'X':Y_sklearn[:,0],'Y':Y_sklearn[:,1]})



In [72]:

    
df = combined[['zip5']].merge(df,left_index=True, right_index=True)



In [73]:

    
df[df['zip5']==90210]



In [74]:

    
#Top of arc = High Housing, low Income
#Negative = High Housing, High Income
#Positive = Low Housing, Low Income
df.plot(kind='scatter',x='X',y='Y')









    Out[74]:





<matplotlib.axes._subplots.AxesSubplot at 0x7ff151755690>



In [77]:

    
df[df['Y']<-10]



In [79]:

    
df[df['X']>8]



In [94]:

    
df[(df['X']>-.5) & (df['X']<.5) & (df['Y']<0)]









    Out[94]:






  
    
      
      zip5
      X
      Y
    
  
  
    
      21
      1053
      0.064608
      -0.245104
    
    
      62
      1240
      -0.257232
      -0.021684
    
    
      63
      1245
      -0.279738
      -0.975474
    
    
      69
      1262
      -0.353235
      -0.176648
    
    
      90
      1375
      0.306770
      -0.183014
    
    
      104
      1464
      -0.307914
      -0.007825
    
    
      119
      1518
      -0.037178
      -0.082075
    
    
      132
      1537
      0.226301
      -0.377547
    
    
      163
      1702
      0.364843
      -0.195228
    
    
      241
      1970
      0.435376
      -0.476600
    
    
      285
      2127
      -0.145804
      -1.266902
    
    
      288
      2130
      -0.296941
      -1.234145
    
    
      289
      2131
      0.046681
      -0.797299
    
    
      293
      2136
      -0.048670
      -0.168980
    
    
      297
      2141
      -0.257093
      -1.413339
    
    
      302
      2148
      0.321340
      -0.468417
    
    
      308
      2169
      -0.119179
      -0.194188
    
    
      316
      2189
      -0.003236
      -0.185794
    
    
      340
      2368
      0.145833
      -0.122926
    
    
      389
      2630
      -0.471313
      -1.453163
    
    
      405
      2657
      0.162036
      -0.779767
    
    
      470
      2840
      -0.344479
      -0.208771
    
    
      484
      2882
      -0.243592
      -0.851685
    
    
      498
      2906
      -0.171606
      -1.020005
    
    
      566
      3253
      0.004081
      -0.246357
    
    
      602
      3461
      0.181132
      -0.051355
    
    
      651
      3824
      0.020621
      -0.989868
    
    
      685
      3874
      -0.018771
      -0.135976
    
    
      719
      5443
      0.468227
      -0.042621
    
    
      743
      5651
      0.225654
      -0.401573
    
    
      ...
      ...
      ...
      ...
    
    
      12730
      98261
      -0.123482
      -0.724112
    
    
      12731
      98262
      -0.465071
      -0.225020
    
    
      12735
      98270
      0.114647
      -0.186863
    
    
      12736
      98271
      -0.137044
      -0.040737
    
    
      12739
      98274
      0.032027
      -0.077227
    
    
      12745
      98292
      -0.494928
      -0.291293
    
    
      12754
      98321
      -0.251319
      -0.143365
    
    
      12776
      98366
      0.455906
      -0.052767
    
    
      12777
      98367
      -0.368170
      -0.111199
    
    
      12781
      98372
      0.134852
      -0.204046
    
    
      12782
      98373
      0.308655
      -0.202166
    
    
      12791
      98390
      0.347422
      -0.030591
    
    
      12794
      98394
      0.134061
      -0.078051
    
    
      12796
      98403
      -0.239239
      -0.157914
    
    
      12799
      98406
      0.222205
      -0.370056
    
    
      12800
      98407
      -0.237965
      -0.084691
    
    
      12811
      98466
      0.177025
      -0.067364
    
    
      12812
      98467
      0.179890
      -0.387212
    
    
      12818
      98506
      0.144386
      -0.066760
    
    
      12821
      98516
      -0.348927
      -0.061699
    
    
      12838
      98580
      -0.077394
      -0.243003
    
    
      12866
      98671
      -0.192109
      -0.202126
    
    
      12867
      98674
      0.154816
      -0.192993
    
    
      12870
      98683
      0.024403
      -0.033818
    
    
      12871
      98684
      0.385029
      -0.035259
    
    
      12873
      98686
      -0.437751
      -0.139231
    
    
      12877
      98816
      0.484283
      -0.157040
    
    
      12880
      98826
      0.232554
      -0.162735
    
    
      12882
      98831
      0.299221
      -0.050048
    
    
      12971
      99654
      -0.252891
      -0.038480
    
  

597 rows × 3 columns



In [80]:

    
df[df['Y']>0]









    Out[80]:






  
    
      
      zip5
      X
      Y
    
  
  
    
      0
      1001
      -0.209610
      0.555319
    
    
      2
      1005
      -0.998922
      0.837407
    
    
      3
      1007
      -0.529983
      0.114181
    
    
      4
      1008
      -1.144768
      0.811163
    
    
      5
      1010
      -0.971439
      0.569256
    
    
      6
      1011
      -0.341337
      0.577316
    
    
      8
      1020
      0.309196
      0.520136
    
    
      10
      1026
      -0.552681
      0.720066
    
    
      11
      1027
      -0.514904
      0.554378
    
    
      12
      1028
      -0.983329
      0.449181
    
    
      13
      1030
      -0.414434
      0.293022
    
    
      14
      1033
      -1.117019
      0.777389
    
    
      15
      1034
      -1.064903
      0.387531
    
    
      17
      1036
      -1.346627
      0.566806
    
    
      18
      1038
      -0.713128
      0.253932
    
    
      20
      1050
      0.029264
      0.535282
    
    
      23
      1056
      -0.558750
      0.710186
    
    
      24
      1057
      -0.211522
      0.191307
    
    
      26
      1062
      -0.638163
      0.351628
    
    
      27
      1068
      -1.274841
      0.398266
    
    
      28
      1069
      0.365876
      0.419715
    
    
      29
      1071
      -1.081674
      0.770970
    
    
      30
      1072
      -0.824851
      0.564430
    
    
      31
      1073
      -1.282759
      0.450727
    
    
      32
      1075
      -0.582318
      0.684515
    
    
      33
      1077
      -1.349392
      0.784897
    
    
      35
      1081
      -0.217643
      0.656071
    
    
      36
      1082
      0.558238
      0.177666
    
    
      37
      1083
      0.871581
      0.289997
    
    
      38
      1085
      0.224208
      0.145490
    
    
      ...
      ...
      ...
      ...
    
    
      12936
      99216
      0.957613
      0.076701
    
    
      12939
      99223
      -0.205127
      0.264317
    
    
      12940
      99224
      0.068346
      0.049541
    
    
      12942
      99320
      0.303124
      0.169849
    
    
      12943
      99323
      -0.460195
      0.423740
    
    
      12946
      99337
      -0.471959
      0.433329
    
    
      12947
      99338
      -1.669890
      0.446592
    
    
      12949
      99352
      -0.749953
      0.239548
    
    
      12950
      99353
      -0.859055
      0.326252
    
    
      12951
      99354
      0.056020
      0.204216
    
    
      12953
      99402
      0.523632
      0.028952
    
    
      12954
      99403
      0.827492
      0.369117
    
    
      12956
      99502
      -1.244004
      0.122944
    
    
      12958
      99504
      -0.593647
      0.166664
    
    
      12959
      99507
      -1.278218
      0.121707
    
    
      12961
      99515
      -1.500315
      0.217164
    
    
      12963
      99517
      -0.686718
      0.117141
    
    
      12964
      99518
      -0.837455
      0.388418
    
    
      12965
      99567
      -1.708138
      0.196757
    
    
      12966
      99577
      -1.830426
      0.197862
    
    
      12968
      99611
      -0.504399
      0.385261
    
    
      12969
      99615
      -0.817403
      0.129799
    
    
      12970
      99645
      -0.735283
      0.256060
    
    
      12972
      99669
      -0.341638
      0.435116
    
    
      12973
      99701
      0.366613
      0.344208
    
    
      12974
      99705
      -0.864881
      0.529043
    
    
      12975
      99709
      -0.773324
      0.433899
    
    
      12976
      99712
      -1.499410
      0.634598
    
    
      12977
      99801
      -1.137844
      0.055228
    
    
      12978
      99901
      0.008810
      0.095256
    
  

8093 rows × 3 columns



In [84]:

    
combined[combined['zip5']== 28202]



In [83]:

    
combined[combined['zip5']== 43240]



In [82]:

    
combined[combined['zip5']== 90210]



In [81]:

    
combined[combined['zip5']== 99901]



In [100]:

    
combined[combined['zip5']== 58420]









    Out[100]:






  
    
      
      zip5
      snap
      inc_median
      poverty
      2015-07



In [85]:

    
df[df['zip5']== 90210]



In [37]:

    
traces = []

list = combined['zip5'].values.tolist()
list_short = list[0:200]
list_short.append('90210')

for name in (list_short):

    trace = Scatter(
        x=Y_sklearn[y==name,0],
        y=Y_sklearn[y==name,1],
        mode='markers',
        name=name,
        marker=Marker(
            size=12,
            line=Line(
                color='rgba(217, 217, 217, 0.14)',
                width=0.5),
            opacity=0.8))
    traces.append(trace)


data = Data(traces)
layout = Layout(xaxis=XAxis(title='PC1', showline=False),
                yaxis=YAxis(title='PC2', showline=False))
fig = Figure(data=data, layout=layout)
py.iplot(fig)









    Out[37]:



In [38]:

    
#Beginning PCA Analysis (reference: http://sebastianraschka.com/Articles/2015_pca_in_3_steps.html)

X = zillow_combined.ix[:,1:3].values
y = zillow_combined.ix[:,0].values



In [40]:

    
#Standardization
from sklearn.preprocessing import StandardScaler
X_std = StandardScaler().fit_transform(X)

#Same PCA analysis using scikit-learn

from sklearn.decomposition import PCA as sklearnPCA
sklearn_pca = sklearnPCA(n_components=2)
Y_sklearn = sklearn_pca.fit_transform(X_std)









    



/home/russ/anaconda/lib/python2.7/site-packages/sklearn/utils/validation.py:498: UserWarning: StandardScaler assumes floating point values as input, got int64
  "got %s" % (estimator, X.dtype))



In [ ]:

    
expl_var = sklearn_pca.explained_variance_ratio_



In [64]:

    
ev = expl_var.tolist()
ls =  ['PC%s' %i for i in range(0,len(ev))]
df_explainedValue = pd.DataFrame(ev,columns=['Value'],index=ls)



In [82]:



In [83]:

    
df_explainedValue.plot(kind='bar')









    Out[83]:





<matplotlib.axes._subplots.AxesSubplot at 0x7fc8fe268f90>



In [43]:

    
df = zillow_combined[['zip5']].merge(df,left_index=True, right_index=True)



In [44]:

    
df.plot(kind='scatter',x='X',y='Y')









    Out[44]:





<matplotlib.axes._subplots.AxesSubplot at 0x7fc9041b1690>



In [84]:

    
income_combined = acs[['zip5','snap','inc_median','poverty']].copy(False)



In [105]:

    
#Beginning PCA Analysis (reference: http://sebastianraschka.com/Articles/2015_pca_in_3_steps.html)

X = income_combined.ix[:,1:4].values
y = income_combined.ix[:,0].values

#Standardization
from sklearn.preprocessing import StandardScaler
X_std = StandardScaler().fit_transform(X)

#Same PCA analysis using scikit-learn

from sklearn.decomposition import PCA as sklearnPCA
sklearn_pca = sklearnPCA(n_components=1)
Y_sklearn = sklearn_pca.fit_transform(X_std)

expl_var = sklearn_pca.explained_variance_ratio_
ev = expl_var.tolist()
ls =  ['PC%s' %i for i in range(0,len(ev))]
df_explainedValue = pd.DataFrame(ev,columns=['Value'],index=ls)



In [106]:

    
sklearn_pca.explained_variance_ratio_









    Out[106]:





array([ 0.60302841])



In [107]:

    
df_explainedValue.plot(kind='bar')









    Out[107]:





<matplotlib.axes._subplots.AxesSubplot at 0x7fc8fddad690>



In [108]:

    
df = pd.DataFrame({'X':Y_sklearn[:,0],'Y':Y_sklearn[:,1]})
df = income_combined[['zip5']].merge(df,left_index=True, right_index=True)
df.plot(kind='scatter',x='X',y='Y')









    



---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-108-636b9daf0cc6> in <module>()
----> 1 df = pd.DataFrame({'X':Y_sklearn[:,0],'Y':Y_sklearn[:,1]})
      2 df = income_combined[['zip5']].merge(df,left_index=True, right_index=True)
      3 df.plot(kind='scatter',x='X',y='Y')

IndexError: index 1 is out of bounds for axis 1 with size 1



In [110]:

    
df = pd.DataFrame({'Income_Level':Y_sklearn[:,0]})
df = income_combined[['zip5']].merge(df,left_index=True, right_index=True)
df.plot(kind='scatter',x='zip5',y='X')









    Out[110]:





<matplotlib.axes._subplots.AxesSubplot at 0x7fc8fd99e0d0>



In [112]:

    
df.sort(['Income_Level'], ascending=[False])









    Out[112]:






  
    
      
      zip5
      X
    
  
  
    
      6592
      21405
      5.833637
    
    
      27494
      79033
      5.833637
    
    
      18961
      56210
      5.833637
    
    
      31890
      97028
      5.833637
    
    
      28246
      81335
      5.833637
    
    
      32320
      98050
      5.833637
    
    
      28332
      82063
      5.833637
    
    
      2263
      7970
      5.833637
    
    
      21848
      63073
      5.569037
    
    
      1236
      4575
      5.371800
    
    
      2626
      10282
      5.323790
    
    
      6503
      21056
      5.291953
    
    
      31036
      94528
      5.224450
    
    
      20444
      60043
      5.201251
    
    
      30419
      92145
      5.173233
    
    
      2067
      7311
      5.125283
    
    
      26662
      77010
      5.124102
    
    
      2028
      7078
      5.111521
    
    
      2265
      7976
      4.960067
    
    
      2712
      10577
      4.958065
    
    
      6801
      22066
      4.892476
    
    
      2564
      10007
      4.877787
    
    
      6075
      19437
      4.874672
    
    
      30954
      94027
      4.852093
    
    
      629
      2672
      4.791172
    
    
      1953
      6883
      4.758191
    
    
      3280
      12453
      4.751529
    
    
      6539
      21153
      4.637779
    
    
      6694
      21738
      4.582035
    
    
      1937
      6820
      4.512731
    
    
      ...
      ...
      ...
    
    
      6239
      20052
      -7.695458
    
    
      6484
      21031
      -7.695458
    
    
      21534
      62523
      -7.729769
    
    
      15122
      46379
      -7.836499
    
    
      19590
      57622
      -7.871984
    
    
      19589
      57621
      -7.999738
    
    
      8197
      26366
      -8.043925
    
    
      13628
      42321
      -8.193175
    
    
      24642
      71662
      -8.218272
    
    
      27279
      78565
      -8.241577
    
    
      13375
      41390
      -8.693122
    
    
      12717
      39061
      -8.782287
    
    
      24290
      70580
      -8.786269
    
    
      11825
      36727
      -8.902769
    
    
      29367
      86011
      -9.274077
    
    
      32277
      97909
      -9.432400
    
    
      33079
      99774
      -9.479919
    
    
      3628
      13102
      -9.675718
    
    
      15954
      48411
      -9.726297
    
    
      13575
      42124
      -9.736086
    
    
      22478
      64858
      -9.898691
    
    
      4198
      14614
      -10.234640
    
    
      31693
      96049
      -10.314945
    
    
      11595
      36031
      -10.389852
    
    
      12579
      38677
      -10.650890
    
    
      4653
      15616
      -10.925601
    
    
      29302
      85654
      -10.971303
    
    
      22837
      66019
      -11.661508
    
    
      14968
      46047
      -13.280264
    
    
      8291
      26684
      -13.280264
    
  

33120 rows × 2 columns



In [114]:

    
df[df['zip5']== 10001]



In [75]:

    
urban.rename(columns={'ZCTA5':'zip'},inplace=True)



In [46]:

    
import vincent
from vincent import AxisProperties, PropertySet, ValueRef
vincent.core.initialize_notebook()

zip_topo = r'zips_us_topo.json'
state_topo = r'us_states.topo.json'

geo_data2 = [{'name': 'zip_codes',
             'url': zip_topo,
             'feature': 'zip_codes_for_the_usa'},
            {'name': 'states',
             'url': state_topo,
             'feature': 'us_states.geo'}]

geo_data = [{'name': 'states',
             'url': state_topo,
             'feature': 'us_states.geo'},
            {'name': 'zip_codes',
             'url': zip_topo,
             'feature': 'zip_codes_for_the_usa'}]

vis = vincent.Map(data=urban, geo_data=geo_data, scale=1100, projection='albersUsa',
          data_bind='POPULATION', data_key='zip',brew='PuRd',
          map_key={'zip_codes': 'properties.zip'})
del vis.marks[0].properties.update
#del vis.marks[1].properties.update
#vis.marks[1].properties.update.fill.value = '#FFFFFF'
#vis.marks[1].properties.enter.stroke.value = '#CCCCFF'
vis.marks[1].properties.enter.stroke_opacity = ValueRef(value=0.1)
vis.marks[0].properties.enter.stroke.value = '#FF0000'
vis.marks[1].properties.enter.hover.fill.value = '#f84525'
vis.legend(title='POPULATION')
vis.display()









    





           






    



---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-46-85f573942253> in <module>()
     22 vis = vincent.Map(data=urban, geo_data=geo_data, scale=1100, projection='albersUsa',
     23           data_bind='POPULATION', data_key='zip',brew='PuRd',
---> 24           map_key={'zip_codes': 'properties.zip'})
     25 del vis.marks[0].properties.update
     26 #del vis.marks[1].properties.update

/home/russ/anaconda/lib/python2.7/site-packages/vincent/charts.pyc in __init__(self, data, geo_data, projection, center, translate, scale, rotate, data_bind, data_key, map_key, brew, *args, **kwargs)
    418                                      'you must pass data cols to key/bind on')
    419                 self.data['table'] = Data.keypairs(
--> 420                     data, columns=[data_key, data_bind]
    421                     )
    422                 key_join = '.'.join(['data', map_key[dat['name']]])

/home/russ/anaconda/lib/python2.7/site-packages/vincent/data.pyc in keypairs(cls, data, columns, use_index, name)
    420                 values = [{"x": cls.serialize(x[1][columns[0]]),
    421                            "y": cls.serialize(x[1][columns[1]])}
--> 422                           for x in data.iterrows()]
    423 
    424         # NumPy arrays

/home/russ/anaconda/lib/python2.7/site-packages/pandas/core/series.pyc in __getitem__(self, key)
    519     def __getitem__(self, key):
    520         try:
--> 521             result = self.index.get_value(self, key)
    522 
    523             if not np.isscalar(result):

/home/russ/anaconda/lib/python2.7/site-packages/pandas/core/index.pyc in get_value(self, series, key)
   1607                     raise InvalidIndexError(key)
   1608                 else:
-> 1609                     raise e1
   1610             except Exception:  # pragma: no cover
   1611                 raise e1

KeyError: 'zip'



In [122]:

    
zip_topo = r'zips_us_topo.json'
state_topo = r'us_states.topo.json'

geo_data2 = [{'name': 'zip_codes',
             'url': zip_topo,
             'feature': 'zip_codes_for_the_usa'},
            {'name': 'states',
             'url': state_topo,
             'feature': 'us_states.geo'}]

geo_data = [{'name': 'states',
             'url': state_topo,
             'feature': 'us_states.geo'},
            {'name': 'zip_codes',
             'url': zip_topo,
             'feature': 'zip_codes_for_the_usa'}]

vis = vincent.Map(data=urban[urban['zip']== '83211'], geo_data=geo_data, scale=1100, projection='albersUsa',
          data_bind='POPULATION', data_key='zip',brew='PuRd',
          map_key={'zip_codes': 'properties.zip'})
del vis.marks[0].properties.update
#del vis.marks[1].properties.update
vis.marks[1].properties.update.fill.value = '#C390D4'
#vis.marks[1].properties.enter.stroke.value = '#CCCCFF'
#vis.marks[1].properties.enter.stroke_opacity = ValueRef(value=0.1)
vis.marks[0].properties.enter.stroke.value = '#FF0000'
vis.legend(title='POPULATION')
vis.display()



In [ ]:

	ZCTA5	Zip5	LANDSQMT	WATERSQMT	LANDSQMI	WATERSQMI	POPULATION	HSGUNITS	INTPTLAT	INTPTLON	pop
0	00601	601	166659789	799296	64.35	0.31	18570	7744	18.180556	-66.749961	18570
1	00602	602	79288158	4446273	30.61	1.72	41520	18073	18.362268	-67.176130	41520
2	00603	603	81880442	183425	31.61	0.07	54689	25653	18.455183	-67.119887	54689
3	00606	606	109580061	12487	42.31	0.00	6615	2877	18.158345	-66.932911	6615
4	00610	610	93021467	4172001	35.92	1.61	29016	12618	18.290955	-67.125868	29016

	zip5	X	Y
5201	33109	-7.667834	-10.175178
11183	90210	-9.901957	-14.466638
11220	90402	-7.893375	-10.070110
11866	94027	-13.161680	-17.342531

	zip5	X	Y
6183	37915	8.058904	-4.534861
6952	45232	8.904873	-5.158492

	Value
PC0	0.966283
PC1	0.033717

	zip5	pop	race_white	race_black	race_indian	race_asian	race_other	race_multi	hisp	...	marital_yes30to34	marital_yes35to39	marital_yes40to44	marital_yes45to49	marital_yes50to54	marital_yes55to59	marital_yes60to64	marital_yes65to74	marital_yes75to84	marital_yes85p
0	601	18450	0.946179	0.007805	0.000217	0.000000	0.042005	0.007588	0.994580	...	0.038743	0.030818	0.049919	0.051477	0.053644	0.038607	0.049038	0.068274	0.029193	0.005690
1	602	41302	0.635611	0.035010	0.000654	0.001719	0.030676	0.592659	0.935838	...	0.039094	0.037999	0.051465	0.049600	0.058420	0.054276	0.053892	0.067683	0.025244	0.002545
2	603	53683	0.775217	0.032953	0.000335	0.008345	0.030643	0.305013	0.958609	...	0.037783	0.037464	0.033750	0.039014	0.042386	0.046215	0.056538	0.077891	0.031152	0.003168
3	606	6591	0.953421	0.011227	0.000000	0.000000	0.022151	0.026400	0.998635	...	0.004145	0.024496	0.032410	0.029772	0.049934	0.034483	0.061428	0.052007	0.012436	0.003580
4	610	28963	0.740013	0.029141	0.000000	0.000725	0.167041	0.126161	0.992542	...	0.032814	0.031205	0.067025	0.049411	0.052079	0.041282	0.048353	0.078161	0.026590	0.006563

	zip5	1996-07	1997-01	1997-07	1998-01	1998-07	1999-01	1999-07	2000-01	2000-07	...	2011-01	2011-07	2012-01	2012-07	2013-01	2013-07	2014-01	2014-07	2015-01	2015-07
0	10025	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	758600	751200	737500	772400	825700	908100	998300	1001700	1029600	1097100
1	60657	141900	144600	154500	161400	165300	179000	188700	201900	220300	...	288300	274600	258900	264000	267900	279900	294700	297000	304500	307000
2	60614	193200	197200	199200	200800	216500	228400	244900	261900	276700	...	351800	348500	332900	332900	335700	354000	386800	382300	378900	393000
3	79936	71100	72600	74000	76000	76200	75600	77100	78400	78900	...	110100	109200	108700	109000	109900	111300	109600	111900	111600	112100
4	10002	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	704800	688900	696000	700000	730300	791900	849400	863900	903200	951600

	zip5	2011-01	2011-07	2012-01	2012-07	2013-01	2013-07	2014-01	2014-07	2015-01	2015-07
0	10025	NaN	NaN	3008	3394	3538	3485	3477	3484	3520	3599
1	60657	1453	1243	1698	2053	1957	1917	2005	2142	1969	2050
2	10023	NaN	NaN	3911	4118	3960	4180	4251	4312	4370	4620
3	60614	1482	1379	1920	2184	2171	2223	2199	2343	2232	2361
4	79936	1003	1038	1088	1079	1031	996	1029	1029	1022	1057

	zip5	dep_amt_low	dep_count_low	retirement_amt_low	retirement_count_low	time_deposits_low	dep_amt_high	dep_count_high	retirement_amt_high	retirement_count_high	time_deposits_high
0	21613	105111	9871	11265	712	40000	49156	70	781	2	13062
1	21202	0	0	0	0	0	500	1	0	0	500
2	63376	42888	1250	1552	84	28590	21793	32	0	0	4088
3	59317	30205	2340	743	57	7043	11090	30	0	0	0
4	74728	82686	8762	6146	208	26712	28517	37	360	1	1787

	STABR	FSCSKEY	FSCS_SEQ	LIBID	LIBNAME	CNTY	PHONE	C_OUT_TY	C_MSA	SQ_FEET	...	CENTRACT	CENBLOCK	CDCODE	CBSA	MICROF	GAL	GALMS	POSTMS	Location	zip
0	AK	AK0001	2	AK0001-002	ANCHOR POINT PUBLIC LIBRARY	KENAI PENINSULA	9072355692	CE	NO	1287	...	8.00	3014	200	0	0	house	STD	NND	72551 MILO FRITZ AVENUE\nANCHOR POINT, AK 9955...	99556
1	AK	AK0002	7	AK0002-007	CHUGIAK/EAGLE RIVER NEIGHBORHOOD LIBRARY	ANCHORAGE	9073431530	BR	CC	17888	...	2.01	2021	200	11260	0	addresspoint	STD	NND	12001 BUSINESS BOULEVARD #176\nEAGLE RIVER, AK...	99577

	zip5	X	Y
21	1053	0.064608	-0.245104
62	1240	-0.257232	-0.021684
63	1245	-0.279738	-0.975474
69	1262	-0.353235	-0.176648
90	1375	0.306770	-0.183014
104	1464	-0.307914	-0.007825
119	1518	-0.037178	-0.082075
132	1537	0.226301	-0.377547
163	1702	0.364843	-0.195228
241	1970	0.435376	-0.476600
285	2127	-0.145804	-1.266902
288	2130	-0.296941	-1.234145
289	2131	0.046681	-0.797299
293	2136	-0.048670	-0.168980
297	2141	-0.257093	-1.413339
302	2148	0.321340	-0.468417
308	2169	-0.119179	-0.194188
316	2189	-0.003236	-0.185794
340	2368	0.145833	-0.122926
389	2630	-0.471313	-1.453163
405	2657	0.162036	-0.779767
470	2840	-0.344479	-0.208771
484	2882	-0.243592	-0.851685
498	2906	-0.171606	-1.020005
566	3253	0.004081	-0.246357
602	3461	0.181132	-0.051355
651	3824	0.020621	-0.989868
685	3874	-0.018771	-0.135976
719	5443	0.468227	-0.042621
743	5651	0.225654	-0.401573
...	...	...	...
12730	98261	-0.123482	-0.724112
12731	98262	-0.465071	-0.225020
12735	98270	0.114647	-0.186863
12736	98271	-0.137044	-0.040737
12739	98274	0.032027	-0.077227
12745	98292	-0.494928	-0.291293
12754	98321	-0.251319	-0.143365
12776	98366	0.455906	-0.052767
12777	98367	-0.368170	-0.111199
12781	98372	0.134852	-0.204046
12782	98373	0.308655	-0.202166
12791	98390	0.347422	-0.030591
12794	98394	0.134061	-0.078051
12796	98403	-0.239239	-0.157914
12799	98406	0.222205	-0.370056
12800	98407	-0.237965	-0.084691
12811	98466	0.177025	-0.067364
12812	98467	0.179890	-0.387212
12818	98506	0.144386	-0.066760
12821	98516	-0.348927	-0.061699
12838	98580	-0.077394	-0.243003
12866	98671	-0.192109	-0.202126
12867	98674	0.154816	-0.192993
12870	98683	0.024403	-0.033818
12871	98684	0.385029	-0.035259
12873	98686	-0.437751	-0.139231
12877	98816	0.484283	-0.157040
12880	98826	0.232554	-0.162735
12882	98831	0.299221	-0.050048
12971	99654	-0.252891	-0.038480

	zip5	X	Y
0	1001	-0.209610	0.555319
2	1005	-0.998922	0.837407
3	1007	-0.529983	0.114181
4	1008	-1.144768	0.811163
5	1010	-0.971439	0.569256
6	1011	-0.341337	0.577316
8	1020	0.309196	0.520136
10	1026	-0.552681	0.720066
11	1027	-0.514904	0.554378
12	1028	-0.983329	0.449181
13	1030	-0.414434	0.293022
14	1033	-1.117019	0.777389
15	1034	-1.064903	0.387531
17	1036	-1.346627	0.566806
18	1038	-0.713128	0.253932
20	1050	0.029264	0.535282
23	1056	-0.558750	0.710186
24	1057	-0.211522	0.191307
26	1062	-0.638163	0.351628
27	1068	-1.274841	0.398266
28	1069	0.365876	0.419715
29	1071	-1.081674	0.770970
30	1072	-0.824851	0.564430
31	1073	-1.282759	0.450727
32	1075	-0.582318	0.684515
33	1077	-1.349392	0.784897
35	1081	-0.217643	0.656071
36	1082	0.558238	0.177666
37	1083	0.871581	0.289997
38	1085	0.224208	0.145490
...	...	...	...
12936	99216	0.957613	0.076701
12939	99223	-0.205127	0.264317
12940	99224	0.068346	0.049541
12942	99320	0.303124	0.169849
12943	99323	-0.460195	0.423740
12946	99337	-0.471959	0.433329
12947	99338	-1.669890	0.446592
12949	99352	-0.749953	0.239548
12950	99353	-0.859055	0.326252
12951	99354	0.056020	0.204216
12953	99402	0.523632	0.028952
12954	99403	0.827492	0.369117
12956	99502	-1.244004	0.122944
12958	99504	-0.593647	0.166664
12959	99507	-1.278218	0.121707
12961	99515	-1.500315	0.217164
12963	99517	-0.686718	0.117141
12964	99518	-0.837455	0.388418
12965	99567	-1.708138	0.196757
12966	99577	-1.830426	0.197862
12968	99611	-0.504399	0.385261
12969	99615	-0.817403	0.129799
12970	99645	-0.735283	0.256060
12972	99669	-0.341638	0.435116
12973	99701	0.366613	0.344208
12974	99705	-0.864881	0.529043
12975	99709	-0.773324	0.433899
12976	99712	-1.499410	0.634598
12977	99801	-1.137844	0.055228
12978	99901	0.008810	0.095256

	zip5	X
6592	21405	5.833637
27494	79033	5.833637
18961	56210	5.833637
31890	97028	5.833637
28246	81335	5.833637
32320	98050	5.833637
28332	82063	5.833637
2263	7970	5.833637
21848	63073	5.569037
1236	4575	5.371800
2626	10282	5.323790
6503	21056	5.291953
31036	94528	5.224450
20444	60043	5.201251
30419	92145	5.173233
2067	7311	5.125283
26662	77010	5.124102
2028	7078	5.111521
2265	7976	4.960067
2712	10577	4.958065
6801	22066	4.892476
2564	10007	4.877787
6075	19437	4.874672
30954	94027	4.852093
629	2672	4.791172
1953	6883	4.758191
3280	12453	4.751529
6539	21153	4.637779
6694	21738	4.582035
1937	6820	4.512731
...	...	...
6239	20052	-7.695458
6484	21031	-7.695458
21534	62523	-7.729769
15122	46379	-7.836499
19590	57622	-7.871984
19589	57621	-7.999738
8197	26366	-8.043925
13628	42321	-8.193175
24642	71662	-8.218272
27279	78565	-8.241577
13375	41390	-8.693122
12717	39061	-8.782287
24290	70580	-8.786269
11825	36727	-8.902769
29367	86011	-9.274077
32277	97909	-9.432400
33079	99774	-9.479919
3628	13102	-9.675718
15954	48411	-9.726297
13575	42124	-9.736086
22478	64858	-9.898691
4198	14614	-10.234640
31693	96049	-10.314945
11595	36031	-10.389852
12579	38677	-10.650890
4653	15616	-10.925601
29302	85654	-10.971303
22837	66019	-11.661508
14968	46047	-13.280264
8291	26684	-13.280264