In [1]:
%matplotlib inline
import pandas as pd
import json

In [2]:
#File locations
acs_file = "/home/russ/Documents/DDL/Data/JeffData/PCA/Jeff_data_acs5yr.csv"
zillow_HVI_file = "/home/russ/Documents/DDL/Data/JeffData/PCA/Zip_Zhvi_AllHomes_HomeValueIndex.csv"
zillow_RI_file = "/home/russ/Documents/DDL/Data/JeffData/PCA/Zip_Zri_AllHomes_RentIndex.csv"
FDIC_deposits_100K_file = "/home/russ/Documents/DDL/Data/JeffData/PCA/FDIC_All_Reports_20150630/All_Reports_20150630_Deposits Based on the $100,000 Reporting Threshold.csv"
FDIC_deposits_250K_file = "/home/russ/Documents/DDL/Data/JeffData/PCA/FDIC_All_Reports_20150630/All_Reports_20150630_Deposits Based on the $250,000 Reporting Threshold.csv"
library_file = "/home/russ/Documents/DDL/Data/JeffData/PCA/Public_Libraries_Survey_FY_2013_-_Outlet.csv"
complaints_file = "/home/russ/Documents/DDL/Data/JeffData/PCA/Consumer_Complaints.csv"
urbanization_zip = "/home/russ/Documents/DDL/Data/JeffData/PCA/zcta2010_txt.csv"

In [3]:
acs = pd.read_csv(acs_file)

In [4]:
acs.head()


Out[4]:
zip5 pop race_white race_black race_indian race_asian race_pac_is race_other race_multi hisp ... marital_yes30to34 marital_yes35to39 marital_yes40to44 marital_yes45to49 marital_yes50to54 marital_yes55to59 marital_yes60to64 marital_yes65to74 marital_yes75to84 marital_yes85p
0 601 18450 0.946179 0.007805 0.000217 0.000000 0 0.042005 0.007588 0.994580 ... 0.038743 0.030818 0.049919 0.051477 0.053644 0.038607 0.049038 0.068274 0.029193 0.005690
1 602 41302 0.635611 0.035010 0.000654 0.001719 0 0.030676 0.592659 0.935838 ... 0.039094 0.037999 0.051465 0.049600 0.058420 0.054276 0.053892 0.067683 0.025244 0.002545
2 603 53683 0.775217 0.032953 0.000335 0.008345 0 0.030643 0.305013 0.958609 ... 0.037783 0.037464 0.033750 0.039014 0.042386 0.046215 0.056538 0.077891 0.031152 0.003168
3 606 6591 0.953421 0.011227 0.000000 0.000000 0 0.022151 0.026400 0.998635 ... 0.004145 0.024496 0.032410 0.029772 0.049934 0.034483 0.061428 0.052007 0.012436 0.003580
4 610 28963 0.740013 0.029141 0.000000 0.000725 0 0.167041 0.126161 0.992542 ... 0.032814 0.031205 0.067025 0.049411 0.052079 0.041282 0.048353 0.078161 0.026590 0.006563

5 rows × 77 columns


In [5]:
diversity = acs[['zip5','pop','race_white','race_black','race_asian','race_indian','race_other','hisp']].copy(deep=True)

In [6]:
diversity['white_hisp'] = ((diversity['pop']*diversity['race_white'])*diversity['hisp'])/diversity['pop']

In [7]:
diversity['white_nonhisp'] = ((diversity['pop']*diversity['race_white'])*(1-diversity['hisp']))/diversity['pop']

In [8]:
diversity['div_index'] = 1- (diversity['race_black']**2 + diversity['white_hisp']**2 + diversity['white_nonhisp']**2 + diversity['race_asian']**2 + diversity['race_indian']**2)

In [9]:
zillow_HVI = pd.read_csv(zillow_HVI_file)

In [10]:
zillow_RI = pd.read_csv(zillow_RI_file)

In [11]:
urban = pd.read_csv(urbanization_zip)

In [12]:
#urban['pop'] = urban.apply(lambda x: int(x['POPULATION'].replace(',','')),axis=1)
#alternate
#urban['pop'] = urban['POPULATION'].apply(lambda x: int(x.replace(',','')))
urban['pop'] = urban.apply(lambda x: int(x['POPULATION']),axis=1)

In [13]:
#strip Z from Zip Code Text
urban['ZCTA5'] = urban.apply(lambda x: x['ZCTA5'][1:],axis=1)

In [14]:
urban.head()


Out[14]:
ZCTA5 Zip5 LANDSQMT WATERSQMT LANDSQMI WATERSQMI POPULATION HSGUNITS INTPTLAT INTPTLON pop
0 00601 601 166659789 799296 64.35 0.31 18570 7744 18.180556 -66.749961 18570
1 00602 602 79288158 4446273 30.61 1.72 41520 18073 18.362268 -67.176130 41520
2 00603 603 81880442 183425 31.61 0.07 54689 25653 18.455183 -67.119887 54689
3 00606 606 109580061 12487 42.31 0.00 6615 2877 18.158345 -66.932911 6615
4 00610 610 93021467 4172001 35.92 1.61 29016 12618 18.290955 -67.125868 29016

In [15]:
urban['urban_index'] = urban['LANDSQMT']/urban['pop']

In [16]:
zillow_HVI = zillow_HVI[['RegionName','1996-07','1997-01','1997-07','1998-01','1998-07','1999-01','1999-07','2000-01','2000-07'\
    ,'2001-01','2001-07','2002-01','2002-07','2003-01','2003-07','2004-01','2004-07','2005-01','2005-07','2006-01','2006-07'\
    ,'2007-01','2007-07','2008-01','2008-07','2009-01','2009-07','2010-01','2010-07','2011-01','2011-07','2012-01','2012-07'\
    ,'2013-01','2013-07','2014-01','2014-07','2015-01','2015-07']]

In [17]:
zillow_HVI.rename(columns={'RegionName':'zip5'},inplace=True)
zillow_HVI.head()


Out[17]:
zip5 1996-07 1997-01 1997-07 1998-01 1998-07 1999-01 1999-07 2000-01 2000-07 ... 2011-01 2011-07 2012-01 2012-07 2013-01 2013-07 2014-01 2014-07 2015-01 2015-07
0 10025 NaN NaN NaN NaN NaN NaN NaN NaN NaN ... 758600 751200 737500 772400 825700 908100 998300 1001700 1029600 1097100
1 60657 141900 144600 154500 161400 165300 179000 188700 201900 220300 ... 288300 274600 258900 264000 267900 279900 294700 297000 304500 307000
2 60614 193200 197200 199200 200800 216500 228400 244900 261900 276700 ... 351800 348500 332900 332900 335700 354000 386800 382300 378900 393000
3 79936 71100 72600 74000 76000 76200 75600 77100 78400 78900 ... 110100 109200 108700 109000 109900 111300 109600 111900 111600 112100
4 10002 NaN NaN NaN NaN NaN NaN NaN NaN NaN ... 704800 688900 696000 700000 730300 791900 849400 863900 903200 951600

5 rows × 40 columns


In [18]:
len(zillow_HVI)


Out[18]:
12988

In [19]:
zillow_RI.head(1)


Out[19]:
RegionName City State Metro CountyName 2010-11 2010-12 2011-01 2011-02 2011-03 ... 2014-10 2014-11 2014-12 2015-01 2015-02 2015-03 2015-04 2015-05 2015-06 2015-07
0 10025 New York NY New York New York NaN NaN NaN NaN NaN ... 3511 3488 3500 3520 3557 3588 3585 3586 3581 3599

1 rows × 62 columns


In [20]:
zillow_RI = zillow_RI[['RegionName','2011-01','2011-07','2012-01','2012-07'\
    ,'2013-01','2013-07','2014-01','2014-07','2015-01','2015-07']].copy(False)

In [21]:
zillow_RI.rename(columns={'RegionName':'zip5'},inplace=True)

In [22]:
zillow_RI.head()


Out[22]:
zip5 2011-01 2011-07 2012-01 2012-07 2013-01 2013-07 2014-01 2014-07 2015-01 2015-07
0 10025 NaN NaN 3008 3394 3538 3485 3477 3484 3520 3599
1 60657 1453 1243 1698 2053 1957 1917 2005 2142 1969 2050
2 10023 NaN NaN 3911 4118 3960 4180 4251 4312 4370 4620
3 60614 1482 1379 1920 2184 2171 2223 2199 2343 2232 2361
4 79936 1003 1038 1088 1079 1031 996 1029 1029 1022 1057

In [23]:
deposits_250K = pd.read_csv(FDIC_deposits_250K_file)

In [24]:
#deposits_250K = deposits_250K[['zip'],['IDdepsmb'],['DEPSMRA'],['DEPSMRN'],['NTRCDSMJ'],['IDdeplam'],['IDdeplgb'],['DEPLGRA'],['DEPLGRN'],['NTRTMLGJ']]
deposits_250K = deposits_250K[['zip','IDdepsam','IDdepsmb','DEPSMRA','DEPSMRN','NTRCDSMJ','IDdeplam','IDdeplgb','DEPLGRA','DEPLGRN','NTRTMLGJ']]

In [25]:
deposits_250K.columns = ['zip5','dep_amt_low','dep_count_low','retirement_amt_low','retirement_count_low','time_deposits_low','dep_amt_high','dep_count_high','retirement_amt_high','retirement_count_high','time_deposits_high']

In [26]:
deposits_250K.head()


Out[26]:
zip5 dep_amt_low dep_count_low retirement_amt_low retirement_count_low time_deposits_low dep_amt_high dep_count_high retirement_amt_high retirement_count_high time_deposits_high
0 21613 105111 9871 11265 712 40000 49156 70 781 2 13062
1 21202 0 0 0 0 0 500 1 0 0 500
2 63376 42888 1250 1552 84 28590 21793 32 0 0 4088
3 59317 30205 2340 743 57 7043 11090 30 0 0 0
4 74728 82686 8762 6146 208 26712 28517 37 360 1 1787

In [27]:
deposits_zip = deposits_250K['dep_amt_high'].groupby(deposits_250K['zip5']).mean().reset_index()

In [28]:
deposits_zip.head()


Out[28]:
zip5 dep_amt_high
0 802 67941
1 820 36914
2 909 2542958
3 917 2690626
4 918 1567600

In [29]:
library = pd.read_csv(library_file)

In [30]:
#Slice field value based on hard coded State
#Next step to substitute ['STABR'] for State text to dynamically find location of each state within address field
#library_zip.apply(lambda x : x['Location'][0:15], axis =1)
#library_zip.apply(lambda x : x['Location'][x['Location'].find(', AK')+5:x['Location'].find(', AK')+10], axis =1)
#Strip Zip Code From Location Column
#library_zip = library[['Location','STABR']]
#library_zip['zip'] = library_zip.apply(lambda x : x['Location'][x['Location'].find(', ' + x['STABR'])+5:x['Location'].find(', ' + x['STABR'])+10], axis =1)

In [31]:
#Parse out Zip Code from Location field
library['zip'] = library.apply(lambda x : x['Location'][x['Location'].rfind(', ' + x['STABR'])+5:x['Location'].rfind(', ' + x['STABR'])+10], axis =1)

In [32]:
library.head(2)


Out[32]:
STABR FSCSKEY FSCS_SEQ LIBID LIBNAME CNTY PHONE C_OUT_TY C_MSA SQ_FEET ... CENTRACT CENBLOCK CDCODE CBSA MICROF GAL GALMS POSTMS Location zip
0 AK AK0001 2 AK0001-002 ANCHOR POINT PUBLIC LIBRARY KENAI PENINSULA 9072355692 CE NO 1287 ... 8.00 3014 200 0 0 house STD NND 72551 MILO FRITZ AVENUE\nANCHOR POINT, AK 9955... 99556
1 AK AK0002 7 AK0002-007 CHUGIAK/EAGLE RIVER NEIGHBORHOOD LIBRARY ANCHORAGE 9073431530 BR CC 17888 ... 2.01 2021 200 11260 0 addresspoint STD NND 12001 BUSINESS BOULEVARD #176\nEAGLE RIVER, AK... 99577

2 rows × 37 columns


In [33]:
#Change to your local path
library.to_csv("/home/russ/Documents/DDL/Data/JeffData/PCA/Library_ZipCode.csv")

In [34]:
library_zip = library['STABR'].groupby(library['zip']).count().reset_index()

In [35]:
library_zip['zip5'] = library_zip.apply(lambda x: int(x['zip']),axis=1)

In [36]:
library_zip.columns = ['zip','LibraryCount','zip5']

In [37]:
library_zip.head(1)


Out[37]:
zip LibraryCount zip5
0 00602 1 602

In [38]:
combined = pd.merge(acs[['zip5','snap','inc_median','poverty']],zillow_HVI[['zip5','2015-07']], on='zip5',copy=False)

In [39]:
combined.head()


Out[39]:
zip5 snap inc_median poverty 2015-07
0 1001 0.089002 58733 0.071132 180800
1 1002 0.086397 54422 0.392859 306400
2 1005 0.029427 68644 0.037358 188900
3 1007 0.088479 71875 0.090516 254300
4 1008 0.043956 71635 0.013986 218600

In [40]:
combined[combined['zip5']==90210]


Out[40]:
zip5 snap inc_median poverty 2015-07
11183 90210 0.008544 132254 0.077309 4635700

In [41]:
zillow_combined = pd.merge(zillow_HVI[['zip5','2015-07']],zillow_RI[['zip5','2015-07']], on='zip5',copy=False)

In [42]:
zillow_combined.columns = ['zip5','HVI','RI']

In [43]:
#Beginning PCA Analysis (reference: http://sebastianraschka.com/Articles/2015_pca_in_3_steps.html)

X = combined.ix[:,1:5].values
y = combined.ix[:,0].values

In [44]:
#Standardization
from sklearn.preprocessing import StandardScaler
X_std = StandardScaler().fit_transform(X)

In [45]:
X_std


Out[45]:
array([[-0.23503473, -0.09204204, -0.49427454, -0.26963345],
       [-0.26560554, -0.26560066,  2.92775955,  0.26227518],
       [-0.93412112,  0.30696967, -0.85351083, -0.23533043],
       ..., 
       [-0.98195932,  0.9688758 , -0.92370737, -0.05068699],
       [-0.31585168,  0.88465296, -0.70153587,  0.33553967],
       [ 0.31176179,  0.06038035, -0.28919535, -0.0350177 ]])

In [46]:
#Same PCA analysis using scikit-learn

from sklearn.decomposition import PCA as sklearnPCA
sklearn_pca = sklearnPCA(n_components=2)
Y_sklearn = sklearn_pca.fit_transform(X_std)

In [47]:
Y_sklearn


Out[47]:
array([[-0.20961048,  0.55531939],
       [ 1.32597703, -1.68874645],
       [-0.99892206,  0.83740701],
       ..., 
       [-1.49940979,  0.63459819],
       [-1.13784378,  0.05522838],
       [ 0.00880996,  0.09525603]])

In [48]:
df = pd.DataFrame({'X':Y_sklearn[:,0],'Y':Y_sklearn[:,1]})

In [49]:
df = combined[['zip5']].merge(df,left_index=True, right_index=True)

In [50]:
df[df['zip5']==90210]


Out[50]:
zip5 X Y
11183 90210 -9.901957 -14.466638

In [51]:
#Top of arc = High Housing, low Income
#Negative = High Housing, High Income
#Positive = Low Housing, Low Income
df.plot(kind='scatter',x='X',y='Y')


Out[51]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f17ad950e90>

In [52]:
df[df['Y']<-10]


Out[52]:
zip5 X Y
5201 33109 -7.667834 -10.175178
11183 90210 -9.901957 -14.466638
11220 90402 -7.893375 -10.070110
11866 94027 -13.161680 -17.342531

In [53]:
df[df['X']>8]


Out[53]:
zip5 X Y
6183 37915 8.058904 -4.534861
6952 45232 8.904873 -5.158492

In [54]:
df[(df['X']>-.5) & (df['X']<.5) & (df['Y']<0)]


Out[54]:
zip5 X Y
21 1053 0.064608 -0.245104
62 1240 -0.257232 -0.021684
63 1245 -0.279738 -0.975474
69 1262 -0.353235 -0.176648
90 1375 0.306770 -0.183014
104 1464 -0.307914 -0.007825
119 1518 -0.037178 -0.082075
132 1537 0.226301 -0.377547
163 1702 0.364843 -0.195228
241 1970 0.435376 -0.476600
285 2127 -0.145804 -1.266902
288 2130 -0.296941 -1.234145
289 2131 0.046681 -0.797299
293 2136 -0.048670 -0.168980
297 2141 -0.257093 -1.413339
302 2148 0.321340 -0.468417
308 2169 -0.119179 -0.194188
316 2189 -0.003236 -0.185794
340 2368 0.145833 -0.122926
389 2630 -0.471313 -1.453163
405 2657 0.162036 -0.779767
470 2840 -0.344479 -0.208771
484 2882 -0.243592 -0.851685
498 2906 -0.171606 -1.020005
566 3253 0.004081 -0.246357
602 3461 0.181132 -0.051355
651 3824 0.020621 -0.989868
685 3874 -0.018771 -0.135976
719 5443 0.468227 -0.042621
743 5651 0.225654 -0.401573
... ... ... ...
12730 98261 -0.123482 -0.724112
12731 98262 -0.465071 -0.225020
12735 98270 0.114647 -0.186863
12736 98271 -0.137044 -0.040737
12739 98274 0.032027 -0.077227
12745 98292 -0.494928 -0.291293
12754 98321 -0.251319 -0.143365
12776 98366 0.455906 -0.052767
12777 98367 -0.368170 -0.111199
12781 98372 0.134852 -0.204046
12782 98373 0.308655 -0.202166
12791 98390 0.347422 -0.030591
12794 98394 0.134061 -0.078051
12796 98403 -0.239239 -0.157914
12799 98406 0.222205 -0.370056
12800 98407 -0.237965 -0.084691
12811 98466 0.177025 -0.067364
12812 98467 0.179890 -0.387212
12818 98506 0.144386 -0.066760
12821 98516 -0.348927 -0.061699
12838 98580 -0.077394 -0.243003
12866 98671 -0.192109 -0.202126
12867 98674 0.154816 -0.192993
12870 98683 0.024403 -0.033818
12871 98684 0.385029 -0.035259
12873 98686 -0.437751 -0.139231
12877 98816 0.484283 -0.157040
12880 98826 0.232554 -0.162735
12882 98831 0.299221 -0.050048
12971 99654 -0.252891 -0.038480

597 rows × 3 columns


In [55]:
df[df['Y']>0]


Out[55]:
zip5 X Y
0 1001 -0.209610 0.555319
2 1005 -0.998922 0.837407
3 1007 -0.529983 0.114181
4 1008 -1.144768 0.811163
5 1010 -0.971439 0.569256
6 1011 -0.341337 0.577316
8 1020 0.309196 0.520136
10 1026 -0.552681 0.720066
11 1027 -0.514904 0.554378
12 1028 -0.983329 0.449181
13 1030 -0.414434 0.293022
14 1033 -1.117019 0.777389
15 1034 -1.064903 0.387531
17 1036 -1.346627 0.566806
18 1038 -0.713128 0.253932
20 1050 0.029264 0.535282
23 1056 -0.558750 0.710186
24 1057 -0.211522 0.191307
26 1062 -0.638163 0.351628
27 1068 -1.274841 0.398266
28 1069 0.365876 0.419715
29 1071 -1.081674 0.770970
30 1072 -0.824851 0.564430
31 1073 -1.282759 0.450727
32 1075 -0.582318 0.684515
33 1077 -1.349392 0.784897
35 1081 -0.217643 0.656071
36 1082 0.558238 0.177666
37 1083 0.871581 0.289997
38 1085 0.224208 0.145490
... ... ... ...
12936 99216 0.957613 0.076701
12939 99223 -0.205127 0.264317
12940 99224 0.068346 0.049541
12942 99320 0.303124 0.169849
12943 99323 -0.460195 0.423740
12946 99337 -0.471959 0.433329
12947 99338 -1.669890 0.446592
12949 99352 -0.749953 0.239548
12950 99353 -0.859055 0.326252
12951 99354 0.056020 0.204216
12953 99402 0.523632 0.028952
12954 99403 0.827492 0.369117
12956 99502 -1.244004 0.122944
12958 99504 -0.593647 0.166664
12959 99507 -1.278218 0.121707
12961 99515 -1.500315 0.217164
12963 99517 -0.686718 0.117141
12964 99518 -0.837455 0.388418
12965 99567 -1.708138 0.196757
12966 99577 -1.830426 0.197862
12968 99611 -0.504399 0.385261
12969 99615 -0.817403 0.129799
12970 99645 -0.735283 0.256060
12972 99669 -0.341638 0.435116
12973 99701 0.366613 0.344208
12974 99705 -0.864881 0.529043
12975 99709 -0.773324 0.433899
12976 99712 -1.499410 0.634598
12977 99801 -1.137844 0.055228
12978 99901 0.008810 0.095256

8093 rows × 3 columns


In [56]:
#Beginning PCA Analysis (reference: http://sebastianraschka.com/Articles/2015_pca_in_3_steps.html)

X = zillow_combined.ix[:,1:3].values
y = zillow_combined.ix[:,0].values

In [57]:
#Standardization
from sklearn.preprocessing import StandardScaler
X_std = StandardScaler().fit_transform(X)

#Same PCA analysis using scikit-learn

from sklearn.decomposition import PCA as sklearnPCA
sklearn_pca = sklearnPCA(n_components=2)
Y_sklearn = sklearn_pca.fit_transform(X_std)


/home/russ/anaconda/lib/python2.7/site-packages/sklearn/utils/validation.py:498: UserWarning: StandardScaler assumes floating point values as input, got int64
  "got %s" % (estimator, X.dtype))

In [58]:
expl_var = sklearn_pca.explained_variance_ratio_

In [59]:
ev = expl_var.tolist()
ls =  ['PC%s' %i for i in range(0,len(ev))]
df_explainedValue = pd.DataFrame(ev,columns=['Value'],index=ls)

In [60]:
df_explainedValue.plot(kind='bar')


Out[60]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f17ad789810>

In [61]:
df = zillow_combined[['zip5']].merge(df,left_index=True, right_index=True)

In [62]:
df.plot(kind='scatter',x='X',y='Y')


Out[62]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f17ad807cd0>

In [63]:
income_combined = acs[['zip5','snap','inc_median','poverty']].copy(False)

In [64]:
#Beginning PCA Analysis (reference: http://sebastianraschka.com/Articles/2015_pca_in_3_steps.html)

X = income_combined.ix[:,1:4].values
y = income_combined.ix[:,0].values

#Standardization
from sklearn.preprocessing import StandardScaler
X_std = StandardScaler().fit_transform(X)

#Same PCA analysis using scikit-learn

from sklearn.decomposition import PCA as sklearnPCA
sklearn_pca = sklearnPCA(n_components=1)
Y_sklearn = sklearn_pca.fit_transform(X_std)

expl_var = sklearn_pca.explained_variance_ratio_
ev = expl_var.tolist()
ls =  ['PC%s' %i for i in range(0,len(ev))]
df_explainedValue = pd.DataFrame(ev,columns=['Value'],index=ls)

In [65]:
sklearn_pca.explained_variance_ratio_


Out[65]:
array([ 0.60302841])

In [66]:
df_explainedValue.plot(kind='bar')


Out[66]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f17ad877f10>

In [67]:
df = pd.DataFrame({'X':Y_sklearn[:,0],'Y':Y_sklearn[:,1]})
df = income_combined[['zip5']].merge(df,left_index=True, right_index=True)
df.plot(kind='scatter',x='X',y='Y')


---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-67-636b9daf0cc6> in <module>()
----> 1 df = pd.DataFrame({'X':Y_sklearn[:,0],'Y':Y_sklearn[:,1]})
      2 df = income_combined[['zip5']].merge(df,left_index=True, right_index=True)
      3 df.plot(kind='scatter',x='X',y='Y')

IndexError: index 1 is out of bounds for axis 1 with size 1

In [68]:
df = pd.DataFrame({'Income_Level':Y_sklearn[:,0]})
df = income_combined[['zip5']].merge(df,left_index=True, right_index=True)
df.plot(kind='scatter',x='zip5',y='X')


---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-68-e817b1d48cd0> in <module>()
      1 df = pd.DataFrame({'Income_Level':Y_sklearn[:,0]})
      2 df = income_combined[['zip5']].merge(df,left_index=True, right_index=True)
----> 3 df.plot(kind='scatter',x='zip5',y='X')

/home/russ/anaconda/lib/python2.7/site-packages/pandas/tools/plotting.pyc in plot_frame(data, x, y, kind, ax, subplots, sharex, sharey, layout, figsize, use_index, title, grid, legend, style, logx, logy, loglog, xticks, yticks, xlim, ylim, rot, fontsize, colormap, table, yerr, xerr, secondary_y, sort_columns, **kwds)
   2486                  yerr=yerr, xerr=xerr,
   2487                  secondary_y=secondary_y, sort_columns=sort_columns,
-> 2488                  **kwds)
   2489 
   2490 

/home/russ/anaconda/lib/python2.7/site-packages/pandas/tools/plotting.pyc in _plot(data, x, y, subplots, ax, kind, **kwds)
   2322         plot_obj = klass(data, subplots=subplots, ax=ax, kind=kind, **kwds)
   2323 
-> 2324     plot_obj.generate()
   2325     plot_obj.draw()
   2326     return plot_obj.result

/home/russ/anaconda/lib/python2.7/site-packages/pandas/tools/plotting.pyc in generate(self)
    912         self._compute_plot_data()
    913         self._setup_subplots()
--> 914         self._make_plot()
    915         self._add_table()
    916         self._make_legend()

/home/russ/anaconda/lib/python2.7/site-packages/pandas/tools/plotting.pyc in _make_plot(self)
   1455         else:
   1456             label = None
-> 1457         scatter = ax.scatter(data[x].values, data[y].values, c=c_values,
   1458                              label=label, cmap=cmap, **self.kwds)
   1459         if cb:

/home/russ/anaconda/lib/python2.7/site-packages/pandas/core/frame.pyc in __getitem__(self, key)
   1795             return self._getitem_multilevel(key)
   1796         else:
-> 1797             return self._getitem_column(key)
   1798 
   1799     def _getitem_column(self, key):

/home/russ/anaconda/lib/python2.7/site-packages/pandas/core/frame.pyc in _getitem_column(self, key)
   1802         # get column
   1803         if self.columns.is_unique:
-> 1804             return self._get_item_cache(key)
   1805 
   1806         # duplicate columns & possible reduce dimensionaility

/home/russ/anaconda/lib/python2.7/site-packages/pandas/core/generic.pyc in _get_item_cache(self, item)
   1082         res = cache.get(item)
   1083         if res is None:
-> 1084             values = self._data.get(item)
   1085             res = self._box_item_values(item, values)
   1086             cache[item] = res

/home/russ/anaconda/lib/python2.7/site-packages/pandas/core/internals.pyc in get(self, item, fastpath)
   2849 
   2850             if not isnull(item):
-> 2851                 loc = self.items.get_loc(item)
   2852             else:
   2853                 indexer = np.arange(len(self.items))[isnull(self.items)]

/home/russ/anaconda/lib/python2.7/site-packages/pandas/core/index.pyc in get_loc(self, key, method)
   1570         """
   1571         if method is None:
-> 1572             return self._engine.get_loc(_values_from_object(key))
   1573 
   1574         indexer = self.get_indexer([key], method=method)

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:3824)()

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:3704)()

pandas/hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:12280)()

pandas/hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:12231)()

KeyError: 'X'

In [69]:
df.sort(['Income_Level'], ascending=[False])


Out[69]:
zip5 Income_Level
6592 21405 5.833637
27494 79033 5.833637
18961 56210 5.833637
31890 97028 5.833637
28246 81335 5.833637
32320 98050 5.833637
28332 82063 5.833637
2263 7970 5.833637
21848 63073 5.569037
1236 4575 5.371800
2626 10282 5.323790
6503 21056 5.291953
31036 94528 5.224450
20444 60043 5.201251
30419 92145 5.173233
2067 7311 5.125283
26662 77010 5.124102
2028 7078 5.111521
2265 7976 4.960067
2712 10577 4.958065
6801 22066 4.892476
2564 10007 4.877787
6075 19437 4.874672
30954 94027 4.852093
629 2672 4.791172
1953 6883 4.758191
3280 12453 4.751529
6539 21153 4.637779
6694 21738 4.582035
1937 6820 4.512731
... ... ...
6239 20052 -7.695458
6484 21031 -7.695458
21534 62523 -7.729769
15122 46379 -7.836499
19590 57622 -7.871984
19589 57621 -7.999738
8197 26366 -8.043925
13628 42321 -8.193175
24642 71662 -8.218272
27279 78565 -8.241577
13375 41390 -8.693122
12717 39061 -8.782287
24290 70580 -8.786269
11825 36727 -8.902769
29367 86011 -9.274077
32277 97909 -9.432400
33079 99774 -9.479919
3628 13102 -9.675718
15954 48411 -9.726297
13575 42124 -9.736086
22478 64858 -9.898691
4198 14614 -10.234640
31693 96049 -10.314945
11595 36031 -10.389852
12579 38677 -10.650890
4653 15616 -10.925601
29302 85654 -10.971303
22837 66019 -11.661508
14968 46047 -13.280264
8291 26684 -13.280264

33120 rows × 2 columns


In [70]:
df[df['zip5']== 10001]


Out[70]:
zip5 Income_Level
2558 10001 0.651156

In [71]:
urban.rename(columns={'ZCTA5':'zip'},inplace=True)

In [72]:
import vincent
from vincent import AxisProperties, PropertySet, ValueRef
vincent.core.initialize_notebook()



In [73]:
zip_topo = r'zips_us_topo.json'
state_topo = r'us_states.topo.json'

geo_data2 = [{'name': 'zip_codes',
             'url': zip_topo,
             'feature': 'zip_codes_for_the_usa'},
            {'name': 'states',
             'url': state_topo,
             'feature': 'us_states.geo'}]

geo_data = [{'name': 'states',
             'url': state_topo,
             'feature': 'us_states.geo'},
            {'name': 'zip_codes',
             'url': zip_topo,
             'feature': 'zip_codes_for_the_usa'}]

vis = vincent.Map(data=urban, geo_data=geo_data, scale=1100, projection='albersUsa',
          data_bind='POPULATION', data_key='zip',brew='PuRd',
          map_key={'zip_codes': 'properties.zip'})
del vis.marks[0].properties.update
#del vis.marks[1].properties.update
#vis.marks[1].properties.update.fill.value = '#FFFFFF'
#vis.marks[1].properties.enter.stroke.value = '#CCCCFF'
vis.marks[1].properties.enter.stroke_opacity = ValueRef(value=0.1)
vis.marks[0].properties.enter.stroke.value = '#FF0000'
#vis.marks[1].properties.hover.fill.value = 'red'
vis.legend(title='POPULATION')
vis.display()
vis.to_json("USA_population.json")



In [195]:
zip_topo = r'zips_us_topo.json'
state_topo = r'us_states.topo.json'

geo_data2 = [{'name': 'zip_codes',
             'url': zip_topo,
             'feature': 'zip_codes_for_the_usa'},
            {'name': 'states',
             'url': state_topo,
             'feature': 'us_states.geo'}]

geo_data = [{'name': 'states',
             'url': state_topo,
             'feature': 'us_states.geo'},
            {'name': 'zip_codes',
             'url': zip_topo,
             'feature': 'zip_codes_for_the_usa'}]

vis = vincent.Map(data=urban[urban['zip']== '83211'], geo_data=geo_data, scale=1100, projection='albersUsa',
          data_bind='POPULATION', data_key='zip',brew='PuRd',
          map_key={'zip_codes': 'properties.zip'})
del vis.marks[0].properties.update
#del vis.marks[1].properties.update
vis.marks[1].properties.update.fill.value = '#C390D4'
#vis.marks[1].properties.enter.stroke.value = '#CCCCFF'
#vis.marks[1].properties.enter.stroke_opacity = ValueRef(value=0.1)
vis.marks[0].properties.enter.stroke.value = '#FF0000'
vis.legend(title='POPULATION')
vis.display()
vis.to_json("USA.json")



In [ ]:


In [253]:
zip_topo = r'topo_files/Maryland.topo.json'
state_topo = r'us_states.topo.json'


geo_data = [{'name': 'Maryland',
             'url': zip_topo,
             'feature': 'Maryland.geo'}]



vis = vincent.Map(geo_data=geo_data)
del vis.marks[0].properties.update
#del vis.marks[1].properties.update
#vis.marks[1].properties.update.fill.value = '#C390D4'
#vis.marks[1].properties.enter.stroke.value = '#CCCCFF'
#vis.marks[1].properties.enter.stroke_opacity = ValueRef(value=0.1)
#vis.marks[0].properties.enter.stroke.value = '#FF0000'
#vis.legend(title='POPULATION')
#vis.marks[0].properties.enter.scale=500
#vis.data[0].transform[0].projection[0].scale = 1000
#vis.marks[0].properties.hover.text.value = 'help'
#vis.marks[0].properties.update.fill.rule.predicate.value = "red"
vis.display()
vis.grammar()
vis.to_json('maryland_scaled.json')



In [252]:
zip_topo = r'topo_files/or_counties.topo_copy.json'
state_topo = r'us_states.topo.json'


geo_data = [{'name': 'Maryland',
             'url': zip_topo,
             'feature': 'or_counties.geo'}]



vis = vincent.Map(geo_data=geo_data, projection='equirectangular')
del vis.marks[0].properties.update
#del vis.marks[1].properties.update
#vis.marks[1].properties.update.fill.value = '#C390D4'
#vis.marks[1].properties.enter.stroke.value = '#CCCCFF'
#vis.marks[1].properties.enter.stroke_opacity = ValueRef(value=0.1)
#vis.marks[0].properties.enter.stroke.value = '#FF0000'
#vis.legend(title='POPULATION')
#vis.marks[0].properties.enter.scale=500
#vis.data[0].transform[0].projection[0].scale = 1000
#vis.marks[0].properties.hover.text.value = 'help'
#vis.marks[0].properties.update.fill.rule.predicate.value = "red"
vis.display()
vis.grammar()
#vis.to_json('maryland.json')


Out[252]:
{u'axes': [],
 u'data': [{u'format': {u'feature': u'or_counties.geo', u'type': u'topojson'},
   u'name': u'Maryland',
   u'transform': [{u'projection': u'equirectangular',
     u'translate': [480, 250],
     u'type': u'geopath',
     u'value': u'data'}],
   u'url': u'topo_files/or_counties.topo_copy.json'}],
 u'height': 500,
 u'legends': [],
 u'marks': [{u'from': {u'data': u'Maryland'},
   u'properties': {u'enter': {u'path': {u'field': u'path'},
     u'stroke': {u'value': u'#000000'}}},
   u'type': u'path'}],
 u'padding': u'auto',
 u'scales': [],
 u'width': 960}

In [250]:
0.0010128706393299272*10


Out[250]:
0.010128706393299273

In [251]:
0.0005370434828103437*10


Out[251]:
0.005370434828103438

In [256]:
zip_topo = r'topo_files/or_counties.topo_copy.json'
state_topo = r'us_states.topo.json'


geo_data = [{'name': 'Maryland',
             'url': zip_topo,
             'feature': 'or_counties.geo'}]



vis = vincent.Map(geo_data=geo_data, projection="")
del vis.marks[0].properties.update
#del vis.marks[1].properties.update
#vis.marks[1].properties.update.fill.value = '#C390D4'
#vis.marks[1].properties.enter.stroke.value = '#CCCCFF'
#vis.marks[1].properties.enter.stroke_opacity = ValueRef(value=0.1)
#vis.marks[0].properties.enter.stroke.value = '#FF0000'
#vis.legend(title='POPULATION')
#vis.marks[0].properties.enter.scale=500
#vis.data[0].transform[0].projection[0].scale = 1000
#vis.marks[0].properties.hover.text.value = 'help'
#vis.marks[0].properties.update.fill.rule.predicate.value = "red"
vis.display()
vis.grammar()
#vis.to_json('maryland.json')


Out[256]:
{u'axes': [],
 u'data': [{u'format': {u'feature': u'or_counties.geo', u'type': u'topojson'},
   u'name': u'Maryland',
   u'transform': [{u'translate': [480, 250],
     u'type': u'geopath',
     u'value': u'data'}],
   u'url': u'topo_files/or_counties.topo_copy.json'}],
 u'height': 500,
 u'legends': [],
 u'marks': [{u'from': {u'data': u'Maryland'},
   u'properties': {u'enter': {u'path': {u'field': u'path'},
     u'stroke': {u'value': u'#000000'}}},
   u'type': u'path'}],
 u'padding': u'auto',
 u'scales': [],
 u'width': 960}

In [270]:
zip_topo = r'topo_files/Maryland.topo.json'

geo_data = [{'name': 'Maryland',
             'url': zip_topo,
             'feature': 'Maryland.geo'}]

vis = vincent.Map(geo_data=geo_data,scale=8000,projection='equirectangular')
vis.display()



In [ ]: