In [7]:
%matplotlib inline
import pandas as pd
import json

In [8]:
#File locations
acs_file = "/home/russ/Documents/DDL/Projects/03-censusables/source/Data/raw_files/acs5yr.csv"
zillow_HVI_file = "/home/russ/Documents/DDL/Projects/03-censusables/source/Data/raw_files/Zip_Zhvi_AllHomes_HomeValueIndex.csv"
zillow_RI_file = "/home/russ/Documents/DDL/Projects/03-censusables/source/Data/raw_files/Zip_Zri_AllHomes_RentIndex.csv"
urbanization_zip = "/home/russ/Documents/DDL/Projects/03-censusables/source/Data/raw_files/zcta2010_txt.csv"
ZCTA = "/home/russ/Documents/DDL/Projects/03-censusables/source/Data/raw_files/ZCTA.csv"

In [9]:
acs = pd.read_csv(acs_file)

In [10]:
diversity = acs[['zip5','pop','race_white','race_black','race_asian','race_indian','race_other','hisp']].copy(deep=True)

In [11]:
diversity['white_hisp'] = ((diversity['pop']*diversity['race_white'])*diversity['hisp'])/diversity['pop']

In [12]:
diversity['white_nonhisp'] = ((diversity['pop']*diversity['race_white'])*(1-diversity['hisp']))/diversity['pop']

In [13]:
diversity['div_index'] = 1- (diversity['race_black']**2 + diversity['white_hisp']**2 + diversity['white_nonhisp']**2 + diversity['race_asian']**2 + diversity['race_indian']**2)

In [14]:
diversity_index = diversity[['zip5','div_index']].dropna(axis=0,how='any',subset=['zip5','div_index'])

In [15]:
import numpy as np
diversity_index[diversity_index['div_index']==np.nan]


Out[15]:
zip5 div_index

In [16]:
urban = pd.read_csv(urbanization_zip)

In [17]:
urban.rename(columns={'Zip5':'zip5'},inplace=True)

In [18]:
urban['zip5'] = urban.apply(lambda x: float(x['zip5']),axis=1)

In [19]:
#urban['pop'] = urban.apply(lambda x: int(x['POPULATION'].replace(',','')),axis=1)
#alternate
#urban['pop'] = urban['POPULATION'].apply(lambda x: int(x.replace(',','')))
urban['pop'] = urban.apply(lambda x: int(x['POPULATION']),axis=1)

In [20]:
#strip Z from Zip Code Text
urban['ZCTA5'] = urban.apply(lambda x: x['ZCTA5'][1:],axis=1)

In [196]:
urban_index.min()


Out[196]:
ZCTA5          601.000000
zip5           601.000000
urban_index     15.025532
dtype: float64

In [199]:
urban_index.idxmax()


Out[199]:
ZCTA5          33119
zip5           33119
urban_index    33034
dtype: int64

In [197]:
urban_index[(urban_index['zip5']==21230) | (urban_index['zip5']==59736)]


Out[197]:
ZCTA5 zip5 urban_index
6577 21230 21230 486.343393
20321 59736 59736 8617968.842520

In [192]:
urban_index[urban_index['urban_index']==np.inf]


Out[192]:
ZCTA5 zip5 urban_index
206 01199 1199 inf
514 02203 2203 inf
1552 05481 5481 inf
1687 06020 6020 inf
1709 06059 6059 inf
1752 06160 6160 inf
2105 07495 7495 inf
2575 10020 10020 inf
2601 10110 10110 inf
2602 10111 10111 inf
2603 10112 10112 inf
2604 10115 10115 inf
2607 10152 10152 inf
2608 10153 10153 inf
2609 10154 10154 inf
2612 10167 10167 inf
2613 10168 10168 inf
2614 10169 10169 inf
2616 10171 10171 inf
2617 10172 10172 inf
2619 10174 10174 inf
2620 10177 10177 inf
2622 10271 10271 inf
2623 10278 10278 inf
2624 10279 10279 inf
2637 10311 10311 inf
2862 11351 11351 inf
2868 11359 11359 inf
2880 11371 11371 inf
2902 11424 11424 inf
... ... ... ...
27896 80293 80293 inf
27897 80294 80294 inf
27906 80419 80419 inf
28120 80939 80939 inf
28427 82715 82715 inf
28879 84144 84144 inf
28880 84180 84180 inf
29319 85723 85723 inf
29320 85724 85724 inf
29321 85726 85726 inf
29989 89702 89702 inf
30073 90079 90079 inf
30075 90090 90090 inf
30128 90506 90506 inf
30166 90747 90747 inf
30180 90831 90831 inf
30268 91608 91608 inf
30414 92132 92132 inf
30770 93410 93410 inf
31125 94850 94850 inf
31281 95314 95314 inf
31861 96850 96850 inf
31974 97208 97208 inf
32364 98154 98154 inf
32366 98158 98158 inf
32370 98174 98174 inf
32374 98195 98195 inf
32712 98929 98929 inf
32817 99164 99164 inf
32891 99513 99513 inf

144 rows × 3 columns


In [185]:
urban.dtypes


Out[185]:
ZCTA5           object
zip5           float64
LANDSQMT         int64
WATERSQMT        int64
LANDSQMI       float64
WATERSQMI      float64
POPULATION       int64
HSGUNITS         int64
INTPTLAT       float64
INTPTLON       float64
pop              int64
urban_index    float64
dtype: object

In [22]:
urban['urban_index'] = urban['LANDSQMT']/urban['pop']

In [23]:
urban_index = urban[['ZCTA5','zip5','urban_index']].dropna(axis=0,how='any',subset=['zip5','urban_index'])

In [ ]:
urban_index

In [24]:
urban_index.head(1)


Out[24]:
ZCTA5 zip5 urban_index
0 00601 601 8974.678998

In [25]:
zillow_HVI = pd.read_csv(zillow_HVI_file)

In [26]:
zillow_HVI = zillow_HVI[['RegionName','2014-01','2014-07','2015-01','2015-07']]

In [27]:
zillow_HVI.rename(columns={'RegionName':'zip5'},inplace=True)
zillow_HVI.head()


Out[27]:
zip5 2014-01 2014-07 2015-01 2015-07
0 10025 998300 1001700 1029600 1097100
1 60657 294700 297000 304500 307000
2 60614 386800 382300 378900 393000
3 79936 109600 111900 111600 112100
4 10002 849400 863900 903200 951600

In [28]:
zillow_RI = pd.read_csv(zillow_RI_file)

In [29]:
len(zillow_HVI)


Out[29]:
12988

In [30]:
zillow_RI.head(1)


Out[30]:
RegionName City State Metro CountyName 2010-11 2010-12 2011-01 2011-02 2011-03 ... 2014-10 2014-11 2014-12 2015-01 2015-02 2015-03 2015-04 2015-05 2015-06 2015-07
0 10025 New York NY New York New York NaN NaN NaN NaN NaN ... 3511 3488 3500 3520 3557 3588 3585 3586 3581 3599

1 rows × 62 columns


In [31]:
zillow_RI = zillow_RI[['RegionName','2014-01','2014-07','2015-01','2015-07']].copy(False)

In [32]:
zillow_RI.rename(columns={'RegionName':'zip5'},inplace=True)

In [33]:
zillow_RI.head()


Out[33]:
zip5 2014-01 2014-07 2015-01 2015-07
0 10025 3477 3484 3520 3599
1 60657 2005 2142 1969 2050
2 10023 4251 4312 4370 4620
3 60614 2199 2343 2232 2361
4 79936 1029 1029 1022 1057

In [34]:
housing_index = pd.merge (zillow_HVI, zillow_RI,how='inner', on='zip5').dropna(axis=0,how='all')

In [35]:
#housing_index = zillow_HVI.dropna(axis=0,how='all')

In [36]:
acs.head(1).to_csv("ACS_Headers.csv")

In [37]:
#housing_index = pd.merge(housing_index, acs[['zip5','rent_median']], how='inner', on='zip5',copy=False)

In [38]:
income_index = acs[['zip5','inc_median','poverty','snap','gini_index']].dropna(axis=0,how='all')

In [189]:
len(income_index.columns)


Out[189]:
5

In [39]:
income_index[income_index['zip5']==90210]


Out[39]:
zip5 inc_median poverty snap gini_index
30079 90210 132254 0.077309 0.008544 0.635

In [40]:
#Beginning PCA Analysis (reference: http://sebastianraschka.com/Articles/2015_pca_in_3_steps.html)

X_div = diversity_index.ix[:,1].values
y_div = diversity_index.ix[:,0].values

In [41]:
y_div


Out[41]:
array([  601,   602,   603, ..., 99926, 99927, 99929])

In [48]:
#Standardization
from sklearn.preprocessing import StandardScaler
X_div_std = StandardScaler().fit_transform(X_div)

In [49]:
X_div_std


Out[49]:
array([-0.68146139,  1.60062294,  0.74761118, ...,  0.11210504,
        0.21154735,  1.06348835])

In [50]:
#Same PCA analysis using scikit-learn

from sklearn.decomposition import PCA as sklearnPCA
sklearn_div_pca = sklearnPCA(n_components=1)
Y_div_sklearn = sklearn_div_pca.fit_transform(X_div_std)


/home/russ/anaconda/lib/python2.7/site-packages/numpy/core/_methods.py:59: RuntimeWarning: Mean of empty slice.
  warnings.warn("Mean of empty slice.", RuntimeWarning)

In [51]:
sklearn_pca.explained_variance_ratio_


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-51-276c2c932926> in <module>()
----> 1 sklearn_pca.explained_variance_ratio_

NameError: name 'sklearn_pca' is not defined

In [52]:
len(X_div)


Out[52]:
32784

In [53]:
df_div = pd.DataFrame({'zip5':y_div,'diversity_index':X_div})

In [54]:
df_div.head()


Out[54]:
diversity_index zip5
0 0.114337 601
1 0.643286 602
2 0.445572 603
3 0.093341 606
4 0.459638 610

In [55]:
#Beginning PCA Analysis (reference: http://sebastianraschka.com/Articles/2015_pca_in_3_steps.html)

X_urb = urban_index.ix[:,1].values
y_urb = urban_index.ix[:,0].values

In [56]:
#Standardization
from sklearn.preprocessing import StandardScaler
X_urb_std = StandardScaler().fit_transform(X_urb)

In [57]:
X_urb_std


Out[57]:
array([-1.7800184 , -1.77998212, -1.77994584, ...,  1.82334699,
        1.82338327,  1.82345583])

In [58]:
#Same PCA analysis using scikit-learn

from sklearn.decomposition import PCA as sklearnPCA
sklearn_urb_pca = sklearnPCA(n_components=1)
Y_urb_sklearn = sklearn_pca.fit_transform(X_urb_std)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-58-751625d30aeb> in <module>()
      3 from sklearn.decomposition import PCA as sklearnPCA
      4 sklearn_urb_pca = sklearnPCA(n_components=1)
----> 5 Y_urb_sklearn = sklearn_pca.fit_transform(X_urb_std)

NameError: name 'sklearn_pca' is not defined

In [59]:
df_urb = pd.DataFrame({'zip5':y_div,'urban_index':X_div})

In [186]:
df_urb.dtypes


Out[186]:
urban_index    float64
zip5             int64
dtype: object

In [60]:
df_urb.head(1)a


Out[60]:
urban_index zip5
0 0.114337 601

In [61]:
housing_index['2014-07_x']


Out[61]:
0        1001700
1         297000
2         382300
3         111900
4         863900
5         300500
6         130100
7         911000
8         581300
9         190400
10        758500
11        579600
12        222100
13        458300
14        152600
15       1039500
16        137800
17        271400
18        130200
19        710100
20        317000
21        167200
22       1093600
23        151000
24         79800
25       1417400
26        284400
27        106300
28        246800
29        135400
          ...   
12228     155600
12229     224200
12230     162300
12231     447200
12232     248500
12233     103300
12234     372000
12235     237500
12236     236000
12237      76500
12238     252500
12239     168800
12240     215800
12241     110300
12242      82500
12243     235700
12244     134500
12245     167500
12246     196100
12247     121600
12248     247000
12249     211200
12250     144300
12251     163500
12252     222900
12253     512700
12254     303500
12255     206000
12256     830100
12257     165700
Name: 2014-07_x, dtype: float64

In [62]:
housing_index.dtypes


Out[62]:
zip5           int64
2014-01_x      int64
2014-07_x    float64
2015-01_x      int64
2015-07_x      int64
2014-01_y      int64
2014-07_y      int64
2015-01_y      int64
2015-07_y      int64
dtype: object

In [63]:
housing_index[housing_index['2014-07_x'].isnull()]
#income_index[income_index['zip5']==90210]


Out[63]:
zip5 2014-01_x 2014-07_x 2015-01_x 2015-07_x 2014-01_y 2014-07_y 2015-01_y 2015-07_y
5154 29341 63900 NaN 74700 72600 1000 910 880 935

In [184]:
#housing_index.isnull()==True
housing_index[housing_index.isnull().any(axis=1)]


Out[184]:
zip5 2014-01_x 2014-07_x 2015-01_x 2015-07_x 2014-01_y 2014-07_y 2015-01_y 2015-07_y

In [65]:
housing_index[housing_index['zip5']== 21211]


Out[65]:
zip5 2014-01_x 2014-07_x 2015-01_x 2015-07_x 2014-01_y 2014-07_y 2015-01_y 2015-07_y
4766 21211 167300 176900 179200 188100 1325 1377 1389 1473

In [183]:
housing_index.loc[housing_index['2014-07_x'].isnull(),'2014-07_x'] = housing_index['2014-01_x']

In [67]:
housing_index.head(1)


Out[67]:
zip5 2014-01_x 2014-07_x 2015-01_x 2015-07_x 2014-01_y 2014-07_y 2015-01_y 2015-07_y
0 10025 998300 1001700 1029600 1097100 3477 3484 3520 3599

In [68]:
#Beginning PCA Analysis (reference: http://sebastianraschka.com/Articles/2015_pca_in_3_steps.html)

X_hou = housing_index.ix[:,1:9].values
zip5_hou = housing_index.ix[:,0].values

In [69]:
#Standardization
from sklearn.preprocessing import StandardScaler
X_hou_std = StandardScaler().fit_transform(X_hou)

In [70]:
#Same PCA analysis using scikit-learn

from sklearn.decomposition import PCA as sklearnPCA
#sklearn_hou_pca = sklearnPCA(n_components=8)
sklearn_hou_pca = sklearnPCA(n_components=1)
Y_hou_sklearn = sklearn_hou_pca.fit_transform(X_hou_std)

In [71]:
sklearn_hou_pca


Out[71]:
PCA(copy=True, n_components=1, whiten=False)

In [72]:
expl_hou_var = sklearn_hou_pca.explained_variance_ratio_

In [73]:
sklearn_hou_pca.explained_variance_ratio_


Out[73]:
array([ 0.96098779])

In [74]:
X_hou_std


Out[74]:
array([[ 3.7929565 ,  3.65054733,  3.64774133, ...,  2.86499164,
         2.7859207 ,  2.73825957],
       [ 0.33423702,  0.30410089,  0.31021787, ...,  0.92538362,
         0.61158254,  0.65113861],
       [ 0.7869773 ,  0.70916955,  0.6526696 , ...,  1.21589123,
         0.98028076,  1.07017968],
       ..., 
       [-0.07229524, -0.12803565, -0.14500357, ...,  0.20272787,
         0.16017255,  0.24018194],
       [ 2.62448348,  2.83566126,  2.81738795, ...,  1.52807851,
         1.66019953,  1.69537279],
       [-0.37903785, -0.31941041, -0.36179761, ..., -0.91738854,
        -0.85760276, -0.847169  ]])

In [75]:
sklearn_hou_pca.get_covariance()


Out[75]:
array([[ 1.00455373,  0.96063112,  0.95844352,  0.95600854,  0.94955811,
         0.95597756,  0.95986476,  0.96100498],
       [ 0.96063112,  1.00587974,  0.95910524,  0.95666858,  0.95021369,
         0.95663758,  0.96052746,  0.96166846],
       [ 0.95844352,  0.95910524,  1.0015065 ,  0.95449   ,  0.94804982,
         0.95445907,  0.9583401 ,  0.9594785 ],
       [ 0.95600854,  0.95666858,  0.95449   ,  0.99665045,  0.94564124,
         0.95203422,  0.95590538,  0.95704089],
       [ 0.94955811,  0.95021369,  0.94804982,  0.94564124,  0.98384615,
         0.9456106 ,  0.94945564,  0.9505835 ],
       [ 0.95597756,  0.95663758,  0.95445907,  0.95203422,  0.9456106 ,
         0.99658876,  0.95587441,  0.95700988],
       [ 0.95986476,  0.96052746,  0.9583401 ,  0.95590538,  0.94945564,
         0.95587441,  1.00434657,  0.96090128],
       [ 0.96100498,  0.96166846,  0.9594785 ,  0.95704089,  0.9505835 ,
         0.95700988,  0.96090128,  1.00662811]])

In [76]:
sklearn_hou_pca.explained_variance_


Out[76]:
array([ 7.6879023])

In [77]:
sklearn_hou_pca.explained_variance_ratio_


Out[77]:
array([ 0.96098779])

In [78]:
expl_hou_val = sklearn_hou_pca.explained_variance_ratio_

In [79]:
ev = expl_hou_val.tolist()
ls =  ['PC%s' %i for i in range(0,len(ev))]
df_hou_explainedValue = pd.DataFrame(ev,columns=['Value'],index=ls)

In [80]:
df_hou_explainedValue.plot(kind='bar')


Out[80]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fb448058c50>

In [81]:
sklearn_hou_pca.components_


Out[81]:
array([[-0.35439495, -0.35463963, -0.35383202, -0.35293309, -0.35055176,
        -0.35292166, -0.35435671, -0.35477764]])

In [82]:
df_hou = pd.DataFrame({'zip5':zip5_hou,'housing_index':Y_hou_sklearn[:,0]})

In [83]:
df_hou['housing_index'] = df_hou.apply(lambda x: x['housing_index']*-1,axis=1)

In [84]:
df_hou[df_hou['zip5']==21222]


Out[84]:
housing_index zip5
275 -1.014197 21222

In [85]:
#Beginning PCA Analysis (reference: http://sebastianraschka.com/Articles/2015_pca_in_3_steps.html)

X_inc = income_index.ix[:,1:5].values
zip5_inc = income_index.ix[:,0].values

In [86]:
#Standardization
from sklearn.preprocessing import StandardScaler
X_inc_std = StandardScaler().fit_transform(X_inc)

In [87]:
#Same PCA analysis using scikit-learn

from sklearn.decomposition import PCA as sklearnPCA
#sklearn_inc_pca = sklearnPCA(n_components=4)
sklearn_inc_pca = sklearnPCA(n_components=1)
Y_inc_sklearn = sklearn_inc_pca.fit_transform(X_inc_std)

In [88]:
X_inc_std


Out[88]:
array([[-1.59359348,  4.4150441 ,  3.50806255,  1.95198116],
       [-1.44234668,  3.29998401,  2.80016619,  1.23438494],
       [-1.44977957,  3.13114555,  2.87822525,  1.60594258],
       ..., 
       [-0.00633725, -0.14431335,  2.2477343 ,  0.12175354],
       [-1.30300086, -0.86957667,  0.99043425,  0.58926288],
       [-0.1901969 , -0.30098952,  0.45777452,  0.24730736]])

In [89]:
sklearn_inc_pca.explained_variance_ratio_


Out[89]:
array([ 0.46377581])

In [90]:
sklearn_inc_pca.explained_variance_


Out[90]:
array([ 1.85510322])

In [91]:
expl_inc_var = sklearn_inc_pca.explained_variance_ratio_
ev = expl_inc_var.tolist()
ls =  ['PC%s' %i for i in range(0,len(ev))]
df_inc_explainedValue = pd.DataFrame(ev,columns=['Value'],index=ls)

In [92]:
df_inc_explainedValue.plot(kind='bar')


Out[92]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fb448058b10>

In [93]:
df_inc = pd.DataFrame({'zip5':zip5_inc,'income_index':Y_inc_sklearn[:,0]})

In [94]:
df_inc[df_inc['zip5']==21223]


Out[94]:
income_index zip5
6570 -3.13511 21223

In [95]:
df_final = pd.merge (df_inc,df_hou,on='zip5')

In [96]:
df_final= pd.merge (df_final,df_urb,on='zip5')

In [97]:
df_final= pd.merge (df_final,df_div,on='zip5')

In [98]:
df_final.head(1)


Out[98]:
income_index zip5 housing_index urban_index diversity_index
0 0.60221 1001 -0.570902 0.233848 0.233848

In [99]:
df_final[df_final['zip5']==90210]


Out[99]:
income_index zip5 housing_index urban_index diversity_index
10505 1.942892 90210 49.762782 0.319288 0.319288

In [100]:
#Rescale indexes
from sklearn import preprocessing

zip5_final = df_final['zip5'].values
std_scale = preprocessing.StandardScaler().fit(df_final[['income_index', 'housing_index','urban_index','diversity_index']])
df_std = std_scale.transform(df_final[['income_index', 'housing_index','urban_index','diversity_index']])

minmax_scale = preprocessing.MinMaxScaler().fit(df_final[['income_index', 'housing_index','urban_index','diversity_index']])
df_minmax = minmax_scale.transform(df_final[['income_index', 'housing_index','urban_index','diversity_index']])

In [101]:
df_minmax_final = pd.DataFrame({'zip5':zip5_final,'income_index':df_minmax[:,0],'housing_index':df_minmax[:,1],'urban_index':df_minmax[:,2],'diversity_index':df_minmax[:,3]})

In [102]:
df_minmax_final[df_minmax_final['zip5']==21230]


Out[102]:
diversity_index housing_index income_index urban_index zip5
3141 0.542558 0.064618 0.575462 0.542558 21230

In [190]:
df_minmax_final[df_minmax_final['zip5']==90210]


Out[190]:
diversity_index housing_index income_index urban_index zip5
10505 0.327081 1 0.73986 0.327081 90210

In [103]:
ZCTA = pd.read_csv(ZCTA)

In [104]:
ZCTA.head(1)


Out[104]:
zcta5 ZIPName State FIPS state County County2 PlaceFP PlaceFP2 cbsa MetDiv csa CBSAType CBSAName MetDivName CSAName NectaName UAName Puma12Name
0 35004 Moody, AL AL 1 St. Clair AL NaN 51096 46696 13820 NaN 142 Metro Birmingham-Hoover, AL NaN Birmingham-Hoover-Talladega, AL NaN Birmingham, AL Urbanized Area St. Clair & Blount Counties

In [105]:
df_all_final = pd.merge (df_minmax_final,ZCTA[['zcta5','ZIPName','State']],left_on='zip5',right_on='zcta5',copy=False)

In [106]:
del df_all_final['zcta5']

In [107]:
df_all_final = pd.merge(df_all_final,urban[['zip5','ZCTA5']],copy=False)

In [108]:
df_all_final.to_csv('/home/russ/Documents/DDL/Projects/03-censusables/source/Data/final_files/Final.csv')

In [110]:
fit = [.5,.5,.5,.5]
#fit = [0,0,1,0]
df_all_final['fit'] = df_all_final.apply(lambda x: abs(fit[0]-x['diversity_index'])+abs(fit[1]-x['housing_index'])+\
                                        abs(fit[2]-x['income_index'])+abs(fit[3]-x['urban_index']),axis=1)

In [111]:
state = 'MD'
if state:
    df_display = df_all_final[df_all_final['State']==state].sort(['fit']).head(10)
else:
    df_display = df_all_final.sort(['fit']).head(10)
df_display


Out[111]:
diversity_index housing_index income_index urban_index zip5 ZIPName State ZCTA5 fit
2972 0.499313 0.061582 0.540120 0.499313 20710 Bladensburg, MD MD 20710 0.479912
3258 0.469100 0.040419 0.498020 0.469100 21875 Delmar, MD MD 21875 0.523362
3117 0.491175 0.044540 0.552835 0.491175 21206 Baltimore, MD MD 21206 0.525945
3182 0.514728 0.040767 0.537463 0.514728 21643 Hurlock, MD MD 21643 0.526152
3237 0.516602 0.038326 0.547926 0.516602 21804 Salisbury, MD MD 21804 0.542804
3132 0.513988 0.054983 0.571036 0.513988 21221 Essex, MD MD 21221 0.544029
3243 0.513848 0.034980 0.562139 0.513848 21826 Fruitland, MD MD 21826 0.554855
2960 0.505270 0.081946 0.633642 0.505270 20678 Prince Frederick, MD MD 20678 0.562236
2973 0.461209 0.105898 0.596931 0.461209 20711 Lothian, MD MD 20711 0.568616
3191 0.522953 0.127711 0.657794 0.522953 21662 Royal Oak, MD MD 21662 0.575989

In [112]:
df_all_final.sort(['fit']).head(10)


Out[112]:
diversity_index housing_index income_index urban_index zip5 ZIPName State ZCTA5 fit
849 0.510518 0.402507 0.707100 0.510518 6830 Greenwich, CT CT 06830 0.325628
10487 0.482534 0.263484 0.593316 0.482534 90046 Los Angeles, CA CA 90046 0.364764
10506 0.450808 0.376718 0.645668 0.450808 90211 Beverly Hills, CA CA 90211 0.367333
11171 0.503706 0.482497 0.844349 0.503706 94024 Los Altos, CA CA 94024 0.369264
10507 0.426886 0.449827 0.675198 0.426886 90212 Beverly Hills, CA CA 90212 0.371599
10500 0.482723 0.278766 0.631285 0.482723 90068 Los Angeles, CA CA 90068 0.387072
1356 0.470625 0.322967 0.653689 0.470625 10003 New York, NY NY 10003 0.389471
11948 0.511741 0.157938 0.466941 0.511741 98105 Seattle, WA WA 98105 0.398604
11197 0.563598 0.236204 0.516794 0.563598 94108 San Francisco, CA CA 94108 0.407786
11183 0.484693 0.371334 0.755528 0.484693 94062 Redwood City, CA CA 94062 0.414807

In [113]:
fit = []
fit.append(float(50)/100)
fit.append(float(50)/100)
fit.append(float(50)/100)
fit.append(float(50)/100)        
df_all_final['fit'] = df_all_final.apply(lambda x: abs(fit[0]-x['diversity_index'])+abs(fit[1]-x['housing_index'])+\
                                        abs(fit[2]-x['income_index'])+abs(fit[3]-x['urban_index']),axis=1)

In [537]:
import vincent
from vincent import AxisProperties, PropertySet, ValueRef
vincent.core.initialize_notebook()



In [593]:
zip_topo = r'zips_us_topo.json'
state_topo = r'us_states.topo.json'

geo_data = [{'name': 'states',
             'url': state_topo,
             'feature': 'us_states.geo'},
            {'name': 'zip_codes',
             'url': zip_topo,
             'feature': 'zip_codes_for_the_usa'}]

vis = vincent.Map(data=df_all_final, geo_data=geo_data, scale=800, projection='albersUsa',
          data_bind='fit', data_key='zip5',brew='YlOrRd',
          map_key={'zip_codes': 'properties.zip'})
del vis.marks[0].properties.update
#del vis.marks[1].properties.update
#vis.marks[1].properties.update.fill.value = '#FFFFFF'
#vis.marks[1].properties.enter.stroke.value = '#CCCCFF'
vis.marks[1].properties.enter.stroke_opacity = ValueRef(value=0.05)
vis.marks[0].properties.enter.stroke.value = '#C0C0C0'
#vis.marks[1].properties.hover.fill.value = 'red'
vis.legend(title='Preferred ZipCode')
vis.display()
vis.to_json("USA_Preferred.json")



In [541]:
zip_topo = r'zips_us_topo.json'
state_topo = r'us_states.topo.json'

geo_data2 = [{'name': 'zip_codes',
             'url': zip_topo,
             'feature': 'zip_codes_for_the_usa'},
            {'name': 'states',
             'url': state_topo,
             'feature': 'us_states.geo'}]

geo_data = [{'name': 'states',
             'url': state_topo,
             'feature': 'us_states.geo'},
            {'name': 'zip_codes',
             'url': zip_topo,
             'feature': 'zip_codes_for_the_usa'}]

vis = vincent.Map(data=urban[urban['zip']== '83211'], geo_data=geo_data, scale=1100, projection='albersUsa',
          data_bind='POPULATION', data_key='zip',brew='PuRd',
          map_key={'zip_codes': 'properties.zip'})
del vis.marks[0].properties.update
#del vis.marks[1].properties.update
vis.marks[1].properties.update.fill.value = '#C390D4'
#vis.marks[1].properties.enter.stroke.value = '#CCCCFF'
#vis.marks[1].properties.enter.stroke_opacity = ValueRef(value=0.1)
vis.marks[0].properties.enter.stroke.value = '#FF0000'
vis.legend(title='POPULATION')
vis.display()
vis.to_json("USA.json")


---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-541-13c50b6d908d> in <module>()
     16              'feature': 'zip_codes_for_the_usa'}]
     17 
---> 18 vis = vincent.Map(data=urban[urban['zip']== '83211'], geo_data=geo_data, scale=1100, projection='albersUsa',
     19           data_bind='POPULATION', data_key='zip',brew='PuRd',
     20           map_key={'zip_codes': 'properties.zip'})

/home/russ/anaconda/lib/python2.7/site-packages/pandas/core/frame.pyc in __getitem__(self, key)
   1795             return self._getitem_multilevel(key)
   1796         else:
-> 1797             return self._getitem_column(key)
   1798 
   1799     def _getitem_column(self, key):

/home/russ/anaconda/lib/python2.7/site-packages/pandas/core/frame.pyc in _getitem_column(self, key)
   1802         # get column
   1803         if self.columns.is_unique:
-> 1804             return self._get_item_cache(key)
   1805 
   1806         # duplicate columns & possible reduce dimensionaility

/home/russ/anaconda/lib/python2.7/site-packages/pandas/core/generic.pyc in _get_item_cache(self, item)
   1082         res = cache.get(item)
   1083         if res is None:
-> 1084             values = self._data.get(item)
   1085             res = self._box_item_values(item, values)
   1086             cache[item] = res

/home/russ/anaconda/lib/python2.7/site-packages/pandas/core/internals.pyc in get(self, item, fastpath)
   2849 
   2850             if not isnull(item):
-> 2851                 loc = self.items.get_loc(item)
   2852             else:
   2853                 indexer = np.arange(len(self.items))[isnull(self.items)]

/home/russ/anaconda/lib/python2.7/site-packages/pandas/core/index.pyc in get_loc(self, key, method)
   1570         """
   1571         if method is None:
-> 1572             return self._engine.get_loc(_values_from_object(key))
   1573 
   1574         indexer = self.get_indexer([key], method=method)

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:3824)()

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:3704)()

pandas/hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:12280)()

pandas/hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:12231)()

KeyError: 'zip'

In [4]:
import csv
with open("/home/russ/Documents/DDL/Projects/03-censusables/source/Data/raw_files/state_landarea_rank.csv") as f:
    f.readline() # ignore first line (header)
    land_area = dict(csv.reader(f, delimiter=','))
print land_area


{'Mississippi': '31', 'Oklahoma': '19', 'Wyoming': '9', 'Minnesota': '14', 'Illinois': '24', 'Arkansas': '27', 'Ohio': '35', 'Indiana': '38', 'Maryland': '42', 'Louisiana': '33', 'New_Hampshire': '44', 'Idaho': '11', 'New_York': '30', 'Arizona': '6', 'Iowa': '23', 'South_Carolina': '40', 'Michigan': '22', 'Kansas': '13', 'Utah': '12', 'Virginia': '37', 'Oregon': '10', 'District_of_Columbia': '51', 'Connecticut': '48', 'Montana': '4', 'California': '3', 'Texas': '2', 'New_Mexico': '5', 'South_Dakota': '16', 'Massachusetts': '45', 'Vermont': '43', 'Georgia': '21', 'Pennsylvania': '32', 'Florida': '26', 'Alaska': '1', 'North_Dakota': '17', 'Hawaii': '47', 'Nebraska': '15', 'Kentucky': '36', 'Missouri': '18', 'Wisconsin': '25', 'Alabama': '28', 'New_Jersey': '46', 'Colorado': '8', 'Washington': '20', 'West_Virginia': '41', 'Tennessee': '34', 'Rhode_Island': '50', 'North_Carolina': '29', 'Nevada': '7', 'Delaware': '49', 'Maine': '39'}

In [146]:
ziplist = json.loads(df_all_final[['ZCTA5','ZIPName','fit']].head(5).to_json())
ziplist


Out[146]:
{u'ZCTA5': {u'0': u'01001',
  u'1': u'01002',
  u'2': u'01005',
  u'3': u'01007',
  u'4': u'01008'},
 u'ZIPName': {u'0': u'Agawam Town, MA',
  u'1': u'Amherst Center, MA',
  u'2': u'Barre, MA',
  u'3': u'Belchertown, MA',
  u'4': u'Blandford, MA'},
 u'fit': {u'0': 1.1006146364,
  u'1': 0.5398473703,
  u'2': 1.4791458348,
  u'3': 1.2032138701,
  u'4': 1.5669620785}}

In [140]:
ziplist['ZCTA5']['0']


Out[140]:
u'01001'

In [150]:
ziplist = json.loads(df_all_final[['ZCTA5','ZIPName','fit']].sort(['fit']).reset_index().head(5).to_json())

In [151]:
ziplist


Out[151]:
{u'ZCTA5': {u'0': u'06830',
  u'1': u'90046',
  u'2': u'90211',
  u'3': u'94024',
  u'4': u'90212'},
 u'ZIPName': {u'0': u'Greenwich, CT',
  u'1': u'Los Angeles, CA',
  u'2': u'Beverly Hills, CA',
  u'3': u'Los Altos, CA',
  u'4': u'Beverly Hills, CA'},
 u'fit': {u'0': 0.3256283618,
  u'1': 0.3647637893,
  u'2': 0.3673332581,
  u'3': 0.3692640451,
  u'4': 0.3715989326},
 u'index': {u'0': 849, u'1': 10487, u'2': 10506, u'3': 11171, u'4': 10507}}

In [153]:
table_data = []

for i in range (5):
    dict_row = {}
    dict_row['index'] = i
    dict_row['ZCTA5'] = ziplist['ZCTA5'][str(i)]
    dict_row['ZIPName'] = ziplist['ZIPName'][str(i)]
    table_data.append(dict_row)
print table_data


[{'ZCTA5': u'06830', 'index': 0, 'ZIPName': u'Greenwich, CT'}, {'ZCTA5': u'90046', 'index': 1, 'ZIPName': u'Los Angeles, CA'}, {'ZCTA5': u'90211', 'index': 2, 'ZIPName': u'Beverly Hills, CA'}, {'ZCTA5': u'94024', 'index': 3, 'ZIPName': u'Los Altos, CA'}, {'ZCTA5': u'90212', 'index': 4, 'ZIPName': u'Beverly Hills, CA'}]

In [160]:
json_export = json.dumps(table_data)

In [164]:
json_export


Out[164]:
'[{"ZCTA5": "06830", "index": 0, "ZIPName": "Greenwich, CT"}, {"ZCTA5": "90046", "index": 1, "ZIPName": "Los Angeles, CA"}, {"ZCTA5": "90211", "index": 2, "ZIPName": "Beverly Hills, CA"}, {"ZCTA5": "94024", "index": 3, "ZIPName": "Los Altos, CA"}, {"ZCTA5": "90212", "index": 4, "ZIPName": "Beverly Hills, CA"}]'

In [170]:
df_all_final.head(1)


Out[170]:
diversity_index housing_index income_index urban_index zip5 ZIPName State ZCTA5 fit
0 0.239556 0.052897 0.632623 0.239556 1001 Agawam Town, MA MA 01001 1.100615

In [179]:
state = "MD"
df_all_final[['ZCTA5','ZIPName','fit']][df_all_final['State']==state]


Out[179]:
ZCTA5 ZIPName fit
2929 20601 Waldorf, MD 0.823020
2930 20602 Waldorf, MD 0.833981
2931 20603 Waldorf, MD 0.874615
2932 20607 Accokeek, MD 0.779023
2933 20608 Aquasco, MD 0.680266
2934 20609 Avenue, MD 1.260702
2935 20616 Bryans Road, MD 0.792749
2936 20618 Bushwood, MD 0.677027
2937 20619 California, MD 0.909049
2938 20620 Callaway, MD 0.613723
2939 20621 Chaptico, MD 0.703706
2940 20622 Charlotte Hall, MD 0.991696
2941 20623 Rosaryville, MD 0.743587
2942 20625 Cobb Island, MD 1.283090
2943 20634 California, MD 0.710945
2944 20636 Hollywood, MD 1.191136
2945 20637 Hughesville, MD 0.879244
2946 20639 Huntingtown, MD 0.905863
2947 20640 Indian Head, MD 0.801099
2948 20646 La Plata, MD 0.741235
2949 20650 Leonardtown, MD 0.968928
2950 20653 Lexington Park, MD 0.736164
2951 20657 Chesapeake Ranch Estates, MD 0.881315
2952 20658 Marbury, MD 0.754521
2953 20659 Mechanicsville, MD 1.159966
2954 20662 Nanjemoy, MD 0.673161
2955 20664 Newburg, MD 0.706848
2956 20674 Piney Point, MD 1.321042
2957 20675 Bensville, MD 0.852792
2958 20676 Port Republic, MD 1.127359
... ... ... ...
3238 21811 Ocean Pines, MD 0.946528
3239 21813 Bishopville, MD 1.225039
3240 21817 Crisfield, MD 0.715068
3241 21821 Deal Island, MD 1.283110
3242 21822 Eden, MD 0.870842
3243 21826 Fruitland, MD 0.554855
3244 21830 Hebron, MD 0.851687
3245 21837 Mardela Springs, MD 0.823912
3246 21841 Newark, MD 1.291431
3247 21842 Ocean City, MD 1.140685
3248 21849 Parsonsburg, MD 0.843340
3249 21850 Pittsville, MD 1.427987
3250 21851 Pocomoke City, MD 0.580918
3251 21853 Princess Anne, MD 0.688110
3252 21856 Quantico, MD 0.661147
3253 21861 Sharptown, MD 1.048766
3254 21863 Snow Hill, MD 0.652626
3255 21869 Vienna, MD 0.855466
3256 21871 Westover, MD 0.612537
3257 21874 Willards, MD 1.304704
3258 21875 Delmar, MD 0.523362
3259 21901 North East, MD 1.089752
3260 21903 Perryville, MD 0.591037
3261 21904 Port Deposit, MD 0.973152
3262 21911 Rising Sun, MD 1.384978
3263 21912 Warwick, MD 1.301058
3264 21915 Chesapeake City, MD 1.310506
3265 21917 Colora, MD 1.510552
3266 21918 Conowingo, MD 1.463528
3267 21921 Elkton, MD 0.950810

339 rows × 3 columns


In [182]:
state = "VT"
df_all_final[['ZCTA5','ZIPName','fit']][df_all_final['State']==state].sort(['fit']).reset_index().head(5)


Out[182]:
index ZCTA5 ZIPName fit
0 628 05101 Bellows Falls, VT 1.213078

In [ ]: