notebook.community

Edit and run



In [7]:

    
%matplotlib inline
import pandas as pd
import json



In [8]:

    
#File locations
acs_file = "/home/russ/Documents/DDL/Projects/03-censusables/source/Data/raw_files/acs5yr.csv"
zillow_HVI_file = "/home/russ/Documents/DDL/Projects/03-censusables/source/Data/raw_files/Zip_Zhvi_AllHomes_HomeValueIndex.csv"
zillow_RI_file = "/home/russ/Documents/DDL/Projects/03-censusables/source/Data/raw_files/Zip_Zri_AllHomes_RentIndex.csv"
urbanization_zip = "/home/russ/Documents/DDL/Projects/03-censusables/source/Data/raw_files/zcta2010_txt.csv"
ZCTA = "/home/russ/Documents/DDL/Projects/03-censusables/source/Data/raw_files/ZCTA.csv"



In [9]:

    
acs = pd.read_csv(acs_file)



In [10]:

    
diversity = acs[['zip5','pop','race_white','race_black','race_asian','race_indian','race_other','hisp']].copy(deep=True)



In [11]:

    
diversity['white_hisp'] = ((diversity['pop']*diversity['race_white'])*diversity['hisp'])/diversity['pop']



In [12]:

    
diversity['white_nonhisp'] = ((diversity['pop']*diversity['race_white'])*(1-diversity['hisp']))/diversity['pop']



In [13]:

    
diversity['div_index'] = 1- (diversity['race_black']**2 + diversity['white_hisp']**2 + diversity['white_nonhisp']**2 + diversity['race_asian']**2 + diversity['race_indian']**2)



In [14]:

    
diversity_index = diversity[['zip5','div_index']].dropna(axis=0,how='any',subset=['zip5','div_index'])



In [15]:

    
import numpy as np
diversity_index[diversity_index['div_index']==np.nan]









    Out[15]:






  
    
      
      zip5
      div_index



In [16]:

    
urban = pd.read_csv(urbanization_zip)



In [17]:

    
urban.rename(columns={'Zip5':'zip5'},inplace=True)



In [18]:

    
urban['zip5'] = urban.apply(lambda x: float(x['zip5']),axis=1)



In [19]:

    
#urban['pop'] = urban.apply(lambda x: int(x['POPULATION'].replace(',','')),axis=1)
#alternate
#urban['pop'] = urban['POPULATION'].apply(lambda x: int(x.replace(',','')))
urban['pop'] = urban.apply(lambda x: int(x['POPULATION']),axis=1)



In [20]:

    
#strip Z from Zip Code Text
urban['ZCTA5'] = urban.apply(lambda x: x['ZCTA5'][1:],axis=1)



In [21]:

    
urban.head(1)









    Out[21]:






  
    
      
      ZCTA5
      zip5
      LANDSQMT
      WATERSQMT
      LANDSQMI
      WATERSQMI
      POPULATION
      HSGUNITS
      INTPTLAT
      INTPTLON
      pop
    
  
  
    
      0
      00601
      601
      166659789
      799296
      64.35
      0.31
      18570
      7744
      18.180556
      -66.749961
      18570



In [22]:

    
urban['urban_index'] = urban['LANDSQMT']/urban['pop']



In [23]:

    
urban_index = urban[['ZCTA5','zip5','urban_index']].dropna(axis=0,how='any',subset=['zip5','urban_index'])



In [24]:

    
urban_index.head(1)









    Out[24]:






  
    
      
      ZCTA5
      zip5
      urban_index
    
  
  
    
      0
      00601
      601
      8974.678998



In [25]:

    
zillow_HVI = pd.read_csv(zillow_HVI_file)



In [26]:

    
zillow_HVI = zillow_HVI[['RegionName','2014-01','2014-07','2015-01','2015-07']]



In [27]:

    
zillow_HVI.rename(columns={'RegionName':'zip5'},inplace=True)
zillow_HVI.head()



In [28]:

    
zillow_RI = pd.read_csv(zillow_RI_file)



In [29]:

    
len(zillow_HVI)









    Out[29]:





12988



In [30]:

    
zillow_RI.head(1)









    Out[30]:






  
    
      
      RegionName
      City
      State
      Metro
      CountyName
      2010-11
      2010-12
      2011-01
      2011-02
      2011-03
      ...
      2014-10
      2014-11
      2014-12
      2015-01
      2015-02
      2015-03
      2015-04
      2015-05
      2015-06
      2015-07
    
  
  
    
      0
      10025
      New York
      NY
      New York
      New York
      NaN
      NaN
      NaN
      NaN
      NaN
      ...
      3511
      3488
      3500
      3520
      3557
      3588
      3585
      3586
      3581
      3599
    
  

1 rows × 62 columns



In [31]:

    
zillow_RI = zillow_RI[['RegionName','2014-01','2014-07','2015-01','2015-07']].copy(False)



In [32]:

    
zillow_RI.rename(columns={'RegionName':'zip5'},inplace=True)



In [33]:

    
zillow_RI.head()



In [34]:

    
housing_index = pd.merge (zillow_HVI, zillow_RI,how='inner', on='zip5').dropna(axis=0,how='all')



In [35]:

    
#housing_index = zillow_HVI.dropna(axis=0,how='all')



In [36]:

    
acs.head(1).to_csv("ACS_Headers.csv")



In [37]:

    
#housing_index = pd.merge(housing_index, acs[['zip5','rent_median']], how='inner', on='zip5',copy=False)



In [38]:

    
income_index = acs[['zip5','inc_median','poverty','snap','gini_index']].dropna(axis=0,how='all')



In [39]:

    
income_index[income_index['zip5']==90210]









    Out[39]:






  
    
      
      zip5
      inc_median
      poverty
      snap
      gini_index
    
  
  
    
      30079
      90210
      132254
      0.077309
      0.008544
      0.635



In [40]:

    
#Beginning PCA Analysis (reference: http://sebastianraschka.com/Articles/2015_pca_in_3_steps.html)

X_div = diversity_index.ix[:,1].values
y_div = diversity_index.ix[:,0].values



In [41]:

    
y_div









    Out[41]:





array([  601,   602,   603, ..., 99926, 99927, 99929])



In [48]:

    
#Standardization
from sklearn.preprocessing import StandardScaler
X_div_std = StandardScaler().fit_transform(X_div)



In [49]:

    
X_div_std









    Out[49]:





array([-0.68146139,  1.60062294,  0.74761118, ...,  0.11210504,
        0.21154735,  1.06348835])



In [50]:

    
#Same PCA analysis using scikit-learn

from sklearn.decomposition import PCA as sklearnPCA
sklearn_div_pca = sklearnPCA(n_components=1)
Y_div_sklearn = sklearn_div_pca.fit_transform(X_div_std)









    



/home/russ/anaconda/lib/python2.7/site-packages/numpy/core/_methods.py:59: RuntimeWarning: Mean of empty slice.
  warnings.warn("Mean of empty slice.", RuntimeWarning)



In [51]:

    
sklearn_pca.explained_variance_ratio_









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-51-276c2c932926> in <module>()
----> 1 sklearn_pca.explained_variance_ratio_

NameError: name 'sklearn_pca' is not defined



In [52]:

    
len(X_div)









    Out[52]:





32784



In [53]:

    
df_div = pd.DataFrame({'zip5':y_div,'diversity_index':X_div})



In [54]:

    
df_div.head()









    Out[54]:






  
    
      
      diversity_index
      zip5
    
  
  
    
      0
      0.114337
      601
    
    
      1
      0.643286
      602
    
    
      2
      0.445572
      603
    
    
      3
      0.093341
      606
    
    
      4
      0.459638
      610



In [55]:

    
#Beginning PCA Analysis (reference: http://sebastianraschka.com/Articles/2015_pca_in_3_steps.html)

X_urb = urban_index.ix[:,1].values
y_urb = urban_index.ix[:,0].values



In [56]:

    
#Standardization
from sklearn.preprocessing import StandardScaler
X_urb_std = StandardScaler().fit_transform(X_urb)



In [57]:

    
X_urb_std









    Out[57]:





array([-1.7800184 , -1.77998212, -1.77994584, ...,  1.82334699,
        1.82338327,  1.82345583])



In [58]:

    
#Same PCA analysis using scikit-learn

from sklearn.decomposition import PCA as sklearnPCA
sklearn_urb_pca = sklearnPCA(n_components=1)
Y_urb_sklearn = sklearn_pca.fit_transform(X_urb_std)









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-58-751625d30aeb> in <module>()
      3 from sklearn.decomposition import PCA as sklearnPCA
      4 sklearn_urb_pca = sklearnPCA(n_components=1)
----> 5 Y_urb_sklearn = sklearn_pca.fit_transform(X_urb_std)

NameError: name 'sklearn_pca' is not defined



In [59]:

    
df_urb = pd.DataFrame({'zip5':y_div,'urban_index':X_div})



In [60]:

    
df_urb.head(1)









    Out[60]:






  
    
      
      urban_index
      zip5
    
  
  
    
      0
      0.114337
      601



In [61]:

    
housing_index['2014-07_x']









    Out[61]:





0        1001700
1         297000
2         382300
3         111900
4         863900
5         300500
6         130100
7         911000
8         581300
9         190400
10        758500
11        579600
12        222100
13        458300
14        152600
15       1039500
16        137800
17        271400
18        130200
19        710100
20        317000
21        167200
22       1093600
23        151000
24         79800
25       1417400
26        284400
27        106300
28        246800
29        135400
          ...   
12228     155600
12229     224200
12230     162300
12231     447200
12232     248500
12233     103300
12234     372000
12235     237500
12236     236000
12237      76500
12238     252500
12239     168800
12240     215800
12241     110300
12242      82500
12243     235700
12244     134500
12245     167500
12246     196100
12247     121600
12248     247000
12249     211200
12250     144300
12251     163500
12252     222900
12253     512700
12254     303500
12255     206000
12256     830100
12257     165700
Name: 2014-07_x, dtype: float64



In [62]:

    
housing_index.dtypes









    Out[62]:





zip5           int64
2014-01_x      int64
2014-07_x    float64
2015-01_x      int64
2015-07_x      int64
2014-01_y      int64
2014-07_y      int64
2015-01_y      int64
2015-07_y      int64
dtype: object



In [63]:

    
housing_index[housing_index['2014-07_x'].isnull()]
#income_index[income_index['zip5']==90210]



In [64]:

    
#housing_index.isnull()==True
housing_index[housing_index.isnull().any(axis=1)]



In [65]:

    
housing_index[housing_index['zip5']== 21211]



In [66]:

    
housing_index.loc[housing_index['2014-07_x'].isnull(),'2014-07_x'] = housing_index['2014-01_x']



In [67]:

    
housing_index.head(1)



In [68]:

    
#Beginning PCA Analysis (reference: http://sebastianraschka.com/Articles/2015_pca_in_3_steps.html)

X_hou = housing_index.ix[:,1:9].values
zip5_hou = housing_index.ix[:,0].values



In [69]:

    
#Standardization
from sklearn.preprocessing import StandardScaler
X_hou_std = StandardScaler().fit_transform(X_hou)



In [70]:

    
#Same PCA analysis using scikit-learn

from sklearn.decomposition import PCA as sklearnPCA
#sklearn_hou_pca = sklearnPCA(n_components=8)
sklearn_hou_pca = sklearnPCA(n_components=1)
Y_hou_sklearn = sklearn_hou_pca.fit_transform(X_hou_std)



In [71]:

    
sklearn_hou_pca









    Out[71]:





PCA(copy=True, n_components=1, whiten=False)



In [72]:

    
expl_hou_var = sklearn_hou_pca.explained_variance_ratio_



In [73]:

    
sklearn_hou_pca.explained_variance_ratio_









    Out[73]:





array([ 0.96098779])



In [74]:

    
X_hou_std









    Out[74]:





array([[ 3.7929565 ,  3.65054733,  3.64774133, ...,  2.86499164,
         2.7859207 ,  2.73825957],
       [ 0.33423702,  0.30410089,  0.31021787, ...,  0.92538362,
         0.61158254,  0.65113861],
       [ 0.7869773 ,  0.70916955,  0.6526696 , ...,  1.21589123,
         0.98028076,  1.07017968],
       ..., 
       [-0.07229524, -0.12803565, -0.14500357, ...,  0.20272787,
         0.16017255,  0.24018194],
       [ 2.62448348,  2.83566126,  2.81738795, ...,  1.52807851,
         1.66019953,  1.69537279],
       [-0.37903785, -0.31941041, -0.36179761, ..., -0.91738854,
        -0.85760276, -0.847169  ]])



In [75]:

    
sklearn_hou_pca.get_covariance()









    Out[75]:





array([[ 1.00455373,  0.96063112,  0.95844352,  0.95600854,  0.94955811,
         0.95597756,  0.95986476,  0.96100498],
       [ 0.96063112,  1.00587974,  0.95910524,  0.95666858,  0.95021369,
         0.95663758,  0.96052746,  0.96166846],
       [ 0.95844352,  0.95910524,  1.0015065 ,  0.95449   ,  0.94804982,
         0.95445907,  0.9583401 ,  0.9594785 ],
       [ 0.95600854,  0.95666858,  0.95449   ,  0.99665045,  0.94564124,
         0.95203422,  0.95590538,  0.95704089],
       [ 0.94955811,  0.95021369,  0.94804982,  0.94564124,  0.98384615,
         0.9456106 ,  0.94945564,  0.9505835 ],
       [ 0.95597756,  0.95663758,  0.95445907,  0.95203422,  0.9456106 ,
         0.99658876,  0.95587441,  0.95700988],
       [ 0.95986476,  0.96052746,  0.9583401 ,  0.95590538,  0.94945564,
         0.95587441,  1.00434657,  0.96090128],
       [ 0.96100498,  0.96166846,  0.9594785 ,  0.95704089,  0.9505835 ,
         0.95700988,  0.96090128,  1.00662811]])



In [76]:

    
sklearn_hou_pca.explained_variance_









    Out[76]:





array([ 7.6879023])



In [77]:

    
sklearn_hou_pca.explained_variance_ratio_









    Out[77]:





array([ 0.96098779])



In [78]:

    
expl_hou_val = sklearn_hou_pca.explained_variance_ratio_



In [79]:

    
ev = expl_hou_val.tolist()
ls =  ['PC%s' %i for i in range(0,len(ev))]
df_hou_explainedValue = pd.DataFrame(ev,columns=['Value'],index=ls)



In [80]:

    
df_hou_explainedValue.plot(kind='bar')









    Out[80]:





<matplotlib.axes._subplots.AxesSubplot at 0x7fb448058c50>



In [81]:

    
sklearn_hou_pca.components_









    Out[81]:





array([[-0.35439495, -0.35463963, -0.35383202, -0.35293309, -0.35055176,
        -0.35292166, -0.35435671, -0.35477764]])



In [82]:

    
df_hou = pd.DataFrame({'zip5':zip5_hou,'housing_index':Y_hou_sklearn[:,0]})



In [83]:

    
df_hou['housing_index'] = df_hou.apply(lambda x: x['housing_index']*-1,axis=1)



In [84]:

    
df_hou[df_hou['zip5']==21222]









    Out[84]:






  
    
      
      housing_index
      zip5
    
  
  
    
      275
      -1.014197
      21222



In [85]:

    
#Beginning PCA Analysis (reference: http://sebastianraschka.com/Articles/2015_pca_in_3_steps.html)

X_inc = income_index.ix[:,1:5].values
zip5_inc = income_index.ix[:,0].values



In [86]:

    
#Standardization
from sklearn.preprocessing import StandardScaler
X_inc_std = StandardScaler().fit_transform(X_inc)



In [87]:

    
#Same PCA analysis using scikit-learn

from sklearn.decomposition import PCA as sklearnPCA
#sklearn_inc_pca = sklearnPCA(n_components=4)
sklearn_inc_pca = sklearnPCA(n_components=1)
Y_inc_sklearn = sklearn_inc_pca.fit_transform(X_inc_std)



In [88]:

    
X_inc_std









    Out[88]:





array([[-1.59359348,  4.4150441 ,  3.50806255,  1.95198116],
       [-1.44234668,  3.29998401,  2.80016619,  1.23438494],
       [-1.44977957,  3.13114555,  2.87822525,  1.60594258],
       ..., 
       [-0.00633725, -0.14431335,  2.2477343 ,  0.12175354],
       [-1.30300086, -0.86957667,  0.99043425,  0.58926288],
       [-0.1901969 , -0.30098952,  0.45777452,  0.24730736]])



In [89]:

    
sklearn_inc_pca.explained_variance_ratio_









    Out[89]:





array([ 0.46377581])



In [90]:

    
sklearn_inc_pca.explained_variance_









    Out[90]:





array([ 1.85510322])



In [91]:

    
expl_inc_var = sklearn_inc_pca.explained_variance_ratio_
ev = expl_inc_var.tolist()
ls =  ['PC%s' %i for i in range(0,len(ev))]
df_inc_explainedValue = pd.DataFrame(ev,columns=['Value'],index=ls)



In [92]:

    
df_inc_explainedValue.plot(kind='bar')









    Out[92]:





<matplotlib.axes._subplots.AxesSubplot at 0x7fb448058b10>



In [93]:

    
df_inc = pd.DataFrame({'zip5':zip5_inc,'income_index':Y_inc_sklearn[:,0]})



In [94]:

    
df_inc[df_inc['zip5']==21223]









    Out[94]:






  
    
      
      income_index
      zip5
    
  
  
    
      6570
      -3.13511
      21223



In [95]:

    
df_final = pd.merge (df_inc,df_hou,on='zip5')



In [96]:

    
df_final= pd.merge (df_final,df_urb,on='zip5')



In [97]:

    
df_final= pd.merge (df_final,df_div,on='zip5')



In [98]:

    
df_final.head(1)









    Out[98]:






  
    
      
      income_index
      zip5
      housing_index
      urban_index
      diversity_index
    
  
  
    
      0
      0.60221
      1001
      -0.570902
      0.233848
      0.233848



In [99]:

    
df_final[df_final['zip5']==90210]









    Out[99]:






  
    
      
      income_index
      zip5
      housing_index
      urban_index
      diversity_index
    
  
  
    
      10505
      1.942892
      90210
      49.762782
      0.319288
      0.319288



In [100]:

    
#Rescale indexes
from sklearn import preprocessing

zip5_final = df_final['zip5'].values
std_scale = preprocessing.StandardScaler().fit(df_final[['income_index', 'housing_index','urban_index','diversity_index']])
df_std = std_scale.transform(df_final[['income_index', 'housing_index','urban_index','diversity_index']])

minmax_scale = preprocessing.MinMaxScaler().fit(df_final[['income_index', 'housing_index','urban_index','diversity_index']])
df_minmax = minmax_scale.transform(df_final[['income_index', 'housing_index','urban_index','diversity_index']])



In [101]:

    
df_minmax_final = pd.DataFrame({'zip5':zip5_final,'income_index':df_minmax[:,0],'housing_index':df_minmax[:,1],'urban_index':df_minmax[:,2],'diversity_index':df_minmax[:,3]})



In [102]:

    
df_minmax_final[df_minmax_final['zip5']==21230]









    Out[102]:






  
    
      
      diversity_index
      housing_index
      income_index
      urban_index
      zip5
    
  
  
    
      3141
      0.542558
      0.064618
      0.575462
      0.542558
      21230



In [103]:

    
ZCTA = pd.read_csv(ZCTA)



In [104]:

    
ZCTA.head(1)









    Out[104]:






  
    
      
      zcta5
      ZIPName
      State
      FIPS state
      County
      County2
      PlaceFP
      PlaceFP2
      cbsa
      MetDiv
      csa
      CBSAType
      CBSAName
      MetDivName
      CSAName
      NectaName
      UAName
      Puma12Name
    
  
  
    
      0
      35004
      Moody, AL
      AL
      1
      St. Clair AL
      NaN
      51096
      46696
      13820
      NaN
      142
      Metro
      Birmingham-Hoover, AL
      NaN
      Birmingham-Hoover-Talladega, AL
      NaN
      Birmingham, AL Urbanized Area
      St. Clair & Blount Counties



In [105]:

    
df_all_final = pd.merge (df_minmax_final,ZCTA[['zcta5','ZIPName','State']],left_on='zip5',right_on='zcta5',copy=False)



In [106]:

    
del df_all_final['zcta5']



In [107]:

    
df_all_final = pd.merge(df_all_final,urban[['zip5','ZCTA5']],copy=False)



In [108]:

    
df_all_final.to_csv('/home/russ/Documents/DDL/Projects/03-censusables/source/Data/final_files/Final.csv')



In [110]:

    
fit = [.5,.5,.5,.5]
#fit = [0,0,1,0]
df_all_final['fit'] = df_all_final.apply(lambda x: abs(fit[0]-x['diversity_index'])+abs(fit[1]-x['housing_index'])+\
                                        abs(fit[2]-x['income_index'])+abs(fit[3]-x['urban_index']),axis=1)



In [111]:

    
state = 'MD'
if state:
    df_display = df_all_final[df_all_final['State']==state].sort(['fit']).head(10)
else:
    df_display = df_all_final.sort(['fit']).head(10)
df_display









    Out[111]:






  
    
      
      diversity_index
      housing_index
      income_index
      urban_index
      zip5
      ZIPName
      State
      ZCTA5
      fit
    
  
  
    
      2972
      0.499313
      0.061582
      0.540120
      0.499313
      20710
      Bladensburg, MD
      MD
      20710
      0.479912
    
    
      3258
      0.469100
      0.040419
      0.498020
      0.469100
      21875
      Delmar, MD
      MD
      21875
      0.523362
    
    
      3117
      0.491175
      0.044540
      0.552835
      0.491175
      21206
      Baltimore, MD
      MD
      21206
      0.525945
    
    
      3182
      0.514728
      0.040767
      0.537463
      0.514728
      21643
      Hurlock, MD
      MD
      21643
      0.526152
    
    
      3237
      0.516602
      0.038326
      0.547926
      0.516602
      21804
      Salisbury, MD
      MD
      21804
      0.542804
    
    
      3132
      0.513988
      0.054983
      0.571036
      0.513988
      21221
      Essex, MD
      MD
      21221
      0.544029
    
    
      3243
      0.513848
      0.034980
      0.562139
      0.513848
      21826
      Fruitland, MD
      MD
      21826
      0.554855
    
    
      2960
      0.505270
      0.081946
      0.633642
      0.505270
      20678
      Prince Frederick, MD
      MD
      20678
      0.562236
    
    
      2973
      0.461209
      0.105898
      0.596931
      0.461209
      20711
      Lothian, MD
      MD
      20711
      0.568616
    
    
      3191
      0.522953
      0.127711
      0.657794
      0.522953
      21662
      Royal Oak, MD
      MD
      21662
      0.575989



In [112]:

    
df_all_final.sort(['fit']).head(10)









    Out[112]:






  
    
      
      diversity_index
      housing_index
      income_index
      urban_index
      zip5
      ZIPName
      State
      ZCTA5
      fit
    
  
  
    
      849
      0.510518
      0.402507
      0.707100
      0.510518
      6830
      Greenwich, CT
      CT
      06830
      0.325628
    
    
      10487
      0.482534
      0.263484
      0.593316
      0.482534
      90046
      Los Angeles, CA
      CA
      90046
      0.364764
    
    
      10506
      0.450808
      0.376718
      0.645668
      0.450808
      90211
      Beverly Hills, CA
      CA
      90211
      0.367333
    
    
      11171
      0.503706
      0.482497
      0.844349
      0.503706
      94024
      Los Altos, CA
      CA
      94024
      0.369264
    
    
      10507
      0.426886
      0.449827
      0.675198
      0.426886
      90212
      Beverly Hills, CA
      CA
      90212
      0.371599
    
    
      10500
      0.482723
      0.278766
      0.631285
      0.482723
      90068
      Los Angeles, CA
      CA
      90068
      0.387072
    
    
      1356
      0.470625
      0.322967
      0.653689
      0.470625
      10003
      New York, NY
      NY
      10003
      0.389471
    
    
      11948
      0.511741
      0.157938
      0.466941
      0.511741
      98105
      Seattle, WA
      WA
      98105
      0.398604
    
    
      11197
      0.563598
      0.236204
      0.516794
      0.563598
      94108
      San Francisco, CA
      CA
      94108
      0.407786
    
    
      11183
      0.484693
      0.371334
      0.755528
      0.484693
      94062
      Redwood City, CA
      CA
      94062
      0.414807



In [113]:

    
fit = []
fit.append(float(50)/100)
fit.append(float(50)/100)
fit.append(float(50)/100)
fit.append(float(50)/100)        
df_all_final['fit'] = df_all_final.apply(lambda x: abs(fit[0]-x['diversity_index'])+abs(fit[1]-x['housing_index'])+\
                                        abs(fit[2]-x['income_index'])+abs(fit[3]-x['urban_index']),axis=1)



In [537]:

    
import vincent
from vincent import AxisProperties, PropertySet, ValueRef
vincent.core.initialize_notebook()



In [593]:

    
zip_topo = r'zips_us_topo.json'
state_topo = r'us_states.topo.json'

geo_data = [{'name': 'states',
             'url': state_topo,
             'feature': 'us_states.geo'},
            {'name': 'zip_codes',
             'url': zip_topo,
             'feature': 'zip_codes_for_the_usa'}]

vis = vincent.Map(data=df_all_final, geo_data=geo_data, scale=800, projection='albersUsa',
          data_bind='fit', data_key='zip5',brew='YlOrRd',
          map_key={'zip_codes': 'properties.zip'})
del vis.marks[0].properties.update
#del vis.marks[1].properties.update
#vis.marks[1].properties.update.fill.value = '#FFFFFF'
#vis.marks[1].properties.enter.stroke.value = '#CCCCFF'
vis.marks[1].properties.enter.stroke_opacity = ValueRef(value=0.05)
vis.marks[0].properties.enter.stroke.value = '#C0C0C0'
#vis.marks[1].properties.hover.fill.value = 'red'
vis.legend(title='Preferred ZipCode')
vis.display()
vis.to_json("USA_Preferred.json")



In [541]:

    
zip_topo = r'zips_us_topo.json'
state_topo = r'us_states.topo.json'

geo_data2 = [{'name': 'zip_codes',
             'url': zip_topo,
             'feature': 'zip_codes_for_the_usa'},
            {'name': 'states',
             'url': state_topo,
             'feature': 'us_states.geo'}]

geo_data = [{'name': 'states',
             'url': state_topo,
             'feature': 'us_states.geo'},
            {'name': 'zip_codes',
             'url': zip_topo,
             'feature': 'zip_codes_for_the_usa'}]

vis = vincent.Map(data=urban[urban['zip']== '83211'], geo_data=geo_data, scale=1100, projection='albersUsa',
          data_bind='POPULATION', data_key='zip',brew='PuRd',
          map_key={'zip_codes': 'properties.zip'})
del vis.marks[0].properties.update
#del vis.marks[1].properties.update
vis.marks[1].properties.update.fill.value = '#C390D4'
#vis.marks[1].properties.enter.stroke.value = '#CCCCFF'
#vis.marks[1].properties.enter.stroke_opacity = ValueRef(value=0.1)
vis.marks[0].properties.enter.stroke.value = '#FF0000'
vis.legend(title='POPULATION')
vis.display()
vis.to_json("USA.json")









    



---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-541-13c50b6d908d> in <module>()
     16              'feature': 'zip_codes_for_the_usa'}]
     17 
---> 18 vis = vincent.Map(data=urban[urban['zip']== '83211'], geo_data=geo_data, scale=1100, projection='albersUsa',
     19           data_bind='POPULATION', data_key='zip',brew='PuRd',
     20           map_key={'zip_codes': 'properties.zip'})

/home/russ/anaconda/lib/python2.7/site-packages/pandas/core/frame.pyc in __getitem__(self, key)
   1795             return self._getitem_multilevel(key)
   1796         else:
-> 1797             return self._getitem_column(key)
   1798 
   1799     def _getitem_column(self, key):

/home/russ/anaconda/lib/python2.7/site-packages/pandas/core/frame.pyc in _getitem_column(self, key)
   1802         # get column
   1803         if self.columns.is_unique:
-> 1804             return self._get_item_cache(key)
   1805 
   1806         # duplicate columns & possible reduce dimensionaility

/home/russ/anaconda/lib/python2.7/site-packages/pandas/core/generic.pyc in _get_item_cache(self, item)
   1082         res = cache.get(item)
   1083         if res is None:
-> 1084             values = self._data.get(item)
   1085             res = self._box_item_values(item, values)
   1086             cache[item] = res

/home/russ/anaconda/lib/python2.7/site-packages/pandas/core/internals.pyc in get(self, item, fastpath)
   2849 
   2850             if not isnull(item):
-> 2851                 loc = self.items.get_loc(item)
   2852             else:
   2853                 indexer = np.arange(len(self.items))[isnull(self.items)]

/home/russ/anaconda/lib/python2.7/site-packages/pandas/core/index.pyc in get_loc(self, key, method)
   1570         """
   1571         if method is None:
-> 1572             return self._engine.get_loc(_values_from_object(key))
   1573 
   1574         indexer = self.get_indexer([key], method=method)

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:3824)()

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:3704)()

pandas/hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:12280)()

pandas/hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:12231)()

KeyError: 'zip'



In [4]:

    
import csv
with open("/home/russ/Documents/DDL/Projects/03-censusables/source/Data/raw_files/state_landarea_rank.csv") as f:
    f.readline() # ignore first line (header)
    land_area = dict(csv.reader(f, delimiter=','))
print land_area









    



{'Mississippi': '31', 'Oklahoma': '19', 'Wyoming': '9', 'Minnesota': '14', 'Illinois': '24', 'Arkansas': '27', 'Ohio': '35', 'Indiana': '38', 'Maryland': '42', 'Louisiana': '33', 'New_Hampshire': '44', 'Idaho': '11', 'New_York': '30', 'Arizona': '6', 'Iowa': '23', 'South_Carolina': '40', 'Michigan': '22', 'Kansas': '13', 'Utah': '12', 'Virginia': '37', 'Oregon': '10', 'District_of_Columbia': '51', 'Connecticut': '48', 'Montana': '4', 'California': '3', 'Texas': '2', 'New_Mexico': '5', 'South_Dakota': '16', 'Massachusetts': '45', 'Vermont': '43', 'Georgia': '21', 'Pennsylvania': '32', 'Florida': '26', 'Alaska': '1', 'North_Dakota': '17', 'Hawaii': '47', 'Nebraska': '15', 'Kentucky': '36', 'Missouri': '18', 'Wisconsin': '25', 'Alabama': '28', 'New_Jersey': '46', 'Colorado': '8', 'Washington': '20', 'West_Virginia': '41', 'Tennessee': '34', 'Rhode_Island': '50', 'North_Carolina': '29', 'Nevada': '7', 'Delaware': '49', 'Maine': '39'}



In [146]:

    
ziplist = json.loads(df_all_final[['ZCTA5','ZIPName','fit']].head(5).to_json())
ziplist









    Out[146]:





{u'ZCTA5': {u'0': u'01001',
  u'1': u'01002',
  u'2': u'01005',
  u'3': u'01007',
  u'4': u'01008'},
 u'ZIPName': {u'0': u'Agawam Town, MA',
  u'1': u'Amherst Center, MA',
  u'2': u'Barre, MA',
  u'3': u'Belchertown, MA',
  u'4': u'Blandford, MA'},
 u'fit': {u'0': 1.1006146364,
  u'1': 0.5398473703,
  u'2': 1.4791458348,
  u'3': 1.2032138701,
  u'4': 1.5669620785}}



In [140]:

    
ziplist['ZCTA5']['0']









    Out[140]:





u'01001'



In [150]:

    
ziplist = json.loads(df_all_final[['ZCTA5','ZIPName','fit']].sort(['fit']).reset_index().head(5).to_json())



In [151]:

    
ziplist









    Out[151]:





{u'ZCTA5': {u'0': u'06830',
  u'1': u'90046',
  u'2': u'90211',
  u'3': u'94024',
  u'4': u'90212'},
 u'ZIPName': {u'0': u'Greenwich, CT',
  u'1': u'Los Angeles, CA',
  u'2': u'Beverly Hills, CA',
  u'3': u'Los Altos, CA',
  u'4': u'Beverly Hills, CA'},
 u'fit': {u'0': 0.3256283618,
  u'1': 0.3647637893,
  u'2': 0.3673332581,
  u'3': 0.3692640451,
  u'4': 0.3715989326},
 u'index': {u'0': 849, u'1': 10487, u'2': 10506, u'3': 11171, u'4': 10507}}



In [153]:

    
table_data = []

for i in range (5):
    dict_row = {}
    dict_row['index'] = i
    dict_row['ZCTA5'] = ziplist['ZCTA5'][str(i)]
    dict_row['ZIPName'] = ziplist['ZIPName'][str(i)]
    table_data.append(dict_row)
print table_data









    



[{'ZCTA5': u'06830', 'index': 0, 'ZIPName': u'Greenwich, CT'}, {'ZCTA5': u'90046', 'index': 1, 'ZIPName': u'Los Angeles, CA'}, {'ZCTA5': u'90211', 'index': 2, 'ZIPName': u'Beverly Hills, CA'}, {'ZCTA5': u'94024', 'index': 3, 'ZIPName': u'Los Altos, CA'}, {'ZCTA5': u'90212', 'index': 4, 'ZIPName': u'Beverly Hills, CA'}]



In [ ]:

	zip5	2014-01	2014-07	2015-01	2015-07
0	10025	998300	1001700	1029600	1097100
1	60657	294700	297000	304500	307000
2	60614	386800	382300	378900	393000
3	79936	109600	111900	111600	112100
4	10002	849400	863900	903200	951600

	zip5	2014-01	2014-07	2015-01	2015-07
0	10025	3477	3484	3520	3599
1	60657	2005	2142	1969	2050
2	10023	4251	4312	4370	4620
3	60614	2199	2343	2232	2361
4	79936	1029	1029	1022	1057

	diversity_index	zip5
0	0.114337	601
1	0.643286	602
2	0.445572	603
3	0.093341	606
4	0.459638	610

	diversity_index	housing_index	income_index	urban_index	zip5	ZIPName	State	ZCTA5	fit
2972	0.499313	0.061582	0.540120	0.499313	20710	Bladensburg, MD	MD	20710	0.479912
3258	0.469100	0.040419	0.498020	0.469100	21875	Delmar, MD	MD	21875	0.523362
3117	0.491175	0.044540	0.552835	0.491175	21206	Baltimore, MD	MD	21206	0.525945
3182	0.514728	0.040767	0.537463	0.514728	21643	Hurlock, MD	MD	21643	0.526152
3237	0.516602	0.038326	0.547926	0.516602	21804	Salisbury, MD	MD	21804	0.542804
3132	0.513988	0.054983	0.571036	0.513988	21221	Essex, MD	MD	21221	0.544029
3243	0.513848	0.034980	0.562139	0.513848	21826	Fruitland, MD	MD	21826	0.554855
2960	0.505270	0.081946	0.633642	0.505270	20678	Prince Frederick, MD	MD	20678	0.562236
2973	0.461209	0.105898	0.596931	0.461209	20711	Lothian, MD	MD	20711	0.568616
3191	0.522953	0.127711	0.657794	0.522953	21662	Royal Oak, MD	MD	21662	0.575989

	diversity_index	housing_index	income_index	urban_index	zip5	ZIPName	State	ZCTA5	fit
849	0.510518	0.402507	0.707100	0.510518	6830	Greenwich, CT	CT	06830	0.325628
10487	0.482534	0.263484	0.593316	0.482534	90046	Los Angeles, CA	CA	90046	0.364764
10506	0.450808	0.376718	0.645668	0.450808	90211	Beverly Hills, CA	CA	90211	0.367333
11171	0.503706	0.482497	0.844349	0.503706	94024	Los Altos, CA	CA	94024	0.369264
10507	0.426886	0.449827	0.675198	0.426886	90212	Beverly Hills, CA	CA	90212	0.371599
10500	0.482723	0.278766	0.631285	0.482723	90068	Los Angeles, CA	CA	90068	0.387072
1356	0.470625	0.322967	0.653689	0.470625	10003	New York, NY	NY	10003	0.389471
11948	0.511741	0.157938	0.466941	0.511741	98105	Seattle, WA	WA	98105	0.398604
11197	0.563598	0.236204	0.516794	0.563598	94108	San Francisco, CA	CA	94108	0.407786
11183	0.484693	0.371334	0.755528	0.484693	94062	Redwood City, CA	CA	94062	0.414807