In [7]:
%matplotlib inline
import pandas as pd
import json

In [8]:
#File locations
acs_file = "/home/russ/Documents/DDL/Projects/03-censusables/source/Data/raw_files/acs5yr.csv"
zillow_HVI_file = "/home/russ/Documents/DDL/Projects/03-censusables/source/Data/raw_files/Zip_Zhvi_AllHomes_HomeValueIndex.csv"
zillow_RI_file = "/home/russ/Documents/DDL/Projects/03-censusables/source/Data/raw_files/Zip_Zri_AllHomes_RentIndex.csv"
urbanization_zip = "/home/russ/Documents/DDL/Projects/03-censusables/source/Data/raw_files/zcta2010_txt.csv"
ZCTA = "/home/russ/Documents/DDL/Projects/03-censusables/source/Data/raw_files/ZCTA.csv"

In [9]:
acs = pd.read_csv(acs_file)

In [10]:
diversity = acs[['zip5','pop','race_white','race_black','race_asian','race_indian','race_other','hisp']].copy(deep=True)

In [11]:
diversity['white_hisp'] = ((diversity['pop']*diversity['race_white'])*diversity['hisp'])/diversity['pop']

In [12]:
diversity['white_nonhisp'] = ((diversity['pop']*diversity['race_white'])*(1-diversity['hisp']))/diversity['pop']

In [13]:
diversity['div_index'] = 1- (diversity['race_black']**2 + diversity['white_hisp']**2 + diversity['white_nonhisp']**2 + diversity['race_asian']**2 + diversity['race_indian']**2)

In [14]:
diversity_index = diversity[['zip5','div_index']].dropna(axis=0,how='any',subset=['zip5','div_index'])

In [15]:
import numpy as np
diversity_index[diversity_index['div_index']==np.nan]


Out[15]:
zip5 div_index

In [16]:
urban = pd.read_csv(urbanization_zip)

In [17]:
urban.rename(columns={'Zip5':'zip5'},inplace=True)

In [18]:
urban['zip5'] = urban.apply(lambda x: float(x['zip5']),axis=1)

In [19]:
#urban['pop'] = urban.apply(lambda x: int(x['POPULATION'].replace(',','')),axis=1)
#alternate
#urban['pop'] = urban['POPULATION'].apply(lambda x: int(x.replace(',','')))
urban['pop'] = urban.apply(lambda x: int(x['POPULATION']),axis=1)

In [20]:
#strip Z from Zip Code Text
urban['ZCTA5'] = urban.apply(lambda x: x['ZCTA5'][1:],axis=1)

In [21]:
urban.head(1)


Out[21]:
ZCTA5 zip5 LANDSQMT WATERSQMT LANDSQMI WATERSQMI POPULATION HSGUNITS INTPTLAT INTPTLON pop
0 00601 601 166659789 799296 64.35 0.31 18570 7744 18.180556 -66.749961 18570

In [22]:
urban['urban_index'] = urban['LANDSQMT']/urban['pop']

In [23]:
urban_index = urban[['ZCTA5','zip5','urban_index']].dropna(axis=0,how='any',subset=['zip5','urban_index'])

In [24]:
urban_index.head(1)


Out[24]:
ZCTA5 zip5 urban_index
0 00601 601 8974.678998

In [25]:
zillow_HVI = pd.read_csv(zillow_HVI_file)

In [26]:
zillow_HVI = zillow_HVI[['RegionName','2014-01','2014-07','2015-01','2015-07']]

In [27]:
zillow_HVI.rename(columns={'RegionName':'zip5'},inplace=True)
zillow_HVI.head()


Out[27]:
zip5 2014-01 2014-07 2015-01 2015-07
0 10025 998300 1001700 1029600 1097100
1 60657 294700 297000 304500 307000
2 60614 386800 382300 378900 393000
3 79936 109600 111900 111600 112100
4 10002 849400 863900 903200 951600

In [28]:
zillow_RI = pd.read_csv(zillow_RI_file)

In [29]:
len(zillow_HVI)


Out[29]:
12988

In [30]:
zillow_RI.head(1)


Out[30]:
RegionName City State Metro CountyName 2010-11 2010-12 2011-01 2011-02 2011-03 ... 2014-10 2014-11 2014-12 2015-01 2015-02 2015-03 2015-04 2015-05 2015-06 2015-07
0 10025 New York NY New York New York NaN NaN NaN NaN NaN ... 3511 3488 3500 3520 3557 3588 3585 3586 3581 3599

1 rows × 62 columns


In [31]:
zillow_RI = zillow_RI[['RegionName','2014-01','2014-07','2015-01','2015-07']].copy(False)

In [32]:
zillow_RI.rename(columns={'RegionName':'zip5'},inplace=True)

In [33]:
zillow_RI.head()


Out[33]:
zip5 2014-01 2014-07 2015-01 2015-07
0 10025 3477 3484 3520 3599
1 60657 2005 2142 1969 2050
2 10023 4251 4312 4370 4620
3 60614 2199 2343 2232 2361
4 79936 1029 1029 1022 1057

In [34]:
housing_index = pd.merge (zillow_HVI, zillow_RI,how='inner', on='zip5').dropna(axis=0,how='all')

In [35]:
#housing_index = zillow_HVI.dropna(axis=0,how='all')

In [36]:
acs.head(1).to_csv("ACS_Headers.csv")

In [37]:
#housing_index = pd.merge(housing_index, acs[['zip5','rent_median']], how='inner', on='zip5',copy=False)

In [38]:
income_index = acs[['zip5','inc_median','poverty','snap','gini_index']].dropna(axis=0,how='all')

In [39]:
income_index[income_index['zip5']==90210]


Out[39]:
zip5 inc_median poverty snap gini_index
30079 90210 132254 0.077309 0.008544 0.635

In [40]:
#Beginning PCA Analysis (reference: http://sebastianraschka.com/Articles/2015_pca_in_3_steps.html)

X_div = diversity_index.ix[:,1].values
y_div = diversity_index.ix[:,0].values

In [41]:
y_div


Out[41]:
array([  601,   602,   603, ..., 99926, 99927, 99929])

In [48]:
#Standardization
from sklearn.preprocessing import StandardScaler
X_div_std = StandardScaler().fit_transform(X_div)

In [49]:
X_div_std


Out[49]:
array([-0.68146139,  1.60062294,  0.74761118, ...,  0.11210504,
        0.21154735,  1.06348835])

In [50]:
#Same PCA analysis using scikit-learn

from sklearn.decomposition import PCA as sklearnPCA
sklearn_div_pca = sklearnPCA(n_components=1)
Y_div_sklearn = sklearn_div_pca.fit_transform(X_div_std)


/home/russ/anaconda/lib/python2.7/site-packages/numpy/core/_methods.py:59: RuntimeWarning: Mean of empty slice.
  warnings.warn("Mean of empty slice.", RuntimeWarning)

In [51]:
sklearn_pca.explained_variance_ratio_


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-51-276c2c932926> in <module>()
----> 1 sklearn_pca.explained_variance_ratio_

NameError: name 'sklearn_pca' is not defined

In [52]:
len(X_div)


Out[52]:
32784

In [53]:
df_div = pd.DataFrame({'zip5':y_div,'diversity_index':X_div})

In [54]:
df_div.head()


Out[54]:
diversity_index zip5
0 0.114337 601
1 0.643286 602
2 0.445572 603
3 0.093341 606
4 0.459638 610

In [55]:
#Beginning PCA Analysis (reference: http://sebastianraschka.com/Articles/2015_pca_in_3_steps.html)

X_urb = urban_index.ix[:,1].values
y_urb = urban_index.ix[:,0].values

In [56]:
#Standardization
from sklearn.preprocessing import StandardScaler
X_urb_std = StandardScaler().fit_transform(X_urb)

In [57]:
X_urb_std


Out[57]:
array([-1.7800184 , -1.77998212, -1.77994584, ...,  1.82334699,
        1.82338327,  1.82345583])

In [58]:
#Same PCA analysis using scikit-learn

from sklearn.decomposition import PCA as sklearnPCA
sklearn_urb_pca = sklearnPCA(n_components=1)
Y_urb_sklearn = sklearn_pca.fit_transform(X_urb_std)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-58-751625d30aeb> in <module>()
      3 from sklearn.decomposition import PCA as sklearnPCA
      4 sklearn_urb_pca = sklearnPCA(n_components=1)
----> 5 Y_urb_sklearn = sklearn_pca.fit_transform(X_urb_std)

NameError: name 'sklearn_pca' is not defined

In [59]:
df_urb = pd.DataFrame({'zip5':y_div,'urban_index':X_div})

In [60]:
df_urb.head(1)


Out[60]:
urban_index zip5
0 0.114337 601

In [61]:
housing_index['2014-07_x']


Out[61]:
0        1001700
1         297000
2         382300
3         111900
4         863900
5         300500
6         130100
7         911000
8         581300
9         190400
10        758500
11        579600
12        222100
13        458300
14        152600
15       1039500
16        137800
17        271400
18        130200
19        710100
20        317000
21        167200
22       1093600
23        151000
24         79800
25       1417400
26        284400
27        106300
28        246800
29        135400
          ...   
12228     155600
12229     224200
12230     162300
12231     447200
12232     248500
12233     103300
12234     372000
12235     237500
12236     236000
12237      76500
12238     252500
12239     168800
12240     215800
12241     110300
12242      82500
12243     235700
12244     134500
12245     167500
12246     196100
12247     121600
12248     247000
12249     211200
12250     144300
12251     163500
12252     222900
12253     512700
12254     303500
12255     206000
12256     830100
12257     165700
Name: 2014-07_x, dtype: float64

In [62]:
housing_index.dtypes


Out[62]:
zip5           int64
2014-01_x      int64
2014-07_x    float64
2015-01_x      int64
2015-07_x      int64
2014-01_y      int64
2014-07_y      int64
2015-01_y      int64
2015-07_y      int64
dtype: object

In [63]:
housing_index[housing_index['2014-07_x'].isnull()]
#income_index[income_index['zip5']==90210]


Out[63]:
zip5 2014-01_x 2014-07_x 2015-01_x 2015-07_x 2014-01_y 2014-07_y 2015-01_y 2015-07_y
5154 29341 63900 NaN 74700 72600 1000 910 880 935

In [64]:
#housing_index.isnull()==True
housing_index[housing_index.isnull().any(axis=1)]


Out[64]:
zip5 2014-01_x 2014-07_x 2015-01_x 2015-07_x 2014-01_y 2014-07_y 2015-01_y 2015-07_y
5154 29341 63900 NaN 74700 72600 1000 910 880 935

In [65]:
housing_index[housing_index['zip5']== 21211]


Out[65]:
zip5 2014-01_x 2014-07_x 2015-01_x 2015-07_x 2014-01_y 2014-07_y 2015-01_y 2015-07_y
4766 21211 167300 176900 179200 188100 1325 1377 1389 1473

In [66]:
housing_index.loc[housing_index['2014-07_x'].isnull(),'2014-07_x'] = housing_index['2014-01_x']

In [67]:
housing_index.head(1)


Out[67]:
zip5 2014-01_x 2014-07_x 2015-01_x 2015-07_x 2014-01_y 2014-07_y 2015-01_y 2015-07_y
0 10025 998300 1001700 1029600 1097100 3477 3484 3520 3599

In [68]:
#Beginning PCA Analysis (reference: http://sebastianraschka.com/Articles/2015_pca_in_3_steps.html)

X_hou = housing_index.ix[:,1:9].values
zip5_hou = housing_index.ix[:,0].values

In [69]:
#Standardization
from sklearn.preprocessing import StandardScaler
X_hou_std = StandardScaler().fit_transform(X_hou)

In [70]:
#Same PCA analysis using scikit-learn

from sklearn.decomposition import PCA as sklearnPCA
#sklearn_hou_pca = sklearnPCA(n_components=8)
sklearn_hou_pca = sklearnPCA(n_components=1)
Y_hou_sklearn = sklearn_hou_pca.fit_transform(X_hou_std)

In [71]:
sklearn_hou_pca


Out[71]:
PCA(copy=True, n_components=1, whiten=False)

In [72]:
expl_hou_var = sklearn_hou_pca.explained_variance_ratio_

In [73]:
sklearn_hou_pca.explained_variance_ratio_


Out[73]:
array([ 0.96098779])

In [74]:
X_hou_std


Out[74]:
array([[ 3.7929565 ,  3.65054733,  3.64774133, ...,  2.86499164,
         2.7859207 ,  2.73825957],
       [ 0.33423702,  0.30410089,  0.31021787, ...,  0.92538362,
         0.61158254,  0.65113861],
       [ 0.7869773 ,  0.70916955,  0.6526696 , ...,  1.21589123,
         0.98028076,  1.07017968],
       ..., 
       [-0.07229524, -0.12803565, -0.14500357, ...,  0.20272787,
         0.16017255,  0.24018194],
       [ 2.62448348,  2.83566126,  2.81738795, ...,  1.52807851,
         1.66019953,  1.69537279],
       [-0.37903785, -0.31941041, -0.36179761, ..., -0.91738854,
        -0.85760276, -0.847169  ]])

In [75]:
sklearn_hou_pca.get_covariance()


Out[75]:
array([[ 1.00455373,  0.96063112,  0.95844352,  0.95600854,  0.94955811,
         0.95597756,  0.95986476,  0.96100498],
       [ 0.96063112,  1.00587974,  0.95910524,  0.95666858,  0.95021369,
         0.95663758,  0.96052746,  0.96166846],
       [ 0.95844352,  0.95910524,  1.0015065 ,  0.95449   ,  0.94804982,
         0.95445907,  0.9583401 ,  0.9594785 ],
       [ 0.95600854,  0.95666858,  0.95449   ,  0.99665045,  0.94564124,
         0.95203422,  0.95590538,  0.95704089],
       [ 0.94955811,  0.95021369,  0.94804982,  0.94564124,  0.98384615,
         0.9456106 ,  0.94945564,  0.9505835 ],
       [ 0.95597756,  0.95663758,  0.95445907,  0.95203422,  0.9456106 ,
         0.99658876,  0.95587441,  0.95700988],
       [ 0.95986476,  0.96052746,  0.9583401 ,  0.95590538,  0.94945564,
         0.95587441,  1.00434657,  0.96090128],
       [ 0.96100498,  0.96166846,  0.9594785 ,  0.95704089,  0.9505835 ,
         0.95700988,  0.96090128,  1.00662811]])

In [76]:
sklearn_hou_pca.explained_variance_


Out[76]:
array([ 7.6879023])

In [77]:
sklearn_hou_pca.explained_variance_ratio_


Out[77]:
array([ 0.96098779])

In [78]:
expl_hou_val = sklearn_hou_pca.explained_variance_ratio_

In [79]:
ev = expl_hou_val.tolist()
ls =  ['PC%s' %i for i in range(0,len(ev))]
df_hou_explainedValue = pd.DataFrame(ev,columns=['Value'],index=ls)

In [80]:
df_hou_explainedValue.plot(kind='bar')


Out[80]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fb448058c50>

In [81]:
sklearn_hou_pca.components_


Out[81]:
array([[-0.35439495, -0.35463963, -0.35383202, -0.35293309, -0.35055176,
        -0.35292166, -0.35435671, -0.35477764]])

In [82]:
df_hou = pd.DataFrame({'zip5':zip5_hou,'housing_index':Y_hou_sklearn[:,0]})

In [83]:
df_hou['housing_index'] = df_hou.apply(lambda x: x['housing_index']*-1,axis=1)

In [84]:
df_hou[df_hou['zip5']==21222]


Out[84]:
housing_index zip5
275 -1.014197 21222

In [85]:
#Beginning PCA Analysis (reference: http://sebastianraschka.com/Articles/2015_pca_in_3_steps.html)

X_inc = income_index.ix[:,1:5].values
zip5_inc = income_index.ix[:,0].values

In [86]:
#Standardization
from sklearn.preprocessing import StandardScaler
X_inc_std = StandardScaler().fit_transform(X_inc)

In [87]:
#Same PCA analysis using scikit-learn

from sklearn.decomposition import PCA as sklearnPCA
#sklearn_inc_pca = sklearnPCA(n_components=4)
sklearn_inc_pca = sklearnPCA(n_components=1)
Y_inc_sklearn = sklearn_inc_pca.fit_transform(X_inc_std)

In [88]:
X_inc_std


Out[88]:
array([[-1.59359348,  4.4150441 ,  3.50806255,  1.95198116],
       [-1.44234668,  3.29998401,  2.80016619,  1.23438494],
       [-1.44977957,  3.13114555,  2.87822525,  1.60594258],
       ..., 
       [-0.00633725, -0.14431335,  2.2477343 ,  0.12175354],
       [-1.30300086, -0.86957667,  0.99043425,  0.58926288],
       [-0.1901969 , -0.30098952,  0.45777452,  0.24730736]])

In [89]:
sklearn_inc_pca.explained_variance_ratio_


Out[89]:
array([ 0.46377581])

In [90]:
sklearn_inc_pca.explained_variance_


Out[90]:
array([ 1.85510322])

In [91]:
expl_inc_var = sklearn_inc_pca.explained_variance_ratio_
ev = expl_inc_var.tolist()
ls =  ['PC%s' %i for i in range(0,len(ev))]
df_inc_explainedValue = pd.DataFrame(ev,columns=['Value'],index=ls)

In [92]:
df_inc_explainedValue.plot(kind='bar')


Out[92]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fb448058b10>

In [93]:
df_inc = pd.DataFrame({'zip5':zip5_inc,'income_index':Y_inc_sklearn[:,0]})

In [94]:
df_inc[df_inc['zip5']==21223]


Out[94]:
income_index zip5
6570 -3.13511 21223

In [95]:
df_final = pd.merge (df_inc,df_hou,on='zip5')

In [96]:
df_final= pd.merge (df_final,df_urb,on='zip5')

In [97]:
df_final= pd.merge (df_final,df_div,on='zip5')

In [98]:
df_final.head(1)


Out[98]:
income_index zip5 housing_index urban_index diversity_index
0 0.60221 1001 -0.570902 0.233848 0.233848

In [99]:
df_final[df_final['zip5']==90210]


Out[99]:
income_index zip5 housing_index urban_index diversity_index
10505 1.942892 90210 49.762782 0.319288 0.319288

In [100]:
#Rescale indexes
from sklearn import preprocessing

zip5_final = df_final['zip5'].values
std_scale = preprocessing.StandardScaler().fit(df_final[['income_index', 'housing_index','urban_index','diversity_index']])
df_std = std_scale.transform(df_final[['income_index', 'housing_index','urban_index','diversity_index']])

minmax_scale = preprocessing.MinMaxScaler().fit(df_final[['income_index', 'housing_index','urban_index','diversity_index']])
df_minmax = minmax_scale.transform(df_final[['income_index', 'housing_index','urban_index','diversity_index']])

In [101]:
df_minmax_final = pd.DataFrame({'zip5':zip5_final,'income_index':df_minmax[:,0],'housing_index':df_minmax[:,1],'urban_index':df_minmax[:,2],'diversity_index':df_minmax[:,3]})

In [102]:
df_minmax_final[df_minmax_final['zip5']==21230]


Out[102]:
diversity_index housing_index income_index urban_index zip5
3141 0.542558 0.064618 0.575462 0.542558 21230

In [103]:
ZCTA = pd.read_csv(ZCTA)

In [104]:
ZCTA.head(1)


Out[104]:
zcta5 ZIPName State FIPS state County County2 PlaceFP PlaceFP2 cbsa MetDiv csa CBSAType CBSAName MetDivName CSAName NectaName UAName Puma12Name
0 35004 Moody, AL AL 1 St. Clair AL NaN 51096 46696 13820 NaN 142 Metro Birmingham-Hoover, AL NaN Birmingham-Hoover-Talladega, AL NaN Birmingham, AL Urbanized Area St. Clair & Blount Counties

In [105]:
df_all_final = pd.merge (df_minmax_final,ZCTA[['zcta5','ZIPName','State']],left_on='zip5',right_on='zcta5',copy=False)

In [106]:
del df_all_final['zcta5']

In [107]:
df_all_final = pd.merge(df_all_final,urban[['zip5','ZCTA5']],copy=False)

In [108]:
df_all_final.to_csv('/home/russ/Documents/DDL/Projects/03-censusables/source/Data/final_files/Final.csv')

In [110]:
fit = [.5,.5,.5,.5]
#fit = [0,0,1,0]
df_all_final['fit'] = df_all_final.apply(lambda x: abs(fit[0]-x['diversity_index'])+abs(fit[1]-x['housing_index'])+\
                                        abs(fit[2]-x['income_index'])+abs(fit[3]-x['urban_index']),axis=1)

In [111]:
state = 'MD'
if state:
    df_display = df_all_final[df_all_final['State']==state].sort(['fit']).head(10)
else:
    df_display = df_all_final.sort(['fit']).head(10)
df_display


Out[111]:
diversity_index housing_index income_index urban_index zip5 ZIPName State ZCTA5 fit
2972 0.499313 0.061582 0.540120 0.499313 20710 Bladensburg, MD MD 20710 0.479912
3258 0.469100 0.040419 0.498020 0.469100 21875 Delmar, MD MD 21875 0.523362
3117 0.491175 0.044540 0.552835 0.491175 21206 Baltimore, MD MD 21206 0.525945
3182 0.514728 0.040767 0.537463 0.514728 21643 Hurlock, MD MD 21643 0.526152
3237 0.516602 0.038326 0.547926 0.516602 21804 Salisbury, MD MD 21804 0.542804
3132 0.513988 0.054983 0.571036 0.513988 21221 Essex, MD MD 21221 0.544029
3243 0.513848 0.034980 0.562139 0.513848 21826 Fruitland, MD MD 21826 0.554855
2960 0.505270 0.081946 0.633642 0.505270 20678 Prince Frederick, MD MD 20678 0.562236
2973 0.461209 0.105898 0.596931 0.461209 20711 Lothian, MD MD 20711 0.568616
3191 0.522953 0.127711 0.657794 0.522953 21662 Royal Oak, MD MD 21662 0.575989

In [112]:
df_all_final.sort(['fit']).head(10)


Out[112]:
diversity_index housing_index income_index urban_index zip5 ZIPName State ZCTA5 fit
849 0.510518 0.402507 0.707100 0.510518 6830 Greenwich, CT CT 06830 0.325628
10487 0.482534 0.263484 0.593316 0.482534 90046 Los Angeles, CA CA 90046 0.364764
10506 0.450808 0.376718 0.645668 0.450808 90211 Beverly Hills, CA CA 90211 0.367333
11171 0.503706 0.482497 0.844349 0.503706 94024 Los Altos, CA CA 94024 0.369264
10507 0.426886 0.449827 0.675198 0.426886 90212 Beverly Hills, CA CA 90212 0.371599
10500 0.482723 0.278766 0.631285 0.482723 90068 Los Angeles, CA CA 90068 0.387072
1356 0.470625 0.322967 0.653689 0.470625 10003 New York, NY NY 10003 0.389471
11948 0.511741 0.157938 0.466941 0.511741 98105 Seattle, WA WA 98105 0.398604
11197 0.563598 0.236204 0.516794 0.563598 94108 San Francisco, CA CA 94108 0.407786
11183 0.484693 0.371334 0.755528 0.484693 94062 Redwood City, CA CA 94062 0.414807

In [113]:
fit = []
fit.append(float(50)/100)
fit.append(float(50)/100)
fit.append(float(50)/100)
fit.append(float(50)/100)        
df_all_final['fit'] = df_all_final.apply(lambda x: abs(fit[0]-x['diversity_index'])+abs(fit[1]-x['housing_index'])+\
                                        abs(fit[2]-x['income_index'])+abs(fit[3]-x['urban_index']),axis=1)

In [537]:
import vincent
from vincent import AxisProperties, PropertySet, ValueRef
vincent.core.initialize_notebook()



In [593]:
zip_topo = r'zips_us_topo.json'
state_topo = r'us_states.topo.json'

geo_data = [{'name': 'states',
             'url': state_topo,
             'feature': 'us_states.geo'},
            {'name': 'zip_codes',
             'url': zip_topo,
             'feature': 'zip_codes_for_the_usa'}]

vis = vincent.Map(data=df_all_final, geo_data=geo_data, scale=800, projection='albersUsa',
          data_bind='fit', data_key='zip5',brew='YlOrRd',
          map_key={'zip_codes': 'properties.zip'})
del vis.marks[0].properties.update
#del vis.marks[1].properties.update
#vis.marks[1].properties.update.fill.value = '#FFFFFF'
#vis.marks[1].properties.enter.stroke.value = '#CCCCFF'
vis.marks[1].properties.enter.stroke_opacity = ValueRef(value=0.05)
vis.marks[0].properties.enter.stroke.value = '#C0C0C0'
#vis.marks[1].properties.hover.fill.value = 'red'
vis.legend(title='Preferred ZipCode')
vis.display()
vis.to_json("USA_Preferred.json")



In [541]:
zip_topo = r'zips_us_topo.json'
state_topo = r'us_states.topo.json'

geo_data2 = [{'name': 'zip_codes',
             'url': zip_topo,
             'feature': 'zip_codes_for_the_usa'},
            {'name': 'states',
             'url': state_topo,
             'feature': 'us_states.geo'}]

geo_data = [{'name': 'states',
             'url': state_topo,
             'feature': 'us_states.geo'},
            {'name': 'zip_codes',
             'url': zip_topo,
             'feature': 'zip_codes_for_the_usa'}]

vis = vincent.Map(data=urban[urban['zip']== '83211'], geo_data=geo_data, scale=1100, projection='albersUsa',
          data_bind='POPULATION', data_key='zip',brew='PuRd',
          map_key={'zip_codes': 'properties.zip'})
del vis.marks[0].properties.update
#del vis.marks[1].properties.update
vis.marks[1].properties.update.fill.value = '#C390D4'
#vis.marks[1].properties.enter.stroke.value = '#CCCCFF'
#vis.marks[1].properties.enter.stroke_opacity = ValueRef(value=0.1)
vis.marks[0].properties.enter.stroke.value = '#FF0000'
vis.legend(title='POPULATION')
vis.display()
vis.to_json("USA.json")


---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-541-13c50b6d908d> in <module>()
     16              'feature': 'zip_codes_for_the_usa'}]
     17 
---> 18 vis = vincent.Map(data=urban[urban['zip']== '83211'], geo_data=geo_data, scale=1100, projection='albersUsa',
     19           data_bind='POPULATION', data_key='zip',brew='PuRd',
     20           map_key={'zip_codes': 'properties.zip'})

/home/russ/anaconda/lib/python2.7/site-packages/pandas/core/frame.pyc in __getitem__(self, key)
   1795             return self._getitem_multilevel(key)
   1796         else:
-> 1797             return self._getitem_column(key)
   1798 
   1799     def _getitem_column(self, key):

/home/russ/anaconda/lib/python2.7/site-packages/pandas/core/frame.pyc in _getitem_column(self, key)
   1802         # get column
   1803         if self.columns.is_unique:
-> 1804             return self._get_item_cache(key)
   1805 
   1806         # duplicate columns & possible reduce dimensionaility

/home/russ/anaconda/lib/python2.7/site-packages/pandas/core/generic.pyc in _get_item_cache(self, item)
   1082         res = cache.get(item)
   1083         if res is None:
-> 1084             values = self._data.get(item)
   1085             res = self._box_item_values(item, values)
   1086             cache[item] = res

/home/russ/anaconda/lib/python2.7/site-packages/pandas/core/internals.pyc in get(self, item, fastpath)
   2849 
   2850             if not isnull(item):
-> 2851                 loc = self.items.get_loc(item)
   2852             else:
   2853                 indexer = np.arange(len(self.items))[isnull(self.items)]

/home/russ/anaconda/lib/python2.7/site-packages/pandas/core/index.pyc in get_loc(self, key, method)
   1570         """
   1571         if method is None:
-> 1572             return self._engine.get_loc(_values_from_object(key))
   1573 
   1574         indexer = self.get_indexer([key], method=method)

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:3824)()

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:3704)()

pandas/hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:12280)()

pandas/hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:12231)()

KeyError: 'zip'

In [4]:
import csv
with open("/home/russ/Documents/DDL/Projects/03-censusables/source/Data/raw_files/state_landarea_rank.csv") as f:
    f.readline() # ignore first line (header)
    land_area = dict(csv.reader(f, delimiter=','))
print land_area


{'Mississippi': '31', 'Oklahoma': '19', 'Wyoming': '9', 'Minnesota': '14', 'Illinois': '24', 'Arkansas': '27', 'Ohio': '35', 'Indiana': '38', 'Maryland': '42', 'Louisiana': '33', 'New_Hampshire': '44', 'Idaho': '11', 'New_York': '30', 'Arizona': '6', 'Iowa': '23', 'South_Carolina': '40', 'Michigan': '22', 'Kansas': '13', 'Utah': '12', 'Virginia': '37', 'Oregon': '10', 'District_of_Columbia': '51', 'Connecticut': '48', 'Montana': '4', 'California': '3', 'Texas': '2', 'New_Mexico': '5', 'South_Dakota': '16', 'Massachusetts': '45', 'Vermont': '43', 'Georgia': '21', 'Pennsylvania': '32', 'Florida': '26', 'Alaska': '1', 'North_Dakota': '17', 'Hawaii': '47', 'Nebraska': '15', 'Kentucky': '36', 'Missouri': '18', 'Wisconsin': '25', 'Alabama': '28', 'New_Jersey': '46', 'Colorado': '8', 'Washington': '20', 'West_Virginia': '41', 'Tennessee': '34', 'Rhode_Island': '50', 'North_Carolina': '29', 'Nevada': '7', 'Delaware': '49', 'Maine': '39'}

In [146]:
ziplist = json.loads(df_all_final[['ZCTA5','ZIPName','fit']].head(5).to_json())
ziplist


Out[146]:
{u'ZCTA5': {u'0': u'01001',
  u'1': u'01002',
  u'2': u'01005',
  u'3': u'01007',
  u'4': u'01008'},
 u'ZIPName': {u'0': u'Agawam Town, MA',
  u'1': u'Amherst Center, MA',
  u'2': u'Barre, MA',
  u'3': u'Belchertown, MA',
  u'4': u'Blandford, MA'},
 u'fit': {u'0': 1.1006146364,
  u'1': 0.5398473703,
  u'2': 1.4791458348,
  u'3': 1.2032138701,
  u'4': 1.5669620785}}

In [140]:
ziplist['ZCTA5']['0']


Out[140]:
u'01001'

In [150]:
ziplist = json.loads(df_all_final[['ZCTA5','ZIPName','fit']].sort(['fit']).reset_index().head(5).to_json())

In [151]:
ziplist


Out[151]:
{u'ZCTA5': {u'0': u'06830',
  u'1': u'90046',
  u'2': u'90211',
  u'3': u'94024',
  u'4': u'90212'},
 u'ZIPName': {u'0': u'Greenwich, CT',
  u'1': u'Los Angeles, CA',
  u'2': u'Beverly Hills, CA',
  u'3': u'Los Altos, CA',
  u'4': u'Beverly Hills, CA'},
 u'fit': {u'0': 0.3256283618,
  u'1': 0.3647637893,
  u'2': 0.3673332581,
  u'3': 0.3692640451,
  u'4': 0.3715989326},
 u'index': {u'0': 849, u'1': 10487, u'2': 10506, u'3': 11171, u'4': 10507}}

In [153]:
table_data = []

for i in range (5):
    dict_row = {}
    dict_row['index'] = i
    dict_row['ZCTA5'] = ziplist['ZCTA5'][str(i)]
    dict_row['ZIPName'] = ziplist['ZIPName'][str(i)]
    table_data.append(dict_row)
print table_data


[{'ZCTA5': u'06830', 'index': 0, 'ZIPName': u'Greenwich, CT'}, {'ZCTA5': u'90046', 'index': 1, 'ZIPName': u'Los Angeles, CA'}, {'ZCTA5': u'90211', 'index': 2, 'ZIPName': u'Beverly Hills, CA'}, {'ZCTA5': u'94024', 'index': 3, 'ZIPName': u'Los Altos, CA'}, {'ZCTA5': u'90212', 'index': 4, 'ZIPName': u'Beverly Hills, CA'}]

In [ ]: