notebook.community

Edit and run



In [260]:

    
import os
import geopandas as gpd
import pandas as pd
import pickle
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline  

import json

from sklearn import cluster, datasets
from sklearn.neighbors import kneighbors_graph
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import scale
from sklearn.preprocessing import normalize

from itertools import islice

from sklearn.cluster import SpectralClustering
from sklearn import preprocessing

import time

import numpy as np
import matplotlib.pyplot as plt


from sklearn.neighbors import kneighbors_graph
from sklearn.preprocessing import StandardScaler
from sklearn import cluster, datasets



In [261]:

    
directory = 'analysis/histograms'
if not os.path.exists(directory):
    os.makedirs(directory)



In [262]:

    
# import merged puma pums data
df_pp = pd.read_pickle('data/puma_pums/merged2.pickle')
gdf_pp = gpd.GeoDataFrame(df_pp)
gdf_pp.head(3)









    Out[262]:






  
    
      
      DIVISION
      PUMA
      REGION
      ST
      WGTP
      ADJINC
      ADJHSG
      HINCP
      FINCP
      NP
      ...
      puma
      y_wt
      state_puma
      inc_percap
      w_percap
      geometry
      owner
      system
      price
      w_use_percap
    
  
  
    
      0
      7
      3602
      3
      48
      90
      1008425
      1000000
      52000.0
      52000.0
      3
      ...
      3602
      0.069231
      48-3602
      17333.333333
      13.333333
      POLYGON ((-96.60183099999998 30.64192599999999, -96.59990599999998 30.64390499999999, -96.59661599999997 30.64628099999999, -96.59522199999998 30.64905399999999, -96.59549599999997 30.650283999999...
      Public
      COLLEGE STATION
      0.004298
      3102.218086
    
    
      1
      7
      3602
      3
      48
      290
      1008425
      1000000
      41100.0
      0.0
      3
      ...
      3602
      6.350365
      48-3602
      13700.000000
      300.000000
      POLYGON ((-96.60183099999998 30.64192599999999, -96.59990599999998 30.64390499999999, -96.59661599999997 30.64628099999999, -96.59522199999998 30.64905399999999, -96.59549599999997 30.650283999999...
      Public
      COLLEGE STATION
      0.004298
      69799.906933
    
    
      2
      7
      3602
      3
      48
      124
      1008425
      1000000
      61000.0
      61000.0
      4
      ...
      3602
      0.609836
      48-3602
      15250.000000
      75.000000
      POLYGON ((-96.60183099999998 30.64192599999999, -96.59990599999998 30.64390499999999, -96.59661599999997 30.64628099999999, -96.59522199999998 30.64905399999999, -96.59549599999997 30.650283999999...
      Public
      COLLEGE STATION
      0.004298
      17449.976733
    
  

3 rows × 64 columns



In [263]:

    
gdf_pp['geometry']=gdf_pp['geometry'].centroid
gdf_pp.head(3)









    Out[263]:






  
    
      
      DIVISION
      PUMA
      REGION
      ST
      WGTP
      ADJINC
      ADJHSG
      HINCP
      FINCP
      NP
      ...
      puma
      y_wt
      state_puma
      inc_percap
      w_percap
      geometry
      owner
      system
      price
      w_use_percap
    
  
  
    
      0
      7
      3602
      3
      48
      90
      1008425
      1000000
      52000.0
      52000.0
      3
      ...
      3602
      0.069231
      48-3602
      17333.333333
      13.333333
      POINT (-96.3023928660625 30.66080409035931)
      Public
      COLLEGE STATION
      0.004298
      3102.218086
    
    
      1
      7
      3602
      3
      48
      290
      1008425
      1000000
      41100.0
      0.0
      3
      ...
      3602
      6.350365
      48-3602
      13700.000000
      300.000000
      POINT (-96.3023928660625 30.66080409035931)
      Public
      COLLEGE STATION
      0.004298
      69799.906933
    
    
      2
      7
      3602
      3
      48
      124
      1008425
      1000000
      61000.0
      61000.0
      4
      ...
      3602
      0.609836
      48-3602
      15250.000000
      75.000000
      POINT (-96.3023928660625 30.66080409035931)
      Public
      COLLEGE STATION
      0.004298
      17449.976733
    
  

3 rows × 64 columns



In [264]:

    
def getXY(pt):
    return (pt.x, pt.y)
gdf_pp['troid'] = gdf_pp['geometry'].centroid
gdf_pp['x'], gdf_pp['y'] = [list(t) for t in zip(*map(getXY, gdf_pp['geometry']))]

gdf_pp.head()









    Out[264]:






  
    
      
      DIVISION
      PUMA
      REGION
      ST
      WGTP
      ADJINC
      ADJHSG
      HINCP
      FINCP
      NP
      ...
      inc_percap
      w_percap
      geometry
      owner
      system
      price
      w_use_percap
      troid
      x
      y
    
  
  
    
      0
      7
      3602
      3
      48
      90
      1008425
      1000000
      52000.0
      52000.0
      3
      ...
      17333.333333
      13.333333
      POINT (-96.3023928660625 30.66080409035931)
      Public
      COLLEGE STATION
      0.004298
      3102.218086
      POINT (-96.3023928660625 30.66080409035931)
      -96.302393
      30.660804
    
    
      1
      7
      3602
      3
      48
      290
      1008425
      1000000
      41100.0
      0.0
      3
      ...
      13700.000000
      300.000000
      POINT (-96.3023928660625 30.66080409035931)
      Public
      COLLEGE STATION
      0.004298
      69799.906933
      POINT (-96.3023928660625 30.66080409035931)
      -96.302393
      30.660804
    
    
      2
      7
      3602
      3
      48
      124
      1008425
      1000000
      61000.0
      61000.0
      4
      ...
      15250.000000
      75.000000
      POINT (-96.3023928660625 30.66080409035931)
      Public
      COLLEGE STATION
      0.004298
      17449.976733
      POINT (-96.3023928660625 30.66080409035931)
      -96.302393
      30.660804
    
    
      3
      7
      3602
      3
      48
      89
      1008425
      1000000
      125000.0
      125000.0
      2
      ...
      62500.000000
      85.000000
      POINT (-96.3023928660625 30.66080409035931)
      Public
      COLLEGE STATION
      0.004298
      19776.640298
      POINT (-96.3023928660625 30.66080409035931)
      -96.302393
      30.660804
    
    
      4
      7
      3602
      3
      48
      52
      1008425
      1000000
      50000.0
      50000.0
      2
      ...
      25000.000000
      145.000000
      POINT (-96.3023928660625 30.66080409035931)
      Public
      COLLEGE STATION
      0.004298
      33736.621685
      POINT (-96.3023928660625 30.66080409035931)
      -96.302393
      30.660804
    
  

5 rows × 67 columns



In [265]:

    
# WORKFLOW
# select features
# convert to numpy array
# weight the numpy array
# remove the weight column
# normalize array
# run sklearn spectral clustering



In [266]:

    
# select features
gdf_pp_trunc = gdf_pp[['WGTP', 'total', 'child', 'adult', 'cit', 'employ', 'hi', 'hs', 'l_h', 'a', 'b', 'na','o', 'w', 'inc_percap', 'price', 'w_use_percap', 'x', 'y']]
gdf_pp_trunc.head(3)









    Out[266]:






  
    
      
      WGTP
      total
      child
      adult
      cit
      employ
      hi
      hs
      l_h
      a
      b
      na
      o
      w
      inc_percap
      price
      w_use_percap
      x
      y
    
  
  
    
      0
      90
      3.0
      0.333333
      0.333333
      1.0
      1.0
      0.666667
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      17333.333333
      0.004298
      3102.218086
      -96.302393
      30.660804
    
    
      1
      290
      3.0
      0.000000
      1.000000
      1.0
      1.0
      1.000000
      1.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      13700.000000
      0.004298
      69799.906933
      -96.302393
      30.660804
    
    
      2
      124
      4.0
      0.500000
      0.500000
      1.0
      1.0
      1.000000
      1.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      15250.000000
      0.004298
      17449.976733
      -96.302393
      30.660804



In [267]:

    
# CORRELATION TABLES 
samples = 100000

gdf_pp_trunc_samp = gdf_pp_trunc.sample(n = samples, replace=True, weights=gdf_pp_trunc['WGTP'])
gdf_pp_trunc_samp.head(3)









    Out[267]:






  
    
      
      WGTP
      total
      child
      adult
      cit
      employ
      hi
      hs
      l_h
      a
      b
      na
      o
      w
      inc_percap
      price
      w_use_percap
      x
      y
    
  
  
    
      31361
      32
      3.0
      0.333333
      0.666667
      1.0
      1.0
      1.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      17933.333333
      0.003678
      21750.951604
      -81.602248
      27.821369
    
    
      17451
      140
      2.0
      0.000000
      0.500000
      1.0
      1.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      5800.000000
      0.008607
      11618.225122
      -98.703637
      33.987930
    
    
      77676
      62
      4.0
      0.500000
      0.500000
      1.0
      1.0
      1.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      43000.250000
      0.008479
      12382.805590
      -71.662727
      43.256232



In [268]:

    
# generate correlation matrix and format

corr = gdf_pp_trunc_samp.drop('WGTP', axis=1)
corr = corr.corr(min_periods=2)
corr = corr.round(decimals=2)

# set all values in upper-right hand corner to arbitrary value
data_2_idx = corr.index.tolist()
for row in range(corr.shape[0]):
    for col in range(corr.shape[1]):
        if row <= col:
            corr.iloc[row][col] = float('inf')

def bcolor_divergent(val):
    """
    Uses colorbrewer2's divergent color scheme for ten intervals
    """
    if val == float('inf'):
        bcolor = '#FFFFFF'
    elif val < -0.8:
        bcolor = '#8e0152'
    elif val < -0.6:
        bcolor = '#c51b7d'
    elif val < -0.4:
        bcolor = '#de77ae'
    elif val < -0.2:
        bcolor = '#f1b6da'
    elif val < 0.0:
        bcolor = '#fde0ef'
    elif val < 0.2:
        bcolor = '#e6f5d0'
    elif val < 0.4:
        bcolor = '#b8e186'
    elif val < 0.6:
        bcolor = '#7fbc41'
    elif val < 0.6:
        bcolor = '#4d9221'
    else:
        bcolor = '#276419'
    return 'background-color: %s' % bcolor

def font_white(val):
    """
    blank out cells with value of inf
    """
    color =''
    if val == float('inf'):
        color = '#FFFFFF'
    return 'color: %s' % color

pd.set_option('display.width', 200)
pd.set_option('display.max_colwidth', 200)
styled = corr.style.applymap(bcolor_divergent)
styled2 = styled.applymap(font_white)
styled2.set_properties(**{'border-color':'white'})
#corr.applymap(remove_upper_right)
#new = corr.iloc[:3, 0:3].style.applymap(color_divergent)
#new = corr
#new.iloc[0][0].style.applymap(color_divergent)









    Out[268]:





        

        
        

        
            
            
                
                
                
                total
                
                child
                
                adult
                
                cit
                
                employ
                
                hi
                
                hs
                
                l_h
                
                a
                
                b
                
                na
                
                o
                
                w
                
                inc_percap
                
                price
                
                w_use_percap
                
                x
                
                y
                
            
            
        
        
            
            
                
                
                    total
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
            
            
            
                
                
                    child
                
                
                    0.69
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
            
            
            
                
                
                    adult
                
                
                    -0.1
                
                
                    -0.27
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
            
            
            
                
                
                    cit
                
                
                    -0.14
                
                
                    -0.06
                
                
                    -0.05
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
            
            
            
                
                
                    employ
                
                
                    0.28
                
                
                    0.23
                
                
                    0.44
                
                
                    -0.07
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
            
            
            
                
                
                    hi
                
                
                    -0.07
                
                
                    -0.01
                
                
                    -0.17
                
                
                    0.2
                
                
                    -0.04
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
            
            
            
                
                
                    hs
                
                
                    0.09
                
                
                    0.15
                
                
                    0.46
                
                
                    0.06
                
                
                    0.4
                
                
                    0.09
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
            
            
            
                
                
                    l_h
                
                
                    0.27
                
                
                    0.18
                
                
                    -0
                
                
                    -0.32
                
                
                    0.07
                
                
                    -0.17
                
                
                    -0.13
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
            
            
            
                
                
                    a
                
                
                    0.07
                
                
                    0.03
                
                
                    0.03
                
                
                    -0.22
                
                
                    0.06
                
                
                    0.02
                
                
                    0.1
                
                
                    -0.08
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
            
            
            
                
                
                    b
                
                
                    -0.02
                
                
                    0.04
                
                
                    0.04
                
                
                    0.07
                
                
                    -0.02
                
                
                    -0.05
                
                
                    -0.03
                
                
                    -0.13
                
                
                    -0.09
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
            
            
            
                
                
                    na
                
                
                    0.02
                
                
                    0.01
                
                
                    0.01
                
                
                    -0.01
                
                
                    -0
                
                
                    -0.02
                
                
                    -0.01
                
                
                    0.01
                
                
                    -0.02
                
                
                    -0.03
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
            
            
            
                
                
                    o
                
                
                    0.15
                
                
                    0.13
                
                
                    0.02
                
                
                    -0.16
                
                
                    0.05
                
                
                    -0.08
                
                
                    -0.02
                
                
                    0.33
                
                
                    -0.04
                
                
                    -0.09
                
                
                    -0.01
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
            
            
            
                
                
                    w
                
                
                    -0.11
                
                
                    -0.11
                
                
                    -0.06
                
                
                    0.15
                
                
                    -0.04
                
                
                    0.08
                
                
                    -0.01
                
                
                    -0.04
                
                
                    -0.41
                
                
                    -0.68
                
                
                    -0.14
                
                
                    -0.45
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
            
            
            
                
                
                    inc_percap
                
                
                    -0.23
                
                
                    -0.23
                
                
                    0.14
                
                
                    0.07
                
                
                    0.16
                
                
                    0.16
                
                
                    0.21
                
                
                    -0.15
                
                
                    0.04
                
                
                    -0.1
                
                
                    -0.02
                
                
                    -0.08
                
                
                    0.11
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
            
            
            
                
                
                    price
                
                
                    0
                
                
                    -0.01
                
                
                    -0.01
                
                
                    0
                
                
                    -0.01
                
                
                    0.03
                
                
                    -0.01
                
                
                    -0.01
                
                
                    0.03
                
                
                    -0.05
                
                
                    -0.01
                
                
                    0.02
                
                
                    0.01
                
                
                    0.02
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
            
            
            
                
                
                    w_use_percap
                
                
                    -0.32
                
                
                    -0.23
                
                
                    -0
                
                
                    0.04
                
                
                    -0.14
                
                
                    -0
                
                
                    -0.06
                
                
                    -0.05
                
                
                    -0.03
                
                
                    0.04
                
                
                    -0
                
                
                    -0.04
                
                
                    0.01
                
                
                    0.11
                
                
                    -0.24
                
                
                    inf
                
                
                    inf
                
                
                    inf
                
            
            
            
                
                
                    x
                
                
                    -0.12
                
                
                    -0.05
                
                
                    -0.02
                
                
                    0.12
                
                
                    -0.05
                
                
                    0.01
                
                
                    -0.05
                
                
                    -0.29
                
                
                    -0.16
                
                
                    0.15
                
                
                    -0.06
                
                
                    -0.14
                
                
                    0.05
                
                
                    -0.01
                
                
                    -0.09
                
                
                    -0.02
                
                
                    inf
                
                
                    inf
                
            
            
            
                
                
                    y
                
                
                    -0.01
                
                
                    0.01
                
                
                    0.03
                
                
                    0.05
                
                
                    0.04
                
                
                    0.1
                
                
                    0.07
                
                
                    -0.13
                
                
                    0.01
                
                
                    -0.09
                
                
                    0
                
                
                    -0.02
                
                
                    0.08
                
                
                    0.08
                
                
                    0.19
                
                
                    -0.06
                
                
                    -0.03
                
                
                    inf



In [269]:

    
# HISTOGRAM OUTPUTS TO HTML / JS
# take samples of 1000
samples2 = 1000
gdf_pp_trunc_samp2 = gdf_pp_trunc.sample(n = samples2, replace=True, weights=gdf_pp_trunc['WGTP'])
gdf_pp_trunc_samp2.shape

gdf_hist = gdf_pp_trunc_samp2.copy()
gdf_hist.drop('WGTP', inplace=True, axis=1)

gdf_hist_list = gdf_hist.columns.values.tolist()
print len(gdf_hist_list)
print gdf_hist_list









    



18
['total', 'child', 'adult', 'cit', 'employ', 'hi', 'hs', 'l_h', 'a', 'b', 'na', 'o', 'w', 'inc_percap', 'price', 'w_use_percap', 'x', 'y']



In [270]:

    
np_hist = gdf_hist.as_matrix()
np_hist = (np_hist - np_hist.min(0)) / np_hist.ptp(0)
np_hist = np.round(np_hist, decimals=2)
np.set_printoptions(precision=2, suppress=True)
np_hist_format = np_hist.copy()
np_hist_format.shape









    Out[270]:





(1000, 18)



In [271]:

    
# export to json
with open('presentation/js-d3_viz/hist.json', 'w') as outfile:
    json.dump(np_hist_format.transpose().tolist(), outfile)



In [272]:

    
# convert to numpy array
np_pp_trunc = gdf_pp_trunc.as_matrix()
np_pp_trunc[:3]









    Out[272]:





array([[    90.  ,      3.  ,      0.33,      0.33,      1.  ,      1.  ,
             0.67,      0.  ,      0.  ,      0.  ,      0.  ,      0.  ,
             0.  ,      1.  ,  17333.33,      0.  ,   3102.22,    -96.3 ,
            30.66],
       [   290.  ,      3.  ,      0.  ,      1.  ,      1.  ,      1.  ,
             1.  ,      1.  ,      0.  ,      0.  ,      1.  ,      0.  ,
             0.  ,      0.  ,  13700.  ,      0.  ,  69799.91,    -96.3 ,
            30.66],
       [   124.  ,      4.  ,      0.5 ,      0.5 ,      1.  ,      1.  ,
             1.  ,      1.  ,      0.  ,      0.  ,      1.  ,      0.  ,
             0.  ,      0.  ,  15250.  ,      0.  ,  17449.98,    -96.3 ,
            30.66]])



In [273]:

    
# weight the numpy array by expanding
weights = np_pp_trunc[:,0].astype('int64')
np_pp_expand = np.repeat(np_pp_trunc, weights, axis=0)
np_pp_wt_col_dropped = np_pp_expand[:,1:]
print np_pp_expand.shape
print np_pp_wt_col_dropped.shape









    



(16521484, 19)
(16521484, 18)



In [274]:

    
# randomly choose 500k samples
sample_size = 500000
random_selection = np.random.randint(0, high=np_pp_wt_col_dropped.shape[0], size=sample_size)
rand_sample = np_pp_wt_col_dropped[random_selection,:]
rand_sample









    Out[274]:





array([[      2.  ,       0.  ,       1.  , ...,    4326.82,     -81.6 ,
             27.82],
       [      4.  ,       0.25,       0.75, ...,   43203.88,     -99.89,
             32.3 ],
       [      1.  ,       0.  ,       1.  , ...,  133858.85,    -117.02,
             32.57],
       ..., 
       [      4.  ,       0.5 ,       0.5 , ...,   16891.89,     -83.49,
             42.31],
       [      4.  ,       0.5 ,       0.5 , ...,    4196.98,     -79.14,
             34.68],
       [      2.  ,       0.  ,       0.5 , ...,   52710.84,     -95.22,
             30.95]])



In [275]:

    
# normalize the array so that each col's values range from 0 to 1
normed = (rand_sample - rand_sample.min(0)) / rand_sample.ptp(0)
normed









    Out[275]:





array([[ 0.05,  0.  ,  1.  , ...,  0.  ,  0.88,  0.16],
       [ 0.16,  0.25,  0.75, ...,  0.02,  0.67,  0.27],
       [ 0.  ,  0.  ,  1.  , ...,  0.05,  0.47,  0.28],
       ..., 
       [ 0.16,  0.5 ,  0.5 , ...,  0.01,  0.86,  0.52],
       [ 0.16,  0.5 ,  0.5 , ...,  0.  ,  0.91,  0.33],
       [ 0.05,  0.  ,  0.5 , ...,  0.02,  0.72,  0.24]])



In [276]:

    
#trial_0 definition: all 18 features
trial_0 = np.copy(normed)
print trial_0.shape
trial_0









    



(500000, 18)






    Out[276]:





array([[ 0.05,  0.  ,  1.  , ...,  0.  ,  0.88,  0.16],
       [ 0.16,  0.25,  0.75, ...,  0.02,  0.67,  0.27],
       [ 0.  ,  0.  ,  1.  , ...,  0.05,  0.47,  0.28],
       ..., 
       [ 0.16,  0.5 ,  0.5 , ...,  0.01,  0.86,  0.52],
       [ 0.16,  0.5 ,  0.5 , ...,  0.  ,  0.91,  0.33],
       [ 0.05,  0.  ,  0.5 , ...,  0.02,  0.72,  0.24]])



In [277]:

    
# trial_1 definition
# total[0], child[1], employ[4], hs[6], l_h[7], inc_percap[13],  price[14], w_use_percap[15], y[17]
trial_1 = np.copy(normed)

trial_1 = trial_1[:,[0, 1, 4, 6, 7, 13, 14, 15, 17]]
print trial_1.shape
trial_1









    



(500000, 9)






    Out[277]:





array([[ 0.05,  0.  ,  0.  , ...,  0.07,  0.  ,  0.16],
       [ 0.16,  0.25,  1.  , ...,  0.27,  0.02,  0.27],
       [ 0.  ,  0.  ,  1.  , ...,  0.55,  0.05,  0.28],
       ..., 
       [ 0.16,  0.5 ,  1.  , ...,  0.26,  0.01,  0.52],
       [ 0.16,  0.5 ,  1.  , ...,  0.16,  0.  ,  0.33],
       [ 0.05,  0.  ,  0.  , ...,  0.14,  0.02,  0.24]])



In [278]:

    
# trial_2 definition
# total[0], child[1], employ[4], inc_percap[13],  price[14], w_use_percap[15]
trial_2 = np.copy(normed)

trial_2 = trial_2[:,[0, 1, 4, 13, 14, 15]]
print trial_2.shape
trial_2









    



(500000, 6)






    Out[278]:





array([[ 0.05,  0.  ,  0.  ,  0.03,  0.07,  0.  ],
       [ 0.16,  0.25,  1.  ,  0.03,  0.27,  0.02],
       [ 0.  ,  0.  ,  1.  ,  0.09,  0.55,  0.05],
       ..., 
       [ 0.16,  0.5 ,  1.  ,  0.08,  0.26,  0.01],
       [ 0.16,  0.5 ,  1.  ,  0.02,  0.16,  0.  ],
       [ 0.05,  0.  ,  0.  ,  0.04,  0.14,  0.02]])



In [313]:

    
# trial_3 definition
# total[0], child[1], employ[4],  price[14], w_use_percap[15]
trial_3 = np.copy(normed)

trial_3 = trial_3[:,[0, 1, 4, 14, 15]]
print trial_3.shape
trial_3









    



(500000, 5)






    Out[313]:





array([[ 0.05,  0.  ,  0.  ,  0.07,  0.  ],
       [ 0.16,  0.25,  1.  ,  0.27,  0.02],
       [ 0.  ,  0.  ,  1.  ,  0.55,  0.05],
       ..., 
       [ 0.16,  0.5 ,  1.  ,  0.26,  0.01],
       [ 0.16,  0.5 ,  1.  ,  0.16,  0.  ],
       [ 0.05,  0.  ,  0.  ,  0.14,  0.02]])



In [279]:

    
# trial_0 modeling
inertia_list_0 = []
maxk = 15
for k in range(1,maxk):
    km_0 = cluster.KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1)
    scale(trial_0,with_mean=False)
    km_0.fit(trial_0)
    inertia_list_0.append(km_0.inertia_)

plt.plot(range(1,maxk),inertia_list_0)









    Out[279]:





[<matplotlib.lines.Line2D at 0x221827490>]



In [280]:

    
# trial_1 modeling
# total[0], child[1], employ[4], hs[6], l_h[7], inc_percap[13],  price[14], w_use_percap[15], y[17]
inertia_list_1 = []
maxk = 15
for k in range(1,maxk):
    km_1 = cluster.KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1)
    scale(trial_1,with_mean=False)
    km_1.fit(trial_1)
    inertia_list_1.append(km_1.inertia_)

plt.plot(range(1,maxk),inertia_list_1)









    Out[280]:





[<matplotlib.lines.Line2D at 0x124460690>]



In [281]:

    
# trial_2 modeling
# total[0], child[1], employ[4], inc_percap[13],  price[14], w_use_percap[15]

inertia_list_2 = []
maxk = 15
for k in range(1,maxk):
    km_2 = cluster.KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1)
    scale(trial_2,with_mean=False)
    km_2.fit(trial_2)
    inertia_list_2.append(km_2.inertia_)

plt.plot(range(1,maxk),inertia_list_2)









    Out[281]:





[<matplotlib.lines.Line2D at 0x22196c790>]



In [314]:

    
# trial_3 modeling
# total[0], child[1], employ[4], price[14], w_use_percap[15]

inertia_list_2 = []
maxk = 15
for k in range(1,maxk):
    km_2 = cluster.KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1)
    scale(trial_2,with_mean=False)
    km_2.fit(trial_2)
    inertia_list_2.append(km_2.inertia_)

plt.plot(range(1,maxk),inertia_list_2)









    Out[314]:





[<matplotlib.lines.Line2D at 0x2234c9910>]



In [282]:

    
# using 3 clusters for 6 features (selected trial_2 model)
nk = 3
km = cluster.KMeans(n_clusters=nk, init='k-means++', max_iter=100, n_init=10)
scale(trial_2,with_mean=False)
clusters = km.fit_predict(trial_2)

# total[0], child[1], employ[4], inc_percap[13],  price[14], w_use_percap[15]


clusters_dict = {}
for i in range(nk):
    clusters_dict[i] = []
    
for idx, a_cluster in enumerate(clusters):
    clusters_dict[a_cluster].append(trial_2[idx])



In [283]:

    
# convert clusters np array to dataframe
df_clusters = pd.DataFrame(clusters, columns=['cluster'])

# format as string, create new column, drop old column
df_clusters['Cluster'] = ('Cluster ' + df_clusters['cluster'].astype('str'))
df_clusters = df_clusters.drop('cluster', axis=1)

df_clusters.head(3)









    Out[283]:






  
    
      
      Cluster
    
  
  
    
      0
      Cluster 1
    
    
      1
      Cluster 2
    
    
      2
      Cluster 0



In [284]:

    
# recover rand_sample, which aligns with trial_2, and convert to dataframe
df_rand_sample = pd.DataFrame(rand_sample, columns=gdf_hist_list)
df_rand_sample.head(3)









    Out[284]:






  
    
      
      total
      child
      adult
      cit
      employ
      hi
      hs
      l_h
      a
      b
      na
      o
      w
      inc_percap
      price
      w_use_percap
      x
      y
    
  
  
    
      0
      2.0
      0.00
      1.00
      1.0
      0.0
      1.0
      0.000000
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      19900.0
      0.002311
      4326.819067
      -81.602248
      27.821369
    
    
      1
      4.0
      0.25
      0.75
      1.0
      1.0
      1.0
      0.333333
      1.0
      0.0
      0.0
      0.0
      0.0
      1.0
      24075.0
      0.005150
      43203.883495
      -99.890119
      32.301456
    
    
      2
      1.0
      0.00
      1.00
      1.0
      1.0
      1.0
      1.000000
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      84000.0
      0.008965
      133858.853276
      -117.017231
      32.567479



In [304]:

    
# select subset of columns based on results of trial_2 kmeans clustering
features = ['total', 'child', 'employ', 'inc_percap', 'price', 'w_use_percap']
df_features = df_rand_sample[features]

# format columns
df_features['Water Usage'] = (df_features['w_use_percap']/1000).astype('int')
df_features['Income'] = (df_features['inc_percap']/1000).astype('int')
df_features['Price'] = (df_features['price']*100).round(2)
df_features['Household Size'] = df_features['total'].astype('int')
df_features['Children'] = df_features['child'].round(1)
df_features['Employ'] = df_features['employ'].round(1)

# drop columns
df_features = df_features.drop(['w_use_percap', 'inc_percap', 'price', 'total', 'child', 'employ'], axis=1)

df_features.head(3)









    



/Users/peter/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/Users/peter/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/Users/peter/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:8: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/Users/peter/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:9: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/Users/peter/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:10: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/Users/peter/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy






    Out[304]:






  
    
      
      Water Usage
      Income
      Price
      Household Size
      Children
      Employ
    
  
  
    
      0
      4
      19
      0.23
      2
      0.0
      0.0
    
    
      1
      43
      24
      0.52
      4
      0.2
      1.0
    
    
      2
      133
      84
      0.90
      1
      0.0
      1.0



In [309]:

    
# merge features dataframe to clusters dataframe
df_final = pd.merge(df_features, df_clusters, left_index=True, right_index=True)
df_final.head(3)









    Out[309]:






  
    
      
      Water Usage
      Income
      Price
      Household Size
      Children
      Employ
      Cluster
    
  
  
    
      0
      4
      19
      0.23
      2
      0.0
      0.0
      Cluster 1
    
    
      1
      43
      24
      0.52
      4
      0.2
      1.0
      Cluster 2
    
    
      2
      133
      84
      0.90
      1
      0.0
      1.0
      Cluster 0



In [310]:

    
# export to csv for incorporation into d3 scatterplot matrix visualization
df_final = df_final.sample(150)
df_final.shape









    Out[310]:





(150, 7)



In [311]:

    
df_final.to_csv('presentation/scatter/scatter.csv', columns=['Water Usage', 'Income', 'Price', 'Household Size', 'Cluster'], index=False)



In [ ]:

	DIVISION	PUMA	REGION	ST	WGTP	ADJINC	ADJHSG	HINCP	FINCP	NP	...	puma	y_wt	state_puma	inc_percap	w_percap	geometry	owner	system	price	w_use_percap
0	7	3602	3	48	90	1008425	1000000	52000.0	52000.0	3	...	3602	0.069231	48-3602	17333.333333	13.333333	POLYGON ((-96.60183099999998 30.64192599999999, -96.59990599999998 30.64390499999999, -96.59661599999997 30.64628099999999, -96.59522199999998 30.64905399999999, -96.59549599999997 30.650283999999...	Public	COLLEGE STATION	0.004298	3102.218086
1	7	3602	3	48	290	1008425	1000000	41100.0	0.0	3	...	3602	6.350365	48-3602	13700.000000	300.000000	POLYGON ((-96.60183099999998 30.64192599999999, -96.59990599999998 30.64390499999999, -96.59661599999997 30.64628099999999, -96.59522199999998 30.64905399999999, -96.59549599999997 30.650283999999...	Public	COLLEGE STATION	0.004298	69799.906933
2	7	3602	3	48	124	1008425	1000000	61000.0	61000.0	4	...	3602	0.609836	48-3602	15250.000000	75.000000	POLYGON ((-96.60183099999998 30.64192599999999, -96.59990599999998 30.64390499999999, -96.59661599999997 30.64628099999999, -96.59522199999998 30.64905399999999, -96.59549599999997 30.650283999999...	Public	COLLEGE STATION	0.004298	17449.976733

	WGTP	total	child	adult	cit	employ	hi	hs	b	w	inc_percap	price	w_use_percap	x	y
0	90	3.0	0.333333	0.333333	1.0	1.0	0.666667	0.0	0.0	1.0	17333.333333	0.004298	3102.218086	-96.302393	30.660804
1	290	3.0	0.000000	1.000000	1.0	1.0	1.000000	1.0	1.0	0.0	13700.000000	0.004298	69799.906933	-96.302393	30.660804
2	124	4.0	0.500000	0.500000	1.0	1.0	1.000000	1.0	1.0	0.0	15250.000000	0.004298	17449.976733	-96.302393	30.660804

	WGTP	total	child	adult	cit	employ	hi	hs	o	w	inc_percap	price	w_use_percap	x	y
31361	32	3.0	0.333333	0.666667	1.0	1.0	1.0	1.0	0.0	1.0	17933.333333	0.003678	21750.951604	-81.602248	27.821369
17451	140	2.0	0.000000	0.500000	1.0	1.0	1.0	0.0	1.0	0.0	5800.000000	0.008607	11618.225122	-98.703637	33.987930
77676	62	4.0	0.500000	0.500000	1.0	1.0	1.0	1.0	0.0	1.0	43000.250000	0.008479	12382.805590	-71.662727	43.256232

	total	child	adult	cit	employ	hi	hs	l_h	a	b	na	o	w	inc_percap	price	w_use_percap	x	y
total	inf	inf	inf	inf	inf	inf	inf	inf	inf	inf	inf	inf	inf	inf	inf	inf	inf	inf
child	0.69	inf	inf	inf	inf	inf	inf	inf	inf	inf	inf	inf	inf	inf	inf	inf	inf	inf
adult	-0.1	-0.27	inf	inf	inf	inf	inf	inf	inf	inf	inf	inf	inf	inf	inf	inf	inf	inf
cit	-0.14	-0.06	-0.05	inf	inf	inf	inf	inf	inf	inf	inf	inf	inf	inf	inf	inf	inf	inf
employ	0.28	0.23	0.44	-0.07	inf	inf	inf	inf	inf	inf	inf	inf	inf	inf	inf	inf	inf	inf
hi	-0.07	-0.01	-0.17	0.2	-0.04	inf	inf	inf	inf	inf	inf	inf	inf	inf	inf	inf	inf	inf
hs	0.09	0.15	0.46	0.06	0.4	0.09	inf	inf	inf	inf	inf	inf	inf	inf	inf	inf	inf	inf
l_h	0.27	0.18	-0	-0.32	0.07	-0.17	-0.13	inf	inf	inf	inf	inf	inf	inf	inf	inf	inf	inf
a	0.07	0.03	0.03	-0.22	0.06	0.02	0.1	-0.08	inf	inf	inf	inf	inf	inf	inf	inf	inf	inf
b	-0.02	0.04	0.04	0.07	-0.02	-0.05	-0.03	-0.13	-0.09	inf	inf	inf	inf	inf	inf	inf	inf	inf
na	0.02	0.01	0.01	-0.01	-0	-0.02	-0.01	0.01	-0.02	-0.03	inf	inf	inf	inf	inf	inf	inf	inf
o	0.15	0.13	0.02	-0.16	0.05	-0.08	-0.02	0.33	-0.04	-0.09	-0.01	inf	inf	inf	inf	inf	inf	inf
w	-0.11	-0.11	-0.06	0.15	-0.04	0.08	-0.01	-0.04	-0.41	-0.68	-0.14	-0.45	inf	inf	inf	inf	inf	inf
inc_percap	-0.23	-0.23	0.14	0.07	0.16	0.16	0.21	-0.15	0.04	-0.1	-0.02	-0.08	0.11	inf	inf	inf	inf	inf
price	0	-0.01	-0.01	0	-0.01	0.03	-0.01	-0.01	0.03	-0.05	-0.01	0.02	0.01	0.02	inf	inf	inf	inf
w_use_percap	-0.32	-0.23	-0	0.04	-0.14	-0	-0.06	-0.05	-0.03	0.04	-0	-0.04	0.01	0.11	-0.24	inf	inf	inf
x	-0.12	-0.05	-0.02	0.12	-0.05	0.01	-0.05	-0.29	-0.16	0.15	-0.06	-0.14	0.05	-0.01	-0.09	-0.02	inf	inf
y	-0.01	0.01	0.03	0.05	0.04	0.1	0.07	-0.13	0.01	-0.09	0	-0.02	0.08	0.08	0.19	-0.06	-0.03	inf

	total	child	adult	cit	employ	hi	hs	l_h	o	w	inc_percap	price	w_use_percap	x	y
0	2.0	0.00	1.00	1.0	0.0	1.0	0.000000	0.0	0.0	1.0	19900.0	0.002311	4326.819067	-81.602248	27.821369
1	4.0	0.25	0.75	1.0	1.0	1.0	0.333333	1.0	0.0	1.0	24075.0	0.005150	43203.883495	-99.890119	32.301456
2	1.0	0.00	1.00	1.0	1.0	1.0	1.000000	0.0	1.0	0.0	84000.0	0.008965	133858.853276	-117.017231	32.567479

	Water Usage	Income	Price	Household Size	Children	Employ	Cluster
0	4	19	0.23	2	0.0	0.0	Cluster 1
1	43	24	0.52	4	0.2	1.0	Cluster 2
2	133	84	0.90	1	0.0	1.0	Cluster 0