notebook.community

Edit and run



In [1]:

    
import os
import pandas as pd
import csv
import json
import numpy as np
import re
import bisect



In [2]:

    
directory = 'analysis'
if not os.path.exists(directory):
    os.makedirs(directory)



In [3]:

    
# import pums-lite
df = pd.read_pickle('data/puma_pums/merged.pickle')
df.head(3)









    Out[3]:






  
    
      
      state_puma
      inc_percap
      WGTP
      geometry
      price
      w_use_percap
    
  
  
    
      0
      1-1302
      8850.000000
      121
      POLYGON ((-86.85779499999998 33.53612699999999...
      0.007806
      3843.115486
    
    
      1
      1-1302
      9100.000000
      62
      POLYGON ((-86.85779499999998 33.53612699999999...
      0.007806
      89672.694664
    
    
      2
      1-1302
      31933.333333
      59
      POLYGON ((-86.85779499999998 33.53612699999999...
      0.007806
      38431.154856



In [4]:

    
def get_usage(dataframe, col_wt, col_val):
    usage = []
    df_subset = df[[col_wt, col_val]]
    tuples = df_subset.values.tolist()
    for a_tuple in tuples:
        for weight in range(int(a_tuple[0])):
            usage.append(int(a_tuple[1]))
    return usage
water_usage = get_usage(df, 'WGTP', 'w_use_percap')
type(water_usage)









    Out[4]:





list



In [5]:

    
water_usage_slice = np.random.choice(water_usage, 100000)
water_usage_slice = water_usage_slice.tolist()
type(water_usage_slice)









    Out[5]:





list



In [6]:

    
with open('analysis/js/nums.json', 'w') as outfile:
    json.dump({'numbers':water_usage_slice}, outfile, indent=0)



In [7]:

    
# computed jenks breaks in javascript
# read in outputs
with open('analysis/js/breaks', 'r') as f:
    breaks = f.readlines()
breaks[0]









    Out[7]:





'[18, 88349, 589738]\n'



In [8]:

    
def clean_and_split(string_list):
    num_list = []
    for string in string_list:
        string = string.replace('\n', '')
        string = re.sub(r'[^\w]', ' ', string)
        strings = re.sub(' +',' ', string).strip().split()
        nums = map(int, strings)
        num_list.append(nums)
    return num_list

lower_bounds = clean_and_split(breaks)
len(lower_bounds)









    Out[8]:





18



In [9]:

    
# sort sample list to use bisection algorithms
# see http://www.ehdp.com/vitalnet/breaks-1-prn.htm
sample = sorted(np.random.choice(water_usage,10000).tolist())
sample[:5]









    Out[9]:





[201, 245, 325, 327, 335]



In [10]:

    
# define bounding functions
def find_lb_idx(a, x):
    'Find index corresponding to leftmost value greater than x'
    i = bisect.bisect_left(a, x)
    if i != len(a):
        return i
    else:
        return 0

def find_ub_idx(a, x):
    'Find index corresponding to rightmost value less than or equal to x'
    i = bisect.bisect_right(a, x)
    if i:
        return i-1
    else:
        return 0

# test bounding functions
lb_in = 0
ub_in = 5
a_list = [0, 1, 2, 3, 4.1, 4.2, 4.3, 5, 5.7, 6, 7, 8, 9, 10]
lb_idx = find_lb_idx(a_list,lb_in) # index of lower bound
ub_idx = find_ub_idx(a_list,ub_in) # index of higher bound
lb = a_list[lb_idx]
ub = a_list[ub_idx]
a_slice = a_list[lb_idx:ub_idx]

# print bounding outputs
print 'input x ranges from %s to %s' % (lb_in, ub_in)
print 'indices range from %s to %s' % (lb_idx, ub_idx)
print 'tested x ranges from %s to %s' % (lb, ub)
print 'new sliced list is %s'         % a_slice









    



input x ranges from 0 to 5
indices range from 0 to 7
tested x ranges from 0 to 5
new sliced list is [0, 1, 2, 3, 4.1, 4.2, 4.3]



In [11]:

    
# function to build nice list of upper bounds
def get_upper_bounds(lower_bounds):
    upper_bounds = []
    for row in lower_bounds:
        upper_bounds_row = []
        for idx, val in enumerate(row):
            if idx < (len(row)-1):
                upper_bounds_row.append(row[idx+1])
            else:
                upper_bounds_row.append(float('inf'))
        upper_bounds.append(upper_bounds_row)
    return upper_bounds

upper_bounds = get_upper_bounds(lower_bounds)
zipped_bounds = zip(lower_bounds, upper_bounds)
bounds = []
for row in zipped_bounds:
    bounds.append(zip(row[0], row[1]))

print 'zipped lower & upper bounds %s' % bounds[:2]









    



zipped lower & upper bounds [[(18, 88349), (88349, 589738), (589738, inf)], [(18, 42459), (42459, 130605), (130605, 589738), (589738, inf)]]



In [12]:

    
# build function to get slices of data list
def get_indices(data, bounds):
    indices = []
    for li in bounds:
        temp = []
        for tu in li:
            index_tuple = (find_lb_idx(sample, tu[0]), find_ub_idx(sample, tu[1]))
            temp.append(index_tuple)
        indices.append(temp)
    return indices
indices = get_indices(sample, bounds)
indices[:2]









    Out[12]:





[[(0, 7931), (7932, 9979), (9980, 9999)],
 [(0, 5026), (5027, 8957), (8957, 9979), (9980, 9999)]]



In [13]:

    
# build function to get data binned into lists
def bin_data(data, indices):
    sample_list=[]
    for li in indices:
        temp_list = []
        for tu in li:
            lower = tu[0]
            upper = tu[1]
            temp  = sample[tu[0]:tu[1]]
            temp_list.append(temp)
        sample_list.append(temp_list)
    return sample_list

sample_list = bin_data(sample, indices)

sample_list[0][0][:10]









    Out[13]:





[201, 245, 325, 327, 335, 391, 462, 463, 464, 485]



In [14]:

    
# compute global stats
samp_mean = np.mean(sample)
sdam = np.sum((sample[:]-samp_mean)**2)



In [15]:

    
np.sum(np.subtract([1,2,3], np.mean([1,2,3]))**2)









    Out[15]:





2.0



In [16]:

    
# compute class stats

# compute cluster means

def sse(data):
    return np.sum((np.subtract(data, np.mean(data))**2))  

def calc_scdm(sample_list):
    foo = []
    for li in sample_list:
        bar = []
        for cluster in li:
            bar.append(sse(cluster))
            baz = np.sum(bar)
        foo.append(baz)
    return foo

def calc_gvf(sdam, scdm):
    return (sdam-scdm)/sdam

scdm = calc_scdm(sample_list)
gvf = np.divide((np.subtract(sdam, scdm)),sdam)
gvf
#def calc_cluster_
#list_sdcm = np.sum((data[:]-mean)**2)

#cluster_means = calc_cluster_means(sample_list)
#cluster_means









    Out[16]:





array([ 0.6958851 ,  0.82400687,  0.89156585,  0.91747381,  0.92753122,
        0.94838306,  0.95355712,  0.95606449,  0.95861116,  0.96035283,
        0.96123053,  0.9621615 ,  0.96275417,  0.96331449,  0.96365511,
        0.96398704,  0.97448984,  0.97473725])



In [17]:

    
# select quintiles, which has gvf ~0.9
jenks5 = bounds[2]
jenks5









    Out[17]:





[(18, 29527), (29527, 89073), (89073, 175182), (175182, 589738), (589738, inf)]



In [18]:

    
# calculate quintiles
quintile_lb = np.percentile(sample, [0, 20, 40, 60, 80])
quintile_ub = np.percentile(sample, [20, 40, 60, 80])
quintile_ub = np.append(quintile_ub, float('inf'))
quintile = zip(quintile_lb, quintile_ub)
quintile









    Out[18]:





[(201.0, 12694.600000000002),
 (12694.600000000002, 32195.0),
 (32195.0, 54390.999999999985),
 (54390.999999999985, 91114.600000000006),
 (91114.600000000006, inf)]



In [19]:

    
# calculate equal interval
minimum = min(sample)
maximum = max(sample)
difference = maximum - minimum
step = difference/5
lb = np.arange(minimum, maximum, step)
ub = np.append(lb[1:], float('inf'))
equal_interval = zip(lb, ub)
equal_interval









    Out[19]:





[(201, 251116.0),
 (251116, 502031.0),
 (502031, 752946.0),
 (752946, 1003861.0),
 (1003861, 1254776.0),
 (1254776, inf)]

	state_puma	inc_percap	WGTP	geometry	price	w_use_percap
0	1-1302	8850.000000	121	POLYGON ((-86.85779499999998 33.53612699999999...	0.007806	3843.115486
1	1-1302	9100.000000	62	POLYGON ((-86.85779499999998 33.53612699999999...	0.007806	89672.694664
2	1-1302	31933.333333	59	POLYGON ((-86.85779499999998 33.53612699999999...	0.007806	38431.154856