In [1]:
import os
import pandas as pd
import csv
import json
import numpy as np
import re
import bisect

In [2]:
directory = 'analysis'
if not os.path.exists(directory):
    os.makedirs(directory)

In [3]:
# import pums-lite
df = pd.read_pickle('data/puma_pums/merged.pickle')
df.head(3)


Out[3]:
state_puma inc_percap WGTP geometry price w_use_percap
0 1-1302 8850.000000 121 POLYGON ((-86.85779499999998 33.53612699999999... 0.007806 3843.115486
1 1-1302 9100.000000 62 POLYGON ((-86.85779499999998 33.53612699999999... 0.007806 89672.694664
2 1-1302 31933.333333 59 POLYGON ((-86.85779499999998 33.53612699999999... 0.007806 38431.154856

In [4]:
def get_usage(dataframe, col_wt, col_val):
    usage = []
    df_subset = df[[col_wt, col_val]]
    tuples = df_subset.values.tolist()
    for a_tuple in tuples:
        for weight in range(int(a_tuple[0])):
            usage.append(int(a_tuple[1]))
    return usage
water_usage = get_usage(df, 'WGTP', 'w_use_percap')
type(water_usage)


Out[4]:
list

In [5]:
water_usage_slice = np.random.choice(water_usage, 100000)
water_usage_slice = water_usage_slice.tolist()
type(water_usage_slice)


Out[5]:
list

In [6]:
with open('analysis/js/nums.json', 'w') as outfile:
    json.dump({'numbers':water_usage_slice}, outfile, indent=0)

In [7]:
# computed jenks breaks in javascript
# read in outputs
with open('analysis/js/breaks', 'r') as f:
    breaks = f.readlines()
breaks[0]


Out[7]:
'[18, 88349, 589738]\n'

In [8]:
def clean_and_split(string_list):
    num_list = []
    for string in string_list:
        string = string.replace('\n', '')
        string = re.sub(r'[^\w]', ' ', string)
        strings = re.sub(' +',' ', string).strip().split()
        nums = map(int, strings)
        num_list.append(nums)
    return num_list

lower_bounds = clean_and_split(breaks)
len(lower_bounds)


Out[8]:
18

In [9]:
# sort sample list to use bisection algorithms
# see http://www.ehdp.com/vitalnet/breaks-1-prn.htm
sample = sorted(np.random.choice(water_usage,10000).tolist())
sample[:5]


Out[9]:
[201, 245, 325, 327, 335]

In [10]:
# define bounding functions
def find_lb_idx(a, x):
    'Find index corresponding to leftmost value greater than x'
    i = bisect.bisect_left(a, x)
    if i != len(a):
        return i
    else:
        return 0

def find_ub_idx(a, x):
    'Find index corresponding to rightmost value less than or equal to x'
    i = bisect.bisect_right(a, x)
    if i:
        return i-1
    else:
        return 0

# test bounding functions
lb_in = 0
ub_in = 5
a_list = [0, 1, 2, 3, 4.1, 4.2, 4.3, 5, 5.7, 6, 7, 8, 9, 10]
lb_idx = find_lb_idx(a_list,lb_in) # index of lower bound
ub_idx = find_ub_idx(a_list,ub_in) # index of higher bound
lb = a_list[lb_idx]
ub = a_list[ub_idx]
a_slice = a_list[lb_idx:ub_idx]

# print bounding outputs
print 'input x ranges from %s to %s' % (lb_in, ub_in)
print 'indices range from %s to %s' % (lb_idx, ub_idx)
print 'tested x ranges from %s to %s' % (lb, ub)
print 'new sliced list is %s'         % a_slice


input x ranges from 0 to 5
indices range from 0 to 7
tested x ranges from 0 to 5
new sliced list is [0, 1, 2, 3, 4.1, 4.2, 4.3]

In [11]:
# function to build nice list of upper bounds
def get_upper_bounds(lower_bounds):
    upper_bounds = []
    for row in lower_bounds:
        upper_bounds_row = []
        for idx, val in enumerate(row):
            if idx < (len(row)-1):
                upper_bounds_row.append(row[idx+1])
            else:
                upper_bounds_row.append(float('inf'))
        upper_bounds.append(upper_bounds_row)
    return upper_bounds

upper_bounds = get_upper_bounds(lower_bounds)
zipped_bounds = zip(lower_bounds, upper_bounds)
bounds = []
for row in zipped_bounds:
    bounds.append(zip(row[0], row[1]))

print 'zipped lower & upper bounds %s' % bounds[:2]


zipped lower & upper bounds [[(18, 88349), (88349, 589738), (589738, inf)], [(18, 42459), (42459, 130605), (130605, 589738), (589738, inf)]]

In [12]:
# build function to get slices of data list
def get_indices(data, bounds):
    indices = []
    for li in bounds:
        temp = []
        for tu in li:
            index_tuple = (find_lb_idx(sample, tu[0]), find_ub_idx(sample, tu[1]))
            temp.append(index_tuple)
        indices.append(temp)
    return indices
indices = get_indices(sample, bounds)
indices[:2]


Out[12]:
[[(0, 7931), (7932, 9979), (9980, 9999)],
 [(0, 5026), (5027, 8957), (8957, 9979), (9980, 9999)]]

In [13]:
# build function to get data binned into lists
def bin_data(data, indices):
    sample_list=[]
    for li in indices:
        temp_list = []
        for tu in li:
            lower = tu[0]
            upper = tu[1]
            temp  = sample[tu[0]:tu[1]]
            temp_list.append(temp)
        sample_list.append(temp_list)
    return sample_list

sample_list = bin_data(sample, indices)

sample_list[0][0][:10]


Out[13]:
[201, 245, 325, 327, 335, 391, 462, 463, 464, 485]

In [14]:
# compute global stats
samp_mean = np.mean(sample)
sdam = np.sum((sample[:]-samp_mean)**2)

In [15]:
np.sum(np.subtract([1,2,3], np.mean([1,2,3]))**2)


Out[15]:
2.0

In [16]:
# compute class stats

# compute cluster means

def sse(data):
    return np.sum((np.subtract(data, np.mean(data))**2))  

def calc_scdm(sample_list):
    foo = []
    for li in sample_list:
        bar = []
        for cluster in li:
            bar.append(sse(cluster))
            baz = np.sum(bar)
        foo.append(baz)
    return foo

def calc_gvf(sdam, scdm):
    return (sdam-scdm)/sdam

scdm = calc_scdm(sample_list)
gvf = np.divide((np.subtract(sdam, scdm)),sdam)
gvf
#def calc_cluster_
#list_sdcm = np.sum((data[:]-mean)**2)

#cluster_means = calc_cluster_means(sample_list)
#cluster_means


Out[16]:
array([ 0.6958851 ,  0.82400687,  0.89156585,  0.91747381,  0.92753122,
        0.94838306,  0.95355712,  0.95606449,  0.95861116,  0.96035283,
        0.96123053,  0.9621615 ,  0.96275417,  0.96331449,  0.96365511,
        0.96398704,  0.97448984,  0.97473725])

In [17]:
# select quintiles, which has gvf ~0.9
jenks5 = bounds[2]
jenks5


Out[17]:
[(18, 29527), (29527, 89073), (89073, 175182), (175182, 589738), (589738, inf)]

In [18]:
# calculate quintiles
quintile_lb = np.percentile(sample, [0, 20, 40, 60, 80])
quintile_ub = np.percentile(sample, [20, 40, 60, 80])
quintile_ub = np.append(quintile_ub, float('inf'))
quintile = zip(quintile_lb, quintile_ub)
quintile


Out[18]:
[(201.0, 12694.600000000002),
 (12694.600000000002, 32195.0),
 (32195.0, 54390.999999999985),
 (54390.999999999985, 91114.600000000006),
 (91114.600000000006, inf)]

In [19]:
# calculate equal interval
minimum = min(sample)
maximum = max(sample)
difference = maximum - minimum
step = difference/5
lb = np.arange(minimum, maximum, step)
ub = np.append(lb[1:], float('inf'))
equal_interval = zip(lb, ub)
equal_interval


Out[19]:
[(201, 251116.0),
 (251116, 502031.0),
 (502031, 752946.0),
 (752946, 1003861.0),
 (1003861, 1254776.0),
 (1254776, inf)]