In [1]:
import os
import pandas as pd
import csv
import json
import numpy as np
import re
import bisect
In [2]:
directory = 'analysis'
if not os.path.exists(directory):
os.makedirs(directory)
In [3]:
# import pums-lite
df = pd.read_pickle('data/puma_pums/merged.pickle')
df.head(3)
Out[3]:
In [4]:
def get_usage(dataframe, col_wt, col_val):
usage = []
df_subset = df[[col_wt, col_val]]
tuples = df_subset.values.tolist()
for a_tuple in tuples:
for weight in range(int(a_tuple[0])):
usage.append(int(a_tuple[1]))
return usage
water_usage = get_usage(df, 'WGTP', 'w_use_percap')
type(water_usage)
Out[4]:
In [5]:
water_usage_slice = np.random.choice(water_usage, 100000)
water_usage_slice = water_usage_slice.tolist()
type(water_usage_slice)
Out[5]:
In [6]:
with open('analysis/js/nums.json', 'w') as outfile:
json.dump({'numbers':water_usage_slice}, outfile, indent=0)
In [7]:
# computed jenks breaks in javascript
# read in outputs
with open('analysis/js/breaks', 'r') as f:
breaks = f.readlines()
breaks[0]
Out[7]:
In [8]:
def clean_and_split(string_list):
num_list = []
for string in string_list:
string = string.replace('\n', '')
string = re.sub(r'[^\w]', ' ', string)
strings = re.sub(' +',' ', string).strip().split()
nums = map(int, strings)
num_list.append(nums)
return num_list
lower_bounds = clean_and_split(breaks)
len(lower_bounds)
Out[8]:
In [9]:
# sort sample list to use bisection algorithms
# see http://www.ehdp.com/vitalnet/breaks-1-prn.htm
sample = sorted(np.random.choice(water_usage,10000).tolist())
sample[:5]
Out[9]:
In [10]:
# define bounding functions
def find_lb_idx(a, x):
'Find index corresponding to leftmost value greater than x'
i = bisect.bisect_left(a, x)
if i != len(a):
return i
else:
return 0
def find_ub_idx(a, x):
'Find index corresponding to rightmost value less than or equal to x'
i = bisect.bisect_right(a, x)
if i:
return i-1
else:
return 0
# test bounding functions
lb_in = 0
ub_in = 5
a_list = [0, 1, 2, 3, 4.1, 4.2, 4.3, 5, 5.7, 6, 7, 8, 9, 10]
lb_idx = find_lb_idx(a_list,lb_in) # index of lower bound
ub_idx = find_ub_idx(a_list,ub_in) # index of higher bound
lb = a_list[lb_idx]
ub = a_list[ub_idx]
a_slice = a_list[lb_idx:ub_idx]
# print bounding outputs
print 'input x ranges from %s to %s' % (lb_in, ub_in)
print 'indices range from %s to %s' % (lb_idx, ub_idx)
print 'tested x ranges from %s to %s' % (lb, ub)
print 'new sliced list is %s' % a_slice
In [11]:
# function to build nice list of upper bounds
def get_upper_bounds(lower_bounds):
upper_bounds = []
for row in lower_bounds:
upper_bounds_row = []
for idx, val in enumerate(row):
if idx < (len(row)-1):
upper_bounds_row.append(row[idx+1])
else:
upper_bounds_row.append(float('inf'))
upper_bounds.append(upper_bounds_row)
return upper_bounds
upper_bounds = get_upper_bounds(lower_bounds)
zipped_bounds = zip(lower_bounds, upper_bounds)
bounds = []
for row in zipped_bounds:
bounds.append(zip(row[0], row[1]))
print 'zipped lower & upper bounds %s' % bounds[:2]
In [12]:
# build function to get slices of data list
def get_indices(data, bounds):
indices = []
for li in bounds:
temp = []
for tu in li:
index_tuple = (find_lb_idx(sample, tu[0]), find_ub_idx(sample, tu[1]))
temp.append(index_tuple)
indices.append(temp)
return indices
indices = get_indices(sample, bounds)
indices[:2]
Out[12]:
In [13]:
# build function to get data binned into lists
def bin_data(data, indices):
sample_list=[]
for li in indices:
temp_list = []
for tu in li:
lower = tu[0]
upper = tu[1]
temp = sample[tu[0]:tu[1]]
temp_list.append(temp)
sample_list.append(temp_list)
return sample_list
sample_list = bin_data(sample, indices)
sample_list[0][0][:10]
Out[13]:
In [14]:
# compute global stats
samp_mean = np.mean(sample)
sdam = np.sum((sample[:]-samp_mean)**2)
In [15]:
np.sum(np.subtract([1,2,3], np.mean([1,2,3]))**2)
Out[15]:
In [16]:
# compute class stats
# compute cluster means
def sse(data):
return np.sum((np.subtract(data, np.mean(data))**2))
def calc_scdm(sample_list):
foo = []
for li in sample_list:
bar = []
for cluster in li:
bar.append(sse(cluster))
baz = np.sum(bar)
foo.append(baz)
return foo
def calc_gvf(sdam, scdm):
return (sdam-scdm)/sdam
scdm = calc_scdm(sample_list)
gvf = np.divide((np.subtract(sdam, scdm)),sdam)
gvf
#def calc_cluster_
#list_sdcm = np.sum((data[:]-mean)**2)
#cluster_means = calc_cluster_means(sample_list)
#cluster_means
Out[16]:
In [17]:
# select quintiles, which has gvf ~0.9
jenks5 = bounds[2]
jenks5
Out[17]:
In [18]:
# calculate quintiles
quintile_lb = np.percentile(sample, [0, 20, 40, 60, 80])
quintile_ub = np.percentile(sample, [20, 40, 60, 80])
quintile_ub = np.append(quintile_ub, float('inf'))
quintile = zip(quintile_lb, quintile_ub)
quintile
Out[18]:
In [19]:
# calculate equal interval
minimum = min(sample)
maximum = max(sample)
difference = maximum - minimum
step = difference/5
lb = np.arange(minimum, maximum, step)
ub = np.append(lb[1:], float('inf'))
equal_interval = zip(lb, ub)
equal_interval
Out[19]: