In [1]:
import numpy as np
import scipy.stats
import scipy
import bisect
In [2]:
#raw1 = np.loadtxt("2sum.txt", dtype=int)
#np.savez_compressed("2sum.txt.npz", data=raw1)
with np.load('2sum.txt.npz') as data:
raw1 = data['data']
raw1.shape
U = raw1
In [3]:
scipy.stats.kurtosis(U)
Out[3]:
In [4]:
U_mean = scipy.mean(U)
U_std = scipy.std(U)
print "Mean:", U_mean
print "Standard Deviation:", U_std
In [5]:
U.sort()
In [80]:
# this function takes advantage of the fact the numbers are uniformly distributed
# thus the answer should simple be on the "opposite" side of the distribution
# the function searches the other side of the distribution for a range of possible
# y indexes for a given x index
def find_yrange(U, i, i_mag, mag_fac = 100, mag_zoom = 20):
# U: universe of numbers
# i: x index
# i_mag: y index search median
# mag_fac: number of indices between range checks
global t_bounds
x_i = U[i]
y_indices = [mag_fac*k + i_mag for k in range(-10, 11) if -1*(mag_fac*k + i_mag) > 0 and -1*(mag_fac*k + i_mag) < len(U)]
y = U[y_indices]
t = x_i + y
t_min = t[0]
t_max = t[-1]
# check if the max or min is outside of the bounds
if t_max < t_bounds[0] or t_min > t_bounds[1]:
return None
l_bound = y_indices[bisect.bisect_left(t, -t_bounds[0])-1]
r_bound = y_indices[bisect.bisect_right(t, t_bounds[1])]
m_bound = np.mean([l_bound, r_bound], dtype=int)
mag_fac /= mag_zoom
if mag_fac >= 5:
l_bound, r_bound = find_yrange(U, i, m_bound, mag_fac)
return l_bound, r_bound
t_bounds = (-10000,10000)
t_set = set()
for i in range(1000,1010):
bounds = find_yrange(U,i, -1*(i+1))
if bounds:
t_i = [U[i] + U[i_inv] for i_inv in range(bounds[0], bounds[1]+1) if U[i] + U[i_inv] > t_bounds[0] and U[i] + U[i_inv] < t_bounds[1]]
if t_i: t_set.update(t_i)
print t_set
# set([8195, 5308, 3445])
In [70]:
t_bounds = (-10000,10000)
t_set = set()
for i in range(len(U)):
bounds = find_yrange(U,i, -1*(i+1))
if bounds:
t_i = [U[i] + U[i_inv] for i_inv in range(bounds[0], bounds[1]+1) if U[i] + U[i_inv] > t_bounds[0] and U[i] + U[i_inv] < t_bounds[1]]
if t_i: t_set.update(t_i)
print t_set
In [66]:
print len(t_set)
The goal of this problem is to implement the "Median Maintenance" algorithm (covered in the Week 5 lecture on heap applications). The text file contains a list of the integers from 1 to 10000 in unsorted order; you should treat this as a stream of numbers, arriving one by one. Letting xi denote the ith number of the file, the kth median mk is defined as the median of the numbers x1,…,xk. (So, if k is odd, then mk is ((k+1)/2)th smallest number among x1,…,xk; if k is even, then mk is the (k/2)th smallest number among x1,…,xk.)
In the box below you should type the sum of these 10000 medians, modulo 10000 (i.e., only the last 4 digits). That is, you should compute (m1+m2+m3+⋯+m10000)mod10000.
OPTIONAL EXERCISE: Compare the performance achieved by heap-based and search-tree-based implementations of the algorithm.
In [ ]: