In [1]:
import numpy as np
import scipy.stats
import scipy
import bisect

In [2]:
#raw1 = np.loadtxt("2sum.txt", dtype=int)
#np.savez_compressed("2sum.txt.npz", data=raw1)
with np.load('2sum.txt.npz') as data:
    raw1 = data['data']
raw1.shape
U = raw1

In [3]:
scipy.stats.kurtosis(U)


Out[3]:
-1.1990247437581012

In [4]:
U_mean = scipy.mean(U)
U_std  = scipy.std(U)

print "Mean:", U_mean
print "Standard Deviation:", U_std


Mean: 1322010.15214
Standard Deviation: 57723915587.6

In [5]:
U.sort()

In [80]:
# this function takes advantage of the fact the numbers are uniformly distributed
# thus the answer should simple be on the "opposite" side of the distribution
# the function searches the other side of the distribution for a range of possible 
# y indexes for a given x index
def find_yrange(U, i, i_mag, mag_fac = 100, mag_zoom = 20):
    # U: universe of numbers
    # i: x index
    # i_mag: y index search median
    # mag_fac: number of indices between range checks
    global t_bounds
    x_i = U[i]
    y_indices = [mag_fac*k + i_mag for k in range(-10, 11) if -1*(mag_fac*k + i_mag) > 0 and -1*(mag_fac*k + i_mag) < len(U)]
    y = U[y_indices]
    t = x_i + y
    t_min = t[0]
    t_max = t[-1]
    # check if the max or min is outside of the bounds
    if t_max < t_bounds[0] or t_min > t_bounds[1]:
        return None
    l_bound = y_indices[bisect.bisect_left(t, -t_bounds[0])-1]
    r_bound = y_indices[bisect.bisect_right(t, t_bounds[1])]
    m_bound = np.mean([l_bound, r_bound], dtype=int)
    mag_fac /= mag_zoom
    if mag_fac >= 5:
        l_bound, r_bound = find_yrange(U, i, m_bound, mag_fac)
    return l_bound, r_bound

t_bounds = (-10000,10000)
t_set = set()
for i in range(1000,1010):
    bounds = find_yrange(U,i, -1*(i+1))
    if bounds:
        t_i = [U[i] + U[i_inv] for i_inv in range(bounds[0], bounds[1]+1) if U[i] + U[i_inv] > t_bounds[0] and U[i] + U[i_inv] < t_bounds[1]]
        if t_i: t_set.update(t_i)
print t_set
# set([8195, 5308, 3445])


set([8195, 5308, 3445])

In [70]:
t_bounds = (-10000,10000)
t_set = set()
for i in range(len(U)):
    bounds = find_yrange(U,i, -1*(i+1))
    if bounds:
        t_i = [U[i] + U[i_inv] for i_inv in range(bounds[0], bounds[1]+1) if U[i] + U[i_inv] > t_bounds[0] and U[i] + U[i_inv] < t_bounds[1]]
        if t_i: t_set.update(t_i)
print t_set


set([2048, 4097, 6146, 8195, 6145, 8194, 7171, -7637, -2795, 1024, -6613, -2794, 6890, -8104, -8103, -4006, -4005, 92, 2141, 6238, 8287, 8288, -747, 9125, 6891, -5588, -746, 3073, 4096, -9873, -8848, -4751, -2702, -653, -8011, -5962, -5961, -1864, 185, 2234, 2235, 6332, 8381, 2421, 4470, 9591, -3354, 5122, -1492, 2327, -9967, -7918, -7917, -5868, -3819, -1770, 279, 4376, 6425, 8474, 8475, 2328, 3351, -4286, -9127, -9500, -4285, 557, -6427, -4378, -1305, -9874, -7825, -5776, -3727, -3726, 371, 2420, 4469, 6518, 6519, 8568, 558, 3818, 7915, 7916, -2237, -6055, -9780, -7731, -5682, -1585, 464, 2513, 2514, 4563, 8660, 8661, 2607, 7449, -5030, -1211, -188, 3631, 7450, -7080, -5031, -6054, -1957, 1116, -7638, -5589, -3540, -1491, 2606, 4655, 4656, 8753, 8754, -8382, 3166, 6239, 9312, 838, 1395, 5681, 6704, -9594, -7545, -5496, -5495, -1398, 651, 2700, 4749, 8846, -5775, -1956, -6147, -933, 7728, 2887, -8569, -6707, -6706, -3633, -1584, -9501, -7452, -5403, -5402, 1489, 2792, 2793, 4842, 8939, 8940, 4562, -7824, 6611, 9777, 6612, -7544, 1117, 4936, -3541, -1678, 5960, 6983, -9408, -9407, -7358, -5309, -1212, 837, 2886, 4935, 9032, 9033, -1677, 2142, 4189, -7359, 1023, 4190, -5310, -3261, -3260, -9315, -9314, 1861, -5216, -3167, -1118, 931, 2980, 1862, 9126, -3447, 372, 5959, 6984, 8009, -7265, -3446, 1396, 5215, -9222, -7173, -5124, -3075, -3074, -1025, 3072, 5121, 7170, 9219, 9220, 3444, -5217, -1397, -6240, -6986, 650, -3913, -3912, 1209, -9128, -7079, -2982, -2981, -932, 3165, 5214, 7263, 7264, 9313, 5307, 5493, 5308, 7357, -2144, -6985, -2143, 1676, 2699, -9035, -9034, -4937, -2888, -839, 3258, 3259, 7356, 9405, 9406, -4938, 7542, -9779, -9687, -8662, -94, 8567, -4564, -2515, -8942, -8941, -4844, -466, 1302, 1303, 5400, 5401, 9498, 9499, -2889, 930, 3632, -8010, 6705, -7730, -5683, 1955, 5774, 6797, -8849, -6800, -6799, -4750, -2701, -652, 3445, 5494, 7543, 9592, -1863, -840, -7266, -4657, -3634, -6241, 7823, -3168, -1119, -8756, -8755, -4658, -2609, -560, -559, 3538, 2979, 9684, 9685, 186, 6052, 5028, 8101, -4191, 7078, -7451, -6428, 1210, 5029, -2608, -8663, -6614, -4565, -2516, -467, 1582, 1583, 5680, 7729, 9778, 7077, -9966, -6893, 4283, -3820, -1771, 278, -8570, -6521, -6520, -2423, -374, -373, 3724, 5773, 7822, 9871, 9872, 3352, 465, 4284, 4377, 6426, 93, -7172, -3353, -2329, 1490, -8477, -8476, -4379, -2330, -281, 1768, 1769, 5866, 5867, 9964, 9965, -1304, 8380, -9593, -280, 3539, -4472, -4471, -2422, -8383, -6334, 1675, -2236, -187, 3910, 3911, 8008, 4748, 744, 3725, 5586, 6798, 8847, 745, 5587, -6892, -5869, -2050, -8290, -8289, -4192, -95, 1954, 4003, 4004, 6053, 8102, -2049, -1026, 7635, -9686, 3817, 7636, -9221, -4843, -8196, 4841, -5123, -4098, -8197, -6148, -4099, -1, 2047])

In [66]:
print len(t_set)


427

The goal of this problem is to implement the "Median Maintenance" algorithm (covered in the Week 5 lecture on heap applications). The text file contains a list of the integers from 1 to 10000 in unsorted order; you should treat this as a stream of numbers, arriving one by one. Letting xi denote the ith number of the file, the kth median mk is defined as the median of the numbers x1,…,xk. (So, if k is odd, then mk is ((k+1)/2)th smallest number among x1,…,xk; if k is even, then mk is the (k/2)th smallest number among x1,…,xk.)

In the box below you should type the sum of these 10000 medians, modulo 10000 (i.e., only the last 4 digits). That is, you should compute (m1+m2+m3+⋯+m10000)mod10000.

OPTIONAL EXERCISE: Compare the performance achieved by heap-based and search-tree-based implementations of the algorithm.


In [ ]: