The goal of this problem is to implement the "Median Maintenance" algorithm (covered in the Week 3 lecture on heap applications). The text file contains a list of the integers from 1 to 10000 in unsorted order; you should treat this as a stream of numbers, arriving one by one. Letting xi denote the ith number of the file, the kth median mk is defined as the median of the numbers x1,…,xk. (So, if k is odd, then mk is ((k+1)/2)th smallest number among x1,…,xk; if k is even, then mk is the (k/2)th smallest number among x1,…,xk.)
In the box below you should type the sum of these 10000 medians, modulo 10000 (i.e., only the last 4 digits). That is, you should compute (m1+m2+m3+⋯+m10000)mod10000.
OPTIONAL EXERCISE: Compare the performance achieved by heap-based and search-tree-based implementations of the algorithm.
In [14]:
# heap structure. a binary tree where the key values of children is larger than the key value of the parent
class myHeapArray(object):
def __init__(self, minheap=True):
self.heap = []
self.minheap = minheap
def bubbleDown(self):
n = len(self.heap)-1
p = 0
while (2*p <= n):
c1 = 2*p
if 2*p + 1 <= n:
c2 = 2*p + 1
# Second child exists
cdt = (self.heap[c1] < self.heap[c2]) if self.minheap else (self.heap[c1] > self.heap[c2])
if cdt:
c_val, c = (self.heap[c1], c1)
else:
c_val, c = (self.heap[c2], c2)
else:
c_val, c = (self.heap[c1], c1)
# Swap parent with the child with the smallest key value
cdt = (self.heap[p] > c_val) if self.minheap else (self.heap[p] < c_val)
if cdt:
tmp = self.heap[p]
self.heap[p] = c_val
self.heap[c] = tmp
p = c
else:
break
def extractMin(self):
if not self.heap:
return None
val = self.heap[0]
# copy last value to root, then remove and discard.
last = len(self.heap) - 1
self.heap[0] = self.heap[last]
self.heap.pop()
self.bubbleDown()
return val
def bubbleUp(self):
# for parent at node i, children is at node 2*i and 2*i + 1
# for child i, parent is at i/2 if i is even or floor(i/2) if i is odd
# Bubble Up starting the newly added key at the end of the heap
c = len(self.heap) - 1
while c!= 0:
p = int(c / 2)
cdt = (self.heap[p] > self.heap[c]) if self.minheap else (self.heap[p] < self.heap[c])
if cdt:
tmp = self.heap[p]
self.heap[p] = self.heap[c]
self.heap[c] = tmp
c = p
else:
break
def insert(self,elem):
self.heap.append(elem)
self.bubbleUp()
def insertList(self, elemList):
for elem in elemList:
self.heap.append(elem)
self.bubbleUp()
def get_ordered_list(self):
ordered = []
while True:
n = self.extractMin()
if n:
ordered.append(n)
else:
break
return ordered
In [17]:
arr = [1 , 10, 9, 15, 5]
elem = [str(i) for i in arr]
h = myHeapArray(minheap=True)
h.insertList(arr)
print (h.get_ordered_list())
h = myHeapArray(minheap=False)
h.insertList(arr)
print (h.get_ordered_list())
In [39]:
def median_maintenance(stream):
lower_heap = myHeapArray(minheap=False) # maintain max value
upper_heap = myHeapArray(minheap=True)
median_list = []
for i in stream:
if not lower_heap.heap:
# Should run only once
lower_heap.insert(i)
median_list.append(i)
print ("median:", i)
continue
minMedian = lower_heap.heap[0]
if not upper_heap.heap:
# Should run only once
if i >= minMedian:
upper_heap.insert(i)
median_list.append(minMedian)
print ("median:", minMedian)
else:
v = lower_heap.extractMin()
upper_heap.insert(v)
lower_heap.insert(i)
median_list.append(i)
print ("median:", i)
continue
maxMedian = upper_heap.heap[0]
if i >= maxMedian:
upper_heap.insert(i)
else:
lower_heap.insert(i)
# Rebalance heaps if needed
if len(lower_heap.heap) > len(upper_heap.heap) + 1:
v = lower_heap.extractMin()
upper_heap.insert(v)
elif len(upper_heap.heap) > len(lower_heap.heap) + 1:
v = upper_heap.extractMin()
lower_heap.insert(v)
# Get Median
if (len(lower_heap.heap) == len(upper_heap.heap)) or (len(lower_heap.heap) > len(upper_heap.heap)):
median = lower_heap.heap[0]
else:
median = upper_heap.heap[0]
#print ("median", median)
median_list.append(median)
return median_list
def getdataStream(FILE):
fp = open(FILE, "r")
for i in fp.readlines():
yield int(i)
FILE = "median.txt"
median_list = median_maintenance(getdataStream(FILE))
print (sum(median_list)%10000)
In [ ]: