Median Maintenance

The goal of this problem is to implement the "Median Maintenance" algorithm (covered in the Week 3 lecture on heap applications). The text file contains a list of the integers from 1 to 10000 in unsorted order; you should treat this as a stream of numbers, arriving one by one. Letting xi denote the ith number of the file, the kth median mk is defined as the median of the numbers x1,…,xk. (So, if k is odd, then mk is ((k+1)/2)th smallest number among x1,…,xk; if k is even, then mk is the (k/2)th smallest number among x1,…,xk.)

In the box below you should type the sum of these 10000 medians, modulo 10000 (i.e., only the last 4 digits). That is, you should compute (m1+m2+m3+⋯+m10000)mod10000.

OPTIONAL EXERCISE: Compare the performance achieved by heap-based and search-tree-based implementations of the algorithm.


In [14]:
# heap structure. a binary tree where the key values of children is larger than the key value of the parent        
class myHeapArray(object):
    
    def __init__(self, minheap=True):        
        self.heap = []
        self.minheap = minheap
        
    def bubbleDown(self):
        
        n = len(self.heap)-1        
        p = 0
        
        while (2*p <= n):
            c1 = 2*p
               
            if 2*p + 1 <= n: 
                c2 = 2*p + 1
                # Second child exists
                
                cdt = (self.heap[c1] < self.heap[c2]) if self.minheap else (self.heap[c1] > self.heap[c2])
                if cdt:
                    c_val, c = (self.heap[c1], c1)
                else:
                    c_val, c = (self.heap[c2], c2)
            else:
                c_val, c = (self.heap[c1], c1)


            # Swap parent with the child with the smallest key value
            cdt = (self.heap[p] >  c_val) if self.minheap else (self.heap[p] <  c_val)
            if cdt:
                tmp = self.heap[p]
                self.heap[p] = c_val
                self.heap[c] = tmp
                p = c
            else:
                break         
        
        
    def extractMin(self):
        if not self.heap:
            return None
        
        val = self.heap[0]
        
        # copy last value to root, then remove and discard.
        last = len(self.heap) - 1
        self.heap[0] = self.heap[last]        
        self.heap.pop()
        
        self.bubbleDown()
        
        return val
    
    def bubbleUp(self):
        # for parent at node i, children is at node 2*i and 2*i + 1
        # for child i, parent is at i/2 if i is even or floor(i/2) if i is odd
        
        # Bubble Up starting the newly added key at the end of the heap
        
        c = len(self.heap) - 1
        while c!= 0:
            p = int(c / 2)
            
            
            cdt = (self.heap[p] > self.heap[c]) if self.minheap else (self.heap[p] < self.heap[c])
            if cdt:
                tmp = self.heap[p]
                self.heap[p] = self.heap[c]
                self.heap[c] = tmp
                c = p
            else:
                break                        
        
    def insert(self,elem):
        self.heap.append(elem)
        
        self.bubbleUp()
        
    def insertList(self, elemList):
        for elem in elemList:
            self.heap.append(elem)
            self.bubbleUp()

    def get_ordered_list(self):
        ordered = []        

        while True:
            n = self.extractMin()
            if n:
                ordered.append(n)
            else:
                break

        return ordered

In [17]:
arr = [1 , 10, 9, 15, 5]

elem = [str(i) for i in arr]

h = myHeapArray(minheap=True)
h.insertList(arr)

print (h.get_ordered_list())

        
h = myHeapArray(minheap=False)
h.insertList(arr)

print (h.get_ordered_list())


[1, 5, 9, 10, 15]
[15, 10, 9, 5, 1]

In [39]:
def median_maintenance(stream):
    lower_heap  = myHeapArray(minheap=False) # maintain max value
    upper_heap  = myHeapArray(minheap=True)
    
    median_list = []
    
    for i in stream:
        
        if not lower_heap.heap:
            # Should run only once
            lower_heap.insert(i)
            median_list.append(i)
            print ("median:", i)
            continue
                
        minMedian = lower_heap.heap[0]

        if not upper_heap.heap:
            # Should run only once
            if i >=  minMedian:
                upper_heap.insert(i)
                median_list.append(minMedian)
                print ("median:", minMedian)


            else:
                v = lower_heap.extractMin()
                upper_heap.insert(v)
                lower_heap.insert(i)  
                median_list.append(i)
                print ("median:", i)
            continue
                    
        maxMedian = upper_heap.heap[0]

        if i >= maxMedian:
            upper_heap.insert(i)
        else:
            lower_heap.insert(i)
            
        # Rebalance heaps if needed
        if len(lower_heap.heap) > len(upper_heap.heap) + 1:
            v = lower_heap.extractMin()
            upper_heap.insert(v)
        elif len(upper_heap.heap) > len(lower_heap.heap) + 1:
            v = upper_heap.extractMin()
            lower_heap.insert(v)

        # Get Median
        if (len(lower_heap.heap) == len(upper_heap.heap)) or (len(lower_heap.heap) > len(upper_heap.heap)):
            median = lower_heap.heap[0]
        else:
            median = upper_heap.heap[0]

        #print ("median", median)
        median_list.append(median)
    
    return median_list

def getdataStream(FILE):
    fp = open(FILE, "r")
    
    for i in fp.readlines():
        yield int(i)

FILE = "median.txt"  

median_list = median_maintenance(getdataStream(FILE))

print (sum(median_list)%10000)


median: 6331
median: 2793
1213

In [ ]: