In [1]:

    
from __future__ import division, print_function

from collections import defaultdict
import os

import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

Approximation



In [2]:

    
approx = defaultdict(int)
mafs = []
for fname in os.listdir('.'):
    if not fname.startswith('tsi-') or not fname.endswith('.frq'):
        continue
    f = open(fname)
    f.readline()
    for cnt, l in enumerate(f):
        toks = [tok for tok in l.rstrip().split(' ') if tok != '']
        maf = float(toks[-2])
        mafs.append(maf)
        approx[maf] += 1
    f.close()



In [3]:

    
print(len(mafs), type(mafs))
print(len(approx))









    



1222126 <type 'list'>
417



In [4]:

    
%time print(np.median(mafs))









    



0.2216
CPU times: user 60 ms, sys: 4 ms, total: 64 ms
Wall time: 65.9 ms



In [5]:

    
fig, axs = plt.subplots(2, 2, sharex=True, figsize=(16, 9))

xs, ys = zip(*approx.items())
axs[0, 0].plot(xs, ys, '.')

xs = list(approx.keys())
xs.sort()
ys = [approx[x] for x in xs]
axs[0, 1].bar(xs, ys, 0.005)

def get_bins(my_dict, nbins):
    acumulator = [0] * nbins
    xmin, xmax = xs[0], xs[-1]
    interval = (xmax - xmin) / nbins
    bin_xs = [xmin + i * interval + interval / 2 for i in range(nbins)]
    #the starting point, midpoint woudl take more interval / 2
    curr_bin = 0
    for x in xs:
        y_cnt = approx[x]
        while curr_bin + 1 != nbins and abs(x - bin_xs[curr_bin]) > abs(x - bin_xs[curr_bin + 1]):
            curr_bin += 1
        acumulator[curr_bin] += y_cnt
    return bin_xs, acumulator, interval

bin_xs, acumulator, interval = get_bins(approx, 10)
axs[1, 0].bar(np.array(bin_xs) - interval / 2, acumulator, 0.5 / 10)

bin_xs, acumulator, interval = get_bins(approx, 20)
axs[1, 1].bar(np.array(bin_xs) - interval / 2, acumulator, 0.5 / 20)
axs[1, 1].set_xlim(0, 0.5)

fig.tight_layout()



In [6]:

    
def compute_median_from_dictionary(my_dict):
    xs = list(my_dict.keys())
    xs.sort()
    x_cnt = [my_dict[x] for x in xs]
    start = 0
    end = len(xs) - 1
    while start != end:
        if start == end - 1 and x_cnt[start] == x_cnt[end]:
            return (xs[start] + xs[end]) / 2
        if x_cnt[start] > x_cnt[end]:
            x_cnt[start] -= x_cnt[end]
            end -= 1
        elif x_cnt[start] < x_cnt[end]:
            x_cnt[end] -= x_cnt[start]
            start += 1
        else:
            start += 1
            end -= 1
    return xs[start]



In [7]:

    
print(compute_median_from_dictionary(approx))



In [8]:

    
#do not put this, but discuss
for my_round in [1, 2, 3]:
    my_approx = defaultdict(int)
    for val, cnt in approx.items():
        my_approx[round(val, my_round)] += cnt
    print(len(my_approx), compute_median_from_dictionary(my_approx))

Subsampling



In [9]:

    
import random
import pandas as pd
arr = np.ndarray(shape=(5, 6), dtype=float)
samp_sizes = [1, 10, 100, 1000, 10000]
for rep in range(3):
    for si, samp_size in enumerate([1, 10, 100, 1000, 10000]):
        my_vals = random.sample(mafs, samp_size)
        arr[si, rep] = np.median(my_vals)
        arr[si, rep + 3] = max(my_vals)
df = pd.DataFrame(arr, index=samp_sizes,
                  columns=['Mean #1', 'Mean #2', 'Mean #3', 'Max #1', 'Max #2', 'Max #3'])
df



In [10]:

    
#bill gates effect

	Mean #1	Mean #2	Mean #3	Max #1	Max #2	Max #3
1	0.23300	0.3977	0.48860	0.2330	0.3977	0.4886
10	0.22160	0.1932	0.22725	0.3807	0.3693	0.3977
100	0.26420	0.1733	0.21590	0.4943	0.4886	0.4830
1000	0.22445	0.2159	0.22160	0.5000	0.5000	0.5000
10000	0.22160	0.2159	0.22160	0.5000	0.5000	0.5000