In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from collections import Counter
import sys
from scipy import signal
from collections import defaultdict

In [2]:
filename = "../data/hgmm_1k_S1_L001_R1_001.fastq"

In [3]:
with open(filename) as f:
    num = 0
    count = 1
    bc = []
    for line in f:
        if num == 1 and 'N' not in line:
            bc.append(line.strip()[:16])
            count += 1
        num = (num+1)%4
        if (count%100000 == 0):
            print "\r\r Done: "+ str(count),


 Done: 7900000                                                                             

In [4]:
ct = Counter(bc)

In [5]:
vals = sorted(ct.values(), reverse=True)

In [6]:
t = []
for ind,x in enumerate(vals):
    if ind == 0:
        t.append(x)
        continue
    t.append(x+t[-1])

In [30]:
# vals[-1]
# plt.plot(np.log(t[:1200]))

In [31]:
# plt.plot(vals[:10000])

In [32]:
cdf = t[:100000]

In [141]:
def local_lin_fit(y, window_len=10):
    from scipy.optimize import curve_fit
    num_windows = len(y) - window_len
    slopes = []
    x = []
    for window_start in range(0, num_windows):
        window_x = np.array(range(window_start, window_start + window_len))
        window_y = np.array(y[window_start : window_start + window_len])
        n = len(window_x)
        s_x = sum(window_x)
        s_y = sum(window_y)
        s_xx = sum(window_x * window_x)
        s_xy = sum(window_x * window_y)
        slopes.append(-((n * s_xy - s_x * s_y) / (n * s_xx - s_x * s_x)))
        x.append(window_start + window_len / 2)
    return slopes, x


WINDOW = [100, 5000]
LOCAL_WINDOW_LEN = 25

y=cdf

slopes, x = local_lin_fit(y, window_len=LOCAL_WINDOW_LEN)
savCoeff = signal.savgol_coeffs(251, 4)
savgol = []

for ll in range(len(slopes)-251):
    savgol.append(sum(slopes[ll:ll+251] * savCoeff))

threshold =  int(WINDOW[0] + x[0] + 125)

diff = []
ind = 0
history = savgol[WINDOW[0]]
for x in savgol[WINDOW[0]+1:WINDOW[1]]:
    diff.append(abs(history-x))
    history = x

for i in range(0,len(diff)-9):
    l = diff[i:i+10]
    flag = False
    for x in l:
        if x > 2:
            flag = True
            break
    if not flag:
        ind = i-1
        break
            
threshold += ind

print threshold


1153

In [103]:
plt.plot(savgol[1040:1070])


Out[103]:
[<matplotlib.lines.Line2D at 0x1389b32d0>]

In [ ]:


In [ ]: