In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from collections import Counter
import sys
from scipy import signal
from collections import defaultdict
In [2]:
filename = "../data/hgmm_1k_S1_L001_R1_001.fastq"
In [3]:
with open(filename) as f:
num = 0
count = 1
bc = []
for line in f:
if num == 1 and 'N' not in line:
bc.append(line.strip()[:16])
count += 1
num = (num+1)%4
if (count%100000 == 0):
print "\r\r Done: "+ str(count),
In [4]:
ct = Counter(bc)
In [5]:
vals = sorted(ct.values(), reverse=True)
In [6]:
t = []
for ind,x in enumerate(vals):
if ind == 0:
t.append(x)
continue
t.append(x+t[-1])
In [30]:
# vals[-1]
# plt.plot(np.log(t[:1200]))
In [31]:
# plt.plot(vals[:10000])
In [32]:
cdf = t[:100000]
In [141]:
def local_lin_fit(y, window_len=10):
from scipy.optimize import curve_fit
num_windows = len(y) - window_len
slopes = []
x = []
for window_start in range(0, num_windows):
window_x = np.array(range(window_start, window_start + window_len))
window_y = np.array(y[window_start : window_start + window_len])
n = len(window_x)
s_x = sum(window_x)
s_y = sum(window_y)
s_xx = sum(window_x * window_x)
s_xy = sum(window_x * window_y)
slopes.append(-((n * s_xy - s_x * s_y) / (n * s_xx - s_x * s_x)))
x.append(window_start + window_len / 2)
return slopes, x
WINDOW = [100, 5000]
LOCAL_WINDOW_LEN = 25
y=cdf
slopes, x = local_lin_fit(y, window_len=LOCAL_WINDOW_LEN)
savCoeff = signal.savgol_coeffs(251, 4)
savgol = []
for ll in range(len(slopes)-251):
savgol.append(sum(slopes[ll:ll+251] * savCoeff))
threshold = int(WINDOW[0] + x[0] + 125)
diff = []
ind = 0
history = savgol[WINDOW[0]]
for x in savgol[WINDOW[0]+1:WINDOW[1]]:
diff.append(abs(history-x))
history = x
for i in range(0,len(diff)-9):
l = diff[i:i+10]
flag = False
for x in l:
if x > 2:
flag = True
break
if not flag:
ind = i-1
break
threshold += ind
print threshold
In [103]:
plt.plot(savgol[1040:1070])
Out[103]:
In [ ]:
In [ ]: