notebook.community

Edit and run



In [1]:

    
import pandas as pd
import numpy as np
import matplotlib as mpl
%matplotlib inline
import matplotlib.pyplot as plt
from collections import Counter
from collections import defaultdict
from sklearn.neighbors.kde import KernelDensity
from scipy.stats.kde import gaussian_kde
from scipy import signal
from scipy.ndimage.filters import correlate1d



In [2]:

    
barcodes = open('../data/barcodes.txt').readlines()
barcodes = [x.strip() for x in barcodes if 'N' not in x and 'n' not in x]
temp = barcodes[:]
barcodeFreq = Counter(barcodes)



In [3]:

    
# barcodes = temp[:]
# #bootstrapping
# # bins=400
# d = []
# numberOfBoootstrapsamples = 1000
# for i in range(numberOfBoootstrapsamples):
#     bc = barcodes.pop()
#     d.append(bc)



In [4]:

    
# len(barcodes), len(temp)



In [29]:

    
#knee detection
def local_lin_fit(y, window_len=10):
    from scipy.optimize import curve_fit
    num_windows = len(y) - window_len
    slopes = []
    x = []
    for window_start in range(0, num_windows):
        window_x = np.array(range(window_start, window_start + window_len))
        window_y = np.array(y[window_start : window_start + window_len])
        n = len(window_x)
        s_x = sum(window_x)
        s_y = sum(window_y)
        s_xx = sum(window_x * window_x)
        s_xy = sum(window_x * window_y)
        slopes.append(-((n * s_xy - s_x * s_y) / (n * s_xx - s_x * s_x)))
        x.append(window_start + window_len / 2)
    return slopes, x


# letter_counts = Counter(barcodes)
# freqCounts = np.array(letter_counts.values())
# freqCounts.sort()
# f=freqCounts[::-1]
# WINDOW = [100, 5000]
# LOCAL_WINDOW_LEN = 25

y=test

slopes, x = local_lin_fit(y, window_len=LOCAL_WINDOW_LEN)
savCoeff = signal.savgol_coeffs(251, 4)
savgol = []

for ll in range(len(slopes)-251):
    savgol.append(sum(slopes[ll:ll+251] * savCoeff))

threshold =  int(WINDOW[0] + x[0] + 125)

diff = []
ind = 0
history = savgol[WINDOW[0]]
for x in savgol[WINDOW[0]+1:WINDOW[1]]:
    diff.append(abs(history-x))
    history = x

for i in range(0,len(diff)-9):
    l = diff[i:i+10]
    flag = False
    for x in l:
        if x > 18:
            flag = True
            break
    if not flag:
        ind = i-1
        break
            
threshold += ind

print threshold



In [15]:

    
plt.plot(test)









    Out[15]:





[<matplotlib.lines.Line2D at 0x11c5d3750>]



In [ ]:

    
#phew!!! So the problem is numpy and scipy handles the edges in convolution diffferently so we simplt ignore the edges.



In [ ]:



In [15]:

    
f[566]









    Out[15]:





2158



In [4]:

    
#read eqClass files with barcodes
eqFile="../data/eq_classes.txt"
eqClasses = []
with open(eqFile) as ifile:
    numTran = int(ifile.readline().rstrip())
    numEq = int(ifile.readline().rstrip())
    for i in xrange(numTran):
        ifile.readline()
    for i in xrange(numEq):
        toks = ifile.readline().rstrip().split('\t')
        nt = int(toks[0])
        ##########################
        #parse barcode map
        #nt+1 gives number of barcodes ignoring for now
        barcodeData = toks[nt+2:-1]
        temp = {}
        for barcode, count in zip(barcodeData[::2], barcodeData[1::2]):
            if barcode not in temp:
                temp[barcode]=count
            else:
                print "error"
                print temp
                print barcode
                print toks
                sys.exit(0)
        ###########################
        eqClasses.append(temp)
barcodeFreq = defaultdict(int)
for eClass in eqClasses:
    for bcode, count in eClass.items():
        barcodeFreq[bcode] += int(count)



In [ ]:



In [ ]:



In [ ]:

    
#28th May
#Question does eq Barcodes and readbarcodes have difference?



In [ ]:

    
readDict = Counter(barcodes)



In [ ]:

    
len(barcodeFreq)



In [ ]:

    
len(readDict)



In [ ]:

    
readBcs = set(readDict.keys())
eqBcs = set(barcodeFreq.keys())



In [ ]:

    
eqOnly = eqBcs - readBcs
readOnly = readBcs - eqBcs



In [ ]:

    
len(eqOnly), len(readOnly)



In [ ]:



In [ ]:

    
count = []
for x in (readBcs-set(readOnly))-set(trBc):
    count.append(readDict[x])
print sum(count)



In [ ]:



In [ ]:

    
#Ans: Yes ~800k in readBarcode ~200k in eqBarcodes, ~600k thrown out with less than threshold density
#but there is a problem we are throwing 1M ambient reads but around 3-4M actual reads.



In [ ]:



In [10]:

    
freqCounts = np.array(barcodeFreq.values())
freqCounts.sort()
f=freqCounts[::-1]
WINDOW = [100, 5000]
LOCAL_WINDOW_LEN = 25

y=f[1:100001]

slopes, x = local_lin_fit(np.log10(y), window_len=LOCAL_WINDOW_LEN)

savgol = signal.savgol_filter(slopes, 251, 4)

threshold = int(np.argmax(savgol[WINDOW[0]:WINDOW[1]]) + WINDOW[0] + x[0])

print threshold, f[578]



In [17]:

    
trBc = [k for k,v in barcodeFreq.items() if v>2158]



In [3]:

    
import pickle
trBc = pickle.load( open( "../data/true.p", "rb" ) )



In [ ]:

    
#So, I clipped out true barcodes using knee method and looked for barcodes which are having more than 10 reads 
#and is max 1 edit-distance away from this true set and I got just 5710 barcodes out of 35345 passing this 
#criteria while overall barcodes were ~200k and 570 true.
#Overall with 1 edit-distance and more than 10 reads density barcode filter we are gaining 280k reads.



In [19]:

    
import distance

def min_edit_dist(bc, tr):
    for t in tr:
        dist = distance.levenshtein(bc,t, max_dist=1)
        if dist == 1 or dist == 0:#adjust accordingly
             return 1
    return 0
    
count = 0
pr_count = 0
tot_count = 0
read_count = 0
for k,v in barcodeFreq.items():
    pr_count += 1
    print ("\r count = "+str(pr_count)),
    if v > 5:
        tot_count += 1
        if min_edit_dist(k,trBc):
            count += 1
            read_count += v
        else:
            continue
print tot_count, count, read_count
#38529 7192 221452 for 7.389
#257882 17798 255030 for 0









    



 count = 53786                                                                                 





    



---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-19-8fbcca29c589> in <module>()
     17     if v > 5:
     18         tot_count += 1
---> 19         if min_edit_dist(k,trBc):
     20             count += 1
     21             read_count += v

<ipython-input-19-8fbcca29c589> in min_edit_dist(bc, tr)
      3 def min_edit_dist(bc, tr):
      4     for t in tr:
----> 5         dist = distance.levenshtein(bc,t, max_dist=1)
      6         if dist == 1 or dist == 0:#adjust accordingly
      7              return 1

/Users/avisrivastava/miniconda2/lib/python2.7/site-packages/distance/_levenshtein.pyc in levenshtein(seq1, seq2, max_dist, normalized)
     58                 for y in range(1, len2 + 1):
     59                         old = column[y]
---> 60                         cost = int(seq1[x - 1] != seq2[y - 1])
     61                         column[y] = min(column[y] + 1, column[y - 1] + 1, last + cost)
     62                         last = old

KeyboardInterrupt:



In [ ]:

    
plt.hist(np.log(barcodeFreq.values()), log=True)



In [ ]:

    
#Q: for a given barcode how many true barcodes is edit distance 1 from it.
import distance

def min_edit_dist(bc, tr):
    count = 0
    for t in tr:
        dist = distance.levenshtein(bc,t, max_dist=1)
        if dist == 1:#adjust accordingly
             count+=1
    return count
    
count = []
pr_count = 0
for k,v in barcodeFreq.items():
    pr_count += 1
    print ("\r count = "+str(pr_count)),
    if v > 7.38:
        count.append(min_edit_dist(k,trBc))
print Counter(count)
#Ans: Counter({0: 31337, 1: 7183, 2: 5, 3: 4}) with 7.38
#Ans: Counter({0: 240084, 1: 17784, 2: 10, 3: 4})with 0
#Ans: Counter({0: 69394, 1: 15208, 2: 10, 3: 4}) with 1



In [ ]:

    
#Q: get counts for CDF from before mapping but barcodes after mapping
hypoDict = {}
for k,v in readDict.items():
    if k in eqBcs:
        hypoDict[k] = v



In [ ]:

    
freqCounts = np.array(hypoDict.values())
freqCounts.sort()
f=freqCounts[::-1]
WINDOW = [100, 5000]
LOCAL_WINDOW_LEN = 25

y=f[1:100001]

slopes, x = local_lin_fit(np.log10(y), window_len=LOCAL_WINDOW_LEN)

savgol = signal.savgol_filter(slopes, 251, 4)

threshold = int(np.argmax(savgol[WINDOW[0]:WINDOW[1]]) + WINDOW[0] + x[0])

print threshold, f[threshold]



In [ ]:



In [ ]:

    
#Q: Probability that a randomly chosen barcode is 1edit distance from true BC
import random
import distance

def min_edit_dist(bc, tr):
    count = 0
    for t in tr:
        dist = distance.levenshtein(bc,t, max_dist=1)
        if dist == 1:#adjust accordingly
             count+=1
    return count
nt_map={1:'A', 2:'T', 3:'C', 4:'G'}
count = []
for i in range(10000):
    bc = ''.join([nt_map[random.randint(1, 4)] for x in range(12)])
    count.append(min_edit_dist(bc, trBc))
    print "\r"+str(i),

Counter(count)
#Ans:.0012



In [ ]:



In [ ]:

    
#Difference in barcodes freq before and after rapmap
new_trBc = [k for k,v in hypoDictDictpoDictpoDict.items() if v>1846]



In [ ]:

    
for x in set(new_trBc)-set(trBc):
    print x, hypoDict[x], barcodeFreq[x]



In [ ]:

    
len(trBc), len(new_trBc)



In [56]:

    
#Q: Verify the output of Alevin wrt normal parsing
data = pd.read_table('../data/test.dump', header=None).to_dict()[0]



In [3]:

    
new_nt_map = {0:'A', 1:'C', 2: 'T', 3:'G'}
rev_nt_map = {'A':0, 'C':1, 'T':2, 'G':3}
def getBC(num):
    bc = []
    if num > 16777215:
        print num 
        return False
    while (num>0):
        bc.append(new_nt_map[num%4])
        num = num/4
    while len(bc) < 12:
        bc.append('A')
    return ''.join(bc)[::-1]
def getIn(bc):
    index = 0
    for ind, c in enumerate(bc):
        index += (4**(12-ind-1))*rev_nt_map[c]
    return index



In [ ]:

    
newDict = {}
for k,v in data.items():
    if v!=0:
        newDict[getBC(k)] = v



In [ ]:

    
barcodeFreq =  Counter(barcodes)



In [ ]:

    
len(barcodeFreq), len(newDict)



In [ ]:

    
nonSet = set(barcodeFreq.values())-set(newDict.values())
#Ans: Yes It is working



In [7]:

    
#Q: Verify Z matrix; naive model with top 570 barcode selected /10000 is working
tr = pd.read_table(open('../data/true.dump'), header=None).to_dict()[0]
trBc = [getBC(x) for x in tr.values()]
# set([getBC(k) for k in data.values()]) - set(trBc)



In [54]:

    
def dataFromBarMap(name):
    data = pd.read_table(open('../data/test.dump'), header=None).to_dict()[0]
    barcodeMapAlevin = {}
    for bc, tr in data.items():
        if tr != 4294967295:
            barcodeMapAlevin[getBC(bc)] = getBC(tr)
    return barcodeMapAlevin
barcodeMapAlevin = dataFromBarMap(open('../data/test.dump'))



In [5]:

    
barcodeFreq = Counter(barcodes)
freqs = barcodeFreq.values()
freqs.sort(reverse=True)



In [ ]:



In [8]:

    
count = 0
barcodeMap = defaultdict(set)
for bc in trBc:
    barcodeMap[bc].add(bc)
    for ind,nt in enumerate(bc):
        if nt == 'A':
            newNts = 'CGT'
        elif nt == 'C':
            newNts = 'AGT'
        elif nt == 'G':
            newNts = 'ACT'
        elif nt == 'T':
            newNts = 'ACG'
        else:
            print "error"
            exit(0)
        for newNt in newNts:
            count += 1
            cp = bc[:ind] + newNt + bc[ind+1:]
            if barcodeFreq[cp] != 0 and cp != bc:
                barcodeMap[cp].add(bc)
        for newNt in 'ACGT':
            if ind!=len(bc)-1:
                count +=2
                cp = bc[:ind] + newNt + bc[ind:-1]
                if barcodeFreq[cp] > 0 and cp != bc:
                    barcodeMap[cp].add(bc)
                cp = bc[:ind] + bc[ind+1:] + newNt
                if barcodeFreq[cp] > 0 and cp != bc:
                    barcodeMap[cp].add(bc)
print count



In [18]:



In [55]:

    
def dataFromZmat(name):
    data = name.readlines()
    barcodeMapAlevin = {}
    for line in data:
        bc = int(line.strip().split('\t')[0])
        trs = [ int(x) for x in line.strip().split('\t')[1:] ]
        for x in trs:
            barcodeMapAlevin[getBC(bc)] = set([getBC(x) for x in trs])
    return barcodeMapAlevin
barcodeMapAlevin = dataFromZmat(open('../data/ZMat.dump'))



In [ ]:



In [ ]:



In [35]:

    
def bcSelection(bcs, t):
    num = []
    for b in bcs:
        for ind, i in enumerate(b):
            if i != t[ind]:
                if ind == len(t)-1 or b[ind+1] == t[ind+1]:
                    num.append(0)
                    break
                if i == t[ind+1]:
                    num.append(1)
                    break
                if b[ind+1] == t[ind]:
                    num.append(2)
                    break
    if len(num) != len(bcs):
        print "error"
    else:
        return bcs[np.argmin(num)]



In [36]:

    
newbarcodeMap = {}
count = 0
for k,v in barcodeMap.items():
    if k in trBc:
        newbarcodeMap[k] = k
        continue
    if barcodeFreq[k] > 1: 
        if len(v)>1:
            count += 1
            maxVal = 0
            Lv = list(v)
            newbarcodeMap[k] = bcSelection(Lv, k)
        else:
            newbarcodeMap[k] = list(v)[0]
print count



In [61]:

    
len(newbarcodeMap)









    Out[61]:





29055



In [56]:

    
match = 0
noMatch = 0
notIn = 0
for k,v in data.items():
    if v!= 4294967295:
        bc = getBC(k)
        tr = getBC(v)
        if bc in newbarcodeMap:
            if tr == newbarcodeMap[bc]:
                match += 1
            else:
#                 print bc, tr, newbarcodeMap[bc]
#                 break
                noMatch += 1
        else:
            notIn += 1



In [57]:

    
print match, noMatch, notIn



In [62]:

    
neg= set(newbarcodeMap.keys()) - set(barcodeMapAlevin.keys())



In [63]:

    
len(neg)









    Out[63]:





0



In [ ]:

    
#A: True Barcode Correct
#A1: No subs/indel correct
#A2: only subs correct 20520/570:36 19675 entries
#A3: all subs and indels 70680/570:(44+44+36) 29152 entries