In [1]:
import pandas as pd
import numpy as np
import matplotlib as mpl
%matplotlib inline
import matplotlib.pyplot as plt
from collections import Counter
from collections import defaultdict
from sklearn.neighbors.kde import KernelDensity
from scipy.stats.kde import gaussian_kde
from scipy import signal
from scipy.ndimage.filters import correlate1d
In [2]:
barcodes = open('../data/barcodes.txt').readlines()
barcodes = [x.strip() for x in barcodes if 'N' not in x and 'n' not in x]
temp = barcodes[:]
barcodeFreq = Counter(barcodes)
In [3]:
# barcodes = temp[:]
# #bootstrapping
# # bins=400
# d = []
# numberOfBoootstrapsamples = 1000
# for i in range(numberOfBoootstrapsamples):
# bc = barcodes.pop()
# d.append(bc)
In [4]:
# len(barcodes), len(temp)
In [29]:
#knee detection
def local_lin_fit(y, window_len=10):
from scipy.optimize import curve_fit
num_windows = len(y) - window_len
slopes = []
x = []
for window_start in range(0, num_windows):
window_x = np.array(range(window_start, window_start + window_len))
window_y = np.array(y[window_start : window_start + window_len])
n = len(window_x)
s_x = sum(window_x)
s_y = sum(window_y)
s_xx = sum(window_x * window_x)
s_xy = sum(window_x * window_y)
slopes.append(-((n * s_xy - s_x * s_y) / (n * s_xx - s_x * s_x)))
x.append(window_start + window_len / 2)
return slopes, x
# letter_counts = Counter(barcodes)
# freqCounts = np.array(letter_counts.values())
# freqCounts.sort()
# f=freqCounts[::-1]
# WINDOW = [100, 5000]
# LOCAL_WINDOW_LEN = 25
y=test
slopes, x = local_lin_fit(y, window_len=LOCAL_WINDOW_LEN)
savCoeff = signal.savgol_coeffs(251, 4)
savgol = []
for ll in range(len(slopes)-251):
savgol.append(sum(slopes[ll:ll+251] * savCoeff))
threshold = int(WINDOW[0] + x[0] + 125)
diff = []
ind = 0
history = savgol[WINDOW[0]]
for x in savgol[WINDOW[0]+1:WINDOW[1]]:
diff.append(abs(history-x))
history = x
for i in range(0,len(diff)-9):
l = diff[i:i+10]
flag = False
for x in l:
if x > 18:
flag = True
break
if not flag:
ind = i-1
break
threshold += ind
print threshold
In [15]:
plt.plot(test)
Out[15]:
In [ ]:
#phew!!! So the problem is numpy and scipy handles the edges in convolution diffferently so we simplt ignore the edges.
In [ ]:
In [15]:
f[566]
Out[15]:
In [4]:
#read eqClass files with barcodes
eqFile="../data/eq_classes.txt"
eqClasses = []
with open(eqFile) as ifile:
numTran = int(ifile.readline().rstrip())
numEq = int(ifile.readline().rstrip())
for i in xrange(numTran):
ifile.readline()
for i in xrange(numEq):
toks = ifile.readline().rstrip().split('\t')
nt = int(toks[0])
##########################
#parse barcode map
#nt+1 gives number of barcodes ignoring for now
barcodeData = toks[nt+2:-1]
temp = {}
for barcode, count in zip(barcodeData[::2], barcodeData[1::2]):
if barcode not in temp:
temp[barcode]=count
else:
print "error"
print temp
print barcode
print toks
sys.exit(0)
###########################
eqClasses.append(temp)
barcodeFreq = defaultdict(int)
for eClass in eqClasses:
for bcode, count in eClass.items():
barcodeFreq[bcode] += int(count)
In [ ]:
In [ ]:
In [ ]:
#28th May
#Question does eq Barcodes and readbarcodes have difference?
In [ ]:
readDict = Counter(barcodes)
In [ ]:
len(barcodeFreq)
In [ ]:
len(readDict)
In [ ]:
readBcs = set(readDict.keys())
eqBcs = set(barcodeFreq.keys())
In [ ]:
eqOnly = eqBcs - readBcs
readOnly = readBcs - eqBcs
In [ ]:
len(eqOnly), len(readOnly)
In [ ]:
In [ ]:
count = []
for x in (readBcs-set(readOnly))-set(trBc):
count.append(readDict[x])
print sum(count)
In [ ]:
In [ ]:
#Ans: Yes ~800k in readBarcode ~200k in eqBarcodes, ~600k thrown out with less than threshold density
#but there is a problem we are throwing 1M ambient reads but around 3-4M actual reads.
In [ ]:
In [10]:
freqCounts = np.array(barcodeFreq.values())
freqCounts.sort()
f=freqCounts[::-1]
WINDOW = [100, 5000]
LOCAL_WINDOW_LEN = 25
y=f[1:100001]
slopes, x = local_lin_fit(np.log10(y), window_len=LOCAL_WINDOW_LEN)
savgol = signal.savgol_filter(slopes, 251, 4)
threshold = int(np.argmax(savgol[WINDOW[0]:WINDOW[1]]) + WINDOW[0] + x[0])
print threshold, f[578]
In [17]:
trBc = [k for k,v in barcodeFreq.items() if v>2158]
In [3]:
import pickle
trBc = pickle.load( open( "../data/true.p", "rb" ) )
In [ ]:
#So, I clipped out true barcodes using knee method and looked for barcodes which are having more than 10 reads
#and is max 1 edit-distance away from this true set and I got just 5710 barcodes out of 35345 passing this
#criteria while overall barcodes were ~200k and 570 true.
#Overall with 1 edit-distance and more than 10 reads density barcode filter we are gaining 280k reads.
In [19]:
import distance
def min_edit_dist(bc, tr):
for t in tr:
dist = distance.levenshtein(bc,t, max_dist=1)
if dist == 1 or dist == 0:#adjust accordingly
return 1
return 0
count = 0
pr_count = 0
tot_count = 0
read_count = 0
for k,v in barcodeFreq.items():
pr_count += 1
print ("\r count = "+str(pr_count)),
if v > 5:
tot_count += 1
if min_edit_dist(k,trBc):
count += 1
read_count += v
else:
continue
print tot_count, count, read_count
#38529 7192 221452 for 7.389
#257882 17798 255030 for 0
In [ ]:
plt.hist(np.log(barcodeFreq.values()), log=True)
In [ ]:
#Q: for a given barcode how many true barcodes is edit distance 1 from it.
import distance
def min_edit_dist(bc, tr):
count = 0
for t in tr:
dist = distance.levenshtein(bc,t, max_dist=1)
if dist == 1:#adjust accordingly
count+=1
return count
count = []
pr_count = 0
for k,v in barcodeFreq.items():
pr_count += 1
print ("\r count = "+str(pr_count)),
if v > 7.38:
count.append(min_edit_dist(k,trBc))
print Counter(count)
#Ans: Counter({0: 31337, 1: 7183, 2: 5, 3: 4}) with 7.38
#Ans: Counter({0: 240084, 1: 17784, 2: 10, 3: 4})with 0
#Ans: Counter({0: 69394, 1: 15208, 2: 10, 3: 4}) with 1
In [ ]:
#Q: get counts for CDF from before mapping but barcodes after mapping
hypoDict = {}
for k,v in readDict.items():
if k in eqBcs:
hypoDict[k] = v
In [ ]:
freqCounts = np.array(hypoDict.values())
freqCounts.sort()
f=freqCounts[::-1]
WINDOW = [100, 5000]
LOCAL_WINDOW_LEN = 25
y=f[1:100001]
slopes, x = local_lin_fit(np.log10(y), window_len=LOCAL_WINDOW_LEN)
savgol = signal.savgol_filter(slopes, 251, 4)
threshold = int(np.argmax(savgol[WINDOW[0]:WINDOW[1]]) + WINDOW[0] + x[0])
print threshold, f[threshold]
In [ ]:
In [ ]:
#Q: Probability that a randomly chosen barcode is 1edit distance from true BC
import random
import distance
def min_edit_dist(bc, tr):
count = 0
for t in tr:
dist = distance.levenshtein(bc,t, max_dist=1)
if dist == 1:#adjust accordingly
count+=1
return count
nt_map={1:'A', 2:'T', 3:'C', 4:'G'}
count = []
for i in range(10000):
bc = ''.join([nt_map[random.randint(1, 4)] for x in range(12)])
count.append(min_edit_dist(bc, trBc))
print "\r"+str(i),
Counter(count)
#Ans:.0012
In [ ]:
In [ ]:
#Difference in barcodes freq before and after rapmap
new_trBc = [k for k,v in hypoDictDictpoDictpoDict.items() if v>1846]
In [ ]:
for x in set(new_trBc)-set(trBc):
print x, hypoDict[x], barcodeFreq[x]
In [ ]:
len(trBc), len(new_trBc)
In [56]:
#Q: Verify the output of Alevin wrt normal parsing
data = pd.read_table('../data/test.dump', header=None).to_dict()[0]
In [3]:
new_nt_map = {0:'A', 1:'C', 2: 'T', 3:'G'}
rev_nt_map = {'A':0, 'C':1, 'T':2, 'G':3}
def getBC(num):
bc = []
if num > 16777215:
print num
return False
while (num>0):
bc.append(new_nt_map[num%4])
num = num/4
while len(bc) < 12:
bc.append('A')
return ''.join(bc)[::-1]
def getIn(bc):
index = 0
for ind, c in enumerate(bc):
index += (4**(12-ind-1))*rev_nt_map[c]
return index
In [ ]:
newDict = {}
for k,v in data.items():
if v!=0:
newDict[getBC(k)] = v
In [ ]:
barcodeFreq = Counter(barcodes)
In [ ]:
len(barcodeFreq), len(newDict)
In [ ]:
nonSet = set(barcodeFreq.values())-set(newDict.values())
#Ans: Yes It is working
In [7]:
#Q: Verify Z matrix; naive model with top 570 barcode selected /10000 is working
tr = pd.read_table(open('../data/true.dump'), header=None).to_dict()[0]
trBc = [getBC(x) for x in tr.values()]
# set([getBC(k) for k in data.values()]) - set(trBc)
In [54]:
def dataFromBarMap(name):
data = pd.read_table(open('../data/test.dump'), header=None).to_dict()[0]
barcodeMapAlevin = {}
for bc, tr in data.items():
if tr != 4294967295:
barcodeMapAlevin[getBC(bc)] = getBC(tr)
return barcodeMapAlevin
barcodeMapAlevin = dataFromBarMap(open('../data/test.dump'))
In [5]:
barcodeFreq = Counter(barcodes)
freqs = barcodeFreq.values()
freqs.sort(reverse=True)
In [ ]:
In [8]:
count = 0
barcodeMap = defaultdict(set)
for bc in trBc:
barcodeMap[bc].add(bc)
for ind,nt in enumerate(bc):
if nt == 'A':
newNts = 'CGT'
elif nt == 'C':
newNts = 'AGT'
elif nt == 'G':
newNts = 'ACT'
elif nt == 'T':
newNts = 'ACG'
else:
print "error"
exit(0)
for newNt in newNts:
count += 1
cp = bc[:ind] + newNt + bc[ind+1:]
if barcodeFreq[cp] != 0 and cp != bc:
barcodeMap[cp].add(bc)
for newNt in 'ACGT':
if ind!=len(bc)-1:
count +=2
cp = bc[:ind] + newNt + bc[ind:-1]
if barcodeFreq[cp] > 0 and cp != bc:
barcodeMap[cp].add(bc)
cp = bc[:ind] + bc[ind+1:] + newNt
if barcodeFreq[cp] > 0 and cp != bc:
barcodeMap[cp].add(bc)
print count
In [18]:
In [55]:
def dataFromZmat(name):
data = name.readlines()
barcodeMapAlevin = {}
for line in data:
bc = int(line.strip().split('\t')[0])
trs = [ int(x) for x in line.strip().split('\t')[1:] ]
for x in trs:
barcodeMapAlevin[getBC(bc)] = set([getBC(x) for x in trs])
return barcodeMapAlevin
barcodeMapAlevin = dataFromZmat(open('../data/ZMat.dump'))
In [ ]:
In [ ]:
In [35]:
def bcSelection(bcs, t):
num = []
for b in bcs:
for ind, i in enumerate(b):
if i != t[ind]:
if ind == len(t)-1 or b[ind+1] == t[ind+1]:
num.append(0)
break
if i == t[ind+1]:
num.append(1)
break
if b[ind+1] == t[ind]:
num.append(2)
break
if len(num) != len(bcs):
print "error"
else:
return bcs[np.argmin(num)]
In [36]:
newbarcodeMap = {}
count = 0
for k,v in barcodeMap.items():
if k in trBc:
newbarcodeMap[k] = k
continue
if barcodeFreq[k] > 1:
if len(v)>1:
count += 1
maxVal = 0
Lv = list(v)
newbarcodeMap[k] = bcSelection(Lv, k)
else:
newbarcodeMap[k] = list(v)[0]
print count
In [61]:
len(newbarcodeMap)
Out[61]:
In [56]:
match = 0
noMatch = 0
notIn = 0
for k,v in data.items():
if v!= 4294967295:
bc = getBC(k)
tr = getBC(v)
if bc in newbarcodeMap:
if tr == newbarcodeMap[bc]:
match += 1
else:
# print bc, tr, newbarcodeMap[bc]
# break
noMatch += 1
else:
notIn += 1
In [57]:
print match, noMatch, notIn
In [62]:
neg= set(newbarcodeMap.keys()) - set(barcodeMapAlevin.keys())
In [63]:
len(neg)
Out[63]:
In [ ]:
#A: True Barcode Correct
#A1: No subs/indel correct
#A2: only subs correct 20520/570:36 19675 entries
#A3: all subs and indels 70680/570:(44+44+36) 29152 entries