In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from collections import Counter
import sys
from collections import defaultdict
from scipy import signal
In [2]:
eqFile="../data/eq_classes.txt"
In [3]:
#read eqClass files with barcodes
eqClasses = []
with open(eqFile) as ifile:
numTran = int(ifile.readline().rstrip())
numEq = int(ifile.readline().rstrip())
for i in xrange(numTran):
ifile.readline()
for i in xrange(numEq):
toks = ifile.readline().rstrip().split('\t')
nt = int(toks[0])
##########################
#parse barcode map
#nt+1 gives number of barcodes ignoring for now
barcodeData = toks[nt+2:-1]
temp = {}
for barcode, count in zip(barcodeData[::2], barcodeData[1::2]):
if barcode not in temp:
temp[barcode]=count
else:
print "error"
print temp
print barcode
print toks
sys.exit(0)
###########################
eqClasses.append(temp)
In [6]:
len(eqClasses)
Out[6]:
In [5]:
#barcode level map
barcodeFreq = defaultdict(int)
for eClass in eqClasses:
for bcode, count in eClass.items():
barcodeFreq[bcode] += int(count)
In [6]:
sum(barcodeFreq.values())
Out[6]:
In [18]:
len(barcodeFreq)
Out[18]:
In [19]:
# with open('barcodes.txt', 'w') as f:
# count = 0
# for x in barcodeFreq.keys():
# count += 1
# f.write(x+","+str(count)+"\n")
In [20]:
freqCounts = np.array(barcodeFreq.values())
freqCounts.sort()
f=freqCounts[::-1]
# for i in range(len(f)):
# f[i] += f[i-1]
In [ ]:
In [21]:
def local_lin_fit(y, window_len=10):
from scipy.optimize import curve_fit
num_windows = len(y) - window_len
slopes = []
x = []
for window_start in range(0, num_windows):
window_x = range(window_start, window_start + window_len)
window_y = y[window_start : window_start + window_len]
coeff, var_matrix = curve_fit(
linear,
window_x,
window_y,
p0=[window_y[-1] - window_y[0], window_y[0]])
(slope, intercept) = coeff
slopes.append(-slope)
x.append(window_start + window_len / 2)
return slopes, x
def linear(x, *p):
(slope, intercept) = p
return slope*x + intercept
WINDOW = [100, 5000]
LOCAL_WINDOW_LEN = 25
y=f[1:100001]
slopes, x = local_lin_fit(np.log10(y), window_len=LOCAL_WINDOW_LEN)
savgol = signal.savgol_filter(slopes, 251, 4)
threshold = int(np.argmax(savgol[WINDOW[0]:WINDOW[1]]) + WINDOW[0] + x[0])
print threshold
In [22]:
f[578]
Out[22]:
In [234]:
import operator
x = barcodeFreq
sorted_x = sorted(x.items(), key=operator.itemgetter(1), reverse=True)
In [237]:
corrBarcodes = []
for x,_ in sorted_x[1:579]:
corrBarcodes.append(x)
In [238]:
len(corrBarcodes)
Out[238]:
In [252]:
from itertools import izip
import distance
def getMinHamDistance(bcode):
dist=1000
for tb in corrBarcodes:
dist = min(dist, distance.hamming(tb, bcode))
return dist
In [253]:
freqDist = []
for count, (k,v) in enumerate(barcodeFreq.items()):
freqDist.append(getMinHamDistance(k))
if count%1000 == 0:
print "\r" + str(count),
In [6]:
from lshash import LSHash
In [16]:
lsh = LSHash(8, 12, 10)
ntMap = {'A':1, 'C':2, 'G':3, 'T':4}
In [17]:
freqs = barcodeFreq.values()
freqs.sort(reverse=True)
freqs[579]
Out[17]:
In [18]:
for k,v in barcodeFreq.items():
if v >= 814:
lsh.index([ntMap[x] for x in k])
In [ ]:
dist = []
countN = 0;
count=1
for k,v in barcodeFreq.items():
count += 1
if count%1000 == 0:
print "\r" + str(count),
if 'N' in k:
countN += 1
continue
if v < 814:
try:
query = lsh.query([ntMap[x] for x in k], 1)
dist.append(query[0][1])
except:
break
print countN
In [32]:
dist = []
countN = 0;
count=1
for k,v in barcodeFreq.items():
count += 1
if count%1000 == 0:
print "\r" + str(count),
if 'N' in k:
countN += 1
continue
if v < 814:
try:
query = lsh.query([ntMap[x] for x in k], 1, "hamming")
print query
break
dist.append(query[0][1])
except:
break
print countN
In [31]:
back=dist
In [30]:
plt.hist(dist)
Out[30]:
In [ ]: