In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from collections import Counter
import sys
from collections import defaultdict
from scipy import signal

In [2]:
eqFile="../data/eq_classes.txt"

In [3]:
#read eqClass files with barcodes
eqClasses = []
with open(eqFile) as ifile:
    numTran = int(ifile.readline().rstrip())
    numEq = int(ifile.readline().rstrip())
    for i in xrange(numTran):
        ifile.readline()
    for i in xrange(numEq):
        toks = ifile.readline().rstrip().split('\t')
        nt = int(toks[0])
        ##########################
        #parse barcode map
        #nt+1 gives number of barcodes ignoring for now
        barcodeData = toks[nt+2:-1]
        temp = {}
        for barcode, count in zip(barcodeData[::2], barcodeData[1::2]):
            if barcode not in temp:
                temp[barcode]=count
            else:
                print "error"
                print temp
                print barcode
                print toks
                sys.exit(0)
        ###########################
        eqClasses.append(temp)

In [6]:
len(eqClasses)


Out[6]:
162936

In [5]:
#barcode level map
barcodeFreq = defaultdict(int)
for eClass in eqClasses:
    for bcode, count in eClass.items():
        barcodeFreq[bcode] += int(count)

In [6]:
sum(barcodeFreq.values())


Out[6]:
4923897

In [18]:
len(barcodeFreq)


Out[18]:
261094

In [19]:
# with open('barcodes.txt', 'w') as f:
#      count = 0
#      for x in barcodeFreq.keys():
#          count += 1
#          f.write(x+","+str(count)+"\n")

In [20]:
freqCounts = np.array(barcodeFreq.values())
freqCounts.sort()
f=freqCounts[::-1]
# for i in range(len(f)):
#     f[i] += f[i-1]

In [ ]:


In [21]:
def local_lin_fit(y, window_len=10):
	from scipy.optimize import curve_fit
	num_windows = len(y) - window_len
	slopes = []
	x = []
	for window_start in range(0, num_windows):
		window_x = range(window_start, window_start + window_len)
		window_y = y[window_start : window_start + window_len]
		coeff, var_matrix = curve_fit(
			linear,
			window_x,
			window_y,
			p0=[window_y[-1] - window_y[0], window_y[0]])
		(slope, intercept) = coeff
		slopes.append(-slope)
		x.append(window_start + window_len / 2)
	return slopes, x

def linear(x, *p):
	(slope, intercept) = p
	return slope*x + intercept



WINDOW = [100, 5000]
LOCAL_WINDOW_LEN = 25

y=f[1:100001]

slopes, x = local_lin_fit(np.log10(y), window_len=LOCAL_WINDOW_LEN)

savgol = signal.savgol_filter(slopes, 251, 4)

threshold = int(np.argmax(savgol[WINDOW[0]:WINDOW[1]]) + WINDOW[0] + x[0])

print threshold


578

In [22]:
f[578]


Out[22]:
832

In [234]:
import operator
x = barcodeFreq
sorted_x = sorted(x.items(), key=operator.itemgetter(1), reverse=True)

In [237]:
corrBarcodes = []
for x,_ in sorted_x[1:579]:
    corrBarcodes.append(x)

In [238]:
len(corrBarcodes)


Out[238]:
578

In [252]:
from itertools import izip
import distance

def getMinHamDistance(bcode):
    dist=1000
    for tb in corrBarcodes:
        dist = min(dist, distance.hamming(tb, bcode))
    return dist

In [253]:
freqDist = []
for count, (k,v) in enumerate(barcodeFreq.items()):
    freqDist.append(getMinHamDistance(k))
    if count%1000 == 0:
        print "\r" + str(count),


261000                                                                                                                                                                                                                                                                  

lsh


In [6]:
from lshash import LSHash

In [16]:
lsh = LSHash(8, 12, 10)
ntMap = {'A':1, 'C':2, 'G':3, 'T':4}

In [17]:
freqs = barcodeFreq.values()
freqs.sort(reverse=True)
freqs[579]


Out[17]:
814

In [18]:
for k,v in barcodeFreq.items():
    if v >= 814:
        lsh.index([ntMap[x] for x in k])

In [ ]:
dist = []
countN = 0;
count=1
for k,v in barcodeFreq.items():
    count += 1
    if count%1000 == 0:
        print "\r" + str(count),
    if 'N' in k:
        countN += 1
        continue
    if v < 814:
        try:
            query = lsh.query([ntMap[x] for x in k], 1)
            dist.append(query[0][1])
        except:
            break
        
print countN


261000                                                                                                                                                                                                                                                                   3212

In [32]:
dist = []
countN = 0;
count=1
for k,v in barcodeFreq.items():
    count += 1
    if count%1000 == 0:
        print "\r" + str(count),
    if 'N' in k:
        countN += 1
        continue
    if v < 814:
        try:
            query = lsh.query([ntMap[x] for x in k], 1, "hamming")
            print query
            break
            dist.append(query[0][1])
        except:
            break

print countN


[((1, 1, 2, 2, 4, 3, 4, 1, 3, 3, 4, 3), 8)]
0

In [31]:
back=dist

In [30]:
plt.hist(dist)


Out[30]:
(array([  1.60650000e+04,   3.26020000e+04,   3.94980000e+04,
          9.57130000e+04,   3.61190000e+04,   3.17780000e+04,
          5.03200000e+03,   3.76000000e+02,   1.14000000e+02,
          5.00000000e+00]),
 array([  1. ,   2.6,   4.2,   5.8,   7.4,   9. ,  10.6,  12.2,  13.8,
         15.4,  17. ]),
 <a list of 10 Patch objects>)
<matplotlib.figure.Figure at 0x7f01fa9b52d0>

In [ ]: