In [1]:
import pandas as pd
import numpy as np
import matplotlib as mpl
%matplotlib inline
import matplotlib.pyplot as plt
from collections import Counter
from collections import defaultdict
import pysam
from pyfasta import Fasta

In [3]:
new_nt_map = {0:'A', 1:'C', 2: 'T', 3:'G'}
rev_nt_map = {'A':0, 'C':1, 'T':2, 'G':3}
def getBC(num):
    bc = []
    if num > 16777215:
        print num 
        return False
    while (num>0):
        bc.append(new_nt_map[num%4])
        num = num/4
    while len(bc) < 12:
        bc.append('A')
    return ''.join(bc)[::-1]
def getIn(bc):
    index = 0
    for ind, c in enumerate(bc):
        index += (4**(12-ind-1))*rev_nt_map[c]
    return index

In [30]:
tr = pd.read_table(open('../data/true.dump'), header=None).to_dict()[0]
trBc = tr.values()

In [40]:
#read eqClass files with barcodes
eqFile="../data/eq_classes.txt"
barcodeCount = defaultdict(int)
with open(eqFile) as ifile:
    numTran = int(ifile.readline().rstrip())
    numEq = int(ifile.readline().rstrip())
    for i in xrange(numTran):
        ifile.readline()
    for i in xrange(numEq):
        toks = ifile.readline().rstrip().split('\t')
        nt = int(toks[0])
        ##########################
        #parse barcode map
        #nt+1 gives number of barcodes ignoring for now
        nb = toks[nt+1]
        nr = int(toks[-1])
        verify = 0
        for i in range(nt+2, len(toks)-1, 2):
            bc = int(toks[i].split(',')[0].lstrip('('))
            count = int(toks[i+1])
            barcodeCount[bc] += int(count)
            verify += count
        if verify != nr:
            print "error"
            break

In [41]:
tCount = 0
ntCount = 0
for k,v in barcodeCount.items():
    if k in trBc:
        tCount += barcodeCount[k]
    else:
        ntCount += barcodeCount[k]

In [39]:
#before using our knee correction
# #reads from true txp and #reads from non true
print tCount, ntCount


3426976 1488761

In [42]:
#After using our knee correction
# #reads from true txp and #reads from non true
print tCount, ntCount


3702880 1212857

In [43]:
#Total Reads remapped
1488761 - 1212857


Out[43]:
275904

In [44]:
#Q: Running  Star in txptome and genome what are the differences

In [3]:
tx = set(pd.read_table(open('../data/txReads.txt'), header=None)[0].tolist())
ge = set(pd.read_table(open('../data/geReads.txt'), header=None)[0].tolist())

In [4]:
len(tx) , len(ge), len(tx-ge), len(ge-tx)


Out[4]:
(6582636, 7196940, 307922, 922226)

In [5]:
onlyGe = ge-tx
onlyTx = tx-ge

In [90]:
#Q: onlyTx testing, first hypo
ref = Fasta("../data/ref.fa")
txSam = pysam.AlignmentFile("../data/txp.sam", "r")
lens = txSam.lengths

In [91]:
txPairs = defaultdict(list)
for read in txSam:
    if read.qname in onlyTx:
        txPairs[read.qname].append((read.pos, lens[read.rname]))

In [100]:
len(txPairs), sum([len(x) for x in txPairs.values()])
#lens of unique reads are consistent: check


Out[100]:
(307922, 1024397)

In [101]:
#get min of the distance from 5' and 3'
dis = []
for _,pair in txPairs.items():
    minDist = 200000
    for (k,v) in pair:
        minDist = min(v-k, k, minDist)
    dis.append(minDist)

In [102]:
plt.hist(dis, log=True)


Out[102]:
(array([  3.07474000e+05,   1.83000000e+02,   3.60000000e+01,
          3.80000000e+01,   3.20000000e+01,   3.00000000e+00,
          1.44000000e+02,   0.00000000e+00,   6.00000000e+00,
          6.00000000e+00]),
 array([     0. ,   5736.7,  11473.4,  17210.1,  22946.8,  28683.5,
         34420.2,  40156.9,  45893.6,  51630.3,  57367. ]),
 <a list of 10 Patch objects>)

In [103]:
#get min of the distance from 5' and 3'
dis = []
for _,pair in txPairs.items():
    minDist = 200000
    for (k,v) in pair:
        minDist = min(v-k, k, minDist)
    if minDist < 1000:
        dis.append(minDist)

In [104]:
plt.hist(dis, log=True)


Out[104]:
(array([ 76595.,  85656.,  45750.,  21937.,  13899.,   8168.,   7465.,
          6169.,   4214.,   3773.]),
 array([   0. ,   99.9,  199.8,  299.7,  399.6,  499.5,  599.4,  699.3,
         799.2,  899.1,  999. ]),
 <a list of 10 Patch objects>)

In [ ]:
#Ans: Significant number of reads are getting map very distant from either end

In [105]:


In [9]:
#Q: onlyGe testing, second hypo
ref = Fasta("../data/genome.fa")
geSam = pysam.AlignmentFile("../data/gene.sam", "r")
lens = geSam.lengths
wiSam = pysam.AlignmentFile("../data/onlyGene.sam", "w", template=geSam)

In [ ]:


In [10]:
gePairs = []
for read in geSam.fetch():
    if read.qname in onlyGe:
        wiSam.write(read)
#             print read.qname
#         gePairs.append((read.pos, read.reference_name))

In [54]:
len(gePairs)#, sum([len(x) for x in gePairs.values()])
#lens of unique reads are consistent: check


Out[54]:
1531201

In [3]:
import pickle
# pickle.dump(gePairs, open( "../data/gePairs.p", "wb" ) )
gePairs = pickle.load(open( "../data/gePairs.p", "rb" ) )

In [52]:
gtf = defaultdict(lambda : defaultdict(list))
with open('../data/mohu.gtf') as f:
    for line in f:
        if line[0] != '#':
            toks = line.strip().split('\t')
            gene = toks[0]
            feature = toks[2]
            start = toks[3]
            end = toks[4]
            gtf[gene][feature].append((start,end, toks[-1]))

In [83]:
it = 0
counts = defaultdict(int)
for p,r in gePairs:
    it += 1
    for feature in gtf[r]:
        for (start, stop) in gtf[r][feature]:
            if start < p and p < stop:
                counts[feature] += 1
    print ("\r count = "+str(it)),
print counts


defaultdict(<type 'int'>, {})

In [110]:
for feat in gtf['mchr16'].keys():
    for s,e,x in gtf['mchr16'][feat]:
        if int(s)<30570147 and int(e)>30570147:
            print (feat, s,e, 30570147-int(s), int(e)-int(s))


('gene', '30560494', '30587592', 9653, 27098)
('transcript', '30560494', '30587592', 9653, 27098)
('transcript', '30561371', '30587589', 8776, 26218)
('transcript', '30564678', '30587563', 5469, 22885)
('transcript', '30568013', '30587592', 2134, 19579)

In [93]:
t1="GGAGAAACCGGAAGCCTCGCGTGCTGAGCAGTGGTTTGTGCTGTTGATGCGTGGTTATGGGTCGGAGACGAGCGCCGGGTGGTGGGTCCCTGGGCCGAGTACTTATCCGGCAGCAGACTCAGCGAAGCCGCAGCCACCGTCATACTGACTCCTGGCTGCATACAAGTGAACTCAATGATGGCTATGACTGGGGTCGTCTGAATCTTCAGTCTGTGACAGAGCAGAGCTCCCTGGAGGACTTTCTTGCAACTGCAGAACTTGCAGGGACCGAGTTTGTGGCTGAGAAGCTGAATATTAAGTTTGTACCCCCTGAGGCTAGAACCGGACTGCTGTCTTTTGAGGAGAGCCAGAGAATTAAGAAACTCCACGAAGAAAACAGACAGTTCTTGTGTATACCAAGGAGGCCAAATTGGGACAGAAAAACTAGCCCAGAAGAACTGAAACAAGCAGAGAAAGATAACTTCCTGAAATGGAGGCGCCAGCTTGTGCGGCTAGAAGAGGAACAGAAGTTGATATTGACTCCCTTTGAACGGAACTTGGACTTCTGGCGCCAGCTCTGGAGGGTCATTGAGAGAAGTGATATTGTGGTCCAGATCGTAGATGCTCGAAACCCGCTCCTGTTTAGATGTGAGGATTTGGAGTGTTACGTGAAAGAGATCGATGCTGCCAAGGAGAACGTGATCCTGATAAACAAGGCAGACTTGCTGACTGCTGAGCAGCGGTTTGCCTGGGCTGTGCACTTTGAAAAGGAAGGGGTAAAGGTTATCTTCTGGTCAGCTTTGGCAGAAACTGATCACCTAAATGGTGACTTGAAGGAGGAAGTAGACAGTGTTGCTGGAGACACCAACAAAACAGAGTCTGAAAGCTCCAGTTTGGATGCGAATGAAATCCCACACAGAGACTTGATCTCGCTTAGTGAGGAGTCCGCAAGTGACAGTGGTGACAGCAAATATGAGGACTGTCAGGAGGACGAAGAGGAAGATTGGCAGACGTGTTCAGAAGAAGACAGCGTCCCTGAGGAGGAGGAGGGCTGTAATGCGGATTCTGAGACTCAGAACAGAAAAAACGCAGAAAATCAACAAGTGAACAATGATAGCTACCTGGTGTCGAAACAGGAATTACTGGAACTCTTTAAGAAACTCCACACTGGAAAGAAAGTGAAAGATGGACAGCTAACTGTTGGGCTGGTGGGCTACCCTAATGTTGGTAAGAGCTCAACCATCAACACCATCATGGGCAACAAGAAAGTATCGGTGTCTGCCACGCCAGGCCACACCAAGCATTTCCAGACCCTGTACGTGGAGCCTGGCCTCTGCCTGTGTGACTGCCCGGGCCTGGTGATGCCATCTTTCGTGTCCACCAAAGCAGAGATGATTTGCAATGGGATCCTCCCTATCGACCAGATGAGAGACCACGTTCCACCCGTATCACTAGTTTGCCAGAATATCCCACGGCGAGTTTTGGAAGTGACATATGGTATTAACATTATAAAGCCTAGAGAAGATGAAGACCCCTACCGACCTCCAACCTCAGAAGAACTGTTGACGGCTTATGGATGCATGAGAGGATTCATGACAGCACATGGGCAGCCAGACCAGCCTCGATCTGCACGGTACATCCTAAAGGACTATGTTGGAGGTAAACTACTGTATTGCCACCCTCCTCCTGGAAAAGACCCTGTGGCGTTCCAGCACCAACACCAGCAGCTCCTAGAGAGCAAAGTAAAAGGTGGAGAACTGAGACTTCAGCCGGGCAAAGGCCGAAAAGCCAAACAGATCGAAAATGTAGTCGACAAAACTTTTTTCCATCAGGAGAACGTCAGAGCACTGACCAAAGGTGTCCAGGCTGTCATGGGTTACAAGCCCGGGCATGGCCTGGTGACTGCGGCTGCTGCGAGCGCTGAGAATGTACCTGGGAAGCCCTGGAAAAAACACGGTAACAGAAACAAAAAAGAGAAAAGCCGTAGGCTCTACAAGCACCTGGATGTGTGAGCTCGGCTGAACAGAGACGGCAGGCAGCTGCGCTGTGCGGGAGGACACGAGCTCAAACCGCCACTGCTGCTGTCGCTGCCGTGCTGCTCTCCTGACCACTGCACTGTGGACTGGACCCTGCTCTCCTGGAGAGCACAGTCGCACCAGAAGTCTTGACATCAAGACAGCCTTTCGGAAGCACCTGCTGTGGCCGTGGTAGTCATGGGGTGCCCCAGAAGTCTCATCCTGCAGAAGGATGGTGGATATGTGTACAAGCATACAAATGCACATTGACTTTGAGGTTTTGGGAAGTTATTAAAATTAATTTTCCACTAGTTGATATTCTTATTGTCTTAAGTAGGCAAGCTGCCATTTATACATTTAATGTTGACCCTCTAGGAAGGGTCTGGAGATGGCTCAGCAGTTAGGAGCATTGGTTGCCCTTTCAGAGGACCCCAGTGTGGTTCCTAGCACCTACATGGCAGCTCATAGCCATCGATAACTCCAGTGTTAGGGCATCTAATGACTTCTTCTGGCTCTGATGGGTACCAGACACACAAATGGTACGCAGACATTCATGCAGGCAAAACACCCATACACATAAACAGCACACATTCACTCAGGACTCCAGTCTCCCTGGGCTTCCTCGGCTCCCTAGAGGTGACTGTGAGAGGTGGTTGAACGAGAATGGCCTGCAGAGGGCTTATGGATTTGCACACTTGGTTCCACCTCTGCCCCCTTTTAAACATAAAGTTTTAAAAAAGGAAATACTAAAAATAAACTATGACTAGGACTTGAATTAATTTCCAGACACATATCTAACGTATCCCTGATATGGCAGAGGGCTAGCAGGGGTGGTTTGGTAAGAGCTGGGCTGGACTGGTGAGACCATTGAGTGAAAGCTACTGTCACACAAACCTGCTGACCTTAGTCTGGTCTCTGGAGTCTACCAAAGCGGGAGGACTCTGACCAGAGCATTCAAGCGCACACTAATTACAGAATGGGACACTCCAGAAAGCATCACCCCATGGGGGAAGCACTTGCTCAAACCCATAGAAACGAAAAAAGAGCCAGGTCAGCTGCTAGGCTTTATTTCAGACATGAATTTTTGATGTTATAGATGTTTGGGGGTAAAGTTCTACAGAAAAATACGCATTTCTGCAAACAGTTTTGTAAACTAGAATTCATTAAAGCCTCTGTCGCAACCATG"
t2="GAAACCGGAAGCCTCGCGTGCTGAGCAGTGGTTTGTGCTGTTGATGCGTGGTTATGGGTCGGAGACGAGCGCCGGGTGGTGGGTCCCTGGGCCGAGTACTTATCCGGCAGCAGACTCAGCGAAGCCGCAGCCACCGTCATACTGACTCCTGGAGAAGCTGAATATTAAGTTTGTACCCCCTGAGGCTAGAACCGGACTGCTGTCTTTTGAGGAGAGCCAGAGAATTAAGAAACTCCACGAAGAAAACAGACAGTTCTTGTGTATACCAAGGAGGCCAAATTGGGACAGAAAAACTAGCCCAGAAGAACTGAAACAAGCAGAGAAAGATAACTTCCTGAAATGGAGGCGCCAGCTTGTGCGGCTAGAAGAGGAACAGAAGTTGATATTGACTCCCTTTGAACGGAACTTGGACTTCTGGCGCCAGCTCTGGAGGGTCATTGAGAGAAGATGGCCTGGAACTCAACTCTAGACCAGCCTGGACCTGAAGTCACAGAGATCTGCTGCTTCAGTCTCCCAAGTACTGGGATTAAAGACCTGTGCCAACAATGATATTGTGGTCCAGATCGTAGATGCTCGAAACCCGCTCCTGTTTAGATGTGAGGATTTGGAGTGTTACGTGAAAGAGATCGATGCTGCCAAGGAGAACGTGATCCTGATAAACAAGGCAGACTTGCTGACTGCTGAGCAGCGGTTTGCCTGGGCTGTGCACTTTGAAAAGGAAGGGGTAAAGGTTATCTTCTGGTCAGCTTTGGCAGAAACTGATCACCTAAATGGTGACTTGAAGGAGGAAGTAGACAGTGTTGCTGGAGACACCAACAAAACAGAGTCTGAAAGCTCCAGTTTGGATGCGAATGAAATCCCACACAGAGACTTGATCTCGCTTAGTGAGGAGTCCGCAAGTGACAGTGGTGACAGCAAATATGAGGACTGTCAGGAGGACGAAGAGGAAGATTGGCAGACGTGTTCAGAAGAAGACAGCGTCCCTGAGGAGGAGGAGGGCTGTAATGCGGATTCTGAGACTCAGAACAGAAAAAACGCAGAAAATCAACAAGTGAACAATGATAGCTACCTGGTGTCGAAACAGGAATTACTGGAACTCTTTAAGAAACTCCACACTGGAAAGAAAGTGAAAGATGGACAGCTAACTGTTGGGCTGGTGGGCTACCCTAATGTTGGTAAGAGCTCAACCATCAACACCATCATGGGCAACAAGAAAGTATCGGTGTCTGCCACGCCAGGCCACACCAAGCATTTCCAGACCCTGTACGTGGAGCCTGGCCTCTGCCTGTGTGACTGCCCGGGCCTGGTGATGCCATCTTTCGTGTCCACCAAAGCAGAGATGATTTGCAATGGGATCCTCCCTATCGACCAGATGAGAGACCACGTTCCACCCGTATCACTAGTTTGCCAGAATATCCCACGGCGAGTTTTGGAAGTGACATATGGTATTAACATTATAAAGCCTAGAGAAGATGAAGACCCCTACCGACCTCCAACCTCAGAAGAACTGTTGACGGCTTATGGATGCATGAGAGGATTCATGACAGCACATGGGCAGCCAGACCAGCCTCGATCTGCACGGTACATCCTAAAGGACTATGTTGGAGGTAAACTACTGTATTGCCACCCTCCTCCTGGAAAAGACCCTGTGGCGTTCCAGCACCAACACCAGCAGCTCCTAGAGAGCAAAGTAAAAGGTGGAGAACTGAGACTTCAGCCGGGCAAAGGCCGAAAAGCCAAACAGATCGAAAATGTAGTCGACAAAACTTTTTTCCATCAGGAGAACGTCAGAGCACTGACCAAAGGTGTCCAGGCTGTCATGGGTTACAAGCCCGGGCATGGCCTGGTGACTGCGGCTGCTGCGAGCGCTGAGAATGTACCTGGGAAGCCCTGGAAAAAACACGGTAACAGAAACAAAAAAGAGAAAAGCCGTAGGCTCTACAAGCACCTGGATGTGTGAGCTCGGCTGAACAGAGACGGCAGGCAGCTGCGCTGTGCGGGAGGACACGAGCTCAAACCGCCACTGCTGCTGTCGCTGCCGTGCTGCTCTCCTGACCACTGCACTGTGGACTGGACCCTGCTCTCCTGGAGAGCACAGTCGCACCAGAAGTCTTGACATCAAGACAGCCTTTCGGAAGCACCTGCTGTGGCCGTGGTAGTCATGGGGTGCCCCAGAAGTCTCATCCTGCAGAAGGATGGTGGATATGTGTACAAGCATACAAATGCACATTGACTTTGAGGTTTTGGGAAGTTATTAAAATTAATTTTCCAC"
t3="AGTGGTTTGTGCTGTTGATGCGTGGTTATGGGTCGGAGACGAGCGCCGGGTGGTGGGTCCCTGGGCCGAGTACTTATCCGGCAGCAGACTCAGCGAAGCCGCAGCCACCGTCATACTGACTCCTGGCTGCATACAAGTGAACTCAATGATGGCTATGACTGGGGTCGTCTGAATCTTCAGTCTGTGACAGAGCAGAGCTCCCTGGAGGACTTTCTTGCAACTGCAGAACTTGCAGGGACCGAGTTTGTGGCTGAGAAGCTGAATATTAAGTTTGTACCCCCTGAGGCTAGAACCGGACTGCTGTCTTTTGAGGAGAGCCAGAGAATTAAGAAACTCCACGAAGAAAACAGACAGTTCTTGTGTATACCAAGGAGGCCAAATTGGGACAGAAAAACTAGCCCAGAAGAACTGAAACAAGCAGAGAAAGATAACTTCCTGAAATGGAGGCGCCAGCTTGTGCGGCTAGAAGAGGAACAGAAGTTGATATTGACTCCCTTTGAACGGAACTTGGACTTCTGGCGCCAGCTCTGGAGGGTCATTGAGAGAAGTGATATTGTGGTCCAGATCGTAGATGCTCGAAACCCGCTCCTGTTTAGATGTGAGGATTTGGAGTGTTACGTGAAAGAGATCGATGCTGCCAAGGAGAACGTGATCCTGATAAACAAGGCAGACTTGCTGACTGCTGAGCAGCGGTTTGCCTGGGCTGTGCACTTTGAAAAGGAAGGGGTAAAGGTTATCTTCTGGTCAGCTTTGGCAGAAACTGATCACCTAAATGGTGACTTGAAGGAGGAAGTAGACAGTGTTGCTGGAGACACCAACAAAACAGAGTCTGAAAGCTCCAGTTTGGATGCGAATGAAATCCCACACAGAGACTTGATCTCGCTTAGTGAGGAGTCCGCAAGTGACAGTGGTGACAGCAAATATGAGGACTGTCAGGAGGACGAAGAGGAAGATTGGCAGACGTGTTCAGAAGAAGACAGCGTCCCTGAGGAGGAGGAGGGCTGTAATGCGGATTCTGAGACTCAGAACAGAAAAAACGCAGAAAATCAACAAGTGAACAATGATAGCTACCTGGTGTCGAAACAGGAATTACTGGAACTCTTTAAGAAACTCCACACTGGAAAGAAAGTGAAAGATGGACAGCTAACTGTTGGGCTGGTGGGCTACCCTAATGTTGGTAAGAGCTCAACCATCAACACCATCATGGGCAACAAGAAAGTATCGGTGTCTGCCACGCCAGGCCACACCAAGCATTTCCAGACCCTGTACGTGGAGCCTGGCCTCTGCCTGTGTGACTGCCCGGGCCTGGTGATGCCATCTTTCGTGTCCACCAAAGCAGAGATGATTTGCAATGGGATCCTCCCTATCGACCAGATGAGAGACCACGTTCCACCCGTATCACTAGTTTGCCAGAATATCCCACGGCGAGTTTTGGAAGTGACATATGGTATTAACATTATAAAGCCTAGAGAAGATGAAGACCCCTACCGACCTCCAACCTCAGAAGAACTGTTGACGGCTTATGGATGCATGAGAGGATTCATGACAGCACATGGGCAGCCAGACCAGCCTCGATCTGCACGGTACATCCTAAAGGACTATGTTGGAGTGAGTGAACCTGGTCCTGATAGAATGAGCAAGTTTGCTTTTCCCTGCACTGTGTTCTCAGTAATGTCAACCTGTTAAAAAGATGTTTTCTCATTGAGATGGACTGATACCCAAGTGGAGTGTCTTGTTTTTAGTGATGCTAGGGATTGACCTGTGGGCCTTAAACTGAGCCCCAGCCCCGGCCCTCAGAAACAATCTTGATTTCAGACTCTAACCTAGTATTCCCAGGTTTAGTGTAAAGAATACTGAAGAAGCCAGGCATGGTGGTGCACGCCTTTAATCCCAGCACTTGGGAGGCAGAGGCAGGCAGATTTCTGAGTTCCAGGACAGCCAGAGCTACACAGAGAAACCCTGTCTCGAAAAACCAAAAAAAAGAATACTGAAGAATACTTAGTAGAAATGAAGTACACGTGAAAAGGCAGGCCACAGAGAAGGCAAATTGCGTGTAAGCTCTGGGCCCTGGACGGGCTAGTATGTGAAAGTTAGGGCAGTTACTGAAGCTCCACTTGCCACAGCTGCTTCAGCCTGTGCCGGGTTACTACATACTGCCTTGAGGACCTTTGCCCTCCTGGGGTGACACAGGTACACTGTCACTCTGCTCTCTCTGCAGGGTAAACTACTGTATTGCCACCCTCCTCCTGGAAAAGACCCTGTGGCGTTCCAGCACCAACACCAGCAGCTCCTAGAGAGCAAAGTAAAAGGTGGAGAACTGAGACTTCAGCCGGGCAAAGGCCGAAAAGCCAAACAGATCG"
t4="GGAGAAACCGGAAGCCTCGCGTGCTGAGCAGTGGTTTGTGCTGTTGATGCGTGGTTATGGGTCGGAGACGAGCGCCGGGTGGTGGGTCCCTGGGCCGAGTACTTATCCGGCAGCAGACTCAGCGAAGCCGCAGCCACCGTCATACTGACTCCTGGCTGCATACAAGTGAACTCAATGATGGCTATGACTGGGGTCGTCTGAATCTTCAGTCTGTGACAGAGCAGAGCTCCCTGGAGGACTTTCTTGCAACTGCAGAACTTGCAGGGACCGAGTTTGTGGCTGAGAAGCTGAATATTAAGTTTGTACCCCCTGAGGCTAGAACCGGACTGCTGTCTTTTGAGGAGAGCCAGAGAATTAAGAAACTCCACGAAGAAAACAGACAGTTCTTGTGTATACCAAGGAGGCCAAATTGGGACAGAAAAACTAGCCCAGAAGAACTGAAACAAGCAGAGAAAGATAACTTCCTGAAATGGAGGCGCCAGCTTGTGCGGCTAGAAGAGGAACAGAAGTTGATATTGACTCCCTTTGAACGGAACTTGGACTTCTGGCGCCAGCTCTGGAGGGTCATTGAGAGAAGATGGCCTGGAACTCAACTCTAGACCAGCCTGGACCTGAAGTCACAGAGATCTGCTGCTTCAGTCTCCCAAGTACTGGGATTAAAGACCTGTGCCAACAAGTATAGCCTTAATGTTTTTCCAAAAGGATTCAGTTTCATCCTGACTGCATAACACCTTCAAGTCCTGCTTAAAAAAAAAAAAATCTTAAATAGCTTAAAGTGATATTGTGGTCCAGATCGTAGATGCTCGAAACCCGCTCCTGTTTAGATGTGAGGATTTGGAGTGTTACGTGAAAGAGATCGATGCTGCCAAGGAGAACGTGATCCTGATAAACAAGGCAGACTTGCTGACTGCTGAGCAGCGGTTTGCCTGGGCTGTGCACTTTGAAAAGGAAGGGGTAAAGGTTATCTTCTGGTCAGCTTTGGCAGAAACTGATCACCTAAATGGTGACTTGAAGGAGGAAGTAGACAGTGTTGCTGGAGACACCAACAAAACAGAGTCTGAAAGCTCCAGTTTGGATGCGAATGAAATCCCACACAGAGACTTGATCTCGCTTAGTGAGGAGTCCGCAAGTGACAGTGGTGACAGCAAATATGAGGACTGTCAGGAGGACGAAGAGGAAGATTGGCAGACGTGTTCAGAAGAAGACAGCGTCCCTGAGGAGGAGGAGGGCTGTAATGCGGATTCTGAGACTCAGAACAGAAAAAACGCAGAAAATCAACAAGTGAACAATGATAGCTACCTGGTGTCGAAACAGGAATTACTGGAACTCTTTAAGAAACTCCACACTGGAAAGAAAGTGAAAGATGGACAGCTAACTGTTGGGCTGGTGGGCTACCCTAATGTTGGTAAGAGCTCAACCATCAACACCATCATGGGCAACAAGAAAGTATCGGTGTCTGCCACGCCAGGCCACACCAAGCATTTCCAGACCCTGTACGTGGAGCCTGGCCTCTGCCTGTGTGACTGCCCGGGCCTGGTGATGCCATCTTTCGTGTCCACCAAAGCAGAGATGATTTGCAATGGGATCCTCCCTATCGACCAGATGAGAGACCACGTTCCACCCGTATCACTAATATCCTTTCTGCTTCAGTCCACTCTGGTTATTGATGTGAGTGAGGGCTCTGGGGATTTTCAGGGCACTGCCTTCAAACAGCATAGTTATAGGCATTAGGGACAAAGGTGTGGTTCTTTGTATCTCTGTTAGGATTATCTCTCAAAGGTAAGAGTTTGTTTATAATAATGTTCCTTAAGGTAATGTAGTTAAGATCTATCTGTTTTAGATTTGTTGATTTTTATTTTATGTGTGTGTGTTTCCCTGTACCTTTATCTGTACACTACTCACATGTCTGTTGCCCTTGACGGTCAGAAGAGAGCATTGGATCCCCTGAACTGGAGTTACAGACAGTTGTGAACCTCTTTGTAGGTGCTGGGAATCGAACCCTGGTCCTGCAAGAGAAGCAAGTGTGATTACCCACTGAACTATCTTTCTAGCCCCAGAGATAATATTTTTTGAAACTTTTAAAAATTTTTTTAATTTATTTTTTACACTCTATATTCCATTCCCCACCCCCACCCCCGAAATAATATTTTAATGATTGTTTTATGTACGTTTAGAGACTAGGCTTTACTATGTTGCTCAGGCTAGCCCAGAATGCACTGTGTTACCCAAACTAGCCTTAAACTCAGACTCCTCCTCAACCTTCCTAGTGCTGAAATTACAGGTATGAGTTACCATGCCTGGCTTGGTTTTTGGTTGGTTTGGTTCAGGGTTTTTTTGGTTTGGGGAGGGAGTGTTGGGAGGGTCTGGGGGAGTTGAGGGAGCTGTGCGAAGGTTAAGACAGTTTTTCTATGTAGCCCTAGCTGTCCTGGAACTCGATCTGTAGACTACCTGGCCTCAAACTCACAGAGATCTCCCTGCCTCTGCCTCCGAGTGCTGGGATTAAAGGCATGGGCCGGGCAGGAGAGATGGCTCAGCAGTTAAGAACACTGACTGTTCTTCCAGTGGTCCTGAGTTCAAAAATCCAGCAACCATATGGTGGCTCACAACCATCTGTAATGAGACCTGACGCCTTCTTCTGGTGTGTCTGAAAACAGCTACAGTGTATTCATATACGTTAAATAAATAATAATTTAAATAAATAAATAAAGGTATGGGCCACCACCG"

In [94]:
a="GTGTACTTACATATAATAATAAATAAATAATAAATAAATCTTAAAAAAAAAAAAAAAAAA"

In [91]:
a="GTGTA"

In [102]:
def query(a, t):
    ind = []
    for x in range(len(t)-len(a)):
        if t[x:x+len(a)] == a:
            ind.append(x)
    for x in ind:
        print t[x:x+60]
        print "\n"

In [113]:
query("GTGTA", t4)


GTGTATACCAAGGAGGCCAAATTGGGACAGAAAAACTAGCCCAGAAGAACTGAAACAAGC


GTGTATTCATATACGTTAAATAAATAATAATTTAAATAAATAAATAAAGGTATGGGCCAC



In [46]:
gtf['mchr16'].keys()


Out[46]:
['gene',
 'Selenocysteine',
 'UTR',
 'exon',
 'stop_codon',
 'CDS',
 'start_codon',
 'transcript']

In [ ]:


In [3]:
#new txptome testing

In [2]:
tx = set(pd.read_table(open('../data/reReads.txt'), header=None)[0].tolist())
ge = set(pd.read_table(open('../data/geReads.txt'), header=None)[0].tolist())

In [5]:
onlyGe = ge-tx
onlyTx = tx-ge

ref = Fasta("../data/ref.fa")
txSam = pysam.AlignmentFile("../data/rsem.sam", "r")
lens = txSam.lengths

txPairs = defaultdict(list)
for read in txSam:
    if read.qname in onlyTx:
        txPairs[read.qname].append((read.pos, lens[read.rname]))

In [6]:
len(txPairs), sum([len(x) for x in txPairs.values()])


Out[6]:
(280551, 1009259)

In [8]:
#get min of the distance from 5' and 3'
dis = []
for _,pair in txPairs.items():
    minDist = 200000
    for (k,v) in pair:
        minDist = min(v-k, k, minDist)
    dis.append(minDist)
plt.hist(dis, log=True)


Out[8]:
(array([  2.80155000e+05,   1.15000000e+02,   4.10000000e+01,
          9.50000000e+01,   7.20000000e+01,   1.40000000e+01,
          5.00000000e+00,   3.00000000e+00,   0.00000000e+00,
          5.10000000e+01]),
 array([     0. ,   9481.4,  18962.8,  28444.2,  37925.6,  47407. ,
         56888.4,  66369.8,  75851.2,  85332.6,  94814. ]),
 <a list of 10 Patch objects>)

In [9]:
#get min of the distance from 5' and 3'
dis = []
for _,pair in txPairs.items():
    minDist = 200000
    for (k,v) in pair:
        minDist = min(v-k, k, minDist)
    if minDist < 1000:
        dis.append(minDist)
plt.hist(dis, log=True)


Out[9]:
(array([ 79918.,  78082.,  38268.,  18820.,  10829.,   6577.,   5282.,
          4783.,   3458.,   2879.]),
 array([   0. ,   99.9,  199.8,  299.7,  399.6,  499.5,  599.4,  699.3,
         799.2,  899.1,  999. ]),
 <a list of 10 Patch objects>)

In [11]:
geSam = pysam.AlignmentFile("../data/gene.sam", "r")
wiSam = pysam.AlignmentFile("../data/onlyReGene.sam", "w", template=geSam)
gePairs = []
for read in geSam.fetch():
    if read.qname in onlyGe:
        wiSam.write(read)

In [3]:
onlyGe = ge-tx
onlyTx = tx-ge

In [4]:
with open('../data/reOnly.txt', 'w') as f:
    for x in onlyGe:
        f.write(x+"\n")

In [5]:
len(onlyGe)


Out[5]:
984879

In [ ]:


In [ ]: