In [1]:
import pandas as pd
import numpy as np
import matplotlib as mpl
%matplotlib inline
import matplotlib.pyplot as plt
from collections import Counter
from collections import defaultdict
import pysam
from pyfasta import Fasta
from intervaltree import IntervalTree
In [2]:
gtf = defaultdict(lambda : defaultdict(list))
with open('../data/mohu.gtf') as f:
for line in f:
if line[0] != '#':
toks = line.strip().split('\t')
chro = toks[0]
feature = toks[2]
start = toks[3]
end = toks[4]
if feature in ['gene', 'transcript', 'exon']:
#extract start, end , id of the feature
gtf[chro][feature].append((start, end, ''.join(\
[ x.replace(feature+'_id','').replace('"','').strip() \
for x in toks[-1].split(';') \
if feature+'_id' in x] )))
In [3]:
exonDict = {}
for x in gtf:
tree = IntervalTree()
for (s,e, det) in gtf[x]['exon']:
tree[int(s):int(e)+1] = det
exonDict[x] = tree
In [ ]:
In [ ]:
In [ ]:
#txp.bam
In [3]:
count = defaultdict(int)
with pysam.AlignmentFile('../data/tx_de_sort.bam', 'rb') as geFile:
count_align = 0
for alignment in geFile:
count_align += 1
if (count_align%1000 == 0):
print "\r processed #alignments:" + str(count_align),
try:
count[alignment.qname] += 1
except:
print 'NH tag needed'
exit(0)
In [4]:
Counter(count.values()),
Out[4]:
In [3]:
data = pd.read_table('../data/reOnly.txt', header=None)[0].tolist()
In [7]:
len(data),
Out[7]:
In [26]:
geneMap = pd.read_table('../data/geneMap.tsv', header=None).set_index(1).to_dict()[0]
In [12]:
tx_gene_counts = defaultdict(int)
ge_gene_counts = defaultdict(int)
In [7]:
txpList = defaultdict(list)
with pysam.AlignmentFile('../data/tx_de_sort.bam', 'rb') as geFile:
count_align = 0
for alignment in geFile:
count_align += 1
if (count_align%1000 == 0):
print "\r processed #alignments:" + str(count_align),
try:
nh = count[alignment.qname]
txpList[nh].append(alignment.reference_name)
except:
print 'NH tag needed'
exit(0)
In [8]:
sameGeneCount = defaultdict(int)
if 0 in txpList.keys():
del txpList[0]
for key, v in txpList.items():
for x in range(0,len(v), key):
t = set([])
for y in range(key):
t.add(geneMap[v[x+y]])
if len(t) == 1:
tx_gene_counts[list(t)[0]] += 1
sameGeneCount[key] += 1
recoverable = 0
for k,v in sameGeneCount.items():
recoverable += v
print str(k) + ":" + str((float(k) * v) / len(txpList[k]))
print recoverable
In [9]:
len(tx_gene_counts)
Out[9]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
#gene.bam
In [4]:
count = defaultdict(int)
with pysam.AlignmentFile('../data/ge_de_sort.bam', 'rb') as geFile:
count_align = 0
for alignment in geFile:
count_align += 1
if (count_align%1000 == 0):
print "\r processed #alignments:" + str(count_align),
try:
count[alignment.qname] += 1
except:
print 'NH tag needed'
exit(0)
In [5]:
Counter(count.values())
Out[5]:
In [6]:
exonMap = pd.read_table('../data/exonMap.tsv', header=None).set_index(0).to_dict()[1]
In [7]:
geList = defaultdict(list)
with pysam.AlignmentFile('../data/ge_de_sort.bam', 'rb') as geFile:
count_align = 0
for alignment in geFile:
count_align += 1
if (count_align%1000 == 0):
print "\r processed #alignments:" + str(count_align),
try:
nh = count[alignment.qname]
geList[nh].append((alignment.reference_name, alignment.pos))
except:
print 'NH tag needed'
exit(0)
In [17]:
errors = []
sameGeneCount = defaultdict(int)
ge_gene_counts = defaultdict(int)
count_align = 0
if 0 in geList.keys():
del geList[0]
for key, v in geList.items():
for x in range(0,len(v), key):
count_align += 1
if (count_align%10000 == 0):
print "\r processed #alignments:" + str(count_align),
t = set([])
for y in range(key):
intv = []
name, pos = v[x+y]
try:
intv = exonDict[name][pos]
except:
errors.append(name)
if len(intv) > 0:
t.add(list(intv)[0].data)
if len(t) == 1:
ge_gene_counts[list(t)[0]] += 1
sameGeneCount[key] += 1
recoverable = 0
for k,v in sameGeneCount.items():
recoverable += v
print str(k) + ":" + str((float(k) * v) / len(geList[k]))
print recoverable, len(ge_gene_counts)
In [62]:
x = Counter(errors)
sumv = 0
for k, v in x.items():
sumv += v
print sumv
In [63]:
x
Out[63]:
In [33]:
len(exonMap), geneMap.values()[-1]
Out[33]:
In [47]:
v = ge_gene_counts
for x in ge_gene_counts:
if str(x).isdigit():
del v[x]
In [60]:
1000000000 in ge_gene_counts.keys()
Out[60]:
In [ ]: