In [1]:
import pandas as pd
import numpy as np
import matplotlib as mpl
%matplotlib inline
import matplotlib.pyplot as plt
from collections import Counter
from collections import defaultdict
import pysam
from pyfasta import Fasta
from intervaltree import IntervalTree
In [2]:
gtf = defaultdict(lambda : defaultdict(list))
with open('../data/mohu.gtf') as f:
for line in f:
if line[0] != '#':
toks = line.strip().split('\t')
chro = toks[0]
feature = toks[2]
start = toks[3]
end = toks[4]
if feature in ['gene', 'transcript', 'exon']:
#extract start, end , id of the feature
gtf[chro][feature].append((start, end, ''.join(\
[ x.replace(feature+'_id','').replace('"','').strip() \
for x in toks[-1].split(';') \
if feature+'_id' in x] )))
In [3]:
exonDict = {}
for x in gtf:
tree = IntervalTree()
for (s,e, det) in gtf[x]['exon']:
tree[int(s):int(e)+1] = det
exonDict[x] = tree
In [ ]:
In [ ]:
In [ ]:
#txp.bam
In [4]:
count = defaultdict(int)
with pysam.AlignmentFile('../data/txp_nameSorted_subsample.bam', 'rb') as geFile:
count_align = 0
for alignment in geFile:
count_align += 1
if (count_align%1000 == 0):
print "\r processed #alignments:" + str(count_align),
try:
count[alignment.qname] += 1
except:
print 'NH tag needed'
exit(0)
In [5]:
Counter(count.values()),
Out[5]:
In [6]:
geneMap = pd.read_table('../data/geneMap.tsv', header=None).set_index(1).to_dict()[0]
In [7]:
tx_gene_counts = defaultdict(int)
In [8]:
txpList = defaultdict(list)
with pysam.AlignmentFile('../data/txp_nameSorted_subsample.bam', 'rb') as geFile:
count_align = 0
for alignment in geFile:
count_align += 1
if (count_align%1000 == 0):
print "\r processed #alignments:" + str(count_align),
try:
nh = count[alignment.qname]
txpList[nh].append(alignment.reference_name)
except:
print 'NH tag needed'
exit(0)
In [9]:
sameGeneCount = defaultdict(int)
if 0 in txpList.keys():
del txpList[0]
for key, v in txpList.items():
for x in range(0,len(v), key):
t = set([])
for y in range(key):
t.add(geneMap[v[x+y]])
if len(t) == 1:
tx_gene_counts[list(t)[0]] += 1
sameGeneCount[key] += 1
recoverable = 0
for k,v in sameGeneCount.items():
recoverable += v
print str(k) + ":" + str((float(k) * v) / len(txpList[k]))
print recoverable
In [10]:
len(tx_gene_counts)
Out[10]:
In [11]:
totGene = 0
for chro in gtf:
for l in gtf[chro]:
if l == "gene":
totGene += len(gtf[chro][l])
print totGene
In [ ]:
In [ ]:
In [ ]:
#gene.bam
In [12]:
count = defaultdict(int)
with pysam.AlignmentFile('../data/gene_nameSorted_subsample.bam', 'rb') as geFile:
count_align = 0
for alignment in geFile:
count_align += 1
if (count_align%1000 == 0):
print "\r processed #alignments:" + str(count_align),
try:
count[alignment.qname] += 1
except:
print 'NH tag needed'
exit(0)
In [13]:
Counter(count.values())
Out[13]:
In [14]:
ge_gene_counts = defaultdict(int)
In [15]:
exonMap = pd.read_table('../data/exonMap.tsv', header=None).set_index(0).to_dict()[1]
In [16]:
geList = defaultdict(list)
with pysam.AlignmentFile('../data/gene_nameSorted_subsample.bam', 'rb') as geFile:
count_align = 0
for alignment in geFile:
count_align += 1
if (count_align%1000 == 0):
print "\r processed #alignments:" + str(count_align),
try:
nh = count[alignment.qname]
geList[nh].append((alignment.reference_name, alignment.pos))
except:
print 'NH tag needed'
exit(0)
In [18]:
errors = []
sameGeneCount = defaultdict(int)
ge_gene_counts = defaultdict(int)
count_align = 0
if 0 in geList.keys():
del geList[0]
for key, v in geList.items():
for x in range(0,len(v), key):
count_align += 1
if (count_align%10000 == 0):
print "\r processed #alignments:" + str(count_align),
t = set([])
for y in range(key):
intv = []
name, pos = v[x+y]
try:
intv = exonDict[name][pos]
except:
errors.append(name)
if len(intv) > 0:
t.add(exonMap[list(intv)[0].data])
if len(t) == 1:
ge_gene_counts[list(t)[0]] += 1
sameGeneCount[key] += 1
recoverable = 0
for k,v in sameGeneCount.items():
recoverable += v
print str(k) + ":" + str((float(k) * v) / len(geList[k]))
print recoverable, len(ge_gene_counts)
In [20]:
x = Counter(errors)
sumv = 0
for k, v in x.items():
sumv += v
print sumv
In [ ]:
In [ ]:
In [30]:
#Analysis
In [21]:
geDf = pd.DataFrame.from_dict(ge_gene_counts.items()).set_index(0).rename(columns={1:'gene'})
txDf = pd.DataFrame.from_dict(tx_gene_counts.items()).set_index(0).rename(columns={1:'txp'})
In [22]:
combineCounts = pd.concat([geDf, txDf], axis=1)
In [23]:
combineCounts['diff'] = combineCounts['gene'] - combineCounts['txp']
In [24]:
diff = combineCounts.sort_values(by='diff', ascending=False).dropna()
In [25]:
fc = diff.div(diff['gene'], axis='index').sort_values(by='diff', ascending=False)
In [26]:
goi = fc[fc['diff']>0.95].index
In [27]:
hGoi = []
mGoi = []
for g in goi:
if 'MUSG' in g:
mGoi.append(g)
else:
hGoi.append(g)
print len(mGoi), len(hGoi), len(goi)
In [42]:
geneId2Name = defaultdict(set)
with open('../data/mohu.gtf') as f:
for line in f:
if line[0] != '#':
toks = line.strip().split('\t')
feature = toks[2]
if feature == 'gene':
details = toks[-1].split(";")
if 'gene_id' not in details[0] and 'gene_name' not in details[2]:
print error
break
gId = details[0].replace('gene_id "', "").rstrip('"').strip()
gName = details[2].replace('gene_name "', "").rstrip('"').strip()
geneId2Name[gId].add(gName)
In [29]:
for x in hGoi:
print list(geneId2Name[x])[0]
In [30]:
for x in mGoi:
print list(geneId2Name[x])[0]
In [31]:
hGenes= []
mGenes = []
for k,v in geneId2Name.items():
if 'MUSG' in k:
mGenes.append(list(v)[0])
else:
hGenes.append(list(v)[0])
In [32]:
len(hGenes), len(mGenes), len(hGenes) + len(mGenes)
Out[32]:
In [33]:
with open("hGenes.txt", 'w') as f:
for x in hGenes:
f.write(x+'\n')
with open("mGenes.txt", 'w') as f:
for x in mGenes:
f.write(x+'\n')
In [ ]:
In [161]:
#import htseq
In [45]:
htseq = pd.read_table('../data/gene_subsample.htseq', header=None).set_index(0).rename(columns={1:'gene'})
In [55]:
htseq.sum()
Out[55]:
In [46]:
combineCounts = pd.concat([htseq, txDf], axis=1).fillna(0) + 1
In [47]:
combineCounts['fc'] = combineCounts['gene'] / combineCounts['txp']
In [48]:
fc = np.log(combineCounts.sort_values(by='fc', ascending=False))
In [49]:
goi = fc[fc['fc']>0.5].index
In [50]:
hGoi = []
mGoi = []
for g in goi:
if 'MUSG' in g:
mGoi.append(g)
else:
hGoi.append(g)
print len(mGoi), len(hGoi), len(goi)
In [51]:
for x in hGoi:
print list(geneId2Name[x])[0]
In [52]:
fc
Out[52]:
In [53]:
plt.plot(fc['fc'].values)
plt.xlabel('fc sorted gene id')
plt.ylabel('log fold change (pc=1)')
Out[53]:
In [54]:
plt.hist(fc['fc'].values)
Out[54]:
In [ ]:
In [ ]:
In [ ]:
#subsample sam by tossing bad alignments
In [14]:
def dumpBestAlignment(wfile, aligmns):
if len(aligmns) == 1:
wfile.write(aligmns[0])
elif len(aligmns) != 0:
index = -1
cigar = []
for aln in aligmns:
cig = aln.cigartuples
cigSum = 0
for k,v in cig:
if k == 0:
cigSum += v
cigar.append(cigSum)
if len(cigar) != len(aligmns):
print "cigar error"
maxInd = np.argmax(cigar)
wfile.write(aligmns[maxInd])
def subsampleSam(name):
nh = 0
aligmns = []
with pysam.AlignmentFile(name, 'rb') as geFile, pysam.AlignmentFile(name[:-4]+'_subsample.bam', 'wb', template=geFile) as wFile:
count_align = 0
for alignment in geFile:
count_align += 1
if (count_align%1000 == 0):
print "\r processed #alignments:" + str(count_align),
if nh == 0:
dumpBestAlignment(wFile, aligmns)
nh = alignment.get_tag('NH') - 1
curReadName = alignment.qname
aligmns= [alignment]
else:
if curReadName != alignment.qname:
print "error"
break
nh -= 1
aligmns.append(alignment)
# subsampleSam('../data/txp_nameSorted.bam')
subsampleSam('../data/gene_nameSorted.bam')
In [ ]: