In [1]:
import pandas as pd
import numpy as np
import matplotlib as mpl
%matplotlib inline
import matplotlib.pyplot as plt
from collections import Counter
from collections import defaultdict
import pysam
from pyfasta import Fasta
In [3]:
new_nt_map = {0:'A', 1:'C', 2: 'T', 3:'G'}
rev_nt_map = {'A':0, 'C':1, 'T':2, 'G':3}
def getBC(num):
bc = []
if num > 16777215:
print num
return False
while (num>0):
bc.append(new_nt_map[num%4])
num = num/4
while len(bc) < 12:
bc.append('A')
return ''.join(bc)[::-1]
def getIn(bc):
index = 0
for ind, c in enumerate(bc):
index += (4**(12-ind-1))*rev_nt_map[c]
return index
In [30]:
tr = pd.read_table(open('../data/true.dump'), header=None).to_dict()[0]
trBc = tr.values()
In [40]:
#read eqClass files with barcodes
eqFile="../data/eq_classes.txt"
barcodeCount = defaultdict(int)
with open(eqFile) as ifile:
numTran = int(ifile.readline().rstrip())
numEq = int(ifile.readline().rstrip())
for i in xrange(numTran):
ifile.readline()
for i in xrange(numEq):
toks = ifile.readline().rstrip().split('\t')
nt = int(toks[0])
##########################
#parse barcode map
#nt+1 gives number of barcodes ignoring for now
nb = toks[nt+1]
nr = int(toks[-1])
verify = 0
for i in range(nt+2, len(toks)-1, 2):
bc = int(toks[i].split(',')[0].lstrip('('))
count = int(toks[i+1])
barcodeCount[bc] += int(count)
verify += count
if verify != nr:
print "error"
break
In [41]:
tCount = 0
ntCount = 0
for k,v in barcodeCount.items():
if k in trBc:
tCount += barcodeCount[k]
else:
ntCount += barcodeCount[k]
In [39]:
#before using our knee correction
# #reads from true txp and #reads from non true
print tCount, ntCount
In [42]:
#After using our knee correction
# #reads from true txp and #reads from non true
print tCount, ntCount
In [43]:
#Total Reads remapped
1488761 - 1212857
Out[43]:
In [44]:
#Q: Running Star in txptome and genome what are the differences
In [3]:
tx = set(pd.read_table(open('../data/txReads.txt'), header=None)[0].tolist())
ge = set(pd.read_table(open('../data/geReads.txt'), header=None)[0].tolist())
In [4]:
len(tx) , len(ge), len(tx-ge), len(ge-tx)
Out[4]:
In [5]:
onlyGe = ge-tx
onlyTx = tx-ge
In [90]:
#Q: onlyTx testing, first hypo
ref = Fasta("../data/ref.fa")
txSam = pysam.AlignmentFile("../data/txp.sam", "r")
lens = txSam.lengths
In [91]:
txPairs = defaultdict(list)
for read in txSam:
if read.qname in onlyTx:
txPairs[read.qname].append((read.pos, lens[read.rname]))
In [100]:
len(txPairs), sum([len(x) for x in txPairs.values()])
#lens of unique reads are consistent: check
Out[100]:
In [101]:
#get min of the distance from 5' and 3'
dis = []
for _,pair in txPairs.items():
minDist = 200000
for (k,v) in pair:
minDist = min(v-k, k, minDist)
dis.append(minDist)
In [102]:
plt.hist(dis, log=True)
Out[102]:
In [103]:
#get min of the distance from 5' and 3'
dis = []
for _,pair in txPairs.items():
minDist = 200000
for (k,v) in pair:
minDist = min(v-k, k, minDist)
if minDist < 1000:
dis.append(minDist)
In [104]:
plt.hist(dis, log=True)
Out[104]:
In [ ]:
#Ans: Significant number of reads are getting map very distant from either end
In [105]:
In [9]:
#Q: onlyGe testing, second hypo
ref = Fasta("../data/genome.fa")
geSam = pysam.AlignmentFile("../data/gene.sam", "r")
lens = geSam.lengths
wiSam = pysam.AlignmentFile("../data/onlyGene.sam", "w", template=geSam)
In [ ]:
In [10]:
gePairs = []
for read in geSam.fetch():
if read.qname in onlyGe:
wiSam.write(read)
# print read.qname
# gePairs.append((read.pos, read.reference_name))
In [54]:
len(gePairs)#, sum([len(x) for x in gePairs.values()])
#lens of unique reads are consistent: check
Out[54]:
In [3]:
import pickle
# pickle.dump(gePairs, open( "../data/gePairs.p", "wb" ) )
gePairs = pickle.load(open( "../data/gePairs.p", "rb" ) )
In [52]:
gtf = defaultdict(lambda : defaultdict(list))
with open('../data/mohu.gtf') as f:
for line in f:
if line[0] != '#':
toks = line.strip().split('\t')
gene = toks[0]
feature = toks[2]
start = toks[3]
end = toks[4]
gtf[gene][feature].append((start,end, toks[-1]))
In [83]:
it = 0
counts = defaultdict(int)
for p,r in gePairs:
it += 1
for feature in gtf[r]:
for (start, stop) in gtf[r][feature]:
if start < p and p < stop:
counts[feature] += 1
print ("\r count = "+str(it)),
print counts
In [110]:
for feat in gtf['mchr16'].keys():
for s,e,x in gtf['mchr16'][feat]:
if int(s)<30570147 and int(e)>30570147:
print (feat, s,e, 30570147-int(s), int(e)-int(s))
In [93]:
t1="GGAGAAACCGGAAGCCTCGCGTGCTGAGCAGTGGTTTGTGCTGTTGATGCGTGGTTATGGGTCGGAGACGAGCGCCGGGTGGTGGGTCCCTGGGCCGAGTACTTATCCGGCAGCAGACTCAGCGAAGCCGCAGCCACCGTCATACTGACTCCTGGCTGCATACAAGTGAACTCAATGATGGCTATGACTGGGGTCGTCTGAATCTTCAGTCTGTGACAGAGCAGAGCTCCCTGGAGGACTTTCTTGCAACTGCAGAACTTGCAGGGACCGAGTTTGTGGCTGAGAAGCTGAATATTAAGTTTGTACCCCCTGAGGCTAGAACCGGACTGCTGTCTTTTGAGGAGAGCCAGAGAATTAAGAAACTCCACGAAGAAAACAGACAGTTCTTGTGTATACCAAGGAGGCCAAATTGGGACAGAAAAACTAGCCCAGAAGAACTGAAACAAGCAGAGAAAGATAACTTCCTGAAATGGAGGCGCCAGCTTGTGCGGCTAGAAGAGGAACAGAAGTTGATATTGACTCCCTTTGAACGGAACTTGGACTTCTGGCGCCAGCTCTGGAGGGTCATTGAGAGAAGTGATATTGTGGTCCAGATCGTAGATGCTCGAAACCCGCTCCTGTTTAGATGTGAGGATTTGGAGTGTTACGTGAAAGAGATCGATGCTGCCAAGGAGAACGTGATCCTGATAAACAAGGCAGACTTGCTGACTGCTGAGCAGCGGTTTGCCTGGGCTGTGCACTTTGAAAAGGAAGGGGTAAAGGTTATCTTCTGGTCAGCTTTGGCAGAAACTGATCACCTAAATGGTGACTTGAAGGAGGAAGTAGACAGTGTTGCTGGAGACACCAACAAAACAGAGTCTGAAAGCTCCAGTTTGGATGCGAATGAAATCCCACACAGAGACTTGATCTCGCTTAGTGAGGAGTCCGCAAGTGACAGTGGTGACAGCAAATATGAGGACTGTCAGGAGGACGAAGAGGAAGATTGGCAGACGTGTTCAGAAGAAGACAGCGTCCCTGAGGAGGAGGAGGGCTGTAATGCGGATTCTGAGACTCAGAACAGAAAAAACGCAGAAAATCAACAAGTGAACAATGATAGCTACCTGGTGTCGAAACAGGAATTACTGGAACTCTTTAAGAAACTCCACACTGGAAAGAAAGTGAAAGATGGACAGCTAACTGTTGGGCTGGTGGGCTACCCTAATGTTGGTAAGAGCTCAACCATCAACACCATCATGGGCAACAAGAAAGTATCGGTGTCTGCCACGCCAGGCCACACCAAGCATTTCCAGACCCTGTACGTGGAGCCTGGCCTCTGCCTGTGTGACTGCCCGGGCCTGGTGATGCCATCTTTCGTGTCCACCAAAGCAGAGATGATTTGCAATGGGATCCTCCCTATCGACCAGATGAGAGACCACGTTCCACCCGTATCACTAGTTTGCCAGAATATCCCACGGCGAGTTTTGGAAGTGACATATGGTATTAACATTATAAAGCCTAGAGAAGATGAAGACCCCTACCGACCTCCAACCTCAGAAGAACTGTTGACGGCTTATGGATGCATGAGAGGATTCATGACAGCACATGGGCAGCCAGACCAGCCTCGATCTGCACGGTACATCCTAAAGGACTATGTTGGAGGTAAACTACTGTATTGCCACCCTCCTCCTGGAAAAGACCCTGTGGCGTTCCAGCACCAACACCAGCAGCTCCTAGAGAGCAAAGTAAAAGGTGGAGAACTGAGACTTCAGCCGGGCAAAGGCCGAAAAGCCAAACAGATCGAAAATGTAGTCGACAAAACTTTTTTCCATCAGGAGAACGTCAGAGCACTGACCAAAGGTGTCCAGGCTGTCATGGGTTACAAGCCCGGGCATGGCCTGGTGACTGCGGCTGCTGCGAGCGCTGAGAATGTACCTGGGAAGCCCTGGAAAAAACACGGTAACAGAAACAAAAAAGAGAAAAGCCGTAGGCTCTACAAGCACCTGGATGTGTGAGCTCGGCTGAACAGAGACGGCAGGCAGCTGCGCTGTGCGGGAGGACACGAGCTCAAACCGCCACTGCTGCTGTCGCTGCCGTGCTGCTCTCCTGACCACTGCACTGTGGACTGGACCCTGCTCTCCTGGAGAGCACAGTCGCACCAGAAGTCTTGACATCAAGACAGCCTTTCGGAAGCACCTGCTGTGGCCGTGGTAGTCATGGGGTGCCCCAGAAGTCTCATCCTGCAGAAGGATGGTGGATATGTGTACAAGCATACAAATGCACATTGACTTTGAGGTTTTGGGAAGTTATTAAAATTAATTTTCCACTAGTTGATATTCTTATTGTCTTAAGTAGGCAAGCTGCCATTTATACATTTAATGTTGACCCTCTAGGAAGGGTCTGGAGATGGCTCAGCAGTTAGGAGCATTGGTTGCCCTTTCAGAGGACCCCAGTGTGGTTCCTAGCACCTACATGGCAGCTCATAGCCATCGATAACTCCAGTGTTAGGGCATCTAATGACTTCTTCTGGCTCTGATGGGTACCAGACACACAAATGGTACGCAGACATTCATGCAGGCAAAACACCCATACACATAAACAGCACACATTCACTCAGGACTCCAGTCTCCCTGGGCTTCCTCGGCTCCCTAGAGGTGACTGTGAGAGGTGGTTGAACGAGAATGGCCTGCAGAGGGCTTATGGATTTGCACACTTGGTTCCACCTCTGCCCCCTTTTAAACATAAAGTTTTAAAAAAGGAAATACTAAAAATAAACTATGACTAGGACTTGAATTAATTTCCAGACACATATCTAACGTATCCCTGATATGGCAGAGGGCTAGCAGGGGTGGTTTGGTAAGAGCTGGGCTGGACTGGTGAGACCATTGAGTGAAAGCTACTGTCACACAAACCTGCTGACCTTAGTCTGGTCTCTGGAGTCTACCAAAGCGGGAGGACTCTGACCAGAGCATTCAAGCGCACACTAATTACAGAATGGGACACTCCAGAAAGCATCACCCCATGGGGGAAGCACTTGCTCAAACCCATAGAAACGAAAAAAGAGCCAGGTCAGCTGCTAGGCTTTATTTCAGACATGAATTTTTGATGTTATAGATGTTTGGGGGTAAAGTTCTACAGAAAAATACGCATTTCTGCAAACAGTTTTGTAAACTAGAATTCATTAAAGCCTCTGTCGCAACCATG"
t2="GAAACCGGAAGCCTCGCGTGCTGAGCAGTGGTTTGTGCTGTTGATGCGTGGTTATGGGTCGGAGACGAGCGCCGGGTGGTGGGTCCCTGGGCCGAGTACTTATCCGGCAGCAGACTCAGCGAAGCCGCAGCCACCGTCATACTGACTCCTGGAGAAGCTGAATATTAAGTTTGTACCCCCTGAGGCTAGAACCGGACTGCTGTCTTTTGAGGAGAGCCAGAGAATTAAGAAACTCCACGAAGAAAACAGACAGTTCTTGTGTATACCAAGGAGGCCAAATTGGGACAGAAAAACTAGCCCAGAAGAACTGAAACAAGCAGAGAAAGATAACTTCCTGAAATGGAGGCGCCAGCTTGTGCGGCTAGAAGAGGAACAGAAGTTGATATTGACTCCCTTTGAACGGAACTTGGACTTCTGGCGCCAGCTCTGGAGGGTCATTGAGAGAAGATGGCCTGGAACTCAACTCTAGACCAGCCTGGACCTGAAGTCACAGAGATCTGCTGCTTCAGTCTCCCAAGTACTGGGATTAAAGACCTGTGCCAACAATGATATTGTGGTCCAGATCGTAGATGCTCGAAACCCGCTCCTGTTTAGATGTGAGGATTTGGAGTGTTACGTGAAAGAGATCGATGCTGCCAAGGAGAACGTGATCCTGATAAACAAGGCAGACTTGCTGACTGCTGAGCAGCGGTTTGCCTGGGCTGTGCACTTTGAAAAGGAAGGGGTAAAGGTTATCTTCTGGTCAGCTTTGGCAGAAACTGATCACCTAAATGGTGACTTGAAGGAGGAAGTAGACAGTGTTGCTGGAGACACCAACAAAACAGAGTCTGAAAGCTCCAGTTTGGATGCGAATGAAATCCCACACAGAGACTTGATCTCGCTTAGTGAGGAGTCCGCAAGTGACAGTGGTGACAGCAAATATGAGGACTGTCAGGAGGACGAAGAGGAAGATTGGCAGACGTGTTCAGAAGAAGACAGCGTCCCTGAGGAGGAGGAGGGCTGTAATGCGGATTCTGAGACTCAGAACAGAAAAAACGCAGAAAATCAACAAGTGAACAATGATAGCTACCTGGTGTCGAAACAGGAATTACTGGAACTCTTTAAGAAACTCCACACTGGAAAGAAAGTGAAAGATGGACAGCTAACTGTTGGGCTGGTGGGCTACCCTAATGTTGGTAAGAGCTCAACCATCAACACCATCATGGGCAACAAGAAAGTATCGGTGTCTGCCACGCCAGGCCACACCAAGCATTTCCAGACCCTGTACGTGGAGCCTGGCCTCTGCCTGTGTGACTGCCCGGGCCTGGTGATGCCATCTTTCGTGTCCACCAAAGCAGAGATGATTTGCAATGGGATCCTCCCTATCGACCAGATGAGAGACCACGTTCCACCCGTATCACTAGTTTGCCAGAATATCCCACGGCGAGTTTTGGAAGTGACATATGGTATTAACATTATAAAGCCTAGAGAAGATGAAGACCCCTACCGACCTCCAACCTCAGAAGAACTGTTGACGGCTTATGGATGCATGAGAGGATTCATGACAGCACATGGGCAGCCAGACCAGCCTCGATCTGCACGGTACATCCTAAAGGACTATGTTGGAGGTAAACTACTGTATTGCCACCCTCCTCCTGGAAAAGACCCTGTGGCGTTCCAGCACCAACACCAGCAGCTCCTAGAGAGCAAAGTAAAAGGTGGAGAACTGAGACTTCAGCCGGGCAAAGGCCGAAAAGCCAAACAGATCGAAAATGTAGTCGACAAAACTTTTTTCCATCAGGAGAACGTCAGAGCACTGACCAAAGGTGTCCAGGCTGTCATGGGTTACAAGCCCGGGCATGGCCTGGTGACTGCGGCTGCTGCGAGCGCTGAGAATGTACCTGGGAAGCCCTGGAAAAAACACGGTAACAGAAACAAAAAAGAGAAAAGCCGTAGGCTCTACAAGCACCTGGATGTGTGAGCTCGGCTGAACAGAGACGGCAGGCAGCTGCGCTGTGCGGGAGGACACGAGCTCAAACCGCCACTGCTGCTGTCGCTGCCGTGCTGCTCTCCTGACCACTGCACTGTGGACTGGACCCTGCTCTCCTGGAGAGCACAGTCGCACCAGAAGTCTTGACATCAAGACAGCCTTTCGGAAGCACCTGCTGTGGCCGTGGTAGTCATGGGGTGCCCCAGAAGTCTCATCCTGCAGAAGGATGGTGGATATGTGTACAAGCATACAAATGCACATTGACTTTGAGGTTTTGGGAAGTTATTAAAATTAATTTTCCAC"
t3="AGTGGTTTGTGCTGTTGATGCGTGGTTATGGGTCGGAGACGAGCGCCGGGTGGTGGGTCCCTGGGCCGAGTACTTATCCGGCAGCAGACTCAGCGAAGCCGCAGCCACCGTCATACTGACTCCTGGCTGCATACAAGTGAACTCAATGATGGCTATGACTGGGGTCGTCTGAATCTTCAGTCTGTGACAGAGCAGAGCTCCCTGGAGGACTTTCTTGCAACTGCAGAACTTGCAGGGACCGAGTTTGTGGCTGAGAAGCTGAATATTAAGTTTGTACCCCCTGAGGCTAGAACCGGACTGCTGTCTTTTGAGGAGAGCCAGAGAATTAAGAAACTCCACGAAGAAAACAGACAGTTCTTGTGTATACCAAGGAGGCCAAATTGGGACAGAAAAACTAGCCCAGAAGAACTGAAACAAGCAGAGAAAGATAACTTCCTGAAATGGAGGCGCCAGCTTGTGCGGCTAGAAGAGGAACAGAAGTTGATATTGACTCCCTTTGAACGGAACTTGGACTTCTGGCGCCAGCTCTGGAGGGTCATTGAGAGAAGTGATATTGTGGTCCAGATCGTAGATGCTCGAAACCCGCTCCTGTTTAGATGTGAGGATTTGGAGTGTTACGTGAAAGAGATCGATGCTGCCAAGGAGAACGTGATCCTGATAAACAAGGCAGACTTGCTGACTGCTGAGCAGCGGTTTGCCTGGGCTGTGCACTTTGAAAAGGAAGGGGTAAAGGTTATCTTCTGGTCAGCTTTGGCAGAAACTGATCACCTAAATGGTGACTTGAAGGAGGAAGTAGACAGTGTTGCTGGAGACACCAACAAAACAGAGTCTGAAAGCTCCAGTTTGGATGCGAATGAAATCCCACACAGAGACTTGATCTCGCTTAGTGAGGAGTCCGCAAGTGACAGTGGTGACAGCAAATATGAGGACTGTCAGGAGGACGAAGAGGAAGATTGGCAGACGTGTTCAGAAGAAGACAGCGTCCCTGAGGAGGAGGAGGGCTGTAATGCGGATTCTGAGACTCAGAACAGAAAAAACGCAGAAAATCAACAAGTGAACAATGATAGCTACCTGGTGTCGAAACAGGAATTACTGGAACTCTTTAAGAAACTCCACACTGGAAAGAAAGTGAAAGATGGACAGCTAACTGTTGGGCTGGTGGGCTACCCTAATGTTGGTAAGAGCTCAACCATCAACACCATCATGGGCAACAAGAAAGTATCGGTGTCTGCCACGCCAGGCCACACCAAGCATTTCCAGACCCTGTACGTGGAGCCTGGCCTCTGCCTGTGTGACTGCCCGGGCCTGGTGATGCCATCTTTCGTGTCCACCAAAGCAGAGATGATTTGCAATGGGATCCTCCCTATCGACCAGATGAGAGACCACGTTCCACCCGTATCACTAGTTTGCCAGAATATCCCACGGCGAGTTTTGGAAGTGACATATGGTATTAACATTATAAAGCCTAGAGAAGATGAAGACCCCTACCGACCTCCAACCTCAGAAGAACTGTTGACGGCTTATGGATGCATGAGAGGATTCATGACAGCACATGGGCAGCCAGACCAGCCTCGATCTGCACGGTACATCCTAAAGGACTATGTTGGAGTGAGTGAACCTGGTCCTGATAGAATGAGCAAGTTTGCTTTTCCCTGCACTGTGTTCTCAGTAATGTCAACCTGTTAAAAAGATGTTTTCTCATTGAGATGGACTGATACCCAAGTGGAGTGTCTTGTTTTTAGTGATGCTAGGGATTGACCTGTGGGCCTTAAACTGAGCCCCAGCCCCGGCCCTCAGAAACAATCTTGATTTCAGACTCTAACCTAGTATTCCCAGGTTTAGTGTAAAGAATACTGAAGAAGCCAGGCATGGTGGTGCACGCCTTTAATCCCAGCACTTGGGAGGCAGAGGCAGGCAGATTTCTGAGTTCCAGGACAGCCAGAGCTACACAGAGAAACCCTGTCTCGAAAAACCAAAAAAAAGAATACTGAAGAATACTTAGTAGAAATGAAGTACACGTGAAAAGGCAGGCCACAGAGAAGGCAAATTGCGTGTAAGCTCTGGGCCCTGGACGGGCTAGTATGTGAAAGTTAGGGCAGTTACTGAAGCTCCACTTGCCACAGCTGCTTCAGCCTGTGCCGGGTTACTACATACTGCCTTGAGGACCTTTGCCCTCCTGGGGTGACACAGGTACACTGTCACTCTGCTCTCTCTGCAGGGTAAACTACTGTATTGCCACCCTCCTCCTGGAAAAGACCCTGTGGCGTTCCAGCACCAACACCAGCAGCTCCTAGAGAGCAAAGTAAAAGGTGGAGAACTGAGACTTCAGCCGGGCAAAGGCCGAAAAGCCAAACAGATCG"
t4="GGAGAAACCGGAAGCCTCGCGTGCTGAGCAGTGGTTTGTGCTGTTGATGCGTGGTTATGGGTCGGAGACGAGCGCCGGGTGGTGGGTCCCTGGGCCGAGTACTTATCCGGCAGCAGACTCAGCGAAGCCGCAGCCACCGTCATACTGACTCCTGGCTGCATACAAGTGAACTCAATGATGGCTATGACTGGGGTCGTCTGAATCTTCAGTCTGTGACAGAGCAGAGCTCCCTGGAGGACTTTCTTGCAACTGCAGAACTTGCAGGGACCGAGTTTGTGGCTGAGAAGCTGAATATTAAGTTTGTACCCCCTGAGGCTAGAACCGGACTGCTGTCTTTTGAGGAGAGCCAGAGAATTAAGAAACTCCACGAAGAAAACAGACAGTTCTTGTGTATACCAAGGAGGCCAAATTGGGACAGAAAAACTAGCCCAGAAGAACTGAAACAAGCAGAGAAAGATAACTTCCTGAAATGGAGGCGCCAGCTTGTGCGGCTAGAAGAGGAACAGAAGTTGATATTGACTCCCTTTGAACGGAACTTGGACTTCTGGCGCCAGCTCTGGAGGGTCATTGAGAGAAGATGGCCTGGAACTCAACTCTAGACCAGCCTGGACCTGAAGTCACAGAGATCTGCTGCTTCAGTCTCCCAAGTACTGGGATTAAAGACCTGTGCCAACAAGTATAGCCTTAATGTTTTTCCAAAAGGATTCAGTTTCATCCTGACTGCATAACACCTTCAAGTCCTGCTTAAAAAAAAAAAAATCTTAAATAGCTTAAAGTGATATTGTGGTCCAGATCGTAGATGCTCGAAACCCGCTCCTGTTTAGATGTGAGGATTTGGAGTGTTACGTGAAAGAGATCGATGCTGCCAAGGAGAACGTGATCCTGATAAACAAGGCAGACTTGCTGACTGCTGAGCAGCGGTTTGCCTGGGCTGTGCACTTTGAAAAGGAAGGGGTAAAGGTTATCTTCTGGTCAGCTTTGGCAGAAACTGATCACCTAAATGGTGACTTGAAGGAGGAAGTAGACAGTGTTGCTGGAGACACCAACAAAACAGAGTCTGAAAGCTCCAGTTTGGATGCGAATGAAATCCCACACAGAGACTTGATCTCGCTTAGTGAGGAGTCCGCAAGTGACAGTGGTGACAGCAAATATGAGGACTGTCAGGAGGACGAAGAGGAAGATTGGCAGACGTGTTCAGAAGAAGACAGCGTCCCTGAGGAGGAGGAGGGCTGTAATGCGGATTCTGAGACTCAGAACAGAAAAAACGCAGAAAATCAACAAGTGAACAATGATAGCTACCTGGTGTCGAAACAGGAATTACTGGAACTCTTTAAGAAACTCCACACTGGAAAGAAAGTGAAAGATGGACAGCTAACTGTTGGGCTGGTGGGCTACCCTAATGTTGGTAAGAGCTCAACCATCAACACCATCATGGGCAACAAGAAAGTATCGGTGTCTGCCACGCCAGGCCACACCAAGCATTTCCAGACCCTGTACGTGGAGCCTGGCCTCTGCCTGTGTGACTGCCCGGGCCTGGTGATGCCATCTTTCGTGTCCACCAAAGCAGAGATGATTTGCAATGGGATCCTCCCTATCGACCAGATGAGAGACCACGTTCCACCCGTATCACTAATATCCTTTCTGCTTCAGTCCACTCTGGTTATTGATGTGAGTGAGGGCTCTGGGGATTTTCAGGGCACTGCCTTCAAACAGCATAGTTATAGGCATTAGGGACAAAGGTGTGGTTCTTTGTATCTCTGTTAGGATTATCTCTCAAAGGTAAGAGTTTGTTTATAATAATGTTCCTTAAGGTAATGTAGTTAAGATCTATCTGTTTTAGATTTGTTGATTTTTATTTTATGTGTGTGTGTTTCCCTGTACCTTTATCTGTACACTACTCACATGTCTGTTGCCCTTGACGGTCAGAAGAGAGCATTGGATCCCCTGAACTGGAGTTACAGACAGTTGTGAACCTCTTTGTAGGTGCTGGGAATCGAACCCTGGTCCTGCAAGAGAAGCAAGTGTGATTACCCACTGAACTATCTTTCTAGCCCCAGAGATAATATTTTTTGAAACTTTTAAAAATTTTTTTAATTTATTTTTTACACTCTATATTCCATTCCCCACCCCCACCCCCGAAATAATATTTTAATGATTGTTTTATGTACGTTTAGAGACTAGGCTTTACTATGTTGCTCAGGCTAGCCCAGAATGCACTGTGTTACCCAAACTAGCCTTAAACTCAGACTCCTCCTCAACCTTCCTAGTGCTGAAATTACAGGTATGAGTTACCATGCCTGGCTTGGTTTTTGGTTGGTTTGGTTCAGGGTTTTTTTGGTTTGGGGAGGGAGTGTTGGGAGGGTCTGGGGGAGTTGAGGGAGCTGTGCGAAGGTTAAGACAGTTTTTCTATGTAGCCCTAGCTGTCCTGGAACTCGATCTGTAGACTACCTGGCCTCAAACTCACAGAGATCTCCCTGCCTCTGCCTCCGAGTGCTGGGATTAAAGGCATGGGCCGGGCAGGAGAGATGGCTCAGCAGTTAAGAACACTGACTGTTCTTCCAGTGGTCCTGAGTTCAAAAATCCAGCAACCATATGGTGGCTCACAACCATCTGTAATGAGACCTGACGCCTTCTTCTGGTGTGTCTGAAAACAGCTACAGTGTATTCATATACGTTAAATAAATAATAATTTAAATAAATAAATAAAGGTATGGGCCACCACCG"
In [94]:
a="GTGTACTTACATATAATAATAAATAAATAATAAATAAATCTTAAAAAAAAAAAAAAAAAA"
In [91]:
a="GTGTA"
In [102]:
def query(a, t):
ind = []
for x in range(len(t)-len(a)):
if t[x:x+len(a)] == a:
ind.append(x)
for x in ind:
print t[x:x+60]
print "\n"
In [113]:
query("GTGTA", t4)
In [46]:
gtf['mchr16'].keys()
Out[46]:
In [ ]:
In [3]:
#new txptome testing
In [2]:
tx = set(pd.read_table(open('../data/reReads.txt'), header=None)[0].tolist())
ge = set(pd.read_table(open('../data/geReads.txt'), header=None)[0].tolist())
In [5]:
onlyGe = ge-tx
onlyTx = tx-ge
ref = Fasta("../data/ref.fa")
txSam = pysam.AlignmentFile("../data/rsem.sam", "r")
lens = txSam.lengths
txPairs = defaultdict(list)
for read in txSam:
if read.qname in onlyTx:
txPairs[read.qname].append((read.pos, lens[read.rname]))
In [6]:
len(txPairs), sum([len(x) for x in txPairs.values()])
Out[6]:
In [8]:
#get min of the distance from 5' and 3'
dis = []
for _,pair in txPairs.items():
minDist = 200000
for (k,v) in pair:
minDist = min(v-k, k, minDist)
dis.append(minDist)
plt.hist(dis, log=True)
Out[8]:
In [9]:
#get min of the distance from 5' and 3'
dis = []
for _,pair in txPairs.items():
minDist = 200000
for (k,v) in pair:
minDist = min(v-k, k, minDist)
if minDist < 1000:
dis.append(minDist)
plt.hist(dis, log=True)
Out[9]:
In [11]:
geSam = pysam.AlignmentFile("../data/gene.sam", "r")
wiSam = pysam.AlignmentFile("../data/onlyReGene.sam", "w", template=geSam)
gePairs = []
for read in geSam.fetch():
if read.qname in onlyGe:
wiSam.write(read)
In [3]:
onlyGe = ge-tx
onlyTx = tx-ge
In [4]:
with open('../data/reOnly.txt', 'w') as f:
for x in onlyGe:
f.write(x+"\n")
In [5]:
len(onlyGe)
Out[5]:
In [ ]:
In [ ]: