RNA-Seq detects many novel splice variants :: Analysis

Prerequisite

Before running the pipeline please make sure all required software packages are installed.

Please see README.md for a list of required software.

Seup a working directory and others


In [1]:
%matplotlib inline

In [5]:
import matplotlib.pyplot as plt
import matplotlib

In [11]:
import json
s = json.load( open("bmh_matplotlibrc.json") )
matplotlib.rcParams.update(s)

In [3]:
cd ../results


/Users/Likit/projects/gimme/results

Sizes of transcripts


In [12]:
%run ../protocol/get_total_seq_size.py line6u_global.fa.clean.nr


90705	77454439

In [13]:
%run ../protocol/get_total_seq_size.py line6i_global.fa.clean.nr


104785	86622623

In [14]:
%run ../protocol/get_total_seq_size.py line7u_global.fa.clean.nr


90125	76566717

In [15]:
%run ../protocol/get_total_seq_size.py line7i_global.fa.clean.nr


92192	74957624

In [16]:
%run ../protocol/get_total_seq_size.py line6u_local.fa.clean.nr


68845	36662830

In [17]:
%run ../protocol/get_total_seq_size.py line6i_local.fa.clean.nr


70191	37877766

In [18]:
%run ../protocol/get_total_seq_size.py line7u_local.fa.clean.nr


63302	33571348

In [19]:
%run ../protocol/get_total_seq_size.py line7i_local.fa.clean.nr


67097	33824849

Unique sequences between global and local assembly


In [20]:
%run ../protocol/get_total_seq_size.py line6u_global.fa.clean.nr.uniq


17754	3686835

In [21]:
%run ../protocol/get_total_seq_size.py line6i_global.fa.clean.nr.uniq


18883	4157541

In [22]:
%run ../protocol/get_total_seq_size.py line7u_global.fa.clean.nr.uniq


18129	4180202

In [23]:
%run ../protocol/get_total_seq_size.py line7i_global.fa.clean.nr.uniq


19734	4242922

In [24]:
%run ../protocol/get_total_seq_size.py line6u_local.fa.clean.nr.uniq


1634	307975

In [25]:
%run ../protocol/get_total_seq_size.py line6i_local.fa.clean.nr.uniq


2550	400702

In [26]:
%run ../protocol/get_total_seq_size.py line7u_local.fa.clean.nr.uniq


2007	365850

In [27]:
%run ../protocol/get_total_seq_size.py line7i_local.fa.clean.nr.uniq


1619	326169

Homology search for unique sequences

To find if a unique region from global or local assembly are part of a gene, we search for homologous sequences in mouse proteins using BLASTX.

To obtain a significant hit, we only select unique regions longer than 300bp.


In [28]:
def blast_match(aligned_bases, total_bases):
    print 'Total long unique sequences matched with mouse proteins = %d/%d (%.2f%%)' % \
                (aligned_bases, total_bases, float(aligned_bases)/total_bases*100.0)

In [29]:
def get_match_bases(infile):
    bases = 0
    for line in open(infile):
        base = line.split('\t')[-1]
        bases += int(base)
    return bases

In [30]:
line6u_global_uniq_match = get_match_bases('line6u_global.fa.clean.nr.uniq.long.xml.out')
line6i_global_uniq_match = get_match_bases('line6i_global.fa.clean.nr.uniq.long.xml.out')
line7u_global_uniq_match = get_match_bases('line7u_global.fa.clean.nr.uniq.long.xml.out')
line7i_global_uniq_match = get_match_bases('line7i_global.fa.clean.nr.uniq.long.xml.out')

In [31]:
line6u_local_uniq_match = get_match_bases('line6u_local.fa.clean.nr.uniq.long.xml.out')
line6i_local_uniq_match = get_match_bases('line6i_local.fa.clean.nr.uniq.long.xml.out')
line7u_local_uniq_match = get_match_bases('line7u_local.fa.clean.nr.uniq.long.xml.out')
line7i_local_uniq_match = get_match_bases('line7i_local.fa.clean.nr.uniq.long.xml.out')

In [32]:
line6u_global_uniq_long = !python ../protocol/get_total_seq_size.py line6u_global.fa.clean.nr.uniq.long
line6i_global_uniq_long = !python ../protocol/get_total_seq_size.py line6i_global.fa.clean.nr.uniq.long
line7u_global_uniq_long = !python ../protocol/get_total_seq_size.py line7u_global.fa.clean.nr.uniq.long
line7i_global_uniq_long = !python ../protocol/get_total_seq_size.py line7i_global.fa.clean.nr.uniq.long

In [33]:
line6u_local_uniq_long = !python ../protocol/get_total_seq_size.py line6u_local.fa.clean.nr.uniq.long
line6i_local_uniq_long = !python ../protocol/get_total_seq_size.py line6i_local.fa.clean.nr.uniq.long
line7u_local_uniq_long = !python ../protocol/get_total_seq_size.py line7u_local.fa.clean.nr.uniq.long
line7i_local_uniq_long = !python ../protocol/get_total_seq_size.py line7i_local.fa.clean.nr.uniq.long

In [34]:
line6u_global_uniq_long = int(line6u_global_uniq_long[0].split('\t')[1])
line6i_global_uniq_long = int(line6i_global_uniq_long[0].split('\t')[1])
line7u_global_uniq_long = int(line7u_global_uniq_long[0].split('\t')[1])
line7i_global_uniq_long = int(line7i_global_uniq_long[0].split('\t')[1])

In [35]:
line6u_local_uniq_long = int(line6u_local_uniq_long[0].split('\t')[1])
line6i_local_uniq_long = int(line6i_local_uniq_long[0].split('\t')[1])
line7u_local_uniq_long = int(line7u_local_uniq_long[0].split('\t')[1])
line7i_local_uniq_long = int(line7i_local_uniq_long[0].split('\t')[1])

In [36]:
blast_match(line6u_global_uniq_match, line6u_global_uniq_long)


Total long unique sequences matched with mouse proteins = 260321/1285929 (20.24%)

In [37]:
blast_match(line6i_global_uniq_match, line6i_global_uniq_long)


Total long unique sequences matched with mouse proteins = 312849/1631356 (19.18%)

In [38]:
blast_match(line7u_global_uniq_match, line7u_global_uniq_long)


Total long unique sequences matched with mouse proteins = 349346/1800634 (19.40%)

In [39]:
blast_match(line7i_global_uniq_match, line7i_global_uniq_long)


Total long unique sequences matched with mouse proteins = 296915/1611354 (18.43%)

In [40]:
blast_match(line6u_local_uniq_match, line6u_local_uniq_long)


Total long unique sequences matched with mouse proteins = 9413/96830 (9.72%)

In [41]:
blast_match(line6i_local_uniq_match, line6i_local_uniq_long)


Total long unique sequences matched with mouse proteins = 5132/59813 (8.58%)

In [42]:
blast_match(line7u_local_uniq_match, line7u_local_uniq_long)


Total long unique sequences matched with mouse proteins = 9883/104229 (9.48%)

In [43]:
blast_match(line7i_local_uniq_match, line7i_local_uniq_long)


Total long unique sequences matched with mouse proteins = 9381/125640 (7.47%)

Splice junctions analysis

Different Isoforms from different hash length

The analysis compares splice junctions found in transcripts from k=21 and k=31.


In [44]:
k21_junctions = !cut -f 1 global_k21.fa.clean.nr.psl.best_all_sp.txt
k31_junctions = !cut -f 1 global_k31.fa.clean.nr.psl.best_all_sp.txt
k21_junctions = set(k21_junctions)
k31_junctions = set(k31_junctions)
print "Total junctions from k21 = %d, k31 = %d" % (len(k21_junctions), len(k31_junctions))
diff_junctions = list(k21_junctions.difference(k31_junctions))
print "Different junctions between k21 and k31 = %d junctions" % len(diff_junctions)


Total junctions from k21 = 103127, k31 = 97523
Different junctions between k21 and k31 = 10335 junctions

Oases-M vs conventional assembly


In [45]:
est_junctions = !cut -f 1 est.psl.best_all_sp.txt
est_junctions = set(est_junctions)

In [46]:
oasesM_junctions = !cut -f 1 oases-M.psl.best.all_sp.txt
oasesM_junctions = set(oasesM_junctions)

In [47]:
assembly_junctions = !cut -f 1 global_assembly.clean.nr.psl.best_all_sp.txt
assembly_junctions = set(assembly_junctions)

In [48]:
print 'Total junctions from Oases M = %d' % len(oasesM_junctions)
print 'Total junctions from global assembly = %d' % len(assembly_junctions)


Total junctions from Oases M = 111273
Total junctions from global assembly = 112708

In [49]:
print 'Unique Oases M junctions = %d' % (len(oasesM_junctions.difference(assembly_junctions)))


Unique Oases M junctions = 6860

In [50]:
print 'Unique global assembly junctions = %d' % (len(assembly_junctions.difference(oasesM_junctions)))


Unique global assembly junctions = 8295

In [51]:
print 'Unique Oases M junctions supported by ESTs = %d' % len(oasesM_junctions.difference(assembly_junctions).intersection(est_junctions))


Unique Oases M junctions supported by ESTs = 420

In [52]:
print 'Unique global assembly junctions supported by ESTs = %d' % \
    len(assembly_junctions.difference(oasesM_junctions).intersection(est_junctions))


Unique global assembly junctions supported by ESTs = 1608

Load all splice junctions data


In [53]:
gimme_junctions = !cut -f 1 all_assembly_models.bed_all_sp.txt
gimme_junctions = set(gimme_junctions)

In [54]:
ensembl_junctions = !cut -f 1 Gallus_gallus.WASHUC2.64.bed_all_sp.txt
ensembl_junctions = set(ensembl_junctions)

In [55]:
gimme_cuff_junctions = !cut -f 1 all_assembly_cufflinks_models.bed_all_sp.txt
gimme_cuff_junctions = set(gimme_cuff_junctions)

In [56]:
gimme_cuff_ensembl_junctions = !cut -f 1 all_assembly_cufflinks_ensembl_models.bed_all_sp.txt
gimme_cuff_ensembl_junctions = set(gimme_cuff_ensembl_junctions)

In [57]:
cufflinks_junctions = !cut -f 1 cufflinks.bed_all_sp.txt
cufflinks_junctions = set(cufflinks_junctions)

In [58]:
global_junctions = !cut -f 1 global_assembly_models.bed_all_sp.txt
global_junctions = set(global_junctions)
local_junctions = !cut -f 1 local_assembly_models.bed_all_sp.txt
local_junctions = set(local_junctions)

In [59]:
mrna_junctions = !cut -f 1 mrna.psl.best_all_sp.txt
mrna_junctions = set(mrna_junctions)

In [60]:
selected_chroms = ['chr1','chr10','chr11','chr12','chr13',
                   'chr14','chr15','chr16','chr17','chr18',
                   'chr19','chr2','chr3','chr4','chr5','chr6',
                   'chr7','chr8','chr9','chrX']

In [61]:
mus_gimme_junctions = !cut -f 1 mouse_assembly_models.bed_all_sp.txt
tmp = []
for ss in mus_gimme_junctions:
    chrom = ss.split(':')[0]
    if chrom in selected_chroms:
        tmp.append(ss)
mus_gimme_junctions = set(tmp)

In [62]:
mus_cuff_junctions = !cut -f 1 mouse_cufflinks.bed_all_sp.txt
mus_cuff_junctions = set(mus_cuff_junctions)

In [63]:
mus_ensembl_junctions = !cut -f 1 Mus_musculus.NCBIM37.64.bed_all_sp.txt
tmp = []
for ss in mus_ensembl_junctions:
    chrom = ss.split(':')[0]
    if chrom in selected_chroms:
        tmp.append(ss)

mus_ensembl_junctions = set(tmp)

All Venn diagram


In [64]:
from matplotlib_venn import venn3, venn3_circles, venn2, venn2_circles
from collections import Counter

In [65]:
union = cufflinks_junctions.union(gimme_junctions).union(ensembl_junctions)
indicators = ['%d%d%d' % (a in cufflinks_junctions, a in gimme_junctions, a in ensembl_junctions) for a in union]
subsets = Counter(indicators)

In [73]:
figsize(8,16)
v = venn3([ensembl_junctions, gimme_junctions, cufflinks_junctions],
          set_labels=('', '', ''))

fontsize = 16
v.get_label_by_id('100').set_fontsize(fontsize)
v.get_label_by_id('110').set_fontsize(fontsize)
v.get_label_by_id('101').set_fontsize(fontsize)
v.get_label_by_id('111').set_fontsize(fontsize)
v.get_label_by_id('011').set_fontsize(fontsize)
v.get_label_by_id('001').set_fontsize(fontsize)
v.get_label_by_id('010').set_fontsize(fontsize)

c = venn3_circles([ensembl_junctions, gimme_junctions, cufflinks_junctions],
                  linestyle='solid', lw=2.5, color='white')
font = {'size':18}
text(-0.9, 0.3, 'Ensembl', fontdict=font)
text(0.35, -0.5, 'Cufflinks', fontdict=font)
text(0.3, 0.45, 'Gimme', fontdict=font)

legend_ls = {'linewidth':12, 'alpha':0.5}
legend_font = {'size':16}
hlines(0.3,1.0,0.6, color=v.get_patch_by_id('100').get_facecolor(), **legend_ls)
hlines(0.25,1.0,0.6, color=v.get_patch_by_id('110').get_facecolor(), **legend_ls)
hlines(0.2,1.0,0.6, color=v.get_patch_by_id('101').get_facecolor(), **legend_ls)
hlines(0.15,1.0,0.6, color=v.get_patch_by_id('111').get_facecolor(), **legend_ls)
hlines(0.1,1.0,0.6, color=v.get_patch_by_id('011').get_facecolor(), **legend_ls)
hlines(0.05,1.0,0.6, color=v.get_patch_by_id('001').get_facecolor(), **legend_ls)
hlines(0.0,1.0,0.6, color=v.get_patch_by_id('010').get_facecolor(), **legend_ls)

text(0.7,0.29, 'Ensembl', fontdict=legend_font)
text(0.7,0.24, 'Ensembl + Gimme', fontdict=legend_font)
text(0.7,0.19, 'Ensembl + Cufflinks', fontdict=legend_font)
text(0.7, 0.14, 'Ensembl + Cufflinks + Gimme', fontdict=legend_font)
text(0.7, 0.09, 'Cufflinks + Gimme', fontdict=legend_font)
text(0.7, 0.04, 'Cufflinks', fontdict=legend_font)
text(0.7, -0.01, 'Gimme', fontdict=legend_font)

savefig('chick_venn.pdf',bbox_inches='tight', pad_inches=0.1)


Mouse data


In [74]:
fig = plt.figure(0,figsize=(12,8))
ax1 = fig.add_subplot(1,2,1)
ax2 = fig.add_subplot(1,2,2)
fig.tight_layout()

v1 = venn2([mus_ensembl_junctions, mus_cuff_junctions],
           set_labels=('', ''), ax=ax1)
v1.get_label_by_id('10').set_fontsize(18)
v1.get_label_by_id('01').set_fontsize(18)
v1.get_label_by_id('11').set_fontsize(18)
v1.get_patch_by_id('10').set_color('green')
v1.get_patch_by_id('01').set_color('blue')
v1.get_patch_by_id('11').set_color('yellow')

c1 = venn2_circles([mus_ensembl_junctions, mus_cuff_junctions],
                   linestyle='solid', lw=2.5, ax=ax1, color='white')

font = {'size':18}
#ax1.text(0, 0.65, 'A', fontdict=font)
ax1.text(-0.6, -0.55, 'Ensembl', fontdict=font)
ax1.text(0.55, -0.1, 'Cufflinks', fontdict=font)

v2 = venn2([mus_ensembl_junctions, mus_gimme_junctions],
           set_labels=('', ''), ax=ax2)
v2.get_patch_by_id('10').set_color('green')
v2.get_patch_by_id('01').set_color('blue')
v2.get_patch_by_id('11').set_color('red')
v2.get_label_by_id('10').set_fontsize(16)
v2.get_label_by_id('01').set_fontsize(16)
v2.get_label_by_id('11').set_fontsize(16)
c2 = venn2_circles([mus_ensembl_junctions, mus_gimme_junctions],
                   linestyle='solid', lw=2.5, ax=ax2, color='white')

#ax2.text(0, 0.65, 'B', fontdict=font)
ax2.text(-0.4, -0.6, 'Ensembl', fontdict=font)
ax2.text(0.3, -0.45, 'Gimme', fontdict=font)
plt.savefig('mouse_venn.pdf',bbox_inches='tight', pad_inches=0.1)



In [68]:
est_mrna_not_in_ensembl_junctions = est_junctions.union(mrna_junctions).difference(ensembl_junctions)

In [73]:
fig = plt.figure(figsize=(8,16))
v = venn3([cufflinks_junctions, gimme_junctions, est_junctions.union(mrna_junctions)],
          set_labels=('', '', ''))

fontsize = 16
v.get_label_by_id('100').set_fontsize(fontsize)
v.get_label_by_id('110').set_fontsize(fontsize)
v.get_label_by_id('101').set_fontsize(fontsize)
v.get_label_by_id('111').set_fontsize(fontsize)
v.get_label_by_id('011').set_fontsize(fontsize)
v.get_label_by_id('001').set_fontsize(fontsize)
v.get_label_by_id('010').set_fontsize(fontsize)

c = venn3_circles([cufflinks_junctions, gimme_junctions, est_junctions.union(mrna_junctions)],
                  linestyle='solid', lw=2.5, color='white')
font = {'size':18}
plt.text(-0.6, 0.4, 'Cufflinks', fontdict=font)
plt.text(0.3, -0.7, 'ESTs or mRNAs', fontdict=font)
plt.text(0.45, 0.40, 'Gimme', fontdict=font)

legend_ls = {'linewidth':12, 'alpha':0.5}
legend_font = {'size':16}

x = 0.2
plt.hlines(0.3-x,1.0,0.6, color=v.get_patch_by_id('100').get_facecolor(), **legend_ls)
plt.hlines(0.25-x,1.0,0.6, color=v.get_patch_by_id('110').get_facecolor(), **legend_ls)
plt.hlines(0.2-x,1.0,0.6, color=v.get_patch_by_id('101').get_facecolor(), **legend_ls)
plt.hlines(0.15-x,1.0,0.6, color=v.get_patch_by_id('111').get_facecolor(), **legend_ls)
plt.hlines(0.1-x,1.0,0.6, color=v.get_patch_by_id('011').get_facecolor(), **legend_ls)
plt.hlines(0.05-x,1.0,0.6, color=v.get_patch_by_id('001').get_facecolor(), **legend_ls)
plt.hlines(0.0-x,1.0,0.6, color=v.get_patch_by_id('010').get_facecolor(), **legend_ls)

plt.text(0.65,0.29-x, 'Cufflinks', fontdict=legend_font)
plt.text(0.65,0.24-x, 'Cufflinks + Gimme', fontdict=legend_font)
plt.text(0.65,0.19-x, 'Cufflinks + ESTs', fontdict=legend_font)
plt.text(0.65,0.14-x, 'Cufflinks + Gimme + ESTs', fontdict=legend_font)
plt.text(0.65,0.09-x, 'Gimme + ESTs', fontdict=legend_font)
plt.text(0.65,0.04-x, 'ESTs', fontdict=legend_font)
plt.text(0.65,-0.01-x, 'Gimme', fontdict=legend_font)

plt.savefig('chick_est_venn.pdf',bbox_inches='tight', pad_inches=0.1)



In [75]:
len(cufflinks_junctions.union(gimme_junctions).intersection(est_mrna_not_in_ensembl_junctions))


Out[75]:
11989

In [76]:
len(est_mrna_not_in_ensembl_junctions)


Out[76]:
101847

In [77]:
len(est_junctions.difference(ensembl_junctions)), len(mrna_junctions.difference(ensembl_junctions))


Out[77]:
(97740, 13987)

In [78]:
fig = plt.figure(0, figsize=(12,8))
ax1 = fig.add_subplot(1,2,1)
ax2 = fig.add_subplot(1,2,2)
fig.tight_layout()

v1 = venn2([cufflinks_junctions, est_mrna_not_in_ensembl_junctions],
           set_labels=('', ''), ax=ax1)
v1.get_label_by_id('10').set_fontsize(18)
v1.get_label_by_id('01').set_fontsize(18)
v1.get_label_by_id('11').set_fontsize(18)
v1.get_patch_by_id('10').set_color('green')
v1.get_patch_by_id('01').set_color('blue')
v1.get_patch_by_id('11').set_color('yellow')

c1 = venn2_circles([cufflinks_junctions, est_mrna_not_in_ensembl_junctions],
                   linestyle='dashed', lw=0.9, ax=ax1)

font = {'size':16}
ax1.text(0, 0.65, 'A', fontdict=font)
ax1.text(-0.6, -0.5, 'Cufflinks', fontdict=font)
ax1.text(0.1, -0.6, 'ESTs or mRNAs\n(not in Ensembl)', fontdict=font)

v2 = venn2([gimme_junctions, est_mrna_not_in_ensembl_junctions],
           set_labels=('', ''), ax=ax2)
v2.get_patch_by_id('10').set_color('orange')
v2.get_patch_by_id('01').set_color('blue')
v2.get_patch_by_id('11').set_color('red')
v2.get_label_by_id('10').set_fontsize(18)
v2.get_label_by_id('01').set_fontsize(18)
v2.get_label_by_id('11').set_fontsize(18)
c2 = venn2_circles([gimme_junctions, est_mrna_not_in_ensembl_junctions],
                   linestyle='dashed', lw=0.9, ax=ax2)

font = {'size':16}
ax2.text(0, 0.65, 'B', fontdict=font)
ax2.text(-0.6, -0.5, 'Gimme', fontdict=font)
ax2.text(0.1, -0.6, 'EST or mRNA\n(not in Ensembl)', fontdict=font)
plt.savefig('chick_est_mrna_venn.pdf',bbox_inches='tight', pad_inches=0.1)



In [79]:
fig = plt.figure(0, figsize=(8,16))
ax1 = fig.add_subplot(1,2,1)
ax2 = fig.add_subplot(1,2,2)

fig.tight_layout()
fontsize = 14
font = {'size':16}

c = venn3_circles([global_junctions, ensembl_junctions, est_junctions.union(mrna_junctions)],
                  linestyle='solid', lw=2.5, ax=ax1, color='white')
c[0].set_lw(2.0)
c[0].set_ls('dashed')
v = venn3([global_junctions, ensembl_junctions, est_junctions.union(mrna_junctions)],
          set_labels=('', '', ''), ax=ax1)
v.get_label_by_id('100').set_fontsize(fontsize)
v.get_label_by_id('110').set_fontsize(fontsize)
v.get_label_by_id('101').set_fontsize(fontsize)
v.get_label_by_id('111').set_fontsize(fontsize)
v.get_label_by_id('011').set_fontsize(fontsize)
v.get_label_by_id('001').set_fontsize(fontsize)
v.get_label_by_id('010').set_fontsize(fontsize)
ax1.text(-0.8, 0.5, 'Global', fontdict=font)
ax1.text(0.3, -0.7, 'ESTs or mRNAs', fontdict=font)
ax1.text(0.45, 0.40, 'Ensembl', fontdict=font)
ax1.text(0, -0.8, 'A', fontdict={'size':18})

c = venn3_circles([global_junctions.union(local_junctions), ensembl_junctions, est_junctions.union(mrna_junctions)],
                  linestyle='solid', lw=2.5, ax=ax2, color='white')
c[0].set_lw('2.0')
c[0].set_ls('dashed')
v = venn3([global_junctions.union(local_junctions), ensembl_junctions, est_junctions.union(mrna_junctions)],
          set_labels=('', '', ''), ax=ax2)
v.get_label_by_id('100').set_fontsize(fontsize)
v.get_label_by_id('110').set_fontsize(fontsize)
v.get_label_by_id('101').set_fontsize(fontsize)
v.get_label_by_id('111').set_fontsize(fontsize)
v.get_label_by_id('011').set_fontsize(fontsize)
v.get_label_by_id('001').set_fontsize(fontsize)
v.get_label_by_id('010').set_fontsize(fontsize)
ax2.text(-0.8, 0.5, 'Global+local', fontdict=font)
ax2.text(0.3, -0.7, 'ESTs or mRNAs', fontdict=font)
ax2.text(0.45, 0.40, 'Ensembl', fontdict=font)
ax2.text(0, -0.8, 'B', fontdict={'size':18})
plt.savefig('global_local_junctions_venn.pdf',bbox_inches='tight', pad_inches=0.1)



In [80]:
fig = plt.figure(0, figsize=(8,8))
ax1 = fig.add_subplot(2,2,1)
ax2 = fig.add_subplot(2,2,2)
ax3 = fig.add_subplot(2,2,3)
ax4 = fig.add_subplot(2,2,4)

fig.tight_layout()
fontsize = 14
font = {'size':16}

v = venn3([gimme_junctions, ensembl_junctions, est_junctions.union(mrna_junctions)],
          set_labels=('', '', ''), ax=ax1)
v.get_label_by_id('100').set_fontsize(fontsize)
v.get_label_by_id('110').set_fontsize(fontsize)
v.get_label_by_id('101').set_fontsize(fontsize)
v.get_label_by_id('111').set_fontsize(fontsize)
v.get_label_by_id('011').set_fontsize(fontsize)
v.get_label_by_id('001').set_fontsize(fontsize)
v.get_label_by_id('010').set_fontsize(fontsize)
ax1.text(-0.8, 0.5, 'Cufflinks', fontdict=font)
ax1.text(0.3, -0.7, 'ESTs or mRNAs', fontdict=font)
ax1.text(0.45, 0.40, 'Ensembl', fontdict=font)
ax1.text(0, -0.8, 'A', fontdict={'size':18})

c = venn3_circles([gimme_junctions, ensembl_junctions, est_junctions.union(mrna_junctions)],
                  linestyle='solid', lw=1.5, ax=ax1, color='white')
#c[0].set_lw('2.0')
#c[0].set_ls('dashed')

ax3.set_xlim(0,10)
ax3.set_ylim(0,1.0)
x = 0.1
ax3.hlines(1.0-x,2.0,1.5, color=v.get_patch_by_id('100').get_facecolor(), **legend_ls)
ax3.hlines(0.94-x,2.0,1.5, color=v.get_patch_by_id('110').get_facecolor(), **legend_ls)
ax3.hlines(0.88-x,2.0,1.5, color=v.get_patch_by_id('101').get_facecolor(), **legend_ls)
ax3.hlines(0.82-x,2.0,1.5, color=v.get_patch_by_id('111').get_facecolor(), **legend_ls)
ax3.hlines(0.76-x,2.0,1.5, color=v.get_patch_by_id('011').get_facecolor(), **legend_ls)
ax3.hlines(0.70-x,2.0,1.5, color=v.get_patch_by_id('001').get_facecolor(), **legend_ls)
ax3.hlines(0.64-x,2.0,1.5, color=v.get_patch_by_id('010').get_facecolor(), **legend_ls)

xx=0.02
ax3.text(2.3, 1.0-x-xx, 'Cufflinks', size=16)
ax3.text(2.3, 0.94-x-xx, 'Cufflinks+Ensembl', size=16)
ax3.text(2.3, 0.88-x-xx, 'Cufflinks+ESTs', size=16)
ax3.text(2.3, 0.82-x-xx, 'Cufflinks+Ensembl+ESTs', size=16)
ax3.text(2.3, 0.76-x-xx, 'Ensembl+ESTs', size=16)
ax3.text(2.3, 0.70-x-xx, 'ESTs', size=16)
ax3.text(2.3, 0.64-x-xx, 'Ensembl', size=16)
ax3.axis('off')

v = venn3([gimme_cuff_junctions, ensembl_junctions, est_junctions.union(mrna_junctions)],
          set_labels=('', '', ''), ax=ax2)
v.get_label_by_id('100').set_fontsize(fontsize)
v.get_label_by_id('110').set_fontsize(fontsize)
v.get_label_by_id('101').set_fontsize(fontsize)
v.get_label_by_id('111').set_fontsize(fontsize)
v.get_label_by_id('011').set_fontsize(fontsize)
v.get_label_by_id('001').set_fontsize(fontsize)
v.get_label_by_id('010').set_fontsize(fontsize)

c = venn3_circles([gimme_cuff_junctions, ensembl_junctions, est_junctions.union(mrna_junctions)],
                  linestyle='solid', lw=1.5, ax=ax2, color='white')
#c[0].set_lw(2.0)
#c[0].set_ls('dashed')

ax2.text(-0.8, 0.5, 'Cufflinks+Gimme', fontdict=font)
ax2.text(0.3, -0.7, 'ESTs or mRNAs', fontdict=font)
ax2.text(0.45, 0.4, 'Ensembl', fontdict=font)
ax2.text(0, -0.8, 'B', fontdict={'size':18})

ax4.set_xlim(0,10)
ax4.set_ylim(0,1.0)
x = 0.1
ax4.hlines(1.0-x,2.0,1.5, color=v.get_patch_by_id('100').get_facecolor(), **legend_ls)
ax4.hlines(0.94-x,2.0,1.5, color=v.get_patch_by_id('110').get_facecolor(), **legend_ls)
ax4.hlines(0.88-x,2.0,1.5, color=v.get_patch_by_id('101').get_facecolor(), **legend_ls)
ax4.hlines(0.82-x,2.0,1.5, color=v.get_patch_by_id('111').get_facecolor(), **legend_ls)
ax4.hlines(0.76-x,2.0,1.5, color=v.get_patch_by_id('011').get_facecolor(), **legend_ls)
ax4.hlines(0.70-x,2.0,1.5, color=v.get_patch_by_id('001').get_facecolor(), **legend_ls)
ax4.hlines(0.64-x,2.0,1.5, color=v.get_patch_by_id('010').get_facecolor(), **legend_ls)

xx=0.02
ax4.text(2.3, 1.0-x-xx, 'Cufflinks+Gimme', size=16)
ax4.text(2.3, 0.94-x-xx, 'Cufflinks+Gimme+Ensembl', size=16)
ax4.text(2.3, 0.88-x-xx, 'Cufflinks+Gimme+ESTs', size=16)
ax4.text(2.3, 0.82-x-xx, 'Cufflinks+Gimme+Ensembl+ESTs', size=16)
ax4.text(2.3, 0.76-x-xx, 'Ensembl+ESTs', size=16)
ax4.text(2.3, 0.70-x-xx, 'ESTs', size=16)
ax4.text(2.3, 0.64-x-xx, 'Ensembl', size=16)
ax4.axis('off')
plt.savefig('cuff_gimme_junctions_venn.pdf',bbox_inches='tight', pad_inches=0.1)
plt.savefig('cuff_gimme_junctions_venn.eps',bbox_inches='tight', pad_inches=0.1, format='eps')


CDF plots


In [81]:
from scipy.stats import cumfreq
from glob import glob
from collections import defaultdict
from mpl_toolkits.axes_grid1.inset_locator import zoomed_inset_axes
from mpl_toolkits.axes_grid1.inset_locator import mark_inset, inset_axes

In [82]:
def get_read_count(filename, junctions_db):
    '''Read read counts and return a dictionary

    containing the number of reads of each splice
    junction.

    '''

    with open(filename, 'r') as infile:
        for line in infile:
            junction, reads = line.strip().split('\t')
            chrom = junction.split(':')[0]
            jstart, jend = junction.split(':')[1].split('-')
            jstart = int(jstart) + 1  # convert the end of an exon to the start of an intron
            jend = int(jend) - 1  # convert the start of an exon to the end of an intron
            if jend - jstart > 50: # min intron size default value in compare_splice_junctions script
                junction = '%s:%d-%d' % (chrom, jstart, jend)
                junctions_db[junction] += int(reads)

In [83]:
def get_normfreqs(inputfiles, junctions_db):
    for countfile in glob(inputfiles):
        get_read_count(countfile, junctions_db)

    sorted_counts = sorted([v for v in junctions_db.itervalues()])
    cumfreqs, lowlim, binsize, extrapoints = cumfreq(sorted_counts,
                                                        max(sorted_counts))
    norm_cumfreqs = cumfreqs / max(cumfreqs)
    poor_junctions = [i for i in sorted_counts if i < 4]
    print 'Total junctions with no or fewer than four reads = %d (%.4f%%)' % \
                            (len(poor_junctions), len(poor_junctions)/float(len(sorted_counts))*100)
    return norm_cumfreqs

In [84]:
gimme_se_junctions_counts = defaultdict(int)
gimme_se_norm_cumfreqs = get_normfreqs('*gimme.sorted.bam.reads_counts', gimme_se_junctions_counts)
gimme_pe_junctions_counts = defaultdict(int)
gimme_pe_norm_cumfreqs = get_normfreqs('*gimme.pe.sorted.bam.reads_counts', gimme_pe_junctions_counts)


Total junctions with no or fewer than four reads = 6699 (6.3521%)
Total junctions with no or fewer than four reads = 7621 (7.2264%)

In [85]:
cuff_se_junctions_counts = defaultdict(int)
cuff_se_norm_cumfreqs = get_normfreqs('*cufflinks.sorted.bam.reads_counts', cuff_se_junctions_counts)
cuff_pe_junctions_counts = defaultdict(int)
cuff_pe_norm_cumfreqs = get_normfreqs('*cufflinks.pe.sorted.bam.reads_counts', cuff_pe_junctions_counts)


Total junctions with no or fewer than four reads = 2721 (2.9551%)
Total junctions with no or fewer than four reads = 3290 (3.5731%)

In [87]:
fig, ax = plt.subplots(figsize=(12,8))
ax.plot(cuff_se_norm_cumfreqs, linewidth=2.0, color='blue', label='Cufflinks SE')
ax.plot(cuff_pe_norm_cumfreqs, linewidth=2.0, color='red', label='Cufflinks PE')
ax.plot(gimme_se_norm_cumfreqs, linewidth=2.0, color='green', label='Gimme SE')
ax.plot(gimme_pe_norm_cumfreqs, linewidth=2.0, color='orange', label='Gimme PE')
ax.set_xscale('log')

axins = inset_axes(ax, 2.8, 2.8, loc=1)
axins.plot(cuff_se_norm_cumfreqs, linewidth=3.0, color='blue')
axins.plot(cuff_pe_norm_cumfreqs, linewidth=3.0, color='red')
axins.plot(gimme_se_norm_cumfreqs, linewidth=3.0, color='green')
axins.plot(gimme_pe_norm_cumfreqs, linewidth=3.0, color='orange')
axins.set_xlim(0,3)
axins.set_ylim(0,0.1)
axins.set_xticks([0, 1, 2, 3])
axins.set_yticks([0,0.1])
for label in (axins.get_xticklabels() + axins.get_yticklabels()):
    label.set_fontsize(13)

#axins.set_xscale('log')
#ax.hlines(0.09, 1, 3, linestyles='solid', lw=3.0, color='k')
#ax.vlines(3, 0, 0.085, linestyles='solid', lw=3.0, color='k')
#axins.hlines(cuff_pe_norm_cumfreqs[4], 0, 4, linestyles='solid', lw=1.0, color='k')
#axins.vlines(4, 0, cuff_pe_norm_cumfreqs[4], linestyles='solid', lw=1.0, color='k')

axins.set_axis_bgcolor('white')
ax.legend(loc='upper left', fontsize=16)
ax.set_xlabel("Number of spliced reads", size=18)
ax.set_ylabel("Number of splice junction", size=18)

# change the font size of x and y ticks
for label in (ax.get_xticklabels() + ax.get_yticklabels()):
    label.set_fontsize(18)

plt.savefig('cdf_splice.pdf', bbox_inches='tight')



In [88]:
# CDF of Cufflinks junctions at 3 or fewer reads
cuff_se_norm_cumfreqs[4], cuff_pe_norm_cumfreqs[3]


Out[88]:
(0.036882174701608435, 0.035730964301617124)

In [89]:
# CDF of Gimme junctions at 3 or fewer reads
gimme_se_norm_cumfreqs[4], gimme_pe_norm_cumfreqs[3]


Out[89]:
(0.072254198234418407, 0.072263680412664399)

In [90]:
combined_junctions_counts = {}
for j in gimme_pe_junctions_counts:
    combined_junctions_counts[j] = gimme_pe_junctions_counts[j] + gimme_se_junctions_counts[j]
    
print 'Junctions with no spliced reads = ', len([k for k,v
                                in combined_junctions_counts.iteritems() if v == 0 and 'random' in k])
print 'Junctions with fewer than 4 spliced reads = ', len([k for k,v
                                in combined_junctions_counts.iteritems() if v < 4])


Junctions with no spliced reads =  710
Junctions with fewer than 4 spliced reads =  4626

In [91]:
combined_cuff_junctions_counts = {}
for j in cuff_pe_junctions_counts:
    combined_cuff_junctions_counts[j] = cuff_pe_junctions_counts[j] + cuff_se_junctions_counts[j]
    
print 'Junctions with no spliced reads = ', len([k for k,v
                                in combined_cuff_junctions_counts.iteritems() if v == 0 and 'random' in k])
print 'Junctions with fewer than 4 spliced reads = ', len([k for k,v
                                in combined_cuff_junctions_counts.iteritems() if v < 4])


Junctions with no spliced reads =  26
Junctions with fewer than 4 spliced reads =  1010

In [92]:
len(combined_junctions_counts), len(gimme_pe_junctions_counts)


Out[92]:
(105461, 105461)

In [93]:
len(combined_cuff_junctions_counts)


Out[93]:
92077

In [94]:
print 'Junctions with no spliced reads = ', len([k for k,v
                                in gimme_pe_junctions_counts.iteritems() if v == 0])
print 'Junctions with no spliced reads = ', len([k for k,v
                                in gimme_se_junctions_counts.iteritems() if v == 0])


Junctions with no spliced reads =  3603
Junctions with no spliced reads =  3046

In [95]:
print 'Junctions with no spliced reads = ', len([k for k,v
                                in gimme_pe_junctions_counts.iteritems() if v == 0 and 'random' in k])
print 'Junctions with no spliced reads = ', len([k for k,v
                                in gimme_se_junctions_counts.iteritems() if v == 0 and 'random' in k])


Junctions with no spliced reads =  1041
Junctions with no spliced reads =  1006

In [96]:
print 'Junctions with no spliced reads = ', len([k for k,v
                                in gimme_pe_junctions_counts.iteritems() if v < 4])
print 'Junctions with no spliced reads = ', len([k for k,v
                                in gimme_se_junctions_counts.iteritems() if v < 4])


Junctions with no spliced reads =  7621
Junctions with no spliced reads =  6699

In [97]:
print 'Junctions with no spliced reads = ', len([k for k,v
                                in cuff_pe_junctions_counts.iteritems() if v == 0])
print 'Junctions with no spliced reads = ', len([k for k,v
                                in cuff_se_junctions_counts.iteritems() if v == 0])


Junctions with no spliced reads =  902
Junctions with no spliced reads =  852

In [98]:
print 'Junctions with no spliced reads = ', len([k for k,v
                                in cuff_pe_junctions_counts.iteritems() if v == 0 and 'random' in k])
print 'Junctions with no spliced reads = ', len([k for k,v
                                in cuff_se_junctions_counts.iteritems() if v == 0 and 'random' in k])


Junctions with no spliced reads =  89
Junctions with no spliced reads =  48

Reads mapping statistics


In [99]:
figure, axes = plt.subplots(nrows=2, sharex=True, figsize=(10,10))

bar_width = 0.35
axes[0].bar(range(4), [58.76, 55.61, 56.94, 57.37],
    width=bar_width, label='Ensembl', color='#660000')
axes[0].bar([bar_width + i for i in range(4)], [81.96, 75.62, 78.93, 82.01],
    width=bar_width, label='Gimme', color='#006600')
axes[0].set_ylabel('Percent mapped reads', size=16)
axes[0].legend(loc='lower right', fontsize=16)
axes[0].set_title('Single-end reads', size=16)
axes[0].set_yticklabels(range(0,100,10), fontsize=16)

axes[1].bar(range(4), [52.35, 51.65, 53.05, 52.40],
    width=bar_width, label='Ensembl', color='#660000')
axes[1].bar([bar_width + i for i in range(4)], [74.40, 75.13, 74.76, 74.86],
    width=bar_width, label='Gimme', color='#006600')
axes[1].set_ylabel('Percent mapped reads', size=16)
axes[1].set_title('Paired-end reads', size=16)
axes[1].set_yticklabels(range(0,100,10), fontsize=16)

axes[1].legend(loc='lower right', fontsize=16)
plt.xticks([0.25+i for i in range(4)],
       ['Line 6 (Ctrl)', 'Line 6 (Inf)', 'Line 7 (Ctrl)', 'Line 7 (Inf)'],
       size=14)
plt.savefig('mapped-reads.pdf', bbox_inches='tight')


Mouse homology


In [100]:
from Bio import SeqIO
from collections import defaultdict

In [101]:
genes_length = defaultdict(int)
seqs_length = {}

for rec in SeqIO.parse('all_assembly_models.bed.faa', 'fasta'):
    seqid = rec.id.strip(';')
    seqs_length[seqid] = len(rec)
    geneid = seqid.split('.')[0]
    if len(rec) > genes_length[geneid]:
        genes_length[geneid] = len(rec)
        
cufflinks_genes_length = defaultdict(int)
cufflinks_seqs_length = {}

for rec in SeqIO.parse('cufflinks.bed.faa', 'fasta'):
    seqid = rec.id.strip(';')
    cufflinks_seqs_length[seqid] = len(rec)
    geneid = seqid.split('.')[0]
    if len(rec) > cufflinks_genes_length[geneid]:
        cufflinks_genes_length[geneid] = len(rec)

asm_cuff_genes_length = defaultdict(int)
asm_cuff_seqs_length = {}
for rec in SeqIO.parse('all_assembly_cufflinks_models.bed.faa', 'fasta'):
    seqid = rec.id.strip(';')
    asm_cuff_seqs_length[seqid] = len(rec)
    geneid = seqid.split('.')[0]
    if len(rec) > asm_cuff_genes_length[geneid]:
        asm_cuff_genes_length[geneid] = len(rec)

In [102]:
genes_match = defaultdict(float)
seqs_match = []
genes_length_match = {}
isoforms_length = []
for line in open('all_assembly_models.bed.faa.blastp.out'):
    seqid, bits, homolog = line.split('\t')
    bits = float(bits)
    seqid = seqid.strip(';')
    geneid = seqid.split('.')[0]
    seqlen = seqs_length[seqid]
    seqs_match.append(bits/seqlen)
    ratio = bits/genes_length[geneid]
    isoforms_length.append(seqlen)
    if ratio > genes_match[geneid]:
        genes_match[geneid] = ratio
        genes_length_match[geneid] = genes_length[geneid]

asm_cuff_genes_match = defaultdict(float)
asm_cuff_seqs_match = []
asm_cuff_genes_length_match = {}
asm_cuff_isoforms_length = []
for line in open('all_assembly_cufflinks_models.bed.faa.blastp.out'):
    seqid, bits, homolog = line.split('\t')
    bits = float(bits)
    seqid = seqid.strip(';')
    geneid = seqid.split('.')[0]
    seqlen = asm_cuff_seqs_length[seqid]
    asm_cuff_seqs_match.append(bits/seqlen)
    ratio = bits/asm_cuff_genes_length[geneid]
    asm_cuff_isoforms_length.append(seqlen)
    if ratio > asm_cuff_genes_match[geneid]:
        asm_cuff_genes_match[geneid] = ratio
        asm_cuff_genes_length_match[geneid] = asm_cuff_genes_length[geneid]
        
        
cufflinks_genes_match = defaultdict(float)
cufflinks_seqs_match = []
cufflinks_genes_length_match = {}
cufflinks_isoforms_length = []
for line in open('cufflinks.bed.faa.blastp.out'):
    seqid, bits, homolog = line.split('\t')
    bits = float(bits)
    seqid = seqid.strip(';')
    geneid = seqid.split('.')[0]
    seqlen = cufflinks_seqs_length[seqid]
    cufflinks_seqs_match.append(bits/seqlen)
    cufflinks_isoforms_length.append(seqlen)
    ratio = bits/cufflinks_genes_length[geneid]
    if ratio > cufflinks_genes_match[geneid]:
        cufflinks_genes_match[geneid] = ratio
        cufflinks_genes_length_match[geneid] = cufflinks_genes_length[geneid]

In [106]:
fig = plt.figure(0, figsize=(12,8))
plt.subplot(211)
plt.boxplot([[v for v in cufflinks_genes_match.itervalues()],
          [v for v in genes_match.itervalues()],
          [v for v in asm_cuff_genes_match.itervalues()]
          , cufflinks_seqs_match, seqs_match, asm_cuff_seqs_match])
plt.ylabel('bits/length ratio', size=15)
plt.xticks([1,2,3,4], [''] * 4)
plt.yticks(size=14)
plt.title('A', size=18)
#yscale('log')

plt.subplot(212)
plt.boxplot([[v for v in cufflinks_genes_length.itervalues()],
         [v for v in genes_length.itervalues()],
        [v for v in asm_cuff_genes_length.itervalues()],
        cufflinks_isoforms_length, isoforms_length, asm_cuff_isoforms_length])
plt.xticks([1,2,3,4,5,6], ['Gene\nCufflinks', 'Gene\nGimme',
                       'Gene\nCufflinks+Gimme', 'Isoform\nCufflinks',
                       'Isoform\nGimme', 'Isoform\nGimme+Cufflinks'],
       size=16)

plt.title('B', size=18)
plt.yticks(size=14)
plt.ylabel('size (bp)', size=15)
plt.yscale('log')
plt.savefig('mouse-homolog-boxplot.pdf',bbox_inches='tight')



In [107]:
len(cufflinks_seqs_match), len(seqs_match)


Out[107]:
(22460, 22991)

In [108]:
len(cufflinks_genes_match), len(genes_match)


Out[108]:
(16731, 12945)

In [109]:
mlength = defaultdict(int)
mrat = {}

for rec in SeqIO.parse('mouse.sample.faa', 'fasta'):
    mlength[rec.id] = len(rec)

for line in open('mouse.sample.faa.blastp.out'):
    seqid, bits, homolog = line.split('\t')
    bits = float(bits)
    seqlen = mlength[seqid]
    mrat[seqid] = bits/seqlen

In [110]:
clength = defaultdict(int)
crat = {}

for rec in SeqIO.parse('chick.sample.faa', 'fasta'):
    clength[rec.id] = len(rec)

for line in open('chick-vs-mouse.sample.faa.blastp.out'):
    seqid, bits, homolog = line.split('\t')
    bits = float(bits)
    seqlen = clength[seqid]
    crat[seqid] = bits/seqlen

Canonical Splice Site Analysis


In [113]:
!cat all_assembly_models.bed_all_sp.txt | python ../protocol/canonical-splice-site.py \
../../chick.fa > all_assembly_models.noncanonical


Total splice sites = 105461
Total major canonical sites (GT,AG) = 94399 (89.51)
Total minor canonical sites (GC,AG) = 552 (0.52)
Total non canonical sites = 10510 (9.97)

In [114]:
!cat all_assembly_cufflinks_ensembl_models.bed_all_sp.txt | python ../protocol/canonical-splice-site.py \
../../chick.fa > all_assembly_cufflinks_ensembl_models.noncanonical


Total splice sites = 181714
Total major canonical sites (GT,AG) = 155963 (85.83)
Total minor canonical sites (GC,AG) = 1593 (0.88)
Total non canonical sites = 24158 (13.29)

In [115]:
!cat Gallus_gallus.WASHUC2.64.bed_all_sp.txt | python ../protocol/canonical-splice-site.py \
../../chick.fa > Gallus_gallus.WASHUC2.64.noncanonical


Total splice sites = 152794
Total major canonical sites (GT,AG) = 138099 (90.38)
Total minor canonical sites (GC,AG) = 1161 (0.76)
Total non canonical sites = 13534 (8.86)

In [116]:
!cat cufflinks-ref.bed_all_sp.txt | python ../protocol/canonical-splice-site.py \
../../chick.fa > cufflinks-ref.canonical


Total splice sites = 167509
Total major canonical sites (GT,AG) = 151119 (90.22)
Total minor canonical sites (GC,AG) = 1411 (0.84)
Total non canonical sites = 14979 (8.94)

In [117]:
!cat cufflinks.bed_all_sp.txt | python ../protocol/canonical-splice-site.py \
../../chick.fa > cufflinks.canonical


Total splice sites = 92078
Total major canonical sites (GT,AG) = 89468 (97.17)
Total minor canonical sites (GC,AG) = 530 (0.58)
Total non canonical sites = 2080 (2.26)

In [118]:
!cat est.psl.best_all_sp.txt | python ../protocol/canonical-splice-site.py \
../../chick.fa > est.psl.best.noncanonical


Total splice sites = 222770
Total major canonical sites (GT,AG) = 155273 (69.70)
Total minor canonical sites (GC,AG) = 1879 (0.84)
Total non canonical sites = 65618 (29.46)

In [119]:
!cat mrna.psl.best_all_sp.txt | python ../protocol/canonical-splice-site.py \
../../chick.fa > mrna.psl.noncanonical


Total splice sites = 74514
Total major canonical sites (GT,AG) = 67170 (90.14)
Total minor canonical sites (GC,AG) = 421 (0.56)
Total non canonical sites = 6923 (9.29)

In [120]:
!cat mouse_cufflinks.bed_all_sp.txt | python ../protocol/canonical-splice-site.py \
mm9.fa > mouse_cufflinks.noncanonical


Total splice sites = 87981
Total major canonical sites (GT,AG) = 85512 (97.19)
Total minor canonical sites (GC,AG) = 400 (0.45)
Total non canonical sites = 2069 (2.35)

In [121]:
muschroms = !cat ../protocol/mouse.list.txt
op = open('Mus_musculus.NCBIM37.64.bed_all_sp.flt.txt', 'w') # remove some chromosomes not in Cufflinks
for line in open('Mus_musculus.NCBIM37.64.bed_all_sp.txt'):
    chro = line.split(':')[0]
    if chro in muschroms:
        print >> op, line.strip('\n')
op.close()

!cat Mus_musculus.NCBIM37.64.bed_all_sp.flt.txt | python ../protocol/canonical-splice-site.py \
mm9.fa > mouse_ensembl_cufflinks.noncanonical


Total splice sites = 240510
Total major canonical sites (GT,AG) = 222724 (92.60)
Total minor canonical sites (GC,AG) = 1276 (0.53)
Total non canonical sites = 16510 (6.86)

In [122]:
gimme_noncano = !cat all_assembly_cufflinks_ensembl_models.noncanonical
gimme_noncano = set(gimme_noncano)

In [123]:
ensbl_noncano = !cat Gallus_gallus.WASHUC2.64.noncanonical
ensbl_noncano = set(ensbl_noncano)

In [124]:
est_noncano = !cat est.psl.best.noncanonical
mrna_noncano = !cat mrna.psl.noncanonical
est_noncano = set(est_noncano)
est_noncano.update(set(mrna_noncano))

In [125]:
cuff_noncano = !cat cufflinks-ref.canonical
cuff_noncano = set(cuff_noncano)

In [126]:
# non-canonical splice site from Gimme models supported by Ensembl
print len(gimme_noncano.intersection(ensbl_noncano))
print len(gimme_noncano.intersection(ensbl_noncano))/float(len(gimme_noncano))*100


13364
55.3191489362

In [127]:
# non-canonical splice site from Gimme models supported by ESTs or mRNAs.
print len(gimme_noncano.intersection(est_noncano)), 'sites',
print '%.2f%%' % (len(gimme_noncano.intersection(est_noncano))/float(len(gimme_noncano))*100)


4117 sites 17.04%

In [128]:
# non-canonical splice site from Cufflinks-ref models supported by ESTs or mRNAs.
print len(cuff_noncano.intersection(est_noncano)), 'sites',
print '%.2f%%' % (len(cuff_noncano.intersection(est_noncano))/float(len(cuff_noncano))*100)


1735 sites 11.59%

In [129]:
# non-canonical splice site from Cufflinks-ref models supported by ESTs or mRNAs.
print len(cuff_noncano.intersection(ensbl_noncano)), 'sites',
print '%.2f%%' % (len(cuff_noncano.intersection(ensbl_noncano))/float(len(cuff_noncano))*100)


13531 sites 90.39%

In [130]:
tmp = !cut -f 1 all_assembly_cufflinks_ensembl_models.bed_all_sp.txt
gimme_cano = set(tmp)

In [131]:
tmp = !cut -f 1 est.psl.best_all_sp.txt
est_cano = set(tmp)
tmp = !cut -f 1 mrna.psl.best_all_sp.txt
est_cano.update(set(tmp))

In [132]:
# canonical splice site from Gimme models supported by ESTs or mRNAs.
print len(gimme_cano.intersection(est_cano)), 'sites',
print '%.2f%%' % (len(gimme_cano.intersection(est_cano))/float(len(gimme_cano))*100)


115534 sites 63.58%

In [134]:
from collections import defaultdict

chromcounts = defaultdict(int)

for line in open('all_assembly_models.noncanonical'):
    chrom = line.split(':')[0]
    chromcounts[chrom] += 1

chromlist = sorted(chromcounts.keys())
plt.bar(range(len(chromlist)), [chromcounts[c] for c in chromlist])
plt.ylabel('Non-Canonical Splice Site', size=14)
plt.xticks(range(len(chromlist)), chromlist, rotation=90, size=14)
plt.yticks(size=13)
plt.savefig('noncanonical.pdf')



In [135]:
chromcounts['chrUn_random']


Out[135]:
1872

Reads Error Profile


In [141]:
import numpy as np
from numpy import arange

In [138]:
err6u = np.loadtxt('line6u.profile', delimiter=',', unpack=True)
err6i = np.loadtxt('line6i.profile', delimiter=',', unpack=True)
err7u = np.loadtxt('line7u.profile', delimiter=',', unpack=True)
err7i = np.loadtxt('line7i.profile', delimiter=',', unpack=True)

In [142]:
plt.subplot(221)
plt.plot(arange(1,76), err6u[:,0], 'blue', label='A')
plt.plot(arange(1,76), err6u[:,1], 'red', label='C')
plt.plot(arange(1,76), err6u[:,2], 'green', label='G')
plt.plot(arange(1,76), err6u[:,3], 'orange', label='T')
plt.legend(loc='upper left')
plt.ylabel('Mismatch', size=14)
plt.title('Line 6 Control', size=14)
plt.xticks(size=13)
plt.yticks(size=13)

plt.subplot(222)
plt.plot(arange(1,76), err6i[:,0], 'blue', label='A')
plt.plot(arange(1,76), err6i[:,1], 'red', label='C')
plt.plot(arange(1,76), err6i[:,2], 'green', label='G')
plt.plot(arange(1,76), err6i[:,3], 'orange', label='T')
plt.legend(loc='upper left')
plt.title('Line 6 Infected', size=14)
plt.xticks(size=13)
plt.yticks(size=13)

plt.subplot(223)
plt.plot(arange(1,76), err7u[:,0], 'blue', label='A')
plt.plot(arange(1,76), err7u[:,1], 'red', label='C')
plt.plot(arange(1,76), err7u[:,2], 'green', label='G')
plt.plot(arange(1,76), err7u[:,3], 'orange', label='T')
plt.legend(loc='upper left')
plt.ylabel('Mismatch', size=14)
plt.xlabel('Nucleotide Position', size=14)
plt.title('Line 7 Control', size=14)
plt.xticks(size=13)
plt.yticks(size=13)

plt.subplot(224)
plt.plot(arange(1,76), err7i[:,0], 'blue', label='A')
plt.plot(arange(1,76), err7i[:,1], 'red', label='C')
plt.plot(arange(1,76), err7i[:,2], 'green', label='G')
plt.plot(arange(1,76), err7i[:,3], 'orange', label='T')
plt.legend(loc='upper left')
plt.xlabel('Nucleotide Position', size=14)
plt.title('Line 7 Infected', size=14)
plt.xticks(size=13)
plt.yticks(size=13)
plt.savefig('error_profile.pdf')



In [ ]: