In [10]:
%matplotlib inline
from matplotlib import pyplot as plt
import pandas as pd
import screed
import numpy as np
import seaborn
from matplotlib_venn import venn3, venn3_circles

Making sense of the calculations, before starting!


In [2]:
v = venn3((1,1,1,1,1,1,1), set_labels=('Ref', 'Moleculo', 'RNA'))
v.get_label_by_id('100').set_text('Only in ref')
v.get_label_by_id('010').set_text('Only in mol')
v.get_label_by_id('001').set_text('Only in RNA')

v.get_label_by_id('011').set_text('*')
v.get_label_by_id('101').set_text('A')
v.get_label_by_id('110').set_text('B')

v.get_label_by_id('111').set_text('C')


Blat ref/rna $= A + C$

Blat mol/rna $= C + *$

BWA mem ref/moleculo $= B + C$

$C = (A + C) \cap (C + *)$

$* = $ Blat mol/rna $ - C$

$A = $ Blat ref/rna $-C$


In [17]:
! cd .. && make outputs/reference/galGal4.fa_screed outputs/reference/galGal5.fa_screed 
! cd .. && make -j10 outputs/moleculo/galGal5.unmapped_reads outputs/moleculo/galGal4.unmapped_reads
! cd .. && make -j10 workdirs/blat/minlen200.h5
! cd .. && make outputs/rna/moleculo/galGal4/summary.csv
! cd .. && make outputs/rna/moleculo/galGal5/summary.csv


make: `outputs/reference/galGal4.fa_screed' is up to date.
make: `outputs/reference/galGal5.fa_screed' is up to date.
make: `outputs/moleculo/galGal5.unmapped_reads' is up to date.
make: `outputs/moleculo/galGal4.unmapped_reads' is up to date.
make: `workdirs/blat/minlen200.h5' is up to date.
make: `outputs/rna/moleculo/galGal4/summary.csv' is up to date.
make: `outputs/rna/moleculo/galGal5/summary.csv' is up to date.

In [18]:
def venn_diagram(only_rna, only_rna_seq, only_rna_ref, intersection, ref="galGal4", seq="Moleculo"):
    v = venn3((1,1,1,1,1,1,1), set_labels=(ref, seq, 'RNA'))
    v.get_label_by_id('100').set_text('') # 'Only in ref'
    v.get_label_by_id('010').set_text('') # 'Only in seq'
    v.get_label_by_id('001').set_text('D\n%.1fK' % (float(only_rna) / 1000))

    v.get_label_by_id('011').set_text('C\n%.1fK' % (float(only_rna_seq) / 1000))
    v.get_label_by_id('101').set_text('A\n%.1fK' % (float(only_rna_ref) / 1000))
    v.get_label_by_id('110').set_text('') # 'Only in ref and seq'

    v.get_label_by_id('111').set_text('B\n%.1fK' % (float(intersection) / 1000))

galGal4 diagram


In [19]:
summary = pd.Series.from_csv("../outputs/rna/moleculo/galGal4/summary.csv")
venn_diagram(*[summary[s] for s in ('Only in RNA', 'RNA/Seq', 'RNA/Ref', 'Intersection')],
             ref="galGal4", seq='Moleculo')


galGal5 diagram


In [20]:
summary = pd.Series.from_csv("../outputs/rna/moleculo/galGal5/summary.csv")
venn_diagram(*[summary[s] for s in ('Only in RNA', 'RNA/Seq', 'RNA/Ref', 'Intersection')],
             ref="galGal5", seq='Moleculo')