In [36]:
%matplotlib inline

import os

from matplotlib import pyplot as plt
import pandas as pd
import screed
import numpy as np
import seaborn
from matplotlib_venn import venn3, venn3_circles

Making sense of the calculations, before starting!


In [2]:
v = venn3((1,1,1,1,1,1,1), set_labels=('Ref', 'PacBio', 'RNA'))
v.get_label_by_id('100').set_text('Only in ref')
v.get_label_by_id('010').set_text('Only in PacBio')
v.get_label_by_id('001').set_text('Only in RNA')

v.get_label_by_id('011').set_text('*')
v.get_label_by_id('101').set_text('A')
v.get_label_by_id('110').set_text('B')

v.get_label_by_id('111').set_text('C')


Blat ref/rna $= A + C$

Blat PacBio/rna $= C + *$

BLASR ref/PacBio $= B + C$

$C = (A + C) \cap (C + *)$

$* = $ Blat PacBio/rna $ - C$

$A = $ Blat ref/rna $-C$


In [166]:
! cd .. && make outputs/reference/galGal4.fa_screed
! cd .. && make outputs/pacbio/galGal4.unmapped_reads
! cd .. && make workdirs/blat/pacbio_minlen200.h5
! cd .. && make outputs/rna/pacbio/galGal4/summary.csv


make: `outputs/reference/galGal4.fa_screed' is up to date.
make: `outputs/pacbio/galGal4.unmapped_reads' is up to date.
make: `workdirs/blat/pacbio_minlen200.h5' is up to date.
make: `outputs/rna/pacbio/galGal4/summary.csv' is up to date.

galGal4 diagram


In [167]:
def venn_diagram(only_rna, only_rna_seq, only_rna_ref, intersection, ref="galGal4", seq="PacBio"):
    v = venn3((1,1,1,1,1,1,1), set_labels=(ref, seq, 'RNA'))
    v.get_label_by_id('100').set_text('') # 'Only in ref', len(only_ref)
    v.get_label_by_id('010').set_text('') # 'Only in seq', len(only_in_seq)
    v.get_label_by_id('001').set_text('D\n%.1fK' % (float(only_rna) / 1000)) # 121.3K

    v.get_label_by_id('011').set_text('C\n%.1fK' % (float(only_rna_seq) / 1000)) # 14.4K
    v.get_label_by_id('101').set_text('A\n%.1fK' % (float(only_rna_ref) / 1000)) # 35.2K
    v.get_label_by_id('110').set_text('')

    v.get_label_by_id('111').set_text('B\n%.1fK' % (float(intersection) / 1000)) # 248.2K

In [168]:
summary = pd.Series.from_csv("../outputs/rna/pacbio/galGal4/summary.csv")
venn_diagram(*[summary[s] for s in ('Only in RNA', 'RNA/Seq', 'RNA/Ref', 'Intersection')],
             ref="galGal4", seq='PacBio')