In [1]:
%matplotlib inline
from matplotlib import pyplot as plt
import pandas as pd
import tables
import screed
import numpy as np
import seaborn
from matplotlib_venn import venn3, venn3_circles

Making sense of the calculations, before starting!


In [2]:
v = venn3((1,1,1,1,1,1,1), set_labels=('Ref', 'MSU', 'RNA'))
v.get_label_by_id('100').set_text('Only in ref')
v.get_label_by_id('010').set_text('Only in MSU')
v.get_label_by_id('001').set_text('Only in RNA')

v.get_label_by_id('011').set_text('*')
v.get_label_by_id('101').set_text('A')
v.get_label_by_id('110').set_text('B')

v.get_label_by_id('111').set_text('C')


Blat ref/rna $= A + C$

Blat MSU/rna $= C + *$

BWA mem ref/MSU $= B + C$

$C = (A + C) \cap (C + *)$

$* = $ Blat MSU/rna $ - C$

$A = $ Blat ref/rna $-C$


In [6]:
! cd .. && make outputs/reference/galGal4.fa_screed
! cd .. && make outputs/msu/galGal4.latest.unmapped_reads
! cd .. && make workdirs/blat/msu_latest_minlen200.h5


make: `outputs/reference/galGal4.fa_screed' is up to date.
make: `outputs/msu/galGal4.latest.unmapped_reads' is up to date.
make: `workdirs/blat/msu_latest_minlen200.h5' is up to date.

In [7]:
ALIGNMENTS_HDF5 = '../workdirs/blat/msu_latest_minlen200.h5'

with pd.get_store(ALIGNMENTS_HDF5, mode='r') as store:
    print(store)


<class 'pandas.io.pytables.HDFStore'>
File path: ../workdirs/blat/msu_latest_minlen200.h5
/msu_latest                   frame        (shape->[881304,13]) 
/reference_galGal4            frame        (shape->[1469272,13])

In [12]:
# Load chick mRNA data from Likit
with open('../inputs/chicken_transcripts/global_merged.fa.clean.nr') as seqfile:
    rna = {seq['name'] for seq in screed.fasta.fasta_iter(seqfile)}

# Load mRNA data to a DataFrame
with open('../inputs/chicken_transcripts/global_merged.fa.clean.nr') as seqfile:
    chick_rna_df = pd.DataFrame(index=rna)
    chick_rna_df['sequence'] = pd.Series({seq['name']: str(seq['sequence'])
                                         for seq in screed.fasta.fasta_iter(seqfile)})

Load msu/rna reads and calculate intersection (C)


In [13]:
def calc_intersection(ref_rna_set, ref_rna_90_set):
    intersection = set()
    intersection_90 = set()
    
    msu_reads = pd.read_hdf(ALIGNMENTS_HDF5, '/msu_latest', mode='r')
    intersection |= ref_rna_set & set(msu_reads['Query id'])

    msu_reads = msu_reads[msu_reads['% identity'] > 90]
    intersection_90 |= ref_rna_set & set(msu_reads['Query id'])

    return intersection, intersection_90

$* = $ Blat msu/rna $- C = $ only_rna_msu


In [14]:
def calc_only_rna_msu(intersection, intersection_90):
    only_rna_msu = set()
    only_rna_msu_90 = set()

    msu_reads = pd.read_hdf(ALIGNMENTS_HDF5, '/msu_latest', mode='r')
    only_rna_msu |= set(msu_reads['Query id']) - intersection

    msu_reads = msu_reads[msu_reads['% identity'] > 90]
    only_rna_msu_90 |= set(msu_reads['Query id']) - intersection_90

    #msu_matches = msu_reads[~(msu_reads['Query id'].isin(ref_matches['Query id']))]
    return only_rna_msu, only_rna_msu_90

def calc_only_in_msu(unmapped_reads):
    only_in_msu = unmapped_reads.copy()
    only_in_msu_90 = unmapped_reads.copy()

    msu_reads = pd.read_hdf(ALIGNMENTS_HDF5, '/msu_latest', mode='r')
    only_in_msu -= set(msu_reads['Subject id'])

    msu_reads = msu_reads[msu_reads['% identity'] > 90]
    only_in_msu_90 -= set(msu_reads['Subject id'])

    msu_reads = None
    #msu_matches = msu_reads[~(msu_reads['Query id'].isin(ref_matches['Query id']))]
    return only_in_msu, only_in_msu_90

galGal4 data


In [15]:
# Load galGal4 data
chick_galGal4 = screed.ScreedDB('../outputs/galGal4/galGal4.fa_screed')
galGal4 = {chick_galGal4[seq]['name'] for seq in chick_galGal4}

# Load galGal4 data to a DataFrame
chick_galGal4_df = pd.DataFrame(index=galGal4)
chick_galGal4_df['sequence'] = pd.Series({seq: str(chick_galGal4[seq]['sequence']) for seq in chick_galGal4})

# Load all alignments for galGal4.x.mrnaseq
galGal4_rna = pd.read_hdf(ALIGNMENTS_HDF5, "/reference_galGal4", mode='r')
galGal4_rna_set = set(galGal4_rna['Query id'])

# Select only alignments with % identity > 90
galGal4_rna_90 = galGal4_rna[galGal4_rna['% identity'] > 90]
galGal4_rna_set_90 = set(galGal4_rna_90['Query id'])

# Load unmapped msu reads (to galGal4)
with open('../outputs/msu/galGal4.unmapped_reads', 'r') as f:
    unmapped_reads_galGal4 = {line.strip() for line in f if line.startswith('>')}

In [16]:
def summarize(rna, ref, ref_rna_set, ref_rna_set_90, unmapped_reads):
    intersection, intersection_90 = calc_intersection(ref_rna_set, ref_rna_set_90)
    only_rna_msu, only_rna_msu_90 = calc_only_rna_msu(intersection, intersection_90)
    only_in_msu, only_in_msu_90 = calc_only_in_msu(unmapped_reads)
    
    # $A = $ Blat ref/rna $-C =$ only_rna_ref
    only_rna_ref = ref_rna_set - intersection
    only_rna_ref_90 = ref_rna_set - intersection_90
    
    only_rna = rna - only_rna_ref - only_rna_msu - intersection
    only_ref = ref - only_rna_ref - intersection # - only_msu_ref
    
    return only_rna, only_rna_msu, only_rna_ref, intersection

def create_dataframes(chick_rna_df, only_rna_msu, only_rna_msu_90, only_rna_ref, only_rna_ref_90):
    # Create new DataFrames with only_rna_msu\* sequences
    only_rna_msu_90_df = chick_rna_df[chick_rna_df.index.isin(only_rna_msu_90)]
    only_rna_msu_df = chick_rna_df[chick_rna_df.index.isin(only_rna_msu)]
    
    # Create new DataFrames with only_rna_ref\* sequences
    only_rna_ref_90_df = chick_rna_df[chick_rna_df.index.isin(only_rna_ref_90)]
    only_rna_ref_df = chick_rna_df[chick_rna_df.index.isin(only_rna_ref)]
    
    # Checking sequences lengths
    lens_all = chick_rna_df['sequence'].apply(len)
    lens_only_rna_msu = only_rna_msu_df['sequence'].apply(len)
    lens_only_rna_msu_90 = only_rna_msu_90_df['sequence'].apply(len)
    lens_only_rna_ref = only_rna_ref_df['sequence'].apply(len)
    lens_only_rna_ref_90 = only_rna_ref_90_df['sequence'].apply(len)

    lens_df = pd.DataFrame({
      'mRNAseq': {
        'total': lens_all.size,
        'mean': lens_all.mean(),
        'max': lens_all.max(),
        'min': lens_all.min()
      },
      'in MSU and not in reference (min len >= 200)' : {
        'total': lens_only_rna_msu.size,
        'mean': lens_only_rna_msu.mean(),
        'max': lens_only_rna_msu.max(),
        'min': lens_only_rna_msu.min()
      },
      'in MSU and not in reference (min len >= 200, min align size > 90% contig or MSU read)': {
        'total': lens_only_rna_msu_90.size,
        'mean': lens_only_rna_msu_90.mean(),
        'max': lens_only_rna_msu_90.max(),
        'min': lens_only_rna_msu_90.min(),
      },
      'in reference and not in MSU (min len >= 200)': {
        'total': lens_only_rna_ref.size,
        'mean': lens_only_rna_ref.mean(),
        'max': lens_only_rna_ref.max(),
        'min': lens_only_rna_ref.min()
      },
      'in reference and not in MSU (min len >= 200, min align size > 90% contig or MSU read))': {
        'total': lens_only_rna_ref_90.size,
        'mean': lens_only_rna_ref_90.mean(),
        'max': lens_only_rna_ref_90.max(),
        'min': lens_only_rna_ref_90.min()
      }
    })

    return lens_df.T

def plot_histograms():
    plt.figure()
    lens_all.hist(bins=range(0, 5000, 500), label='all')
    lens_only_rna_msu.hist(bins=range(0, 5000, 500), label='RNA seqs in msu and not in reference')
    lens_only_rna_msu_90.hist(bins=range(0, 5000, 500), label='RNA seqs in msu and not in reference > 90%')
    plt.legend()

    plt.figure()
    lens_only_rna_msu.hist(bins=range(0, 5000, 500), label='RNA seqs in msu and not in reference')
    lens_only_rna_msu_90.hist(bins=range(0, 5000, 500), label='RNA seqs in msu and not in reference > 90%')
    plt.legend()

    plt.figure()
    lens_all.hist(bins=range(0, 5000, 500), label='all')
    lens_only_rna_ref.hist(bins=range(0, 5000, 500), label='RNA seqs in reference and not in msu')
    lens_only_rna_ref_90.hist(bins=range(0, 5000, 500), label='RNA seqs in reference and not in msu > 90%')
    plt.legend()
    
    plt.figure()
    lens_only_rna_msu.hist(bins=range(0, 5000, 500), label='RNA seqs in reference and not in msu')
    lens_only_rna_msu_90.hist(bins=range(0, 5000, 500), label='RNA seqs in reference and not in msu > 90%')
    plt.legend()

In [17]:
def venn_diagram(only_rna, only_rna_msu, only_rna_ref, intersection, refname="galGal4"):
    v = venn3((1,1,1,1,1,1,1), set_labels=(refname, 'MSU', 'RNA'))
    v.get_label_by_id('100').set_text('') # 'Only in ref', len(only_ref)
    v.get_label_by_id('010').set_text('') # 'Only in msu', len(only_in_msu)
    v.get_label_by_id('001').set_text('D\n%.1fK' % (float(only_rna) / 1000)) # 121.3K

    v.get_label_by_id('011').set_text('C\n%.1fK' % (float(only_rna_msu) / 1000)) # 14.4K
    v.get_label_by_id('101').set_text('A\n%.1fK' % (float(only_rna_ref) / 1000)) # 35.2K
    v.get_label_by_id('110').set_text('')

    v.get_label_by_id('111').set_text('B\n%.1fK' % (float(intersection) / 1000)) # 248.2K

galGal4 diagram


In [18]:
galGal4_summary = summarize(rna, galGal4, galGal4_rna_set, galGal4_rna_set_90, unmapped_reads_galGal4)
#create_dataframes()
#plot_histograms()

In [19]:
venn_diagram(len(galGal4_summary[0]), len(galGal4_summary[1]), len(galGal4_summary[2]), len(galGal4_summary[3]), refname="galGal4")



In [53]:
def save_sequences(chick_rna_df, only_rna, only_rna_msu, only_rna_ref, intersection, ref='galGal4'):
    !mkdir -p ../outputs/rna/msu/${ref}

    with open('../outputs/rna/msu/%s/only_rna.fa' % ref, 'w') as f:
        for name, seq in chick_rna_df[chick_rna_df.index.isin(only_rna)]['sequence'].iteritems():
            f.write(">%s\n%s\n" % (name, "\n".join(seq[60 * i: 60 * i + 60]
                                                   for i in range(0, (len(seq) / 60) + 1))))
    with open('../outputs/rna/msu/%s/only_rna_msu.fa' % ref, 'w') as f:
        for name, seq in chick_rna_df[chick_rna_df.index.isin(only_rna_msu)]['sequence'].iteritems():
            f.write(">%s\n%s\n" % (name, "\n".join(seq[60 * i: 60 * i + 60]
                                                   for i in range(0, (len(seq) / 60) + 1))))
    with open('../outputs/rna/msu/%s/only_rna_ref.fa' % ref, 'w') as f:
        for name, seq in chick_rna_df[chick_rna_df.index.isin(only_rna_ref)]['sequence'].iteritems():
            f.write(">%s\n%s\n" % (name, "\n".join(seq[60 * i: 60 * i + 60]
                                                   for i in range(0, (len(seq) / 60) + 1))))
    with open('../outputs/rna/msu/%s/intersection.fa' % ref, 'w') as f:
        for name, seq in chick_rna_df[chick_rna_df.index.isin(intersection)]['sequence'].iteritems():
            f.write(">%s\n%s\n" % (name, "\n".join(seq[60 * i: 60 * i + 60]
                                                   for i in range(0, (len(seq) / 60) + 1))))

In [54]:
save_sequences(chick_rna_df, *galGal4_summary, ref='galGal4')