In [1]:
%matplotlib inline
from matplotlib import pyplot as plt
import pandas as pd
import tables
import screed
import numpy as np
import seaborn
from matplotlib_venn import venn3, venn3_circles
In [2]:
v = venn3((1,1,1,1,1,1,1), set_labels=('Ref', 'MSU', 'RNA'))
v.get_label_by_id('100').set_text('Only in ref')
v.get_label_by_id('010').set_text('Only in MSU')
v.get_label_by_id('001').set_text('Only in RNA')
v.get_label_by_id('011').set_text('*')
v.get_label_by_id('101').set_text('A')
v.get_label_by_id('110').set_text('B')
v.get_label_by_id('111').set_text('C')
Blat ref/rna $= A + C$
Blat MSU/rna $= C + *$
BWA mem ref/MSU $= B + C$
$C = (A + C) \cap (C + *)$
$* = $ Blat MSU/rna $ - C$
$A = $ Blat ref/rna $-C$
In [3]:
! cd .. && make outputs/reference/galGal4.fa_screed outputs/reference/galGal5.fa_screed
! cd .. && make outputs/msu/galGal5.unmapped_reads outputs/msu/galGal4.unmapped_reads
! cd .. && make workdirs/blat/msu_minlen200.h5
In [4]:
ALIGNMENTS_HDF5 = '../workdirs/blat/msu_minlen200.h5'
with pd.get_store(ALIGNMENTS_HDF5, mode='r') as store:
print(store)
In [5]:
# Load chick mRNA data from Likit
chick_rna = screed.ScreedDB('../inputs/chicken_transcripts/global_merged.fa.clean.nr_screed')
rna = {chick_rna[seq]['name'] for seq in chick_rna}
# Load mRNA data to a DataFrame
chick_rna_df = pd.DataFrame(index=rna)
chick_rna_df['sequence'] = pd.Series({seq: str(chick_rna[seq]['sequence']) for seq in chick_rna})
In [34]:
def calc_intersection(ref_rna_set, ref_rna_90_set):
intersection = set()
intersection_90 = set()
msu_reads = pd.read_hdf(ALIGNMENTS_HDF5, '/msu', mode='r')
intersection |= ref_rna_set & set(msu_reads['Query id'])
msu_reads = msu_reads[msu_reads['% identity'] > 90]
intersection_90 |= ref_rna_set & set(msu_reads['Query id'])
return intersection, intersection_90
In [35]:
def calc_only_rna_msu(intersection, intersection_90):
only_rna_msu = set()
only_rna_msu_90 = set()
msu_reads = pd.read_hdf(ALIGNMENTS_HDF5, '/msu', mode='r')
only_rna_msu |= set(msu_reads['Query id']) - intersection
msu_reads = msu_reads[msu_reads['% identity'] > 90]
only_rna_msu_90 |= set(msu_reads['Query id']) - intersection_90
#msu_matches = msu_reads[~(msu_reads['Query id'].isin(ref_matches['Query id']))]
return only_rna_msu, only_rna_msu_90
def calc_only_in_msu(unmapped_reads):
only_in_msu = unmapped_reads.copy()
only_in_msu_90 = unmapped_reads.copy()
msu_reads = pd.read_hdf(ALIGNMENTS_HDF5, '/msu', mode='r')
only_in_msu -= set(msu_reads['Subject id'])
msu_reads = msu_reads[msu_reads['% identity'] > 90]
only_in_msu_90 -= set(msu_reads['Subject id'])
msu_reads = None
#msu_matches = msu_reads[~(msu_reads['Query id'].isin(ref_matches['Query id']))]
return only_in_msu, only_in_msu_90
In [36]:
# Load galGal4 data
chick_galGal4 = screed.ScreedDB('../outputs/galGal4/galGal4.fa_screed')
galGal4 = {chick_galGal4[seq]['name'] for seq in chick_galGal4}
# Load galGal4 data to a DataFrame
chick_galGal4_df = pd.DataFrame(index=galGal4)
chick_galGal4_df['sequence'] = pd.Series({seq: str(chick_galGal4[seq]['sequence']) for seq in chick_galGal4})
# Load all alignments for galGal4.x.mrnaseq
galGal4_rna = pd.read_hdf(ALIGNMENTS_HDF5, "/reference_galGal4", mode='r')
galGal4_rna_set = set(galGal4_rna['Query id'])
# Select only alignments with % identity > 90
galGal4_rna_90 = galGal4_rna[galGal4_rna['% identity'] > 90]
galGal4_rna_set_90 = set(galGal4_rna_90['Query id'])
# Load unmapped msu reads (to galGal4)
with open('../outputs/msu/galGal4.unmapped_reads', 'r') as f:
unmapped_reads_galGal4 = {line.strip() for line in f if line.startswith('>')}
In [37]:
# Load galGal5 data
chick_galGal5 = screed.ScreedDB('../outputs/galGal5/galGal5.fa_screed')
galGal5 = {chick_galGal5[seq]['name'] for seq in chick_galGal5}
# Load galGal5 data to a DataFrame
chick_galGal5_df = pd.DataFrame(index=galGal5)
chick_galGal5_df['sequence'] = pd.Series({seq: str(chick_galGal5[seq]['sequence']) for seq in chick_galGal5})
# Load all alignments for galGal5.x.mrnaseq
galGal5_rna = pd.read_hdf(ALIGNMENTS_HDF5, "/reference_galGal5", mode='r')
galGal5_rna_set = set(galGal5_rna['Query id'])
# Select only alignments with % identity > 90
galGal5_rna_90 = galGal5_rna[galGal5_rna['% identity'] > 90]
galGal5_rna_set_90 = set(galGal5_rna_90['Query id'])
# Load unmapped msu reads (to galGal5)
with open('../outputs/msu/galGal5.unmapped_reads', 'r') as f:
unmapped_reads_galGal5 = {line.strip() for line in f}
In [38]:
def summarize(rna, ref, ref_rna_set, ref_rna_set_90, unmapped_reads):
intersection, intersection_90 = calc_intersection(ref_rna_set, ref_rna_set_90)
only_rna_msu, only_rna_msu_90 = calc_only_rna_msu(intersection, intersection_90)
only_in_msu, only_in_msu_90 = calc_only_in_msu(unmapped_reads)
# $A = $ Blat ref/rna $-C =$ only_rna_ref
only_rna_ref = ref_rna_set - intersection
only_rna_ref_90 = ref_rna_set - intersection_90
only_rna = rna - only_rna_ref - only_rna_msu - intersection
only_ref = ref - only_rna_ref - intersection # - only_msu_ref
return only_rna, only_rna_msu, only_rna_ref, intersection
def create_dataframes(chick_rna_df, only_rna_msu, only_rna_msu_90, only_rna_ref, only_rna_ref_90):
# Create new DataFrames with only_rna_msu\* sequences
only_rna_msu_90_df = chick_rna_df[chick_rna_df.index.isin(only_rna_msu_90)]
only_rna_msu_df = chick_rna_df[chick_rna_df.index.isin(only_rna_msu)]
# Create new DataFrames with only_rna_ref\* sequences
only_rna_ref_90_df = chick_rna_df[chick_rna_df.index.isin(only_rna_ref_90)]
only_rna_ref_df = chick_rna_df[chick_rna_df.index.isin(only_rna_ref)]
# Checking sequences lengths
lens_all = chick_rna_df['sequence'].apply(len)
lens_only_rna_msu = only_rna_msu_df['sequence'].apply(len)
lens_only_rna_msu_90 = only_rna_msu_90_df['sequence'].apply(len)
lens_only_rna_ref = only_rna_ref_df['sequence'].apply(len)
lens_only_rna_ref_90 = only_rna_ref_90_df['sequence'].apply(len)
lens_df = pd.DataFrame({
'mRNAseq': {
'total': lens_all.size,
'mean': lens_all.mean(),
'max': lens_all.max(),
'min': lens_all.min()
},
'in MSU and not in reference (min len >= 200)' : {
'total': lens_only_rna_msu.size,
'mean': lens_only_rna_msu.mean(),
'max': lens_only_rna_msu.max(),
'min': lens_only_rna_msu.min()
},
'in MSU and not in reference (min len >= 200, min align size > 90% contig or MSU read)': {
'total': lens_only_rna_msu_90.size,
'mean': lens_only_rna_msu_90.mean(),
'max': lens_only_rna_msu_90.max(),
'min': lens_only_rna_msu_90.min(),
},
'in reference and not in MSU (min len >= 200)': {
'total': lens_only_rna_ref.size,
'mean': lens_only_rna_ref.mean(),
'max': lens_only_rna_ref.max(),
'min': lens_only_rna_ref.min()
},
'in reference and not in MSU (min len >= 200, min align size > 90% contig or MSU read))': {
'total': lens_only_rna_ref_90.size,
'mean': lens_only_rna_ref_90.mean(),
'max': lens_only_rna_ref_90.max(),
'min': lens_only_rna_ref_90.min()
}
})
return lens_df.T
def plot_histograms():
plt.figure()
lens_all.hist(bins=range(0, 5000, 500), label='all')
lens_only_rna_msu.hist(bins=range(0, 5000, 500), label='RNA seqs in msu and not in reference')
lens_only_rna_msu_90.hist(bins=range(0, 5000, 500), label='RNA seqs in msu and not in reference > 90%')
plt.legend()
plt.figure()
lens_only_rna_msu.hist(bins=range(0, 5000, 500), label='RNA seqs in msu and not in reference')
lens_only_rna_msu_90.hist(bins=range(0, 5000, 500), label='RNA seqs in msu and not in reference > 90%')
plt.legend()
plt.figure()
lens_all.hist(bins=range(0, 5000, 500), label='all')
lens_only_rna_ref.hist(bins=range(0, 5000, 500), label='RNA seqs in reference and not in msu')
lens_only_rna_ref_90.hist(bins=range(0, 5000, 500), label='RNA seqs in reference and not in msu > 90%')
plt.legend()
plt.figure()
lens_only_rna_msu.hist(bins=range(0, 5000, 500), label='RNA seqs in reference and not in msu')
lens_only_rna_msu_90.hist(bins=range(0, 5000, 500), label='RNA seqs in reference and not in msu > 90%')
plt.legend()
In [39]:
def venn_diagram(only_rna, only_rna_msu, only_rna_ref, intersection, refname="galGal4"):
v = venn3((1,1,1,1,1,1,1), set_labels=(refname, 'MSU', 'RNA'))
v.get_label_by_id('100').set_text('') # 'Only in ref', len(only_ref)
v.get_label_by_id('010').set_text('') # 'Only in msu', len(only_in_msu)
v.get_label_by_id('001').set_text('D\n%.1fK' % (float(only_rna) / 1000)) # 121.3K
v.get_label_by_id('011').set_text('C\n%.1fK' % (float(only_rna_msu) / 1000)) # 14.4K
v.get_label_by_id('101').set_text('A\n%.1fK' % (float(only_rna_ref) / 1000)) # 35.2K
v.get_label_by_id('110').set_text('')
v.get_label_by_id('111').set_text('B\n%.1fK' % (float(intersection) / 1000)) # 248.2K
In [40]:
galGal4_summary = summarize(rna, galGal4, galGal4_rna_set, galGal4_rna_set_90, unmapped_reads_galGal4)
#create_dataframes()
#plot_histograms()
In [41]:
venn_diagram(len(galGal4_summary[0]), len(galGal4_summary[1]), len(galGal4_summary[2]), len(galGal4_summary[3]), refname="galGal4")
In [42]:
galGal5_summary = summarize(rna, galGal5, galGal5_rna_set, galGal5_rna_set_90, unmapped_reads_galGal5)
In [43]:
venn_diagram(len(galGal5_summary[0]), len(galGal5_summary[1]), len(galGal5_summary[2]), len(galGal5_summary[3]), refname="galGal5")
In [53]:
def save_sequences(chick_rna_df, only_rna, only_rna_msu, only_rna_ref, intersection, ref='galGal4'):
!mkdir -p ../outputs/rna/msu/${ref}
with open('../outputs/rna/msu/%s/only_rna.fa' % ref, 'w') as f:
for name, seq in chick_rna_df[chick_rna_df.index.isin(only_rna)]['sequence'].iteritems():
f.write(">%s\n%s\n" % (name, "\n".join(seq[60 * i: 60 * i + 60]
for i in range(0, (len(seq) / 60) + 1))))
with open('../outputs/rna/msu/%s/only_rna_msu.fa' % ref, 'w') as f:
for name, seq in chick_rna_df[chick_rna_df.index.isin(only_rna_msu)]['sequence'].iteritems():
f.write(">%s\n%s\n" % (name, "\n".join(seq[60 * i: 60 * i + 60]
for i in range(0, (len(seq) / 60) + 1))))
with open('../outputs/rna/msu/%s/only_rna_ref.fa' % ref, 'w') as f:
for name, seq in chick_rna_df[chick_rna_df.index.isin(only_rna_ref)]['sequence'].iteritems():
f.write(">%s\n%s\n" % (name, "\n".join(seq[60 * i: 60 * i + 60]
for i in range(0, (len(seq) / 60) + 1))))
with open('../outputs/rna/msu/%s/intersection.fa' % ref, 'w') as f:
for name, seq in chick_rna_df[chick_rna_df.index.isin(intersection)]['sequence'].iteritems():
f.write(">%s\n%s\n" % (name, "\n".join(seq[60 * i: 60 * i + 60]
for i in range(0, (len(seq) / 60) + 1))))
In [54]:
save_sequences(chick_rna_df, *galGal4_summary, ref='galGal4')
save_sequences(chick_rna_df, *galGal5_summary, ref='galGal4')