CVI and IconMAP dataset analysis

Reference: Analysis of biobanked serum from a bovine Mycobacterium avium subsp paratuberculosis infection model confirms the remarkable stability of circulating miRNA profiles and defines a bovine serum miRNA repertoire.

Ronan Shaughnessy, Damien Farrell, Karel Riepema, Douwe Bakker, Stephen V. Gordon


In [4]:
#douwe data analysis
import glob,os
import pandas as pd
import numpy as np
import mirnaseq.mirdeep2 as mdp
import mirnaseq.srnabench as srb
from mirnaseq import base, analysis, ensembl
pd.set_option('display.width', 300)
%matplotlib inline
import pylab as plt
import seaborn as sns
sns.set_style("ticks", {'axes.facecolor': '#F7F7F7','legend.frameon': True})
sns.set_context("notebook", font_scale=1.8)
plt.rcParams['savefig.dpi']=150

In [2]:
reload(mdp)
path='/opt/mirnaseq/data/douwe/'
os.chdir(path)
path1='analysis/douwe_mirdeep_rnafiltered'
path2='analysis/douwe_srnabench_rnafiltered'
df = mdp.getResults(path1)
#df = df.set_index('#miRNA')
k = df[df.novel==False]
n = df[df.novel==True]
n.to_csv('douwe_novel_all.csv')

In [3]:
#overlap between our miRs and other circulating studies?
mirbase = base.fasta2DataFrame('mature_btau_alt.fa').reset_index()
mb = mirbase.name.str.replace('bta-','')
k = mdp.filterExprResults(k,meanreads=150)
a = k['#miRNA'].str.replace('bta-','')
mdla = pd.read_csv('/opt/mirnaseq/analysis/miRandola_1.6.csv')
h1 = mdla[mdla.miRNA_type=='circulating'].name.str.replace('hsa-','')
f,ax=plt.subplots(1,1,figsize=(10,8))
#print set(h)-set(a)
v=base.venndiagram([mb,a,h1],['mirbase cow','douwe serum','human miRandola'],ax, set_colors=['r','y','g'], alpha=0.6)
ax.set_title('miRNA family overlap',fontsize=20)
plt.tight_layout()
f.savefig('known_overlap.png')



In [ ]:
#convservation of families for known results
known = pd.read_csv('known_mirdeep.csv')
#m = mdp.getResults(path)
#known = m[m.novel==False]
#known = mdp.filterExprResults(known,meanreads=50)
mbf = analysis.getmiFam()
fams = mbf.groupby('family').agg({'name':np.size}).reset_index()
fams=fams.rename(columns={'name':'size'})
names = pd.Series(known.precursor).str.lower()
x = mbf[mbf.name.isin(names)]
x=x.merge(fams,on='family')
print x.sort('size')[:10]

isoall = pd.read_csv('srnabench_isomirs_all.csv')
g = isoall.groupby('name', as_index=False)
l = g.agg({'length':np.size})
#print l

In [5]:
#novel
novel = pd.read_csv('novel_conserved.csv')
#known = pd.read_csv('known_mirdeep.csv').set_index('#miRNA')
mirbase = base.fasta2DataFrame('mature.fa')
#print isoall[:1]
for i,r in novel.iterrows():
    name=r['#miRNA']
    ms=r['consensus mature sequence']
    ref = r['mirbase'].replace('hsa','bta')
    if ref in mirbase.index:
        refsq =  mirbase.ix[ref]['sequence'].lower()
    else:
        refsq = ''
    print name,ms,ref,refsq


---------------------------------------------------------------------------
IOError                                   Traceback (most recent call last)
<ipython-input-5-fdeb19f56c09> in <module>()
      1 #novel
----> 2 novel = pd.read_csv('novel_conserved.csv')
      3 #known = pd.read_csv('known_mirdeep.csv').set_index('#miRNA')
      4 mirbase = base.fasta2DataFrame('mature.fa')
      5 #print isoall[:1]

/usr/local/lib/python2.7/dist-packages/pandas-0.16.0-py2.7-linux-x86_64.egg/pandas/io/parsers.pyc in parser_f(filepath_or_buffer, sep, dialect, compression, doublequote, escapechar, quotechar, quoting, skipinitialspace, lineterminator, header, index_col, names, prefix, skiprows, skipfooter, skip_footer, na_values, na_fvalues, true_values, false_values, delimiter, converters, dtype, usecols, engine, delim_whitespace, as_recarray, na_filter, compact_ints, use_unsigned, low_memory, buffer_lines, warn_bad_lines, error_bad_lines, keep_default_na, thousands, comment, decimal, parse_dates, keep_date_col, dayfirst, date_parser, memory_map, float_precision, nrows, iterator, chunksize, verbose, encoding, squeeze, mangle_dupe_cols, tupleize_cols, infer_datetime_format, skip_blank_lines)
    468                     skip_blank_lines=skip_blank_lines)
    469 
--> 470         return _read(filepath_or_buffer, kwds)
    471 
    472     parser_f.__name__ = name

/usr/local/lib/python2.7/dist-packages/pandas-0.16.0-py2.7-linux-x86_64.egg/pandas/io/parsers.pyc in _read(filepath_or_buffer, kwds)
    244 
    245     # Create the parser.
--> 246     parser = TextFileReader(filepath_or_buffer, **kwds)
    247 
    248     if (nrows is not None) and (chunksize is not None):

/usr/local/lib/python2.7/dist-packages/pandas-0.16.0-py2.7-linux-x86_64.egg/pandas/io/parsers.pyc in __init__(self, f, engine, **kwds)
    560             self.options['has_index_names'] = kwds['has_index_names']
    561 
--> 562         self._make_engine(self.engine)
    563 
    564     def _get_options_with_defaults(self, engine):

/usr/local/lib/python2.7/dist-packages/pandas-0.16.0-py2.7-linux-x86_64.egg/pandas/io/parsers.pyc in _make_engine(self, engine)
    697     def _make_engine(self, engine='c'):
    698         if engine == 'c':
--> 699             self._engine = CParserWrapper(self.f, **self.options)
    700         else:
    701             if engine == 'python':

/usr/local/lib/python2.7/dist-packages/pandas-0.16.0-py2.7-linux-x86_64.egg/pandas/io/parsers.pyc in __init__(self, src, **kwds)
   1064         kwds['allow_leading_cols'] = self.index_col is not False
   1065 
-> 1066         self._reader = _parser.TextReader(src, **kwds)
   1067 
   1068         # XXX

/usr/local/lib/python2.7/dist-packages/pandas-0.16.0-py2.7-linux-x86_64.egg/pandas/parser.so in pandas.parser.TextReader.__cinit__ (pandas/parser.c:3163)()

/usr/local/lib/python2.7/dist-packages/pandas-0.16.0-py2.7-linux-x86_64.egg/pandas/parser.so in pandas.parser.TextReader._setup_parser_source (pandas/parser.c:5779)()

IOError: File novel_conserved.csv does not exist

In [23]:
#novel conservation
reload(ensembl)
df = pd.read_csv('novel_mirdeep.csv')
ort = pd.read_csv('novel_orthologs.csv')
#print ort[ort['#miRNA']=='14_5598']
#ensembl.getmiRNAOrthologs(df)
x=ensembl.summarise(df)
print x[x.columns[2:11]]


             energy      tu  seedcons         biotype     gene  read_count  miRDeep2 score mirbase seed match     precursor coordinate
#miRNA                                                                                                                                
20_11630 -33.200001  intron         1  protein_coding   ZSWIM6         718             5.1      hsa-miR-1255a  20:17904153..17904232:-
28_16560 -30.100000     NaN         2             NaN      NaN         471             5.3    hsa-miR-4766-5p    28:9347067..9347134:-
12_3861  -20.400000    exon         2           snRNA       U2       15465             4.0       hsa-miR-1246  12:69698007..69698073:-
14_5598  -11.000000  intron         2  protein_coding  C8orf37        1917           852.3                  -  14:71406692..71406740:-
9_24296  -18.299999     NaN         2             NaN      NaN         304             5.1     hsa-miR-142-5p   9:86842891..86842959:+
13_4103         NaN  intron         0  protein_coding  SLC24A3         308           165.4                  -  13:39299094..39299164:+

In [6]:
def getfiltered(path, cut=150):
    df = mdp.getResults(path)
    print df.read_count.sum()
    df = df[df.novel==False]
    df = mdp.filterExprResults(df,meanreads=cut)
    return df

douwe = getfiltered(path1)
#iconmap
iconmap = getfiltered('/opt/mirnaseq/data/iconmap/analysis/iconmap_results_mirdeep')
#nick
nick = getfiltered('/opt/mirnaseq/data/nick/analysis/nick_results_mirdeep')
#vegh
vegh = getfiltered('/opt/mirnaseq/data/vegh_13/analysis/vegh_results_mirdeep')
#hooten paper results
hooten = getfiltered('/opt/mirnaseq/data/hooten_13/analysis/hooten13_results_mirdeep')


2135533.0
10684392.0
6201514.0
40229562.0
4527911.0

In [7]:
d = douwe['#miRNA']
ic = iconmap['#miRNA']
n = nick['#miRNA']
v = vegh['#miRNA']
plt.rcParams['font.size']=16
f,ax=plt.subplots(1,1,figsize=(8,8))
base.venndiagram([d,ic,v],['CVI','UCD','macrophage'],ax, set_colors=['g','r','b'], alpha=0.6)
ax.set_title('core bovine serum and macrophage miRNA',fontsize=20)
#b.plot(y='mean',kind='barh',figsize=(10,10))
f.savefig('oldvsfresh_overlap.png')



In [38]:
#per mirna expression comparison between fresh and old

def getexprdata(df, norm=True):
    #get variance data
    cols,normcols=mdp.getColumnNames(df)
    if norm==True:
        c=normcols
    else:
        c=cols
    d=df[['#miRNA','mean_norm']]
    d['std'] = df[c].std(1)
    return d

d = getexprdata(douwe)
ic = getexprdata(iconmap)

#c=['#miRNA']+normcols
#t = pd.melt(x[:20][c],id_vars='#miRNA',var_name='sample',value_name='mean')
#g=sns.factorplot('#miRNA','mean',data=t, kind="bar",aspect=1.8,palette='Spectral')

x = pd.merge(ic,d,on='#miRNA')
f,ax=plt.subplots(1,1,figsize=(8,8))
x.plot('mean_norm_x','mean_norm_y',kind='scatter',logx=True, grid=False,
       logy=True,s=30,ax=ax)
ax.errorbar(x=x.mean_norm_x,y=x.mean_norm_y, xerr=x.std_x,yerr=x.std_y,fmt='.',ecolor='gray', elinewidth=1)
ax.plot([0, 1], [0, 1], transform=ax.transAxes,color='black',alpha=0.7)
ax.set_xlabel('fresh')
ax.set_ylabel('old')
ax.set_title('mean reads comparison')
f.savefig('oldvsfresh_correlation.png')
print 'cc=%s' %np.corrcoef(np.log2(x.mean_norm_x),np.log2(x.mean_norm_y))[0][1]


cc=0.866716177407

In [30]:
#top comparisons
top=x.set_index('#miRNA')[:8]
vals = top[['mean_norm_x','mean_norm_y']]
errs = top[['std_x','std_y']]
errs=errs.rename(columns={'std_x':'mean_norm_x','std_y':'mean_norm_y'})
ax=vals.plot(yerr=errs,kind='bar',grid=False,figsize=(9,6))
plt.ylabel('mean normalised count')
plt.legend(['UCD','CVI'])
ax.set_xticklabels(x['#miRNA'],rotation=60)
plt.tight_layout()
plt.savefig('oldvsfresh_top.png')



In [12]:
a = douwe['precursor'].str.replace('bta-','')
b = hooten['precursor'].str.replace('hsa-','')
c = vegh['precursor'].str.replace('bta-','')

f,ax=plt.subplots(1,1,figsize=(8,8))
v=base.venndiagram([a,b,c],['bovine serum','human serum','bovine macrophage'], ax, set_colors=['y','b','g'], alpha=0.6)



In [35]:
#ncrna iconmap and douwe compare..
nc1 = pd.read_csv('ncrna_mapped.csv',index_col=0)
nc2 = pd.read_csv('iconmap_ncrna.csv',index_col=0)
nc1['name']='CVI'
nc2['name']='UCD'
x=pd.concat([nc1,nc2])
x=x.drop(['total'],1)
x['unmapped'] = 1-x.sum(1)
catnames = {'mirbase-hairpin':'miRBase','Rfam_btau':'rRNA','bosTau6-tRNAs':'tRNA','unmapped':'unmapped',
        'bostau-snRNA':'sRNA','noncodev4_btau':'noncode','bos_taurus_alt':'UMD3.1'}
x = x.rename(columns=catnames)
t = pd.melt(x,id_vars=['name'],var_name='cat',value_name='total')
g=sns.factorplot('cat','total','name', t, kind="bar",aspect=1.8,size=6,palette='Set1',legend=False)
plt.legend(loc=2)
g.set_xticklabels(rotation=40)
g.set(ylabel='fraction')
plt.tight_layout()
plt.savefig('oldvsfresh_ncrna.png')
#plot just mirnas
t = t[t.cat=='miRBase']
g=sns.factorplot('cat','total','name', t,kind="box",size=3.5,aspect=.8,palette='Set1',legend=False)
g.axes[0]
g.set(ylabel='fraction')
plt.savefig('oldvsfresh_mirna.png')