Reference: Analysis of biobanked serum from a bovine Mycobacterium avium subsp paratuberculosis infection model confirms the remarkable stability of circulating miRNA profiles and defines a bovine serum miRNA repertoire.
Ronan Shaughnessy, Damien Farrell, Karel Riepema, Douwe Bakker, Stephen V. Gordon
In [4]:
#douwe data analysis
import glob,os
import pandas as pd
import numpy as np
import mirnaseq.mirdeep2 as mdp
import mirnaseq.srnabench as srb
from mirnaseq import base, analysis, ensembl
pd.set_option('display.width', 300)
%matplotlib inline
import pylab as plt
import seaborn as sns
sns.set_style("ticks", {'axes.facecolor': '#F7F7F7','legend.frameon': True})
sns.set_context("notebook", font_scale=1.8)
plt.rcParams['savefig.dpi']=150
In [2]:
reload(mdp)
path='/opt/mirnaseq/data/douwe/'
os.chdir(path)
path1='analysis/douwe_mirdeep_rnafiltered'
path2='analysis/douwe_srnabench_rnafiltered'
df = mdp.getResults(path1)
#df = df.set_index('#miRNA')
k = df[df.novel==False]
n = df[df.novel==True]
n.to_csv('douwe_novel_all.csv')
In [3]:
#overlap between our miRs and other circulating studies?
mirbase = base.fasta2DataFrame('mature_btau_alt.fa').reset_index()
mb = mirbase.name.str.replace('bta-','')
k = mdp.filterExprResults(k,meanreads=150)
a = k['#miRNA'].str.replace('bta-','')
mdla = pd.read_csv('/opt/mirnaseq/analysis/miRandola_1.6.csv')
h1 = mdla[mdla.miRNA_type=='circulating'].name.str.replace('hsa-','')
f,ax=plt.subplots(1,1,figsize=(10,8))
#print set(h)-set(a)
v=base.venndiagram([mb,a,h1],['mirbase cow','douwe serum','human miRandola'],ax, set_colors=['r','y','g'], alpha=0.6)
ax.set_title('miRNA family overlap',fontsize=20)
plt.tight_layout()
f.savefig('known_overlap.png')
In [ ]:
#convservation of families for known results
known = pd.read_csv('known_mirdeep.csv')
#m = mdp.getResults(path)
#known = m[m.novel==False]
#known = mdp.filterExprResults(known,meanreads=50)
mbf = analysis.getmiFam()
fams = mbf.groupby('family').agg({'name':np.size}).reset_index()
fams=fams.rename(columns={'name':'size'})
names = pd.Series(known.precursor).str.lower()
x = mbf[mbf.name.isin(names)]
x=x.merge(fams,on='family')
print x.sort('size')[:10]
isoall = pd.read_csv('srnabench_isomirs_all.csv')
g = isoall.groupby('name', as_index=False)
l = g.agg({'length':np.size})
#print l
In [5]:
#novel
novel = pd.read_csv('novel_conserved.csv')
#known = pd.read_csv('known_mirdeep.csv').set_index('#miRNA')
mirbase = base.fasta2DataFrame('mature.fa')
#print isoall[:1]
for i,r in novel.iterrows():
name=r['#miRNA']
ms=r['consensus mature sequence']
ref = r['mirbase'].replace('hsa','bta')
if ref in mirbase.index:
refsq = mirbase.ix[ref]['sequence'].lower()
else:
refsq = ''
print name,ms,ref,refsq
In [23]:
#novel conservation
reload(ensembl)
df = pd.read_csv('novel_mirdeep.csv')
ort = pd.read_csv('novel_orthologs.csv')
#print ort[ort['#miRNA']=='14_5598']
#ensembl.getmiRNAOrthologs(df)
x=ensembl.summarise(df)
print x[x.columns[2:11]]
In [6]:
def getfiltered(path, cut=150):
df = mdp.getResults(path)
print df.read_count.sum()
df = df[df.novel==False]
df = mdp.filterExprResults(df,meanreads=cut)
return df
douwe = getfiltered(path1)
#iconmap
iconmap = getfiltered('/opt/mirnaseq/data/iconmap/analysis/iconmap_results_mirdeep')
#nick
nick = getfiltered('/opt/mirnaseq/data/nick/analysis/nick_results_mirdeep')
#vegh
vegh = getfiltered('/opt/mirnaseq/data/vegh_13/analysis/vegh_results_mirdeep')
#hooten paper results
hooten = getfiltered('/opt/mirnaseq/data/hooten_13/analysis/hooten13_results_mirdeep')
In [7]:
d = douwe['#miRNA']
ic = iconmap['#miRNA']
n = nick['#miRNA']
v = vegh['#miRNA']
plt.rcParams['font.size']=16
f,ax=plt.subplots(1,1,figsize=(8,8))
base.venndiagram([d,ic,v],['CVI','UCD','macrophage'],ax, set_colors=['g','r','b'], alpha=0.6)
ax.set_title('core bovine serum and macrophage miRNA',fontsize=20)
#b.plot(y='mean',kind='barh',figsize=(10,10))
f.savefig('oldvsfresh_overlap.png')
In [38]:
#per mirna expression comparison between fresh and old
def getexprdata(df, norm=True):
#get variance data
cols,normcols=mdp.getColumnNames(df)
if norm==True:
c=normcols
else:
c=cols
d=df[['#miRNA','mean_norm']]
d['std'] = df[c].std(1)
return d
d = getexprdata(douwe)
ic = getexprdata(iconmap)
#c=['#miRNA']+normcols
#t = pd.melt(x[:20][c],id_vars='#miRNA',var_name='sample',value_name='mean')
#g=sns.factorplot('#miRNA','mean',data=t, kind="bar",aspect=1.8,palette='Spectral')
x = pd.merge(ic,d,on='#miRNA')
f,ax=plt.subplots(1,1,figsize=(8,8))
x.plot('mean_norm_x','mean_norm_y',kind='scatter',logx=True, grid=False,
logy=True,s=30,ax=ax)
ax.errorbar(x=x.mean_norm_x,y=x.mean_norm_y, xerr=x.std_x,yerr=x.std_y,fmt='.',ecolor='gray', elinewidth=1)
ax.plot([0, 1], [0, 1], transform=ax.transAxes,color='black',alpha=0.7)
ax.set_xlabel('fresh')
ax.set_ylabel('old')
ax.set_title('mean reads comparison')
f.savefig('oldvsfresh_correlation.png')
print 'cc=%s' %np.corrcoef(np.log2(x.mean_norm_x),np.log2(x.mean_norm_y))[0][1]
In [30]:
#top comparisons
top=x.set_index('#miRNA')[:8]
vals = top[['mean_norm_x','mean_norm_y']]
errs = top[['std_x','std_y']]
errs=errs.rename(columns={'std_x':'mean_norm_x','std_y':'mean_norm_y'})
ax=vals.plot(yerr=errs,kind='bar',grid=False,figsize=(9,6))
plt.ylabel('mean normalised count')
plt.legend(['UCD','CVI'])
ax.set_xticklabels(x['#miRNA'],rotation=60)
plt.tight_layout()
plt.savefig('oldvsfresh_top.png')
In [12]:
a = douwe['precursor'].str.replace('bta-','')
b = hooten['precursor'].str.replace('hsa-','')
c = vegh['precursor'].str.replace('bta-','')
f,ax=plt.subplots(1,1,figsize=(8,8))
v=base.venndiagram([a,b,c],['bovine serum','human serum','bovine macrophage'], ax, set_colors=['y','b','g'], alpha=0.6)
In [35]:
#ncrna iconmap and douwe compare..
nc1 = pd.read_csv('ncrna_mapped.csv',index_col=0)
nc2 = pd.read_csv('iconmap_ncrna.csv',index_col=0)
nc1['name']='CVI'
nc2['name']='UCD'
x=pd.concat([nc1,nc2])
x=x.drop(['total'],1)
x['unmapped'] = 1-x.sum(1)
catnames = {'mirbase-hairpin':'miRBase','Rfam_btau':'rRNA','bosTau6-tRNAs':'tRNA','unmapped':'unmapped',
'bostau-snRNA':'sRNA','noncodev4_btau':'noncode','bos_taurus_alt':'UMD3.1'}
x = x.rename(columns=catnames)
t = pd.melt(x,id_vars=['name'],var_name='cat',value_name='total')
g=sns.factorplot('cat','total','name', t, kind="bar",aspect=1.8,size=6,palette='Set1',legend=False)
plt.legend(loc=2)
g.set_xticklabels(rotation=40)
g.set(ylabel='fraction')
plt.tight_layout()
plt.savefig('oldvsfresh_ncrna.png')
#plot just mirnas
t = t[t.cat=='miRBase']
g=sns.factorplot('cat','total','name', t,kind="box",size=3.5,aspect=.8,palette='Set1',legend=False)
g.axes[0]
g.set(ylabel='fraction')
plt.savefig('oldvsfresh_mirna.png')
In [16]:
def DEbytime(df, condmap, times, cut=1.5):
elisa=['N','P']
res=[]
print times
for e in elisa:
c = condmap[condmap.elisa==e]
c = c[c.timegroup.isin(times)]
x = pd.Categorical(c['timegroup'],categories=times)
c['factor'] = x.labels
c['label'] = c.apply(lambda x: x.id+'_'+str(x.factor),1)
c = c.sort('factor')
scols = c.id
data = df[scols]
data.columns = c.label
data.to_csv('decounts.csv')
de = base.runEdgeR('decounts.csv', cut)
de['elisa']=e
res.append(de)
print de
detime = pd.concat(res)
detime.to_csv('de_time.csv')
return detime
In [17]:
#diffr expression
times = [['START','EARLY'],['START','LATE']]
df = mdp.getResults(path1)
k=df[df.novel==False]
k = mdp.filterExprResults(k,meanreads=150)
cm1 = mdp.getLabelMap(path1, 'mapdata_labels.csv')
k=k.set_index('#miRNA')
for t in times:
deknown = DEbytime(k, cm1, t)
#isomir diffr expression
cm2 = srb.getLabelMap(path2, 'mapdata_labels.csv')
iso = pd.read_csv('analysis/srnabench_isomirs_all.csv')
df = iso[(iso.mean_norm>50) & (iso.freq>0.9)]
#create unique names for isomirs
df['tag'] = df.name+'_'+df.variant+'_'+df.pos.astype(str)+'_'+df.index.astype(str)
df = df.set_index('tag')
res=[]
for t in times:
d = DEbytime(df, cm2, t, cut=2)
res.append(d)
deiso=pd.concat(res)
kiso=df
In [18]:
def meltdata(df, labels, cols):
t=df.T
t.index = cols
t = t.merge(labels,left_index=True,right_on='id')
tm = pd.melt(t,id_vars=list(labels.columns),
var_name='miRNA',value_name='read count')
return tm
In [34]:
names = deknown.name
c,normcols=mdp.getColumnNames(k)
#names = ['bta-miR-22-3p']
df = k.ix[names][normcols]
tm = meltdata(df,cm1,c)
g = sns.factorplot('month','read count','elisa', tm, col='miRNA', kind="point",
col_wrap=3,size=3,aspect=1,legend_out=True,sharey=False)
plt.savefig('de_known.png')
names = deiso.name.unique()[:12]
#names = ['bta-miR-22-3p_lv3p_-1_53','bta-miR-22-3p_lv3p_-2_18']
c2,normcols2=srb.getColumnNames(kiso)
df = kiso.ix[names][normcols2]
tm = meltdata(df,cm2,c2)
sns.set_context("notebook", font_scale=1.2)
g = sns.factorplot('month','read count','elisa', tm, col='miRNA', kind="point",
col_wrap=3,size=3,aspect=1.2,legend_out=True,sharey=False)
plt.savefig('de_isomirs.png')
In [22]:
inames = deiso.name.apply( lambda x : x.split('_')[0])
#print deiso.sort('name')
#domiso = pd.read_csv('analysis/srnabench_isomirs_dominant.csv')
#print domiso[domiso.name.isin(inames)]
def plotIsoDist(iso,name):
#add this to srnabench
x=iso[iso.name==name]
x=x[x.perc>0.01]
print x[['name','isoClass','mean_norm','perc']]
cols,normcols=srb.getColumnNames(iso)
x['std'] = x[normcols].std(1)
x = x.sort('mean_norm')
x.plot(x='isoClass',y='perc',kind='barh',grid=False,legend=False,figsize=(7,4))
plt.xlabel('fraction of total')
plt.title(name)
plt.tight_layout()
plt.savefig('%s_isomirs.png' %name)
plotIsoDist(iso,'bta-miR-22-3p')
#plotIsoDist(iso,'bta-miR-23a')
In [167]:
def meancc(df):
cc = df.corr()
print cc
cc.values[np.tril_indices_from(cc)] = np.nan
print cc.unstack().mean()
In [188]:
#variation between samples
df = mdp.getResults(path1)
df = mdp.filterExprResults(df,meanreads=150,score=0)
df=df[df>0]
#cols,normcols=mdp.getColumnNames(df)
cm = mdp.getLabelMap(path1, 'mapdata_labels.csv')
c1 = cm[(cm.month==0) & (cm.elisa=='N')].id
#c1 = cm[(cm.animal==1366)].id
nc1 = [i+'(norm)' for i in c1]
print nc1
a=nc1[0]
b=nc1[2]
ax=pd.scatter_matrix(np.log2(df[nc1]),figsize=(10,10))
meancc(df[nc1])
#df.plot(x=a,y=b,kind='scatter',logx=True,logy=True,figsize=(8,8),
# s=20,alpha=0.9,grid=False)
In [187]:
#isomir variation between samples
cm = srb.getLabelMap(path2, 'mapdata_labels.csv')
#get bio 'replicates'
c2 = cm[(cm.month==0) & (cm.elisa=='N')].id
c2 = cm[(cm.animal==1361)].id
nc2 = [i+'_norm' for i in c2]
#iso=iso[iso.mean_norm>150]
x = iso[nc2]
x = x[x>0]
ax=pd.scatter_matrix(np.log2(x),figsize=(10,10))
meancc(x)
In [39]:
#normalisation stuff
#stability of each miRNA over samples?
d = getexprdata(douwe, norm=False)
#d = getexprdata(iconmap, norm=False)
cols,normcols=mdp.getColumnNames(douwe)
#need to re-add mean,std and cv to mirdeep getResults so this is simplified
d['mean']=douwe[cols].mean(1)
d['cv'] = d['std']/d['mean']
d = d.sort('cv')[:10]
print d
names=d['#miRNA']
d=douwe.set_index('#miRNA')
d=d[d.index.isin(names)]
x=d[cols].T
ax=x.plot(kind='box',grid=False,figsize=(12,6),logy=True)
ax.set_xticklabels(d.index,rotation=60)
plt.ylabel('mean normalised count')
ax=x.hist(bins=20,grid=False,figsize=(12,6))
In [ ]:
#isomiR dependency on elisa?