isomiR analysis for miRNA datasets

references:

D. Farrell, R. G. Shaughnessy, L. Britton, D. E. MacHugh, B. Markey, and S. V. Gordon, “The Identification of Circulating MiRNA in Bovine Serum and Their Potential as Novel Biomarkers of Early Mycobacterium avium subsp paratuberculosis Infection,” PLoS One, vol. 10, no. 7, p. e0134310, 2015.

P. Vegh, A. B. K. Foroushani, D. a Magee, M. S. McCabe, J. a Browne, N. C. Nalpas, K. M. Conlon, S. V Gordon, D. G. Bradley, D. E. MacHugh, and D. J. Lynn, “Profiling microRNA expression in bovine alveolar macrophages using RNA-seq.,” Vet. Immunol. Immunopathol., vol. 155, no. 4, pp. 238–44, Oct. 2013.

N. Hooten, M. Fitzpatrick, and W. W. 3rd, “Age-related changes in microRNA levels in serum,” Aging (Albany …, vol. 5, no. 10, pp. 725–740, 2013.

M. K. McGahon, J. M. Yarham, A. Daly, J. Guduric-Fuchs, L. J. Ferguson, D. a. Simpson, and A. Collins, “Distinctive Profile of IsomiR Expression and Novel MicroRNAs in Rat Heart Left Ventricle,” PLoS One, vol. 8, no. 6, 2013.

A. Wojcicka, M. Swierniak, O. Kornasiewicz, W. Gierlikowski, M. Maciag, M. Kolanowska, M. Kotlarek, B. Gornicka, L. Koperski, G. Niewinski, M. Krawczyk, and K. Jazdzewski, “Next generation sequencing reveals microRNA isoforms in liver cirrhosis and hepatocellular carcinoma.,” Int. J. Biochem. Cell Biol., vol. 53, pp. 208–17, 2014.


In [1]:
#general isomir analysis
import glob,os
import pandas as pd
import numpy as np
import mirnaseq.mirdeep2 as mdp
import mirnaseq.srnabench as srb
from mirnaseq import base, analysis, ensembl
pd.set_option('display.width', 300)
%matplotlib inline
import matplotlib as mpl
import pylab as plt
import seaborn as sns
base.seabornsetup()
from scipy.stats import norm,rayleigh
sns.set_style("ticks", {'axes.facecolor': '#F7F7F7','legend.frameon': True})
sns.set_context("notebook", font_scale=1.5)
plt.rcParams['savefig.dpi']=150
plt.rcParams['legend.fontsize']=25

In [2]:
dwiso = pd.read_csv('douwe_isomirs_all.csv')
iciso = pd.read_csv('iconmap_isomirs_all.csv')
nkiso = pd.read_csv('nick_isomirs_all.csv')
viso = pd.read_csv('vegh_isomirs_all.csv')
htiso = pd.read_csv('hooten_isomirs_all.csv')
mciso = pd.read_csv('mcgahon_isomirs_all.csv')
wjiso = pd.read_csv('wojcicka_isomirs_all.csv')
raiso = pd.read_csv('rau_isomirs_all.csv')

In [3]:
#isomir distr comparisons
dw = pd.read_csv('douwe_isomirs_dominant.csv')
ic = pd.read_csv('iconmap_isomirs_dominant.csv')
v = pd.read_csv('vegh_isomirs_dominant.csv')
ht = pd.read_csv('hooten_isomirs_dominant.csv')
mc = pd.read_csv('mcgahon_isomirs_dominant.csv')
wj = pd.read_csv('wojcicka_isomirs_dominant.csv')
ra =  pd.read_csv('rau_isomirs_dominant.csv')
nk = pd.read_csv('nick_isomirs_dominant.csv')
datasets = {'douwe':dw,'iconmap':ic,'vegh':v,'hooten':ht}#'nick':nk,'mcgahon':mc}#,'wojcicka':wj,'rau':ra}
l=len(datasets)

#filter out low abundance isomirs
for d in datasets:
    f=datasets[d]
    datasets[d] = f[f.total>200]
fig,ax=plt.subplots(l,1,figsize=(6,8))
grid=ax.flat
bins = np.linspace(0, 1, 20)
clrs = sns.color_palette("Set2", l)
i=0
for d in datasets:
    g=sns.distplot(datasets[d].domisoperc,bins=bins,ax=grid[i],color=clrs[i],kde=True)#kde_kws={"lw": 2})
    g.set_title(d)
    g.set_xlabel('')
    i+=1
plt.tight_layout()
'''fig,ax=plt.subplots(1,1,figsize=(10,8))
i=0
for d in datasets:
    f=datasets[d]   
    g=sns.kdeplot(f.domisoperc,ax=ax,color=clrs[i],lw=3,label=d)
    i+=1
fig.suptitle('dominant isomir percentages',fontsize=20)'''
fig.savefig('isomir_domperc_dists.png')



In [4]:
ic.read
fig,ax=plt.subplots(1,1,figsize=(6,6))
vd=base.venndiagram([dw.read,ic.read,v.read],['douwe','iconmap','vegh'],set_colors=['r','y','b'], alpha=0.6,ax=ax)
ax.set_title('isomir overlap')
plt.savefig('isomir_overlap.png')



In [7]:
#dominant isomir classes

def countclasses(dom):
    dom['iclass'] = dom.apply(lambda x: x.variant+' '+str(x.pos),1)
    #x=dom.iclass.value_counts()/len(dom) 
    x=dom.variant.value_counts()/len(dom) 
    return x

d1=countclasses(ic)
d2=countclasses(dw)
d3=countclasses(ht)
d4=countclasses(v)
x=pd.concat([d1,d2,d3,d4],1).fillna(0)
x= x.rename(columns={0:'bovine serum (UCD)',1:'bovine serum (CVI)',3:'bovine macrophage',2:'human serum'})
x.plot(kind='barh',width=.8,grid=False,figsize=(8,6))
plt.xlabel('fraction of total')
plt.ylabel('class')
plt.tight_layout()
plt.savefig('isomir_domclasses.png')



In [8]:
#compare isomirs between data

def diagonal(ax):    
    ax.set_ylim(ax.get_xlim())    
    ax.plot([0, 1], [0, 1], transform=ax.transAxes,color='red',alpha=0.7)

def compareIsomirs(x,y,a,b,name='',xlabel='',ylabel='',ax=None):
    x=x[x.perc>0.05]
    y=y[y.perc>0.05]
    #fig,ax=plt.subplots(1,2,figsize=(11,6),sharex=False)
    #grid=ax.flat
    '''ax=grid[0]
    d = a.merge(b,on=['read'],how='inner')
    #how much do dominant match?    
    d.plot('isomirs_x','isomirs_y',kind='scatter',logx=True,logy=True,s=40,grid=False,ax=ax)
    #ax.set_xlim((0,1))
    diagonal(ax)'''
    
    b = x.merge(y,on=['read'],how='inner')
    
    #ax=grid[0]
    b.plot('mean_norm_x','mean_norm_y',kind='scatter',logx=True,logy=True,alpha=0.8,grid=False,ax=ax)
    #b.plot('perc_x','perc_y',kind='scatter',alpha=0.8,grid=False,ax=ax)
    ax.set_xlim((1,5e5))
    #ax.set_xlim((0,1))
    diagonal(ax)
    print 'cc=%s' %np.corrcoef(np.log10(b.mean_norm_x),np.log10(b.mean_norm_y))[0][1]

    #ax.set_title(name) #'isomir normalised counts')
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)

#compareIsomirs(iciso,nkiso,ic,nk,'iconmap nick')
fig,ax=plt.subplots(1,3,figsize=(14.5,5),sharex=False)
grid=ax.flat
compareIsomirs(iciso,dwiso,ic,dw,xlabel='UCD',ylabel='CVI',ax=grid[0])
compareIsomirs(iciso,viso,ic,v,xlabel='serum (UCD)',ylabel='macrophage',ax=grid[1])
compareIsomirs(iciso,htiso,ic,ht,xlabel='serum (UCD)',ylabel='human serum',ax=grid[2])
#grid[0].set_xlabel('mean normalised counts')
#grid[0].set_ylabel('mean normalised counts')
plt.tight_layout()
fig.suptitle('isomiR mean normalised counts compared',fontsize=22)
fig.subplots_adjust(top=0.9)
plt.savefig('isomirs_compared_counts.png')


cc=0.834924671182
cc=0.456473797097
cc=0.711359989779

In [6]:
#percentage is not dependant on total NB
x=iciso[iciso.variant=='exact']

iciso.plot('mean_norm','perc',kind='scatter',logx=True,alpha=.6,s=30,c='seagreen',
         grid=False,figsize=(8,6),by='variant')

plt.title('isomiR fraction as function of mean counts')
plt.savefig('isomir_percvscounts.png')



In [16]:
#counts of dominant and sub-dominant dists
def binIsomirs(iso):
    g=iso.groupby('name').agg({'perc':lambda x: (x > .06).sum(), 
                                  'variant': lambda x: base.first(x),'total': lambda x: base.first(x)})    
    return g

cols = ['read','total','perc','length','isoClass']
g1=binIsomirs(dwiso)
g1['tag']='CVI (10-15 yr)'
g2=binIsomirs(iciso)
g2['tag']='UCD (fresh)'
g3=binIsomirs(htiso)
g3['tag']='human'
g4=binIsomirs(viso)
g4['tag']='macrophage'

#print dwiso[dwiso.name=='bta-miR-342'][cols]
subdom = g1.sort('perc',ascending=False)[:6]
onedom = g1.sort('perc')[1:7]
#print subdom
#print onedom
fig,ax=plt.subplots(1,1,figsize=(10,4))
bins=np.arange(0,8)+0.6
g=pd.concat([g1,g2])
h=g.hist('perc',width=0.8,bins=bins,color='deepskyblue',grid=False,ax=ax,by='tag',normed=True)
#sns.kdeplot(g1.perc,kernel='gau',bw=1,ax=h[0])
#sns.kdeplot(g2.perc,kernel='gau',bw=1,ax=h[1])
h[0].set_xlabel('no. dominant isomiRs (>5%)')
h[0].set_ylabel('normalised count')
plt.suptitle('distribution of isomiRs with >5% of total reads',fontsize=14)
plt.subplots_adjust(top=0.85)
plt.savefig('isomir_subdom_breakdown.png')

g.plot('perc','total',kind='scatter',logy=True,s=50,marker='s',alpha=0.6,c='darkblue',figsize=(6,6))
plt.title('isomiR single/sub dominance vs total counts')
plt.xlabel('no. dominant isomiRs (>5%)')
plt.tight_layout() 
plt.savefig('isomir_domperc_counts.png')
#types of variant by no. of sub/dominant isomirs
#print g2.groupby('perc').apply(lambda x: x.variant.value_counts())



In [22]:
x=iciso; y=dwiso
b = x.merge(y,on=['isoClass','read'],how='left')
#tmp=b[b.name_x=='bta-miR-23a']
#tmp=tmp[tmp.perc_x>0.02]
#print tmp[['isoClass','perc_x','perc_y','mean_norm_x','mean_norm_y','read']]

#names = b.name_x.unique()[:9]
names1 = onedom.index
names2 = subdom.index

def plotisodists(d,names):
    fig,ax=plt.subplots(2,3,figsize=(10,5)) 
    
    grid=ax.flat
    i=0
    for name in names:
        x = d[d.name_x==name]
        if len(x)==0:
            continue
        x=x[x.perc_x>0.02]
        #print name
        #print x[['isoClass_x','perc_x','perc_y']]       
        ax=grid[i]
        x = x.sort('perc_x')
        x.plot(x='variant_x',y=['perc_x','perc_y'],kind='barh',ax=ax,#sharex=False,
               grid=False,legend=False,width=0.8)  
        ax.set_title(name,fontsize=14)
        ax.set_ylabel('')
        ax.set_xlabel('fraction of reads')
        #ax.set_xticklabels('') 
        #ax.set_yticklabels('')
        #ax.set_xlim(0,1) 
        #bars=filter(lambda x: isinstance(x, mpl.patches.Rectangle), ax.get_children())
        #print bars[0]
        i+=1
    grid[0].set_ylabel('isomiR')    
    plt.legend(['UCD','CVI'])    
    plt.tight_layout()
    plt.savefig('isoperc_dists.png')

#mpl.rc('ytick', labelsize=12) 

sns.set_context("notebook", font_scale=1.1)
plotisodists(b,names1)
plotisodists(b,names2)
#other = ['bta-miR-205','bta-miR-92a']
#plotisodists(b,other)