ICONMAP project miRNA data analysis

Reference:

D. Farrell, R. G. Shaughnessy, L. Britton, D. E. MacHugh, B. Markey, and S. V. Gordon, “The Identification of Circulating MiRNA in Bovine Serum and Their Potential as Novel Biomarkers of Early Mycobacterium avium subsp paratuberculosis Infection,” PLoS One, vol. 10, no. 7, p. e0134310, 2015.


In [4]:
import glob,os
import pandas as pd
import numpy as np
import mirnaseq.mirdeep2 as mdp
import mirnaseq.srnabench as srb
from mirnaseq import base, analysis, ensembl
pd.set_option('display.width', 130)
%matplotlib inline
import pylab as plt
base.seabornsetup()
plt.rcParams['savefig.dpi']=300
plt.rcParams['font.size']=16

In [2]:
reload(analysis)
reload(mdp)
path='/opt/mirnaseq/data/iconmap/'
os.chdir(path)
#mirdeep
path1 = 'analysis/iconmap_results_mirdeep'
ic = mdp.getResults(path1)
ic = mdp.filterExprResults(ic,meanreads=150,freq=0.8)
k = ic[ic.novel==False]
n = ic[ic.novel==True]
n=n[n['miRDeep2 score']>=4]
condmap1 = mdp.getLabelMap(path1,'samplelabels.csv')

#srnabench
path2 = 'analysis/iconmap_results_srnabench'
sk,sn,iso = srb.getResults(path2)
sk = srb.filterExprResults(sk,meanreads=150)
condmap2 = srb.getLabelMap(path2,'samplelabels.csv')

In [9]:
def DEbytime(df, condmap):
    status=['control','infected']
    res=[]
    for s in status:
        c = condmap[condmap.status==s]
        x = pd.Categorical(c['month'],categories=[0,6])
        c['factor'] = x.labels
        c['label'] = c.apply(lambda x: x.id+'_'+str(x.factor),1)
        c = c.sort('factor')    
        scols = c.id
        data = df[scols]    
        data.columns = c.label       
        data.to_csv('decounts.csv')
        de = base.runEdgeR('decounts.csv', 1.5)
        de['status']=s
        res.append(de)
        print de
    detime = pd.concat(res)    
    detime.to_csv('de_time.csv')
    return detime

In [4]:
#diff expr by time
df1 = pd.concat([k,n])
df1 = df1.set_index('#miRNA')
df2 = sk.set_index('name')
detime = DEbytime(df1, condmap1)


/usr/local/lib/python2.7/dist-packages/pandas-0.16.0-py2.7-linux-x86_64.egg/pandas/core/categorical.py:396: FutureWarning: 'labels' is deprecated. Use 'codes' instead
  warnings.warn("'labels' is deprecated. Use 'codes' instead", FutureWarning)
-c:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
-c:8: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
 [1] "0" "0" "0" "0" "0" "0" "1" "1" "1" "1" "1" "1"

               name     logFC     logCPM    PValue       FDR   status
0       bta-miR-205  2.323181   8.635382  0.000002  0.000176  control
2    bta-miR-126-5p  1.706727   8.520117  0.000300  0.003967  control
3       bta-miR-143  1.702237  10.203733  0.000005  0.000184  control
4    bta-miR-27a-3p  1.633126  10.343266  0.000010  0.000287  control
5       bta-miR-92b  1.598428  10.845843  0.000074  0.001102  control
6       bta-miR-10b  1.570112  14.530603  0.000022  0.000429  control
114     bta-miR-127 -1.618129  11.239548  0.000027  0.000465  control
115     bta-miR-432 -1.916355   7.621927  0.000015  0.000351  control
116         9_25151 -1.966658  10.038651  0.000003  0.000176  control
117         13_4632 -2.044966   7.816529  0.004710  0.043461  control
118         4_20321 -2.347488   4.746192  0.000423  0.005028  control
 [1] "0" "0" "0" "0" "0" "0" "1" "1" "1" "1" "1" "1"

            name     logFC    logCPM        PValue           FDR    status
0    bta-miR-205  2.184026  8.876098  8.964759e-14  1.066806e-11  infected
117      4_20321 -2.057422  4.200245  9.175926e-04  1.213261e-02  infected
118  bta-miR-432 -2.242852  7.549442  3.021257e-11  1.797648e-09  infected

In [5]:
def meltdata(df, labels):
    t=df.T
    t.index = cols    
    t = t.merge(labels,left_index=True,right_on='id')   
    tm = pd.melt(t,id_vars=list(labels.columns),
                 var_name='miRNA',value_name='read count')
    return tm

In [9]:
df = ic.set_index('#miRNA')
names = detime.name
#names=df.index[10:30]
cols,normcols = mdp.getColumnNames(df)
df = df.ix[names][normcols]

tm = meltdata(df,condmap1)

g = base.sns.factorplot('month','read count','status', tm, col='miRNA', kind="point",
                            col_wrap=4,size=4,aspect=0.9,legend_out=True,sharey=False)
plt.savefig('iconmap_DE_time.png')
#g = base.sns.factorplot('animal','read count', 'month',data=tm, col='miRNA', kind="bar",
#                         col_wrap=2,size=4,aspect=2,legend_out=True,sharey=False,palette='Set2')
#plt.savefig('iconmap_DE_time_animals.png')



In [39]:
def DEbystatus(df,condmap):
    #diff expr by infection
    status=['control','infected']
    months=[0,6]    
    res=[]
    for m in months:
        c = condmap[condmap.month==m]
        x = pd.Categorical(c['status'],categories=status)
        c['factor'] = x.labels
        c['label'] = c.apply(lambda x: x.id+'_'+str(x.factor),1)
        c = c.sort('factor')   
        scols = c.id   
        data = df[scols]    
        data.columns = c.label    
        data.to_csv('decounts.csv')
        de = base.runEdgeR('decounts.csv', 1.5)
        de['month']=m
        res.append(de)
        print de        
    deall = pd.concat(res) 
    return deall

In [ ]:
destatus = DEbystatus(df1, condmap1)

df = ic.set_index('#miRNA')
names = destatus.name
#names=df.index[10:30]
cols,normcols = mdp.getColumnNames(df)
df = df.ix[names][normcols]
tm = meltdata(df,condmap1)

g = base.sns.factorplot('month','read count','status', tm, col='miRNA', kind="box",
                            col_wrap=2,size=4,aspect=1,legend_out=True,sharey=False)
plt.savefig('iconmap_DE_status.png')
g = base.sns.factorplot('animal','read count','month', tm, col='miRNA', kind="bar",
                            col_wrap=2,size=3,aspect=2,legend_out=True,sharey=False)

In [10]:
#isomir diff expr
df3 = iso[(iso.mean_norm>50) & (iso.freq>0.9)]
df3['tag'] = df3.name+'_'+df3.variant+'_'+df3.index.astype(str) # +iso.read
df3 = df3.set_index('tag')
res = DEbytime(df3, condmap2)


 [1] "0" "0" "0" "0" "0" "0" "1" "1" "1" "1" "1" "1"

                           name     logFC     logCPM        PValue           FDR   status
0        bta-miR-205_lv3p_38283  2.583610   7.028488  2.111344e-10  4.338812e-08  control
1       bta-miR-205_exact_38284  2.435571   6.564262  2.559840e-05  4.102623e-04  control
2        bta-miR-205_lv3p_38279  2.281452   6.637006  6.139603e-08  4.587958e-06  control
3     bta-miR-27a-3p_lv3p_47828  2.243228   7.188335  2.058058e-08  2.679327e-06  control
4        bta-miR-27b_lv3p_47829  2.243225   7.188335  2.088398e-08  2.679327e-06  control
5     bta-miR-27a-3p_lv3p_47838  2.196640   7.927045  5.486711e-08  4.510753e-06  control
6      bta-miR-21-5p_lv3p_30254  2.167486   6.074332  1.173502e-07  6.890135e-06  control
7        bta-miR-10b_lv3p_28871  2.129061   8.733686  1.084041e-07  6.854474e-06  control
8       bta-miR-143_nta#T_42032  2.107948   8.182706  7.452607e-07  2.663497e-05  control
9        bta-miR-10b_lv3p_30855  2.105019   8.955312  1.337919e-07  7.331799e-06  control
10       bta-miR-10b_lv3p_29196  2.091197   7.168629  2.347153e-07  1.205850e-05  control
11       bta-miR-10b_lv3p_28562  1.973810  13.614865  1.898132e-06  5.778758e-05  control
12       bta-miR-10b_lv3p_48274  1.972305   6.174363  4.399565e-06  9.272929e-05  control
13       bta-miR-10b_lv3p_29129  1.963152   8.081788  3.513356e-06  8.022162e-05  control
14      bta-miR-92b_exact_31439  1.932861   9.187374  2.272987e-06  6.227984e-05  control
15       bta-miR-10b_lv3p_28152  1.930771  10.680200  1.321143e-06  4.343919e-05  control
16   bta-miR-27a-3p_exact_47841  1.925955   9.088669  6.040780e-07  2.257055e-05  control
17   bta-miR-126-5p_exact_17930  1.909945   8.033158  3.567698e-04  3.858747e-03  control
18       bta-miR-10b_lv3p_28633  1.904506   8.668793  2.609803e-06  6.500781e-05  control
19       bta-miR-143_lv3p_42020  1.858273   8.821548  2.127725e-06  6.030999e-05  control
20          bta-miR-10b_mv_7770  1.844054   9.734953  2.864557e-06  6.727616e-05  control
21        bta-miR-133a_mv_48655  1.832058   6.441703  1.635824e-03  1.430476e-02  control
22      bta-miR-10b_nta#A_28549  1.811426   7.620949  9.584499e-06  1.750769e-04  control
23       bta-miR-92b_lv3p_31420  1.805084   7.680534  1.124637e-06  3.851883e-05  control
24       bta-miR-10b_lv3p_29062  1.794482   6.809576  3.700932e-06  8.222070e-05  control
25      bta-miR-125a_lv3p_34052  1.662301   6.537086  4.295882e-04  4.469892e-03  control
27       bta-miR-10b_lv3p_28627  1.605167   6.587326  2.738527e-06  6.620793e-05  control
28      bta-miR-92b_nta#T_31479  1.602334   6.746163  2.463671e-05  4.050275e-04  control
29          bta-miR-10b_mv_7748  1.588137   7.866790  5.990831e-05  8.346547e-04  control
30       bta-miR-10b_lv3p_30849  1.583429   6.926705  3.416959e-05  5.299510e-04  control
..                          ...       ...        ...           ...           ...      ...
35       bta-miR-143_lv3p_42016  1.535331   7.676596  4.923360e-05  7.358185e-04  control
36       bta-miR-92a_lv3p_29354  1.510527   8.261004  5.937955e-04  5.742352e-03  control
794      bta-miR-221_lv3p_10691 -1.542311  10.319272  1.229256e-03  1.086504e-02  control
795     bta-miR-486_nta#T_36429 -1.586621   8.010386  6.288090e-06  1.230669e-04  control
796     bta-miR-486_nta#A_35838 -1.597661   8.372941  4.045100e-04  4.262913e-03  control
797     bta-miR-486_nta#A_35920 -1.599061   7.135872  5.777191e-05  8.331317e-04  control
798     bta-miR-486_nta#T_36796 -1.609493  11.374710  2.595333e-05  4.102623e-04  control
799     bta-miR-486_nta#T_36812 -1.617696   8.304346  5.686175e-04  5.631369e-03  control
800     bta-miR-486_nta#T_36738 -1.645620  13.076498  4.220893e-06  9.130458e-05  control
801     bta-miR-486_nta#T_34751 -1.692636   6.926356  2.486735e-06  6.387800e-05  control
802     bta-miR-486_nta#T_37899 -1.744536   8.369333  2.120820e-06  6.030999e-05  control
803     bta-miR-486_nta#T_40756 -1.748943   7.632061  1.191139e-05  2.039825e-04  control
804     bta-miR-486_nta#T_36395 -1.757572  11.694112  2.853166e-07  1.379590e-05  control
805     bta-miR-486_nta#T_37839 -1.763992   7.033812  3.320810e-07  1.516503e-05  control
806     bta-miR-486_nta#A_36338 -1.778327   6.305328  2.371309e-06  6.287794e-05  control
807     bta-miR-486_nta#T_34360 -1.780049   6.651851  9.011506e-06  1.683513e-04  control
808     bta-miR-486_nta#T_35887 -1.804862   7.544679  4.787530e-06  9.838374e-05  control
809     bta-miR-486_nta#A_36508 -1.844849   7.946535  3.824852e-07  1.557259e-05  control
810     bta-miR-486_nta#T_40765 -1.852273   5.840994  1.360896e-05  2.282973e-04  control
811     bta-miR-486_nta#T_36441 -1.940158   6.258355  5.512122e-06  1.105113e-04  control
812     bta-miR-486_nta#T_40700 -1.977098   6.230027  2.281665e-08  2.679327e-06  control
813     bta-miR-486_nta#A_36241 -1.993517   9.311372  7.933625e-08  5.434533e-06  control
814     bta-miR-486_nta#T_36301 -2.043170   7.975606  3.978399e-07  1.557259e-05  control
815     bta-miR-486_nta#T_36591 -2.089019   8.687974  5.487534e-08  4.510753e-06  control
816     bta-miR-154c_exact_9682 -2.115983   7.070673  1.484938e-06  4.694690e-05  control
817     bta-miR-127_exact_38849 -2.574804   7.261388  4.179556e-08  4.294494e-06  control
818      bta-miR-127_lv3p_38813 -2.743162   6.827637  3.771124e-07  1.557259e-05  control
819     bta-miR-127_nta#T_38863 -3.268735   6.332052  1.008246e-10  3.842574e-08  control
820     bta-miR-127_nta#T_38892 -3.740155   6.893733  1.402399e-10  3.842574e-08  control
821     bta-miR-127_nta#T_38878 -5.178622   6.129684  4.638858e-12  3.813141e-09  control

[64 rows x 6 columns]
 [1] "0" "0" "0" "0" "0" "0" "1" "1" "1" "1" "1" "1"

                           name     logFC    logCPM        PValue           FDR    status
0        bta-miR-205_lv3p_38283  2.751287  7.081101  1.256730e-19  1.033032e-16  infected
1        bta-miR-205_lv3p_38279  2.173987  6.970024  4.521719e-12  7.433706e-10  infected
2       bta-miR-205_exact_38284  1.976079  6.933956  1.020205e-07  9.317873e-06  infected
3     bta-miR-27a-3p_lv3p_47838  1.704128  7.556267  1.411370e-14  2.900366e-12  infected
4    bta-miR-126-5p_exact_17930  1.561230  7.217108  1.135058e-06  5.488339e-05  infected
819     bta-miR-127_nta#T_38863 -2.144145  6.141942  6.052057e-11  8.291318e-09  infected
820     bta-miR-127_nta#T_38878 -2.498069  5.821830  4.867040e-16  2.000353e-13  infected
821     bta-miR-127_nta#T_38892 -3.100544  6.690599  6.421183e-15  1.759404e-12  infected
-c:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

In [ ]:
#res = res[res.logFC<2]
res = res.sort('logFC',ascending=False)
names=res.name[:30]
#names=df.index[10:30]
cols,normcols = srb.getColumnNames(df3)
df = df3.ix[names][normcols]
tm = meltdata(df,condmap2)
g = base.sns.factorplot('month','read count','status', tm, col='miRNA', kind="point",
                            col_wrap=4,size=3,aspect=1.5,legend_out=True,sharey=False)
g = base.sns.factorplot('animal','read count', 'month',data=tm, col='miRNA', kind="bar",
                         col_wrap=3,size=3,aspect=3,legend_out=True,sharey=False,palette='Set2')

#compare methods reload(analysis) analysis.compareMethods('analysis/iconmap_results_mirdeep', 'analysis/iconmap_results_srnabench')


In [ ]:
#isomir diversity comparisons
reload(srb)
#cols,normcols = srb.getColumnNames(iso)
#normcols.append('name')
df = iso.set_index('name')
#x=srb.getTopIsomirs(iso)

status=['control','infected']
month = [0,6]
for s in status:
    for m in month:        
        c = condmap2[(condmap2.month==m) & (condmap2.status==s)]
        scols = c.id#+'_norm'          
        data = df[scols].reset_index()
        #data = data[data.mean(1)>50]
        print data[:10]
        
        print s,m,len(data)
        x=srb.getTopIsomirs(data)
        print x[:2]

In [7]:
#compare methods reload(analysis) 
plt.rcParams['font.size']=18
analysis.compareMethods('analysis/iconmap_results_mirdeep', 
                        'analysis/iconmap_results_srnabench')


            #miRNA  total1  total2
38  bta-miR-99a-5p   19041   24132
42     bta-miR-99b   15819   24821
71     bta-miR-101    4240    9760

In [8]:
#overlap between our miRs and other circulating 
mirbase = base.fasta2DataFrame('mature_btau_alt.fa').reset_index()
mb = mirbase.name.str.replace('bta-','')
a=k['#miRNA'].str.replace('bta-','')
mdla = pd.read_csv('/opt/mirnaseq/analysis/miRandola_1.6.csv')
h1 = mdla[mdla.miRNA_type=='circulating'].name.str.replace('hsa-','')
f,ax=plt.subplots(1,1,figsize=(10,8))
v=base.venndiagram([mb,a,h1],['miRBase cow','bovine serum','human miRandola'],ax, set_colors=['r','y','g'], alpha=0.6)
ax.set_title('miRNA family overlap',fontsize=20)
plt.tight_layout()
f.savefig('known_overlap.png')



In [30]:
#ks test
reload(analysis)
c1 = condmap1[(condmap1.status=='infected') & (condmap1.month==6)]
pairs = zip(*[iter(c1.id)]*2)
print c1
print pairs
df = mdp.getResults(path1)
analysis.KStest(df, ids=pairs)
#pd.scatter_matrix(np.log(df[c1.id].replace(0,.001)))


    index  animal  month    status      label            filename   id
5      42    2393      6  infected   sample_6   sample_6_combined  s21
6      43    2152      6  infected   sample_7   sample_7_combined  s22
12      1    2155      6  infected  sample_13  sample_13_combined  s04
17      6    2402      6  infected  sample_18  sample_18_combined  s09
22     11    2388      6  infected  sample_23  sample_23_combined  s15
23     12    2390      6  infected  sample_24  sample_24_combined  s16
[('s21', 's22'), ('s04', 's09'), ('s15', 's16')]
0    128
1      0
2    158
dtype: int64

In [41]:
reload(ensembl)
pd.set_option('display.max_colwidth', 200)
#novel
df = mdp.getResults(path1)
df = mdp.filterExprResults(df,meanreads=150,freq=0.8)
n = df[df.novel==True]
n = n[n['miRDeep2 score']>=4]
print len(n)
#print n[mdp.mirdeepcols[:9]]
#sn = n[n['#miRNA'].str.contains('26_')]
#print sn[['#miRNA','consensus precursor sequence']].sort('#miRNA')

dnov = pd.read_csv('douwe_novel_all.csv')
dnov = dnov[['precursor coordinate','total','miRDeep2 score']]

#ensembl.getmiRNAOrthologs(n)
cons = ensembl.summarise(n)
cons = cons.reset_index().merge(dnov,on='precursor coordinate',how='left',suffixes=['','_douwe'])
print cons

cons.to_csv('novel_conserved.csv',index=False)


31
      #miRNA  ident  seq      energy      tu  seedcons         biotype         gene  read_count  miRDeep2 score  \
0   20_11724  1.000   11  -34.599998  intron        11  protein_coding       DROSHA         223           133.9   
1   22_13473  1.000   11  -22.200001    exon         7           miRNA  bta-mir-191         257           133.1   
2    15_6321  0.952   10  -49.200001     NaN         4             NaN          NaN          74            79.7   
3    3_18437  0.992   10 -105.000000     NaN         0  protein_coding      SLC35D2         219           136.1   
4    3_18843  0.863    9  -42.299999     NaN         3             NaN          NaN          70            71.7   
5   28_17041  1.000    9  -24.500000     NaN         2             NaN          NaN         256             5.3   
6    13_4632  0.990    8  -91.000000  intron         4  protein_coding         SNX5        1498             5.7   
7    10_1684  0.941    7  -22.200001     NaN         0             NaN          NaN          59             4.9   
8    11_2549  0.984    7  -28.100000    exon         4  protein_coding        PTGS1          59            33.7   
9    13_4213  0.763    5  -36.299999    exon         1  protein_coding         SRMS         253           160.0   
10  28_17046  0.873    5  -30.100000     NaN         2             NaN          NaN        3763             5.3   
11   18_8057  0.673    3  -19.600000     NaN         2             NaN          NaN         177             4.2   
12   9_24948  0.910    2  -18.299999     NaN         2             NaN          NaN         192             5.1   
13   6_21896  0.966    2  -10.300000  intron         2  protein_coding        PDCL2         235           120.6   
14   5_20820  0.968    2  -28.100000  intron         2  protein_coding        ABCC9          71             5.2   
15   4_20321  1.000    2  -16.400000     NaN         2             NaN          NaN         186             4.8   
16   3_18517  0.966    2  -21.600000  intron         2  protein_coding         ORC1        1771          1374.1   
17   10_1480  0.948    2  -25.100000  intron         1  protein_coding        DAAM1         193            38.7   
18   3_18032  0.965    2  -31.799999     NaN         2             NaN          NaN          54             5.6   
19  26_16197  0.763    2  -25.900000  intron         1  protein_coding        DMBT1        1572           887.3   
20  26_16101  0.915    2  -19.600000     NaN         1             NaN          NaN         143            99.0   
21   9_25151  0.957    2  -22.900000     NaN         2             NaN          NaN        7363             4.2   
22  29_18010    NaN    1         NaN  intron         0  protein_coding        BRSK2         152             5.9   
23   4_20156    NaN    1         NaN  intron         0  protein_coding        FAM3C         197           102.5   
24  24_14535    NaN    1  -19.799999     NaN         1             NaN          NaN         227           198.4   
25   9_24602    NaN    1         NaN    exon         0        misc_RNA  Metazoa_SRP         491             5.5   
26     1_987    NaN    1  -23.000000     NaN         1             NaN          NaN          51             4.6   
27   4_19879    NaN  NaN         NaN     NaN       NaN             NaN          NaN        2175             5.1   
28  25_15966    NaN  NaN         NaN     NaN       NaN             NaN          NaN         173             6.2   
29   3_18831    NaN  NaN         NaN     NaN       NaN             NaN          NaN         141            95.2   
30   13_4152    NaN  NaN         NaN     NaN       NaN             NaN          NaN          41             4.8   

   mirbase seed match      precursor coordinate     seed consensus mature sequence      freq  total  miRDeep2 score_douwe  
0         hsa-miR-635   20:42077865..42077925:+  cuugggc   ucuugggccccacccccggagac  1.000000    NaN                   NaN  
1                   -   22:51543482..51543545:-  aacgaaa    gaacgaaauccaagcgcagcug  1.000000    NaN                   NaN  
2                   -   15:36372561..36372621:-  ggcucgg     aggcucgggcugggccccggg  0.916667      7                  57.7  
3                   -    3:78601739..78601806:+  cggcccc   ccggccccgcgagcuccccgcga  1.000000    NaN                   NaN  
4        hsa-miR-7977    3:20198831..20198891:-  ucccagc    aucccagccgggucgagggaca  0.833333     14                   2.2  
5        hsa-miR-1469     28:8597865..8597927:-  ucggcgc    uucggcgccaccacccugcggg  1.000000    NaN                   NaN  
6        hsa-miR-4532   13:38543580..38543652:-  cccgggg       ccccggggagcccggcggu  1.000000    664                   2.1  
7      hsa-miR-423-5p   10:13002726..13002777:-  gaggggc      ugaggggcagagagugagaa  0.875000     33                   4.9  
8                   -   11:93244929..93244990:+  agccacc   aagccaccagagcucuguccuca  0.833333    NaN                   NaN  
9                   -   13:54591461..54591519:+  ggcgcgc    cggcgcgcggaggagcugagcg  1.000000     58                  40.3  
10    hsa-miR-4766-5p     28:9347067..9347134:-  cugaaag        ccugaaagcucuccaaac  1.000000    471                   5.3  
11      hsa-miR-22-3p     18:1864501..1864552:+  agcugcc       aagcugccaguggaagaag  0.916667     53                   2.8  
12     hsa-miR-142-5p    9:86842891..86842958:+  auaaagu     aauaaaguuuguuuggguuuu  0.958333    NaN                   NaN  
13                  -    6:72695823..72695882:+  aaaaagu     caaaaaguucguccagauuuu  1.000000    NaN                   NaN  
14    hsa-miR-6747-5p    5:88730950..88731013:+  ggggugu        ggggguguagcucagagg  0.833333     49                   5.2  
15    hsa-miR-6827-5p  4:115125346..115125387:-  gggagcc        agggagcccuggaauggg  0.875000    NaN                   NaN  
16                  -    3:94548590..94548649:+  aaaaccu    aaaaaccugaaugacccuuuug  1.000000    NaN                   NaN  
17                  -   10:71809966..71810024:+  aaacccg     aaaacccgaaugaacuuuuug  0.958333    NaN                   NaN  
18     hsa-miR-561-3p      3:2852303..2852360:+  aaaguuu       gaaaguuuguugggguuuu  0.833333    NaN                   NaN  
19                  -   26:42790257..42790316:+  cgagccu     ccgagccugacagaucacaca  1.000000    NaN                   NaN  
20   hsa-miR-548at-3p   26:26545665..26545724:+  aaaaccg     aaaaaccgaaugaacuuuuug  0.958333    NaN                   NaN  
21    hsa-miR-6873-5p    9:27844229..27844299:-  agaggga        gagagggagccugagaaa  1.000000    NaN                   NaN  
22      hsa-let-7i-3p   29:51095161..51095211:-  ugcgcaa     cugcgcaaagcgucgggcccc  1.000000    NaN                   NaN  
23                  -    4:86669291..86669351:-  aaacccu    aaaacccugaugaacuuuuuga  0.958333     19                  11.2  
24                  -     24:6436128..6436188:+  uaaccgg    auaaccggaaugaacuuuuuga  1.000000    NaN                   NaN  
25    hsa-miR-6829-5p      9:2105903..2105987:+  gggcugc     ugggcugcagugcgcuaugcc  1.000000    270                   1.7  
26    hsa-miR-6890-3p  1:135841882..135841950:-  cacugcc        ccacugccccaggugcug  0.916667    NaN                   NaN  
27     hsa-miR-515-5p          4:15729..15810:-  ucuccaa  cucuccaaucgcgacggguaucuc  1.000000    NaN                   NaN  
28     hsa-miR-132-5p   25:42673013..42673075:-  ccguggc       cccguggccccggcccgcc  0.958333     45                   6.2  
29        hsa-miR-637    3:19127469..19127551:-  cuggggg     acuggggguugagaaugucgc  1.000000     25                  19.2  
30    hsa-miR-4750-3p   13:45064441..45064507:+  cugaccc      acugacccaggugcugcugu  0.875000    NaN                   NaN  

In [74]:
no = pd.read_csv('novel_orthologs.csv',index_col=0)
no=no.drop(['seq','gene_loc'],1)
#no.columns.insert(-1,'seq')
print no[no['#miRNA']=='26_16197']


       species    #miRNA     seed  ident  seedcons mirbase  genes                location         biotype  trscpt  energy
9   Bos taurus  26_16197  CGAGCCT    NaN        38       -  DMBT1  26:42790257-42790316:1  protein_coding  intron   -25.9
10  Ovis aries  26_16197  CGAGCCT  0.763        -1       -  DMBT1  22:41368900-41368957:1  protein_coding  intron   -11.7

In [15]:
reload(ensembl)
#res = ensembl.getHostGenes(k)
#gk.to_csv('known_hostgenes.csv')
gk=pd.read_csv('known_hostgenes.csv')
#print gk
gk = gk.merge(k,on='#miRNA',how='right')
x = gk.tu.replace(np.nan,'intergenic').value_counts()
print x/x.sum()
x.plot(kind='pie')
plt.title('percent in host genes')


intergenic    0.647727
intron        0.329545
exon          0.022727
dtype: float64
Out[15]:
<matplotlib.text.Text at 0x7f65702f81d0>

In [5]:
#biomart
from biomart import BiomartServer
#server = BiomartServer( "http://www.biomart.org/biomart" )
#print server.show_databases()

from biomart import BiomartDataset
d = BiomartDataset( "http://www.biomart.org/biomart", name = 'uniprot' )
#print d.show_filters()
#print d.show_attributes()

In [12]:
#genenames = list(no.dropna().genes.unique())
genenames = gk.dropna().gene.unique()
res=[]
for g in genenames:
    #print g
    term = {'filters': { 'gene_name': g,'proteome_name': 'Bos taurus'},
            'attributes':['accession','ensembl_id','name','go_name']}
    response = d.search(term,header=1)   
    items = [i.split('\t') for i in response.iter_lines()]
    if len(items)<2: continue
    x = pd.DataFrame(items[1:],columns=items[0])   
    res.append(x)
res = pd.concat(res)  
res['domain'] = res['GO name'].apply(lambda x: x.split(':')[0])

res = res[res.domain=='C']
ccomp = res['GO name'].value_counts()

In [11]:
ccomp[:10].plot('bar',cmap='Spectral')


Out[11]:
<matplotlib.axes.AxesSubplot at 0x7f6570618890>

In [19]:
#customise ncrna plot
reload(analysis)
df = pd.read_csv('ncrna_mapped.csv',index_col=0)
catnames = {'mirbase-hairpin':'miRBase','Rfam_btau':'rRNA','bosTau6-tRNAs':'tRNA','unmapped':'unmapped',
        'bostau-snRNA':'sRNA','noncodev4_btau':'noncode','bos_taurus_alt':'UMD3.1'}
samples = pd.read_csv('samplelabels.csv')
df = df.rename(columns=catnames)
cols = df.columns
x = df.merge(samples,left_on='name',right_on='filename')
x['name'] = x.apply(lambda r: str(r.animal)+'_'+r.status+'_'+str(r.month),1)
x=x.sort('name')
#for i,g in x.groupby('month'):
analysis.plotRNAmapped(x[cols])


                 miRBase   rRNA   tRNA   sRNA  noncode  UMD3.1  unmapped
name                                                                    
2152_infected_0    0.036  0.019  0.608  0.005    0.040   0.082     0.210
2152_infected_6    0.041  0.017  0.566  0.006    0.034   0.104     0.232
2155_infected_0    0.041  0.027  0.599  0.007    0.034   0.098     0.194
2155_infected_6    0.031  0.020  0.599  0.006    0.053   0.103     0.188
2168_control_0     0.065  0.030  0.550  0.005    0.042   0.144     0.164
2168_control_6     0.066  0.035  0.529  0.004    0.058   0.130     0.178
2215_control_0     0.018  0.005  0.702  0.001    0.022   0.053     0.199
2215_control_6     0.051  0.035  0.582  0.003    0.039   0.128     0.162
2218_control_0     0.063  0.020  0.552  0.004    0.055   0.114     0.192
2218_control_6     0.041  0.025  0.536  0.005    0.052   0.101     0.240
2388_infected_0    0.025  0.022  0.601  0.005    0.036   0.083     0.228
2388_infected_6    0.038  0.018  0.604  0.006    0.062   0.087     0.185
2389_control_0     0.035  0.021  0.653  0.006    0.028   0.094     0.163
2389_control_6     0.047  0.032  0.556  0.007    0.048   0.118     0.192
2390_infected_0    0.032  0.015  0.630  0.006    0.031   0.085     0.201
2390_infected_6    0.026  0.012  0.638  0.006    0.036   0.097     0.185
2393_infected_0    0.049  0.023  0.619  0.007    0.033   0.086     0.183
2393_infected_6    0.034  0.017  0.580  0.008    0.037   0.113     0.211
2401_control_0     0.046  0.023  0.598  0.004    0.042   0.109     0.178
2401_control_6     0.043  0.038  0.532  0.005    0.045   0.113     0.224
2402_infected_0    0.045  0.025  0.546  0.006    0.055   0.096     0.227
2402_infected_6    0.043  0.019  0.589  0.008    0.050   0.105     0.186
2409_control_0     0.082  0.025  0.551  0.005    0.038   0.150     0.149
2409_control_6     0.039  0.034  0.615  0.005    0.040   0.129     0.138

In [21]:
reload(analysis)
infile = '/opt/mirnaseq/data/iconmap_feb15/combined/ncrna_map/sample_1_combined_cut.fastq'
analysis.mirnaDiscoveryTest(infile)


    reads  total  frac     rcm
0    0.05     41    41  464288
1    0.10     56    56    6406
2    0.20     73    73    3412
3    0.50     93    93    1399
4    1.00    118   118     636
5    1.50    129   129     357
6    2.00    140   140     216
7    2.50    151   151     159
8    3.00    163   163     146
9    3.50    167   167      65
10   4.00    167   167     NaN
11   4.50    171   171      65
12   5.00    176   176      56
13   5.50    178   178      38
14   6.00    181   181      53
15   6.50    182   182      38
    reads  total  frac     rcm
0    0.05     46    46  496131
1    0.10     63    63    6466
2    0.20     82    82    3442
3    0.50    108   108    1383
4    1.00    134   134     569
5    1.50    146   146     357
6    2.00    160   160     217
7    2.50    172   172     159
8    3.00    184   184     146
9    3.50    190   190      81
10   4.00    191   191      67
11   4.50    197   197      65
12   5.00    203   203      61
13   5.50    205   205      38
14   6.00    207   207      53
15   6.50    211   211      44
/usr/lib/pymodules/python2.7/matplotlib/figure.py:1595: UserWarning: This figure includes Axes that are not compatible with tight_layout, so its results might be incorrect.
  warnings.warn("This figure includes Axes that are not "

In [66]:
#rna structures
reload(base)
names=['20_11724','22_13473','15_6321','3_18437','3_18517','26_16197']
x=n.set_index('#miRNA')
for i,r in x.iterrows():
    if i not in names:
        continue
    seq = r['consensus precursor sequence']
    mature = r['consensus mature sequence']
    star = r['consensus star sequence']
    print i, mature, star    
    struct = base.plotRNA(seq, path=os.getcwd(), subseqs=[mature,star], name=i)


3_18517 aaaaaccugaaugacccuuuug aaaagguucauuuggguuuc
26_16197 ccgagccugacagaucacaca gugugaccugccaggcacc
22_13473 gaacgaaauccaagcgcagcug gcugcuuuugggauuccguugc
20_11724 ucuugggccccacccccggagac aucugggguagggccuggga
3_18437 ccggccccgcgagcuccccgcga cggggagccgaggcggccagc
15_6321 aggcucgggcugggccccggg ccgggccugggccgagcccca

In [ ]: