Reference:
D. Farrell, R. G. Shaughnessy, L. Britton, D. E. MacHugh, B. Markey, and S. V. Gordon, “The Identification of Circulating MiRNA in Bovine Serum and Their Potential as Novel Biomarkers of Early Mycobacterium avium subsp paratuberculosis Infection,” PLoS One, vol. 10, no. 7, p. e0134310, 2015.
In [4]:
import glob,os
import pandas as pd
import numpy as np
import mirnaseq.mirdeep2 as mdp
import mirnaseq.srnabench as srb
from mirnaseq import base, analysis, ensembl
pd.set_option('display.width', 130)
%matplotlib inline
import pylab as plt
base.seabornsetup()
plt.rcParams['savefig.dpi']=300
plt.rcParams['font.size']=16
In [2]:
reload(analysis)
reload(mdp)
path='/opt/mirnaseq/data/iconmap/'
os.chdir(path)
#mirdeep
path1 = 'analysis/iconmap_results_mirdeep'
ic = mdp.getResults(path1)
ic = mdp.filterExprResults(ic,meanreads=150,freq=0.8)
k = ic[ic.novel==False]
n = ic[ic.novel==True]
n=n[n['miRDeep2 score']>=4]
condmap1 = mdp.getLabelMap(path1,'samplelabels.csv')
#srnabench
path2 = 'analysis/iconmap_results_srnabench'
sk,sn,iso = srb.getResults(path2)
sk = srb.filterExprResults(sk,meanreads=150)
condmap2 = srb.getLabelMap(path2,'samplelabels.csv')
In [9]:
def DEbytime(df, condmap):
status=['control','infected']
res=[]
for s in status:
c = condmap[condmap.status==s]
x = pd.Categorical(c['month'],categories=[0,6])
c['factor'] = x.labels
c['label'] = c.apply(lambda x: x.id+'_'+str(x.factor),1)
c = c.sort('factor')
scols = c.id
data = df[scols]
data.columns = c.label
data.to_csv('decounts.csv')
de = base.runEdgeR('decounts.csv', 1.5)
de['status']=s
res.append(de)
print de
detime = pd.concat(res)
detime.to_csv('de_time.csv')
return detime
In [4]:
#diff expr by time
df1 = pd.concat([k,n])
df1 = df1.set_index('#miRNA')
df2 = sk.set_index('name')
detime = DEbytime(df1, condmap1)
In [5]:
def meltdata(df, labels):
t=df.T
t.index = cols
t = t.merge(labels,left_index=True,right_on='id')
tm = pd.melt(t,id_vars=list(labels.columns),
var_name='miRNA',value_name='read count')
return tm
In [9]:
df = ic.set_index('#miRNA')
names = detime.name
#names=df.index[10:30]
cols,normcols = mdp.getColumnNames(df)
df = df.ix[names][normcols]
tm = meltdata(df,condmap1)
g = base.sns.factorplot('month','read count','status', tm, col='miRNA', kind="point",
col_wrap=4,size=4,aspect=0.9,legend_out=True,sharey=False)
plt.savefig('iconmap_DE_time.png')
#g = base.sns.factorplot('animal','read count', 'month',data=tm, col='miRNA', kind="bar",
# col_wrap=2,size=4,aspect=2,legend_out=True,sharey=False,palette='Set2')
#plt.savefig('iconmap_DE_time_animals.png')
In [39]:
def DEbystatus(df,condmap):
#diff expr by infection
status=['control','infected']
months=[0,6]
res=[]
for m in months:
c = condmap[condmap.month==m]
x = pd.Categorical(c['status'],categories=status)
c['factor'] = x.labels
c['label'] = c.apply(lambda x: x.id+'_'+str(x.factor),1)
c = c.sort('factor')
scols = c.id
data = df[scols]
data.columns = c.label
data.to_csv('decounts.csv')
de = base.runEdgeR('decounts.csv', 1.5)
de['month']=m
res.append(de)
print de
deall = pd.concat(res)
return deall
In [ ]:
destatus = DEbystatus(df1, condmap1)
df = ic.set_index('#miRNA')
names = destatus.name
#names=df.index[10:30]
cols,normcols = mdp.getColumnNames(df)
df = df.ix[names][normcols]
tm = meltdata(df,condmap1)
g = base.sns.factorplot('month','read count','status', tm, col='miRNA', kind="box",
col_wrap=2,size=4,aspect=1,legend_out=True,sharey=False)
plt.savefig('iconmap_DE_status.png')
g = base.sns.factorplot('animal','read count','month', tm, col='miRNA', kind="bar",
col_wrap=2,size=3,aspect=2,legend_out=True,sharey=False)
In [10]:
#isomir diff expr
df3 = iso[(iso.mean_norm>50) & (iso.freq>0.9)]
df3['tag'] = df3.name+'_'+df3.variant+'_'+df3.index.astype(str) # +iso.read
df3 = df3.set_index('tag')
res = DEbytime(df3, condmap2)
In [ ]:
#res = res[res.logFC<2]
res = res.sort('logFC',ascending=False)
names=res.name[:30]
#names=df.index[10:30]
cols,normcols = srb.getColumnNames(df3)
df = df3.ix[names][normcols]
tm = meltdata(df,condmap2)
g = base.sns.factorplot('month','read count','status', tm, col='miRNA', kind="point",
col_wrap=4,size=3,aspect=1.5,legend_out=True,sharey=False)
g = base.sns.factorplot('animal','read count', 'month',data=tm, col='miRNA', kind="bar",
col_wrap=3,size=3,aspect=3,legend_out=True,sharey=False,palette='Set2')
In [ ]:
#isomir diversity comparisons
reload(srb)
#cols,normcols = srb.getColumnNames(iso)
#normcols.append('name')
df = iso.set_index('name')
#x=srb.getTopIsomirs(iso)
status=['control','infected']
month = [0,6]
for s in status:
for m in month:
c = condmap2[(condmap2.month==m) & (condmap2.status==s)]
scols = c.id#+'_norm'
data = df[scols].reset_index()
#data = data[data.mean(1)>50]
print data[:10]
print s,m,len(data)
x=srb.getTopIsomirs(data)
print x[:2]
In [7]:
#compare methods reload(analysis)
plt.rcParams['font.size']=18
analysis.compareMethods('analysis/iconmap_results_mirdeep',
'analysis/iconmap_results_srnabench')
In [8]:
#overlap between our miRs and other circulating
mirbase = base.fasta2DataFrame('mature_btau_alt.fa').reset_index()
mb = mirbase.name.str.replace('bta-','')
a=k['#miRNA'].str.replace('bta-','')
mdla = pd.read_csv('/opt/mirnaseq/analysis/miRandola_1.6.csv')
h1 = mdla[mdla.miRNA_type=='circulating'].name.str.replace('hsa-','')
f,ax=plt.subplots(1,1,figsize=(10,8))
v=base.venndiagram([mb,a,h1],['miRBase cow','bovine serum','human miRandola'],ax, set_colors=['r','y','g'], alpha=0.6)
ax.set_title('miRNA family overlap',fontsize=20)
plt.tight_layout()
f.savefig('known_overlap.png')
In [30]:
#ks test
reload(analysis)
c1 = condmap1[(condmap1.status=='infected') & (condmap1.month==6)]
pairs = zip(*[iter(c1.id)]*2)
print c1
print pairs
df = mdp.getResults(path1)
analysis.KStest(df, ids=pairs)
#pd.scatter_matrix(np.log(df[c1.id].replace(0,.001)))
In [41]:
reload(ensembl)
pd.set_option('display.max_colwidth', 200)
#novel
df = mdp.getResults(path1)
df = mdp.filterExprResults(df,meanreads=150,freq=0.8)
n = df[df.novel==True]
n = n[n['miRDeep2 score']>=4]
print len(n)
#print n[mdp.mirdeepcols[:9]]
#sn = n[n['#miRNA'].str.contains('26_')]
#print sn[['#miRNA','consensus precursor sequence']].sort('#miRNA')
dnov = pd.read_csv('douwe_novel_all.csv')
dnov = dnov[['precursor coordinate','total','miRDeep2 score']]
#ensembl.getmiRNAOrthologs(n)
cons = ensembl.summarise(n)
cons = cons.reset_index().merge(dnov,on='precursor coordinate',how='left',suffixes=['','_douwe'])
print cons
cons.to_csv('novel_conserved.csv',index=False)
In [74]:
no = pd.read_csv('novel_orthologs.csv',index_col=0)
no=no.drop(['seq','gene_loc'],1)
#no.columns.insert(-1,'seq')
print no[no['#miRNA']=='26_16197']
In [15]:
reload(ensembl)
#res = ensembl.getHostGenes(k)
#gk.to_csv('known_hostgenes.csv')
gk=pd.read_csv('known_hostgenes.csv')
#print gk
gk = gk.merge(k,on='#miRNA',how='right')
x = gk.tu.replace(np.nan,'intergenic').value_counts()
print x/x.sum()
x.plot(kind='pie')
plt.title('percent in host genes')
Out[15]:
In [5]:
#biomart
from biomart import BiomartServer
#server = BiomartServer( "http://www.biomart.org/biomart" )
#print server.show_databases()
from biomart import BiomartDataset
d = BiomartDataset( "http://www.biomart.org/biomart", name = 'uniprot' )
#print d.show_filters()
#print d.show_attributes()
In [12]:
#genenames = list(no.dropna().genes.unique())
genenames = gk.dropna().gene.unique()
res=[]
for g in genenames:
#print g
term = {'filters': { 'gene_name': g,'proteome_name': 'Bos taurus'},
'attributes':['accession','ensembl_id','name','go_name']}
response = d.search(term,header=1)
items = [i.split('\t') for i in response.iter_lines()]
if len(items)<2: continue
x = pd.DataFrame(items[1:],columns=items[0])
res.append(x)
res = pd.concat(res)
res['domain'] = res['GO name'].apply(lambda x: x.split(':')[0])
res = res[res.domain=='C']
ccomp = res['GO name'].value_counts()
In [11]:
ccomp[:10].plot('bar',cmap='Spectral')
Out[11]:
In [19]:
#customise ncrna plot
reload(analysis)
df = pd.read_csv('ncrna_mapped.csv',index_col=0)
catnames = {'mirbase-hairpin':'miRBase','Rfam_btau':'rRNA','bosTau6-tRNAs':'tRNA','unmapped':'unmapped',
'bostau-snRNA':'sRNA','noncodev4_btau':'noncode','bos_taurus_alt':'UMD3.1'}
samples = pd.read_csv('samplelabels.csv')
df = df.rename(columns=catnames)
cols = df.columns
x = df.merge(samples,left_on='name',right_on='filename')
x['name'] = x.apply(lambda r: str(r.animal)+'_'+r.status+'_'+str(r.month),1)
x=x.sort('name')
#for i,g in x.groupby('month'):
analysis.plotRNAmapped(x[cols])
In [21]:
reload(analysis)
infile = '/opt/mirnaseq/data/iconmap_feb15/combined/ncrna_map/sample_1_combined_cut.fastq'
analysis.mirnaDiscoveryTest(infile)
In [66]:
#rna structures
reload(base)
names=['20_11724','22_13473','15_6321','3_18437','3_18517','26_16197']
x=n.set_index('#miRNA')
for i,r in x.iterrows():
if i not in names:
continue
seq = r['consensus precursor sequence']
mature = r['consensus mature sequence']
star = r['consensus star sequence']
print i, mature, star
struct = base.plotRNA(seq, path=os.getcwd(), subseqs=[mature,star], name=i)
In [ ]: