In [8]:
%load_ext autoreload
%autoreload 2
from eden.util import configure_logging
import logging
DEBUG=False
NJOBS=4
if DEBUG:
NJOBS=1
configure_logging(logging.getLogger(),verbosity=1+DEBUG)
%matplotlib inline
!pwd
!date
In [9]:
# get data
from itertools import islice
from eden.converter.fasta import fasta_to_sequence
from eden.converter.rna.rnafold import rnafold_to_eden
from eden.converter.fasta import fasta_to_sequence
import itertools
from eden.graph import Vectorizer
def rfam_uri(family_id):
return 'http://rfam.xfam.org/family/%s/alignment?acc=%s&format=fastau&download=0'%(family_id,family_id)
def rfam_uri(family_id):
return '%s.fa'%(family_id)
def get_sequences(size=9999,withoutnames=False):
sequences = itertools.islice( fasta_to_sequence("../toolsdata/RF00005.fa"), size)
if withoutnames:
return [ b for (a,b) in sequences ]
return sequences
In [10]:
from graphlearn.utils import draw
import graphlearn.abstract_graphs.rna.rnatransform as learned
from eden.graph import Vectorizer
from sklearn.cluster import KMeans
import graphlearn.abstract_graphs.rna.rnadecomposer as decompose
v=Vectorizer()
make_decomposer = lambda x,y: decompose.RnaDecomposer(x,y,
include_base=False,
base_thickness_list=[2])
if True:
# if we are not interested in what the pp does we can skip this because it is slow
# it shows how the created abstractions look like.
pp=learned.GraphTransformerRNA(shape_cluster=KMeans(n_clusters=2),structure_mod=False)
pp.set_param(v)
gm=[make_decomposer(v,x) for x in pp.fit_transform(get_sequences(40))]
for i in range(2):
draw.graphlearn([gm[i+5].pre_vectorizer_graph(),gm[i+5].base_graph(),gm[i+5].abstract_graph()],size=13,contract=True, abstract_color='red')
In [ ]:
In [11]:
%%time
import graphlearn.abstract_graphs.rna.rnatransform as learned
import graphlearn.abstract_graphs.rna.infernal as infernal
import graphlearn.abstract_graphs.rna.rnafeasibility as feas
from graphlearn import feasibility
import graphlearn.estimate as estimate
#get some graphs
graphs = get_sequences(40)
#estimator wrapper... (default has nu=0.5)
estimator=estimate.OneClassEstimator( nu=.33, cv=2, n_jobs=NJOBS)
#feasibility checker can decide if a graph is a valid rna
feas=feasibility.FeasibilityChecker(checklist=[feasibility.default_check,feas.is_rna])
#preprocessor for rna.. it will take care of consensus folding etc
pp=learned.GraphTransformerRNA(shape_cluster=KMeans(n_clusters=2) ,structure_mod=False)
# create a sampler and fit
sampler=infernal.AbstractSampler(radius_list=[0,1],
thickness_list=[1],
min_cip_count=1,
min_interface_count=2,
feasibility_checker=feas,
graphtransformer=pp,
decomposergen=make_decomposer,
estimator=estimator)
sampler.fit(graphs,grammar_n_jobs=NJOBS,grammar_batch_size=1)
In [12]:
#draw production rules
from graphlearn.utils import draw
draw.draw_grammar(sampler.lsgg.productions,n_productions=5,n_graphs_per_production=5,
n_graphs_per_line=5, size=9, contract=False,
colormap='Paired', invert_colormap=False,node_border=1,
vertex_alpha=0.6, edge_alpha=0.5, node_size=450, abstract_interface=True)
In [13]:
%%time
import graphlearn.utils.draw as draw
import itertools
#parameters
sequences = get_sequences(withoutnames=True)
id_start=66
id_end=id_start+3
sequences = sequences[id_start:id_end]
n_steps=20
# sample
graphs = sampler.sample(sequences,
n_samples=4,
batch_size=1,
n_steps=n_steps,
n_jobs=NJOBS,
quick_skip_orig_cip=True,
probabilistic_core_choice=True,
burnin=0,
improving_threshold=0.9,
improving_linear_start=0.1,
max_size_diff=20,
select_cip_max_tries=40,
keep_duplicates=False,
accept_min_similarity=0.55,
include_seed=True,
backtrack=10,
monitor=True)
# collect data and print graph samples
scores=[]
sequences=[]
ids=range(id_start,id_end)
for i,path_graphs in enumerate(graphs):
print 'Graph id: %d'%(ids[i])
scores.append(sampler.monitors[i].sampling_info['score_history'])
#path_graphs = graph
sequences.append(sampler.monitors[i].sampling_info['notes'])
draw.graphlearn(path_graphs,contract=True,
n_graphs_per_line=5, size=10,
colormap='Paired', invert_colormap=False,node_border=0.5, vertex_color='color_level',
vertex_alpha=0.5, edge_alpha=0.7, node_size=350,edge_label='label',layout="RNA"
)
In [14]:
colors=['b','g','r','c','m','y','k','w']
'''
plot scores
'''
%matplotlib inline
from itertools import islice
import matplotlib.pyplot as plt
import numpy as np
step=1
num_graphs_per_plot=3
num_plots=np.ceil([len(scores)/num_graphs_per_plot])
for i in range(num_plots):
plt.figure(figsize=(10,5))
for j,score in enumerate(scores[i*num_graphs_per_plot:i*num_graphs_per_plot+num_graphs_per_plot]):
data = list(islice(score,None, None, step))
plt.plot(data,ls='-',color=colors[j], label='graph %d'%(j+i*num_graphs_per_plot+id_start))
# okok now we need to add the infernal evaluation
seqs=sequences[i*num_graphs_per_plot+j]
seqs=seqs.split('n')
data2= infernal.infernal_checker(seqs,cmfile='../rf00005.cm')
#draw line
plt.plot(data2,ls='--',color=colors[j], label='graph %d infernal score'%(j+i*num_graphs_per_plot+id_start))
plt.legend(loc='lower right')
plt.grid()
plt.ylim(-0.1,1.1)
plt.show()
In [ ]: