In [1]:
%matplotlib inline
set an interface for data acquisition, either from file or from remote
In [2]:
def rfam_uri(family_id):
return 'http://rfam.xfam.org/family/%s/alignment?acc=%s&format=fastau&download=0'%(family_id,family_id)
In [3]:
def rfam_uri(family_id):
return '%s.fa'%(family_id)
In [4]:
rfam_id = 'RF00005' #tRNA
In [5]:
rfam_id = 'RF00871' #microRNA mir-689
In [6]:
rfam_id = 'RF02275' #Hammerhead_HH9
write a parser for FASTA format
In [7]:
import re
from eden import util
def fasta_to_fasta( input ):
for m in re.finditer(r"^(>[^\n]+)\n+([^>]+)",'\n'.join(util.read( input )), re.MULTILINE):
if m:
header, seq = m.groups()
seq = re.sub('\n','',seq)
yield header
yield seq
In [8]:
iterable = fasta_to_fasta(rfam_uri(rfam_id))
[line for line in iterable]
Out[8]:
write a converter that takes a parsed fasta file in input, calls RNAfold, parses the output and builds a graph
In [9]:
import networkx as nx
def sequence_dotbracket_to_graph(seq_info, seq_struct):
G = nx.Graph()
lifo = list()
for i,(c,b) in enumerate( zip(seq_info, seq_struct) ):
G.add_node(i, label = c)
if i > 0:
#add backbone edges
G.add_edge(i,i-1, label='-')
if b == '(':
lifo.append(i)
if b == ')':
#when a closing bracket is found, add a basepair edge with the corresponding opening bracket
j = lifo.pop()
G.add_edge(i,j, label='=')
return G
import subprocess as sp
def pre_process(input):
lines = fasta_to_fasta(input)
for line in lines:
#get a header+sequence
header = line
seq = lines.next()
#invoke RNAfold
cmd = 'echo "%s" | RNAfold --noPS' % seq
out = sp.check_output(cmd, shell = True)
#parse the output
text = out.strip().split('\n')
seq_info = text[0]
seq_struct = text[1].split()[0]
#make a graph
G = sequence_dotbracket_to_graph(seq_info, seq_struct)
G.graph['id'] = header
yield G
display the graphs
In [10]:
from eden.util.display import draw_graph
import itertools
graphs = pre_process(rfam_uri(rfam_id))
for graph in itertools.islice(graphs,2):
draw_graph(graph, size=11, node_size=200, node_border=False)
Setup the vectorizer object
In [11]:
from eden.graph import Vectorizer
vectorizer = Vectorizer( complexity=2 )
In [12]:
def describe(X):
print 'Instances: %d ; Features: %d with an avg of %d features per instance' % (X.shape[0], X.shape[1], X.getnnz()/X.shape[0])
process the fasta, build the graphs, transform them with the vectorizer into sparse vectors
In [13]:
graphs = pre_process(rfam_uri(rfam_id))
Xp = vectorizer.transform( graphs )
describe(Xp)
create a custom fasta parser that shuffles the sequences; these will be used as negative examples
In [14]:
import random
def shuffle_fasta( input , times=1, order=1):
iterable = fasta_to_fasta(input)
for line in iterable:
#get pairs of header-sequence
header = line
seq = iterable.next()
#shuffle
for i in range(times):
#split the seqeunce in substrings of length 'order'
kmers = [ seq[i:i+order] for i in range(0,len(seq),order) ]
#shuffle and join
random.shuffle(kmers)
seq_out = ''.join(kmers)
yield header
yield seq_out
In [15]:
iterable = shuffle_fasta( rfam_uri(rfam_id), times=2, order=2)
graphs = pre_process( iterable )
Xn = vectorizer.transform( graphs )
describe(Xn)
build a data matrix by stacking the data matrix for positive and for negative instances, build a target vector with +1 for positive ids and -1 for negative ids
In [16]:
import numpy as np
from scipy.sparse import vstack
yp = [1] * Xp.shape[0]
yn = [-1] * Xn.shape[0]
y = np.array(yp + yn)
X = vstack( [Xp,Xn] , format = "csr")
fit a binary classifier from the scikit ML library
In [17]:
from sklearn.linear_model import SGDClassifier
estimator = SGDClassifier(class_weight='auto', shuffle = True )
estimator.fit(X,y)
Out[17]:
evaluate the quality of the classifier using the cross-validation technique
In [18]:
from sklearn import cross_validation
print 'Predictive performance:'
#assess the generalization capacity of the model via a k-fold cross validation
for scoring in ['accuracy','precision', 'recall', 'f1', 'average_precision', 'roc_auc']:
scores = cross_validation.cross_val_score( estimator, X, y, cv = 3, scoring = scoring )
print( '%20s: %.3f +- %.3f' % ( scoring, np.mean( scores ), np.std( scores ) ) )
use the decision_function of the classifier to get an indication on its confidence for each instance
In [19]:
graphs = pre_process(rfam_uri(rfam_id))
Xp = vectorizer.transform( graphs )
predictions = estimator.decision_function(Xp)
iterable = fasta_to_fasta(rfam_uri(rfam_id))
headers = [line for line in itertools.islice(iterable, 0, None, 2)]
for prediction,header in sorted(zip(predictions, headers), reverse=True):
print "Score: %0.3f %s " % (prediction, header)
In [20]:
%%time
from sklearn.cluster import KMeans
kmeans = KMeans( n_clusters=4 )
predictions = kmeans.fit_predict(Xp)
#get the headers of the FASTA files
iterable = fasta_to_fasta(rfam_uri(rfam_id))
headers = [line for line in itertools.islice(iterable, 0, None, 2)]
for prediction,header in sorted(zip(predictions, headers)):
print "cluster: %d %s " % (prediction, header)
In [21]:
graphs = itertools.islice( pre_process( rfam_uri( rfam_id ) ), 50 )
from eden.util.display import dendrogram
dendrogram( graphs, vectorizer )
In [22]:
%%time
graphs = itertools.islice( pre_process( rfam_uri( rfam_id ) ), 450 )
from eden.util.display import embed2D
embed2D( graphs, vectorizer )