In [1]:
import itertools
def join_pre_processes( data, pre_processes, weights):
    if hasattr( data, '__iter__' ):
        iterable = data
    else: #if not then process url or file with fasta_to_fasta
        from eden.modifier.fasta import fasta_to_fasta
        iterable = fasta_to_fasta( data )
    from eden import util
    return util.join_pre_processes(iterable, pre_processes=pre_processes, weights=weights)

In [2]:
def pre_process_rnashapes( iterable ):
    from eden.converter.fasta import fasta_to_sequence
    seqs = fasta_to_sequence(iterable)
    
    from eden.converter.rna.rnashapes import rnashapes_to_eden
    graphs = rnashapes_to_eden( seqs, shape_type = 5, energy_range = 35, max_num = 3 )
    return graphs

In [3]:
def pre_process_fasta( iterable ):
    from eden.converter.fasta import fasta_to_sequence
    seqs = fasta_to_sequence(iterable)
    
    from eden.converter.fasta import sequence_to_eden
    graphs = sequence_to_eden(seqs)
    return graphs

In [4]:
def pre_process_structure( iterable ):
    from eden.converter.fasta import fasta_to_sequence
    seqs = fasta_to_sequence(iterable)

    from eden.converter.rna.rnashapes_struct import rnashapes_struct_to_eden
    graphs = rnashapes_struct_to_eden(seqs, shape=True, dotbracket=False, energy=False, shape_type=5, energy_range=35, max_num=3)
    return graphs

In [5]:
def pre_process_energy( iterable ):
    from eden.converter.fasta import fasta_to_sequence
    seqs = fasta_to_sequence(iterable)

    from eden.converter.rna.rnashapes_struct import rnashapes_struct_to_eden
    graphs = rnashapes_struct_to_eden(seqs, energy=True, dotbracket=False, shape=False, shape_type=5, energy_range=35, max_num=3)
    return graphs

In [6]:
def pre_process_contraction( iterable ):
    from eden.converter.fasta import fasta_to_sequence
    seqs = fasta_to_sequence(iterable)

    from eden.converter.rna.rnashapes import rnashapes_to_eden
    graphs = rnashapes_to_eden( seqs, shape_type=5, energy_range=35, max_num=3 )
    
    #annotate in node attribute 'type' the incident edges' labels
    from eden.modifier.graph import vertex_attributes
    graphs = vertex_attributes.incident_edge_label(graphs, level=1, output_attribute='type', separator='.')
    
    from eden.modifier.graph.structure import contraction, contraction_modifier
    label_modifier = contraction_modifier(attribute_in='type', attribute_out='label', reduction='set_categorical')
    
    #reduce all 'weight' attributes of contracted nodes using a sum to be written in the 'weight' attribute of the resulting graph 
    weight_modifier = contraction_modifier(attribute_in='weight', attribute_out='weight', reduction='sum')
    modifiers = [label_modifier, weight_modifier]
    
    #contract the graph on the 'type' attribute
    graphs = contraction(graphs, contraction_attribute='type', modifiers=modifiers)
    
    return graphs

In [7]:
def pre_process( data ):
    pre_processes = [pre_process_energy, pre_process_rnashapes, pre_process_fasta, pre_process_structure, pre_process_contraction]
    weights = [0.4, 0.2, 0.05, 0.3, 0.05]
    return join_pre_processes( data, pre_processes, weights)

In [8]:
def pre_process( data ):
    pre_processes = [pre_process_energy, pre_process_structure]
    weights = [0.2, 0.8]
    return join_pre_processes( data, pre_processes, weights)

In [9]:
def describe(X):
    print 'Instances: %d ; Features: %d (with an avg of %d features per instance)' % (X.shape[0], X.shape[1],  X.getnnz()/X.shape[0])

In [10]:
def model(uri, pre_process):

    from eden.graph import Vectorizer
    vectorizer = Vectorizer( complexity=2 )

    graphs = pre_process( uri )
    X1 = vectorizer.transform( graphs )
    describe(X1)
    
    from eden.modifier.fasta import fasta_to_fasta, shuffle_modifier
    graphs = pre_process( fasta_to_fasta( uri , modifier=shuffle_modifier, times=2, order=2) )
    X2 = vectorizer.transform( graphs )
    describe(X2)
    
    from eden.util import fit_estimator
    estimator = fit_estimator( positive_data_matrix=X1, negative_data_matrix=X2, cv=5 )
    
    return estimator, vectorizer

In [11]:
def list_model(uri, pre_process):

    from eden.graph import ListVectorizer
    vectorizer = ListVectorizer( complexity=2 )

    graphs_list, weights = pre_process( uri )
    X1 = vectorizer.transform( graphs_list, weights=weights )
    describe(X1)
    
    from eden.modifier.fasta import fasta_to_fasta, shuffle_modifier
    graphs_list, weights = pre_process( fasta_to_fasta( uri , modifier=shuffle_modifier, times=2, order=2) )
    X2 = vectorizer.transform( graphs_list, weights=weights )
    describe(X2)
    
    from eden.util import fit_estimator
    estimator = fit_estimator( positive_data_matrix=X1, negative_data_matrix=X2, cv=5 )
    
    return estimator, vectorizer

In [12]:
def rfam_uri(family_id):
    return 'http://rfam.xfam.org/family/%s/alignment?acc=%s&format=fastau&download=0'%(family_id,family_id)
def rfam_uri(family_id):
    return '%s.fa'%(family_id)

In [13]:
rfam_id = 'RF00871' #microRNA mir-689
rfam_id = 'RF02275' #Hammerhead_HH9
rfam_id = 'RF00005' #tRNA

In [14]:
%%time
estimator, vectorizer = model(rfam_uri( rfam_id ), pre_process_rnashapes)


Instances: 954 ; Features: 1048577 (with an avg of 919 features per instance)
Instances: 1908 ; Features: 1048577 (with an avg of 916 features per instance)
Classifier:
SGDClassifier(alpha=6.44322168128e-05, average=True, class_weight='auto',
       epsilon=0.1, eta0=0.116417790071, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=18, n_jobs=-1,
       penalty='l2', power_t=0.709345550534, random_state=None,
       shuffle=True, verbose=0, warm_start=False)
--------------------------------------------------------------------------------
Predictive performance:
            accuracy: 0.972 +- 0.011
           precision: 0.967 +- 0.016
              recall: 0.948 +- 0.021
                  f1: 0.957 +- 0.017
   average_precision: 0.990 +- 0.005
             roc_auc: 0.994 +- 0.003
--------------------------------------------------------------------------------
CPU times: user 5min 41s, sys: 54.4 s, total: 6min 36s
Wall time: 13min 12s

In [15]:
#%%time
estimator_str, vectorizer_str = list_model(rfam_uri( rfam_id ), pre_process)


Instances: 954 ; Features: 1048577 (with an avg of 81 features per instance)
Instances: 1908 ; Features: 1048577 (with an avg of 70 features per instance)
Classifier:
SGDClassifier(alpha=1.69737258883e-05, average=True, class_weight='auto',
       epsilon=0.1, eta0=0.0755825999753, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='hinge', n_iter=96,
       n_jobs=-1, penalty='l2', power_t=0.879701120972, random_state=None,
       shuffle=True, verbose=0, warm_start=False)
--------------------------------------------------------------------------------
Predictive performance:
            accuracy: 0.927 +- 0.004
           precision: 0.864 +- 0.031
              recall: 0.931 +- 0.031
                  f1: 0.895 +- 0.003
   average_precision: 0.924 +- 0.012
             roc_auc: 0.962 +- 0.010
--------------------------------------------------------------------------------

In [16]:
def extract_motives(uri, motif_support_threshold = 2, min_subarray_size=5, max_subarray_size=18, estimator = estimator, vectorizer = vectorizer):
    
    #annotate graphs with node importance 
    graphs = pre_process_rnashapes( uri )
    graphs = vectorizer.annotate( graphs, estimator )

    #use node importance and 'position' attribute to identify max_subarrays of a specific size
    #use compute_max_subarrays to return an iterator over motives 
    from eden.util.iterated_maximum_subarray import compute_max_subarrays

    motives = set()
    for graph in graphs:
        subarrays = compute_max_subarrays(graph=graph, min_subarray_size=min_subarray_size, max_subarray_size=max_subarray_size)
        for subarray in subarrays:
            motives.add(subarray['subarray_string'])

    #count occurrences of motives in original dataset to determine support of each motif
    from collections import defaultdict

    from eden.modifier.fasta import fasta_to_fasta, one_line_modifier
    iterable = fasta_to_fasta( uri, modifier=one_line_modifier, sequence_only=True)

    motif_counter = defaultdict(int)
    for seq in iterable:
        for motif in motives:
            if seq.find(motif) != -1:
                motif_counter[motif] += 1
    
    #select only motives with support higher than a user defined threshold
    selcted_motives = [motif for motif in motif_counter if motif_counter[motif] >= motif_support_threshold]   

    #remove motives that are specializations of smaller motifs
    blacklist=set()
    for motif1 in selcted_motives:
        for motif2 in selcted_motives:
            if motif1 != motif2 and motif1 in motif2:
                blacklist.add(motif2)
    selcted_motives=list(set(selcted_motives).difference(blacklist))

    print 'Selected %d motives'%len(selcted_motives)
    #build a regex expression that matches the occurrence of at least one of the motives
    regex=''
    for m in sorted(selcted_motives): regex += '|' + m
    regex = regex[1:]
    return regex

In [17]:
%%time
regex = extract_motives(rfam_uri( rfam_id ), motif_support_threshold = 10, min_subarray_size=5, max_subarray_size=15, estimator = estimator, vectorizer = vectorizer)
print "regex: ", regex


Selected 115 motives
regex:  AAAGCA|AAAUCCC|AAAUCCU|AAAUCUC|AGAAAU|AGAAGU|AGAGAU|AGAGCUU|AGCCCA|AGCUGGU|AGCUUAA|AGCUUC|AGUAAG|AGUUCGA|AUUAGAU|AUUCCU|CAAAAAU|CAAAUCC|CAAAUCU|CAAGGCA|CAAUUGG|CACCCA|CACCCCU|CAUCCG|CCCCCA|CCCGCU|CCUCCCU|CGAUCCC|CUUCCG|CUUCCU|CUUUUAAA|GAAACA|GAACCA|GAAGCU|GAAUAGU|GAAUCCU|GACGCA|GAGAAUA|GCAAAG|GCACCC|GCAGAU|GCAUUGG|GCCCAA|GCCCCA|GCCCCC|GCCCGA|GCCCUU|GCGGAU|GCUCAAU|GCUUUUAA|GGAAGC|GGAGAG|GGUAAAAU|GGUAAAG|GGUCCU|GGUUAAC|GGUUAAU|GGUUCAAG|GGUUCGA|GUAGAAC|GUAGAGC|GUCCCG|GUCCUU|GUUAAAC|GUUAAAGC|GUUAAAU|GUUCAAU|GUUCGAAC|GUUCGAU|GUUGGUU|GUUUCC|UAAAAGUA|UAAAAUCC|UAAACA|UAAAUCC|UAAAUCU|UAACCCA|UAACCUU|UAACUAA|UAACUCCA|UAAGCA|UAAGCUA|UAAUUGGU|UAGAAU|UAGAGUA|UAGAUA|UAGAUU|UAGCUUA|UAUAGCUC|UAUAGUUUAA|UCCAAAUAAAAG|UCCCCA|UCCCUA|UCCGCA|UCCGCU|UCCUAG|UCCUUA|UCUCUUC|UCUUUCC|UGGGCU|UGGUAGA|UGGUAGCG|UGGUUAA|UGGUUAGAGC|UUAAAAC|UUAAAAU|UUAACCGA|UUAAUUG|UUCAAAU|UUCCCG|UUCGAAU|UUGGUAG|UUGGUUCA|UUUCCG|UUUCUA
CPU times: user 1min 48s, sys: 6.78 s, total: 1min 55s
Wall time: 2min 22s

In [18]:
def motives_discriminative_performance_evaluation(data, regex):
    #compute statistics when using the motives as indicators for the target family on the original data 
    #and on a randomly permuted dataset
    
    from eden.modifier.fasta import fasta_to_fasta, keep_modifier, shuffle_modifier
    iterable = fasta_to_fasta( data , modifier=keep_modifier, regex=regex )
    true_positive_count = sum(1 for x in iterable)
    iterable = fasta_to_fasta( data )
    positive_count = sum(1 for x in iterable)
    iterable = fasta_to_fasta( data , modifier=shuffle_modifier, times=10, order=2)
    it1,it2 = itertools.tee(iterable)
    negative_count = sum(1 for x in it1)
    iterable = fasta_to_fasta( it2 , modifier=keep_modifier, regex=regex )
    false_positive_count = sum(1 for x in iterable)
    false_negative_count =  positive_count - true_positive_count
    true_negative_count = negative_count - false_negative_count

    recall = true_positive_count/float(true_positive_count + false_negative_count)
    precision = true_positive_count/float(true_positive_count + false_positive_count)
    f1 = 2 * precision * recall / (precision + recall)

    print 'pos:%d neg:%d TP:%d FP:%d TN:%d FN:%d' % (positive_count, negative_count, true_positive_count, false_positive_count, true_negative_count, false_negative_count)
    print 'precision:%0.3f recall:%0.3f f1:%0.3f' % (precision, recall, f1)
    print 'dataset reduction: from %d to %d = %0.3f' % (negative_count, false_positive_count, false_positive_count/float(negative_count))

In [19]:
motives_discriminative_performance_evaluation(rfam_uri( rfam_id ), regex)


pos:1908 neg:19080 TP:1854 FP:12338 TN:19026 FN:54
precision:0.131 recall:0.972 f1:0.230
dataset reduction: from 19080 to 12338 = 0.647

In [20]:
def lenght_stats(uri, lower_percentile=25, upper_percentile=75):
    #extract statistics on the average lenght of the sequences in the given Rfam family
    #use this information to determine the size of the window when scanning the genome
    import numpy
    from eden.modifier.fasta import fasta_to_fasta, one_line_modifier

    iterable = fasta_to_fasta( uri, modifier=one_line_modifier, sequence_only=True)
    len_seqs = [len(seq) for seq in iterable]
    lower_quantile = int(numpy.percentile(len_seqs, lower_percentile))
    upper_quantile = int(numpy.percentile(len_seqs, upper_percentile))

    num_seqs = len(len_seqs)

    print 'Rfam family %s has %d sequences' % (rfam_id, num_seqs)
    print 'more than %d%% of the sequences have lenght smaller than: %d ' % (upper_percentile, upper_quantile)
    print 'less then %d%% of the sequences have lenght smaller than: %d ' % (lower_percentile, lower_quantile)
    
    return lower_quantile, upper_quantile

In [21]:
lower_quantile, upper_quantile = lenght_stats(rfam_uri( rfam_id ), lower_percentile=15, upper_percentile=85)


Rfam family RF00005 has 954 sequences
more than 85% of the sequences have lenght smaller than: 77 
less then 15% of the sequences have lenght smaller than: 69 

In [22]:
def out_of_core_list_filter( iterable, estimator=None, vectorizer=None, pre_process=None, threshold=1 ):
    #define a filter that uses the predictive model
    import itertools
    iterable_headers, iterable_seqs = itertools.tee( iterable )
    
    #1. extract oneline fasta
    from eden.modifier.fasta import fasta_to_fasta, one_line_modifier
    header_seqs = fasta_to_fasta( iterable_headers, modifier=one_line_modifier, one_line=True )

    #2. extract graphs
    graphs_list, weights = pre_process( iterable_seqs )
    predictions = vectorizer.predict(graphs_list, estimator=estimator, weights=weights)

    #1+2
    for header_seq, prediction in itertools.izip(header_seqs, predictions):
        if prediction >= float(threshold):
            lines = header_seq.split('\t')
            header = lines[0] + ' SCORE: %.3f '%prediction
            seq = lines[1]
            yield header
            yield seq

In [23]:
def out_of_core_filter( iterable, estimator=None, vectorizer=None, pre_process=None, threshold=1 ):
    #define a filter that uses the predictive model
    import itertools
    iterable_headers, iterable_seqs = itertools.tee( iterable )
    
    #1. extract oneline fasta
    from eden.modifier.fasta import fasta_to_fasta, one_line_modifier
    header_seqs = fasta_to_fasta( iterable_headers, modifier=one_line_modifier, one_line=True )

    #2. extract graphs
    graphs = pre_process( iterable_seqs )
    predictions = vectorizer.predict(graphs, estimator=estimator)

    #1+2
    for header_seq, prediction in itertools.izip(header_seqs, predictions):
        if prediction >= float(threshold):
            lines = header_seq.split('\t')
            header = lines[0] + ' SCORE: %.3f '%prediction
            seq = lines[1]
            yield header
            yield seq

In [24]:
#parameters choice
uri = 'ecoli.fa'
min_lenght = lower_quantile
max_lenght = upper_quantile
threshold_1 = 0.0
threshold_2 = 0.0


#compose all modifiers and filters
from eden.modifier.fasta import fasta_to_fasta, one_line_modifier, replace_modifier, split_regex_modifier, split_modifier, keep_modifier

#replace Ts with Us
iterable = fasta_to_fasta(uri, modifier=replace_modifier, regex='T', replacement='U')

#split out sequences that are not stretches of Ns  
#iterable = fasta_to_fasta(iterable, modifier=split_regex_modifier, regex="([^N]+)" )

#filter out small sequences, i.e. smaller than lower_quantile
#iterable = fasta_to_fasta(iterable, modifier=keep_modifier, regex="(.{%d,})"%min_lenght )

#split large sequences into overlapping windows of size comparable with the Rfam average seq lenght
iterable = fasta_to_fasta( iterable, modifier=split_modifier, window=max_lenght, step=5 )

#keep only the sequences that are matched by the regex
iterable = fasta_to_fasta(iterable, modifier=keep_modifier, regex=regex )

#filter sequences using predictive model on structures
iterable = out_of_core_list_filter( iterable, threshold=threshold_1, pre_process=pre_process, estimator=estimator_str, vectorizer=vectorizer_str )

#filter sequences using predictive model on seq+structure
iterable = out_of_core_filter( iterable, threshold=threshold_2, pre_process=pre_process_rnashapes, estimator=estimator, vectorizer=vectorizer )

#extract one line
iterable = fasta_to_fasta( iterable, modifier=one_line_modifier, one_line=True )

In [25]:
%%time
#run the filter on the genome and save results

from datetime import datetime
import sys

results = []

f = open('result','a')
print >> f, '-'*80
print >> f, rfam_id
print >> f, datetime.now() 

#display and save results    
for line in iterable:
    print line
    sys.stdout.flush()
    print >> f, line
    f.flush()
    results.append(line)


>gi|556503834|ref|NC_000913.3| Escherichia coli str. K-12 substr. MG1655, complete genome START: 000015200 END: 000015277 LEN: 77 SCORE: 1.264 SCORE: 0.029	AAGAAAGCUUCGGUGGCCCAACCGGCGAGCACAACAGCCCGCGCUCAAAGAGCUUCUUUGAUGGUGUGAAGAAGUUU
>gi|556503834|ref|NC_000913.3| Escherichia coli str. K-12 substr. MG1655, complete genome START: 000019190 END: 000019267 LEN: 77 SCORE: 1.264 SCORE: 0.376	GGAAGGCCUGUUCUCCGUGAGAAUUGGCGAAUGUGGCGUGAGUUUCUGGUGUACAAAUCCACCACCAGAAAAACCGU
>gi|556503834|ref|NC_000913.3| Escherichia coli str. K-12 substr. MG1655, complete genome START: 000020940 END: 000021017 LEN: 77 SCORE: 1.742 SCORE: 0.809	AGCAGCUUUGUCGCCAGCUUCGAUAGCUGCGUAUACUUUCUUGAUGAAAGUACGCAUCAUAGAGCGACGGCUUGCGU
>gi|556503834|ref|NC_000913.3| Escherichia coli str. K-12 substr. MG1655, complete genome START: 000052805 END: 000052882 LEN: 77 SCORE: 1.168 SCORE: 0.244	AGAAUGCGCGGUUCGGCAAUACCAAAUUUGGUCCGCAAAUCGUGAUGCAAAAUAGCAAUCACUUCGUGCAAAAGUGC
>gi|556503834|ref|NC_000913.3| Escherichia coli str. K-12 substr. MG1655, complete genome START: 000054190 END: 000054267 LEN: 77 SCORE: 1.657 SCORE: 0.328	CUGGCGUCGUUUUGGUUACCCACCUGCUGCGCCAGGGAUUCGACUUCCUGCGGCAGGAUGGUGAUGCGACGACGCAC
>gi|556503834|ref|NC_000913.3| Escherichia coli str. K-12 substr. MG1655, complete genome START: 000076160 END: 000076237 LEN: 77 SCORE: 1.164 SCORE: 0.241	GCCGGGAACCACAGUUGCUGGUACUGUUCCUCAGCGAAAUAGACCAGAUUAGUUGGAGAAAGCACAUAGCUUACCCA
>gi|556503834|ref|NC_000913.3| Escherichia coli str. K-12 substr. MG1655, complete genome START: 000076170 END: 000076247 LEN: 77 SCORE: 1.216 SCORE: 0.374	ACAGUUGCUGGUACUGUUCCUCAGCGAAAUAGACCAGAUUAGUUGGAGAAAGCACAUAGCUUACCCAGUCCCUGACU
>gi|556503834|ref|NC_000913.3| Escherichia coli str. K-12 substr. MG1655, complete genome START: 000104210 END: 000104287 LEN: 77 SCORE: 1.667 SCORE: 0.190	CUUCGGUAUAUCUGGCGCUUUCUGGUAAGCACAUCAGCUGCCAGAAUGAAAUUGGUAUGGUGCCUAUUUCUGAAGAA
>gi|556503834|ref|NC_000913.3| Escherichia coli str. K-12 substr. MG1655, complete genome START: 000104225 END: 000104302 LEN: 77 SCORE: 1.162 SCORE: 0.022	CGCUUUCUGGUAAGCACAUCAGCUGCCAGAAUGAAAUUGGUAUGGUGCCUAUUUCUGAAGAAGAAGUGACGCAAGAA
>gi|556503834|ref|NC_000913.3| Escherichia coli str. K-12 substr. MG1655, complete genome START: 000122575 END: 000122652 LEN: 77 SCORE: 1.519 SCORE: 0.060	ACAAUGUGGUUCUGCUUCAUCUGCUAAGGUGUAUGGAGCCGAUGUUGGCCCAGAAUGUCCGCCAGAACUUCGAAUUG
>gi|556503834|ref|NC_000913.3| Escherichia coli str. K-12 substr. MG1655, complete genome START: 000127730 END: 000127807 LEN: 77 SCORE: 1.264 SCORE: 0.229	GGUGGUGGUUAGGGUAUUACUUCACAUACCCUAUGGAUUUCUGGGUGCAGCAAGGUAGCAAGCGCCAGAAUCCCCAG
>gi|556503834|ref|NC_000913.3| Escherichia coli str. K-12 substr. MG1655, complete genome START: 000134875 END: 000134952 LEN: 77 SCORE: 1.539 SCORE: 0.409	AAGUCUUCCGGUUUGGUGUGGAACAUGUAGUGCUUAAGGUCGAACUCUUUAAGCAACAUCUUGGUAUGGAAGAUAUU
>gi|556503834|ref|NC_000913.3| Escherichia coli str. K-12 substr. MG1655, complete genome START: 000134895 END: 000134972 LEN: 77 SCORE: 1.209 SCORE: 0.064	GAACAUGUAGUGCUUAAGGUCGAACUCUUUAAGCAACAUCUUGGUAUGGAAGAUAUUUUCCUGAUAGACGUUCACAU
>gi|556503834|ref|NC_000913.3| Escherichia coli str. K-12 substr. MG1655, complete genome START: 000139505 END: 000139582 LEN: 77 SCORE: 1.620 SCORE: 0.055	GGGUGAAAAUGCCUUCAUAGCGCAUCUGGUGGAACAUCACGCGGCACACCAGUUGGUCAAACAUGGUGGCUCCCCAC
>gi|556503834|ref|NC_000913.3| Escherichia coli str. K-12 substr. MG1655, complete genome START: 000181270 END: 000181347 LEN: 77 SCORE: 1.667 SCORE: 0.473	AACCACGUUGUUGAUAACGCGACGGUCAUUAAAGUUCAACUGAGCGAUGGCCGUAAGUUCGACGCGAAGAUGGUUGG
>gi|556503834|ref|NC_000913.3| Escherichia coli str. K-12 substr. MG1655, complete genome START: 000190020 END: 000190097 LEN: 77 SCORE: 1.422 SCORE: 0.238	UUCAACGAAGCUCUGGCUGAACUGAACAAGAUUGCUUCUCGCAAAGGUAAAAUCCUUUUCGUUGGUACUAAACGCGC
>gi|556503834|ref|NC_000913.3| Escherichia coli str. K-12 substr. MG1655, complete genome START: 000190035 END: 000190112 LEN: 77 SCORE: 1.422 SCORE: 0.071	GCUGAACUGAACAAGAUUGCUUCUCGCAAAGGUAAAAUCCUUUUCGUUGGUACUAAACGCGCUGCAAGCGAAGCGGU
>gi|556503834|ref|NC_000913.3| Escherichia coli str. K-12 substr. MG1655, complete genome START: 000190555 END: 000190632 LEN: 77 SCORE: 0.954 SCORE: 0.663	AUCUGGCUUCCCAGGCGGAAGAAAGCUUCGUAGAAGCUGAGUAAUAAGGCUUGAUAACUCCCCCAAAAUAGUUCGAG
>gi|556503834|ref|NC_000913.3| Escherichia coli str. K-12 substr. MG1655, complete genome START: 000196930 END: 000197007 LEN: 77 SCORE: 0.298 SCORE: 0.136	CGGUGGUUGGCGAAAUAGCAGCCAAUUCGAUAGCUGCGGAAGCACAAAUUGCACCAGGUACGGAACUAAAAGCCGUA
>gi|556503834|ref|NC_000913.3| Escherichia coli str. K-12 substr. MG1655, complete genome START: 000196940 END: 000197017 LEN: 77 SCORE: 1.226 SCORE: 0.312	CGAAAUAGCAGCCAAUUCGAUAGCUGCGGAAGCACAAAUUGCACCAGGUACGGAACUAAAAGCCGUAGAUGGUAUCG
>gi|556503834|ref|NC_000913.3| Escherichia coli str. K-12 substr. MG1655, complete genome START: 000218360 END: 000218437 LEN: 77 SCORE: 1.539 SCORE: 0.299	ACUUGGUCUGGAUCUGAUAGAAGUUCAGCGGCAGCUGUUUGUAAGAGCUAAGCUCGUUACGAAUCAGGUCAGUGAUA
>gi|556503834|ref|NC_000913.3| Escherichia coli str. K-12 substr. MG1655, complete genome START: 000218375 END: 000218452 LEN: 77 SCORE: 1.519 SCORE: 0.236	GAUAGAAGUUCAGCGGCAGCUGUUUGUAAGAGCUAAGCUCGUUACGAAUCAGGUCAGUGAUAACUUCUUCAUGAGUU
>gi|556503834|ref|NC_000913.3| Escherichia coli str. K-12 substr. MG1655, complete genome START: 000225375 END: 000225452 LEN: 77 SCORE: 1.617 SCORE: 0.729	UCUACAGGCUUGUAGCUCAGGUGGUUAGAGCGCACCCCUGAUAAGGGUGAGGUCGGUGGUUCAAGUCCACUCAGGCC
>gi|556503834|ref|NC_000913.3| Escherichia coli str. K-12 substr. MG1655, complete genome START: 000225380 END: 000225457 LEN: 77 SCORE: 1.693 SCORE: 1.818	AGGCUUGUAGCUCAGGUGGUUAGAGCGCACCCCUGAUAAGGGUGAGGUCGGUGGUUCAAGUCCACUCAGGCCUACCA
>gi|556503834|ref|NC_000913.3| Escherichia coli str. K-12 substr. MG1655, complete genome START: 000225405 END: 000225482 LEN: 77 SCORE: 1.270 SCORE: 0.379	CGCACCCCUGAUAAGGGUGAGGUCGGUGGUUCAAGUCCACUCAGGCCUACCAAAUUUGCACGGCAAAUUUGAAGAGG
>gi|556503834|ref|NC_000913.3| Escherichia coli str. K-12 substr. MG1655, complete genome START: 000225495 END: 000225572 LEN: 77 SCORE: 1.807 SCORE: 0.357	UUAUGGGGCUAUAGCUCAGCUGGGAGAGCGCCUGCUUUGCACGCAGGAGGUCUGCGGUUCGAUCCCGCAUAGCUCCA
>gi|556503834|ref|NC_000913.3| Escherichia coli str. K-12 substr. MG1655, complete genome START: 000228925 END: 000229002 LEN: 77 SCORE: 2.311 SCORE: 0.514	GUGGAGCGGUAGUUCAGUCGGUUAGAAUACCUGCCUGUCACGCAGGGGGUCGCGGGUUCGAGUCCCGUCCGUUCCGC
>gi|556503834|ref|NC_000913.3| Escherichia coli str. K-12 substr. MG1655, complete genome START: 000228930 END: 000229007 LEN: 77 SCORE: 1.264 SCORE: 0.807	GCGGUAGUUCAGUCGGUUAGAAUACCUGCCUGUCACGCAGGGGGUCGCGGGUUCGAGUCCCGUCCGUUCCGCCACUU
>gi|556503834|ref|NC_000913.3| Escherichia coli str. K-12 substr. MG1655, complete genome START: 000236305 END: 000236382 LEN: 77 SCORE: 1.539 SCORE: 0.002	CGAAGUAGCCGAUGAGUUCAUGGACUAUAUUCGCGGCGCGGAGUUGGUGAUCCAUAACGCAGCGUUCGAUAUCGGCU
>gi|556503834|ref|NC_000913.3| Escherichia coli str. K-12 substr. MG1655, complete genome START: 000236930 END: 000237007 LEN: 77 SCORE: 2.369 SCORE: 0.466	GGAGCGGUAGUUCAGUCGGUUAGAAUACCUGCCUGUCACGCAGGGGGUCGCGGGUUCGAGUCCCGUCCGUUCCGCCA
>gi|556503834|ref|NC_000913.3| Escherichia coli str. K-12 substr. MG1655, complete genome START: 000236935 END: 000237012 LEN: 77 SCORE: 1.183 SCORE: 0.219	GGUAGUUCAGUCGGUUAGAAUACCUGCCUGUCACGCAGGGGGUCGCGGGUUCGAGUCCCGUCCGUUCCGCCACUAUU
>gi|556503834|ref|NC_000913.3| Escherichia coli str. K-12 substr. MG1655, complete genome START: 000237525 END: 000237602 LEN: 77 SCORE: 1.566 SCORE: 0.004	GGAUAUUCAAGCUGCGCUUGAUGGAGAUGAGUUCCGCGUUCCCCUUAAUUCUCCUGUAAUACUUGUUCAAUCCGGCA
>gi|556503834|ref|NC_000913.3| Escherichia coli str. K-12 substr. MG1655, complete genome START: 000262825 END: 000262902 LEN: 77 SCORE: 0.335 SCORE: 0.042	AACCUUGUCCACCGUGAUUCACGUUCGUGAACAUGUCCUUUCAGGGCCGAUAUAGCUCAGUUGGUAGAGCAGCGCAU
>gi|556503834|ref|NC_000913.3| Escherichia coli str. K-12 substr. MG1655, complete genome START: 000262850 END: 000262927 LEN: 77 SCORE: 1.309 SCORE: 0.200	CGUGAACAUGUCCUUUCAGGGCCGAUAUAGCUCAGUUGGUAGAGCAGCGCAUUCGUAAUGCGAAGGUCGUAGGUUCG
>gi|556503834|ref|NC_000913.3| Escherichia coli str. K-12 substr. MG1655, complete genome START: 000262860 END: 000262937 LEN: 77 SCORE: 0.610 SCORE: 0.038	UCCUUUCAGGGCCGAUAUAGCUCAGUUGGUAGAGCAGCGCAUUCGUAAUGCGAAGGUCGUAGGUUCGACUCCUAUUA
>gi|556503834|ref|NC_000913.3| Escherichia coli str. K-12 substr. MG1655, complete genome START: 000262865 END: 000262942 LEN: 77 SCORE: 1.521 SCORE: 0.322	UCAGGGCCGAUAUAGCUCAGUUGGUAGAGCAGCGCAUUCGUAAUGCGAAGGUCGUAGGUUCGACUCCUAUUAUCGGC
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-25-dc4f7ca1eb28> in <module>()
----> 1 get_ipython().run_cell_magic(u'time', u'', u"#run the filter on the genome and save results\n\nfrom datetime import datetime\nimport sys\n\nresults = []\n\nf = open('result','a')\nprint >> f, '-'*80\nprint >> f, rfam_id\nprint >> f, datetime.now() \n\n#display and save results    \nfor line in iterable:\n    print line\n    sys.stdout.flush()\n    print >> f, line\n    f.flush()\n    results.append(line)")

/Library/Python/2.7/site-packages/IPython/core/interactiveshell.pyc in run_cell_magic(self, magic_name, line, cell)
   2160             magic_arg_s = self.var_expand(line, stack_depth)
   2161             with self.builtin_trap:
-> 2162                 result = fn(magic_arg_s, cell)
   2163             return result
   2164 

/Library/Python/2.7/site-packages/IPython/core/magics/execution.pyc in time(self, line, cell, local_ns)

/Library/Python/2.7/site-packages/IPython/core/magic.pyc in <lambda>(f, *a, **k)
    191     # but it's overkill for just that one bit of state.
    192     def magic_deco(arg):
--> 193         call = lambda f, *a, **k: f(*a, **k)
    194 
    195         if callable(arg):

/Library/Python/2.7/site-packages/IPython/core/magics/execution.pyc in time(self, line, cell, local_ns)
   1127         else:
   1128             st = clock2()
-> 1129             exec(code, glob, local_ns)
   1130             end = clock2()
   1131             out = None

<timed exec> in <module>()

/Users/costa/Desktop/BTSync/Projects/EDeN/EDeN/eden/modifier/fasta.pyc in fasta_to_fasta(input, modifier, **options)
     25     for line in iterable:
     26         header = line
---> 27         seq = iterable.next()
     28         if normalize:
     29             seq.upper()

/Users/costa/Desktop/BTSync/Projects/EDeN/EDeN/eden/modifier/fasta.pyc in _fasta_to_fasta(input)
     36 def _fasta_to_fasta(input):
     37     seq = ""
---> 38     for line in util.read(input):
     39         if line:
     40             if line[0] == '>':

<ipython-input-23-052fa88c3448> in out_of_core_filter(iterable, estimator, vectorizer, pre_process, threshold)
     13 
     14     #1+2
---> 15     for header_seq, prediction in itertools.izip(header_seqs, predictions):
     16         if prediction >= float(threshold):
     17             lines = header_seq.split('\t')

/Users/costa/Desktop/BTSync/Projects/EDeN/EDeN/eden/modifier/fasta.pyc in fasta_to_fasta(input, modifier, **options)
     25     for line in iterable:
     26         header = line
---> 27         seq = iterable.next()
     28         if normalize:
     29             seq.upper()

/Users/costa/Desktop/BTSync/Projects/EDeN/EDeN/eden/modifier/fasta.pyc in _fasta_to_fasta(input)
     36 def _fasta_to_fasta(input):
     37     seq = ""
---> 38     for line in util.read(input):
     39         if line:
     40             if line[0] == '>':

<ipython-input-22-3917d0504051> in out_of_core_list_filter(iterable, estimator, vectorizer, pre_process, threshold)
     13 
     14     #1+2
---> 15     for header_seq, prediction in itertools.izip(header_seqs, predictions):
     16         if prediction >= float(threshold):
     17             lines = header_seq.split('\t')

/Users/costa/Desktop/BTSync/Projects/EDeN/EDeN/eden/graph.pyc in predict(self, G_iterators_list, estimator, weights)
    956         try:
    957             while True:
--> 958                 graphs = [G_iterator.next() for G_iterator in G_iterators_list]
    959                 yield self._predict(graphs, weights)
    960         except StopIteration:

/Users/costa/Desktop/BTSync/Projects/EDeN/EDeN/eden/converter/rna/rnashapes_struct.pyc in rnashapes_struct_to_eden(iterable, **options)
    104     for header, seq in iterable:
    105         try:
--> 106             for G in string_to_networkx(header, seq, **options):
    107                 yield G
    108         except Exception as e:

/Users/costa/Desktop/BTSync/Projects/EDeN/EDeN/eden/converter/rna/rnashapes_struct.pyc in string_to_networkx(header, sequence, **options)
     61     dotbracket = options.get('dotbracket', True)
     62     split_components = options.get('split_components', False)
---> 63     seq_info, seq_struct_list = rnashapes_wrapper(sequence, shape_type=shape_type, energy_range=energy_range, max_num=max_num)
     64     if split_components:
     65         for shape_str, energy_str, dotbracket_str in seq_struct_list:

/Users/costa/Desktop/BTSync/Projects/EDeN/EDeN/eden/converter/rna/rnashapes_struct.pyc in rnashapes_wrapper(sequence, shape_type, energy_range, max_num)
     12     # command line
     13     cmd = 'echo "%s" | RNAshapes -t %d -c %d -# %d' % (sequence, shape_type, energy_range, max_num)
---> 14     out = sp.check_output(cmd, shell=True)
     15     text = out.strip().split('\n')
     16     seq_info = text[0]

/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/subprocess.pyc in check_output(*popenargs, **kwargs)
    565         raise ValueError('stdout argument not allowed, it will be overridden.')
    566     process = Popen(stdout=PIPE, *popenargs, **kwargs)
--> 567     output, unused_err = process.communicate()
    568     retcode = process.poll()
    569     if retcode:

/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/subprocess.pyc in communicate(self, input)
    788                 self.stdin.close()
    789             elif self.stdout:
--> 790                 stdout = _eintr_retry_call(self.stdout.read)
    791                 self.stdout.close()
    792             elif self.stderr:

/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/subprocess.pyc in _eintr_retry_call(func, *args)
    474     while True:
    475         try:
--> 476             return func(*args)
    477         except (OSError, IOError) as e:
    478             if e.errno == errno.EINTR:

KeyboardInterrupt: 

In [ ]:
from ipy_table import * 

matr=[('ID','Pos','Conf1','Conf2','Seq')]

th0=1.8
th1=th2=0
counter=1
for line in results:
    lines=line.split()
    seq,sc1,sc2,pos = lines[-1], float(lines[-2]),float(lines[-4]),lines[-10]
    if sc1 + sc2 > th0 and sc2 > th2 and sc1 > th1:
        matr += [(counter, pos, sc1,sc2, seq)]
        counter += 1

make_table(matr)
apply_theme('basic')
set_global_style(float_format = '%0.3f')

In [ ]:
#reference tRNA
fname='trna_ecoli'
def parse(fname):
    with open(fname) as f:
        for line in f:
            text = ' '.join([line.strip(), f.next().strip(), f.next().strip(), f.next().strip()])
            lines=text.split()
            yield (int(lines[6].replace(",", "")),lines[0], lines[5])

from ipy_table import * 

mat=[('ID','Name','Pos')]

counter=1            
for t in sorted(parse(fname)):
    pos,name,strand = t
    name=name.replace("T","U")
    if '+' in strand:
        mat += [(counter, name, pos)]
        counter += 1
        
make_table(mat)
apply_theme('basic')
set_global_style(float_format = '%0.3f')

In [ ]:
#validate results
tolerance = 50
offset = 2000
name_set = set()
for counter_r, pos_r, sc1_r,sc2_r, seq_r in matr[1:]:
    for counter, name, pos in mat[1:]:
        if int(pos_r) - offset > int(pos) - tolerance and int(pos_r) - offset < int(pos) + tolerance: 
            print name,pos, pos_r, counter_r
            name_set.add(name)
print len(name_set)

In [ ]:
def fasta_results(mat):
    for counter, pos, sc1,sc2, seq in mat:
        yield '>Ecoli:%s'%pos
        yield seq

In [ ]:
def pre_process(data):
    from eden.converter.rna.rnashapes import rnashapes_to_eden
    graphs = rnashapes_to_eden(data, shape_type=5, energy_range=35, max_num=3)
    return graphs

In [ ]:
%matplotlib inline
#parameters for visualization
from eden.util.display import draw_graph
size = 14
node_size = 200
font_size = 9
opts={'size':size, 'node_border':False, 'node_size':node_size, 'font_size':font_size, 'vertex_alpha':0.6}

for graph in itertools.islice(pre_process(fasta_results(matr)),29,31):
    draw_graph(graph, **opts)