In [1]:
%matplotlib inline  
import pylab
pylab.rcParams['figure.figsize'] = (10.0, 8.0)
import os
import numpy as np

import matplotlib.pyplot as plt

from src.python.baselines import *

from tempfile import gettempdir
tmp_dir = gettempdir()

from src.python.preprocess2 import *

from pymongo import MongoClient

asp = 'F'  # default: Molecular Function

client = MongoClient('mongodb://localhost:27017/')

db = client['prot2vec']

onto = get_ontology(asp)


/home/yotamfr/development/prot2vec/virtualenv/lib/python3.6/site-packages/Bio/SearchIO/__init__.py:211: BiopythonExperimentalWarning: Bio.SearchIO is an experimental submodule which may undergo significant changes prior to its future official release.
  BiopythonExperimentalWarning)

In [2]:
t0 = datetime(2014, 1, 1, 0, 0)
t1 = datetime(2014, 9, 1, 0, 0)
lim = None

seqs_train, annots_train, seqs_valid, annots_valid = load_training_and_validation(db, t0, t1, asp, lim)

len(seqs_train), len(seqs_valid)

data = (seqs_train, annots_train, seqs_valid, annots_valid)
pred3, perf3 = evaluate_performance(db, ["naive", "blast"], asp, train_and_validation_data=data, filename="2014-jan-sep")


100%
Finished loading 81630 mappings!
100%
Finished loading 44839 sequences!
100%
Finished loading 7909 mappings!
100%
Finished loading 5528 sequences!
49%%
targets processed: 100%|██████████| 2941/2941 [00:00<00:00, 102350.18it/s]
100%
targets processed:  74%|███████▍  | 2178/2941 [53:33<31:54,  2.51s/it]  
---------------------------------------------------------------------------
AssertionError                            Traceback (most recent call last)
<ipython-input-2-0d130bb40eed> in <module>()
      8 
      9 data = (seqs_train, annots_train, seqs_valid, annots_valid)
---> 10 pred3, perf3 = evaluate_performance(db, ["naive", "blast"], asp, train_and_validation_data=data, filename="2014-jan-sep")

~/development/prot2vec/src/python/baselines.py in evaluate_performance(db, methods, asp, train_and_validation_data, filename, plot)
    321     perf = {}
    322     for meth in methods:
--> 323         pred = predict(seqs_train, annots_train, seqs_valid, meth, filename)
    324         perf[meth] = performance(pred, annots_valid)
    325     if plot == 1:

~/development/prot2vec/src/python/baselines.py in predict(reference_seqs, reference_annots, target_seqs, method, basename)
    251             return np.load(pred_path).item()
    252         _prepare_blast(reference_seqs)
--> 253         predictions = _predict(reference_annots, target_seqs, _blast)
    254         np.save(pred_path, predictions)
    255         return predictions

~/development/prot2vec/src/python/baselines.py in _predict(reference_annots, target_seqs, func_predict, binary_mode)
    175         predictions = {}
    176         for _, (seqid, seq) in enumerate(target_seqs.items()):
--> 177             predictions[seqid] = func_predict(SeqRecord(Seq(seq), seqid), reference_annots)
    178             if pbar: pbar.update(1)
    179     if pbar: pbar.close()

~/development/prot2vec/src/python/baselines.py in _blast(target_fasta, reference, topn, choose_max_prob)
    127 
    128     handle, _ = child.communicate()
--> 129     assert child.returncode == 0
    130 
    131     blast_qresult = SearchIO.read(output_pth, 'blast-xml')

AssertionError: 

In [ ]:
t0 = datetime(2017, 1, 1, 0, 0)
t1 = datetime.utcnow()
lim = None

seqs_train, annots_train, seqs_valid, annots_valid = load_training_and_validation(db, t0, t1, asp, lim)

len(seqs_train), len(seqs_valid)

data = (seqs_train, annots_train, seqs_valid, annots_valid)
pred1, perf1 = evaluate_performance(db, ["naive", "blast"], asp, train_and_validation_data=data, filename="2017-now")

In [ ]:
trn_stream, tst_stream = get_random_training_and_validation_streams(db, asp, ratio=0.2)

seqs_train, annots_train = trn_stream.to_dictionaries()
seqs_valid, annots_valid = tst_stream.to_dictionaries()

len(seqs_train), len(seqs_valid)

data = (seqs_train, annots_train, seqs_valid, annots_valid)
pred2, perf2 = evaluate_performance(db, ["naive", "blast"], asp, train_and_validation_data=data, filename="random_0.2")

In [ ]: