In [2]:
from src.python.deepseq0 import *


/home/yotamfr/development/prot2vec/virtualenv/lib/python3.6/site-packages/Bio/SearchIO/__init__.py:211: BiopythonExperimentalWarning: Bio.SearchIO is an experimental submodule which may undergo significant changes prior to its future official release.
  BiopythonExperimentalWarning)
/home/yotamfr/development/prot2vec/virtualenv/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: compiletime version 3.5 of module 'tensorflow.python.framework.fast_tensor_util' does not match runtime version 3.6
  return f(*args, **kwds)
Using TensorFlow backend.

In [3]:
from keras.models import model_from_json

from src.python import baselines
baselines.init_GO(asp=ASPECT)

ASPECT = 'F'  # Molecular Function

client = MongoClient("mongodb://localhost:27017/")

db = client['prot2vec']

print("Loading Ontology...")
onto = get_ontology(ASPECT)

model_str_deeperseq = "deeperseq-19-0.00224-0.57"
model_str_inception = "inception-14-0.00269-0.51"

# classes = [c for c in np.load('../../checkpoints/%s.npy' % model_str)]
classes = onto.classes
classes.remove(onto.root)
assert onto.root not in classes

with open('../../checkpoints/%s.json' % model_str_deeperseq, 'r') as f:
    deeperseq = model_from_json(f.read())
    
with open('../../checkpoints/%s.json' % model_str_inception, 'r') as f:
    inception = model_from_json(f.read())


Loading Ontology...
WARNING:tensorflow:From /home/yotamfr/development/prot2vec/virtualenv/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:1208: calling reduce_max (from tensorflow.python.ops.math_ops) with keep_dims is deprecated and will be removed in a future version.
Instructions for updating:
keep_dims is deprecated, use keepdims instead
WARNING:tensorflow:From /home/yotamfr/development/prot2vec/virtualenv/lib/python3.6/site-packages/tensorflow/python/ops/nn_impl.py:666: calling reduce_mean (from tensorflow.python.ops.math_ops) with keep_dims is deprecated and will be removed in a future version.
Instructions for updating:
keep_dims is deprecated, use keepdims instead

In [3]:
inception.load_weights("../../checkpoints/%s.hdf5" % model_str_inception)
print("Loaded model from disk")

deeperseq.load_weights("../../checkpoints/%s.hdf5" % model_str_deeperseq)
print("Loaded model from disk")


Loaded model from disk
Loaded model from disk

In [4]:
lim = None
trn_stream, tst_stream = get_training_and_validation_streams(db, onto, classes, limit=lim)


100%
Finished loading 81702 mappings!
100%
Finished loading 9049 mappings!

In [5]:
y_true, y_deeperseq = predict(deeperseq, batch_generator(tst_stream), len(tst_stream), classes)


Predicting...:  97%|█████████▋| 6355/6525 [12:05<00:18,  9.11it/s]

In [6]:
perf_deeperseq = baselines.performance(y_deeperseq, y_true, classes)
baselines.plot_precision_recall({"deeperseq": perf_deeperseq})



In [1]:
import pandas as pd
prs, rcs, f1s = perf_deeperseq
pd.DataFrame({"precision": prs, "recall": rcs, "F_max": f1s}).head(20)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-d5c637e78141> in <module>()
      1 import pandas as pd
----> 2 prs, rcs, f1s = perf_deeperseq
      3 pd.DataFrame({"precision": prs, "recall": rcs, "F_max": f1s}).head(20)

NameError: name 'perf_deeperseq' is not defined

In [8]:
lim = None
trn_stream, tst_stream = get_training_and_validation_streams(db, onto, classes, limit=lim)


100%
Finished loading 81702 mappings!
100%
Finished loading 9049 mappings!

In [9]:
y_true, y_inception = predict(inception, batch_generator(tst_stream), len(tst_stream), classes)


Predicting...:  97%|█████████▋| 6355/6525 [02:38<00:04, 41.78it/s]

In [10]:
perf_inception = baselines.performance(y_inception, y_true, classes)
baselines.plot_precision_recall({"inception": perf_inception})



In [11]:
import pandas as pd
prs, rcs, f1s = perf_inception
pd.DataFrame({"precision": prs, "recall": rcs, "F_max": f1s}).head(10)


Out[11]:
F_max precision recall
0 0.189652 0.110897 0.654331
1 0.281893 0.185830 0.583557
2 0.345911 0.253336 0.545107
3 0.368412 0.289332 0.506982
4 0.393576 0.328197 0.491483
5 0.403175 0.354504 0.467339
6 0.410842 0.381756 0.444724
7 0.414303 0.400833 0.428710
8 0.411240 0.417713 0.404964
9 0.412047 0.429774 0.395725

In [9]:
def unpad_seq(padded_seq):
    return ''.join([AA.index2aa[ix] for ix in padded_seq if ix != PAD])

_, tst_stream = get_training_and_validation_streams(db, onto, classes, limit=lim)
seqs_valid = {k: unpad_seq(seq) for k, seq, _ in tst_stream}
annots_valid = tst_stream._seq2go

trn_stream, _ = get_training_and_validation_streams(db, onto, classes, limit=None)
seqs_train = {k: unpad_seq(seq) for k, seq, _ in trn_stream}
annots_train = trn_stream._seq2go


100%
Finished loading 81702 mappings!
100%
Finished loading 9049 mappings!
100%
Finished loading 81702 mappings!
100%
Finished loading 9049 mappings!

In [13]:
y_naive = baselines.predict(seqs_train, annots_train, seqs_valid, "naive", load_file=0)
perf_naive = baselines.performance(y_naive, annots_valid)
baselines.plot_precision_recall({"naive": perf_naive})


targets processed: 100%|██████████| 6355/6355 [00:00<00:00, 106039.81it/s]

In [18]:
import pandas as pd
prs, rcs, f1s = perf_naive
pd.DataFrame({"precision": prs, "recall": rcs, "F_max": f1s}).head(20)


Out[18]:
F_max precision recall
0 0.225797 0.253973 0.203248
1 0.225797 0.253973 0.203248
2 0.225797 0.253973 0.203248
3 0.225797 0.253973 0.203248
4 0.225797 0.253973 0.203248
5 0.225797 0.253973 0.203248
6 NaN 0.000000 0.000000
7 NaN 0.000000 0.000000
8 NaN 0.000000 0.000000
9 NaN 0.000000 0.000000
10 NaN 0.000000 0.000000
11 NaN 0.000000 0.000000
12 NaN 0.000000 0.000000
13 NaN 0.000000 0.000000
14 NaN 0.000000 0.000000
15 NaN 0.000000 0.000000
16 NaN 0.000000 0.000000
17 NaN 0.000000 0.000000
18 NaN 0.000000 0.000000

In [11]:
y_blast = baselines.predict(seqs_train, annots_train, seqs_valid, "blast", load_file=1)
perf_blast = baselines.performance(y_blast, annots_valid)
baselines.plot_precision_recall({"blast": perf_blast})



In [16]:
%matplotlib inline  
import pylab
pylab.rcParams['figure.figsize'] = (10.0, 8.0)

perf = {"naive": perf_naive, "blast": perf_blast, "deeperseq": perf_deeperseq, "inception": perf_inception}
baselines.plot_precision_recall(perf)



In [ ]:
s1 = reduce(lambda x, y: set(x) | set(y), trn_stream._seq2go.values(), set())
s2 = reduce(lambda x, y: set(x) | set(y), tst_stream._seq2go.values(), set())

len(s1), len(s2), len(s1 | s2), len(s1 & s2), len(s1 - s2), len(s2 - s1)

In [ ]: