In [1]:
import os
os.getcwd()


Out[1]:
'/home/yotamfr/development/prot2vec/src/python'

In [2]:
from src.python.digo import *

asp = 'F'
year = 2014


/home/yotamfr/development/prot2vec/virtualenv/lib/python3.6/site-packages/Bio/SearchIO/__init__.py:211: BiopythonExperimentalWarning: Bio.SearchIO is an experimental submodule which may undergo significant changes prior to its future official release.
  BiopythonExperimentalWarning)

In [3]:
%matplotlib inline
%load_ext autoreload   
%autoreload 2

import pylab
pylab.rcParams['figure.figsize'] = (10.0, 8.0)

def plot_histograms(pos, neg, val_range, pos_label='same-leaf', neg_label='cousins'):
    plt.hist(pos, bins=100, alpha=0.8, label=pos_label, normed=1, cumulative=True, histtype='step', range=val_range)
    plt.hist(neg, bins=100, alpha=0.8, label=neg_label, normed=1, cumulative=True, histtype='step', range=val_range)
    plt.legend(loc='upper right')
    plt.title("Distribution of  bitscore")
    plt.ylabel("Frequency")
    plt.grid(True)
    plt.show()

In [4]:
pos = load_object("../../Data/digo_%s_ks_pos_data_%d" % (GoAspect(asp), year))
neg = load_object("../../Data/digo_%s_ks_neg_data_%d" % (GoAspect(asp), year))

In [5]:
pos_bitscore = [h.bitscore for h in pos]
neg_bitscore = [h.bitscore for h in neg]

ks_2samp(pos_bitscore, neg_bitscore)


Out[5]:
Ks_2sampResult(statistic=0.42820000000000003, pvalue=0.0)

In [6]:
plot_histograms(pos_bitscore, neg_bitscore, (0, 500))



In [7]:
pos_pident = [h.pident for h in pos]
neg_pident = [h.pident for h in neg]

ks_2samp(pos_pident, neg_pident)


Out[7]:
Ks_2sampResult(statistic=0.13350000000000006, pvalue=4.3016933400341578e-78)

In [8]:
plot_histograms(pos_pident, neg_pident, (0, 100))



In [9]:
hits_per_uid = load_object("../../Data/blast_%s_hsp_%d" % (GoAspect(asp), year))
gt_per_uid = load_object("../../Data/gt_%s_%d" % (GoAspect(asp), year))
len(hits_per_uid), len(gt_per_uid)


Out[9]:
(2893, 2893)

In [10]:
from digo import *
graph = load_object("../../Data/digo_%s_graph_%d" % (GoAspect(asp), year))
for node in graph: compute_node_prior(node, graph, grace=0.0)

In [11]:
def bar_plot_with_labels(data, labels):
    indx = range(1, len(data)+1)
    plt.bar(indx, data, align='center')
    plt.xticks(indx, labels)
    plt.xticks(rotation=90)
    plt.show()

In [12]:
total_gt = sum([len(terms) for terms in gt_per_uid.values()])
reversed_gt = {}
for uid, terms in gt_per_uid.items():
    for go in terms:
        if go in reversed_gt:
            reversed_gt[go].append(uid)
        else:
            reversed_gt[go] = [uid]

gt_prior_arr = [(go, len(reversed_gt[go])/total_gt) for go in reversed_gt]
gt_prior_arr.sort(key=lambda pair: -pair[1])
labels, data = zip(*gt_prior_arr[:20])
bar_plot_with_labels(data, labels)



In [13]:
nodes_prior = {node.go: node.prior for node in graph}
del nodes_prior[graph.root.go]
nodes_prior_arr = list(nodes_prior.items())
nodes_prior_arr.sort(key=lambda pair: -pair[1])
    
labels, data = zip(*nodes_prior_arr[:20])
bar_plot_with_labels(data, labels)



In [14]:
import pandas as pd
from baselines import *

thresholds = [0.02, 0.05, 0.10, 0.15, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50, 0.55, 0.60, 0.65]
preds_per_uid = {uid: nodes_prior for uid in hits_per_uid.keys()}
perf_naive = ths, prs, rcs, f1s = performance(preds_per_uid, gt_per_uid, ths=thresholds)
pd.DataFrame({"Threshold":ths, "Precision":prs, "Recall":rcs, "F1":f1s}).head(20)


Out[14]:
F1 Precision Recall Threshold
0 0.101411 0.054933 0.658830 0.02
1 0.191831 0.117249 0.527154 0.05
2 0.303822 0.235127 0.429223 0.10
3 0.337799 0.298767 0.388561 0.15
4 0.382398 0.448784 0.333121 0.20
5 0.382398 0.448784 0.333121 0.25
6 0.382398 0.448784 0.333121 0.30
7 0.382398 0.448784 0.333121 0.35
8 0.382398 0.448784 0.333121 0.40
9 0.351613 0.485482 0.275614 0.45
10 0.237978 0.577255 0.149884 0.50
11 0.237978 0.577255 0.149884 0.55
12 0.237978 0.577255 0.149884 0.60
13 0.000020 1.000000 0.000000 0.65

In [15]:
plot_precision_recall({"naive": perf_naive})



In [16]:
from digo import *
graph = load_object("../../Data/digo_%s_graph_%d" % (GoAspect(asp), year))
for node in graph: compute_node_prior(node, graph, grace=0.1)

In [17]:
def preds_by_attr(hits_per_uid, attr, nb=None):
    preds = {}
    pbar = tqdm(range(len(hits_per_uid)), desc="sequences processed")
    for uid, hits in hits_per_uid.items():
        pbar.update(1)
        if len(hits) == 0:
            continue
        preds[uid] = {}
        for go, hits in hits.items():
            assert go != graph.root.go
            hs =[getattr(h, attr) for h in hits if h.evalue < 0.001]
            if len(hs) == 0:
                continue
            if nb:
                preds[uid][go] = nb.infer(max(hs), graph[go].prior)
            else:
                preds[uid][go] = max(hs)
    pbar.close()
    return preds

def smart_infer(go2score, max_leaves=4):
    leaves = list(get_leaves([graph[go] for go in go2score.keys()]))
    leaves.sort(key=lambda node: go2score[node.go], reverse=True)
    fathers = {}
    for leaf in leaves[:max_leaves]:
        prop_set = digo.propagate(leaf)
        assert graph.root not in prop_set
        assert leaf in prop_set
        for father in prop_set:
            try:
                fathers[father.go] = max(go2score[leaf.go], go2score[father.go])
            except KeyError:
                fathers[father.go] = go2score[leaf.go]
    return fathers

In [18]:
import pandas as pd
from baselines import *

nb = NaiveBayes(Distribution(pos_bitscore), Distribution(neg_bitscore))

thresholds = [-5, -4.5, -4, -3.5, -3, -2.5, -2, -1, -0.5, 0.0, 0.5, 1.0, 1.5, 2]
preds_per_uid = preds_by_attr(hits_per_uid, "bitscore", nb)
# preds_per_uid = {uid: smart_infer(graph, go2score) for uid, go2score in preds_per_uid.items()}
perf_bitscore_nb = ths, prs, rcs, f1s = performance(preds_per_uid, gt_per_uid, ths=thresholds)
pd.DataFrame({"Threshold":ths, "Precision":prs, "Recall":rcs, "F1":f1s}).head(20)


sequences processed: 100%|██████████| 2893/2893 [01:47<00:00, 26.82it/s]
Out[18]:
F1 Precision Recall Threshold
0 0.483036 0.355319 0.754087 -5.0
1 0.482987 0.358007 0.742032 -4.5
2 0.484515 0.390486 0.638191 -4.0
3 0.504330 0.477697 0.534109 -3.5
4 0.518385 0.565392 0.478594 -3.0
5 0.493349 0.650567 0.397329 -2.5
6 0.465066 0.758492 0.335339 -2.0
7 0.414597 0.823668 0.277018 -1.0
8 0.396101 0.837368 0.259404 -0.5
9 0.296178 0.871544 0.178402 0.0
10 0.214646 0.901921 0.121818 0.5
11 0.175019 0.921955 0.096687 1.0
12 0.097050 0.942149 0.051160 1.5
13 0.009018 0.997523 0.004529 2.0

In [19]:
# import pandas as pd
# from baselines import *

# thresholds = [20, 25, 30, 40, 50, 75, 100, 150, 200, 250, 500, 750, 1000, 1250, 1500, 2000, 2500, 3000]
# preds_per_uid = preds_by_attr(hits_per_uid, "bitscore")
# smart_preds_per_uid = {uid: smart_infer(graph, preds) for uid, preds in preds_per_uid.items()}
# perf_bitscore = performance(preds_per_uid, gt_per_uid, ths=thresholds)
# perf_smart_bitscore = performance(smart_preds_per_uid, gt_per_uid, ths=thresholds)
# ths1, prs1, rcs1, f1s1 = perf_bitscore
# ths2, prs2, rcs2, f1s2 = perf_smart_bitscore
# pd.DataFrame({"Threshold":ths1, "Precision":prs1, "Recall":rcs1, "F1":f1s1,
#               "Threshold_smart":ths2, "Precision_smart":prs2, "Recall_smart":rcs2, "F1_smart":f1s2}).head(20)

In [20]:
import pandas as pd
from baselines import *

thresholds = [15, 20, 25, 30, 40, 50, 75, 100, 150, 200, 250, 500, 750, 1000, 1250, 1500, 2000, 2500, 3000]
preds_per_uid = preds_by_attr(hits_per_uid, "bitscore")
perf_bitscore = ths, prs, rcs, f1s = performance(preds_per_uid, gt_per_uid, ths=thresholds)
pd.DataFrame({"Threshold":ths, "Precision":prs, "Recall":rcs, "F1":f1s}).head(20)


sequences processed: 100%|██████████| 2893/2893 [00:03<00:00, 836.17it/s]
Out[20]:
F1 Precision Recall Threshold
0 0.483036 0.355319 0.754087 15
1 0.483036 0.355319 0.754087 20
2 0.483036 0.355319 0.754087 25
3 0.483036 0.355319 0.754087 30
4 0.483964 0.356655 0.752610 40
5 0.511507 0.397763 0.716356 50
6 0.556314 0.482970 0.655921 75
7 0.572498 0.546068 0.601617 100
8 0.569726 0.641213 0.512580 150
9 0.549064 0.694932 0.453809 200
10 0.521332 0.741318 0.402030 250
11 0.344421 0.872550 0.214556 500
12 0.198050 0.930914 0.110813 750
13 0.120228 0.960517 0.064127 1000
14 0.075106 0.978716 0.039052 1250
15 0.053157 0.985623 0.027315 1500
16 0.023401 0.991771 0.011840 2000
17 0.009396 0.996696 0.004720 2500
18 0.004552 0.998185 0.002281 3000

In [21]:
# import pandas as pd
# from baselines import *
# thresholds = [25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 97, 99]
# preds_per_uid = preds_by_attr(hits_per_uid, "pident")
# smart_preds_per_uid = {uid: smart_infer(graph, preds) for uid, preds in preds_per_uid.items()}
# perf_pident = performance(preds_per_uid, gt_per_uid, ths=thresholds)
# perf_smart_pident = performance(smart_preds_per_uid, gt_per_uid, ths=thresholds)
# ths1, prs1, rcs1, f1s1 = perf_pident
# ths2, prs2, rcs2, f1s2 = perf_smart_pident
# pd.DataFrame({"Threshold":ths1, "Precision":prs1, "Recall":rcs1, "F1":f1s1,
#              "Threshold_smart":ths2, "Precision_smart":prs2, "Recall_smart":rcs2, "F1_smart":f1s2}).head(20)

In [44]:
import pandas as pd
from baselines import *

thresholds = [15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95]
preds_per_uid = preds_by_attr(hits_per_uid, "pident")
perf_pident = ths, prs, rcs, f1s = performance(preds_per_uid, gt_per_uid, ths=thresholds)
pd.DataFrame({"Threshold":ths, "Precision":prs, "Recall":rcs, "F1":f1s}).head(20)


sequences processed: 100%|██████████| 2893/2893 [00:03<00:00, 817.40it/s]
Out[44]:
F1 Precision Recall Threshold
0 0.483036 0.355319 0.754087 15
1 0.483393 0.355764 0.753828 20
2 0.500546 0.378258 0.739676 25
3 0.539997 0.446235 0.683641 30
4 0.566457 0.529046 0.609562 35
5 0.570589 0.613483 0.533301 40
6 0.555086 0.686787 0.465768 45
7 0.514175 0.740657 0.393767 50
8 0.460506 0.788010 0.325306 55
9 0.418604 0.818704 0.281188 60
10 0.365001 0.846238 0.232680 65
11 0.310140 0.870504 0.188681 70
12 0.271303 0.889208 0.160071 75
13 0.222028 0.912017 0.126400 80
14 0.185062 0.928280 0.102776 85
15 0.141342 0.945844 0.076377 90
16 0.086501 0.968834 0.045272 95

In [23]:
# plot_precision_recall({"blast_bitscore": perf_bitscore, "blast_%identity": perf_pident,
#                       "smart_blast_bitscore": perf_smart_bitscore, "smart_blast_%identity": perf_smart_pident})

In [24]:
plot_precision_recall({"bitscore_nb": perf_bitscore_nb, "bitscore": perf_bitscore, 
                       "pident": perf_pident, "naive": perf_naive})


Questions

  1. If bitscore is such a good discriminator between sequences coming from the same node and sequences coming from cousins how come that it achieves equivalent results to percentage of idendity?
  2. Could I have higher precision w/o affecting the recall?

In [25]:
onto = get_ontology(asp)

In [26]:
def least_common_ancestor(G, go1, go2):
    ancestors1 = nx.shortest_path_length(G, source=go1)
    ancestors2 = nx.shortest_path_length(G, source=go2)
    common = list(set(ancestors1.keys()) & set(ancestors2.keys()))
    assert len(common) > 0
    c = common[np.argmin([ancestors1[c] + ancestors2[c] for c in common])]
    return c, ancestors1[c] + ancestors2[c]

wrong_hist = []
pbar = tqdm(range(len(hits_per_uid)), desc="sequences processed")
for uid, hits in hits_per_uid.items():
    pbar.update(1)
    if len(hits) == 0:
        continue
    for go, hits in hits.items():
        assert go != graph.root.go
        hs =[h for h in hits if h.evalue < 0.001]
        if len(hs) == 0:
            continue
        uid_annots = gt_per_uid[uid]
        if go in uid_annots:
            continue
        shortest = [least_common_ancestor(onto._graph, go, correct)[1] for correct in uid_annots]
        wrong_hist.extend(shortest)
pbar.close()


sequences processed: 100%|██████████| 2893/2893 [01:36<00:00, 29.86it/s]

In [27]:
plt.hist(wrong_hist, bins=50, alpha=0.8, label="wrong calssification LCA", normed=1, cumulative=False)
plt.legend(loc='upper right')
plt.title("Distribution of  LCA")
plt.ylabel("Frequency")
plt.grid(True)
plt.show()



In [28]:
# from src.python.deepseq0 import *

# from keras.models import model_from_json

# client = MongoClient("mongodb://localhost:27017/")

# db = client['prot2vec']

# print("Loading Ontology...")
# onto = get_ontology(ASPECT)

# model_str_deeperseq = "deeperseq-20-0.00241-0.56"

# classes = onto.classes
# classes.remove(onto.root)
# assert onto.root not in classes

# with open('../../checkpoints/%s.json' % model_str_deeperseq, 'r') as f:
#     deeperseq = model_from_json(f.read())
    
# deeperseq.load_weights("../../checkpoints/%s.hdf5" % model_str_deeperseq)
# print("Loaded model from disk")

In [29]:
# query = {"_id": {"$in": list(gt_per_uid.keys())}}
# count = db.uniprot.count(query)
# source = db.uniprot.find(query).batch_size(10)
# stream_tst = DataStream(source, count, gt_per_uid, onto, classes)

In [30]:
# y_gt, y_ds = predict(deeperseq, batch_generator(stream_tst), len(stream_tst), classes)
# perf_deepseq = performance(y_ds, y_gt, classes)

In [31]:
# y_gt, y_dumb = predict_dummy(deeperseq, batch_generator(stream_tst), len(stream_tst), classes)
# perf_dummy = performance(y_ds, y_dumb, classes)

In [48]:
digo_preds_ks = load_object("../../Data/digo_F_preds_0.001_ks")

In [45]:
import pandas as pd
from baselines import *

thresholds = [0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50, 0.55, 0.60, 0.65, 0.70, 0.75, 0.80, 0.85, 0.90, 0.95]
perf_ks = ths, prs, rcs, f1s = performance(digo_preds_ks_blast, gt_per_uid, ths=thresholds)
pd.DataFrame({"Threshold":ths, "Precision":prs, "Recall":rcs, "F1":f1s}).head(20)


Out[45]:
F1 Precision Recall Threshold
0 0.464220 0.535325 0.409789 0.20
1 0.457826 0.543486 0.395491 0.25
2 0.451377 0.553819 0.380917 0.30
3 0.436414 0.565702 0.355229 0.35
4 0.429951 0.572747 0.344148 0.40
5 0.420306 0.584831 0.328025 0.45
6 0.413042 0.598023 0.315463 0.50
7 0.397914 0.613517 0.294441 0.55
8 0.388872 0.624899 0.282261 0.60
9 0.380629 0.638562 0.271118 0.65
10 0.364877 0.653582 0.253083 0.70
11 0.353803 0.665554 0.240943 0.75
12 0.336725 0.682781 0.223465 0.80
13 0.323288 0.700232 0.210158 0.85
14 0.293395 0.733388 0.183378 0.90
15 0.263861 0.769181 0.159244 0.95

In [46]:
plot_precision_recall({"bitscore_nb": perf_bitscore_nb, "bitscore": perf_bitscore, "K-S": perf_ks, "naive": perf_naive})



In [50]:
predictions_pindent = preds_by_attr(hits_per_uid, "pident")

for tgt in gt_per_uid.keys():
    predictions = digo_preds_ks[tgt]
    for go, pident in predictions_pindent[tgt].items():
        prob = pident / 100
        if go in predictions:
            predictions[go] = 1 - (1 - prob) * (1 - predictions[go])
        else:
            predictions[go] = prob
    digo_preds_ks[tgt] = predictions


sequences processed: 100%|██████████| 2893/2893 [00:03<00:00, 836.26it/s]

In [51]:
import pandas as pd
from baselines import *

thresholds = [0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50, 0.55, 0.60, 0.65, 0.70, 0.75, 0.80, 0.85, 0.90, 0.95]
digo_preds_ks = ths, prs, rcs, f1s = performance(digo_preds_ks, gt_per_uid, ths=thresholds)
pd.DataFrame({"Threshold":ths, "Precision":prs, "Recall":rcs, "F1":f1s}).head(20)


Out[51]:
F1 Precision Recall Threshold
0 0.482687 0.354788 0.754778 0.20
1 0.482695 0.354798 0.754778 0.25
2 0.482696 0.354798 0.754778 0.30
3 0.482906 0.355026 0.754778 0.35
4 0.485101 0.358032 0.751988 0.40
5 0.495927 0.372239 0.742720 0.45
6 0.513436 0.398467 0.721652 0.50
7 0.526891 0.424648 0.693981 0.55
8 0.536449 0.450286 0.663388 0.60
9 0.543505 0.477003 0.631555 0.65
10 0.544246 0.501963 0.594308 0.70
11 0.539896 0.527973 0.552370 0.75
12 0.528823 0.554719 0.505237 0.80
13 0.510955 0.580365 0.456374 0.85
14 0.485662 0.619162 0.399519 0.90
15 0.439556 0.667042 0.327773 0.95

In [53]:
plot_precision_recall({"bitscore_nb": perf_bitscore_nb, "bitscore": perf_bitscore, "K-S": digo_preds_ks, "naive": perf_naive})



In [ ]: