In [1]:
import os
os.getcwd()
Out[1]:
In [2]:
from src.python.digo2 import *
asp = 'F'
year = 2014
In [3]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
import pylab
pylab.rcParams['figure.figsize'] = (10.0, 8.0)
def plot_histograms(pos, neg, val_range, pos_label='same-leaf', neg_label='cousins'):
plt.hist(pos, bins=100, alpha=0.8, label=pos_label, normed=1, cumulative=True, histtype='step', range=val_range)
plt.hist(neg, bins=100, alpha=0.8, label=neg_label, normed=1, cumulative=True, histtype='step', range=val_range)
plt.legend(loc='upper right')
plt.title("Distribution of CosineSimilarity")
plt.ylabel("Frequency")
plt.grid(True)
plt.show()
In [4]:
pos = [p[0] for p in load_object("../../Data/dingo_%s_ks_pos_data" % (GoAspect(asp),))]
neg = [n[0] for n in load_object("../../Data/dingo_%s_ks_neg_data" % (GoAspect(asp),))]
len(pos), len(neg), np.mean(pos), np.mean(neg)
Out[4]:
In [5]:
plot_histograms(pos, neg, (0, 1))
In [6]:
ks_2samp(pos, neg)
Out[6]:
In [7]:
graph = load_object("../../Data/dingo_%s_graph_%d" % (GoAspect(asp), year))
for node in graph: compute_node_prior(node, graph, grace=0.0)
In [8]:
preds_per_uid_blast = load_object("../../Data/blast_%s_preds_%d" % (GoAspect(asp), year))
preds_per_uid_dingo = load_object("../../Data/dingo_%s_preds_%d" % (GoAspect(asp), year))
# hits_per_uid = load_object("../../Data/blast_%s_hsp_%d" % (GoAspect(asp), year))
gt_per_uid = load_object("../../Data/gt_%s_%d" % (GoAspect(asp), year))
len(preds_per_uid_blast), len(gt_per_uid), len(preds_per_uid_dingo)
Out[8]:
In [9]:
import pandas as pd
from baselines import *
thresholds = [0, 10e-3, 0.01, 0.02, 0.05, 0.10, 0.15, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50, 0.55, 0.60, 0.65, 0.80, 0.85, 0.90, 0.95, 0.99]
perf_dingo = ths, prs, rcs, f1s = performance(preds_per_uid_dingo, gt_per_uid, ths=thresholds)
pd.DataFrame({"Threshold":ths, "Precision":prs, "Recall":rcs, "F1":f1s}).head(30)
Out[9]:
In [10]:
import pandas as pd
from baselines import *
thresholds = [15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95]
perf_pident = ths, prs, rcs, f1s = performance(preds_per_uid_blast, gt_per_uid, ths=thresholds)
pd.DataFrame({"Threshold":ths, "Precision":prs, "Recall":rcs, "F1":f1s}).head(20)
Out[10]:
In [11]:
import pandas as pd
from baselines import *
preds_per_uid_dingo_blast = {}
for uid in preds_per_uid_blast.keys():
preds_per_uid_dingo_blast[uid] = {}
for go, percent in preds_per_uid_blast[uid].items():
if go not in preds_per_uid_dingo[uid]:
preds_per_uid_dingo_blast[uid][go] = percent/100
else:
preds_per_uid_dingo_blast[uid][go] = preds_per_uid_dingo[uid][go]
thresholds = [0.15, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50, 0.55, 0.60, 0.65, 0.70, 0.75, 0.80, 0.85, 0.90, 0.95]
perf_combined = ths, prs, rcs, f1s = performance(preds_per_uid_dingo_blast, gt_per_uid, ths=thresholds)
pd.DataFrame({"Threshold":ths, "Precision":prs, "Recall":rcs, "F1":f1s}).head(20)
Out[11]:
In [12]:
plot_precision_recall({"dingo": perf_dingo, "blast": perf_pident, "combined": perf_combined})
In [ ]: