In [1]:
%matplotlib inline
import pylab
import matplotlib.pyplot as plt
pylab.rcParams['figure.figsize'] = (5.0, 4.0)
from src.python.preprocess2 import *
from pymongo import MongoClient
asp = 'F' # default: Molecular Function
client = MongoClient('mongodb://localhost:27017/')
db = client['prot2vec']
onto = get_ontology(asp)
In [2]:
# t0 = datetime(2014, 1, 1, 0, 0)
# t1 = datetime(2014, 9, 1, 0, 0)
t0 = datetime(2017, 1, 1, 0, 0)
t1 = datetime.utcnow()
trn_stream, tst_stream = get_training_and_validation_streams(db, t0, t1, asp, profile=False)
# trn_stream, tst_stream = get_random_training_and_validation_streams(db, asp, ratio=0.2)
seq2go_trn = trn_stream._seq2go
seq2go_tst = tst_stream._seq2go
In [3]:
len(seq2go_trn), len(seq2go_tst), len(seq2go_tst)/(len(seq2go_trn) + len(seq2go_tst)), len(set(seq2go_trn.keys()) & set(seq2go_tst.keys()))
Out[3]:
In [4]:
for i, (k, v) in enumerate(seq2go_tst.items()):
if i > 20: break
print(k, v)
In [5]:
cls_tst = get_classes(seq2go_tst, onto)
cls_trn = get_classes(seq2go_trn, onto)
In [6]:
len(cls_trn), len(cls_tst), len(set(cls_trn) & set(cls_tst)), len(set(cls_trn) - set(cls_tst)), len(set(cls_tst) - set(cls_trn))
Out[6]:
In [7]:
lengths_trn = list(map(lambda annos: len(annos), seq2go_trn.values()))
lengths_tst = list(map(lambda annos: len(annos), seq2go_tst.values()))
plt.tight_layout()
plt.subplot(2, 1, 1)
plt.hist(lengths_trn, bins=100, range=(1, 100))
plt.title("Annotations per sequence")
plt.ylabel("Frequency")
plt.grid(True)
plt.show()
plt.subplot(2, 1, 2)
plt.hist(lengths_tst, bins=100, range=(1, 100))
plt.title("Annotations per sequence")
plt.ylabel("Frequency")
plt.grid(True)
plt.show()
np.percentile(lengths_trn, 95, axis=0), np.percentile(lengths_tst, 95, axis=0)
# np.percentile(lengths_trn, 90, axis=0), np.percentile(lengths_tst, 90, axis=0)
# np.percentile(lengths_trn, 80, axis=0), np.percentile(lengths_tst, 80, axis=0)
Out[7]:
In [8]:
onto.propagate(["GO:0005391"])
Out[8]:
In [ ]: