In [1]:
import os
print(os.getcwd())
In [2]:
%matplotlib inline
import pylab
pylab.rcParams['figure.figsize'] = (10.0, 8.0)
import matplotlib.pyplot as plt
import numpy as np
from src.python.preprocess import *
from pymongo import MongoClient
client = MongoClient("mongodb://127.0.0.1:27017")
# client = MongoClient("mongodb://192.168.1.14:27017")
db = client['prot2vec']
goa_collection = db.goa_uniprot
q = {'Evidence': {'$in': exp_codes}, 'DB': 'UniProtKB'}
num_go = goa_collection.count(q)
src_go = goa_collection.find(q)
def bar_plot_with_labels(data, labels):
indx = range(1, len(data)+1)
plt.bar(indx, data, align='center')
plt.xticks(indx, labels)
plt.show()
def pretty_print(codes, counts):
hist = list(zip(codes, counts))
for code, c in hist:
print("%s\t:\t%d" % (code, c))
print("Total\t:\t%d" % sum(map(lambda x: x[1], hist)))
codes = goa_collection.distinct('Evidence')
counts = list(map(lambda c: goa_collection.count({'Evidence': {'$in': [c]}}), codes))
bar_plot_with_labels(counts, codes)
pretty_print(codes, counts)
# dbs = db.goa_uniprot.distinct('DB')
# counts = list(map(lambda d: db.goa_uniprot.count({'DB': d}), dbs))
# bar_plot_with_labels(counts, dbs)
# pretty_print(dbs, counts)
seqid2goid, goid2seqid = GoAnnotationCollectionLoader(src_go, num_go, 'P').load()
query = {"_id": {"$in": unique(list(seqid2goid.keys())).tolist()}}
num_seq = db.uniprot.count(query)
src_seq = db.uniprot.find(query)
seqid2seq = UniprotCollectionLoader(src_seq, num_seq).load()
In [3]:
plt.hist(list(map(lambda annos: len(annos), seqid2goid.values())), bins=100, range=(1, 100))
plt.title("Annotations per sequence")
plt.ylabel("Frequency")
plt.grid(True)
plt.show()
plt.hist(list(map(lambda seqs: len(seqs), goid2seqid.values())), bins=50, range=(1, 50))
plt.title("Sequences per GO-term")
plt.ylabel("Frequency")
plt.grid(True)
plt.show()
In [4]:
onto = get_ontology('P')
dataset = Dataset(seqid2seq, seqid2goid, onto)
print(dataset)
p99 = np.percentile(list(map(lambda r: len(r.seq), dataset.records)), 99)
print("99 percentile:\t%d" % p99)
In [5]:
plt.hist(list(map(lambda r: len(r.seq), dataset.records)), bins=1000, range=(0, p99 * 2))
plt.title("Sequence length")
plt.ylabel("Frequency")
plt.grid(True)
plt.show()
# pnts = np.random.choice(dataset.records, 5000, replace=False)
pnts = dataset.records
x = list(map(lambda r: len(r.seq), pnts))
y = list(map(lambda r: len(r.lbl), pnts))
plt.scatter(x, y)
plt.xlim(0, 2 * p99)
plt.title("#Annotations vs. Sequence length")
plt.xlabel("Sequence length")
plt.ylabel("#Annotations")
plt.grid(True)
plt.show()
In [6]:
from src.python.geneontology import *
go_graph = initialize_go()
assert nx.is_directed_acyclic_graph(go_graph)
In [7]:
attr = go_graph._node
def pretty(labels):
return '\n'.join(["%s\t%s" % (lbl, attr[lbl]['name']) for lbl in labels])
print("\n--------------> leaf-only\n")
print(dataset)
print("\n")
record = dataset.records[0]
print(pretty(record.lbl))
In [8]:
dataset.augment()
print("\n--------------> augmented\n")
print(dataset)
print("\n")
record = dataset.records[0]
print(pretty(record.lbl))
sub_graph = go_graph.subgraph(record.lbl)
nx.draw(sub_graph)
plt.show()
In [9]:
_, seqid2goid = Dataset.to_dictionaries(dataset.records)
In [17]:
plt.hist(list(map(lambda annos: len(annos), seqid2goid.values())), bins=100, range=(1, 100))
plt.title("Annotations per sequence")
plt.ylabel("Frequency")
plt.grid(True)
plt.show()
In [11]:
from preprocess2 import *
In [15]:
def get_random_training_and_validation_streams(db, asp, profile=False):
Stream = ProfileStream if profile else SequenceStream
collection = db.pssm if profile else db.uniprot
q_valid = {'DB': 'UniProtKB', 'Evidence': {'$in': exp_codes}, 'Aspect': asp}
seq2go, _ = GoAnnotationCollectionLoader(db.goa_uniprot.find(q_valid),
db.goa_uniprot.count(q_valid), asp).load()
seq2go_tst = {k: seq2go[k] for k in np.random.choice(list(seq2go.keys()), size=len(seq2go)//5)}
seq2go_trn = {k: v for k, v in seq2go.items() if k not in seq2go_tst}
stream_tst = Stream(seq2go_tst, collection)
stream_trn = Stream(seq2go_trn, collection)
return stream_trn, stream_tst
trn_stream, tst_stream = get_random_training_and_validation_streams(db, 'F')
In [22]:
(len(tst_stream._seq2go), len(trn_stream._seq2go), len(set([k for k in tst_stream._seq2go]) & set([k for k in trn_stream._seq2go])))
Out[22]:
In [23]:
plt.hist(list(map(lambda annos: len(annos), tst_stream._seq2go.values())), bins=100, range=(1, 100))
plt.title("Annotations per sequence")
plt.ylabel("Frequency")
plt.grid(True)
plt.show()
plt.hist(list(map(lambda seqs: len(seqs), trn_stream._seq2go.values())), bins=50, range=(1, 50))
plt.title("Sequences per GO-term")
plt.ylabel("Frequency")
plt.grid(True)
plt.show()
In [ ]: