In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
import pylab
pylab.rcParams['figure.figsize'] = (10.0, 8.0)
from src.python.geneontology import *
go_graph = initialize_go()
assert nx.is_directed_acyclic_graph(go_graph)
In [2]:
import os
from src.python import node2vec
from gensim.models.word2vec import Word2Vec
from gensim.models.keyedvectors import KeyedVectors
p = 1.0
q = 1.0
num_walks = 10 # defaullt is 10
walk_length = 12 # defaullt is 80
dim = 256
win = 6
sg = 1
model_fname = '/tmp/GO-%s-dim%d-win%d-len%d-iter%d.emb' % ("sg" if sg==1 else "cbow", dim, win, walk_length, num_walks)
def embedding(graph):
G = node2vec.Graph(graph, True, p, q) # True for directed
G.preprocess_transition_probs()
walks = G.simulate_walks(num_walks, walk_length)
walks = [list(map(str, walk)) for walk in walks]
model = Word2Vec(walks, size=dim, window=win, min_count=0, sg=sg, workers=4, iter=num_walks)
model.wv.save(model_fname)
return model.wv
if os.path.exists(model_fname):
word_vectors = KeyedVectors.load(model_fname)
else:
word_vectors = embedding(go_graph)
In [3]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
def tsne(embeddings):
tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000, method='exact')
return tsne.fit_transform(embeddings[:, :])
def pca(embeddings):
pca = PCA(n_components=2)
return pca.fit_transform(embeddings)
def lda(embeddings, labels):
lda = LinearDiscriminantAnalysis(n_components=2)
return lda.fit(embeddings, labels).transform(embeddings)
def plot(title, X_r):
for color, asp in zip(colors, aspects):
plt.scatter(X_r[y == asp, 0], X_r[y == asp, 1], color=color, alpha=.8, lw=lw,
label=asp)
plt.title(title)
plt.legend(loc='best', shadow=False, scatterpoints=1)
X = np.array([word_vectors[n] for n in go_graph.nodes()])
nrow = len(X)
indx = np.arange(nrow) % 10 == 0
X = X[indx, :]
colors = ['navy', 'turquoise', 'darkorange']
aspects = ['biological_process', 'molecular_function', 'cellular_component']
lw = 2
y = np.array([attr['namespace'] for _, attr in go_graph._node.items()])[indx]
X_pca = pca(X)
plot('PCA of GeneOntology', X_pca)
plt.show()
X_lda = lda(X, y)
plot('LDA of GeneOntology', X_lda)
plt.show()
# X_tsne = tsne(X)
# plot('t-SNE of GeneOntology', X_tsne)
# plt.show()
In [4]:
# num_walks = 10 # defaullt is 10
# walk_length = 12 # defaullt is 80
# dim = 256
# win = 6
# sg = 1
# model_fname = '/tmp/MFO-%s-dim%d-win%d-len%d-iter%d.emb' % ("sg" if sg==1 else "cbow", dim, win, walk_length, num_walks)
# def get_ontology_graph(aspect):
# asp_graph = go_graph.copy()
# for n, attr in go_graph._node.items():
# if attr['namespace'] != aspect:
# asp_graph.remove_node(n)
# return asp_graph
# onto_graph = get_ontology_graph('molecular_function')
# if os.path.exists(model_fname):
# word_vectors = KeyedVectors.load(model_fname)
# else:
# word_vectors = embedding(onto_graph)
In [5]:
import os
import numpy as np
from gensim.models.poincare import PoincareModel, PoincareKeyedVectors
mfo = get_ontology('F')
print(mfo)
bpo = get_ontology('P')
print(bpo)
cco = get_ontology('C')
print(cco)
onto_root = mfo.root
onto_graph = mfo._graph
word_vectors = mfo._kv
In [6]:
print(mfo.root)
print(cco.root)
print(bpo.root)
In [7]:
import sys
hsv = plt.get_cmap('hsv')
def get_level(n, G, root):
anc = nx.shortest_path_length(G, target=root)
s = set([k for k, v in anc.items() if (v == n)])
return s
rollups = get_level(1, onto_graph, onto_root)
def eval_function(word_vectors, graph):
y = []
X = []
nn_sim = []
nn_rank = []
nn_level = []
rand_sim = []
N = len(graph._node)
nodes = list(graph.nodes())
for i, (n, _) in enumerate(graph._node.items()):
sys.stdout.write("\r{0:.0f}%".format(100.0 * i/N))
if n in rollups | {onto_root}:
continue
X.append(word_vectors[n])
desc = nx.descendants(graph, n)
nbr = list(graph.neighbors(n))
sample = np.random.choice(nodes, len(nbr))
rand_sim.append(np.mean([word_vectors.similarity(n, v) for v in sample]))
nn_sim.append(np.mean([word_vectors.similarity(n, u) for u in nbr]))
# nn_rank.append(np.mean([word_vectors.rank(n, u) for u in nbr]))
k = nx.shortest_path_length(graph, source=n, target=onto_root)
nn_level.append(k)
for go in [go for go in rollups if go in desc]:
rollup = graph._node[go]['name']
y.append(rollup)
break
return X, y, rand_sim, nn_sim, nn_rank, nn_level
X, y, RAND_sim, NN_sim, NN_rank, NN_lvl = eval_function(word_vectors, onto_graph)
aspects = np.unique(y)
print("\n" + "\n".join(aspects))
colors = hsv(np.linspace(0, 1.0, len(aspects)))
lw = 2
N = len(y)
indx = np.arange(N) % 2 == 0 # Just plot Half of'em
X = np.array(X)[indx, :]
y = np.array(y)[indx]
X_lda = lda(X, y)
plot('LDA of GeneOntology', X_lda)
plt.show()
# X_tsne = tsne(X)
# plot('t-SNE of GeneOntology', X_tsne)
# plt.show()
plt.hist(RAND_sim, bins=100, alpha=0.8, label='rand')
plt.hist(NN_sim, bins=100, alpha=0.8, label='nbr')
plt.legend(loc='upper right')
plt.title("Avg. Similarity of Neighbors")
plt.ylabel("Frequency")
plt.grid(True)
plt.show()
# plt.hist(NN_rank, bins=200, range=(0, np.percentile(NN_rank, 95)))
# plt.title("Avg. Rank of Neighbors")
# plt.ylabel("Frequency")
# plt.grid(True)
# plt.show()
NN_lvl = np.array(NN_lvl)
NN_sim = np.array(NN_sim)
levels = np.unique(NN_lvl)
y = list(map(lambda lvl: np.mean(NN_sim[NN_lvl == lvl]), levels))
plt.scatter(levels, y)
plt.title("Avg. Similarity of Neighbors vs. Level in Ontology")
plt.xlabel("Level")
plt.ylabel("Avg. Similarity")
plt.grid(True)
plt.show()
In [8]:
from src.python.preprocess import *
from pymongo import MongoClient
client = MongoClient("mongodb://127.0.0.1:27017")
# client = MongoClient("mongodb://192.168.1.14:27017")
db = client['prot2vec']
dataset, _ = load_data(db, 'F', limit=200000)
dataset.augment()
print(dataset)
In [11]:
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import precision_recall_fscore_support
augment_nn = True
cls = dataset.onto.classes
n_cls = len(cls)
N = len(dataset)
y_truth = np.zeros((N, n_cls))
y_pred = np.zeros((N, n_cls))
macro, micro = {}, {}
tops = [1, 3, 5, 10]
for top in tops:
print("\ntop%d\n" % top)
for i, r in enumerate(dataset.records):
sys.stdout.write("\r{0:.0f}%".format(100.0 * i/N))
labels = r.lbl
if len(labels) == 0:
continue
y_truth[i, :] = dataset.onto.binarize([labels])
label_vecotrs = [word_vectors[go] for go in labels if go in word_vectors]
centroid = np.mean(label_vecotrs, axis=0)
nn = word_vectors.most_similar(centroid, topn=top)
nn = set([go for go, _ in nn if go in cls])
if augment_nn:
anc = map(lambda go: nx.descendants(onto_graph, go), nn)
nn = reduce(lambda x, y: x | y, anc, nn)
y_pred[i, :] = dataset.onto.binarize([nn])
y_truth = np.array(y_truth)
y_pred = np.array(y_pred)
score = np.zeros((n_cls, 3))
for j in range(n_cls):
sys.stdout.write("\r%d\t/\t%d" % (j, n_cls))
pr, rc, f1, _ = precision_recall_fscore_support(y_truth[:, j], y_pred[:, j], beta=1.0, average="binary")
score[j, :] = np.array([pr, rc, f1])
### macro
macro[top] = np.mean(score, axis=0) # compute the macro score
### micro
micro[top] = precision_recall_fscore_support(y_truth, y_pred, beta=1.0, average="micro")
print("\nmicro\tpr=%.2f\trc=%.2f\tf1=%.2f" % (micro[top][0], micro[top][1], micro[top][2]))
print("\nmacro\tpr=%.2f\trc=%.2f\tf1=%.2f" % (macro[top][0], macro[top][1], macro[top][2]))
In [12]:
###https://matplotlib.org/examples/api/barchart_demo.html
def plot_pr_rc_f1(title, scores):
N = len(scores)
pr = [v[0] for v in scores.values()]
rc = [v[1] for v in scores.values()]
f1 = [v[2] for v in scores.values()]
ind = np.arange(N) # the x locations for the groups
width = 0.2 # the width of the bars
fig, ax = plt.subplots()
rects1 = ax.bar(ind, pr, width, color='r')
rects2 = ax.bar(ind + width, rc, width, color='g')
rects3 = ax.bar(ind + width * 2, f1, width, color='b')
# add some text for labels, title and axes ticks
ax.set_ylabel('Scores')
ax.set_title(title)
ax.set_xticks(ind + width / 2)
ax.set_xticklabels(list(scores.keys()))
ax.legend((rects1[0], rects2[0], rects3[0]), ('prec', 'recall', 'f1'))
autolabel(rects1, ax)
autolabel(rects2, ax)
autolabel(rects3, ax)
def autolabel(rects, ax):
"""
Attach a text label above each bar displaying its height
"""
for rect in rects:
height = rect.get_height()
ax.text(rect.get_x() + rect.get_width()/2., 1.05*height,
'%.2f' % height,
ha='center', va='bottom')
In [13]:
plot_pr_rc_f1("macro", macro)
plot_pr_rc_f1("micro", micro)
plt.show()
In [ ]: