Amino Acids W2V Evaluations

Evaluate various vector representations of amino acids.

Imports & Consts


In [1]:
%matplotlib inline  

import os
import sys

module_path = os.path.abspath(os.path.join('src/python'))
if module_path not in sys.path:
    sys.path.append(module_path)

print(sys.version)
print(sys.path)


import torch
import torch.nn as nn
import torchvision.datasets as dsets
import torchvision.transforms as transforms
    
import word2vec
from word2vec import Word2VecImpl
from word2vec import Word2VecAPI

c_stats = word2vec.clstr_stats
stats = word2vec.nn_stats
plot = word2vec.plot
pca = word2vec.pca

n_clstr = 5


3.6.3 |Anaconda, Inc.| (default, Nov 20 2017, 20:41:42) 
[GCC 7.2.0]
['', '/home/yotamfr/development/prot2vec', '/home/yotamfr/development/prot2vec/virtualenv/lib/python36.zip', '/home/yotamfr/development/prot2vec/virtualenv/lib/python3.6', '/home/yotamfr/development/prot2vec/virtualenv/lib/python3.6/lib-dynload', '/home/yotamfr/development/prot2vec/virtualenv/lib/python3.6/site-packages', '/home/yotamfr/development/prot2vec/virtualenv/lib/python3.6/site-packages/recurrentshop-1.0.0-py3.6.egg', '/home/yotamfr/development/prot2vec/virtualenv/lib/python3.6/site-packages/torchvision-0.1.9-py3.6.egg', '/home/yotamfr/development/prot2vec/virtualenv/lib/python3.6/site-packages/IPython/extensions', '/home/yotamfr/.ipython', '/home/yotamfr/development/prot2vec/src/python/src/python']
{
 "A": 0,
 "R": 1,
 "N": 2,
 "D": 3,
 "C": 4,
 "E": 5,
 "Q": 6,
 "G": 7,
 "H": 8,
 "I": 9,
 "L": 10,
 "K": 11,
 "M": 12,
 "F": 13,
 "P": 14,
 "O": 15,
 "U": 16,
 "S": 17,
 "T": 18,
 "W": 19,
 "Y": 20,
 "V": 21,
 "X": 22,
 "B": 23,
 "Z": 24
}
WARNING! Deprecated!

Evaluate SkipGram


In [2]:
w2v = Word2VecImpl(5)
checkpoint = torch.load('../../data/trained/w2v_sg_best.tar')
w2v.load_state_dict(checkpoint['state_dict'])

api = Word2VecAPI(w2v)

print(stats(api))

print(c_stats(api, n_clstr))

plot(pca(api.embeddings))


Nearest to A: Q, H, I,
Nearest to R: P, V, T,
Nearest to N: H, P, L,
Nearest to D: F, S, G,
Nearest to C: S, V, R,
Nearest to E: G, V, F,
Nearest to Q: A, I, H,
Nearest to G: F, E, V,
Nearest to H: L, I, P,
Nearest to I: L, H, A,
Nearest to L: I, H, P,
Nearest to K: Y, V, L,
Nearest to M: X, D, S,
Nearest to F: G, D, V,
Nearest to P: R, H, L,
Nearest to O: R, T, P,
Nearest to U: F, G, T,
Nearest to S: Y, R, D,
Nearest to T: R, V, O,
Nearest to W: L, A, Q,
Nearest to Y: S, I, A,
Nearest to V: R, G, E,
Nearest to X: M, R, P,
Nearest to B: E, Z, G,
Nearest to Z: E, G, V,
None
cluster 0: A N Q H I L P W Y
cluster 1: D C M S X
cluster 2: E G F B Z
cluster 3: R K O T V
cluster 4: U
highest similarity: sim(I, L)=0.991650473562
lowest similarity: sim(M, B)=0.379447995595
average similarity: 0.837380660094

Evaluate CBOW


In [3]:
w2v = Word2VecImpl(5)
checkpoint = torch.load('../../data/trained/w2v_cbow_best.tar')
w2v.load_state_dict(checkpoint['state_dict'])

api = Word2VecAPI(w2v)

print(stats(api))

print(c_stats(api, n_clstr))

plot(pca(api.embeddings))


Nearest to A: W, Z, O,
Nearest to R: I, H, V,
Nearest to N: V, C, T,
Nearest to D: H, T, V,
Nearest to C: F, N, Q,
Nearest to E: K, W, Q,
Nearest to Q: E, V, K,
Nearest to G: X, K, Y,
Nearest to H: D, V, I,
Nearest to I: R, H, B,
Nearest to L: M, F, C,
Nearest to K: E, U, G,
Nearest to M: L, F, Q,
Nearest to F: C, L, M,
Nearest to P: T, S, N,
Nearest to O: B, W, A,
Nearest to U: K, Z, Y,
Nearest to S: E, V, H,
Nearest to T: D, N, P,
Nearest to W: B, A, E,
Nearest to Y: B, U, G,
Nearest to V: H, N, Q,
Nearest to X: G, B, Y,
Nearest to B: W, O, Y,
Nearest to Z: A, U, W,
None
cluster 0: A O W Z
cluster 1: C L M F
cluster 2: N E Q G K U Y V X B
cluster 3: D H P S T
cluster 4: R I
highest similarity: sim(L, M)=0.956542689714
lowest similarity: sim(F, Z)=-0.234628294837
average similarity: 0.514138536987

Evaluate Vocabulary SoftMax


In [4]:
w2v = Word2VecImpl(5)
checkpoint = torch.load('../../data/trained/w2v_sf_best.tar')
w2v.load_state_dict(checkpoint['state_dict'])

api = Word2VecAPI(w2v)

print(stats(api))

print(c_stats(api, n_clstr))

plot(pca(api.embeddings))


Nearest to A: M, V, T,
Nearest to R: E, W, L,
Nearest to N: Y, I, K,
Nearest to D: I, L, M,
Nearest to C: B, G, W,
Nearest to E: K, L, M,
Nearest to Q: L, K, E,
Nearest to G: C, V, M,
Nearest to H: W, L, B,
Nearest to I: D, L, M,
Nearest to L: W, D, I,
Nearest to K: I, E, N,
Nearest to M: V, D, W,
Nearest to F: Y, B, D,
Nearest to P: T, S, H,
Nearest to O: U, A, P,
Nearest to U: O, A, Z,
Nearest to S: B, F, T,
Nearest to T: V, I, D,
Nearest to W: L, M, D,
Nearest to Y: F, D, N,
Nearest to V: M, T, D,
Nearest to X: Z, Q, H,
Nearest to B: F, S, Y,
Nearest to Z: Q, R, X,
None
cluster 0: A C G H M P S T W V
cluster 1: O U B Z
cluster 2: N D I L K F Y
cluster 3: X
cluster 4: R E Q
highest similarity: sim(D, I)=0.99737761576
lowest similarity: sim(N, U)=-0.590581170856
average similarity: 0.58375842981

Evaluate aapred CNN


In [7]:
from src.python.aa_predict import GoodOldCNN

w2v = GoodOldCNN(5, 10)
checkpoint = torch.load('../../data/trained/aapred_cnn_latest.tar', map_location=lambda storage, loc: storage)
w2v.load_state_dict(checkpoint['state_dict'])

api = Word2VecAPI(w2v)

print(stats(api))

print(c_stats(api, n_clstr))

plot(pca(api.embeddings))


Nearest to A: Y, I, M,
Nearest to R: X, I, E,
Nearest to N: E, R, W,
Nearest to D: P, F, Z,
Nearest to C: U, I, L,
Nearest to E: G, V, W,
Nearest to Q: P, M, K,
Nearest to G: E, V, I,
Nearest to H: V, K, B,
Nearest to I: C, R, E,
Nearest to L: C, I, R,
Nearest to K: U, H, V,
Nearest to M: B, Q, P,
Nearest to F: X, Q, S,
Nearest to P: X, D, Q,
Nearest to O: Z, L, W,
Nearest to U: C, Y, K,
Nearest to S: X, K, F,
Nearest to T: V, K, E,
Nearest to W: E, V, I,
Nearest to Y: U, A, P,
Nearest to V: E, H, T,
Nearest to X: P, F, S,
Nearest to B: M, Q, H,
Nearest to Z: D, G, O,
None
cluster 0: R C I L
cluster 1: D Q F P S X
cluster 2: E G K T V
cluster 3: A M Y
cluster 4: N H O U W B Z
highest similarity: sim(C, U)=0.853412758767
lowest similarity: sim(K, O)=-0.926911171186
average similarity: 0.148256318851

Evaluate Gensim


In [10]:
from gensim.models.word2vec import Word2Vec


model_filename = "../../models/kmer/uniprot_1-mer_dim20_win5_mc2.emb"
w2v = Word2Vec.load(model_filename)
print(c_stats(w2v.wv, n_clstr))
print(w2v.similarity('A', 'V'))
print(w2v.similarity('R', 'L'))
print(w2v.similarity('F', 'D'))
print(w2v.similarity('H', 'C'))

model_filename = "../../models/kmer/uniprot_1-mer_dim20_win10_mc2.emb"
w2v = Word2Vec.load(model_filename)
print(c_stats(w2v.wv, n_clstr))
print(w2v.similarity('A', 'V'))
print(w2v.similarity('R', 'L'))
print(w2v.similarity('F', 'D'))
print(w2v.similarity('H', 'C'))


cluster 0: L Y R E K A N F S M T I H Q W D C G P V
cluster 1: Z B
cluster 2: X
cluster 3: U
cluster 4: O
highest similarity: sim(Z, B)=0.961280105136
lowest similarity: sim(U, X)=-0.616557742535
average similarity: 0.528280574761
0.920351996943
0.911896837896
0.915705890587
0.784250521464
cluster 0: E I G M R Q F V P K L D H C Y W N T A S
cluster 1: U
cluster 2: O
cluster 3: X
cluster 4: B Z
highest similarity: sim(F, D)=0.959639293265
lowest similarity: sim(U, X)=-0.741377679027
average similarity: 0.603392330821
0.932760972414
0.894943136196
0.959639293265
0.818763128181
/home/yotamfr/development/prot2vec/virtualenv/lib/python3.6/site-packages/ipykernel_launcher.py:7: DeprecationWarning: Call to deprecated `similarity` (Method will be removed in 4.0.0, use self.wv.similarity() instead).
  import sys
/home/yotamfr/development/prot2vec/virtualenv/lib/python3.6/site-packages/ipykernel_launcher.py:8: DeprecationWarning: Call to deprecated `similarity` (Method will be removed in 4.0.0, use self.wv.similarity() instead).
  
/home/yotamfr/development/prot2vec/virtualenv/lib/python3.6/site-packages/ipykernel_launcher.py:9: DeprecationWarning: Call to deprecated `similarity` (Method will be removed in 4.0.0, use self.wv.similarity() instead).
  if __name__ == '__main__':
/home/yotamfr/development/prot2vec/virtualenv/lib/python3.6/site-packages/ipykernel_launcher.py:10: DeprecationWarning: Call to deprecated `similarity` (Method will be removed in 4.0.0, use self.wv.similarity() instead).
  # Remove the CWD from sys.path while we load stuff.
/home/yotamfr/development/prot2vec/virtualenv/lib/python3.6/site-packages/ipykernel_launcher.py:15: DeprecationWarning: Call to deprecated `similarity` (Method will be removed in 4.0.0, use self.wv.similarity() instead).
  from ipykernel import kernelapp as app
/home/yotamfr/development/prot2vec/virtualenv/lib/python3.6/site-packages/ipykernel_launcher.py:16: DeprecationWarning: Call to deprecated `similarity` (Method will be removed in 4.0.0, use self.wv.similarity() instead).
  app.launch_new_instance()
/home/yotamfr/development/prot2vec/virtualenv/lib/python3.6/site-packages/ipykernel_launcher.py:17: DeprecationWarning: Call to deprecated `similarity` (Method will be removed in 4.0.0, use self.wv.similarity() instead).
/home/yotamfr/development/prot2vec/virtualenv/lib/python3.6/site-packages/ipykernel_launcher.py:18: DeprecationWarning: Call to deprecated `similarity` (Method will be removed in 4.0.0, use self.wv.similarity() instead).

In [11]:
import numpy as np

a = [[1, 2], [3, 4]]
np.lib.pad(a, ((0, 2), (0, 0)), mode='constant', constant_values=0.)


Out[11]:
array([[1, 2],
       [3, 4],
       [0, 0],
       [0, 0]])

In [ ]: