First, let's do word2vec with 'normal' text



In [1]:

    
from gensim.models.word2vec import Word2Vec
from gensim.models import word2vec
import pandas as pd
import re
import string
from nltk.corpus import stopwords
import numpy as np 

STOP = set(stopwords.words("english"))
REMOVE = set(["!","(",")",":",".",";",",",'"',"?","-",">","_"])


df = pd.read_csv('../Goodreads_visualization/goodreads_export.csv')
cleaned_df = df[df["My Rating"] != 0]


html_clean = re.compile('<.*?>')
gr_clean = re.compile('\[.*?\]')

all_my_words = []

reviews = cleaned_df["My Review"]

num_reviews = 0
num_words = 0
for row in reviews:
    if pd.isnull(row):
        continue
    review = row.lower()
    if not review:
        # empty review
        continue
    # clean strings
    cleaned_review = re.sub(html_clean, '', review)
    cleaned_review = re.sub(gr_clean, '', cleaned_review)
    new_review = []
    for x in cleaned_review.split(' '):
        if x in STOP: continue
        if x in REMOVE: continue
        new_review.append(x)
    new_review = ' '.join(new_review)
    
    all_my_words += new_review.split('.')
    num_reviews += 1

let's put these sentences into a dumb text file for the helper, just as an example



In [2]:

    
text = open('simple_text.txt','w')
for element in all_my_words:
    text.write('%s.\n'%element)



In [3]:

    
from gensim.models.word2vec import LineSentence
sentences = LineSentence(text.name) 
model = Word2Vec(sentences, size=100, window=4, min_count=5)



In [4]:

    
vocab = list(model.vocab.keys())
vocab[:10]









    Out[4]:





['confusing',
 '8',
 'anything,',
 'them,',
 'highly',
 'novella',
 'took',
 'history,',
 'escaped',
 'description']



In [5]:

    
model['physics']









    Out[5]:





array([ 0.02590463,  0.05325259,  0.00275839, -0.02335821, -0.0021469 ,
        0.00749045, -0.04389399,  0.01700118, -0.00811095,  0.031775  ,
       -0.02971643, -0.02754352,  0.00948112, -0.01186801, -0.0017621 ,
        0.0011061 , -0.00807445,  0.01756881,  0.061256  ,  0.02924409,
        0.0099775 ,  0.03155666, -0.01586835, -0.01067527, -0.04608267,
        0.02817214,  0.00196514,  0.02461084, -0.00582369,  0.11048271,
        0.02120005, -0.04655095, -0.0654763 ,  0.05405751,  0.05760092,
       -0.00156674, -0.05828641,  0.02093396,  0.02653268, -0.0292282 ,
       -0.06003998, -0.03734865, -0.04022611,  0.01959871,  0.00404786,
       -0.00465561, -0.02315675,  0.05392883, -0.02473005, -0.01621924,
       -0.00241546, -0.00201591, -0.01784706,  0.01396601,  0.01072223,
       -0.00744795,  0.03499661,  0.01595387,  0.05096153,  0.02115347,
       -0.0189    ,  0.00876939,  0.00991265,  0.00485152, -0.07494608,
       -0.03111615,  0.01724795,  0.00794527, -0.00816187, -0.02802069,
       -0.01804733, -0.021119  , -0.03714184, -0.06643627, -0.03084962,
        0.0117041 ,  0.01596848, -0.03023775,  0.04834134, -0.01046323,
        0.02554929,  0.00931484,  0.00155813, -0.00014995, -0.0297344 ,
       -0.02444706,  0.01184977, -0.02338641,  0.01341958, -0.03774538,
       -0.07839838,  0.01400146, -0.00590791,  0.01397337,  0.02610124,
       -0.02448988,  0.02835801, -0.00709735,  0.00664044,  0.01345898], dtype=float32)



In [6]:

    
model.most_similar_cosmul(positive=['nazis', 'japan'])









    Out[6]:





[('the', 0.9963245987892151),
 ('around', 0.9962689280509949),
 ('less', 0.996261715888977),
 ('years', 0.9962410926818848),
 ('things', 0.9962348937988281),
 ('lot', 0.9962292909622192),
 ('need', 0.9962226748466492),
 ('story', 0.9962186813354492),
 ('starts', 0.9962095022201538),
 ('take', 0.996208906173706)]



In [7]:

    
model.similarity('book', 'novel')









    Out[7]:





0.99948381242476159

Exciting! Are other pairs more dissimilar?



In [8]:

    
import random
similarities = []
for i in range(1,100):
    one, other = random.choice(vocab), random.choice(vocab)

    similarities.append(model.similarity(one, other))



In [9]:

    
np.mean(similarities), np.median(similarities)









    Out[9]:





(0.97340578733299954, 0.98706711297229521)

Welp that's not good. I guess the training corpus is too small...



In [10]:

    
model.syn0









    Out[10]:





array([[  3.00974160e-01,   6.87568307e-01,   2.25740969e-02, ...,
         -1.06893435e-01,   8.04017931e-02,   1.47196159e-01],
       [  2.77204245e-01,   6.41609013e-01,   2.73348484e-02, ...,
         -1.00759573e-01,   6.77796975e-02,   1.36412725e-01],
       [  2.74293721e-01,   6.22721732e-01,   2.16171890e-02, ...,
         -1.01019360e-01,   6.91562518e-02,   1.27807602e-01],
       ..., 
       [  1.27614932e-02,   2.32411809e-02,   1.53399154e-03, ...,
         -7.35794660e-03,   3.31353222e-04,   2.68854876e-03],
       [  1.42780505e-02,   2.22719088e-02,  -1.50333496e-03, ...,
         -4.20227501e-04,  -1.96805643e-03,   1.83127099e-03],
       [  1.00261848e-02,   3.60426456e-02,   3.44293192e-03, ...,
         -9.25957877e-03,   4.07849299e-03,   9.70157422e-03]], dtype=float32)

The above are the weights you can use in embeddings

Now let's look at DNA!



In [11]:

    
import dna2vec

First a look at how dna2vec does it:



In [12]:

    
class Learner:
    def __init__(self, out_fileroot, context_halfsize, gensim_iters, vec_dim):
        self.logger = logbook.Logger(self.__class__.__name__)
        assert(word2vec.FAST_VERSION >= 0)
        self.logger.info('word2vec.FAST_VERSION (should be >= 0): {}'.format(word2vec.FAST_VERSION))
        self.model = None
        self.out_fileroot = out_fileroot
        self.context_halfsize = context_halfsize
        self.gensim_iters = gensim_iters
        self.use_skipgram = 1
        self.vec_dim = vec_dim

        self.logger.info('Context window half size: {}'.format(self.context_halfsize))
        self.logger.info('Use skipgram: {}'.format(self.use_skipgram))
        self.logger.info('gensim_iters: {}'.format(self.gensim_iters))
        self.logger.info('vec_dim: {}'.format(self.vec_dim))

    def train(self, kmer_seq_generator):
        self.model = word2vec.Word2Vec(
            sentences=kmer_seq_generator,
            size=self.vec_dim,
            window=self.context_halfsize,
            min_count=5,
            workers=4,
            sg=self.use_skipgram,
            iter=self.gensim_iters)

        # self.logger.info(model.vocab)

    def write_vec(self):
        out_filename = '{}.w2v'.format(self.out_fileroot)
        self.model.save_word2vec_format(out_filename, binary=False)

The trained table looks like this (some random input data from my projects)

1344 12
AAA 0.798623 0.340167 -0.106002 0.479023 -0.512316 -0.204932 -0.909642 0.929776 -0.526895 -0.487418 0.652579 -0.041673
TTT -0.430355 0.507353 0.204868 -0.396864 0.594459 -0.879607 -0.070906 1.065970 -0.216547 0.540595 0.742848 -0.213119
AAAA 0.916474 0.360498 -0.201165 0.450726 -0.627372 -0.232655 -1.043633 1.079020 -0.585594 -0.505746 0.719241 0.046239
TTTT -0.604012 0.578508 0.240181 -0.476954 0.605583 -0.960840 -0.079009 1.184651 -0.243861 0.608393 0.795853 -0.286772
TTTTT -0.625304 0.602461 0.296689 -0.482694 0.649928 -0.997988 -0.065473 1.091690 -0.250700 0.741902 0.868796 -0.313275
AAAAA 1.029531 0.364190 -0.265436 0.437347 -0.723385 -0.299899 -1.087821 1.122777 -0.636950 -0.578345 0.761875 0.069213
ATT -0.490559 0.496848 -0.300972 -0.190906 0.170407 -0.613530 -0.456763 0.833760 -0.632226 0.541257 0.759477 -0.018878
AAT 0.028495 0.360433 -0.458956 0.125764 -0.013972 -0.417648 -0.925049 0.953009 -0.534955 -0.186829 0.600540 0.346013

Now load this into python, and later Keras



In [13]:

    
from gensim.models import word2vec
word_vectors = word2vec.Word2Vec.load_word2vec_format('./dna2vec-20170621-0833-k3to5-12d-4c-6Mbp-sliding-nqR.w2v')



In [14]:

    
word_vectors.syn0









    Out[14]:





array([[ 0.79862303,  0.34016699, -0.106002  , ..., -0.487418  ,
         0.65257901, -0.041673  ],
       [-0.43035501,  0.50735301,  0.204868  , ...,  0.540595  ,
         0.74284798, -0.213119  ],
       [ 0.91647398,  0.36049801, -0.20116501, ..., -0.50574601,
         0.71924102,  0.046239  ],
       ..., 
       [ 0.76107103,  0.83726299, -0.28360999, ...,  0.170651  ,
         0.103405  , -0.71069998],
       [ 0.89855403,  0.8168    , -0.37794799, ...,  0.32508701,
         0.61397099, -0.39332399],
       [ 0.814578  ,  0.86901098, -0.498676  , ...,  0.21636599,
         0.474002  , -0.49750999]], dtype=float32)

That's the weights for Keras!



In [15]:

    
word_vectors.syn0.shape









    Out[15]:





(1344, 12)



In [16]:

    
word_vectors.most_similar(positive=['AAA'], negative=['TTT'])









    Out[16]:





[('AAAAA', 0.6307621002197266),
 ('AAAA', 0.6196656227111816),
 ('AAAGG', 0.597085177898407),
 ('GAAAA', 0.5835936069488525),
 ('AGGAA', 0.5639663338661194),
 ('AAAAG', 0.5594942569732666),
 ('CAAAA', 0.5461493730545044),
 ('GAAA', 0.5357614159584045),
 ('AAAAC', 0.5209873914718628),
 ('GGAAA', 0.5187864303588867)]



In [17]:

    
word_vectors.most_similar(positive=['AAA'])









    Out[17]:





[('AAAA', 0.997001051902771),
 ('AAAAA', 0.9936331510543823),
 ('AATAA', 0.9240623712539673),
 ('TAAAA', 0.9036520719528198),
 ('TAAA', 0.8960134983062744),
 ('GAAAA', 0.8902114629745483),
 ('AAAT', 0.8800619840621948),
 ('AAAAT', 0.8783645629882812),
 ('GAAA', 0.8714821338653564),
 ('CAAAA', 0.869722843170166)]



In [18]:

    
word_vectors.similarity('AGAAT','AAGTA')









    Out[18]:





0.37256660513788192

More promising! At least no 99% as above....

Let's compare with Needleman Wunsch, an alignment algorithm



In [19]:

    
from Bio import pairwise2
pairwise2.align.globalxx("AGAAT", "AAGTA")









    Out[19]:





[('AGAA-T-', '--AAGTA', 3.0, 0, 7),
 ('AGAA-T-', 'A--AGTA', 3.0, 0, 7),
 ('AGAA-T-', 'A-A-GTA', 3.0, 0, 7),
 ('AGAAT-', 'A-AGTA', 3.0, 0, 6),
 ('A-GAAT-', 'AAG--TA', 3.0, 0, 7),
 ('-AGAAT-', 'AAG--TA', 3.0, 0, 7),
 ('AGA--AT', 'A-AGTA-', 3.0, 0, 7),
 ('A-GA-AT', 'AAG-TA-', 3.0, 0, 7),
 ('-AGA-AT', 'AAG-TA-', 3.0, 0, 7),
 ('A-GAAT', 'AAGTA-', 3.0, 0, 6),
 ('-AGAAT', 'AAGTA-', 3.0, 0, 6),
 ('A-G-AAT', 'AAGTA--', 3.0, 0, 7),
 ('-AG-AAT', 'AAGTA--', 3.0, 0, 7)]



In [20]:

    
3.0/7









    Out[20]:





0.42857142857142855

Kind of close? 42 % to 37%....



In [21]:

    
from datasketch import MinHash
import random
nws = []
sims = []
sketches = []
counter = 0
outside_counter = 0

keys = list(word_vectors.vocab.keys())
for i in range(1,1000):
    #print(word_vectors.vocab.keys())
    a = random.choice(keys)
    b = random.choice(keys)
    if a == b: continue
    if len(a) != len(b) != 5: continue
    # get similarity via needleman-wunsch
    new_score = pairwise2.align.globalxx(a, b)[0]
    # ('CCAC--T', '-CA-AAT', 3.0, 0, 7)
    score, length = new_score[2], new_score[-1]
    score += 1

    nws.append(score)

    # get from dna2vec
    similarity = word_vectors.similarity(a, b) # 0.3
    sims.append(similarity)

    # get from minhash
    m1, m2 = MinHash(), MinHash()
    for d in a:
        m1.update(d.encode('utf8'))
    for d in b:
        m2.update(d.encode('utf8'))
    jaccard = m1.jaccard(m2)

    sketches.append(jaccard)



In [22]:

    
import seaborn as sns
import pandas as pd
df = pd.DataFrame({'Needleman-Wunsch':nws, 'word2vec':sims})
%matplotlib inline
sns.violinplot(x='Needleman-Wunsch', y='word2vec', data=df);



In [23]:

    
import scipy
print(scipy.stats.pearsonr(nws, sims), scipy.stats.pearsonr(sketches, sims), scipy.stats.pearsonr(nws, sketches))









    



(0.58097157250417764, 4.240821694880841e-75) (0.37885759750021414, 2.3789707907957134e-29) (0.58060023892518331, 5.5360242715689705e-75)

wat. was expecting something higher. Let's try t-SNE



In [24]:

    
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

X = word_vectors[word_vectors.vocab]
print(X[0:3,0:3])

tsne = TSNE(n_components=2)
X_tsne = tsne.fit_transform(X)
plt.scatter(X_tsne[:, 0], X_tsne[:, 1])
plt.show()









    



[[-0.17905501  1.31028903  0.25527099]
 [ 0.29710999  1.24941504 -0.51987797]
 [ 0.248716    1.53722298 -0.417088  ]]

Let's load the pretrained huge wordvectors from the paper (trained on human genome)



In [25]:

    
word_vectors = word2vec.Word2Vec.load_word2vec_format('./dna2vec/pretrained/dna2vec-20161219-0153-k3to8-100d-10c-29320Mbp-sliding-Xat.w2v')



In [34]:

    
from sklearn.metrics.pairwise import cosine_similarity

nws = []
sims = []
sketches = []
cosines = []


keys = list(word_vectors.vocab.keys())
for i in range(1,1000):
    #print(word_vectors.vocab.keys())
    a = random.choice(keys)
    b = random.choice(keys)
    if a == b:
        continue
    if len(a) + len(b) != 16:
        continue

    # get similarity via needleman-wunsch
    new_score = pairwise2.align.globalxx(a, b)[0]
    # ('CCAC--T', '-CA-AAT', 3.0, 0, 7)
    score, length = new_score[2], new_score[-1]
    new_score = score + 1
    nws.append(new_score)

    # get from cosine similarity
    cosine = cosine_similarity(word_vectors[a].reshape(1,-1), word_vectors[b].reshape(1,-1))[0][0]
    cosines.append(cosine)

    # get from dna2vec
    similarity = word_vectors.similarity(a, b) # 0.3
    sims.append(similarity)

    # get from minhash
    m1, m2 = MinHash(), MinHash()
    for d in a:
        m1.update(d.encode('utf8'))
    for d in b:
        m2.update(d.encode('utf8'))
    jaccard = m1.jaccard(m2)

    sketches.append(jaccard)
    counter += 1


print('''Needleman Wunsch vs word2vec similarities: %s, 
      MinHash vs word2vec similarities: %s,
      Needleman Wunsch vs MinHash: %s,
      MinHash vs. Cosine Similarity: %s,
      Needleman Wunsch vs Cosine Similarity: %s,
      word2vec vs Cosine Similarity: %s'''%(
      scipy.stats.pearsonr(nws, sims)[0], 
      scipy.stats.pearsonr(sketches, sims)[0], 
      scipy.stats.pearsonr(nws, sketches)[0], 
      scipy.stats.pearsonr(sketches, cosines)[0],
      scipy.stats.pearsonr(nws, cosines)[0],
      scipy.stats.pearsonr(sims, cosines)[0]
))

df = pd.DataFrame({'Needleman-Wunsch':nws, 'word2vec':sims})
sns.violinplot(x='Needleman-Wunsch', y='word2vec', data=df);









    



Needleman Wunsch vs word2vec similarities: 0.549490891705, 
      MinHash vs word2vec similarities: -0.0298490315182,
      Needleman Wunsch vs MinHash: 0.170673690321,
      MinHash vs. Cosine Similarity: -0.029849041001,
      Needleman Wunsch vs Cosine Similarity: 0.54949089627,
      word2vec vs Cosine Similarity: 0.999999996318

Now check the first NEIGHBOR only



In [27]:

    
counter = 0
nws = []
w2v = []
sketches = []

for a in word_vectors.vocab:
    if len(a) != 8: continue
        
    neighbors = word_vectors.most_similar_cosmul(a)
    neighbor = ''
    for  n in neighbors:
        # n = ('CTTTCCT', 0.929522693157196)
        if len(n[0]) == len(a):
            neighbor = n
            break
    if not neighbor: 
        continue
    
    new_score = pairwise2.align.globalxx(a, neighbor[0])[0]
    # ('CCAC--T', '-CA-AAT', 3.0, 0, 7)
    score, length = new_score[2], new_score[-1]
    score += 1
    if counter < 5:
        print('Sequence %s Neighbor %s'%(a, neighbor))
        print('Needleman Wunsch: %s'%score)
        print('-------')
    nws.append(score)
    other_score = neighbor[1]
    w2v.append(other_score)
    
    # get from minhash
    m1, m2 = MinHash(), MinHash()
    for d in a:
        m1.update(d.encode('utf8'))
    for d in neighbor[0]:
        m2.update(d.encode('utf8'))
    jaccard = m1.jaccard(m2)

    sketches.append(float('%.1f'%jaccard))
    
    if counter == 1000:
        break
    counter += 1

scipy.stats.spearmanr(nws, w2v)









    



Sequence AGATGCAC Neighbor ('GATGCACA', 0.8849484920501709)
Needleman Wunsch: 8.0
-------
Sequence CTGCATAC Neighbor ('CTGCATAT', 0.8692834377288818)
Needleman Wunsch: 8.0
-------
Sequence GTATGATA Neighbor ('AGTATGAT', 0.8735715746879578)
Needleman Wunsch: 8.0
-------
Sequence CGGGGGGG Neighbor ('CGGCGGGG', 0.9540440440177917)
Needleman Wunsch: 8.0
-------
Sequence ATAGGGGC Neighbor ('ATAGGGGG', 0.9357842206954956)
Needleman Wunsch: 8.0
-------






    Out[27]:





SpearmanrResult(correlation=0.0049777762026951312, pvalue=0.87501315933465107)



In [28]:

    
df = pd.DataFrame({'Needleman-Wunsch':nws, 'dna2vec cosine':w2v, 'MinHash':sketches})
sns.violinplot(x='Needleman-Wunsch', y='dna2vec cosine', data=df, scale='count');



In [29]:

    
g = sns.violinplot(x='MinHash', y='dna2vec cosine', data=df, scale='count');

Niiiiiiiiiice. So for close neighbors MinHash and Needleman-Wunsch seem to correlate, but not for distant neighbors, interesting



In [30]:

    
df.describe()









    Out[30]:






  
    
      
      MinHash
      Needleman-Wunsch
      dna2vec cosine
    
  
  
    
      count
      1001.000000
      1001.000000
      1001.000000
    
    
      mean
      0.973826
      7.949051
      0.892468
    
    
      std
      0.079714
      0.228914
      0.022032
    
    
      min
      0.600000
      6.000000
      0.844782
    
    
      25%
      1.000000
      8.000000
      0.875647
    
    
      50%
      1.000000
      8.000000
      0.890206
    
    
      75%
      1.000000
      8.000000
      0.907552
    
    
      max
      1.000000
      8.000000
      0.975248

Welp that's weird. By now I think that neighbors in the dna2vec space don't necessarily have to be related by their bases.

TO CHECK:

what are the neighbors that have low NW scores? Are they functionally related, or are their sequences conserved? Can I find conservation islands this way?
why is it that close neighbors have better NW/MinHash score correlation than distant neighbors?



In [31]:

    
weights = word_vectors.syn0

from keras.layers import Embedding
from keras.engine import Input

layer = Embedding(input_dim=weights.shape[0], output_dim=weights.shape[1], weights=[weights], trainable=False)
layer









    



Using TensorFlow backend.






    Out[31]:





<keras.layers.embeddings.Embedding at 0x7ff893925c88>

IMPORTANT to set trainable to False, else Keras is allowed to change our word2vec input weights!

We can either use an LSTM or convolutional layers here.

https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html



In [ ]:

	MinHash	Needleman-Wunsch	dna2vec cosine
count	1001.000000	1001.000000	1001.000000
mean	0.973826	7.949051	0.892468
std	0.079714	0.228914	0.022032
min	0.600000	6.000000	0.844782
25%	1.000000	8.000000	0.875647
50%	1.000000	8.000000	0.890206
75%	1.000000	8.000000	0.907552
max	1.000000	8.000000	0.975248