First, let's do word2vec with 'normal' text
In [1]:
from gensim.models.word2vec import Word2Vec
from gensim.models import word2vec
import pandas as pd
import re
import string
from nltk.corpus import stopwords
import numpy as np
STOP = set(stopwords.words("english"))
REMOVE = set(["!","(",")",":",".",";",",",'"',"?","-",">","_"])
df = pd.read_csv('../Goodreads_visualization/goodreads_export.csv')
cleaned_df = df[df["My Rating"] != 0]
html_clean = re.compile('<.*?>')
gr_clean = re.compile('\[.*?\]')
all_my_words = []
reviews = cleaned_df["My Review"]
num_reviews = 0
num_words = 0
for row in reviews:
if pd.isnull(row):
continue
review = row.lower()
if not review:
# empty review
continue
# clean strings
cleaned_review = re.sub(html_clean, '', review)
cleaned_review = re.sub(gr_clean, '', cleaned_review)
new_review = []
for x in cleaned_review.split(' '):
if x in STOP: continue
if x in REMOVE: continue
new_review.append(x)
new_review = ' '.join(new_review)
all_my_words += new_review.split('.')
num_reviews += 1
let's put these sentences into a dumb text file for the helper, just as an example
In [2]:
text = open('simple_text.txt','w')
for element in all_my_words:
text.write('%s.\n'%element)
In [3]:
from gensim.models.word2vec import LineSentence
sentences = LineSentence(text.name)
model = Word2Vec(sentences, size=100, window=4, min_count=5)
In [4]:
vocab = list(model.vocab.keys())
vocab[:10]
Out[4]:
In [5]:
model['physics']
Out[5]:
In [6]:
model.most_similar_cosmul(positive=['nazis', 'japan'])
Out[6]:
In [7]:
model.similarity('book', 'novel')
Out[7]:
Exciting! Are other pairs more dissimilar?
In [8]:
import random
similarities = []
for i in range(1,100):
one, other = random.choice(vocab), random.choice(vocab)
similarities.append(model.similarity(one, other))
In [9]:
np.mean(similarities), np.median(similarities)
Out[9]:
Welp that's not good. I guess the training corpus is too small...
In [10]:
model.syn0
Out[10]:
The above are the weights you can use in embeddings
Now let's look at DNA!
In [11]:
import dna2vec
First a look at how dna2vec does it:
In [12]:
class Learner:
def __init__(self, out_fileroot, context_halfsize, gensim_iters, vec_dim):
self.logger = logbook.Logger(self.__class__.__name__)
assert(word2vec.FAST_VERSION >= 0)
self.logger.info('word2vec.FAST_VERSION (should be >= 0): {}'.format(word2vec.FAST_VERSION))
self.model = None
self.out_fileroot = out_fileroot
self.context_halfsize = context_halfsize
self.gensim_iters = gensim_iters
self.use_skipgram = 1
self.vec_dim = vec_dim
self.logger.info('Context window half size: {}'.format(self.context_halfsize))
self.logger.info('Use skipgram: {}'.format(self.use_skipgram))
self.logger.info('gensim_iters: {}'.format(self.gensim_iters))
self.logger.info('vec_dim: {}'.format(self.vec_dim))
def train(self, kmer_seq_generator):
self.model = word2vec.Word2Vec(
sentences=kmer_seq_generator,
size=self.vec_dim,
window=self.context_halfsize,
min_count=5,
workers=4,
sg=self.use_skipgram,
iter=self.gensim_iters)
# self.logger.info(model.vocab)
def write_vec(self):
out_filename = '{}.w2v'.format(self.out_fileroot)
self.model.save_word2vec_format(out_filename, binary=False)
The trained table looks like this (some random input data from my projects)
1344 12
AAA 0.798623 0.340167 -0.106002 0.479023 -0.512316 -0.204932 -0.909642 0.929776 -0.526895 -0.487418 0.652579 -0.041673
TTT -0.430355 0.507353 0.204868 -0.396864 0.594459 -0.879607 -0.070906 1.065970 -0.216547 0.540595 0.742848 -0.213119
AAAA 0.916474 0.360498 -0.201165 0.450726 -0.627372 -0.232655 -1.043633 1.079020 -0.585594 -0.505746 0.719241 0.046239
TTTT -0.604012 0.578508 0.240181 -0.476954 0.605583 -0.960840 -0.079009 1.184651 -0.243861 0.608393 0.795853 -0.286772
TTTTT -0.625304 0.602461 0.296689 -0.482694 0.649928 -0.997988 -0.065473 1.091690 -0.250700 0.741902 0.868796 -0.313275
AAAAA 1.029531 0.364190 -0.265436 0.437347 -0.723385 -0.299899 -1.087821 1.122777 -0.636950 -0.578345 0.761875 0.069213
ATT -0.490559 0.496848 -0.300972 -0.190906 0.170407 -0.613530 -0.456763 0.833760 -0.632226 0.541257 0.759477 -0.018878
AAT 0.028495 0.360433 -0.458956 0.125764 -0.013972 -0.417648 -0.925049 0.953009 -0.534955 -0.186829 0.600540 0.346013
Now load this into python, and later Keras
In [13]:
from gensim.models import word2vec
word_vectors = word2vec.Word2Vec.load_word2vec_format('./dna2vec-20170621-0833-k3to5-12d-4c-6Mbp-sliding-nqR.w2v')
In [14]:
word_vectors.syn0
Out[14]:
That's the weights for Keras!
In [15]:
word_vectors.syn0.shape
Out[15]:
In [16]:
word_vectors.most_similar(positive=['AAA'], negative=['TTT'])
Out[16]:
In [17]:
word_vectors.most_similar(positive=['AAA'])
Out[17]:
In [18]:
word_vectors.similarity('AGAAT','AAGTA')
Out[18]:
More promising! At least no 99% as above....
Let's compare with Needleman Wunsch, an alignment algorithm
In [19]:
from Bio import pairwise2
pairwise2.align.globalxx("AGAAT", "AAGTA")
Out[19]:
In [20]:
3.0/7
Out[20]:
Kind of close? 42 % to 37%....
In [21]:
from datasketch import MinHash
import random
nws = []
sims = []
sketches = []
counter = 0
outside_counter = 0
keys = list(word_vectors.vocab.keys())
for i in range(1,1000):
#print(word_vectors.vocab.keys())
a = random.choice(keys)
b = random.choice(keys)
if a == b: continue
if len(a) != len(b) != 5: continue
# get similarity via needleman-wunsch
new_score = pairwise2.align.globalxx(a, b)[0]
# ('CCAC--T', '-CA-AAT', 3.0, 0, 7)
score, length = new_score[2], new_score[-1]
score += 1
nws.append(score)
# get from dna2vec
similarity = word_vectors.similarity(a, b) # 0.3
sims.append(similarity)
# get from minhash
m1, m2 = MinHash(), MinHash()
for d in a:
m1.update(d.encode('utf8'))
for d in b:
m2.update(d.encode('utf8'))
jaccard = m1.jaccard(m2)
sketches.append(jaccard)
In [22]:
import seaborn as sns
import pandas as pd
df = pd.DataFrame({'Needleman-Wunsch':nws, 'word2vec':sims})
%matplotlib inline
sns.violinplot(x='Needleman-Wunsch', y='word2vec', data=df);
In [23]:
import scipy
print(scipy.stats.pearsonr(nws, sims), scipy.stats.pearsonr(sketches, sims), scipy.stats.pearsonr(nws, sketches))
wat. was expecting something higher. Let's try t-SNE
In [24]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
X = word_vectors[word_vectors.vocab]
print(X[0:3,0:3])
tsne = TSNE(n_components=2)
X_tsne = tsne.fit_transform(X)
plt.scatter(X_tsne[:, 0], X_tsne[:, 1])
plt.show()
Let's load the pretrained huge wordvectors from the paper (trained on human genome)
In [25]:
word_vectors = word2vec.Word2Vec.load_word2vec_format('./dna2vec/pretrained/dna2vec-20161219-0153-k3to8-100d-10c-29320Mbp-sliding-Xat.w2v')
In [34]:
from sklearn.metrics.pairwise import cosine_similarity
nws = []
sims = []
sketches = []
cosines = []
keys = list(word_vectors.vocab.keys())
for i in range(1,1000):
#print(word_vectors.vocab.keys())
a = random.choice(keys)
b = random.choice(keys)
if a == b:
continue
if len(a) + len(b) != 16:
continue
# get similarity via needleman-wunsch
new_score = pairwise2.align.globalxx(a, b)[0]
# ('CCAC--T', '-CA-AAT', 3.0, 0, 7)
score, length = new_score[2], new_score[-1]
new_score = score + 1
nws.append(new_score)
# get from cosine similarity
cosine = cosine_similarity(word_vectors[a].reshape(1,-1), word_vectors[b].reshape(1,-1))[0][0]
cosines.append(cosine)
# get from dna2vec
similarity = word_vectors.similarity(a, b) # 0.3
sims.append(similarity)
# get from minhash
m1, m2 = MinHash(), MinHash()
for d in a:
m1.update(d.encode('utf8'))
for d in b:
m2.update(d.encode('utf8'))
jaccard = m1.jaccard(m2)
sketches.append(jaccard)
counter += 1
print('''Needleman Wunsch vs word2vec similarities: %s,
MinHash vs word2vec similarities: %s,
Needleman Wunsch vs MinHash: %s,
MinHash vs. Cosine Similarity: %s,
Needleman Wunsch vs Cosine Similarity: %s,
word2vec vs Cosine Similarity: %s'''%(
scipy.stats.pearsonr(nws, sims)[0],
scipy.stats.pearsonr(sketches, sims)[0],
scipy.stats.pearsonr(nws, sketches)[0],
scipy.stats.pearsonr(sketches, cosines)[0],
scipy.stats.pearsonr(nws, cosines)[0],
scipy.stats.pearsonr(sims, cosines)[0]
))
df = pd.DataFrame({'Needleman-Wunsch':nws, 'word2vec':sims})
sns.violinplot(x='Needleman-Wunsch', y='word2vec', data=df);
Now check the first NEIGHBOR only
In [27]:
counter = 0
nws = []
w2v = []
sketches = []
for a in word_vectors.vocab:
if len(a) != 8: continue
neighbors = word_vectors.most_similar_cosmul(a)
neighbor = ''
for n in neighbors:
# n = ('CTTTCCT', 0.929522693157196)
if len(n[0]) == len(a):
neighbor = n
break
if not neighbor:
continue
new_score = pairwise2.align.globalxx(a, neighbor[0])[0]
# ('CCAC--T', '-CA-AAT', 3.0, 0, 7)
score, length = new_score[2], new_score[-1]
score += 1
if counter < 5:
print('Sequence %s Neighbor %s'%(a, neighbor))
print('Needleman Wunsch: %s'%score)
print('-------')
nws.append(score)
other_score = neighbor[1]
w2v.append(other_score)
# get from minhash
m1, m2 = MinHash(), MinHash()
for d in a:
m1.update(d.encode('utf8'))
for d in neighbor[0]:
m2.update(d.encode('utf8'))
jaccard = m1.jaccard(m2)
sketches.append(float('%.1f'%jaccard))
if counter == 1000:
break
counter += 1
scipy.stats.spearmanr(nws, w2v)
Out[27]:
In [28]:
df = pd.DataFrame({'Needleman-Wunsch':nws, 'dna2vec cosine':w2v, 'MinHash':sketches})
sns.violinplot(x='Needleman-Wunsch', y='dna2vec cosine', data=df, scale='count');
In [29]:
g = sns.violinplot(x='MinHash', y='dna2vec cosine', data=df, scale='count');
Niiiiiiiiiice. So for close neighbors MinHash and Needleman-Wunsch seem to correlate, but not for distant neighbors, interesting
In [30]:
df.describe()
Out[30]:
Welp that's weird. By now I think that neighbors in the dna2vec space don't necessarily have to be related by their bases.
TO CHECK:
In [31]:
weights = word_vectors.syn0
from keras.layers import Embedding
from keras.engine import Input
layer = Embedding(input_dim=weights.shape[0], output_dim=weights.shape[1], weights=[weights], trainable=False)
layer
Out[31]:
IMPORTANT to set trainable to False, else Keras is allowed to change our word2vec input weights!
We can either use an LSTM or convolutional layers here.
https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html
In [ ]: