In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf

import pickle
import os
import random

from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

In [2]:
!! unzip data.zip


Out[2]:
['Archive:  data.zip',
 '   creating: data/',
 '  inflating: data/nice_embed_tsne.csv  ',
 '  inflating: data/acid_properties.csv  ',
 '  inflating: data/family_classification_sequences.tab  ',
 '  inflating: data/family_classification_metadata.tab  ']

In [3]:
seq_df = pd.read_table('data/family_classification_sequences.tab')
seq_df.head()


Out[3]:
Sequences
0 MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQV...
1 MSIIGATRLQNDKSDTYSAGPCYAGGCSAFTPRGTCGKDWDLGEQT...
2 MQNPLPEVMSPEHDKRTTTPMSKEANKFIRELDKKPGDLAVVSDFV...
3 MDSLNEVCYEQIKGTFYKGLFGDFPLIVDKKTGCFNATKLCVLGGK...
4 MEAKNITIDNTTYNFFKFYNINQPLTNLKYLNSERLCFSNAVMGKI...

In [4]:
def make_codones(sseq):
    crop = len(sseq) % 3
    cropped_seq = sseq[:-crop] if crop > 0 else sseq

    return [cropped_seq[i:i+3] for i in range(0, len(cropped_seq), 3)]

def seq_to3(seq):
    splittings = [make_codones(seq[i:]) for i in range(3)]
    return splittings

def create_all_codones(df):
    codones = []

    for i in range(df.shape[0]):
        row = df.iloc[i, :][0]
        codones.extend(seq_to3(row))
    return codones

In [5]:
def read_or_create(read_path, producer):
    if os.path.isfile(read_path):
        print('reading', read_path)
        with open(read_path, 'rb') as fp:
            return pickle.load(fp)
    result = producer()
    print('saving', read_path)
    with open(read_path, 'wb') as fp:
        pickle.dump(result, fp)
    return result

In [ ]:
all_codones = read_or_create(read_path='data/all_codones.pickle',
                             producer= lambda: create_all_codones(seq_df))

In [ ]:
######################

In [9]:
def generate_sample(index_words_list, context_window_size):
    """ Form training pairs according to the skip-gram model. """
    for index_words in index_words_list:
        for index, center in enumerate(index_words):
            context = random.randint(1, context_window_size)
            # get a random target before the center word
            for target in index_words[max(0, index - context): index]:
                yield center, target
            # get a random target after the center wrod
            for target in index_words[index + 1: index + context + 1]:
                yield center, target


def get_batch(iterator, batch_size):
    """ Group a numerical stream into batches and yield them as Numpy arrays. """
    while True:
        center_batch = np.zeros(batch_size, dtype=np.int32)
        target_batch = np.zeros([batch_size, 1])
        for index in range(batch_size):
            center_batch[index], target_batch[index] = next(iterator)
        yield center_batch, target_batch


def flatten(x):
    return [item for sublist in x for item in sublist]


def cod_to_dict(cod, dictionary):
    return [dictionary[key] for key in cod]


def process_data(all_codones, batch_size, skip_window):
    flat_codones = flatten(all_codones)
    unique_codones = set(flat_codones)
    dictionary = {cod: i for i, cod in enumerate(unique_codones)}
    cod_dicts = [cod_to_dict(cod, dictionary) for cod in all_codones]
    single_gen = generate_sample(cod_dicts, context_window_size=skip_window)
    batch_gen = get_batch(single_gen, batch_size=batch_size)
    return batch_gen, dictionary

In [10]:
BATCH_SIZE = 128
SKIP_WINDOW = 12  # the context window

batch_gen, dictionary = process_data(all_codones, BATCH_SIZE, SKIP_WINDOW)

In [ ]:
######################

In [12]:
class SkipGramModel:
    """ Build the graph for word2vec model """

    def __init__(self, vocab_size, embed_size, batch_size, num_sampled, learning_rate):
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.batch_size = batch_size
        self.num_sampled = num_sampled
        self.lr = learning_rate
        self.global_step = tf.Variable(0, dtype=tf.int32, trainable=False, name='global_step')

    def _create_placeholders(self):
        with tf.name_scope("data"):
            self.center_words = tf.placeholder(tf.int32, shape=[self.batch_size], name='center_words')
            self.target_words = tf.placeholder(tf.int32, shape=[self.batch_size, 1], name='target_words')

    def _create_embedding(self):
        with tf.name_scope("embed"):
            self.embed_matrix = tf.Variable(tf.random_uniform([self.vocab_size,
                                                               self.embed_size], -1.0, 1.0),
                                            name='embed_matrix')

    def _create_loss(self):
        with tf.device('/cpu:0'):
            with tf.name_scope("loss"):
                embed = tf.nn.embedding_lookup(self.embed_matrix, self.center_words, name='embed')

                # construct variables for NCE loss
                nce_weight = tf.Variable(tf.truncated_normal([self.vocab_size, self.embed_size],
                                                             stddev=1.0 / (self.embed_size ** 0.5)),
                                         name='nce_weight')
                nce_bias = tf.Variable(tf.zeros([self.vocab_size]), name='nce_bias')

                # define loss function to be NCE loss function
                self.loss = tf.reduce_mean(tf.nn.nce_loss(weights=nce_weight,
                                                          biases=nce_bias,
                                                          labels=self.target_words,
                                                          inputs=embed,
                                                          num_sampled=self.num_sampled,
                                                          num_classes=self.vocab_size), name='loss')

    def _create_optimizer(self):
        self.optimizer = tf.train.GradientDescentOptimizer(self.lr).minimize(self.loss,
                                                                                 global_step=self.global_step)

    def _create_summaries(self):
        with tf.name_scope("summaries"):
            tf.summary.scalar("loss", self.loss)
            tf.summary.histogram("histogram loss", self.loss)
            # because you have several summaries, we should merge them all
            # into one op to make it easier to manage
            self.summary_op = tf.summary.merge_all()

    def build_graph(self):
        """ Build the graph for our model """
        self._create_placeholders()
        self._create_embedding()
        self._create_loss()
        self._create_optimizer()
        self._create_summaries()

In [13]:
VOCAB_SIZE = 9424
EMBED_SIZE = 100  # dimension of the word embedding vectors
NUM_SAMPLED = 5  # Number of negative examples to sample.
LEARNING_RATE = 0.8
NUM_TRAIN_STEPS = 100000
SKIP_STEP = 2000

model = SkipGramModel(VOCAB_SIZE, EMBED_SIZE, BATCH_SIZE, NUM_SAMPLED, LEARNING_RATE)
model.build_graph()


INFO:tensorflow:Summary name histogram loss is illegal; using histogram_loss instead.

In [ ]:
######################

In [14]:
def make_dir(path):
    """ Create a directory if there isn't one already. """
    try:
        os.mkdir(path)
    except OSError:
        pass

In [15]:
def train_model(model, batch_gen, num_train_steps, learning_rate, skip_step):
    saver = tf.train.Saver()  # defaults to saving all variables - in this case embed_matrix, nce_weight, nce_bias

    make_dir('checkpoints')
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        ckpt = tf.train.get_checkpoint_state(os.path.dirname('checkpoints/checkpoint'))
        # if that checkpoint exists, restore from checkpoint
        if ckpt and ckpt.model_checkpoint_path:
            saver.restore(sess, ckpt.model_checkpoint_path)

        total_loss = 0.0  # we use this to calculate late average loss in the last SKIP_STEP steps
        writer = tf.summary.FileWriter('improved_graph/lr' + str(learning_rate), sess.graph)
        initial_step = model.global_step.eval()
        for index in range(initial_step, initial_step + num_train_steps):
            centers, targets = next(batch_gen)
            feed_dict = {model.center_words: centers, model.target_words: targets}
            loss_batch, _, summary = sess.run([model.loss, model.optimizer, model.summary_op],
                                              feed_dict=feed_dict)
            writer.add_summary(summary, global_step=index)
            total_loss += loss_batch
            if (index + 1) % skip_step == 0:
                print('Average loss at step {}: {:5.1f}'.format(index, total_loss / skip_step))
                total_loss = 0.0
                saver.save(sess, 'checkpoints/skip-gram', index)

        final_embed_matrix = sess.run(model.embed_matrix)
        return final_embed_matrix

In [16]:
final_embed_matrix = train_model(model, batch_gen, NUM_TRAIN_STEPS, LEARNING_RATE, SKIP_STEP)


Average loss at step 307999:   2.6
Average loss at step 309999:   2.1
Average loss at step 311999:   1.5
Average loss at step 313999:   2.0
Average loss at step 315999:   1.2
Average loss at step 317999:   1.7
Average loss at step 319999:   1.0
Average loss at step 321999:   2.1
Average loss at step 323999:   1.9
Average loss at step 325999:   2.0
Average loss at step 327999:   2.0
Average loss at step 329999:   2.0
Average loss at step 331999:   1.9
Average loss at step 333999:   1.8
Average loss at step 335999:   1.8
Average loss at step 337999:   1.9
Average loss at step 339999:   1.9
Average loss at step 341999:   1.3
Average loss at step 343999:   1.9
Average loss at step 345999:   1.9
Average loss at step 347999:   1.8
Average loss at step 349999:   1.9
Average loss at step 351999:   1.6
Average loss at step 353999:   1.8
Average loss at step 355999:   1.8
Average loss at step 357999:   1.4
Average loss at step 359999:   1.3
Average loss at step 361999:   1.1
Average loss at step 363999:   1.7
Average loss at step 365999:   1.9
Average loss at step 367999:   0.9
Average loss at step 369999:   1.0
Average loss at step 371999:   0.6
Average loss at step 373999:   1.5
Average loss at step 375999:   2.0
Average loss at step 377999:   1.9
Average loss at step 379999:   1.9
Average loss at step 381999:   1.8
Average loss at step 383999:   1.7
Average loss at step 385999:   1.8
Average loss at step 387999:   1.9
Average loss at step 389999:   1.8
Average loss at step 391999:   1.8
Average loss at step 393999:   1.8
Average loss at step 395999:   1.9
Average loss at step 397999:   1.8
Average loss at step 399999:   1.8
Average loss at step 401999:   1.4
Average loss at step 403999:   1.8
Average loss at step 405999:   1.9

In [11]:
######################

In [17]:
tsne = TSNE(n_components=2, random_state=42)
XX = tsne.fit_transform(final_embed_matrix)

In [18]:
tsne_df = pd.DataFrame(XX, columns=['x0', 'x1'])
unique_codones = sorted(dictionary, key=dictionary.get)
tsne_df['codone'] = list(unique_codones)
tsne_df.head()


Out[18]:
x0 x1 codone
0 0.251213 -1.921739 GRW
1 0.053797 2.925586 LAK
2 -3.275461 -1.735125 PKQ
3 -2.305040 -1.823419 PIQ
4 -1.199928 0.021694 FSQ

In [19]:
def plot_tsne_df(df):
    plt.figure(figsize=(15, 10))
    plt.title('unlabeled encoding', fontsize=20)
    plt.scatter(df.x0, df.x1, s=10)
    plt.show()

In [20]:
plot_tsne_df(tsne_df)



In [21]:
filename = 'data/acid_properties.csv'
props = pd.read_csv(filename)

In [ ]:
######################

In [22]:
def acid_dict(some_c, props):
    prop_by_letter = [props[props.acid == let].iloc[:, 1:] for let in some_c]   
    df_concat = pd.concat(prop_by_letter)
    res = df_concat.mean()
    dres = dict(res)
    dres['acid'] = some_c
    return dres

In [23]:
save_path = 'data/all_acid_dicts.pickle'
producer = lambda: [acid_dict(some_c, props) for some_c in tsne_df.codone]
all_acid_dicts = read_or_create(save_path, producer)


reading data/all_acid_dicts.pickle

In [24]:
all_acid_df = pd.DataFrame(all_acid_dicts)
all_acid_df.head()


Out[24]:
acid hydrophobicity mass number_of_atoms volume
0 KDK -3.766667 123.810667 21.333333 149.433333
1 YMN -0.966667 136.157333 20.333333 156.866667
2 EGM -0.666667 105.787133 16.333333 120.466667
3 VRQ -1.266667 127.815333 21.666667 152.400000
4 TVL 2.433333 104.464200 19.333333 140.933333

In [25]:
final_df = all_acid_df.join(tsne_df.set_index('codone'), on='acid')
final_df.head()


Out[25]:
acid hydrophobicity mass number_of_atoms volume x0 x1
0 KDK -3.766667 123.810667 21.333333 149.433333 -3.675756 1.755588
1 YMN -0.966667 136.157333 20.333333 156.866667 -2.831734 0.179093
2 EGM -0.666667 105.787133 16.333333 120.466667 -1.577637 0.180259
3 VRQ -1.266667 127.815333 21.666667 152.400000 -0.009369 0.675519
4 TVL 2.433333 104.464200 19.333333 140.933333 -2.448395 0.328913

In [26]:
def plot_embedding_properties(final_df):
    plt.figure(figsize=(25, 20))
    for i, p in enumerate(['hydrophobicity', 'mass', 'number_of_atoms', 'volume']):
        plt.subplot(2,2,i+1)
        plt.title(p, fontsize=25)
        plt.scatter(final_df.x0, final_df.x1, c=final_df[p], s=10)
    plt.show()

In [27]:
plot_embedding_properties(final_df)



In [28]:
######################

In [29]:
filename = 'data/nice_embed_tsne.csv'
gensim_tsne_df = pd.read_csv(filename, index_col=0)
gensim_tsne_df.columns = ['x0', 'x1', 'codone']

In [30]:
plot_tsne_df(gensim_tsne_df)



In [31]:
final_df_nice = all_acid_df.join(gensim_tsne_df.set_index('codone'), on='acid')

In [32]:
plot_embedding_properties(final_df_nice)


Homework

Improve SkipGramModel to archive better embedding for amino acids codones. Visualize your space in the similar style as on the bottom example. You are only allowed to use vanilla tensorflow for this task.

Bonus task(no credit): visualize your embedding space in similar manner as minst example: https://www.tensorflow.org/versions/r0.12/how_tos/embedding_viz/