In [1]:

    
import pandas as pd
import numpy as np
import tensorflow as tf

import pickle
import os
import random

from sklearn.manifold import TSNE
import matplotlib.pyplot as plt



In [2]:

    
!! unzip data.zip









    Out[2]:





['Archive:  data.zip',
 '   creating: data/',
 '  inflating: data/nice_embed_tsne.csv  ',
 '  inflating: data/acid_properties.csv  ',
 '  inflating: data/family_classification_sequences.tab  ',
 '  inflating: data/family_classification_metadata.tab  ']



In [3]:

    
seq_df = pd.read_table('data/family_classification_sequences.tab')
seq_df.head()









    Out[3]:







  
    
      
      Sequences
    
  
  
    
      0
      MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQV...
    
    
      1
      MSIIGATRLQNDKSDTYSAGPCYAGGCSAFTPRGTCGKDWDLGEQT...
    
    
      2
      MQNPLPEVMSPEHDKRTTTPMSKEANKFIRELDKKPGDLAVVSDFV...
    
    
      3
      MDSLNEVCYEQIKGTFYKGLFGDFPLIVDKKTGCFNATKLCVLGGK...
    
    
      4
      MEAKNITIDNTTYNFFKFYNINQPLTNLKYLNSERLCFSNAVMGKI...



In [4]:

    
def make_codones(sseq):
    crop = len(sseq) % 3
    cropped_seq = sseq[:-crop] if crop > 0 else sseq

    return [cropped_seq[i:i+3] for i in range(0, len(cropped_seq), 3)]

def seq_to3(seq):
    splittings = [make_codones(seq[i:]) for i in range(3)]
    return splittings

def create_all_codones(df):
    codones = []

    for i in range(df.shape[0]):
        row = df.iloc[i, :][0]
        codones.extend(seq_to3(row))
    return codones



In [5]:

    
def read_or_create(read_path, producer):
    if os.path.isfile(read_path):
        print('reading', read_path)
        with open(read_path, 'rb') as fp:
            return pickle.load(fp)
    result = producer()
    print('saving', read_path)
    with open(read_path, 'wb') as fp:
        pickle.dump(result, fp)
    return result



In [ ]:

    
all_codones = read_or_create(read_path='data/all_codones.pickle',
                             producer= lambda: create_all_codones(seq_df))



In [ ]:

    
######################



In [9]:

    
def generate_sample(index_words_list, context_window_size):
    """ Form training pairs according to the skip-gram model. """
    for index_words in index_words_list:
        for index, center in enumerate(index_words):
            context = random.randint(1, context_window_size)
            # get a random target before the center word
            for target in index_words[max(0, index - context): index]:
                yield center, target
            # get a random target after the center wrod
            for target in index_words[index + 1: index + context + 1]:
                yield center, target


def get_batch(iterator, batch_size):
    """ Group a numerical stream into batches and yield them as Numpy arrays. """
    while True:
        center_batch = np.zeros(batch_size, dtype=np.int32)
        target_batch = np.zeros([batch_size, 1])
        for index in range(batch_size):
            center_batch[index], target_batch[index] = next(iterator)
        yield center_batch, target_batch


def flatten(x):
    return [item for sublist in x for item in sublist]


def cod_to_dict(cod, dictionary):
    return [dictionary[key] for key in cod]


def process_data(all_codones, batch_size, skip_window):
    flat_codones = flatten(all_codones)
    unique_codones = set(flat_codones)
    dictionary = {cod: i for i, cod in enumerate(unique_codones)}
    cod_dicts = [cod_to_dict(cod, dictionary) for cod in all_codones]
    single_gen = generate_sample(cod_dicts, context_window_size=skip_window)
    batch_gen = get_batch(single_gen, batch_size=batch_size)
    return batch_gen, dictionary



In [10]:

    
BATCH_SIZE = 128
SKIP_WINDOW = 12  # the context window

batch_gen, dictionary = process_data(all_codones, BATCH_SIZE, SKIP_WINDOW)



In [ ]:

    
######################



In [12]:

    
class SkipGramModel:
    """ Build the graph for word2vec model """

    def __init__(self, vocab_size, embed_size, batch_size, num_sampled, learning_rate):
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.batch_size = batch_size
        self.num_sampled = num_sampled
        self.lr = learning_rate
        self.global_step = tf.Variable(0, dtype=tf.int32, trainable=False, name='global_step')

    def _create_placeholders(self):
        with tf.name_scope("data"):
            self.center_words = tf.placeholder(tf.int32, shape=[self.batch_size], name='center_words')
            self.target_words = tf.placeholder(tf.int32, shape=[self.batch_size, 1], name='target_words')

    def _create_embedding(self):
        with tf.name_scope("embed"):
            self.embed_matrix = tf.Variable(tf.random_uniform([self.vocab_size,
                                                               self.embed_size], -1.0, 1.0),
                                            name='embed_matrix')

    def _create_loss(self):
        with tf.device('/cpu:0'):
            with tf.name_scope("loss"):
                embed = tf.nn.embedding_lookup(self.embed_matrix, self.center_words, name='embed')

                # construct variables for NCE loss
                nce_weight = tf.Variable(tf.truncated_normal([self.vocab_size, self.embed_size],
                                                             stddev=1.0 / (self.embed_size ** 0.5)),
                                         name='nce_weight')
                nce_bias = tf.Variable(tf.zeros([self.vocab_size]), name='nce_bias')

                # define loss function to be NCE loss function
                self.loss = tf.reduce_mean(tf.nn.nce_loss(weights=nce_weight,
                                                          biases=nce_bias,
                                                          labels=self.target_words,
                                                          inputs=embed,
                                                          num_sampled=self.num_sampled,
                                                          num_classes=self.vocab_size), name='loss')

    def _create_optimizer(self):
        self.optimizer = tf.train.GradientDescentOptimizer(self.lr).minimize(self.loss,
                                                                                 global_step=self.global_step)

    def _create_summaries(self):
        with tf.name_scope("summaries"):
            tf.summary.scalar("loss", self.loss)
            tf.summary.histogram("histogram loss", self.loss)
            # because you have several summaries, we should merge them all
            # into one op to make it easier to manage
            self.summary_op = tf.summary.merge_all()

    def build_graph(self):
        """ Build the graph for our model """
        self._create_placeholders()
        self._create_embedding()
        self._create_loss()
        self._create_optimizer()
        self._create_summaries()



In [13]:

    
VOCAB_SIZE = 9424
EMBED_SIZE = 100  # dimension of the word embedding vectors
NUM_SAMPLED = 5  # Number of negative examples to sample.
LEARNING_RATE = 0.8
NUM_TRAIN_STEPS = 100000
SKIP_STEP = 2000

model = SkipGramModel(VOCAB_SIZE, EMBED_SIZE, BATCH_SIZE, NUM_SAMPLED, LEARNING_RATE)
model.build_graph()









    



INFO:tensorflow:Summary name histogram loss is illegal; using histogram_loss instead.



In [ ]:

    
######################



In [14]:

    
def make_dir(path):
    """ Create a directory if there isn't one already. """
    try:
        os.mkdir(path)
    except OSError:
        pass



In [15]:

    
def train_model(model, batch_gen, num_train_steps, learning_rate, skip_step):
    saver = tf.train.Saver()  # defaults to saving all variables - in this case embed_matrix, nce_weight, nce_bias

    make_dir('checkpoints')
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        ckpt = tf.train.get_checkpoint_state(os.path.dirname('checkpoints/checkpoint'))
        # if that checkpoint exists, restore from checkpoint
        if ckpt and ckpt.model_checkpoint_path:
            saver.restore(sess, ckpt.model_checkpoint_path)

        total_loss = 0.0  # we use this to calculate late average loss in the last SKIP_STEP steps
        writer = tf.summary.FileWriter('improved_graph/lr' + str(learning_rate), sess.graph)
        initial_step = model.global_step.eval()
        for index in range(initial_step, initial_step + num_train_steps):
            centers, targets = next(batch_gen)
            feed_dict = {model.center_words: centers, model.target_words: targets}
            loss_batch, _, summary = sess.run([model.loss, model.optimizer, model.summary_op],
                                              feed_dict=feed_dict)
            writer.add_summary(summary, global_step=index)
            total_loss += loss_batch
            if (index + 1) % skip_step == 0:
                print('Average loss at step {}: {:5.1f}'.format(index, total_loss / skip_step))
                total_loss = 0.0
                saver.save(sess, 'checkpoints/skip-gram', index)

        final_embed_matrix = sess.run(model.embed_matrix)
        return final_embed_matrix



In [16]:

    
final_embed_matrix = train_model(model, batch_gen, NUM_TRAIN_STEPS, LEARNING_RATE, SKIP_STEP)









    



Average loss at step 307999:   2.6
Average loss at step 309999:   2.1
Average loss at step 311999:   1.5
Average loss at step 313999:   2.0
Average loss at step 315999:   1.2
Average loss at step 317999:   1.7
Average loss at step 319999:   1.0
Average loss at step 321999:   2.1
Average loss at step 323999:   1.9
Average loss at step 325999:   2.0
Average loss at step 327999:   2.0
Average loss at step 329999:   2.0
Average loss at step 331999:   1.9
Average loss at step 333999:   1.8
Average loss at step 335999:   1.8
Average loss at step 337999:   1.9
Average loss at step 339999:   1.9
Average loss at step 341999:   1.3
Average loss at step 343999:   1.9
Average loss at step 345999:   1.9
Average loss at step 347999:   1.8
Average loss at step 349999:   1.9
Average loss at step 351999:   1.6
Average loss at step 353999:   1.8
Average loss at step 355999:   1.8
Average loss at step 357999:   1.4
Average loss at step 359999:   1.3
Average loss at step 361999:   1.1
Average loss at step 363999:   1.7
Average loss at step 365999:   1.9
Average loss at step 367999:   0.9
Average loss at step 369999:   1.0
Average loss at step 371999:   0.6
Average loss at step 373999:   1.5
Average loss at step 375999:   2.0
Average loss at step 377999:   1.9
Average loss at step 379999:   1.9
Average loss at step 381999:   1.8
Average loss at step 383999:   1.7
Average loss at step 385999:   1.8
Average loss at step 387999:   1.9
Average loss at step 389999:   1.8
Average loss at step 391999:   1.8
Average loss at step 393999:   1.8
Average loss at step 395999:   1.9
Average loss at step 397999:   1.8
Average loss at step 399999:   1.8
Average loss at step 401999:   1.4
Average loss at step 403999:   1.8
Average loss at step 405999:   1.9



In [11]:

    
######################



In [17]:

    
tsne = TSNE(n_components=2, random_state=42)
XX = tsne.fit_transform(final_embed_matrix)



In [18]:

    
tsne_df = pd.DataFrame(XX, columns=['x0', 'x1'])
unique_codones = sorted(dictionary, key=dictionary.get)
tsne_df['codone'] = list(unique_codones)
tsne_df.head()



In [19]:

    
def plot_tsne_df(df):
    plt.figure(figsize=(15, 10))
    plt.title('unlabeled encoding', fontsize=20)
    plt.scatter(df.x0, df.x1, s=10)
    plt.show()



In [20]:

    
plot_tsne_df(tsne_df)



In [21]:

    
filename = 'data/acid_properties.csv'
props = pd.read_csv(filename)



In [ ]:

    
######################



In [22]:

    
def acid_dict(some_c, props):
    prop_by_letter = [props[props.acid == let].iloc[:, 1:] for let in some_c]   
    df_concat = pd.concat(prop_by_letter)
    res = df_concat.mean()
    dres = dict(res)
    dres['acid'] = some_c
    return dres



In [23]:

    
save_path = 'data/all_acid_dicts.pickle'
producer = lambda: [acid_dict(some_c, props) for some_c in tsne_df.codone]
all_acid_dicts = read_or_create(save_path, producer)









    



reading data/all_acid_dicts.pickle



In [24]:

    
all_acid_df = pd.DataFrame(all_acid_dicts)
all_acid_df.head()









    Out[24]:






  
    
      
      acid
      hydrophobicity
      mass
      number_of_atoms
      volume
    
  
  
    
      0
      KDK
      -3.766667
      123.810667
      21.333333
      149.433333
    
    
      1
      YMN
      -0.966667
      136.157333
      20.333333
      156.866667
    
    
      2
      EGM
      -0.666667
      105.787133
      16.333333
      120.466667
    
    
      3
      VRQ
      -1.266667
      127.815333
      21.666667
      152.400000
    
    
      4
      TVL
      2.433333
      104.464200
      19.333333
      140.933333



In [25]:

    
final_df = all_acid_df.join(tsne_df.set_index('codone'), on='acid')
final_df.head()









    Out[25]:






  
    
      
      acid
      hydrophobicity
      mass
      number_of_atoms
      volume
      x0
      x1
    
  
  
    
      0
      KDK
      -3.766667
      123.810667
      21.333333
      149.433333
      -3.675756
      1.755588
    
    
      1
      YMN
      -0.966667
      136.157333
      20.333333
      156.866667
      -2.831734
      0.179093
    
    
      2
      EGM
      -0.666667
      105.787133
      16.333333
      120.466667
      -1.577637
      0.180259
    
    
      3
      VRQ
      -1.266667
      127.815333
      21.666667
      152.400000
      -0.009369
      0.675519
    
    
      4
      TVL
      2.433333
      104.464200
      19.333333
      140.933333
      -2.448395
      0.328913



In [26]:

    
def plot_embedding_properties(final_df):
    plt.figure(figsize=(25, 20))
    for i, p in enumerate(['hydrophobicity', 'mass', 'number_of_atoms', 'volume']):
        plt.subplot(2,2,i+1)
        plt.title(p, fontsize=25)
        plt.scatter(final_df.x0, final_df.x1, c=final_df[p], s=10)
    plt.show()



In [27]:

    
plot_embedding_properties(final_df)



In [28]:

    
######################



In [29]:

    
filename = 'data/nice_embed_tsne.csv'
gensim_tsne_df = pd.read_csv(filename, index_col=0)
gensim_tsne_df.columns = ['x0', 'x1', 'codone']



In [30]:

    
plot_tsne_df(gensim_tsne_df)



In [31]:

    
final_df_nice = all_acid_df.join(gensim_tsne_df.set_index('codone'), on='acid')



In [32]:

    
plot_embedding_properties(final_df_nice)

Homework

Improve SkipGramModel to archive better embedding for amino acids codones. Visualize your space in the similar style as on the bottom example. You are only allowed to use vanilla tensorflow for this task.

Bonus task(no credit): visualize your embedding space in similar manner as minst example: https://www.tensorflow.org/versions/r0.12/how_tos/embedding_viz/

	x0	x1	codone
0	0.251213	-1.921739	GRW
1	0.053797	2.925586	LAK
2	-3.275461	-1.735125	PKQ
3	-2.305040	-1.823419	PIQ
4	-1.199928	0.021694	FSQ

	Sequences
0	MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQV...
1	MSIIGATRLQNDKSDTYSAGPCYAGGCSAFTPRGTCGKDWDLGEQT...
2	MQNPLPEVMSPEHDKRTTTPMSKEANKFIRELDKKPGDLAVVSDFV...
3	MDSLNEVCYEQIKGTFYKGLFGDFPLIVDKKTGCFNATKLCVLGGK...
4	MEAKNITIDNTTYNFFKFYNINQPLTNLKYLNSERLCFSNAVMGKI...

	acid	hydrophobicity	mass	number_of_atoms	volume
0	KDK	-3.766667	123.810667	21.333333	149.433333
1	YMN	-0.966667	136.157333	20.333333	156.866667
2	EGM	-0.666667	105.787133	16.333333	120.466667
3	VRQ	-1.266667	127.815333	21.666667	152.400000
4	TVL	2.433333	104.464200	19.333333	140.933333