In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import math
import os
import random
import zipfile
import numpy as np
from six.moves import urllib
from six.moves import xrange # pylint: disable=redefined-builtin
import tensorflow as tf
In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
import pickle
import os
import random
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import collections
In [3]:
seq_df = pd.read_table('data/family_classification_sequences.tab')
seq_df.head()
Out[3]:
In [4]:
def make_codones(sseq):
crop = len(sseq) % 3
cropped_seq = sseq[:-crop] if crop > 0 else sseq
return [cropped_seq[i:i+3] for i in range(0, len(cropped_seq), 3)]
def seq_to3(seq):
splittings = [make_codones(seq[i:]) for i in range(3)]
return splittings
def create_all_codones(df):
codones = []
for i in range(df.shape[0]):
row = df.iloc[i, :][0]
codones.extend(seq_to3(row))
return codones
In [5]:
def read_or_create(read_path, producer):
if os.path.isfile(read_path):
print('reading', read_path)
with open(read_path, 'rb') as fp:
return pickle.load(fp)
result = producer()
print('saving', read_path)
with open(read_path, 'wb') as fp:
pickle.dump(result, fp)
return result
In [6]:
all_codones = read_or_create(read_path='data/all_codones.pickle',
producer= lambda: create_all_codones(seq_df))
In [7]:
len(all_codones)
Out[7]:
In [8]:
# Step 2: Build the dictionary and replace rare words with UNK token.
vocabulary_size = 9000
def flatten(x):
return [item for sublist in x for item in sublist]
def build_dataset(words, n_words):
"""Process raw inputs into a dataset."""
count = [['UNK', -1]]
count.extend(collections.Counter(words).most_common(n_words - 1))
dictionary = dict()
for word, _ in count:
dictionary[word] = len(dictionary)
data = list()
unk_count = 0
for word in words:
if word in dictionary:
index = dictionary[word]
else:
index = 0 # dictionary['UNK']
unk_count += 1
data.append(index)
count[0][1] = unk_count
reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
return data, count, dictionary, reversed_dictionary
data, count, dictionary, reverse_dictionary = build_dataset(flatten(all_codones),
vocabulary_size)
In [9]:
#del vocabulary # Hint to reduce memory.
print('Most common words (+UNK)', count[:5])
print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]])
data_index = 0
In [10]:
# Step 3: Function to generate a training batch for the skip-gram model.
def generate_batch(batch_size, num_skips, skip_window):
global data_index
assert batch_size % num_skips == 0
assert num_skips <= 2 * skip_window
batch = np.ndarray(shape=(batch_size), dtype=np.int32)
labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
span = 2 * skip_window + 1 # [ skip_window target skip_window ]
buffer = collections.deque(maxlen=span)
if data_index + span > len(data):
data_index = 0
buffer.extend(data[data_index:data_index + span])
data_index += span
for i in range(batch_size // num_skips):
target = skip_window # target label at the center of the buffer
targets_to_avoid = [skip_window]
for j in range(num_skips):
while target in targets_to_avoid:
target = random.randint(0, span - 1)
targets_to_avoid.append(target)
batch[i * num_skips + j] = buffer[skip_window]
labels[i * num_skips + j, 0] = buffer[target]
if data_index == len(data):
buffer[:] = data[:span]
data_index = span
else:
buffer.append(data[data_index])
data_index += 1
# Backtrack a little bit to avoid skipping words in the end of a batch
data_index = (data_index + len(data) - span) % len(data)
return batch, labels
In [13]:
# Step 4: Build and train a skip-gram model.
batch_size = 130
embedding_size = 100 # Dimension of the embedding vector.
skip_window = 2 # How many words to consider left and right.
num_skips = 2 # How many times to reuse an input to generate a label.
# We pick a random validation set to sample nearest neighbors. Here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent.
valid_size = 16 # Random set of words to evaluate similarity on.
valid_window = 50 # Only pick dev samples in the head of the distribution.
valid_examples = np.random.choice(valid_window, valid_size, replace=False)
num_sampled = 32 # Number of negative examples to sample.
graph = tf.Graph()
with graph.as_default():
# Input data.
train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
# Look up embeddings for inputs.
embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
embed = tf.nn.embedding_lookup(embeddings, train_inputs)
# Construct the variables for the NCE loss
nce_weights = tf.Variable(tf.truncated_normal([vocabulary_size, embedding_size],
stddev=1.0 / math.sqrt(embedding_size)))
nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
# Compute the average NCE loss for the batch.
# tf.nce_loss automatically draws a new sample of the negative labels each
# time we evaluate the loss.
loss = tf.reduce_mean(
tf.nn.nce_loss(weights=nce_weights,
biases=nce_biases,
labels=train_labels,
inputs=embed,
num_sampled=num_sampled,
num_classes=vocabulary_size))
# Construct the SGD optimizer using a learning rate of 0.5
optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
# Compute the cosine similarity between minibatch examples and all embeddings.
norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
normalized_embeddings = embeddings / norm
valid_embeddings = tf.nn.embedding_lookup(
normalized_embeddings, valid_dataset)
similarity = tf.matmul(
valid_embeddings, normalized_embeddings, transpose_b=True)
# Add variable initializer.
init = tf.global_variables_initializer()
In [24]:
# Step 5: Begin training.
num_steps = 1000001
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.1)
with tf.Session(graph=graph, config=tf.ConfigProto(gpu_options=gpu_options)) as session:
# We must initialize all variables before we use them.
init.run()
print('Initialized')
average_loss = 0
for step in xrange(num_steps):
batch_inputs, batch_labels = generate_batch(
batch_size, num_skips, skip_window)
feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}
# We perform one update step by evaluating the optimizer op (including it
# in the list of returned values for session.run()
_, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
average_loss += loss_val
if step % 2000 == 0:
if step > 0:
average_loss /= 2000
# The average loss is an estimate of the loss over the last 2000 batches.
print('Average loss at step ', step, ': ', average_loss)
average_loss = 0
final_embeddings = normalized_embeddings.eval()
In [25]:
final_embeddings
Out[25]:
In [26]:
tsne = TSNE(n_components=2, random_state=42)
XX = tsne.fit_transform(final_embeddings)
In [27]:
tsne_df = pd.DataFrame(XX, columns=['x0', 'x1'])
unique_codones = sorted(dictionary, key=dictionary.get)
tsne_df['codone'] = list(unique_codones)
tsne_df.head()
Out[27]:
In [28]:
filename = 'data/acid_properties.csv'
props = pd.read_csv(filename)
In [29]:
def acid_dict(some_c, props):
prop_by_letter = [props[props.acid == let].iloc[:, 1:] for let in some_c]
df_concat = pd.concat(prop_by_letter)
res = df_concat.mean()
dres = dict(res)
dres['acid'] = some_c
return dres
In [30]:
save_path = 'data/all_acid_dicts.pickle'
producer = lambda: [acid_dict(some_c, props) for some_c in tsne_df.codone]
all_acid_dicts = read_or_create(save_path, producer)
all_acid_df = pd.DataFrame(all_acid_dicts)
final_df = all_acid_df.join(tsne_df.set_index('codone'), on='acid')
In [31]:
def plot_embedding_properties(final_df):
plt.figure(figsize=(25, 20))
for i, p in enumerate(['hydrophobicity', 'mass', 'number_of_atoms', 'volume']):
plt.subplot(2,2,i+1)
plt.title(p, fontsize=25)
plt.scatter(final_df.x0, final_df.x1, c=final_df[p], s=10)
plt.show()
plot_embedding_properties(final_df)
In [ ]: