In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from lazy import lazy
from attrdict import AttrDict
from collections import Counter
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import random
In [2]:
family_classification_metadata = pd.read_table('../seminar_5/data/family_classification_metadata.tab')
family_classification_sequences = pd.read_table('../seminar_5/data/family_classification_sequences.tab')
In [3]:
family_classification_metadata.head()
Out[3]:
In [4]:
family_classification_sequences.head()
Out[4]:
Use your ProtVec embedding from homework 5 to perform protein family classification using RNN.
In [5]:
table = pd.read_csv('data/protVec_100d_3grams_without_quotes.csv', sep='\t', header=None)
table = table.T
header = table.iloc[0] # grab the first row for the header
prot2vec = table[1:] # take the data less the header row
prot2vec.columns = header # set the header row as the df header
prot2vec["AAA"].head()
Out[5]:
In [ ]:
most_common_families = Counter(family_classification_metadata['FamilyID']).most_common(1000)
most_common_families = [family for (family, count) in most_common_families]
family2num = {f: i for (i, f) in enumerate(most_common_families)}
In [7]:
seq_lens = [len(i) for i in family_classification_sequences["Sequences"]]
sns.distplot(seq_lens);
In [8]:
sns.distplot(seq_lens, bins=100).set(xlim=(0, 5000));
In [9]:
MAX_PROTEIN_LEN = 501
In [10]:
all_proteins = family_classification_sequences['Sequences']
all_families = family_classification_metadata['FamilyID']
selected_ids = [i for i in range(len(all_proteins))
if all_families[i] in family2num and len(all_proteins[i]) <= MAX_PROTEIN_LEN]
random.shuffle(selected_ids)
train_ratio = 0.9
num_train = int(len(selected_ids) * train_ratio)
train_ids = selected_ids[:num_train]
test_ids = selected_ids[num_train:]
In [11]:
EMBED_LEN = 100
NUM_CLASSES = 1000
BATCH_SIZE = 128
In [12]:
def embedding(protein):
res = np.zeros((MAX_PROTEIN_LEN // 3, EMBED_LEN))
for i in range(0, (len(protein) - 3) // 3):
try:
res[i] = prot2vec[protein[i*3: i*3 + 3]]
except KeyError:
res[i] = prot2vec['<unk>']
return res
embedding(all_proteins[selected_ids[0]])
Out[12]:
In [13]:
def batches(steps):
for i in range(steps):
cur_ids = random.sample(train_ids, BATCH_SIZE)
cur_proteins = [embedding(p) for p in all_proteins[cur_ids]]
fam_ids = [family2num[f] for f in all_families[cur_ids]]
cur_families = np.eye(NUM_CLASSES)[fam_ids]
yield cur_proteins, cur_families
In [14]:
class SequenceClassificationModel:
def __init__(self, params):
self.params = params
self._create_placeholders()
self.prediction
self.cost
self.error
self.optimize
#self._create_summaries()
def _create_placeholders(self):
with tf.name_scope("data"):
self.data = tf.placeholder(tf.float32, [None, self.params.seq_length, self.params.embed_length])
self.target = tf.placeholder(tf.float32, [None, NUM_CLASSES]) #####
def _create_summaries(self):
with tf.name_scope("summaries"):
tf.summary.scalar('loss', self.cost)
tf.summary.scalar('error', self.error)
self.summary = tf.summary.merge_all()
saver = tf.train.Saver()
@lazy
def length(self):
# lengths of sequences in the current data batch
with tf.name_scope("seq_length"):
used = tf.sign(tf.reduce_max(tf.abs(self.data), reduction_indices=2))
length = tf.reduce_sum(used, reduction_indices=1)
length = tf.cast(length, tf.int32)
return length
@lazy
def prediction(self):
with tf.name_scope("recurrent_layer"):
output, _ = tf.nn.dynamic_rnn(
self.params.rnn_cell(self.params.rnn_hidden),
self.data,
dtype=tf.float32,
sequence_length=self.length
)
last = self._last_relevant(output, self.length)
with tf.name_scope("softmax_layer"):
num_classes = int(self.target.get_shape()[1])
weight = tf.Variable(tf.truncated_normal(
[self.params.rnn_hidden, num_classes], stddev=0.01))
bias = tf.Variable(tf.constant(0.1, shape=[num_classes]))
prediction = tf.nn.softmax(tf.matmul(last, weight) + bias)
return prediction
@lazy
def cost(self):
cross_entropy = -tf.reduce_sum(self.target * tf.log(self.prediction))
return cross_entropy
@lazy
def error(self):
self.mistakes = tf.not_equal(
tf.argmax(self.target, 1), tf.argmax(self.prediction, 1))
return tf.reduce_mean(tf.cast(self.mistakes, tf.float32))
@lazy
def optimize(self):
# we will limit the maximum weight updates for gradient clipping
with tf.name_scope("optimization"):
gradient = self.params.optimizer.compute_gradients(self.cost)
if self.params.gradient_clipping:
limit = self.params.gradient_clipping
gradient = [
(tf.clip_by_value(g, -limit, limit), v)
if g is not None else (None, v)
for g, v in gradient]
optimize = self.params.optimizer.apply_gradients(gradient)
return optimize
@staticmethod
def _last_relevant(output, length):
with tf.name_scope("last_relevant"):
batch_size = tf.shape(output)[0]
max_length = int(output.get_shape()[1])
output_size = int(output.get_shape()[2])
index = tf.range(0, batch_size) * max_length + (length - 1)
flat = tf.reshape(output, [-1, output_size])
relevant = tf.gather(flat, index)
return relevant
In [15]:
params = AttrDict(
rnn_cell=tf.contrib.rnn.GRUCell,
rnn_hidden=300,
optimizer=tf.train.AdamOptimizer(0.002),
batch_size=BATCH_SIZE,
gradient_clipping=100,
seq_length=MAX_PROTEIN_LEN//3,
embed_length=EMBED_LEN
)
In [ ]:
tf.reset_default_graph()
model = SequenceClassificationModel(params)
In [18]:
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.3)
gpu_config = tf.ConfigProto(gpu_options=gpu_options)
NUM_STEPS = 1000
with tf.Session(config=gpu_config) as sess:
sess.run(tf.global_variables_initializer())
for index, batch in enumerate(batches(NUM_STEPS)):
feed = {model.data: batch[0], model.target: batch[1]}
error, _ = sess.run([model.error, model.optimize], feed)
if index % 50 == 0:
print('Accuracy on step {}: {:3.1f}%'.format(index + 1, 100 * (1 - error)))
# Testing
# I run model on a one sample at time because otherwise I had problems with memory.
sum_error = 0
for i in range(len(test_ids)):
if i % 1000 == 0:
print(i, end='_')
cur_id = [test_ids[i]]
test_proteins = [embedding(p) for p in all_proteins[cur_id]]
test_fam_ids = [family2num[f] for f in all_families[cur_id]]
test_families = np.eye(NUM_CLASSES)[test_fam_ids]
feed = {model.data: test_proteins, model.target: test_families}
error = sess.run(model.error, feed)
sum_error += error
In [20]:
print('Accuracy on test set: {:3.1f}%'.format(100 * (1 - sum_error/len(test_ids))))
In [ ]: