Protein Family Classification



In [1]:

    
import numpy as np
import pandas as pd
import tensorflow as tf
from lazy import lazy
from attrdict import AttrDict
from collections import Counter

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import random



In [2]:

    
family_classification_metadata = pd.read_table('../seminar_5/data/family_classification_metadata.tab')
family_classification_sequences = pd.read_table('../seminar_5/data/family_classification_sequences.tab')



In [3]:

    
family_classification_metadata.head()









    Out[3]:







  
    
      
      SwissProtAccessionID
      LongID
      ProteinName
      FamilyID
      FamilyDescription
    
  
  
    
      0
      Q6GZX4
      001R_FRG3G
      Putative transcription factor 001R
      Pox_VLTF3
      Poxvirus Late Transcription Factor VLTF3 like
    
    
      1
      Q6GZX3
      002L_FRG3G
      Uncharacterized protein 002L
      DUF230
      Poxvirus proteins of unknown function
    
    
      2
      Q6GZX0
      005R_FRG3G
      Uncharacterized protein 005R
      US22
      US22 like
    
    
      3
      Q91G88
      006L_IIV6
      Putative KilA-N domain-containing protein 006L
      DUF3627
      Protein of unknown function (DUF3627)
    
    
      4
      Q197F3
      007R_IIV3
      Uncharacterized protein 007R
      DUF2738
      Protein of unknown function (DUF2738)



In [4]:

    
family_classification_sequences.head()









    Out[4]:







  
    
      
      Sequences
    
  
  
    
      0
      MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQV...
    
    
      1
      MSIIGATRLQNDKSDTYSAGPCYAGGCSAFTPRGTCGKDWDLGEQT...
    
    
      2
      MQNPLPEVMSPEHDKRTTTPMSKEANKFIRELDKKPGDLAVVSDFV...
    
    
      3
      MDSLNEVCYEQIKGTFYKGLFGDFPLIVDKKTGCFNATKLCVLGGK...
    
    
      4
      MEAKNITIDNTTYNFFKFYNINQPLTNLKYLNSERLCFSNAVMGKI...

Task:

Use your ProtVec embedding from homework 5 to perform protein family classification using RNN.

use 1000 most frequent families for classification
validate your results on the train-test split
reduce the dimensionality of the protein-space using Stochastic Neighbor Embedding and visualize two most frequent classes
compare your RNN results with SVM

Let's read the embedding matrix from the original article data.



In [5]:

    
table = pd.read_csv('data/protVec_100d_3grams_without_quotes.csv', sep='\t', header=None)
table = table.T
header = table.iloc[0] # grab the first row for the header
prot2vec = table[1:] # take the data less the header row
prot2vec.columns = header # set the header row as the df header
prot2vec["AAA"].head()









    Out[5]:





1    -0.17406
2   -0.095756
3    0.059515
4    0.039673
5   -0.375934
Name: AAA, dtype: object

Let's find 1000 most frequent families.



In [ ]:

    
most_common_families = Counter(family_classification_metadata['FamilyID']).most_common(1000)
most_common_families = [family for (family, count) in most_common_families]
family2num = {f: i for (i, f) in enumerate(most_common_families)}

Let's determine what protein length we should consider.



In [7]:

    
seq_lens = [len(i) for i in family_classification_sequences["Sequences"]]
sns.distplot(seq_lens);



In [8]:

    
sns.distplot(seq_lens, bins=100).set(xlim=(0, 5000));



In [9]:

    
MAX_PROTEIN_LEN = 501

Now we'll generate batches.



In [10]:

    
all_proteins = family_classification_sequences['Sequences']
all_families = family_classification_metadata['FamilyID']

selected_ids = [i for i in range(len(all_proteins)) 
                  if all_families[i] in family2num and len(all_proteins[i]) <= MAX_PROTEIN_LEN]

random.shuffle(selected_ids)

train_ratio = 0.9
num_train = int(len(selected_ids) * train_ratio)

train_ids = selected_ids[:num_train]
test_ids = selected_ids[num_train:]



In [11]:

    
EMBED_LEN = 100
NUM_CLASSES = 1000
BATCH_SIZE = 128



In [12]:

    
def embedding(protein):
    res = np.zeros((MAX_PROTEIN_LEN // 3, EMBED_LEN))
    for i in range(0, (len(protein) - 3) // 3):
        try:
            res[i] = prot2vec[protein[i*3: i*3 + 3]]
        except KeyError:
            res[i] = prot2vec['<unk>']

    return res
    
embedding(all_proteins[selected_ids[0]])









    Out[12]:





array([[-0.098772, -0.131638, -0.145519, ..., -0.065224, -0.037585,
        -0.187173],
       [ 0.01193 , -0.034541, -0.023654, ..., -0.081972,  0.061456,
         0.13207 ],
       [-0.048559,  0.017829,  0.043389, ...,  0.038055,  0.13303 ,
         0.111015],
       ..., 
       [ 0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,  0.      ],
       [ 0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,  0.      ],
       [ 0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,  0.      ]])



In [13]:

    
def batches(steps):
    for i in range(steps):
        cur_ids = random.sample(train_ids, BATCH_SIZE)
        
        cur_proteins = [embedding(p) for p in all_proteins[cur_ids]]
        
        fam_ids = [family2num[f] for f in all_families[cur_ids]]
        cur_families = np.eye(NUM_CLASSES)[fam_ids]
        
        yield cur_proteins, cur_families

Model



In [14]:

    
class SequenceClassificationModel:
    
    def __init__(self, params):
        self.params = params
        self._create_placeholders()
        self.prediction
        self.cost
        self.error
        self.optimize
        #self._create_summaries()
    
    def _create_placeholders(self):
        with tf.name_scope("data"):
            self.data = tf.placeholder(tf.float32, [None, self.params.seq_length, self.params.embed_length])
            self.target = tf.placeholder(tf.float32, [None, NUM_CLASSES]) #####

    def _create_summaries(self):
        with tf.name_scope("summaries"):
            tf.summary.scalar('loss', self.cost)
            tf.summary.scalar('error', self.error)
            self.summary = tf.summary.merge_all()
            saver = tf.train.Saver()     
            
    @lazy
    def length(self):
    # lengths of sequences in the current data batch    
        with tf.name_scope("seq_length"):
            used = tf.sign(tf.reduce_max(tf.abs(self.data), reduction_indices=2))
            length = tf.reduce_sum(used, reduction_indices=1)
            length = tf.cast(length, tf.int32)
        return length
    
    @lazy
    def prediction(self):
        with tf.name_scope("recurrent_layer"):
            output, _ = tf.nn.dynamic_rnn(
                self.params.rnn_cell(self.params.rnn_hidden),
                self.data,
                dtype=tf.float32,
                sequence_length=self.length
            )
        last = self._last_relevant(output, self.length)

        with tf.name_scope("softmax_layer"):
            num_classes = int(self.target.get_shape()[1])
            weight = tf.Variable(tf.truncated_normal(
                [self.params.rnn_hidden, num_classes], stddev=0.01))
            bias = tf.Variable(tf.constant(0.1, shape=[num_classes]))
            prediction = tf.nn.softmax(tf.matmul(last, weight) + bias)
        return prediction
    
    @lazy
    def cost(self):
        cross_entropy = -tf.reduce_sum(self.target * tf.log(self.prediction))
        return cross_entropy
    
    @lazy
    def error(self):
        self.mistakes = tf.not_equal(
            tf.argmax(self.target, 1), tf.argmax(self.prediction, 1))
        return tf.reduce_mean(tf.cast(self.mistakes, tf.float32))
    
    @lazy
    def optimize(self):
    # we will limit the maximum weight updates for gradient clipping
        with tf.name_scope("optimization"):
            gradient = self.params.optimizer.compute_gradients(self.cost)
            if self.params.gradient_clipping:
                limit = self.params.gradient_clipping
                gradient = [
                    (tf.clip_by_value(g, -limit, limit), v)
                    if g is not None else (None, v)
                    for g, v in gradient]
            optimize = self.params.optimizer.apply_gradients(gradient)
        return optimize
    
    @staticmethod
    def _last_relevant(output, length):
        with tf.name_scope("last_relevant"):
            batch_size = tf.shape(output)[0]
            max_length = int(output.get_shape()[1])
            output_size = int(output.get_shape()[2])
            index = tf.range(0, batch_size) * max_length + (length - 1)
            flat = tf.reshape(output, [-1, output_size])
            relevant = tf.gather(flat, index)
        return relevant



In [15]:

    
params = AttrDict(
    rnn_cell=tf.contrib.rnn.GRUCell,
    rnn_hidden=300,
    optimizer=tf.train.AdamOptimizer(0.002),
    batch_size=BATCH_SIZE,
    gradient_clipping=100,
    seq_length=MAX_PROTEIN_LEN//3,
    embed_length=EMBED_LEN
)



In [ ]:

    
tf.reset_default_graph()
model = SequenceClassificationModel(params)

Training and testing



In [18]:

    
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.3)
gpu_config = tf.ConfigProto(gpu_options=gpu_options)

NUM_STEPS = 1000

with tf.Session(config=gpu_config) as sess:

    sess.run(tf.global_variables_initializer())
    
    for index, batch in enumerate(batches(NUM_STEPS)):
        feed = {model.data: batch[0], model.target: batch[1]}
        error, _ = sess.run([model.error, model.optimize], feed)
        
        if index % 50 == 0:
            print('Accuracy on step {}: {:3.1f}%'.format(index + 1, 100 * (1 - error)))
            
    # Testing
    # I run model on a one sample at time because otherwise I had problems with memory.
    sum_error = 0
    for i in range(len(test_ids)):
        if i % 1000 == 0:
            print(i, end='_')
        cur_id = [test_ids[i]]
        test_proteins = [embedding(p) for p in all_proteins[cur_id]]

        test_fam_ids = [family2num[f] for f in all_families[cur_id]]
        test_families = np.eye(NUM_CLASSES)[test_fam_ids]

        feed = {model.data: test_proteins, model.target: test_families}
        error = sess.run(model.error, feed)
        sum_error += error









    



Accuracy on step 1: 0.0%
Accuracy on step 51: 0.0%
Accuracy on step 101: 1.6%
Accuracy on step 151: 7.8%
Accuracy on step 201: 10.2%
Accuracy on step 251: 9.4%
Accuracy on step 301: 18.0%
Accuracy on step 351: 23.4%
Accuracy on step 401: 22.7%
Accuracy on step 451: 28.9%
Accuracy on step 501: 39.1%
Accuracy on step 551: 45.3%
Accuracy on step 601: 43.0%
Accuracy on step 651: 59.4%
Accuracy on step 701: 64.8%
Accuracy on step 751: 64.1%
Accuracy on step 801: 62.5%
Accuracy on step 851: 66.4%
Accuracy on step 901: 74.2%
Accuracy on step 951: 65.6%
0_1000_2000_3000_4000_5000_6000_7000_8000_9000_10000_11000_12000_13000_14000_15000_16000_17000_18000_19000_



In [20]:

    
print('Accuracy on test set: {:3.1f}%'.format(100 * (1 - sum_error/len(test_ids))))









    



Accuracy on test set: 70.5%



In [ ]:

	SwissProtAccessionID	LongID	ProteinName	FamilyID	FamilyDescription
0	Q6GZX4	001R_FRG3G	Putative transcription factor 001R	Pox_VLTF3	Poxvirus Late Transcription Factor VLTF3 like
1	Q6GZX3	002L_FRG3G	Uncharacterized protein 002L	DUF230	Poxvirus proteins of unknown function
2	Q6GZX0	005R_FRG3G	Uncharacterized protein 005R	US22	US22 like
3	Q91G88	006L_IIV6	Putative KilA-N domain-containing protein 006L	DUF3627	Protein of unknown function (DUF3627)
4	Q197F3	007R_IIV3	Uncharacterized protein 007R	DUF2738	Protein of unknown function (DUF2738)

	Sequences
0	MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQV...
1	MSIIGATRLQNDKSDTYSAGPCYAGGCSAFTPRGTCGKDWDLGEQT...
2	MQNPLPEVMSPEHDKRTTTPMSKEANKFIRELDKKPGDLAVVSDFV...
3	MDSLNEVCYEQIKGTFYKGLFGDFPLIVDKKTGCFNATKLCVLGGK...
4	MEAKNITIDNTTYNFFKFYNINQPLTNLKYLNSERLCFSNAVMGKI...