Protein Family Classification


In [ ]:
import numpy as np
import pandas as pd
from collections import Counter

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.neighbors import NearestCentroid
import random
import pickle

In [ ]:
family_classification_metadata = pd.read_table('../seminar_5/data/family_classification_metadata.tab')
family_classification_sequences = pd.read_table('../seminar_5/data/family_classification_sequences.tab')

In [ ]:
table = pd.read_csv('data/protVec_100d_3grams_without_quotes.csv', sep='\t', header=None)
table = table.T
header = table.iloc[0] # grab the first row for the header
prot2vec = table[1:] # take the data less the header row
prot2vec.columns = header # set the header row as the df header

In [ ]:
most_common_families = Counter(family_classification_metadata['FamilyID']).most_common(1000)
most_common_families = [family for (family, count) in most_common_families]
family2num = {f: i for (i, f) in enumerate(most_common_families)}

In [ ]:
MAX_PROTEIN_LEN = 501
EMBED_LEN = 100

In [ ]:
all_proteins = family_classification_sequences['Sequences']
all_families = family_classification_metadata['FamilyID']

selected_ids = [i for i in range(len(all_proteins)) 
                  if all_families[i] in family2num and len(all_proteins[i]) <= MAX_PROTEIN_LEN]

random.shuffle(selected_ids)

train_ratio = 0.9
num_train = int(len(selected_ids) * train_ratio)

train_ids = selected_ids[:num_train]
test_ids = selected_ids[num_train:]

In [ ]:
def embedding(protein):
    res = np.zeros(100)
    for i in range(0, (len(protein) - 3) // 3):
        try:
            res = np.add(res, prot2vec[protein[i*3: i*3 + 3]])
        except KeyError:
            res = np.add(res, prot2vec['<unk>'])

    return np.divide(res, ((len(protein) - 3) // 3))

#embedding(all_proteins[11])

In [ ]:
X_train = []
for i in range(len(train_ids)):
    #if i % 2000 == 0:
    #    print(i)
    cur_id = train_ids[i]
    X_train.append(embedding(all_proteins[cur_id]))

In [ ]:
X_test = []
for i in range(len(test_ids)):
    #if i % 2000 == 0:
    #    print(i)
    cur_id = test_ids[i]
    X_test.append(embedding(all_proteins[cur_id]))

In [67]:
with  open('data/X_train.pickle', 'wb') as f:
    pickle.dump(X_train, f)

with  open('data/X_test.pickle', 'wb') as f:
    pickle.dump(X_test, f)

In [69]:
y_train = all_families[train_ids]

y_test = all_families[test_ids]

with  open('data/y_train.pickle', 'wb') as f:
    pickle.dump(y_train, f)
    
with  open('data/y_test.pickle', 'wb') as f:
    pickle.dump(y_test, f)

Nearest centroid classifier

I used it because it's fast.


In [79]:
for shrinkage in [None, .2, 5, 10]:
    clf = NearestCentroid(shrink_threshold=shrinkage)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print('Accuracy for shinkage {}: {:3.1f}%'.format(shrinkage, np.mean(y_test == y_pred) * 100))


Accuracy for shinkage None: 45.0%
Accuracy for shinkage 0.2: 44.8%
Accuracy for shinkage 5: 28.3%
Accuracy for shinkage 10: 17.2%

So my RNN model gave me 70.5% of accuracy VS 45.0% by Nearest centroid classifier.