In [ ]:
    
import numpy as np
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.neighbors import NearestCentroid
import random
import pickle
    
In [ ]:
    
family_classification_metadata = pd.read_table('../seminar_5/data/family_classification_metadata.tab')
family_classification_sequences = pd.read_table('../seminar_5/data/family_classification_sequences.tab')
    
In [ ]:
    
table = pd.read_csv('data/protVec_100d_3grams_without_quotes.csv', sep='\t', header=None)
table = table.T
header = table.iloc[0] # grab the first row for the header
prot2vec = table[1:] # take the data less the header row
prot2vec.columns = header # set the header row as the df header
    
In [ ]:
    
most_common_families = Counter(family_classification_metadata['FamilyID']).most_common(1000)
most_common_families = [family for (family, count) in most_common_families]
family2num = {f: i for (i, f) in enumerate(most_common_families)}
    
In [ ]:
    
MAX_PROTEIN_LEN = 501
EMBED_LEN = 100
    
In [ ]:
    
all_proteins = family_classification_sequences['Sequences']
all_families = family_classification_metadata['FamilyID']
selected_ids = [i for i in range(len(all_proteins)) 
                  if all_families[i] in family2num and len(all_proteins[i]) <= MAX_PROTEIN_LEN]
random.shuffle(selected_ids)
train_ratio = 0.9
num_train = int(len(selected_ids) * train_ratio)
train_ids = selected_ids[:num_train]
test_ids = selected_ids[num_train:]
    
In [ ]:
    
def embedding(protein):
    res = np.zeros(100)
    for i in range(0, (len(protein) - 3) // 3):
        try:
            res = np.add(res, prot2vec[protein[i*3: i*3 + 3]])
        except KeyError:
            res = np.add(res, prot2vec['<unk>'])
    return np.divide(res, ((len(protein) - 3) // 3))
#embedding(all_proteins[11])
    
In [ ]:
    
X_train = []
for i in range(len(train_ids)):
    #if i % 2000 == 0:
    #    print(i)
    cur_id = train_ids[i]
    X_train.append(embedding(all_proteins[cur_id]))
    
In [ ]:
    
X_test = []
for i in range(len(test_ids)):
    #if i % 2000 == 0:
    #    print(i)
    cur_id = test_ids[i]
    X_test.append(embedding(all_proteins[cur_id]))
    
In [67]:
    
with  open('data/X_train.pickle', 'wb') as f:
    pickle.dump(X_train, f)
with  open('data/X_test.pickle', 'wb') as f:
    pickle.dump(X_test, f)
    
In [69]:
    
y_train = all_families[train_ids]
y_test = all_families[test_ids]
with  open('data/y_train.pickle', 'wb') as f:
    pickle.dump(y_train, f)
    
with  open('data/y_test.pickle', 'wb') as f:
    pickle.dump(y_test, f)
    
In [79]:
    
for shrinkage in [None, .2, 5, 10]:
    clf = NearestCentroid(shrink_threshold=shrinkage)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print('Accuracy for shinkage {}: {:3.1f}%'.format(shrinkage, np.mean(y_test == y_pred) * 100))
    
    
So my RNN model gave me 70.5% of accuracy VS 45.0% by Nearest centroid classifier.