In [ ]:
import numpy as np
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.neighbors import NearestCentroid
import random
import pickle
In [ ]:
family_classification_metadata = pd.read_table('../seminar_5/data/family_classification_metadata.tab')
family_classification_sequences = pd.read_table('../seminar_5/data/family_classification_sequences.tab')
In [ ]:
table = pd.read_csv('data/protVec_100d_3grams_without_quotes.csv', sep='\t', header=None)
table = table.T
header = table.iloc[0] # grab the first row for the header
prot2vec = table[1:] # take the data less the header row
prot2vec.columns = header # set the header row as the df header
In [ ]:
most_common_families = Counter(family_classification_metadata['FamilyID']).most_common(1000)
most_common_families = [family for (family, count) in most_common_families]
family2num = {f: i for (i, f) in enumerate(most_common_families)}
In [ ]:
MAX_PROTEIN_LEN = 501
EMBED_LEN = 100
In [ ]:
all_proteins = family_classification_sequences['Sequences']
all_families = family_classification_metadata['FamilyID']
selected_ids = [i for i in range(len(all_proteins))
if all_families[i] in family2num and len(all_proteins[i]) <= MAX_PROTEIN_LEN]
random.shuffle(selected_ids)
train_ratio = 0.9
num_train = int(len(selected_ids) * train_ratio)
train_ids = selected_ids[:num_train]
test_ids = selected_ids[num_train:]
In [ ]:
def embedding(protein):
res = np.zeros(100)
for i in range(0, (len(protein) - 3) // 3):
try:
res = np.add(res, prot2vec[protein[i*3: i*3 + 3]])
except KeyError:
res = np.add(res, prot2vec['<unk>'])
return np.divide(res, ((len(protein) - 3) // 3))
#embedding(all_proteins[11])
In [ ]:
X_train = []
for i in range(len(train_ids)):
#if i % 2000 == 0:
# print(i)
cur_id = train_ids[i]
X_train.append(embedding(all_proteins[cur_id]))
In [ ]:
X_test = []
for i in range(len(test_ids)):
#if i % 2000 == 0:
# print(i)
cur_id = test_ids[i]
X_test.append(embedding(all_proteins[cur_id]))
In [67]:
with open('data/X_train.pickle', 'wb') as f:
pickle.dump(X_train, f)
with open('data/X_test.pickle', 'wb') as f:
pickle.dump(X_test, f)
In [69]:
y_train = all_families[train_ids]
y_test = all_families[test_ids]
with open('data/y_train.pickle', 'wb') as f:
pickle.dump(y_train, f)
with open('data/y_test.pickle', 'wb') as f:
pickle.dump(y_test, f)
In [79]:
for shrinkage in [None, .2, 5, 10]:
clf = NearestCentroid(shrink_threshold=shrinkage)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print('Accuracy for shinkage {}: {:3.1f}%'.format(shrinkage, np.mean(y_test == y_pred) * 100))
So my RNN model gave me 70.5% of accuracy VS 45.0% by Nearest centroid classifier.