In [2]:
import numpy as np
import pandas as pd
from collections import Counter
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.manifold import TSNE
In [3]:
family_classification_metadata = pd.read_table('../seminar_5/data/family_classification_metadata.tab')
family_classification_sequences = pd.read_table('../seminar_5/data/family_classification_sequences.tab')
In [4]:
family_classification_metadata.head()
Out[4]:
In [5]:
family_classification_sequences.head()
Out[5]:
Use your ProtVec embedding from homework 5 to perform protein family classification using RNN.
In [6]:
table = pd.read_csv('data/protVec_100d_3grams_without_quotes.csv', sep='\t', header=None)
table = table.T
header = table.iloc[0] # grab the first row for the header
prot2vec = table[1:] # take the data less the header row
prot2vec.columns = header # set the header row as the df header
prot2vec["AAA"].head()
Out[6]:
2 most frequent families:
In [7]:
most_common_families = Counter(family_classification_metadata['FamilyID']).most_common(2)
most_common_families = [family for (family, count) in most_common_families]
family2num = {f: i for (i, f) in enumerate(most_common_families)}
family2num
Out[7]:
In [8]:
MAX_PROTEIN_LEN = 501
EMBED_LEN = 100
In [9]:
all_proteins = family_classification_sequences['Sequences']
all_families = family_classification_metadata['FamilyID']
selected_ids = [i for i in range(len(all_proteins))
if all_families[i] in family2num and len(all_proteins[i]) <= MAX_PROTEIN_LEN]
In [13]:
def embedding(protein):
res = np.zeros(100)
for i in range(0, (len(protein) - 3) // 3):
try:
res += prot2vec[protein[i*3: i*3 + 3]]
except KeyError:
res += prot2vec['<unk>']
return res / ((len(protein) - 3) // 3)
#embedding(all_proteins[11])
In [14]:
selected_proteins = [embedding(p) for p in all_proteins[selected_ids]]
In [15]:
tsne = TSNE(n_components=2, random_state=42, angle=0.7, init='pca', n_iter=500)
XX = tsne.fit_transform(selected_proteins)
In [16]:
tsne_df = pd.DataFrame(XX, columns=['x0', 'x1'])
In [17]:
plt.figure(figsize=(10, 10))
colors = ['red', 'blue']
plt.scatter(tsne_df['x0'], tsne_df['x1'], c=[colors[family2num[f]] for f in all_families[selected_ids]], s=20);
In [ ]: