In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import glob
from small_script.myFunctions import *
import feather
import Bio.PDB as bio
from sklearn.metrics import confusion_matrix
d3_to_index = bio.Polypeptide.d3_to_index # we may want to adjust this in the future.
three_to_one = bio.Polypeptide.three_to_one
one_to_index = bio.Polypeptide.one_to_index
plt.rcParams['figure.figsize'] = [16.18033, 10]
%matplotlib inline
%load_ext autoreload
%autoreload 2
In [ ]:
def getFragPdb(pdbId, i, outFile=None):
pdb = pdbId + ".pdb"
if outFile is None:
outFile = f"{i}_{pdb}"
# pdb = "1igqB00.pdb"
# pdbId = pdb.split('.')[0]
pre = "/Users/weilu/Research/optimization/fragment/"
database = "/Users/weilu/Research/optimization/fragment/database/dompdb/"
parser = bio.PDBParser(QUIET=True)
structure = parser.get_structure("x", os.path.join(database, pdb))
for model in structure:
for chain in model:
all_residues = list(chain)
io = bio.PDBIO()
c = bio.Chain.Chain("A")
c.child_list = all_residues[i:i+9]
# for ii, res in enumerate(c):
# res.id = (' ', ii+1, ' ')
io.set_structure(c)
io.save(f'{pre}{outFile}')
In [130]:
data_original = feather.read_dataframe("/Users/weilu/Research/optimization/fragment/cluster100.feather")
In [178]:
t = data_original.query("cluster < 5").groupby("cluster").apply(pd.DataFrame.sample, 4)
In [181]:
for i, row in t.reset_index(drop=True).iterrows():
print(i, row["pdb"], row["i"], row["cluster"])
getFragPdb(row["pdb"], int(row["i"]), f"cluster0to4/{i}_cluster_{row['cluster']}.pdb")
In [191]:
# compute the rmsd with respect to the pdb that closest to the cluster center
pre = "/Users/weilu/Research/optimization/fragment/"
pdbList = glob.glob(f"{pre}cluster0to4/[0-9]*.pdb")
with open(pre+"cluster0to4_rmsd.csv", "w") as out:
out.write("i,j,rmsd\n")
for p1 in pdbList:
print(p1)
i1 = p1.split("/")[-1].split(".")[0]
# if i1 != 0:
# continue
print(i1)
for p2 in pdbList:
i2 = p2.split("/")[-1].split(".")[0]
rmsd = float(getFromTerminal(f"calculate_rmsd.py {p1} {p2}"))
out.write(f"{i1},{i2},{rmsd}\n")
In [192]:
cluster_rmsd = pd.read_csv(pre+"cluster0to4_rmsd.csv")
In [198]:
In [206]:
cluster_rmsd["rmsd"] = cluster_rmsd["rmsd"].round(3)
cluster_rmsd["ii"] = cluster_rmsd["i"].apply(lambda x: int(x.split("_")[0]))
cluster_rmsd["jj"] = cluster_rmsd["j"].apply(lambda x: int(x.split("_")[0]))
cluster_rmsd = cluster_rmsd.sort_values(["ii", "jj"])
t = cluster_rmsd.pivot(index="ii", columns="jj", values="rmsd")
In [218]:
plt.rcParams['figure.figsize'] = [16.18033, 10]
plt.imshow(t, cmap="Greys")
plt.colorbar()
Out[218]:
In [221]:
t = data_original.query("cluster < 5").groupby("cluster").head(4)
In [222]:
folder = "cluster0to4_center"
for i, row in t.reset_index(drop=True).iterrows():
# print(i, row["pdb"], row["i"], row["cluster"])
getFragPdb(row["pdb"], int(row["i"]), f"{folder}/{i}_cluster_{row['cluster']}.pdb")
In [223]:
# compute the rmsd with respect to the pdb that closest to the cluster center
pre = "/Users/weilu/Research/optimization/fragment/"
pdbList = glob.glob(f"{pre}{folder}/[0-9]*.pdb")
with open(pre+f"{folder}_rmsd.csv", "w") as out:
out.write("i,j,rmsd\n")
for p1 in pdbList:
print(p1)
i1 = p1.split("/")[-1].split(".")[0]
# if i1 != 0:
# continue
print(i1)
for p2 in pdbList:
i2 = p2.split("/")[-1].split(".")[0]
rmsd = float(getFromTerminal(f"calculate_rmsd.py {p1} {p2}"))
out.write(f"{i1},{i2},{rmsd}\n")
In [224]:
cluster_rmsd = pd.read_csv(pre+f"{folder}_rmsd.csv")
cluster_rmsd["rmsd"] = cluster_rmsd["rmsd"].round(3)
cluster_rmsd["ii"] = cluster_rmsd["i"].apply(lambda x: int(x.split("_")[0]))
cluster_rmsd["jj"] = cluster_rmsd["j"].apply(lambda x: int(x.split("_")[0]))
cluster_rmsd = cluster_rmsd.sort_values(["ii", "jj"])
t = cluster_rmsd.pivot(index="ii", columns="jj", values="rmsd")
In [225]:
plt.rcParams['figure.figsize'] = [16.18033, 10]
plt.imshow(t, cmap="Greys")
plt.colorbar()
Out[225]:
In [238]:
data_original.shape
Out[238]:
In [239]:
data_original.head()
Out[239]:
In [219]:
data_original.query("cluster < 5")["cluster"].value_counts()
Out[219]:
In [161]:
data_original.head()
Out[161]:
In [3]:
data_original = feather.read_dataframe("/Users/weilu/Research/optimization/fragment/cluster100_v2.feather")
In [4]:
data = data_original[["pdb", "i", "seq","cluster", "rmsd"]].reset_index(drop=True)
data["cluster"] = data["cluster"].astype(int)
for i in range(1,10):
data[f"s{i}"] = data["seq"].apply(lambda x: one_to_index(x[i-1]))
In [5]:
data.to_feather("/Users/weilu/Research/optimization/fragment/cluster100_v2_processed.feather")
# data.to_feather("/Users/weilu/Research/optimization/fragment/cluster100_processed.feather")
In [226]:
data = feather.read_dataframe("/Users/weilu/Research/optimization/fragment/cluster100_processed.feather")
In [6]:
data = feather.read_dataframe("/Users/weilu/Research/optimization/fragment/cluster100_v2_processed.feather")
In [9]:
data.head()
Out[9]:
In [228]:
data = data.query("cluster < 2")
In [229]:
data["cluster"].value_counts()
Out[229]:
In [241]:
maxlen = 9
test = data.groupby("cluster").apply(pd.DataFrame.sample, 10000, replace=True)
# test = data.query("category > -1 and category < 10").sample(10000)
x_train = test.iloc[:, 5:14].values
y_train_value = test["cluster"].values
test = data.groupby("cluster").apply(pd.DataFrame.sample, 10000, replace=True)
# test = data.query("category > -1 and category < 10").sample(10000)
x_test = test.iloc[:, 5:14].values
y_test_value = test["cluster"].values
In [253]:
maxlen = 9
test = data.groupby("cluster").apply(pd.DataFrame.sample, 10000, replace=True)
# test = data.query("category > -1 and category < 10").sample(10000)
x_train = test.iloc[:, 5:14].values
y_train_value = test["cluster"].values
test = data.groupby("cluster").apply(pd.DataFrame.sample, 10000, replace=True)
# test = data.query("category > -1 and category < 10").sample(10000)
x_test = test.iloc[:, 5:14].values
y_test_value = test["cluster"].values
# print('Pad sequences (samples x time)')
# x_train1 = sequence.pad_sequences(x_train, maxlen=10)
# x_test1 = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
y_train = to_categorical(np.array(y_train_value))
y_test = to_categorical(np.array(y_test_value))
In [248]:
np.all(x_train1 == x_train)
Out[248]:
In [254]:
Out[254]:
In [ ]:
In [251]:
y_train.shape
Out[251]:
In [236]:
from keras.utils import to_categorical
max_features = 100
# cut texts after this number of words
# (among top max_features most common words)
batch_size = 1024*2
print(len(x_train), 'train sequences')
model = Sequential()
model.add(Embedding(max_features, 128, input_length=maxlen))
model.add(Bidirectional(LSTM(64)))
model.add(Dropout(0.5))
model.add(Dense(2, activation='softmax'))
# try using different optimizers and different optimizer configs
# model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])
model.compile('adam', 'categorical_crossentropy', metrics=['accuracy'])
print('Train...')
model.fit(x_train, y_train,
batch_size=batch_size,
epochs=40,
validation_data=[x_test, y_test])
Out[236]:
In [237]:
model.summary()
In [234]:
model.summary()
In [160]:
from keras.utils import to_categorical
max_features = 20
# cut texts after this number of words
# (among top max_features most common words)
batch_size = 1024*2
print(len(x_train), 'train sequences')
model = Sequential()
model.add(Embedding(max_features, 128, input_length=maxlen))
model.add(Bidirectional(LSTM(64)))
model.add(Dropout(0.5))
model.add(Dense(100, activation='softmax'))
# try using different optimizers and different optimizer configs
# model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])
model.compile('adam', 'categorical_crossentropy', metrics=['accuracy'])
print('Train...')
model.fit(x_train, y_train,
batch_size=batch_size,
epochs=40,
validation_data=[x_test, y_test])
Out[160]:
In [8]:
data = feather.read_dataframe("/Users/weilu/Research/optimization/fragment/feather_cluster_data.feather")
In [15]:
data.head()
Out[15]:
In [56]:
data["dd"].value_counts()
Out[56]:
In [10]:
import numpy as np
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional
from keras.datasets import imdb
In [19]:
x_train.shape
Out[19]:
In [40]:
data.query("category == 11").shape
Out[40]:
In [59]:
data.query("category == 10 or category == 11").sample(10)
Out[59]:
In [58]:
data.query("category == 29 or category == 11").sample(10)
Out[58]:
In [55]:
data.query("category == 4 or category == 11").sample(10)
Out[55]:
In [28]:
test.head()
Out[28]:
In [102]:
data.query("category == 9").shape
Out[102]:
In [ ]:
data.sample(replace)
In [117]:
test = data.query("category > -1 and category < 10").groupby("category").apply(pd.DataFrame.sample, 10000, replace=True)
# test = data.query("category > -1 and category < 10").sample(10000)
x_train = test.iloc[:, 6:15].values
y_train_value = test["category"].values
test = data.query("category > -1 and category < 10").groupby("category").apply(pd.DataFrame.sample, 10000, replace=True)
# test = data.query("category > -1 and category < 10").sample(10000)
x_test = test.iloc[:, 6:15].values
y_test_value = test["category"].values
# print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
y_train = to_categorical(np.array(y_train_value))
y_test = to_categorical(np.array(y_test_value))
In [118]:
y_train.sum(axis=0)
Out[118]:
In [119]:
from keras.utils import to_categorical
max_features = 20000
# cut texts after this number of words
# (among top max_features most common words)
maxlen = 9
batch_size = 32
print(len(x_train), 'train sequences')
model = Sequential()
model.add(Embedding(max_features, 128, input_length=maxlen))
model.add(Bidirectional(LSTM(64)))
model.add(Dropout(0.5))
model.add(Dense(10, activation='softmax'))
# try using different optimizers and different optimizer configs
# model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])
model.compile('adam', 'categorical_crossentropy', metrics=['accuracy'])
print('Train...')
model.fit(x_train, y_train,
batch_size=batch_size,
epochs=40,
validation_data=[x_test, y_test])
Out[119]:
In [121]:
model.fit(x_train, y_train,
batch_size=10240,
epochs=2,
validation_data=[x_test, y_test])
Out[121]:
In [122]:
y_pred = model.predict(x_test)
predicted = np.argmax(y_pred, axis=1)
confusion_matrix(y_test_value, predicted)
Out[122]:
In [123]:
y_pred = model.predict(x_test)
In [108]:
predicted = np.argmax(y_pred, axis=1)
In [109]:
In [110]:
confusion_matrix(y_test_value, predicted)
Out[110]:
In [79]:
y_test[-1]
Out[79]:
In [96]:
y_test.sum(axis=0)
Out[96]:
In [72]:
y_test
Out[72]:
In [48]:
max_features = 20000
# cut texts after this number of words
# (among top max_features most common words)
maxlen = 9
batch_size = 32
print(len(x_train), 'train sequences')
print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
print('x_train shape:', x_train.shape)
y_train = np.array(y_train)
model = Sequential()
model.add(Embedding(max_features, 128, input_length=maxlen))
model.add(Bidirectional(LSTM(64)))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
# try using different optimizers and different optimizer configs
model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])
print('Train...')
model.fit(x_train, y_train,
batch_size=batch_size,
epochs=40,
validation_data=[x_train, y_train])
Out[48]:
In [11]:
max_features = 20000
# cut texts after this number of words
# (among top max_features most common words)
maxlen = 100
batch_size = 32
print('Loading data...')
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')
print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)
y_train = np.array(y_train)
y_test = np.array(y_test)
model = Sequential()
model.add(Embedding(max_features, 128, input_length=maxlen))
model.add(Bidirectional(LSTM(64)))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
# try using different optimizers and different optimizer configs
model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])
print('Train...')
model.fit(x_train, y_train,
batch_size=batch_size,
epochs=4,
validation_data=[x_test, y_test])
Out[11]:
In [17]:
data.head()
Out[17]:
In [125]:
"""Example of using Hierarchical RNN (HRNN) to classify MNIST digits.
HRNNs can learn across multiple levels
of temporal hierarchy over a complex sequence.
Usually, the first recurrent layer of an HRNN
encodes a sentence (e.g. of word vectors)
into a sentence vector.
The second recurrent layer then encodes a sequence of
such vectors (encoded by the first layer) into a document vector.
This document vector is considered to preserve both
the word-level and sentence-level structure of the context.
# References
- [A Hierarchical Neural Autoencoder for Paragraphs and Documents]
(https://arxiv.org/abs/1506.01057)
Encodes paragraphs and documents with HRNN.
Results have shown that HRNN outperforms standard
RNNs and may play some role in more sophisticated generation tasks like
summarization or question answering.
- [Hierarchical recurrent neural network for skeleton based action recognition]
(http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=7298714)
Achieved state-of-the-art results on
skeleton based action recognition with 3 levels
of bidirectional HRNN combined with fully connected layers.
In the below MNIST example the first LSTM layer first encodes every
column of pixels of shape (28, 1) to a column vector of shape (128,).
The second LSTM layer encodes then these 28 column vectors of shape (28, 128)
to a image vector representing the whole image.
A final Dense layer is added for prediction.
After 5 epochs: train acc: 0.9858, val acc: 0.9864
"""
from __future__ import print_function
import keras
from keras.datasets import mnist
from keras.models import Model
from keras.layers import Input, Dense, TimeDistributed
from keras.layers import LSTM
# Training parameters.
batch_size = 32
num_classes = 10
epochs = 5
# Embedding dimensions.
row_hidden = 128
col_hidden = 128
# The data, split between train and test sets.
(x_train, y_train), (x_test, y_test) = mnist.load_data()
# Reshapes data to 4D for Hierarchical RNN.
x_train = x_train.reshape(x_train.shape[0], 28, 28, 1)
x_test = x_test.reshape(x_test.shape[0], 28, 28, 1)
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
x_train /= 255
x_test /= 255
print('x_train shape:', x_train.shape)
print(x_train.shape[0], 'train samples')
print(x_test.shape[0], 'test samples')
# Converts class vectors to binary class matrices.
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)
row, col, pixel = x_train.shape[1:]
# 4D input.
x = Input(shape=(row, col, pixel))
# Encodes a row of pixels using TimeDistributed Wrapper.
encoded_rows = TimeDistributed(LSTM(row_hidden))(x)
# Encodes columns of encoded rows.
encoded_columns = LSTM(col_hidden)(encoded_rows)
# Final predictions and model.
prediction = Dense(num_classes, activation='softmax')(encoded_columns)
model = Model(x, prediction)
model.compile(loss='categorical_crossentropy',
optimizer='rmsprop',
metrics=['accuracy'])
# Training.
model.fit(x_train, y_train,
batch_size=batch_size,
epochs=epochs,
verbose=1,
validation_data=(x_test, y_test))
# Evaluation.
scores = model.evaluate(x_test, y_test, verbose=0)
print('Test loss:', scores[0])
print('Test accuracy:', scores[1])
In [128]:
x_train[0]
Out[128]:
In [ ]: