In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import glob
from small_script.myFunctions import *
import feather
import Bio.PDB as bio

from sklearn.metrics import confusion_matrix


d3_to_index = bio.Polypeptide.d3_to_index  # we may want to adjust this in the future.
three_to_one = bio.Polypeptide.three_to_one
one_to_index = bio.Polypeptide.one_to_index
plt.rcParams['figure.figsize'] = [16.18033, 10]

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [ ]:
def getFragPdb(pdbId, i, outFile=None):
    pdb = pdbId + ".pdb"
    if outFile is None:
        outFile = f"{i}_{pdb}"
#     pdb = "1igqB00.pdb"
#     pdbId = pdb.split('.')[0]
    pre = "/Users/weilu/Research/optimization/fragment/"
    database = "/Users/weilu/Research/optimization/fragment/database/dompdb/"
    parser = bio.PDBParser(QUIET=True)
    structure = parser.get_structure("x", os.path.join(database, pdb))
    for model in structure:
        for chain in model:
            all_residues = list(chain)
            io = bio.PDBIO()
            c = bio.Chain.Chain("A")
            c.child_list = all_residues[i:i+9]
#             for ii, res in enumerate(c):
#                 res.id = (' ', ii+1, ' ')
            io.set_structure(c)
            io.save(f'{pre}{outFile}')

In [130]:
data_original = feather.read_dataframe("/Users/weilu/Research/optimization/fragment/cluster100.feather")

random structure from 5 clusters


In [178]:
t = data_original.query("cluster < 5").groupby("cluster").apply(pd.DataFrame.sample, 4)

In [181]:
for i, row in t.reset_index(drop=True).iterrows():
    print(i, row["pdb"], row["i"], row["cluster"])
    getFragPdb(row["pdb"], int(row["i"]), f"cluster0to4/{i}_cluster_{row['cluster']}.pdb")


0 3un9A01 55 0
1 2vovA00 163 0
2 3psfA03 228 0
3 4nn5B01 20 0
4 1zu0A03 84 1
5 1jplA00 86 1
6 4ovjA02 77 1
7 5lozA00 112 1
8 4g0aA01 3 2
9 3lppA02 115 2
10 1umhA00 32 2
11 2jgvB00 157 2
12 2c43A01 17 3
13 2c9eA00 13 3
14 4lr2A01 86 3
15 3zheB02 116 3
16 2taaA02 55 4
17 3doaA01 118 4
18 1ywkC00 217 4
19 3ty1A00 204 4

In [191]:
# compute the rmsd with respect to the pdb that closest to the cluster center
pre = "/Users/weilu/Research/optimization/fragment/"
pdbList = glob.glob(f"{pre}cluster0to4/[0-9]*.pdb")
with open(pre+"cluster0to4_rmsd.csv", "w") as out:
    out.write("i,j,rmsd\n")
    for p1 in pdbList:
        print(p1)
        i1 = p1.split("/")[-1].split(".")[0]
#         if i1 != 0:
#             continue
        print(i1)
        for p2 in pdbList:
            i2 = p2.split("/")[-1].split(".")[0]
            rmsd = float(getFromTerminal(f"calculate_rmsd.py {p1} {p2}"))
            out.write(f"{i1},{i2},{rmsd}\n")


/Users/weilu/Research/optimization/fragment/cluster0to4/5_cluster_1.pdb
5_cluster_1
/Users/weilu/Research/optimization/fragment/cluster0to4/1_cluster_0.pdb
1_cluster_0
/Users/weilu/Research/optimization/fragment/cluster0to4/13_cluster_3.pdb
13_cluster_3
/Users/weilu/Research/optimization/fragment/cluster0to4/17_cluster_4.pdb
17_cluster_4
/Users/weilu/Research/optimization/fragment/cluster0to4/10_cluster_2.pdb
10_cluster_2
/Users/weilu/Research/optimization/fragment/cluster0to4/6_cluster_1.pdb
6_cluster_1
/Users/weilu/Research/optimization/fragment/cluster0to4/14_cluster_3.pdb
14_cluster_3
/Users/weilu/Research/optimization/fragment/cluster0to4/2_cluster_0.pdb
2_cluster_0
/Users/weilu/Research/optimization/fragment/cluster0to4/8_cluster_2.pdb
8_cluster_2
/Users/weilu/Research/optimization/fragment/cluster0to4/19_cluster_4.pdb
19_cluster_4
/Users/weilu/Research/optimization/fragment/cluster0to4/18_cluster_4.pdb
18_cluster_4
/Users/weilu/Research/optimization/fragment/cluster0to4/9_cluster_2.pdb
9_cluster_2
/Users/weilu/Research/optimization/fragment/cluster0to4/16_cluster_4.pdb
16_cluster_4
/Users/weilu/Research/optimization/fragment/cluster0to4/15_cluster_3.pdb
15_cluster_3
/Users/weilu/Research/optimization/fragment/cluster0to4/3_cluster_0.pdb
3_cluster_0
/Users/weilu/Research/optimization/fragment/cluster0to4/11_cluster_2.pdb
11_cluster_2
/Users/weilu/Research/optimization/fragment/cluster0to4/7_cluster_1.pdb
7_cluster_1
/Users/weilu/Research/optimization/fragment/cluster0to4/12_cluster_3.pdb
12_cluster_3
/Users/weilu/Research/optimization/fragment/cluster0to4/0_cluster_0.pdb
0_cluster_0
/Users/weilu/Research/optimization/fragment/cluster0to4/4_cluster_1.pdb
4_cluster_1

In [192]:
cluster_rmsd = pd.read_csv(pre+"cluster0to4_rmsd.csv")

In [198]:


In [206]:
cluster_rmsd["rmsd"] = cluster_rmsd["rmsd"].round(3)
cluster_rmsd["ii"] = cluster_rmsd["i"].apply(lambda x: int(x.split("_")[0]))
cluster_rmsd["jj"] = cluster_rmsd["j"].apply(lambda x: int(x.split("_")[0]))
cluster_rmsd = cluster_rmsd.sort_values(["ii", "jj"])
t = cluster_rmsd.pivot(index="ii", columns="jj", values="rmsd")

In [218]:
plt.rcParams['figure.figsize'] = [16.18033, 10]
plt.imshow(t, cmap="Greys")
plt.colorbar()


Out[218]:
<matplotlib.colorbar.Colorbar at 0x1ad7417f0>

5 cluster center.


In [221]:
t = data_original.query("cluster < 5").groupby("cluster").head(4)

In [222]:
folder = "cluster0to4_center"
for i, row in t.reset_index(drop=True).iterrows():
#     print(i, row["pdb"], row["i"], row["cluster"])
    getFragPdb(row["pdb"], int(row["i"]), f"{folder}/{i}_cluster_{row['cluster']}.pdb")


0 1g9mG00 206 0
1 1wyuB03 158 0
2 1gqiA02 246 0
3 4c8pA00 89 0
4 5cr9A02 26 1
5 1ic2A00 11 1
6 4r33A00 51 1
7 1oaoC03 163 1
8 2h7fX02 93 2
9 2w2iA00 179 2
10 1d7pM00 117 2
11 2q83A02 121 2
12 2q9oA03 200 3
13 1vchD00 41 3
14 1uqtA01 108 3
15 1ej6B00 574 3
16 4yokA01 40 4
17 2e8yA01 70 4
18 3pjyA00 7 4
19 3sluB01 51 4

In [223]:
# compute the rmsd with respect to the pdb that closest to the cluster center

pre = "/Users/weilu/Research/optimization/fragment/"
pdbList = glob.glob(f"{pre}{folder}/[0-9]*.pdb")
with open(pre+f"{folder}_rmsd.csv", "w") as out:
    out.write("i,j,rmsd\n")
    for p1 in pdbList:
        print(p1)
        i1 = p1.split("/")[-1].split(".")[0]
#         if i1 != 0:
#             continue
        print(i1)
        for p2 in pdbList:
            i2 = p2.split("/")[-1].split(".")[0]
            rmsd = float(getFromTerminal(f"calculate_rmsd.py {p1} {p2}"))
            out.write(f"{i1},{i2},{rmsd}\n")


/Users/weilu/Research/optimization/fragment/cluster0to4_center/5_cluster_1.pdb
5_cluster_1
/Users/weilu/Research/optimization/fragment/cluster0to4_center/1_cluster_0.pdb
1_cluster_0
/Users/weilu/Research/optimization/fragment/cluster0to4_center/13_cluster_3.pdb
13_cluster_3
/Users/weilu/Research/optimization/fragment/cluster0to4_center/17_cluster_4.pdb
17_cluster_4
/Users/weilu/Research/optimization/fragment/cluster0to4_center/10_cluster_2.pdb
10_cluster_2
/Users/weilu/Research/optimization/fragment/cluster0to4_center/6_cluster_1.pdb
6_cluster_1
/Users/weilu/Research/optimization/fragment/cluster0to4_center/14_cluster_3.pdb
14_cluster_3
/Users/weilu/Research/optimization/fragment/cluster0to4_center/2_cluster_0.pdb
2_cluster_0
/Users/weilu/Research/optimization/fragment/cluster0to4_center/8_cluster_2.pdb
8_cluster_2
/Users/weilu/Research/optimization/fragment/cluster0to4_center/19_cluster_4.pdb
19_cluster_4
/Users/weilu/Research/optimization/fragment/cluster0to4_center/18_cluster_4.pdb
18_cluster_4
/Users/weilu/Research/optimization/fragment/cluster0to4_center/9_cluster_2.pdb
9_cluster_2
/Users/weilu/Research/optimization/fragment/cluster0to4_center/16_cluster_4.pdb
16_cluster_4
/Users/weilu/Research/optimization/fragment/cluster0to4_center/15_cluster_3.pdb
15_cluster_3
/Users/weilu/Research/optimization/fragment/cluster0to4_center/3_cluster_0.pdb
3_cluster_0
/Users/weilu/Research/optimization/fragment/cluster0to4_center/11_cluster_2.pdb
11_cluster_2
/Users/weilu/Research/optimization/fragment/cluster0to4_center/7_cluster_1.pdb
7_cluster_1
/Users/weilu/Research/optimization/fragment/cluster0to4_center/12_cluster_3.pdb
12_cluster_3
/Users/weilu/Research/optimization/fragment/cluster0to4_center/0_cluster_0.pdb
0_cluster_0
/Users/weilu/Research/optimization/fragment/cluster0to4_center/4_cluster_1.pdb
4_cluster_1

In [224]:
cluster_rmsd = pd.read_csv(pre+f"{folder}_rmsd.csv")
cluster_rmsd["rmsd"] = cluster_rmsd["rmsd"].round(3)
cluster_rmsd["ii"] = cluster_rmsd["i"].apply(lambda x: int(x.split("_")[0]))
cluster_rmsd["jj"] = cluster_rmsd["j"].apply(lambda x: int(x.split("_")[0]))
cluster_rmsd = cluster_rmsd.sort_values(["ii", "jj"])
t = cluster_rmsd.pivot(index="ii", columns="jj", values="rmsd")

In [225]:
plt.rcParams['figure.figsize'] = [16.18033, 10]
plt.imshow(t, cmap="Greys")
plt.colorbar()


Out[225]:
<matplotlib.colorbar.Colorbar at 0x1c6f450b8>

In [238]:
data_original.shape


Out[238]:
(1901430, 89)

In [239]:
data_original.head()


Out[239]:
pdb i seq caca_1 caca_2 caca_3 caca_4 caca_5 caca_6 caca_7 ... cbcb_14 cbcb_15 cbcb_16 cbcb_17 cbcb_18 cbcb_19 cbcb_20 cbcb_21 rmsd cluster
0 1g9mG00 206 FFYCNSTQL 9.879721 13.365485 15.387287 18.605337 20.240051 19.853123 9.830854 ... 14.465818 14.041501 9.975571 9.917680 9.397504 7.394856 9.067275 4.491067 4.966916 0
1 1wyuB03 158 QLYYDGANL 9.870072 12.872975 15.433169 18.492691 19.469760 19.349613 9.869476 ... 13.653317 13.835575 9.690599 9.644212 8.670468 7.562226 9.338580 4.768490 5.051766 0
2 1gqiA02 246 THLAYQGPL 9.443632 12.430652 14.523600 17.825624 19.053972 18.688576 9.596665 ... 12.804110 13.845181 10.013397 9.532242 8.748777 6.784276 8.743169 4.850136 5.151431 0
3 4c8pA00 89 AVIFDVSEN 9.920808 13.207690 15.512877 18.683271 20.596830 19.606644 9.877905 ... 14.648695 13.808992 9.533399 10.704074 8.485997 8.005937 8.796261 5.409743 5.206578 0
4 3stoA01 171 KNTLDLVPT 10.185408 13.765726 15.791227 19.181437 20.328955 19.526373 10.073597 ... 13.647936 13.585122 9.760163 9.854342 8.159576 6.618244 8.337652 5.255162 5.255510 0

5 rows × 89 columns


In [219]:
data_original.query("cluster < 5")["cluster"].value_counts()


Out[219]:
1    382971
2     28523
4     22208
3     21066
0     10646
Name: cluster, dtype: int64

In [161]:
data_original.head()


Out[161]:
pdb i seq caca_1 caca_2 caca_3 caca_4 caca_5 caca_6 caca_7 ... cbcb_14 cbcb_15 cbcb_16 cbcb_17 cbcb_18 cbcb_19 cbcb_20 cbcb_21 rmsd cluster
0 1g9mG00 206 FFYCNSTQL 9.879721 13.365485 15.387287 18.605337 20.240051 19.853123 9.830854 ... 14.465818 14.041501 9.975571 9.917680 9.397504 7.394856 9.067275 4.491067 4.966916 0
1 1wyuB03 158 QLYYDGANL 9.870072 12.872975 15.433169 18.492691 19.469760 19.349613 9.869476 ... 13.653317 13.835575 9.690599 9.644212 8.670468 7.562226 9.338580 4.768490 5.051766 0
2 1gqiA02 246 THLAYQGPL 9.443632 12.430652 14.523600 17.825624 19.053972 18.688576 9.596665 ... 12.804110 13.845181 10.013397 9.532242 8.748777 6.784276 8.743169 4.850136 5.151431 0
3 4c8pA00 89 AVIFDVSEN 9.920808 13.207690 15.512877 18.683271 20.596830 19.606644 9.877905 ... 14.648695 13.808992 9.533399 10.704074 8.485997 8.005937 8.796261 5.409743 5.206578 0
4 3stoA01 171 KNTLDLVPT 10.185408 13.765726 15.791227 19.181437 20.328955 19.526373 10.073597 ... 13.647936 13.585122 9.760163 9.854342 8.159576 6.618244 8.337652 5.255162 5.255510 0

5 rows × 89 columns


In [3]:
data_original = feather.read_dataframe("/Users/weilu/Research/optimization/fragment/cluster100_v2.feather")

In [4]:
data = data_original[["pdb", "i", "seq","cluster", "rmsd"]].reset_index(drop=True)
data["cluster"] = data["cluster"].astype(int)
for i in range(1,10):
    data[f"s{i}"] = data["seq"].apply(lambda x: one_to_index(x[i-1]))

In [5]:
data.to_feather("/Users/weilu/Research/optimization/fragment/cluster100_v2_processed.feather")
# data.to_feather("/Users/weilu/Research/optimization/fragment/cluster100_processed.feather")

In [226]:
data = feather.read_dataframe("/Users/weilu/Research/optimization/fragment/cluster100_processed.feather")

In [6]:
data = feather.read_dataframe("/Users/weilu/Research/optimization/fragment/cluster100_v2_processed.feather")

In [9]:
data.head()


Out[9]:
pdb i seq cluster rmsd s1 s2 s3 s4 s5 s6 s7 s8 s9
0 1bg6A02 107 RAVNVPTPL 0 5.756177 14 0 17 11 17 12 16 12 9
1 3g85A02 38 HKNGIKISE 0 6.329374 6 8 11 5 7 8 7 15 3
2 4je5C00 166 EAQGVITFP 0 6.365145 3 0 13 5 17 7 16 4 12
3 3q41A01 88 GFYRIPVLG 0 6.389094 5 4 19 14 7 12 17 9 5
4 1auqA00 144 KKKKVIVIP 0 6.450880 8 8 8 8 17 7 17 7 12

In [228]:
data = data.query("cluster < 2")

In [229]:
data["cluster"].value_counts()


Out[229]:
1    382971
0     10646
Name: cluster, dtype: int64

In [241]:
maxlen = 9
test = data.groupby("cluster").apply(pd.DataFrame.sample, 10000, replace=True)
# test = data.query("category > -1 and category < 10").sample(10000)
x_train = test.iloc[:, 5:14].values
y_train_value = test["cluster"].values
test = data.groupby("cluster").apply(pd.DataFrame.sample, 10000, replace=True)
# test = data.query("category > -1 and category < 10").sample(10000)
x_test = test.iloc[:, 5:14].values
y_test_value = test["cluster"].values

In [253]:
maxlen = 9
test = data.groupby("cluster").apply(pd.DataFrame.sample, 10000, replace=True)
# test = data.query("category > -1 and category < 10").sample(10000)
x_train = test.iloc[:, 5:14].values
y_train_value = test["cluster"].values
test = data.groupby("cluster").apply(pd.DataFrame.sample, 10000, replace=True)
# test = data.query("category > -1 and category < 10").sample(10000)
x_test = test.iloc[:, 5:14].values
y_test_value = test["cluster"].values

# print('Pad sequences (samples x time)')
# x_train1 = sequence.pad_sequences(x_train, maxlen=10)
# x_test1 = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
y_train = to_categorical(np.array(y_train_value))
y_test = to_categorical(np.array(y_test_value))


x_train shape: (20000, 9)

In [248]:
np.all(x_train1 == x_train)


/Users/weilu/anaconda3/envs/py36/lib/python3.6/site-packages/ipykernel_launcher.py:1: DeprecationWarning: elementwise comparison failed; this will raise an error in the future.
  """Entry point for launching an IPython kernel.
Out[248]:
False

In [254]:



Out[254]:
(20000,)

In [ ]:


In [251]:
y_train.shape


Out[251]:
(20000, 2)

In [236]:
from keras.utils import to_categorical
max_features = 100
# cut texts after this number of words
# (among top max_features most common words)

batch_size = 1024*2

print(len(x_train), 'train sequences')



model = Sequential()
model.add(Embedding(max_features, 128, input_length=maxlen))
model.add(Bidirectional(LSTM(64)))
model.add(Dropout(0.5))
model.add(Dense(2, activation='softmax'))

# try using different optimizers and different optimizer configs
# model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])
model.compile('adam', 'categorical_crossentropy', metrics=['accuracy'])

print('Train...')
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=40,
          validation_data=[x_test, y_test])


20000 train sequences
Train...
Train on 20000 samples, validate on 20000 samples
Epoch 1/40
20000/20000 [==============================] - 9s 456us/step - loss: 0.6777 - acc: 0.6865 - val_loss: 0.6499 - val_acc: 0.7552
Epoch 2/40
20000/20000 [==============================] - 3s 163us/step - loss: 0.6063 - acc: 0.7463 - val_loss: 0.5310 - val_acc: 0.7519
Epoch 3/40
20000/20000 [==============================] - 3s 163us/step - loss: 0.5059 - acc: 0.7521 - val_loss: 0.4727 - val_acc: 0.7712
Epoch 4/40
20000/20000 [==============================] - 3s 173us/step - loss: 0.4595 - acc: 0.7837 - val_loss: 0.4319 - val_acc: 0.8012
Epoch 5/40
20000/20000 [==============================] - 3s 170us/step - loss: 0.4318 - acc: 0.8013 - val_loss: 0.4096 - val_acc: 0.8093
Epoch 6/40
20000/20000 [==============================] - 3s 169us/step - loss: 0.4102 - acc: 0.8122 - val_loss: 0.3900 - val_acc: 0.8214
Epoch 7/40
20000/20000 [==============================] - 4s 186us/step - loss: 0.3999 - acc: 0.8195 - val_loss: 0.3890 - val_acc: 0.8235
Epoch 8/40
20000/20000 [==============================] - 4s 177us/step - loss: 0.3970 - acc: 0.8205 - val_loss: 0.3868 - val_acc: 0.8243
Epoch 9/40
20000/20000 [==============================] - 4s 184us/step - loss: 0.3963 - acc: 0.8219 - val_loss: 0.3861 - val_acc: 0.8252
Epoch 10/40
20000/20000 [==============================] - 4s 177us/step - loss: 0.3940 - acc: 0.8230 - val_loss: 0.3841 - val_acc: 0.8250
Epoch 11/40
20000/20000 [==============================] - 4s 186us/step - loss: 0.3929 - acc: 0.8228 - val_loss: 0.3828 - val_acc: 0.8247
Epoch 12/40
20000/20000 [==============================] - 3s 172us/step - loss: 0.3901 - acc: 0.8237 - val_loss: 0.3804 - val_acc: 0.8262
Epoch 13/40
20000/20000 [==============================] - 3s 172us/step - loss: 0.3886 - acc: 0.8258 - val_loss: 0.3783 - val_acc: 0.8274
Epoch 14/40
20000/20000 [==============================] - 3s 163us/step - loss: 0.3845 - acc: 0.8261 - val_loss: 0.3748 - val_acc: 0.8300
Epoch 15/40
20000/20000 [==============================] - 3s 172us/step - loss: 0.3808 - acc: 0.8301 - val_loss: 0.3716 - val_acc: 0.8323
Epoch 16/40
20000/20000 [==============================] - 3s 175us/step - loss: 0.3779 - acc: 0.8298 - val_loss: 0.3693 - val_acc: 0.8326
Epoch 17/40
20000/20000 [==============================] - 3s 172us/step - loss: 0.3754 - acc: 0.8313 - val_loss: 0.3675 - val_acc: 0.8358
Epoch 18/40
20000/20000 [==============================] - 3s 164us/step - loss: 0.3738 - acc: 0.8320 - val_loss: 0.3680 - val_acc: 0.8335
Epoch 19/40
20000/20000 [==============================] - 3s 172us/step - loss: 0.3717 - acc: 0.8334 - val_loss: 0.3648 - val_acc: 0.8352
Epoch 20/40
20000/20000 [==============================] - 3s 166us/step - loss: 0.3705 - acc: 0.8349 - val_loss: 0.3635 - val_acc: 0.8369
Epoch 21/40
20000/20000 [==============================] - 4s 184us/step - loss: 0.3701 - acc: 0.8358 - val_loss: 0.3626 - val_acc: 0.8370
Epoch 22/40
20000/20000 [==============================] - 3s 175us/step - loss: 0.3695 - acc: 0.8352 - val_loss: 0.3622 - val_acc: 0.8371
Epoch 23/40
20000/20000 [==============================] - 4s 184us/step - loss: 0.3692 - acc: 0.8341 - val_loss: 0.3615 - val_acc: 0.8370
Epoch 24/40
20000/20000 [==============================] - 3s 170us/step - loss: 0.3675 - acc: 0.8350 - val_loss: 0.3611 - val_acc: 0.8396
Epoch 25/40
20000/20000 [==============================] - 3s 172us/step - loss: 0.3669 - acc: 0.8355 - val_loss: 0.3598 - val_acc: 0.8388
Epoch 26/40
20000/20000 [==============================] - 3s 173us/step - loss: 0.3667 - acc: 0.8361 - val_loss: 0.3597 - val_acc: 0.8389
Epoch 27/40
20000/20000 [==============================] - 3s 171us/step - loss: 0.3657 - acc: 0.8364 - val_loss: 0.3590 - val_acc: 0.8392
Epoch 28/40
20000/20000 [==============================] - 3s 165us/step - loss: 0.3641 - acc: 0.8366 - val_loss: 0.3587 - val_acc: 0.8397
Epoch 29/40
20000/20000 [==============================] - 3s 170us/step - loss: 0.3642 - acc: 0.8366 - val_loss: 0.3595 - val_acc: 0.8398
Epoch 30/40
20000/20000 [==============================] - 3s 166us/step - loss: 0.3653 - acc: 0.8361 - val_loss: 0.3598 - val_acc: 0.8392
Epoch 31/40
20000/20000 [==============================] - 3s 167us/step - loss: 0.3637 - acc: 0.8393 - val_loss: 0.3585 - val_acc: 0.8400
Epoch 32/40
20000/20000 [==============================] - 3s 164us/step - loss: 0.3633 - acc: 0.8380 - val_loss: 0.3571 - val_acc: 0.8400
Epoch 33/40
20000/20000 [==============================] - 3s 162us/step - loss: 0.3630 - acc: 0.8376 - val_loss: 0.3565 - val_acc: 0.8407
Epoch 34/40
20000/20000 [==============================] - 3s 163us/step - loss: 0.3626 - acc: 0.8380 - val_loss: 0.3564 - val_acc: 0.8414
Epoch 35/40
20000/20000 [==============================] - 3s 165us/step - loss: 0.3630 - acc: 0.8387 - val_loss: 0.3565 - val_acc: 0.8399
Epoch 36/40
20000/20000 [==============================] - 3s 162us/step - loss: 0.3605 - acc: 0.8390 - val_loss: 0.3555 - val_acc: 0.8418
Epoch 37/40
20000/20000 [==============================] - 3s 161us/step - loss: 0.3603 - acc: 0.8401 - val_loss: 0.3551 - val_acc: 0.8418
Epoch 38/40
20000/20000 [==============================] - 3s 163us/step - loss: 0.3587 - acc: 0.8401 - val_loss: 0.3549 - val_acc: 0.8416
Epoch 39/40
20000/20000 [==============================] - 4s 189us/step - loss: 0.3587 - acc: 0.8402 - val_loss: 0.3547 - val_acc: 0.8425
Epoch 40/40
20000/20000 [==============================] - 4s 185us/step - loss: 0.3575 - acc: 0.8399 - val_loss: 0.3537 - val_acc: 0.8430
Out[236]:
<keras.callbacks.History at 0x2239de438>

In [237]:
model.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_32 (Embedding)     (None, 9, 128)            12800     
_________________________________________________________________
bidirectional_31 (Bidirectio (None, 128)               98816     
_________________________________________________________________
dropout_31 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_33 (Dense)             (None, 2)                 258       
=================================================================
Total params: 111,874
Trainable params: 111,874
Non-trainable params: 0
_________________________________________________________________

In [234]:
model.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_30 (Embedding)     (None, 9, 128)            2560      
_________________________________________________________________
bidirectional_29 (Bidirectio (None, 128)               98816     
_________________________________________________________________
dropout_29 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_31 (Dense)             (None, 2)                 258       
=================================================================
Total params: 101,634
Trainable params: 101,634
Non-trainable params: 0
_________________________________________________________________

In [160]:
from keras.utils import to_categorical
max_features = 20
# cut texts after this number of words
# (among top max_features most common words)

batch_size = 1024*2

print(len(x_train), 'train sequences')



model = Sequential()
model.add(Embedding(max_features, 128, input_length=maxlen))
model.add(Bidirectional(LSTM(64)))
model.add(Dropout(0.5))
model.add(Dense(100, activation='softmax'))

# try using different optimizers and different optimizer configs
# model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])
model.compile('adam', 'categorical_crossentropy', metrics=['accuracy'])

print('Train...')
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=40,
          validation_data=[x_test, y_test])


1000000 train sequences
Train...
Train on 1000000 samples, validate on 1000000 samples
Epoch 1/40
1000000/1000000 [==============================] - 203s 203us/step - loss: 4.3113 - acc: 0.0507 - val_loss: 4.1467 - val_acc: 0.0794
Epoch 2/40
1000000/1000000 [==============================] - 194s 194us/step - loss: 4.1239 - acc: 0.0827 - val_loss: 4.0079 - val_acc: 0.1031
Epoch 3/40
1000000/1000000 [==============================] - 196s 196us/step - loss: 4.0236 - acc: 0.1016 - val_loss: 3.9225 - val_acc: 0.1182
Epoch 4/40
1000000/1000000 [==============================] - 192s 192us/step - loss: 3.9654 - acc: 0.1115 - val_loss: 3.8749 - val_acc: 0.1254
Epoch 5/40
1000000/1000000 [==============================] - 184s 184us/step - loss: 3.9337 - acc: 0.1167 - val_loss: 3.8524 - val_acc: 0.1293
Epoch 6/40
1000000/1000000 [==============================] - 198s 198us/step - loss: 3.9151 - acc: 0.1201 - val_loss: 3.8367 - val_acc: 0.1316
Epoch 7/40
1000000/1000000 [==============================] - 189s 189us/step - loss: 3.9025 - acc: 0.1219 - val_loss: 3.8208 - val_acc: 0.1340
Epoch 8/40
1000000/1000000 [==============================] - 183s 183us/step - loss: 3.8927 - acc: 0.1234 - val_loss: 3.8179 - val_acc: 0.1354
Epoch 9/40
1000000/1000000 [==============================] - 190s 190us/step - loss: 3.8843 - acc: 0.1252 - val_loss: 3.8079 - val_acc: 0.1365
Epoch 10/40
1000000/1000000 [==============================] - 187s 187us/step - loss: 3.8768 - acc: 0.1265 - val_loss: 3.7970 - val_acc: 0.1379
Epoch 11/40
1000000/1000000 [==============================] - 180s 180us/step - loss: 3.8713 - acc: 0.1280 - val_loss: 3.7946 - val_acc: 0.1388
Epoch 12/40
1000000/1000000 [==============================] - 180s 180us/step - loss: 3.8652 - acc: 0.1288 - val_loss: 3.7874 - val_acc: 0.1396
Epoch 13/40
1000000/1000000 [==============================] - 180s 180us/step - loss: 3.8606 - acc: 0.1292 - val_loss: 3.7816 - val_acc: 0.1403
Epoch 14/40
1000000/1000000 [==============================] - 181s 181us/step - loss: 3.8561 - acc: 0.1306 - val_loss: 3.7760 - val_acc: 0.1412
Epoch 15/40
1000000/1000000 [==============================] - 182s 182us/step - loss: 3.8520 - acc: 0.1310 - val_loss: 3.7732 - val_acc: 0.1425
Epoch 16/40
1000000/1000000 [==============================] - 186s 186us/step - loss: 3.8487 - acc: 0.1316 - val_loss: 3.7684 - val_acc: 0.1426
Epoch 17/40
1000000/1000000 [==============================] - 186s 186us/step - loss: 3.8452 - acc: 0.1322 - val_loss: 3.7662 - val_acc: 0.1437
Epoch 18/40
1000000/1000000 [==============================] - 182s 182us/step - loss: 3.8410 - acc: 0.1332 - val_loss: 3.7633 - val_acc: 0.1435
Epoch 19/40
1000000/1000000 [==============================] - 169s 169us/step - loss: 3.8383 - acc: 0.1338 - val_loss: 3.7615 - val_acc: 0.1447
Epoch 20/40
1000000/1000000 [==============================] - 177s 177us/step - loss: 3.8347 - acc: 0.1344 - val_loss: 3.7594 - val_acc: 0.1449
Epoch 21/40
1000000/1000000 [==============================] - 176s 176us/step - loss: 3.8317 - acc: 0.1349 - val_loss: 3.7560 - val_acc: 0.1463
Epoch 22/40
1000000/1000000 [==============================] - 178s 178us/step - loss: 3.8296 - acc: 0.1356 - val_loss: 3.7551 - val_acc: 0.1462
Epoch 23/40
1000000/1000000 [==============================] - 178s 178us/step - loss: 3.8268 - acc: 0.1362 - val_loss: 3.7508 - val_acc: 0.1470
Epoch 24/40
1000000/1000000 [==============================] - 177s 177us/step - loss: 3.8244 - acc: 0.1365 - val_loss: 3.7499 - val_acc: 0.1466
Epoch 25/40
1000000/1000000 [==============================] - 175s 175us/step - loss: 3.8225 - acc: 0.1368 - val_loss: 3.7445 - val_acc: 0.1479
Epoch 26/40
1000000/1000000 [==============================] - 175s 175us/step - loss: 3.8205 - acc: 0.1371 - val_loss: 3.7461 - val_acc: 0.1483
Epoch 27/40
1000000/1000000 [==============================] - 175s 175us/step - loss: 3.8173 - acc: 0.1378 - val_loss: 3.7411 - val_acc: 0.1489
Epoch 28/40
1000000/1000000 [==============================] - 176s 176us/step - loss: 3.8163 - acc: 0.1381 - val_loss: 3.7374 - val_acc: 0.1494
Epoch 29/40
1000000/1000000 [==============================] - 177s 177us/step - loss: 3.8138 - acc: 0.1387 - val_loss: 3.7367 - val_acc: 0.1499
Epoch 30/40
1000000/1000000 [==============================] - 178s 178us/step - loss: 3.8121 - acc: 0.1394 - val_loss: 3.7363 - val_acc: 0.1498
Epoch 31/40
1000000/1000000 [==============================] - 178s 178us/step - loss: 3.8100 - acc: 0.1392 - val_loss: 3.7351 - val_acc: 0.1506
Epoch 32/40
1000000/1000000 [==============================] - 180s 180us/step - loss: 3.8078 - acc: 0.1399 - val_loss: 3.7331 - val_acc: 0.1505
Epoch 33/40
1000000/1000000 [==============================] - 179s 179us/step - loss: 3.8063 - acc: 0.1404 - val_loss: 3.7320 - val_acc: 0.1508
Epoch 34/40
1000000/1000000 [==============================] - 180s 180us/step - loss: 3.8045 - acc: 0.1406 - val_loss: 3.7296 - val_acc: 0.1516
Epoch 35/40
1000000/1000000 [==============================] - 185s 185us/step - loss: 3.8027 - acc: 0.1412 - val_loss: 3.7285 - val_acc: 0.1523
Epoch 36/40
1000000/1000000 [==============================] - 180s 180us/step - loss: 3.8002 - acc: 0.1417 - val_loss: 3.7272 - val_acc: 0.1522
Epoch 37/40
1000000/1000000 [==============================] - 186s 186us/step - loss: 3.7985 - acc: 0.1419 - val_loss: 3.7242 - val_acc: 0.1528
Epoch 38/40
1000000/1000000 [==============================] - 185s 185us/step - loss: 3.7971 - acc: 0.1422 - val_loss: 3.7233 - val_acc: 0.1532
Epoch 39/40
1000000/1000000 [==============================] - 185s 185us/step - loss: 3.7955 - acc: 0.1424 - val_loss: 3.7216 - val_acc: 0.1527
Epoch 40/40
1000000/1000000 [==============================] - 186s 186us/step - loss: 3.7947 - acc: 0.1427 - val_loss: 3.7252 - val_acc: 0.1536
Out[160]:
<keras.callbacks.History at 0x1c34440b8>

In [8]:
data = feather.read_dataframe("/Users/weilu/Research/optimization/fragment/feather_cluster_data.feather")

In [15]:
data.head()


Out[15]:
pdb i seq dd category count s1 s2 s3 s4 s5 s6 s7 s8 s9
0 1igqB00 0 DKLKKAIVQ 9,13,11,15,15,19,9,9,13,15,17,5,9,11,15,9,13,1... -1 -1 2 8 9 8 8 0 7 17 13
1 1igqB00 1 KLKKAIVQV 9,9,13,15,17,21,5,9,11,15,17,9,13,15,19,9,13,1... -1 -1 8 9 8 8 0 7 17 13 17
2 1igqB00 2 LKKAIVQVE 5,9,11,15,17,21,9,13,15,19,23,9,13,17,19,9,13,... -1 -1 9 8 8 0 7 17 13 17 3
3 1igqB00 3 KKAIVQVEH 9,13,15,19,23,25,9,13,17,19,21,9,13,17,19,9,13... 9987 13 8 8 0 7 17 13 17 3 6
4 1igqB00 4 KAIVQVEHD 9,13,17,19,21,25,9,13,17,19,23,9,13,15,19,11,1... 6835 18 8 0 7 17 13 17 3 6 2

In [56]:
data["dd"].value_counts()


Out[56]:
5,7,9,9,11,13,5,7,9,9,11,5,7,9,9,5,7,9,5,7,5                    39130
5,7,9,11,11,13,5,7,9,9,11,5,7,9,9,5,7,9,5,7,5                   20575
5,7,9,9,11,13,5,7,9,9,11,5,7,9,11,5,7,9,5,7,5                   18996
5,7,9,9,11,13,5,7,9,11,11,5,7,9,9,5,7,9,5,7,5                   18382
5,7,9,11,11,13,5,7,9,9,11,5,7,9,11,5,7,9,5,7,5                  14304
5,7,9,9,11,13,5,7,9,11,11,5,7,9,11,5,7,9,5,7,5                  12961
5,7,9,11,11,13,5,7,9,11,11,5,7,9,9,5,7,9,5,7,5                  12641
5,7,9,11,11,13,5,7,9,11,11,5,7,9,11,5,7,9,5,7,5                 12033
5,7,9,9,11,13,5,7,9,9,11,5,5,9,9,5,7,9,5,7,5                     7326
5,7,9,9,11,13,5,7,9,11,11,5,5,9,9,5,7,9,5,7,5                    6656
5,7,9,11,11,13,5,5,9,9,11,5,7,9,9,5,7,9,5,7,5                    5374
5,7,9,9,11,13,5,5,9,9,11,5,7,9,9,5,7,9,5,7,5                     5195
5,7,9,9,11,13,5,7,9,9,11,5,7,9,9,5,5,9,5,7,5                     4941
5,7,9,9,11,13,5,7,9,9,11,5,7,9,9,5,7,9,5,5,5                     4867
5,7,9,9,11,13,5,7,9,9,11,5,7,9,11,5,5,9,5,7,5                    4800
5,7,9,11,11,13,5,7,9,9,11,5,7,9,9,5,7,9,5,5,5                    4735
5,7,9,11,11,13,5,5,9,9,11,5,7,9,11,5,7,9,5,7,5                   4654
5,5,9,9,11,13,5,7,9,9,11,5,7,9,11,5,7,9,5,7,5                    4367
5,5,9,9,11,13,5,7,9,9,11,5,7,9,9,5,7,9,5,7,5                     4213
5,5,9,9,11,13,5,7,9,11,11,5,7,9,11,5,7,9,5,7,5                   3600
5,7,9,11,11,13,5,7,9,9,11,5,7,9,11,5,5,9,5,7,5                   3426
5,7,9,9,11,13,5,5,9,9,11,5,7,9,11,5,7,9,5,7,5                    3236
5,7,9,9,11,13,5,5,9,9,11,5,5,9,9,5,7,9,5,7,5                     3225
5,7,9,11,11,13,5,7,9,11,11,5,7,9,9,5,7,9,5,5,5                   3170
5,7,9,9,11,13,5,7,9,9,11,5,5,9,9,5,5,9,5,7,5                     3071
5,7,9,11,11,13,5,7,9,9,11,5,7,9,9,5,5,9,5,7,5                    2981
5,7,9,11,11,13,5,7,9,11,11,5,5,9,9,5,7,9,5,7,5                   2898
5,7,9,9,11,13,5,7,9,11,11,5,7,9,9,5,7,9,5,5,5                    2779
5,5,9,9,11,13,5,7,9,11,11,5,7,9,9,5,7,9,5,7,5                    2671
9,7,7,11,11,11,5,7,9,11,11,5,7,9,9,5,7,9,5,7,5                   2452
                                                                ...  
5,9,11,11,15,17,9,9,11,15,17,5,9,11,15,7,9,13,7,11,11               1
11,11,9,7,9,13,9,7,5,5,9,5,5,5,9,7,5,7,7,9,9                        1
7,9,9,11,9,5,9,11,11,9,7,9,11,9,7,7,5,7,5,7,5                       1
5,7,11,13,11,13,5,7,11,9,11,7,11,11,13,9,9,13,5,9,9                 1
5,7,9,11,11,9,9,11,13,13,11,9,11,11,9,9,7,7,5,7,5                   1
9,13,11,11,13,9,9,7,7,9,7,5,7,9,7,7,11,9,9,9,5                      1
9,11,15,13,17,15,9,11,9,13,11,9,7,11,9,5,9,9,9,7,5                  1
11,13,15,17,21,21,9,13,15,19,19,9,13,17,19,11,13,17,11,13,11        1
9,11,11,13,15,15,9,9,13,13,11,5,9,9,9,9,11,7,9,7,5                  1
7,9,13,15,19,19,11,13,17,19,21,9,13,17,17,9,13,15,11,11,9           1
7,9,11,15,17,17,9,11,13,17,17,9,11,13,13,9,11,13,9,11,7             1
5,9,11,13,13,17,9,11,11,13,15,9,11,11,15,7,9,11,5,9,7               1
9,13,15,15,17,15,9,13,13,15,15,9,11,13,11,9,11,9,9,7,5              1
5,7,9,11,13,11,5,9,11,11,9,7,9,11,9,9,11,11,9,9,9                   1
9,11,15,17,17,17,11,13,15,15,17,9,11,11,13,9,9,13,5,9,9             1
7,9,13,13,13,17,9,13,13,13,17,9,9,9,13,5,7,11,5,7,7                 1
7,9,9,13,17,19,7,11,15,17,19,11,13,15,17,11,13,15,9,11,11           1
5,9,9,9,11,13,9,9,11,13,15,5,7,11,11,5,7,11,5,7,5                   1
9,11,13,15,19,23,7,9,11,15,19,11,13,17,19,9,13,15,9,13,9            1
5,7,9,9,13,15,5,7,9,11,13,5,5,7,9,5,9,9,9,11,9                      1
5,7,9,11,15,17,5,7,11,15,15,5,9,11,13,7,11,13,11,11,9               1
9,11,15,19,21,23,9,11,15,17,19,9,13,17,17,9,13,15,11,11,9           1
9,11,11,15,15,17,7,9,11,13,15,9,13,15,17,9,13,17,11,13,11           1
11,13,17,21,21,23,9,13,17,17,21,9,13,13,17,11,11,15,9,13,11         1
7,11,13,13,15,11,9,11,13,13,11,9,11,11,7,7,9,5,7,5,5                1
7,9,11,11,9,11,9,11,9,9,13,9,5,7,11,5,7,11,5,7,7                    1
5,7,7,11,11,9,7,9,11,13,11,9,11,11,9,7,7,7,7,9,7                    1
9,11,11,13,13,11,9,11,13,15,13,7,9,11,11,9,13,11,9,9,9              1
9,9,11,13,17,17,7,9,13,15,17,9,11,13,15,7,11,13,11,13,9             1
5,7,11,13,13,11,5,9,13,13,11,7,11,11,11,9,7,7,9,7,5                 1
Name: dd, Length: 853589, dtype: int64

In [10]:
import numpy as np

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional
from keras.datasets import imdb


Using TensorFlow backend.

In [19]:
x_train.shape


Out[19]:
(25000, 100)

In [40]:
data.query("category == 11").shape


Out[40]:
(5195, 15)

cat 10 and 11 are very similar. they should be in the same group.


In [59]:
data.query("category == 10 or category == 11").sample(10)


Out[59]:
pdb i seq dd category count s1 s2 s3 s4 s5 s6 s7 s8 s9
299936 3tulA00 66 AKSVYDAAT 5,7,9,11,11,13,5,5,9,9,11,5,7,9,9,5,7,9,5,7,5 10 5374 0 8 15 17 19 2 0 0 16
1876837 1k1fA00 27 DIEQELERA 5,7,9,9,11,13,5,5,9,9,11,5,7,9,9,5,7,9,5,7,5 11 5195 2 7 3 13 3 9 3 14 0
418011 1ztpA01 107 AWAGIARAV 5,7,9,9,11,13,5,5,9,9,11,5,7,9,9,5,7,9,5,7,5 11 5195 0 18 0 5 7 0 14 0 17
1211494 3zkvA00 758 QFLTHFVMQ 5,7,9,9,11,13,5,5,9,9,11,5,7,9,9,5,7,9,5,7,5 11 5195 13 4 9 16 6 4 17 10 13
313392 1pp1X01 110 HCCSLLIGV 5,7,9,11,11,13,5,5,9,9,11,5,7,9,9,5,7,9,5,7,5 10 5374 6 1 1 15 9 9 7 5 17
429159 2j7qA00 169 DFTEAISAL 5,7,9,11,11,13,5,5,9,9,11,5,7,9,9,5,7,9,5,7,5 10 5374 2 4 16 3 0 7 15 0 9
1323194 1s7oB00 96 KISILTSID 5,7,9,9,11,13,5,5,9,9,11,5,7,9,9,5,7,9,5,7,5 11 5195 8 7 15 7 9 16 15 7 2
192023 2pftA00 27 YLGSMAKIQ 5,7,9,9,11,13,5,5,9,9,11,5,7,9,9,5,7,9,5,7,5 11 5195 19 9 5 15 10 0 8 7 13
1204807 4f3qA01 7 AKARQDAKR 5,7,9,9,11,13,5,5,9,9,11,5,7,9,9,5,7,9,5,7,5 11 5195 0 8 0 14 13 2 0 8 14
1324321 3aqpA02 21 LEKARTVLE 5,7,9,11,11,13,5,5,9,9,11,5,7,9,9,5,7,9,5,7,5 10 5374 9 3 8 0 14 16 17 9 3

In [58]:
data.query("category == 29 or category == 11").sample(10)


Out[58]:
pdb i seq dd category count s1 s2 s3 s4 s5 s6 s7 s8 s9
1211147 3zkvA00 395 LEACIYSFQ 5,7,9,9,11,13,5,5,9,9,11,5,7,9,9,5,7,9,5,7,5 11 5195 9 3 0 1 7 19 15 4 13
1711744 2itbA00 36 FKAASTALS 5,7,9,9,11,13,5,5,9,9,11,5,7,9,9,5,7,9,5,7,5 11 5195 4 8 0 0 15 16 0 9 15
413878 3rjvA01 15 RAQYYLADT 5,7,9,9,11,13,5,5,9,9,11,5,7,9,9,5,7,9,5,7,5 11 5195 14 0 13 19 19 9 0 2 16
1836419 5tkyA03 30 ALRRLRTAC 5,7,9,9,11,13,5,5,9,9,11,5,7,9,9,5,7,9,5,7,5 11 5195 0 9 14 14 9 14 16 0 1
1216811 1w9cA00 141 FFLLLQAVN 5,7,9,9,11,13,5,5,9,9,11,5,7,9,9,5,7,9,5,7,5 11 5195 4 4 9 9 9 13 0 17 11
249112 1ej6B00 233 INPTEIEWA 9,7,7,11,11,11,5,7,9,11,11,5,7,9,9,5,7,9,5,7,5 29 2452 7 11 12 16 3 7 3 18 0
1664651 2qtfA01 42 IQYDKLQQI 9,7,7,11,11,11,5,7,9,11,11,5,7,9,9,5,7,9,5,7,5 29 2452 7 13 19 2 8 9 13 13 7
950129 1xflA00 46 APFFADLAK 5,7,9,9,11,13,5,5,9,9,11,5,7,9,9,5,7,9,5,7,5 11 5195 0 12 4 4 0 2 9 0 8
310748 2yyuB00 138 VETVAHYAA 5,7,9,9,11,13,5,5,9,9,11,5,7,9,9,5,7,9,5,7,5 11 5195 17 3 16 17 0 6 19 0 0
822123 3qvsA01 236 PLILDIARF 5,7,9,9,11,13,5,5,9,9,11,5,7,9,9,5,7,9,5,7,5 11 5195 12 9 7 9 2 7 0 14 4

In [55]:
data.query("category == 4 or category == 11").sample(10)


Out[55]:
pdb i seq dd category count s1 s2 s3 s4 s5 s6 s7 s8 s9
1691660 3ar4A04 179 TGPVKEKIL 5,7,9,11,11,13,5,7,9,9,11,5,7,9,11,5,7,9,5,7,5 4 14304 16 5 12 17 8 3 8 7 9
606678 4fguB02 2 DVPLTIMKR 5,7,9,11,11,13,5,7,9,9,11,5,7,9,11,5,7,9,5,7,5 4 14304 2 17 12 9 16 7 10 8 14
1297613 4muoA02 229 GLARVNQAF 5,7,9,11,11,13,5,7,9,9,11,5,7,9,11,5,7,9,5,7,5 4 14304 5 9 0 14 17 11 13 0 4
395061 5aj3P00 95 TNAERLRRK 5,7,9,9,11,13,5,5,9,9,11,5,7,9,9,5,7,9,5,7,5 11 5195 16 11 0 3 14 9 14 14 8
1029953 2oryA00 207 NADFADYFD 5,7,9,9,11,13,5,5,9,9,11,5,7,9,9,5,7,9,5,7,5 11 5195 11 0 2 4 0 2 19 4 2
1765556 1zhcA00 45 VSHMKKQKL 5,7,9,11,11,13,5,7,9,9,11,5,7,9,11,5,7,9,5,7,5 4 14304 17 15 6 10 8 8 13 8 9
755750 4i9cA02 181 DLEDFAIDV 5,7,9,9,11,13,5,5,9,9,11,5,7,9,9,5,7,9,5,7,5 11 5195 2 9 3 2 4 0 7 2 17
1486275 1h9eA00 14 LKSELVANN 5,7,9,11,11,13,5,7,9,9,11,5,7,9,11,5,7,9,5,7,5 4 14304 9 8 15 3 9 17 0 11 11
1200946 3s1sA02 125 DIELGKVLS 5,7,9,11,11,13,5,7,9,9,11,5,7,9,11,5,7,9,5,7,5 4 14304 2 7 3 9 5 8 17 9 15
1072383 3it4B01 56 AQADLDEAV 5,7,9,11,11,13,5,7,9,9,11,5,7,9,11,5,7,9,5,7,5 4 14304 0 13 0 2 9 2 3 0 17

In [28]:
test.head()


Out[28]:
pdb i seq dd category count s1 s2 s3 s4 s5 s6 s7 s8 s9
929058 3i4rB00 40 DRAVTQISV 5,7,9,9,11,13,5,5,9,9,11,5,7,9,9,5,7,9,5,7,5 11 5195 2 14 0 17 16 13 7 15 17
1276353 4tl2A02 72 DWHDLAAFW 5,7,9,11,11,13,5,5,9,9,11,5,7,9,9,5,7,9,5,7,5 10 5374 2 18 6 2 9 0 0 4 18
869288 1xdpA01 31 GIYSNNLDE 5,7,9,9,11,13,5,5,9,9,11,5,7,9,9,5,7,9,5,7,5 11 5195 5 7 19 15 11 11 9 2 3
1358821 2g7lA00 132 LQDATATAA 5,7,9,11,11,13,5,5,9,9,11,5,7,9,9,5,7,9,5,7,5 10 5374 9 13 2 0 16 0 16 0 0
857911 2jbrA03 83 ARALLEKTW 5,7,9,9,11,13,5,5,9,9,11,5,7,9,9,5,7,9,5,7,5 11 5195 0 14 0 9 9 3 8 16 18

In [102]:
data.query("category == 9").shape


Out[102]:
(6656, 15)

In [ ]:
data.sample(replace)

In [117]:
test = data.query("category > -1 and category < 10").groupby("category").apply(pd.DataFrame.sample, 10000, replace=True)
# test = data.query("category > -1 and category < 10").sample(10000)
x_train = test.iloc[:, 6:15].values
y_train_value = test["category"].values
test = data.query("category > -1 and category < 10").groupby("category").apply(pd.DataFrame.sample, 10000, replace=True)
# test = data.query("category > -1 and category < 10").sample(10000)
x_test = test.iloc[:, 6:15].values
y_test_value = test["category"].values

# print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
y_train = to_categorical(np.array(y_train_value))
y_test = to_categorical(np.array(y_test_value))


Pad sequences (samples x time)
x_train shape: (100000, 9)

In [118]:
y_train.sum(axis=0)


Out[118]:
array([10000., 10000., 10000., 10000., 10000., 10000., 10000., 10000.,
       10000., 10000.], dtype=float32)

In [119]:
from keras.utils import to_categorical
max_features = 20000
# cut texts after this number of words
# (among top max_features most common words)
maxlen = 9
batch_size = 32

print(len(x_train), 'train sequences')



model = Sequential()
model.add(Embedding(max_features, 128, input_length=maxlen))
model.add(Bidirectional(LSTM(64)))
model.add(Dropout(0.5))
model.add(Dense(10, activation='softmax'))

# try using different optimizers and different optimizer configs
# model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])
model.compile('adam', 'categorical_crossentropy', metrics=['accuracy'])

print('Train...')
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=40,
          validation_data=[x_test, y_test])


100000 train sequences
Train...
Train on 100000 samples, validate on 100000 samples
Epoch 1/10
100000/100000 [==============================] - 116s 1ms/step - loss: 2.2778 - acc: 0.1338 - val_loss: 2.2583 - val_acc: 0.1490
Epoch 2/10
100000/100000 [==============================] - 107s 1ms/step - loss: 2.2539 - acc: 0.1527 - val_loss: 2.2470 - val_acc: 0.1550
Epoch 3/10
100000/100000 [==============================] - 107s 1ms/step - loss: 2.2460 - acc: 0.1573 - val_loss: 2.2391 - val_acc: 0.1637
Epoch 4/10
100000/100000 [==============================] - 108s 1ms/step - loss: 2.2376 - acc: 0.1638 - val_loss: 2.2305 - val_acc: 0.1668
Epoch 5/10
100000/100000 [==============================] - 117s 1ms/step - loss: 2.2283 - acc: 0.1683 - val_loss: 2.2232 - val_acc: 0.1720
Epoch 6/10
100000/100000 [==============================] - 108s 1ms/step - loss: 2.2173 - acc: 0.1771 - val_loss: 2.2177 - val_acc: 0.1788
Epoch 7/10
100000/100000 [==============================] - 108s 1ms/step - loss: 2.2060 - acc: 0.1819 - val_loss: 2.2097 - val_acc: 0.1814
Epoch 8/10
100000/100000 [==============================] - 108s 1ms/step - loss: 2.1924 - acc: 0.1889 - val_loss: 2.2030 - val_acc: 0.1861
Epoch 9/10
100000/100000 [==============================] - 108s 1ms/step - loss: 2.1748 - acc: 0.1985 - val_loss: 2.1945 - val_acc: 0.1907
Epoch 10/10
100000/100000 [==============================] - 113s 1ms/step - loss: 2.1579 - acc: 0.2080 - val_loss: 2.1833 - val_acc: 0.1987
Out[119]:
<keras.callbacks.History at 0x192d892b0>

In [121]:
model.fit(x_train, y_train,
          batch_size=10240,
          epochs=2,
          validation_data=[x_test, y_test])


Train on 100000 samples, validate on 100000 samples
Epoch 1/2
100000/100000 [==============================] - 18s 183us/step - loss: 1.7636 - acc: 0.3695 - val_loss: 2.0681 - val_acc: 0.2857
Epoch 2/2
100000/100000 [==============================] - 18s 176us/step - loss: 1.7558 - acc: 0.3744 - val_loss: 2.0647 - val_acc: 0.2868
Out[121]:
<keras.callbacks.History at 0x1a5e005c0>

In [122]:
y_pred = model.predict(x_test)
predicted = np.argmax(y_pred, axis=1)
confusion_matrix(y_test_value, predicted)


Out[122]:
array([[1152,  964, 1188, 1183,  896,  947,  895,  789, 1236,  750],
       [ 912, 1745,  802,  828, 1418,  633, 1308,  977,  861,  516],
       [ 905,  722, 2160,  904, 1224, 1186,  572,  825,  979,  523],
       [ 859,  633,  823, 2042,  660, 1214, 1071,  865,  876,  957],
       [ 599, 1074, 1174,  536, 2915,  735,  837, 1127,  640,  363],
       [ 701,  367, 1100, 1265,  719, 2588,  725, 1062,  744,  729],
       [ 597, 1088,  474, 1141,  808,  690, 2615, 1361,  658,  568],
       [ 495,  690,  727,  729, 1215, 1004, 1155, 2928,  547,  510],
       [ 577,  407,  652,  559,  355,  533,  348,  337, 5154, 1078],
       [ 336,  201,  380,  794,  187,  586,  454,  370, 1308, 5384]])

In [123]:
y_pred = model.predict(x_test)

In [108]:
predicted = np.argmax(y_pred, axis=1)

In [109]:


In [110]:
confusion_matrix(y_test_value, predicted)


Out[110]:
array([[107, 102, 141, 114,  57, 104,  74,  73, 138,  90],
       [ 82, 123,  97,  81, 108, 102, 100, 111, 118,  78],
       [ 88, 102, 133,  79,  88, 129,  48,  93, 123, 117],
       [ 81,  91, 122, 133,  68, 143,  87,  77,  97, 101],
       [ 66, 121, 128,  87, 114, 109,  80, 116, 105,  74],
       [ 89,  87, 115, 132,  72, 137,  71,  88, 109, 100],
       [ 70, 125,  94, 114, 113, 115, 107, 106,  89,  67],
       [ 64, 102, 119, 110, 105, 131,  94, 109,  86,  80],
       [ 96,  86, 124,  69,  66, 103,  40,  77, 219, 120],
       [ 82,  71, 126, 110,  61, 140,  45,  85, 128, 152]])

In [79]:
y_test[-1]


Out[79]:
array([0., 0., 0., 1., 0., 0., 0., 0., 0., 0.], dtype=float32)

In [96]:
y_test.sum(axis=0)


Out[96]:
array([2327., 1264., 1173., 1126.,  918.,  839.,  789.,  724.,  446.,
        394.], dtype=float32)

In [72]:
y_test


Out[72]:
array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.]], dtype=float32)

In [48]:
max_features = 20000
# cut texts after this number of words
# (among top max_features most common words)
maxlen = 9
batch_size = 32

print(len(x_train), 'train sequences')


print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
print('x_train shape:', x_train.shape)
y_train = np.array(y_train)

model = Sequential()
model.add(Embedding(max_features, 128, input_length=maxlen))
model.add(Bidirectional(LSTM(64)))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

# try using different optimizers and different optimizer configs
model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])

print('Train...')
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=40,
          validation_data=[x_train, y_train])


1000 train sequences
Pad sequences (samples x time)
x_train shape: (1000, 9)
Train...
Train on 1000 samples, validate on 1000 samples
Epoch 1/40
1000/1000 [==============================] - 4s 4ms/step - loss: 0.6922 - acc: 0.5350 - val_loss: 0.6884 - val_acc: 0.5350
Epoch 2/40
1000/1000 [==============================] - 1s 1ms/step - loss: 0.6875 - acc: 0.5500 - val_loss: 0.6816 - val_acc: 0.5980
Epoch 3/40
1000/1000 [==============================] - 1s 1ms/step - loss: 0.6810 - acc: 0.5840 - val_loss: 0.6734 - val_acc: 0.5810
Epoch 4/40
1000/1000 [==============================] - 1s 1ms/step - loss: 0.6768 - acc: 0.5630 - val_loss: 0.6698 - val_acc: 0.6030
Epoch 5/40
1000/1000 [==============================] - 1s 1ms/step - loss: 0.6679 - acc: 0.5940 - val_loss: 0.6592 - val_acc: 0.6030
Epoch 6/40
1000/1000 [==============================] - 1s 1ms/step - loss: 0.6623 - acc: 0.6020 - val_loss: 0.6546 - val_acc: 0.6100
Epoch 7/40
1000/1000 [==============================] - 1s 1ms/step - loss: 0.6570 - acc: 0.6090 - val_loss: 0.6504 - val_acc: 0.6180
Epoch 8/40
1000/1000 [==============================] - 1s 1ms/step - loss: 0.6569 - acc: 0.6080 - val_loss: 0.6492 - val_acc: 0.6120
Epoch 9/40
1000/1000 [==============================] - 1s 1ms/step - loss: 0.6554 - acc: 0.6130 - val_loss: 0.6447 - val_acc: 0.6150
Epoch 10/40
1000/1000 [==============================] - 1s 1ms/step - loss: 0.6488 - acc: 0.6150 - val_loss: 0.6462 - val_acc: 0.6300
Epoch 11/40
1000/1000 [==============================] - 1s 1ms/step - loss: 0.6467 - acc: 0.6280 - val_loss: 0.6388 - val_acc: 0.6190
Epoch 12/40
1000/1000 [==============================] - 1s 1ms/step - loss: 0.6450 - acc: 0.6130 - val_loss: 0.6339 - val_acc: 0.6210
Epoch 13/40
1000/1000 [==============================] - 1s 1ms/step - loss: 0.6444 - acc: 0.6250 - val_loss: 0.6289 - val_acc: 0.6300
Epoch 14/40
1000/1000 [==============================] - 1s 1ms/step - loss: 0.6347 - acc: 0.6230 - val_loss: 0.6256 - val_acc: 0.6320
Epoch 15/40
1000/1000 [==============================] - 1s 1ms/step - loss: 0.6363 - acc: 0.6310 - val_loss: 0.6226 - val_acc: 0.6390
Epoch 16/40
1000/1000 [==============================] - 1s 1ms/step - loss: 0.6333 - acc: 0.6280 - val_loss: 0.6233 - val_acc: 0.6400
Epoch 17/40
1000/1000 [==============================] - 1s 1ms/step - loss: 0.6294 - acc: 0.6280 - val_loss: 0.6164 - val_acc: 0.6430
Epoch 18/40
1000/1000 [==============================] - 1s 1ms/step - loss: 0.6281 - acc: 0.6360 - val_loss: 0.6119 - val_acc: 0.6510
Epoch 19/40
1000/1000 [==============================] - 1s 1ms/step - loss: 0.6159 - acc: 0.6440 - val_loss: 0.6100 - val_acc: 0.6500
Epoch 20/40
1000/1000 [==============================] - 1s 1ms/step - loss: 0.6204 - acc: 0.6440 - val_loss: 0.6064 - val_acc: 0.6640
Epoch 21/40
1000/1000 [==============================] - 1s 1ms/step - loss: 0.6137 - acc: 0.6510 - val_loss: 0.5981 - val_acc: 0.6720
Epoch 22/40
1000/1000 [==============================] - 1s 1ms/step - loss: 0.6029 - acc: 0.6650 - val_loss: 0.6042 - val_acc: 0.6740
Epoch 23/40
1000/1000 [==============================] - 1s 1ms/step - loss: 0.6011 - acc: 0.6570 - val_loss: 0.5867 - val_acc: 0.6880
Epoch 24/40
1000/1000 [==============================] - 1s 1ms/step - loss: 0.6029 - acc: 0.6660 - val_loss: 0.5810 - val_acc: 0.6800
Epoch 25/40
1000/1000 [==============================] - 1s 1ms/step - loss: 0.5945 - acc: 0.6680 - val_loss: 0.5717 - val_acc: 0.6830
Epoch 26/40
1000/1000 [==============================] - 1s 1ms/step - loss: 0.5866 - acc: 0.6730 - val_loss: 0.5715 - val_acc: 0.6930
Epoch 27/40
1000/1000 [==============================] - 1s 1ms/step - loss: 0.5787 - acc: 0.6860 - val_loss: 0.5592 - val_acc: 0.7060
Epoch 28/40
1000/1000 [==============================] - 1s 1ms/step - loss: 0.5786 - acc: 0.6810 - val_loss: 0.5499 - val_acc: 0.7150
Epoch 29/40
1000/1000 [==============================] - 1s 1ms/step - loss: 0.5653 - acc: 0.6930 - val_loss: 0.5393 - val_acc: 0.7250
Epoch 30/40
1000/1000 [==============================] - 1s 1ms/step - loss: 0.5626 - acc: 0.7070 - val_loss: 0.5420 - val_acc: 0.7310
Epoch 31/40
1000/1000 [==============================] - 1s 1ms/step - loss: 0.5627 - acc: 0.6910 - val_loss: 0.5292 - val_acc: 0.7230
Epoch 32/40
1000/1000 [==============================] - 1s 1ms/step - loss: 0.5412 - acc: 0.7140 - val_loss: 0.5129 - val_acc: 0.7360
Epoch 33/40
1000/1000 [==============================] - 1s 1ms/step - loss: 0.5381 - acc: 0.7130 - val_loss: 0.4962 - val_acc: 0.7400
Epoch 34/40
1000/1000 [==============================] - 1s 1ms/step - loss: 0.5263 - acc: 0.7250 - val_loss: 0.5032 - val_acc: 0.7510
Epoch 35/40
1000/1000 [==============================] - 1s 1ms/step - loss: 0.5227 - acc: 0.7290 - val_loss: 0.4846 - val_acc: 0.7590
Epoch 36/40
1000/1000 [==============================] - 1s 1ms/step - loss: 0.5069 - acc: 0.7220 - val_loss: 0.4678 - val_acc: 0.7650
Epoch 37/40
1000/1000 [==============================] - 1s 1ms/step - loss: 0.4847 - acc: 0.7540 - val_loss: 0.4518 - val_acc: 0.7790
Epoch 38/40
1000/1000 [==============================] - 1s 1ms/step - loss: 0.4756 - acc: 0.7510 - val_loss: 0.4586 - val_acc: 0.7650
Epoch 39/40
1000/1000 [==============================] - 1s 1ms/step - loss: 0.4926 - acc: 0.7450 - val_loss: 0.4354 - val_acc: 0.7820
Epoch 40/40
1000/1000 [==============================] - 1s 1ms/step - loss: 0.4866 - acc: 0.7470 - val_loss: 0.4413 - val_acc: 0.7820
Out[48]:
<keras.callbacks.History at 0x17a19d860>

In [11]:
max_features = 20000
# cut texts after this number of words
# (among top max_features most common words)
maxlen = 100
batch_size = 32

print('Loading data...')
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)
y_train = np.array(y_train)
y_test = np.array(y_test)

model = Sequential()
model.add(Embedding(max_features, 128, input_length=maxlen))
model.add(Bidirectional(LSTM(64)))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

# try using different optimizers and different optimizer configs
model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])

print('Train...')
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=4,
          validation_data=[x_test, y_test])


Loading data...
Downloading data from https://s3.amazonaws.com/text-datasets/imdb.npz
17465344/17464789 [==============================] - 2s 0us/step
25000 train sequences
25000 test sequences
Pad sequences (samples x time)
x_train shape: (25000, 100)
x_test shape: (25000, 100)
Train...
Train on 25000 samples, validate on 25000 samples
Epoch 1/4
25000/25000 [==============================] - 98s 4ms/step - loss: 0.4393 - acc: 0.7982 - val_loss: 0.3589 - val_acc: 0.8412
Epoch 2/4
25000/25000 [==============================] - 99s 4ms/step - loss: 0.2482 - acc: 0.9024 - val_loss: 0.4077 - val_acc: 0.8422
Epoch 3/4
25000/25000 [==============================] - 99s 4ms/step - loss: 0.1685 - acc: 0.9390 - val_loss: 0.4364 - val_acc: 0.8392
Epoch 4/4
25000/25000 [==============================] - 98s 4ms/step - loss: 0.1138 - acc: 0.9591 - val_loss: 0.5522 - val_acc: 0.8333
Out[11]:
<keras.callbacks.History at 0x12c087e10>

In [17]:
data.head()


Out[17]:
pdb i seq dd category count s1 s2 s3 s4 s5 s6 s7 s8 s9
0 1igqB00 0 DKLKKAIVQ 9,13,11,15,15,19,9,9,13,15,17,5,9,11,15,9,13,1... -1 -1 2 8 9 8 8 0 7 17 13
1 1igqB00 1 KLKKAIVQV 9,9,13,15,17,21,5,9,11,15,17,9,13,15,19,9,13,1... -1 -1 8 9 8 8 0 7 17 13 17
2 1igqB00 2 LKKAIVQVE 5,9,11,15,17,21,9,13,15,19,23,9,13,17,19,9,13,... -1 -1 9 8 8 0 7 17 13 17 3
3 1igqB00 3 KKAIVQVEH 9,13,15,19,23,25,9,13,17,19,21,9,13,17,19,9,13... 9987 13 8 8 0 7 17 13 17 3 6
4 1igqB00 4 KAIVQVEHD 9,13,17,19,21,25,9,13,17,19,23,9,13,15,19,11,1... 6835 18 8 0 7 17 13 17 3 6 2

In [125]:
"""Example of using Hierarchical RNN (HRNN) to classify MNIST digits.

HRNNs can learn across multiple levels
of temporal hierarchy over a complex sequence.
Usually, the first recurrent layer of an HRNN
encodes a sentence (e.g. of word vectors)
into a  sentence vector.
The second recurrent layer then encodes a sequence of
such vectors (encoded by the first layer) into a document vector.
This document vector is considered to preserve both
the word-level and sentence-level structure of the context.

# References

- [A Hierarchical Neural Autoencoder for Paragraphs and Documents]
    (https://arxiv.org/abs/1506.01057)
    Encodes paragraphs and documents with HRNN.
    Results have shown that HRNN outperforms standard
    RNNs and may play some role in more sophisticated generation tasks like
    summarization or question answering.
- [Hierarchical recurrent neural network for skeleton based action recognition]
    (http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=7298714)
    Achieved state-of-the-art results on
    skeleton based action recognition with 3 levels
    of bidirectional HRNN combined with fully connected layers.

In the below MNIST example the first LSTM layer first encodes every
column of pixels of shape (28, 1) to a column vector of shape (128,).
The second LSTM layer encodes then these 28 column vectors of shape (28, 128)
to a image vector representing the whole image.
A final Dense layer is added for prediction.

After 5 epochs: train acc: 0.9858, val acc: 0.9864
"""
from __future__ import print_function

import keras
from keras.datasets import mnist
from keras.models import Model
from keras.layers import Input, Dense, TimeDistributed
from keras.layers import LSTM

# Training parameters.
batch_size = 32
num_classes = 10
epochs = 5

# Embedding dimensions.
row_hidden = 128
col_hidden = 128

# The data, split between train and test sets.
(x_train, y_train), (x_test, y_test) = mnist.load_data()

# Reshapes data to 4D for Hierarchical RNN.
x_train = x_train.reshape(x_train.shape[0], 28, 28, 1)
x_test = x_test.reshape(x_test.shape[0], 28, 28, 1)
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
x_train /= 255
x_test /= 255
print('x_train shape:', x_train.shape)
print(x_train.shape[0], 'train samples')
print(x_test.shape[0], 'test samples')

# Converts class vectors to binary class matrices.
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

row, col, pixel = x_train.shape[1:]

# 4D input.
x = Input(shape=(row, col, pixel))

# Encodes a row of pixels using TimeDistributed Wrapper.
encoded_rows = TimeDistributed(LSTM(row_hidden))(x)

# Encodes columns of encoded rows.
encoded_columns = LSTM(col_hidden)(encoded_rows)

# Final predictions and model.
prediction = Dense(num_classes, activation='softmax')(encoded_columns)
model = Model(x, prediction)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

# Training.
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          verbose=1,
          validation_data=(x_test, y_test))

# Evaluation.
scores = model.evaluate(x_test, y_test, verbose=0)
print('Test loss:', scores[0])
print('Test accuracy:', scores[1])


Downloading data from https://s3.amazonaws.com/img-datasets/mnist.npz
11493376/11490434 [==============================] - 1s 0us/step
x_train shape: (60000, 28, 28, 1)
60000 train samples
10000 test samples
Train on 60000 samples, validate on 10000 samples
Epoch 1/5
  416/60000 [..............................] - ETA: 15:49 - loss: 2.2967 - acc: 0.1298
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-125-d665489173cb> in <module>
     91           epochs=epochs,
     92           verbose=1,
---> 93           validation_data=(x_test, y_test))
     94 
     95 # Evaluation.

~/anaconda3/envs/py36/lib/python3.6/site-packages/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, **kwargs)
   1037                                         initial_epoch=initial_epoch,
   1038                                         steps_per_epoch=steps_per_epoch,
-> 1039                                         validation_steps=validation_steps)
   1040 
   1041     def evaluate(self, x=None, y=None,

~/anaconda3/envs/py36/lib/python3.6/site-packages/keras/engine/training_arrays.py in fit_loop(model, f, ins, out_labels, batch_size, epochs, verbose, callbacks, val_f, val_ins, shuffle, callback_metrics, initial_epoch, steps_per_epoch, validation_steps)
    197                     ins_batch[i] = ins_batch[i].toarray()
    198 
--> 199                 outs = f(ins_batch)
    200                 outs = to_list(outs)
    201                 for l, o in zip(out_labels, outs):

~/anaconda3/envs/py36/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py in __call__(self, inputs)
   2713                 return self._legacy_call(inputs)
   2714 
-> 2715             return self._call(inputs)
   2716         else:
   2717             if py_any(is_tensor(x) for x in inputs):

~/anaconda3/envs/py36/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py in _call(self, inputs)
   2673             fetched = self._callable_fn(*array_vals, run_metadata=self.run_metadata)
   2674         else:
-> 2675             fetched = self._callable_fn(*array_vals)
   2676         return fetched[:len(self.outputs)]
   2677 

~/anaconda3/envs/py36/lib/python3.6/site-packages/tensorflow/python/client/session.py in __call__(self, *args, **kwargs)
   1437           ret = tf_session.TF_SessionRunCallable(
   1438               self._session._session, self._handle, args, status,
-> 1439               run_metadata_ptr)
   1440         if run_metadata:
   1441           proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)

KeyboardInterrupt: 

In [128]:
x_train[0]


Out[128]:
array([[[0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ]],

       [[0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ]],

       [[0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ]],

       [[0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ]],

       [[0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ]],

       [[0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.01176471],
        [0.07058824],
        [0.07058824],
        [0.07058824],
        [0.49411765],
        [0.53333336],
        [0.6862745 ],
        [0.10196079],
        [0.6509804 ],
        [1.        ],
        [0.96862745],
        [0.49803922],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ]],

       [[0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.11764706],
        [0.14117648],
        [0.36862746],
        [0.6039216 ],
        [0.6666667 ],
        [0.99215686],
        [0.99215686],
        [0.99215686],
        [0.99215686],
        [0.99215686],
        [0.88235295],
        [0.6745098 ],
        [0.99215686],
        [0.9490196 ],
        [0.7647059 ],
        [0.2509804 ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ]],

       [[0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.19215687],
        [0.93333334],
        [0.99215686],
        [0.99215686],
        [0.99215686],
        [0.99215686],
        [0.99215686],
        [0.99215686],
        [0.99215686],
        [0.99215686],
        [0.9843137 ],
        [0.3647059 ],
        [0.32156864],
        [0.32156864],
        [0.21960784],
        [0.15294118],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ]],

       [[0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.07058824],
        [0.85882354],
        [0.99215686],
        [0.99215686],
        [0.99215686],
        [0.99215686],
        [0.99215686],
        [0.7764706 ],
        [0.7137255 ],
        [0.96862745],
        [0.94509804],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ]],

       [[0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.3137255 ],
        [0.6117647 ],
        [0.41960785],
        [0.99215686],
        [0.99215686],
        [0.8039216 ],
        [0.04313726],
        [0.        ],
        [0.16862746],
        [0.6039216 ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ]],

       [[0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.05490196],
        [0.00392157],
        [0.6039216 ],
        [0.99215686],
        [0.3529412 ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ]],

       [[0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.54509807],
        [0.99215686],
        [0.74509805],
        [0.00784314],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ]],

       [[0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.04313726],
        [0.74509805],
        [0.99215686],
        [0.27450982],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ]],

       [[0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.13725491],
        [0.94509804],
        [0.88235295],
        [0.627451  ],
        [0.42352942],
        [0.00392157],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ]],

       [[0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.31764707],
        [0.9411765 ],
        [0.99215686],
        [0.99215686],
        [0.46666667],
        [0.09803922],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ]],

       [[0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.1764706 ],
        [0.7294118 ],
        [0.99215686],
        [0.99215686],
        [0.5882353 ],
        [0.10588235],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ]],

       [[0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.0627451 ],
        [0.3647059 ],
        [0.9882353 ],
        [0.99215686],
        [0.73333335],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ]],

       [[0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.9764706 ],
        [0.99215686],
        [0.9764706 ],
        [0.2509804 ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ]],

       [[0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.18039216],
        [0.50980395],
        [0.7176471 ],
        [0.99215686],
        [0.99215686],
        [0.8117647 ],
        [0.00784314],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ]],

       [[0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.15294118],
        [0.5803922 ],
        [0.8980392 ],
        [0.99215686],
        [0.99215686],
        [0.99215686],
        [0.98039216],
        [0.7137255 ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ]],

       [[0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.09411765],
        [0.44705883],
        [0.8666667 ],
        [0.99215686],
        [0.99215686],
        [0.99215686],
        [0.99215686],
        [0.7882353 ],
        [0.30588236],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ]],

       [[0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.09019608],
        [0.25882354],
        [0.8352941 ],
        [0.99215686],
        [0.99215686],
        [0.99215686],
        [0.99215686],
        [0.7764706 ],
        [0.31764707],
        [0.00784314],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ]],

       [[0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.07058824],
        [0.67058825],
        [0.85882354],
        [0.99215686],
        [0.99215686],
        [0.99215686],
        [0.99215686],
        [0.7647059 ],
        [0.3137255 ],
        [0.03529412],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ]],

       [[0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.21568628],
        [0.6745098 ],
        [0.8862745 ],
        [0.99215686],
        [0.99215686],
        [0.99215686],
        [0.99215686],
        [0.95686275],
        [0.52156866],
        [0.04313726],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ]],

       [[0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.53333336],
        [0.99215686],
        [0.99215686],
        [0.99215686],
        [0.83137256],
        [0.5294118 ],
        [0.5176471 ],
        [0.0627451 ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ]],

       [[0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ]],

       [[0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ]],

       [[0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ]]], dtype=float32)

In [ ]: