In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os, re, sys

# sklearn stuff
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.externals import joblib

# keras stuff
import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, Conv2D, MaxPooling2D, Flatten
from keras.optimizers import SGD
from keras.metrics import binary_accuracy
from keras import callbacks

# scipy stuff
from scipy.interpolate import interp1d

from preprocessingTR import *

%matplotlib inline


Using Theano backend.

Things to consider:

  • use multiple steps instead of one per training example
  • use LSTM instead of ConvNet

Load data


In [3]:
data = pd.read_pickle('./Data/processed/full.pickle')

In [4]:
labels = np.load('./Data/processed/labels.npy')

Split and scale data, convert to 0 indexing


In [5]:
xTr, xTe, yTr, yTe = train_test_split(data.values, labels, test_size=0.2)
num_classes = int(labels.max())
ss = StandardScaler()
xTr = ss.fit_transform(xTr)
xTe = ss.transform(xTe)
yTr = np.mod(yTr, num_classes)
yTe = np.mod(yTe, num_classes)

Convert labels to categorical


In [6]:
yTr_cat = keras.utils.to_categorical(yTr, num_classes=num_classes)
yTe_cat = keras.utils.to_categorical(yTe, num_classes=num_classes)

Simple linear model


In [ ]:
lr_fname = './Data/processed/lr_trained.pkl'
if os.path.isfile(lr_fname):
    lr = joblib.load(lr_fname)
else:
    lr = LogisticRegression(verbose=1, n_jobs=-1)
    lr.fit(xTr, yTr)

In [ ]:
lr.score(xTe, yTe)

In [ ]:
ctr = 0
for a,b in zip(lr.predict(xTe), yTe):
    if a == b:
        print(a,b)
    else:
        print(a,b,'FAIL')
    
    ctr += 1
    if ctr > 100:
        break

In [ ]:
a = confusion_matrix(lr.predict(xTe), yTe)
plt.figure(figsize=(6,6))
plt.imshow(a/a.sum(axis=0), cmap='hot', interpolation='nearest')
plt.colorbar()
plt.show()

In [ ]:
joblib.dump(lr, lr_fname)

Neural Networks

Fully connected


In [ ]:
model = Sequential([
        Dense(512, input_dim=900, activation='relu'),
        Dropout(0.5),
        Dense(512, activation='relu'),
        Dropout(0.5),
        Dense(256, activation='relu'),
        Dropout(0.5),
        Dense(128, activation='relu'),
        Dropout(0.5),
        Dense(50, activation='softmax')
    ])
sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

In [ ]:
history = model.fit(xTr, yTr_cat, epochs=50, batch_size=128, verbose=1)

In [ ]:
preds = model.predict(xTe)

In [ ]:
(preds.argmax(axis=1) == yTe).mean()

In [ ]:
plt.plot(history.history['acc'])
plt.title('accuracy')
plt.show()
plt.plot(history.history['loss'])
plt.title('loss')
plt.show()

ConvNet A

All vs All


In [7]:
xTr_conv = xTr.reshape(-1, 3, 300, 1)
xTe_conv = xTe.reshape(-1, 3, 300, 1)

In [ ]:
model = Sequential([
        Conv2D(128, (3,50), activation='relu', input_shape=(3,300,1), padding='same'),
        Conv2D(128, (3,30), activation='relu', padding='same'),
        Conv2D(128, (3,10), activation='relu', padding='same'),
        Conv2D(128, (3,3), activation='relu', padding='same'),
        MaxPooling2D(pool_size=(2,2)),
        Flatten(),
        Dense(512, activation='relu'),
        Dropout(0.5),
        Dense(512, activation='relu'),
        Dropout(0.5),
        Dense(50, activation='softmax')
    ])
sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])

In [ ]:
history = model.fit(xTr_conv, yTr_cat, epochs=10, batch_size=128, validation_data=(xTe_conv, yTe_cat))

In [ ]:
preds = model.predict(xTe_conv)
(preds.argmax(axis=1) == yTe).mean()

In [ ]:
plt.plot(history.history['acc'])
plt.title('accuracy')
plt.show()
plt.plot(history.history['loss'])
plt.title('loss')
plt.show()

ConvNet B

Frank vs Rest


In [21]:
(yTr == 0).sum(), (yTe == 0).sum()


Out[21]:
(31, 7)

In [27]:
def random_others(labels, target_class):
    out = []
    ctr = 0
    n = (labels == target_class).sum()
    for i in range(len(labels)):
        if labels[i] == target_class:
            out.append(True)
        else:
            p = 0.5
            if np.random.rand() > p and ctr < n:
                ctr += 1
                out.append(True)
            else:
                out.append(False)
    return np.array(out)

In [82]:
tr_idx = random_others(yTr, 0)
te_idx = random_others(yTe, 0)

binarize


In [83]:
conditionB = lambda y: y == 0
yTr_bin = conditionB(yTr[tr_idx]) + 0
yTe_bin = conditionB(yTe[te_idx]) + 0

In [84]:
model = Sequential([
        Conv2D(128, (3,50), activation='relu', input_shape=(3,300,1), padding='same'),
        Conv2D(128, (3,30), activation='relu', padding='same'),
        Conv2D(128, (3,3), activation='relu', padding='same'),
        MaxPooling2D(pool_size=(3,10)),
        Flatten(),
        Dense(512, activation='relu'),
        Dropout(0.5),
        Dense(512, activation='relu'),
        Dropout(0.5),
        Dense(1, activation='sigmoid')
    ])
sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='binary_crossentropy', optimizer=sgd, metrics=['accuracy'])

In [89]:
history = model.fit(xTr_conv[tr_idx, :, :, :],
                    yTr_bin,
                    epochs=4,
                    batch_size=128,
                    validation_data=(xTe_conv[te_idx, :, :, :], yTe_bin))


Train on 34 samples, validate on 8 samples
Epoch 1/4
34/34 [==============================] - 0s - loss: 0.2575 - acc: 1.0000 - val_loss: 0.2209 - val_acc: 1.0000
Epoch 2/4
34/34 [==============================] - 0s - loss: 0.2165 - acc: 0.9412 - val_loss: 0.1669 - val_acc: 1.0000
Epoch 3/4
34/34 [==============================] - 0s - loss: 0.1917 - acc: 0.9706 - val_loss: 0.1275 - val_acc: 1.0000
Epoch 4/4
34/34 [==============================] - 0s - loss: 0.1105 - acc: 1.0000 - val_loss: 0.0922 - val_acc: 1.0000

In [90]:
plt.plot(history.history['acc'])
plt.title('accuracy')
plt.show()
plt.plot(history.history['loss'])
plt.title('loss')
plt.show()



In [91]:
preds = model.predict(xTe_conv[te_idx, :, :, :])
(preds.flatten().round() == yTe_bin).mean()


Out[91]:
1.0

In [92]:
preds.flatten().round()


Out[92]:
array([ 0.,  0.,  0.,  0.,  1.,  1.,  1.,  1.], dtype=float32)

ConvNet C

Frank vs Mukund vs Rest


In [12]:
def random_others(labels, target_class):
    out = []
    ctr = 0
    n = 2*(labels == target_class).sum()
    for i in range(len(labels)):
        if labels[i] == target_class:
            out.append(True)
        else:
            p = 0.5
            if np.random.rand() > p and ctr < n:
                ctr += 1
                out.append(True)
            else:
                out.append(False)
    return np.array(out)

In [8]:
xTr_conv = xTr.reshape(-1, 3, 300, 1)
xTe_conv = xTe.reshape(-1, 3, 300, 1)

In [13]:
muk_idx_tr = random_others(yTr, 51)
muk_idx_te = random_others(yTe, 51)

fra_idx_tr = random_others(yTr, 0)
fra_idx_te = random_others(yTe, 0)

tr_idx = np.logical_or(muk_idx_tr, fra_idx_tr)
te_idx = np.logical_or(muk_idx_te, fra_idx_te)

In [14]:
def trinarize(labels, class1, class2):
    out = []
    for l in labels:
        if l == class1:
            out.append(1)
        elif l == class2:
            out.append(2)
        else:
            out.append(0)
    return np.array(out)

In [15]:
yTr_tri = trinarize(yTr[tr_idx], 51, 0)
yTe_tri = trinarize(yTe[te_idx], 51, 0)

In [16]:
yTr_cat = keras.utils.to_categorical(yTr_tri, num_classes=3)
yTe_cat = keras.utils.to_categorical(yTe_tri, num_classes=3)

In [17]:
model = Sequential([
        Conv2D(128, (3,50), activation='relu', input_shape=(3,300,1), padding='same'),
        Conv2D(128, (3,30), activation='relu', padding='same'),
        Conv2D(128, (3,3), activation='relu', padding='same'),
        MaxPooling2D(pool_size=(3,10)),
        Flatten(),
        Dense(512, activation='relu'),
        Dropout(0.5),
        Dense(512, activation='relu'),
        Dropout(0.5),
        Dense(3, activation='softmax')
    ])
sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])

In [18]:
history = model.fit(xTr_conv[tr_idx,:,:,:],
                    yTr_cat,
                    epochs=30,
                    batch_size=128,
                    validation_data=(xTe_conv[te_idx,:,:,:], yTe_cat))


Train on 1146 samples, validate on 267 samples
Epoch 1/30
1146/1146 [==============================] - 6s - loss: 0.8921 - acc: 0.6222 - val_loss: 0.6322 - val_acc: 0.6667
Epoch 2/30
1146/1146 [==============================] - 5s - loss: 0.6057 - acc: 0.7103 - val_loss: 0.4567 - val_acc: 0.8502
Epoch 3/30
1146/1146 [==============================] - 5s - loss: 0.4788 - acc: 0.7914 - val_loss: 0.3755 - val_acc: 0.8989
Epoch 4/30
1146/1146 [==============================] - 5s - loss: 0.3783 - acc: 0.8613 - val_loss: 0.3169 - val_acc: 0.9176
Epoch 5/30
1146/1146 [==============================] - 5s - loss: 0.3036 - acc: 0.9023 - val_loss: 0.2673 - val_acc: 0.9176
Epoch 6/30
1146/1146 [==============================] - 5s - loss: 0.2626 - acc: 0.9110 - val_loss: 0.2334 - val_acc: 0.9064
Epoch 7/30
1146/1146 [==============================] - 5s - loss: 0.2160 - acc: 0.9311 - val_loss: 0.1770 - val_acc: 0.9251
Epoch 8/30
1146/1146 [==============================] - 5s - loss: 0.1610 - acc: 0.9459 - val_loss: 0.1343 - val_acc: 0.9551
Epoch 9/30
1146/1146 [==============================] - 5s - loss: 0.1081 - acc: 0.9660 - val_loss: 0.1039 - val_acc: 0.9663
Epoch 10/30
1146/1146 [==============================] - 5s - loss: 0.0876 - acc: 0.9747 - val_loss: 0.0934 - val_acc: 0.9663
Epoch 11/30
1146/1146 [==============================] - 5s - loss: 0.0611 - acc: 0.9834 - val_loss: 0.0818 - val_acc: 0.9663
Epoch 12/30
1146/1146 [==============================] - 5s - loss: 0.0542 - acc: 0.9825 - val_loss: 0.0764 - val_acc: 0.9700
Epoch 13/30
1146/1146 [==============================] - 5s - loss: 0.0368 - acc: 0.9913 - val_loss: 0.0716 - val_acc: 0.9813
Epoch 14/30
1146/1146 [==============================] - 5s - loss: 0.0220 - acc: 0.9956 - val_loss: 0.0750 - val_acc: 0.9663
Epoch 15/30
1146/1146 [==============================] - 5s - loss: 0.0198 - acc: 0.9948 - val_loss: 0.0682 - val_acc: 0.9663
Epoch 16/30
1146/1146 [==============================] - 5s - loss: 0.0140 - acc: 0.9974 - val_loss: 0.0697 - val_acc: 0.9850
Epoch 17/30
1146/1146 [==============================] - 5s - loss: 0.0117 - acc: 0.9974 - val_loss: 0.0870 - val_acc: 0.9738
Epoch 18/30
1146/1146 [==============================] - 5s - loss: 0.0127 - acc: 0.9965 - val_loss: 0.0696 - val_acc: 0.9775
Epoch 19/30
1146/1146 [==============================] - 5s - loss: 0.0106 - acc: 0.9983 - val_loss: 0.0672 - val_acc: 0.9700
Epoch 20/30
1146/1146 [==============================] - 5s - loss: 0.0137 - acc: 0.9956 - val_loss: 0.0728 - val_acc: 0.9700
Epoch 21/30
1146/1146 [==============================] - 5s - loss: 0.0091 - acc: 0.9983 - val_loss: 0.0895 - val_acc: 0.9738
Epoch 22/30
1146/1146 [==============================] - 5s - loss: 0.0103 - acc: 0.9965 - val_loss: 0.0680 - val_acc: 0.9813
Epoch 23/30
1146/1146 [==============================] - 5s - loss: 0.0053 - acc: 0.9991 - val_loss: 0.0682 - val_acc: 0.9775
Epoch 24/30
1146/1146 [==============================] - 5s - loss: 0.0043 - acc: 0.9991 - val_loss: 0.0742 - val_acc: 0.9738
Epoch 25/30
1146/1146 [==============================] - 5s - loss: 0.0051 - acc: 0.9991 - val_loss: 0.0674 - val_acc: 0.9738
Epoch 26/30
1146/1146 [==============================] - 5s - loss: 0.0052 - acc: 0.9983 - val_loss: 0.0655 - val_acc: 0.9775
Epoch 27/30
1146/1146 [==============================] - 5s - loss: 0.0031 - acc: 0.9991 - val_loss: 0.0713 - val_acc: 0.9813
Epoch 28/30
1146/1146 [==============================] - 5s - loss: 0.0034 - acc: 0.9991 - val_loss: 0.0838 - val_acc: 0.9738
Epoch 29/30
1146/1146 [==============================] - 5s - loss: 0.0028 - acc: 0.9991 - val_loss: 0.0820 - val_acc: 0.9813
Epoch 30/30
1146/1146 [==============================] - 5s - loss: 0.0041 - acc: 0.9991 - val_loss: 0.0766 - val_acc: 0.9813

In [20]:
plt.plot(history.history['acc'])
plt.title('accuracy')
plt.show()
plt.plot(history.history['loss'])
plt.title('loss')
plt.show()



In [21]:
preds = model.predict(xTe_conv[te_idx, :, :, :])
(yTe_cat == preds.round()).mean()


Out[21]:
0.98751560549313355

In [30]:



Out[30]:
0.98124673598252865

In [32]:



Out[32]:
array([[176,   2],
       [  3,  86]])

In [ ]: