Multi-Class Classifier on Particle Track Data


In [1]:
%matplotlib inline

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import sys
import numpy as np
import math

Get angle values and cast to boolean


In [3]:
track_params = pd.read_csv('../TRAIN/track_parms.csv')

In [4]:
track_params.tail()


Out[4]:
filename phi z phi_calc phi_regression sigma_regression
499995 img499995.png -4.509653 0.968584 -4.494071 -4.494040 0.014105
499996 img499996.png -1.595661 -7.397094 -1.645501 -1.645483 0.014181
499997 img499997.png 7.695264 -2.984060 7.799254 7.799186 0.013931
499998 img499998.png -1.898667 5.082713 -1.880849 -1.880834 0.014177
499999 img499999.png 4.275843 -2.266920 4.324348 4.324317 0.014112

Create our simple classification target


In [21]:
# Bin the phi values to get multi-class labels
track_params['phi_binned'], phi_bins = pd.cut(track_params.phi,
                                              bins=range(-10, 12, 2),
                                              retbins=True)
track_params['phi_binned'] = track_params['phi_binned'].astype(str)

In [22]:
track_params.head()


Out[22]:
filename phi z phi_calc phi_regression sigma_regression phi_binned
0 img000000.png -0.195900 -5.164839 -0.206930 -0.206928 0.014192 (-2, 0]
1 img000001.png -1.473349 5.784543 -1.409622 -1.409614 0.014184 (-2, 0]
2 img000002.png 9.206585 -2.295192 9.296442 9.296330 0.016293 (8, 10]
3 img000003.png 5.378890 4.685070 5.281532 5.281474 0.014072 (4, 6]
4 img000004.png -6.700401 -0.851756 -6.739551 -6.739504 0.013997 (-8, -6]

Create an image generator from this dataframe


In [23]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

In [24]:
DATAGEN = ImageDataGenerator(rescale=1./255.,
                             validation_split=0.25)

In [25]:
height = 100
width = 36

def create_generator(target, subset, class_mode,
                     idg=DATAGEN, df=track_params, N=1000):
    
    return idg.flow_from_dataframe(
        dataframe=track_params.head(N),
        directory="../TRAIN",
        x_col="filename",
        y_col=target,
        subset=subset,
        target_size=(height, width),
        batch_size=32,
        seed=314,
        shuffle=True,
        class_mode=class_mode,
    )

Create a very simple convolutional model from scratch


In [26]:
from tensorflow.keras import Sequential, Model
from tensorflow.keras.layers import (
    Conv2D, Activation, MaxPooling2D,
    Flatten, Dense, Dropout, Input
)

Okay, maybe that was too easy

  • I mean, if any pixels are lit up on the top half / bottom half, it's a smoking gun.
  • Let's make it harder with binned measurements and treat it as categorical.

In [27]:
mc_train_generator = create_generator(
    target="phi_binned",
    subset="training",
    class_mode="categorical",
    N=10000
)
mc_val_generator = create_generator(
    target="phi_binned",
    subset="validation",
    class_mode="categorical",
    N=10000
)


Found 7500 validated image filenames belonging to 10 classes.
Found 2500 validated image filenames belonging to 10 classes.

Similar model, with some tweaks


In [28]:
width  = 36
height = 100
channels = 3

def multiclass_classifier():
    model = Sequential()

    # Convoluional Layer
    model.add(Conv2D(32, (3, 3), input_shape=(height, width, channels)))
    model.add(Activation('relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))

    # Dense, Classification Layer
    model.add(Flatten())
    model.add(Dense(32))
    model.add(Activation('relu'))
    model.add(Dropout(0.5))
    model.add(Dense(10))
    model.add(Activation('softmax'))

    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    
    return model

In [29]:
STEP_SIZE_TRAIN = mc_train_generator.n//mc_train_generator.batch_size
STEP_SIZE_VAL = mc_val_generator.n//mc_val_generator.batch_size

In [30]:
mc_model = multiclass_classifier()
mc_history = mc_model.fit_generator(
    generator=mc_train_generator,
    steps_per_epoch=STEP_SIZE_TRAIN,
    validation_data=mc_val_generator,
    validation_steps=STEP_SIZE_VAL,
    epochs=10
)


Epoch 1/10
234/234 [==============================] - 8s 34ms/step - loss: 0.2956 - accuracy: 0.9138 - val_loss: 0.0671 - val_accuracy: 0.9736
Epoch 2/10
234/234 [==============================] - 7s 28ms/step - loss: 0.1325 - accuracy: 0.9530 - val_loss: 0.0523 - val_accuracy: 0.9788
Epoch 3/10
234/234 [==============================] - 7s 28ms/step - loss: 0.0960 - accuracy: 0.9657 - val_loss: 0.0428 - val_accuracy: 0.9836
Epoch 4/10
234/234 [==============================] - 7s 29ms/step - loss: 0.0875 - accuracy: 0.9680 - val_loss: 0.0397 - val_accuracy: 0.9848
Epoch 5/10
234/234 [==============================] - 7s 29ms/step - loss: 0.0824 - accuracy: 0.9697 - val_loss: 0.0370 - val_accuracy: 0.9844
Epoch 6/10
234/234 [==============================] - 7s 30ms/step - loss: 0.0783 - accuracy: 0.9695 - val_loss: 0.0334 - val_accuracy: 0.9856
Epoch 7/10
234/234 [==============================] - 7s 29ms/step - loss: 0.0766 - accuracy: 0.9716 - val_loss: 0.0270 - val_accuracy: 0.9900
Epoch 8/10
234/234 [==============================] - 7s 29ms/step - loss: 0.0658 - accuracy: 0.9730 - val_loss: 0.0287 - val_accuracy: 0.9876
Epoch 9/10
234/234 [==============================] - 7s 29ms/step - loss: 0.0637 - accuracy: 0.9767 - val_loss: 0.0265 - val_accuracy: 0.9880
Epoch 10/10
234/234 [==============================] - 7s 28ms/step - loss: 0.0644 - accuracy: 0.9754 - val_loss: 0.0267 - val_accuracy: 0.9900

In [31]:
plt.plot(mc_history.history['accuracy'], label="Train Accuracy")
plt.plot(mc_history.history['val_accuracy'], label="Validation Accuracy")
plt.legend()
plt.show()


Check out predictions on Holdout data


In [32]:
holdout_track_params = pd.read_csv('../VALIDATION/track_parms.csv')

holdout_track_params['phi_binned'] = pd.cut(
    holdout_track_params['phi'],
    bins=phi_bins
)
holdout_track_params['phi_binned'] = (
    holdout_track_params['phi_binned'].astype(str)
)

In [33]:
mc_holdout_generator = DATAGEN.flow_from_dataframe(
    dataframe=holdout_track_params,
    directory="../VALIDATION",
    x_col="filename",
    y_col="phi_binned",
    subset=None,
    target_size=(height, width),
    batch_size=32,
    seed=314,
    shuffle=False,
    class_mode="categorical",
)


Found 50000 validated image filenames belonging to 10 classes.

In [34]:
holdout_track_params['y_pred'] = mc_model.predict_classes(mc_holdout_generator)

In [35]:
holdout_track_params['y_true'] = mc_holdout_generator.classes

In [36]:
import numpy as np
from sklearn.metrics import confusion_matrix

def plot_confusion_matrix(cm,
                          target_names,
                          title='Confusion matrix',
                          cmap=None,
                          normalize=True):
    """
    given a sklearn confusion matrix (cm), make a nice plot

    Arguments
    ---------
    cm:           confusion matrix from sklearn.metrics.confusion_matrix

    target_names: given classification classes such as [0, 1, 2]
                  the class names, for example: ['high', 'medium', 'low']

    title:        the text to display at the top of the matrix

    cmap:         the gradient of the values displayed from matplotlib.pyplot.cm
                  see http://matplotlib.org/examples/color/colormaps_reference.html
                  plt.get_cmap('jet') or plt.cm.Blues

    normalize:    If False, plot the raw numbers
                  If True, plot the proportions

    Usage
    -----
    plot_confusion_matrix(cm           = cm,                  # confusion matrix created by
                                                              # sklearn.metrics.confusion_matrix
                          normalize    = True,                # show proportions
                          target_names = y_labels_vals,       # list of names of the classes
                          title        = best_estimator_name) # title of graph

    Citiation
    ---------
    http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html

    """
    import matplotlib.pyplot as plt
    import numpy as np
    import itertools

    accuracy = np.trace(cm) / float(np.sum(cm))
    misclass = 1 - accuracy

    if cmap is None:
        cmap = plt.get_cmap('Blues')

    plt.figure(figsize=(10, 8))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()

    if target_names is not None:
        tick_marks = np.arange(len(target_names))
        plt.xticks(tick_marks, target_names, rotation=45)
        plt.yticks(tick_marks, target_names)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]


    thresh = cm.max() / 1.5 if normalize else cm.max() / 2
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        if normalize:
            plt.text(j, i, "{:0.4f}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")
        else:
            plt.text(j, i, "{:,}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")


    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label\naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass))
    plt.show()

In [37]:
y_pred = mc_model.predict_classes(mc_holdout_generator)
y_true = mc_holdout_generator.labels

label_list = ['(-10.0, -8.0]', '(-8.0, -6.0]', '(-6.0, -4.0]', '(-4.0, -2.0]',
              '(-2.0, 0.0]', '(0.0, 2.0]', '(2.0, 4.0]', '(4.0, 6.0]', '(6.0, 8.0]',
              '(8.0, 10.0]']
plot_confusion_matrix(confusion_matrix(y_true, y_pred),
                      target_names=label_list,
                      normalize=False)



In [ ]: