00-Data-Exploration


JLab ML Lunch 2 - Data Exploration

  • Second ML challenge hosted
  • On October 30th, a test dataset will be released, and predictions must be submitted within 24 hours
  • Let's take a look at the training data!

In [1]:
%matplotlib widget

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import axes3d
import imageio

Training Data


In [3]:
X_train = pd.read_csv("MLchallenge2_training.csv")
# There are 150 columns. Let's just see a few
X_train[['x', 'y', 'z', 'px', 'py', 'pz',
         'x1', 'y1', 'z1', 'px1', 'py1', 'pz1']].head()


Out[3]:
x y z px py pz x1 y1 z1 px1 py1 pz1
0 0.877079 1.32218 65 -0.243980 -0.053204 2.414260 -10.66900 0.330138 176.944 -0.253523 0.015487 2.408040
1 0.786361 -2.48294 65 0.103229 0.432216 2.592910 7.36649 15.502000 176.944 0.205638 0.393629 2.581810
2 -13.133900 -26.53090 65 0.064432 -0.020771 0.952952 -7.58617 -30.686700 176.944 0.026643 -0.051061 0.948479
3 18.454200 2.80469 65 -0.019384 0.069384 1.832590 18.04330 6.797470 176.944 0.013039 0.062029 1.824850
4 15.552100 -19.19600 65 -0.009768 -0.010642 2.366080 15.06810 -19.750200 176.944 -0.014308 -0.015936 2.351700

In [4]:
def plot_quiver_track(df, track_id, elev=None,
                      azim=None, dist=None):
    
    # Extract the track row
    track = df.loc[track_id].values

    # Get all the values of each type of feature
    x = [track[(6*i)] for i in range(0, 25)]
    y = [track[1+(6*i)] for i in range(0, 25)]
    z = [track[2+(6*i)] for i in range(0, 25)]
    px = [track[3+(6*i)] for i in range(0, 25)]
    py = [track[4+(6*i)] for i in range(0, 25)]
    pz = [track[5+(6*i)] for i in range(0, 25)]
    
    # I ideally would like to link the magnitude
    # of the momentum to the color, but my results
    # were buggy...
    p_tot = np.sqrt(np.square(px) +
                    np.square(py) + 
                    np.square(pz))

    # Create our 3D figure
    fig = plt.figure()
    ax = fig.gca(projection='3d')   
    ax.xaxis.set_pane_color((1,1,1,1))
    ax.yaxis.set_pane_color((1,1,1,1))
    ax.zaxis.set_pane_color((1,1,1,1))
    
    # Set the three 3D plot viewing attributes
    if elev is not None:
        ax.elev = elev
    if azim is not None:
        ax.azim = azim
    if dist is not None:
        ax.dist = dist
    
    # Create our quiver plot
    ax.quiver(z, x, y, pz, px, py, length=14)
    
    # Labels for clarity
    ax.set_title("Track {}".format(track_id))
    ax.set_xlabel("z", fontweight="bold")
    ax.set_ylabel("x", fontweight="bold")
    ax.set_zlabel("y", fontweight="bold")
    plt.tight_layout()

    return fig, ax

In [5]:
fig, ax = plot_quiver_track(X_train, 2)
fig.show()



In [6]:
gif_filename = "track-2-anim"

ax.elev = 50.
ax.azim = 90.
ax.dist = 9.

img_files = []
for n in range(0, 100):
    ax.elev = ax.elev-0.4
    ax.azim = ax.azim+1.5
    filename = f'images/{gif_filename}/img{str(n).zfill(3)}.png'
    img_files.append(filename)
    plt.savefig(filename, bbox_inches='tight')

In [7]:
images = []
for filename in img_files:
    images.append(imageio.imread(filename))
imageio.mimsave('images/track-2.gif', images)

Now read in the example test data


In [8]:
X_test = pd.read_csv("test_in.csv", names=X_train.columns)
X_test[['x', 'y', 'z', 'x15', 'y15', 'z15', 'x23', 'y23', 'z23']].head()


Out[8]:
x y z x15 y15 z15 x23 y23 z23
0 0.877 1.322 65.0 298.354 NaN NaN NaN NaN NaN
1 0.786 -2.483 65.0 298.354 NaN NaN NaN NaN NaN
2 -13.134 -26.531 65.0 -7.435 -37.461 298.354 NaN NaN NaN
3 18.454 2.805 65.0 19.788 10.501 298.354 NaN NaN NaN
4 15.552 -19.196 65.0 14.023 -20.481 298.354 13.669 -20.622 341.28

In [9]:
import missingno as mno
ax = mno.matrix(X_test.head(100))


One caveat on the test data

  • The last value of each row is actually the z-value of the next step to be predicted, not the x-position
  • ... but this isn't the same spot for each row
  • Just add two commas before the last number of each row

In [84]:
import re
from io import StringIO

In [93]:
with open('test_in.csv', 'r') as f:
    data_str = f.read()

In [94]:
data_str_io = StringIO(
    re.sub(r"([-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?\n)", r",,\1", data_str)
)

In [95]:
X_test = pd.read_csv(data_str_io, names=X_train.columns)

In [96]:
X_test.head()


Out[96]:
x y z px py pz x1 y1 z1 px1 ... z23 px23 py23 pz23 x24 y24 z24 px24 py24 pz24
0 0.877 1.322 65.0 -0.244 -0.053 2.414 -10.669 0.330 176.944 -0.254 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 0.786 -2.483 65.0 0.103 0.432 2.593 7.366 15.502 176.944 0.206 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 -13.134 -26.531 65.0 0.064 -0.021 0.953 -7.586 -30.687 176.944 0.027 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 18.454 2.805 65.0 -0.019 0.069 1.833 18.043 6.797 176.944 0.013 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
4 15.552 -19.196 65.0 -0.010 -0.011 2.366 15.068 -19.750 176.944 -0.014 ... 341.28 -0.014 -0.002 2.351 NaN NaN 343.405 NaN NaN NaN

5 rows × 150 columns

This should be saved for later usage


In [97]:
import re
from io import StringIO

def load_test_data(filename):
    with open(filename, 'r') as f:
        data_str = f.read()
    data_str_io = StringIO(
        re.sub(r"([-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?\n)", r",,\1", data_str)
    )
    X_test = pd.read_csv(data_str_io, names=X_train.columns)
    
    return X_test

In [ ]: