JLab ML Lunch 2 - Data Exploration

Second ML challenge hosted
On October 30th, a test dataset will be released, and predictions must be submitted within 24 hours
Let's take a look at the training data!



In [1]:

    
%matplotlib widget



In [2]:

    
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import axes3d
import imageio

Training Data

This shows the state vector ($x,y,z, p_x, p_y, p_z$) for the origin and 24 detector stations
Jupyter-matplotlib widget used for handy visualizations (https://github.com/matplotlib/jupyter-matplotlib)



In [3]:

    
X_train = pd.read_csv("MLchallenge2_training.csv")
# There are 150 columns. Let's just see a few
X_train[['x', 'y', 'z', 'px', 'py', 'pz',
         'x1', 'y1', 'z1', 'px1', 'py1', 'pz1']].head()



In [4]:

    
def plot_quiver_track(df, track_id, elev=None,
                      azim=None, dist=None):
    
    # Extract the track row
    track = df.loc[track_id].values

    # Get all the values of each type of feature
    x = [track[(6*i)] for i in range(0, 25)]
    y = [track[1+(6*i)] for i in range(0, 25)]
    z = [track[2+(6*i)] for i in range(0, 25)]
    px = [track[3+(6*i)] for i in range(0, 25)]
    py = [track[4+(6*i)] for i in range(0, 25)]
    pz = [track[5+(6*i)] for i in range(0, 25)]
    
    # I ideally would like to link the magnitude
    # of the momentum to the color, but my results
    # were buggy...
    p_tot = np.sqrt(np.square(px) +
                    np.square(py) + 
                    np.square(pz))

    # Create our 3D figure
    fig = plt.figure()
    ax = fig.gca(projection='3d')   
    ax.xaxis.set_pane_color((1,1,1,1))
    ax.yaxis.set_pane_color((1,1,1,1))
    ax.zaxis.set_pane_color((1,1,1,1))
    
    # Set the three 3D plot viewing attributes
    if elev is not None:
        ax.elev = elev
    if azim is not None:
        ax.azim = azim
    if dist is not None:
        ax.dist = dist
    
    # Create our quiver plot
    ax.quiver(z, x, y, pz, px, py, length=14)
    
    # Labels for clarity
    ax.set_title("Track {}".format(track_id))
    ax.set_xlabel("z", fontweight="bold")
    ax.set_ylabel("x", fontweight="bold")
    ax.set_zlabel("y", fontweight="bold")
    plt.tight_layout()

    return fig, ax



In [5]:

    
fig, ax = plot_quiver_track(X_train, 2)
fig.show()



In [6]:

    
gif_filename = "track-2-anim"

ax.elev = 50.
ax.azim = 90.
ax.dist = 9.

img_files = []
for n in range(0, 100):
    ax.elev = ax.elev-0.4
    ax.azim = ax.azim+1.5
    filename = f'images/{gif_filename}/img{str(n).zfill(3)}.png'
    img_files.append(filename)
    plt.savefig(filename, bbox_inches='tight')



In [7]:

    
images = []
for filename in img_files:
    images.append(imageio.imread(filename))
imageio.mimsave('images/track-2.gif', images)

Now read in the example test data



In [8]:

    
X_test = pd.read_csv("test_in.csv", names=X_train.columns)
X_test[['x', 'y', 'z', 'x15', 'y15', 'z15', 'x23', 'y23', 'z23']].head()



In [9]:

    
import missingno as mno
ax = mno.matrix(X_test.head(100))

One caveat on the test data

The last value of each row is actually the z-value of the next step to be predicted, not the x-position
... but this isn't the same spot for each row
Just add two commas before the last number of each row



In [84]:

    
import re
from io import StringIO



In [93]:

    
with open('test_in.csv', 'r') as f:
    data_str = f.read()



In [94]:

    
data_str_io = StringIO(
    re.sub(r"([-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?\n)", r",,\1", data_str)
)



In [95]:

    
X_test = pd.read_csv(data_str_io, names=X_train.columns)



In [96]:

    
X_test.head()









    Out[96]:







  
    
      
      x
      y
      z
      px
      py
      pz
      x1
      y1
      z1
      px1
      ...
      z23
      px23
      py23
      pz23
      x24
      y24
      z24
      px24
      py24
      pz24
    
  
  
    
      0
      0.877
      1.322
      65.0
      -0.244
      -0.053
      2.414
      -10.669
      0.330
      176.944
      -0.254
      ...
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      1
      0.786
      -2.483
      65.0
      0.103
      0.432
      2.593
      7.366
      15.502
      176.944
      0.206
      ...
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      2
      -13.134
      -26.531
      65.0
      0.064
      -0.021
      0.953
      -7.586
      -30.687
      176.944
      0.027
      ...
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      3
      18.454
      2.805
      65.0
      -0.019
      0.069
      1.833
      18.043
      6.797
      176.944
      0.013
      ...
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      4
      15.552
      -19.196
      65.0
      -0.010
      -0.011
      2.366
      15.068
      -19.750
      176.944
      -0.014
      ...
      341.28
      -0.014
      -0.002
      2.351
      NaN
      NaN
      343.405
      NaN
      NaN
      NaN
    
  

5 rows × 150 columns

This should be saved for later usage



In [97]:

    
import re
from io import StringIO

def load_test_data(filename):
    with open(filename, 'r') as f:
        data_str = f.read()
    data_str_io = StringIO(
        re.sub(r"([-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?\n)", r",,\1", data_str)
    )
    X_test = pd.read_csv(data_str_io, names=X_train.columns)
    
    return X_test



In [ ]:

	x	y	z	px	py	pz	x1	y1	z1	px1	py1	pz1
0	0.877079	1.32218	65	-0.243980	-0.053204	2.414260	-10.66900	0.330138	176.944	-0.253523	0.015487	2.408040
1	0.786361	-2.48294	65	0.103229	0.432216	2.592910	7.36649	15.502000	176.944	0.205638	0.393629	2.581810
2	-13.133900	-26.53090	65	0.064432	-0.020771	0.952952	-7.58617	-30.686700	176.944	0.026643	-0.051061	0.948479
3	18.454200	2.80469	65	-0.019384	0.069384	1.832590	18.04330	6.797470	176.944	0.013039	0.062029	1.824850
4	15.552100	-19.19600	65	-0.009768	-0.010642	2.366080	15.06810	-19.750200	176.944	-0.014308	-0.015936	2.351700

	x	y	z	x15	y15	z15	x23	y23	z23
0	0.877	1.322	65.0	298.354	NaN	NaN	NaN	NaN	NaN
1	0.786	-2.483	65.0	298.354	NaN	NaN	NaN	NaN	NaN
2	-13.134	-26.531	65.0	-7.435	-37.461	298.354	NaN	NaN	NaN
3	18.454	2.805	65.0	19.788	10.501	298.354	NaN	NaN	NaN
4	15.552	-19.196	65.0	14.023	-20.481	298.354	13.669	-20.622	341.28