# JLab ML Lunch 2 - Data Exploration

• Second ML challenge hosted
• On October 30th, a test dataset will be released, and predictions must be submitted within 24 hours
• Let's take a look at the training data!
``````

In [1]:

%matplotlib widget

``````
``````

In [2]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import axes3d
import imageio

``````

## Training Data

``````

In [3]:

# There are 150 columns. Let's just see a few
X_train[['x', 'y', 'z', 'px', 'py', 'pz',
'x1', 'y1', 'z1', 'px1', 'py1', 'pz1']].head()

``````
``````

Out[3]:

x
y
z
px
py
pz
x1
y1
z1
px1
py1
pz1

0
0.877079
1.32218
65
-0.243980
-0.053204
2.414260
-10.66900
0.330138
176.944
-0.253523
0.015487
2.408040

1
0.786361
-2.48294
65
0.103229
0.432216
2.592910
7.36649
15.502000
176.944
0.205638
0.393629
2.581810

2
-13.133900
-26.53090
65
0.064432
-0.020771
0.952952
-7.58617
-30.686700
176.944
0.026643
-0.051061
0.948479

3
18.454200
2.80469
65
-0.019384
0.069384
1.832590
18.04330
6.797470
176.944
0.013039
0.062029
1.824850

4
15.552100
-19.19600
65
-0.009768
-0.010642
2.366080
15.06810
-19.750200
176.944
-0.014308
-0.015936
2.351700

``````
``````

In [4]:

def plot_quiver_track(df, track_id, elev=None,
azim=None, dist=None):

# Extract the track row
track = df.loc[track_id].values

# Get all the values of each type of feature
x = [track[(6*i)] for i in range(0, 25)]
y = [track[1+(6*i)] for i in range(0, 25)]
z = [track[2+(6*i)] for i in range(0, 25)]
px = [track[3+(6*i)] for i in range(0, 25)]
py = [track[4+(6*i)] for i in range(0, 25)]
pz = [track[5+(6*i)] for i in range(0, 25)]

# I ideally would like to link the magnitude
# of the momentum to the color, but my results
# were buggy...
p_tot = np.sqrt(np.square(px) +
np.square(py) +
np.square(pz))

# Create our 3D figure
fig = plt.figure()
ax = fig.gca(projection='3d')
ax.xaxis.set_pane_color((1,1,1,1))
ax.yaxis.set_pane_color((1,1,1,1))
ax.zaxis.set_pane_color((1,1,1,1))

# Set the three 3D plot viewing attributes
if elev is not None:
ax.elev = elev
if azim is not None:
ax.azim = azim
if dist is not None:
ax.dist = dist

# Create our quiver plot
ax.quiver(z, x, y, pz, px, py, length=14)

# Labels for clarity
ax.set_title("Track {}".format(track_id))
ax.set_xlabel("z", fontweight="bold")
ax.set_ylabel("x", fontweight="bold")
ax.set_zlabel("y", fontweight="bold")
plt.tight_layout()

return fig, ax

``````
``````

In [5]:

fig, ax = plot_quiver_track(X_train, 2)
fig.show()

``````
``````

{"version_major": 2, "model_id": "27659a7a150347338267e73f82c154da", "version_minor": 0}

``````
``````

In [6]:

gif_filename = "track-2-anim"

ax.elev = 50.
ax.azim = 90.
ax.dist = 9.

img_files = []
for n in range(0, 100):
ax.elev = ax.elev-0.4
ax.azim = ax.azim+1.5
filename = f'images/{gif_filename}/img{str(n).zfill(3)}.png'
img_files.append(filename)
plt.savefig(filename, bbox_inches='tight')

``````
``````

In [7]:

images = []
for filename in img_files:
imageio.mimsave('images/track-2.gif', images)

``````

## Now read in the example test data

``````

In [8]:

X_test[['x', 'y', 'z', 'x15', 'y15', 'z15', 'x23', 'y23', 'z23']].head()

``````
``````

Out[8]:

x
y
z
x15
y15
z15
x23
y23
z23

0
0.877
1.322
65.0
298.354
NaN
NaN
NaN
NaN
NaN

1
0.786
-2.483
65.0
298.354
NaN
NaN
NaN
NaN
NaN

2
-13.134
-26.531
65.0
-7.435
-37.461
298.354
NaN
NaN
NaN

3
18.454
2.805
65.0
19.788
10.501
298.354
NaN
NaN
NaN

4
15.552
-19.196
65.0
14.023
-20.481
298.354
13.669
-20.622
341.28

``````
``````

In [9]:

import missingno as mno

``````
``````

{"version_major": 2, "model_id": "d7bde131472143d3b9f199425df16441", "version_minor": 0}

``````

## One caveat on the test data

• The last value of each row is actually the z-value of the next step to be predicted, not the x-position
• ... but this isn't the same spot for each row
• Just add two commas before the last number of each row
``````

In [84]:

import re
from io import StringIO

``````
``````

In [93]:

with open('test_in.csv', 'r') as f:

``````
``````

In [94]:

data_str_io = StringIO(
re.sub(r"([-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?\n)", r",,\1", data_str)
)

``````
``````

In [95]:

``````
``````

In [96]:

``````
``````

Out[96]:

x
y
z
px
py
pz
x1
y1
z1
px1
...
z23
px23
py23
pz23
x24
y24
z24
px24
py24
pz24

0
0.877
1.322
65.0
-0.244
-0.053
2.414
-10.669
0.330
176.944
-0.254
...
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN

1
0.786
-2.483
65.0
0.103
0.432
2.593
7.366
15.502
176.944
0.206
...
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN

2
-13.134
-26.531
65.0
0.064
-0.021
0.953
-7.586
-30.687
176.944
0.027
...
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN

3
18.454
2.805
65.0
-0.019
0.069
1.833
18.043
6.797
176.944
0.013
...
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN

4
15.552
-19.196
65.0
-0.010
-0.011
2.366
15.068
-19.750
176.944
-0.014
...
341.28
-0.014
-0.002
2.351
NaN
NaN
343.405
NaN
NaN
NaN

5 rows × 150 columns

``````

## This should be saved for later usage

``````

In [97]:

import re
from io import StringIO

with open(filename, 'r') as f:
data_str_io = StringIO(
re.sub(r"([-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?\n)", r",,\1", data_str)
)

return X_test

``````
``````

In [ ]:

``````