0. Setup


In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [285]:
from sklearn import preprocessing, svm
from sklearn.linear_model import LinearRegression, LogisticRegression

In [3]:
PATH = Path('data/datasets/paresh2047/uci-semcom')

In [4]:
df = pd.read_csv(PATH/'uci-secom.csv')

1. EDA


In [5]:
df.head() # -1 pass; +1 fail


Out[5]:
Time 0 1 2 3 4 5 6 7 8 ... 581 582 583 584 585 586 587 588 589 Pass/Fail
0 2008-07-19 11:55:00 3030.93 2564.00 2187.7333 1411.1265 1.3602 100.0 97.6133 0.1242 1.5005 ... NaN 0.5005 0.0118 0.0035 2.3630 NaN NaN NaN NaN -1
1 2008-07-19 12:32:00 3095.78 2465.14 2230.4222 1463.6606 0.8294 100.0 102.3433 0.1247 1.4966 ... 208.2045 0.5019 0.0223 0.0055 4.4447 0.0096 0.0201 0.0060 208.2045 -1
2 2008-07-19 13:17:00 2932.61 2559.94 2186.4111 1698.0172 1.5102 100.0 95.4878 0.1241 1.4436 ... 82.8602 0.4958 0.0157 0.0039 3.1745 0.0584 0.0484 0.0148 82.8602 1
3 2008-07-19 14:43:00 2988.72 2479.90 2199.0333 909.7926 1.3204 100.0 104.2367 0.1217 1.4882 ... 73.8432 0.4990 0.0103 0.0025 2.0544 0.0202 0.0149 0.0044 73.8432 -1
4 2008-07-19 15:22:00 3032.24 2502.87 2233.3667 1326.5200 1.5334 100.0 100.3967 0.1235 1.5031 ... NaN 0.4800 0.4766 0.1045 99.3032 0.0202 0.0149 0.0044 73.8432 -1

5 rows × 592 columns


In [6]:
df


Out[6]:
Time 0 1 2 3 4 5 6 7 8 ... 581 582 583 584 585 586 587 588 589 Pass/Fail
0 2008-07-19 11:55:00 3030.93 2564.00 2187.7333 1411.1265 1.3602 100.0 97.6133 0.1242 1.5005 ... NaN 0.5005 0.0118 0.0035 2.3630 NaN NaN NaN NaN -1
1 2008-07-19 12:32:00 3095.78 2465.14 2230.4222 1463.6606 0.8294 100.0 102.3433 0.1247 1.4966 ... 208.2045 0.5019 0.0223 0.0055 4.4447 0.0096 0.0201 0.0060 208.2045 -1
2 2008-07-19 13:17:00 2932.61 2559.94 2186.4111 1698.0172 1.5102 100.0 95.4878 0.1241 1.4436 ... 82.8602 0.4958 0.0157 0.0039 3.1745 0.0584 0.0484 0.0148 82.8602 1
3 2008-07-19 14:43:00 2988.72 2479.90 2199.0333 909.7926 1.3204 100.0 104.2367 0.1217 1.4882 ... 73.8432 0.4990 0.0103 0.0025 2.0544 0.0202 0.0149 0.0044 73.8432 -1
4 2008-07-19 15:22:00 3032.24 2502.87 2233.3667 1326.5200 1.5334 100.0 100.3967 0.1235 1.5031 ... NaN 0.4800 0.4766 0.1045 99.3032 0.0202 0.0149 0.0044 73.8432 -1
5 2008-07-19 17:53:00 2946.25 2432.84 2233.3667 1326.5200 1.5334 100.0 100.3967 0.1235 1.5287 ... 44.0077 0.4949 0.0189 0.0044 3.8276 0.0342 0.0151 0.0052 44.0077 -1
6 2008-07-19 19:44:00 3030.27 2430.12 2230.4222 1463.6606 0.8294 100.0 102.3433 0.1247 1.5816 ... NaN 0.5010 0.0143 0.0042 2.8515 0.0342 0.0151 0.0052 44.0077 -1
7 2008-07-19 19:45:00 3058.88 2690.15 2248.9000 1004.4692 0.7884 100.0 106.2400 0.1185 1.5153 ... 95.0310 0.4984 0.0106 0.0034 2.1261 0.0204 0.0194 0.0063 95.0310 -1
8 2008-07-19 20:24:00 2967.68 2600.47 2248.9000 1004.4692 0.7884 100.0 106.2400 0.1185 1.5358 ... 111.6525 0.4993 0.0172 0.0046 3.4456 0.0111 0.0124 0.0045 111.6525 -1
9 2008-07-19 21:35:00 3016.11 2428.37 2248.9000 1004.4692 0.7884 100.0 106.2400 0.1185 1.5381 ... 90.2294 0.4967 0.0152 0.0038 3.0687 0.0212 0.0191 0.0073 90.2294 -1
10 2008-07-19 21:57:00 2994.05 2548.21 2195.1222 1046.1468 1.3204 100.0 103.3400 0.1223 1.5144 ... 57.8122 0.4925 0.0158 0.0041 3.2115 0.0355 0.0205 0.0071 57.8122 1
11 2008-07-19 22:52:00 2928.84 2479.40 2196.2111 1605.7578 0.9959 100.0 97.9156 0.1257 1.4690 ... 75.5077 0.4987 0.0427 0.0092 8.5646 0.0370 0.0279 0.0081 75.5077 1
12 2008-07-20 03:35:00 2920.07 2507.40 2195.1222 1046.1468 1.3204 100.0 103.3400 0.1223 1.5310 ... 52.2039 0.4950 0.0153 0.0041 3.0926 0.0188 0.0098 0.0034 52.2039 -1
13 2008-07-21 08:21:00 3051.44 2529.27 2184.4333 877.6266 1.4668 100.0 107.8711 0.1240 1.5236 ... NaN 0.5034 0.0151 0.0038 3.0063 0.0188 0.0098 0.0034 52.2039 -1
14 2008-07-21 11:53:00 2963.97 2629.48 2224.6222 947.7739 1.2924 100.0 104.8489 0.1197 1.4474 ... 142.9080 0.5077 0.0094 0.0026 1.8483 0.0202 0.0289 0.0084 142.9080 1
15 2008-07-22 00:03:00 2988.31 2546.26 2224.6222 947.7739 1.2924 100.0 104.8489 0.1197 1.5465 ... 100.2745 0.5058 0.0078 0.0021 1.5352 0.0174 0.0174 0.0045 100.2745 -1
16 2008-07-22 02:59:00 3028.02 2560.87 2270.2556 1258.4558 1.3950 100.0 104.8078 0.1207 1.4368 ... 82.0989 0.5005 0.0108 0.0034 2.1574 0.0184 0.0151 0.0042 82.0989 -1
17 2008-07-22 08:41:00 3032.73 2517.79 2270.2556 1258.4558 1.3950 100.0 104.8078 0.1207 1.5537 ... NaN 0.5015 0.0105 0.0027 2.0979 0.0184 0.0151 0.0042 82.0989 -1
18 2008-07-22 11:47:00 3040.34 2501.16 2207.3889 962.5317 1.2043 100.0 104.0311 0.1210 1.5481 ... NaN 0.4948 0.0117 0.0034 2.3737 0.0184 0.0151 0.0042 82.0989 -1
19 2008-07-22 14:00:00 2988.30 2519.05 2208.8556 1157.7224 1.5509 100.0 107.8022 0.1233 1.5362 ... 47.1586 0.5036 0.0169 0.0039 3.3514 0.0229 0.0108 0.0032 47.1586 -1
20 2008-07-22 15:30:00 2987.32 2528.81 NaN NaN NaN NaN NaN 0.1195 1.6343 ... NaN 0.5011 0.0117 0.0033 2.3308 0.0229 0.0108 0.0032 47.1586 -1
21 2008-07-23 05:15:00 NaN 2481.85 2207.3889 962.5317 1.2043 100.0 104.0311 0.1210 1.5559 ... 34.4153 0.4947 0.0137 0.0041 2.7729 0.0175 0.0060 0.0023 34.4153 -1
22 2008-07-23 19:22:00 3002.27 2497.45 2207.3889 962.5317 1.2043 100.0 104.0311 0.1210 1.5465 ... 114.5979 0.4977 0.0114 0.0037 2.2849 0.0250 0.0286 0.0075 114.5979 -1
23 2008-07-25 15:23:00 2884.74 2514.54 2160.3667 899.9488 1.4022 100.0 105.4978 0.1240 1.5585 ... 216.8869 0.4982 0.0099 0.0027 1.9771 0.0098 0.0213 0.0077 216.8869 1
24 2008-07-27 04:18:00 3010.41 2632.80 2203.9000 1116.4129 1.2639 100.0 102.2733 0.1199 1.4227 ... 125.0600 0.5032 0.0159 0.0039 3.1576 0.0288 0.0361 0.0101 125.0600 -1
25 2008-07-27 09:37:00 2979.74 2446.56 2257.1667 1437.9565 1.4918 100.0 106.3400 0.1203 1.5136 ... NaN 0.5012 0.0336 0.0072 6.7053 0.0288 0.0361 0.0101 125.0600 -1
26 2008-07-27 11:10:00 3067.35 2456.33 2257.1667 1437.9565 1.4918 100.0 106.3400 0.1203 1.4860 ... NaN 0.5006 0.0083 0.0022 1.6593 0.0288 0.0361 0.0101 125.0600 -1
27 2008-07-27 15:46:00 2988.99 2607.63 2223.0333 1533.9934 1.3548 100.0 109.7067 0.1211 1.5582 ... 216.9552 0.5069 0.0158 0.0040 3.1232 0.0183 0.0397 0.0116 216.9552 -1
28 2008-07-27 16:06:00 2972.78 2431.57 2190.4889 1059.4390 0.8614 100.0 102.1178 0.1216 1.5438 ... 127.5067 0.5036 0.0137 0.0036 2.7205 0.0130 0.0165 0.0053 127.5067 -1
29 2008-07-27 16:49:00 2981.85 2529.11 2180.3778 1208.7411 1.2998 100.0 100.2789 0.1209 1.4200 ... 146.8715 0.5019 0.0139 0.0034 2.7747 0.0121 0.0178 0.0062 146.8715 -1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1537 2008-10-15 18:16:00 3006.22 2525.20 2192.7889 1268.5852 1.9935 100.0 104.5867 0.1268 1.4522 ... NaN 0.4942 0.0175 0.0045 3.5322 0.0199 0.0097 0.0037 48.7045 -1
1538 2008-10-15 19:15:00 3128.11 2367.16 2223.5333 1352.1869 0.8714 100.0 97.2189 0.1196 1.3907 ... 47.3376 0.4977 0.0144 0.0037 2.9002 0.0237 0.0112 0.0038 47.3376 -1
1539 2008-10-15 19:24:00 2908.94 2560.99 2187.3444 2882.8558 1.5876 100.0 85.4189 0.1235 1.4167 ... NaN 0.4987 0.0118 0.0028 2.3583 0.0237 0.0112 0.0038 47.3376 -1
1540 2008-10-15 21:44:00 2996.04 2555.92 2190.7666 3530.2362 0.8017 100.0 83.8767 0.1249 1.4158 ... 67.6676 0.5011 0.0163 0.0035 3.2608 0.0181 0.0123 0.0046 67.6676 -1
1541 2008-10-15 22:45:00 3246.31 2499.79 2216.8111 1190.4067 2.5148 100.0 114.5533 0.1230 1.3966 ... 23.5979 0.5021 0.0103 0.0030 2.0418 0.0266 0.0063 0.0019 23.5979 -1
1542 2008-10-15 22:54:00 2965.57 2487.91 2210.3556 910.7177 1.6941 100.0 119.8822 0.1268 1.3109 ... 121.9426 0.4992 0.0136 0.0037 2.7204 0.0216 0.0263 0.0087 121.9426 -1
1543 2008-10-15 23:00:00 3109.18 2447.97 2210.3556 910.7177 1.6941 100.0 119.8822 0.1268 1.3502 ... 50.8827 0.4975 0.0109 0.0038 2.1905 0.0273 0.0139 0.0055 50.8827 -1
1544 2008-10-15 23:45:00 3108.98 2537.73 2210.3556 910.7177 1.6941 100.0 119.8822 0.1268 1.2901 ... 72.9676 0.4986 0.0192 0.0046 3.8442 0.0172 0.0126 0.0039 72.9676 -1
1545 2008-10-16 02:16:00 3100.19 2490.60 2212.8445 1068.5644 1.7835 100.0 113.8833 0.1249 1.4294 ... 402.6874 0.4951 0.0165 0.0051 3.3383 0.0063 0.0252 0.0070 402.6874 -1
1546 2008-10-16 02:16:00 3093.24 2488.18 2212.8445 1068.5644 1.7835 100.0 113.8833 0.1249 1.3482 ... 105.7142 0.5022 0.0249 0.0049 4.9623 0.0134 0.0142 0.0047 105.7142 -1
1547 2008-10-16 02:17:00 3008.77 2542.36 2167.4222 2837.8788 1.4892 100.0 83.8222 0.1255 1.2895 ... 36.2975 0.5012 0.0160 0.0044 3.2008 0.0222 0.0080 0.0022 36.2975 -1
1548 2008-10-16 02:22:00 3027.01 2464.98 2212.6334 1081.5662 1.0096 100.0 113.4278 0.1253 1.4410 ... 88.5812 0.4944 0.0171 0.0040 3.4656 0.0253 0.0224 0.0071 88.5812 -1
1549 2008-10-16 02:55:00 3183.63 2498.00 2195.4444 2914.1792 1.5978 100.0 85.1011 0.1235 1.4129 ... NaN 0.5037 0.0117 0.0030 2.3203 0.0253 0.0224 0.0071 88.5812 -1
1550 2008-10-16 03:56:00 3072.20 2406.47 2195.4444 2914.1792 1.5978 100.0 85.1011 0.1235 1.3148 ... 27.5514 0.5034 0.0178 0.0043 3.5459 0.0236 0.0065 0.0022 27.5514 -1
1551 2008-10-16 04:02:00 2958.43 2489.06 2192.7556 867.3027 1.7393 100.0 123.4244 0.1251 1.4386 ... 30.7574 0.4972 0.0157 0.0040 3.1578 0.0400 0.0123 0.0044 30.7574 -1
1552 2008-10-16 04:02:00 2939.35 2521.98 2195.1000 1526.4440 0.8279 100.0 96.3100 0.1203 1.4366 ... 247.6285 0.5031 0.0111 0.0028 2.2144 0.0085 0.0212 0.0074 247.6285 -1
1553 2008-10-16 04:04:00 3020.79 2500.19 2210.3556 910.7177 1.6941 100.0 119.8822 0.1268 1.3405 ... 134.3983 0.4985 0.0126 0.0036 2.5295 0.0174 0.0234 0.0077 134.3983 -1
1554 2008-10-16 04:47:00 3031.78 2528.55 2182.5555 1261.0898 1.2110 100.0 112.2922 0.1252 1.3485 ... 117.0945 0.5025 0.0138 0.0039 2.7512 0.0235 0.0275 0.0098 117.0945 -1
1555 2008-10-16 04:50:00 2902.96 2515.03 2181.1889 1338.8895 2.1195 100.0 108.1400 0.1263 1.3255 ... 184.8703 0.5002 0.0130 0.0035 2.5982 0.0086 0.0160 0.0051 184.8703 -1
1556 2008-10-16 04:54:00 3025.21 2503.30 2179.7333 3085.3781 1.4843 100.0 82.2467 0.1248 1.3687 ... 46.1076 0.5019 0.0158 0.0043 3.1428 0.0120 0.0055 0.0016 46.1076 -1
1557 2008-10-16 05:08:00 3072.10 2534.87 2177.4333 2945.8855 1.3321 100.0 83.1700 0.1253 1.4359 ... 26.7330 0.5010 0.0132 0.0035 2.6249 0.0277 0.0074 0.0023 26.7330 -1
1558 2008-10-16 05:13:00 3012.30 2466.84 2217.4111 1032.2836 1.4802 100.0 101.3511 0.1195 1.3832 ... 176.6783 0.4993 0.0130 0.0037 2.5976 0.0097 0.0172 0.0054 176.6783 -1
1559 2008-10-16 05:44:00 3076.33 2456.13 2217.4111 1032.2836 1.4802 100.0 101.3511 0.1195 1.3120 ... 117.4564 0.4960 0.0157 0.0036 3.1743 0.0150 0.0176 0.0057 117.4564 -1
1560 2008-10-16 05:58:00 2770.40 2549.42 2204.2889 2637.9989 1.5549 100.0 86.1089 0.1234 1.2811 ... 127.3154 0.5009 0.0155 0.0036 3.0997 0.0105 0.0133 0.0042 127.3154 -1
1561 2008-10-16 15:02:00 2951.14 2326.59 2212.6334 1081.5662 1.0096 100.0 113.4278 0.1253 1.4492 ... 46.4573 0.4965 0.0118 0.0032 2.3817 0.0320 0.0148 0.0051 46.4573 -1
1562 2008-10-16 15:13:00 2899.41 2464.36 2179.7333 3085.3781 1.4843 100.0 82.2467 0.1248 1.3424 ... 203.1720 0.4988 0.0143 0.0039 2.8669 0.0068 0.0138 0.0047 203.1720 -1
1563 2008-10-16 20:49:00 3052.31 2522.55 2198.5667 1124.6595 0.8763 100.0 98.4689 0.1205 1.4333 ... NaN 0.4975 0.0131 0.0036 2.6238 0.0068 0.0138 0.0047 203.1720 -1
1564 2008-10-17 05:26:00 2978.81 2379.78 2206.3000 1110.4967 0.8236 100.0 99.4122 0.1208 NaN ... 43.5231 0.4987 0.0153 0.0041 3.0590 0.0197 0.0086 0.0025 43.5231 -1
1565 2008-10-17 06:01:00 2894.92 2532.01 2177.0333 1183.7287 1.5726 100.0 98.7978 0.1213 1.4622 ... 93.4941 0.5004 0.0178 0.0038 3.5662 0.0262 0.0245 0.0075 93.4941 -1
1566 2008-10-17 06:07:00 2944.92 2450.76 2195.4444 2914.1792 1.5978 100.0 85.1011 0.1235 NaN ... 137.7844 0.4987 0.0181 0.0040 3.6275 0.0117 0.0162 0.0045 137.7844 -1

1567 rows × 592 columns


In [21]:
df.values.shape


Out[21]:
(1567, 592)

In [22]:
col = df.columns[-1]
col


Out[22]:
'Pass/Fail'

In [23]:
passes = df.loc[df[col]==-1]
fails  = df.loc[df[col]== 1]

In [24]:
plt.style.use('seaborn')

In [25]:
def plot_row(df, rows=0, show_nans=False, figsize=None, alpha=1.):
    if figsize is not None:
        fig = plt.figure(figsize=(figsize))
    if type(rows) == int:
        rows = [rows]
    for row in rows:
        row = df.values[row][1:]
        if show_nans:
            nans = np.where(pd.isnull(row))
            ymax,ymin = max(row)/5, -max(row)/5
            plt.vlines(nans, ymin=ymin, ymax=ymax, linewidth=.5, color='firebrick')
        plt.plot(range(len(row)), row, alpha=alpha);
    
plot_row(df, figsize=(12,8), show_nans=True)


50 random signals:


In [173]:
plot_row(df, np.random.randint(len(df), size=50), figsize=(12,8), alpha=0.1)


All failures (104)


In [26]:
plot_row(fails, rows=range(len(fails)), figsize=(12,8), alpha=0.1)


Random 100 passes


In [30]:
plot_row(passes, rows=np.random.randint(len(passes), size=100), figsize=(12,8), alpha=0.1)


Eyeing it isn't going to work.

2. Data split

train / val : 80 / 20


In [44]:
def train_val_idxs(data, p=0.2):
    idxs = np.random.permutation(len(data))
    n_val = int(len(data)*p)
    return idxs[n_val:], idxs[:n_val]

In [160]:
train_idxs, val_idxs = train_val_idxs(df)

In [197]:
train.columns


Out[197]:
Index(['Time', '0', '1', '2', '3', '4', '5', '6', '7', '8',
       ...
       '581', '582', '583', '584', '585', '586', '587', '588', '589',
       'Pass/Fail'],
      dtype='object', length=592)

In [537]:
train = df.iloc[train_idxs]
valid = df.iloc[val_idxs]

# remove the first 'timestamp' column
train = train.drop(columns=['Time'])
valid = valid.drop(columns=['Time'])

In [538]:
len(train), len(valid)


Out[538]:
(1254, 313)

Since there are only 104 negative examples to 1463 positives, I want to ensure there's a similar ratio in the split datasets.


In [539]:
pos, neg =  len(passes), len(fails)
pos, neg, neg/pos


Out[539]:
(1463, 104, 0.0710868079289132)

In [540]:
pos, neg = len(valid.loc[valid[col]==-1]), len(valid.loc[valid[col]== 1])
pos, neg, neg/pos


Out[540]:
(291, 22, 0.07560137457044673)

In [541]:
pos, neg = len(train.loc[train[col]==-1]), len(train.loc[train[col]== 1])
pos, neg, neg/pos


Out[541]:
(1172, 82, 0.06996587030716724)

I could try resampling negative examples to artificially balance the dataset, although I won't attempt to generatively create new examples here.

2.1 Data preprocessing


In [542]:
# replacing NaNs with the mean of each row
for rdx in range(len(train)):
    train.iloc[rdx] = train.iloc[rdx].fillna(train.iloc[rdx].mean())
for rdx in range(len(valid)):
    valid.iloc[rdx] = valid.iloc[rdx].fillna(valid.iloc[rdx].mean())

Separate data into inputs and labels:


In [543]:
x_train = train.drop([col], 1).values
y_train = train[col].values

x_valid = valid.drop([col], 1).values
y_valid = valid[col].values

Preprocessing: Center to Mean and Scale to Unit Variance


In [544]:
x_train = preprocessing.scale(x_train)
x_valid = preprocessing.scale(x_valid)

3. Linear Models 1: Linear Regression

Classifier:


In [277]:
clsfr = LinearRegression()
clsfr.fit(x_train, y_train)
# clsfr.fit(x_valid, y_valid)


Out[277]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [279]:
clsfr.score(x_train, y_train)


Out[279]:
0.5224496775369216

In [278]:
clsfr.score(x_valid, y_valid)


Out[278]:
-81554330.9058108

An R2 score (what the Linear Regressor is using as its scoring metric) gives a value of 1 for a perfect score, and 0 for taking the average; anything below a zero is worse than just taking the average of the dataset..

I wonder if I was just misusing this model. Though I guess fitting a simple line to this dataset and generalizing would be difficult.

4. Linear Models 2: Logistic Regression


In [280]:
clsfr = LogisticRegression()
clsfr.fit(x_train, y_train)


Out[280]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [281]:
clsfr.score(x_train, y_train)


Out[281]:
0.9681020733652312

In [282]:
clsfr.score(x_valid, y_valid)


Out[282]:
0.8690095846645367

This gives more-expected results.

5. Support Vector Machine


In [295]:
clsfr = svm.LinearSVC()
clsfr.fit(x_train, y_train)


Out[295]:
LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [296]:
clsfr.score(x_train, y_train)


Out[296]:
0.992822966507177

In [297]:
clsfr.score(x_valid, y_valid)


Out[297]:
0.7955271565495208

6. Simple Neural Network - exploring issues


In [545]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from fastai.learner import *

In [546]:
from fastai.dataloader import DataLoader

In [547]:
import torchvision

In [548]:
class SimpleNet(nn.Module):
    def __init__(self, in_size):
        super().__init__()
        self.fc0 = nn.Linear(in_size, 80)
        self.fc1 = nn.Linear(80,  2)
    def forward(self, x):
        x = F.relu(self.fc0(x))
        x = F.log_softmax(self.fc1(x))
        return x

In [549]:
class SignalDataset(Dataset):
    def __init__(self, x, y, transform=None):
        self.x = np.copy(x)
        self.y = np.copy(y)
        self.transform = transform
    def __len__(self):
        return len(self.x)
    def __getitem__(self, i):            
        x = self.x[i]
        y = self.y[i]
        if self.transform is not None:
            x = self.transform(x)
        return (x, y)

One-Hot Encode -1/+1 pass/fail


In [550]:
y_train.shape


Out[550]:
(1254,)

In [551]:
def one_hot_y(y_data):
    y = np.zeros((y_data.shape[0], 2))
    for i,yi in enumerate(y_data):
        y[i][int((yi + 1)/2)] = 1
    return y

Normalizing to [0,1]

... after sklearn scaling


In [560]:
train_dataset = SignalDataset(x_train, one_hot_y(y_train))
valid_dataset = SignalDataset(x_valid, one_hot_y(y_valid))

In [561]:
train_dataset.x


Out[561]:
array([[-0.07301,  0.61736, -0.03433, ..., -0.03144, -0.02933, -0.62915],
       [ 0.00391,  0.28586,  0.14544, ..., -0.0308 , -0.02898,  0.16875],
       [ 0.34918, -0.1458 , -0.00637, ..., -0.03247, -0.02967, -0.59601],
       ...,
       [-0.27318,  0.22363,  0.17287, ..., -0.0307 , -0.02908, -0.48313],
       [-0.05875,  0.18604,  0.06884, ..., -0.0335 , -0.02997, -0.82873],
       [ 0.02466,  1.08547,  0.17943, ..., -0.03109, -0.02933, -0.52329]])

In [562]:
minval = abs(np.min(train_dataset.x))
train_dataset.x += minval
train_dataset.x /= np.max(train_dataset.x)

minval = abs(np.min(valid_dataset.x))
valid_dataset.x += minval
valid_dataset.x /= np.max(valid_dataset.x)

In [563]:
train_dataloader = DataLoader(train_dataset)
valid_dataloader = DataLoader(valid_dataset)

In [564]:
mdata = ModelData(PATH, train_dataloader, valid_dataloader)

In [565]:
network = SimpleNet(len(train_dataset.x[0]))
network


Out[565]:
SimpleNet(
  (fc0): Linear(in_features=590, out_features=80, bias=True)
  (fc1): Linear(in_features=80, out_features=2, bias=True)
)

In [566]:
learner = Learner.from_model_data(network, mdata)

In [567]:
learner.lr_find()
learner.sched.plot()


 84%|████████▎ | 1050/1254 [00:01<00:00, 1041.33it/s, loss=1.67]
                                                                

In [568]:
learner.fit(1e-4, n_cycle=5, wds=1e-6)


epoch      trn_loss   val_loss                                  
    0      1.587977   1.593989  
    1      1.587965   1.593889                                  
    2      1.587954   1.59379                                   
    3      1.587944   1.593694                                  
    4      1.587933   1.5936                                    

Out[568]:
[array([1.5936])]

In [569]:
log_preds = learner.predict()

In [570]:
np.exp(log_preds)[:40]


Out[570]:
array([[0.61504, 0.38496],
       [0.61446, 0.38554],
       [0.61511, 0.38489],
       [0.6173 , 0.3827 ],
       [0.61736, 0.38264],
       [0.61689, 0.38311],
       [0.61911, 0.38089],
       [0.61752, 0.38248],
       [0.61516, 0.38484],
       [0.61796, 0.38204],
       [0.61753, 0.38247],
       [0.61559, 0.38441],
       [0.61656, 0.38344],
       [0.61769, 0.38231],
       [0.61477, 0.38523],
       [0.61603, 0.38397],
       [0.61487, 0.38513],
       [0.61881, 0.38119],
       [0.61636, 0.38364],
       [0.61741, 0.38259],
       [0.61794, 0.38206],
       [0.61695, 0.38305],
       [0.61589, 0.38411],
       [0.61718, 0.38282],
       [0.61696, 0.38304],
       [0.61807, 0.38193],
       [0.61885, 0.38115],
       [0.61737, 0.38263],
       [0.61755, 0.38245],
       [0.61581, 0.38419],
       [0.61801, 0.38199],
       [0.61651, 0.38349],
       [0.61946, 0.38054],
       [0.61892, 0.38108],
       [0.61901, 0.38099],
       [0.61759, 0.38241],
       [0.61554, 0.38446],
       [0.6223 , 0.3777 ],
       [0.63906, 0.36094],
       [0.61706, 0.38294]], dtype=float32)

Clipping to [0,1]


In [571]:
train_dataset = SignalDataset(x_train, one_hot_y(y_train))
valid_dataset = SignalDataset(x_valid, one_hot_y(y_valid))

In [572]:
train_dataset.x


Out[572]:
array([[-0.07301,  0.61736, -0.03433, ..., -0.03144, -0.02933, -0.62915],
       [ 0.00391,  0.28586,  0.14544, ..., -0.0308 , -0.02898,  0.16875],
       [ 0.34918, -0.1458 , -0.00637, ..., -0.03247, -0.02967, -0.59601],
       ...,
       [-0.27318,  0.22363,  0.17287, ..., -0.0307 , -0.02908, -0.48313],
       [-0.05875,  0.18604,  0.06884, ..., -0.0335 , -0.02997, -0.82873],
       [ 0.02466,  1.08547,  0.17943, ..., -0.03109, -0.02933, -0.52329]])

In [573]:
train_dataset.x = np.clip(train_dataset.x, 0.0, 1.0)
valid_dataset.x = np.clip(valid_dataset.x, 0.0, 1.0)

In [574]:
train_dataloader = DataLoader(train_dataset)
valid_dataloader = DataLoader(valid_dataset)

In [575]:
mdata = ModelData(PATH, train_dataloader, valid_dataloader)

In [576]:
network = SimpleNet(len(train_dataset.x[0]))
network


Out[576]:
SimpleNet(
  (fc0): Linear(in_features=590, out_features=80, bias=True)
  (fc1): Linear(in_features=80, out_features=2, bias=True)
)

In [577]:
learner = Learner.from_model_data(network, mdata)

In [578]:
learner.lr_find()
learner.sched.plot()


 82%|████████▏ | 1032/1254 [00:01<00:00, 1019.28it/s, loss=1.7]
                                                               

In [579]:
learner.fit(1e-4, n_cycle=5, wds=1e-6)


epoch      trn_loss   val_loss                                  
    0      1.592702   1.598058  
    1      1.589796   1.595568                                  
    2      1.588088   1.594202                                  
    3      1.586967   1.59338                                   
    4      1.586123   1.592871                                  

Out[579]:
[array([1.59287])]

In [580]:
log_preds = learner.predict()

In [581]:
np.exp(log_preds)[:40]


Out[581]:
array([[0.60003, 0.39997],
       [0.56597, 0.43403],
       [0.59632, 0.40368],
       [0.59666, 0.40334],
       [0.59496, 0.40504],
       [0.56945, 0.43055],
       [0.58142, 0.41858],
       [0.59304, 0.40696],
       [0.58224, 0.41776],
       [0.55389, 0.44611],
       [0.58977, 0.41023],
       [0.58686, 0.41314],
       [0.57487, 0.42513],
       [0.57851, 0.42149],
       [0.58578, 0.41422],
       [0.57528, 0.42472],
       [0.57515, 0.42485],
       [0.6024 , 0.3976 ],
       [0.56676, 0.43324],
       [0.57762, 0.42238],
       [0.57493, 0.42507],
       [0.59623, 0.40377],
       [0.55971, 0.44029],
       [0.56766, 0.43234],
       [0.57526, 0.42474],
       [0.6145 , 0.3855 ],
       [0.58286, 0.41714],
       [0.58328, 0.41672],
       [0.59114, 0.40886],
       [0.59063, 0.40937],
       [0.58937, 0.41063],
       [0.5592 , 0.4408 ],
       [0.59507, 0.40493],
       [0.58609, 0.41391],
       [0.59081, 0.40919],
       [0.60179, 0.39821],
       [0.56093, 0.43907],
       [0.59393, 0.40607],
       [0.61743, 0.38257],
       [0.59939, 0.40061]], dtype=float32)

No clipping; only sklearn scaling


In [582]:
train_dataset = SignalDataset(x_train, one_hot_y(y_train))
valid_dataset = SignalDataset(x_valid, one_hot_y(y_valid))

In [583]:
train_dataset.x


Out[583]:
array([[-0.07301,  0.61736, -0.03433, ..., -0.03144, -0.02933, -0.62915],
       [ 0.00391,  0.28586,  0.14544, ..., -0.0308 , -0.02898,  0.16875],
       [ 0.34918, -0.1458 , -0.00637, ..., -0.03247, -0.02967, -0.59601],
       ...,
       [-0.27318,  0.22363,  0.17287, ..., -0.0307 , -0.02908, -0.48313],
       [-0.05875,  0.18604,  0.06884, ..., -0.0335 , -0.02997, -0.82873],
       [ 0.02466,  1.08547,  0.17943, ..., -0.03109, -0.02933, -0.52329]])

In [584]:
train_dataloader = DataLoader(train_dataset)
valid_dataloader = DataLoader(valid_dataset)

In [585]:
mdata = ModelData(PATH, train_dataloader, valid_dataloader)

In [586]:
network = SimpleNet(len(train_dataset.x[0]))
network


Out[586]:
SimpleNet(
  (fc0): Linear(in_features=590, out_features=80, bias=True)
  (fc1): Linear(in_features=80, out_features=2, bias=True)
)

In [587]:
learner = Learner.from_model_data(network, mdata)

In [588]:
learner.lr_find()
learner.sched.plot()


 38%|███▊      | 481/1254 [00:00<00:00, 1195.81it/s, loss=1.62]
                                                               

In [589]:
learner.fit(5e-4, n_cycle=5, wds=1e-6)


epoch      trn_loss   val_loss                                 
    0      nan        nan       
    1      nan        nan                                     
    2      nan        nan                                      
    3      nan        nan                                      
    4      nan        nan                                      

Out[589]:
[array([nan])]

In [590]:
log_preds = learner.predict()

In [591]:
np.exp(log_preds)[:40]


Out[591]:
array([[nan, nan],
       [nan, nan],
       [nan, nan],
       [nan, nan],
       [nan, nan],
       [nan, nan],
       [nan, nan],
       [nan, nan],
       [nan, nan],
       [nan, nan],
       [nan, nan],
       [nan, nan],
       [nan, nan],
       [nan, nan],
       [nan, nan],
       [nan, nan],
       [nan, nan],
       [nan, nan],
       [nan, nan],
       [nan, nan],
       [nan, nan],
       [nan, nan],
       [nan, nan],
       [nan, nan],
       [nan, nan],
       [nan, nan],
       [nan, nan],
       [nan, nan],
       [nan, nan],
       [nan, nan],
       [nan, nan],
       [nan, nan],
       [nan, nan],
       [nan, nan],
       [nan, nan],
       [nan, nan],
       [nan, nan],
       [nan, nan],
       [nan, nan],
       [nan, nan]], dtype=float32)

More work is needed.