In [1]:
%matplotlib inline
In [2]:
import matplotlib.pyplot as plt
import sklearn
from sklearn import datasets
In [3]:
import os, sys
print( os.getcwd() )
print( os.listdir( os.getcwd() ) )
/home/topolo/PropD/MLgrabbag/kaggle
['.ipynb_checkpoints', 'glass-classification.zip', 'kaggle.ipynb', 'glass.csv', 'train.h5.zip', 'train.h5']
In [4]:
import numpy as np
import scipy
In [5]:
import pandas as pd
In [6]:
print( os.listdir( os.getcwd() ))
['.ipynb_checkpoints', 'glass-classification.zip', 'kaggle.ipynb', 'glass.csv', 'train.h5.zip', 'train.h5']
In [7]:
timeseries_pd = pd.read_hdf( 'train.h5')
In [8]:
timeseries_pd.describe()
Out[8]:
id
timestamp
derived_0
derived_1
derived_2
derived_3
derived_4
fundamental_0
fundamental_1
fundamental_2
...
technical_36
technical_37
technical_38
technical_39
technical_40
technical_41
technical_42
technical_43
technical_44
y
count
1.710756e+06
1.710756e+06
1.637797e+06
1.629727e+06
1.312105e+06
1.561285e+06
1.304298e+06
1.686809e+06
1.031686e+06
1.341916e+06
...
1.708204e+06
1.691591e+06
1.691591e+06
1.690740e+06
1.708520e+06
1.666567e+06
1.690755e+06
1.706070e+06
1.473977e+06
1.710756e+06
mean
1.093858e+03
9.456257e+02
-4.536046e+00
7.729436e+11
-3.320328e-01
-5.046012e-01
1.801661e+01
-2.040938e-02
-5.703754e+08
-1.622954e-01
...
-8.584833e-02
-9.103397e-02
-8.156685e-02
-7.287001e-02
4.908321e-02
5.236218e-03
-1.699966e-02
-9.735299e-01
3.881475e-04
2.217509e-04
std
6.308563e+02
5.195685e+02
2.497382e+02
7.620606e+13
6.519810e+01
1.020749e+02
9.258360e+02
2.494859e-01
7.502322e+10
3.668149e+00
...
6.125852e-01
2.471038e-01
2.346534e-01
2.235729e-01
3.102316e-01
1.133733e-01
2.116284e-01
9.605551e-01
3.011983e-02
2.240643e-02
min
0.000000e+00
0.000000e+00
-2.017497e+04
-7.375435e-02
-9.848880e+03
-3.434176e+04
-8.551914e+03
-2.344957e+00
-1.043737e+13
-1.077101e+03
...
-1.687572e+00
-1.000000e+00
-1.000000e+00
-1.000000e+00
-5.250904e-01
-4.449529e-01
-1.000000e+00
-2.000000e+00
-1.265686e-01
-8.609413e-02
25%
5.500000e+02
5.040000e+02
-1.449710e-01
-2.956479e-02
-5.967524e-02
-1.655826e-01
-1.057050e-01
-1.996543e-01
-1.960470e-01
-2.280967e-01
...
-4.050297e-01
-4.651562e-04
-1.992532e-04
-2.203252e-05
-1.521701e-01
-7.377038e-02
-3.887695e-15
-2.000000e+00
-1.998819e-02
-9.561389e-03
50%
1.098000e+03
9.560000e+02
-8.368272e-04
5.523058e-03
2.109505e-02
2.475614e-03
1.175234e-02
-4.064488e-02
-7.395084e-03
-3.029069e-02
...
-8.502064e-02
-3.951567e-12
-1.418487e-13
-1.591224e-16
-1.476793e-02
9.782702e-05
0.000000e+00
-6.597540e-01
1.117279e-05
-1.570681e-04
75%
1.657000e+03
1.401000e+03
1.199108e-01
1.078554e-01
1.952209e-01
3.037236e-01
1.556464e-01
1.303819e-01
1.832071e-01
1.764751e-01
...
1.909600e-01
-5.219879e-40
0.000000e+00
0.000000e+00
1.772415e-01
7.855728e-02
0.000000e+00
-5.188884e-08
2.047074e-02
9.520990e-03
max
2.158000e+03
1.812000e+03
3.252527e+03
1.068448e+16
3.823001e+03
1.239737e+03
6.785965e+04
1.378195e+00
5.203165e+02
7.677125e+01
...
4.957758e+01
0.000000e+00
0.000000e+00
0.000000e+00
1.569265e+00
6.844833e-01
1.000000e+00
0.000000e+00
1.435858e-01
9.349781e-02
8 rows × 111 columns
In [9]:
timeseries_pd.head()
Out[9]:
id
timestamp
derived_0
derived_1
derived_2
derived_3
derived_4
fundamental_0
fundamental_1
fundamental_2
...
technical_36
technical_37
technical_38
technical_39
technical_40
technical_41
technical_42
technical_43
technical_44
y
0
10
0
0.370326
-0.006316
0.222831
-0.213030
0.729277
-0.335633
0.113292
1.621238
...
0.775208
NaN
NaN
NaN
-0.414776
NaN
NaN
-2.0
NaN
-0.011753
1
11
0
0.014765
-0.038064
-0.017425
0.320652
-0.034134
0.004413
0.114285
-0.210185
...
0.025590
NaN
NaN
NaN
-0.273607
NaN
NaN
-2.0
NaN
-0.001240
2
12
0
-0.010622
-0.050577
3.379575
-0.157525
-0.068550
-0.155937
1.219439
-0.764516
...
0.151881
NaN
NaN
NaN
-0.175710
NaN
NaN
-2.0
NaN
-0.020940
3
25
0
NaN
NaN
NaN
NaN
NaN
0.178495
NaN
-0.007262
...
1.035936
NaN
NaN
NaN
-0.211506
NaN
NaN
-2.0
NaN
-0.015959
4
26
0
0.176693
-0.025284
-0.057680
0.015100
0.180894
0.139445
-0.125687
-0.018707
...
0.630232
NaN
NaN
NaN
-0.001957
NaN
NaN
0.0
NaN
-0.007338
5 rows × 111 columns
In [10]:
timeseries_pd.columns
Out[10]:
Index([u'id', u'timestamp', u'derived_0', u'derived_1', u'derived_2',
u'derived_3', u'derived_4', u'fundamental_0', u'fundamental_1',
u'fundamental_2',
...
u'technical_36', u'technical_37', u'technical_38', u'technical_39',
u'technical_40', u'technical_41', u'technical_42', u'technical_43',
u'technical_44', u'y'],
dtype='object', length=111)
In [11]:
print( len(timeseries_pd.columns) )
for col in timeseries_pd.columns: print col
111
id
timestamp
derived_0
derived_1
derived_2
derived_3
derived_4
fundamental_0
fundamental_1
fundamental_2
fundamental_3
fundamental_5
fundamental_6
fundamental_7
fundamental_8
fundamental_9
fundamental_10
fundamental_11
fundamental_12
fundamental_13
fundamental_14
fundamental_15
fundamental_16
fundamental_17
fundamental_18
fundamental_19
fundamental_20
fundamental_21
fundamental_22
fundamental_23
fundamental_24
fundamental_25
fundamental_26
fundamental_27
fundamental_28
fundamental_29
fundamental_30
fundamental_31
fundamental_32
fundamental_33
fundamental_34
fundamental_35
fundamental_36
fundamental_37
fundamental_38
fundamental_39
fundamental_40
fundamental_41
fundamental_42
fundamental_43
fundamental_44
fundamental_45
fundamental_46
fundamental_47
fundamental_48
fundamental_49
fundamental_50
fundamental_51
fundamental_52
fundamental_53
fundamental_54
fundamental_55
fundamental_56
fundamental_57
fundamental_58
fundamental_59
fundamental_60
fundamental_61
fundamental_62
fundamental_63
technical_0
technical_1
technical_2
technical_3
technical_5
technical_6
technical_7
technical_9
technical_10
technical_11
technical_12
technical_13
technical_14
technical_16
technical_17
technical_18
technical_19
technical_20
technical_21
technical_22
technical_24
technical_25
technical_27
technical_28
technical_29
technical_30
technical_31
technical_32
technical_33
technical_34
technical_35
technical_36
technical_37
technical_38
technical_39
technical_40
technical_41
technical_42
technical_43
technical_44
y
In [19]:
timeseries_pd["timestamp"]; # Name: timestamp, dtype: int16
timeseries_pd[["id","timestamp"]]
Out[19]:
id
timestamp
0
10
0
1
11
0
2
12
0
3
25
0
4
26
0
5
27
0
6
31
0
7
38
0
8
39
0
9
40
0
10
41
0
11
43
0
12
44
0
13
49
0
14
54
0
15
59
0
16
60
0
17
62
0
18
63
0
19
68
0
20
69
0
21
70
0
22
76
0
23
79
0
24
80
0
25
82
0
26
83
0
27
85
0
28
87
0
29
90
0
...
...
...
1710726
2100
1812
1710727
2101
1812
1710728
2102
1812
1710729
2104
1812
1710730
2107
1812
1710731
2108
1812
1710732
2109
1812
1710733
2114
1812
1710734
2117
1812
1710735
2118
1812
1710736
2120
1812
1710737
2121
1812
1710738
2126
1812
1710739
2129
1812
1710740
2130
1812
1710741
2131
1812
1710742
2137
1812
1710743
2138
1812
1710744
2139
1812
1710745
2140
1812
1710746
2142
1812
1710747
2145
1812
1710748
2146
1812
1710749
2148
1812
1710750
2149
1812
1710751
2150
1812
1710752
2151
1812
1710753
2154
1812
1710754
2156
1812
1710755
2158
1812
1710756 rows × 2 columns
In [18]:
timeseries_pd["timestamp"]
Out[18]:
0 0
1 0
2 0
3 0
4 0
5 0
6 0
7 0
8 0
9 0
10 0
11 0
12 0
13 0
14 0
15 0
16 0
17 0
18 0
19 0
20 0
21 0
22 0
23 0
24 0
25 0
26 0
27 0
28 0
29 0
...
1710726 1812
1710727 1812
1710728 1812
1710729 1812
1710730 1812
1710731 1812
1710732 1812
1710733 1812
1710734 1812
1710735 1812
1710736 1812
1710737 1812
1710738 1812
1710739 1812
1710740 1812
1710741 1812
1710742 1812
1710743 1812
1710744 1812
1710745 1812
1710746 1812
1710747 1812
1710748 1812
1710749 1812
1710750 1812
1710751 1812
1710752 1812
1710753 1812
1710754 1812
1710755 1812
Name: timestamp, dtype: int16
Total number of data points in time series
In [45]:
timeseries_pd.count()
Out[45]:
id 1710756
timestamp 1710756
derived_0 1637797
derived_1 1629727
derived_2 1312105
derived_3 1561285
derived_4 1304298
fundamental_0 1686809
fundamental_1 1031686
fundamental_2 1341916
fundamental_3 1256376
fundamental_5 748736
fundamental_6 1009131
fundamental_7 1684416
fundamental_8 1337590
fundamental_9 1145189
fundamental_10 1597779
fundamental_11 1341916
fundamental_12 1599885
fundamental_13 1355618
fundamental_14 1354672
fundamental_15 1355859
fundamental_16 1355618
fundamental_17 1613534
fundamental_18 1694923
fundamental_19 1656168
fundamental_20 1599885
fundamental_21 1656423
fundamental_22 1152268
fundamental_23 1354033
...
technical_13 1705992
technical_14 1696572
technical_16 1690775
technical_17 1706477
technical_18 1690740
technical_19 1708436
technical_20 1705992
technical_21 1708520
technical_22 1710756
technical_24 1639610
technical_25 1502700
technical_27 1708336
technical_28 1447840
technical_29 1649141
technical_30 1705992
technical_31 1528078
technical_32 1691591
technical_33 1696221
technical_34 1710756
technical_35 1707601
technical_36 1708204
technical_37 1691591
technical_38 1691591
technical_39 1690740
technical_40 1708520
technical_41 1666567
technical_42 1690755
technical_43 1706070
technical_44 1473977
y 1710756
dtype: int64
In [47]:
timeseries_pd.size
Out[47]:
189893916
cf. https://gallery.cortanaintelligence.com/Experiment/Methods-for-handling-missing-values-1
Replace missing values with the mean. For this age data, we assume that missing values are distributed similarly to the values that are present. The formal name for this assumption is Missing Completely at Random (MCAR). In this case, substituting values that represent the existing distribution, such as the mean, is a reasonable approach.
Replace missing values with the median. This is another justifiable way to handle missing-at-random data, although note that it gives a different answer. For categorical data, it's also common to use the mode, the most commonly occurring value.
cf. http://pandas.pydata.org/pandas-docs/stable/missing_data.html
The sections that became very useful were
In [13]:
timeseries_pd.isnull().describe()
Out[13]:
id
timestamp
derived_0
derived_1
derived_2
derived_3
derived_4
fundamental_0
fundamental_1
fundamental_2
...
technical_36
technical_37
technical_38
technical_39
technical_40
technical_41
technical_42
technical_43
technical_44
y
count
1710756
1710756
1710756
1710756
1710756
1710756
1710756
1710756
1710756
1710756
...
1710756
1710756
1710756
1710756
1710756
1710756
1710756
1710756
1710756
1710756
unique
1
1
2
2
2
2
2
2
2
2
...
2
2
2
2
2
2
2
2
2
1
top
False
False
False
False
False
False
False
False
False
False
...
False
False
False
False
False
False
False
False
False
False
freq
1710756
1710756
1637797
1629727
1312105
1561285
1304298
1686809
1031686
1341916
...
1708204
1691591
1691591
1690740
1708520
1666567
1690755
1706070
1473977
1710756
4 rows × 111 columns
In [14]:
timeseries_pd_meanclean = timeseries_pd.where( pd.notnull(timeseries_pd),timeseries_pd.mean(),axis='columns')
In [15]:
timeseries_pd_meanclean.describe()
Out[15]:
id
timestamp
derived_0
derived_1
derived_2
derived_3
derived_4
fundamental_0
fundamental_1
fundamental_2
...
technical_36
technical_37
technical_38
technical_39
technical_40
technical_41
technical_42
technical_43
technical_44
y
count
1.710756e+06
1.710756e+06
1.710756e+06
1.710756e+06
1.710756e+06
1.710756e+06
1.710756e+06
1.710756e+06
1.710756e+06
1.710756e+06
...
1.710756e+06
1.710756e+06
1.710756e+06
1.710756e+06
1.710756e+06
1.710756e+06
1.710756e+06
1.710756e+06
1.710756e+06
1.710756e+06
mean
1.093858e+03
9.456257e+02
-4.534660e+00
7.727040e+11
-3.316913e-01
-5.042495e-01
1.801324e+01
-2.040735e-02
-5.611223e+08
-1.615499e-01
...
-8.584631e-02
-9.103718e-02
-8.156337e-02
-7.286777e-02
4.908380e-02
5.236934e-03
-1.700031e-02
-9.735575e-01
3.879543e-04
2.217509e-04
std
6.308563e+02
5.195685e+02
2.443549e+02
7.437944e+13
5.709856e+01
9.751381e+01
8.084039e+02
2.477336e-01
5.826071e+10
3.248749e+00
...
6.121281e-01
2.457155e-01
2.333340e-01
2.222618e-01
3.100289e-01
1.118994e-01
2.103878e-01
9.592382e-01
2.795785e-02
2.240643e-02
min
0.000000e+00
0.000000e+00
-2.017497e+04
-7.375435e-02
-9.848880e+03
-3.434176e+04
-8.551914e+03
-2.344957e+00
-1.043737e+13
-1.077101e+03
...
-1.687572e+00
-1.000000e+00
-1.000000e+00
-1.000000e+00
-5.250904e-01
-4.449529e-01
-1.000000e+00
-2.000000e+00
-1.265686e-01
-8.609413e-02
25%
5.500000e+02
5.040000e+02
-1.849245e-01
-2.829417e-02
-3.320328e-01
-2.661774e-01
-5.365840e-02
-1.969607e-01
-5.703754e+08
-1.646032e-01
...
-4.043798e-01
-7.892920e-04
-3.068867e-04
-4.336545e-05
-1.519630e-01
-7.160673e-02
-3.515464e-14
-2.000000e+00
-1.638420e-02
-9.561389e-03
50%
1.098000e+03
9.560000e+02
-9.243710e-03
1.066268e-02
-3.129806e-02
-2.266392e-02
7.860678e-02
-3.671852e-02
-2.737567e-01
-1.286000e-01
...
-8.584833e-02
-6.932156e-12
-2.836974e-13
-3.655687e-16
-1.434174e-02
3.836489e-03
0.000000e+00
-6.804921e-01
3.881475e-04
-1.570681e-04
75%
1.657000e+03
1.401000e+03
1.128421e-01
1.460314e-01
1.219615e-01
2.595344e-01
1.802310e+00
1.273931e-01
5.088157e-02
1.083336e-01
...
1.903692e-01
-3.863118e-39
0.000000e+00
0.000000e+00
1.768057e-01
7.607020e-02
0.000000e+00
-5.451053e-08
1.680637e-02
9.520990e-03
max
2.158000e+03
1.812000e+03
3.252527e+03
1.068448e+16
3.823001e+03
1.239737e+03
6.785965e+04
1.378195e+00
5.203165e+02
7.677125e+01
...
4.957758e+01
0.000000e+00
0.000000e+00
0.000000e+00
1.569265e+00
6.844833e-01
1.000000e+00
0.000000e+00
1.435858e-01
9.349781e-02
8 rows × 111 columns
In [17]:
timeseries_pd_meanclean.notnull().describe()
Out[17]:
id
timestamp
derived_0
derived_1
derived_2
derived_3
derived_4
fundamental_0
fundamental_1
fundamental_2
...
technical_36
technical_37
technical_38
technical_39
technical_40
technical_41
technical_42
technical_43
technical_44
y
count
1710756
1710756
1710756
1710756
1710756
1710756
1710756
1710756
1710756
1710756
...
1710756
1710756
1710756
1710756
1710756
1710756
1710756
1710756
1710756
1710756
unique
1
1
1
1
1
1
1
1
1
1
...
1
1
1
1
1
1
1
1
1
1
top
True
True
True
True
True
True
True
True
True
True
...
True
True
True
True
True
True
True
True
True
True
freq
1710756
1710756
1710756
1710756
1710756
1710756
1710756
1710756
1710756
1710756
...
1710756
1710756
1710756
1710756
1710756
1710756
1710756
1710756
1710756
1710756
4 rows × 111 columns
In [19]:
timeseries_pd_meanclean.head()
Out[19]:
id
timestamp
derived_0
derived_1
derived_2
derived_3
derived_4
fundamental_0
fundamental_1
fundamental_2
...
technical_36
technical_37
technical_38
technical_39
technical_40
technical_41
technical_42
technical_43
technical_44
y
0
10
0
0.370326
-6.316399e-03
0.222831
-0.213030
0.729277
-0.335633
1.132921e-01
1.621238
...
0.775208
-0.091034
-0.081567
-0.07287
-0.414776
0.005236
-0.017
-2.0
0.000388
-0.011753
1
11
0
0.014765
-3.806422e-02
-0.017425
0.320652
-0.034134
0.004413
1.142851e-01
-0.210185
...
0.025590
-0.091034
-0.081567
-0.07287
-0.273607
0.005236
-0.017
-2.0
0.000388
-0.001240
2
12
0
-0.010622
-5.057707e-02
3.379575
-0.157525
-0.068550
-0.155937
1.219439e+00
-0.764516
...
0.151881
-0.091034
-0.081567
-0.07287
-0.175710
0.005236
-0.017
-2.0
0.000388
-0.020940
3
25
0
-4.536046
7.729436e+11
-0.332033
-0.504601
18.016613
0.178495
-5.703754e+08
-0.007262
...
1.035936
-0.091034
-0.081567
-0.07287
-0.211506
0.005236
-0.017
-2.0
0.000388
-0.015959
4
26
0
0.176693
-2.528418e-02
-0.057680
0.015100
0.180894
0.139445
-1.256869e-01
-0.018707
...
0.630232
-0.091034
-0.081567
-0.07287
-0.001957
0.005236
-0.017
0.0
0.000388
-0.007338
5 rows × 111 columns
"Each (financial) instrument has an id. "
In [24]:
timeseries_id=timeseries_pd_meanclean.sort_values(by=["id", "timestamp"])
In [25]:
timeseries_id.describe()
Out[25]:
id
timestamp
derived_0
derived_1
derived_2
derived_3
derived_4
fundamental_0
fundamental_1
fundamental_2
...
technical_36
technical_37
technical_38
technical_39
technical_40
technical_41
technical_42
technical_43
technical_44
y
count
1.710756e+06
1.710756e+06
1.710756e+06
1.710756e+06
1.710756e+06
1.710756e+06
1.710756e+06
1.710756e+06
1.710756e+06
1.710756e+06
...
1.710756e+06
1.710756e+06
1.710756e+06
1.710756e+06
1.710756e+06
1.710756e+06
1.710756e+06
1.710756e+06
1.710756e+06
1.710756e+06
mean
1.093858e+03
9.456257e+02
-4.534488e+00
7.731216e+11
-3.314414e-01
-5.045264e-01
1.801171e+01
-2.040870e-02
-5.694613e+08
-1.612957e-01
...
-8.584719e-02
-9.105250e-02
-8.155458e-02
-7.286179e-02
4.907284e-02
5.236997e-03
-1.700201e-02
-9.735696e-01
3.883361e-04
2.217637e-04
std
6.308563e+02
5.195685e+02
2.443558e+02
7.437888e+13
5.710430e+01
9.752032e+01
8.084053e+02
2.477764e-01
5.826182e+10
3.247496e+00
...
6.119142e-01
2.456984e-01
2.332277e-01
2.214861e-01
3.101637e-01
1.119004e-01
2.103856e-01
9.613923e-01
2.795772e-02
2.240271e-02
min
0.000000e+00
0.000000e+00
-2.017497e+04
-7.375435e-02
-9.848880e+03
-3.434176e+04
-8.551914e+03
-2.344957e+00
-1.043737e+13
-1.077101e+03
...
-1.687572e+00
-1.000000e+00
-1.000000e+00
-1.000000e+00
-5.250904e-01
-4.449529e-01
-1.000000e+00
-2.000000e+00
-1.265686e-01
-8.609413e-02
25%
5.500000e+02
5.040000e+02
-1.849245e-01
-2.829417e-02
-3.320328e-01
-2.661774e-01
-5.365840e-02
-1.969607e-01
-5.703754e+08
-1.646032e-01
...
-4.043798e-01
-7.892920e-04
-3.068867e-04
-4.336545e-05
-1.519630e-01
-7.160673e-02
-3.515464e-14
-2.000000e+00
-1.638420e-02
-9.561389e-03
50%
1.098000e+03
9.560000e+02
-9.243710e-03
1.066268e-02
-3.129806e-02
-2.266392e-02
7.860678e-02
-3.671852e-02
-2.737567e-01
-1.286000e-01
...
-8.584833e-02
-6.932156e-12
-2.836974e-13
-3.655687e-16
-1.434174e-02
3.836489e-03
0.000000e+00
-6.804921e-01
3.881475e-04
-1.570681e-04
75%
1.657000e+03
1.401000e+03
1.128421e-01
1.460314e-01
1.219615e-01
2.595344e-01
1.802310e+00
1.273931e-01
5.088157e-02
1.083336e-01
...
1.903692e-01
-3.863118e-39
0.000000e+00
0.000000e+00
1.768057e-01
7.607020e-02
0.000000e+00
-5.451053e-08
1.680637e-02
9.520990e-03
max
2.158000e+03
1.812000e+03
3.252527e+03
1.068448e+16
3.823001e+03
1.239737e+03
6.785965e+04
1.378195e+00
5.203165e+02
7.677125e+01
...
4.957758e+01
0.000000e+00
0.000000e+00
0.000000e+00
1.569265e+00
6.844833e-01
1.000000e+00
0.000000e+00
1.435858e-01
9.349781e-02
8 rows × 111 columns
In [26]:
timeseries_id.head()
Out[26]:
id
timestamp
derived_0
derived_1
derived_2
derived_3
derived_4
fundamental_0
fundamental_1
fundamental_2
...
technical_36
technical_37
technical_38
technical_39
technical_40
technical_41
technical_42
technical_43
technical_44
y
131062
0
167
-4.536046
7.729436e+11
-0.332033
-0.504601
18.016613
-0.020409
-570375360.0
-0.162295
...
-0.085848
-0.091034
-0.081567
-0.07287
0.049083
0.005236
-0.017
-0.97353
0.000388
-0.007108
131895
0
168
-4.536046
7.729436e+11
-0.332033
-0.504601
18.016613
-0.020409
-570375360.0
-0.162295
...
-0.085848
-0.091034
-0.081567
-0.07287
0.049083
0.005236
-0.017
-0.97353
0.000388
0.001950
132728
0
169
-4.536046
7.729436e+11
-0.332033
-0.504601
18.016613
-0.020409
-570375360.0
-0.162295
...
-0.085848
-0.091034
-0.081567
-0.07287
0.049083
0.005236
-0.017
-0.97353
0.000388
0.017724
133561
0
170
-0.230583
4.880956e-01
0.935920
0.028222
-0.083071
-0.240929
-570375360.0
0.212425
...
0.727659
0.000000
0.000000
0.00000
-0.160478
0.005236
0.000
0.00000
0.000388
0.012934
134393
0
171
-0.230583
4.880956e-01
0.935920
0.028222
-0.083071
-0.240929
-570375360.0
0.212425
...
0.727659
0.000000
0.000000
0.00000
-0.160478
0.005236
0.000
0.00000
0.000388
-0.025229
5 rows × 111 columns
In [31]:
print( timeseries_id['id'].unique() )
print( len( timeseries_id['id'].unique() ))
[ 0 6 7 ..., 2155 2156 2158]
1424
In [32]:
timeseries_id.groupby('id').count()
Out[32]:
timestamp
derived_0
derived_1
derived_2
derived_3
derived_4
fundamental_0
fundamental_1
fundamental_2
fundamental_3
...
technical_36
technical_37
technical_38
technical_39
technical_40
technical_41
technical_42
technical_43
technical_44
y
id
0
1646
1646
1646
1646
1646
1646
1646
1646
1646
1646
...
1646
1646
1646
1646
1646
1646
1646
1646
1646
1646
6
728
728
728
728
728
728
728
728
728
728
...
728
728
728
728
728
728
728
728
728
728
7
1543
1543
1543
1543
1543
1543
1543
1543
1543
1543
...
1543
1543
1543
1543
1543
1543
1543
1543
1543
1543
10
116
116
116
116
116
116
116
116
116
116
...
116
116
116
116
116
116
116
116
116
116
11
1813
1813
1813
1813
1813
1813
1813
1813
1813
1813
...
1813
1813
1813
1813
1813
1813
1813
1813
1813
1813
12
1813
1813
1813
1813
1813
1813
1813
1813
1813
1813
...
1813
1813
1813
1813
1813
1813
1813
1813
1813
1813
13
1543
1543
1543
1543
1543
1543
1543
1543
1543
1543
...
1543
1543
1543
1543
1543
1543
1543
1543
1543
1543
14
218
218
218
218
218
218
218
218
218
218
...
218
218
218
218
218
218
218
218
218
218
15
1340
1340
1340
1340
1340
1340
1340
1340
1340
1340
...
1340
1340
1340
1340
1340
1340
1340
1340
1340
1340
16
1745
1745
1745
1745
1745
1745
1745
1745
1745
1745
...
1745
1745
1745
1745
1745
1745
1745
1745
1745
1745
17
779
779
779
779
779
779
779
779
779
779
...
779
779
779
779
779
779
779
779
779
779
18
218
218
218
218
218
218
218
218
218
218
...
218
218
218
218
218
218
218
218
218
218
19
829
829
829
829
829
829
829
829
829
829
...
829
829
829
829
829
829
829
829
829
829
20
1646
1646
1646
1646
1646
1646
1646
1646
1646
1646
...
1646
1646
1646
1646
1646
1646
1646
1646
1646
1646
22
1442
1442
1442
1442
1442
1442
1442
1442
1442
1442
...
1442
1442
1442
1442
1442
1442
1442
1442
1442
1442
23
218
218
218
218
218
218
218
218
218
218
...
218
218
218
218
218
218
218
218
218
218
24
932
932
932
932
932
932
932
932
932
932
...
932
932
932
932
932
932
932
932
932
932
25
1813
1813
1813
1813
1813
1813
1813
1813
1813
1813
...
1813
1813
1813
1813
1813
1813
1813
1813
1813
1813
26
1813
1813
1813
1813
1813
1813
1813
1813
1813
1813
...
1813
1813
1813
1813
1813
1813
1813
1813
1813
1813
27
159
159
159
159
159
159
159
159
159
159
...
159
159
159
159
159
159
159
159
159
159
30
932
932
932
932
932
932
932
932
932
932
...
932
932
932
932
932
932
932
932
932
932
31
1813
1813
1813
1813
1813
1813
1813
1813
1813
1813
...
1813
1813
1813
1813
1813
1813
1813
1813
1813
1813
32
218
218
218
218
218
218
218
218
218
218
...
218
218
218
218
218
218
218
218
218
218
33
422
422
422
422
422
422
422
422
422
422
...
422
422
422
422
422
422
422
422
422
422
38
1813
1813
1813
1813
1813
1813
1813
1813
1813
1813
...
1813
1813
1813
1813
1813
1813
1813
1813
1813
1813
39
1813
1813
1813
1813
1813
1813
1813
1813
1813
1813
...
1813
1813
1813
1813
1813
1813
1813
1813
1813
1813
40
1813
1813
1813
1813
1813
1813
1813
1813
1813
1813
...
1813
1813
1813
1813
1813
1813
1813
1813
1813
1813
41
1813
1813
1813
1813
1813
1813
1813
1813
1813
1813
...
1813
1813
1813
1813
1813
1813
1813
1813
1813
1813
43
1813
1813
1813
1813
1813
1813
1813
1813
1813
1813
...
1813
1813
1813
1813
1813
1813
1813
1813
1813
1813
44
1813
1813
1813
1813
1813
1813
1813
1813
1813
1813
...
1813
1813
1813
1813
1813
1813
1813
1813
1813
1813
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
2115
363
363
363
363
363
363
363
363
363
363
...
363
363
363
363
363
363
363
363
363
363
2117
728
728
728
728
728
728
728
728
728
728
...
728
728
728
728
728
728
728
728
728
728
2118
829
829
829
829
829
829
829
829
829
829
...
829
829
829
829
829
829
829
829
829
829
2120
1745
1745
1745
1745
1745
1745
1745
1745
1745
1745
...
1745
1745
1745
1745
1745
1745
1745
1745
1745
1745
2121
422
422
422
422
422
422
422
422
422
422
...
422
422
422
422
422
422
422
422
422
422
2124
627
627
627
627
627
627
627
627
627
627
...
627
627
627
627
627
627
627
627
627
627
2125
1128
1128
1128
1128
1128
1128
1128
1128
1128
1128
...
1128
1128
1128
1128
1128
1128
1128
1128
1128
1128
2126
1813
1813
1813
1813
1813
1813
1813
1813
1813
1813
...
1813
1813
1813
1813
1813
1813
1813
1813
1813
1813
2129
1813
1813
1813
1813
1813
1813
1813
1813
1813
1813
...
1813
1813
1813
1813
1813
1813
1813
1813
1813
1813
2130
1745
1745
1745
1745
1745
1745
1745
1745
1745
1745
...
1745
1745
1745
1745
1745
1745
1745
1745
1745
1745
2131
1813
1813
1813
1813
1813
1813
1813
1813
1813
1813
...
1813
1813
1813
1813
1813
1813
1813
1813
1813
1813
2134
1203
1203
1203
1203
1203
1203
1203
1203
1203
1203
...
1203
1203
1203
1203
1203
1203
1203
1203
1203
1203
2135
960
960
960
960
960
960
960
960
960
960
...
960
960
960
960
960
960
960
960
960
960
2136
1290
1290
1290
1290
1290
1290
1290
1290
1290
1290
...
1290
1290
1290
1290
1290
1290
1290
1290
1290
1290
2137
1646
1646
1646
1646
1646
1646
1646
1646
1646
1646
...
1646
1646
1646
1646
1646
1646
1646
1646
1646
1646
2138
1813
1813
1813
1813
1813
1813
1813
1813
1813
1813
...
1813
1813
1813
1813
1813
1813
1813
1813
1813
1813
2139
880
880
880
880
880
880
880
880
880
880
...
880
880
880
880
880
880
880
880
880
880
2140
1442
1442
1442
1442
1442
1442
1442
1442
1442
1442
...
1442
1442
1442
1442
1442
1442
1442
1442
1442
1442
2142
1813
1813
1813
1813
1813
1813
1813
1813
1813
1813
...
1813
1813
1813
1813
1813
1813
1813
1813
1813
1813
2145
1646
1646
1646
1646
1646
1646
1646
1646
1646
1646
...
1646
1646
1646
1646
1646
1646
1646
1646
1646
1646
2146
1813
1813
1813
1813
1813
1813
1813
1813
1813
1813
...
1813
1813
1813
1813
1813
1813
1813
1813
1813
1813
2148
1813
1813
1813
1813
1813
1813
1813
1813
1813
1813
...
1813
1813
1813
1813
1813
1813
1813
1813
1813
1813
2149
1813
1813
1813
1813
1813
1813
1813
1813
1813
1813
...
1813
1813
1813
1813
1813
1813
1813
1813
1813
1813
2150
1646
1646
1646
1646
1646
1646
1646
1646
1646
1646
...
1646
1646
1646
1646
1646
1646
1646
1646
1646
1646
2151
1745
1745
1745
1745
1745
1745
1745
1745
1745
1745
...
1745
1745
1745
1745
1745
1745
1745
1745
1745
1745
2152
167
167
167
167
167
167
167
167
167
167
...
167
167
167
167
167
167
167
167
167
167
2154
1813
1813
1813
1813
1813
1813
1813
1813
1813
1813
...
1813
1813
1813
1813
1813
1813
1813
1813
1813
1813
2155
1657
1657
1657
1657
1657
1657
1657
1657
1657
1657
...
1657
1657
1657
1657
1657
1657
1657
1657
1657
1657
2156
1813
1813
1813
1813
1813
1813
1813
1813
1813
1813
...
1813
1813
1813
1813
1813
1813
1813
1813
1813
1813
2158
150
150
150
150
150
150
150
150
150
150
...
150
150
150
150
150
150
150
150
150
150
1424 rows × 110 columns
Let uid $\in \mathbb{Z}^+$ represent a unique id for each financial instrument, and in this case, it's implemented with this command in pandas. Each 1 of the uids will be a training example.
In [34]:
uids = timeseries_id['id'].unique()
print(uids)
[ 0 6 7 ..., 2155 2156 2158]
In [35]:
timeseries_id.loc[ timeseries_id['id'] == 0 ] # this selects rows based on values in id column
# cf. http://stackoverflow.com/questions/17071871/select-rows-from-a-dataframe-based-on-values-in-a-column-in-pandas
Out[35]:
id
timestamp
derived_0
derived_1
derived_2
derived_3
derived_4
fundamental_0
fundamental_1
fundamental_2
...
technical_36
technical_37
technical_38
technical_39
technical_40
technical_41
technical_42
technical_43
technical_44
y
131062
0
167
-4.536046
7.729436e+11
-0.332033
-0.504601
18.016613
-0.020409
-5.703754e+08
-0.162295
...
-0.085848
-9.103397e-02
-8.156685e-02
-7.287001e-02
0.049083
0.005236
-1.699966e-02
-9.735299e-01
0.000388
-0.007108
131895
0
168
-4.536046
7.729436e+11
-0.332033
-0.504601
18.016613
-0.020409
-5.703754e+08
-0.162295
...
-0.085848
-9.103397e-02
-8.156685e-02
-7.287001e-02
0.049083
0.005236
-1.699966e-02
-9.735299e-01
0.000388
0.001950
132728
0
169
-4.536046
7.729436e+11
-0.332033
-0.504601
18.016613
-0.020409
-5.703754e+08
-0.162295
...
-0.085848
-9.103397e-02
-8.156685e-02
-7.287001e-02
0.049083
0.005236
-1.699966e-02
-9.735299e-01
0.000388
0.017724
133561
0
170
-0.230583
4.880956e-01
0.935920
0.028222
-0.083071
-0.240929
-5.703754e+08
0.212425
...
0.727659
0.000000e+00
0.000000e+00
0.000000e+00
-0.160478
0.005236
0.000000e+00
0.000000e+00
0.000388
0.012934
134393
0
171
-0.230583
4.880956e-01
0.935920
0.028222
-0.083071
-0.240929
-5.703754e+08
0.212425
...
0.727659
0.000000e+00
0.000000e+00
0.000000e+00
-0.160478
0.005236
0.000000e+00
0.000000e+00
0.000388
-0.025229
135224
0
172
-0.230583
4.880956e-01
0.935920
0.028222
-0.083071
-0.240929
-5.703754e+08
0.212425
...
0.727659
0.000000e+00
0.000000e+00
0.000000e+00
-0.160478
0.005236
0.000000e+00
0.000000e+00
0.000388
-0.021411
136055
0
173
-0.230583
4.880956e-01
0.935920
0.028222
-0.083071
-0.240929
-5.703754e+08
0.212425
...
0.727659
0.000000e+00
0.000000e+00
0.000000e+00
-0.160478
0.005236
0.000000e+00
0.000000e+00
0.000388
-0.030042
136885
0
174
-0.230583
4.880956e-01
0.935920
0.028222
-0.083071
-0.240929
-5.703754e+08
0.212425
...
0.727659
0.000000e+00
0.000000e+00
0.000000e+00
-0.160478
0.005236
0.000000e+00
0.000000e+00
0.000388
-0.013961
137715
0
175
-0.230583
4.880956e-01
0.935920
0.028222
-0.083071
-0.240929
-5.703754e+08
0.212425
...
0.727659
0.000000e+00
0.000000e+00
0.000000e+00
-0.160478
0.005236
0.000000e+00
0.000000e+00
0.000388
0.015330
138545
0
176
-0.230583
4.880956e-01
0.935920
0.028222
-0.083071
-0.240929
-5.703754e+08
0.212425
...
0.727659
0.000000e+00
0.000000e+00
0.000000e+00
-0.160478
0.005236
0.000000e+00
0.000000e+00
0.000388
-0.011354
139374
0
177
-0.230583
4.880956e-01
0.935920
0.028222
-0.083071
-0.240929
-5.703754e+08
0.212425
...
0.727659
0.000000e+00
0.000000e+00
0.000000e+00
-0.160478
0.005236
0.000000e+00
0.000000e+00
0.000388
0.011996
140203
0
178
-0.230583
4.880956e-01
0.935920
0.028222
-0.083071
-0.240929
-5.703754e+08
0.212425
...
0.727659
0.000000e+00
0.000000e+00
0.000000e+00
-0.160478
0.005236
0.000000e+00
0.000000e+00
0.000388
0.010069
141032
0
179
-0.230583
4.880956e-01
0.935920
0.028222
-0.083071
-0.240929
-5.703754e+08
0.212425
...
0.727659
0.000000e+00
0.000000e+00
0.000000e+00
-0.160478
0.005236
0.000000e+00
0.000000e+00
0.000388
0.010060
141861
0
180
-0.230583
4.880956e-01
0.935920
0.028222
-0.083071
-0.240929
-5.703754e+08
0.212425
...
0.727659
0.000000e+00
0.000000e+00
0.000000e+00
-0.160478
0.005236
0.000000e+00
0.000000e+00
0.000388
0.027913
142690
0
181
-0.230583
4.880956e-01
0.935920
0.028222
-0.083071
-0.240929
-5.703754e+08
0.212425
...
0.727659
0.000000e+00
0.000000e+00
0.000000e+00
-0.160478
0.005236
0.000000e+00
0.000000e+00
0.000388
0.022216
143518
0
182
-0.230583
4.880956e-01
0.935920
0.028222
-0.083071
-0.240929
-5.703754e+08
0.212425
...
0.727659
0.000000e+00
0.000000e+00
0.000000e+00
-0.160478
0.005236
0.000000e+00
0.000000e+00
0.000388
0.025130
144346
0
183
-0.230583
4.880956e-01
0.935920
0.028222
-0.083071
-0.240929
-5.703754e+08
0.212425
...
0.727659
0.000000e+00
0.000000e+00
0.000000e+00
-0.160478
0.005236
0.000000e+00
0.000000e+00
0.000388
0.017332
145173
0
184
-0.230583
4.880956e-01
0.935920
0.028222
-0.083071
-0.240929
-5.703754e+08
0.212425
...
0.727659
0.000000e+00
0.000000e+00
0.000000e+00
-0.160478
0.005236
0.000000e+00
0.000000e+00
0.000388
0.008471
146000
0
185
-0.258631
4.642017e-01
0.629588
0.021282
-0.116380
-0.226894
-5.703754e+08
0.183160
...
0.821629
-3.529145e-01
-3.529145e-01
-3.529145e-01
-0.169180
0.005236
0.000000e+00
0.000000e+00
0.000388
0.008027
146827
0
186
-0.265503
4.583476e-01
0.554534
0.019581
-0.124540
-0.223455
-5.703754e+08
0.175990
...
0.844653
-4.393815e-01
-4.393815e-01
-4.393815e-01
-0.171312
0.005236
0.000000e+00
0.000000e+00
0.000388
0.016009
147655
0
187
-0.271432
4.532968e-01
0.489780
0.018114
-0.131581
-0.220488
-5.703754e+08
0.169803
...
0.864517
-5.139828e-01
-5.139828e-01
-5.139828e-01
-0.173152
0.005236
0.000000e+00
0.000000e+00
0.000388
-0.007890
148483
0
188
-0.276553
4.489338e-01
0.433844
0.016847
-0.137663
-0.217925
-5.703754e+08
0.164460
...
0.881675
-5.784236e-01
-5.784236e-01
-5.784236e-01
-0.174741
0.005236
0.000000e+00
0.000000e+00
0.000388
0.004876
149311
0
189
-0.280982
4.451612e-01
0.385478
0.015751
-0.142922
-0.215709
-5.703754e+08
0.159839
...
0.896512
-6.341452e-01
-6.341452e-01
-6.341452e-01
-0.176115
0.005236
0.000000e+00
0.000000e+00
0.000388
0.008867
150138
0
190
-0.284815
4.418962e-01
0.343618
0.014803
-0.147474
-0.213791
-5.703754e+08
0.155840
...
0.909353
-6.823705e-01
-6.823705e-01
-6.823705e-01
-0.177304
0.005236
0.000000e+00
0.000000e+00
0.000388
0.010612
150966
0
191
-0.288134
4.390682e-01
0.307361
0.013981
-0.151416
-0.212130
-5.703754e+08
0.152376
...
0.920475
-7.241401e-01
-7.241401e-01
-7.241401e-01
-0.178334
0.005236
0.000000e+00
0.000000e+00
0.000388
0.012024
151793
0
192
-0.291011
4.366171e-01
0.275937
0.013269
-0.154833
-0.210690
-5.703754e+08
0.149374
...
0.930115
-7.603424e-01
-7.603424e-01
-7.603424e-01
-0.179226
0.005236
0.000000e+00
0.000000e+00
0.000388
-0.014977
152619
0
193
-0.295671
4.326472e-01
0.225041
0.012116
-0.160367
-0.208358
-5.703754e+08
0.144512
...
0.945728
-8.189780e-01
-8.189780e-01
-8.189780e-01
-0.180672
0.005236
0.000000e+00
0.000000e+00
0.000388
0.010760
153444
0
194
-0.297551
4.310463e-01
0.204517
0.011651
-0.162599
-0.207418
-5.703754e+08
0.142551
...
0.952024
-8.426234e-01
-8.426234e-01
-8.426234e-01
-0.181255
0.005236
0.000000e+00
0.000000e+00
0.000388
0.012923
154270
0
195
-0.299183
4.296562e-01
0.186694
0.011247
-0.164536
-0.206602
-5.703754e+08
0.140849
...
0.957491
-8.631561e-01
-8.631561e-01
-8.631561e-01
-0.181762
0.005236
0.000000e+00
0.000000e+00
0.000388
0.022528
155095
0
196
-0.300600
4.284486e-01
0.171213
0.010897
-0.166220
-0.205892
-5.703754e+08
0.139370
...
0.962240
-8.809918e-01
-8.809918e-01
-8.809918e-01
-0.182201
0.005236
0.000000e+00
0.000000e+00
0.000388
-0.026797
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
1678413
0
1783
0.029335
-5.027509e-02
-0.003322
0.031089
-0.473021
-0.077385
3.716965e-01
-0.092991
...
0.294015
-1.739833e-27
-1.260800e-23
-1.008935e-42
-0.096673
-0.047351
1.530497e-08
-6.291904e-07
-0.004992
-0.011444
1679486
0
1784
0.029756
-5.024469e-02
-0.003379
0.033569
-0.476485
-0.084699
3.706125e-01
-0.092883
...
0.278514
-1.318547e-27
-9.555080e-24
-7.651090e-43
-0.100545
-0.036978
1.159900e-08
-4.768372e-07
-0.005937
0.011999
1680559
0
1785
0.029926
-5.023238e-02
-0.003402
0.034574
-0.477889
-0.087662
3.701733e-01
-0.092840
...
0.272234
-1.147862e-27
-8.318181e-24
-6.656168e-43
-0.102114
-0.091188
1.009751e-08
-4.151109e-07
-0.009025
0.020277
1681632
0
1786
0.030075
-5.022166e-02
-0.003422
0.035448
-0.479111
-0.090242
3.697910e-01
-0.092802
...
0.266767
-9.992718e-28
-7.241397e-24
-5.801376e-43
-0.103480
-0.094736
8.790395e-09
-3.613750e-07
-0.010514
0.048122
1682705
0
1787
0.030204
-5.021232e-02
-0.003439
0.036210
-0.480175
-0.092488
3.694582e-01
-0.092768
...
0.262008
-8.699167e-28
-6.304002e-24
-5.044674e-43
-0.104669
-0.105505
7.652483e-09
-3.145952e-07
-0.017143
-0.014449
1683778
0
1788
0.030317
-5.020420e-02
-0.003454
0.036873
-0.481101
-0.094443
3.691684e-01
-0.092740
...
0.257864
-7.573065e-28
-5.487953e-24
-4.400077e-43
-0.105704
-0.052454
6.661874e-09
-2.738710e-07
-0.017226
-0.004970
1684851
0
1789
0.030574
-5.018561e-02
-0.003489
0.038389
-0.483220
-0.098916
3.685054e-01
-0.092674
...
0.248384
-4.996359e-28
-3.620698e-24
-2.900688e-43
-0.108072
0.018029
4.395198e-09
-1.806875e-07
-0.025375
0.001802
1685923
0
1790
0.030639
-5.018094e-02
-0.003498
0.038770
-0.483751
-0.100039
3.683390e-01
-0.092657
...
0.246004
-4.349583e-28
-3.152001e-24
-2.522337e-43
-0.108666
0.020061
3.826242e-09
-1.572976e-07
-0.023762
0.003436
1686995
0
1791
0.030695
-5.017688e-02
-0.003505
0.039102
-0.484214
-0.101017
3.681941e-01
-0.092643
...
0.243932
-3.786532e-28
-2.743976e-24
-2.200039e-43
-0.109184
0.050118
3.330937e-09
-1.369355e-07
-0.024146
-0.004264
1688067
0
1792
0.030744
-5.017334e-02
-0.003512
0.039390
-0.484618
-0.101868
3.680680e-01
-0.092630
...
0.242129
-3.296368e-28
-2.388770e-24
-1.905766e-43
-0.109634
0.056490
2.899749e-09
-1.192093e-07
-0.023694
-0.006907
1689139
0
1793
0.030787
-5.017026e-02
-0.003517
0.039641
-0.484968
-0.102609
3.679582e-01
-0.092619
...
0.240559
-2.869655e-28
-2.079545e-24
-1.667545e-43
-0.110027
0.068677
2.524378e-09
-1.037777e-07
-0.021098
-0.004992
1690211
0
1794
0.030856
-5.016525e-02
-0.003527
0.040050
-0.485540
-0.103815
3.677794e-01
-0.092602
...
0.238002
-2.174792e-28
-1.576001e-24
-1.261169e-43
-0.110665
0.068767
1.913121e-09
-7.864880e-08
-0.021999
-0.002802
1691284
0
1795
0.030884
-5.016322e-02
-0.003531
0.040216
-0.485771
-0.104304
3.677069e-01
-0.092594
...
0.236966
-1.893266e-28
-1.371988e-24
-1.093013e-43
-0.110924
0.082079
1.665468e-09
-6.846776e-08
-0.019079
-0.000695
1692357
0
1796
0.030930
-5.015991e-02
-0.003537
0.040486
-0.486148
-0.105100
3.675890e-01
-0.092583
...
0.235279
-1.434827e-28
-1.039773e-24
-8.267661e-44
-0.111345
0.072209
1.262189e-09
-5.188886e-08
-0.025620
0.002240
1693430
0
1797
0.030948
-5.015857e-02
-0.003539
0.040595
-0.486301
-0.105422
3.675412e-01
-0.092578
...
0.234596
-1.249090e-28
-9.051746e-25
-7.286752e-44
-0.111516
0.077314
1.098799e-09
-4.517187e-08
-0.023317
-0.000972
1694503
0
1798
0.030965
-5.015740e-02
-0.003541
0.040691
-0.486434
-0.105703
3.674996e-01
-0.092574
...
0.234001
-1.087396e-28
-7.880003e-25
-6.305843e-44
-0.111665
0.062890
9.565604e-10
-3.932440e-08
-0.026073
0.002504
1695576
0
1799
0.030979
-5.015639e-02
-0.003543
0.044831
-0.486550
-0.101101
3.674634e-01
-0.094245
...
0.226356
-9.466331e-29
-6.859941e-25
-5.465064e-44
-0.116042
0.079037
8.327342e-10
-3.423388e-08
-0.020253
-0.001514
1696660
0
1800
0.030991
-5.015550e-02
-0.003545
0.048435
-0.486651
-0.097094
3.674318e-01
-0.095700
...
0.219701
-8.240919e-29
-5.971925e-25
-4.764415e-44
-0.119852
0.056845
7.249372e-10
-2.980232e-08
-0.022716
-0.014502
1697744
0
1801
0.031002
-5.015473e-02
-0.003546
0.051573
-0.486738
-0.093606
3.674044e-01
-0.096966
...
0.213907
-7.174137e-29
-5.198863e-25
-4.203895e-44
-0.123169
0.044515
6.310945e-10
-2.594443e-08
-0.025760
0.001356
1698828
0
1802
0.031011
-5.015406e-02
-0.003548
0.054305
-0.486815
-0.090569
3.673805e-01
-0.098068
...
0.208863
-6.245449e-29
-4.525873e-25
-3.643376e-44
-0.126056
0.039230
5.493997e-10
-2.258594e-08
-0.022167
-0.006496
1699912
0
1803
0.031019
-5.015348e-02
-0.003549
0.056682
-0.486881
-0.087926
3.673597e-01
-0.099028
...
0.204473
-5.436979e-29
-3.940001e-25
-3.222986e-44
-0.128570
0.034210
4.782802e-10
-1.966220e-08
-0.025424
0.000646
1700996
0
1804
0.031026
-5.015297e-02
-0.003550
0.058753
-0.486939
-0.085625
3.673416e-01
-0.099864
...
0.200650
-4.733165e-29
-3.429970e-25
-2.802597e-44
-0.130759
0.036258
4.163671e-10
-1.711694e-08
-0.027991
-0.007014
1702080
0
1805
0.031032
-5.015253e-02
-0.003550
0.060555
-0.486989
-0.083621
3.673258e-01
-0.100591
...
0.197323
-4.120460e-29
-2.985963e-25
-2.382207e-44
-0.132664
0.037274
3.624686e-10
-1.490116e-08
-0.029492
0.008276
1703164
0
1806
0.031037
-5.015215e-02
-0.003551
0.062124
-0.487033
-0.081877
3.673121e-01
-0.101224
...
0.194426
-3.587069e-29
-2.599432e-25
-2.101948e-44
-0.134322
0.013720
3.155473e-10
-1.297221e-08
-0.030470
-0.004655
1704248
0
1807
0.031042
-5.015181e-02
-0.003552
0.063489
-0.487071
-0.080359
3.673001e-01
-0.101776
...
0.191904
-3.122724e-29
-2.262936e-25
-1.821688e-44
-0.135766
0.027679
2.746998e-10
-1.129297e-08
-0.029216
-0.007162
1705332
0
1808
0.031046
-5.015152e-02
-0.003552
0.064678
-0.487105
-0.079037
3.672897e-01
-0.102255
...
0.189709
-2.718490e-29
-1.970001e-25
-1.541428e-44
-0.137023
-0.008870
2.391401e-10
-9.831099e-09
-0.027297
0.005913
1706416
0
1809
0.031053
-5.015104e-02
-0.003553
0.066615
-0.487159
-0.076885
3.672728e-01
-0.103037
...
0.186134
-2.060230e-29
-1.492981e-25
-1.261169e-44
-0.139070
-0.011822
1.812343e-10
-7.450581e-09
-0.027407
0.001084
1707500
0
1810
0.031055
-5.015085e-02
-0.003554
0.067399
-0.487181
-0.076013
3.672659e-01
-0.103353
...
0.184685
-1.793534e-29
-1.299716e-25
-9.809089e-45
-0.139899
0.108225
1.577736e-10
-6.486107e-09
-0.032493
0.014193
1708584
0
1811
0.031058
-5.015069e-02
-0.003554
0.068082
-0.487200
-0.075254
3.672599e-01
-0.103629
...
0.183424
-1.561362e-29
-1.131468e-25
-8.407791e-45
-0.140621
0.118004
1.373499e-10
-5.646484e-09
-0.032447
0.017506
1709670
0
1812
0.031061
-5.015041e-02
-0.003554
0.069194
-0.487231
-0.074018
3.672502e-01
-0.104078
...
0.181371
-1.183291e-29
-8.574926e-26
-7.006492e-45
-0.141797
0.155133
1.040918e-10
-4.279235e-09
-0.028572
-0.001499
1646 rows × 111 columns
In [36]:
train_data = []
for uid in uids:
train_data.append( timeseries_id.loc[ timeseries_id['id'] == uid ] )
In [42]:
print(train_data[500].describe() )
train_data[90].describe()
id timestamp derived_0 derived_1 derived_2 derived_3 \
count 167.0 167.00000 167.000000 1.670000e+02 167.000000 167.000000
mean 760.0 1729.00000 2.665702 1.851362e+10 -1.412422 -0.542541
std 0.0 48.35287 1.296966 1.185383e+11 1.188876 0.098337
min 760.0 1646.00000 -4.536046 3.712765e-01 -2.474511 -0.737826
25% 760.0 1687.50000 2.084843 4.033775e-01 -2.108023 -0.576306
50% 760.0 1729.00000 2.729487 4.152939e-01 -1.935057 -0.503850
75% 760.0 1770.50000 3.381960 8.561365e-01 -1.265020 -0.464237
max 760.0 1812.00000 3.853158 7.729436e+11 0.880080 -0.437878
derived_4 fundamental_0 fundamental_1 fundamental_2 ... \
count 167.000000 167.000000 1.670000e+02 167.000000 ...
mean 0.640865 -0.211127 -5.703760e+08 0.570303 ...
std 2.730414 0.063717 6.419248e+02 0.230059 ...
min 0.100831 -0.304139 -5.703754e+08 -0.162295 ...
25% 0.206776 -0.267548 -5.703754e+08 0.428843 ...
50% 0.221496 -0.220222 -5.703754e+08 0.460351 ...
75% 0.247066 -0.166967 -5.703754e+08 0.715237 ...
max 18.016613 -0.020409 -5.703754e+08 1.033149 ...
technical_36 technical_37 technical_38 technical_39 technical_40 \
count 167.000000 1.670000e+02 1.670000e+02 1.670000e+02 167.000000
mean 0.042319 -1.052911e-01 -1.050643e-01 -1.048560e-01 0.153309
std 0.794744 2.472035e-01 2.472209e-01 2.472445e-01 0.080489
min -1.222611 -9.376735e-01 -9.376735e-01 -9.376735e-01 0.049083
25% -0.581855 -2.094509e-02 -2.094509e-02 -2.094509e-02 0.093031
50% -0.005949 -1.442857e-05 -1.442857e-05 -1.442857e-05 0.130116
75% 0.710346 -1.147246e-08 -1.147246e-08 -1.147246e-08 0.212634
max 1.350116 0.000000e+00 0.000000e+00 0.000000e+00 0.306145
technical_41 technical_42 technical_43 technical_44 y
count 167.000000 1.670000e+02 1.670000e+02 1.670000e+02 167.000000
mean -0.067410 -1.035178e-01 -2.434016e-01 3.881482e-04 0.001822
std 0.102435 2.475667e-01 5.809696e-01 7.005927e-10 0.027742
min -0.307961 -9.376735e-01 -2.000000e+00 3.881475e-04 -0.086094
25% -0.135019 -1.699966e-02 -2.407625e-02 3.881475e-04 -0.014582
50% -0.010967 -1.442857e-05 -1.903862e-05 3.881475e-04 0.002450
75% 0.005236 -1.147246e-08 -1.513801e-08 3.881475e-04 0.015327
max 0.074351 0.000000e+00 -1.197886e-11 3.881475e-04 0.093498
[8 rows x 111 columns]
Out[42]:
id
timestamp
derived_0
derived_1
derived_2
derived_3
derived_4
fundamental_0
fundamental_1
fundamental_2
...
technical_36
technical_37
technical_38
technical_39
technical_40
technical_41
technical_42
technical_43
technical_44
y
count
1813.0
1813.000000
1813.000000
1.813000e+03
1813.000000
1813.000000
1813.000000
1813.000000
1.813000e+03
1813.000000
...
1813.000000
1.813000e+03
1.813000e+03
1.813000e+03
1813.000000
1813.000000
1.813000e+03
1813.000000
1813.000000
1813.000000
mean
132.0
906.000000
-0.058425
8.953012e+09
0.146766
0.703138
-0.083834
0.190869
-5.703642e+08
-0.122751
...
-0.119251
-6.804787e-02
-7.749242e-02
-6.822543e-02
0.469998
0.007877
-1.069208e-02
-1.181274
-0.000033
0.000061
std
0.0
523.512337
0.213262
8.273049e+10
0.103685
2.230596
0.322826
0.089175
1.120296e+04
0.240167
...
0.817104
2.016541e-01
2.135926e-01
2.023427e-01
0.229366
0.111792
1.706771e-01
0.942628
0.025838
0.029507
min
132.0
0.000000
-0.338433
-3.131947e-02
-0.332033
-7.054357
-0.609691
-0.100612
-5.703754e+08
-0.429032
...
-1.360596
-9.527674e-01
-9.527674e-01
-9.527674e-01
0.043785
-0.327233
-9.457232e-01
-2.000000
-0.057559
-0.086094
25%
132.0
453.000000
-0.217740
1.147535e-01
0.074607
0.459400
-0.350128
0.120739
-5.703754e+08
-0.266960
...
-0.738286
-3.046177e-04
-1.399656e-03
-3.046177e-04
0.250156
-0.071838
-1.106077e-20
-2.000000
-0.020096
-0.016046
50%
132.0
906.000000
-0.161497
3.381517e-01
0.131323
0.547394
-0.076328
0.185428
-5.703754e+08
-0.212201
...
-0.244904
-1.264501e-10
-8.023565e-09
-2.607062e-12
0.552837
0.002232
0.000000e+00
-1.984375
-0.000609
-0.001044
75%
132.0
1359.000000
0.119724
1.438533e+00
0.233826
0.607379
0.177735
0.273472
-5.703754e+08
-0.033086
...
0.221536
-6.972643e-22
-2.164439e-18
-6.900835e-35
0.631811
0.093350
0.000000e+00
-0.000488
0.019493
0.013955
max
132.0
1812.000000
0.360814
7.729436e+11
0.321231
19.607140
0.427289
0.344082
-5.703754e+08
0.975261
...
2.643460
0.000000e+00
0.000000e+00
0.000000e+00
0.848786
0.313070
9.375000e-01
0.000000
0.065233
0.093498
8 rows × 111 columns
In [43]:
len(train_data)
Out[43]:
1424
In [44]:
Out[44]:
id
timestamp
derived_0
derived_1
derived_2
derived_3
derived_4
fundamental_0
fundamental_1
fundamental_2
...
technical_36
technical_37
technical_38
technical_39
technical_40
technical_41
technical_42
technical_43
technical_44
y
count
1646.0
1646.000000
1646.000000
1.646000e+03
1646.000000
1646.000000
1646.000000
1646.000000
1.646000e+03
1646.000000
...
1646.000000
1.646000e+03
1.646000e+03
1.646000e+03
1646.000000
1646.000000
1.646000e+03
1.646000e+03
1646.000000
1646.000000
mean
0.0
989.500000
-0.001228
1.408767e+09
-0.004749
-0.008533
0.111429
-0.182586
-4.262226e+07
0.176767
...
0.108057
-1.302352e-01
-1.226205e-01
-7.148141e-02
-0.071777
0.006662
1.168664e-02
-5.813643e-01
-0.014462
0.000207
std
0.0
475.303587
0.333821
3.297888e+10
0.103862
0.054967
0.816958
0.074010
1.500248e+08
0.112926
...
0.374263
2.689950e-01
2.764083e-01
2.133210e-01
0.070447
0.096715
1.869833e-01
8.603244e-01
0.024462
0.014373
min
0.0
167.000000
-4.536046
-5.430397e-02
-0.332033
-0.504601
-0.487231
-0.380160
-5.703754e+08
-0.162295
...
-0.553699
-9.970396e-01
-9.974228e-01
-9.974228e-01
-0.242448
-0.238166
-9.455906e-01
-2.000000e+00
-0.078894
-0.086094
25%
0.0
578.250000
-0.169424
-4.109198e-02
-0.040009
-0.050176
-0.177244
-0.250514
3.485617e-01
0.120773
...
-0.157843
-5.859375e-02
-1.710002e-02
-2.266130e-04
-0.122406
-0.056985
-2.438194e-28
-1.747775e+00
-0.031335
-0.007402
50%
0.0
989.500000
-0.064549
9.054013e-02
-0.012615
0.009540
0.131383
-0.181862
4.936251e-01
0.169546
...
0.038023
-4.492915e-05
-2.087333e-06
-2.759641e-10
-0.071140
0.004271
2.261187e-38
-1.062609e-04
-0.009130
0.000147
75%
0.0
1400.750000
0.246447
1.836513e-01
0.004717
0.035317
0.305429
-0.115067
5.513806e-01
0.228592
...
0.346111
-9.501463e-11
-3.333576e-11
-1.752180e-19
-0.016527
0.065895
7.723607e-10
-1.713779e-11
0.000700
0.007355
max
0.0
1812.000000
0.473889
7.729436e+11
0.935920
0.072241
18.016613
-0.020409
6.205065e-01
0.549132
...
1.404450
0.000000e+00
0.000000e+00
0.000000e+00
0.058438
0.325816
9.526339e-01
0.000000e+00
0.041618
0.093498
8 rows × 111 columns
cf. https://docs.scipy.org/doc/numpy/reference/generated/numpy.dstack.html http://stackoverflow.com/questions/4341359/convert-a-list-of-2d-numpy-arrays-to-one-3d-numpy-array
We can also make this into a 3-dimensional numpy array:
In [48]:
train_data_3d = np.dstack(train_data)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-48-eb3fb5792645> in <module>()
----> 1 train_data_3d = np.dstack(train_data)
/home/topolo/Public/anaconda2/lib/python2.7/site-packages/numpy/lib/shape_base.pyc in dstack(tup)
366
367 """
--> 368 return _nx.concatenate([atleast_3d(_m) for _m in tup], 2)
369
370 def _replace_zero_by_x_arrays(sub_arys):
ValueError: all the input array dimensions except for the concatenation axis must match exactly
In [ ]:
# clean with mean
timeseries_pd_meanclean = timeseries_pd.where( pd.notnull(timeseries_pd),timeseries_pd.mean(),axis='columns')
In [ ]:
# order by id
timeseries_id=timeseries_pd_meanclean.sort_values(by=["id", "timestamp"])
In [ ]:
uids = timeseries_id['id'].unique()
In [ ]:
train_data = []
for uid in uids:
train_data.append( timeseries_id.loc[ timeseries_id['id'] == uid ] )
Simulating what kaggle uses to train on
In [10]:
# data input to train on
obs_trainon = timeseries_pd[timeseries_pd["timestamp"]<907]
In [34]:
obs_trainon.describe();
obs_trainon_meanclean = obs_trainon.where(pd.notnull(obs_trainon),obs_trainon.mean(),axis='columns')
obs_trainon_meanclean.sort_values(by=['id','timestamp']);
In [ ]:
In [ ]:
In [ ]:
In [35]:
def clean_tseries(timeseries_pd):
# clean the data
# I chose to fill in missing values, NaN values, with the mean, due to the distribution of the data
tseries_meanclean = timeseries_pd.where( pd.notnull(timeseries_pd),timeseries_pd.mean(),axis='columns')
# order by id. We want the first index to be id=1,2,..m, m representing number of training examples
tseries_id=tseries_meanclean.sort_values(by=['id','timestamp'])
uids = tseries_id['id'].unique()
train_data = []
for uid in uids:
train_data.append( tseries_id.loc[ tseries_id['id'] == uid])
return train_data
In [36]:
res_clean_obs_trainon = clean_tseries(obs_trainon)
print(len(res_clean_obs_trainon)); res_clean_obs_trainon[0].describe()
1096
Out[36]:
id
timestamp
derived_0
derived_1
derived_2
derived_3
derived_4
fundamental_0
fundamental_1
fundamental_2
...
technical_36
technical_37
technical_38
technical_39
technical_40
technical_41
technical_42
technical_43
technical_44
y
count
740.0
740.000000
740.000000
7.400000e+02
740.000000
740.000000
740.000000
740.000000
7.400000e+02
740.000000
...
740.000000
740.000000
7.400000e+02
7.400000e+02
740.000000
740.000000
7.400000e+02
7.400000e+02
740.000000
740.000000
mean
0.0
536.500000
0.141298
1.294029e+09
0.029532
-0.059602
0.143820
-0.245960
-2.082033e+08
0.155819
...
0.327425
-0.196187
-1.561827e-01
-9.037122e-02
-0.133291
0.016009
8.418077e-04
-3.366559e-01
-0.012644
0.000317
std
0.0
213.763888
0.673985
2.029559e+10
0.152071
0.058863
2.384773
0.050656
4.666271e+08
0.063896
...
0.372466
0.311861
3.046142e-01
2.271253e-01
0.048187
0.104661
1.870003e-01
6.936044e-01
0.018649
0.015126
min
0.0
167.000000
-9.452177
1.022506e-01
-0.834537
-0.855791
-0.421443
-0.380160
-1.252604e+09
-0.249810
...
-0.270343
-0.997040
-9.970396e-01
-9.526339e-01
-0.242448
-0.212149
-9.455906e-01
-1.999907e+00
-0.057820
-0.060769
25%
0.0
351.750000
-0.034855
1.390509e-01
-0.013344
-0.074317
-0.340190
-0.265878
3.921026e-01
0.105054
...
0.020599
-0.291292
-8.936096e-02
-6.354555e-03
-0.170361
-0.051960
-5.884874e-13
-1.620358e-02
-0.026290
-0.007691
50%
0.0
536.500000
0.285073
1.961368e-01
0.000810
-0.054055
-0.117074
-0.253719
5.271003e-01
0.159256
...
0.339271
-0.006377
-1.661682e-04
-1.459134e-06
-0.126920
0.006004
-6.565064e-27
-2.764836e-09
-0.007597
0.000309
75%
0.0
721.250000
0.438258
2.748602e-01
0.021877
-0.040042
0.441190
-0.210890
5.566207e-01
0.212425
...
0.496720
-0.000029
-6.355221e-08
-1.268952e-11
-0.090822
0.086496
0.000000e+00
-3.330669e-16
0.000783
0.007995
max
0.0
906.000000
0.473889
3.191938e+11
0.935920
0.028222
37.087269
-0.005858
6.205065e-01
0.277171
...
1.404450
0.000000
0.000000e+00
0.000000e+00
0.031995
0.325816
9.526339e-01
0.000000e+00
0.027206
0.093498
8 rows × 111 columns
In [37]:
res_clean_obs_trainon[0].values.shape
Out[37]:
(740, 111)
In [40]:
res_clean_obs_trainon[0].drop(['y'],axis=1).describe()
Out[40]:
id
timestamp
derived_0
derived_1
derived_2
derived_3
derived_4
fundamental_0
fundamental_1
fundamental_2
...
technical_35
technical_36
technical_37
technical_38
technical_39
technical_40
technical_41
technical_42
technical_43
technical_44
count
740.0
740.000000
740.000000
7.400000e+02
740.000000
740.000000
740.000000
740.000000
7.400000e+02
740.000000
...
740.000000
740.000000
740.000000
7.400000e+02
7.400000e+02
740.000000
740.000000
7.400000e+02
7.400000e+02
740.000000
mean
0.0
536.500000
0.141298
1.294029e+09
0.029532
-0.059602
0.143820
-0.245960
-2.082033e+08
0.155819
...
0.379662
0.327425
-0.196187
-1.561827e-01
-9.037122e-02
-0.133291
0.016009
8.418077e-04
-3.366559e-01
-0.012644
std
0.0
213.763888
0.673985
2.029559e+10
0.152071
0.058863
2.384773
0.050656
4.666271e+08
0.063896
...
0.359621
0.372466
0.311861
3.046142e-01
2.271253e-01
0.048187
0.104661
1.870003e-01
6.936044e-01
0.018649
min
0.0
167.000000
-9.452177
1.022506e-01
-0.834537
-0.855791
-0.421443
-0.380160
-1.252604e+09
-0.249810
...
-0.126267
-0.270343
-0.997040
-9.970396e-01
-9.526339e-01
-0.242448
-0.212149
-9.455906e-01
-1.999907e+00
-0.057820
25%
0.0
351.750000
-0.034855
1.390509e-01
-0.013344
-0.074317
-0.340190
-0.265878
3.921026e-01
0.105054
...
0.093098
0.020599
-0.291292
-8.936096e-02
-6.354555e-03
-0.170361
-0.051960
-5.884874e-13
-1.620358e-02
-0.026290
50%
0.0
536.500000
0.285073
1.961368e-01
0.000810
-0.054055
-0.117074
-0.253719
5.271003e-01
0.159256
...
0.337073
0.339271
-0.006377
-1.661682e-04
-1.459134e-06
-0.126920
0.006004
-6.565064e-27
-2.764836e-09
-0.007597
75%
0.0
721.250000
0.438258
2.748602e-01
0.021877
-0.040042
0.441190
-0.210890
5.566207e-01
0.212425
...
0.570238
0.496720
-0.000029
-6.355221e-08
-1.268952e-11
-0.090822
0.086496
0.000000e+00
-3.330669e-16
0.000783
max
0.0
906.000000
0.473889
3.191938e+11
0.935920
0.028222
37.087269
-0.005858
6.205065e-01
0.277171
...
1.204843
1.404450
0.000000
0.000000e+00
0.000000e+00
0.031995
0.325816
9.526339e-01
0.000000e+00
0.027206
8 rows × 110 columns
In [44]:
print( type(res_clean_obs_trainon[0]['y']) )
print( type(res_clean_obs_trainon[0][['y']]))
print( res_clean_obs_trainon[0][['y']].values.shape)
<class 'pandas.core.series.Series'>
<class 'pandas.core.frame.DataFrame'>
(740, 1)
So we actually want the numpy array for calculations.
In [29]:
def clean_tseries(timeseries_pd):
# clean the data
# I chose to fill in missing values, NaN values, with the mean, due to the distribution of the data
tseries_meanclean = timeseries_pd.where( pd.notnull(timeseries_pd),timeseries_pd.mean(),axis='columns')
# order by id. We want the first index to be id=1,2,..m, m representing number of training examples
tseries_id=tseries_meanclean.sort_values(by=['id','timestamp'])
uids = tseries_id['id'].unique()
train_data = []
for uid in uids:
train_data.append( tseries_id.loc[ tseries_id['id'] == uid].values )
return train_data
In [30]:
res_clean_obs_trainon = clean_tseries(obs_trainon)
print(len(res_clean_obs_trainon)); res_clean_obs_trainon[0].shape
1096
Out[30]:
(740, 111)
So we actually need to split up the input data $X$ from the output data $y$
In [47]:
def clean_tseries(timeseries_pd):
# clean the data
# I chose to fill in missing values, NaN values, with the mean, due to the distribution of the data
tseries_meanclean = timeseries_pd.where( pd.notnull(timeseries_pd),timeseries_pd.mean(),axis='columns')
# order by id. We want the first index to be id=1,2,..m, m representing number of training examples
tseries_id=tseries_meanclean.sort_values(by=['id','timestamp'])
uids = tseries_id['id'].unique()
train_data = []
for uid in uids:
train_data.append( tseries_id.loc[ tseries_id['id'] == uid] )
train_data_split = []
for row in train_data:
train_data_split.append( ( row.drop(['y'],axis=1).values, row[['y']].values ) )
return train_data_split
In [48]:
res_clean_obs_trainon = clean_tseries(obs_trainon)
In [51]:
print( len(res_clean_obs_trainon))
print( type( res_clean_obs_trainon[0] ) ); print(len(res_clean_obs_trainon[0]));
print( res_clean_obs_trainon[0][0].shape);print(res_clean_obs_trainon[0][1].shape)
1096
<type 'tuple'>
2
(740, 110)
(740, 1)
Simulating observations, features
In [39]:
np.array( [3]).shape
Out[39]:
(1,)
So we actually need also something to clean the test data X to predict on
In [53]:
def clean_test(timeseries_pd):
# clean the data
# I chose to fill in missing values, NaN values, with the mean, due to the distribution of the data
tseries_meanclean = timeseries_pd.where( pd.notnull(timeseries_pd),timeseries_pd.mean(),axis='columns')
# order by id. We want the first index to be id=1,2,..m, m representing number of training examples
tseries_id=tseries_meanclean.sort_values(by=['id','timestamp'])
uids = tseries_id['id'].unique()
train_data = []
for uid in uids:
train_data.append( tseries_id.loc[ tseries_id['id'] == uid].values )
return train_data
In [65]:
#this corresponds to what kaggle calls FEATURES, features
obs_predicton = timeseries_pd[timeseries_pd["timestamp"]==908]
obs_predictonX = obs_predicton.drop('y',axis=1)
In [66]:
obs_predicton_cleaned = clean_test( obs_predictonX)
In [69]:
print(type(obs_predicton_cleaned));print(type(obs_predicton_cleaned[0]));print(obs_predicton_cleaned[0].shape)
<type 'list'>
<type 'numpy.ndarray'>
(1, 110)
In [72]:
obs_predicton_cleaned[5][0][0]
Out[72]:
15.0
In [73]:
def id_only(cleaned_X_data):
m = len(cleaned_X_data)
result = []
for idx in range(m):
id = cleaned_X_data[idx][0][0]
result.append(int(id))
return result
In [74]:
res_id_only = id_only(obs_predicton_cleaned)
In [79]:
res_id_only[:10]
print(len(res_id_only))
968
In [ ]:
def just_id(cleaned_X_data, predicted_y):
# assert len(cleaned_X_data) == len(predicted_y)
In [61]:
print( np.array( [[5],[3],[2]]).shape)
np.array( [[5]]).flatten()[0]
(3, 1)
Out[61]:
5
In [76]:
obs_predictedy = obs_predicton['y']
In [83]:
# simulate what I'm going to get
list( obs_predictedy.values.reshape(968,1,1) )[0]
Out[83]:
array([[-0.00385292]], dtype=float32)
In [84]:
def y_only(predicted_y):
result = []
for row in predicted_y:
y = row.flatten()[0]
result.append(y)
return result
In [87]:
res_y_only = y_only( list( obs_predictedy.values.reshape(968,1,1) ) )
In [88]:
print(len(res_y_only)); res_y_only[:10]
968
Out[88]:
[-0.0038529162,
0.0028961198,
-0.00094434741,
-0.005380407,
-0.01242116,
-0.014861807,
-0.026671588,
-0.012705223,
0.0045782495,
-0.01963657]
In [89]:
pd_predictedon = pd.DataFrame.from_dict( dict(id=res_id_only,y=res_y_only))
In [90]:
pd_predictedon.describe()
Out[90]:
id
y
count
968.000000
968.000000
mean
1094.757231
-0.001006
std
632.309033
0.014307
min
0.000000
-0.086094
25%
550.250000
-0.007348
50%
1099.500000
-0.000869
75%
1660.500000
0.005278
max
2156.000000
0.093498
In [91]:
pd_predictedon.isnull().describe()
Out[91]:
id
y
count
968
968
unique
1
1
top
False
False
freq
968
968
In [92]:
t_all_cleaned = clean_tseries(timeseries_pd)
In [93]:
len(t_all_cleaned)
Out[93]:
1424
In [99]:
print( type(t_all_cleaned[0]));print( len(t_all_cleaned[0]));
print( t_all_cleaned[0][0].shape); print(t_all_cleaned[0][1].shape)
<type 'tuple'>
2
(1646, 110)
(1646, 1)
In [100]:
for i in range(13):
print( t_all_cleaned[i][0].shape, t_all_cleaned[i][1].shape )
((1646, 110), (1646, 1))
((728, 110), (728, 1))
((1543, 110), (1543, 1))
((116, 110), (116, 1))
((1813, 110), (1813, 1))
((1813, 110), (1813, 1))
((1543, 110), (1543, 1))
((218, 110), (218, 1))
((1340, 110), (1340, 1))
((1745, 110), (1745, 1))
((779, 110), (779, 1))
((218, 110), (218, 1))
((829, 110), (829, 1))
In [1]:
import theano
WARNING (theano.sandbox.cuda): The cuda backend is deprecated and will be removed in the next release (v0.10). Please switch to the gpuarray backend. You can get more information about how to switch at this URL:
https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29
Using gpu device 0: GeForce GTX 980 Ti (CNMeM is enabled with initial size: 80.0% of memory, cuDNN 5105)
In [2]:
theano.version.full_version
Out[2]:
'0.9.0rc2.dev-19540d4e3064fe0dc0e1281f517bad0f355e46a2'
In [ ]:
Content source: ernestyalumni/MLgrabbag
Similar notebooks: