In [1]:
%matplotlib inline

In [2]:
import matplotlib.pyplot as plt
import sklearn
from sklearn import datasets

In [3]:
import os, sys
print( os.getcwd() )
print( os.listdir( os.getcwd() ) )


/home/topolo/PropD/MLgrabbag/kaggle
['.ipynb_checkpoints', 'glass-classification.zip', 'kaggle.ipynb', 'glass.csv', 'train.h5.zip', 'train.h5']

In [4]:
import numpy as np
import scipy

In [5]:
import pandas as pd

Sigma, Financial Time Series


In [6]:
print( os.listdir( os.getcwd() ))


['.ipynb_checkpoints', 'glass-classification.zip', 'kaggle.ipynb', 'glass.csv', 'train.h5.zip', 'train.h5']

In [7]:
timeseries_pd = pd.read_hdf( 'train.h5')

In [8]:
timeseries_pd.describe()


Out[8]:
id timestamp derived_0 derived_1 derived_2 derived_3 derived_4 fundamental_0 fundamental_1 fundamental_2 ... technical_36 technical_37 technical_38 technical_39 technical_40 technical_41 technical_42 technical_43 technical_44 y
count 1.710756e+06 1.710756e+06 1.637797e+06 1.629727e+06 1.312105e+06 1.561285e+06 1.304298e+06 1.686809e+06 1.031686e+06 1.341916e+06 ... 1.708204e+06 1.691591e+06 1.691591e+06 1.690740e+06 1.708520e+06 1.666567e+06 1.690755e+06 1.706070e+06 1.473977e+06 1.710756e+06
mean 1.093858e+03 9.456257e+02 -4.536046e+00 7.729436e+11 -3.320328e-01 -5.046012e-01 1.801661e+01 -2.040938e-02 -5.703754e+08 -1.622954e-01 ... -8.584833e-02 -9.103397e-02 -8.156685e-02 -7.287001e-02 4.908321e-02 5.236218e-03 -1.699966e-02 -9.735299e-01 3.881475e-04 2.217509e-04
std 6.308563e+02 5.195685e+02 2.497382e+02 7.620606e+13 6.519810e+01 1.020749e+02 9.258360e+02 2.494859e-01 7.502322e+10 3.668149e+00 ... 6.125852e-01 2.471038e-01 2.346534e-01 2.235729e-01 3.102316e-01 1.133733e-01 2.116284e-01 9.605551e-01 3.011983e-02 2.240643e-02
min 0.000000e+00 0.000000e+00 -2.017497e+04 -7.375435e-02 -9.848880e+03 -3.434176e+04 -8.551914e+03 -2.344957e+00 -1.043737e+13 -1.077101e+03 ... -1.687572e+00 -1.000000e+00 -1.000000e+00 -1.000000e+00 -5.250904e-01 -4.449529e-01 -1.000000e+00 -2.000000e+00 -1.265686e-01 -8.609413e-02
25% 5.500000e+02 5.040000e+02 -1.449710e-01 -2.956479e-02 -5.967524e-02 -1.655826e-01 -1.057050e-01 -1.996543e-01 -1.960470e-01 -2.280967e-01 ... -4.050297e-01 -4.651562e-04 -1.992532e-04 -2.203252e-05 -1.521701e-01 -7.377038e-02 -3.887695e-15 -2.000000e+00 -1.998819e-02 -9.561389e-03
50% 1.098000e+03 9.560000e+02 -8.368272e-04 5.523058e-03 2.109505e-02 2.475614e-03 1.175234e-02 -4.064488e-02 -7.395084e-03 -3.029069e-02 ... -8.502064e-02 -3.951567e-12 -1.418487e-13 -1.591224e-16 -1.476793e-02 9.782702e-05 0.000000e+00 -6.597540e-01 1.117279e-05 -1.570681e-04
75% 1.657000e+03 1.401000e+03 1.199108e-01 1.078554e-01 1.952209e-01 3.037236e-01 1.556464e-01 1.303819e-01 1.832071e-01 1.764751e-01 ... 1.909600e-01 -5.219879e-40 0.000000e+00 0.000000e+00 1.772415e-01 7.855728e-02 0.000000e+00 -5.188884e-08 2.047074e-02 9.520990e-03
max 2.158000e+03 1.812000e+03 3.252527e+03 1.068448e+16 3.823001e+03 1.239737e+03 6.785965e+04 1.378195e+00 5.203165e+02 7.677125e+01 ... 4.957758e+01 0.000000e+00 0.000000e+00 0.000000e+00 1.569265e+00 6.844833e-01 1.000000e+00 0.000000e+00 1.435858e-01 9.349781e-02

8 rows × 111 columns


In [9]:
timeseries_pd.head()


Out[9]:
id timestamp derived_0 derived_1 derived_2 derived_3 derived_4 fundamental_0 fundamental_1 fundamental_2 ... technical_36 technical_37 technical_38 technical_39 technical_40 technical_41 technical_42 technical_43 technical_44 y
0 10 0 0.370326 -0.006316 0.222831 -0.213030 0.729277 -0.335633 0.113292 1.621238 ... 0.775208 NaN NaN NaN -0.414776 NaN NaN -2.0 NaN -0.011753
1 11 0 0.014765 -0.038064 -0.017425 0.320652 -0.034134 0.004413 0.114285 -0.210185 ... 0.025590 NaN NaN NaN -0.273607 NaN NaN -2.0 NaN -0.001240
2 12 0 -0.010622 -0.050577 3.379575 -0.157525 -0.068550 -0.155937 1.219439 -0.764516 ... 0.151881 NaN NaN NaN -0.175710 NaN NaN -2.0 NaN -0.020940
3 25 0 NaN NaN NaN NaN NaN 0.178495 NaN -0.007262 ... 1.035936 NaN NaN NaN -0.211506 NaN NaN -2.0 NaN -0.015959
4 26 0 0.176693 -0.025284 -0.057680 0.015100 0.180894 0.139445 -0.125687 -0.018707 ... 0.630232 NaN NaN NaN -0.001957 NaN NaN 0.0 NaN -0.007338

5 rows × 111 columns


In [10]:
timeseries_pd.columns


Out[10]:
Index([u'id', u'timestamp', u'derived_0', u'derived_1', u'derived_2',
       u'derived_3', u'derived_4', u'fundamental_0', u'fundamental_1',
       u'fundamental_2',
       ...
       u'technical_36', u'technical_37', u'technical_38', u'technical_39',
       u'technical_40', u'technical_41', u'technical_42', u'technical_43',
       u'technical_44', u'y'],
      dtype='object', length=111)

In [11]:
print( len(timeseries_pd.columns) )
for col in timeseries_pd.columns: print col


111
id
timestamp
derived_0
derived_1
derived_2
derived_3
derived_4
fundamental_0
fundamental_1
fundamental_2
fundamental_3
fundamental_5
fundamental_6
fundamental_7
fundamental_8
fundamental_9
fundamental_10
fundamental_11
fundamental_12
fundamental_13
fundamental_14
fundamental_15
fundamental_16
fundamental_17
fundamental_18
fundamental_19
fundamental_20
fundamental_21
fundamental_22
fundamental_23
fundamental_24
fundamental_25
fundamental_26
fundamental_27
fundamental_28
fundamental_29
fundamental_30
fundamental_31
fundamental_32
fundamental_33
fundamental_34
fundamental_35
fundamental_36
fundamental_37
fundamental_38
fundamental_39
fundamental_40
fundamental_41
fundamental_42
fundamental_43
fundamental_44
fundamental_45
fundamental_46
fundamental_47
fundamental_48
fundamental_49
fundamental_50
fundamental_51
fundamental_52
fundamental_53
fundamental_54
fundamental_55
fundamental_56
fundamental_57
fundamental_58
fundamental_59
fundamental_60
fundamental_61
fundamental_62
fundamental_63
technical_0
technical_1
technical_2
technical_3
technical_5
technical_6
technical_7
technical_9
technical_10
technical_11
technical_12
technical_13
technical_14
technical_16
technical_17
technical_18
technical_19
technical_20
technical_21
technical_22
technical_24
technical_25
technical_27
technical_28
technical_29
technical_30
technical_31
technical_32
technical_33
technical_34
technical_35
technical_36
technical_37
technical_38
technical_39
technical_40
technical_41
technical_42
technical_43
technical_44
y

In [19]:
timeseries_pd["timestamp"];  # Name: timestamp, dtype: int16
timeseries_pd[["id","timestamp"]]


Out[19]:
id timestamp
0 10 0
1 11 0
2 12 0
3 25 0
4 26 0
5 27 0
6 31 0
7 38 0
8 39 0
9 40 0
10 41 0
11 43 0
12 44 0
13 49 0
14 54 0
15 59 0
16 60 0
17 62 0
18 63 0
19 68 0
20 69 0
21 70 0
22 76 0
23 79 0
24 80 0
25 82 0
26 83 0
27 85 0
28 87 0
29 90 0
... ... ...
1710726 2100 1812
1710727 2101 1812
1710728 2102 1812
1710729 2104 1812
1710730 2107 1812
1710731 2108 1812
1710732 2109 1812
1710733 2114 1812
1710734 2117 1812
1710735 2118 1812
1710736 2120 1812
1710737 2121 1812
1710738 2126 1812
1710739 2129 1812
1710740 2130 1812
1710741 2131 1812
1710742 2137 1812
1710743 2138 1812
1710744 2139 1812
1710745 2140 1812
1710746 2142 1812
1710747 2145 1812
1710748 2146 1812
1710749 2148 1812
1710750 2149 1812
1710751 2150 1812
1710752 2151 1812
1710753 2154 1812
1710754 2156 1812
1710755 2158 1812

1710756 rows × 2 columns


In [18]:
timeseries_pd["timestamp"]


Out[18]:
0             0
1             0
2             0
3             0
4             0
5             0
6             0
7             0
8             0
9             0
10            0
11            0
12            0
13            0
14            0
15            0
16            0
17            0
18            0
19            0
20            0
21            0
22            0
23            0
24            0
25            0
26            0
27            0
28            0
29            0
           ... 
1710726    1812
1710727    1812
1710728    1812
1710729    1812
1710730    1812
1710731    1812
1710732    1812
1710733    1812
1710734    1812
1710735    1812
1710736    1812
1710737    1812
1710738    1812
1710739    1812
1710740    1812
1710741    1812
1710742    1812
1710743    1812
1710744    1812
1710745    1812
1710746    1812
1710747    1812
1710748    1812
1710749    1812
1710750    1812
1710751    1812
1710752    1812
1710753    1812
1710754    1812
1710755    1812
Name: timestamp, dtype: int16

Total number of data points in time series


In [45]:
timeseries_pd.count()


Out[45]:
id                1710756
timestamp         1710756
derived_0         1637797
derived_1         1629727
derived_2         1312105
derived_3         1561285
derived_4         1304298
fundamental_0     1686809
fundamental_1     1031686
fundamental_2     1341916
fundamental_3     1256376
fundamental_5      748736
fundamental_6     1009131
fundamental_7     1684416
fundamental_8     1337590
fundamental_9     1145189
fundamental_10    1597779
fundamental_11    1341916
fundamental_12    1599885
fundamental_13    1355618
fundamental_14    1354672
fundamental_15    1355859
fundamental_16    1355618
fundamental_17    1613534
fundamental_18    1694923
fundamental_19    1656168
fundamental_20    1599885
fundamental_21    1656423
fundamental_22    1152268
fundamental_23    1354033
                   ...   
technical_13      1705992
technical_14      1696572
technical_16      1690775
technical_17      1706477
technical_18      1690740
technical_19      1708436
technical_20      1705992
technical_21      1708520
technical_22      1710756
technical_24      1639610
technical_25      1502700
technical_27      1708336
technical_28      1447840
technical_29      1649141
technical_30      1705992
technical_31      1528078
technical_32      1691591
technical_33      1696221
technical_34      1710756
technical_35      1707601
technical_36      1708204
technical_37      1691591
technical_38      1691591
technical_39      1690740
technical_40      1708520
technical_41      1666567
technical_42      1690755
technical_43      1706070
technical_44      1473977
y                 1710756
dtype: int64

In [47]:
timeseries_pd.size


Out[47]:
189893916

Dealing with Missing Values, NaN

cf. https://gallery.cortanaintelligence.com/Experiment/Methods-for-handling-missing-values-1

  1. Replace missing values with the mean. For this age data, we assume that missing values are distributed similarly to the values that are present. The formal name for this assumption is Missing Completely at Random (MCAR). In this case, substituting values that represent the existing distribution, such as the mean, is a reasonable approach.

  2. Replace missing values with the median. This is another justifiable way to handle missing-at-random data, although note that it gives a different answer. For categorical data, it's also common to use the mode, the most commonly occurring value.

cf. http://pandas.pydata.org/pandas-docs/stable/missing_data.html

The sections that became very useful were

Cleaning / filling missing data

Filling with a PandasObject


In [13]:
timeseries_pd.isnull().describe()


Out[13]:
id timestamp derived_0 derived_1 derived_2 derived_3 derived_4 fundamental_0 fundamental_1 fundamental_2 ... technical_36 technical_37 technical_38 technical_39 technical_40 technical_41 technical_42 technical_43 technical_44 y
count 1710756 1710756 1710756 1710756 1710756 1710756 1710756 1710756 1710756 1710756 ... 1710756 1710756 1710756 1710756 1710756 1710756 1710756 1710756 1710756 1710756
unique 1 1 2 2 2 2 2 2 2 2 ... 2 2 2 2 2 2 2 2 2 1
top False False False False False False False False False False ... False False False False False False False False False False
freq 1710756 1710756 1637797 1629727 1312105 1561285 1304298 1686809 1031686 1341916 ... 1708204 1691591 1691591 1690740 1708520 1666567 1690755 1706070 1473977 1710756

4 rows × 111 columns

clean with mean - fill in missing values with the mean on each column


In [14]:
timeseries_pd_meanclean = timeseries_pd.where( pd.notnull(timeseries_pd),timeseries_pd.mean(),axis='columns')

In [15]:
timeseries_pd_meanclean.describe()


Out[15]:
id timestamp derived_0 derived_1 derived_2 derived_3 derived_4 fundamental_0 fundamental_1 fundamental_2 ... technical_36 technical_37 technical_38 technical_39 technical_40 technical_41 technical_42 technical_43 technical_44 y
count 1.710756e+06 1.710756e+06 1.710756e+06 1.710756e+06 1.710756e+06 1.710756e+06 1.710756e+06 1.710756e+06 1.710756e+06 1.710756e+06 ... 1.710756e+06 1.710756e+06 1.710756e+06 1.710756e+06 1.710756e+06 1.710756e+06 1.710756e+06 1.710756e+06 1.710756e+06 1.710756e+06
mean 1.093858e+03 9.456257e+02 -4.534660e+00 7.727040e+11 -3.316913e-01 -5.042495e-01 1.801324e+01 -2.040735e-02 -5.611223e+08 -1.615499e-01 ... -8.584631e-02 -9.103718e-02 -8.156337e-02 -7.286777e-02 4.908380e-02 5.236934e-03 -1.700031e-02 -9.735575e-01 3.879543e-04 2.217509e-04
std 6.308563e+02 5.195685e+02 2.443549e+02 7.437944e+13 5.709856e+01 9.751381e+01 8.084039e+02 2.477336e-01 5.826071e+10 3.248749e+00 ... 6.121281e-01 2.457155e-01 2.333340e-01 2.222618e-01 3.100289e-01 1.118994e-01 2.103878e-01 9.592382e-01 2.795785e-02 2.240643e-02
min 0.000000e+00 0.000000e+00 -2.017497e+04 -7.375435e-02 -9.848880e+03 -3.434176e+04 -8.551914e+03 -2.344957e+00 -1.043737e+13 -1.077101e+03 ... -1.687572e+00 -1.000000e+00 -1.000000e+00 -1.000000e+00 -5.250904e-01 -4.449529e-01 -1.000000e+00 -2.000000e+00 -1.265686e-01 -8.609413e-02
25% 5.500000e+02 5.040000e+02 -1.849245e-01 -2.829417e-02 -3.320328e-01 -2.661774e-01 -5.365840e-02 -1.969607e-01 -5.703754e+08 -1.646032e-01 ... -4.043798e-01 -7.892920e-04 -3.068867e-04 -4.336545e-05 -1.519630e-01 -7.160673e-02 -3.515464e-14 -2.000000e+00 -1.638420e-02 -9.561389e-03
50% 1.098000e+03 9.560000e+02 -9.243710e-03 1.066268e-02 -3.129806e-02 -2.266392e-02 7.860678e-02 -3.671852e-02 -2.737567e-01 -1.286000e-01 ... -8.584833e-02 -6.932156e-12 -2.836974e-13 -3.655687e-16 -1.434174e-02 3.836489e-03 0.000000e+00 -6.804921e-01 3.881475e-04 -1.570681e-04
75% 1.657000e+03 1.401000e+03 1.128421e-01 1.460314e-01 1.219615e-01 2.595344e-01 1.802310e+00 1.273931e-01 5.088157e-02 1.083336e-01 ... 1.903692e-01 -3.863118e-39 0.000000e+00 0.000000e+00 1.768057e-01 7.607020e-02 0.000000e+00 -5.451053e-08 1.680637e-02 9.520990e-03
max 2.158000e+03 1.812000e+03 3.252527e+03 1.068448e+16 3.823001e+03 1.239737e+03 6.785965e+04 1.378195e+00 5.203165e+02 7.677125e+01 ... 4.957758e+01 0.000000e+00 0.000000e+00 0.000000e+00 1.569265e+00 6.844833e-01 1.000000e+00 0.000000e+00 1.435858e-01 9.349781e-02

8 rows × 111 columns


In [17]:
timeseries_pd_meanclean.notnull().describe()


Out[17]:
id timestamp derived_0 derived_1 derived_2 derived_3 derived_4 fundamental_0 fundamental_1 fundamental_2 ... technical_36 technical_37 technical_38 technical_39 technical_40 technical_41 technical_42 technical_43 technical_44 y
count 1710756 1710756 1710756 1710756 1710756 1710756 1710756 1710756 1710756 1710756 ... 1710756 1710756 1710756 1710756 1710756 1710756 1710756 1710756 1710756 1710756
unique 1 1 1 1 1 1 1 1 1 1 ... 1 1 1 1 1 1 1 1 1 1
top True True True True True True True True True True ... True True True True True True True True True True
freq 1710756 1710756 1710756 1710756 1710756 1710756 1710756 1710756 1710756 1710756 ... 1710756 1710756 1710756 1710756 1710756 1710756 1710756 1710756 1710756 1710756

4 rows × 111 columns


In [19]:
timeseries_pd_meanclean.head()


Out[19]:
id timestamp derived_0 derived_1 derived_2 derived_3 derived_4 fundamental_0 fundamental_1 fundamental_2 ... technical_36 technical_37 technical_38 technical_39 technical_40 technical_41 technical_42 technical_43 technical_44 y
0 10 0 0.370326 -6.316399e-03 0.222831 -0.213030 0.729277 -0.335633 1.132921e-01 1.621238 ... 0.775208 -0.091034 -0.081567 -0.07287 -0.414776 0.005236 -0.017 -2.0 0.000388 -0.011753
1 11 0 0.014765 -3.806422e-02 -0.017425 0.320652 -0.034134 0.004413 1.142851e-01 -0.210185 ... 0.025590 -0.091034 -0.081567 -0.07287 -0.273607 0.005236 -0.017 -2.0 0.000388 -0.001240
2 12 0 -0.010622 -5.057707e-02 3.379575 -0.157525 -0.068550 -0.155937 1.219439e+00 -0.764516 ... 0.151881 -0.091034 -0.081567 -0.07287 -0.175710 0.005236 -0.017 -2.0 0.000388 -0.020940
3 25 0 -4.536046 7.729436e+11 -0.332033 -0.504601 18.016613 0.178495 -5.703754e+08 -0.007262 ... 1.035936 -0.091034 -0.081567 -0.07287 -0.211506 0.005236 -0.017 -2.0 0.000388 -0.015959
4 26 0 0.176693 -2.528418e-02 -0.057680 0.015100 0.180894 0.139445 -1.256869e-01 -0.018707 ... 0.630232 -0.091034 -0.081567 -0.07287 -0.001957 0.005236 -0.017 0.0 0.000388 -0.007338

5 rows × 111 columns

"Each (financial) instrument has an id. "


In [24]:
timeseries_id=timeseries_pd_meanclean.sort_values(by=["id", "timestamp"])

In [25]:
timeseries_id.describe()


Out[25]:
id timestamp derived_0 derived_1 derived_2 derived_3 derived_4 fundamental_0 fundamental_1 fundamental_2 ... technical_36 technical_37 technical_38 technical_39 technical_40 technical_41 technical_42 technical_43 technical_44 y
count 1.710756e+06 1.710756e+06 1.710756e+06 1.710756e+06 1.710756e+06 1.710756e+06 1.710756e+06 1.710756e+06 1.710756e+06 1.710756e+06 ... 1.710756e+06 1.710756e+06 1.710756e+06 1.710756e+06 1.710756e+06 1.710756e+06 1.710756e+06 1.710756e+06 1.710756e+06 1.710756e+06
mean 1.093858e+03 9.456257e+02 -4.534488e+00 7.731216e+11 -3.314414e-01 -5.045264e-01 1.801171e+01 -2.040870e-02 -5.694613e+08 -1.612957e-01 ... -8.584719e-02 -9.105250e-02 -8.155458e-02 -7.286179e-02 4.907284e-02 5.236997e-03 -1.700201e-02 -9.735696e-01 3.883361e-04 2.217637e-04
std 6.308563e+02 5.195685e+02 2.443558e+02 7.437888e+13 5.710430e+01 9.752032e+01 8.084053e+02 2.477764e-01 5.826182e+10 3.247496e+00 ... 6.119142e-01 2.456984e-01 2.332277e-01 2.214861e-01 3.101637e-01 1.119004e-01 2.103856e-01 9.613923e-01 2.795772e-02 2.240271e-02
min 0.000000e+00 0.000000e+00 -2.017497e+04 -7.375435e-02 -9.848880e+03 -3.434176e+04 -8.551914e+03 -2.344957e+00 -1.043737e+13 -1.077101e+03 ... -1.687572e+00 -1.000000e+00 -1.000000e+00 -1.000000e+00 -5.250904e-01 -4.449529e-01 -1.000000e+00 -2.000000e+00 -1.265686e-01 -8.609413e-02
25% 5.500000e+02 5.040000e+02 -1.849245e-01 -2.829417e-02 -3.320328e-01 -2.661774e-01 -5.365840e-02 -1.969607e-01 -5.703754e+08 -1.646032e-01 ... -4.043798e-01 -7.892920e-04 -3.068867e-04 -4.336545e-05 -1.519630e-01 -7.160673e-02 -3.515464e-14 -2.000000e+00 -1.638420e-02 -9.561389e-03
50% 1.098000e+03 9.560000e+02 -9.243710e-03 1.066268e-02 -3.129806e-02 -2.266392e-02 7.860678e-02 -3.671852e-02 -2.737567e-01 -1.286000e-01 ... -8.584833e-02 -6.932156e-12 -2.836974e-13 -3.655687e-16 -1.434174e-02 3.836489e-03 0.000000e+00 -6.804921e-01 3.881475e-04 -1.570681e-04
75% 1.657000e+03 1.401000e+03 1.128421e-01 1.460314e-01 1.219615e-01 2.595344e-01 1.802310e+00 1.273931e-01 5.088157e-02 1.083336e-01 ... 1.903692e-01 -3.863118e-39 0.000000e+00 0.000000e+00 1.768057e-01 7.607020e-02 0.000000e+00 -5.451053e-08 1.680637e-02 9.520990e-03
max 2.158000e+03 1.812000e+03 3.252527e+03 1.068448e+16 3.823001e+03 1.239737e+03 6.785965e+04 1.378195e+00 5.203165e+02 7.677125e+01 ... 4.957758e+01 0.000000e+00 0.000000e+00 0.000000e+00 1.569265e+00 6.844833e-01 1.000000e+00 0.000000e+00 1.435858e-01 9.349781e-02

8 rows × 111 columns


In [26]:
timeseries_id.head()


Out[26]:
id timestamp derived_0 derived_1 derived_2 derived_3 derived_4 fundamental_0 fundamental_1 fundamental_2 ... technical_36 technical_37 technical_38 technical_39 technical_40 technical_41 technical_42 technical_43 technical_44 y
131062 0 167 -4.536046 7.729436e+11 -0.332033 -0.504601 18.016613 -0.020409 -570375360.0 -0.162295 ... -0.085848 -0.091034 -0.081567 -0.07287 0.049083 0.005236 -0.017 -0.97353 0.000388 -0.007108
131895 0 168 -4.536046 7.729436e+11 -0.332033 -0.504601 18.016613 -0.020409 -570375360.0 -0.162295 ... -0.085848 -0.091034 -0.081567 -0.07287 0.049083 0.005236 -0.017 -0.97353 0.000388 0.001950
132728 0 169 -4.536046 7.729436e+11 -0.332033 -0.504601 18.016613 -0.020409 -570375360.0 -0.162295 ... -0.085848 -0.091034 -0.081567 -0.07287 0.049083 0.005236 -0.017 -0.97353 0.000388 0.017724
133561 0 170 -0.230583 4.880956e-01 0.935920 0.028222 -0.083071 -0.240929 -570375360.0 0.212425 ... 0.727659 0.000000 0.000000 0.00000 -0.160478 0.005236 0.000 0.00000 0.000388 0.012934
134393 0 171 -0.230583 4.880956e-01 0.935920 0.028222 -0.083071 -0.240929 -570375360.0 0.212425 ... 0.727659 0.000000 0.000000 0.00000 -0.160478 0.005236 0.000 0.00000 0.000388 -0.025229

5 rows × 111 columns


In [31]:
print( timeseries_id['id'].unique() )
print( len( timeseries_id['id'].unique() ))


[   0    6    7 ..., 2155 2156 2158]
1424

In [32]:
timeseries_id.groupby('id').count()


Out[32]:
timestamp derived_0 derived_1 derived_2 derived_3 derived_4 fundamental_0 fundamental_1 fundamental_2 fundamental_3 ... technical_36 technical_37 technical_38 technical_39 technical_40 technical_41 technical_42 technical_43 technical_44 y
id
0 1646 1646 1646 1646 1646 1646 1646 1646 1646 1646 ... 1646 1646 1646 1646 1646 1646 1646 1646 1646 1646
6 728 728 728 728 728 728 728 728 728 728 ... 728 728 728 728 728 728 728 728 728 728
7 1543 1543 1543 1543 1543 1543 1543 1543 1543 1543 ... 1543 1543 1543 1543 1543 1543 1543 1543 1543 1543
10 116 116 116 116 116 116 116 116 116 116 ... 116 116 116 116 116 116 116 116 116 116
11 1813 1813 1813 1813 1813 1813 1813 1813 1813 1813 ... 1813 1813 1813 1813 1813 1813 1813 1813 1813 1813
12 1813 1813 1813 1813 1813 1813 1813 1813 1813 1813 ... 1813 1813 1813 1813 1813 1813 1813 1813 1813 1813
13 1543 1543 1543 1543 1543 1543 1543 1543 1543 1543 ... 1543 1543 1543 1543 1543 1543 1543 1543 1543 1543
14 218 218 218 218 218 218 218 218 218 218 ... 218 218 218 218 218 218 218 218 218 218
15 1340 1340 1340 1340 1340 1340 1340 1340 1340 1340 ... 1340 1340 1340 1340 1340 1340 1340 1340 1340 1340
16 1745 1745 1745 1745 1745 1745 1745 1745 1745 1745 ... 1745 1745 1745 1745 1745 1745 1745 1745 1745 1745
17 779 779 779 779 779 779 779 779 779 779 ... 779 779 779 779 779 779 779 779 779 779
18 218 218 218 218 218 218 218 218 218 218 ... 218 218 218 218 218 218 218 218 218 218
19 829 829 829 829 829 829 829 829 829 829 ... 829 829 829 829 829 829 829 829 829 829
20 1646 1646 1646 1646 1646 1646 1646 1646 1646 1646 ... 1646 1646 1646 1646 1646 1646 1646 1646 1646 1646
22 1442 1442 1442 1442 1442 1442 1442 1442 1442 1442 ... 1442 1442 1442 1442 1442 1442 1442 1442 1442 1442
23 218 218 218 218 218 218 218 218 218 218 ... 218 218 218 218 218 218 218 218 218 218
24 932 932 932 932 932 932 932 932 932 932 ... 932 932 932 932 932 932 932 932 932 932
25 1813 1813 1813 1813 1813 1813 1813 1813 1813 1813 ... 1813 1813 1813 1813 1813 1813 1813 1813 1813 1813
26 1813 1813 1813 1813 1813 1813 1813 1813 1813 1813 ... 1813 1813 1813 1813 1813 1813 1813 1813 1813 1813
27 159 159 159 159 159 159 159 159 159 159 ... 159 159 159 159 159 159 159 159 159 159
30 932 932 932 932 932 932 932 932 932 932 ... 932 932 932 932 932 932 932 932 932 932
31 1813 1813 1813 1813 1813 1813 1813 1813 1813 1813 ... 1813 1813 1813 1813 1813 1813 1813 1813 1813 1813
32 218 218 218 218 218 218 218 218 218 218 ... 218 218 218 218 218 218 218 218 218 218
33 422 422 422 422 422 422 422 422 422 422 ... 422 422 422 422 422 422 422 422 422 422
38 1813 1813 1813 1813 1813 1813 1813 1813 1813 1813 ... 1813 1813 1813 1813 1813 1813 1813 1813 1813 1813
39 1813 1813 1813 1813 1813 1813 1813 1813 1813 1813 ... 1813 1813 1813 1813 1813 1813 1813 1813 1813 1813
40 1813 1813 1813 1813 1813 1813 1813 1813 1813 1813 ... 1813 1813 1813 1813 1813 1813 1813 1813 1813 1813
41 1813 1813 1813 1813 1813 1813 1813 1813 1813 1813 ... 1813 1813 1813 1813 1813 1813 1813 1813 1813 1813
43 1813 1813 1813 1813 1813 1813 1813 1813 1813 1813 ... 1813 1813 1813 1813 1813 1813 1813 1813 1813 1813
44 1813 1813 1813 1813 1813 1813 1813 1813 1813 1813 ... 1813 1813 1813 1813 1813 1813 1813 1813 1813 1813
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2115 363 363 363 363 363 363 363 363 363 363 ... 363 363 363 363 363 363 363 363 363 363
2117 728 728 728 728 728 728 728 728 728 728 ... 728 728 728 728 728 728 728 728 728 728
2118 829 829 829 829 829 829 829 829 829 829 ... 829 829 829 829 829 829 829 829 829 829
2120 1745 1745 1745 1745 1745 1745 1745 1745 1745 1745 ... 1745 1745 1745 1745 1745 1745 1745 1745 1745 1745
2121 422 422 422 422 422 422 422 422 422 422 ... 422 422 422 422 422 422 422 422 422 422
2124 627 627 627 627 627 627 627 627 627 627 ... 627 627 627 627 627 627 627 627 627 627
2125 1128 1128 1128 1128 1128 1128 1128 1128 1128 1128 ... 1128 1128 1128 1128 1128 1128 1128 1128 1128 1128
2126 1813 1813 1813 1813 1813 1813 1813 1813 1813 1813 ... 1813 1813 1813 1813 1813 1813 1813 1813 1813 1813
2129 1813 1813 1813 1813 1813 1813 1813 1813 1813 1813 ... 1813 1813 1813 1813 1813 1813 1813 1813 1813 1813
2130 1745 1745 1745 1745 1745 1745 1745 1745 1745 1745 ... 1745 1745 1745 1745 1745 1745 1745 1745 1745 1745
2131 1813 1813 1813 1813 1813 1813 1813 1813 1813 1813 ... 1813 1813 1813 1813 1813 1813 1813 1813 1813 1813
2134 1203 1203 1203 1203 1203 1203 1203 1203 1203 1203 ... 1203 1203 1203 1203 1203 1203 1203 1203 1203 1203
2135 960 960 960 960 960 960 960 960 960 960 ... 960 960 960 960 960 960 960 960 960 960
2136 1290 1290 1290 1290 1290 1290 1290 1290 1290 1290 ... 1290 1290 1290 1290 1290 1290 1290 1290 1290 1290
2137 1646 1646 1646 1646 1646 1646 1646 1646 1646 1646 ... 1646 1646 1646 1646 1646 1646 1646 1646 1646 1646
2138 1813 1813 1813 1813 1813 1813 1813 1813 1813 1813 ... 1813 1813 1813 1813 1813 1813 1813 1813 1813 1813
2139 880 880 880 880 880 880 880 880 880 880 ... 880 880 880 880 880 880 880 880 880 880
2140 1442 1442 1442 1442 1442 1442 1442 1442 1442 1442 ... 1442 1442 1442 1442 1442 1442 1442 1442 1442 1442
2142 1813 1813 1813 1813 1813 1813 1813 1813 1813 1813 ... 1813 1813 1813 1813 1813 1813 1813 1813 1813 1813
2145 1646 1646 1646 1646 1646 1646 1646 1646 1646 1646 ... 1646 1646 1646 1646 1646 1646 1646 1646 1646 1646
2146 1813 1813 1813 1813 1813 1813 1813 1813 1813 1813 ... 1813 1813 1813 1813 1813 1813 1813 1813 1813 1813
2148 1813 1813 1813 1813 1813 1813 1813 1813 1813 1813 ... 1813 1813 1813 1813 1813 1813 1813 1813 1813 1813
2149 1813 1813 1813 1813 1813 1813 1813 1813 1813 1813 ... 1813 1813 1813 1813 1813 1813 1813 1813 1813 1813
2150 1646 1646 1646 1646 1646 1646 1646 1646 1646 1646 ... 1646 1646 1646 1646 1646 1646 1646 1646 1646 1646
2151 1745 1745 1745 1745 1745 1745 1745 1745 1745 1745 ... 1745 1745 1745 1745 1745 1745 1745 1745 1745 1745
2152 167 167 167 167 167 167 167 167 167 167 ... 167 167 167 167 167 167 167 167 167 167
2154 1813 1813 1813 1813 1813 1813 1813 1813 1813 1813 ... 1813 1813 1813 1813 1813 1813 1813 1813 1813 1813
2155 1657 1657 1657 1657 1657 1657 1657 1657 1657 1657 ... 1657 1657 1657 1657 1657 1657 1657 1657 1657 1657
2156 1813 1813 1813 1813 1813 1813 1813 1813 1813 1813 ... 1813 1813 1813 1813 1813 1813 1813 1813 1813 1813
2158 150 150 150 150 150 150 150 150 150 150 ... 150 150 150 150 150 150 150 150 150 150

1424 rows × 110 columns

Let uid $\in \mathbb{Z}^+$ represent a unique id for each financial instrument, and in this case, it's implemented with this command in pandas. Each 1 of the uids will be a training example.


In [34]:
uids = timeseries_id['id'].unique()
print(uids)


[   0    6    7 ..., 2155 2156 2158]

In [35]:
timeseries_id.loc[ timeseries_id['id'] == 0 ] # this selects rows based on values in id column
# cf. http://stackoverflow.com/questions/17071871/select-rows-from-a-dataframe-based-on-values-in-a-column-in-pandas


Out[35]:
id timestamp derived_0 derived_1 derived_2 derived_3 derived_4 fundamental_0 fundamental_1 fundamental_2 ... technical_36 technical_37 technical_38 technical_39 technical_40 technical_41 technical_42 technical_43 technical_44 y
131062 0 167 -4.536046 7.729436e+11 -0.332033 -0.504601 18.016613 -0.020409 -5.703754e+08 -0.162295 ... -0.085848 -9.103397e-02 -8.156685e-02 -7.287001e-02 0.049083 0.005236 -1.699966e-02 -9.735299e-01 0.000388 -0.007108
131895 0 168 -4.536046 7.729436e+11 -0.332033 -0.504601 18.016613 -0.020409 -5.703754e+08 -0.162295 ... -0.085848 -9.103397e-02 -8.156685e-02 -7.287001e-02 0.049083 0.005236 -1.699966e-02 -9.735299e-01 0.000388 0.001950
132728 0 169 -4.536046 7.729436e+11 -0.332033 -0.504601 18.016613 -0.020409 -5.703754e+08 -0.162295 ... -0.085848 -9.103397e-02 -8.156685e-02 -7.287001e-02 0.049083 0.005236 -1.699966e-02 -9.735299e-01 0.000388 0.017724
133561 0 170 -0.230583 4.880956e-01 0.935920 0.028222 -0.083071 -0.240929 -5.703754e+08 0.212425 ... 0.727659 0.000000e+00 0.000000e+00 0.000000e+00 -0.160478 0.005236 0.000000e+00 0.000000e+00 0.000388 0.012934
134393 0 171 -0.230583 4.880956e-01 0.935920 0.028222 -0.083071 -0.240929 -5.703754e+08 0.212425 ... 0.727659 0.000000e+00 0.000000e+00 0.000000e+00 -0.160478 0.005236 0.000000e+00 0.000000e+00 0.000388 -0.025229
135224 0 172 -0.230583 4.880956e-01 0.935920 0.028222 -0.083071 -0.240929 -5.703754e+08 0.212425 ... 0.727659 0.000000e+00 0.000000e+00 0.000000e+00 -0.160478 0.005236 0.000000e+00 0.000000e+00 0.000388 -0.021411
136055 0 173 -0.230583 4.880956e-01 0.935920 0.028222 -0.083071 -0.240929 -5.703754e+08 0.212425 ... 0.727659 0.000000e+00 0.000000e+00 0.000000e+00 -0.160478 0.005236 0.000000e+00 0.000000e+00 0.000388 -0.030042
136885 0 174 -0.230583 4.880956e-01 0.935920 0.028222 -0.083071 -0.240929 -5.703754e+08 0.212425 ... 0.727659 0.000000e+00 0.000000e+00 0.000000e+00 -0.160478 0.005236 0.000000e+00 0.000000e+00 0.000388 -0.013961
137715 0 175 -0.230583 4.880956e-01 0.935920 0.028222 -0.083071 -0.240929 -5.703754e+08 0.212425 ... 0.727659 0.000000e+00 0.000000e+00 0.000000e+00 -0.160478 0.005236 0.000000e+00 0.000000e+00 0.000388 0.015330
138545 0 176 -0.230583 4.880956e-01 0.935920 0.028222 -0.083071 -0.240929 -5.703754e+08 0.212425 ... 0.727659 0.000000e+00 0.000000e+00 0.000000e+00 -0.160478 0.005236 0.000000e+00 0.000000e+00 0.000388 -0.011354
139374 0 177 -0.230583 4.880956e-01 0.935920 0.028222 -0.083071 -0.240929 -5.703754e+08 0.212425 ... 0.727659 0.000000e+00 0.000000e+00 0.000000e+00 -0.160478 0.005236 0.000000e+00 0.000000e+00 0.000388 0.011996
140203 0 178 -0.230583 4.880956e-01 0.935920 0.028222 -0.083071 -0.240929 -5.703754e+08 0.212425 ... 0.727659 0.000000e+00 0.000000e+00 0.000000e+00 -0.160478 0.005236 0.000000e+00 0.000000e+00 0.000388 0.010069
141032 0 179 -0.230583 4.880956e-01 0.935920 0.028222 -0.083071 -0.240929 -5.703754e+08 0.212425 ... 0.727659 0.000000e+00 0.000000e+00 0.000000e+00 -0.160478 0.005236 0.000000e+00 0.000000e+00 0.000388 0.010060
141861 0 180 -0.230583 4.880956e-01 0.935920 0.028222 -0.083071 -0.240929 -5.703754e+08 0.212425 ... 0.727659 0.000000e+00 0.000000e+00 0.000000e+00 -0.160478 0.005236 0.000000e+00 0.000000e+00 0.000388 0.027913
142690 0 181 -0.230583 4.880956e-01 0.935920 0.028222 -0.083071 -0.240929 -5.703754e+08 0.212425 ... 0.727659 0.000000e+00 0.000000e+00 0.000000e+00 -0.160478 0.005236 0.000000e+00 0.000000e+00 0.000388 0.022216
143518 0 182 -0.230583 4.880956e-01 0.935920 0.028222 -0.083071 -0.240929 -5.703754e+08 0.212425 ... 0.727659 0.000000e+00 0.000000e+00 0.000000e+00 -0.160478 0.005236 0.000000e+00 0.000000e+00 0.000388 0.025130
144346 0 183 -0.230583 4.880956e-01 0.935920 0.028222 -0.083071 -0.240929 -5.703754e+08 0.212425 ... 0.727659 0.000000e+00 0.000000e+00 0.000000e+00 -0.160478 0.005236 0.000000e+00 0.000000e+00 0.000388 0.017332
145173 0 184 -0.230583 4.880956e-01 0.935920 0.028222 -0.083071 -0.240929 -5.703754e+08 0.212425 ... 0.727659 0.000000e+00 0.000000e+00 0.000000e+00 -0.160478 0.005236 0.000000e+00 0.000000e+00 0.000388 0.008471
146000 0 185 -0.258631 4.642017e-01 0.629588 0.021282 -0.116380 -0.226894 -5.703754e+08 0.183160 ... 0.821629 -3.529145e-01 -3.529145e-01 -3.529145e-01 -0.169180 0.005236 0.000000e+00 0.000000e+00 0.000388 0.008027
146827 0 186 -0.265503 4.583476e-01 0.554534 0.019581 -0.124540 -0.223455 -5.703754e+08 0.175990 ... 0.844653 -4.393815e-01 -4.393815e-01 -4.393815e-01 -0.171312 0.005236 0.000000e+00 0.000000e+00 0.000388 0.016009
147655 0 187 -0.271432 4.532968e-01 0.489780 0.018114 -0.131581 -0.220488 -5.703754e+08 0.169803 ... 0.864517 -5.139828e-01 -5.139828e-01 -5.139828e-01 -0.173152 0.005236 0.000000e+00 0.000000e+00 0.000388 -0.007890
148483 0 188 -0.276553 4.489338e-01 0.433844 0.016847 -0.137663 -0.217925 -5.703754e+08 0.164460 ... 0.881675 -5.784236e-01 -5.784236e-01 -5.784236e-01 -0.174741 0.005236 0.000000e+00 0.000000e+00 0.000388 0.004876
149311 0 189 -0.280982 4.451612e-01 0.385478 0.015751 -0.142922 -0.215709 -5.703754e+08 0.159839 ... 0.896512 -6.341452e-01 -6.341452e-01 -6.341452e-01 -0.176115 0.005236 0.000000e+00 0.000000e+00 0.000388 0.008867
150138 0 190 -0.284815 4.418962e-01 0.343618 0.014803 -0.147474 -0.213791 -5.703754e+08 0.155840 ... 0.909353 -6.823705e-01 -6.823705e-01 -6.823705e-01 -0.177304 0.005236 0.000000e+00 0.000000e+00 0.000388 0.010612
150966 0 191 -0.288134 4.390682e-01 0.307361 0.013981 -0.151416 -0.212130 -5.703754e+08 0.152376 ... 0.920475 -7.241401e-01 -7.241401e-01 -7.241401e-01 -0.178334 0.005236 0.000000e+00 0.000000e+00 0.000388 0.012024
151793 0 192 -0.291011 4.366171e-01 0.275937 0.013269 -0.154833 -0.210690 -5.703754e+08 0.149374 ... 0.930115 -7.603424e-01 -7.603424e-01 -7.603424e-01 -0.179226 0.005236 0.000000e+00 0.000000e+00 0.000388 -0.014977
152619 0 193 -0.295671 4.326472e-01 0.225041 0.012116 -0.160367 -0.208358 -5.703754e+08 0.144512 ... 0.945728 -8.189780e-01 -8.189780e-01 -8.189780e-01 -0.180672 0.005236 0.000000e+00 0.000000e+00 0.000388 0.010760
153444 0 194 -0.297551 4.310463e-01 0.204517 0.011651 -0.162599 -0.207418 -5.703754e+08 0.142551 ... 0.952024 -8.426234e-01 -8.426234e-01 -8.426234e-01 -0.181255 0.005236 0.000000e+00 0.000000e+00 0.000388 0.012923
154270 0 195 -0.299183 4.296562e-01 0.186694 0.011247 -0.164536 -0.206602 -5.703754e+08 0.140849 ... 0.957491 -8.631561e-01 -8.631561e-01 -8.631561e-01 -0.181762 0.005236 0.000000e+00 0.000000e+00 0.000388 0.022528
155095 0 196 -0.300600 4.284486e-01 0.171213 0.010897 -0.166220 -0.205892 -5.703754e+08 0.139370 ... 0.962240 -8.809918e-01 -8.809918e-01 -8.809918e-01 -0.182201 0.005236 0.000000e+00 0.000000e+00 0.000388 -0.026797
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1678413 0 1783 0.029335 -5.027509e-02 -0.003322 0.031089 -0.473021 -0.077385 3.716965e-01 -0.092991 ... 0.294015 -1.739833e-27 -1.260800e-23 -1.008935e-42 -0.096673 -0.047351 1.530497e-08 -6.291904e-07 -0.004992 -0.011444
1679486 0 1784 0.029756 -5.024469e-02 -0.003379 0.033569 -0.476485 -0.084699 3.706125e-01 -0.092883 ... 0.278514 -1.318547e-27 -9.555080e-24 -7.651090e-43 -0.100545 -0.036978 1.159900e-08 -4.768372e-07 -0.005937 0.011999
1680559 0 1785 0.029926 -5.023238e-02 -0.003402 0.034574 -0.477889 -0.087662 3.701733e-01 -0.092840 ... 0.272234 -1.147862e-27 -8.318181e-24 -6.656168e-43 -0.102114 -0.091188 1.009751e-08 -4.151109e-07 -0.009025 0.020277
1681632 0 1786 0.030075 -5.022166e-02 -0.003422 0.035448 -0.479111 -0.090242 3.697910e-01 -0.092802 ... 0.266767 -9.992718e-28 -7.241397e-24 -5.801376e-43 -0.103480 -0.094736 8.790395e-09 -3.613750e-07 -0.010514 0.048122
1682705 0 1787 0.030204 -5.021232e-02 -0.003439 0.036210 -0.480175 -0.092488 3.694582e-01 -0.092768 ... 0.262008 -8.699167e-28 -6.304002e-24 -5.044674e-43 -0.104669 -0.105505 7.652483e-09 -3.145952e-07 -0.017143 -0.014449
1683778 0 1788 0.030317 -5.020420e-02 -0.003454 0.036873 -0.481101 -0.094443 3.691684e-01 -0.092740 ... 0.257864 -7.573065e-28 -5.487953e-24 -4.400077e-43 -0.105704 -0.052454 6.661874e-09 -2.738710e-07 -0.017226 -0.004970
1684851 0 1789 0.030574 -5.018561e-02 -0.003489 0.038389 -0.483220 -0.098916 3.685054e-01 -0.092674 ... 0.248384 -4.996359e-28 -3.620698e-24 -2.900688e-43 -0.108072 0.018029 4.395198e-09 -1.806875e-07 -0.025375 0.001802
1685923 0 1790 0.030639 -5.018094e-02 -0.003498 0.038770 -0.483751 -0.100039 3.683390e-01 -0.092657 ... 0.246004 -4.349583e-28 -3.152001e-24 -2.522337e-43 -0.108666 0.020061 3.826242e-09 -1.572976e-07 -0.023762 0.003436
1686995 0 1791 0.030695 -5.017688e-02 -0.003505 0.039102 -0.484214 -0.101017 3.681941e-01 -0.092643 ... 0.243932 -3.786532e-28 -2.743976e-24 -2.200039e-43 -0.109184 0.050118 3.330937e-09 -1.369355e-07 -0.024146 -0.004264
1688067 0 1792 0.030744 -5.017334e-02 -0.003512 0.039390 -0.484618 -0.101868 3.680680e-01 -0.092630 ... 0.242129 -3.296368e-28 -2.388770e-24 -1.905766e-43 -0.109634 0.056490 2.899749e-09 -1.192093e-07 -0.023694 -0.006907
1689139 0 1793 0.030787 -5.017026e-02 -0.003517 0.039641 -0.484968 -0.102609 3.679582e-01 -0.092619 ... 0.240559 -2.869655e-28 -2.079545e-24 -1.667545e-43 -0.110027 0.068677 2.524378e-09 -1.037777e-07 -0.021098 -0.004992
1690211 0 1794 0.030856 -5.016525e-02 -0.003527 0.040050 -0.485540 -0.103815 3.677794e-01 -0.092602 ... 0.238002 -2.174792e-28 -1.576001e-24 -1.261169e-43 -0.110665 0.068767 1.913121e-09 -7.864880e-08 -0.021999 -0.002802
1691284 0 1795 0.030884 -5.016322e-02 -0.003531 0.040216 -0.485771 -0.104304 3.677069e-01 -0.092594 ... 0.236966 -1.893266e-28 -1.371988e-24 -1.093013e-43 -0.110924 0.082079 1.665468e-09 -6.846776e-08 -0.019079 -0.000695
1692357 0 1796 0.030930 -5.015991e-02 -0.003537 0.040486 -0.486148 -0.105100 3.675890e-01 -0.092583 ... 0.235279 -1.434827e-28 -1.039773e-24 -8.267661e-44 -0.111345 0.072209 1.262189e-09 -5.188886e-08 -0.025620 0.002240
1693430 0 1797 0.030948 -5.015857e-02 -0.003539 0.040595 -0.486301 -0.105422 3.675412e-01 -0.092578 ... 0.234596 -1.249090e-28 -9.051746e-25 -7.286752e-44 -0.111516 0.077314 1.098799e-09 -4.517187e-08 -0.023317 -0.000972
1694503 0 1798 0.030965 -5.015740e-02 -0.003541 0.040691 -0.486434 -0.105703 3.674996e-01 -0.092574 ... 0.234001 -1.087396e-28 -7.880003e-25 -6.305843e-44 -0.111665 0.062890 9.565604e-10 -3.932440e-08 -0.026073 0.002504
1695576 0 1799 0.030979 -5.015639e-02 -0.003543 0.044831 -0.486550 -0.101101 3.674634e-01 -0.094245 ... 0.226356 -9.466331e-29 -6.859941e-25 -5.465064e-44 -0.116042 0.079037 8.327342e-10 -3.423388e-08 -0.020253 -0.001514
1696660 0 1800 0.030991 -5.015550e-02 -0.003545 0.048435 -0.486651 -0.097094 3.674318e-01 -0.095700 ... 0.219701 -8.240919e-29 -5.971925e-25 -4.764415e-44 -0.119852 0.056845 7.249372e-10 -2.980232e-08 -0.022716 -0.014502
1697744 0 1801 0.031002 -5.015473e-02 -0.003546 0.051573 -0.486738 -0.093606 3.674044e-01 -0.096966 ... 0.213907 -7.174137e-29 -5.198863e-25 -4.203895e-44 -0.123169 0.044515 6.310945e-10 -2.594443e-08 -0.025760 0.001356
1698828 0 1802 0.031011 -5.015406e-02 -0.003548 0.054305 -0.486815 -0.090569 3.673805e-01 -0.098068 ... 0.208863 -6.245449e-29 -4.525873e-25 -3.643376e-44 -0.126056 0.039230 5.493997e-10 -2.258594e-08 -0.022167 -0.006496
1699912 0 1803 0.031019 -5.015348e-02 -0.003549 0.056682 -0.486881 -0.087926 3.673597e-01 -0.099028 ... 0.204473 -5.436979e-29 -3.940001e-25 -3.222986e-44 -0.128570 0.034210 4.782802e-10 -1.966220e-08 -0.025424 0.000646
1700996 0 1804 0.031026 -5.015297e-02 -0.003550 0.058753 -0.486939 -0.085625 3.673416e-01 -0.099864 ... 0.200650 -4.733165e-29 -3.429970e-25 -2.802597e-44 -0.130759 0.036258 4.163671e-10 -1.711694e-08 -0.027991 -0.007014
1702080 0 1805 0.031032 -5.015253e-02 -0.003550 0.060555 -0.486989 -0.083621 3.673258e-01 -0.100591 ... 0.197323 -4.120460e-29 -2.985963e-25 -2.382207e-44 -0.132664 0.037274 3.624686e-10 -1.490116e-08 -0.029492 0.008276
1703164 0 1806 0.031037 -5.015215e-02 -0.003551 0.062124 -0.487033 -0.081877 3.673121e-01 -0.101224 ... 0.194426 -3.587069e-29 -2.599432e-25 -2.101948e-44 -0.134322 0.013720 3.155473e-10 -1.297221e-08 -0.030470 -0.004655
1704248 0 1807 0.031042 -5.015181e-02 -0.003552 0.063489 -0.487071 -0.080359 3.673001e-01 -0.101776 ... 0.191904 -3.122724e-29 -2.262936e-25 -1.821688e-44 -0.135766 0.027679 2.746998e-10 -1.129297e-08 -0.029216 -0.007162
1705332 0 1808 0.031046 -5.015152e-02 -0.003552 0.064678 -0.487105 -0.079037 3.672897e-01 -0.102255 ... 0.189709 -2.718490e-29 -1.970001e-25 -1.541428e-44 -0.137023 -0.008870 2.391401e-10 -9.831099e-09 -0.027297 0.005913
1706416 0 1809 0.031053 -5.015104e-02 -0.003553 0.066615 -0.487159 -0.076885 3.672728e-01 -0.103037 ... 0.186134 -2.060230e-29 -1.492981e-25 -1.261169e-44 -0.139070 -0.011822 1.812343e-10 -7.450581e-09 -0.027407 0.001084
1707500 0 1810 0.031055 -5.015085e-02 -0.003554 0.067399 -0.487181 -0.076013 3.672659e-01 -0.103353 ... 0.184685 -1.793534e-29 -1.299716e-25 -9.809089e-45 -0.139899 0.108225 1.577736e-10 -6.486107e-09 -0.032493 0.014193
1708584 0 1811 0.031058 -5.015069e-02 -0.003554 0.068082 -0.487200 -0.075254 3.672599e-01 -0.103629 ... 0.183424 -1.561362e-29 -1.131468e-25 -8.407791e-45 -0.140621 0.118004 1.373499e-10 -5.646484e-09 -0.032447 0.017506
1709670 0 1812 0.031061 -5.015041e-02 -0.003554 0.069194 -0.487231 -0.074018 3.672502e-01 -0.104078 ... 0.181371 -1.183291e-29 -8.574926e-26 -7.006492e-45 -0.141797 0.155133 1.040918e-10 -4.279235e-09 -0.028572 -0.001499

1646 rows × 111 columns


In [36]:
train_data = []
for uid in uids:
    train_data.append( timeseries_id.loc[ timeseries_id['id'] == uid ] )

In [42]:
print(train_data[500].describe() )
train_data[90].describe()


          id   timestamp   derived_0     derived_1   derived_2   derived_3  \
count  167.0   167.00000  167.000000  1.670000e+02  167.000000  167.000000   
mean   760.0  1729.00000    2.665702  1.851362e+10   -1.412422   -0.542541   
std      0.0    48.35287    1.296966  1.185383e+11    1.188876    0.098337   
min    760.0  1646.00000   -4.536046  3.712765e-01   -2.474511   -0.737826   
25%    760.0  1687.50000    2.084843  4.033775e-01   -2.108023   -0.576306   
50%    760.0  1729.00000    2.729487  4.152939e-01   -1.935057   -0.503850   
75%    760.0  1770.50000    3.381960  8.561365e-01   -1.265020   -0.464237   
max    760.0  1812.00000    3.853158  7.729436e+11    0.880080   -0.437878   

        derived_4  fundamental_0  fundamental_1  fundamental_2     ...      \
count  167.000000     167.000000   1.670000e+02     167.000000     ...       
mean     0.640865      -0.211127  -5.703760e+08       0.570303     ...       
std      2.730414       0.063717   6.419248e+02       0.230059     ...       
min      0.100831      -0.304139  -5.703754e+08      -0.162295     ...       
25%      0.206776      -0.267548  -5.703754e+08       0.428843     ...       
50%      0.221496      -0.220222  -5.703754e+08       0.460351     ...       
75%      0.247066      -0.166967  -5.703754e+08       0.715237     ...       
max     18.016613      -0.020409  -5.703754e+08       1.033149     ...       

       technical_36  technical_37  technical_38  technical_39  technical_40  \
count    167.000000  1.670000e+02  1.670000e+02  1.670000e+02    167.000000   
mean       0.042319 -1.052911e-01 -1.050643e-01 -1.048560e-01      0.153309   
std        0.794744  2.472035e-01  2.472209e-01  2.472445e-01      0.080489   
min       -1.222611 -9.376735e-01 -9.376735e-01 -9.376735e-01      0.049083   
25%       -0.581855 -2.094509e-02 -2.094509e-02 -2.094509e-02      0.093031   
50%       -0.005949 -1.442857e-05 -1.442857e-05 -1.442857e-05      0.130116   
75%        0.710346 -1.147246e-08 -1.147246e-08 -1.147246e-08      0.212634   
max        1.350116  0.000000e+00  0.000000e+00  0.000000e+00      0.306145   

       technical_41  technical_42  technical_43  technical_44           y  
count    167.000000  1.670000e+02  1.670000e+02  1.670000e+02  167.000000  
mean      -0.067410 -1.035178e-01 -2.434016e-01  3.881482e-04    0.001822  
std        0.102435  2.475667e-01  5.809696e-01  7.005927e-10    0.027742  
min       -0.307961 -9.376735e-01 -2.000000e+00  3.881475e-04   -0.086094  
25%       -0.135019 -1.699966e-02 -2.407625e-02  3.881475e-04   -0.014582  
50%       -0.010967 -1.442857e-05 -1.903862e-05  3.881475e-04    0.002450  
75%        0.005236 -1.147246e-08 -1.513801e-08  3.881475e-04    0.015327  
max        0.074351  0.000000e+00 -1.197886e-11  3.881475e-04    0.093498  

[8 rows x 111 columns]
Out[42]:
id timestamp derived_0 derived_1 derived_2 derived_3 derived_4 fundamental_0 fundamental_1 fundamental_2 ... technical_36 technical_37 technical_38 technical_39 technical_40 technical_41 technical_42 technical_43 technical_44 y
count 1813.0 1813.000000 1813.000000 1.813000e+03 1813.000000 1813.000000 1813.000000 1813.000000 1.813000e+03 1813.000000 ... 1813.000000 1.813000e+03 1.813000e+03 1.813000e+03 1813.000000 1813.000000 1.813000e+03 1813.000000 1813.000000 1813.000000
mean 132.0 906.000000 -0.058425 8.953012e+09 0.146766 0.703138 -0.083834 0.190869 -5.703642e+08 -0.122751 ... -0.119251 -6.804787e-02 -7.749242e-02 -6.822543e-02 0.469998 0.007877 -1.069208e-02 -1.181274 -0.000033 0.000061
std 0.0 523.512337 0.213262 8.273049e+10 0.103685 2.230596 0.322826 0.089175 1.120296e+04 0.240167 ... 0.817104 2.016541e-01 2.135926e-01 2.023427e-01 0.229366 0.111792 1.706771e-01 0.942628 0.025838 0.029507
min 132.0 0.000000 -0.338433 -3.131947e-02 -0.332033 -7.054357 -0.609691 -0.100612 -5.703754e+08 -0.429032 ... -1.360596 -9.527674e-01 -9.527674e-01 -9.527674e-01 0.043785 -0.327233 -9.457232e-01 -2.000000 -0.057559 -0.086094
25% 132.0 453.000000 -0.217740 1.147535e-01 0.074607 0.459400 -0.350128 0.120739 -5.703754e+08 -0.266960 ... -0.738286 -3.046177e-04 -1.399656e-03 -3.046177e-04 0.250156 -0.071838 -1.106077e-20 -2.000000 -0.020096 -0.016046
50% 132.0 906.000000 -0.161497 3.381517e-01 0.131323 0.547394 -0.076328 0.185428 -5.703754e+08 -0.212201 ... -0.244904 -1.264501e-10 -8.023565e-09 -2.607062e-12 0.552837 0.002232 0.000000e+00 -1.984375 -0.000609 -0.001044
75% 132.0 1359.000000 0.119724 1.438533e+00 0.233826 0.607379 0.177735 0.273472 -5.703754e+08 -0.033086 ... 0.221536 -6.972643e-22 -2.164439e-18 -6.900835e-35 0.631811 0.093350 0.000000e+00 -0.000488 0.019493 0.013955
max 132.0 1812.000000 0.360814 7.729436e+11 0.321231 19.607140 0.427289 0.344082 -5.703754e+08 0.975261 ... 2.643460 0.000000e+00 0.000000e+00 0.000000e+00 0.848786 0.313070 9.375000e-01 0.000000 0.065233 0.093498

8 rows × 111 columns


In [43]:
len(train_data)


Out[43]:
1424

In [44]:



Out[44]:
id timestamp derived_0 derived_1 derived_2 derived_3 derived_4 fundamental_0 fundamental_1 fundamental_2 ... technical_36 technical_37 technical_38 technical_39 technical_40 technical_41 technical_42 technical_43 technical_44 y
count 1646.0 1646.000000 1646.000000 1.646000e+03 1646.000000 1646.000000 1646.000000 1646.000000 1.646000e+03 1646.000000 ... 1646.000000 1.646000e+03 1.646000e+03 1.646000e+03 1646.000000 1646.000000 1.646000e+03 1.646000e+03 1646.000000 1646.000000
mean 0.0 989.500000 -0.001228 1.408767e+09 -0.004749 -0.008533 0.111429 -0.182586 -4.262226e+07 0.176767 ... 0.108057 -1.302352e-01 -1.226205e-01 -7.148141e-02 -0.071777 0.006662 1.168664e-02 -5.813643e-01 -0.014462 0.000207
std 0.0 475.303587 0.333821 3.297888e+10 0.103862 0.054967 0.816958 0.074010 1.500248e+08 0.112926 ... 0.374263 2.689950e-01 2.764083e-01 2.133210e-01 0.070447 0.096715 1.869833e-01 8.603244e-01 0.024462 0.014373
min 0.0 167.000000 -4.536046 -5.430397e-02 -0.332033 -0.504601 -0.487231 -0.380160 -5.703754e+08 -0.162295 ... -0.553699 -9.970396e-01 -9.974228e-01 -9.974228e-01 -0.242448 -0.238166 -9.455906e-01 -2.000000e+00 -0.078894 -0.086094
25% 0.0 578.250000 -0.169424 -4.109198e-02 -0.040009 -0.050176 -0.177244 -0.250514 3.485617e-01 0.120773 ... -0.157843 -5.859375e-02 -1.710002e-02 -2.266130e-04 -0.122406 -0.056985 -2.438194e-28 -1.747775e+00 -0.031335 -0.007402
50% 0.0 989.500000 -0.064549 9.054013e-02 -0.012615 0.009540 0.131383 -0.181862 4.936251e-01 0.169546 ... 0.038023 -4.492915e-05 -2.087333e-06 -2.759641e-10 -0.071140 0.004271 2.261187e-38 -1.062609e-04 -0.009130 0.000147
75% 0.0 1400.750000 0.246447 1.836513e-01 0.004717 0.035317 0.305429 -0.115067 5.513806e-01 0.228592 ... 0.346111 -9.501463e-11 -3.333576e-11 -1.752180e-19 -0.016527 0.065895 7.723607e-10 -1.713779e-11 0.000700 0.007355
max 0.0 1812.000000 0.473889 7.729436e+11 0.935920 0.072241 18.016613 -0.020409 6.205065e-01 0.549132 ... 1.404450 0.000000e+00 0.000000e+00 0.000000e+00 0.058438 0.325816 9.526339e-01 0.000000e+00 0.041618 0.093498

8 rows × 111 columns


In [48]:
train_data_3d = np.dstack(train_data)


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-48-eb3fb5792645> in <module>()
----> 1 train_data_3d = np.dstack(train_data)

/home/topolo/Public/anaconda2/lib/python2.7/site-packages/numpy/lib/shape_base.pyc in dstack(tup)
    366 
    367     """
--> 368     return _nx.concatenate([atleast_3d(_m) for _m in tup], 2)
    369 
    370 def _replace_zero_by_x_arrays(sub_arys):

ValueError: all the input array dimensions except for the concatenation axis must match exactly

In Summary


In [ ]:
# clean with mean
timeseries_pd_meanclean = timeseries_pd.where( pd.notnull(timeseries_pd),timeseries_pd.mean(),axis='columns')

In [ ]:
# order by id
timeseries_id=timeseries_pd_meanclean.sort_values(by=["id", "timestamp"])

In [ ]:
uids = timeseries_id['id'].unique()

In [ ]:
train_data = []
for uid in uids:
    train_data.append( timeseries_id.loc[ timeseries_id['id'] == uid ] )

Practice; simulating the kaggle gym, with its "features" (the input data you want to test on)

Simulating what kaggle uses to train on


In [10]:
# data input to train on  
obs_trainon = timeseries_pd[timeseries_pd["timestamp"]<907]

In [34]:
obs_trainon.describe();
obs_trainon_meanclean = obs_trainon.where(pd.notnull(obs_trainon),obs_trainon.mean(),axis='columns')
obs_trainon_meanclean.sort_values(by=['id','timestamp']);

In [ ]:


In [ ]:


In [ ]:


In [35]:
def clean_tseries(timeseries_pd):
    # clean the data
    # I chose to fill in missing values, NaN values, with the mean, due to the distribution of the data 
    tseries_meanclean = timeseries_pd.where( pd.notnull(timeseries_pd),timeseries_pd.mean(),axis='columns')
    
    # order by id.  We want the first index to be id=1,2,..m, m representing number of training examples
    tseries_id=tseries_meanclean.sort_values(by=['id','timestamp'])
    
    uids = tseries_id['id'].unique()
    train_data = []
    for uid in uids:
        train_data.append( tseries_id.loc[ tseries_id['id'] == uid])
    return train_data

In [36]:
res_clean_obs_trainon = clean_tseries(obs_trainon)
print(len(res_clean_obs_trainon)); res_clean_obs_trainon[0].describe()


1096
Out[36]:
id timestamp derived_0 derived_1 derived_2 derived_3 derived_4 fundamental_0 fundamental_1 fundamental_2 ... technical_36 technical_37 technical_38 technical_39 technical_40 technical_41 technical_42 technical_43 technical_44 y
count 740.0 740.000000 740.000000 7.400000e+02 740.000000 740.000000 740.000000 740.000000 7.400000e+02 740.000000 ... 740.000000 740.000000 7.400000e+02 7.400000e+02 740.000000 740.000000 7.400000e+02 7.400000e+02 740.000000 740.000000
mean 0.0 536.500000 0.141298 1.294029e+09 0.029532 -0.059602 0.143820 -0.245960 -2.082033e+08 0.155819 ... 0.327425 -0.196187 -1.561827e-01 -9.037122e-02 -0.133291 0.016009 8.418077e-04 -3.366559e-01 -0.012644 0.000317
std 0.0 213.763888 0.673985 2.029559e+10 0.152071 0.058863 2.384773 0.050656 4.666271e+08 0.063896 ... 0.372466 0.311861 3.046142e-01 2.271253e-01 0.048187 0.104661 1.870003e-01 6.936044e-01 0.018649 0.015126
min 0.0 167.000000 -9.452177 1.022506e-01 -0.834537 -0.855791 -0.421443 -0.380160 -1.252604e+09 -0.249810 ... -0.270343 -0.997040 -9.970396e-01 -9.526339e-01 -0.242448 -0.212149 -9.455906e-01 -1.999907e+00 -0.057820 -0.060769
25% 0.0 351.750000 -0.034855 1.390509e-01 -0.013344 -0.074317 -0.340190 -0.265878 3.921026e-01 0.105054 ... 0.020599 -0.291292 -8.936096e-02 -6.354555e-03 -0.170361 -0.051960 -5.884874e-13 -1.620358e-02 -0.026290 -0.007691
50% 0.0 536.500000 0.285073 1.961368e-01 0.000810 -0.054055 -0.117074 -0.253719 5.271003e-01 0.159256 ... 0.339271 -0.006377 -1.661682e-04 -1.459134e-06 -0.126920 0.006004 -6.565064e-27 -2.764836e-09 -0.007597 0.000309
75% 0.0 721.250000 0.438258 2.748602e-01 0.021877 -0.040042 0.441190 -0.210890 5.566207e-01 0.212425 ... 0.496720 -0.000029 -6.355221e-08 -1.268952e-11 -0.090822 0.086496 0.000000e+00 -3.330669e-16 0.000783 0.007995
max 0.0 906.000000 0.473889 3.191938e+11 0.935920 0.028222 37.087269 -0.005858 6.205065e-01 0.277171 ... 1.404450 0.000000 0.000000e+00 0.000000e+00 0.031995 0.325816 9.526339e-01 0.000000e+00 0.027206 0.093498

8 rows × 111 columns


In [37]:
res_clean_obs_trainon[0].values.shape


Out[37]:
(740, 111)

In [40]:
res_clean_obs_trainon[0].drop(['y'],axis=1).describe()


Out[40]:
id timestamp derived_0 derived_1 derived_2 derived_3 derived_4 fundamental_0 fundamental_1 fundamental_2 ... technical_35 technical_36 technical_37 technical_38 technical_39 technical_40 technical_41 technical_42 technical_43 technical_44
count 740.0 740.000000 740.000000 7.400000e+02 740.000000 740.000000 740.000000 740.000000 7.400000e+02 740.000000 ... 740.000000 740.000000 740.000000 7.400000e+02 7.400000e+02 740.000000 740.000000 7.400000e+02 7.400000e+02 740.000000
mean 0.0 536.500000 0.141298 1.294029e+09 0.029532 -0.059602 0.143820 -0.245960 -2.082033e+08 0.155819 ... 0.379662 0.327425 -0.196187 -1.561827e-01 -9.037122e-02 -0.133291 0.016009 8.418077e-04 -3.366559e-01 -0.012644
std 0.0 213.763888 0.673985 2.029559e+10 0.152071 0.058863 2.384773 0.050656 4.666271e+08 0.063896 ... 0.359621 0.372466 0.311861 3.046142e-01 2.271253e-01 0.048187 0.104661 1.870003e-01 6.936044e-01 0.018649
min 0.0 167.000000 -9.452177 1.022506e-01 -0.834537 -0.855791 -0.421443 -0.380160 -1.252604e+09 -0.249810 ... -0.126267 -0.270343 -0.997040 -9.970396e-01 -9.526339e-01 -0.242448 -0.212149 -9.455906e-01 -1.999907e+00 -0.057820
25% 0.0 351.750000 -0.034855 1.390509e-01 -0.013344 -0.074317 -0.340190 -0.265878 3.921026e-01 0.105054 ... 0.093098 0.020599 -0.291292 -8.936096e-02 -6.354555e-03 -0.170361 -0.051960 -5.884874e-13 -1.620358e-02 -0.026290
50% 0.0 536.500000 0.285073 1.961368e-01 0.000810 -0.054055 -0.117074 -0.253719 5.271003e-01 0.159256 ... 0.337073 0.339271 -0.006377 -1.661682e-04 -1.459134e-06 -0.126920 0.006004 -6.565064e-27 -2.764836e-09 -0.007597
75% 0.0 721.250000 0.438258 2.748602e-01 0.021877 -0.040042 0.441190 -0.210890 5.566207e-01 0.212425 ... 0.570238 0.496720 -0.000029 -6.355221e-08 -1.268952e-11 -0.090822 0.086496 0.000000e+00 -3.330669e-16 0.000783
max 0.0 906.000000 0.473889 3.191938e+11 0.935920 0.028222 37.087269 -0.005858 6.205065e-01 0.277171 ... 1.204843 1.404450 0.000000 0.000000e+00 0.000000e+00 0.031995 0.325816 9.526339e-01 0.000000e+00 0.027206

8 rows × 110 columns


In [44]:
print( type(res_clean_obs_trainon[0]['y']) )
print( type(res_clean_obs_trainon[0][['y']]))
print( res_clean_obs_trainon[0][['y']].values.shape)


<class 'pandas.core.series.Series'>
<class 'pandas.core.frame.DataFrame'>
(740, 1)

So we actually want the numpy array for calculations.


In [29]:
def clean_tseries(timeseries_pd):
    # clean the data
    # I chose to fill in missing values, NaN values, with the mean, due to the distribution of the data 
    tseries_meanclean = timeseries_pd.where( pd.notnull(timeseries_pd),timeseries_pd.mean(),axis='columns')
    
    # order by id.  We want the first index to be id=1,2,..m, m representing number of training examples
    tseries_id=tseries_meanclean.sort_values(by=['id','timestamp'])
    
    uids = tseries_id['id'].unique()
    train_data = []
    for uid in uids:
        train_data.append( tseries_id.loc[ tseries_id['id'] == uid].values  )
    return train_data

In [30]:
res_clean_obs_trainon = clean_tseries(obs_trainon)
print(len(res_clean_obs_trainon)); res_clean_obs_trainon[0].shape


1096
Out[30]:
(740, 111)

So we actually need to split up the input data $X$ from the output data $y$


In [47]:
def clean_tseries(timeseries_pd):
    # clean the data
    # I chose to fill in missing values, NaN values, with the mean, due to the distribution of the data 
    tseries_meanclean = timeseries_pd.where( pd.notnull(timeseries_pd),timeseries_pd.mean(),axis='columns')
    
    # order by id.  We want the first index to be id=1,2,..m, m representing number of training examples
    tseries_id=tseries_meanclean.sort_values(by=['id','timestamp'])
    
    uids = tseries_id['id'].unique()
    train_data = []
    for uid in uids:
        train_data.append( tseries_id.loc[ tseries_id['id'] == uid]  )
    
    train_data_split = []
    for row in train_data:
        train_data_split.append( ( row.drop(['y'],axis=1).values, row[['y']].values ) )
        
    return train_data_split

In [48]:
res_clean_obs_trainon = clean_tseries(obs_trainon)

In [51]:
print( len(res_clean_obs_trainon))
print( type( res_clean_obs_trainon[0] ) ); print(len(res_clean_obs_trainon[0]));
print( res_clean_obs_trainon[0][0].shape);print(res_clean_obs_trainon[0][1].shape)


1096
<type 'tuple'>
2
(740, 110)
(740, 1)

Simulating observations, features


In [39]:
np.array( [3]).shape


Out[39]:
(1,)

So we actually need also something to clean the test data X to predict on


In [53]:
def clean_test(timeseries_pd):
    # clean the data
    # I chose to fill in missing values, NaN values, with the mean, due to the distribution of the data 
    tseries_meanclean = timeseries_pd.where( pd.notnull(timeseries_pd),timeseries_pd.mean(),axis='columns')
    
    # order by id.  We want the first index to be id=1,2,..m, m representing number of training examples
    tseries_id=tseries_meanclean.sort_values(by=['id','timestamp'])
    
    uids = tseries_id['id'].unique()
    train_data = []
    for uid in uids:
        train_data.append( tseries_id.loc[ tseries_id['id'] == uid].values  )
    
        
    return train_data

In [65]:
#this corresponds to what kaggle calls FEATURES, features
obs_predicton = timeseries_pd[timeseries_pd["timestamp"]==908]
obs_predictonX = obs_predicton.drop('y',axis=1)

In [66]:
obs_predicton_cleaned = clean_test( obs_predictonX)

In [69]:
print(type(obs_predicton_cleaned));print(type(obs_predicton_cleaned[0]));print(obs_predicton_cleaned[0].shape)


<type 'list'>
<type 'numpy.ndarray'>
(1, 110)

In [72]:
obs_predicton_cleaned[5][0][0]


Out[72]:
15.0

In [73]:
def id_only(cleaned_X_data):
    m = len(cleaned_X_data)
    result = []
    for idx in range(m):
        id = cleaned_X_data[idx][0][0]
        result.append(int(id))
    return result

In [74]:
res_id_only = id_only(obs_predicton_cleaned)

In [79]:
res_id_only[:10]
print(len(res_id_only))


968

In [ ]:
def just_id(cleaned_X_data, predicted_y):
    # assert len(cleaned_X_data) == len(predicted_y)

In [61]:
print( np.array( [[5],[3],[2]]).shape)
np.array( [[5]]).flatten()[0]


(3, 1)
Out[61]:
5

In [76]:
obs_predictedy = obs_predicton['y']

In [83]:
# simulate what I'm going to get
list( obs_predictedy.values.reshape(968,1,1) )[0]


Out[83]:
array([[-0.00385292]], dtype=float32)

In [84]:
def y_only(predicted_y):
    result = []
    for row in predicted_y:
        y = row.flatten()[0]
        result.append(y)
    return result

In [87]:
res_y_only = y_only( list( obs_predictedy.values.reshape(968,1,1) ) )

In [88]:
print(len(res_y_only)); res_y_only[:10]


968
Out[88]:
[-0.0038529162,
 0.0028961198,
 -0.00094434741,
 -0.005380407,
 -0.01242116,
 -0.014861807,
 -0.026671588,
 -0.012705223,
 0.0045782495,
 -0.01963657]

In [89]:
pd_predictedon = pd.DataFrame.from_dict( dict(id=res_id_only,y=res_y_only))

In [90]:
pd_predictedon.describe()


Out[90]:
id y
count 968.000000 968.000000
mean 1094.757231 -0.001006
std 632.309033 0.014307
min 0.000000 -0.086094
25% 550.250000 -0.007348
50% 1099.500000 -0.000869
75% 1660.500000 0.005278
max 2156.000000 0.093498

In [91]:
pd_predictedon.isnull().describe()


Out[91]:
id y
count 968 968
unique 1 1
top False False
freq 968 968

Making it work with the different $T$


In [92]:
t_all_cleaned = clean_tseries(timeseries_pd)

In [93]:
len(t_all_cleaned)


Out[93]:
1424

In [99]:
print( type(t_all_cleaned[0]));print( len(t_all_cleaned[0]));
print( t_all_cleaned[0][0].shape); print(t_all_cleaned[0][1].shape)


<type 'tuple'>
2
(1646, 110)
(1646, 1)

In [100]:
for i in range(13):
    print( t_all_cleaned[i][0].shape, t_all_cleaned[i][1].shape )


((1646, 110), (1646, 1))
((728, 110), (728, 1))
((1543, 110), (1543, 1))
((116, 110), (116, 1))
((1813, 110), (1813, 1))
((1813, 110), (1813, 1))
((1543, 110), (1543, 1))
((218, 110), (218, 1))
((1340, 110), (1340, 1))
((1745, 110), (1745, 1))
((779, 110), (779, 1))
((218, 110), (218, 1))
((829, 110), (829, 1))

In [1]:
import theano


WARNING (theano.sandbox.cuda): The cuda backend is deprecated and will be removed in the next release (v0.10).  Please switch to the gpuarray backend. You can get more information about how to switch at this URL:
 https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29

Using gpu device 0: GeForce GTX 980 Ti (CNMeM is enabled with initial size: 80.0% of memory, cuDNN 5105)

In [2]:
theano.version.full_version


Out[2]:
'0.9.0rc2.dev-19540d4e3064fe0dc0e1281f517bad0f355e46a2'

In [ ]: