By: 顾 瞻 GU Zhan (Sam)

July 2017

[2] Data pre-porcessing

Explore and visualize data


In [1]:
# from __future__ import print_function, division
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import pandas as pd
import operator
from scipy import interp
from itertools import cycle
from sklearn import svm
from sklearn.utils.validation import check_random_state
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_curve, auc
from statsmodels.graphics.mosaicplot import mosaic
print(__doc__)


Automatically created module for IPython interactive environment

In [2]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
# from sklearn import svm

Read raw data


In [338]:
df_history_ts = pd.read_csv('data/history_ts.csv') 
df_history_ts_process = df_history_ts.copy()
df_history_ts_process.head()


Out[338]:
ccyy-mm time bid-price
0 2015-01 11:29:00 74000
1 2015-01 11:29:01 74000
2 2015-01 11:29:02 74000
3 2015-01 11:29:03 74000
4 2015-01 11:29:04 74000

In [339]:
df_history_table = pd.read_csv('data/history_table.csv') 
df_history_table_process = df_history_table.copy()
df_history_table_process.head()


Out[339]:
ccyy-mm volume-plate deal-price-low deal-price-avg deal-early-second volume-bidder
0 2015-01 7990 74000 74216 48 98203
1 2015-02 7653 76500 76618 49 103224
2 2015-03 7406 74600 74830 16 132690
3 2015-04 8288 80600 80759 41 152298
4 2015-05 7482 79000 79099 53 156007

Prepare derived features


In [ ]:


In [ ]:


In [340]:
# convert an array of values into a dataset matrix
def create_dataset(dataset, look_back=1):
	dataX, dataY = [], []
	for i in range(len(dataset)-look_back-1):
		a = dataset[i:(i+look_back), 0]
		dataX.append(a)
		dataY.append(dataset[i + look_back, 0])
	return numpy.array(dataX), numpy.array(dataY)

Parameters


In [341]:
parm_ts_cycle = 61 # seconds/records per month
parm_ts_month = int(len(df_history_ts) / parm_ts_cycle)
print('parm_ts_month : ', parm_ts_month)

parm_calculate_prev_bp = 15 # Number of previous bid-price to include, i.e. previous 2sec, 3sec, 4sec, 5sec ... 15sec
parm_calculate_mv = 15 # Number of moving average to calculate, i.e. previous 2sec, 3sec, 4sec, 5sec ... 15sec


parm_ts_month :  30

In [342]:
# previous N sec ['bid-price']
gap = parm_calculate_prev_bp

for gap in range(1, gap+1):
    col_name = 'bid-price-prev'+str(gap)+'sec'
    col_data = pd.DataFrame(columns=[col_name])
    print('Creating : ', col_name)  

    for month in range(0, parm_ts_month):
    #     print('month : ', month)
        col_data.append(col_data_zeros)
        for i in range(0, gap):
            col_data.loc[month*parm_ts_cycle+i] = 0
        for i in range(gap, parm_ts_cycle):
            col_data.loc[month*parm_ts_cycle+i] = df_history_ts_process['bid-price'][month*parm_ts_cycle+i-gap]
  
    df_history_ts_process[col_name] = col_data

print('Total records processed : ', len(col_data))


Creating :  bid-price-prev1sec
Creating :  bid-price-prev2sec
Creating :  bid-price-prev3sec
Creating :  bid-price-prev4sec
Creating :  bid-price-prev5sec
Creating :  bid-price-prev6sec
Creating :  bid-price-prev7sec
Creating :  bid-price-prev8sec
Creating :  bid-price-prev9sec
Creating :  bid-price-prev10sec
Creating :  bid-price-prev11sec
Creating :  bid-price-prev12sec
Creating :  bid-price-prev13sec
Creating :  bid-price-prev14sec
Creating :  bid-price-prev15sec
Total records processed :  1830

In [343]:
df_history_ts_process[1768:]


Out[343]:
ccyy-mm time bid-price bid-price-prev1sec bid-price-prev2sec bid-price-prev3sec bid-price-prev4sec bid-price-prev5sec bid-price-prev6sec bid-price-prev7sec bid-price-prev8sec bid-price-prev9sec bid-price-prev10sec bid-price-prev11sec bid-price-prev12sec bid-price-prev13sec bid-price-prev14sec bid-price-prev15sec
1768 2017-05 11:30:00 90100 90100 90100 90000 89900 89800 89700 89600 89600 89500 89500 89400 89400 89300 89300 89200
1769 2017-06 11:29:00 88400 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1770 2017-06 11:29:01 88500 88400 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1771 2017-06 11:29:02 88500 88500 88400 0 0 0 0 0 0 0 0 0 0 0 0 0
1772 2017-06 11:29:03 88500 88500 88500 88400 0 0 0 0 0 0 0 0 0 0 0 0
1773 2017-06 11:29:04 88500 88500 88500 88500 88400 0 0 0 0 0 0 0 0 0 0 0
1774 2017-06 11:29:05 88500 88500 88500 88500 88500 88400 0 0 0 0 0 0 0 0 0 0
1775 2017-06 11:29:06 88500 88500 88500 88500 88500 88500 88400 0 0 0 0 0 0 0 0 0
1776 2017-06 11:29:07 88500 88500 88500 88500 88500 88500 88500 88400 0 0 0 0 0 0 0 0
1777 2017-06 11:29:08 88500 88500 88500 88500 88500 88500 88500 88500 88400 0 0 0 0 0 0 0
1778 2017-06 11:29:09 88500 88500 88500 88500 88500 88500 88500 88500 88500 88400 0 0 0 0 0 0
1779 2017-06 11:29:10 88500 88500 88500 88500 88500 88500 88500 88500 88500 88500 88400 0 0 0 0 0
1780 2017-06 11:29:11 88500 88500 88500 88500 88500 88500 88500 88500 88500 88500 88500 88400 0 0 0 0
1781 2017-06 11:29:12 88500 88500 88500 88500 88500 88500 88500 88500 88500 88500 88500 88500 88400 0 0 0
1782 2017-06 11:29:13 88500 88500 88500 88500 88500 88500 88500 88500 88500 88500 88500 88500 88500 88400 0 0
1783 2017-06 11:29:14 88500 88500 88500 88500 88500 88500 88500 88500 88500 88500 88500 88500 88500 88500 88400 0
1784 2017-06 11:29:15 88500 88500 88500 88500 88500 88500 88500 88500 88500 88500 88500 88500 88500 88500 88500 88400
1785 2017-06 11:29:16 88500 88500 88500 88500 88500 88500 88500 88500 88500 88500 88500 88500 88500 88500 88500 88500
1786 2017-06 11:29:17 88500 88500 88500 88500 88500 88500 88500 88500 88500 88500 88500 88500 88500 88500 88500 88500
1787 2017-06 11:29:18 88500 88500 88500 88500 88500 88500 88500 88500 88500 88500 88500 88500 88500 88500 88500 88500
1788 2017-06 11:29:19 88600 88500 88500 88500 88500 88500 88500 88500 88500 88500 88500 88500 88500 88500 88500 88500
1789 2017-06 11:29:20 88600 88600 88500 88500 88500 88500 88500 88500 88500 88500 88500 88500 88500 88500 88500 88500
1790 2017-06 11:29:21 88600 88600 88600 88500 88500 88500 88500 88500 88500 88500 88500 88500 88500 88500 88500 88500
1791 2017-06 11:29:22 88600 88600 88600 88600 88500 88500 88500 88500 88500 88500 88500 88500 88500 88500 88500 88500
1792 2017-06 11:29:23 88600 88600 88600 88600 88600 88500 88500 88500 88500 88500 88500 88500 88500 88500 88500 88500
1793 2017-06 11:29:24 88600 88600 88600 88600 88600 88600 88500 88500 88500 88500 88500 88500 88500 88500 88500 88500
1794 2017-06 11:29:25 88700 88600 88600 88600 88600 88600 88600 88500 88500 88500 88500 88500 88500 88500 88500 88500
1795 2017-06 11:29:26 88700 88700 88600 88600 88600 88600 88600 88600 88500 88500 88500 88500 88500 88500 88500 88500
1796 2017-06 11:29:27 88700 88700 88700 88600 88600 88600 88600 88600 88600 88500 88500 88500 88500 88500 88500 88500
1797 2017-06 11:29:28 88700 88700 88700 88700 88600 88600 88600 88600 88600 88600 88500 88500 88500 88500 88500 88500
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1800 2017-06 11:29:31 88800 88800 88700 88700 88700 88700 88700 88600 88600 88600 88600 88600 88600 88500 88500 88500
1801 2017-06 11:29:32 88800 88800 88800 88700 88700 88700 88700 88700 88600 88600 88600 88600 88600 88600 88500 88500
1802 2017-06 11:29:33 88800 88800 88800 88800 88700 88700 88700 88700 88700 88600 88600 88600 88600 88600 88600 88500
1803 2017-06 11:29:34 88800 88800 88800 88800 88800 88700 88700 88700 88700 88700 88600 88600 88600 88600 88600 88600
1804 2017-06 11:29:35 88800 88800 88800 88800 88800 88800 88700 88700 88700 88700 88700 88600 88600 88600 88600 88600
1805 2017-06 11:29:36 88800 88800 88800 88800 88800 88800 88800 88700 88700 88700 88700 88700 88600 88600 88600 88600
1806 2017-06 11:29:37 88800 88800 88800 88800 88800 88800 88800 88800 88700 88700 88700 88700 88700 88600 88600 88600
1807 2017-06 11:29:38 88800 88800 88800 88800 88800 88800 88800 88800 88800 88700 88700 88700 88700 88700 88600 88600
1808 2017-06 11:29:39 88800 88800 88800 88800 88800 88800 88800 88800 88800 88800 88700 88700 88700 88700 88700 88600
1809 2017-06 11:29:40 88800 88800 88800 88800 88800 88800 88800 88800 88800 88800 88800 88700 88700 88700 88700 88700
1810 2017-06 11:29:41 88900 88800 88800 88800 88800 88800 88800 88800 88800 88800 88800 88800 88700 88700 88700 88700
1811 2017-06 11:29:42 88900 88900 88800 88800 88800 88800 88800 88800 88800 88800 88800 88800 88800 88700 88700 88700
1812 2017-06 11:29:43 89000 88900 88900 88800 88800 88800 88800 88800 88800 88800 88800 88800 88800 88800 88700 88700
1813 2017-06 11:29:44 89000 89000 88900 88900 88800 88800 88800 88800 88800 88800 88800 88800 88800 88800 88800 88700
1814 2017-06 11:29:45 89100 89000 89000 88900 88900 88800 88800 88800 88800 88800 88800 88800 88800 88800 88800 88800
1815 2017-06 11:29:46 89100 89100 89000 89000 88900 88900 88800 88800 88800 88800 88800 88800 88800 88800 88800 88800
1816 2017-06 11:29:47 89100 89100 89100 89000 89000 88900 88900 88800 88800 88800 88800 88800 88800 88800 88800 88800
1817 2017-06 11:29:48 89100 89100 89100 89100 89000 89000 88900 88900 88800 88800 88800 88800 88800 88800 88800 88800
1818 2017-06 11:29:49 89100 89100 89100 89100 89100 89000 89000 88900 88900 88800 88800 88800 88800 88800 88800 88800
1819 2017-06 11:29:50 89100 89100 89100 89100 89100 89100 89000 89000 88900 88900 88800 88800 88800 88800 88800 88800
1820 2017-06 11:29:51 89100 89100 89100 89100 89100 89100 89100 89000 89000 88900 88900 88800 88800 88800 88800 88800
1821 2017-06 11:29:52 89100 89100 89100 89100 89100 89100 89100 89100 89000 89000 88900 88900 88800 88800 88800 88800
1822 2017-06 11:29:53 89200 89100 89100 89100 89100 89100 89100 89100 89100 89000 89000 88900 88900 88800 88800 88800
1823 2017-06 11:29:54 89200 89200 89100 89100 89100 89100 89100 89100 89100 89100 89000 89000 88900 88900 88800 88800
1824 2017-06 11:29:55 89200 89200 89200 89100 89100 89100 89100 89100 89100 89100 89100 89000 89000 88900 88900 88800
1825 2017-06 11:29:56 89200 89200 89200 89200 89100 89100 89100 89100 89100 89100 89100 89100 89000 89000 88900 88900
1826 2017-06 11:29:57 89300 89200 89200 89200 89200 89100 89100 89100 89100 89100 89100 89100 89100 89000 89000 88900
1827 2017-06 11:29:58 89300 89300 89200 89200 89200 89200 89100 89100 89100 89100 89100 89100 89100 89100 89000 89000
1828 2017-06 11:29:59 89300 89300 89300 89200 89200 89200 89200 89100 89100 89100 89100 89100 89100 89100 89100 89000
1829 2017-06 11:30:00 89400 89300 89300 89300 89200 89200 89200 89200 89100 89100 89100 89100 89100 89100 89100 89100

62 rows × 18 columns


In [ ]:


In [373]:
month


Out[373]:
29

In [374]:
parm_ts_cycle


Out[374]:
61

In [375]:
i


Out[375]:
60

In [384]:
gap=5
df_history_ts_process['bid-price'][month*parm_ts_cycle+i-gap:month*parm_ts_cycle+i]


Out[384]:
1824    89200
1825    89200
1826    89300
1827    89300
1828    89300
Name: bid-price, dtype: int64

In [385]:
np.mean(df_history_ts_process['bid-price'][month*parm_ts_cycle+i-gap:month*parm_ts_cycle+i])


Out[385]:
89260.0

In [ ]:


In [368]:
# previous 2 sec Moving Average ['bid-price']

gap = parm_calculate_mv

for gap in range(2, gap+1): # MV starts from 2 seconds, till parm_calculate_mv
    col_name = 'bid-price-mv'+str(gap)+'sec'
    col_data = pd.DataFrame(columns=[col_name])
    print('Creating : ', col_name)  

    for month in range(0, parm_ts_month):
    #     print('month : ', month)
        col_data.append(col_data_zeros)
        for i in range(0, gap):
            col_data.loc[month*parm_ts_cycle+i] = 0
        for i in range(gap, parm_ts_cycle):
            col_data.loc[month*parm_ts_cycle+i] = \
            np.mean(df_history_ts_process['bid-price'][month*parm_ts_cycle+i-gap:month*parm_ts_cycle+i])
  
    df_history_ts_process[col_name] = col_data

print('Total records processed : ', len(col_data))


Creating :  bid-price-mv2sec
Creating :  bid-price-mv3sec
Creating :  bid-price-mv4sec
Creating :  bid-price-mv5sec
Creating :  bid-price-mv6sec
Creating :  bid-price-mv7sec
Creating :  bid-price-mv8sec
Creating :  bid-price-mv9sec
Creating :  bid-price-mv10sec
Creating :  bid-price-mv11sec
Creating :  bid-price-mv12sec
Creating :  bid-price-mv13sec
Creating :  bid-price-mv14sec
Creating :  bid-price-mv15sec
Total records processed :  1830

In [369]:
df_history_ts_process[1768:]


Out[369]:
ccyy-mm time bid-price bid-price-prev1sec bid-price-prev2sec bid-price-prev3sec bid-price-prev4sec bid-price-prev5sec bid-price-prev6sec bid-price-prev7sec ... bid-price-mv6sec bid-price-mv7sec bid-price-mv8sec bid-price-mv9sec bid-price-mv10sec bid-price-mv11sec bid-price-mv12sec bid-price-mv13sec bid-price-mv14sec bid-price-mv15sec
1768 2017-05 11:30:00 90100 90100 90100 90000 89900 89800 89700 89600 ... 89933.3 89885.7 89850 89811.1 89780 89745.5 89716.7 89684.6 89657.1 89626.7
1769 2017-06 11:29:00 88400 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1770 2017-06 11:29:01 88500 88400 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1771 2017-06 11:29:02 88500 88500 88400 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1772 2017-06 11:29:03 88500 88500 88500 88400 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1773 2017-06 11:29:04 88500 88500 88500 88500 88400 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1774 2017-06 11:29:05 88500 88500 88500 88500 88500 88400 0 0 ... 0 0 0 0 0 0 0 0 0 0
1775 2017-06 11:29:06 88500 88500 88500 88500 88500 88500 88400 0 ... 88483.3 0 0 0 0 0 0 0 0 0
1776 2017-06 11:29:07 88500 88500 88500 88500 88500 88500 88500 88400 ... 88500 88485.7 0 0 0 0 0 0 0 0
1777 2017-06 11:29:08 88500 88500 88500 88500 88500 88500 88500 88500 ... 88500 88500 88487.5 0 0 0 0 0 0 0
1778 2017-06 11:29:09 88500 88500 88500 88500 88500 88500 88500 88500 ... 88500 88500 88500 88488.9 0 0 0 0 0 0
1779 2017-06 11:29:10 88500 88500 88500 88500 88500 88500 88500 88500 ... 88500 88500 88500 88500 88490 0 0 0 0 0
1780 2017-06 11:29:11 88500 88500 88500 88500 88500 88500 88500 88500 ... 88500 88500 88500 88500 88500 88490.9 0 0 0 0
1781 2017-06 11:29:12 88500 88500 88500 88500 88500 88500 88500 88500 ... 88500 88500 88500 88500 88500 88500 88491.7 0 0 0
1782 2017-06 11:29:13 88500 88500 88500 88500 88500 88500 88500 88500 ... 88500 88500 88500 88500 88500 88500 88500 88492.3 0 0
1783 2017-06 11:29:14 88500 88500 88500 88500 88500 88500 88500 88500 ... 88500 88500 88500 88500 88500 88500 88500 88500 88492.9 0
1784 2017-06 11:29:15 88500 88500 88500 88500 88500 88500 88500 88500 ... 88500 88500 88500 88500 88500 88500 88500 88500 88500 88493.3
1785 2017-06 11:29:16 88500 88500 88500 88500 88500 88500 88500 88500 ... 88500 88500 88500 88500 88500 88500 88500 88500 88500 88500
1786 2017-06 11:29:17 88500 88500 88500 88500 88500 88500 88500 88500 ... 88500 88500 88500 88500 88500 88500 88500 88500 88500 88500
1787 2017-06 11:29:18 88500 88500 88500 88500 88500 88500 88500 88500 ... 88500 88500 88500 88500 88500 88500 88500 88500 88500 88500
1788 2017-06 11:29:19 88600 88500 88500 88500 88500 88500 88500 88500 ... 88500 88500 88500 88500 88500 88500 88500 88500 88500 88500
1789 2017-06 11:29:20 88600 88600 88500 88500 88500 88500 88500 88500 ... 88516.7 88514.3 88512.5 88511.1 88510 88509.1 88508.3 88507.7 88507.1 88506.7
1790 2017-06 11:29:21 88600 88600 88600 88500 88500 88500 88500 88500 ... 88533.3 88528.6 88525 88522.2 88520 88518.2 88516.7 88515.4 88514.3 88513.3
1791 2017-06 11:29:22 88600 88600 88600 88600 88500 88500 88500 88500 ... 88550 88542.9 88537.5 88533.3 88530 88527.3 88525 88523.1 88521.4 88520
1792 2017-06 11:29:23 88600 88600 88600 88600 88600 88500 88500 88500 ... 88566.7 88557.1 88550 88544.4 88540 88536.4 88533.3 88530.8 88528.6 88526.7
1793 2017-06 11:29:24 88600 88600 88600 88600 88600 88600 88500 88500 ... 88583.3 88571.4 88562.5 88555.6 88550 88545.5 88541.7 88538.5 88535.7 88533.3
1794 2017-06 11:29:25 88700 88600 88600 88600 88600 88600 88600 88500 ... 88600 88585.7 88575 88566.7 88560 88554.5 88550 88546.2 88542.9 88540
1795 2017-06 11:29:26 88700 88700 88600 88600 88600 88600 88600 88600 ... 88616.7 88614.3 88600 88588.9 88580 88572.7 88566.7 88561.5 88557.1 88553.3
1796 2017-06 11:29:27 88700 88700 88700 88600 88600 88600 88600 88600 ... 88633.3 88628.6 88625 88611.1 88600 88590.9 88583.3 88576.9 88571.4 88566.7
1797 2017-06 11:29:28 88700 88700 88700 88700 88600 88600 88600 88600 ... 88650 88642.9 88637.5 88633.3 88620 88609.1 88600 88592.3 88585.7 88580
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1800 2017-06 11:29:31 88800 88800 88700 88700 88700 88700 88700 88600 ... 88716.7 88700 88687.5 88677.8 88670 88663.6 88658.3 88646.2 88635.7 88626.7
1801 2017-06 11:29:32 88800 88800 88800 88700 88700 88700 88700 88700 ... 88733.3 88728.6 88712.5 88700 88690 88681.8 88675 88669.2 88657.1 88646.7
1802 2017-06 11:29:33 88800 88800 88800 88800 88700 88700 88700 88700 ... 88750 88742.9 88737.5 88722.2 88710 88700 88691.7 88684.6 88678.6 88666.7
1803 2017-06 11:29:34 88800 88800 88800 88800 88800 88700 88700 88700 ... 88766.7 88757.1 88750 88744.4 88730 88718.2 88708.3 88700 88692.9 88686.7
1804 2017-06 11:29:35 88800 88800 88800 88800 88800 88800 88700 88700 ... 88783.3 88771.4 88762.5 88755.6 88750 88736.4 88725 88715.4 88707.1 88700
1805 2017-06 11:29:36 88800 88800 88800 88800 88800 88800 88800 88700 ... 88800 88785.7 88775 88766.7 88760 88754.5 88741.7 88730.8 88721.4 88713.3
1806 2017-06 11:29:37 88800 88800 88800 88800 88800 88800 88800 88800 ... 88800 88800 88787.5 88777.8 88770 88763.6 88758.3 88746.2 88735.7 88726.7
1807 2017-06 11:29:38 88800 88800 88800 88800 88800 88800 88800 88800 ... 88800 88800 88800 88788.9 88780 88772.7 88766.7 88761.5 88750 88740
1808 2017-06 11:29:39 88800 88800 88800 88800 88800 88800 88800 88800 ... 88800 88800 88800 88800 88790 88781.8 88775 88769.2 88764.3 88753.3
1809 2017-06 11:29:40 88800 88800 88800 88800 88800 88800 88800 88800 ... 88800 88800 88800 88800 88800 88790.9 88783.3 88776.9 88771.4 88766.7
1810 2017-06 11:29:41 88900 88800 88800 88800 88800 88800 88800 88800 ... 88800 88800 88800 88800 88800 88800 88791.7 88784.6 88778.6 88773.3
1811 2017-06 11:29:42 88900 88900 88800 88800 88800 88800 88800 88800 ... 88816.7 88814.3 88812.5 88811.1 88810 88809.1 88808.3 88800 88792.9 88786.7
1812 2017-06 11:29:43 89000 88900 88900 88800 88800 88800 88800 88800 ... 88833.3 88828.6 88825 88822.2 88820 88818.2 88816.7 88815.4 88807.1 88800
1813 2017-06 11:29:44 89000 89000 88900 88900 88800 88800 88800 88800 ... 88866.7 88857.1 88850 88844.4 88840 88836.4 88833.3 88830.8 88828.6 88820
1814 2017-06 11:29:45 89100 89000 89000 88900 88900 88800 88800 88800 ... 88900 88885.7 88875 88866.7 88860 88854.5 88850 88846.2 88842.9 88840
1815 2017-06 11:29:46 89100 89100 89000 89000 88900 88900 88800 88800 ... 88950 88928.6 88912.5 88900 88890 88881.8 88875 88869.2 88864.3 88860
1816 2017-06 11:29:47 89100 89100 89100 89000 89000 88900 88900 88800 ... 89000 88971.4 88950 88933.3 88920 88909.1 88900 88892.3 88885.7 88880
1817 2017-06 11:29:48 89100 89100 89100 89100 89000 89000 88900 88900 ... 89033.3 89014.3 88987.5 88966.7 88950 88936.4 88925 88915.4 88907.1 88900
1818 2017-06 11:29:49 89100 89100 89100 89100 89100 89000 89000 88900 ... 89066.7 89042.9 89025 89000 88980 88963.6 88950 88938.5 88928.6 88920
1819 2017-06 11:29:50 89100 89100 89100 89100 89100 89100 89000 89000 ... 89083.3 89071.4 89050 89033.3 89010 88990.9 88975 88961.5 88950 88940
1820 2017-06 11:29:51 89100 89100 89100 89100 89100 89100 89100 89000 ... 89100 89085.7 89075 89055.6 89040 89018.2 89000 88984.6 88971.4 88960
1821 2017-06 11:29:52 89100 89100 89100 89100 89100 89100 89100 89100 ... 89100 89100 89087.5 89077.8 89060 89045.5 89025 89007.7 88992.9 88980
1822 2017-06 11:29:53 89200 89100 89100 89100 89100 89100 89100 89100 ... 89100 89100 89100 89088.9 89080 89063.6 89050 89030.8 89014.3 89000
1823 2017-06 11:29:54 89200 89200 89100 89100 89100 89100 89100 89100 ... 89116.7 89114.3 89112.5 89111.1 89100 89090.9 89075 89061.5 89042.9 89026.7
1824 2017-06 11:29:55 89200 89200 89200 89100 89100 89100 89100 89100 ... 89133.3 89128.6 89125 89122.2 89120 89109.1 89100 89084.6 89071.4 89053.3
1825 2017-06 11:29:56 89200 89200 89200 89200 89100 89100 89100 89100 ... 89150 89142.9 89137.5 89133.3 89130 89127.3 89116.7 89107.7 89092.9 89080
1826 2017-06 11:29:57 89300 89200 89200 89200 89200 89100 89100 89100 ... 89166.7 89157.1 89150 89144.4 89140 89136.4 89133.3 89123.1 89114.3 89100
1827 2017-06 11:29:58 89300 89300 89200 89200 89200 89200 89100 89100 ... 89200 89185.7 89175 89166.7 89160 89154.5 89150 89146.2 89135.7 89126.7
1828 2017-06 11:29:59 89300 89300 89300 89200 89200 89200 89200 89100 ... 89233.3 89214.3 89200 89188.9 89180 89172.7 89166.7 89161.5 89157.1 89146.7
1829 2017-06 11:30:00 89400 89300 89300 89300 89200 89200 89200 89200 ... 89250 89242.9 89225 89211.1 89200 89190.9 89183.3 89176.9 89171.4 89166.7

62 rows × 32 columns


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [390]:
plt.plot(df_history_ts_process['bid-price'][1784:])
plt.plot(df_history_ts_process['bid-price-prev10sec'][1784:])
plt.plot(df_history_ts_process['bid-price-mv10sec'][1784:])


Out[390]:
[<matplotlib.lines.Line2D at 0x7fa233b40390>]
/home/user/env_py3/lib/python3.5/site-packages/matplotlib/font_manager.py:1297: UserWarning: findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans
  (prop.get_family(), self.defaultFamily[fontext]))

In [391]:
plt.plot(df_history_ts_process['bid-price'][1784:])
plt.plot(df_history_ts_process['bid-price-prev5sec'][1784:])
plt.plot(df_history_ts_process['bid-price-prev10sec'][1784:])
plt.plot(df_history_ts_process['bid-price-prev15sec'][1784:])


Out[391]:
[<matplotlib.lines.Line2D at 0x7fa2338d75f8>]
/home/user/env_py3/lib/python3.5/site-packages/matplotlib/font_manager.py:1297: UserWarning: findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans
  (prop.get_family(), self.defaultFamily[fontext]))

In [392]:
plt.plot(df_history_ts_process['bid-price'][1784:])
plt.plot(df_history_ts_process['bid-price-mv5sec'][1784:])
plt.plot(df_history_ts_process['bid-price-mv10sec'][1784:])
plt.plot(df_history_ts_process['bid-price-mv15sec'][1784:])


Out[392]:
[<matplotlib.lines.Line2D at 0x7fa2339c9908>]
/home/user/env_py3/lib/python3.5/site-packages/matplotlib/font_manager.py:1297: UserWarning: findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans
  (prop.get_family(), self.defaultFamily[fontext]))

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:
# previous 2 sec Moving Average ['bid-price']

gap = parm_calculate_mv

for gap in range(1, gap+1):
    col_name = 'bid-price-mv'+str(gap)+'sec'
    col_data = pd.DataFrame(columns=[col_name])
    print('Creating : ', col_name)  

    for month in range(0, parm_ts_month):
    #     print('month : ', month)
        col_data.append(col_data_zeros)
        for i in range(0, gap):
            col_data.loc[month*parm_ts_cycle+i] = 0
        for i in range(gap, parm_ts_cycle):
            col_data.loc[month*parm_ts_cycle+i] = df_history_ts_process['bid-price'][month*parm_ts_cycle+i-gap]
  
    df_history_ts_process[col_name] = col_data

print('len : ', len(col_data))


Creating :  bid-price-prev1sec
Creating :  bid-price-prev2sec
Creating :  bid-price-prev3sec
Creating :  bid-price-prev4sec
Creating :  bid-price-prev5sec
Creating :  bid-price-prev6sec

In [ ]:


In [ ]:


In [318]:
# previous N sec
gap = 1
gap = 2
gap = 3
gap = 4
gap = 5
gap = 6
gap = 7
gap = 8
gap = 9
gap = 10

col_name = 'bid-price-prev'+str(gap)+'sec'
col_data = pd.DataFrame(columns=[col_name])

for month in range(0, parm_ts_month):
#     print('month : ', month)
    col_data.append(col_data_zeros)
    for i in range(0, gap):
        col_data.loc[month*parm_ts_cycle+i] = 0
    for i in range(gap, parm_ts_cycle):
        col_data.loc[month*parm_ts_cycle+i] = df_history_ts_process['bid-price'][month*parm_ts_cycle+i]
    
print('len : ', len(col_data))    
df_history_ts_process[col_name] = col_data


month :  0
len :  61
month :  1
len :  122
month :  2
len :  183
month :  3
len :  244
month :  4
len :  305
month :  5
len :  366
month :  6
len :  427
month :  7
len :  488
month :  8
len :  549
month :  9
len :  610
month :  10
len :  671
month :  11
len :  732
month :  12
len :  793
month :  13
len :  854
month :  14
len :  915
month :  15
len :  976
month :  16
len :  1037
month :  17
len :  1098
month :  18
len :  1159
month :  19
len :  1220
month :  20
len :  1281
month :  21
len :  1342
month :  22
len :  1403
month :  23
len :  1464
month :  24
len :  1525
month :  25
len :  1586
month :  26
len :  1647
month :  27
len :  1708
month :  28
len :  1769
month :  29
len :  1830

In [319]:
len(col_data)


Out[319]:
1830

In [ ]:


In [321]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [268]:
# previous 1 sec
gap = 10

col_data = pd.DataFrame({'bid-price-prev'+str(gap)+'sec': np.zeros(gap)})

# for i in range(gap, len(df_history_ts)-1768):
for i in range(gap, parm_ts_cycle):
#     print(df_history_ts['bid-price'][i])
    col_data.loc[i] = df_history_ts['bid-price'][i]

print(len(col_data))


61

In [ ]:


In [249]:
df_history_ts_process = df_history_ts.copy()

In [252]:
df_history_table_process['tmp'] = col_data['bid-price-prev'+str(gap)+'sec']

In [254]:
df_history_table_process.tail()


Out[254]:
ccyy-mm volume-plate deal-price-low deal-price-avg deal-early-second volume-bidder tmp
26 2017-03 10356 87800 87916 55 262010 74000.0
27 2017-04 12196 89800 89850 59 252273 74000.0
28 2017-05 10316 90100 90209 55 270197 74000.0
29 2017-06 10312 89400 89532 45 244349 74000.0
30 2017-07 10325 92200 92250 57 269189 74000.0

In [ ]:


In [ ]:


In [235]:
col_data


Out[235]:
bid-price-prev1sec
0 0.0
1 74000.0
2 74000.0
3 74000.0
4 74000.0
5 74000.0
6 74000.0
7 74000.0
8 74000.0
9 74000.0
10 74000.0
11 74000.0
12 74000.0
13 74000.0
14 74000.0
15 74000.0
16 74000.0
17 74000.0
18 74000.0
19 74000.0
20 74000.0
21 74000.0
22 74000.0
23 74000.0
24 74000.0
25 74000.0
26 74000.0
27 74000.0
28 74000.0
29 74000.0
... ...
31 74000.0
32 74000.0
33 74000.0
34 74000.0
35 74000.0
36 74000.0
37 74000.0
38 74000.0
39 74000.0
40 74000.0
41 74000.0
42 74000.0
43 74000.0
44 74000.0
45 74000.0
46 74000.0
47 74000.0
48 74000.0
49 74000.0
50 74000.0
51 74000.0
52 74000.0
53 74000.0
54 74000.0
55 74000.0
56 74000.0
57 74000.0
58 74000.0
59 74000.0
60 74000.0

61 rows × 1 columns


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:

[3] Modeling Part 2: Python scikit-learn

Models to use:

  • GradientBoostingClassifier
  • RandomForestClassifier
  • AdaBoostClassifier
  • ExtraTreesClassifier
  • BaggingClassifier
  • LogisticRegression
  • SVM kernal RBF
  • SVM kernal Linear
  • KNeighborsClassifier

Import pre-processed data


In [3]:
df_wnv_raw = pd.read_csv('bid_v007_ReScale.csv', encoding='utf-8') 
# df_wnv_raw = pd.read_csv('bid_v010_ts_target.csv', encoding='utf-8') 
# df_wnv_raw = pd.read_csv('bid_v010_ts_target_mean.csv', encoding='utf-8') 
df_wnv_raw.head()


Out[3]:
target.7sec ccyyy.mm.curr ccyyy.mm.prev bid.time bid.sec month bid.curr.mth increment.curr.mth MV.3sec.curr MV.5sec.curr ... R01_Volume.Bidder.curr.mth R01_success.ratio.curr.mth R01_Volume.Plate.prev.mth R01_Volume.Bidder.prev.mth R01_success.ratio.prev.mth R01_d.Volume.Plate R01_d.Volume.Bidder R01_d.success.ratio R01_anomaly R01_target.7sec
0 0 2016-01 2015-12 11:29:35 35 1 81800 0 0.0 0.0 ... 0.0 0.678037 0.0 0.0 0.290318 0.675811 0.611385 0.867524 0.0 0.000000
1 0 2016-01 2015-12 11:29:36 36 1 81800 0 0.0 0.0 ... 0.0 0.678037 0.0 0.0 0.290318 0.675811 0.611385 0.867524 0.0 0.000000
2 100 2016-01 2015-12 11:29:37 37 1 81800 0 0.0 0.0 ... 0.0 0.678037 0.0 0.0 0.290318 0.675811 0.611385 0.867524 0.0 0.071429
3 100 2016-01 2015-12 11:29:38 38 1 81800 0 0.0 0.0 ... 0.0 0.678037 0.0 0.0 0.290318 0.675811 0.611385 0.867524 0.0 0.071429
4 100 2016-01 2015-12 11:29:39 39 1 81800 0 0.0 0.0 ... 0.0 0.678037 0.0 0.0 0.290318 0.675811 0.611385 0.867524 0.0 0.071429

5 rows × 59 columns

Include relevant features


In [441]:
df_wnv_raw = df_history_ts_process
df_history_ts_process.columns


Out[441]:
Index(['ccyy-mm', 'time', 'bid-price', 'bid-price-prev1sec',
       'bid-price-prev2sec', 'bid-price-prev3sec', 'bid-price-prev4sec',
       'bid-price-prev5sec', 'bid-price-prev6sec', 'bid-price-prev7sec',
       'bid-price-prev8sec', 'bid-price-prev9sec', 'bid-price-prev10sec',
       'bid-price-prev11sec', 'bid-price-prev12sec', 'bid-price-prev13sec',
       'bid-price-prev14sec', 'bid-price-prev15sec', 'bid-price-mv2sec',
       'bid-price-mv3sec', 'bid-price-mv4sec', 'bid-price-mv5sec',
       'bid-price-mv6sec', 'bid-price-mv7sec', 'bid-price-mv8sec',
       'bid-price-mv9sec', 'bid-price-mv10sec', 'bid-price-mv11sec',
       'bid-price-mv12sec', 'bid-price-mv13sec', 'bid-price-mv14sec',
       'bid-price-mv15sec'],
      dtype='object')

In [459]:
X = df_wnv_raw[[
#     'ccyy-mm', 
#     'time', 
#     'bid-price', 
    'bid-price-prev1sec',
       'bid-price-prev2sec', 'bid-price-prev3sec', 'bid-price-prev4sec',
       'bid-price-prev5sec', 'bid-price-prev6sec', 'bid-price-prev7sec',
       'bid-price-prev8sec', 'bid-price-prev9sec', 'bid-price-prev10sec',
       'bid-price-prev11sec', 'bid-price-prev12sec', 'bid-price-prev13sec',
       'bid-price-prev14sec', 'bid-price-prev15sec', 
    'bid-price-mv2sec',
       'bid-price-mv3sec', 'bid-price-mv4sec', 'bid-price-mv5sec',
       'bid-price-mv6sec', 'bid-price-mv7sec', 'bid-price-mv8sec',
       'bid-price-mv9sec', 'bid-price-mv10sec', 'bid-price-mv11sec',
       'bid-price-mv12sec', 'bid-price-mv13sec', 'bid-price-mv14sec',
       'bid-price-mv15sec'
        ]]

X_col = X.columns # get the column list
# X = StandardScaler().fit_transform(X.as_matrix())
X = X.as_matrix()
# y = StandardScaler().fit_transform(df_wnv_raw[['bid-price']].as_matrix()).reshape(len(df_wnv_raw),)
y = df_wnv_raw[['bid-price']].as_matrix().reshape(len(df_wnv_raw),)

In [ ]:


In [96]:
X = df_wnv_raw[[
'R01_bid.sec',
'R01_month',
'R01_bid.curr.mth',
'R01_increment.curr.mth',
'R01_MV.3sec.curr',
'R01_MV.5sec.curr',
'R01_MV.3sec.curr.d.Avg.Low.prev.mth',
'R01_MV.5sec.curr.d.Avg.Low.prev.mth',
'R01_bid.prev.mth',
'R01_increment.prev.mth',
'R01_MV.3sec.prev',
'R01_MV.5sec.prev',
'R01_MV.3sec.prev.d.Avg.Low.prev.mth',
'R01_MV.5sec.prev.d.Avg.Low.prev.mth',
'R01_d.Avg.Low.prev.mth',
'R01_increment.curr.mth.d.Avg.Low.prev.mth',
'R01_d.earliest.success.time.sec.prev.mth',
# 'R01_Volume.Plate.curr.mth',
# 'R01_Volume.Bidder.curr.mth',
'R01_success.ratio.curr.mth',
# 'R01_Volume.Plate.prev.mth',
# 'R01_Volume.Bidder.prev.mth',
'R01_success.ratio.prev.mth',
# 'R01_d.Volume.Plate',
# 'R01_d.Volume.Bidder',
# 'R01_d.success.ratio',
'R01_anomaly',
# 'R01_target.7sec'
        ]]

X_col = X.columns # get the column list
# X = StandardScaler().fit_transform(X.as_matrix())
X = X.as_matrix()
y = df_wnv_raw[['R01_target.7sec']].as_matrix().reshape(len(df_wnv_raw),)

In [61]:
X = df_wnv_raw[[
'R01_bid.sec',
'R01_month',
'R01_bid.curr.mth',
'R01_increment.curr.mth',
'R01_MV.3sec.curr',
'R01_MV.5sec.curr',
'R01_MV.3sec.curr.d.Avg.Low.prev.mth',
'R01_MV.5sec.curr.d.Avg.Low.prev.mth',
'R01_bid.prev.mth',
'R01_increment.prev.mth',
'R01_MV.3sec.prev',
'R01_MV.5sec.prev',
'R01_MV.3sec.prev.d.Avg.Low.prev.mth',
'R01_MV.5sec.prev.d.Avg.Low.prev.mth',
'R01_d.Avg.Low.prev.mth',
'R01_increment.curr.mth.d.Avg.Low.prev.mth',
'R01_d.earliest.success.time.sec.prev.mth',
'R01_Volume.Plate.curr.mth',
'R01_Volume.Bidder.curr.mth',
'R01_success.ratio.curr.mth',
'R01_Volume.Plate.prev.mth',
'R01_Volume.Bidder.prev.mth',
'R01_success.ratio.prev.mth',
'R01_d.Volume.Plate',
'R01_d.Volume.Bidder',
'R01_d.success.ratio',
'R01_anomaly',
# 'R01_target.7sec'
        ]]

X_col = X.columns # get the column list
# X = StandardScaler().fit_transform(X.as_matrix())
X = X.as_matrix()
y = df_wnv_raw[['R01_target.7sec']].as_matrix().reshape(len(df_wnv_raw),)

In [60]:
X = df_wnv_raw[[
# 'R01_bid.sec',
# 'R01_month',
'R01_bid.curr.mth',
# 'R01_increment.curr.mth',
# 'R01_MV.3sec.curr',
# 'R01_MV.5sec.curr',
# 'R01_MV.3sec.curr.d.Avg.Low.prev.mth',
# 'R01_MV.5sec.curr.d.Avg.Low.prev.mth',
# 'R01_bid.prev.mth',
# 'R01_increment.prev.mth',
# 'R01_MV.3sec.prev',
# 'R01_MV.5sec.prev',
# 'R01_MV.3sec.prev.d.Avg.Low.prev.mth',
# 'R01_MV.5sec.prev.d.Avg.Low.prev.mth',
# 'R01_d.Avg.Low.prev.mth',
# 'R01_increment.curr.mth.d.Avg.Low.prev.mth',
# 'R01_d.earliest.success.time.sec.prev.mth',
# 'R01_Volume.Plate.curr.mth',
# 'R01_Volume.Bidder.curr.mth',
# 'R01_success.ratio.curr.mth',
# 'R01_Volume.Plate.prev.mth',
# 'R01_Volume.Bidder.prev.mth',
# 'R01_success.ratio.prev.mth',
# 'R01_d.Volume.Plate',
# 'R01_d.Volume.Bidder',
# 'R01_d.success.ratio',
# 'R01_anomaly',
# 'R01_target.7sec'
        ]]

X_col = X.columns # get the column list
# X = StandardScaler().fit_transform(X.as_matrix())
X = X.as_matrix()
y = df_wnv_raw[['R01_target.7sec']].as_matrix().reshape(len(df_wnv_raw),)

In [443]:
X_col


Out[443]:
Index(['time', 'bid-price-prev1sec', 'bid-price-prev2sec',
       'bid-price-prev3sec', 'bid-price-prev4sec', 'bid-price-prev5sec',
       'bid-price-prev6sec', 'bid-price-prev7sec', 'bid-price-prev8sec',
       'bid-price-prev9sec', 'bid-price-prev10sec', 'bid-price-prev11sec',
       'bid-price-prev12sec', 'bid-price-prev13sec', 'bid-price-prev14sec',
       'bid-price-prev15sec', 'bid-price-mv2sec', 'bid-price-mv3sec',
       'bid-price-mv4sec', 'bid-price-mv5sec', 'bid-price-mv6sec',
       'bid-price-mv7sec', 'bid-price-mv8sec', 'bid-price-mv9sec',
       'bid-price-mv10sec', 'bid-price-mv11sec', 'bid-price-mv12sec',
       'bid-price-mv13sec', 'bid-price-mv14sec', 'bid-price-mv15sec'],
      dtype='object')

In [460]:
# plt.plot(X)
plt.plot(y)


Out[460]:
[<matplotlib.lines.Line2D at 0x7fa231ccd4e0>]
/home/user/env_py3/lib/python3.5/site-packages/matplotlib/font_manager.py:1297: UserWarning: findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans
  (prop.get_family(), self.defaultFamily[fontext]))

In [ ]:

[4] Evaluation

K-fold Cross-Validation


In [461]:
rng = check_random_state(0)

In [462]:
# GB
classifier_GB = GradientBoostingRegressor(n_estimators=1500, # score: 0.94608 (AUC 0.81419), learning_rate=0.001, max_features=8 <<< Best
#                                    loss='deviance',
#                                    subsample=1,
#                                    max_depth=5,
#                                    min_samples_split=20,
                                   learning_rate=0.002,
#                                    max_features=10,
                                   random_state=rng)

In [463]:
# AB
classifier_AB = AdaBoostRegressor(n_estimators=1500, # score: 0.93948 (AUC 0.88339), learning_rate=0.004 <<< Best
                                   learning_rate=0.002,
                                   random_state=rng)

In [464]:
# RF
classifier_RF = RandomForestRegressor(n_estimators=1500, # score: 0.94207 (AUC 0.81870), max_depth=3, min_samples_split=20, <<< Best
#                                     max_features=10,
#                                     max_depth=3,
#                                     min_samples_split=20,
                                    random_state=rng)

In [465]:
# ET
classifier_ET = ExtraTreesRegressor(n_estimators=1000, # score: 0.94655 (AUC 0.84364), max_depth=3, min_samples_split=20, max_features=10 <<< Best
#                                     max_depth=3,
#                                     min_samples_split=20,
#                                     max_features=10,
                                    random_state=rng)

In [466]:
# BG
classifier_BG = BaggingRegressor(n_estimators=500, # score: 0.70725 (AUC 0.63729) <<< Best
#                                     max_features=10,
                                    random_state=rng)

LR


In [467]:
classifier_LR = LinearRegression() # score: 0.90199 (AUC 0.80569)

SVM Linear


In [468]:
# classifier_SVCL = svm.SVC(kernel='linear', probability=True, random_state=rng) # score: 0.89976 (AUC 0.70524)
classifier_SVRL = svm.SVR() # score: 0.89976 (AUC 0.70524)

SVM


In [469]:
classifier_SVCR = svm.SVR(kernel='rbf') # score: 0.80188 (AUC 0.50050)
# classifier_SVRR = svm.SVR(kernel='poly') # score: 0.80188 (AUC 0.50050)

KNN


In [470]:
classifier_KNN = KNeighborsRegressor(n_neighbors=2) # score: 0.94018 (AUC 0.72792)
cv = cross_val_score(classifier_KNN,
                            X,
                            y,
                            cv=StratifiedKFold(19))
print('KNN CV score: {0:.5f}'.format(cv.mean()))


/home/user/env_py3/lib/python3.5/site-packages/sklearn/model_selection/_split.py:581: Warning: The least populated class in y has only 1 members, which is too few. The minimum number of groups for any class cannot be less than n_splits=19.
  % (min_groups, self.n_splits)), Warning)
KNN CV score: 0.97155

In [ ]:

Select Model


In [471]:
# classifier = classifier_GB   # 324.632308296
# classifier = classifier_AB   # 429.646733221
# classifier = classifier_RF   # 175.504322802
# classifier = classifier_ET   # 172.097916817, 0.0724812030075
classifier = classifier_BG   # 175.451381872
# classifier = classifier_LR     # 128.465059749, 0.11
# classifier = classifier_SVRL # 3789.82169312
# classifier = classifier_SVRR # 3789.82169312, 0.10754224349

Split Data


In [472]:
n_splits=61 # 19 seconds/records for each bidding month
# n_splits=54 # 19 seconds/records for each bidding month
# n_splits=19 # 19 seconds/records for each bidding month
n_fold = 30

# X_train_1 = X[0:(len(X)-batch*n_splits)]
# y_train_1 = y[0:(len(X)-batch*n_splits)]

# X_test_1 = X[(len(X)-batch*n_splits):((len(X)-batch*n_splits)+n_splits)]
# y_test_1 = y[(len(X)-batch*n_splits):((len(X)-batch*n_splits)+n_splits)]

CV


In [473]:
y_pred = {}
y_test = {}

i = 0
for batch in range(1, n_fold):
    X_train_1 = X[0:(len(X)-batch*n_splits)]
    y_train_1 = y[0:(len(X)-batch*n_splits)]
    X_test_1  = X[(len(X)-batch*n_splits):((len(X)-batch*n_splits)+n_splits)]
    y_test_1  = y[(len(X)-batch*n_splits):((len(X)-batch*n_splits)+n_splits)]
    print(len(X_train_1))

    y_pred[i] = classifier.fit(X_train_1, y_train_1).predict(X_test_1)
#     y_pred[i] = classifier.fit(X_train_1, y_train_1).predict(X_test_1[0:10])
#     y_pred[i] = classifier.fit(X_train_1, y_train_1).predict(X_test_1[10:19])
    y_test[i] = y_test_1  
    plt.figure()
    plt.plot(y_train_1)
    plt.plot()
    plt.figure()
    plt.plot(y_test[i])
    plt.plot(y_pred[i])
    plt.plot()
    i += 1


1769
1708
1647
1586
1525
1464
1403
1342
1281
1220
1159
/home/user/env_py3/lib/python3.5/site-packages/matplotlib/pyplot.py:524: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  max_open_warning, RuntimeWarning)
1098
/home/user/env_py3/lib/python3.5/site-packages/matplotlib/pyplot.py:524: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  max_open_warning, RuntimeWarning)
1037
/home/user/env_py3/lib/python3.5/site-packages/matplotlib/pyplot.py:524: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  max_open_warning, RuntimeWarning)
976
/home/user/env_py3/lib/python3.5/site-packages/matplotlib/pyplot.py:524: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  max_open_warning, RuntimeWarning)
915
/home/user/env_py3/lib/python3.5/site-packages/matplotlib/pyplot.py:524: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  max_open_warning, RuntimeWarning)
854
/home/user/env_py3/lib/python3.5/site-packages/matplotlib/pyplot.py:524: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  max_open_warning, RuntimeWarning)
793
/home/user/env_py3/lib/python3.5/site-packages/matplotlib/pyplot.py:524: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  max_open_warning, RuntimeWarning)
732
/home/user/env_py3/lib/python3.5/site-packages/matplotlib/pyplot.py:524: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  max_open_warning, RuntimeWarning)
671
/home/user/env_py3/lib/python3.5/site-packages/matplotlib/pyplot.py:524: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  max_open_warning, RuntimeWarning)
610
/home/user/env_py3/lib/python3.5/site-packages/matplotlib/pyplot.py:524: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  max_open_warning, RuntimeWarning)
549
/home/user/env_py3/lib/python3.5/site-packages/matplotlib/pyplot.py:524: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  max_open_warning, RuntimeWarning)
488
/home/user/env_py3/lib/python3.5/site-packages/matplotlib/pyplot.py:524: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  max_open_warning, RuntimeWarning)
427
/home/user/env_py3/lib/python3.5/site-packages/matplotlib/pyplot.py:524: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  max_open_warning, RuntimeWarning)
366
/home/user/env_py3/lib/python3.5/site-packages/matplotlib/pyplot.py:524: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  max_open_warning, RuntimeWarning)
305
/home/user/env_py3/lib/python3.5/site-packages/matplotlib/pyplot.py:524: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  max_open_warning, RuntimeWarning)
244
/home/user/env_py3/lib/python3.5/site-packages/matplotlib/pyplot.py:524: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  max_open_warning, RuntimeWarning)
183
/home/user/env_py3/lib/python3.5/site-packages/matplotlib/pyplot.py:524: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  max_open_warning, RuntimeWarning)
122
/home/user/env_py3/lib/python3.5/site-packages/matplotlib/pyplot.py:524: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  max_open_warning, RuntimeWarning)
61
/home/user/env_py3/lib/python3.5/site-packages/matplotlib/pyplot.py:524: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  max_open_warning, RuntimeWarning)
/home/user/env_py3/lib/python3.5/site-packages/matplotlib/font_manager.py:1297: UserWarning: findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans
  (prop.get_family(), self.defaultFamily[fontext]))

In [474]:
k = []
for i in range(0, len(y_test)):
    k.append(np.mean(np.sqrt(np.square(y_test[i] - y_pred[i]))))

k_mean = np.mean(k)

print(k_mean)
print()
print(k)


551.925297394

[122.74447387653989, 154.49689211193177, 425.60032875885753, 96.189199806227052, 151.70372521486129, 277.13784656743684, 121.38881091349954, 206.4945318054875, 697.94537282291321, 126.30753974115225, 122.05361275931774, 926.56823226086794, 80.924110476240344, 88.712505296870575, 126.27182230868971, 53.772582251918749, 74.618117015477225, 122.77751918833347, 97.831762080095359, 216.84658543942194, 1568.7498790114798, 136.66413421813422, 177.204387875864, 1658.7734173931715, 262.17615948714501, 1364.7974322853875, 3769.6769447888396, 588.88110850897749, 2188.5245901639346]

In [478]:
k = []
for i in range(0, len(y_test)):
    k.append(np.mean(np.sqrt(np.square(y_test[i][45:55] - y_pred[i][45:55]))))

k_mean = np.mean(k)

print(k_mean)
print()
print(k)


548.026756351

[39.850000000000001, 35.759999999999124, 612.19999999999709, 41.95969726225885, 60.238071428568219, 60.679999999998834, 96.806141691481756, 148.15999999999912, 766.39999999999418, 97.848666666662027, 61.030000000000292, 1292.3999999999942, 20.846333333331859, 39.060357142855352, 32.819999999999709, 58.219999999999708, 81.87999999999883, 42.479999999998839, 30.976666666666279, 42.799999999999997, 1765.3999999999942, 32.199999999998546, 52.079999999998833, 2080.0, 376.68000000000319, 1138.6000000000058, 3885.3999999999942, 600.0, 2300.0]

In [475]:
k = []
for i in range(0, len(y_test)):
    k.append(np.mean(np.sqrt(np.square(y_test[i][13:16] - y_pred[i][13:16]))))

k_mean = np.mean(k)

print(k_mean)
print()
print(k)


436.232727408

[10.262275132306968, 53.75, 53.866666666663754, 3.815956339443801, 77.333333333338189, 154.53333333333043, 0.93333333333430346, 1.1067690642667003, 382.33333333333331, 21.755555555554263, 68.066666666670542, 447.39999999999901, 35.095555555560473, 59.200000000001943, 35.200000000001943, 6.333333333333333, 27.133333333336243, 153.03357142855626, 55.333333333333336, 304.20000000000192, 1045.6888888888934, 82.933333333334303, 228.0, 1104.1111111111143, 288.0, 1611.9333333333391, 3639.3960784313676, 600.0, 2100.0]

In [ ]:


In [122]:
y_test[1][13:]


Out[122]:
array([ 0.28571429,  0.28571429,  0.35714286,  0.35714286,  0.35714286,
        0.42857143])

In [123]:
y_pred[1][13:]


Out[123]:
array([ 0.36771429,  0.38057143,  0.39428571,  0.396     ,  0.39685714,
        0.4775    ])

In [127]:
np.mean(np.sqrt(np.square(y_test[4] - y_pred[4])))


Out[127]:
0.13372932330826978

In [128]:
np.mean(np.sqrt(np.square(y_test[4][13:16] - y_pred[4][13:16])))


Out[128]:
0.084761904761906912

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [436]:
y_pred_df = pd.DataFrame.from_dict(y_pred)

In [437]:
y_pred_df.columns=['month 7','month 6','month 5','month 4','month 3','month 2','month 1']

In [367]:
y_pred_df.to_csv('bid_results_v001.csv', index=False)

In [438]:
y_pred_df


Out[438]:
month 7 month 6 month 5 month 4 month 3 month 2 month 1
0 0.178571 0.086286 0.101286 0.098714 0.064571 0.086000 0.078571
1 0.184571 0.127286 0.110143 0.127143 0.075857 0.095714 0.084857
2 0.246571 0.180286 0.191000 0.174571 0.097143 0.107000 0.140571
3 0.282571 0.189571 0.345143 0.174857 0.189286 0.147286 0.133714
4 0.337143 0.193286 0.374571 0.155857 0.231143 0.174571 0.146857
5 0.353429 0.204429 0.405286 0.158286 0.261714 0.199000 0.191571
6 0.404000 0.237000 0.426286 0.162571 0.289143 0.308143 0.195429
7 0.433571 0.239429 0.446571 0.233286 0.302143 0.416714 0.204571
8 0.443714 0.256143 0.452571 0.262714 0.323000 0.485143 0.205143
9 0.512571 0.253143 0.472143 0.311857 0.327143 0.531714 0.243571
10 0.538000 0.303286 0.499571 0.338143 0.334714 0.576714 0.245857
11 0.559429 0.322000 0.547429 0.353429 0.350143 0.616714 0.268714
12 0.602571 0.327286 0.579714 0.361857 0.356143 0.616857 0.343143
13 0.674429 0.342714 0.615714 0.405714 0.361857 0.680857 0.393857
14 0.717286 0.363429 0.634429 0.446714 0.391857 0.695429 0.449857
15 0.795143 0.379000 0.709143 0.484571 0.475143 0.750286 0.503143
16 0.802857 0.381714 0.754143 0.503571 0.529286 0.776571 0.531429
17 0.803571 0.385571 0.784857 0.522429 0.566571 0.775571 0.567857
18 0.816571 0.487571 0.860714 0.594571 0.647714 0.780571 0.591286

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


The End