By: 顾瞻 GU Zhan (Sam)

July 2017

[2] Data pre-porcessing

Explore and visualize data



In [1]:

    
# from __future__ import print_function, division
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import pandas as pd
import operator
from scipy import interp
from itertools import cycle
from sklearn import svm
from sklearn.utils.validation import check_random_state
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import roc_curve, auc
from statsmodels.graphics.mosaicplot import mosaic
print(__doc__)









    



Automatically created module for IPython interactive environment

Read raw data



In [2]:

    
df_history_ts = pd.read_csv('data/history_ts.csv') 
df_history_ts_process = df_history_ts.copy()
df_history_ts_process.tail()



In [3]:

    
df_history_table = pd.read_csv('data/history_table.csv') 
df_history_table_process = df_history_table.copy()
df_history_table_process.tail()









    Out[3]:







  
    
      
      ccyy-mm
      volume-plate
      deal-price-low
      deal-price-avg
      deal-early-second
      volume-bidder
    
  
  
    
      26
      2017-03
      10356
      87800
      87916
      55
      262010
    
    
      27
      2017-04
      12196
      89800
      89850
      59
      252273
    
    
      28
      2017-05
      10316
      90100
      90209
      55
      270197
    
    
      29
      2017-06
      10312
      89400
      89532
      45
      244349
    
    
      30
      2017-07
      10325
      92200
      92250
      57
      269189

Parameters



In [4]:

    
parm_ts_cycle = 61 # seconds/records per month
print('parm_ts_cycle : %d seconds' % parm_ts_cycle)
parm_ts_month = int(len(df_history_ts) / parm_ts_cycle)
print('parm_ts_month : %d months' %  parm_ts_month)

parm_calculate_base_price_second = 15 # Use the current month's bid-price as base-price at this seconds. Later to derive increment-price
parm_calculate_prev_bp = 15 # Number of previous price/increment to include, i.e. previous 2sec, 3sec, 4sec, 5sec ... 15sec
parm_calculate_mv = 15 # Number of  previous price/increment Moving Average to calculate, i.e. previous 2sec, 3sec, 4sec, 5sec ... 15sec
parm_calculate_target_second = 7 # How many seconds in future to predict: target variable
parm_calculate_prev_month = 3 # Number of previous month to include (need to remove earliest x month from training data)

parm_record_cut_row_head = max(parm_calculate_base_price_second, parm_calculate_prev_bp, parm_calculate_mv)
parm_record_cut_row_tail = parm_calculate_target_second
parm_record_cut_month_head = parm_calculate_prev_month + 1

parm_ts_valid_cycle = parm_ts_cycle - parm_record_cut_row_head - parm_record_cut_row_tail
print('parm_ts_valid_cycle : %d seconds' % parm_ts_valid_cycle)
parm_ts_valid_month = parm_ts_month - parm_record_cut_month_head
print('parm_ts_valid_month : %d months' % parm_ts_valid_month)

if parm_record_cut_month_head < 10:
    parm_record_cut_ccyy = pd.to_datetime('2015-0'+str(parm_record_cut_month_head))
else:
    parm_record_cut_ccyy = pd.to_datetime('2015-'+str(parm_record_cut_month_head))

print('parm_record_cut_ccyy : %s' % parm_record_cut_ccyy)

print('parm_record_cut_month_head : %d months' % parm_record_cut_month_head)
print('parm_record_cut_row_head :  %d seconds' % parm_record_cut_row_head)
print('parm_record_cut_row_tail :  %d seconds' % parm_record_cut_row_tail)
print(' : ' )
print(' : ' )
print(' : ' )









    



parm_ts_cycle : 61 seconds
parm_ts_month : 31 months
parm_ts_valid_cycle : 39 seconds
parm_ts_valid_month : 27 months
parm_record_cut_ccyy : 2015-04-01 00:00:00
parm_record_cut_month_head : 4 months
parm_record_cut_row_head :  15 seconds
parm_record_cut_row_tail :  7 seconds
 : 
 : 
 :



In [5]:

    
df_history_ts_process.head()



In [ ]:

Prepare derived features

Process: df_history_ts_process



In [6]:

    
# date of current month
df_history_ts_process['date-curr'] = df_history_ts_process.apply(lambda row: pd.to_datetime(row['ccyy-mm']), axis=1)

# date of previous month
df_history_ts_process['date-prev'] = df_history_ts_process.apply(lambda row: row['date-curr'] - pd.offsets.MonthBegin(1), axis=1)


# Year
df_history_ts_process['year'] = df_history_ts_process.apply(lambda row: row['ccyy-mm'][0:4], axis=1)

# Month
df_history_ts_process['month'] = df_history_ts_process.apply(lambda row: row['ccyy-mm'][5:7], axis=1)

# Hour
df_history_ts_process['hour'] = df_history_ts_process.apply(lambda row: row['time'][0:2], axis=1)

# Minute
df_history_ts_process['minute'] = df_history_ts_process.apply(lambda row: row['time'][3:5], axis=1)

# Second
df_history_ts_process['second'] = df_history_ts_process.apply(lambda row: row['time'][6:8], axis=1)



In [9]:

    
# df_history_ts_process
# df_history_ts_process[1768:]



In [10]:

    
# new ['base-price']
gap = 1 # only one new feature/column

for gap in range(1, gap+1):
    col_name = 'base-price'+str(parm_calculate_base_price_second)+'sec'
    col_name_base_price = col_name
    col_data = pd.DataFrame(columns=[col_name])
    print('Creating : ', col_name)  

    for month in range(0, parm_ts_month):
        for i in range(0, parm_ts_cycle):
            col_data.loc[month*parm_ts_cycle+i] = df_history_ts_process['bid-price'][month*parm_ts_cycle+parm_calculate_base_price_second]
  
    df_history_ts_process[col_name] = col_data

print('Total records processed : ', len(col_data))









    



Creating :  base-price15sec
Total records processed :  1891



In [11]:

    
# df_history_ts_process
# df_history_ts_process[1768:]



In [12]:

    
# new ['increment-price'] = ['bid-price'] - ['base-price']

df_history_ts_process['increment-price'] = df_history_ts_process.apply(lambda row: row['bid-price'] - row[col_name_base_price], axis=1)



In [13]:

    
# df_history_ts_process
# df_history_ts_process[1768:]



In [14]:

    
plt.figure()
plt.plot(df_history_ts_process['bid-price'])
plt.plot(df_history_ts_process[col_name_base_price])
plt.plot()
plt.figure()
plt.plot(df_history_ts_process['increment-price'])
plt.plot()









    Out[14]:





[]






    



/home/user/env_py3/lib/python3.5/site-packages/matplotlib/font_manager.py:1297: UserWarning: findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans
  (prop.get_family(), self.defaultFamily[fontext]))

['increment-price-target']



In [15]:

    
# previous N sec ['increment-price-target']

for gap in range(1, 2):
    col_name = 'increment-price-target'
    col_data = pd.DataFrame(columns=[col_name])
    print('Creating : ', col_name)  

    for month in range(0, parm_ts_month):
    #     print('month : ', month)
        for i in range(0, (parm_ts_cycle - parm_calculate_target_second)):
            col_data.loc[month*parm_ts_cycle+i] = df_history_ts_process['increment-price'][month*parm_ts_cycle+i+parm_calculate_target_second]
        for i in range((parm_ts_cycle - parm_calculate_target_second), parm_ts_cycle):
            col_data.loc[month*parm_ts_cycle+i] = 0
  
    df_history_ts_process[col_name] = col_data

print('Total records processed : ', len(col_data))









    



Creating :  increment-price-target
Total records processed :  1891



In [16]:

    
plt.figure()
plt.plot(df_history_ts_process['increment-price'])
plt.plot(df_history_ts_process['increment-price-target'])
plt.plot()

plt.figure()
plt.plot(df_history_ts_process['increment-price'][1768:])
plt.plot(df_history_ts_process['increment-price-target'][1768:])
plt.plot()









    Out[16]:





[]






    



/home/user/env_py3/lib/python3.5/site-packages/matplotlib/font_manager.py:1297: UserWarning: findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans
  (prop.get_family(), self.defaultFamily[fontext]))



In [ ]:



In [17]:

    
# previous 'parm_calculate_prev_bp' sec ['increment-price']
gap = parm_calculate_prev_bp

for gap in range(1, gap+1):
    col_name = 'increment-price-prev'+str(gap)+'sec'
    col_data = pd.DataFrame(columns=[col_name])
#     col_data_zeros = pd.DataFrame({col_name: np.zeros(gap)})
    print('Creating : ', col_name)  

    for month in range(0, parm_ts_month):
    #     print('month : ', month)
#         col_data.append(col_data_zeros)
        for i in range(0, gap):
            col_data.loc[month*parm_ts_cycle+i] = 0
        for i in range(gap, parm_ts_cycle):
            col_data.loc[month*parm_ts_cycle+i] = df_history_ts_process['increment-price'][month*parm_ts_cycle+i-gap]
  
    df_history_ts_process[col_name] = col_data

print('Total records processed : ', len(col_data))









    



Creating :  increment-price-prev1sec
Creating :  increment-price-prev2sec
Creating :  increment-price-prev3sec
Creating :  increment-price-prev4sec
Creating :  increment-price-prev5sec
Creating :  increment-price-prev6sec
Creating :  increment-price-prev7sec
Creating :  increment-price-prev8sec
Creating :  increment-price-prev9sec
Creating :  increment-price-prev10sec
Creating :  increment-price-prev11sec
Creating :  increment-price-prev12sec
Creating :  increment-price-prev13sec
Creating :  increment-price-prev14sec
Creating :  increment-price-prev15sec
Total records processed :  1891



In [18]:

    
# previous 'parm_calculate_mv' sec Moving Average ['increment-price']

gap = parm_calculate_mv

for gap in range(2, gap+1): # MV starts from 2 seconds, till parm_calculate_mv
    col_name = 'increment-price-mv'+str(gap)+'sec'
    col_data = pd.DataFrame(columns=[col_name])
    print('Creating : ', col_name)  

    for month in range(0, parm_ts_month):
    #     print('month : ', month)
        for i in range(0, gap):
            col_data.loc[month*parm_ts_cycle+i] = 0
        for i in range(gap, parm_ts_cycle):
            col_data.loc[month*parm_ts_cycle+i] = \
            np.mean(df_history_ts_process['increment-price'][month*parm_ts_cycle+i-gap:month*parm_ts_cycle+i])
  
    df_history_ts_process[col_name] = col_data

print('Total records processed : ', len(col_data))









    



Creating :  increment-price-mv2sec
Creating :  increment-price-mv3sec
Creating :  increment-price-mv4sec
Creating :  increment-price-mv5sec
Creating :  increment-price-mv6sec
Creating :  increment-price-mv7sec
Creating :  increment-price-mv8sec
Creating :  increment-price-mv9sec
Creating :  increment-price-mv10sec
Creating :  increment-price-mv11sec
Creating :  increment-price-mv12sec
Creating :  increment-price-mv13sec
Creating :  increment-price-mv14sec
Creating :  increment-price-mv15sec
Total records processed :  1891



In [ ]:

    
# df_history_ts_process[1768:]



In [19]:

    
plt.figure()
plt.plot(df_history_ts_process['increment-price'][1768:])
plt.plot(df_history_ts_process['increment-price-prev3sec'][1768:])
plt.plot(df_history_ts_process['increment-price-prev7sec'][1768:])
plt.plot(df_history_ts_process['increment-price-prev11sec'][1768:])
plt.plot(df_history_ts_process['increment-price-prev15sec'][1768:])
plt.plot()









    Out[19]:





[]






    



/home/user/env_py3/lib/python3.5/site-packages/matplotlib/font_manager.py:1297: UserWarning: findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans
  (prop.get_family(), self.defaultFamily[fontext]))



In [20]:

    
plt.figure()
plt.plot(df_history_ts_process['increment-price'][1768:])
plt.plot(df_history_ts_process['increment-price-mv3sec'][1768:])
plt.plot(df_history_ts_process['increment-price-mv7sec'][1768:])
plt.plot(df_history_ts_process['increment-price-mv11sec'][1768:])
plt.plot(df_history_ts_process['increment-price-mv15sec'][1768:])
plt.plot()









    Out[20]:





[]






    



/home/user/env_py3/lib/python3.5/site-packages/matplotlib/font_manager.py:1297: UserWarning: findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans
  (prop.get_family(), self.defaultFamily[fontext]))



In [ ]:

Process: df_history_table_process



In [21]:

    
df_history_table_process.tail()









    Out[21]:







  
    
      
      ccyy-mm
      volume-plate
      deal-price-low
      deal-price-avg
      deal-early-second
      volume-bidder
    
  
  
    
      26
      2017-03
      10356
      87800
      87916
      55
      262010
    
    
      27
      2017-04
      12196
      89800
      89850
      59
      252273
    
    
      28
      2017-05
      10316
      90100
      90209
      55
      270197
    
    
      29
      2017-06
      10312
      89400
      89532
      45
      244349
    
    
      30
      2017-07
      10325
      92200
      92250
      57
      269189



In [22]:

    
# date of current month
df_history_table_process['date-curr'] = df_history_table_process.apply(lambda row: pd.to_datetime(row['ccyy-mm']), axis=1)
df_history_table_process['d-avg-low-price'] = df_history_table_process.apply(lambda row: row['deal-price-avg'] - row['deal-price-low'], axis=1)
df_history_table_process['ratio-bid'] = df_history_table_process.apply(lambda row: row['volume-plate'] / row['volume-bidder'], axis=1)

Merge dataframe



In [23]:

    
df_history_ts_process = pd.merge(df_history_ts_process, df_history_table_process[['date-curr', 'volume-plate', 'ratio-bid']], how = 'left', left_on = 'date-curr', right_on = 'date-curr')



In [24]:

    
df_history_ts_process = pd.merge(df_history_ts_process, df_history_table_process[['date-curr', 'volume-plate', 'ratio-bid', 'deal-early-second', 'deal-price-avg']], how = 'left', left_on = 'date-prev', right_on = 'date-curr')



In [ ]:

Shift to copy previous 'parm_calculate_prev_month' month's data into current row



In [ ]:



In [ ]:



In [ ]:

Housekeeping to remove some invald data during pre-processing



In [26]:

    
df_history_ts_process.columns









    Out[26]:





Index(['ccyy-mm', 'time', 'bid-price', 'date-curr_x', 'date-prev', 'year',
       'month', 'hour', 'minute', 'second', 'base-price15sec',
       'increment-price', 'increment-price-target', 'increment-price-prev1sec',
       'increment-price-prev2sec', 'increment-price-prev3sec',
       'increment-price-prev4sec', 'increment-price-prev5sec',
       'increment-price-prev6sec', 'increment-price-prev7sec',
       'increment-price-prev8sec', 'increment-price-prev9sec',
       'increment-price-prev10sec', 'increment-price-prev11sec',
       'increment-price-prev12sec', 'increment-price-prev13sec',
       'increment-price-prev14sec', 'increment-price-prev15sec',
       'increment-price-mv2sec', 'increment-price-mv3sec',
       'increment-price-mv4sec', 'increment-price-mv5sec',
       'increment-price-mv6sec', 'increment-price-mv7sec',
       'increment-price-mv8sec', 'increment-price-mv9sec',
       'increment-price-mv10sec', 'increment-price-mv11sec',
       'increment-price-mv12sec', 'increment-price-mv13sec',
       'increment-price-mv14sec', 'increment-price-mv15sec', 'volume-plate_x',
       'ratio-bid_x', 'date-curr_y', 'volume-plate_y', 'ratio-bid_y',
       'deal-early-second', 'deal-price-avg', 'deal-price-avg'],
      dtype='object')



In [27]:

    
# housekeeping: delete some columns
df_history_ts_process.drop('date-curr_y', axis=1, inplace=True)



In [28]:

    
# remove first 'parm_record_cut_ccyy' months from dataset
df_history_ts_process = df_history_ts_process[df_history_ts_process['date-curr_x'] > parm_record_cut_ccyy]
df_history_ts_process = df_history_ts_process[df_history_ts_process['date-prev'] > parm_record_cut_ccyy]



In [29]:

    
# total 61 seconds/rows per month:
# remove first 'parm_record_cut_row_head' reconds
# remove last 'parm_record_cut_row_tail' reconds
df_history_ts_process = df_history_ts_process[df_history_ts_process['second'] >= str(parm_record_cut_row_head) ]
df_history_ts_process = df_history_ts_process[df_history_ts_process['second'] <= str(60 - parm_record_cut_row_tail) ]
# df_history_ts_process = df_history_ts_process[df_history_ts_process['second'] > parm_record_cut_row_head ]



In [30]:

    
# Reset index after housekeeping
df_history_ts_process = df_history_ts_process.reset_index(drop=True)



In [32]:

    
df_history_ts_process.tail()









    Out[32]:







  
    
      
      ccyy-mm
      time
      bid-price
      date-curr_x
      date-prev
      year
      month
      hour
      minute
      second
      ...
      increment-price-mv13sec
      increment-price-mv14sec
      increment-price-mv15sec
      volume-plate_x
      ratio-bid_x
      volume-plate_y
      ratio-bid_y
      deal-early-second
      deal-price-avg
      deal-price-avg
    
  
  
    
      1048
      2017-07
      11:29:49
      91400
      2017-07-01
      2017-06-01
      2017
      07
      11
      29
      49
      ...
      461.538
      442.857
      420
      10325
      0.038356
      10312.0
      0.042202
      45.0
      89532.0
      89532.0
    
    
      1049
      2017-07
      11:29:50
      91500
      2017-07-01
      2017-06-01
      2017
      07
      11
      29
      50
      ...
      507.692
      485.714
      466.667
      10325
      0.038356
      10312.0
      0.042202
      45.0
      89532.0
      89532.0
    
    
      1050
      2017-07
      11:29:51
      91600
      2017-07-01
      2017-06-01
      2017
      07
      11
      29
      51
      ...
      553.846
      535.714
      513.333
      10325
      0.038356
      10312.0
      0.042202
      45.0
      89532.0
      89532.0
    
    
      1051
      2017-07
      11:29:52
      91700
      2017-07-01
      2017-06-01
      2017
      07
      11
      29
      52
      ...
      600
      585.714
      566.667
      10325
      0.038356
      10312.0
      0.042202
      45.0
      89532.0
      89532.0
    
    
      1052
      2017-07
      11:29:53
      91800
      2017-07-01
      2017-06-01
      2017
      07
      11
      29
      53
      ...
      653.846
      635.714
      620
      10325
      0.038356
      10312.0
      0.042202
      45.0
      89532.0
      89532.0
    
  

5 rows × 49 columns



In [31]:

    
plt.figure()
plt.plot(df_history_ts_process['increment-price'][974:])
plt.plot(df_history_ts_process['increment-price-mv3sec'][974:])
plt.plot(df_history_ts_process['increment-price-mv7sec'][974:])
plt.plot(df_history_ts_process['increment-price-mv11sec'][974:])
plt.plot(df_history_ts_process['increment-price-mv15sec'][974:])
plt.plot()









    Out[31]:





[]






    



/home/user/env_py3/lib/python3.5/site-packages/matplotlib/font_manager.py:1297: UserWarning: findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans
  (prop.get_family(), self.defaultFamily[fontext]))



In [ ]:

[3] Modeling Part 2: Python scikit-learn

Models to use:

GradientBoostingClassifier
RandomForestClassifier
AdaBoostClassifier
ExtraTreesClassifier
BaggingClassifier
LogisticRegression
SVM kernal RBF
SVM kernal Linear
KNeighborsClassifier

Import pre-processed data



In [33]:

    
df_history_ts_process.head()









    Out[33]:







  
    
      
      ccyy-mm
      time
      bid-price
      date-curr_x
      date-prev
      year
      month
      hour
      minute
      second
      ...
      increment-price-mv13sec
      increment-price-mv14sec
      increment-price-mv15sec
      volume-plate_x
      ratio-bid_x
      volume-plate_y
      ratio-bid_y
      deal-early-second
      deal-price-avg
      deal-price-avg
    
  
  
    
      0
      2015-05
      11:29:15
      78400
      2015-05-01
      2015-04-01
      2015
      05
      11
      29
      15
      ...
      -92.3077
      -92.8571
      -93.3333
      7482
      0.047959
      8288.0
      0.05442
      41.0
      80759.0
      80759.0
    
    
      1
      2015-05
      11:29:16
      78400
      2015-05-01
      2015-04-01
      2015
      05
      11
      29
      16
      ...
      -84.6154
      -85.7143
      -86.6667
      7482
      0.047959
      8288.0
      0.05442
      41.0
      80759.0
      80759.0
    
    
      2
      2015-05
      11:29:17
      78400
      2015-05-01
      2015-04-01
      2015
      05
      11
      29
      17
      ...
      -76.9231
      -78.5714
      -80
      7482
      0.047959
      8288.0
      0.05442
      41.0
      80759.0
      80759.0
    
    
      3
      2015-05
      11:29:18
      78400
      2015-05-01
      2015-04-01
      2015
      05
      11
      29
      18
      ...
      -69.2308
      -71.4286
      -73.3333
      7482
      0.047959
      8288.0
      0.05442
      41.0
      80759.0
      80759.0
    
    
      4
      2015-05
      11:29:19
      78500
      2015-05-01
      2015-04-01
      2015
      05
      11
      29
      19
      ...
      -61.5385
      -64.2857
      -66.6667
      7482
      0.047959
      8288.0
      0.05442
      41.0
      80759.0
      80759.0
    
  

5 rows × 49 columns

Include relevant features



In [35]:

    
X = df_history_ts_process[[
# 'ccyy-mm', 'time', 'bid-price', 'date-curr_x', 'date-prev', 'year',
       'month', 
#     'hour', 'minute', 
    'second', 'base-price15sec',
       'increment-price', 
#     'increment-price-target', 
    'increment-price-prev1sec',
       'increment-price-prev2sec', 'increment-price-prev3sec',
       'increment-price-prev4sec', 'increment-price-prev5sec',
       'increment-price-prev6sec', 'increment-price-prev7sec',
       'increment-price-prev8sec', 'increment-price-prev9sec',
       'increment-price-prev10sec', 'increment-price-prev11sec',
       'increment-price-prev12sec', 'increment-price-prev13sec',
       'increment-price-prev14sec', 'increment-price-prev15sec',
       'increment-price-mv2sec', 'increment-price-mv3sec',
       'increment-price-mv4sec', 'increment-price-mv5sec',
       'increment-price-mv6sec', 'increment-price-mv7sec',
       'increment-price-mv8sec', 'increment-price-mv9sec',
       'increment-price-mv10sec', 'increment-price-mv11sec',
       'increment-price-mv12sec', 'increment-price-mv13sec',
       'increment-price-mv14sec', 'increment-price-mv15sec', 'volume-plate_x',
       'ratio-bid_x', 'volume-plate_y', 'ratio-bid_y', 'deal-early-second',
       'deal-price-avg', 'deal-price-avg'    
        ]]

X_col = X.columns # get the column list

# X = StandardScaler().fit_transform(X.as_matrix())
X = X.as_matrix()

# y = StandardScaler().fit_transform(df_wnv_raw[['increment-price-target']].as_matrix()).reshape(len(df_wnv_raw),)
y = df_history_ts_process[['increment-price-target']].as_matrix().reshape(len(df_history_ts_process),)



In [36]:

    
X_col









    Out[36]:





Index(['month', 'second', 'base-price15sec', 'increment-price',
       'increment-price-prev1sec', 'increment-price-prev2sec',
       'increment-price-prev3sec', 'increment-price-prev4sec',
       'increment-price-prev5sec', 'increment-price-prev6sec',
       'increment-price-prev7sec', 'increment-price-prev8sec',
       'increment-price-prev9sec', 'increment-price-prev10sec',
       'increment-price-prev11sec', 'increment-price-prev12sec',
       'increment-price-prev13sec', 'increment-price-prev14sec',
       'increment-price-prev15sec', 'increment-price-mv2sec',
       'increment-price-mv3sec', 'increment-price-mv4sec',
       'increment-price-mv5sec', 'increment-price-mv6sec',
       'increment-price-mv7sec', 'increment-price-mv8sec',
       'increment-price-mv9sec', 'increment-price-mv10sec',
       'increment-price-mv11sec', 'increment-price-mv12sec',
       'increment-price-mv13sec', 'increment-price-mv14sec',
       'increment-price-mv15sec', 'volume-plate_x', 'ratio-bid_x',
       'volume-plate_y', 'ratio-bid_y', 'deal-early-second', 'deal-price-avg',
       'deal-price-avg', 'deal-price-avg', 'deal-price-avg'],
      dtype='object')



In [39]:

    
plt.figure()
plt.plot(X)
plt.figure()
plt.plot(y)









    Out[39]:





[<matplotlib.lines.Line2D at 0x7f831d9dbcc0>]






    



/home/user/env_py3/lib/python3.5/site-packages/matplotlib/font_manager.py:1297: UserWarning: findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans
  (prop.get_family(), self.defaultFamily[fontext]))



In [ ]:

[4] Evaluation

K-fold Cross-Validation



In [40]:

    
rng = check_random_state(0)



In [41]:

    
# GB
classifier_GB = GradientBoostingRegressor(n_estimators=1500, # score: 0.94608 (AUC 0.81419), learning_rate=0.001, max_features=8 <<< Best
#                                    loss='deviance',
#                                    subsample=1,
#                                    max_depth=5,
#                                    min_samples_split=20,
                                   learning_rate=0.002,
#                                    max_features=10,
                                   random_state=rng)



In [42]:

    
# AB
classifier_AB = AdaBoostRegressor(n_estimators=1500, # score: 0.93948 (AUC 0.88339), learning_rate=0.004 <<< Best
                                   learning_rate=0.002,
                                   random_state=rng)



In [43]:

    
# RF
classifier_RF = RandomForestRegressor(n_estimators=1500, # score: 0.94207 (AUC 0.81870), max_depth=3, min_samples_split=20, <<< Best
#                                     max_features=10,
#                                     max_depth=3,
#                                     min_samples_split=20,
                                    random_state=rng)



In [44]:

    
# ET
classifier_ET = ExtraTreesRegressor(n_estimators=1000, # score: 0.94655 (AUC 0.84364), max_depth=3, min_samples_split=20, max_features=10 <<< Best
#                                     max_depth=3,
#                                     min_samples_split=20,
#                                     max_features=10,
                                    random_state=rng)



In [45]:

    
# BG
classifier_BG = BaggingRegressor(n_estimators=500, # score: 0.70725 (AUC 0.63729) <<< Best
#                                     max_features=10,
                                    random_state=rng)

LR



In [46]:

    
classifier_LR = LinearRegression() # score: 0.90199 (AUC 0.80569)

SVM Linear



In [47]:

    
# classifier_SVCL = svm.SVC(kernel='linear', probability=True, random_state=rng) # score: 0.89976 (AUC 0.70524)
classifier_SVRL = svm.SVR() # score: 0.89976 (AUC 0.70524)

SVM



In [48]:

    
classifier_SVRR = svm.SVR(kernel='rbf') # score: 0.80188 (AUC 0.50050)
# classifier_SVRR = svm.SVR(kernel='poly') # score: 0.80188 (AUC 0.50050)

KNN



In [49]:

    
classifier_KNN = KNeighborsRegressor(n_neighbors=2) # score: 0.94018 (AUC 0.72792)
cv = cross_val_score(classifier_KNN,
                            X,
                            y,
                            cv=StratifiedKFold(parm_ts_valid_month))
print('KNN CV score: {0:.5f}'.format(cv.mean()))









    



/home/user/env_py3/lib/python3.5/site-packages/sklearn/model_selection/_split.py:581: Warning: The least populated class in y has only 1 members, which is too few. The minimum number of groups for any class cannot be less than n_splits=27.
  % (min_groups, self.n_splits)), Warning)






    



KNN CV score: 0.96786



In [ ]:

Select Model



In [50]:

    
# classifier = classifier_GB   # 324.632308296
classifier = classifier_AB   # 429.646733221
# classifier = classifier_RF   # 175.504322802
# classifier = classifier_ET   # 172.097916817, 0.0724812030075
# classifier = classifier_BG   # 175.451381872
# classifier = classifier_LR     # 128.465059749, 0.11
# classifier = classifier_SVRL # 3789.82169312
# classifier = classifier_SVRR # 3789.82169312, 0.10754224349

Split Data



In [51]:

    
n_splits = parm_ts_valid_cycle
print(n_splits)
# n_splits=54 # 19 seconds/records for each bidding month
# n_splits=19 # 19 seconds/records for each bidding month
n_fold = parm_ts_valid_month
print(n_fold)


# X_train_1 = X[0:(len(X)-batch*n_splits)]
# y_train_1 = y[0:(len(X)-batch*n_splits)]

# X_test_1 = X[(len(X)-batch*n_splits):((len(X)-batch*n_splits)+n_splits)]
# y_test_1 = y[(len(X)-batch*n_splits):((len(X)-batch*n_splits)+n_splits)]

CV



In [52]:

    
n_fold=5



In [54]:

    
y_pred = {}
y_test = {}

y_pred_org = {}
y_test_org = {}

i = 0
for batch in range(1, n_fold):
    X_train_1 = X[0:(len(X)-batch*n_splits)]
    y_train_1 = y[0:(len(X)-batch*n_splits)]
    X_test_1  = X[(len(X)-batch*n_splits):((len(X)-batch*n_splits)+n_splits)]
    y_test_1  = y[(len(X)-batch*n_splits):((len(X)-batch*n_splits)+n_splits)]
    print(len(X_train_1))
    
    # ReScale
    ScalerX = StandardScaler()
    ScalerX.fit(X_train_1)
    X_train_1 = ScalerX.transform(X_train_1)
    X_test_1  = ScalerX.transform(X_test_1)
    
    ScalerY = StandardScaler()
    ScalerY.fit(y_train_1.reshape(-1, 1))
    y_train_1 = ScalerY.transform(y_train_1.reshape(-1, 1))
    y_test_1  = ScalerY.transform(y_test_1.reshape(-1, 1))
    
    y_pred[i] = classifier.fit(X_train_1, y_train_1).predict(X_test_1)
    y_test[i] = y_test_1  

    y_pred_org[i] = ScalerY.inverse_transform(y_pred[i])
    y_test_org[i] = ScalerY.inverse_transform(y_test[i])
    
    plt.figure()
    plt.plot(y_train_1)
    plt.plot()
    plt.figure()
    plt.plot(y_test[i])
    plt.plot(y_pred[i])
    plt.plot()
    i += 1









    



1014






    



/home/user/env_py3/lib/python3.5/site-packages/sklearn/utils/validation.py:429: DataConversionWarning: Data with input dtype object was converted to float64 by StandardScaler.
  warnings.warn(msg, _DataConversionWarning)
/home/user/env_py3/lib/python3.5/site-packages/sklearn/utils/validation.py:526: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)






    



975






    



/home/user/env_py3/lib/python3.5/site-packages/sklearn/utils/validation.py:429: DataConversionWarning: Data with input dtype object was converted to float64 by StandardScaler.
  warnings.warn(msg, _DataConversionWarning)
/home/user/env_py3/lib/python3.5/site-packages/sklearn/utils/validation.py:526: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)






    



936






    



/home/user/env_py3/lib/python3.5/site-packages/sklearn/utils/validation.py:429: DataConversionWarning: Data with input dtype object was converted to float64 by StandardScaler.
  warnings.warn(msg, _DataConversionWarning)
/home/user/env_py3/lib/python3.5/site-packages/sklearn/utils/validation.py:526: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)






    



897






    



/home/user/env_py3/lib/python3.5/site-packages/sklearn/utils/validation.py:429: DataConversionWarning: Data with input dtype object was converted to float64 by StandardScaler.
  warnings.warn(msg, _DataConversionWarning)
/home/user/env_py3/lib/python3.5/site-packages/sklearn/utils/validation.py:526: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
/home/user/env_py3/lib/python3.5/site-packages/matplotlib/font_manager.py:1297: UserWarning: findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans
  (prop.get_family(), self.defaultFamily[fontext]))

no inverse-scale



In [55]:

    
k = []
for i in range(0, len(y_test)):
    k.append(np.mean(np.sqrt(np.square(y_test[i] - y_pred[i]))))

k_mean = np.mean(k)

print(k_mean)
print()
print(k)









    



1.03102733454

[1.3234597050962056, 0.76898934262907936, 1.1388271116717978, 0.89283317875862811]



In [56]:

    
k = []
for i in range(0, len(y_test)):
    k.append(np.mean(np.sqrt(np.square(y_test[i][35:37] - y_pred[i][35:37]))))

k_mean = np.mean(k)

print(k_mean)
print()
print(k)









    



0.482894226938

[0.69118943508004782, 0.3221723167772349, 0.37121916470713323, 0.54699599118880293]

inverse-scale



In [57]:

    
k = []
for i in range(0, len(y_test)):
    k.append(np.mean(np.sqrt(np.square(y_test_org[i] - y_pred_org[i]))))

k_mean = np.mean(k)

print(k_mean)
print()
print(k)









    



396.541098178

[504.71584611509144, 297.09564286398006, 437.97466877367339, 346.37823495876768]



In [62]:

    
k = []
for i in range(0, len(y_test)):
    k.append(np.mean(np.sqrt(np.square(y_test_org[i][35:37] - y_pred_org[i][35:37]))))

k_mean = np.mean(k)

print(k_mean)
print()
print(k)









    



185.759189948

[263.59265734265728, 124.46985446985434, 142.76494565217376, 212.20930232558101]



In [69]:

    
# 50 second predicts 57 second
k = []
for i in range(0, len(y_test)):
    k.append(np.mean(np.sqrt(np.square(y_test_org[i][35:36] - y_pred_org[i][35:36]))))

k_mean = np.mean(k)

print(k_mean)
print()
print(k)









    



182.353004936

[281.73076923076906, 124.32432432432415, 115.21739130434798, 208.1395348837209]



In [70]:

    
plt.plot(y_test_org[0])
plt.plot(y_pred_org[0])









    Out[70]:





[<matplotlib.lines.Line2D at 0x7f831d5007b8>]






    



/home/user/env_py3/lib/python3.5/site-packages/matplotlib/font_manager.py:1297: UserWarning: findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans
  (prop.get_family(), self.defaultFamily[fontext]))



In [71]:

    
plt.plot(k)









    Out[71]:





[<matplotlib.lines.Line2D at 0x7f831d71f0b8>]






    



/home/user/env_py3/lib/python3.5/site-packages/matplotlib/font_manager.py:1297: UserWarning: findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans
  (prop.get_family(), self.defaultFamily[fontext]))



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:

    
y_test[1][13:]



In [ ]:

    
y_pred[1][13:]



In [ ]:

    
np.mean(np.sqrt(np.square(y_test[4] - y_pred[4])))



In [ ]:

    
np.mean(np.sqrt(np.square(y_test[4][13:16] - y_pred[4][13:16])))



In [ ]:



In [ ]:

    
y_pred_df = pd.DataFrame.from_dict(y_pred)



In [ ]:

    
y_pred_df.columns=['month 7','month 6','month 5','month 4','month 3','month 2','month 1']



In [ ]:

    
y_pred_df.to_csv('bid_results_v001.csv', index=False)



In [ ]:

    
y_pred_df



In [ ]:



In [ ]:

    
# previous N sec ['bid-price']
gap = parm_calculate_prev_bp

for gap in range(1, gap+1):
    col_name = 'bid-price-prev'+str(gap)+'sec'
    col_data = pd.DataFrame(columns=[col_name])
    col_data_zeros = pd.DataFrame({col_name: np.zeros(gap)})
    print('Creating : ', col_name)  

    for month in range(0, parm_ts_month):
    #     print('month : ', month)
        col_data.append(col_data_zeros)
        for i in range(0, gap):
            col_data.loc[month*parm_ts_cycle+i] = 0
        for i in range(gap, parm_ts_cycle):
            col_data.loc[month*parm_ts_cycle+i] = df_history_ts_process['bid-price'][month*parm_ts_cycle+i-gap]
  
    df_history_ts_process[col_name] = col_data

print('Total records processed : ', len(col_data))



In [ ]:

    
# previous 2 sec Moving Average ['bid-price']

gap = parm_calculate_mv

for gap in range(2, gap+1): # MV starts from 2 seconds, till parm_calculate_mv
    col_name = 'bid-price-mv'+str(gap)+'sec'
    col_data = pd.DataFrame(columns=[col_name])
    col_data_zeros = pd.DataFrame({col_name: np.zeros(gap)})
    print('Creating : ', col_name)  

    for month in range(0, parm_ts_month):
    #     print('month : ', month)
        col_data.append(col_data_zeros)
        for i in range(0, gap):
            col_data.loc[month*parm_ts_cycle+i] = 0
        for i in range(gap, parm_ts_cycle):
            col_data.loc[month*parm_ts_cycle+i] = \
            np.mean(df_history_ts_process['bid-price'][month*parm_ts_cycle+i-gap:month*parm_ts_cycle+i])
  
    df_history_ts_process[col_name] = col_data

print('Total records processed : ', len(col_data))



In [ ]:

    
df_history_ts_process[1768:]



In [ ]:



In [ ]:



In [ ]:



In [ ]:

    
# previous 2 sec Moving Average ['bid-price']

gap = parm_calculate_mv

for gap in range(1, gap+1):
    col_name = 'bid-price-mv'+str(gap)+'sec'
    col_data = pd.DataFrame(columns=[col_name])
    print('Creating : ', col_name)  

    for month in range(0, parm_ts_month):
    #     print('month : ', month)
        col_data.append(col_data_zeros)
        for i in range(0, gap):
            col_data.loc[month*parm_ts_cycle+i] = 0
        for i in range(gap, parm_ts_cycle):
            col_data.loc[month*parm_ts_cycle+i] = df_history_ts_process['bid-price'][month*parm_ts_cycle+i-gap]
  
    df_history_ts_process[col_name] = col_data

print('len : ', len(col_data))



In [ ]:



In [ ]:



In [ ]:

    
# previous N sec
gap = 1
gap = 2
gap = 3
gap = 4
gap = 5
gap = 6
gap = 7
gap = 8
gap = 9
gap = 10

col_name = 'bid-price-prev'+str(gap)+'sec'
col_data = pd.DataFrame(columns=[col_name])

for month in range(0, parm_ts_month):
#     print('month : ', month)
    col_data.append(col_data_zeros)
    for i in range(0, gap):
        col_data.loc[month*parm_ts_cycle+i] = 0
    for i in range(gap, parm_ts_cycle):
        col_data.loc[month*parm_ts_cycle+i] = df_history_ts_process['bid-price'][month*parm_ts_cycle+i]
    
print('len : ', len(col_data))    
df_history_ts_process[col_name] = col_data



In [ ]:

    
len(col_data)



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:

    
# previous 1 sec
gap = 10

col_data = pd.DataFrame({'bid-price-prev'+str(gap)+'sec': np.zeros(gap)})

# for i in range(gap, len(df_history_ts)-1768):
for i in range(gap, parm_ts_cycle):
#     print(df_history_ts['bid-price'][i])
    col_data.loc[i] = df_history_ts['bid-price'][i]

print(len(col_data))



In [ ]:



In [ ]:

    
df_history_ts_process = df_history_ts.copy()



In [ ]:

    
df_history_table_process['tmp'] = col_data['bid-price-prev'+str(gap)+'sec']



In [ ]:

    
df_history_table_process.tail()



In [ ]:



In [ ]:



In [ ]:

    
col_data



In [ ]:



In [ ]:



In [ ]:

	ccyy-mm	time	bid-price
1886	2017-07	11:29:56	92100
1887	2017-07	11:29:57	92100
1888	2017-07	11:29:58	92100
1889	2017-07	11:29:59	92200
1890	2017-07	11:30:00	92200

	ccyy-mm	time	bid-price
0	2015-01	11:29:00	74000
1	2015-01	11:29:01	74000
2	2015-01	11:29:02	74000
3	2015-01	11:29:03	74000
4	2015-01	11:29:04	74000

	ccyy-mm	volume-plate	deal-price-low	deal-price-avg	deal-early-second	volume-bidder
26	2017-03	10356	87800	87916	55	262010
27	2017-04	12196	89800	89850	59	252273
28	2017-05	10316	90100	90209	55	270197
29	2017-06	10312	89400	89532	45	244349
30	2017-07	10325	92200	92250	57	269189

	ccyy-mm	time	bid-price	date-curr_x	date-prev	year	month	hour	minute	second	...	increment-price-mv13sec	increment-price-mv14sec	increment-price-mv15sec	volume-plate_x	ratio-bid_x	volume-plate_y	ratio-bid_y	deal-early-second	deal-price-avg	deal-price-avg
1048	2017-07	11:29:49	91400	2017-07-01	2017-06-01	2017	07	11	29	49	...	461.538	442.857	420	10325	0.038356	10312.0	0.042202	45.0	89532.0	89532.0
1049	2017-07	11:29:50	91500	2017-07-01	2017-06-01	2017	07	11	29	50	...	507.692	485.714	466.667	10325	0.038356	10312.0	0.042202	45.0	89532.0	89532.0
1050	2017-07	11:29:51	91600	2017-07-01	2017-06-01	2017	07	11	29	51	...	553.846	535.714	513.333	10325	0.038356	10312.0	0.042202	45.0	89532.0	89532.0
1051	2017-07	11:29:52	91700	2017-07-01	2017-06-01	2017	07	11	29	52	...	600	585.714	566.667	10325	0.038356	10312.0	0.042202	45.0	89532.0	89532.0
1052	2017-07	11:29:53	91800	2017-07-01	2017-06-01	2017	07	11	29	53	...	653.846	635.714	620	10325	0.038356	10312.0	0.042202	45.0	89532.0	89532.0

	ccyy-mm	time	bid-price	date-curr_x	date-prev	year	month	hour	minute	second	...	increment-price-mv13sec	increment-price-mv14sec	increment-price-mv15sec	volume-plate_x	ratio-bid_x	volume-plate_y	ratio-bid_y	deal-early-second	deal-price-avg	deal-price-avg
0	2015-05	11:29:15	78400	2015-05-01	2015-04-01	2015	05	11	29	15	...	-92.3077	-92.8571	-93.3333	7482	0.047959	8288.0	0.05442	41.0	80759.0	80759.0
1	2015-05	11:29:16	78400	2015-05-01	2015-04-01	2015	05	11	29	16	...	-84.6154	-85.7143	-86.6667	7482	0.047959	8288.0	0.05442	41.0	80759.0	80759.0
2	2015-05	11:29:17	78400	2015-05-01	2015-04-01	2015	05	11	29	17	...	-76.9231	-78.5714	-80	7482	0.047959	8288.0	0.05442	41.0	80759.0	80759.0
3	2015-05	11:29:18	78400	2015-05-01	2015-04-01	2015	05	11	29	18	...	-69.2308	-71.4286	-73.3333	7482	0.047959	8288.0	0.05442	41.0	80759.0	80759.0
4	2015-05	11:29:19	78500	2015-05-01	2015-04-01	2015	05	11	29	19	...	-61.5385	-64.2857	-66.6667	7482	0.047959	8288.0	0.05442	41.0	80759.0	80759.0

[2] Data pre-porcessing

Read raw data

Parameters

Prepare derived features

Process: df_history_ts_process

['increment-price-target']

Process: df_history_table_process

Merge dataframe

Shift to copy previous 'parm_calculate_prev_month' month's data into current row

Housekeeping to remove some invald data during pre-processing

[3] Modeling Part 2: Python scikit-learn

Models to use:

Import pre-processed data

Include relevant features

[4] Evaluation

K-fold Cross-Validation

LR

SVM Linear

SVM

KNN

Select Model

Split Data

CV

no inverse-scale

inverse-scale

The End