By: 顾 瞻 GU Zhan (Sam)
Sep 2017
In [1]:
import pandas as pd
In [14]:
# df_history_ts_process = pd.read_csv('data/history_ts.csv')
df_history_ts_process.tail()
Out[14]:
In [3]:
df_history_table_process = pd.read_csv('data/history_table.csv')
df_history_table_process.tail()
Out[3]:
In [59]:
df_parm_si = pd.read_csv('data/parm_si.csv')
df_parm_si.tail()
Out[59]:
In [86]:
df_parm_si[(df_parm_si['ccyy-mm'] == '2017-08') & (df_parm_si['time'] == '11:29:00')].iloc[0]['SI']
Out[86]:
In [87]:
# function to fetch Seasonality-Index
def fetech_si(ccyy_mm, time, df_parm_si):
# return df_parm_si[(df_parm_si['ccyy-mm'] == '2017-09') & (df_parm_si['time'] == '11:29:00')]
return df_parm_si[(df_parm_si['ccyy-mm'] == ccyy_mm) & (df_parm_si['time'] == time)].iloc[0]['SI']
In [ ]:
In [88]:
# create global base price
global_parm_base_price = 10000000
# create predictino results dataframe: shl_pm
df_shl_pm = pd.DataFrame()
In [89]:
for i in range(1830-1, len(df_history_ts_process)): # use July 2015 data as simulatino
print('\n<<<< Record No.: %5d >>>>' % i)
print(df_history_ts_process['ccyy-mm'][i]) # format: ccyy-mm
print(df_history_ts_process['time'][i]) # format: hh:mm:ss
print(df_history_ts_process['bid-price'][i]) # format: integer
# print(df_history_ts_process['ref-price'][i])
# capture & calculate 11:29:00 bid price - 1 = base price
if df_history_ts_process['time'][i] == '11:29:00':
global_parm_base_price = df_history_ts_process['bid-price'][i] -1
print('#### global_parm_base_price : %d ####' % global_parm_base_price)
# wrtie initial 11:29:00 record into shl_pm prediction dataframe
df_shl_pm = pd.DataFrame()
df_shl_pm_current = {
'ccyy-mm' : df_history_ts_process['ccyy-mm'][i]
,'time' : df_history_ts_process['time'][i]
,'bid' : df_history_ts_process['bid-price'][i]
,'datetime' : current_datetime
,'price4pm' : current_price4pm
,'SI' : current_si
,'price4pmsi' : current_price4pmsi
,'pred_price' : -999
,'pred_price_rounded' : -999
,'pred_dynamic_increment' : -999 # +200 or + 300
,'pred_set_price_rounded' : -999 # pred_price_rounded + pred_dynamic_increment
}
print('---- Pre-Process ---')
# pre-process: ccyy-mm-hh:mm:ss
current_datetime = df_history_ts_process['ccyy-mm'][i] + ' ' + df_history_ts_process['time'][i]
current_price4pm = df_history_ts_process['bid-price'][i] - global_parm_base_price
print('#### current_datetime : %s ####' % current_datetime)
print('#### current_price4pm : %d ####' % current_price4pm)
# get Seasonality-Index
current_si = fetech_si(df_history_ts_process['ccyy-mm'][i]
,df_history_ts_process['time'][i]
,df_parm_si)
print('#### current_si : %0.10f ####' % current_si)
# get de-seasoned price: price4pmsi
current_price4pmsi = current_price4pm / current_si
print('#### current_price4pmsi : %0.10f ####' % current_price4pmsi)
print('---- call predicitno functino shl_pm ----')
# call predicitno functino shl_pm
# write results to shl_pm dataframe
df_shl_pm_current = {
'ccyy-mm' : df_history_ts_process['ccyy-mm'][i]
,'time' : df_history_ts_process['time'][i]
,'bid' : df_history_ts_process['bid-price'][i]
,'datetime' : current_datetime
,'price4pm' : current_price4pm
,'SI' : current_si
,'price4pmsi' : current_price4pmsi
,'pred_price' : -999
,'pred_price_rounded' : -999
,'pred_dynamic_increment' : -999 # +200 or + 300
,'pred_set_price_rounded' : -999 # pred_price_rounded + pred_dynamic_increment
}
df_shl_pm = df_shl_pm.append(df_shl_pm_current, ignore_index=True)
In [91]:
df_shl_pm.tail()
Out[91]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
# create global base price
# create predictino results dataframe: shl_pm
df_shl_pm = pd.DataFrame()
In [ ]:
# append into predictino results dataframe: shl_pm
In [ ]:
In [ ]:
df_shl_pm = pd.DataFrame()
In [40]:
d = {
'ccyy-mm' : df_history_ts_process['ccyy-mm'][1830]
,'time' : df_history_ts_process['time'][1830]
,'bid' : 1.8
}
In [42]:
df_shl_pm = df_shl_pm.append(d, ignore_index=True)
In [43]:
df_shl_pm
Out[43]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
# from __future__ import print_function, division
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import pandas as pd
import operator
from scipy import interp
from itertools import cycle
from sklearn import svm
from sklearn.utils.validation import check_random_state
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_curve, auc
from statsmodels.graphics.mosaicplot import mosaic
print(__doc__)
In [ ]:
parm_calculate_base_price_second = 15 # Use the current month's bid-price as base-price at this seconds. Later to derive increment-price
parm_calculate_target_second = 7 # How many seconds in future to predict: target variable
parm_calculate_prev_bp = 15 # Number of previous price/increment to include, i.e. previous 2sec, 3sec, 4sec, 5sec ... 15sec
parm_calculate_mv = 15 # Number of previous price/increment Moving Average to calculate, i.e. previous 2sec, 3sec, 4sec, 5sec ... 15sec
parm_calculate_prev_month = 3 # Number of previous month to include (need to remove earliest x month from training data)
print('parm_calculate_base_price_second : %3d seconds' % parm_calculate_base_price_second)
print('parm_calculate_target_second : %3d seconds' % parm_calculate_target_second)
print('parm_calculate_prev_bp : %3d seconds' % parm_calculate_prev_bp)
print('parm_calculate_mv : %3d seconds' % parm_calculate_mv)
print('parm_calculate_prev_month : %3d months' % parm_calculate_prev_month)
print('' )
parm_ts_cycle = 61 # seconds/records per month
print('parm_ts_cycle : %3d seconds' % parm_ts_cycle)
parm_ts_month = int(len(df_history_ts_process) / parm_ts_cycle)
print('parm_ts_month : %3d months' % parm_ts_month)
parm_record_cut_row_head = max(parm_calculate_base_price_second, parm_calculate_prev_bp, parm_calculate_mv)
parm_record_cut_row_tail = parm_calculate_target_second
parm_record_cut_month_head = parm_calculate_prev_month + 1
parm_ts_valid_cycle = parm_ts_cycle - parm_record_cut_row_head - parm_record_cut_row_tail
print('parm_ts_valid_cycle : %3d seconds' % parm_ts_valid_cycle)
parm_ts_valid_month = parm_ts_month - parm_record_cut_month_head
print('parm_ts_valid_month : %3d months' % parm_ts_valid_month)
if parm_record_cut_month_head < 10:
parm_record_cut_ccyy = pd.to_datetime('2015-0'+str(parm_record_cut_month_head))
else:
parm_record_cut_ccyy = pd.to_datetime('2015-'+str(parm_record_cut_month_head))
print('' )
print('parm_record_cut_ccyy : %s' % parm_record_cut_ccyy)
print('parm_record_cut_month_head : %3d months' % parm_record_cut_month_head)
print('parm_record_cut_row_head : %3d seconds' % parm_record_cut_row_head)
print('parm_record_cut_row_tail : %3d seconds' % parm_record_cut_row_tail)
print('' )
In [ ]:
df_history_ts_process.head()
In [ ]:
In [ ]:
# date of current month
df_history_ts_process['date-curr'] = df_history_ts_process.apply(lambda row: pd.to_datetime(row['ccyy-mm']), axis=1)
# date of previous month
df_history_ts_process['date-prev'] = df_history_ts_process.apply(lambda row: row['date-curr'] - pd.offsets.MonthBegin(1), axis=1)
# Year
df_history_ts_process['year'] = df_history_ts_process.apply(lambda row: row['ccyy-mm'][0:4], axis=1)
# Month
df_history_ts_process['month'] = df_history_ts_process.apply(lambda row: row['ccyy-mm'][5:7], axis=1)
# Hour
df_history_ts_process['hour'] = df_history_ts_process.apply(lambda row: row['time'][0:2], axis=1)
# Minute
df_history_ts_process['minute'] = df_history_ts_process.apply(lambda row: row['time'][3:5], axis=1)
# Second
df_history_ts_process['second'] = df_history_ts_process.apply(lambda row: row['time'][6:8], axis=1)
# datetime of current month
df_history_ts_process['datetime-curr'] = df_history_ts_process.apply(lambda row: str(row['date-curr']) + ' ' + row['time'], axis=1)
# datetime of previous month
df_history_ts_process['datetime-prev'] = df_history_ts_process.apply(lambda row: str(row['date-prev']) + ' ' + row['time'], axis=1)
In [ ]:
df_history_ts_process.tail()
In [ ]:
# df_history_ts_process
# df_history_ts_process[1768:]
In [ ]:
# new ['base-price']
gap = 1 # only one new feature/column
for gap in range(1, gap+1):
col_name = 'base-price'+str(parm_calculate_base_price_second)+'sec'
col_name_base_price = col_name
col_data = pd.DataFrame(columns=[col_name])
print('Creating : ', col_name)
for month in range(0, parm_ts_month):
for i in range(0, parm_ts_cycle):
col_data.loc[month*parm_ts_cycle+i] = df_history_ts_process['bid-price'][month*parm_ts_cycle+parm_calculate_base_price_second]
df_history_ts_process[col_name] = col_data
print('Total records processed : ', len(col_data))
In [ ]:
# df_history_ts_process
# df_history_ts_process[1768:]
In [ ]:
# new ['increment-price'] = ['bid-price'] - ['base-price']
df_history_ts_process['increment-price'] = df_history_ts_process.apply(lambda row: row['bid-price'] - row[col_name_base_price], axis=1)
In [ ]:
# df_history_ts_process
# df_history_ts_process[1768:]
In [ ]:
plt.figure()
plt.plot(df_history_ts_process['bid-price'])
plt.plot(df_history_ts_process[col_name_base_price])
plt.plot()
plt.figure()
plt.plot(df_history_ts_process['increment-price'])
plt.plot()
In [ ]:
# previous N sec ['increment-price-target']
for gap in range(1, 2):
col_name = 'increment-price-target'
col_data = pd.DataFrame(columns=[col_name])
print('Creating : ', col_name)
for month in range(0, parm_ts_month):
# print('month : ', month)
for i in range(0, (parm_ts_cycle - parm_calculate_target_second)):
col_data.loc[month*parm_ts_cycle+i] = df_history_ts_process['increment-price'][month*parm_ts_cycle+i+parm_calculate_target_second]
for i in range((parm_ts_cycle - parm_calculate_target_second), parm_ts_cycle):
col_data.loc[month*parm_ts_cycle+i] = 0
df_history_ts_process[col_name] = col_data
print('Total records processed : ', len(col_data))
In [ ]:
plt.figure()
plt.plot(df_history_ts_process['increment-price'])
plt.plot(df_history_ts_process['increment-price-target'])
plt.plot()
plt.figure()
plt.plot(df_history_ts_process['increment-price'][1768:])
plt.plot(df_history_ts_process['increment-price-target'][1768:])
plt.plot()
In [ ]:
In [ ]:
# previous 'parm_calculate_prev_bp' sec ['increment-price']
gap = parm_calculate_prev_bp
for gap in range(1, gap+1):
col_name = 'increment-price-prev'+str(gap)+'sec'
col_data = pd.DataFrame(columns=[col_name])
# col_data_zeros = pd.DataFrame({col_name: np.zeros(gap)})
print('Creating : ', col_name)
for month in range(0, parm_ts_month):
# print('month : ', month)
# col_data.append(col_data_zeros)
for i in range(0, gap):
col_data.loc[month*parm_ts_cycle+i] = 0
for i in range(gap, parm_ts_cycle):
col_data.loc[month*parm_ts_cycle+i] = df_history_ts_process['increment-price'][month*parm_ts_cycle+i-gap]
df_history_ts_process[col_name] = col_data
print('Total records processed : ', len(col_data))
In [ ]:
# previous 'parm_calculate_mv' sec Moving Average ['increment-price']
gap = parm_calculate_mv
for gap in range(2, gap+1): # MV starts from 2 seconds, till parm_calculate_mv
col_name = 'increment-price-mv'+str(gap)+'sec'
col_data = pd.DataFrame(columns=[col_name])
print('Creating : ', col_name)
for month in range(0, parm_ts_month):
# print('month : ', month)
for i in range(0, gap):
col_data.loc[month*parm_ts_cycle+i] = 0
for i in range(gap, parm_ts_cycle):
col_data.loc[month*parm_ts_cycle+i] = \
np.mean(df_history_ts_process['increment-price'][month*parm_ts_cycle+i-gap:month*parm_ts_cycle+i])
df_history_ts_process[col_name] = col_data
print('Total records processed : ', len(col_data))
In [ ]:
# df_history_ts_process[1768:]
In [ ]:
plt.figure()
plt.plot(df_history_ts_process['increment-price'][1768:])
plt.plot(df_history_ts_process['increment-price-prev3sec'][1768:])
plt.plot(df_history_ts_process['increment-price-prev7sec'][1768:])
plt.plot(df_history_ts_process['increment-price-prev11sec'][1768:])
plt.plot(df_history_ts_process['increment-price-prev15sec'][1768:])
plt.plot()
In [ ]:
plt.figure()
plt.plot(df_history_ts_process['increment-price'][1768:])
plt.plot(df_history_ts_process['increment-price-mv3sec'][1768:])
plt.plot(df_history_ts_process['increment-price-mv7sec'][1768:])
plt.plot(df_history_ts_process['increment-price-mv11sec'][1768:])
plt.plot(df_history_ts_process['increment-price-mv15sec'][1768:])
plt.plot()
In [ ]:
In [ ]:
df_history_table_process.tail()
In [ ]:
# date of current month
df_history_table_process['date-curr'] = df_history_table_process.apply(lambda row: pd.to_datetime(row['ccyy-mm']), axis=1)
df_history_table_process['d-avg-low-price'] = df_history_table_process.apply(lambda row: row['deal-price-avg'] - row['deal-price-low'], axis=1)
df_history_table_process['ratio-bid'] = df_history_table_process.apply(lambda row: row['volume-plate'] / row['volume-bidder'], axis=1)
In [ ]:
df_history_table_process.tail()
In [ ]:
df_history_ts_process_tmp2 = df_history_ts_process.copy()
In [ ]:
df_history_ts_process = df_history_ts_process_tmp2.copy()
In [ ]:
# look up current month table data: 'volume-plate', 'ratio-bid'
df_history_ts_process = pd.merge(df_history_ts_process, df_history_table_process[['date-curr', 'volume-plate', 'ratio-bid']], how = 'left', left_on = 'date-curr', right_on = 'date-curr', suffixes=['', '_table'])
In [ ]:
for i in range(0, len(df_history_ts_process.columns)): print(df_history_ts_process.columns[i])
In [ ]:
# look up pevious month table data: 'volume-plate', 'ratio-bid', 'deal-early-second', 'deal-price-avg', 'd-avg-low-price'
df_history_ts_process = pd.merge(df_history_ts_process, df_history_table_process[['date-curr', 'volume-plate', 'ratio-bid', 'deal-early-second', 'deal-price-avg', 'd-avg-low-price']], how = 'left', left_on = 'date-prev', right_on = 'date-curr', suffixes=['', '_m0'])
In [ ]:
df_history_ts_process['d-increment-avg-low-price_m0'] = df_history_ts_process.apply(lambda row: row['increment-price'] - row['d-avg-low-price'], axis=1)
In [ ]:
for i in range(0, len(df_history_ts_process.columns)): print(df_history_ts_process.columns[i])
In [ ]:
# df_history_ts_process = df_history_ts_process_lookup.copy()
In [ ]:
df_history_ts_process_lookup = df_history_ts_process.copy()
df_history_ts_process_lookup.tail()
In [ ]:
# _m1
df_history_ts_process = pd.merge(df_history_ts_process, df_history_ts_process_lookup[[ \
'datetime-curr', 'datetime-prev',
'base-price15sec', 'increment-price', 'increment-price-target',
'increment-price-prev1sec', 'increment-price-prev2sec',
'increment-price-prev3sec', 'increment-price-prev4sec',
'increment-price-prev5sec', 'increment-price-prev6sec',
'increment-price-prev7sec', 'increment-price-prev8sec',
'increment-price-prev9sec', 'increment-price-prev10sec',
'increment-price-prev11sec', 'increment-price-prev12sec',
'increment-price-prev13sec', 'increment-price-prev14sec',
'increment-price-prev15sec',
'increment-price-mv2sec',
'increment-price-mv3sec', 'increment-price-mv4sec',
'increment-price-mv5sec', 'increment-price-mv6sec',
'increment-price-mv7sec', 'increment-price-mv8sec',
'increment-price-mv9sec', 'increment-price-mv10sec',
'increment-price-mv11sec', 'increment-price-mv12sec',
'increment-price-mv13sec', 'increment-price-mv14sec',
'increment-price-mv15sec',
'volume-plate_m0',
'ratio-bid_m0',
'deal-early-second',
'deal-price-avg',
'd-avg-low-price',
'd-increment-avg-low-price_m0'
]], how = 'left', left_on = 'datetime-prev', right_on = 'datetime-curr', suffixes=['', '_m1'])
df_history_ts_process.tail()
In [ ]:
# _m2
df_history_ts_process = pd.merge(df_history_ts_process, df_history_ts_process_lookup[[ \
'datetime-curr', 'datetime-prev',
'base-price15sec', 'increment-price', 'increment-price-target',
'increment-price-prev1sec', 'increment-price-prev2sec',
'increment-price-prev3sec', 'increment-price-prev4sec',
'increment-price-prev5sec', 'increment-price-prev6sec',
'increment-price-prev7sec', 'increment-price-prev8sec',
'increment-price-prev9sec', 'increment-price-prev10sec',
'increment-price-prev11sec', 'increment-price-prev12sec',
'increment-price-prev13sec', 'increment-price-prev14sec',
'increment-price-prev15sec',
'increment-price-mv2sec',
'increment-price-mv3sec', 'increment-price-mv4sec',
'increment-price-mv5sec', 'increment-price-mv6sec',
'increment-price-mv7sec', 'increment-price-mv8sec',
'increment-price-mv9sec', 'increment-price-mv10sec',
'increment-price-mv11sec', 'increment-price-mv12sec',
'increment-price-mv13sec', 'increment-price-mv14sec',
'increment-price-mv15sec',
'volume-plate_m0',
'ratio-bid_m0',
'deal-early-second',
'deal-price-avg',
'd-avg-low-price',
'd-increment-avg-low-price_m0'
]], how = 'left', left_on = 'datetime-prev_m1', right_on = 'datetime-curr', suffixes=['', '_m2'])
df_history_ts_process.tail()
In [ ]:
# _m3
df_history_ts_process = pd.merge(df_history_ts_process, df_history_ts_process_lookup[[ \
'datetime-curr', 'datetime-prev',
'base-price15sec', 'increment-price', 'increment-price-target',
'increment-price-prev1sec', 'increment-price-prev2sec',
'increment-price-prev3sec', 'increment-price-prev4sec',
'increment-price-prev5sec', 'increment-price-prev6sec',
'increment-price-prev7sec', 'increment-price-prev8sec',
'increment-price-prev9sec', 'increment-price-prev10sec',
'increment-price-prev11sec', 'increment-price-prev12sec',
'increment-price-prev13sec', 'increment-price-prev14sec',
'increment-price-prev15sec',
'increment-price-mv2sec',
'increment-price-mv3sec', 'increment-price-mv4sec',
'increment-price-mv5sec', 'increment-price-mv6sec',
'increment-price-mv7sec', 'increment-price-mv8sec',
'increment-price-mv9sec', 'increment-price-mv10sec',
'increment-price-mv11sec', 'increment-price-mv12sec',
'increment-price-mv13sec', 'increment-price-mv14sec',
'increment-price-mv15sec',
'volume-plate_m0',
'ratio-bid_m0',
'deal-early-second',
'deal-price-avg',
'd-avg-low-price',
'd-increment-avg-low-price_m0'
]], how = 'left', left_on = 'datetime-prev_m2', right_on = 'datetime-curr', suffixes=['', '_m3'])
df_history_ts_process.tail()
In [ ]:
plt.figure()
plt.plot(df_history_ts_process['increment-price-mv10sec'][1768:])
plt.plot(df_history_ts_process['increment-price-mv10sec_m1'][1768:])
plt.plot(df_history_ts_process['increment-price-mv10sec_m2'][1768:])
plt.plot(df_history_ts_process['increment-price-mv10sec_m3'][1768:])
plt.figure()
plt.plot(df_history_ts_process['increment-price-prev10sec'][1768:])
plt.plot(df_history_ts_process['increment-price-prev10sec_m1'][1768:])
plt.plot(df_history_ts_process['increment-price-prev10sec_m2'][1768:])
plt.plot(df_history_ts_process['increment-price-prev10sec_m3'][1768:])
plt.figure()
plt.plot(df_history_ts_process['increment-price'][1768:])
plt.plot(df_history_ts_process['increment-price_m1'][1768:])
plt.plot(df_history_ts_process['increment-price_m2'][1768:])
plt.plot(df_history_ts_process['increment-price_m3'][1768:])
plt.figure()
plt.plot(df_history_ts_process['increment-price-target'][1768:])
plt.plot(df_history_ts_process['increment-price-target_m1'][1768:])
plt.plot(df_history_ts_process['increment-price-target_m2'][1768:])
plt.plot(df_history_ts_process['increment-price-target_m3'][1768:])
plt.plot()
In [ ]:
In [ ]:
for i in range(0, len(df_history_ts_process.columns)): print(df_history_ts_process.columns[i])
In [ ]:
# housekeeping: delete some columns
# df_history_ts_process.drop('date-curr_y', axis=1, inplace=True)
In [ ]:
parm_record_cut_ccyy
In [ ]:
# remove first 'parm_record_cut_ccyy' months from dataset
df_history_ts_process = df_history_ts_process[df_history_ts_process['date-curr'] > parm_record_cut_ccyy]
In [ ]:
# total 61 seconds/rows per month:
# remove first 'parm_record_cut_row_head' reconds
# remove last 'parm_record_cut_row_tail' reconds
df_history_ts_process = df_history_ts_process[df_history_ts_process['second'] >= str(parm_record_cut_row_head) ]
df_history_ts_process = df_history_ts_process[df_history_ts_process['second'] <= str(60 - parm_record_cut_row_tail) ]
# df_history_ts_process = df_history_ts_process[df_history_ts_process['second'] > parm_record_cut_row_head ]
In [ ]:
# Reset index after housekeeping
df_history_ts_process = df_history_ts_process.reset_index(drop=True)
In [ ]:
df_history_ts_process.head()
In [ ]:
df_history_ts_process.tail()
In [ ]:
plt.figure()
plt.plot(df_history_ts_process['increment-price'][974:])
plt.plot(df_history_ts_process['increment-price-mv3sec'][974:])
plt.plot(df_history_ts_process['increment-price-mv7sec'][974:])
plt.plot(df_history_ts_process['increment-price-mv11sec'][974:])
plt.plot(df_history_ts_process['increment-price-mv15sec'][974:])
plt.figure()
plt.plot(df_history_ts_process['increment-price-mv15sec'][974:])
plt.plot(df_history_ts_process['increment-price-mv15sec_m1'][974:])
plt.plot(df_history_ts_process['increment-price-mv15sec_m2'][974:])
plt.plot(df_history_ts_process['increment-price-mv15sec_m3'][974:])
plt.plot()
In [ ]:
In [ ]:
# plt.plot(df_history_ts_process['d-avg-low-price'])
# plt.figure()
# plt.figure()
# plt.plot(df_history_ts_process['d-avg-low-price_m1'])
# plt.figure()
# plt.plot(df_history_ts_process['d-avg-low-price_m2'])
# plt.figure()
# plt.plot(df_history_ts_process['d-avg-low-price_m3'])
In [ ]:
for i in range(0, len(df_history_ts_process.columns)): print(df_history_ts_process.columns[i])
In [ ]:
X = df_history_ts_process[[
# ,'ccyy-mm'
# ,'time'
# ,'bid-price'
# ,'date-curr'
# ,'date-prev'
# ,'year'
'month'
# ,'hour'
# ,'minute'
,'second'
# ,'datetime-curr'
# ,'datetime-prev'
,'base-price15sec'
,'increment-price'
# ,'increment-price-target' # <<<<<<< This is target
,'increment-price-prev1sec'
,'increment-price-prev2sec'
,'increment-price-prev3sec'
,'increment-price-prev4sec'
,'increment-price-prev5sec'
,'increment-price-prev6sec'
,'increment-price-prev7sec'
,'increment-price-prev8sec'
,'increment-price-prev9sec'
,'increment-price-prev10sec'
,'increment-price-prev11sec'
,'increment-price-prev12sec'
,'increment-price-prev13sec'
,'increment-price-prev14sec'
,'increment-price-prev15sec'
,'increment-price-mv2sec'
,'increment-price-mv3sec'
,'increment-price-mv4sec'
,'increment-price-mv5sec'
,'increment-price-mv6sec'
,'increment-price-mv7sec'
,'increment-price-mv8sec'
,'increment-price-mv9sec'
,'increment-price-mv10sec'
,'increment-price-mv11sec'
,'increment-price-mv12sec'
,'increment-price-mv13sec'
,'increment-price-mv14sec'
,'increment-price-mv15sec'
,'volume-plate'
,'ratio-bid'
# ,'date-curr_m0'
,'volume-plate_m0'
,'ratio-bid_m0'
,'deal-early-second'
,'deal-price-avg'
,'d-avg-low-price'
,'d-increment-avg-low-price_m0'
# ,'datetime-curr_m1'
# ,'datetime-prev_m1'
,'base-price15sec_m1'
,'increment-price_m1'
,'increment-price-target_m1'
,'increment-price-prev1sec_m1'
,'increment-price-prev2sec_m1'
,'increment-price-prev3sec_m1'
,'increment-price-prev4sec_m1'
,'increment-price-prev5sec_m1'
,'increment-price-prev6sec_m1'
,'increment-price-prev7sec_m1'
,'increment-price-prev8sec_m1'
,'increment-price-prev9sec_m1'
,'increment-price-prev10sec_m1'
,'increment-price-prev11sec_m1'
,'increment-price-prev12sec_m1'
,'increment-price-prev13sec_m1'
,'increment-price-prev14sec_m1'
,'increment-price-prev15sec_m1'
,'increment-price-mv2sec_m1'
,'increment-price-mv3sec_m1'
,'increment-price-mv4sec_m1'
,'increment-price-mv5sec_m1'
,'increment-price-mv6sec_m1'
,'increment-price-mv7sec_m1'
,'increment-price-mv8sec_m1'
,'increment-price-mv9sec_m1'
,'increment-price-mv10sec_m1'
,'increment-price-mv11sec_m1'
,'increment-price-mv12sec_m1'
,'increment-price-mv13sec_m1'
,'increment-price-mv14sec_m1'
,'increment-price-mv15sec_m1'
,'volume-plate_m0_m1'
,'ratio-bid_m0_m1'
,'deal-early-second_m1'
,'deal-price-avg_m1'
,'d-avg-low-price_m1'
,'d-increment-avg-low-price_m0_m1'
# ,'datetime-curr_m2'
# ,'datetime-prev_m2'
,'base-price15sec_m2'
,'increment-price_m2'
,'increment-price-target_m2'
,'increment-price-prev1sec_m2'
,'increment-price-prev2sec_m2'
,'increment-price-prev3sec_m2'
,'increment-price-prev4sec_m2'
,'increment-price-prev5sec_m2'
,'increment-price-prev6sec_m2'
,'increment-price-prev7sec_m2'
,'increment-price-prev8sec_m2'
,'increment-price-prev9sec_m2'
,'increment-price-prev10sec_m2'
,'increment-price-prev11sec_m2'
,'increment-price-prev12sec_m2'
,'increment-price-prev13sec_m2'
,'increment-price-prev14sec_m2'
,'increment-price-prev15sec_m2'
,'increment-price-mv2sec_m2'
,'increment-price-mv3sec_m2'
,'increment-price-mv4sec_m2'
,'increment-price-mv5sec_m2'
,'increment-price-mv6sec_m2'
,'increment-price-mv7sec_m2'
,'increment-price-mv8sec_m2'
,'increment-price-mv9sec_m2'
,'increment-price-mv10sec_m2'
,'increment-price-mv11sec_m2'
,'increment-price-mv12sec_m2'
,'increment-price-mv13sec_m2'
,'increment-price-mv14sec_m2'
,'increment-price-mv15sec_m2'
,'volume-plate_m0_m2'
,'ratio-bid_m0_m2'
,'deal-early-second_m2'
,'deal-price-avg_m2'
,'d-avg-low-price_m2'
,'d-increment-avg-low-price_m0_m2'
# ,'datetime-curr_m3'
# ,'datetime-prev_m3'
,'base-price15sec_m3'
,'increment-price_m3'
,'increment-price-target_m3'
,'increment-price-prev1sec_m3'
,'increment-price-prev2sec_m3'
,'increment-price-prev3sec_m3'
,'increment-price-prev4sec_m3'
,'increment-price-prev5sec_m3'
,'increment-price-prev6sec_m3'
,'increment-price-prev7sec_m3'
,'increment-price-prev8sec_m3'
,'increment-price-prev9sec_m3'
,'increment-price-prev10sec_m3'
,'increment-price-prev11sec_m3'
,'increment-price-prev12sec_m3'
,'increment-price-prev13sec_m3'
,'increment-price-prev14sec_m3'
,'increment-price-prev15sec_m3'
,'increment-price-mv2sec_m3'
,'increment-price-mv3sec_m3'
,'increment-price-mv4sec_m3'
,'increment-price-mv5sec_m3'
,'increment-price-mv6sec_m3'
,'increment-price-mv7sec_m3'
,'increment-price-mv8sec_m3'
,'increment-price-mv9sec_m3'
,'increment-price-mv10sec_m3'
,'increment-price-mv11sec_m3'
,'increment-price-mv12sec_m3'
,'increment-price-mv13sec_m3'
,'increment-price-mv14sec_m3'
,'increment-price-mv15sec_m3'
,'volume-plate_m0_m3'
,'ratio-bid_m0_m3'
,'deal-early-second_m3'
,'deal-price-avg_m3'
,'d-avg-low-price_m3'
,'d-increment-avg-low-price_m0_m3'
]]
X_col = X.columns # get the column list
# X = StandardScaler().fit_transform(X.as_matrix())
X = X.as_matrix()
# y = StandardScaler().fit_transform(df_wnv_raw[['increment-price-target']].as_matrix()).reshape(len(df_wnv_raw),)
y = df_history_ts_process[['increment-price-target']].as_matrix().reshape(len(df_history_ts_process),)
In [ ]:
X_col
In [ ]:
plt.figure()
plt.plot(X)
plt.figure()
plt.plot(y)
In [ ]:
rng = check_random_state(0)
In [ ]:
# GB
classifier_GB = GradientBoostingRegressor(n_estimators=1500, # score: 0.94608 (AUC 0.81419), learning_rate=0.001, max_features=8 <<< Best
# loss='deviance',
# subsample=1,
# max_depth=5,
# min_samples_split=20,
learning_rate=0.002,
# max_features=10,
random_state=rng)
In [ ]:
# AB
classifier_AB = AdaBoostRegressor(n_estimators=1500, # score: 0.93948 (AUC 0.88339), learning_rate=0.004 <<< Best
learning_rate=0.002,
random_state=rng)
In [ ]:
# RF
classifier_RF = RandomForestRegressor(n_estimators=1500, # score: 0.94207 (AUC 0.81870), max_depth=3, min_samples_split=20, <<< Best
# max_features=10,
# max_depth=3,
# min_samples_split=20,
random_state=rng)
In [ ]:
# ET
classifier_ET = ExtraTreesRegressor(n_estimators=1000, # score: 0.94655 (AUC 0.84364), max_depth=3, min_samples_split=20, max_features=10 <<< Best
# max_depth=3,
# min_samples_split=20,
# max_features=10,
random_state=rng)
In [ ]:
# BG
classifier_BG = BaggingRegressor(n_estimators=500, # score: 0.70725 (AUC 0.63729) <<< Best
# max_features=10,
random_state=rng)
In [ ]:
classifier_LR = LinearRegression() # score: 0.90199 (AUC 0.80569)
In [ ]:
# classifier_SVCL = svm.SVC(kernel='linear', probability=True, random_state=rng) # score: 0.89976 (AUC 0.70524)
classifier_SVRL = svm.SVR(kernel='linear') # score: 0.89976 (AUC 0.70524)
In [ ]:
classifier_SVRR = svm.SVR(kernel='rbf') # score: 0.80188 (AUC 0.50050)
# classifier_SVRR = svm.SVR(kernel='poly') # score: 0.80188 (AUC 0.50050)
In [ ]:
classifier_KNN = KNeighborsRegressor(n_neighbors=2) # score: 0.94018 (AUC 0.72792)
cv = cross_val_score(classifier_KNN,
X,
y,
cv=StratifiedKFold(parm_ts_valid_month))
print('KNN CV score: {0:.5f}'.format(cv.mean()))
In [ ]:
In [ ]:
# classifier = classifier_GB # 219.099617786
# classifier = classifier_AB # 230.101439444
classifier = classifier_RF # 197.955555556
# classifier = classifier_ET #
# classifier = classifier_BG #
# classifier = classifier_LR #
# classifier = classifier_SVRL #
# classifier = classifier_SVRR #
In [ ]:
n_splits = parm_ts_valid_cycle
print('cycle seconds : %d' % n_splits)
# n_splits=54 # 19 seconds/records for each bidding month
# n_splits=19 # 19 seconds/records for each bidding month
n_fold = parm_ts_valid_month
print('cycle month : %d' % n_fold)
# X_train_1 = X[0:(len(X)-batch*n_splits)]
# y_train_1 = y[0:(len(X)-batch*n_splits)]
# X_test_1 = X[(len(X)-batch*n_splits):((len(X)-batch*n_splits)+n_splits)]
# y_test_1 = y[(len(X)-batch*n_splits):((len(X)-batch*n_splits)+n_splits)]
In [ ]:
n_fold=7
In [ ]:
y_pred = {}
y_test = {}
y_pred_org = {}
y_test_org = {}
i = 0
for batch in range(1, n_fold):
X_train_1 = X[0:(len(X)-batch*n_splits)]
y_train_1 = y[0:(len(X)-batch*n_splits)]
X_test_1 = X[(len(X)-batch*n_splits):((len(X)-batch*n_splits)+n_splits)]
y_test_1 = y[(len(X)-batch*n_splits):((len(X)-batch*n_splits)+n_splits)]
print(len(X_train_1))
# ReScale
ScalerX = StandardScaler()
ScalerX.fit(X_train_1)
X_train_1 = ScalerX.transform(X_train_1)
X_test_1 = ScalerX.transform(X_test_1)
ScalerY = StandardScaler()
ScalerY.fit(y_train_1.reshape(-1, 1))
y_train_1 = ScalerY.transform(y_train_1.reshape(-1, 1))
y_test_1 = ScalerY.transform(y_test_1.reshape(-1, 1))
y_pred[i] = classifier.fit(X_train_1, y_train_1).predict(X_test_1)
y_test[i] = y_test_1
y_pred_org[i] = ScalerY.inverse_transform(y_pred[i])
y_test_org[i] = ScalerY.inverse_transform(y_test[i])
plt.figure()
plt.plot(y_train_1)
plt.plot()
plt.figure()
plt.plot(y_test[i])
plt.plot(y_pred[i])
plt.plot()
i += 1
In [ ]:
k = []
for i in range(0, len(y_test)):
k.append(np.mean(np.sqrt(np.square(y_test[i] - y_pred[i]))))
k_mean = np.mean(k)
print(k_mean)
print()
print(k)
In [ ]:
# 49~51 second predicts 56~58 second
k = []
for i in range(0, len(y_test)):
k.append(np.mean(np.sqrt(np.square(y_test[i][34:36] - y_pred[i][34:36]))))
k_mean = np.mean(k)
print(k_mean)
print()
print(k)
In [ ]:
k = []
for i in range(0, len(y_test)):
k.append(np.mean(np.sqrt(np.square(y_test_org[i] - y_pred_org[i]))))
k_mean = np.mean(k)
print(k_mean)
print()
print(k)
In [ ]:
# 49~51 second predicts 56~58 second
k = []
for i in range(0, len(y_test)):
k.append(np.mean(np.sqrt(np.square(y_test_org[i][34:36] - y_pred_org[i][34:36]))))
k_mean = np.mean(k)
print(k_mean)
print()
print(k)
In [ ]:
# 48 second predicts 56 second
k = []
for i in range(0, len(y_test)):
k.append(np.mean(np.sqrt(np.square(y_test_org[i][33:34] - y_pred_org[i][33:34]))))
k_mean = np.mean(k)
print(k_mean)
print()
print(k)
In [ ]:
# 49 second predicts 56 second
k = []
for i in range(0, len(y_test)):
k.append(np.mean(np.sqrt(np.square(y_test_org[i][34:35] - y_pred_org[i][34:35]))))
k_mean = np.mean(k)
print(k_mean)
print()
print(k)
In [ ]:
# 50 second predicts 57 second
k = []
for i in range(0, len(y_test)):
k.append(np.mean(np.sqrt(np.square(y_test_org[i][35:36] - y_pred_org[i][35:36]))))
k_mean = np.mean(k)
print(k_mean)
print()
print(k)
In [ ]:
# 51 second predicts 58 second
k = []
for i in range(0, len(y_test)):
k.append(np.mean(np.sqrt(np.square(y_test_org[i][36:37] - y_pred_org[i][36:37]))))
k_mean = np.mean(k)
print(k_mean)
print()
print(k)
In [ ]:
# 52 second predicts 59 second
k = []
for i in range(0, len(y_test)):
k.append(np.mean(np.sqrt(np.square(y_test_org[i][37:38] - y_pred_org[i][37:38]))))
k_mean = np.mean(k)
print(k_mean)
print()
print(k)
In [ ]:
# 53 second predicts 60 second
k = []
for i in range(0, len(y_test)):
k.append(np.mean(np.sqrt(np.square(y_test_org[i][38:39] - y_pred_org[i][38:39]))))
k_mean = np.mean(k)
print(k_mean)
print()
print(k)
In [ ]:
plt.plot(y_test_org[0])
plt.plot(y_pred_org[0])
In [ ]:
plt.plot(k)
In [ ]:
In [ ]:
# plt.plot(df_history_ts_process['increment-price-target'][819:])
plt.plot(df_history_ts_process['increment-price'][819:])
plt.plot(df_history_ts_process['d-increment-avg-low-price_m0'][819:])
plt.plot(df_history_ts_process['increment-price'][819:] - df_history_ts_process['d-increment-avg-low-price_m0'][819:])
plt.figure()
plt.plot(df_history_ts_process['d-increment-avg-low-price_m0'][819:])
plt.plot(df_history_ts_process['d-increment-avg-low-price_m0_m1'][819:])
plt.plot(df_history_ts_process['d-increment-avg-low-price_m0_m2'][819:])
plt.plot(df_history_ts_process['d-increment-avg-low-price_m0_m3'][819:])
In [ ]:
def util_feature_importances(classifier):
print(classifier)
dict_importance ={}
for i in range(len(X_col)):
dict_importance[X_col[i]] = classifier.feature_importances_[i]
dict_importance_sort = sorted(dict_importance.items(), key=operator.itemgetter(1), reverse=True)
return dict_importance_sort
In [ ]:
util_feature_importances(classifier_GB)
In [ ]:
util_feature_importances(classifier_RF)
In [ ]:
util_feature_importances(classifier_AB)
In [ ]:
util_feature_importances(classifier_ET)
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: