By: 顾 瞻 GU Zhan (Sam)
July 2017
In [1]:
# from __future__ import print_function, division
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import pandas as pd
import operator
from scipy import interp
from itertools import cycle
from sklearn import svm
from sklearn.utils.validation import check_random_state
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_curve, auc
from statsmodels.graphics.mosaicplot import mosaic
print(__doc__)
In [2]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
# from sklearn import svm
In [338]:
df_history_ts = pd.read_csv('data/history_ts.csv')
df_history_ts_process = df_history_ts.copy()
df_history_ts_process.head()
Out[338]:
In [339]:
df_history_table = pd.read_csv('data/history_table.csv')
df_history_table_process = df_history_table.copy()
df_history_table_process.head()
Out[339]:
In [ ]:
In [ ]:
In [340]:
# convert an array of values into a dataset matrix
def create_dataset(dataset, look_back=1):
dataX, dataY = [], []
for i in range(len(dataset)-look_back-1):
a = dataset[i:(i+look_back), 0]
dataX.append(a)
dataY.append(dataset[i + look_back, 0])
return numpy.array(dataX), numpy.array(dataY)
In [341]:
parm_ts_cycle = 61 # seconds/records per month
parm_ts_month = int(len(df_history_ts) / parm_ts_cycle)
print('parm_ts_month : ', parm_ts_month)
parm_calculate_prev_bp = 15 # Number of previous bid-price to include, i.e. previous 2sec, 3sec, 4sec, 5sec ... 15sec
parm_calculate_mv = 15 # Number of moving average to calculate, i.e. previous 2sec, 3sec, 4sec, 5sec ... 15sec
In [342]:
# previous N sec ['bid-price']
gap = parm_calculate_prev_bp
for gap in range(1, gap+1):
col_name = 'bid-price-prev'+str(gap)+'sec'
col_data = pd.DataFrame(columns=[col_name])
print('Creating : ', col_name)
for month in range(0, parm_ts_month):
# print('month : ', month)
col_data.append(col_data_zeros)
for i in range(0, gap):
col_data.loc[month*parm_ts_cycle+i] = 0
for i in range(gap, parm_ts_cycle):
col_data.loc[month*parm_ts_cycle+i] = df_history_ts_process['bid-price'][month*parm_ts_cycle+i-gap]
df_history_ts_process[col_name] = col_data
print('Total records processed : ', len(col_data))
In [343]:
df_history_ts_process[1768:]
Out[343]:
In [ ]:
In [373]:
month
Out[373]:
In [374]:
parm_ts_cycle
Out[374]:
In [375]:
i
Out[375]:
In [384]:
gap=5
df_history_ts_process['bid-price'][month*parm_ts_cycle+i-gap:month*parm_ts_cycle+i]
Out[384]:
In [385]:
np.mean(df_history_ts_process['bid-price'][month*parm_ts_cycle+i-gap:month*parm_ts_cycle+i])
Out[385]:
In [ ]:
In [368]:
# previous 2 sec Moving Average ['bid-price']
gap = parm_calculate_mv
for gap in range(2, gap+1): # MV starts from 2 seconds, till parm_calculate_mv
col_name = 'bid-price-mv'+str(gap)+'sec'
col_data = pd.DataFrame(columns=[col_name])
print('Creating : ', col_name)
for month in range(0, parm_ts_month):
# print('month : ', month)
col_data.append(col_data_zeros)
for i in range(0, gap):
col_data.loc[month*parm_ts_cycle+i] = 0
for i in range(gap, parm_ts_cycle):
col_data.loc[month*parm_ts_cycle+i] = \
np.mean(df_history_ts_process['bid-price'][month*parm_ts_cycle+i-gap:month*parm_ts_cycle+i])
df_history_ts_process[col_name] = col_data
print('Total records processed : ', len(col_data))
In [369]:
df_history_ts_process[1768:]
Out[369]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [390]:
plt.plot(df_history_ts_process['bid-price'][1784:])
plt.plot(df_history_ts_process['bid-price-prev10sec'][1784:])
plt.plot(df_history_ts_process['bid-price-mv10sec'][1784:])
Out[390]:
In [391]:
plt.plot(df_history_ts_process['bid-price'][1784:])
plt.plot(df_history_ts_process['bid-price-prev5sec'][1784:])
plt.plot(df_history_ts_process['bid-price-prev10sec'][1784:])
plt.plot(df_history_ts_process['bid-price-prev15sec'][1784:])
Out[391]:
In [392]:
plt.plot(df_history_ts_process['bid-price'][1784:])
plt.plot(df_history_ts_process['bid-price-mv5sec'][1784:])
plt.plot(df_history_ts_process['bid-price-mv10sec'][1784:])
plt.plot(df_history_ts_process['bid-price-mv15sec'][1784:])
Out[392]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
# previous 2 sec Moving Average ['bid-price']
gap = parm_calculate_mv
for gap in range(1, gap+1):
col_name = 'bid-price-mv'+str(gap)+'sec'
col_data = pd.DataFrame(columns=[col_name])
print('Creating : ', col_name)
for month in range(0, parm_ts_month):
# print('month : ', month)
col_data.append(col_data_zeros)
for i in range(0, gap):
col_data.loc[month*parm_ts_cycle+i] = 0
for i in range(gap, parm_ts_cycle):
col_data.loc[month*parm_ts_cycle+i] = df_history_ts_process['bid-price'][month*parm_ts_cycle+i-gap]
df_history_ts_process[col_name] = col_data
print('len : ', len(col_data))
In [ ]:
In [ ]:
In [318]:
# previous N sec
gap = 1
gap = 2
gap = 3
gap = 4
gap = 5
gap = 6
gap = 7
gap = 8
gap = 9
gap = 10
col_name = 'bid-price-prev'+str(gap)+'sec'
col_data = pd.DataFrame(columns=[col_name])
for month in range(0, parm_ts_month):
# print('month : ', month)
col_data.append(col_data_zeros)
for i in range(0, gap):
col_data.loc[month*parm_ts_cycle+i] = 0
for i in range(gap, parm_ts_cycle):
col_data.loc[month*parm_ts_cycle+i] = df_history_ts_process['bid-price'][month*parm_ts_cycle+i]
print('len : ', len(col_data))
df_history_ts_process[col_name] = col_data
In [319]:
len(col_data)
Out[319]:
In [ ]:
In [321]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [268]:
# previous 1 sec
gap = 10
col_data = pd.DataFrame({'bid-price-prev'+str(gap)+'sec': np.zeros(gap)})
# for i in range(gap, len(df_history_ts)-1768):
for i in range(gap, parm_ts_cycle):
# print(df_history_ts['bid-price'][i])
col_data.loc[i] = df_history_ts['bid-price'][i]
print(len(col_data))
In [ ]:
In [249]:
df_history_ts_process = df_history_ts.copy()
In [252]:
df_history_table_process['tmp'] = col_data['bid-price-prev'+str(gap)+'sec']
In [254]:
df_history_table_process.tail()
Out[254]:
In [ ]:
In [ ]:
In [235]:
col_data
Out[235]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [3]:
df_wnv_raw = pd.read_csv('bid_v007_ReScale.csv', encoding='utf-8')
# df_wnv_raw = pd.read_csv('bid_v010_ts_target.csv', encoding='utf-8')
# df_wnv_raw = pd.read_csv('bid_v010_ts_target_mean.csv', encoding='utf-8')
df_wnv_raw.head()
Out[3]:
In [441]:
df_wnv_raw = df_history_ts_process
df_history_ts_process.columns
Out[441]:
In [459]:
X = df_wnv_raw[[
# 'ccyy-mm',
# 'time',
# 'bid-price',
'bid-price-prev1sec',
'bid-price-prev2sec', 'bid-price-prev3sec', 'bid-price-prev4sec',
'bid-price-prev5sec', 'bid-price-prev6sec', 'bid-price-prev7sec',
'bid-price-prev8sec', 'bid-price-prev9sec', 'bid-price-prev10sec',
'bid-price-prev11sec', 'bid-price-prev12sec', 'bid-price-prev13sec',
'bid-price-prev14sec', 'bid-price-prev15sec',
'bid-price-mv2sec',
'bid-price-mv3sec', 'bid-price-mv4sec', 'bid-price-mv5sec',
'bid-price-mv6sec', 'bid-price-mv7sec', 'bid-price-mv8sec',
'bid-price-mv9sec', 'bid-price-mv10sec', 'bid-price-mv11sec',
'bid-price-mv12sec', 'bid-price-mv13sec', 'bid-price-mv14sec',
'bid-price-mv15sec'
]]
X_col = X.columns # get the column list
# X = StandardScaler().fit_transform(X.as_matrix())
X = X.as_matrix()
# y = StandardScaler().fit_transform(df_wnv_raw[['bid-price']].as_matrix()).reshape(len(df_wnv_raw),)
y = df_wnv_raw[['bid-price']].as_matrix().reshape(len(df_wnv_raw),)
In [ ]:
In [96]:
X = df_wnv_raw[[
'R01_bid.sec',
'R01_month',
'R01_bid.curr.mth',
'R01_increment.curr.mth',
'R01_MV.3sec.curr',
'R01_MV.5sec.curr',
'R01_MV.3sec.curr.d.Avg.Low.prev.mth',
'R01_MV.5sec.curr.d.Avg.Low.prev.mth',
'R01_bid.prev.mth',
'R01_increment.prev.mth',
'R01_MV.3sec.prev',
'R01_MV.5sec.prev',
'R01_MV.3sec.prev.d.Avg.Low.prev.mth',
'R01_MV.5sec.prev.d.Avg.Low.prev.mth',
'R01_d.Avg.Low.prev.mth',
'R01_increment.curr.mth.d.Avg.Low.prev.mth',
'R01_d.earliest.success.time.sec.prev.mth',
# 'R01_Volume.Plate.curr.mth',
# 'R01_Volume.Bidder.curr.mth',
'R01_success.ratio.curr.mth',
# 'R01_Volume.Plate.prev.mth',
# 'R01_Volume.Bidder.prev.mth',
'R01_success.ratio.prev.mth',
# 'R01_d.Volume.Plate',
# 'R01_d.Volume.Bidder',
# 'R01_d.success.ratio',
'R01_anomaly',
# 'R01_target.7sec'
]]
X_col = X.columns # get the column list
# X = StandardScaler().fit_transform(X.as_matrix())
X = X.as_matrix()
y = df_wnv_raw[['R01_target.7sec']].as_matrix().reshape(len(df_wnv_raw),)
In [61]:
X = df_wnv_raw[[
'R01_bid.sec',
'R01_month',
'R01_bid.curr.mth',
'R01_increment.curr.mth',
'R01_MV.3sec.curr',
'R01_MV.5sec.curr',
'R01_MV.3sec.curr.d.Avg.Low.prev.mth',
'R01_MV.5sec.curr.d.Avg.Low.prev.mth',
'R01_bid.prev.mth',
'R01_increment.prev.mth',
'R01_MV.3sec.prev',
'R01_MV.5sec.prev',
'R01_MV.3sec.prev.d.Avg.Low.prev.mth',
'R01_MV.5sec.prev.d.Avg.Low.prev.mth',
'R01_d.Avg.Low.prev.mth',
'R01_increment.curr.mth.d.Avg.Low.prev.mth',
'R01_d.earliest.success.time.sec.prev.mth',
'R01_Volume.Plate.curr.mth',
'R01_Volume.Bidder.curr.mth',
'R01_success.ratio.curr.mth',
'R01_Volume.Plate.prev.mth',
'R01_Volume.Bidder.prev.mth',
'R01_success.ratio.prev.mth',
'R01_d.Volume.Plate',
'R01_d.Volume.Bidder',
'R01_d.success.ratio',
'R01_anomaly',
# 'R01_target.7sec'
]]
X_col = X.columns # get the column list
# X = StandardScaler().fit_transform(X.as_matrix())
X = X.as_matrix()
y = df_wnv_raw[['R01_target.7sec']].as_matrix().reshape(len(df_wnv_raw),)
In [60]:
X = df_wnv_raw[[
# 'R01_bid.sec',
# 'R01_month',
'R01_bid.curr.mth',
# 'R01_increment.curr.mth',
# 'R01_MV.3sec.curr',
# 'R01_MV.5sec.curr',
# 'R01_MV.3sec.curr.d.Avg.Low.prev.mth',
# 'R01_MV.5sec.curr.d.Avg.Low.prev.mth',
# 'R01_bid.prev.mth',
# 'R01_increment.prev.mth',
# 'R01_MV.3sec.prev',
# 'R01_MV.5sec.prev',
# 'R01_MV.3sec.prev.d.Avg.Low.prev.mth',
# 'R01_MV.5sec.prev.d.Avg.Low.prev.mth',
# 'R01_d.Avg.Low.prev.mth',
# 'R01_increment.curr.mth.d.Avg.Low.prev.mth',
# 'R01_d.earliest.success.time.sec.prev.mth',
# 'R01_Volume.Plate.curr.mth',
# 'R01_Volume.Bidder.curr.mth',
# 'R01_success.ratio.curr.mth',
# 'R01_Volume.Plate.prev.mth',
# 'R01_Volume.Bidder.prev.mth',
# 'R01_success.ratio.prev.mth',
# 'R01_d.Volume.Plate',
# 'R01_d.Volume.Bidder',
# 'R01_d.success.ratio',
# 'R01_anomaly',
# 'R01_target.7sec'
]]
X_col = X.columns # get the column list
# X = StandardScaler().fit_transform(X.as_matrix())
X = X.as_matrix()
y = df_wnv_raw[['R01_target.7sec']].as_matrix().reshape(len(df_wnv_raw),)
In [443]:
X_col
Out[443]:
In [460]:
# plt.plot(X)
plt.plot(y)
Out[460]:
In [ ]:
In [461]:
rng = check_random_state(0)
In [462]:
# GB
classifier_GB = GradientBoostingRegressor(n_estimators=1500, # score: 0.94608 (AUC 0.81419), learning_rate=0.001, max_features=8 <<< Best
# loss='deviance',
# subsample=1,
# max_depth=5,
# min_samples_split=20,
learning_rate=0.002,
# max_features=10,
random_state=rng)
In [463]:
# AB
classifier_AB = AdaBoostRegressor(n_estimators=1500, # score: 0.93948 (AUC 0.88339), learning_rate=0.004 <<< Best
learning_rate=0.002,
random_state=rng)
In [464]:
# RF
classifier_RF = RandomForestRegressor(n_estimators=1500, # score: 0.94207 (AUC 0.81870), max_depth=3, min_samples_split=20, <<< Best
# max_features=10,
# max_depth=3,
# min_samples_split=20,
random_state=rng)
In [465]:
# ET
classifier_ET = ExtraTreesRegressor(n_estimators=1000, # score: 0.94655 (AUC 0.84364), max_depth=3, min_samples_split=20, max_features=10 <<< Best
# max_depth=3,
# min_samples_split=20,
# max_features=10,
random_state=rng)
In [466]:
# BG
classifier_BG = BaggingRegressor(n_estimators=500, # score: 0.70725 (AUC 0.63729) <<< Best
# max_features=10,
random_state=rng)
In [467]:
classifier_LR = LinearRegression() # score: 0.90199 (AUC 0.80569)
In [468]:
# classifier_SVCL = svm.SVC(kernel='linear', probability=True, random_state=rng) # score: 0.89976 (AUC 0.70524)
classifier_SVRL = svm.SVR() # score: 0.89976 (AUC 0.70524)
In [469]:
classifier_SVCR = svm.SVR(kernel='rbf') # score: 0.80188 (AUC 0.50050)
# classifier_SVRR = svm.SVR(kernel='poly') # score: 0.80188 (AUC 0.50050)
In [470]:
classifier_KNN = KNeighborsRegressor(n_neighbors=2) # score: 0.94018 (AUC 0.72792)
cv = cross_val_score(classifier_KNN,
X,
y,
cv=StratifiedKFold(19))
print('KNN CV score: {0:.5f}'.format(cv.mean()))
In [ ]:
In [471]:
# classifier = classifier_GB # 324.632308296
# classifier = classifier_AB # 429.646733221
# classifier = classifier_RF # 175.504322802
# classifier = classifier_ET # 172.097916817, 0.0724812030075
classifier = classifier_BG # 175.451381872
# classifier = classifier_LR # 128.465059749, 0.11
# classifier = classifier_SVRL # 3789.82169312
# classifier = classifier_SVRR # 3789.82169312, 0.10754224349
In [472]:
n_splits=61 # 19 seconds/records for each bidding month
# n_splits=54 # 19 seconds/records for each bidding month
# n_splits=19 # 19 seconds/records for each bidding month
n_fold = 30
# X_train_1 = X[0:(len(X)-batch*n_splits)]
# y_train_1 = y[0:(len(X)-batch*n_splits)]
# X_test_1 = X[(len(X)-batch*n_splits):((len(X)-batch*n_splits)+n_splits)]
# y_test_1 = y[(len(X)-batch*n_splits):((len(X)-batch*n_splits)+n_splits)]
In [473]:
y_pred = {}
y_test = {}
i = 0
for batch in range(1, n_fold):
X_train_1 = X[0:(len(X)-batch*n_splits)]
y_train_1 = y[0:(len(X)-batch*n_splits)]
X_test_1 = X[(len(X)-batch*n_splits):((len(X)-batch*n_splits)+n_splits)]
y_test_1 = y[(len(X)-batch*n_splits):((len(X)-batch*n_splits)+n_splits)]
print(len(X_train_1))
y_pred[i] = classifier.fit(X_train_1, y_train_1).predict(X_test_1)
# y_pred[i] = classifier.fit(X_train_1, y_train_1).predict(X_test_1[0:10])
# y_pred[i] = classifier.fit(X_train_1, y_train_1).predict(X_test_1[10:19])
y_test[i] = y_test_1
plt.figure()
plt.plot(y_train_1)
plt.plot()
plt.figure()
plt.plot(y_test[i])
plt.plot(y_pred[i])
plt.plot()
i += 1
In [474]:
k = []
for i in range(0, len(y_test)):
k.append(np.mean(np.sqrt(np.square(y_test[i] - y_pred[i]))))
k_mean = np.mean(k)
print(k_mean)
print()
print(k)
In [478]:
k = []
for i in range(0, len(y_test)):
k.append(np.mean(np.sqrt(np.square(y_test[i][45:55] - y_pred[i][45:55]))))
k_mean = np.mean(k)
print(k_mean)
print()
print(k)
In [475]:
k = []
for i in range(0, len(y_test)):
k.append(np.mean(np.sqrt(np.square(y_test[i][13:16] - y_pred[i][13:16]))))
k_mean = np.mean(k)
print(k_mean)
print()
print(k)
In [ ]:
In [122]:
y_test[1][13:]
Out[122]:
In [123]:
y_pred[1][13:]
Out[123]:
In [127]:
np.mean(np.sqrt(np.square(y_test[4] - y_pred[4])))
Out[127]:
In [128]:
np.mean(np.sqrt(np.square(y_test[4][13:16] - y_pred[4][13:16])))
Out[128]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [436]:
y_pred_df = pd.DataFrame.from_dict(y_pred)
In [437]:
y_pred_df.columns=['month 7','month 6','month 5','month 4','month 3','month 2','month 1']
In [367]:
y_pred_df.to_csv('bid_results_v001.csv', index=False)
In [438]:
y_pred_df
Out[438]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: