By: 顾 瞻 GU Zhan (Sam)
July 2017
In [1]:
# from __future__ import print_function, division
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import pandas as pd
import operator
from scipy import interp
from itertools import cycle
from sklearn import svm
from sklearn.utils.validation import check_random_state
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_curve, auc
from statsmodels.graphics.mosaicplot import mosaic
print(__doc__)
In [2]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
# from sklearn import svm
In [3]:
df_wnv_raw = pd.read_csv('bid_v007_ReScale.csv', encoding='utf-8')
df_wnv_raw.head()
Out[3]:
In [76]:
X = df_wnv_raw[[
'R01_bid.sec',
'R01_month',
'R01_bid.curr.mth',
'R01_increment.curr.mth',
'R01_MV.3sec.curr',
'R01_MV.5sec.curr',
'R01_MV.3sec.curr.d.Avg.Low.prev.mth',
'R01_MV.5sec.curr.d.Avg.Low.prev.mth',
'R01_bid.prev.mth',
'R01_increment.prev.mth',
'R01_MV.3sec.prev',
'R01_MV.5sec.prev',
'R01_MV.3sec.prev.d.Avg.Low.prev.mth',
'R01_MV.5sec.prev.d.Avg.Low.prev.mth',
'R01_d.Avg.Low.prev.mth',
'R01_increment.curr.mth.d.Avg.Low.prev.mth',
'R01_d.earliest.success.time.sec.prev.mth',
'R01_Volume.Plate.curr.mth',
'R01_Volume.Bidder.curr.mth',
'R01_success.ratio.curr.mth',
'R01_Volume.Plate.prev.mth',
'R01_Volume.Bidder.prev.mth',
'R01_success.ratio.prev.mth',
'R01_d.Volume.Plate',
'R01_d.Volume.Bidder',
'R01_d.success.ratio',
'R01_anomaly',
# 'R01_target.7sec'
]]
X_col = X.columns # get the column list
# X = StandardScaler().fit_transform(X.as_matrix())
X = X.as_matrix()
y = df_wnv_raw[['R01_target.7sec']].as_matrix().reshape(len(df_wnv_raw),)
In [77]:
X_col
Out[77]:
In [78]:
rng = check_random_state(0)
In [79]:
# GB
# classifier_GB = GradientBoostingClassifier(n_estimators=500, # score: (AUC 0.77018), learning_rate=0.005, max_features=5
# classifier_GB = GradientBoostingClassifier(n_estimators=500, # score: (AUC 0.80), learning_rate=0.003, max_features=8
classifier_GB = GradientBoostingClassifier(n_estimators=500, # score: 0.94608 (AUC 0.81419), learning_rate=0.001, max_features=8 <<< Best
# classifier_GB = GradientBoostingClassifier(n_estimators=500, # score: (AUC 0.82), learning_rate=0.0005, max_features=8
# classifier_GB = GradientBoostingClassifier(n_estimators=500, # score: (AUC 0.82), learning_rate=0.0001, max_features=8
# classifier_GB = GradientBoostingClassifier(n_estimators=1000, # score: (AUC 0.82), learning_rate=0.0001, max_features=8
# classifier_GB = GradientBoostingClassifier(n_estimators=1000, # score: (AUC 0.82), learning_rate=0.0005, max_features=8
# classifier_GB = GradientBoostingClassifier(n_estimators=1000, # score: 0.78735 (AUC 0.76840), learning_rate=0.0005, max_features=8
# classifier_GB = GradientBoostingClassifier(n_estimators=200, # score: 0.94608 (AUC 0.78), learning_rate=0.0005, max_depth=4, min_samples_split=30, max_features=5
# classifier_GB = GradientBoostingClassifier(n_estimators=500, # score: 0.66030, default learning_rate=0.1
# classifier_GB = GradientBoostingClassifier(n_estimators=500, # score: 0.88464 (AUC 0.80), learning_rate=0.0035
# classifier_GB = GradientBoostingClassifier(n_estimators=500, # score: 0.85208 (AUC 0.72), learning_rate=0.0035, max_depth=5, min_samples_split=20, max_features=8
# classifier_GB = GradientBoostingClassifier(n_estimators=500, # score: 0.90691 (AUC 0.80), learning_rate=0.002
# classifier_GB = GradientBoostingClassifier(n_estimators=500, # score: 0.94608, learning_rate=0.0005, max_depth=5, min_samples_split=20
# classifier_GB = GradientBoostingClassifier(n_estimators=500, # score: 0.94608, learning_rate=0.0005, max_depth=5, min_samples_split=30
# classifier_GB = GradientBoostingClassifier(n_estimators=500, # score: 0.94608, learning_rate=0.0005, max_depth=5, min_samples_split=30, max_features=10
# classifier_GB = GradientBoostingClassifier(n_estimators=500, # score: 0.94608, learning_rate=0.0005, max_depth=5, min_samples_split=30, max_features=5
# classifier_GB = GradientBoostingClassifier(n_estimators=500, # score: 0.94608, learning_rate=0.0005, max_depth=4, min_samples_split=30, max_features=5
# classifier_GB = GradientBoostingClassifier(n_estimators=1000, # score: 0.79112 (AUC 0.70), learning_rate=0.0035, max_depth=5, min_samples_split=30, max_features=10
# classifier_GB = GradientBoostingClassifier(n_estimators=1000, # score: 0.94608, learning_rate=0.0005, max_depth=5, min_samples_split=30, max_features=5
# classifier_GB = GradientBoostingClassifier(n_estimators=1000, # score: 0.63623, default learning_rate=0.1
# classifier_GB = GradientBoostingClassifier(n_estimators=2000, # score: 0.77567, learning_rate=0.0035
# loss='deviance',
# subsample=1,
# max_depth=5,
# min_samples_split=20,
learning_rate=0.001,
max_features=8,
random_state=rng)
In [144]:
# GB
# classifier_GB = GradientBoostingClassifier(n_estimators=500, # score: (AUC 0.77018), learning_rate=0.005, max_features=5
# classifier_GB = GradientBoostingClassifier(n_estimators=500, # score: (AUC 0.80), learning_rate=0.003, max_features=8
classifier_GB = GradientBoostingRegressor(n_estimators=1500, # score: 0.94608 (AUC 0.81419), learning_rate=0.001, max_features=8 <<< Best
# classifier_GB = GradientBoostingClassifier(n_estimators=500, # score: (AUC 0.82), learning_rate=0.0005, max_features=8
# classifier_GB = GradientBoostingClassifier(n_estimators=500, # score: (AUC 0.82), learning_rate=0.0001, max_features=8
# classifier_GB = GradientBoostingClassifier(n_estimators=1000, # score: (AUC 0.82), learning_rate=0.0001, max_features=8
# classifier_GB = GradientBoostingClassifier(n_estimators=1000, # score: (AUC 0.82), learning_rate=0.0005, max_features=8
# classifier_GB = GradientBoostingClassifier(n_estimators=1000, # score: 0.78735 (AUC 0.76840), learning_rate=0.0005, max_features=8
# classifier_GB = GradientBoostingClassifier(n_estimators=200, # score: 0.94608 (AUC 0.78), learning_rate=0.0005, max_depth=4, min_samples_split=30, max_features=5
# classifier_GB = GradientBoostingClassifier(n_estimators=500, # score: 0.66030, default learning_rate=0.1
# classifier_GB = GradientBoostingClassifier(n_estimators=500, # score: 0.88464 (AUC 0.80), learning_rate=0.0035
# classifier_GB = GradientBoostingClassifier(n_estimators=500, # score: 0.85208 (AUC 0.72), learning_rate=0.0035, max_depth=5, min_samples_split=20, max_features=8
# classifier_GB = GradientBoostingClassifier(n_estimators=500, # score: 0.90691 (AUC 0.80), learning_rate=0.002
# classifier_GB = GradientBoostingClassifier(n_estimators=500, # score: 0.94608, learning_rate=0.0005, max_depth=5, min_samples_split=20
# classifier_GB = GradientBoostingClassifier(n_estimators=500, # score: 0.94608, learning_rate=0.0005, max_depth=5, min_samples_split=30
# classifier_GB = GradientBoostingClassifier(n_estimators=500, # score: 0.94608, learning_rate=0.0005, max_depth=5, min_samples_split=30, max_features=10
# classifier_GB = GradientBoostingClassifier(n_estimators=500, # score: 0.94608, learning_rate=0.0005, max_depth=5, min_samples_split=30, max_features=5
# classifier_GB = GradientBoostingClassifier(n_estimators=500, # score: 0.94608, learning_rate=0.0005, max_depth=4, min_samples_split=30, max_features=5
# classifier_GB = GradientBoostingClassifier(n_estimators=1000, # score: 0.79112 (AUC 0.70), learning_rate=0.0035, max_depth=5, min_samples_split=30, max_features=10
# classifier_GB = GradientBoostingClassifier(n_estimators=1000, # score: 0.94608, learning_rate=0.0005, max_depth=5, min_samples_split=30, max_features=5
# classifier_GB = GradientBoostingClassifier(n_estimators=1000, # score: 0.63623, default learning_rate=0.1
# classifier_GB = GradientBoostingClassifier(n_estimators=2000, # score: 0.77567, learning_rate=0.0035
# loss='deviance',
# subsample=1,
# max_depth=5,
# min_samples_split=20,
learning_rate=0.002,
# max_features=10,
random_state=rng)
In [139]:
# AB
# classifier_AB = AdaBoostClassifier(n_estimators=100, # score: (AUC 0.81618), learning_rate=0.004
# classifier_AB = AdaBoostClassifier(n_estimators=500, # score: 0.94608 (AUC 0.88), learning_rate=0.005
# classifier_AB = AdaBoostClassifier(n_estimators=1000, # score: (AUC 0.87), learning_rate=0.01
# classifier_AB = AdaBoostClassifier(n_estimators=1000, # score: (AUC 0.88), learning_rate=0.0075
# classifier_AB = AdaBoostClassifier(n_estimators=1000, # score: (AUC 0.75), learning_rate=0.0001 <<< pre-matured
# classifier_AB = AdaBoostClassifier(n_estimators=1000, # score: (AUC 0.88), learning_rate=0.005
# classifier_AB = AdaBoostClassifier(n_estimators=1000, # score: (AUC 0.88), learning_rate=0.0025
# classifier_AB = AdaBoostClassifier(n_estimators=500, # score: 0.94608 (AUC 0.88), learning_rate=0.0035
# classifier_AB = AdaBoostClassifier(n_estimators=100, # score: 0.94608 (AUC 0.77), learning_rate=0.002
# classifier_AB = AdaBoostClassifier(n_estimators=500, # score: 0.94608 (AUC 0.85), learning_rate=0.002
# classifier_AB = AdaBoostClassifier(n_estimators=500, # score: 0.94608, learning_rate=0.0005
# classifier_AB = AdaBoostClassifier(n_estimators=1000, # score: 0.943130082 (AUC 0.88207), learning_rate=0.0035
classifier_AB = AdaBoostRegressor(n_estimators=1500, # score: 0.93948 (AUC 0.88339), learning_rate=0.004 <<< Best
# classifier_AB = AdaBoostClassifier(n_estimators=1500, # score: 0.92686 (AUC 0.88), learning_rate=0.0035
# classifier_AB = AdaBoostClassifier(n_estimators=2000, # score: 0.63941, default learning_rate=0.1
# classifier_AB = AdaBoostClassifier(n_estimators=2000, # score: 0.90117 (AUC 0.87984), learning_rate=0.0035
# classifier_AB = AdaBoostClassifier(n_estimators=2000, # score: 0.89056 (AUC 0.88), learning_rate=0.004
# classifier_AB = AdaBoostClassifier(n_estimators=2000, # score: 0.83374 (AUC 0.83), learning_rate=0.01
learning_rate=0.002,
random_state=rng)
In [138]:
# RF
# classifier_RF = RandomForestClassifier(n_estimators=100, # score: (AUC 0.66888), max_depth=8, min_samples_split=20,
# classifier_RF = RandomForestClassifier(n_estimators=500, # score: 0.90469, max_depth=5, min_samples_split=20,
# classifier_RF = RandomForestClassifier(n_estimators=500, # score: 0.90540, max_depth=5, min_samples_split=30,
# classifier_RF = RandomForestClassifier(n_estimators=500, # score: 0.93005, max_depth=4, min_samples_split=30,
# classifier_RF = RandomForestClassifier(n_estimators=500, # score: 0.93099, max_depth=4, min_samples_split=40,
# classifier_RF = RandomForestClassifier(n_estimators=500, # score: 0.93794 (AUC 0.82), max_depth=3, min_samples_split=20,
# classifier_RF = RandomForestClassifier(n_estimators=200, # score: 0.93771, max_depth=3, min_samples_split=20,
# classifier_RF = RandomForestClassifier(n_estimators=1000, # score: 0.90493, max_depth=5, min_samples_split=30,
classifier_RF = RandomForestRegressor(n_estimators=1500, # score: 0.94207 (AUC 0.81870), max_depth=3, min_samples_split=20, <<< Best
# classifier_RF = RandomForestClassifier(n_estimators=500, # score: 0.88900, max_depth=5, min_samples_split=30, max_features=10
# classifier_RF = RandomForestClassifier(n_estimators=1000, # score: 0.88864, max_depth=5, min_samples_split=30, max_features=10
# classifier_RF = RandomForestClassifier(n_estimators=500, # score: 0.77154
# classifier_RF = RandomForestClassifier(n_estimators=1000, # score: 0.76469
# classifier_RF = RandomForestClassifier(n_estimators=2000, # score: 0.76564
# max_features=10,
# max_depth=3,
# min_samples_split=20,
random_state=rng)
In [165]:
# ET
# classifier_ET = ExtraTreesClassifier(n_estimators=500, # score: 0.70973
# classifier_ET = ExtraTreesClassifier(n_estimators=500, # score: 0.93382 (AUC 0.81), max_depth=5, min_samples_split=30, max_features=10
classifier_ET = ExtraTreesRegressor(n_estimators=1500, # score: 0.94655 (AUC 0.84364), max_depth=3, min_samples_split=20, max_features=10 <<< Best
# classifier_ET = ExtraTreesClassifier(n_estimators=1000, # score: 0.93276, max_depth=5, min_samples_split=30, max_features=10
# classifier_ET = ExtraTreesClassifier(n_estimators=1000, # score: 0.94572, max_depth=4, min_samples_split=30, max_features=5
# classifier_ET = ExtraTreesClassifier(n_estimators=1000, # score: 0.94608 (AUC 0.82), max_depth=3, min_samples_split=30, max_features=5
# classifier_ET = ExtraTreesClassifier(n_estimators=1000, # score: (AUC 0.84077), max_depth=3, min_samples_split=20, max_features=10
# classifier_ET = ExtraTreesClassifier(n_estimators=1000, # score: 0.93241, max_depth=5, min_samples_split=20, max_features=10
# classifier_ET = ExtraTreesClassifier(n_estimators=1000, # score: 0.71067
# classifier_ET = ExtraTreesClassifier(n_estimators=2000, # score: 0.71149
# max_depth=3,
# min_samples_split=20,
# max_features=10,
random_state=rng)
In [166]:
# BG
# classifier_BG = BaggingClassifier(n_estimators=500, # score: 0.77035, max_features=20
# classifier_BG = BaggingClassifier(n_estimators=500, # score: 0.78085 (AUC 0.57037), max_features=10
classifier_BG = BaggingRegressor(n_estimators=500, # score: 0.70725 (AUC 0.63729) <<< Best
# classifier_BG = BaggingClassifier(n_estimators=200, # score: 0.77707, max_features=10
# classifier_BG = BaggingClassifier(n_estimators=1000, # score: 0.78096 (AUC 0.56), max_features=10
# classifier_BG = BaggingClassifier(n_estimators=2000, # score: (AUC 0.51506), max_features=10
# classifier_BG = BaggingClassifier(n_estimators=500, # score: 0.76553, max_features=5
# classifier_BG = BaggingClassifier(n_estimators=500, # score: 0.70181
# classifier_BG = BaggingClassifier(n_estimators=1000, # score: 0.69779
# classifier_BG = BaggingClassifier(n_estimators=2000, # score: 0.70004
# max_features=10,
random_state=rng)
In [167]:
classifier_LR = LinearRegression() # score: 0.90199 (AUC 0.80569)
# plot_roc(classifier_LR)
In [178]:
# classifier_SVCL = svm.SVC(kernel='linear', probability=True, random_state=rng) # score: 0.89976 (AUC 0.70524)
classifier_SVRL = svm.SVR() # score: 0.89976 (AUC 0.70524)
# plot_roc(classifier_SVCL)
In [169]:
# classifier_SVCR = svm.SVC(kernel='rbf', probability=True, random_state=rng) # score: 0.80188 (AUC 0.50050)
# classifier_SVRR = svm.SVR(kernel='rbf') # score: 0.80188 (AUC 0.50050)
classifier_SVRR = svm.SVR(kernel='rbf') # score: 0.80188 (AUC 0.50050)
# plot_roc(classifier_SVCR)
In [ ]:
In [187]:
# classifier = classifier_GB # 0.07854768301
# classifier = classifier_AB # 0.0846678136924
# classifier = classifier_RF # 0.0791521661296
classifier = classifier_ET # 0.0743745076978
# classifier = classifier_BG # 0.0787926960258
# classifier = classifier_LR # 0.0846378641917 (171079173108.0)
# classifier = classifier_SVRL # 0.0964908184664
# classifier = classifier_SVRR # 0.0964908184664
In [189]:
n_splits=19 # 19 seconds/records for each bidding month
n_fold = 19
# batch = 18 # range [1, 18]
# X_train_1 = X[0:(len(X)-batch*n_splits)]
# y_train_1 = y[0:(len(X)-batch*n_splits)]
# X_test_1 = X[(len(X)-batch*n_splits):((len(X)-batch*n_splits)+n_splits)]
# y_test_1 = y[(len(X)-batch*n_splits):((len(X)-batch*n_splits)+n_splits)]
In [190]:
y_pred = {}
y_test = {}
i = 0
for batch in range(1, n_fold):
X_train_1 = X[0:(len(X)-batch*n_splits)]
y_train_1 = y[0:(len(X)-batch*n_splits)]
X_test_1 = X[(len(X)-batch*n_splits):((len(X)-batch*n_splits)+n_splits)]
y_test_1 = y[(len(X)-batch*n_splits):((len(X)-batch*n_splits)+n_splits)]
print(len(X_train_1))
y_pred[i] = classifier.fit(X_train_1, y_train_1).predict(X_test_1)
# y_pred[i] = classifier.fit(X_train_1, y_train_1).predict(X_test_1[0:10])
# y_pred[i] = classifier.fit(X_train_1, y_train_1).predict(X_test_1[10:19])
y_test[i] = y_test_1
plt.figure()
plt.plot(y_train_1)
plt.plot()
plt.figure()
plt.plot(y_test[i])
plt.plot(y_pred[i])
plt.plot()
i += 1
In [191]:
k = []
for i in range(0, len(y_test)):
k.append(np.mean(np.sqrt(np.square(y_test[i] - y_pred[i]))))
k_mean = np.mean(k)
print(k_mean)
print()
print(k)
In [ ]:
In [ ]:
In [ ]:
In [174]:
y_pred_df = pd.DataFrame.from_dict(y_pred)
In [175]:
y_pred_df.columns=['month 7','month 6','month 5','month 4','month 3','month 2','month 1']
In [176]:
y_pred_df.to_csv('bid_results_v001.csv', index=False)
In [177]:
y_pred_df
Out[177]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
cv = StratifiedKFold(n_splits=2, shuffle=False)
In [ ]:
# n_splits=6
# Run classifier with cross-validation and plot ROC curves
cv = StratifiedKFold(n_splits=3, shuffle=False)
# colors = cycle(['cyan', 'indigo', 'seagreen', 'yellow', 'blue', 'darkorange'])
y_pred = {}
y_test = {}
i = 0
for (train, test) in cv.split(X, y):
# if i==0: print(X[train])
# probas_ = classifier.fit(X[train], y[train]).predict_proba(X[test])
y_pred[i] = classifier.fit(X[train], y[train]).predict(X[test])
y_test[i] = y[test]
plt.figure()
plt.plot(y[train])
plt.plot()
plt.figure()
plt.plot(y_test[i])
plt.plot(y_pred[i])
plt.plot()
i += 1
In [ ]:
k = []
for i in range(0, len(y_test)):
k.append(np.mean(np.sqrt(np.square(y_test[i] - y_pred[i]))))
k_mean = np.mean(k)
print(k_mean)
print()
print(k)
In [ ]:
In [ ]:
plt.plot(y_test[0])
plt.plot(y_pred[0])
plt.plot()
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: