In [37]:
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
In [3]:
date_index = pd.date_range(start='2019-01-01', end='2019-01-30', freq='1D')
X = pd.DataFrame(date_index, columns=['date'])
X['dummy'] = 'a'
X['label'] = 1
In [7]:
y = X['label']
In [13]:
del X['label']
In [14]:
X.head()
Out[14]:
In [156]:
# 데이터가 이미 date 기준으로 sort 되었다고 가정하고 진행
tscv = TimeSeriesSplit(n_splits=5)
for train_index, test_index in tscv.split(X):
print("TRAIN:", train_index, "TEST:", test_index)
X_train, X_test = X[train_index[0]: test_index[0]], X[test_index[0]:test_index[-1]+1]
y_train, y_test = y[train_index[0]: test_index[0]], y[test_index[0]:test_index[-1]+1]
print("X_train")
print(X_train.head(3))
print()
print("X_test")
print(X_test.head(3))
In [154]:
tscv = TimeSeriesSplit(n_splits=3)
for train_index, test_index in tscv.split(X):
print("TRAIN:", train_index, "TEST:", test_index)
X_train, X_test = X[train_index[0]: test_index[0]], X[test_index[0]:test_index[-1]+1]
y_train, y_test = y[train_index[0]: test_index[0]], y[test_index[0]:test_index[-1]+1]
In [155]:
tscv = TimeSeriesSplit(n_splits=10)
for train_index, test_index in tscv.split(X):
print("TRAIN:", train_index, "TEST:", test_index)
X_train, X_test = X[train_index[0]: test_index[0]], X[test_index[0]:test_index[-1]+1]
y_train, y_test = y[train_index[0]: test_index[0]], y[test_index[0]:test_index[-1]+1]
In [23]:
le = LabelEncoder()
In [24]:
X.head(2)
Out[24]:
In [25]:
le.fit(X['date'])
Out[25]:
In [40]:
X['date'] = le.transform(X['date'])
X['dummy'] = 0
In [42]:
X.head(2)
Out[42]:
In [153]:
xgb_preds = []
tscv = TimeSeriesSplit(n_splits=4)
for train_index, test_index in tscv.split(X):
print("TRAIN:", train_index, "TEST:", test_index)
X_train, X_test = X[train_index[0]: test_index[0]], X[test_index[0]:test_index[-1]+1]
y_train, y_test = y[train_index[0]: test_index[0]], y[test_index[0]:test_index[-1]+1]
print(len(X_train), len(y_train), len(X_test))
xgb_params = {'eta': 0.02, 'max_depth': 4, 'subsample': 0.9, 'colsample_bytree': 0.9, 'objective': 'binary:logistic', 'eval_metric': 'auc', 'seed': 99, 'silent': True}
xgb_regressor = xgb.XGBRegressor(n_estimators=1000)
xgb_model = xgb_regressor.fit(X_train, y_train, verbose=False)
xgb_pred = xgb_model.predict(X_test)
xgb_preds.append(list(xgb_pred))
# print('cv', cross_val_score(xgb_model, X_train, y_train, cv=tscv, scoring='accuracy'))
In [144]:
preds=[]
for i in range(len(xgb_preds[0])):
sum=0
for j in range(4):
sum+=xgb_preds[j][i]
preds.append(sum / 4)
output = pd.DataFrame({'id': 'unknown', 'target': preds})
In [145]:
output
Out[145]:
In [ ]: