Sliding window hold

  • K Hold 말고, 기간이 있으면 누적해서 Train을 설정하는 Hold
  • 주로 시계열 데이터를 다룰 때 사용
  • 참고 문서

In [37]:
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb

In [3]:
date_index = pd.date_range(start='2019-01-01', end='2019-01-30', freq='1D')
X = pd.DataFrame(date_index, columns=['date'])
X['dummy'] = 'a'
X['label'] = 1

In [7]:
y = X['label']

In [13]:
del X['label']

In [14]:
X.head()


Out[14]:
date dummy
0 2019-01-01 a
1 2019-01-02 a
2 2019-01-03 a
3 2019-01-04 a
4 2019-01-05 a

In [156]:
# 데이터가 이미 date 기준으로 sort 되었다고 가정하고 진행
tscv = TimeSeriesSplit(n_splits=5)
for train_index, test_index in tscv.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index[0]: test_index[0]], X[test_index[0]:test_index[-1]+1]
    y_train, y_test = y[train_index[0]: test_index[0]], y[test_index[0]:test_index[-1]+1]
    print("X_train")
    print(X_train.head(3))
    print()
    print("X_test")
    print(X_test.head(3))


TRAIN: [0 1 2 3 4] TEST: [5 6 7 8 9]
X_train
   date  dummy  date_encoding
0     0      0              0
1     1      0              1
2     2      0              2

X_test
   date  dummy  date_encoding
5     5      0              5
6     6      0              6
7     7      0              7
TRAIN: [0 1 2 3 4 5 6 7 8 9] TEST: [10 11 12 13 14]
X_train
   date  dummy  date_encoding
0     0      0              0
1     1      0              1
2     2      0              2

X_test
    date  dummy  date_encoding
10    10      0             10
11    11      0             11
12    12      0             12
TRAIN: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14] TEST: [15 16 17 18 19]
X_train
   date  dummy  date_encoding
0     0      0              0
1     1      0              1
2     2      0              2

X_test
    date  dummy  date_encoding
15    15      0             15
16    16      0             16
17    17      0             17
TRAIN: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19] TEST: [20 21 22 23 24]
X_train
   date  dummy  date_encoding
0     0      0              0
1     1      0              1
2     2      0              2

X_test
    date  dummy  date_encoding
20    20      0             20
21    21      0             21
22    22      0             22
TRAIN: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24] TEST: [25 26 27 28 29]
X_train
   date  dummy  date_encoding
0     0      0              0
1     1      0              1
2     2      0              2

X_test
    date  dummy  date_encoding
25    25      0             25
26    26      0             26
27    27      0             27

In [154]:
tscv = TimeSeriesSplit(n_splits=3)
for train_index, test_index in tscv.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index[0]: test_index[0]], X[test_index[0]:test_index[-1]+1]
    y_train, y_test = y[train_index[0]: test_index[0]], y[test_index[0]:test_index[-1]+1]


TRAIN: [0 1 2 3 4 5 6 7 8] TEST: [ 9 10 11 12 13 14 15]
TRAIN: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15] TEST: [16 17 18 19 20 21 22]
TRAIN: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22] TEST: [23 24 25 26 27 28 29]

In [155]:
tscv = TimeSeriesSplit(n_splits=10)
for train_index, test_index in tscv.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index[0]: test_index[0]], X[test_index[0]:test_index[-1]+1]
    y_train, y_test = y[train_index[0]: test_index[0]], y[test_index[0]:test_index[-1]+1]


TRAIN: [0 1 2 3 4 5 6 7 8 9] TEST: [10 11]
TRAIN: [ 0  1  2  3  4  5  6  7  8  9 10 11] TEST: [12 13]
TRAIN: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13] TEST: [14 15]
TRAIN: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15] TEST: [16 17]
TRAIN: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17] TEST: [18 19]
TRAIN: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19] TEST: [20 21]
TRAIN: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21] TEST: [22 23]
TRAIN: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23] TEST: [24 25]
TRAIN: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25] TEST: [26 27]
TRAIN: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27] TEST: [28 29]

Simple Modeling


In [23]:
le = LabelEncoder()

In [24]:
X.head(2)


Out[24]:
date dummy
0 2019-01-01 a
1 2019-01-02 a

In [25]:
le.fit(X['date'])


Out[25]:
LabelEncoder()

In [40]:
X['date'] = le.transform(X['date']) 
X['dummy'] = 0

In [42]:
X.head(2)


Out[42]:
date dummy date_encoding
0 0 0 0
1 1 0 1

In [153]:
xgb_preds = []
tscv = TimeSeriesSplit(n_splits=4)
for train_index, test_index in tscv.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index[0]: test_index[0]], X[test_index[0]:test_index[-1]+1]
    y_train, y_test = y[train_index[0]: test_index[0]], y[test_index[0]:test_index[-1]+1]
    print(len(X_train), len(y_train), len(X_test))
    xgb_params = {'eta': 0.02, 'max_depth': 4, 'subsample': 0.9, 'colsample_bytree': 0.9, 'objective': 'binary:logistic', 'eval_metric': 'auc', 'seed': 99, 'silent': True}
    xgb_regressor = xgb.XGBRegressor(n_estimators=1000)
    xgb_model = xgb_regressor.fit(X_train, y_train,  verbose=False)
                        
    xgb_pred = xgb_model.predict(X_test)

    xgb_preds.append(list(xgb_pred))
#     print('cv', cross_val_score(xgb_model, X_train, y_train, cv=tscv, scoring='accuracy'))


TRAIN: [0 1 2 3 4 5] TEST: [ 6  7  8  9 10 11]
6 6 6
TRAIN: [ 0  1  2  3  4  5  6  7  8  9 10 11] TEST: [12 13 14 15 16 17]
12 12 6
TRAIN: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17] TEST: [18 19 20 21 22 23]
18 18 6
TRAIN: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23] TEST: [24 25 26 27 28 29]
24 24 6

In [144]:
preds=[]
for i in range(len(xgb_preds[0])):
    sum=0
    for j in range(4):
        sum+=xgb_preds[j][i]
    preds.append(sum / 4)

output = pd.DataFrame({'id': 'unknown', 'target': preds})

In [145]:
output


Out[145]:
id target
0 unknown 1.0
1 unknown 1.0
2 unknown 1.0
3 unknown 1.0
4 unknown 1.0
5 unknown 1.0

In [ ]: