In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import *
import xgboost as xgb
import operator
import settings
from ta import *
import utils
In [3]:
df = pd.read_csv('data/datas-Hourly.csv', sep=',')
In [4]:
print('Number of rows: {}, Number of columns: {}'.format(*df.shape))
In [5]:
df = utils.dropna(df)
In [6]:
print('Number of rows: {}, Number of columns: {}'.format(*df.shape))
In [7]:
# shift
df['Close_target'] = df['Close'].shift(-1)
df = df.dropna()
Create column target with class [UP, KEEP, DOWN]
In [8]:
df["Target"] = 0 # 'KEEP'
df.loc[df.Close + (df.Close * settings.PERCENT_UP) < df.Close_target, "Target"] = 1 # 'UP'
df.loc[df.Close - (df.Close * settings.PERCENT_DOWN) > df.Close_target, "Target"] = 2 # 'DOWN'
In [9]:
print('Number of rows: {}, Number of columns: {}'.format(*df.shape))
print('Number of UP rows: {}, Number of DOWN rows: {}'.format(len(df[df.Target == 1]), len(df[df.Target == 2])))
Create columns from Timestamp to Date, Year, Month, Hour, etc.
In [10]:
df['Date'] = df['Timestamp'].apply(utils.timestamptodate)
df['Date'] = pd.to_datetime(df['Date'])
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Week'] = df['Date'].dt.weekofyear
df['Weekday'] = df['Date'].dt.weekday
df['Day'] = df['Date'].dt.day
df['Hour'] = df['Date'].dt.hour
In [11]:
print('Number of rows: {}, Number of columns: {}'.format(*df.shape))
In [12]:
df = add_all_ta_features(df, "Open", "High", "Low", "Close", "Volume_BTC", fillna=True)
In [13]:
print('Number of rows: {}, Number of columns: {}'.format(*df.shape))
df = df.dropna()
print('Number of rows: {}, Number of columns: {}'.format(*df.shape))
In [14]:
train, test = utils.split_df2(df)
In [15]:
excl = ['Close_target', 'Target', 'Date', 'Timestamp']
cols = [c for c in df.columns if c not in excl]
In [16]:
y_train = train['Target']
y_mean = np.mean(y_train)
xgb_params = {
'n_trees': 800,
'eta': 0.0045,
'max_depth': 20,
'subsample': 1,
'colsample_bytree': 0.95,
'colsample_bylevel': 0.95,
'objective': 'multi:softmax',
'num_class' : 3,
'eval_metric': 'mlogloss', # 'merror', # 'rmse',
'base_score': 0,
'silent': 1
}
dtrain = xgb.DMatrix(train[cols], y_train)
dtest = xgb.DMatrix(test[cols])
cv_result = xgb.cv(xgb_params, dtrain)
# xgboost, cross-validation
"""
cv_result = xgb.cv(xgb_params,
dtrain,
num_boost_round=5000,
early_stopping_rounds=50,
verbose_eval=50,
show_stdv=False
)
num_boost_rounds = len(cv_result)
"""
num_boost_rounds = 705
print(num_boost_rounds)
# train
model = xgb.train(xgb_params, dtrain, num_boost_round=num_boost_rounds)
# predict
y_pred = model.predict(dtest)
y_true = test['Target']
utils.metrics(y_true, y_pred)
print("\n \n \n \n \n \n ********** WEIGHT ************")
importance = model.get_fscore()
importance = sorted(importance.items(), key=operator.itemgetter(1))
for i in importance:
print (i)
print("\n \n \n \n \n \n ********** GAIN ************")
importance = model.get_score(fmap='', importance_type='gain')
importance = sorted(importance.items(), key=operator.itemgetter(1))
for i in importance:
print (i)
print("\n \n \n \n \n \n ********** COVER ************")
importance = model.get_score(fmap='', importance_type='cover')
importance = sorted(importance.items(), key=operator.itemgetter(1))
for i in importance:
print (i)